From c0f89eba6ebdec08460bd796fc62d6aef674d141 Mon Sep 17 00:00:00 2001 From: Adin Scannell Date: Thu, 21 Nov 2019 11:29:49 -0800 Subject: Import and structure cleanup. PiperOrigin-RevId: 281795269 --- pkg/sentry/fsimpl/memfs/BUILD | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/memfs/BUILD b/pkg/sentry/fsimpl/memfs/BUILD index 04d667273..952b20c51 100644 --- a/pkg/sentry/fsimpl/memfs/BUILD +++ b/pkg/sentry/fsimpl/memfs/BUILD @@ -1,10 +1,9 @@ load("//tools/go_stateify:defs.bzl", "go_library") load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools/go_generics:defs.bzl", "go_template_instance") package(licenses = ["notice"]) -load("//tools/go_generics:defs.bzl", "go_template_instance") - go_template_instance( name = "dentry_list", out = "dentry_list.go", -- cgit v1.2.3 From 128948d6ae94009c6ad13a0bd96e03e45a560477 Mon Sep 17 00:00:00 2001 From: Jamie Liu Date: Mon, 25 Nov 2019 15:20:25 -0800 Subject: Implement basic umounting for vfs2. This is required to test filesystems with a non-trivial implementation of FilesystemImpl.Release(). Propagation isn't handled yet, and umount isn't yet plumbed out to VirtualFilesystem.UmountAt(), but otherwise the implementation of umount is believed to be correct. - Move entering mountTable.seq writer critical sections to callers of mountTable.{insert,remove}Seqed. This is required since umount(2) must ensure that no new references are taken on the candidate mount after checking that it isn't busy, which is only possible by entering a vfs.mountTable.seq writer critical section before the check and remaining in it until after VFS.umountRecursiveLocked() is complete. (Linux does the same thing: fs/namespace.c:do_umount() => lock_mount_hash(), fs/pnode.c:propagate_mount_busy(), umount_tree(), unlock_mount_hash().) - It's not possible for dentry deletion to umount while only holding VFS.mountMu for reading, but it's also very unappealing to hold VFS.mountMu exclusively around e.g. gofer unlink RPCs. Introduce dentry.mu to avoid these problems. This means that VFS.mountMu is never acquired for reading, so change it to a sync.Mutex. PiperOrigin-RevId: 282444343 --- pkg/sentry/fsimpl/memfs/BUILD | 4 +- pkg/sentry/fsimpl/memfs/benchmark_test.go | 22 ++- pkg/sentry/vfs/README.md | 4 +- pkg/sentry/vfs/dentry.go | 128 +++++++++--- pkg/sentry/vfs/mount.go | 319 +++++++++++++++++++++--------- pkg/sentry/vfs/mount_test.go | 34 ++-- pkg/sentry/vfs/mount_unsafe.go | 60 +++--- pkg/sentry/vfs/resolving_path.go | 8 +- pkg/sentry/vfs/syscalls.go | 2 + pkg/sentry/vfs/vfs.go | 11 +- 10 files changed, 423 insertions(+), 169 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/memfs/BUILD b/pkg/sentry/fsimpl/memfs/BUILD index 952b20c51..bc5c0b591 100644 --- a/pkg/sentry/fsimpl/memfs/BUILD +++ b/pkg/sentry/fsimpl/memfs/BUILD @@ -1,9 +1,10 @@ load("//tools/go_stateify:defs.bzl", "go_library") load("@io_bazel_rules_go//go:def.bzl", "go_test") -load("//tools/go_generics:defs.bzl", "go_template_instance") package(licenses = ["notice"]) +load("//tools/go_generics:defs.bzl", "go_template_instance") + go_template_instance( name = "dentry_list", out = "dentry_list.go", @@ -48,6 +49,7 @@ go_test( deps = [ ":memfs", "//pkg/abi/linux", + "//pkg/refs", "//pkg/sentry/context", "//pkg/sentry/context/contexttest", "//pkg/sentry/fs", diff --git a/pkg/sentry/fsimpl/memfs/benchmark_test.go b/pkg/sentry/fsimpl/memfs/benchmark_test.go index a94b17db6..23a846c08 100644 --- a/pkg/sentry/fsimpl/memfs/benchmark_test.go +++ b/pkg/sentry/fsimpl/memfs/benchmark_test.go @@ -21,6 +21,7 @@ import ( "testing" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/context/contexttest" "gvisor.dev/gvisor/pkg/sentry/fs" @@ -160,6 +161,8 @@ func BenchmarkVFS1TmpfsStat(b *testing.B) { b.Fatalf("stat(%q) failed: %v", filePath, err) } } + // Don't include deferred cleanup in benchmark time. + b.StopTimer() }) } } @@ -177,6 +180,7 @@ func BenchmarkVFS2MemfsStat(b *testing.B) { if err != nil { b.Fatalf("failed to create tmpfs root mount: %v", err) } + defer mntns.DecRef(vfsObj) var filePathBuilder strings.Builder filePathBuilder.WriteByte('/') @@ -186,7 +190,6 @@ func BenchmarkVFS2MemfsStat(b *testing.B) { defer root.DecRef() vd := root vd.IncRef() - defer vd.DecRef() for i := depth; i > 0; i-- { name := fmt.Sprintf("%d", i) pop := vfs.PathOperation{ @@ -219,6 +222,8 @@ func BenchmarkVFS2MemfsStat(b *testing.B) { Flags: linux.O_RDWR | linux.O_CREAT | linux.O_EXCL, Mode: 0644, }) + vd.DecRef() + vd = vfs.VirtualDentry{} if err != nil { b.Fatalf("failed to create file %q: %v", filename, err) } @@ -243,6 +248,8 @@ func BenchmarkVFS2MemfsStat(b *testing.B) { b.Fatalf("got wrong permissions (%0o)", stat.Mode) } } + // Don't include deferred cleanup in benchmark time. + b.StopTimer() }) } } @@ -343,6 +350,8 @@ func BenchmarkVFS1TmpfsMountStat(b *testing.B) { b.Fatalf("stat(%q) failed: %v", filePath, err) } } + // Don't include deferred cleanup in benchmark time. + b.StopTimer() }) } } @@ -360,6 +369,7 @@ func BenchmarkVFS2MemfsMountStat(b *testing.B) { if err != nil { b.Fatalf("failed to create tmpfs root mount: %v", err) } + defer mntns.DecRef(vfsObj) var filePathBuilder strings.Builder filePathBuilder.WriteByte('/') @@ -395,7 +405,6 @@ func BenchmarkVFS2MemfsMountStat(b *testing.B) { if err != nil { b.Fatalf("failed to walk to mount root: %v", err) } - defer vd.DecRef() for i := depth; i > 0; i-- { name := fmt.Sprintf("%d", i) pop := vfs.PathOperation{ @@ -435,6 +444,7 @@ func BenchmarkVFS2MemfsMountStat(b *testing.B) { Flags: linux.O_RDWR | linux.O_CREAT | linux.O_EXCL, Mode: 0644, }) + vd.DecRef() if err != nil { b.Fatalf("failed to create file %q: %v", filename, err) } @@ -459,6 +469,14 @@ func BenchmarkVFS2MemfsMountStat(b *testing.B) { b.Fatalf("got wrong permissions (%0o)", stat.Mode) } } + // Don't include deferred cleanup in benchmark time. + b.StopTimer() }) } } + +func init() { + // Turn off reference leak checking for a fair comparison between vfs1 and + // vfs2. + refs.SetLeakMode(refs.NoLeakChecking) +} diff --git a/pkg/sentry/vfs/README.md b/pkg/sentry/vfs/README.md index 7847854bc..9aa133bcb 100644 --- a/pkg/sentry/vfs/README.md +++ b/pkg/sentry/vfs/README.md @@ -39,8 +39,8 @@ Mount references are held by: - Mount: Each referenced Mount holds a reference on its parent, which is the mount containing its mount point. -- VirtualFilesystem: A reference is held on all Mounts that are attached - (reachable by Mount traversal). +- VirtualFilesystem: A reference is held on each Mount that has not been + umounted. MountNamespace and FileDescription references are held by users of VFS. The expectation is that each `kernel.Task` holds a reference on its corresponding diff --git a/pkg/sentry/vfs/dentry.go b/pkg/sentry/vfs/dentry.go index 45912fc58..09ed5a70e 100644 --- a/pkg/sentry/vfs/dentry.go +++ b/pkg/sentry/vfs/dentry.go @@ -16,6 +16,7 @@ package vfs import ( "fmt" + "sync" "sync/atomic" "gvisor.dev/gvisor/pkg/syserror" @@ -50,7 +51,7 @@ import ( // and not inodes. Furthermore, when parties outside the scope of VFS can // rename inodes on such filesystems, VFS generally cannot "follow" the rename, // both due to synchronization issues and because it may not even be able to -// name the destination path; this implies that it would in fact be *incorrect* +// name the destination path; this implies that it would in fact be incorrect // for Dentries to be associated with inodes on such filesystems. Consequently, // operations that are inode operations in Linux are FilesystemImpl methods // and/or FileDescriptionImpl methods in gVisor's VFS. Filesystems that do @@ -84,6 +85,9 @@ type Dentry struct { // mounts is accessed using atomic memory operations. mounts uint32 + // mu synchronizes disowning and mounting over this Dentry. + mu sync.Mutex + // children are child Dentries. children map[string]*Dentry @@ -228,36 +232,48 @@ func (vfs *VirtualFilesystem) PrepareDeleteDentry(mntns *MountNamespace, d *Dent panic("d is already disowned") } } - vfs.mountMu.RLock() - if _, ok := mntns.mountpoints[d]; ok { - vfs.mountMu.RUnlock() + vfs.mountMu.Lock() + if mntns.mountpoints[d] != 0 { + vfs.mountMu.Unlock() return syserror.EBUSY } - // Return with vfs.mountMu locked, which will be unlocked by - // AbortDeleteDentry or CommitDeleteDentry. + d.mu.Lock() + vfs.mountMu.Unlock() + // Return with d.mu locked to block attempts to mount over it; it will be + // unlocked by AbortDeleteDentry or CommitDeleteDentry. return nil } // AbortDeleteDentry must be called after PrepareDeleteDentry if the deletion // fails. -func (vfs *VirtualFilesystem) AbortDeleteDentry() { - vfs.mountMu.RUnlock() +func (vfs *VirtualFilesystem) AbortDeleteDentry(d *Dentry) { + d.mu.Unlock() } // CommitDeleteDentry must be called after the file represented by d is // deleted, and causes d to become disowned. // +// CommitDeleteDentry is a mutator of d and d.Parent(). +// // Preconditions: PrepareDeleteDentry was previously called on d. func (vfs *VirtualFilesystem) CommitDeleteDentry(d *Dentry) { - delete(d.parent.children, d.name) + if d.parent != nil { + delete(d.parent.children, d.name) + } d.setDisowned() - // TODO: lazily unmount mounts at d - vfs.mountMu.RUnlock() + d.mu.Unlock() + if d.isMounted() { + vfs.forgetDisownedMountpoint(d) + } } // DeleteDentry combines PrepareDeleteDentry and CommitDeleteDentry, as // appropriate for in-memory filesystems that don't need to ensure that some // external state change succeeds before committing the deletion. +// +// DeleteDentry is a mutator of d and d.Parent(). +// +// Preconditions: d is a child Dentry. func (vfs *VirtualFilesystem) DeleteDentry(mntns *MountNamespace, d *Dentry) error { if err := vfs.PrepareDeleteDentry(mntns, d); err != nil { return err @@ -266,6 +282,27 @@ func (vfs *VirtualFilesystem) DeleteDentry(mntns *MountNamespace, d *Dentry) err return nil } +// ForceDeleteDentry causes d to become disowned. It should only be used in +// cases where VFS has no ability to stop the deletion (e.g. d represents the +// local state of a file on a remote filesystem on which the file has already +// been deleted). +// +// ForceDeleteDentry is a mutator of d and d.Parent(). +// +// Preconditions: d is a child Dentry. +func (vfs *VirtualFilesystem) ForceDeleteDentry(d *Dentry) { + if checkInvariants { + if d.parent == nil { + panic("d is independent") + } + if d.IsDisowned() { + panic("d is already disowned") + } + } + d.mu.Lock() + vfs.CommitDeleteDentry(d) +} + // PrepareRenameDentry must be called before attempting to rename the file // represented by from. If to is not nil, it represents the file that will be // replaced or exchanged by the rename. If PrepareRenameDentry succeeds, the @@ -291,18 +328,21 @@ func (vfs *VirtualFilesystem) PrepareRenameDentry(mntns *MountNamespace, from, t } } } - vfs.mountMu.RLock() - if _, ok := mntns.mountpoints[from]; ok { - vfs.mountMu.RUnlock() + vfs.mountMu.Lock() + if mntns.mountpoints[from] != 0 { + vfs.mountMu.Unlock() return syserror.EBUSY } if to != nil { - if _, ok := mntns.mountpoints[to]; ok { - vfs.mountMu.RUnlock() + if mntns.mountpoints[to] != 0 { + vfs.mountMu.Unlock() return syserror.EBUSY } + to.mu.Lock() } - // Return with vfs.mountMu locked, which will be unlocked by + from.mu.Lock() + vfs.mountMu.Unlock() + // Return with from.mu and to.mu locked, which will be unlocked by // AbortRenameDentry, CommitRenameReplaceDentry, or // CommitRenameExchangeDentry. return nil @@ -310,38 +350,76 @@ func (vfs *VirtualFilesystem) PrepareRenameDentry(mntns *MountNamespace, from, t // AbortRenameDentry must be called after PrepareRenameDentry if the rename // fails. -func (vfs *VirtualFilesystem) AbortRenameDentry() { - vfs.mountMu.RUnlock() +func (vfs *VirtualFilesystem) AbortRenameDentry(from, to *Dentry) { + from.mu.Unlock() + if to != nil { + to.mu.Unlock() + } } // CommitRenameReplaceDentry must be called after the file represented by from // is renamed without RENAME_EXCHANGE. If to is not nil, it represents the file // that was replaced by from. // +// CommitRenameReplaceDentry is a mutator of from, to, from.Parent(), and +// to.Parent(). +// // Preconditions: PrepareRenameDentry was previously called on from and to. // newParent.Child(newName) == to. func (vfs *VirtualFilesystem) CommitRenameReplaceDentry(from, newParent *Dentry, newName string, to *Dentry) { - if to != nil { - to.setDisowned() - // TODO: lazily unmount mounts at d - } if newParent.children == nil { newParent.children = make(map[string]*Dentry) } newParent.children[newName] = from from.parent = newParent from.name = newName - vfs.mountMu.RUnlock() + from.mu.Unlock() + if to != nil { + to.setDisowned() + to.mu.Unlock() + if to.isMounted() { + vfs.forgetDisownedMountpoint(to) + } + } } // CommitRenameExchangeDentry must be called after the files represented by // from and to are exchanged by rename(RENAME_EXCHANGE). // +// CommitRenameExchangeDentry is a mutator of from, to, from.Parent(), and +// to.Parent(). +// // Preconditions: PrepareRenameDentry was previously called on from and to. func (vfs *VirtualFilesystem) CommitRenameExchangeDentry(from, to *Dentry) { from.parent, to.parent = to.parent, from.parent from.name, to.name = to.name, from.name from.parent.children[from.name] = from to.parent.children[to.name] = to - vfs.mountMu.RUnlock() + from.mu.Unlock() + to.mu.Unlock() +} + +// forgetDisownedMountpoint is called when a mount point is deleted to umount +// all mounts using it in all other mount namespaces. +// +// forgetDisownedMountpoint is analogous to Linux's +// fs/namespace.c:__detach_mounts(). +func (vfs *VirtualFilesystem) forgetDisownedMountpoint(d *Dentry) { + var ( + vdsToDecRef []VirtualDentry + mountsToDecRef []*Mount + ) + vfs.mountMu.Lock() + vfs.mounts.seq.BeginWrite() + for mnt := range vfs.mountpoints[d] { + vdsToDecRef, mountsToDecRef = vfs.umountRecursiveLocked(mnt, &umountRecursiveOptions{}, vdsToDecRef, mountsToDecRef) + } + vfs.mounts.seq.EndWrite() + vfs.mountMu.Unlock() + for _, vd := range vdsToDecRef { + vd.DecRef() + } + for _, mnt := range mountsToDecRef { + mnt.decRef() + } } diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go index 11702f720..198fb8067 100644 --- a/pkg/sentry/vfs/mount.go +++ b/pkg/sentry/vfs/mount.go @@ -38,16 +38,12 @@ import ( // Mount is analogous to Linux's struct mount. (gVisor does not distinguish // between struct mount and struct vfsmount.) type Mount struct { - // The lower 63 bits of refs are a reference count. The MSB of refs is set - // if the Mount has been eagerly unmounted, as by umount(2) without the - // MNT_DETACH flag. refs is accessed using atomic memory operations. - refs int64 - - // The lower 63 bits of writers is the number of calls to - // Mount.CheckBeginWrite() that have not yet been paired with a call to - // Mount.EndWrite(). The MSB of writers is set if MS_RDONLY is in effect. - // writers is accessed using atomic memory operations. - writers int64 + // vfs, fs, and root are immutable. References are held on fs and root. + // + // Invariant: root belongs to fs. + vfs *VirtualFilesystem + fs *Filesystem + root *Dentry // key is protected by VirtualFilesystem.mountMu and // VirtualFilesystem.mounts.seq, and may be nil. References are held on @@ -57,13 +53,29 @@ type Mount struct { // key.parent.fs. key mountKey - // fs, root, and ns are immutable. References are held on fs and root (but - // not ns). - // - // Invariant: root belongs to fs. - fs *Filesystem - root *Dentry - ns *MountNamespace + // ns is the namespace in which this Mount was mounted. ns is protected by + // VirtualFilesystem.mountMu. + ns *MountNamespace + + // The lower 63 bits of refs are a reference count. The MSB of refs is set + // if the Mount has been eagerly umounted, as by umount(2) without the + // MNT_DETACH flag. refs is accessed using atomic memory operations. + refs int64 + + // children is the set of all Mounts for which Mount.key.parent is this + // Mount. children is protected by VirtualFilesystem.mountMu. + children map[*Mount]struct{} + + // umounted is true if VFS.umountRecursiveLocked() has been called on this + // Mount. VirtualFilesystem does not hold a reference on Mounts for which + // umounted is true. umounted is protected by VirtualFilesystem.mountMu. + umounted bool + + // The lower 63 bits of writers is the number of calls to + // Mount.CheckBeginWrite() that have not yet been paired with a call to + // Mount.EndWrite(). The MSB of writers is set if MS_RDONLY is in effect. + // writers is accessed using atomic memory operations. + writers int64 } // A MountNamespace is a collection of Mounts. @@ -73,13 +85,16 @@ type Mount struct { // // MountNamespace is analogous to Linux's struct mnt_namespace. type MountNamespace struct { - refs int64 // accessed using atomic memory operations - // root is the MountNamespace's root mount. root is immutable. root *Mount - // mountpoints contains all Dentries which are mount points in this - // namespace. mountpoints is protected by VirtualFilesystem.mountMu. + // refs is the reference count. refs is accessed using atomic memory + // operations. + refs int64 + + // mountpoints maps all Dentries which are mount points in this namespace + // to the number of Mounts for which they are mount points. mountpoints is + // protected by VirtualFilesystem.mountMu. // // mountpoints is used to determine if a Dentry can be moved or removed // (which requires that the Dentry is not a mount point in the calling @@ -89,7 +104,7 @@ type MountNamespace struct { // MountNamespace; this is required to ensure that // VFS.PrepareDeleteDentry() and VFS.PrepareRemoveDentry() operate // correctly on unreferenced MountNamespaces. - mountpoints map[*Dentry]struct{} + mountpoints map[*Dentry]uint32 } // NewMountNamespace returns a new mount namespace with a root filesystem @@ -106,9 +121,10 @@ func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth } mntns := &MountNamespace{ refs: 1, - mountpoints: make(map[*Dentry]struct{}), + mountpoints: make(map[*Dentry]uint32), } mntns.root = &Mount{ + vfs: vfs, fs: fs, root: root, ns: mntns, @@ -136,8 +152,10 @@ func (vfs *VirtualFilesystem) NewMount(ctx context.Context, creds *auth.Credenti return err } vfs.mountMu.Lock() + vd.dentry.mu.Lock() for { if vd.dentry.IsDisowned() { + vd.dentry.mu.Unlock() vfs.mountMu.Unlock() vd.DecRef() root.decRef(fs) @@ -153,36 +171,208 @@ func (vfs *VirtualFilesystem) NewMount(ctx context.Context, creds *auth.Credenti if nextmnt == nil { break } - nextmnt.incRef() + // It's possible that nextmnt has been umounted but not disconnected, + // in which case vfs no longer holds a reference on it, and the last + // reference may be concurrently dropped even though we're holding + // vfs.mountMu. + if !nextmnt.tryIncMountedRef() { + break + } + // This can't fail since we're holding vfs.mountMu. nextmnt.root.incRef(nextmnt.fs) + vd.dentry.mu.Unlock() vd.DecRef() vd = VirtualDentry{ mount: nextmnt, dentry: nextmnt.root, } + vd.dentry.mu.Lock() } // TODO: Linux requires that either both the mount point and the mount root // are directories, or neither are, and returns ENOTDIR if this is not the // case. mntns := vd.mount.ns mnt := &Mount{ + vfs: vfs, fs: fs, root: root, ns: mntns, refs: 1, } - mnt.storeKey(vd.mount, vd.dentry) + vfs.mounts.seq.BeginWrite() + vfs.connectLocked(mnt, vd, mntns) + vfs.mounts.seq.EndWrite() + vd.dentry.mu.Unlock() + vfs.mountMu.Unlock() + return nil +} + +type umountRecursiveOptions struct { + // If eager is true, ensure that future calls to Mount.tryIncMountedRef() + // on umounted mounts fail. + // + // eager is analogous to Linux's UMOUNT_SYNC. + eager bool + + // If disconnectHierarchy is true, Mounts that are umounted hierarchically + // should be disconnected from their parents. (Mounts whose parents are not + // umounted, which in most cases means the Mount passed to the initial call + // to umountRecursiveLocked, are unconditionally disconnected for + // consistency with Linux.) + // + // disconnectHierarchy is analogous to Linux's !UMOUNT_CONNECTED. + disconnectHierarchy bool +} + +// umountRecursiveLocked marks mnt and its descendants as umounted. It does not +// release mount or dentry references; instead, it appends VirtualDentries and +// Mounts on which references must be dropped to vdsToDecRef and mountsToDecRef +// respectively, and returns updated slices. (This is necessary because +// filesystem locks possibly taken by DentryImpl.DecRef() may precede +// vfs.mountMu in the lock order, and Mount.decRef() may lock vfs.mountMu.) +// +// umountRecursiveLocked is analogous to Linux's fs/namespace.c:umount_tree(). +// +// Preconditions: vfs.mountMu must be locked. vfs.mounts.seq must be in a +// writer critical section. +func (vfs *VirtualFilesystem) umountRecursiveLocked(mnt *Mount, opts *umountRecursiveOptions, vdsToDecRef []VirtualDentry, mountsToDecRef []*Mount) ([]VirtualDentry, []*Mount) { + if !mnt.umounted { + mnt.umounted = true + mountsToDecRef = append(mountsToDecRef, mnt) + if parent := mnt.parent(); parent != nil && (opts.disconnectHierarchy || !parent.umounted) { + vdsToDecRef = append(vdsToDecRef, vfs.disconnectLocked(mnt)) + } + } + if opts.eager { + for { + refs := atomic.LoadInt64(&mnt.refs) + if refs < 0 { + break + } + if atomic.CompareAndSwapInt64(&mnt.refs, refs, refs|math.MinInt64) { + break + } + } + } + for child := range mnt.children { + vdsToDecRef, mountsToDecRef = vfs.umountRecursiveLocked(child, opts, vdsToDecRef, mountsToDecRef) + } + return vdsToDecRef, mountsToDecRef +} + +// connectLocked makes vd the mount parent/point for mnt. It consumes +// references held by vd. +// +// Preconditions: vfs.mountMu must be locked. vfs.mounts.seq must be in a +// writer critical section. d.mu must be locked. mnt.parent() == nil. +func (vfs *VirtualFilesystem) connectLocked(mnt *Mount, vd VirtualDentry, mntns *MountNamespace) { + mnt.storeKey(vd) + if vd.mount.children == nil { + vd.mount.children = make(map[*Mount]struct{}) + } + vd.mount.children[mnt] = struct{}{} atomic.AddUint32(&vd.dentry.mounts, 1) - mntns.mountpoints[vd.dentry] = struct{}{} + mntns.mountpoints[vd.dentry]++ + vfs.mounts.insertSeqed(mnt) vfsmpmounts, ok := vfs.mountpoints[vd.dentry] if !ok { vfsmpmounts = make(map[*Mount]struct{}) vfs.mountpoints[vd.dentry] = vfsmpmounts } vfsmpmounts[mnt] = struct{}{} - vfs.mounts.Insert(mnt) - vfs.mountMu.Unlock() - return nil +} + +// disconnectLocked makes vd have no mount parent/point and returns its old +// mount parent/point with a reference held. +// +// Preconditions: vfs.mountMu must be locked. vfs.mounts.seq must be in a +// writer critical section. mnt.parent() != nil. +func (vfs *VirtualFilesystem) disconnectLocked(mnt *Mount) VirtualDentry { + vd := mnt.loadKey() + mnt.storeKey(VirtualDentry{}) + delete(vd.mount.children, mnt) + atomic.AddUint32(&vd.dentry.mounts, math.MaxUint32) // -1 + mnt.ns.mountpoints[vd.dentry]-- + if mnt.ns.mountpoints[vd.dentry] == 0 { + delete(mnt.ns.mountpoints, vd.dentry) + } + vfs.mounts.removeSeqed(mnt) + vfsmpmounts := vfs.mountpoints[vd.dentry] + delete(vfsmpmounts, mnt) + if len(vfsmpmounts) == 0 { + delete(vfs.mountpoints, vd.dentry) + } + return vd +} + +// tryIncMountedRef increments mnt's reference count and returns true. If mnt's +// reference count is already zero, or has been eagerly umounted, +// tryIncMountedRef does nothing and returns false. +// +// tryIncMountedRef does not require that a reference is held on mnt. +func (mnt *Mount) tryIncMountedRef() bool { + for { + refs := atomic.LoadInt64(&mnt.refs) + if refs <= 0 { // refs < 0 => MSB set => eagerly unmounted + return false + } + if atomic.CompareAndSwapInt64(&mnt.refs, refs, refs+1) { + return true + } + } +} + +func (mnt *Mount) incRef() { + // In general, negative values for mnt.refs are valid because the MSB is + // the eager-unmount bit. + atomic.AddInt64(&mnt.refs, 1) +} + +func (mnt *Mount) decRef() { + refs := atomic.AddInt64(&mnt.refs, -1) + if refs&^math.MinInt64 == 0 { // mask out MSB + var vd VirtualDentry + if mnt.parent() != nil { + mnt.vfs.mountMu.Lock() + mnt.vfs.mounts.seq.BeginWrite() + vd = mnt.vfs.disconnectLocked(mnt) + mnt.vfs.mounts.seq.EndWrite() + mnt.vfs.mountMu.Unlock() + } + mnt.root.decRef(mnt.fs) + mnt.fs.decRef() + if vd.Ok() { + vd.DecRef() + } + } +} + +// IncRef increments mntns' reference count. +func (mntns *MountNamespace) IncRef() { + if atomic.AddInt64(&mntns.refs, 1) <= 1 { + panic("MountNamespace.IncRef() called without holding a reference") + } +} + +// DecRef decrements mntns' reference count. +func (mntns *MountNamespace) DecRef(vfs *VirtualFilesystem) { + if refs := atomic.AddInt64(&mntns.refs, -1); refs == 0 { + vfs.mountMu.Lock() + vfs.mounts.seq.BeginWrite() + vdsToDecRef, mountsToDecRef := vfs.umountRecursiveLocked(mntns.root, &umountRecursiveOptions{ + disconnectHierarchy: true, + }, nil, nil) + vfs.mounts.seq.EndWrite() + vfs.mountMu.Unlock() + for _, vd := range vdsToDecRef { + vd.DecRef() + } + for _, mnt := range mountsToDecRef { + mnt.decRef() + } + } else if refs < 0 { + panic("MountNamespace.DecRef() called without holding a reference") + } } // getMountAt returns the last Mount in the stack mounted at (mnt, d). It takes @@ -231,12 +421,12 @@ retryFirst: } // getMountpointAt returns the mount point for the stack of Mounts including -// mnt. It takes a reference on the returned Mount and Dentry. If no such mount +// mnt. It takes a reference on the returned VirtualDentry. If no such mount // point exists (i.e. mnt is a root mount), getMountpointAt returns (nil, nil). // // Preconditions: References are held on mnt and root. vfsroot is not (mnt, // mnt.root). -func (vfs *VirtualFilesystem) getMountpointAt(mnt *Mount, vfsroot VirtualDentry) (*Mount, *Dentry) { +func (vfs *VirtualFilesystem) getMountpointAt(mnt *Mount, vfsroot VirtualDentry) VirtualDentry { // The first mount is special-cased: // // - The caller must have already checked mnt against vfsroot. @@ -246,12 +436,12 @@ func (vfs *VirtualFilesystem) getMountpointAt(mnt *Mount, vfsroot VirtualDentry) // - We don't drop the caller's reference on mnt. retryFirst: epoch := vfs.mounts.seq.BeginRead() - parent, point := mnt.loadKey() + parent, point := mnt.parent(), mnt.point() if !vfs.mounts.seq.ReadOk(epoch) { goto retryFirst } if parent == nil { - return nil, nil + return VirtualDentry{} } if !parent.tryIncMountedRef() { // Raced with umount. @@ -263,6 +453,11 @@ retryFirst: parent.decRef() goto retryFirst } + if !vfs.mounts.seq.ReadOk(epoch) { + point.decRef(parent.fs) + parent.decRef() + goto retryFirst + } mnt = parent d := point for { @@ -274,7 +469,7 @@ retryFirst: } retryNotFirst: epoch := vfs.mounts.seq.BeginRead() - parent, point := mnt.loadKey() + parent, point := mnt.parent(), mnt.point() if !vfs.mounts.seq.ReadOk(epoch) { goto retryNotFirst } @@ -301,43 +496,7 @@ retryFirst: mnt = parent d = point } - return mnt, d -} - -// tryIncMountedRef increments mnt's reference count and returns true. If mnt's -// reference count is already zero, or has been eagerly unmounted, -// tryIncMountedRef does nothing and returns false. -// -// tryIncMountedRef does not require that a reference is held on mnt. -func (mnt *Mount) tryIncMountedRef() bool { - for { - refs := atomic.LoadInt64(&mnt.refs) - if refs <= 0 { // refs < 0 => MSB set => eagerly unmounted - return false - } - if atomic.CompareAndSwapInt64(&mnt.refs, refs, refs+1) { - return true - } - } -} - -func (mnt *Mount) incRef() { - // In general, negative values for mnt.refs are valid because the MSB is - // the eager-unmount bit. - atomic.AddInt64(&mnt.refs, 1) -} - -func (mnt *Mount) decRef() { - refs := atomic.AddInt64(&mnt.refs, -1) - if refs&^math.MinInt64 == 0 { // mask out MSB - parent, point := mnt.loadKey() - if point != nil { - point.decRef(parent.fs) - parent.decRef() - } - mnt.root.decRef(mnt.fs) - mnt.fs.decRef() - } + return VirtualDentry{mnt, d} } // CheckBeginWrite increments the counter of in-progress write operations on @@ -360,7 +519,7 @@ func (mnt *Mount) EndWrite() { atomic.AddInt64(&mnt.writers, -1) } -// Preconditions: VirtualFilesystem.mountMu must be locked for writing. +// Preconditions: VirtualFilesystem.mountMu must be locked. func (mnt *Mount) setReadOnlyLocked(ro bool) error { if oldRO := atomic.LoadInt64(&mnt.writers) < 0; oldRO == ro { return nil @@ -383,22 +542,6 @@ func (mnt *Mount) Filesystem() *Filesystem { return mnt.fs } -// IncRef increments mntns' reference count. -func (mntns *MountNamespace) IncRef() { - if atomic.AddInt64(&mntns.refs, 1) <= 1 { - panic("MountNamespace.IncRef() called without holding a reference") - } -} - -// DecRef decrements mntns' reference count. -func (mntns *MountNamespace) DecRef() { - if refs := atomic.AddInt64(&mntns.refs, 0); refs == 0 { - // TODO: unmount mntns.root - } else if refs < 0 { - panic("MountNamespace.DecRef() called without holding a reference") - } -} - // Root returns mntns' root. A reference is taken on the returned // VirtualDentry. func (mntns *MountNamespace) Root() VirtualDentry { diff --git a/pkg/sentry/vfs/mount_test.go b/pkg/sentry/vfs/mount_test.go index f394d7483..adff0b94b 100644 --- a/pkg/sentry/vfs/mount_test.go +++ b/pkg/sentry/vfs/mount_test.go @@ -37,7 +37,7 @@ func TestMountTableInsertLookup(t *testing.T) { mt.Init() mount := &Mount{} - mount.storeKey(&Mount{}, &Dentry{}) + mount.storeKey(VirtualDentry{&Mount{}, &Dentry{}}) mt.Insert(mount) if m := mt.Lookup(mount.parent(), mount.point()); m != mount { @@ -78,18 +78,10 @@ const enableComparativeBenchmarks = false func newBenchMount() *Mount { mount := &Mount{} - mount.storeKey(&Mount{}, &Dentry{}) + mount.storeKey(VirtualDentry{&Mount{}, &Dentry{}}) return mount } -func vdkey(mnt *Mount) VirtualDentry { - parent, point := mnt.loadKey() - return VirtualDentry{ - mount: parent, - dentry: point, - } -} - func BenchmarkMountTableParallelLookup(b *testing.B) { for numG, maxG := 1, runtime.GOMAXPROCS(0); numG >= 0 && numG <= maxG; numG *= 2 { for _, numMounts := range benchNumMounts { @@ -101,7 +93,7 @@ func BenchmarkMountTableParallelLookup(b *testing.B) { for i := 0; i < numMounts; i++ { mount := newBenchMount() mt.Insert(mount) - keys = append(keys, vdkey(mount)) + keys = append(keys, mount.loadKey()) } var ready sync.WaitGroup @@ -153,7 +145,7 @@ func BenchmarkMountMapParallelLookup(b *testing.B) { keys := make([]VirtualDentry, 0, numMounts) for i := 0; i < numMounts; i++ { mount := newBenchMount() - key := vdkey(mount) + key := mount.loadKey() ms[key] = mount keys = append(keys, key) } @@ -208,7 +200,7 @@ func BenchmarkMountSyncMapParallelLookup(b *testing.B) { keys := make([]VirtualDentry, 0, numMounts) for i := 0; i < numMounts; i++ { mount := newBenchMount() - key := vdkey(mount) + key := mount.loadKey() ms.Store(key, mount) keys = append(keys, key) } @@ -290,7 +282,7 @@ func BenchmarkMountMapNegativeLookup(b *testing.B) { ms := make(map[VirtualDentry]*Mount) for i := 0; i < numMounts; i++ { mount := newBenchMount() - ms[vdkey(mount)] = mount + ms[mount.loadKey()] = mount } negkeys := make([]VirtualDentry, 0, numMounts) for i := 0; i < numMounts; i++ { @@ -325,7 +317,7 @@ func BenchmarkMountSyncMapNegativeLookup(b *testing.B) { var ms sync.Map for i := 0; i < numMounts; i++ { mount := newBenchMount() - ms.Store(vdkey(mount), mount) + ms.Store(mount.loadKey(), mount) } negkeys := make([]VirtualDentry, 0, numMounts) for i := 0; i < numMounts; i++ { @@ -379,7 +371,7 @@ func BenchmarkMountMapInsert(b *testing.B) { b.ResetTimer() for i := range mounts { mount := mounts[i] - ms[vdkey(mount)] = mount + ms[mount.loadKey()] = mount } } @@ -399,7 +391,7 @@ func BenchmarkMountSyncMapInsert(b *testing.B) { b.ResetTimer() for i := range mounts { mount := mounts[i] - ms.Store(vdkey(mount), mount) + ms.Store(mount.loadKey(), mount) } } @@ -432,13 +424,13 @@ func BenchmarkMountMapRemove(b *testing.B) { ms := make(map[VirtualDentry]*Mount) for i := range mounts { mount := mounts[i] - ms[vdkey(mount)] = mount + ms[mount.loadKey()] = mount } b.ResetTimer() for i := range mounts { mount := mounts[i] - delete(ms, vdkey(mount)) + delete(ms, mount.loadKey()) } } @@ -454,12 +446,12 @@ func BenchmarkMountSyncMapRemove(b *testing.B) { var ms sync.Map for i := range mounts { mount := mounts[i] - ms.Store(vdkey(mount), mount) + ms.Store(mount.loadKey(), mount) } b.ResetTimer() for i := range mounts { mount := mounts[i] - ms.Delete(vdkey(mount)) + ms.Delete(mount.loadKey()) } } diff --git a/pkg/sentry/vfs/mount_unsafe.go b/pkg/sentry/vfs/mount_unsafe.go index c98b42f91..ab13fa461 100644 --- a/pkg/sentry/vfs/mount_unsafe.go +++ b/pkg/sentry/vfs/mount_unsafe.go @@ -38,16 +38,6 @@ type mountKey struct { point unsafe.Pointer // *Dentry } -// Invariant: mnt.key's fields are nil. parent and point are non-nil. -func (mnt *Mount) storeKey(parent *Mount, point *Dentry) { - atomic.StorePointer(&mnt.key.parent, unsafe.Pointer(parent)) - atomic.StorePointer(&mnt.key.point, unsafe.Pointer(point)) -} - -func (mnt *Mount) loadKey() (*Mount, *Dentry) { - return (*Mount)(atomic.LoadPointer(&mnt.key.parent)), (*Dentry)(atomic.LoadPointer(&mnt.key.point)) -} - func (mnt *Mount) parent() *Mount { return (*Mount)(atomic.LoadPointer(&mnt.key.parent)) } @@ -56,6 +46,19 @@ func (mnt *Mount) point() *Dentry { return (*Dentry)(atomic.LoadPointer(&mnt.key.point)) } +func (mnt *Mount) loadKey() VirtualDentry { + return VirtualDentry{ + mount: mnt.parent(), + dentry: mnt.point(), + } +} + +// Invariant: mnt.key.parent == nil. vd.Ok(). +func (mnt *Mount) storeKey(vd VirtualDentry) { + atomic.StorePointer(&mnt.key.parent, unsafe.Pointer(vd.mount)) + atomic.StorePointer(&mnt.key.point, unsafe.Pointer(vd.dentry)) +} + // mountTable maps (mount parent, mount point) pairs to mounts. It supports // efficient concurrent lookup, even in the presence of concurrent mutators // (provided mutation is sufficiently uncommon). @@ -201,9 +204,19 @@ loop: // Insert inserts the given mount into mt. // -// Preconditions: There are no concurrent mutators of mt. mt must not already -// contain a Mount with the same mount point and parent. +// Preconditions: mt must not already contain a Mount with the same mount point +// and parent. func (mt *mountTable) Insert(mount *Mount) { + mt.seq.BeginWrite() + mt.insertSeqed(mount) + mt.seq.EndWrite() +} + +// insertSeqed inserts the given mount into mt. +// +// Preconditions: mt.seq must be in a writer critical section. mt must not +// already contain a Mount with the same mount point and parent. +func (mt *mountTable) insertSeqed(mount *Mount) { hash := memhash(unsafe.Pointer(&mount.key), uintptr(mt.seed), mountKeyBytes) // We're under the maximum load factor if: @@ -215,10 +228,8 @@ func (mt *mountTable) Insert(mount *Mount) { tcap := uintptr(1) << order if ((tlen + 1) * mtMaxLoadDen) <= (uint64(mtMaxLoadNum) << order) { // Atomically insert the new element into the table. - mt.seq.BeginWrite() atomic.AddUint64(&mt.size, mtSizeLenOne) mtInsertLocked(mt.slots, tcap, unsafe.Pointer(mount), hash) - mt.seq.EndWrite() return } @@ -241,8 +252,6 @@ func (mt *mountTable) Insert(mount *Mount) { for { oldSlot := (*mountSlot)(oldCur) if oldSlot.value != nil { - // Don't need to lock mt.seq yet since newSlots isn't visible - // to readers. mtInsertLocked(newSlots, newCap, oldSlot.value, oldSlot.hash) } if oldCur == oldLast { @@ -252,11 +261,9 @@ func (mt *mountTable) Insert(mount *Mount) { } // Insert the new element into the new table. mtInsertLocked(newSlots, newCap, unsafe.Pointer(mount), hash) - // Atomically switch to the new table. - mt.seq.BeginWrite() + // Switch to the new table. atomic.AddUint64(&mt.size, mtSizeLenOne|mtSizeOrderOne) atomic.StorePointer(&mt.slots, newSlots) - mt.seq.EndWrite() } // Preconditions: There are no concurrent mutators of the table (slots, cap). @@ -294,9 +301,18 @@ func mtInsertLocked(slots unsafe.Pointer, cap uintptr, value unsafe.Pointer, has // Remove removes the given mount from mt. // -// Preconditions: There are no concurrent mutators of mt. mt must contain -// mount. +// Preconditions: mt must contain mount. func (mt *mountTable) Remove(mount *Mount) { + mt.seq.BeginWrite() + mt.removeSeqed(mount) + mt.seq.EndWrite() +} + +// removeSeqed removes the given mount from mt. +// +// Preconditions: mt.seq must be in a writer critical section. mt must contain +// mount. +func (mt *mountTable) removeSeqed(mount *Mount) { hash := memhash(unsafe.Pointer(&mount.key), uintptr(mt.seed), mountKeyBytes) tcap := uintptr(1) << (mt.size & mtSizeOrderMask) mask := tcap - 1 @@ -311,7 +327,6 @@ func (mt *mountTable) Remove(mount *Mount) { // backward until we either find an empty slot, or an element that // is already in its first-probed slot. (This is backward shift // deletion.) - mt.seq.BeginWrite() for { nextOff := (off + mountSlotBytes) & offmask nextSlot := (*mountSlot)(unsafe.Pointer(uintptr(slots) + nextOff)) @@ -330,7 +345,6 @@ func (mt *mountTable) Remove(mount *Mount) { } atomic.StorePointer(&slot.value, nil) atomic.AddUint64(&mt.size, mtSizeLenNegOne) - mt.seq.EndWrite() return } if checkInvariants && slotValue == nil { diff --git a/pkg/sentry/vfs/resolving_path.go b/pkg/sentry/vfs/resolving_path.go index 8d05c8583..61bce6426 100644 --- a/pkg/sentry/vfs/resolving_path.go +++ b/pkg/sentry/vfs/resolving_path.go @@ -269,11 +269,11 @@ func (rp *ResolvingPath) ResolveParent(d *Dentry) (*Dentry, error) { parent = d } else if d == rp.mount.root { // At mount root ... - mnt, mntpt := rp.vfs.getMountpointAt(rp.mount, rp.root) - if mnt != nil { + vd := rp.vfs.getMountpointAt(rp.mount, rp.root) + if vd.Ok() { // ... of non-root mount. - rp.nextMount = mnt - rp.nextStart = mntpt + rp.nextMount = vd.mount + rp.nextStart = vd.dentry return nil, resolveMountRootError{} } // ... of root mount. diff --git a/pkg/sentry/vfs/syscalls.go b/pkg/sentry/vfs/syscalls.go index abde0feaa..49952b2cc 100644 --- a/pkg/sentry/vfs/syscalls.go +++ b/pkg/sentry/vfs/syscalls.go @@ -230,6 +230,8 @@ func (fd *FileDescription) SetStatusFlags(ctx context.Context, flags uint32) err // // - VFS.SymlinkAt() // +// - VFS.UmountAt() +// // - VFS.UnlinkAt() // // - FileDescription.(almost everything) diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go index 4a8a69540..88e865d86 100644 --- a/pkg/sentry/vfs/vfs.go +++ b/pkg/sentry/vfs/vfs.go @@ -16,9 +16,14 @@ // // Lock order: // -// Filesystem implementation locks +// FilesystemImpl/FileDescriptionImpl locks // VirtualFilesystem.mountMu +// Dentry.mu +// Locks acquired by FilesystemImpls between Prepare{Delete,Rename}Dentry and Commit{Delete,Rename*}Dentry // VirtualFilesystem.fsTypesMu +// +// Locking Dentry.mu in multiple Dentries requires holding +// VirtualFilesystem.mountMu. package vfs import ( @@ -33,7 +38,7 @@ type VirtualFilesystem struct { // mountMu serializes mount mutations. // // mountMu is analogous to Linux's namespace_sem. - mountMu sync.RWMutex + mountMu sync.Mutex // mounts maps (mount parent, mount point) pairs to mounts. (Since mounts // are uniquely namespaced, including mount parent in the key correctly @@ -52,7 +57,7 @@ type VirtualFilesystem struct { // mountpoints maps mount points to mounts at those points in all // namespaces. mountpoints is protected by mountMu. // - // mountpoints is used to find mounts that must be unmounted due to + // mountpoints is used to find mounts that must be umounted due to // removal of a mount point Dentry from another mount namespace. ("A file // or directory that is a mount point in one namespace that is not a mount // point in another namespace, may be renamed, unlinked, or removed -- cgit v1.2.3 From b72e1b3c0873ea29d031db42e39ca053923eecff Mon Sep 17 00:00:00 2001 From: Jamie Liu Date: Mon, 25 Nov 2019 18:09:15 -0800 Subject: Minor VFS2 interface changes. - Remove the Filesystem argument from DentryImpl.*Ref(); in general DentryImpls that need the Filesystem for reference counting will probably also need it for other interface methods that don't plumb Filesystem, so it's easier to just store a pointer to the filesystem in the DentryImpl. - Add a pointer to the VirtualFilesystem to Filesystem, which is needed by the gofer client to disown dentries for cache eviction triggered by dentry reference count changes. - Rename FilesystemType.NewFilesystem to GetFilesystem; in some cases (e.g. sysfs, cgroupfs) it's much cleaner for there to be only one Filesystem that is used by all mounts, and in at least one case (devtmpfs) it's visibly incorrect not to do so, so NewFilesystem doesn't always actually create and return a *new* Filesystem. - Require callers of FileDescription.Init() to increment Mount/Dentry references. This is because the gofer client may, in the OpenAt() path, take a reference on a dentry with 0 references, which is safe due to synchronization that is outside the scope of this CL, and it would be safer to still have its implementation of DentryImpl.IncRef() check for an increment for 0 references in other cases. - Add FileDescription.TryIncRef. This is used by the gofer client to take references on "special file descriptions" (FDs for files such as pipes, sockets, and devices), which use per-FD handles (fids) instead of dentry-shared handles, for sync() and syncfs(). PiperOrigin-RevId: 282473364 --- pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go | 4 +- pkg/sentry/fsimpl/ext/block_map_file.go | 4 +- pkg/sentry/fsimpl/ext/block_map_test.go | 4 +- pkg/sentry/fsimpl/ext/dentry.go | 10 ++-- pkg/sentry/fsimpl/ext/ext.go | 10 ++-- pkg/sentry/fsimpl/ext/ext_test.go | 14 +++--- pkg/sentry/fsimpl/ext/extent_file.go | 6 +-- pkg/sentry/fsimpl/ext/extent_test.go | 4 +- pkg/sentry/fsimpl/ext/file_description.go | 4 +- pkg/sentry/fsimpl/ext/inode.go | 28 +++++++----- pkg/sentry/fsimpl/memfs/benchmark_test.go | 6 +-- pkg/sentry/fsimpl/memfs/filesystem.go | 15 ++++-- pkg/sentry/fsimpl/memfs/memfs.go | 16 +++---- pkg/sentry/fsimpl/memfs/named_pipe.go | 5 +- pkg/sentry/fsimpl/memfs/pipe_test.go | 2 +- pkg/sentry/vfs/dentry.go | 24 ++++++---- pkg/sentry/vfs/file_description.go | 33 +++++++++++-- pkg/sentry/vfs/file_description_impl_util_test.go | 2 +- pkg/sentry/vfs/filesystem.go | 20 ++++++-- pkg/sentry/vfs/filesystem_type.go | 10 ++-- pkg/sentry/vfs/mount.go | 56 ++++++++++++----------- pkg/sentry/vfs/resolving_path.go | 8 ++-- pkg/sentry/vfs/syscalls.go | 2 +- pkg/sentry/vfs/testutil.go | 12 ++--- pkg/sentry/vfs/vfs.go | 8 ++-- 25 files changed, 186 insertions(+), 121 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go index 10a8083a0..94cd74095 100644 --- a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go +++ b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go @@ -50,7 +50,7 @@ func setUp(b *testing.B, imagePath string) (context.Context, *vfs.VirtualFilesys // Create VFS. vfsObj := vfs.New() vfsObj.MustRegisterFilesystemType("extfs", ext.FilesystemType{}) - mntns, err := vfsObj.NewMountNamespace(ctx, creds, imagePath, "extfs", &vfs.NewFilesystemOptions{InternalData: int(f.Fd())}) + mntns, err := vfsObj.NewMountNamespace(ctx, creds, imagePath, "extfs", &vfs.GetFilesystemOptions{InternalData: int(f.Fd())}) if err != nil { f.Close() return nil, nil, nil, nil, err @@ -81,7 +81,7 @@ func mount(b *testing.B, imagePath string, vfsfs *vfs.VirtualFilesystem, pop *vf ctx := contexttest.Context(b) creds := auth.CredentialsFromContext(ctx) - if err := vfsfs.NewMount(ctx, creds, imagePath, pop, "extfs", &vfs.NewFilesystemOptions{InternalData: int(f.Fd())}); err != nil { + if err := vfsfs.NewMount(ctx, creds, imagePath, pop, "extfs", &vfs.GetFilesystemOptions{InternalData: int(f.Fd())}); err != nil { b.Fatalf("failed to mount tmpfs submount: %v", err) } return func() { diff --git a/pkg/sentry/fsimpl/ext/block_map_file.go b/pkg/sentry/fsimpl/ext/block_map_file.go index cea89bcd9..a2d8c3ad6 100644 --- a/pkg/sentry/fsimpl/ext/block_map_file.go +++ b/pkg/sentry/fsimpl/ext/block_map_file.go @@ -154,7 +154,7 @@ func (f *blockMapFile) read(curPhyBlk uint32, relFileOff uint64, height uint, ds toRead = len(dst) } - n, _ := f.regFile.inode.dev.ReadAt(dst[:toRead], curPhyBlkOff+int64(relFileOff)) + n, _ := f.regFile.inode.fs.dev.ReadAt(dst[:toRead], curPhyBlkOff+int64(relFileOff)) if n < toRead { return n, syserror.EIO } @@ -174,7 +174,7 @@ func (f *blockMapFile) read(curPhyBlk uint32, relFileOff uint64, height uint, ds curChildOff := relFileOff % childCov for i := startIdx; i < endIdx; i++ { var childPhyBlk uint32 - err := readFromDisk(f.regFile.inode.dev, curPhyBlkOff+int64(i*4), &childPhyBlk) + err := readFromDisk(f.regFile.inode.fs.dev, curPhyBlkOff+int64(i*4), &childPhyBlk) if err != nil { return read, err } diff --git a/pkg/sentry/fsimpl/ext/block_map_test.go b/pkg/sentry/fsimpl/ext/block_map_test.go index 213aa3919..181727ef7 100644 --- a/pkg/sentry/fsimpl/ext/block_map_test.go +++ b/pkg/sentry/fsimpl/ext/block_map_test.go @@ -87,12 +87,14 @@ func blockMapSetUp(t *testing.T) (*blockMapFile, []byte) { mockDisk := make([]byte, mockBMDiskSize) regFile := regularFile{ inode: inode{ + fs: &filesystem{ + dev: bytes.NewReader(mockDisk), + }, diskInode: &disklayout.InodeNew{ InodeOld: disklayout.InodeOld{ SizeLo: getMockBMFileFize(), }, }, - dev: bytes.NewReader(mockDisk), blkSize: uint64(mockBMBlkSize), }, } diff --git a/pkg/sentry/fsimpl/ext/dentry.go b/pkg/sentry/fsimpl/ext/dentry.go index 054fb42b6..a080cb189 100644 --- a/pkg/sentry/fsimpl/ext/dentry.go +++ b/pkg/sentry/fsimpl/ext/dentry.go @@ -41,16 +41,18 @@ func newDentry(in *inode) *dentry { } // IncRef implements vfs.DentryImpl.IncRef. -func (d *dentry) IncRef(vfsfs *vfs.Filesystem) { +func (d *dentry) IncRef() { d.inode.incRef() } // TryIncRef implements vfs.DentryImpl.TryIncRef. -func (d *dentry) TryIncRef(vfsfs *vfs.Filesystem) bool { +func (d *dentry) TryIncRef() bool { return d.inode.tryIncRef() } // DecRef implements vfs.DentryImpl.DecRef. -func (d *dentry) DecRef(vfsfs *vfs.Filesystem) { - d.inode.decRef(vfsfs.Impl().(*filesystem)) +func (d *dentry) DecRef() { + // FIXME(b/134676337): filesystem.mu may not be locked as required by + // inode.decRef(). + d.inode.decRef() } diff --git a/pkg/sentry/fsimpl/ext/ext.go b/pkg/sentry/fsimpl/ext/ext.go index f10accafc..4b7d17dc6 100644 --- a/pkg/sentry/fsimpl/ext/ext.go +++ b/pkg/sentry/fsimpl/ext/ext.go @@ -40,14 +40,14 @@ var _ vfs.FilesystemType = (*FilesystemType)(nil) // Currently there are two ways of mounting an ext(2/3/4) fs: // 1. Specify a mount with our internal special MountType in the OCI spec. // 2. Expose the device to the container and mount it from application layer. -func getDeviceFd(source string, opts vfs.NewFilesystemOptions) (io.ReaderAt, error) { +func getDeviceFd(source string, opts vfs.GetFilesystemOptions) (io.ReaderAt, error) { if opts.InternalData == nil { // User mount call. // TODO(b/134676337): Open the device specified by `source` and return that. panic("unimplemented") } - // NewFilesystem call originated from within the sentry. + // GetFilesystem call originated from within the sentry. devFd, ok := opts.InternalData.(int) if !ok { return nil, errors.New("internal data for ext fs must be an int containing the file descriptor to device") @@ -91,8 +91,8 @@ func isCompatible(sb disklayout.SuperBlock) bool { return true } -// NewFilesystem implements vfs.FilesystemType.NewFilesystem. -func (FilesystemType) NewFilesystem(ctx context.Context, creds *auth.Credentials, source string, opts vfs.NewFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { +// GetFilesystem implements vfs.FilesystemType.GetFilesystem. +func (FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { // TODO(b/134676337): Ensure that the user is mounting readonly. If not, // EACCESS should be returned according to mount(2). Filesystem independent // flags (like readonly) are currently not available in pkg/sentry/vfs. @@ -103,7 +103,7 @@ func (FilesystemType) NewFilesystem(ctx context.Context, creds *auth.Credentials } fs := filesystem{dev: dev, inodeCache: make(map[uint32]*inode)} - fs.vfsfs.Init(&fs) + fs.vfsfs.Init(vfsObj, &fs) fs.sb, err = readSuperBlock(dev) if err != nil { return nil, nil, err diff --git a/pkg/sentry/fsimpl/ext/ext_test.go b/pkg/sentry/fsimpl/ext/ext_test.go index 1aa2bd6a4..307e4d68c 100644 --- a/pkg/sentry/fsimpl/ext/ext_test.go +++ b/pkg/sentry/fsimpl/ext/ext_test.go @@ -66,7 +66,7 @@ func setUp(t *testing.T, imagePath string) (context.Context, *vfs.VirtualFilesys // Create VFS. vfsObj := vfs.New() vfsObj.MustRegisterFilesystemType("extfs", FilesystemType{}) - mntns, err := vfsObj.NewMountNamespace(ctx, creds, localImagePath, "extfs", &vfs.NewFilesystemOptions{InternalData: int(f.Fd())}) + mntns, err := vfsObj.NewMountNamespace(ctx, creds, localImagePath, "extfs", &vfs.GetFilesystemOptions{InternalData: int(f.Fd())}) if err != nil { f.Close() return nil, nil, nil, nil, err @@ -509,27 +509,27 @@ func TestIterDirents(t *testing.T) { } wantDirents := []vfs.Dirent{ - vfs.Dirent{ + { Name: ".", Type: linux.DT_DIR, }, - vfs.Dirent{ + { Name: "..", Type: linux.DT_DIR, }, - vfs.Dirent{ + { Name: "lost+found", Type: linux.DT_DIR, }, - vfs.Dirent{ + { Name: "file.txt", Type: linux.DT_REG, }, - vfs.Dirent{ + { Name: "bigfile.txt", Type: linux.DT_REG, }, - vfs.Dirent{ + { Name: "symlink.txt", Type: linux.DT_LNK, }, diff --git a/pkg/sentry/fsimpl/ext/extent_file.go b/pkg/sentry/fsimpl/ext/extent_file.go index 38b68a2d3..3d3ebaca6 100644 --- a/pkg/sentry/fsimpl/ext/extent_file.go +++ b/pkg/sentry/fsimpl/ext/extent_file.go @@ -99,7 +99,7 @@ func (f *extentFile) buildExtTree() error { func (f *extentFile) buildExtTreeFromDisk(entry disklayout.ExtentEntry) (*disklayout.ExtentNode, error) { var header disklayout.ExtentHeader off := entry.PhysicalBlock() * f.regFile.inode.blkSize - err := readFromDisk(f.regFile.inode.dev, int64(off), &header) + err := readFromDisk(f.regFile.inode.fs.dev, int64(off), &header) if err != nil { return nil, err } @@ -115,7 +115,7 @@ func (f *extentFile) buildExtTreeFromDisk(entry disklayout.ExtentEntry) (*diskla curEntry = &disklayout.ExtentIdx{} } - err := readFromDisk(f.regFile.inode.dev, int64(off), curEntry) + err := readFromDisk(f.regFile.inode.fs.dev, int64(off), curEntry) if err != nil { return nil, err } @@ -229,7 +229,7 @@ func (f *extentFile) readFromExtent(ex *disklayout.Extent, off uint64, dst []byt toRead = len(dst) } - n, _ := f.regFile.inode.dev.ReadAt(dst[:toRead], int64(readStart)) + n, _ := f.regFile.inode.fs.dev.ReadAt(dst[:toRead], int64(readStart)) if n < toRead { return n, syserror.EIO } diff --git a/pkg/sentry/fsimpl/ext/extent_test.go b/pkg/sentry/fsimpl/ext/extent_test.go index 42d0a484b..a2382daa3 100644 --- a/pkg/sentry/fsimpl/ext/extent_test.go +++ b/pkg/sentry/fsimpl/ext/extent_test.go @@ -180,13 +180,15 @@ func extentTreeSetUp(t *testing.T, root *disklayout.ExtentNode) (*extentFile, [] mockExtentFile := &extentFile{ regFile: regularFile{ inode: inode{ + fs: &filesystem{ + dev: bytes.NewReader(mockDisk), + }, diskInode: &disklayout.InodeNew{ InodeOld: disklayout.InodeOld{ SizeLo: uint32(mockExtentBlkSize) * getNumPhyBlks(root), }, }, blkSize: mockExtentBlkSize, - dev: bytes.NewReader(mockDisk), }, }, } diff --git a/pkg/sentry/fsimpl/ext/file_description.go b/pkg/sentry/fsimpl/ext/file_description.go index 4d18b28cb..5eca2b83f 100644 --- a/pkg/sentry/fsimpl/ext/file_description.go +++ b/pkg/sentry/fsimpl/ext/file_description.go @@ -36,11 +36,11 @@ type fileDescription struct { } func (fd *fileDescription) filesystem() *filesystem { - return fd.vfsfd.VirtualDentry().Mount().Filesystem().Impl().(*filesystem) + return fd.vfsfd.Mount().Filesystem().Impl().(*filesystem) } func (fd *fileDescription) inode() *inode { - return fd.vfsfd.VirtualDentry().Dentry().Impl().(*dentry).inode + return fd.vfsfd.Dentry().Impl().(*dentry).inode } // StatusFlags implements vfs.FileDescriptionImpl.StatusFlags. diff --git a/pkg/sentry/fsimpl/ext/inode.go b/pkg/sentry/fsimpl/ext/inode.go index e6c847a71..24249525c 100644 --- a/pkg/sentry/fsimpl/ext/inode.go +++ b/pkg/sentry/fsimpl/ext/inode.go @@ -16,7 +16,6 @@ package ext import ( "fmt" - "io" "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" @@ -42,13 +41,13 @@ type inode struct { // refs is a reference count. refs is accessed using atomic memory operations. refs int64 + // fs is the containing filesystem. + fs *filesystem + // inodeNum is the inode number of this inode on disk. This is used to // identify inodes within the ext filesystem. inodeNum uint32 - // dev represents the underlying device. Same as filesystem.dev. - dev io.ReaderAt - // blkSize is the fs data block size. Same as filesystem.sb.BlockSize(). blkSize uint64 @@ -81,10 +80,10 @@ func (in *inode) tryIncRef() bool { // decRef decrements the inode ref count and releases the inode resources if // the ref count hits 0. // -// Precondition: Must have locked fs.mu. -func (in *inode) decRef(fs *filesystem) { +// Precondition: Must have locked filesystem.mu. +func (in *inode) decRef() { if refs := atomic.AddInt64(&in.refs, -1); refs == 0 { - delete(fs.inodeCache, in.inodeNum) + delete(in.fs.inodeCache, in.inodeNum) } else if refs < 0 { panic("ext.inode.decRef() called without holding a reference") } @@ -117,8 +116,8 @@ func newInode(fs *filesystem, inodeNum uint32) (*inode, error) { // Build the inode based on its type. inode := inode{ + fs: fs, inodeNum: inodeNum, - dev: fs.dev, blkSize: blkSize, diskInode: diskInode, } @@ -154,11 +153,14 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*v if err := in.checkPermissions(rp.Credentials(), ats); err != nil { return nil, err } + mnt := rp.Mount() switch in.impl.(type) { case *regularFile: var fd regularFileFD fd.flags = flags - fd.vfsfd.Init(&fd, rp.Mount(), vfsd) + mnt.IncRef() + vfsd.IncRef() + fd.vfsfd.Init(&fd, mnt, vfsd) return &fd.vfsfd, nil case *directory: // Can't open directories writably. This check is not necessary for a read @@ -167,8 +169,10 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*v return nil, syserror.EISDIR } var fd directoryFD - fd.vfsfd.Init(&fd, rp.Mount(), vfsd) fd.flags = flags + mnt.IncRef() + vfsd.IncRef() + fd.vfsfd.Init(&fd, mnt, vfsd) return &fd.vfsfd, nil case *symlink: if flags&linux.O_PATH == 0 { @@ -177,7 +181,9 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*v } var fd symlinkFD fd.flags = flags - fd.vfsfd.Init(&fd, rp.Mount(), vfsd) + mnt.IncRef() + vfsd.IncRef() + fd.vfsfd.Init(&fd, mnt, vfsd) return &fd.vfsfd, nil default: panic(fmt.Sprintf("unknown inode type: %T", in.impl)) diff --git a/pkg/sentry/fsimpl/memfs/benchmark_test.go b/pkg/sentry/fsimpl/memfs/benchmark_test.go index 23a846c08..ea6417ce7 100644 --- a/pkg/sentry/fsimpl/memfs/benchmark_test.go +++ b/pkg/sentry/fsimpl/memfs/benchmark_test.go @@ -176,7 +176,7 @@ func BenchmarkVFS2MemfsStat(b *testing.B) { // Create VFS. vfsObj := vfs.New() vfsObj.MustRegisterFilesystemType("memfs", memfs.FilesystemType{}) - mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "memfs", &vfs.NewFilesystemOptions{}) + mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "memfs", &vfs.GetFilesystemOptions{}) if err != nil { b.Fatalf("failed to create tmpfs root mount: %v", err) } @@ -365,7 +365,7 @@ func BenchmarkVFS2MemfsMountStat(b *testing.B) { // Create VFS. vfsObj := vfs.New() vfsObj.MustRegisterFilesystemType("memfs", memfs.FilesystemType{}) - mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "memfs", &vfs.NewFilesystemOptions{}) + mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "memfs", &vfs.GetFilesystemOptions{}) if err != nil { b.Fatalf("failed to create tmpfs root mount: %v", err) } @@ -394,7 +394,7 @@ func BenchmarkVFS2MemfsMountStat(b *testing.B) { } defer mountPoint.DecRef() // Create and mount the submount. - if err := vfsObj.NewMount(ctx, creds, "", &pop, "memfs", &vfs.NewFilesystemOptions{}); err != nil { + if err := vfsObj.NewMount(ctx, creds, "", &pop, "memfs", &vfs.GetFilesystemOptions{}); err != nil { b.Fatalf("failed to mount tmpfs submount: %v", err) } filePathBuilder.WriteString(mountPointName) diff --git a/pkg/sentry/fsimpl/memfs/filesystem.go b/pkg/sentry/fsimpl/memfs/filesystem.go index f006c40cd..08a9cb8ef 100644 --- a/pkg/sentry/fsimpl/memfs/filesystem.go +++ b/pkg/sentry/fsimpl/memfs/filesystem.go @@ -159,7 +159,7 @@ func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, op return nil, err } } - inode.incRef() // vfsd.IncRef(&fs.vfsfs) + inode.incRef() return vfsd, nil } @@ -379,6 +379,7 @@ func (i *inode) open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentr return nil, err } } + mnt := rp.Mount() switch impl := i.impl.(type) { case *regularFile: var fd regularFileFD @@ -386,12 +387,14 @@ func (i *inode) open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentr fd.readable = vfs.MayReadFileWithOpenFlags(flags) fd.writable = vfs.MayWriteFileWithOpenFlags(flags) if fd.writable { - if err := rp.Mount().CheckBeginWrite(); err != nil { + if err := mnt.CheckBeginWrite(); err != nil { return nil, err } - // Mount.EndWrite() is called by regularFileFD.Release(). + // mnt.EndWrite() is called by regularFileFD.Release(). } - fd.vfsfd.Init(&fd, rp.Mount(), vfsd) + mnt.IncRef() + vfsd.IncRef() + fd.vfsfd.Init(&fd, mnt, vfsd) if flags&linux.O_TRUNC != 0 { impl.mu.Lock() impl.data = impl.data[:0] @@ -405,7 +408,9 @@ func (i *inode) open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentr return nil, syserror.EISDIR } var fd directoryFD - fd.vfsfd.Init(&fd, rp.Mount(), vfsd) + mnt.IncRef() + vfsd.IncRef() + fd.vfsfd.Init(&fd, mnt, vfsd) fd.flags = flags return &fd.vfsfd, nil case *symlink: diff --git a/pkg/sentry/fsimpl/memfs/memfs.go b/pkg/sentry/fsimpl/memfs/memfs.go index 64c851c1a..4cb2a4e0f 100644 --- a/pkg/sentry/fsimpl/memfs/memfs.go +++ b/pkg/sentry/fsimpl/memfs/memfs.go @@ -52,10 +52,10 @@ type filesystem struct { nextInoMinusOne uint64 // accessed using atomic memory operations } -// NewFilesystem implements vfs.FilesystemType.NewFilesystem. -func (fstype FilesystemType) NewFilesystem(ctx context.Context, creds *auth.Credentials, source string, opts vfs.NewFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { +// GetFilesystem implements vfs.FilesystemType.GetFilesystem. +func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { var fs filesystem - fs.vfsfs.Init(&fs) + fs.vfsfs.Init(vfsObj, &fs) root := fs.newDentry(fs.newDirectory(creds, 01777)) return &fs.vfsfs, &root.vfsd, nil } @@ -99,17 +99,17 @@ func (fs *filesystem) newDentry(inode *inode) *dentry { } // IncRef implements vfs.DentryImpl.IncRef. -func (d *dentry) IncRef(vfsfs *vfs.Filesystem) { +func (d *dentry) IncRef() { d.inode.incRef() } // TryIncRef implements vfs.DentryImpl.TryIncRef. -func (d *dentry) TryIncRef(vfsfs *vfs.Filesystem) bool { +func (d *dentry) TryIncRef() bool { return d.inode.tryIncRef() } // DecRef implements vfs.DentryImpl.DecRef. -func (d *dentry) DecRef(vfsfs *vfs.Filesystem) { +func (d *dentry) DecRef() { d.inode.decRef() } @@ -266,11 +266,11 @@ type fileDescription struct { } func (fd *fileDescription) filesystem() *filesystem { - return fd.vfsfd.VirtualDentry().Mount().Filesystem().Impl().(*filesystem) + return fd.vfsfd.Mount().Filesystem().Impl().(*filesystem) } func (fd *fileDescription) inode() *inode { - return fd.vfsfd.VirtualDentry().Dentry().Impl().(*dentry).inode + return fd.vfsfd.Dentry().Impl().(*dentry).inode } // StatusFlags implements vfs.FileDescriptionImpl.StatusFlags. diff --git a/pkg/sentry/fsimpl/memfs/named_pipe.go b/pkg/sentry/fsimpl/memfs/named_pipe.go index 732ed7c58..91cb4b1fc 100644 --- a/pkg/sentry/fsimpl/memfs/named_pipe.go +++ b/pkg/sentry/fsimpl/memfs/named_pipe.go @@ -54,6 +54,9 @@ func newNamedPipeFD(ctx context.Context, np *namedPipe, rp *vfs.ResolvingPath, v if err != nil { return nil, err } - fd.vfsfd.Init(&fd, rp.Mount(), vfsd) + mnt := rp.Mount() + mnt.IncRef() + vfsd.IncRef() + fd.vfsfd.Init(&fd, mnt, vfsd) return &fd.vfsfd, nil } diff --git a/pkg/sentry/fsimpl/memfs/pipe_test.go b/pkg/sentry/fsimpl/memfs/pipe_test.go index 0674b81a3..a3a870571 100644 --- a/pkg/sentry/fsimpl/memfs/pipe_test.go +++ b/pkg/sentry/fsimpl/memfs/pipe_test.go @@ -152,7 +152,7 @@ func setup(t *testing.T) (context.Context, *auth.Credentials, *vfs.VirtualFilesy // Create VFS. vfsObj := vfs.New() vfsObj.MustRegisterFilesystemType("memfs", FilesystemType{}) - mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "memfs", &vfs.NewFilesystemOptions{}) + mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "memfs", &vfs.GetFilesystemOptions{}) if err != nil { t.Fatalf("failed to create tmpfs root mount: %v", err) } diff --git a/pkg/sentry/vfs/dentry.go b/pkg/sentry/vfs/dentry.go index 09ed5a70e..40f4c1d09 100644 --- a/pkg/sentry/vfs/dentry.go +++ b/pkg/sentry/vfs/dentry.go @@ -118,7 +118,7 @@ func (d *Dentry) Impl() DentryImpl { type DentryImpl interface { // IncRef increments the Dentry's reference count. A Dentry with a non-zero // reference count must remain coherent with the state of the filesystem. - IncRef(fs *Filesystem) + IncRef() // TryIncRef increments the Dentry's reference count and returns true. If // the Dentry's reference count is zero, TryIncRef may do nothing and @@ -126,10 +126,10 @@ type DentryImpl interface { // guarantee that the Dentry is coherent with the state of the filesystem.) // // TryIncRef does not require that a reference is held on the Dentry. - TryIncRef(fs *Filesystem) bool + TryIncRef() bool // DecRef decrements the Dentry's reference count. - DecRef(fs *Filesystem) + DecRef() } // IsDisowned returns true if d is disowned. @@ -146,16 +146,20 @@ func (d *Dentry) isMounted() bool { return atomic.LoadUint32(&d.mounts) != 0 } -func (d *Dentry) incRef(fs *Filesystem) { - d.impl.IncRef(fs) +// IncRef increments d's reference count. +func (d *Dentry) IncRef() { + d.impl.IncRef() } -func (d *Dentry) tryIncRef(fs *Filesystem) bool { - return d.impl.TryIncRef(fs) +// TryIncRef increments d's reference count and returns true. If d's reference +// count is zero, TryIncRef may instead do nothing and return false. +func (d *Dentry) TryIncRef() bool { + return d.impl.TryIncRef() } -func (d *Dentry) decRef(fs *Filesystem) { - d.impl.DecRef(fs) +// DecRef decrements d's reference count. +func (d *Dentry) DecRef() { + d.impl.DecRef() } // These functions are exported so that filesystem implementations can use @@ -420,6 +424,6 @@ func (vfs *VirtualFilesystem) forgetDisownedMountpoint(d *Dentry) { vd.DecRef() } for _, mnt := range mountsToDecRef { - mnt.decRef() + mnt.DecRef() } } diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go index 3a9665800..34007eb57 100644 --- a/pkg/sentry/vfs/file_description.go +++ b/pkg/sentry/vfs/file_description.go @@ -47,15 +47,14 @@ type FileDescription struct { impl FileDescriptionImpl } -// Init must be called before first use of fd. It takes references on mnt and -// d. +// Init must be called before first use of fd. It takes ownership of references +// on mnt and d held by the caller. func (fd *FileDescription) Init(impl FileDescriptionImpl, mnt *Mount, d *Dentry) { fd.refs = 1 fd.vd = VirtualDentry{ mount: mnt, dentry: d, } - fd.vd.IncRef() fd.impl = impl } @@ -64,6 +63,18 @@ func (fd *FileDescription) Impl() FileDescriptionImpl { return fd.impl } +// Mount returns the mount on which fd was opened. It does not take a reference +// on the returned Mount. +func (fd *FileDescription) Mount() *Mount { + return fd.vd.mount +} + +// Dentry returns the dentry at which fd was opened. It does not take a +// reference on the returned Dentry. +func (fd *FileDescription) Dentry() *Dentry { + return fd.vd.dentry +} + // VirtualDentry returns the location at which fd was opened. It does not take // a reference on the returned VirtualDentry. func (fd *FileDescription) VirtualDentry() VirtualDentry { @@ -75,6 +86,22 @@ func (fd *FileDescription) IncRef() { atomic.AddInt64(&fd.refs, 1) } +// TryIncRef increments fd's reference count and returns true. If fd's +// reference count is already zero, TryIncRef does nothing and returns false. +// +// TryIncRef does not require that a reference is held on fd. +func (fd *FileDescription) TryIncRef() bool { + for { + refs := atomic.LoadInt64(&fd.refs) + if refs <= 0 { + return false + } + if atomic.CompareAndSwapInt64(&fd.refs, refs, refs+1) { + return true + } + } +} + // DecRef decrements fd's reference count. func (fd *FileDescription) DecRef() { if refs := atomic.AddInt64(&fd.refs, -1); refs == 0 { diff --git a/pkg/sentry/vfs/file_description_impl_util_test.go b/pkg/sentry/vfs/file_description_impl_util_test.go index 511b829fc..a5561dcbe 100644 --- a/pkg/sentry/vfs/file_description_impl_util_test.go +++ b/pkg/sentry/vfs/file_description_impl_util_test.go @@ -90,7 +90,7 @@ func TestGenCountFD(t *testing.T) { vfsObj := New() // vfs.New() vfsObj.MustRegisterFilesystemType("testfs", FDTestFilesystemType{}) - mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "testfs", &NewFilesystemOptions{}) + mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "testfs", &GetFilesystemOptions{}) if err != nil { t.Fatalf("failed to create testfs root mount: %v", err) } diff --git a/pkg/sentry/vfs/filesystem.go b/pkg/sentry/vfs/filesystem.go index 7a074b718..76ff8cf51 100644 --- a/pkg/sentry/vfs/filesystem.go +++ b/pkg/sentry/vfs/filesystem.go @@ -33,29 +33,41 @@ type Filesystem struct { // operations. refs int64 + // vfs is the VirtualFilesystem that uses this Filesystem. vfs is + // immutable. + vfs *VirtualFilesystem + // impl is the FilesystemImpl associated with this Filesystem. impl is // immutable. This should be the last field in Dentry. impl FilesystemImpl } // Init must be called before first use of fs. -func (fs *Filesystem) Init(impl FilesystemImpl) { +func (fs *Filesystem) Init(vfsObj *VirtualFilesystem, impl FilesystemImpl) { fs.refs = 1 + fs.vfs = vfsObj fs.impl = impl } +// VirtualFilesystem returns the containing VirtualFilesystem. +func (fs *Filesystem) VirtualFilesystem() *VirtualFilesystem { + return fs.vfs +} + // Impl returns the FilesystemImpl associated with fs. func (fs *Filesystem) Impl() FilesystemImpl { return fs.impl } -func (fs *Filesystem) incRef() { +// IncRef increments fs' reference count. +func (fs *Filesystem) IncRef() { if atomic.AddInt64(&fs.refs, 1) <= 1 { - panic("Filesystem.incRef() called without holding a reference") + panic("Filesystem.IncRef() called without holding a reference") } } -func (fs *Filesystem) decRef() { +// DecRef decrements fs' reference count. +func (fs *Filesystem) DecRef() { if refs := atomic.AddInt64(&fs.refs, -1); refs == 0 { fs.impl.Release() } else if refs < 0 { diff --git a/pkg/sentry/vfs/filesystem_type.go b/pkg/sentry/vfs/filesystem_type.go index f401ad7f3..c335e206d 100644 --- a/pkg/sentry/vfs/filesystem_type.go +++ b/pkg/sentry/vfs/filesystem_type.go @@ -25,21 +25,21 @@ import ( // // FilesystemType is analogous to Linux's struct file_system_type. type FilesystemType interface { - // NewFilesystem returns a Filesystem configured by the given options, + // GetFilesystem returns a Filesystem configured by the given options, // along with its mount root. A reference is taken on the returned // Filesystem and Dentry. - NewFilesystem(ctx context.Context, creds *auth.Credentials, source string, opts NewFilesystemOptions) (*Filesystem, *Dentry, error) + GetFilesystem(ctx context.Context, vfsObj *VirtualFilesystem, creds *auth.Credentials, source string, opts GetFilesystemOptions) (*Filesystem, *Dentry, error) } -// NewFilesystemOptions contains options to FilesystemType.NewFilesystem. -type NewFilesystemOptions struct { +// GetFilesystemOptions contains options to FilesystemType.GetFilesystem. +type GetFilesystemOptions struct { // Data is the string passed as the 5th argument to mount(2), which is // usually a comma-separated list of filesystem-specific mount options. Data string // InternalData holds opaque FilesystemType-specific data. There is // intentionally no way for applications to specify InternalData; if it is - // not nil, the call to NewFilesystem originates from within the sentry. + // not nil, the call to GetFilesystem originates from within the sentry. InternalData interface{} } diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go index 198fb8067..1c3b2e987 100644 --- a/pkg/sentry/vfs/mount.go +++ b/pkg/sentry/vfs/mount.go @@ -110,12 +110,12 @@ type MountNamespace struct { // NewMountNamespace returns a new mount namespace with a root filesystem // configured by the given arguments. A reference is taken on the returned // MountNamespace. -func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth.Credentials, source, fsTypeName string, opts *NewFilesystemOptions) (*MountNamespace, error) { +func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth.Credentials, source, fsTypeName string, opts *GetFilesystemOptions) (*MountNamespace, error) { fsType := vfs.getFilesystemType(fsTypeName) if fsType == nil { return nil, syserror.ENODEV } - fs, root, err := fsType.NewFilesystem(ctx, creds, source, *opts) + fs, root, err := fsType.GetFilesystem(ctx, vfs, creds, source, *opts) if err != nil { return nil, err } @@ -133,13 +133,13 @@ func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth return mntns, nil } -// NewMount creates and mounts a new Filesystem. -func (vfs *VirtualFilesystem) NewMount(ctx context.Context, creds *auth.Credentials, source string, target *PathOperation, fsTypeName string, opts *NewFilesystemOptions) error { +// NewMount creates and mounts a Filesystem configured by the given arguments. +func (vfs *VirtualFilesystem) NewMount(ctx context.Context, creds *auth.Credentials, source string, target *PathOperation, fsTypeName string, opts *GetFilesystemOptions) error { fsType := vfs.getFilesystemType(fsTypeName) if fsType == nil { return syserror.ENODEV } - fs, root, err := fsType.NewFilesystem(ctx, creds, source, *opts) + fs, root, err := fsType.GetFilesystem(ctx, vfs, creds, source, *opts) if err != nil { return err } @@ -147,8 +147,8 @@ func (vfs *VirtualFilesystem) NewMount(ctx context.Context, creds *auth.Credenti // lock ordering. vd, err := vfs.GetDentryAt(ctx, creds, target, &GetDentryOptions{}) if err != nil { - root.decRef(fs) - fs.decRef() + root.DecRef() + fs.DecRef() return err } vfs.mountMu.Lock() @@ -158,8 +158,8 @@ func (vfs *VirtualFilesystem) NewMount(ctx context.Context, creds *auth.Credenti vd.dentry.mu.Unlock() vfs.mountMu.Unlock() vd.DecRef() - root.decRef(fs) - fs.decRef() + root.DecRef() + fs.DecRef() return syserror.ENOENT } // vd might have been mounted over between vfs.GetDentryAt() and @@ -179,7 +179,7 @@ func (vfs *VirtualFilesystem) NewMount(ctx context.Context, creds *auth.Credenti break } // This can't fail since we're holding vfs.mountMu. - nextmnt.root.incRef(nextmnt.fs) + nextmnt.root.IncRef() vd.dentry.mu.Unlock() vd.DecRef() vd = VirtualDentry{ @@ -229,7 +229,7 @@ type umountRecursiveOptions struct { // Mounts on which references must be dropped to vdsToDecRef and mountsToDecRef // respectively, and returns updated slices. (This is necessary because // filesystem locks possibly taken by DentryImpl.DecRef() may precede -// vfs.mountMu in the lock order, and Mount.decRef() may lock vfs.mountMu.) +// vfs.mountMu in the lock order, and Mount.DecRef() may lock vfs.mountMu.) // // umountRecursiveLocked is analogous to Linux's fs/namespace.c:umount_tree(). // @@ -322,13 +322,15 @@ func (mnt *Mount) tryIncMountedRef() bool { } } -func (mnt *Mount) incRef() { +// IncRef increments mnt's reference count. +func (mnt *Mount) IncRef() { // In general, negative values for mnt.refs are valid because the MSB is // the eager-unmount bit. atomic.AddInt64(&mnt.refs, 1) } -func (mnt *Mount) decRef() { +// DecRef decrements mnt's reference count. +func (mnt *Mount) DecRef() { refs := atomic.AddInt64(&mnt.refs, -1) if refs&^math.MinInt64 == 0 { // mask out MSB var vd VirtualDentry @@ -339,8 +341,8 @@ func (mnt *Mount) decRef() { mnt.vfs.mounts.seq.EndWrite() mnt.vfs.mountMu.Unlock() } - mnt.root.decRef(mnt.fs) - mnt.fs.decRef() + mnt.root.DecRef() + mnt.fs.DecRef() if vd.Ok() { vd.DecRef() } @@ -368,7 +370,7 @@ func (mntns *MountNamespace) DecRef(vfs *VirtualFilesystem) { vd.DecRef() } for _, mnt := range mountsToDecRef { - mnt.decRef() + mnt.DecRef() } } else if refs < 0 { panic("MountNamespace.DecRef() called without holding a reference") @@ -413,7 +415,7 @@ retryFirst: // Raced with umount. continue } - mnt.decRef() + mnt.DecRef() mnt = next d = next.root } @@ -447,15 +449,15 @@ retryFirst: // Raced with umount. goto retryFirst } - if !point.tryIncRef(parent.fs) { + if !point.TryIncRef() { // Since Mount holds a reference on Mount.key.point, this can only // happen due to a racing change to Mount.key. - parent.decRef() + parent.DecRef() goto retryFirst } if !vfs.mounts.seq.ReadOk(epoch) { - point.decRef(parent.fs) - parent.decRef() + point.DecRef() + parent.DecRef() goto retryFirst } mnt = parent @@ -480,19 +482,19 @@ retryFirst: // Raced with umount. goto retryNotFirst } - if !point.tryIncRef(parent.fs) { + if !point.TryIncRef() { // Since Mount holds a reference on Mount.key.point, this can // only happen due to a racing change to Mount.key. - parent.decRef() + parent.DecRef() goto retryNotFirst } if !vfs.mounts.seq.ReadOk(epoch) { - point.decRef(parent.fs) - parent.decRef() + point.DecRef() + parent.DecRef() goto retryNotFirst } - d.decRef(mnt.fs) - mnt.decRef() + d.DecRef() + mnt.DecRef() mnt = parent d = point } diff --git a/pkg/sentry/vfs/resolving_path.go b/pkg/sentry/vfs/resolving_path.go index 61bce6426..621f5a6f8 100644 --- a/pkg/sentry/vfs/resolving_path.go +++ b/pkg/sentry/vfs/resolving_path.go @@ -149,20 +149,20 @@ func (vfs *VirtualFilesystem) putResolvingPath(rp *ResolvingPath) { func (rp *ResolvingPath) decRefStartAndMount() { if rp.flags&rpflagsHaveStartRef != 0 { - rp.start.decRef(rp.mount.fs) + rp.start.DecRef() } if rp.flags&rpflagsHaveMountRef != 0 { - rp.mount.decRef() + rp.mount.DecRef() } } func (rp *ResolvingPath) releaseErrorState() { if rp.nextStart != nil { - rp.nextStart.decRef(rp.nextMount.fs) + rp.nextStart.DecRef() rp.nextStart = nil } if rp.nextMount != nil { - rp.nextMount.decRef() + rp.nextMount.DecRef() rp.nextMount = nil } } diff --git a/pkg/sentry/vfs/syscalls.go b/pkg/sentry/vfs/syscalls.go index 49952b2cc..436151afa 100644 --- a/pkg/sentry/vfs/syscalls.go +++ b/pkg/sentry/vfs/syscalls.go @@ -63,7 +63,7 @@ func (vfs *VirtualFilesystem) GetDentryAt(ctx context.Context, creds *auth.Crede mount: rp.mount, dentry: d, } - rp.mount.incRef() + rp.mount.IncRef() vfs.putResolvingPath(rp) return vd, nil } diff --git a/pkg/sentry/vfs/testutil.go b/pkg/sentry/vfs/testutil.go index 70b192ece..593144cb7 100644 --- a/pkg/sentry/vfs/testutil.go +++ b/pkg/sentry/vfs/testutil.go @@ -33,10 +33,10 @@ type FDTestFilesystem struct { vfsfs Filesystem } -// NewFilesystem implements FilesystemType.NewFilesystem. -func (fstype FDTestFilesystemType) NewFilesystem(ctx context.Context, creds *auth.Credentials, source string, opts NewFilesystemOptions) (*Filesystem, *Dentry, error) { +// GetFilesystem implements FilesystemType.GetFilesystem. +func (fstype FDTestFilesystemType) GetFilesystem(ctx context.Context, vfsObj *VirtualFilesystem, creds *auth.Credentials, source string, opts GetFilesystemOptions) (*Filesystem, *Dentry, error) { var fs FDTestFilesystem - fs.vfsfs.Init(&fs) + fs.vfsfs.Init(vfsObj, &fs) return &fs.vfsfs, fs.NewDentry(), nil } @@ -126,14 +126,14 @@ func (fs *FDTestFilesystem) NewDentry() *Dentry { } // IncRef implements DentryImpl.IncRef. -func (d *fdTestDentry) IncRef(vfsfs *Filesystem) { +func (d *fdTestDentry) IncRef() { } // TryIncRef implements DentryImpl.TryIncRef. -func (d *fdTestDentry) TryIncRef(vfsfs *Filesystem) bool { +func (d *fdTestDentry) TryIncRef() bool { return true } // DecRef implements DentryImpl.DecRef. -func (d *fdTestDentry) DecRef(vfsfs *Filesystem) { +func (d *fdTestDentry) DecRef() { } diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go index 88e865d86..f0cd3ffe5 100644 --- a/pkg/sentry/vfs/vfs.go +++ b/pkg/sentry/vfs/vfs.go @@ -116,15 +116,15 @@ func (vd VirtualDentry) Ok() bool { // IncRef increments the reference counts on the Mount and Dentry represented // by vd. func (vd VirtualDentry) IncRef() { - vd.mount.incRef() - vd.dentry.incRef(vd.mount.fs) + vd.mount.IncRef() + vd.dentry.IncRef() } // DecRef decrements the reference counts on the Mount and Dentry represented // by vd. func (vd VirtualDentry) DecRef() { - vd.dentry.decRef(vd.mount.fs) - vd.mount.decRef() + vd.dentry.DecRef() + vd.mount.DecRef() } // Mount returns the Mount associated with vd. It does not take a reference on -- cgit v1.2.3 From 3e832bec1b48b95951c3b83eb5a7b70f29b1f10f Mon Sep 17 00:00:00 2001 From: Fabricio Voznika Date: Tue, 3 Dec 2019 13:31:07 -0800 Subject: Point TODOs to gvisor.dev PiperOrigin-RevId: 283610781 --- pkg/sentry/fsimpl/proc/filesystems.go | 2 +- pkg/sentry/fsimpl/proc/mounts.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/proc/filesystems.go b/pkg/sentry/fsimpl/proc/filesystems.go index c36c4aff5..0e016bca5 100644 --- a/pkg/sentry/fsimpl/proc/filesystems.go +++ b/pkg/sentry/fsimpl/proc/filesystems.go @@ -19,7 +19,7 @@ package proc // +stateify savable type filesystemsData struct{} -// TODO(b/138862512): Implement vfs.DynamicBytesSource.Generate for +// TODO(gvisor.dev/issue/1195): Implement vfs.DynamicBytesSource.Generate for // filesystemsData. We would need to retrive filesystem names from // vfs.VirtualFilesystem. Also needs vfs replacement for // fs.Filesystem.AllowUserList() and fs.FilesystemRequiresDev. diff --git a/pkg/sentry/fsimpl/proc/mounts.go b/pkg/sentry/fsimpl/proc/mounts.go index e81b1e910..8683cf677 100644 --- a/pkg/sentry/fsimpl/proc/mounts.go +++ b/pkg/sentry/fsimpl/proc/mounts.go @@ -16,7 +16,7 @@ package proc import "gvisor.dev/gvisor/pkg/sentry/kernel" -// TODO(b/138862512): Implement mountInfoFile and mountsFile. +// TODO(gvisor.dev/issue/1195): Implement mountInfoFile and mountsFile. // mountInfoFile implements vfs.DynamicBytesSource for /proc/[pid]/mountinfo. // -- cgit v1.2.3 From 46651a7d26559bdc69d460bdeb4de5968212d615 Mon Sep 17 00:00:00 2001 From: Jamie Liu Date: Tue, 10 Dec 2019 18:16:47 -0800 Subject: Add most VFS methods for syscalls. PiperOrigin-RevId: 284892289 --- pkg/abi/linux/file.go | 10 +- pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go | 6 +- pkg/sentry/fsimpl/ext/ext_test.go | 29 +- pkg/sentry/fsimpl/memfs/benchmark_test.go | 2 +- pkg/sentry/fsimpl/memfs/pipe_test.go | 6 +- pkg/sentry/vfs/BUILD | 1 - pkg/sentry/vfs/file_description.go | 93 ++++++ pkg/sentry/vfs/file_description_impl_util_test.go | 10 +- pkg/sentry/vfs/filesystem.go | 22 ++ pkg/sentry/vfs/mount.go | 69 +++- pkg/sentry/vfs/options.go | 12 + pkg/sentry/vfs/syscalls.go | 237 -------------- pkg/sentry/vfs/vfs.go | 378 ++++++++++++++++++++++ 13 files changed, 606 insertions(+), 269 deletions(-) delete mode 100644 pkg/sentry/vfs/syscalls.go (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/abi/linux/file.go b/pkg/abi/linux/file.go index c9ee098f4..0f014d27f 100644 --- a/pkg/abi/linux/file.go +++ b/pkg/abi/linux/file.go @@ -144,9 +144,13 @@ const ( ModeCharacterDevice = S_IFCHR ModeNamedPipe = S_IFIFO - ModeSetUID = 04000 - ModeSetGID = 02000 - ModeSticky = 01000 + S_ISUID = 04000 + S_ISGID = 02000 + S_ISVTX = 01000 + + ModeSetUID = S_ISUID + ModeSetGID = S_ISGID + ModeSticky = S_ISVTX ModeUserAll = 0700 ModeUserRead = 0400 diff --git a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go index 94cd74095..177ce2cb9 100644 --- a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go +++ b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go @@ -81,7 +81,11 @@ func mount(b *testing.B, imagePath string, vfsfs *vfs.VirtualFilesystem, pop *vf ctx := contexttest.Context(b) creds := auth.CredentialsFromContext(ctx) - if err := vfsfs.NewMount(ctx, creds, imagePath, pop, "extfs", &vfs.GetFilesystemOptions{InternalData: int(f.Fd())}); err != nil { + if err := vfsfs.MountAt(ctx, creds, imagePath, pop, "extfs", &vfs.MountOptions{ + GetFilesystemOptions: vfs.GetFilesystemOptions{ + InternalData: int(f.Fd()), + }, + }); err != nil { b.Fatalf("failed to mount tmpfs submount: %v", err) } return func() { diff --git a/pkg/sentry/fsimpl/ext/ext_test.go b/pkg/sentry/fsimpl/ext/ext_test.go index 307e4d68c..e9f756732 100644 --- a/pkg/sentry/fsimpl/ext/ext_test.go +++ b/pkg/sentry/fsimpl/ext/ext_test.go @@ -147,55 +147,54 @@ func TestSeek(t *testing.T) { t.Fatalf("vfsfs.OpenAt failed: %v", err) } - if n, err := fd.Impl().Seek(ctx, 0, linux.SEEK_SET); n != 0 || err != nil { + if n, err := fd.Seek(ctx, 0, linux.SEEK_SET); n != 0 || err != nil { t.Errorf("expected seek position 0, got %d and error %v", n, err) } - stat, err := fd.Impl().Stat(ctx, vfs.StatOptions{}) + stat, err := fd.Stat(ctx, vfs.StatOptions{}) if err != nil { t.Errorf("fd.stat failed for file %s in image %s: %v", test.path, test.image, err) } // We should be able to seek beyond the end of file. size := int64(stat.Size) - if n, err := fd.Impl().Seek(ctx, size, linux.SEEK_SET); n != size || err != nil { + if n, err := fd.Seek(ctx, size, linux.SEEK_SET); n != size || err != nil { t.Errorf("expected seek position %d, got %d and error %v", size, n, err) } // EINVAL should be returned if the resulting offset is negative. - if _, err := fd.Impl().Seek(ctx, -1, linux.SEEK_SET); err != syserror.EINVAL { + if _, err := fd.Seek(ctx, -1, linux.SEEK_SET); err != syserror.EINVAL { t.Errorf("expected error EINVAL but got %v", err) } - if n, err := fd.Impl().Seek(ctx, 3, linux.SEEK_CUR); n != size+3 || err != nil { + if n, err := fd.Seek(ctx, 3, linux.SEEK_CUR); n != size+3 || err != nil { t.Errorf("expected seek position %d, got %d and error %v", size+3, n, err) } // Make sure negative offsets work with SEEK_CUR. - if n, err := fd.Impl().Seek(ctx, -2, linux.SEEK_CUR); n != size+1 || err != nil { + if n, err := fd.Seek(ctx, -2, linux.SEEK_CUR); n != size+1 || err != nil { t.Errorf("expected seek position %d, got %d and error %v", size+1, n, err) } // EINVAL should be returned if the resulting offset is negative. - if _, err := fd.Impl().Seek(ctx, -(size + 2), linux.SEEK_CUR); err != syserror.EINVAL { + if _, err := fd.Seek(ctx, -(size + 2), linux.SEEK_CUR); err != syserror.EINVAL { t.Errorf("expected error EINVAL but got %v", err) } // Make sure SEEK_END works with regular files. - switch fd.Impl().(type) { - case *regularFileFD: + if _, ok := fd.Impl().(*regularFileFD); ok { // Seek back to 0. - if n, err := fd.Impl().Seek(ctx, -size, linux.SEEK_END); n != 0 || err != nil { + if n, err := fd.Seek(ctx, -size, linux.SEEK_END); n != 0 || err != nil { t.Errorf("expected seek position %d, got %d and error %v", 0, n, err) } // Seek forward beyond EOF. - if n, err := fd.Impl().Seek(ctx, 1, linux.SEEK_END); n != size+1 || err != nil { + if n, err := fd.Seek(ctx, 1, linux.SEEK_END); n != size+1 || err != nil { t.Errorf("expected seek position %d, got %d and error %v", size+1, n, err) } // EINVAL should be returned if the resulting offset is negative. - if _, err := fd.Impl().Seek(ctx, -(size + 1), linux.SEEK_END); err != syserror.EINVAL { + if _, err := fd.Seek(ctx, -(size + 1), linux.SEEK_END); err != syserror.EINVAL { t.Errorf("expected error EINVAL but got %v", err) } } @@ -456,7 +455,7 @@ func TestRead(t *testing.T) { want := make([]byte, 1) for { n, err := f.Read(want) - fd.Impl().Read(ctx, usermem.BytesIOSequence(got), vfs.ReadOptions{}) + fd.Read(ctx, usermem.BytesIOSequence(got), vfs.ReadOptions{}) if diff := cmp.Diff(got, want); diff != "" { t.Errorf("file data mismatch (-want +got):\n%s", diff) @@ -464,7 +463,7 @@ func TestRead(t *testing.T) { // Make sure there is no more file data left after getting EOF. if n == 0 || err == io.EOF { - if n, _ := fd.Impl().Read(ctx, usermem.BytesIOSequence(got), vfs.ReadOptions{}); n != 0 { + if n, _ := fd.Read(ctx, usermem.BytesIOSequence(got), vfs.ReadOptions{}); n != 0 { t.Errorf("extra unexpected file data in file %s in image %s", test.absPath, test.image) } @@ -574,7 +573,7 @@ func TestIterDirents(t *testing.T) { } cb := &iterDirentsCb{} - if err = fd.Impl().IterDirents(ctx, cb); err != nil { + if err = fd.IterDirents(ctx, cb); err != nil { t.Fatalf("dir fd.IterDirents() failed: %v", err) } diff --git a/pkg/sentry/fsimpl/memfs/benchmark_test.go b/pkg/sentry/fsimpl/memfs/benchmark_test.go index ea6417ce7..4a7a94a52 100644 --- a/pkg/sentry/fsimpl/memfs/benchmark_test.go +++ b/pkg/sentry/fsimpl/memfs/benchmark_test.go @@ -394,7 +394,7 @@ func BenchmarkVFS2MemfsMountStat(b *testing.B) { } defer mountPoint.DecRef() // Create and mount the submount. - if err := vfsObj.NewMount(ctx, creds, "", &pop, "memfs", &vfs.GetFilesystemOptions{}); err != nil { + if err := vfsObj.MountAt(ctx, creds, "", &pop, "memfs", &vfs.MountOptions{}); err != nil { b.Fatalf("failed to mount tmpfs submount: %v", err) } filePathBuilder.WriteString(mountPointName) diff --git a/pkg/sentry/fsimpl/memfs/pipe_test.go b/pkg/sentry/fsimpl/memfs/pipe_test.go index a3a870571..5bf527c80 100644 --- a/pkg/sentry/fsimpl/memfs/pipe_test.go +++ b/pkg/sentry/fsimpl/memfs/pipe_test.go @@ -194,7 +194,7 @@ func setup(t *testing.T) (context.Context, *auth.Credentials, *vfs.VirtualFilesy func checkEmpty(ctx context.Context, t *testing.T, fd *vfs.FileDescription) { readData := make([]byte, 1) dst := usermem.BytesIOSequence(readData) - bytesRead, err := fd.Impl().Read(ctx, dst, vfs.ReadOptions{}) + bytesRead, err := fd.Read(ctx, dst, vfs.ReadOptions{}) if err != syserror.ErrWouldBlock { t.Fatalf("expected ErrWouldBlock reading from empty pipe %q, but got: %v", fileName, err) } @@ -207,7 +207,7 @@ func checkEmpty(ctx context.Context, t *testing.T, fd *vfs.FileDescription) { func checkWrite(ctx context.Context, t *testing.T, fd *vfs.FileDescription, msg string) { writeData := []byte(msg) src := usermem.BytesIOSequence(writeData) - bytesWritten, err := fd.Impl().Write(ctx, src, vfs.WriteOptions{}) + bytesWritten, err := fd.Write(ctx, src, vfs.WriteOptions{}) if err != nil { t.Fatalf("error writing to pipe %q: %v", fileName, err) } @@ -220,7 +220,7 @@ func checkWrite(ctx context.Context, t *testing.T, fd *vfs.FileDescription, msg func checkRead(ctx context.Context, t *testing.T, fd *vfs.FileDescription, msg string) { readData := make([]byte, len(msg)) dst := usermem.BytesIOSequence(readData) - bytesRead, err := fd.Impl().Read(ctx, dst, vfs.ReadOptions{}) + bytesRead, err := fd.Read(ctx, dst, vfs.ReadOptions{}) if err != nil { t.Fatalf("error reading from pipe %q: %v", fileName, err) } diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD index 74a325309..59237c3b9 100644 --- a/pkg/sentry/vfs/BUILD +++ b/pkg/sentry/vfs/BUILD @@ -19,7 +19,6 @@ go_library( "options.go", "permissions.go", "resolving_path.go", - "syscalls.go", "testutil.go", "vfs.go", ], diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go index 34007eb57..4473dfce8 100644 --- a/pkg/sentry/vfs/file_description.go +++ b/pkg/sentry/vfs/file_description.go @@ -241,3 +241,96 @@ type IterDirentsCallback interface { // called. Handle(dirent Dirent) bool } + +// OnClose is called when a file descriptor representing the FileDescription is +// closed. Returning a non-nil error should not prevent the file descriptor +// from being closed. +func (fd *FileDescription) OnClose(ctx context.Context) error { + return fd.impl.OnClose(ctx) +} + +// StatusFlags returns file description status flags, as for fcntl(F_GETFL). +func (fd *FileDescription) StatusFlags(ctx context.Context) (uint32, error) { + flags, err := fd.impl.StatusFlags(ctx) + flags |= linux.O_LARGEFILE + return flags, err +} + +// SetStatusFlags sets file description status flags, as for fcntl(F_SETFL). +func (fd *FileDescription) SetStatusFlags(ctx context.Context, flags uint32) error { + return fd.impl.SetStatusFlags(ctx, flags) +} + +// Stat returns metadata for the file represented by fd. +func (fd *FileDescription) Stat(ctx context.Context, opts StatOptions) (linux.Statx, error) { + return fd.impl.Stat(ctx, opts) +} + +// SetStat updates metadata for the file represented by fd. +func (fd *FileDescription) SetStat(ctx context.Context, opts SetStatOptions) error { + return fd.impl.SetStat(ctx, opts) +} + +// StatFS returns metadata for the filesystem containing the file represented +// by fd. +func (fd *FileDescription) StatFS(ctx context.Context) (linux.Statfs, error) { + return fd.impl.StatFS(ctx) +} + +// PRead reads from the file represented by fd into dst, starting at the given +// offset, and returns the number of bytes read. PRead is permitted to return +// partial reads with a nil error. +func (fd *FileDescription) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) { + return fd.impl.PRead(ctx, dst, offset, opts) +} + +// Read is similar to PRead, but does not specify an offset. +func (fd *FileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) { + return fd.impl.Read(ctx, dst, opts) +} + +// PWrite writes src to the file represented by fd, starting at the given +// offset, and returns the number of bytes written. PWrite is permitted to +// return partial writes with a nil error. +func (fd *FileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) { + return fd.impl.PWrite(ctx, src, offset, opts) +} + +// Write is similar to PWrite, but does not specify an offset. +func (fd *FileDescription) Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) { + return fd.impl.Write(ctx, src, opts) +} + +// IterDirents invokes cb on each entry in the directory represented by fd. If +// IterDirents has been called since the last call to Seek, it continues +// iteration from the end of the last call. +func (fd *FileDescription) IterDirents(ctx context.Context, cb IterDirentsCallback) error { + return fd.impl.IterDirents(ctx, cb) +} + +// Seek changes fd's offset (assuming one exists) and returns its new value. +func (fd *FileDescription) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { + return fd.impl.Seek(ctx, offset, whence) +} + +// Sync has the semantics of fsync(2). +func (fd *FileDescription) Sync(ctx context.Context) error { + return fd.impl.Sync(ctx) +} + +// ConfigureMMap mutates opts to implement mmap(2) for the file represented by +// fd. +func (fd *FileDescription) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { + return fd.impl.ConfigureMMap(ctx, opts) +} + +// Ioctl implements the ioctl(2) syscall. +func (fd *FileDescription) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) { + return fd.impl.Ioctl(ctx, uio, args) +} + +// SyncFS instructs the filesystem containing fd to execute the semantics of +// syncfs(2). +func (fd *FileDescription) SyncFS(ctx context.Context) error { + return fd.vd.mount.fs.impl.Sync(ctx) +} diff --git a/pkg/sentry/vfs/file_description_impl_util_test.go b/pkg/sentry/vfs/file_description_impl_util_test.go index a5561dcbe..ac7799296 100644 --- a/pkg/sentry/vfs/file_description_impl_util_test.go +++ b/pkg/sentry/vfs/file_description_impl_util_test.go @@ -103,7 +103,7 @@ func TestGenCountFD(t *testing.T) { // The first read causes Generate to be called to fill the FD's buffer. buf := make([]byte, 2) ioseq := usermem.BytesIOSequence(buf) - n, err := fd.Impl().Read(ctx, ioseq, ReadOptions{}) + n, err := fd.Read(ctx, ioseq, ReadOptions{}) if n != 1 || (err != nil && err != io.EOF) { t.Fatalf("first Read: got (%d, %v), wanted (1, nil or EOF)", n, err) } @@ -112,17 +112,17 @@ func TestGenCountFD(t *testing.T) { } // A second read without seeking is still at EOF. - n, err = fd.Impl().Read(ctx, ioseq, ReadOptions{}) + n, err = fd.Read(ctx, ioseq, ReadOptions{}) if n != 0 || err != io.EOF { t.Fatalf("second Read: got (%d, %v), wanted (0, EOF)", n, err) } // Seeking to the beginning of the file causes it to be regenerated. - n, err = fd.Impl().Seek(ctx, 0, linux.SEEK_SET) + n, err = fd.Seek(ctx, 0, linux.SEEK_SET) if n != 0 || err != nil { t.Fatalf("Seek: got (%d, %v), wanted (0, nil)", n, err) } - n, err = fd.Impl().Read(ctx, ioseq, ReadOptions{}) + n, err = fd.Read(ctx, ioseq, ReadOptions{}) if n != 1 || (err != nil && err != io.EOF) { t.Fatalf("Read after Seek: got (%d, %v), wanted (1, nil or EOF)", n, err) } @@ -131,7 +131,7 @@ func TestGenCountFD(t *testing.T) { } // PRead at the beginning of the file also causes it to be regenerated. - n, err = fd.Impl().PRead(ctx, ioseq, 0, ReadOptions{}) + n, err = fd.PRead(ctx, ioseq, 0, ReadOptions{}) if n != 1 || (err != nil && err != io.EOF) { t.Fatalf("PRead: got (%d, %v), wanted (1, nil or EOF)", n, err) } diff --git a/pkg/sentry/vfs/filesystem.go b/pkg/sentry/vfs/filesystem.go index 76ff8cf51..dfbd2372a 100644 --- a/pkg/sentry/vfs/filesystem.go +++ b/pkg/sentry/vfs/filesystem.go @@ -47,6 +47,9 @@ func (fs *Filesystem) Init(vfsObj *VirtualFilesystem, impl FilesystemImpl) { fs.refs = 1 fs.vfs = vfsObj fs.impl = impl + vfsObj.filesystemsMu.Lock() + vfsObj.filesystems[fs] = struct{}{} + vfsObj.filesystemsMu.Unlock() } // VirtualFilesystem returns the containing VirtualFilesystem. @@ -66,9 +69,28 @@ func (fs *Filesystem) IncRef() { } } +// TryIncRef increments fs' reference count and returns true. If fs' reference +// count is zero, TryIncRef does nothing and returns false. +// +// TryIncRef does not require that a reference is held on fs. +func (fs *Filesystem) TryIncRef() bool { + for { + refs := atomic.LoadInt64(&fs.refs) + if refs <= 0 { + return false + } + if atomic.CompareAndSwapInt64(&fs.refs, refs, refs+1) { + return true + } + } +} + // DecRef decrements fs' reference count. func (fs *Filesystem) DecRef() { if refs := atomic.AddInt64(&fs.refs, -1); refs == 0 { + fs.vfs.filesystemsMu.Lock() + delete(fs.vfs.filesystems, fs) + fs.vfs.filesystemsMu.Unlock() fs.impl.Release() } else if refs < 0 { panic("Filesystem.decRef() called without holding a reference") diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go index 1c3b2e987..ec23ab0dd 100644 --- a/pkg/sentry/vfs/mount.go +++ b/pkg/sentry/vfs/mount.go @@ -18,6 +18,7 @@ import ( "math" "sync/atomic" + "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/syserror" @@ -133,13 +134,13 @@ func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth return mntns, nil } -// NewMount creates and mounts a Filesystem configured by the given arguments. -func (vfs *VirtualFilesystem) NewMount(ctx context.Context, creds *auth.Credentials, source string, target *PathOperation, fsTypeName string, opts *GetFilesystemOptions) error { +// MountAt creates and mounts a Filesystem configured by the given arguments. +func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentials, source string, target *PathOperation, fsTypeName string, opts *MountOptions) error { fsType := vfs.getFilesystemType(fsTypeName) if fsType == nil { return syserror.ENODEV } - fs, root, err := fsType.GetFilesystem(ctx, vfs, creds, source, *opts) + fs, root, err := fsType.GetFilesystem(ctx, vfs, creds, source, opts.GetFilesystemOptions) if err != nil { return err } @@ -207,6 +208,68 @@ func (vfs *VirtualFilesystem) NewMount(ctx context.Context, creds *auth.Credenti return nil } +// UmountAt removes the Mount at the given path. +func (vfs *VirtualFilesystem) UmountAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *UmountOptions) error { + if opts.Flags&^(linux.MNT_FORCE|linux.MNT_DETACH) != 0 { + return syserror.EINVAL + } + + // MNT_FORCE is currently unimplemented except for the permission check. + if opts.Flags&linux.MNT_FORCE != 0 && creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, creds.UserNamespace.Root()) { + return syserror.EPERM + } + + vd, err := vfs.GetDentryAt(ctx, creds, pop, &GetDentryOptions{}) + if err != nil { + return err + } + defer vd.DecRef() + if vd.dentry != vd.mount.root { + return syserror.EINVAL + } + vfs.mountMu.Lock() + if mntns := MountNamespaceFromContext(ctx); mntns != nil && mntns != vd.mount.ns { + vfs.mountMu.Unlock() + return syserror.EINVAL + } + + // TODO(jamieliu): Linux special-cases umount of the caller's root, which + // we don't implement yet (we'll just fail it since the caller holds a + // reference on it). + + vfs.mounts.seq.BeginWrite() + if opts.Flags&linux.MNT_DETACH == 0 { + if len(vd.mount.children) != 0 { + vfs.mounts.seq.EndWrite() + vfs.mountMu.Unlock() + return syserror.EBUSY + } + // We are holding a reference on vd.mount. + expectedRefs := int64(1) + if !vd.mount.umounted { + expectedRefs = 2 + } + if atomic.LoadInt64(&vd.mount.refs)&^math.MinInt64 != expectedRefs { // mask out MSB + vfs.mounts.seq.EndWrite() + vfs.mountMu.Unlock() + return syserror.EBUSY + } + } + vdsToDecRef, mountsToDecRef := vfs.umountRecursiveLocked(vd.mount, &umountRecursiveOptions{ + eager: opts.Flags&linux.MNT_DETACH == 0, + disconnectHierarchy: true, + }, nil, nil) + vfs.mounts.seq.EndWrite() + vfs.mountMu.Unlock() + for _, vd := range vdsToDecRef { + vd.DecRef() + } + for _, mnt := range mountsToDecRef { + mnt.DecRef() + } + return nil +} + type umountRecursiveOptions struct { // If eager is true, ensure that future calls to Mount.tryIncMountedRef() // on umounted mounts fail. diff --git a/pkg/sentry/vfs/options.go b/pkg/sentry/vfs/options.go index 3aa73d911..3ecbc8fc1 100644 --- a/pkg/sentry/vfs/options.go +++ b/pkg/sentry/vfs/options.go @@ -46,6 +46,12 @@ type MknodOptions struct { DevMinor uint32 } +// MountOptions contains options to VirtualFilesystem.MountAt(). +type MountOptions struct { + // GetFilesystemOptions contains options to FilesystemType.GetFilesystem(). + GetFilesystemOptions GetFilesystemOptions +} + // OpenOptions contains options to VirtualFilesystem.OpenAt() and // FilesystemImpl.OpenAt(). type OpenOptions struct { @@ -114,6 +120,12 @@ type StatOptions struct { Sync uint32 } +// UmountOptions contains options to VirtualFilesystem.UmountAt(). +type UmountOptions struct { + // Flags contains flags as specified for umount2(2). + Flags uint32 +} + // WriteOptions contains options to FileDescription.PWrite(), // FileDescriptionImpl.PWrite(), FileDescription.Write(), and // FileDescriptionImpl.Write(). diff --git a/pkg/sentry/vfs/syscalls.go b/pkg/sentry/vfs/syscalls.go deleted file mode 100644 index 436151afa..000000000 --- a/pkg/sentry/vfs/syscalls.go +++ /dev/null @@ -1,237 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package vfs - -import ( - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/syserror" -) - -// PathOperation specifies the path operated on by a VFS method. -// -// PathOperation is passed to VFS methods by pointer to reduce memory copying: -// it's somewhat large and should never escape. (Options structs are passed by -// pointer to VFS and FileDescription methods for the same reason.) -type PathOperation struct { - // Root is the VFS root. References on Root are borrowed from the provider - // of the PathOperation. - // - // Invariants: Root.Ok(). - Root VirtualDentry - - // Start is the starting point for the path traversal. References on Start - // are borrowed from the provider of the PathOperation (i.e. the caller of - // the VFS method to which the PathOperation was passed). - // - // Invariants: Start.Ok(). If Pathname.Absolute, then Start == Root. - Start VirtualDentry - - // Path is the pathname traversed by this operation. - Pathname string - - // If FollowFinalSymlink is true, and the Dentry traversed by the final - // path component represents a symbolic link, the symbolic link should be - // followed. - FollowFinalSymlink bool -} - -// GetDentryAt returns a VirtualDentry representing the given path, at which a -// file must exist. A reference is taken on the returned VirtualDentry. -func (vfs *VirtualFilesystem) GetDentryAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *GetDentryOptions) (VirtualDentry, error) { - rp, err := vfs.getResolvingPath(creds, pop) - if err != nil { - return VirtualDentry{}, err - } - for { - d, err := rp.mount.fs.impl.GetDentryAt(ctx, rp, *opts) - if err == nil { - vd := VirtualDentry{ - mount: rp.mount, - dentry: d, - } - rp.mount.IncRef() - vfs.putResolvingPath(rp) - return vd, nil - } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) - return VirtualDentry{}, err - } - } -} - -// MkdirAt creates a directory at the given path. -func (vfs *VirtualFilesystem) MkdirAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *MkdirOptions) error { - // "Under Linux, apart from the permission bits, the S_ISVTX mode bit is - // also honored." - mkdir(2) - opts.Mode &= 01777 - rp, err := vfs.getResolvingPath(creds, pop) - if err != nil { - return err - } - for { - err := rp.mount.fs.impl.MkdirAt(ctx, rp, *opts) - if err == nil { - vfs.putResolvingPath(rp) - return nil - } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) - return err - } - } -} - -// MknodAt creates a file of the given mode at the given path. It returns an -// error from the syserror package. -func (vfs *VirtualFilesystem) MknodAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *MknodOptions) error { - rp, err := vfs.getResolvingPath(creds, pop) - if err != nil { - return nil - } - for { - if err = rp.mount.fs.impl.MknodAt(ctx, rp, *opts); err == nil { - vfs.putResolvingPath(rp) - return nil - } - // Handle mount traversals. - if !rp.handleError(err) { - vfs.putResolvingPath(rp) - return err - } - } -} - -// OpenAt returns a FileDescription providing access to the file at the given -// path. A reference is taken on the returned FileDescription. -func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *OpenOptions) (*FileDescription, error) { - // Remove: - // - // - O_LARGEFILE, which we always report in FileDescription status flags - // since only 64-bit architectures are supported at this time. - // - // - O_CLOEXEC, which affects file descriptors and therefore must be - // handled outside of VFS. - // - // - Unknown flags. - opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_NOCTTY | linux.O_TRUNC | linux.O_APPEND | linux.O_NONBLOCK | linux.O_DSYNC | linux.O_ASYNC | linux.O_DIRECT | linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_NOATIME | linux.O_SYNC | linux.O_PATH | linux.O_TMPFILE - // Linux's __O_SYNC (which we call linux.O_SYNC) implies O_DSYNC. - if opts.Flags&linux.O_SYNC != 0 { - opts.Flags |= linux.O_DSYNC - } - // Linux's __O_TMPFILE (which we call linux.O_TMPFILE) must be specified - // with O_DIRECTORY and a writable access mode (to ensure that it fails on - // filesystem implementations that do not support it). - if opts.Flags&linux.O_TMPFILE != 0 { - if opts.Flags&linux.O_DIRECTORY == 0 { - return nil, syserror.EINVAL - } - if opts.Flags&linux.O_CREAT != 0 { - return nil, syserror.EINVAL - } - if opts.Flags&linux.O_ACCMODE == linux.O_RDONLY { - return nil, syserror.EINVAL - } - } - // O_PATH causes most other flags to be ignored. - if opts.Flags&linux.O_PATH != 0 { - opts.Flags &= linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_PATH - } - // "On Linux, the following bits are also honored in mode: [S_ISUID, - // S_ISGID, S_ISVTX]" - open(2) - opts.Mode &= 07777 - - if opts.Flags&linux.O_NOFOLLOW != 0 { - pop.FollowFinalSymlink = false - } - rp, err := vfs.getResolvingPath(creds, pop) - if err != nil { - return nil, err - } - if opts.Flags&linux.O_DIRECTORY != 0 { - rp.mustBeDir = true - rp.mustBeDirOrig = true - } - for { - fd, err := rp.mount.fs.impl.OpenAt(ctx, rp, *opts) - if err == nil { - vfs.putResolvingPath(rp) - return fd, nil - } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) - return nil, err - } - } -} - -// StatAt returns metadata for the file at the given path. -func (vfs *VirtualFilesystem) StatAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *StatOptions) (linux.Statx, error) { - rp, err := vfs.getResolvingPath(creds, pop) - if err != nil { - return linux.Statx{}, err - } - for { - stat, err := rp.mount.fs.impl.StatAt(ctx, rp, *opts) - if err == nil { - vfs.putResolvingPath(rp) - return stat, nil - } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) - return linux.Statx{}, err - } - } -} - -// StatusFlags returns file description status flags. -func (fd *FileDescription) StatusFlags(ctx context.Context) (uint32, error) { - flags, err := fd.impl.StatusFlags(ctx) - flags |= linux.O_LARGEFILE - return flags, err -} - -// SetStatusFlags sets file description status flags. -func (fd *FileDescription) SetStatusFlags(ctx context.Context, flags uint32) error { - return fd.impl.SetStatusFlags(ctx, flags) -} - -// TODO: -// -// - VFS.SyncAllFilesystems() for sync(2) -// -// - Something for syncfs(2) -// -// - VFS.LinkAt() -// -// - VFS.ReadlinkAt() -// -// - VFS.RenameAt() -// -// - VFS.RmdirAt() -// -// - VFS.SetStatAt() -// -// - VFS.StatFSAt() -// -// - VFS.SymlinkAt() -// -// - VFS.UmountAt() -// -// - VFS.UnlinkAt() -// -// - FileDescription.(almost everything) diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go index f0cd3ffe5..7262b0d0a 100644 --- a/pkg/sentry/vfs/vfs.go +++ b/pkg/sentry/vfs/vfs.go @@ -20,6 +20,7 @@ // VirtualFilesystem.mountMu // Dentry.mu // Locks acquired by FilesystemImpls between Prepare{Delete,Rename}Dentry and Commit{Delete,Rename*}Dentry +// VirtualFilesystem.filesystemsMu // VirtualFilesystem.fsTypesMu // // Locking Dentry.mu in multiple Dentries requires holding @@ -28,6 +29,11 @@ package vfs import ( "sync" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/syserror" ) // A VirtualFilesystem (VFS for short) combines Filesystems in trees of Mounts. @@ -67,6 +73,11 @@ type VirtualFilesystem struct { // mountpoints is analogous to Linux's mountpoint_hashtable. mountpoints map[*Dentry]map[*Mount]struct{} + // filesystems contains all Filesystems. filesystems is protected by + // filesystemsMu. + filesystemsMu sync.Mutex + filesystems map[*Filesystem]struct{} + // fsTypes contains all FilesystemTypes that are usable in the // VirtualFilesystem. fsTypes is protected by fsTypesMu. fsTypesMu sync.RWMutex @@ -77,12 +88,379 @@ type VirtualFilesystem struct { func New() *VirtualFilesystem { vfs := &VirtualFilesystem{ mountpoints: make(map[*Dentry]map[*Mount]struct{}), + filesystems: make(map[*Filesystem]struct{}), fsTypes: make(map[string]FilesystemType), } vfs.mounts.Init() return vfs } +// PathOperation specifies the path operated on by a VFS method. +// +// PathOperation is passed to VFS methods by pointer to reduce memory copying: +// it's somewhat large and should never escape. (Options structs are passed by +// pointer to VFS and FileDescription methods for the same reason.) +type PathOperation struct { + // Root is the VFS root. References on Root are borrowed from the provider + // of the PathOperation. + // + // Invariants: Root.Ok(). + Root VirtualDentry + + // Start is the starting point for the path traversal. References on Start + // are borrowed from the provider of the PathOperation (i.e. the caller of + // the VFS method to which the PathOperation was passed). + // + // Invariants: Start.Ok(). If Pathname.Absolute, then Start == Root. + Start VirtualDentry + + // Path is the pathname traversed by this operation. + Pathname string + + // If FollowFinalSymlink is true, and the Dentry traversed by the final + // path component represents a symbolic link, the symbolic link should be + // followed. + FollowFinalSymlink bool +} + +// GetDentryAt returns a VirtualDentry representing the given path, at which a +// file must exist. A reference is taken on the returned VirtualDentry. +func (vfs *VirtualFilesystem) GetDentryAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *GetDentryOptions) (VirtualDentry, error) { + rp, err := vfs.getResolvingPath(creds, pop) + if err != nil { + return VirtualDentry{}, err + } + for { + d, err := rp.mount.fs.impl.GetDentryAt(ctx, rp, *opts) + if err == nil { + vd := VirtualDentry{ + mount: rp.mount, + dentry: d, + } + rp.mount.IncRef() + vfs.putResolvingPath(rp) + return vd, nil + } + if !rp.handleError(err) { + vfs.putResolvingPath(rp) + return VirtualDentry{}, err + } + } +} + +// LinkAt creates a hard link at newpop representing the existing file at +// oldpop. +func (vfs *VirtualFilesystem) LinkAt(ctx context.Context, creds *auth.Credentials, oldpop, newpop *PathOperation) error { + oldVD, err := vfs.GetDentryAt(ctx, creds, oldpop, &GetDentryOptions{}) + if err != nil { + return err + } + rp, err := vfs.getResolvingPath(creds, newpop) + if err != nil { + oldVD.DecRef() + return err + } + for { + err := rp.mount.fs.impl.LinkAt(ctx, rp, oldVD) + if err == nil { + oldVD.DecRef() + vfs.putResolvingPath(rp) + return nil + } + if !rp.handleError(err) { + oldVD.DecRef() + vfs.putResolvingPath(rp) + return err + } + } +} + +// MkdirAt creates a directory at the given path. +func (vfs *VirtualFilesystem) MkdirAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *MkdirOptions) error { + // "Under Linux, apart from the permission bits, the S_ISVTX mode bit is + // also honored." - mkdir(2) + opts.Mode &= 0777 | linux.S_ISVTX + rp, err := vfs.getResolvingPath(creds, pop) + if err != nil { + return err + } + for { + err := rp.mount.fs.impl.MkdirAt(ctx, rp, *opts) + if err == nil { + vfs.putResolvingPath(rp) + return nil + } + if !rp.handleError(err) { + vfs.putResolvingPath(rp) + return err + } + } +} + +// MknodAt creates a file of the given mode at the given path. It returns an +// error from the syserror package. +func (vfs *VirtualFilesystem) MknodAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *MknodOptions) error { + rp, err := vfs.getResolvingPath(creds, pop) + if err != nil { + return nil + } + for { + if err = rp.mount.fs.impl.MknodAt(ctx, rp, *opts); err == nil { + vfs.putResolvingPath(rp) + return nil + } + // Handle mount traversals. + if !rp.handleError(err) { + vfs.putResolvingPath(rp) + return err + } + } +} + +// OpenAt returns a FileDescription providing access to the file at the given +// path. A reference is taken on the returned FileDescription. +func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *OpenOptions) (*FileDescription, error) { + // Remove: + // + // - O_LARGEFILE, which we always report in FileDescription status flags + // since only 64-bit architectures are supported at this time. + // + // - O_CLOEXEC, which affects file descriptors and therefore must be + // handled outside of VFS. + // + // - Unknown flags. + opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_NOCTTY | linux.O_TRUNC | linux.O_APPEND | linux.O_NONBLOCK | linux.O_DSYNC | linux.O_ASYNC | linux.O_DIRECT | linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_NOATIME | linux.O_SYNC | linux.O_PATH | linux.O_TMPFILE + // Linux's __O_SYNC (which we call linux.O_SYNC) implies O_DSYNC. + if opts.Flags&linux.O_SYNC != 0 { + opts.Flags |= linux.O_DSYNC + } + // Linux's __O_TMPFILE (which we call linux.O_TMPFILE) must be specified + // with O_DIRECTORY and a writable access mode (to ensure that it fails on + // filesystem implementations that do not support it). + if opts.Flags&linux.O_TMPFILE != 0 { + if opts.Flags&linux.O_DIRECTORY == 0 { + return nil, syserror.EINVAL + } + if opts.Flags&linux.O_CREAT != 0 { + return nil, syserror.EINVAL + } + if opts.Flags&linux.O_ACCMODE == linux.O_RDONLY { + return nil, syserror.EINVAL + } + } + // O_PATH causes most other flags to be ignored. + if opts.Flags&linux.O_PATH != 0 { + opts.Flags &= linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_PATH + } + // "On Linux, the following bits are also honored in mode: [S_ISUID, + // S_ISGID, S_ISVTX]" - open(2) + opts.Mode &= 0777 | linux.S_ISUID | linux.S_ISGID | linux.S_ISVTX + + if opts.Flags&linux.O_NOFOLLOW != 0 { + pop.FollowFinalSymlink = false + } + rp, err := vfs.getResolvingPath(creds, pop) + if err != nil { + return nil, err + } + if opts.Flags&linux.O_DIRECTORY != 0 { + rp.mustBeDir = true + rp.mustBeDirOrig = true + } + for { + fd, err := rp.mount.fs.impl.OpenAt(ctx, rp, *opts) + if err == nil { + vfs.putResolvingPath(rp) + return fd, nil + } + if !rp.handleError(err) { + vfs.putResolvingPath(rp) + return nil, err + } + } +} + +// ReadlinkAt returns the target of the symbolic link at the given path. +func (vfs *VirtualFilesystem) ReadlinkAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) (string, error) { + rp, err := vfs.getResolvingPath(creds, pop) + if err != nil { + return "", err + } + for { + target, err := rp.mount.fs.impl.ReadlinkAt(ctx, rp) + if err == nil { + vfs.putResolvingPath(rp) + return target, nil + } + if !rp.handleError(err) { + vfs.putResolvingPath(rp) + return "", err + } + } +} + +// RenameAt renames the file at oldpop to newpop. +func (vfs *VirtualFilesystem) RenameAt(ctx context.Context, creds *auth.Credentials, oldpop, newpop *PathOperation, opts *RenameOptions) error { + oldVD, err := vfs.GetDentryAt(ctx, creds, oldpop, &GetDentryOptions{}) + if err != nil { + return err + } + rp, err := vfs.getResolvingPath(creds, newpop) + if err != nil { + oldVD.DecRef() + return err + } + for { + err := rp.mount.fs.impl.RenameAt(ctx, rp, oldVD, *opts) + if err == nil { + oldVD.DecRef() + vfs.putResolvingPath(rp) + return nil + } + if !rp.handleError(err) { + oldVD.DecRef() + vfs.putResolvingPath(rp) + return err + } + } +} + +// RmdirAt removes the directory at the given path. +func (vfs *VirtualFilesystem) RmdirAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) error { + rp, err := vfs.getResolvingPath(creds, pop) + if err != nil { + return err + } + for { + err := rp.mount.fs.impl.RmdirAt(ctx, rp) + if err == nil { + vfs.putResolvingPath(rp) + return nil + } + if !rp.handleError(err) { + vfs.putResolvingPath(rp) + return err + } + } +} + +// SetStatAt changes metadata for the file at the given path. +func (vfs *VirtualFilesystem) SetStatAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *SetStatOptions) error { + rp, err := vfs.getResolvingPath(creds, pop) + if err != nil { + return err + } + for { + err := rp.mount.fs.impl.SetStatAt(ctx, rp, *opts) + if err == nil { + vfs.putResolvingPath(rp) + return nil + } + if !rp.handleError(err) { + vfs.putResolvingPath(rp) + return err + } + } +} + +// StatAt returns metadata for the file at the given path. +func (vfs *VirtualFilesystem) StatAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *StatOptions) (linux.Statx, error) { + rp, err := vfs.getResolvingPath(creds, pop) + if err != nil { + return linux.Statx{}, err + } + for { + stat, err := rp.mount.fs.impl.StatAt(ctx, rp, *opts) + if err == nil { + vfs.putResolvingPath(rp) + return stat, nil + } + if !rp.handleError(err) { + vfs.putResolvingPath(rp) + return linux.Statx{}, err + } + } +} + +// StatFSAt returns metadata for the filesystem containing the file at the +// given path. +func (vfs *VirtualFilesystem) StatFSAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) (linux.Statfs, error) { + rp, err := vfs.getResolvingPath(creds, pop) + if err != nil { + return linux.Statfs{}, err + } + for { + statfs, err := rp.mount.fs.impl.StatFSAt(ctx, rp) + if err == nil { + vfs.putResolvingPath(rp) + return statfs, nil + } + if !rp.handleError(err) { + vfs.putResolvingPath(rp) + return linux.Statfs{}, err + } + } +} + +// SymlinkAt creates a symbolic link at the given path with the given target. +func (vfs *VirtualFilesystem) SymlinkAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, target string) error { + rp, err := vfs.getResolvingPath(creds, pop) + if err != nil { + return err + } + for { + err := rp.mount.fs.impl.SymlinkAt(ctx, rp, target) + if err == nil { + vfs.putResolvingPath(rp) + return nil + } + if !rp.handleError(err) { + vfs.putResolvingPath(rp) + return err + } + } +} + +// UnlinkAt deletes the non-directory file at the given path. +func (vfs *VirtualFilesystem) UnlinkAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) error { + rp, err := vfs.getResolvingPath(creds, pop) + if err != nil { + return err + } + for { + err := rp.mount.fs.impl.UnlinkAt(ctx, rp) + if err == nil { + vfs.putResolvingPath(rp) + return nil + } + if !rp.handleError(err) { + vfs.putResolvingPath(rp) + return err + } + } +} + +// SyncAllFilesystems has the semantics of Linux's sync(2). +func (vfs *VirtualFilesystem) SyncAllFilesystems(ctx context.Context) error { + fss := make(map[*Filesystem]struct{}) + vfs.filesystemsMu.Lock() + for fs := range vfs.filesystems { + if !fs.TryIncRef() { + continue + } + fss[fs] = struct{}{} + } + vfs.filesystemsMu.Unlock() + var retErr error + for fs := range fss { + if err := fs.impl.Sync(ctx); err != nil && retErr == nil { + retErr = err + } + fs.DecRef() + } + return retErr +} + // A VirtualDentry represents a node in a VFS tree, by combining a Dentry // (which represents a node in a Filesystem's tree) and a Mount (which // represents the Filesystem's position in a VFS mount tree). -- cgit v1.2.3 From 481dbfa5ab24ec2c0752b9e748d3617285603c4e Mon Sep 17 00:00:00 2001 From: Jamie Liu Date: Wed, 11 Dec 2019 13:40:57 -0800 Subject: Add vfs.Pathname{WithDeleted,ForGetcwd}. The former is needed for vfs.FileDescription to implement memmap.MappingIdentity, and the latter is needed to implement getcwd(2). PiperOrigin-RevId: 285051855 --- pkg/sentry/fsimpl/ext/BUILD | 1 + pkg/sentry/fsimpl/ext/filesystem.go | 8 ++ pkg/sentry/fsimpl/memfs/BUILD | 1 + pkg/sentry/fsimpl/memfs/filesystem.go | 8 ++ pkg/sentry/vfs/BUILD | 1 + pkg/sentry/vfs/filesystem.go | 54 +++++++++++- pkg/sentry/vfs/filesystem_impl_util.go | 26 ++++++ pkg/sentry/vfs/pathname.go | 153 +++++++++++++++++++++++++++++++++ pkg/sentry/vfs/testutil.go | 9 ++ 9 files changed, 260 insertions(+), 1 deletion(-) create mode 100644 pkg/sentry/vfs/pathname.go (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/ext/BUILD b/pkg/sentry/fsimpl/ext/BUILD index 7ccff8b0d..880b7bcd3 100644 --- a/pkg/sentry/fsimpl/ext/BUILD +++ b/pkg/sentry/fsimpl/ext/BUILD @@ -38,6 +38,7 @@ go_library( "//pkg/abi/linux", "//pkg/binary", "//pkg/fd", + "//pkg/fspath", "//pkg/log", "//pkg/sentry/arch", "//pkg/sentry/context", diff --git a/pkg/sentry/fsimpl/ext/filesystem.go b/pkg/sentry/fsimpl/ext/filesystem.go index 2d15e8aaf..e7aa3b41b 100644 --- a/pkg/sentry/fsimpl/ext/filesystem.go +++ b/pkg/sentry/fsimpl/ext/filesystem.go @@ -20,6 +20,7 @@ import ( "sync" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout" "gvisor.dev/gvisor/pkg/sentry/vfs" @@ -441,3 +442,10 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error return syserror.EROFS } + +// PrependPath implements vfs.FilesystemImpl.PrependPath. +func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error { + fs.mu.RLock() + defer fs.mu.RUnlock() + return vfs.GenericPrependPath(vfsroot, vd, b) +} diff --git a/pkg/sentry/fsimpl/memfs/BUILD b/pkg/sentry/fsimpl/memfs/BUILD index bc5c0b591..0cc751eb8 100644 --- a/pkg/sentry/fsimpl/memfs/BUILD +++ b/pkg/sentry/fsimpl/memfs/BUILD @@ -32,6 +32,7 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/amutex", + "//pkg/fspath", "//pkg/sentry/arch", "//pkg/sentry/context", "//pkg/sentry/kernel/auth", diff --git a/pkg/sentry/fsimpl/memfs/filesystem.go b/pkg/sentry/fsimpl/memfs/filesystem.go index 08a9cb8ef..1f2a5122a 100644 --- a/pkg/sentry/fsimpl/memfs/filesystem.go +++ b/pkg/sentry/fsimpl/memfs/filesystem.go @@ -19,6 +19,7 @@ import ( "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" @@ -582,3 +583,10 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error inode.decLinksLocked() return nil } + +// PrependPath implements vfs.FilesystemImpl.PrependPath. +func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error { + fs.mu.RLock() + defer fs.mu.RUnlock() + return vfs.GenericPrependPath(vfsroot, vd, b) +} diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD index 59237c3b9..e3e554b88 100644 --- a/pkg/sentry/vfs/BUILD +++ b/pkg/sentry/vfs/BUILD @@ -17,6 +17,7 @@ go_library( "mount.go", "mount_unsafe.go", "options.go", + "pathname.go", "permissions.go", "resolving_path.go", "testutil.go", diff --git a/pkg/sentry/vfs/filesystem.go b/pkg/sentry/vfs/filesystem.go index dfbd2372a..8011eba3f 100644 --- a/pkg/sentry/vfs/filesystem.go +++ b/pkg/sentry/vfs/filesystem.go @@ -18,6 +18,7 @@ import ( "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/context" ) @@ -185,5 +186,56 @@ type FilesystemImpl interface { // UnlinkAt removes the non-directory file at rp. UnlinkAt(ctx context.Context, rp *ResolvingPath) error - // TODO: d_path(); extended attributes; inotify_add_watch(); bind() + // PrependPath prepends a path from vd to vd.Mount().Root() to b. + // + // If vfsroot.Ok(), it is the contextual VFS root; if it is encountered + // before vd.Mount().Root(), PrependPath should stop prepending path + // components and return a PrependPathAtVFSRootError. + // + // If traversal of vd.Dentry()'s ancestors encounters an independent + // ("root") Dentry that is not vd.Mount().Root() (i.e. vd.Dentry() is not a + // descendant of vd.Mount().Root()), PrependPath should stop prepending + // path components and return a PrependPathAtNonMountRootError. + // + // Filesystems for which Dentries do not have meaningful paths may prepend + // an arbitrary descriptive string to b and then return a + // PrependPathSyntheticError. + // + // Most implementations can acquire the appropriate locks to ensure that + // Dentry.Name() and Dentry.Parent() are fixed for vd.Dentry() and all of + // its ancestors, then call GenericPrependPath. + // + // Preconditions: vd.Mount().Filesystem().Impl() == this FilesystemImpl. + PrependPath(ctx context.Context, vfsroot, vd VirtualDentry, b *fspath.Builder) error + + // TODO: extended attributes; inotify_add_watch(); bind() +} + +// PrependPathAtVFSRootError is returned by implementations of +// FilesystemImpl.PrependPath() when they encounter the contextual VFS root. +type PrependPathAtVFSRootError struct{} + +// Error implements error.Error. +func (PrependPathAtVFSRootError) Error() string { + return "vfs.FilesystemImpl.PrependPath() reached VFS root" +} + +// PrependPathAtNonMountRootError is returned by implementations of +// FilesystemImpl.PrependPath() when they encounter an independent ancestor +// Dentry that is not the Mount root. +type PrependPathAtNonMountRootError struct{} + +// Error implements error.Error. +func (PrependPathAtNonMountRootError) Error() string { + return "vfs.FilesystemImpl.PrependPath() reached root other than Mount root" +} + +// PrependPathSyntheticError is returned by implementations of +// FilesystemImpl.PrependPath() for which prepended names do not represent real +// paths. +type PrependPathSyntheticError struct{} + +// Error implements error.Error. +func (PrependPathSyntheticError) Error() string { + return "vfs.FilesystemImpl.PrependPath() prepended synthetic name" } diff --git a/pkg/sentry/vfs/filesystem_impl_util.go b/pkg/sentry/vfs/filesystem_impl_util.go index 465e610e0..7315a588e 100644 --- a/pkg/sentry/vfs/filesystem_impl_util.go +++ b/pkg/sentry/vfs/filesystem_impl_util.go @@ -16,6 +16,8 @@ package vfs import ( "strings" + + "gvisor.dev/gvisor/pkg/fspath" ) // GenericParseMountOptions parses a comma-separated list of options of the @@ -41,3 +43,27 @@ func GenericParseMountOptions(str string) map[string]string { } return m } + +// GenericPrependPath may be used by implementations of +// FilesystemImpl.PrependPath() for which a single statically-determined lock +// or set of locks is sufficient to ensure its preconditions (as opposed to +// e.g. per-Dentry locks). +// +// Preconditions: Dentry.Name() and Dentry.Parent() must be held constant for +// vd.Dentry() and all of its ancestors. +func GenericPrependPath(vfsroot, vd VirtualDentry, b *fspath.Builder) error { + mnt, d := vd.mount, vd.dentry + for { + if mnt == vfsroot.mount && d == vfsroot.dentry { + return PrependPathAtVFSRootError{} + } + if d == mnt.root { + return nil + } + if d.parent == nil { + return PrependPathAtNonMountRootError{} + } + b.PrependComponent(d.name) + d = d.parent + } +} diff --git a/pkg/sentry/vfs/pathname.go b/pkg/sentry/vfs/pathname.go new file mode 100644 index 000000000..8e155654f --- /dev/null +++ b/pkg/sentry/vfs/pathname.go @@ -0,0 +1,153 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs + +import ( + "sync" + + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/syserror" +) + +var fspathBuilderPool = sync.Pool{ + New: func() interface{} { + return &fspath.Builder{} + }, +} + +func getFSPathBuilder() *fspath.Builder { + return fspathBuilderPool.Get().(*fspath.Builder) +} + +func putFSPathBuilder(b *fspath.Builder) { + // No methods can be called on b after b.String(), so reset it to its zero + // value (as returned by fspathBuilderPool.New) instead. + *b = fspath.Builder{} + fspathBuilderPool.Put(b) +} + +// PathnameWithDeleted returns an absolute pathname to vd, consistent with +// Linux's d_path(). In particular, if vd.Dentry() has been disowned, +// PathnameWithDeleted appends " (deleted)" to the returned pathname. +func (vfs *VirtualFilesystem) PathnameWithDeleted(ctx context.Context, vfsroot, vd VirtualDentry) (string, error) { + b := getFSPathBuilder() + defer putFSPathBuilder(b) + haveRef := false + defer func() { + if haveRef { + vd.DecRef() + } + }() + + origD := vd.dentry +loop: + for { + err := vd.mount.fs.impl.PrependPath(ctx, vfsroot, vd, b) + switch err.(type) { + case nil: + if vd.mount == vfsroot.mount && vd.mount.root == vfsroot.dentry { + // GenericPrependPath() will have returned + // PrependPathAtVFSRootError in this case since it checks + // against vfsroot before mnt.root, but other implementations + // of FilesystemImpl.PrependPath() may return nil instead. + break loop + } + nextVD := vfs.getMountpointAt(vd.mount, vfsroot) + if !nextVD.Ok() { + break loop + } + if haveRef { + vd.DecRef() + } + vd = nextVD + haveRef = true + // continue loop + case PrependPathSyntheticError: + // Skip prepending "/" and appending " (deleted)". + return b.String(), nil + case PrependPathAtVFSRootError, PrependPathAtNonMountRootError: + break loop + default: + return "", err + } + } + b.PrependByte('/') + if origD.IsDisowned() { + b.AppendString(" (deleted)") + } + return b.String(), nil +} + +// PathnameForGetcwd returns an absolute pathname to vd, consistent with +// Linux's sys_getcwd(). +func (vfs *VirtualFilesystem) PathnameForGetcwd(ctx context.Context, vfsroot, vd VirtualDentry) (string, error) { + if vd.dentry.IsDisowned() { + return "", syserror.ENOENT + } + + b := getFSPathBuilder() + defer putFSPathBuilder(b) + haveRef := false + defer func() { + if haveRef { + vd.DecRef() + } + }() + unreachable := false +loop: + for { + err := vd.mount.fs.impl.PrependPath(ctx, vfsroot, vd, b) + switch err.(type) { + case nil: + if vd.mount == vfsroot.mount && vd.mount.root == vfsroot.dentry { + break loop + } + nextVD := vfs.getMountpointAt(vd.mount, vfsroot) + if !nextVD.Ok() { + unreachable = true + break loop + } + if haveRef { + vd.DecRef() + } + vd = nextVD + haveRef = true + case PrependPathAtVFSRootError: + break loop + case PrependPathAtNonMountRootError, PrependPathSyntheticError: + unreachable = true + break loop + default: + return "", err + } + } + b.PrependByte('/') + if unreachable { + b.PrependString("(unreachable)") + } + return b.String(), nil +} + +// As of this writing, we do not have equivalents to: +// +// - d_absolute_path(), which returns EINVAL if (effectively) any call to +// FilesystemImpl.PrependPath() would return PrependPathAtNonMountRootError. +// +// - dentry_path(), which does not walk up mounts (and only returns the path +// relative to Filesystem root), but also appends "//deleted" for disowned +// Dentries. +// +// These should be added as necessary. diff --git a/pkg/sentry/vfs/testutil.go b/pkg/sentry/vfs/testutil.go index 593144cb7..7a1d9e383 100644 --- a/pkg/sentry/vfs/testutil.go +++ b/pkg/sentry/vfs/testutil.go @@ -15,7 +15,10 @@ package vfs import ( + "fmt" + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/syserror" @@ -114,6 +117,12 @@ func (fs *FDTestFilesystem) UnlinkAt(ctx context.Context, rp *ResolvingPath) err return syserror.EPERM } +// PrependPath implements FilesystemImpl.PrependPath. +func (fs *FDTestFilesystem) PrependPath(ctx context.Context, vfsroot, vd VirtualDentry, b *fspath.Builder) error { + b.PrependComponent(fmt.Sprintf("vfs.fdTestDentry:%p", vd.dentry.impl.(*fdTestDentry))) + return PrependPathSyntheticError{} +} + type fdTestDentry struct { vfsd Dentry } -- cgit v1.2.3 From 007707a0726602bc99fc0dccaf2994ad967b9d96 Mon Sep 17 00:00:00 2001 From: Rahat Mahmood Date: Thu, 12 Dec 2019 11:19:18 -0800 Subject: Implement kernfs. PiperOrigin-RevId: 285231002 --- pkg/abi/linux/file.go | 23 + pkg/abi/linux/fs.go | 7 + pkg/sentry/fsimpl/kernfs/BUILD | 60 +++ pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go | 131 +++++ pkg/sentry/fsimpl/kernfs/fd_impl_util.go | 207 ++++++++ pkg/sentry/fsimpl/kernfs/filesystem.go | 691 +++++++++++++++++++++++++ pkg/sentry/fsimpl/kernfs/inode_impl_util.go | 492 ++++++++++++++++++ pkg/sentry/fsimpl/kernfs/kernfs.go | 405 +++++++++++++++ pkg/sentry/fsimpl/kernfs/kernfs_test.go | 423 +++++++++++++++ 9 files changed, 2439 insertions(+) create mode 100644 pkg/sentry/fsimpl/kernfs/BUILD create mode 100644 pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go create mode 100644 pkg/sentry/fsimpl/kernfs/fd_impl_util.go create mode 100644 pkg/sentry/fsimpl/kernfs/filesystem.go create mode 100644 pkg/sentry/fsimpl/kernfs/inode_impl_util.go create mode 100644 pkg/sentry/fsimpl/kernfs/kernfs.go create mode 100644 pkg/sentry/fsimpl/kernfs/kernfs_test.go (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/abi/linux/file.go b/pkg/abi/linux/file.go index 0f014d27f..16791d03e 100644 --- a/pkg/abi/linux/file.go +++ b/pkg/abi/linux/file.go @@ -286,6 +286,29 @@ func (m FileMode) String() string { return strings.Join(s, "|") } +// DirentType maps file types to dirent types appropriate for (struct +// dirent)::d_type. +func (m FileMode) DirentType() uint8 { + switch m.FileType() { + case ModeSocket: + return DT_SOCK + case ModeSymlink: + return DT_LNK + case ModeRegular: + return DT_REG + case ModeBlockDevice: + return DT_BLK + case ModeDirectory: + return DT_DIR + case ModeCharacterDevice: + return DT_CHR + case ModeNamedPipe: + return DT_FIFO + default: + return DT_UNKNOWN + } +} + var modeExtraBits = abi.FlagSet{ { Flag: ModeSetUID, diff --git a/pkg/abi/linux/fs.go b/pkg/abi/linux/fs.go index b416e3472..2c652baa2 100644 --- a/pkg/abi/linux/fs.go +++ b/pkg/abi/linux/fs.go @@ -92,3 +92,10 @@ const ( SYNC_FILE_RANGE_WRITE = 2 SYNC_FILE_RANGE_WAIT_AFTER = 4 ) + +// Flag argument to renameat2(2), from include/uapi/linux/fs.h. +const ( + RENAME_NOREPLACE = (1 << 0) // Don't overwrite target. + RENAME_EXCHANGE = (1 << 1) // Exchange src and dst. + RENAME_WHITEOUT = (1 << 2) // Whiteout src. +) diff --git a/pkg/sentry/fsimpl/kernfs/BUILD b/pkg/sentry/fsimpl/kernfs/BUILD new file mode 100644 index 000000000..52596c090 --- /dev/null +++ b/pkg/sentry/fsimpl/kernfs/BUILD @@ -0,0 +1,60 @@ +load("//tools/go_stateify:defs.bzl", "go_library") +load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools/go_generics:defs.bzl", "go_template_instance") + +package(licenses = ["notice"]) + +go_template_instance( + name = "slot_list", + out = "slot_list.go", + package = "kernfs", + prefix = "slot", + template = "//pkg/ilist:generic_list", + types = { + "Element": "*slot", + "Linker": "*slot", + }, +) + +go_library( + name = "kernfs", + srcs = [ + "dynamic_bytes_file.go", + "fd_impl_util.go", + "filesystem.go", + "inode_impl_util.go", + "kernfs.go", + "slot_list.go", + ], + importpath = "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs", + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/fspath", + "//pkg/log", + "//pkg/refs", + "//pkg/sentry/context", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/memmap", + "//pkg/sentry/usermem", + "//pkg/sentry/vfs", + "//pkg/syserror", + ], +) + +go_test( + name = "kernfs_test", + size = "small", + srcs = ["kernfs_test.go"], + deps = [ + ":kernfs", + "//pkg/abi/linux", + "//pkg/sentry/context", + "//pkg/sentry/context/contexttest", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/usermem", + "//pkg/sentry/vfs", + "//pkg/syserror", + "@com_github_google_go-cmp//cmp:go_default_library", + ], +) diff --git a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go new file mode 100644 index 000000000..30c06baf0 --- /dev/null +++ b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go @@ -0,0 +1,131 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernfs + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +// DynamicBytesFile implements kernfs.Inode and represents a read-only +// file whose contents are backed by a vfs.DynamicBytesSource. +// +// Must be initialized with Init before first use. +type DynamicBytesFile struct { + InodeAttrs + InodeNoopRefCount + InodeNotDirectory + InodeNotSymlink + + data vfs.DynamicBytesSource +} + +// Init intializes a dynamic bytes file. +func (f *DynamicBytesFile) Init(creds *auth.Credentials, ino uint64, data vfs.DynamicBytesSource) { + f.InodeAttrs.Init(creds, ino, linux.ModeRegular|0444) + f.data = data +} + +// Open implements Inode.Open. +func (f *DynamicBytesFile) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) { + fd := &DynamicBytesFD{} + fd.Init(rp.Mount(), vfsd, f.data, flags) + return &fd.vfsfd, nil +} + +// SetStat implements Inode.SetStat. +func (f *DynamicBytesFile) SetStat(*vfs.Filesystem, vfs.SetStatOptions) error { + // DynamicBytesFiles are immutable. + return syserror.EPERM +} + +// DynamicBytesFD implements vfs.FileDescriptionImpl for an FD backed by a +// DynamicBytesFile. +// +// Must be initialized with Init before first use. +type DynamicBytesFD struct { + vfs.FileDescriptionDefaultImpl + vfs.DynamicBytesFileDescriptionImpl + + vfsfd vfs.FileDescription + inode Inode + flags uint32 +} + +// Init initializes a DynamicBytesFD. +func (fd *DynamicBytesFD) Init(m *vfs.Mount, d *vfs.Dentry, data vfs.DynamicBytesSource, flags uint32) { + m.IncRef() // DecRef in vfs.FileDescription.vd.DecRef on final ref. + d.IncRef() // DecRef in vfs.FileDescription.vd.DecRef on final ref. + fd.flags = flags + fd.inode = d.Impl().(*Dentry).inode + fd.SetDataSource(data) + fd.vfsfd.Init(fd, m, d) +} + +// Seek implements vfs.FileDescriptionImpl.Seek. +func (fd *DynamicBytesFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { + return fd.DynamicBytesFileDescriptionImpl.Seek(ctx, offset, whence) +} + +// Read implmenets vfs.FileDescriptionImpl.Read. +func (fd *DynamicBytesFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + return fd.DynamicBytesFileDescriptionImpl.Read(ctx, dst, opts) +} + +// PRead implmenets vfs.FileDescriptionImpl.PRead. +func (fd *DynamicBytesFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { + return fd.DynamicBytesFileDescriptionImpl.PRead(ctx, dst, offset, opts) +} + +// Write implements vfs.FileDescriptionImpl.Write. +func (fd *DynamicBytesFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { + return fd.FileDescriptionDefaultImpl.Write(ctx, src, opts) +} + +// PWrite implements vfs.FileDescriptionImpl.PWrite. +func (fd *DynamicBytesFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { + return fd.FileDescriptionDefaultImpl.PWrite(ctx, src, offset, opts) +} + +// Release implements vfs.FileDescriptionImpl.Release. +func (fd *DynamicBytesFD) Release() {} + +// Stat implements vfs.FileDescriptionImpl.Stat. +func (fd *DynamicBytesFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { + fs := fd.vfsfd.VirtualDentry().Mount().Filesystem() + return fd.inode.Stat(fs), nil +} + +// SetStat implements vfs.FileDescriptionImpl.SetStat. +func (fd *DynamicBytesFD) SetStat(context.Context, vfs.SetStatOptions) error { + // DynamicBytesFiles are immutable. + return syserror.EPERM +} + +// StatusFlags implements vfs.FileDescriptionImpl.StatusFlags. +func (fd *DynamicBytesFD) StatusFlags(ctx context.Context) (uint32, error) { + return fd.flags, nil +} + +// SetStatusFlags implements vfs.FileDescriptionImpl.SetStatusFlags. +func (fd *DynamicBytesFD) SetStatusFlags(ctx context.Context, flags uint32) error { + // None of the flags settable by fcntl(F_SETFL) are supported, so this is a + // no-op. + return nil +} diff --git a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go new file mode 100644 index 000000000..d6c18937a --- /dev/null +++ b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go @@ -0,0 +1,207 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernfs + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/memmap" + "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +// GenericDirectoryFD implements vfs.FileDescriptionImpl for a generic directory +// inode that uses OrderChildren to track child nodes. GenericDirectoryFD is not +// compatible with dynamic directories. +// +// Note that GenericDirectoryFD holds a lock over OrderedChildren while calling +// IterDirents callback. The IterDirents callback therefore cannot hash or +// unhash children, or recursively call IterDirents on the same underlying +// inode. +// +// Must be initialize with Init before first use. +type GenericDirectoryFD struct { + vfs.FileDescriptionDefaultImpl + vfs.DirectoryFileDescriptionDefaultImpl + + vfsfd vfs.FileDescription + children *OrderedChildren + flags uint32 + off int64 +} + +// Init initializes a GenericDirectoryFD. +func (fd *GenericDirectoryFD) Init(m *vfs.Mount, d *vfs.Dentry, children *OrderedChildren, flags uint32) { + m.IncRef() // DecRef in vfs.FileDescription.vd.DecRef on final ref. + d.IncRef() // DecRef in vfs.FileDescription.vd.DecRef on final ref. + fd.children = children + fd.flags = flags + fd.vfsfd.Init(fd, m, d) +} + +// VFSFileDescription returns a pointer to the vfs.FileDescription representing +// this object. +func (fd *GenericDirectoryFD) VFSFileDescription() *vfs.FileDescription { + return &fd.vfsfd +} + +// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap. +func (fd *GenericDirectoryFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { + return fd.FileDescriptionDefaultImpl.ConfigureMMap(ctx, opts) +} + +// Read implmenets vfs.FileDescriptionImpl.Read. +func (fd *GenericDirectoryFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + return fd.DirectoryFileDescriptionDefaultImpl.Read(ctx, dst, opts) +} + +// PRead implmenets vfs.FileDescriptionImpl.PRead. +func (fd *GenericDirectoryFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { + return fd.DirectoryFileDescriptionDefaultImpl.PRead(ctx, dst, offset, opts) +} + +// Write implements vfs.FileDescriptionImpl.Write. +func (fd *GenericDirectoryFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { + return fd.DirectoryFileDescriptionDefaultImpl.Write(ctx, src, opts) +} + +// PWrite implements vfs.FileDescriptionImpl.PWrite. +func (fd *GenericDirectoryFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { + return fd.DirectoryFileDescriptionDefaultImpl.PWrite(ctx, src, offset, opts) +} + +// Release implements vfs.FileDecriptionImpl.Release. +func (fd *GenericDirectoryFD) Release() {} + +func (fd *GenericDirectoryFD) filesystem() *vfs.Filesystem { + return fd.vfsfd.VirtualDentry().Mount().Filesystem() +} + +func (fd *GenericDirectoryFD) inode() Inode { + return fd.vfsfd.VirtualDentry().Dentry().Impl().(*Dentry).inode +} + +// IterDirents implements vfs.FileDecriptionImpl.IterDirents. IterDirents holds +// o.mu when calling cb. +func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error { + vfsFS := fd.filesystem() + fs := vfsFS.Impl().(*Filesystem) + vfsd := fd.vfsfd.VirtualDentry().Dentry() + + fs.mu.Lock() + defer fs.mu.Unlock() + + // Handle ".". + if fd.off == 0 { + stat := fd.inode().Stat(vfsFS) + dirent := vfs.Dirent{ + Name: ".", + Type: linux.DT_DIR, + Ino: stat.Ino, + NextOff: 1, + } + if !cb.Handle(dirent) { + return nil + } + fd.off++ + } + + // Handle "..". + if fd.off == 1 { + parentInode := vfsd.ParentOrSelf().Impl().(*Dentry).inode + stat := parentInode.Stat(vfsFS) + dirent := vfs.Dirent{ + Name: "..", + Type: linux.FileMode(stat.Mode).DirentType(), + Ino: stat.Ino, + NextOff: 2, + } + if !cb.Handle(dirent) { + return nil + } + fd.off++ + } + + // Handle static children. + fd.children.mu.RLock() + defer fd.children.mu.RUnlock() + // fd.off accounts for "." and "..", but fd.children do not track + // these. + childIdx := fd.off - 2 + for it := fd.children.nthLocked(childIdx); it != nil; it = it.Next() { + inode := it.Dentry.Impl().(*Dentry).inode + stat := inode.Stat(vfsFS) + dirent := vfs.Dirent{ + Name: it.Name, + Type: linux.FileMode(stat.Mode).DirentType(), + Ino: stat.Ino, + NextOff: fd.off + 1, + } + if !cb.Handle(dirent) { + return nil + } + fd.off++ + } + + return nil +} + +// Seek implements vfs.FileDecriptionImpl.Seek. +func (fd *GenericDirectoryFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { + fs := fd.filesystem().Impl().(*Filesystem) + fs.mu.Lock() + defer fs.mu.Unlock() + + switch whence { + case linux.SEEK_SET: + // Use offset as given. + case linux.SEEK_CUR: + offset += fd.off + default: + return 0, syserror.EINVAL + } + if offset < 0 { + return 0, syserror.EINVAL + } + fd.off = offset + return offset, nil +} + +// StatusFlags implements vfs.FileDescriptionImpl.StatusFlags. +func (fd *GenericDirectoryFD) StatusFlags(ctx context.Context) (uint32, error) { + return fd.flags, nil +} + +// SetStatusFlags implements vfs.FileDescriptionImpl.SetStatusFlags. +func (fd *GenericDirectoryFD) SetStatusFlags(ctx context.Context, flags uint32) error { + // None of the flags settable by fcntl(F_SETFL) are supported, so this is a + // no-op. + return nil +} + +// Stat implements vfs.FileDescriptionImpl.Stat. +func (fd *GenericDirectoryFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { + fs := fd.filesystem() + inode := fd.inode() + return inode.Stat(fs), nil +} + +// SetStat implements vfs.FileDescriptionImpl.SetStat. +func (fd *GenericDirectoryFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { + fs := fd.filesystem() + inode := fd.vfsfd.VirtualDentry().Dentry().Impl().(*Dentry).inode + return inode.SetStat(fs, opts) +} diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go new file mode 100644 index 000000000..db486b6c1 --- /dev/null +++ b/pkg/sentry/fsimpl/kernfs/filesystem.go @@ -0,0 +1,691 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// This file implements vfs.FilesystemImpl for kernfs. + +package kernfs + +import ( + "fmt" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +// stepExistingLocked resolves rp.Component() in parent directory vfsd. +// +// stepExistingLocked is loosely analogous to fs/namei.c:walk_component(). +// +// Preconditions: Filesystem.mu must be locked for at least reading. !rp.Done(). +// +// Postcondition: Caller must call fs.processDeferredDecRefs*. +func (fs *Filesystem) stepExistingLocked(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry) (*vfs.Dentry, error) { + d := vfsd.Impl().(*Dentry) + if !d.isDir() { + return nil, syserror.ENOTDIR + } + // Directory searchable? + if err := d.inode.CheckPermissions(rp.Credentials(), vfs.MayExec); err != nil { + return nil, err + } +afterSymlink: + d.dirMu.Lock() + nextVFSD, err := rp.ResolveComponent(vfsd) + d.dirMu.Unlock() + if err != nil { + return nil, err + } + if nextVFSD != nil { + // Cached dentry exists, revalidate. + next := nextVFSD.Impl().(*Dentry) + if !next.inode.Valid(ctx) { + d.dirMu.Lock() + rp.VirtualFilesystem().ForceDeleteDentry(nextVFSD) + d.dirMu.Unlock() + fs.deferDecRef(nextVFSD) // Reference from Lookup. + nextVFSD = nil + } + } + if nextVFSD == nil { + // Dentry isn't cached; it either doesn't exist or failed + // revalidation. Attempt to resolve it via Lookup. + name := rp.Component() + var err error + nextVFSD, err = d.inode.Lookup(ctx, name) + // Reference on nextVFSD dropped by a corresponding Valid. + if err != nil { + return nil, err + } + d.InsertChild(name, nextVFSD) + } + next := nextVFSD.Impl().(*Dentry) + + // Resolve any symlink at current path component. + if rp.ShouldFollowSymlink() && d.isSymlink() { + // TODO: VFS2 needs something extra for /proc/[pid]/fd/ "magic symlinks". + target, err := next.inode.Readlink(ctx) + if err != nil { + return nil, err + } + if err := rp.HandleSymlink(target); err != nil { + return nil, err + } + goto afterSymlink + + } + rp.Advance() + return nextVFSD, nil +} + +// walkExistingLocked resolves rp to an existing file. +// +// walkExistingLocked is loosely analogous to Linux's +// fs/namei.c:path_lookupat(). +// +// Preconditions: Filesystem.mu must be locked for at least reading. +// +// Postconditions: Caller must call fs.processDeferredDecRefs*. +func (fs *Filesystem) walkExistingLocked(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, Inode, error) { + vfsd := rp.Start() + for !rp.Done() { + var err error + vfsd, err = fs.stepExistingLocked(ctx, rp, vfsd) + if err != nil { + return nil, nil, err + } + } + d := vfsd.Impl().(*Dentry) + if rp.MustBeDir() && !d.isDir() { + return nil, nil, syserror.ENOTDIR + } + return vfsd, d.inode, nil +} + +// walkParentDirLocked resolves all but the last path component of rp to an +// existing directory. It does not check that the returned directory is +// searchable by the provider of rp. +// +// walkParentDirLocked is loosely analogous to Linux's +// fs/namei.c:path_parentat(). +// +// Preconditions: Filesystem.mu must be locked for at least reading. !rp.Done(). +// +// Postconditions: Caller must call fs.processDeferredDecRefs*. +func (fs *Filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, Inode, error) { + vfsd := rp.Start() + for !rp.Final() { + var err error + vfsd, err = fs.stepExistingLocked(ctx, rp, vfsd) + if err != nil { + return nil, nil, err + } + } + d := vfsd.Impl().(*Dentry) + if !d.isDir() { + return nil, nil, syserror.ENOTDIR + } + return vfsd, d.inode, nil +} + +// checkCreateLocked checks that a file named rp.Component() may be created in +// directory parentVFSD, then returns rp.Component(). +// +// Preconditions: Filesystem.mu must be locked for at least reading. parentInode +// == parentVFSD.Impl().(*Dentry).Inode. isDir(parentInode) == true. +func checkCreateLocked(rp *vfs.ResolvingPath, parentVFSD *vfs.Dentry, parentInode Inode) (string, error) { + if err := parentInode.CheckPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { + return "", err + } + pc := rp.Component() + if pc == "." || pc == ".." { + return "", syserror.EEXIST + } + childVFSD, err := rp.ResolveChild(parentVFSD, pc) + if err != nil { + return "", err + } + if childVFSD != nil { + return "", syserror.EEXIST + } + if parentVFSD.IsDisowned() { + return "", syserror.ENOENT + } + return pc, nil +} + +// checkDeleteLocked checks that the file represented by vfsd may be deleted. +// +// Preconditions: Filesystem.mu must be locked for at least reading. +func checkDeleteLocked(rp *vfs.ResolvingPath, vfsd *vfs.Dentry) error { + parentVFSD := vfsd.Parent() + if parentVFSD == nil { + return syserror.EBUSY + } + if parentVFSD.IsDisowned() { + return syserror.ENOENT + } + if err := parentVFSD.Impl().(*Dentry).inode.CheckPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { + return err + } + return nil +} + +// checkRenameLocked checks that a rename operation may be performed on the +// target dentry across the given set of parent directories. The target dentry +// may be nil. +// +// Precondition: isDir(dstInode) == true. +func checkRenameLocked(creds *auth.Credentials, src, dstDir *vfs.Dentry, dstInode Inode) error { + srcDir := src.Parent() + if srcDir == nil { + return syserror.EBUSY + } + if srcDir.IsDisowned() { + return syserror.ENOENT + } + if dstDir.IsDisowned() { + return syserror.ENOENT + } + // Check for creation permissions on dst dir. + if err := dstInode.CheckPermissions(creds, vfs.MayWrite|vfs.MayExec); err != nil { + return err + } + + return nil +} + +// Release implements vfs.FilesystemImpl.Release. +func (fs *Filesystem) Release() { +} + +// Sync implements vfs.FilesystemImpl.Sync. +func (fs *Filesystem) Sync(ctx context.Context) error { + // All filesystem state is in-memory. + return nil +} + +// GetDentryAt implements vfs.FilesystemImpl.GetDentryAt. +func (fs *Filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) { + fs.mu.RLock() + defer fs.processDeferredDecRefs() + defer fs.mu.RUnlock() + vfsd, inode, err := fs.walkExistingLocked(ctx, rp) + if err != nil { + return nil, err + } + + if opts.CheckSearchable { + d := vfsd.Impl().(*Dentry) + if !d.isDir() { + return nil, syserror.ENOTDIR + } + if err := inode.CheckPermissions(rp.Credentials(), vfs.MayExec); err != nil { + return nil, err + } + } + vfsd.IncRef() // Ownership transferred to caller. + return vfsd, nil +} + +// LinkAt implements vfs.FilesystemImpl.LinkAt. +func (fs *Filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error { + if rp.Done() { + return syserror.EEXIST + } + fs.mu.Lock() + defer fs.mu.Unlock() + parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp) + fs.processDeferredDecRefsLocked() + if err != nil { + return err + } + pc, err := checkCreateLocked(rp, parentVFSD, parentInode) + if err != nil { + return err + } + if rp.Mount() != vd.Mount() { + return syserror.EXDEV + } + if err := rp.Mount().CheckBeginWrite(); err != nil { + return err + } + defer rp.Mount().EndWrite() + + d := vd.Dentry().Impl().(*Dentry) + if d.isDir() { + return syserror.EPERM + } + + child, err := parentInode.NewLink(ctx, pc, d.inode) + if err != nil { + return err + } + parentVFSD.Impl().(*Dentry).InsertChild(pc, child) + return nil +} + +// MkdirAt implements vfs.FilesystemImpl.MkdirAt. +func (fs *Filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error { + if rp.Done() { + return syserror.EEXIST + } + fs.mu.Lock() + defer fs.mu.Unlock() + parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp) + fs.processDeferredDecRefsLocked() + if err != nil { + return err + } + pc, err := checkCreateLocked(rp, parentVFSD, parentInode) + if err != nil { + return err + } + if err := rp.Mount().CheckBeginWrite(); err != nil { + return err + } + defer rp.Mount().EndWrite() + child, err := parentInode.NewDir(ctx, pc, opts) + if err != nil { + return err + } + parentVFSD.Impl().(*Dentry).InsertChild(pc, child) + return nil +} + +// MknodAt implements vfs.FilesystemImpl.MknodAt. +func (fs *Filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error { + if rp.Done() { + return syserror.EEXIST + } + fs.mu.Lock() + defer fs.mu.Unlock() + parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp) + fs.processDeferredDecRefsLocked() + if err != nil { + return err + } + pc, err := checkCreateLocked(rp, parentVFSD, parentInode) + if err != nil { + return err + } + if err := rp.Mount().CheckBeginWrite(); err != nil { + return err + } + defer rp.Mount().EndWrite() + new, err := parentInode.NewNode(ctx, pc, opts) + if err != nil { + return err + } + parentVFSD.Impl().(*Dentry).InsertChild(pc, new) + return nil +} + +// OpenAt implements vfs.FilesystemImpl.OpenAt. +func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + // Filter out flags that are not supported by kernfs. O_DIRECTORY and + // O_NOFOLLOW have no effect here (they're handled by VFS by setting + // appropriate bits in rp), but are returned by + // FileDescriptionImpl.StatusFlags(). + opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_TRUNC | linux.O_DIRECTORY | linux.O_NOFOLLOW + ats := vfs.AccessTypesForOpenFlags(opts.Flags) + + // Do not create new file. + if opts.Flags&linux.O_CREAT == 0 { + fs.mu.RLock() + defer fs.processDeferredDecRefs() + defer fs.mu.RUnlock() + vfsd, inode, err := fs.walkExistingLocked(ctx, rp) + if err != nil { + return nil, err + } + if err := inode.CheckPermissions(rp.Credentials(), ats); err != nil { + return nil, err + } + return inode.Open(rp, vfsd, opts.Flags) + } + + // May create new file. + mustCreate := opts.Flags&linux.O_EXCL != 0 + vfsd := rp.Start() + inode := vfsd.Impl().(*Dentry).inode + fs.mu.Lock() + defer fs.mu.Unlock() + if rp.Done() { + if rp.MustBeDir() { + return nil, syserror.EISDIR + } + if mustCreate { + return nil, syserror.EEXIST + } + if err := inode.CheckPermissions(rp.Credentials(), ats); err != nil { + return nil, err + } + return inode.Open(rp, vfsd, opts.Flags) + } +afterTrailingSymlink: + parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp) + fs.processDeferredDecRefsLocked() + if err != nil { + return nil, err + } + // Check for search permission in the parent directory. + if err := parentInode.CheckPermissions(rp.Credentials(), vfs.MayExec); err != nil { + return nil, err + } + // Reject attempts to open directories with O_CREAT. + if rp.MustBeDir() { + return nil, syserror.EISDIR + } + pc := rp.Component() + if pc == "." || pc == ".." { + return nil, syserror.EISDIR + } + // Determine whether or not we need to create a file. + childVFSD, err := rp.ResolveChild(parentVFSD, pc) + if err != nil { + return nil, err + } + if childVFSD == nil { + // Already checked for searchability above; now check for writability. + if err := parentInode.CheckPermissions(rp.Credentials(), vfs.MayWrite); err != nil { + return nil, err + } + if err := rp.Mount().CheckBeginWrite(); err != nil { + return nil, err + } + defer rp.Mount().EndWrite() + // Create and open the child. + child, err := parentInode.NewFile(ctx, pc, opts) + if err != nil { + return nil, err + } + parentVFSD.Impl().(*Dentry).InsertChild(pc, child) + return child.Impl().(*Dentry).inode.Open(rp, child, opts.Flags) + } + // Open existing file or follow symlink. + if mustCreate { + return nil, syserror.EEXIST + } + childDentry := childVFSD.Impl().(*Dentry) + childInode := childDentry.inode + if rp.ShouldFollowSymlink() { + if childDentry.isSymlink() { + target, err := childInode.Readlink(ctx) + if err != nil { + return nil, err + } + if err := rp.HandleSymlink(target); err != nil { + return nil, err + } + // rp.Final() may no longer be true since we now need to resolve the + // symlink target. + goto afterTrailingSymlink + } + } + if err := childInode.CheckPermissions(rp.Credentials(), ats); err != nil { + return nil, err + } + return childInode.Open(rp, childVFSD, opts.Flags) +} + +// ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt. +func (fs *Filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) { + fs.mu.RLock() + d, inode, err := fs.walkExistingLocked(ctx, rp) + fs.mu.RUnlock() + fs.processDeferredDecRefs() + if err != nil { + return "", err + } + if !d.Impl().(*Dentry).isSymlink() { + return "", syserror.EINVAL + } + return inode.Readlink(ctx) +} + +// RenameAt implements vfs.FilesystemImpl.RenameAt. +func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry, opts vfs.RenameOptions) error { + noReplace := opts.Flags&linux.RENAME_NOREPLACE != 0 + exchange := opts.Flags&linux.RENAME_EXCHANGE != 0 + whiteout := opts.Flags&linux.RENAME_WHITEOUT != 0 + if exchange && (noReplace || whiteout) { + // Can't specify RENAME_NOREPLACE or RENAME_WHITEOUT with RENAME_EXCHANGE. + return syserror.EINVAL + } + if exchange || whiteout { + // Exchange and Whiteout flags are not supported on kernfs. + return syserror.EINVAL + } + + fs.mu.Lock() + defer fs.mu.Lock() + + mnt := rp.Mount() + if mnt != vd.Mount() { + return syserror.EXDEV + } + + if err := mnt.CheckBeginWrite(); err != nil { + return err + } + defer mnt.EndWrite() + + dstDirVFSD, dstDirInode, err := fs.walkParentDirLocked(ctx, rp) + fs.processDeferredDecRefsLocked() + if err != nil { + return err + } + + srcVFSD := vd.Dentry() + srcDirVFSD := srcVFSD.Parent() + + // Can we remove the src dentry? + if err := checkDeleteLocked(rp, srcVFSD); err != nil { + return err + } + + // Can we create the dst dentry? + var dstVFSD *vfs.Dentry + pc, err := checkCreateLocked(rp, dstDirVFSD, dstDirInode) + switch err { + case nil: + // Ok, continue with rename as replacement. + case syserror.EEXIST: + if noReplace { + // Won't overwrite existing node since RENAME_NOREPLACE was requested. + return syserror.EEXIST + } + dstVFSD, err = rp.ResolveChild(dstDirVFSD, pc) + if err != nil { + panic(fmt.Sprintf("Child %q for parent Dentry %+v disappeared inside atomic section?", pc, dstDirVFSD)) + } + default: + return err + } + + mntns := vfs.MountNamespaceFromContext(ctx) + virtfs := rp.VirtualFilesystem() + + srcDirDentry := srcDirVFSD.Impl().(*Dentry) + dstDirDentry := dstDirVFSD.Impl().(*Dentry) + + // We can't deadlock here due to lock ordering because we're protected from + // concurrent renames by fs.mu held for writing. + srcDirDentry.dirMu.Lock() + defer srcDirDentry.dirMu.Unlock() + dstDirDentry.dirMu.Lock() + defer dstDirDentry.dirMu.Unlock() + + if err := virtfs.PrepareRenameDentry(mntns, srcVFSD, dstVFSD); err != nil { + return err + } + srcDirInode := srcDirDentry.inode + replaced, err := srcDirInode.Rename(ctx, srcVFSD.Name(), pc, srcVFSD, dstDirVFSD) + if err != nil { + virtfs.AbortRenameDentry(srcVFSD, dstVFSD) + return err + } + virtfs.CommitRenameReplaceDentry(srcVFSD, dstDirVFSD, pc, replaced) + return nil +} + +// RmdirAt implements vfs.FilesystemImpl.RmdirAt. +func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error { + fs.mu.Lock() + defer fs.mu.Unlock() + vfsd, inode, err := fs.walkExistingLocked(ctx, rp) + fs.processDeferredDecRefsLocked() + if err != nil { + return err + } + if err := rp.Mount().CheckBeginWrite(); err != nil { + return err + } + defer rp.Mount().EndWrite() + if err := checkDeleteLocked(rp, vfsd); err != nil { + return err + } + if !vfsd.Impl().(*Dentry).isDir() { + return syserror.ENOTDIR + } + if inode.HasChildren() { + return syserror.ENOTEMPTY + } + virtfs := rp.VirtualFilesystem() + parentDentry := vfsd.Parent().Impl().(*Dentry) + parentDentry.dirMu.Lock() + defer parentDentry.dirMu.Unlock() + if err := virtfs.PrepareDeleteDentry(vfs.MountNamespaceFromContext(ctx), vfsd); err != nil { + return err + } + if err := parentDentry.inode.RmDir(ctx, rp.Component(), vfsd); err != nil { + virtfs.AbortDeleteDentry(vfsd) + return err + } + virtfs.CommitDeleteDentry(vfsd) + return nil +} + +// SetStatAt implements vfs.FilesystemImpl.SetStatAt. +func (fs *Filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error { + fs.mu.RLock() + _, inode, err := fs.walkExistingLocked(ctx, rp) + fs.mu.RUnlock() + fs.processDeferredDecRefs() + if err != nil { + return err + } + if opts.Stat.Mask == 0 { + return nil + } + return inode.SetStat(fs.VFSFilesystem(), opts) +} + +// StatAt implements vfs.FilesystemImpl.StatAt. +func (fs *Filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) { + fs.mu.RLock() + _, inode, err := fs.walkExistingLocked(ctx, rp) + fs.mu.RUnlock() + fs.processDeferredDecRefs() + if err != nil { + return linux.Statx{}, err + } + return inode.Stat(fs.VFSFilesystem()), nil +} + +// StatFSAt implements vfs.FilesystemImpl.StatFSAt. +func (fs *Filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) { + fs.mu.RLock() + _, _, err := fs.walkExistingLocked(ctx, rp) + fs.mu.RUnlock() + fs.processDeferredDecRefs() + if err != nil { + return linux.Statfs{}, err + } + // TODO: actually implement statfs + return linux.Statfs{}, syserror.ENOSYS +} + +// SymlinkAt implements vfs.FilesystemImpl.SymlinkAt. +func (fs *Filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error { + if rp.Done() { + return syserror.EEXIST + } + fs.mu.Lock() + defer fs.mu.Unlock() + parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp) + fs.processDeferredDecRefsLocked() + if err != nil { + return err + } + pc, err := checkCreateLocked(rp, parentVFSD, parentInode) + if err != nil { + return err + } + if err := rp.Mount().CheckBeginWrite(); err != nil { + return err + } + defer rp.Mount().EndWrite() + child, err := parentInode.NewSymlink(ctx, pc, target) + if err != nil { + return err + } + parentVFSD.Impl().(*Dentry).InsertChild(pc, child) + return nil +} + +// UnlinkAt implements vfs.FilesystemImpl.UnlinkAt. +func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error { + fs.mu.Lock() + defer fs.mu.Unlock() + vfsd, _, err := fs.walkExistingLocked(ctx, rp) + fs.processDeferredDecRefsLocked() + if err != nil { + return err + } + if err := rp.Mount().CheckBeginWrite(); err != nil { + return err + } + defer rp.Mount().EndWrite() + if err := checkDeleteLocked(rp, vfsd); err != nil { + return err + } + if vfsd.Impl().(*Dentry).isDir() { + return syserror.EISDIR + } + virtfs := rp.VirtualFilesystem() + parentDentry := vfsd.Parent().Impl().(*Dentry) + parentDentry.dirMu.Lock() + defer parentDentry.dirMu.Unlock() + if err := virtfs.PrepareDeleteDentry(vfs.MountNamespaceFromContext(ctx), vfsd); err != nil { + return err + } + if err := parentDentry.inode.Unlink(ctx, rp.Component(), vfsd); err != nil { + virtfs.AbortDeleteDentry(vfsd) + return err + } + virtfs.CommitDeleteDentry(vfsd) + return nil +} + +// PrependPath implements vfs.FilesystemImpl.PrependPath. +func (fs *Filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error { + fs.mu.RLock() + defer fs.mu.RUnlock() + return vfs.GenericPrependPath(vfsroot, vd, b) +} diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go new file mode 100644 index 000000000..7b45b702a --- /dev/null +++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go @@ -0,0 +1,492 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernfs + +import ( + "fmt" + "sync" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/refs" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +// InodeNoopRefCount partially implements the Inode interface, specifically the +// inodeRefs sub interface. InodeNoopRefCount implements a simple reference +// count for inodes, performing no extra actions when references are obtained or +// released. This is suitable for simple file inodes that don't reference any +// resources. +type InodeNoopRefCount struct { +} + +// IncRef implements Inode.IncRef. +func (n *InodeNoopRefCount) IncRef() { +} + +// DecRef implements Inode.DecRef. +func (n *InodeNoopRefCount) DecRef() { +} + +// TryIncRef implements Inode.TryIncRef. +func (n *InodeNoopRefCount) TryIncRef() bool { + return true +} + +// Destroy implements Inode.Destroy. +func (n *InodeNoopRefCount) Destroy() { +} + +// InodeDirectoryNoNewChildren partially implements the Inode interface. +// InodeDirectoryNoNewChildren represents a directory inode which does not +// support creation of new children. +type InodeDirectoryNoNewChildren struct{} + +// NewFile implements Inode.NewFile. +func (*InodeDirectoryNoNewChildren) NewFile(context.Context, string, vfs.OpenOptions) (*vfs.Dentry, error) { + return nil, syserror.EPERM +} + +// NewDir implements Inode.NewDir. +func (*InodeDirectoryNoNewChildren) NewDir(context.Context, string, vfs.MkdirOptions) (*vfs.Dentry, error) { + return nil, syserror.EPERM +} + +// NewLink implements Inode.NewLink. +func (*InodeDirectoryNoNewChildren) NewLink(context.Context, string, Inode) (*vfs.Dentry, error) { + return nil, syserror.EPERM +} + +// NewSymlink implements Inode.NewSymlink. +func (*InodeDirectoryNoNewChildren) NewSymlink(context.Context, string, string) (*vfs.Dentry, error) { + return nil, syserror.EPERM +} + +// NewNode implements Inode.NewNode. +func (*InodeDirectoryNoNewChildren) NewNode(context.Context, string, vfs.MknodOptions) (*vfs.Dentry, error) { + return nil, syserror.EPERM +} + +// InodeNotDirectory partially implements the Inode interface, specifically the +// inodeDirectory and inodeDynamicDirectory sub interfaces. Inodes that do not +// represent directories can embed this to provide no-op implementations for +// directory-related functions. +type InodeNotDirectory struct { +} + +// HasChildren implements Inode.HasChildren. +func (*InodeNotDirectory) HasChildren() bool { + return false +} + +// NewFile implements Inode.NewFile. +func (*InodeNotDirectory) NewFile(context.Context, string, vfs.OpenOptions) (*vfs.Dentry, error) { + panic("NewFile called on non-directory inode") +} + +// NewDir implements Inode.NewDir. +func (*InodeNotDirectory) NewDir(context.Context, string, vfs.MkdirOptions) (*vfs.Dentry, error) { + panic("NewDir called on non-directory inode") +} + +// NewLink implements Inode.NewLinkink. +func (*InodeNotDirectory) NewLink(context.Context, string, Inode) (*vfs.Dentry, error) { + panic("NewLink called on non-directory inode") +} + +// NewSymlink implements Inode.NewSymlink. +func (*InodeNotDirectory) NewSymlink(context.Context, string, string) (*vfs.Dentry, error) { + panic("NewSymlink called on non-directory inode") +} + +// NewNode implements Inode.NewNode. +func (*InodeNotDirectory) NewNode(context.Context, string, vfs.MknodOptions) (*vfs.Dentry, error) { + panic("NewNode called on non-directory inode") +} + +// Unlink implements Inode.Unlink. +func (*InodeNotDirectory) Unlink(context.Context, string, *vfs.Dentry) error { + panic("Unlink called on non-directory inode") +} + +// RmDir implements Inode.RmDir. +func (*InodeNotDirectory) RmDir(context.Context, string, *vfs.Dentry) error { + panic("RmDir called on non-directory inode") +} + +// Rename implements Inode.Rename. +func (*InodeNotDirectory) Rename(context.Context, string, string, *vfs.Dentry, *vfs.Dentry) (*vfs.Dentry, error) { + panic("Rename called on non-directory inode") +} + +// Lookup implements Inode.Lookup. +func (*InodeNotDirectory) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) { + panic("Lookup called on non-directory inode") +} + +// Valid implements Inode.Valid. +func (*InodeNotDirectory) Valid(context.Context) bool { + return true +} + +// InodeNoDynamicLookup partially implements the Inode interface, specifically +// the inodeDynamicLookup sub interface. Directory inodes that do not support +// dymanic entries (i.e. entries that are not "hashed" into the +// vfs.Dentry.children) can embed this to provide no-op implementations for +// functions related to dynamic entries. +type InodeNoDynamicLookup struct{} + +// Lookup implements Inode.Lookup. +func (*InodeNoDynamicLookup) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) { + return nil, syserror.ENOENT +} + +// Valid implements Inode.Valid. +func (*InodeNoDynamicLookup) Valid(ctx context.Context) bool { + return true +} + +// InodeNotSymlink partially implements the Inode interface, specifically the +// inodeSymlink sub interface. All inodes that are not symlinks may embed this +// to return the appropriate errors from symlink-related functions. +type InodeNotSymlink struct{} + +// Readlink implements Inode.Readlink. +func (*InodeNotSymlink) Readlink(context.Context) (string, error) { + return "", syserror.EINVAL +} + +// InodeAttrs partially implements the Inode interface, specifically the +// inodeMetadata sub interface. InodeAttrs provides functionality related to +// inode attributes. +// +// Must be initialized by Init prior to first use. +type InodeAttrs struct { + ino uint64 + mode uint32 + uid uint32 + gid uint32 + nlink uint32 +} + +// Init initializes this InodeAttrs. +func (a *InodeAttrs) Init(creds *auth.Credentials, ino uint64, mode linux.FileMode) { + if mode.FileType() == 0 { + panic(fmt.Sprintf("No file type specified in 'mode' for InodeAttrs.Init(): mode=0%o", mode)) + } + + nlink := uint32(1) + if mode.FileType() == linux.ModeDirectory { + nlink = 2 + } + atomic.StoreUint64(&a.ino, ino) + atomic.StoreUint32(&a.mode, uint32(mode)) + atomic.StoreUint32(&a.uid, uint32(creds.EffectiveKUID)) + atomic.StoreUint32(&a.gid, uint32(creds.EffectiveKGID)) + atomic.StoreUint32(&a.nlink, nlink) +} + +// Mode implements Inode.Mode. +func (a *InodeAttrs) Mode() linux.FileMode { + return linux.FileMode(atomic.LoadUint32(&a.mode)) +} + +// Stat partially implements Inode.Stat. Note that this function doesn't provide +// all the stat fields, and the embedder should consider extending the result +// with filesystem-specific fields. +func (a *InodeAttrs) Stat(*vfs.Filesystem) linux.Statx { + var stat linux.Statx + stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO | linux.STATX_NLINK + stat.Ino = atomic.LoadUint64(&a.ino) + stat.Mode = uint16(a.Mode()) + stat.UID = atomic.LoadUint32(&a.uid) + stat.GID = atomic.LoadUint32(&a.gid) + stat.Nlink = atomic.LoadUint32(&a.nlink) + + // TODO: Implement other stat fields like timestamps. + + return stat +} + +// SetStat implements Inode.SetStat. +func (a *InodeAttrs) SetStat(_ *vfs.Filesystem, opts vfs.SetStatOptions) error { + stat := opts.Stat + if stat.Mask&linux.STATX_MODE != 0 { + for { + old := atomic.LoadUint32(&a.mode) + new := old | uint32(stat.Mode & ^uint16(linux.S_IFMT)) + if swapped := atomic.CompareAndSwapUint32(&a.mode, old, new); swapped { + break + } + } + } + + if stat.Mask&linux.STATX_UID != 0 { + atomic.StoreUint32(&a.uid, stat.UID) + } + if stat.Mask&linux.STATX_GID != 0 { + atomic.StoreUint32(&a.gid, stat.GID) + } + + // Note that not all fields are modifiable. For example, the file type and + // inode numbers are immutable after node creation. + + // TODO: Implement other stat fields like timestamps. + + return nil +} + +// CheckPermissions implements Inode.CheckPermissions. +func (a *InodeAttrs) CheckPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error { + mode := a.Mode() + return vfs.GenericCheckPermissions( + creds, + ats, + mode.FileType() == linux.ModeDirectory, + uint16(mode), + auth.KUID(atomic.LoadUint32(&a.uid)), + auth.KGID(atomic.LoadUint32(&a.gid)), + ) +} + +// IncLinks implements Inode.IncLinks. +func (a *InodeAttrs) IncLinks(n uint32) { + if atomic.AddUint32(&a.nlink, n) <= n { + panic("InodeLink.IncLinks called with no existing links") + } +} + +// DecLinks implements Inode.DecLinks. +func (a *InodeAttrs) DecLinks() { + if nlink := atomic.AddUint32(&a.nlink, ^uint32(0)); nlink == ^uint32(0) { + // Negative overflow + panic("Inode.DecLinks called at 0 links") + } +} + +type slot struct { + Name string + Dentry *vfs.Dentry + slotEntry +} + +// OrderedChildrenOptions contains initialization options for OrderedChildren. +type OrderedChildrenOptions struct { + // Writable indicates whether vfs.FilesystemImpl methods implemented by + // OrderedChildren may modify the tracked children. This applies to + // operations related to rename, unlink and rmdir. If an OrderedChildren is + // not writable, these operations all fail with EPERM. + Writable bool +} + +// OrderedChildren partially implements the Inode interface. OrderedChildren can +// be embedded in directory inodes to keep track of the children in the +// directory, and can then be used to implement a generic directory FD -- see +// GenericDirectoryFD. OrderedChildren is not compatible with dynamic +// directories. +// +// Must be initialize with Init before first use. +type OrderedChildren struct { + refs.AtomicRefCount + + // Can children be modified by user syscalls? It set to false, interface + // methods that would modify the children return EPERM. Immutable. + writable bool + + mu sync.RWMutex + order slotList + set map[string]*slot +} + +// Init initializes an OrderedChildren. +func (o *OrderedChildren) Init(opts OrderedChildrenOptions) { + o.writable = opts.Writable + o.set = make(map[string]*slot) +} + +// DecRef implements Inode.DecRef. +func (o *OrderedChildren) DecRef() { + o.AtomicRefCount.DecRefWithDestructor(o.Destroy) +} + +// Destroy cleans up resources referenced by this OrderedChildren. +func (o *OrderedChildren) Destroy() { + o.mu.Lock() + defer o.mu.Unlock() + o.order.Reset() + o.set = nil +} + +// Populate inserts children into this OrderedChildren, and d's dentry +// cache. Populate returns the number of directories inserted, which the caller +// may use to update the link count for the parent directory. +// +// Precondition: d.Impl() must be a kernfs Dentry. d must represent a directory +// inode. children must not contain any conflicting entries already in o. +func (o *OrderedChildren) Populate(d *Dentry, children map[string]*Dentry) uint32 { + var links uint32 + for name, child := range children { + if child.isDir() { + links++ + } + if err := o.Insert(name, child.VFSDentry()); err != nil { + panic(fmt.Sprintf("Collision when attempting to insert child %q (%+v) into %+v", name, child, d)) + } + d.InsertChild(name, child.VFSDentry()) + } + return links +} + +// HasChildren implements Inode.HasChildren. +func (o *OrderedChildren) HasChildren() bool { + o.mu.RLock() + defer o.mu.RUnlock() + return len(o.set) > 0 +} + +// Insert inserts child into o. This ignores the writability of o, as this is +// not part of the vfs.FilesystemImpl interface, and is a lower-level operation. +func (o *OrderedChildren) Insert(name string, child *vfs.Dentry) error { + o.mu.Lock() + defer o.mu.Unlock() + if _, ok := o.set[name]; ok { + return syserror.EEXIST + } + s := &slot{ + Name: name, + Dentry: child, + } + o.order.PushBack(s) + o.set[name] = s + return nil +} + +// Precondition: caller must hold o.mu for writing. +func (o *OrderedChildren) removeLocked(name string) { + if s, ok := o.set[name]; ok { + delete(o.set, name) + o.order.Remove(s) + } +} + +// Precondition: caller must hold o.mu for writing. +func (o *OrderedChildren) replaceChildLocked(name string, new *vfs.Dentry) *vfs.Dentry { + if s, ok := o.set[name]; ok { + // Existing slot with given name, simply replace the dentry. + var old *vfs.Dentry + old, s.Dentry = s.Dentry, new + return old + } + + // No existing slot with given name, create and hash new slot. + s := &slot{ + Name: name, + Dentry: new, + } + o.order.PushBack(s) + o.set[name] = s + return nil +} + +// Precondition: caller must hold o.mu for reading or writing. +func (o *OrderedChildren) checkExistingLocked(name string, child *vfs.Dentry) error { + s, ok := o.set[name] + if !ok { + return syserror.ENOENT + } + if s.Dentry != child { + panic(fmt.Sprintf("Dentry hashed into inode doesn't match what vfs thinks! OrderedChild: %+v, vfs: %+v", s.Dentry, child)) + } + return nil +} + +// Unlink implements Inode.Unlink. +func (o *OrderedChildren) Unlink(ctx context.Context, name string, child *vfs.Dentry) error { + if !o.writable { + return syserror.EPERM + } + o.mu.Lock() + defer o.mu.Unlock() + if err := o.checkExistingLocked(name, child); err != nil { + return err + } + o.removeLocked(name) + return nil +} + +// Rmdir implements Inode.Rmdir. +func (o *OrderedChildren) RmDir(ctx context.Context, name string, child *vfs.Dentry) error { + // We're not responsible for checking that child is a directory, that it's + // empty, or updating any link counts; so this is the same as unlink. + return o.Unlink(ctx, name, child) +} + +type renameAcrossDifferentImplementationsError struct{} + +func (renameAcrossDifferentImplementationsError) Error() string { + return "rename across inodes with different implementations" +} + +// Rename implements Inode.Rename. +// +// Precondition: Rename may only be called across two directory inodes with +// identical implementations of Rename. Practically, this means filesystems that +// implement Rename by embedding OrderedChildren for any directory +// implementation must use OrderedChildren for all directory implementations +// that will support Rename. +// +// Postcondition: reference on any replaced dentry transferred to caller. +func (o *OrderedChildren) Rename(ctx context.Context, oldname, newname string, child, dstDir *vfs.Dentry) (*vfs.Dentry, error) { + dst, ok := dstDir.Impl().(*Dentry).inode.(interface{}).(*OrderedChildren) + if !ok { + return nil, renameAcrossDifferentImplementationsError{} + } + if !o.writable || !dst.writable { + return nil, syserror.EPERM + } + // Note: There's a potential deadlock below if concurrent calls to Rename + // refer to the same src and dst directories in reverse. We avoid any + // ordering issues because the caller is required to serialize concurrent + // calls to Rename in accordance with the interface declaration. + o.mu.Lock() + defer o.mu.Unlock() + if dst != o { + dst.mu.Lock() + defer dst.mu.Unlock() + } + if err := o.checkExistingLocked(oldname, child); err != nil { + return nil, err + } + replaced := dst.replaceChildLocked(newname, child) + return replaced, nil +} + +// nthLocked returns an iterator to the nth child tracked by this object. The +// iterator is valid until the caller releases o.mu. Returns nil if the +// requested index falls out of bounds. +// +// Preconditon: Caller must hold o.mu for reading. +func (o *OrderedChildren) nthLocked(i int64) *slot { + for it := o.order.Front(); it != nil && i >= 0; it = it.Next() { + if i == 0 { + return it + } + i-- + } + return nil +} diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go new file mode 100644 index 000000000..bb01c3d01 --- /dev/null +++ b/pkg/sentry/fsimpl/kernfs/kernfs.go @@ -0,0 +1,405 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package kernfs provides the tools to implement inode-based filesystems. +// Kernfs has two main features: +// +// 1. The Inode interface, which maps VFS2's path-based filesystem operations to +// specific filesystem nodes. Kernfs uses the Inode interface to provide a +// blanket implementation for the vfs.FilesystemImpl. Kernfs also serves as +// the synchronization mechanism for all filesystem operations by holding a +// filesystem-wide lock across all operations. +// +// 2. Various utility types which provide generic implementations for various +// parts of the Inode and vfs.FileDescription interfaces. Client filesystems +// based on kernfs can embed the appropriate set of these to avoid having to +// reimplement common filesystem operations. See inode_impl_util.go and +// fd_impl_util.go. +// +// Reference Model: +// +// Kernfs dentries represents named pointers to inodes. Dentries and inode have +// independent lifetimes and reference counts. A child dentry unconditionally +// holds a reference on its parent directory's dentry. A dentry also holds a +// reference on the inode it points to. Multiple dentries can point to the same +// inode (for example, in the case of hardlinks). File descriptors hold a +// reference to the dentry they're opened on. +// +// Dentries are guaranteed to exist while holding Filesystem.mu for +// reading. Dropping dentries require holding Filesystem.mu for writing. To +// queue dentries for destruction from a read critical section, see +// Filesystem.deferDecRef. +// +// Lock ordering: +// +// kernfs.Filesystem.mu +// kernfs.Dentry.dirMu +// vfs.VirtualFilesystem.mountMu +// vfs.Dentry.mu +// kernfs.Filesystem.droppedDentriesMu +// (inode implementation locks, if any) +package kernfs + +import ( + "fmt" + "sync" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/refs" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" +) + +// FilesystemType implements vfs.FilesystemType. +type FilesystemType struct{} + +// Filesystem mostly implements vfs.FilesystemImpl for a generic in-memory +// filesystem. Concrete implementations are expected to embed this in their own +// Filesystem type. +type Filesystem struct { + vfsfs vfs.Filesystem + + droppedDentriesMu sync.Mutex + + // droppedDentries is a list of dentries waiting to be DecRef()ed. This is + // used to defer dentry destruction until mu can be acquired for + // writing. Protected by droppedDentriesMu. + droppedDentries []*vfs.Dentry + + // mu synchronizes the lifetime of Dentries on this filesystem. Holding it + // for reading guarantees continued existence of any resolved dentries, but + // the dentry tree may be modified. + // + // Kernfs dentries can only be DecRef()ed while holding mu for writing. For + // example: + // + // fs.mu.Lock() + // defer fs.mu.Unlock() + // ... + // dentry1.DecRef() + // defer dentry2.DecRef() // Ok, will run before Unlock. + // + // If discarding dentries in a read context, use Filesystem.deferDecRef. For + // example: + // + // fs.mu.RLock() + // fs.mu.processDeferredDecRefs() + // defer fs.mu.RUnlock() + // ... + // fs.deferDecRef(dentry) + mu sync.RWMutex + + // nextInoMinusOne is used to to allocate inode numbers on this + // filesystem. Must be accessed by atomic operations. + nextInoMinusOne uint64 +} + +// deferDecRef defers dropping a dentry ref until the next call to +// processDeferredDecRefs{,Locked}. See comment on Filesystem.mu. +// +// Precondition: d must not already be pending destruction. +func (fs *Filesystem) deferDecRef(d *vfs.Dentry) { + fs.droppedDentriesMu.Lock() + fs.droppedDentries = append(fs.droppedDentries, d) + fs.droppedDentriesMu.Unlock() +} + +// processDeferredDecRefs calls vfs.Dentry.DecRef on all dentries in the +// droppedDentries list. See comment on Filesystem.mu. +func (fs *Filesystem) processDeferredDecRefs() { + fs.mu.Lock() + fs.processDeferredDecRefsLocked() + fs.mu.Unlock() +} + +// Precondition: fs.mu must be held for writing. +func (fs *Filesystem) processDeferredDecRefsLocked() { + fs.droppedDentriesMu.Lock() + for _, d := range fs.droppedDentries { + d.DecRef() + } + fs.droppedDentries = fs.droppedDentries[:0] // Keep slice memory for reuse. + fs.droppedDentriesMu.Unlock() +} + +// Init initializes a kernfs filesystem. This should be called from during +// vfs.FilesystemType.NewFilesystem for the concrete filesystem embedding +// kernfs. +func (fs *Filesystem) Init(vfsObj *vfs.VirtualFilesystem) { + fs.vfsfs.Init(vfsObj, fs) +} + +// VFSFilesystem returns the generic vfs filesystem object. +func (fs *Filesystem) VFSFilesystem() *vfs.Filesystem { + return &fs.vfsfs +} + +// NextIno allocates a new inode number on this filesystem. +func (fs *Filesystem) NextIno() uint64 { + return atomic.AddUint64(&fs.nextInoMinusOne, 1) +} + +// These consts are used in the Dentry.flags field. +const ( + // Dentry points to a directory inode. + dflagsIsDir = 1 << iota + + // Dentry points to a symlink inode. + dflagsIsSymlink +) + +// Dentry implements vfs.DentryImpl. +// +// A kernfs dentry is similar to a dentry in a traditional filesystem: it's a +// named reference to an inode. A dentry generally lives as long as it's part of +// a mounted filesystem tree. Kernfs doesn't cache dentries once all references +// to them are removed. Dentries hold a single reference to the inode they point +// to, and child dentries hold a reference on their parent. +// +// Must be initialized by Init prior to first use. +type Dentry struct { + refs.AtomicRefCount + + vfsd vfs.Dentry + inode Inode + + refs uint64 + + // flags caches useful information about the dentry from the inode. See the + // dflags* consts above. Must be accessed by atomic ops. + flags uint32 + + // dirMu protects vfsd.children for directory dentries. + dirMu sync.Mutex +} + +// Init initializes this dentry. +// +// Precondition: Caller must hold a reference on inode. +// +// Postcondition: Caller's reference on inode is transferred to the dentry. +func (d *Dentry) Init(inode Inode) { + d.vfsd.Init(d) + d.inode = inode + ftype := inode.Mode().FileType() + if ftype == linux.ModeDirectory { + d.flags |= dflagsIsDir + } + if ftype == linux.ModeSymlink { + d.flags |= dflagsIsSymlink + } +} + +// VFSDentry returns the generic vfs dentry for this kernfs dentry. +func (d *Dentry) VFSDentry() *vfs.Dentry { + return &d.vfsd +} + +// isDir checks whether the dentry points to a directory inode. +func (d *Dentry) isDir() bool { + return atomic.LoadUint32(&d.flags)&dflagsIsDir != 0 +} + +// isSymlink checks whether the dentry points to a symlink inode. +func (d *Dentry) isSymlink() bool { + return atomic.LoadUint32(&d.flags)&dflagsIsSymlink != 0 +} + +// DecRef implements vfs.DentryImpl.DecRef. +func (d *Dentry) DecRef() { + d.AtomicRefCount.DecRefWithDestructor(d.destroy) +} + +// Precondition: Dentry must be removed from VFS' dentry cache. +func (d *Dentry) destroy() { + d.inode.DecRef() // IncRef from Init. + d.inode = nil + if parent := d.vfsd.Parent(); parent != nil { + parent.DecRef() // IncRef from Dentry.InsertChild. + } +} + +// InsertChild inserts child into the vfs dentry cache with the given name under +// this dentry. This does not update the directory inode, so calling this on +// it's own isn't sufficient to insert a child into a directory. InsertChild +// updates the link count on d if required. +// +// Precondition: d must represent a directory inode. +func (d *Dentry) InsertChild(name string, child *vfs.Dentry) { + if !d.isDir() { + panic(fmt.Sprintf("InsertChild called on non-directory Dentry: %+v.", d)) + } + vfsDentry := d.VFSDentry() + vfsDentry.IncRef() // DecRef in child's Dentry.destroy. + d.dirMu.Lock() + vfsDentry.InsertChild(child, name) + d.dirMu.Unlock() +} + +// The Inode interface maps filesystem-level operations that operate on paths to +// equivalent operations on specific filesystem nodes. +// +// The interface methods are groups into logical categories as sub interfaces +// below. Generally, an implementation for each sub interface can be provided by +// embedding an appropriate type from inode_impl_utils.go. The sub interfaces +// are purely organizational. Methods declared directly in the main interface +// have no generic implementations, and should be explicitly provided by the +// client filesystem. +// +// Generally, implementations are not responsible for tasks that are common to +// all filesystems. These include: +// +// - Checking that dentries passed to methods are of the appropriate file type. +// - Checking permissions. +// - Updating link and reference counts. +// +// Specific responsibilities of implementations are documented below. +type Inode interface { + // Methods related to reference counting. A generic implementation is + // provided by InodeNoopRefCount. These methods are generally called by the + // equivalent Dentry methods. + inodeRefs + + // Methods related to node metadata. A generic implementation is provided by + // InodeAttrs. + inodeMetadata + + // Method for inodes that represent symlink. InodeNotSymlink provides a + // blanket implementation for all non-symlink inodes. + inodeSymlink + + // Method for inodes that represent directories. InodeNotDirectory provides + // a blanket implementation for all non-directory inodes. + inodeDirectory + + // Method for inodes that represent dynamic directories and their + // children. InodeNoDynamicLookup provides a blanket implementation for all + // non-dynamic-directory inodes. + inodeDynamicLookup + + // Open creates a file description for the filesystem object represented by + // this inode. The returned file description should hold a reference on the + // inode for its lifetime. + // + // Precondition: !rp.Done(). vfsd.Impl() must be a kernfs Dentry. + Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) +} + +type inodeRefs interface { + IncRef() + DecRef() + TryIncRef() bool + // Destroy is called when the inode reaches zero references. Destroy release + // all resources (references) on objects referenced by the inode, including + // any child dentries. + Destroy() +} + +type inodeMetadata interface { + // CheckPermissions checks that creds may access this inode for the + // requested access type, per the the rules of + // fs/namei.c:generic_permission(). + CheckPermissions(creds *auth.Credentials, atx vfs.AccessTypes) error + + // Mode returns the (struct stat)::st_mode value for this inode. This is + // separated from Stat for performance. + Mode() linux.FileMode + + // Stat returns the metadata for this inode. This corresponds to + // vfs.FilesystemImpl.StatAt. + Stat(fs *vfs.Filesystem) linux.Statx + + // SetStat updates the metadata for this inode. This corresponds to + // vfs.FilesystemImpl.SetStatAt. + SetStat(fs *vfs.Filesystem, opts vfs.SetStatOptions) error +} + +// Precondition: All methods in this interface may only be called on directory +// inodes. +type inodeDirectory interface { + // The New{File,Dir,Node,Symlink} methods below should return a new inode + // hashed into this inode. + // + // These inode constructors are inode-level operations rather than + // filesystem-level operations to allow client filesystems to mix different + // implementations based on the new node's location in the + // filesystem. + + // HasChildren returns true if the directory inode has any children. + HasChildren() bool + + // NewFile creates a new regular file inode. + NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (*vfs.Dentry, error) + + // NewDir creates a new directory inode. + NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (*vfs.Dentry, error) + + // NewLink creates a new hardlink to a specified inode in this + // directory. Implementations should create a new kernfs Dentry pointing to + // target, and update target's link count. + NewLink(ctx context.Context, name string, target Inode) (*vfs.Dentry, error) + + // NewSymlink creates a new symbolic link inode. + NewSymlink(ctx context.Context, name, target string) (*vfs.Dentry, error) + + // NewNode creates a new filesystem node for a mknod syscall. + NewNode(ctx context.Context, name string, opts vfs.MknodOptions) (*vfs.Dentry, error) + + // Unlink removes a child dentry from this directory inode. + Unlink(ctx context.Context, name string, child *vfs.Dentry) error + + // RmDir removes an empty child directory from this directory + // inode. Implementations must update the parent directory's link count, + // if required. Implementations are not responsible for checking that child + // is a directory, checking for an empty directory. + RmDir(ctx context.Context, name string, child *vfs.Dentry) error + + // Rename is called on the source directory containing an inode being + // renamed. child should point to the resolved child in the source + // directory. If Rename replaces a dentry in the destination directory, it + // should return the replaced dentry or nil otherwise. + // + // Precondition: Caller must serialize concurrent calls to Rename. + Rename(ctx context.Context, oldname, newname string, child, dstDir *vfs.Dentry) (replaced *vfs.Dentry, err error) +} + +type inodeDynamicLookup interface { + // Lookup should return an appropriate dentry if name should resolve to a + // child of this dynamic directory inode. This gives the directory an + // opportunity on every lookup to resolve additional entries that aren't + // hashed into the directory. This is only called when the inode is a + // directory. If the inode is not a directory, or if the directory only + // contains a static set of children, the implementer can unconditionally + // return an appropriate error (ENOTDIR and ENOENT respectively). + // + // The child returned by Lookup will be hashed into the VFS dentry tree. Its + // lifetime can be controlled by the filesystem implementation with an + // appropriate implementation of Valid. + // + // Lookup returns the child with an extra reference and the caller owns this + // reference. + Lookup(ctx context.Context, name string) (*vfs.Dentry, error) + + // Valid should return true if this inode is still valid, or needs to + // be resolved again by a call to Lookup. + Valid(ctx context.Context) bool +} + +type inodeSymlink interface { + // Readlink resolves the target of a symbolic link. If an inode is not a + // symlink, the implementation should return EINVAL. + Readlink(ctx context.Context) (string, error) +} diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go new file mode 100644 index 000000000..f78bb7b04 --- /dev/null +++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go @@ -0,0 +1,423 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernfs_test + +import ( + "bytes" + "fmt" + "io" + "runtime" + "sync" + "testing" + + "github.com/google/go-cmp/cmp" + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/context/contexttest" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +const defaultMode linux.FileMode = 01777 +const staticFileContent = "This is sample content for a static test file." + +// RootDentryFn is a generator function for creating the root dentry of a test +// filesystem. See newTestSystem. +type RootDentryFn func(*auth.Credentials, *filesystem) *kernfs.Dentry + +// TestSystem represents the context for a single test. +type TestSystem struct { + t *testing.T + ctx context.Context + creds *auth.Credentials + vfs *vfs.VirtualFilesystem + mns *vfs.MountNamespace + root vfs.VirtualDentry +} + +// newTestSystem sets up a minimal environment for running a test, including an +// instance of a test filesystem. Tests can control the contents of the +// filesystem by providing an appropriate rootFn, which should return a +// pre-populated root dentry. +func newTestSystem(t *testing.T, rootFn RootDentryFn) *TestSystem { + ctx := contexttest.Context(t) + creds := auth.CredentialsFromContext(ctx) + v := vfs.New() + v.MustRegisterFilesystemType("testfs", &fsType{rootFn: rootFn}) + mns, err := v.NewMountNamespace(ctx, creds, "", "testfs", &vfs.GetFilesystemOptions{}) + if err != nil { + t.Fatalf("Failed to create testfs root mount: %v", err) + } + + s := &TestSystem{ + t: t, + ctx: ctx, + creds: creds, + vfs: v, + mns: mns, + root: mns.Root(), + } + runtime.SetFinalizer(s, func(s *TestSystem) { s.root.DecRef() }) + return s +} + +// PathOpAtRoot constructs a vfs.PathOperation for a path from the +// root of the test filesystem. +// +// Precondition: path should be relative path. +func (s *TestSystem) PathOpAtRoot(path string) vfs.PathOperation { + return vfs.PathOperation{ + Root: s.root, + Start: s.root, + Pathname: path, + } +} + +// GetDentryOrDie attempts to resolve a dentry referred to by the +// provided path operation. If unsuccessful, the test fails. +func (s *TestSystem) GetDentryOrDie(pop vfs.PathOperation) vfs.VirtualDentry { + vd, err := s.vfs.GetDentryAt(s.ctx, s.creds, &pop, &vfs.GetDentryOptions{}) + if err != nil { + s.t.Fatalf("GetDentryAt(pop:%+v) failed: %v", pop, err) + } + return vd +} + +func (s *TestSystem) ReadToEnd(fd *vfs.FileDescription) (string, error) { + buf := make([]byte, usermem.PageSize) + bufIOSeq := usermem.BytesIOSequence(buf) + opts := vfs.ReadOptions{} + + var content bytes.Buffer + for { + n, err := fd.Impl().Read(s.ctx, bufIOSeq, opts) + if n == 0 || err != nil { + if err == io.EOF { + err = nil + } + return content.String(), err + } + content.Write(buf[:n]) + } +} + +type fsType struct { + rootFn RootDentryFn +} + +type filesystem struct { + kernfs.Filesystem +} + +type file struct { + kernfs.DynamicBytesFile + content string +} + +func (fs *filesystem) newFile(creds *auth.Credentials, content string) *kernfs.Dentry { + f := &file{} + f.content = content + f.DynamicBytesFile.Init(creds, fs.NextIno(), f) + + d := &kernfs.Dentry{} + d.Init(f) + return d +} + +func (f *file) Generate(ctx context.Context, buf *bytes.Buffer) error { + fmt.Fprintf(buf, "%s", f.content) + return nil +} + +type attrs struct { + kernfs.InodeAttrs +} + +func (a *attrs) SetStat(fs *vfs.Filesystem, opt vfs.SetStatOptions) error { + return syserror.EPERM +} + +type readonlyDir struct { + attrs + kernfs.InodeNotSymlink + kernfs.InodeNoDynamicLookup + kernfs.InodeDirectoryNoNewChildren + + kernfs.OrderedChildren + dentry kernfs.Dentry +} + +func (fs *filesystem) newReadonlyDir(creds *auth.Credentials, mode linux.FileMode, contents map[string]*kernfs.Dentry) *kernfs.Dentry { + dir := &readonlyDir{} + dir.attrs.Init(creds, fs.NextIno(), linux.ModeDirectory|mode) + dir.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) + dir.dentry.Init(dir) + + dir.IncLinks(dir.OrderedChildren.Populate(&dir.dentry, contents)) + + return &dir.dentry +} + +func (d *readonlyDir) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) { + fd := &kernfs.GenericDirectoryFD{} + fd.Init(rp.Mount(), vfsd, &d.OrderedChildren, flags) + return fd.VFSFileDescription(), nil +} + +type dir struct { + attrs + kernfs.InodeNotSymlink + kernfs.InodeNoDynamicLookup + + fs *filesystem + dentry kernfs.Dentry + kernfs.OrderedChildren +} + +func (fs *filesystem) newDir(creds *auth.Credentials, mode linux.FileMode, contents map[string]*kernfs.Dentry) *kernfs.Dentry { + dir := &dir{} + dir.fs = fs + dir.attrs.Init(creds, fs.NextIno(), linux.ModeDirectory|mode) + dir.OrderedChildren.Init(kernfs.OrderedChildrenOptions{Writable: true}) + dir.dentry.Init(dir) + + dir.IncLinks(dir.OrderedChildren.Populate(&dir.dentry, contents)) + + return &dir.dentry +} + +func (d *dir) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) { + fd := &kernfs.GenericDirectoryFD{} + fd.Init(rp.Mount(), vfsd, &d.OrderedChildren, flags) + return fd.VFSFileDescription(), nil +} + +func (d *dir) NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (*vfs.Dentry, error) { + creds := auth.CredentialsFromContext(ctx) + dir := d.fs.newDir(creds, opts.Mode, nil) + dirVFSD := dir.VFSDentry() + if err := d.OrderedChildren.Insert(name, dirVFSD); err != nil { + dir.DecRef() + return nil, err + } + d.IncLinks(1) + return dirVFSD, nil +} + +func (d *dir) NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (*vfs.Dentry, error) { + creds := auth.CredentialsFromContext(ctx) + f := d.fs.newFile(creds, "") + fVFSD := f.VFSDentry() + if err := d.OrderedChildren.Insert(name, fVFSD); err != nil { + f.DecRef() + return nil, err + } + return fVFSD, nil +} + +func (*dir) NewLink(context.Context, string, kernfs.Inode) (*vfs.Dentry, error) { + return nil, syserror.EPERM +} + +func (*dir) NewSymlink(context.Context, string, string) (*vfs.Dentry, error) { + return nil, syserror.EPERM +} + +func (*dir) NewNode(context.Context, string, vfs.MknodOptions) (*vfs.Dentry, error) { + return nil, syserror.EPERM +} + +func (fst *fsType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opt vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { + fs := &filesystem{} + fs.Init(vfsObj) + root := fst.rootFn(creds, fs) + return fs.VFSFilesystem(), root.VFSDentry(), nil +} + +// -------------------- Remainder of the file are test cases -------------------- + +func TestBasic(t *testing.T) { + sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry { + return fs.newReadonlyDir(creds, 0755, map[string]*kernfs.Dentry{ + "file1": fs.newFile(creds, staticFileContent), + }) + }) + sys.GetDentryOrDie(sys.PathOpAtRoot("file1")).DecRef() +} + +func TestMkdirGetDentry(t *testing.T) { + sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry { + return fs.newReadonlyDir(creds, 0755, map[string]*kernfs.Dentry{ + "dir1": fs.newDir(creds, 0755, nil), + }) + }) + + pop := sys.PathOpAtRoot("dir1/a new directory") + if err := sys.vfs.MkdirAt(sys.ctx, sys.creds, &pop, &vfs.MkdirOptions{Mode: 0755}); err != nil { + t.Fatalf("MkdirAt for PathOperation %+v failed: %v", pop, err) + } + sys.GetDentryOrDie(pop).DecRef() +} + +func TestReadStaticFile(t *testing.T) { + sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry { + return fs.newReadonlyDir(creds, 0755, map[string]*kernfs.Dentry{ + "file1": fs.newFile(creds, staticFileContent), + }) + }) + + pop := sys.PathOpAtRoot("file1") + fd, err := sys.vfs.OpenAt(sys.ctx, sys.creds, &pop, &vfs.OpenOptions{}) + if err != nil { + sys.t.Fatalf("OpenAt for PathOperation %+v failed: %v", pop, err) + } + defer fd.DecRef() + + content, err := sys.ReadToEnd(fd) + if err != nil { + sys.t.Fatalf("Read failed: %v", err) + } + if diff := cmp.Diff(staticFileContent, content); diff != "" { + sys.t.Fatalf("Read returned unexpected data:\n--- want\n+++ got\n%v", diff) + } +} + +func TestCreateNewFileInStaticDir(t *testing.T) { + sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry { + return fs.newReadonlyDir(creds, 0755, map[string]*kernfs.Dentry{ + "dir1": fs.newDir(creds, 0755, nil), + }) + }) + + pop := sys.PathOpAtRoot("dir1/newfile") + opts := &vfs.OpenOptions{Flags: linux.O_CREAT | linux.O_EXCL, Mode: defaultMode} + fd, err := sys.vfs.OpenAt(sys.ctx, sys.creds, &pop, opts) + if err != nil { + sys.t.Fatalf("OpenAt(pop:%+v, opts:%+v) failed: %v", pop, opts, err) + } + + // Close the file. The file should persist. + fd.DecRef() + + fd, err = sys.vfs.OpenAt(sys.ctx, sys.creds, &pop, &vfs.OpenOptions{}) + if err != nil { + sys.t.Fatalf("OpenAt(pop:%+v) = %+v failed: %v", pop, fd, err) + } + fd.DecRef() +} + +// direntCollector provides an implementation for vfs.IterDirentsCallback for +// testing. It simply iterates to the end of a given directory FD and collects +// all dirents emitted by the callback. +type direntCollector struct { + mu sync.Mutex + dirents map[string]vfs.Dirent +} + +// Handle implements vfs.IterDirentsCallback.Handle. +func (d *direntCollector) Handle(dirent vfs.Dirent) bool { + d.mu.Lock() + if d.dirents == nil { + d.dirents = make(map[string]vfs.Dirent) + } + d.dirents[dirent.Name] = dirent + d.mu.Unlock() + return true +} + +// count returns the number of dirents currently in the collector. +func (d *direntCollector) count() int { + d.mu.Lock() + defer d.mu.Unlock() + return len(d.dirents) +} + +// contains checks whether the collector has a dirent with the given name and +// type. +func (d *direntCollector) contains(name string, typ uint8) error { + d.mu.Lock() + defer d.mu.Unlock() + dirent, ok := d.dirents[name] + if !ok { + return fmt.Errorf("No dirent named %q found", name) + } + if dirent.Type != typ { + return fmt.Errorf("Dirent named %q found, but was expecting type %d, got: %+v", name, typ, dirent) + } + return nil +} + +func TestDirFDReadWrite(t *testing.T) { + sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry { + return fs.newReadonlyDir(creds, 0755, nil) + }) + + pop := sys.PathOpAtRoot("/") + fd, err := sys.vfs.OpenAt(sys.ctx, sys.creds, &pop, &vfs.OpenOptions{}) + if err != nil { + sys.t.Fatalf("OpenAt for PathOperation %+v failed: %v", pop, err) + } + defer fd.DecRef() + + // Read/Write should fail for directory FDs. + if _, err := fd.Read(sys.ctx, usermem.BytesIOSequence([]byte{}), vfs.ReadOptions{}); err != syserror.EISDIR { + sys.t.Fatalf("Read for directory FD failed with unexpected error: %v", err) + } + if _, err := fd.Write(sys.ctx, usermem.BytesIOSequence([]byte{}), vfs.WriteOptions{}); err != syserror.EISDIR { + sys.t.Fatalf("Wrire for directory FD failed with unexpected error: %v", err) + } +} + +func TestDirFDIterDirents(t *testing.T) { + sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry { + return fs.newReadonlyDir(creds, 0755, map[string]*kernfs.Dentry{ + // Fill root with nodes backed by various inode implementations. + "dir1": fs.newReadonlyDir(creds, 0755, nil), + "dir2": fs.newDir(creds, 0755, map[string]*kernfs.Dentry{ + "dir3": fs.newDir(creds, 0755, nil), + }), + "file1": fs.newFile(creds, staticFileContent), + }) + }) + + pop := sys.PathOpAtRoot("/") + fd, err := sys.vfs.OpenAt(sys.ctx, sys.creds, &pop, &vfs.OpenOptions{}) + if err != nil { + sys.t.Fatalf("OpenAt for PathOperation %+v failed: %v", pop, err) + } + defer fd.DecRef() + + collector := &direntCollector{} + if err := fd.IterDirents(sys.ctx, collector); err != nil { + sys.t.Fatalf("IterDirent failed: %v", err) + } + + // Root directory should contain ".", ".." and 3 children: + if collector.count() != 5 { + sys.t.Fatalf("IterDirent returned too many dirents") + } + for _, dirName := range []string{".", "..", "dir1", "dir2"} { + if err := collector.contains(dirName, linux.DT_DIR); err != nil { + sys.t.Fatalf("IterDirent had unexpected results: %v", err) + } + } + if err := collector.contains("file1", linux.DT_REG); err != nil { + sys.t.Fatalf("IterDirent had unexpected results: %v", err) + } + +} -- cgit v1.2.3 From 334a513f11f0ecc260abcce549b1f1a74edc2c51 Mon Sep 17 00:00:00 2001 From: Michael Pratt Date: Wed, 18 Dec 2019 12:59:32 -0800 Subject: Add Mems_allowed to /proc/PID/status PiperOrigin-RevId: 286248378 --- pkg/sentry/fs/proc/task.go | 4 ++++ pkg/sentry/fsimpl/proc/task.go | 4 ++++ 2 files changed, 8 insertions(+) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fs/proc/task.go b/pkg/sentry/fs/proc/task.go index 0e46c5fb7..9bf4b4527 100644 --- a/pkg/sentry/fs/proc/task.go +++ b/pkg/sentry/fs/proc/task.go @@ -604,6 +604,10 @@ func (s *statusData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ( fmt.Fprintf(&buf, "CapEff:\t%016x\n", creds.EffectiveCaps) fmt.Fprintf(&buf, "CapBnd:\t%016x\n", creds.BoundingCaps) fmt.Fprintf(&buf, "Seccomp:\t%d\n", s.t.SeccompMode()) + // We unconditionally report a single NUMA node. See + // pkg/sentry/syscalls/linux/sys_mempolicy.go. + fmt.Fprintf(&buf, "Mems_allowed:\t1\n") + fmt.Fprintf(&buf, "Mems_allowed_list:\t0\n") return []seqfile.SeqData{{Buf: buf.Bytes(), Handle: (*statusData)(nil)}}, 0 } diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go index c46e05c3a..0d87be52b 100644 --- a/pkg/sentry/fsimpl/proc/task.go +++ b/pkg/sentry/fsimpl/proc/task.go @@ -229,6 +229,10 @@ func (s *statusData) Generate(ctx context.Context, buf *bytes.Buffer) error { fmt.Fprintf(buf, "CapEff:\t%016x\n", creds.EffectiveCaps) fmt.Fprintf(buf, "CapBnd:\t%016x\n", creds.BoundingCaps) fmt.Fprintf(buf, "Seccomp:\t%d\n", s.t.SeccompMode()) + // We unconditionally report a single NUMA node. See + // pkg/sentry/syscalls/linux/sys_mempolicy.go. + fmt.Fprintf(buf, "Mems_allowed:\t1\n") + fmt.Fprintf(buf, "Mems_allowed_list:\t0\n") return nil } -- cgit v1.2.3 From 744401297a8c93ce5992ba99aa84f3dcdc19ae9e Mon Sep 17 00:00:00 2001 From: Jamie Liu Date: Wed, 18 Dec 2019 15:47:24 -0800 Subject: Add VFS2 plumbing for extended attributes. PiperOrigin-RevId: 286281274 --- pkg/sentry/fsimpl/ext/filesystem.go | 36 ++++++++++++ pkg/sentry/fsimpl/kernfs/filesystem.go | 52 +++++++++++++++++ pkg/sentry/fsimpl/memfs/filesystem.go | 48 +++++++++++++++ pkg/sentry/vfs/file_description.go | 49 +++++++++++++++- pkg/sentry/vfs/file_description_impl_util.go | 25 ++++++++ pkg/sentry/vfs/filesystem.go | 16 ++++- pkg/sentry/vfs/options.go | 14 +++++ pkg/sentry/vfs/testutil.go | 20 +++++++ pkg/sentry/vfs/vfs.go | 87 ++++++++++++++++++++++++++++ 9 files changed, 345 insertions(+), 2 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/ext/filesystem.go b/pkg/sentry/fsimpl/ext/filesystem.go index e7aa3b41b..d7e87979a 100644 --- a/pkg/sentry/fsimpl/ext/filesystem.go +++ b/pkg/sentry/fsimpl/ext/filesystem.go @@ -443,6 +443,42 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error return syserror.EROFS } +// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt. +func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([]string, error) { + _, _, err := fs.walk(rp, false) + if err != nil { + return nil, err + } + return nil, syserror.ENOTSUP +} + +// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt. +func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) (string, error) { + _, _, err := fs.walk(rp, false) + if err != nil { + return "", err + } + return "", syserror.ENOTSUP +} + +// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt. +func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error { + _, _, err := fs.walk(rp, false) + if err != nil { + return err + } + return syserror.ENOTSUP +} + +// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt. +func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error { + _, _, err := fs.walk(rp, false) + if err != nil { + return err + } + return syserror.ENOTSUP +} + // PrependPath implements vfs.FilesystemImpl.PrependPath. func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error { fs.mu.RLock() diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go index db486b6c1..3cbbe4b20 100644 --- a/pkg/sentry/fsimpl/kernfs/filesystem.go +++ b/pkg/sentry/fsimpl/kernfs/filesystem.go @@ -683,6 +683,58 @@ func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error return nil } +// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt. +func (fs *Filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([]string, error) { + fs.mu.RLock() + _, _, err := fs.walkExistingLocked(ctx, rp) + fs.mu.RUnlock() + fs.processDeferredDecRefs() + if err != nil { + return nil, err + } + // kernfs currently does not support extended attributes. + return nil, syserror.ENOTSUP +} + +// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt. +func (fs *Filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) (string, error) { + fs.mu.RLock() + _, _, err := fs.walkExistingLocked(ctx, rp) + fs.mu.RUnlock() + fs.processDeferredDecRefs() + if err != nil { + return "", err + } + // kernfs currently does not support extended attributes. + return "", syserror.ENOTSUP +} + +// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt. +func (fs *Filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error { + fs.mu.RLock() + _, _, err := fs.walkExistingLocked(ctx, rp) + fs.mu.RUnlock() + fs.processDeferredDecRefs() + if err != nil { + return err + } + // kernfs currently does not support extended attributes. + return syserror.ENOTSUP +} + +// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt. +func (fs *Filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error { + fs.mu.RLock() + _, _, err := fs.walkExistingLocked(ctx, rp) + fs.mu.RUnlock() + fs.processDeferredDecRefs() + if err != nil { + return err + } + // kernfs currently does not support extended attributes. + return syserror.ENOTSUP +} + // PrependPath implements vfs.FilesystemImpl.PrependPath. func (fs *Filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error { fs.mu.RLock() diff --git a/pkg/sentry/fsimpl/memfs/filesystem.go b/pkg/sentry/fsimpl/memfs/filesystem.go index 1f2a5122a..22f1e811f 100644 --- a/pkg/sentry/fsimpl/memfs/filesystem.go +++ b/pkg/sentry/fsimpl/memfs/filesystem.go @@ -584,6 +584,54 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error return nil } +// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt. +func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([]string, error) { + fs.mu.RLock() + defer fs.mu.RUnlock() + _, _, err := walkExistingLocked(rp) + if err != nil { + return nil, err + } + // TODO(b/127675828): support extended attributes + return nil, syserror.ENOTSUP +} + +// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt. +func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) (string, error) { + fs.mu.RLock() + defer fs.mu.RUnlock() + _, _, err := walkExistingLocked(rp) + if err != nil { + return "", err + } + // TODO(b/127675828): support extended attributes + return "", syserror.ENOTSUP +} + +// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt. +func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error { + fs.mu.RLock() + defer fs.mu.RUnlock() + _, _, err := walkExistingLocked(rp) + if err != nil { + return err + } + // TODO(b/127675828): support extended attributes + return syserror.ENOTSUP +} + +// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt. +func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error { + fs.mu.RLock() + defer fs.mu.RUnlock() + _, _, err := walkExistingLocked(rp) + if err != nil { + return err + } + // TODO(b/127675828): support extended attributes + return syserror.ENOTSUP +} + // PrependPath implements vfs.FilesystemImpl.PrependPath. func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error { fs.mu.RLock() diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go index 6575afd16..c5a9adca3 100644 --- a/pkg/sentry/vfs/file_description.go +++ b/pkg/sentry/vfs/file_description.go @@ -22,6 +22,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/waiter" ) @@ -212,7 +213,21 @@ type FileDescriptionImpl interface { // Ioctl implements the ioctl(2) syscall. Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) - // TODO: extended attributes; file locking + // Listxattr returns all extended attribute names for the file. + Listxattr(ctx context.Context) ([]string, error) + + // Getxattr returns the value associated with the given extended attribute + // for the file. + Getxattr(ctx context.Context, name string) (string, error) + + // Setxattr changes the value associated with the given extended attribute + // for the file. + Setxattr(ctx context.Context, opts SetxattrOptions) error + + // Removexattr removes the given extended attribute from the file. + Removexattr(ctx context.Context, name string) error + + // TODO: file locking } // Dirent holds the information contained in struct linux_dirent64. @@ -329,6 +344,38 @@ func (fd *FileDescription) Ioctl(ctx context.Context, uio usermem.IO, args arch. return fd.impl.Ioctl(ctx, uio, args) } +// Listxattr returns all extended attribute names for the file represented by +// fd. +func (fd *FileDescription) Listxattr(ctx context.Context) ([]string, error) { + names, err := fd.impl.Listxattr(ctx) + if err == syserror.ENOTSUP { + // Linux doesn't actually return ENOTSUP in this case; instead, + // fs/xattr.c:vfs_listxattr() falls back to allowing the security + // subsystem to return security extended attributes, which by default + // don't exist. + return nil, nil + } + return names, err +} + +// Getxattr returns the value associated with the given extended attribute for +// the file represented by fd. +func (fd *FileDescription) Getxattr(ctx context.Context, name string) (string, error) { + return fd.impl.Getxattr(ctx, name) +} + +// Setxattr changes the value associated with the given extended attribute for +// the file represented by fd. +func (fd *FileDescription) Setxattr(ctx context.Context, opts SetxattrOptions) error { + return fd.impl.Setxattr(ctx, opts) +} + +// Removexattr removes the given extended attribute from the file represented +// by fd. +func (fd *FileDescription) Removexattr(ctx context.Context, name string) error { + return fd.impl.Removexattr(ctx, name) +} + // SyncFS instructs the filesystem containing fd to execute the semantics of // syncfs(2). func (fd *FileDescription) SyncFS(ctx context.Context) error { diff --git a/pkg/sentry/vfs/file_description_impl_util.go b/pkg/sentry/vfs/file_description_impl_util.go index aae023254..3df49991c 100644 --- a/pkg/sentry/vfs/file_description_impl_util.go +++ b/pkg/sentry/vfs/file_description_impl_util.go @@ -127,6 +127,31 @@ func (FileDescriptionDefaultImpl) Ioctl(ctx context.Context, uio usermem.IO, arg return 0, syserror.ENOTTY } +// Listxattr implements FileDescriptionImpl.Listxattr analogously to +// inode_operations::listxattr == NULL in Linux. +func (FileDescriptionDefaultImpl) Listxattr(ctx context.Context) ([]string, error) { + // This isn't exactly accurate; see FileDescription.Listxattr. + return nil, syserror.ENOTSUP +} + +// Getxattr implements FileDescriptionImpl.Getxattr analogously to +// inode::i_opflags & IOP_XATTR == 0 in Linux. +func (FileDescriptionDefaultImpl) Getxattr(ctx context.Context, name string) (string, error) { + return "", syserror.ENOTSUP +} + +// Setxattr implements FileDescriptionImpl.Setxattr analogously to +// inode::i_opflags & IOP_XATTR == 0 in Linux. +func (FileDescriptionDefaultImpl) Setxattr(ctx context.Context, opts SetxattrOptions) error { + return syserror.ENOTSUP +} + +// Removexattr implements FileDescriptionImpl.Removexattr analogously to +// inode::i_opflags & IOP_XATTR == 0 in Linux. +func (FileDescriptionDefaultImpl) Removexattr(ctx context.Context, name string) error { + return syserror.ENOTSUP +} + // DirectoryFileDescriptionDefaultImpl may be embedded by implementations of // FileDescriptionImpl that always represent directories to obtain // implementations of non-directory I/O methods that return EISDIR. diff --git a/pkg/sentry/vfs/filesystem.go b/pkg/sentry/vfs/filesystem.go index 8011eba3f..b766614e7 100644 --- a/pkg/sentry/vfs/filesystem.go +++ b/pkg/sentry/vfs/filesystem.go @@ -186,6 +186,20 @@ type FilesystemImpl interface { // UnlinkAt removes the non-directory file at rp. UnlinkAt(ctx context.Context, rp *ResolvingPath) error + // ListxattrAt returns all extended attribute names for the file at rp. + ListxattrAt(ctx context.Context, rp *ResolvingPath) ([]string, error) + + // GetxattrAt returns the value associated with the given extended + // attribute for the file at rp. + GetxattrAt(ctx context.Context, rp *ResolvingPath, name string) (string, error) + + // SetxattrAt changes the value associated with the given extended + // attribute for the file at rp. + SetxattrAt(ctx context.Context, rp *ResolvingPath, opts SetxattrOptions) error + + // RemovexattrAt removes the given extended attribute from the file at rp. + RemovexattrAt(ctx context.Context, rp *ResolvingPath, name string) error + // PrependPath prepends a path from vd to vd.Mount().Root() to b. // // If vfsroot.Ok(), it is the contextual VFS root; if it is encountered @@ -208,7 +222,7 @@ type FilesystemImpl interface { // Preconditions: vd.Mount().Filesystem().Impl() == this FilesystemImpl. PrependPath(ctx context.Context, vfsroot, vd VirtualDentry, b *fspath.Builder) error - // TODO: extended attributes; inotify_add_watch(); bind() + // TODO: inotify_add_watch(); bind() } // PrependPathAtVFSRootError is returned by implementations of diff --git a/pkg/sentry/vfs/options.go b/pkg/sentry/vfs/options.go index 3ecbc8fc1..97ee4a446 100644 --- a/pkg/sentry/vfs/options.go +++ b/pkg/sentry/vfs/options.go @@ -101,6 +101,20 @@ type SetStatOptions struct { Stat linux.Statx } +// SetxattrOptions contains options to VirtualFilesystem.SetxattrAt(), +// FilesystemImpl.SetxattrAt(), FileDescription.Setxattr(), and +// FileDescriptionImpl.Setxattr(). +type SetxattrOptions struct { + // Name is the name of the extended attribute being mutated. + Name string + + // Value is the extended attribute's new value. + Value string + + // Flags contains flags as specified for setxattr/lsetxattr/fsetxattr(2). + Flags uint32 +} + // StatOptions contains options to VirtualFilesystem.StatAt(), // FilesystemImpl.StatAt(), FileDescription.Stat(), and // FileDescriptionImpl.Stat(). diff --git a/pkg/sentry/vfs/testutil.go b/pkg/sentry/vfs/testutil.go index 7a1d9e383..d94117bce 100644 --- a/pkg/sentry/vfs/testutil.go +++ b/pkg/sentry/vfs/testutil.go @@ -117,6 +117,26 @@ func (fs *FDTestFilesystem) UnlinkAt(ctx context.Context, rp *ResolvingPath) err return syserror.EPERM } +// ListxattrAt implements FilesystemImpl.ListxattrAt. +func (fs *FDTestFilesystem) ListxattrAt(ctx context.Context, rp *ResolvingPath) ([]string, error) { + return nil, syserror.EPERM +} + +// GetxattrAt implements FilesystemImpl.GetxattrAt. +func (fs *FDTestFilesystem) GetxattrAt(ctx context.Context, rp *ResolvingPath, name string) (string, error) { + return "", syserror.EPERM +} + +// SetxattrAt implements FilesystemImpl.SetxattrAt. +func (fs *FDTestFilesystem) SetxattrAt(ctx context.Context, rp *ResolvingPath, opts SetxattrOptions) error { + return syserror.EPERM +} + +// RemovexattrAt implements FilesystemImpl.RemovexattrAt. +func (fs *FDTestFilesystem) RemovexattrAt(ctx context.Context, rp *ResolvingPath, name string) error { + return syserror.EPERM +} + // PrependPath implements FilesystemImpl.PrependPath. func (fs *FDTestFilesystem) PrependPath(ctx context.Context, vfsroot, vd VirtualDentry, b *fspath.Builder) error { b.PrependComponent(fmt.Sprintf("vfs.fdTestDentry:%p", vd.dentry.impl.(*fdTestDentry))) diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go index 7262b0d0a..e60898d7c 100644 --- a/pkg/sentry/vfs/vfs.go +++ b/pkg/sentry/vfs/vfs.go @@ -440,6 +440,93 @@ func (vfs *VirtualFilesystem) UnlinkAt(ctx context.Context, creds *auth.Credenti } } +// ListxattrAt returns all extended attribute names for the file at the given +// path. +func (vfs *VirtualFilesystem) ListxattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) ([]string, error) { + rp, err := vfs.getResolvingPath(creds, pop) + if err != nil { + return nil, err + } + for { + names, err := rp.mount.fs.impl.ListxattrAt(ctx, rp) + if err == nil { + vfs.putResolvingPath(rp) + return names, nil + } + if err == syserror.ENOTSUP { + // Linux doesn't actually return ENOTSUP in this case; instead, + // fs/xattr.c:vfs_listxattr() falls back to allowing the security + // subsystem to return security extended attributes, which by + // default don't exist. + vfs.putResolvingPath(rp) + return nil, nil + } + if !rp.handleError(err) { + vfs.putResolvingPath(rp) + return nil, err + } + } +} + +// GetxattrAt returns the value associated with the given extended attribute +// for the file at the given path. +func (vfs *VirtualFilesystem) GetxattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, name string) (string, error) { + rp, err := vfs.getResolvingPath(creds, pop) + if err != nil { + return "", err + } + for { + val, err := rp.mount.fs.impl.GetxattrAt(ctx, rp, name) + if err == nil { + vfs.putResolvingPath(rp) + return val, nil + } + if !rp.handleError(err) { + vfs.putResolvingPath(rp) + return "", err + } + } +} + +// SetxattrAt changes the value associated with the given extended attribute +// for the file at the given path. +func (vfs *VirtualFilesystem) SetxattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *SetxattrOptions) error { + rp, err := vfs.getResolvingPath(creds, pop) + if err != nil { + return err + } + for { + err := rp.mount.fs.impl.SetxattrAt(ctx, rp, *opts) + if err == nil { + vfs.putResolvingPath(rp) + return nil + } + if !rp.handleError(err) { + vfs.putResolvingPath(rp) + return err + } + } +} + +// RemovexattrAt removes the given extended attribute from the file at rp. +func (vfs *VirtualFilesystem) RemovexattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, name string) error { + rp, err := vfs.getResolvingPath(creds, pop) + if err != nil { + return err + } + for { + err := rp.mount.fs.impl.RemovexattrAt(ctx, rp, name) + if err == nil { + vfs.putResolvingPath(rp) + return nil + } + if !rp.handleError(err) { + vfs.putResolvingPath(rp) + return err + } + } +} + // SyncAllFilesystems has the semantics of Linux's sync(2). func (vfs *VirtualFilesystem) SyncAllFilesystems(ctx context.Context) error { fss := make(map[*Filesystem]struct{}) -- cgit v1.2.3 From 3eb489ed6c67b069bc135ab92cb031ce80b40d8f Mon Sep 17 00:00:00 2001 From: Jamie Liu Date: Fri, 20 Dec 2019 11:52:24 -0800 Subject: Move VFS2 file description status flags to vfs.FileDescription. PiperOrigin-RevId: 286616668 --- pkg/sentry/fsimpl/ext/file_description.go | 19 --- pkg/sentry/fsimpl/ext/inode.go | 9 +- pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go | 16 +-- pkg/sentry/fsimpl/kernfs/fd_impl_util.go | 16 +-- pkg/sentry/fsimpl/memfs/filesystem.go | 11 +- pkg/sentry/fsimpl/memfs/memfs.go | 14 --- pkg/sentry/fsimpl/memfs/named_pipe.go | 2 +- pkg/sentry/vfs/file_description.go | 141 +++++++++++++++------- pkg/sentry/vfs/file_description_impl_util_test.go | 2 +- 9 files changed, 107 insertions(+), 123 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/ext/file_description.go b/pkg/sentry/fsimpl/ext/file_description.go index 5eca2b83f..841274daf 100644 --- a/pkg/sentry/fsimpl/ext/file_description.go +++ b/pkg/sentry/fsimpl/ext/file_description.go @@ -26,13 +26,6 @@ import ( type fileDescription struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl - - // flags is the same as vfs.OpenOptions.Flags which are passed to - // vfs.FilesystemImpl.OpenAt. - // TODO(b/134676337): syscalls like read(2), write(2), fchmod(2), fchown(2), - // fgetxattr(2), ioctl(2), mmap(2) should fail with EBADF if O_PATH is set. - // Only close(2), fstat(2), fstatfs(2) should work. - flags uint32 } func (fd *fileDescription) filesystem() *filesystem { @@ -43,18 +36,6 @@ func (fd *fileDescription) inode() *inode { return fd.vfsfd.Dentry().Impl().(*dentry).inode } -// StatusFlags implements vfs.FileDescriptionImpl.StatusFlags. -func (fd *fileDescription) StatusFlags(ctx context.Context) (uint32, error) { - return fd.flags, nil -} - -// SetStatusFlags implements vfs.FileDescriptionImpl.SetStatusFlags. -func (fd *fileDescription) SetStatusFlags(ctx context.Context, flags uint32) error { - // None of the flags settable by fcntl(F_SETFL) are supported, so this is a - // no-op. - return nil -} - // Stat implements vfs.FileDescriptionImpl.Stat. func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { var stat linux.Statx diff --git a/pkg/sentry/fsimpl/ext/inode.go b/pkg/sentry/fsimpl/ext/inode.go index 24249525c..b2cc826c7 100644 --- a/pkg/sentry/fsimpl/ext/inode.go +++ b/pkg/sentry/fsimpl/ext/inode.go @@ -157,10 +157,9 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*v switch in.impl.(type) { case *regularFile: var fd regularFileFD - fd.flags = flags mnt.IncRef() vfsd.IncRef() - fd.vfsfd.Init(&fd, mnt, vfsd) + fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{}) return &fd.vfsfd, nil case *directory: // Can't open directories writably. This check is not necessary for a read @@ -169,10 +168,9 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*v return nil, syserror.EISDIR } var fd directoryFD - fd.flags = flags mnt.IncRef() vfsd.IncRef() - fd.vfsfd.Init(&fd, mnt, vfsd) + fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{}) return &fd.vfsfd, nil case *symlink: if flags&linux.O_PATH == 0 { @@ -180,10 +178,9 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*v return nil, syserror.ELOOP } var fd symlinkFD - fd.flags = flags mnt.IncRef() vfsd.IncRef() - fd.vfsfd.Init(&fd, mnt, vfsd) + fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{}) return &fd.vfsfd, nil default: panic(fmt.Sprintf("unknown inode type: %T", in.impl)) diff --git a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go index 30c06baf0..51102ce48 100644 --- a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go +++ b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go @@ -65,17 +65,15 @@ type DynamicBytesFD struct { vfsfd vfs.FileDescription inode Inode - flags uint32 } // Init initializes a DynamicBytesFD. func (fd *DynamicBytesFD) Init(m *vfs.Mount, d *vfs.Dentry, data vfs.DynamicBytesSource, flags uint32) { m.IncRef() // DecRef in vfs.FileDescription.vd.DecRef on final ref. d.IncRef() // DecRef in vfs.FileDescription.vd.DecRef on final ref. - fd.flags = flags fd.inode = d.Impl().(*Dentry).inode fd.SetDataSource(data) - fd.vfsfd.Init(fd, m, d) + fd.vfsfd.Init(fd, flags, m, d, &vfs.FileDescriptionOptions{}) } // Seek implements vfs.FileDescriptionImpl.Seek. @@ -117,15 +115,3 @@ func (fd *DynamicBytesFD) SetStat(context.Context, vfs.SetStatOptions) error { // DynamicBytesFiles are immutable. return syserror.EPERM } - -// StatusFlags implements vfs.FileDescriptionImpl.StatusFlags. -func (fd *DynamicBytesFD) StatusFlags(ctx context.Context) (uint32, error) { - return fd.flags, nil -} - -// SetStatusFlags implements vfs.FileDescriptionImpl.SetStatusFlags. -func (fd *DynamicBytesFD) SetStatusFlags(ctx context.Context, flags uint32) error { - // None of the flags settable by fcntl(F_SETFL) are supported, so this is a - // no-op. - return nil -} diff --git a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go index d6c18937a..bd402330f 100644 --- a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go @@ -39,7 +39,6 @@ type GenericDirectoryFD struct { vfsfd vfs.FileDescription children *OrderedChildren - flags uint32 off int64 } @@ -48,8 +47,7 @@ func (fd *GenericDirectoryFD) Init(m *vfs.Mount, d *vfs.Dentry, children *Ordere m.IncRef() // DecRef in vfs.FileDescription.vd.DecRef on final ref. d.IncRef() // DecRef in vfs.FileDescription.vd.DecRef on final ref. fd.children = children - fd.flags = flags - fd.vfsfd.Init(fd, m, d) + fd.vfsfd.Init(fd, flags, m, d, &vfs.FileDescriptionOptions{}) } // VFSFileDescription returns a pointer to the vfs.FileDescription representing @@ -180,18 +178,6 @@ func (fd *GenericDirectoryFD) Seek(ctx context.Context, offset int64, whence int return offset, nil } -// StatusFlags implements vfs.FileDescriptionImpl.StatusFlags. -func (fd *GenericDirectoryFD) StatusFlags(ctx context.Context) (uint32, error) { - return fd.flags, nil -} - -// SetStatusFlags implements vfs.FileDescriptionImpl.SetStatusFlags. -func (fd *GenericDirectoryFD) SetStatusFlags(ctx context.Context, flags uint32) error { - // None of the flags settable by fcntl(F_SETFL) are supported, so this is a - // no-op. - return nil -} - // Stat implements vfs.FileDescriptionImpl.Stat. func (fd *GenericDirectoryFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { fs := fd.filesystem() diff --git a/pkg/sentry/fsimpl/memfs/filesystem.go b/pkg/sentry/fsimpl/memfs/filesystem.go index 22f1e811f..af4389459 100644 --- a/pkg/sentry/fsimpl/memfs/filesystem.go +++ b/pkg/sentry/fsimpl/memfs/filesystem.go @@ -282,9 +282,8 @@ func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) { // Filter out flags that are not supported by memfs. O_DIRECTORY and // O_NOFOLLOW have no effect here (they're handled by VFS by setting - // appropriate bits in rp), but are returned by - // FileDescriptionImpl.StatusFlags(). O_NONBLOCK is supported only by - // pipes. + // appropriate bits in rp), but are visible in FD status flags. O_NONBLOCK + // is supported only by pipes. opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_TRUNC | linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_NONBLOCK if opts.Flags&linux.O_CREAT == 0 { @@ -384,7 +383,6 @@ func (i *inode) open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentr switch impl := i.impl.(type) { case *regularFile: var fd regularFileFD - fd.flags = flags fd.readable = vfs.MayReadFileWithOpenFlags(flags) fd.writable = vfs.MayWriteFileWithOpenFlags(flags) if fd.writable { @@ -395,7 +393,7 @@ func (i *inode) open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentr } mnt.IncRef() vfsd.IncRef() - fd.vfsfd.Init(&fd, mnt, vfsd) + fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{}) if flags&linux.O_TRUNC != 0 { impl.mu.Lock() impl.data = impl.data[:0] @@ -411,8 +409,7 @@ func (i *inode) open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentr var fd directoryFD mnt.IncRef() vfsd.IncRef() - fd.vfsfd.Init(&fd, mnt, vfsd) - fd.flags = flags + fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{}) return &fd.vfsfd, nil case *symlink: // Can't open symlinks without O_PATH (which is unimplemented). diff --git a/pkg/sentry/fsimpl/memfs/memfs.go b/pkg/sentry/fsimpl/memfs/memfs.go index 4cb2a4e0f..9d509f6e4 100644 --- a/pkg/sentry/fsimpl/memfs/memfs.go +++ b/pkg/sentry/fsimpl/memfs/memfs.go @@ -261,8 +261,6 @@ func (i *inode) direntType() uint8 { type fileDescription struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl - - flags uint32 // status flags; immutable } func (fd *fileDescription) filesystem() *filesystem { @@ -273,18 +271,6 @@ func (fd *fileDescription) inode() *inode { return fd.vfsfd.Dentry().Impl().(*dentry).inode } -// StatusFlags implements vfs.FileDescriptionImpl.StatusFlags. -func (fd *fileDescription) StatusFlags(ctx context.Context) (uint32, error) { - return fd.flags, nil -} - -// SetStatusFlags implements vfs.FileDescriptionImpl.SetStatusFlags. -func (fd *fileDescription) SetStatusFlags(ctx context.Context, flags uint32) error { - // None of the flags settable by fcntl(F_SETFL) are supported, so this is a - // no-op. - return nil -} - // Stat implements vfs.FileDescriptionImpl.Stat. func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { var stat linux.Statx diff --git a/pkg/sentry/fsimpl/memfs/named_pipe.go b/pkg/sentry/fsimpl/memfs/named_pipe.go index 91cb4b1fc..d5060850e 100644 --- a/pkg/sentry/fsimpl/memfs/named_pipe.go +++ b/pkg/sentry/fsimpl/memfs/named_pipe.go @@ -57,6 +57,6 @@ func newNamedPipeFD(ctx context.Context, np *namedPipe, rp *vfs.ResolvingPath, v mnt := rp.Mount() mnt.IncRef() vfsd.IncRef() - fd.vfsfd.Init(&fd, mnt, vfsd) + fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{}) return &fd.vfsfd, nil } diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go index c5a9adca3..df03886c3 100644 --- a/pkg/sentry/vfs/file_description.go +++ b/pkg/sentry/vfs/file_description.go @@ -20,6 +20,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" @@ -39,49 +40,43 @@ type FileDescription struct { // operations. refs int64 + // statusFlags contains status flags, "initialized by open(2) and possibly + // modified by fcntl()" - fcntl(2). statusFlags is accessed using atomic + // memory operations. + statusFlags uint32 + // vd is the filesystem location at which this FileDescription was opened. // A reference is held on vd. vd is immutable. vd VirtualDentry + opts FileDescriptionOptions + // impl is the FileDescriptionImpl associated with this Filesystem. impl is // immutable. This should be the last field in FileDescription. impl FileDescriptionImpl } +// FileDescriptionOptions contains options to FileDescription.Init(). +type FileDescriptionOptions struct { + // If AllowDirectIO is true, allow O_DIRECT to be set on the file. This is + // usually only the case if O_DIRECT would actually have an effect. + AllowDirectIO bool +} + // Init must be called before first use of fd. It takes ownership of references -// on mnt and d held by the caller. -func (fd *FileDescription) Init(impl FileDescriptionImpl, mnt *Mount, d *Dentry) { +// on mnt and d held by the caller. statusFlags is the initial file description +// status flags, which is usually the full set of flags passed to open(2). +func (fd *FileDescription) Init(impl FileDescriptionImpl, statusFlags uint32, mnt *Mount, d *Dentry, opts *FileDescriptionOptions) { fd.refs = 1 + fd.statusFlags = statusFlags | linux.O_LARGEFILE fd.vd = VirtualDentry{ mount: mnt, dentry: d, } + fd.opts = *opts fd.impl = impl } -// Impl returns the FileDescriptionImpl associated with fd. -func (fd *FileDescription) Impl() FileDescriptionImpl { - return fd.impl -} - -// Mount returns the mount on which fd was opened. It does not take a reference -// on the returned Mount. -func (fd *FileDescription) Mount() *Mount { - return fd.vd.mount -} - -// Dentry returns the dentry at which fd was opened. It does not take a -// reference on the returned Dentry. -func (fd *FileDescription) Dentry() *Dentry { - return fd.vd.dentry -} - -// VirtualDentry returns the location at which fd was opened. It does not take -// a reference on the returned VirtualDentry. -func (fd *FileDescription) VirtualDentry() VirtualDentry { - return fd.vd -} - // IncRef increments fd's reference count. func (fd *FileDescription) IncRef() { atomic.AddInt64(&fd.refs, 1) @@ -113,6 +108,82 @@ func (fd *FileDescription) DecRef() { } } +// Mount returns the mount on which fd was opened. It does not take a reference +// on the returned Mount. +func (fd *FileDescription) Mount() *Mount { + return fd.vd.mount +} + +// Dentry returns the dentry at which fd was opened. It does not take a +// reference on the returned Dentry. +func (fd *FileDescription) Dentry() *Dentry { + return fd.vd.dentry +} + +// VirtualDentry returns the location at which fd was opened. It does not take +// a reference on the returned VirtualDentry. +func (fd *FileDescription) VirtualDentry() VirtualDentry { + return fd.vd +} + +// StatusFlags returns file description status flags, as for fcntl(F_GETFL). +func (fd *FileDescription) StatusFlags() uint32 { + return atomic.LoadUint32(&fd.statusFlags) +} + +// SetStatusFlags sets file description status flags, as for fcntl(F_SETFL). +func (fd *FileDescription) SetStatusFlags(ctx context.Context, creds *auth.Credentials, flags uint32) error { + // Compare Linux's fs/fcntl.c:setfl(). + oldFlags := fd.StatusFlags() + // Linux documents this check as "O_APPEND cannot be cleared if the file is + // marked as append-only and the file is open for write", which would make + // sense. However, the check as actually implemented seems to be "O_APPEND + // cannot be changed if the file is marked as append-only". + if (flags^oldFlags)&linux.O_APPEND != 0 { + stat, err := fd.impl.Stat(ctx, StatOptions{ + // There is no mask bit for stx_attributes. + Mask: 0, + // Linux just reads inode::i_flags directly. + Sync: linux.AT_STATX_DONT_SYNC, + }) + if err != nil { + return err + } + if (stat.AttributesMask&linux.STATX_ATTR_APPEND != 0) && (stat.Attributes&linux.STATX_ATTR_APPEND != 0) { + return syserror.EPERM + } + } + if (flags&linux.O_NOATIME != 0) && (oldFlags&linux.O_NOATIME == 0) { + stat, err := fd.impl.Stat(ctx, StatOptions{ + Mask: linux.STATX_UID, + // Linux's inode_owner_or_capable() just reads inode::i_uid + // directly. + Sync: linux.AT_STATX_DONT_SYNC, + }) + if err != nil { + return err + } + if stat.Mask&linux.STATX_UID == 0 { + return syserror.EPERM + } + if !CanActAsOwner(creds, auth.KUID(stat.UID)) { + return syserror.EPERM + } + } + if flags&linux.O_DIRECT != 0 && !fd.opts.AllowDirectIO { + return syserror.EINVAL + } + // TODO(jamieliu): FileDescriptionImpl.SetOAsync()? + const settableFlags = linux.O_APPEND | linux.O_ASYNC | linux.O_DIRECT | linux.O_NOATIME | linux.O_NONBLOCK + atomic.StoreUint32(&fd.statusFlags, (oldFlags&^settableFlags)|(flags&settableFlags)) + return nil +} + +// Impl returns the FileDescriptionImpl associated with fd. +func (fd *FileDescription) Impl() FileDescriptionImpl { + return fd.impl +} + // FileDescriptionImpl contains implementation details for an FileDescription. // Implementations of FileDescriptionImpl should contain their associated // FileDescription by value as their first field. @@ -132,14 +203,6 @@ type FileDescriptionImpl interface { // prevent the file descriptor from being closed. OnClose(ctx context.Context) error - // StatusFlags returns file description status flags, as for - // fcntl(F_GETFL). - StatusFlags(ctx context.Context) (uint32, error) - - // SetStatusFlags sets file description status flags, as for - // fcntl(F_SETFL). - SetStatusFlags(ctx context.Context, flags uint32) error - // Stat returns metadata for the file represented by the FileDescription. Stat(ctx context.Context, opts StatOptions) (linux.Statx, error) @@ -264,18 +327,6 @@ func (fd *FileDescription) OnClose(ctx context.Context) error { return fd.impl.OnClose(ctx) } -// StatusFlags returns file description status flags, as for fcntl(F_GETFL). -func (fd *FileDescription) StatusFlags(ctx context.Context) (uint32, error) { - flags, err := fd.impl.StatusFlags(ctx) - flags |= linux.O_LARGEFILE - return flags, err -} - -// SetStatusFlags sets file description status flags, as for fcntl(F_SETFL). -func (fd *FileDescription) SetStatusFlags(ctx context.Context, flags uint32) error { - return fd.impl.SetStatusFlags(ctx, flags) -} - // Stat returns metadata for the file represented by fd. func (fd *FileDescription) Stat(ctx context.Context, opts StatOptions) (linux.Statx, error) { return fd.impl.Stat(ctx, opts) diff --git a/pkg/sentry/vfs/file_description_impl_util_test.go b/pkg/sentry/vfs/file_description_impl_util_test.go index ac7799296..678be07fe 100644 --- a/pkg/sentry/vfs/file_description_impl_util_test.go +++ b/pkg/sentry/vfs/file_description_impl_util_test.go @@ -48,7 +48,7 @@ type genCountFD struct { func newGenCountFD(mnt *Mount, vfsd *Dentry) *FileDescription { var fd genCountFD - fd.vfsfd.Init(&fd, mnt, vfsd) + fd.vfsfd.Init(&fd, 0 /* statusFlags */, mnt, vfsd, &FileDescriptionOptions{}) fd.DynamicBytesFileDescriptionImpl.SetDataSource(&fd) return &fd.vfsfd } -- cgit v1.2.3 From f45df7505b0e7baf48a37f7c625f05051d144738 Mon Sep 17 00:00:00 2001 From: Jamie Liu Date: Mon, 23 Dec 2019 13:17:29 -0800 Subject: Clean up vfs.FilesystemImpl methods that operate on parent directories. - Make FilesystemImpl methods that operate on parent directories require !rp.Done() (i.e. there is at least one path component to resolve) as precondition and postcondition (in cases where they do not finish path resolution due to mount boundary / absolute symlink), and require that they do not need to follow the last path component (the file being created / deleted) as a symlink. Check for these in VFS. - Add FilesystemImpl.GetParentDentryAt(), which is required to obtain the old parent directory for VFS.RenameAt(). (Passing the Dentry to be renamed instead has the wrong semantics if the file named by the old path is a mount point since the Dentry will be on the wrong Mount.) - Update memfs to implement these methods correctly (?), including RenameAt. - Change fspath.Parse() to allow empty paths (to simplify implementation of AT_EMPTY_PATH). - Change vfs.PathOperation to take a fspath.Path instead of a raw pathname; non-test callers will need to fspath.Parse() pathnames themselves anyway in order to detect absolute paths and select PathOperation.Start accordingly. PiperOrigin-RevId: 286934941 --- pkg/fspath/BUILD | 2 - pkg/fspath/fspath.go | 24 +- pkg/fspath/fspath_test.go | 25 +- pkg/sentry/fsimpl/ext/BUILD | 1 + pkg/sentry/fsimpl/ext/benchmark/BUILD | 1 + pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go | 11 +- pkg/sentry/fsimpl/ext/ext_test.go | 9 +- pkg/sentry/fsimpl/ext/filesystem.go | 12 +- pkg/sentry/fsimpl/kernfs/BUILD | 1 + pkg/sentry/fsimpl/kernfs/filesystem.go | 138 +++-- pkg/sentry/fsimpl/kernfs/kernfs_test.go | 7 +- pkg/sentry/fsimpl/memfs/BUILD | 2 + pkg/sentry/fsimpl/memfs/benchmark_test.go | 27 +- pkg/sentry/fsimpl/memfs/filesystem.go | 667 ++++++++++++---------- pkg/sentry/fsimpl/memfs/memfs.go | 29 +- pkg/sentry/fsimpl/memfs/pipe_test.go | 18 +- pkg/sentry/vfs/dentry.go | 29 +- pkg/sentry/vfs/file_description.go | 19 + pkg/sentry/vfs/filesystem.go | 251 +++++++- pkg/sentry/vfs/options.go | 3 + pkg/sentry/vfs/resolving_path.go | 46 +- pkg/sentry/vfs/testutil.go | 7 +- pkg/sentry/vfs/vfs.go | 259 ++++++--- pkg/syserror/syserror.go | 1 + 24 files changed, 1051 insertions(+), 538 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/fspath/BUILD b/pkg/fspath/BUILD index 0c5f50397..ca540363c 100644 --- a/pkg/fspath/BUILD +++ b/pkg/fspath/BUILD @@ -14,7 +14,6 @@ go_library( "fspath.go", ], importpath = "gvisor.dev/gvisor/pkg/fspath", - deps = ["//pkg/syserror"], ) go_test( @@ -25,5 +24,4 @@ go_test( "fspath_test.go", ], embed = [":fspath"], - deps = ["//pkg/syserror"], ) diff --git a/pkg/fspath/fspath.go b/pkg/fspath/fspath.go index f68752560..9fb3fee24 100644 --- a/pkg/fspath/fspath.go +++ b/pkg/fspath/fspath.go @@ -18,19 +18,17 @@ package fspath import ( "strings" - - "gvisor.dev/gvisor/pkg/syserror" ) const pathSep = '/' -// Parse parses a pathname as described by path_resolution(7). -func Parse(pathname string) (Path, error) { +// Parse parses a pathname as described by path_resolution(7), except that +// empty pathnames will be parsed successfully to a Path for which +// Path.Absolute == Path.Dir == Path.HasComponents() == false. (This is +// necessary to support AT_EMPTY_PATH.) +func Parse(pathname string) Path { if len(pathname) == 0 { - // "... POSIX decrees that an empty pathname must not be resolved - // successfully. Linux returns ENOENT in this case." - - // path_resolution(7) - return Path{}, syserror.ENOENT + return Path{} } // Skip leading path separators. i := 0 @@ -41,7 +39,7 @@ func Parse(pathname string) (Path, error) { return Path{ Absolute: true, Dir: true, - }, nil + } } } // Skip trailing path separators. This is required by Iterator.Next. This @@ -64,7 +62,7 @@ func Parse(pathname string) (Path, error) { }, Absolute: i != 0, Dir: j != len(pathname)-1, - }, nil + } } // Path contains the information contained in a pathname string. @@ -111,6 +109,12 @@ func (p Path) String() string { return b.String() } +// HasComponents returns true if p contains a non-zero number of path +// components. +func (p Path) HasComponents() bool { + return p.Begin.Ok() +} + // An Iterator represents either a path component in a Path or a terminal // iterator indicating that the end of the path has been reached. // diff --git a/pkg/fspath/fspath_test.go b/pkg/fspath/fspath_test.go index 215b35622..d5e9a549a 100644 --- a/pkg/fspath/fspath_test.go +++ b/pkg/fspath/fspath_test.go @@ -18,15 +18,10 @@ import ( "reflect" "strings" "testing" - - "gvisor.dev/gvisor/pkg/syserror" ) func TestParseIteratorPartialPathnames(t *testing.T) { - path, err := Parse("/foo//bar///baz////") - if err != nil { - t.Fatalf("Parse failed: %v", err) - } + path := Parse("/foo//bar///baz////") // Parse strips leading slashes, and records their presence as // Path.Absolute. if !path.Absolute { @@ -70,6 +65,12 @@ func TestParse(t *testing.T) { dir bool } tests := []testCase{ + { + pathname: "", + relpath: []string{}, + abs: false, + dir: false, + }, { pathname: "/", relpath: []string{}, @@ -113,10 +114,7 @@ func TestParse(t *testing.T) { for _, test := range tests { t.Run(test.pathname, func(t *testing.T) { - p, err := Parse(test.pathname) - if err != nil { - t.Fatalf("failed to parse pathname %q: %v", test.pathname, err) - } + p := Parse(test.pathname) t.Logf("pathname %q => path %q", test.pathname, p) if p.Absolute != test.abs { t.Errorf("path absoluteness: got %v, wanted %v", p.Absolute, test.abs) @@ -134,10 +132,3 @@ func TestParse(t *testing.T) { }) } } - -func TestParseEmptyPathname(t *testing.T) { - p, err := Parse("") - if err != syserror.ENOENT { - t.Errorf("parsing empty pathname: got (%v, %v), wanted (, ENOENT)", p, err) - } -} diff --git a/pkg/sentry/fsimpl/ext/BUILD b/pkg/sentry/fsimpl/ext/BUILD index 880b7bcd3..bc90330bc 100644 --- a/pkg/sentry/fsimpl/ext/BUILD +++ b/pkg/sentry/fsimpl/ext/BUILD @@ -74,6 +74,7 @@ go_test( deps = [ "//pkg/abi/linux", "//pkg/binary", + "//pkg/fspath", "//pkg/sentry/context", "//pkg/sentry/context/contexttest", "//pkg/sentry/fsimpl/ext/disklayout", diff --git a/pkg/sentry/fsimpl/ext/benchmark/BUILD b/pkg/sentry/fsimpl/ext/benchmark/BUILD index bfc46dfa6..4fc8296ef 100644 --- a/pkg/sentry/fsimpl/ext/benchmark/BUILD +++ b/pkg/sentry/fsimpl/ext/benchmark/BUILD @@ -7,6 +7,7 @@ go_test( size = "small", srcs = ["benchmark_test.go"], deps = [ + "//pkg/fspath", "//pkg/sentry/context", "//pkg/sentry/context/contexttest", "//pkg/sentry/fsimpl/ext", diff --git a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go index 177ce2cb9..2f46d2d13 100644 --- a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go +++ b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go @@ -24,6 +24,7 @@ import ( "strings" "testing" + "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/context/contexttest" "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext" @@ -121,7 +122,7 @@ func BenchmarkVFS2Ext4fsStat(b *testing.B) { stat, err := vfsfs.StatAt(ctx, creds, &vfs.PathOperation{ Root: *root, Start: *root, - Pathname: filePath, + Path: fspath.Parse(filePath), FollowFinalSymlink: true, }, &vfs.StatOptions{}) if err != nil { @@ -150,9 +151,9 @@ func BenchmarkVFS2ExtfsMountStat(b *testing.B) { creds := auth.CredentialsFromContext(ctx) mountPointName := "/1/" pop := vfs.PathOperation{ - Root: *root, - Start: *root, - Pathname: mountPointName, + Root: *root, + Start: *root, + Path: fspath.Parse(mountPointName), } // Save the mount point for later use. @@ -181,7 +182,7 @@ func BenchmarkVFS2ExtfsMountStat(b *testing.B) { stat, err := vfsfs.StatAt(ctx, creds, &vfs.PathOperation{ Root: *root, Start: *root, - Pathname: filePath, + Path: fspath.Parse(filePath), FollowFinalSymlink: true, }, &vfs.StatOptions{}) if err != nil { diff --git a/pkg/sentry/fsimpl/ext/ext_test.go b/pkg/sentry/fsimpl/ext/ext_test.go index e9f756732..5d6c999bd 100644 --- a/pkg/sentry/fsimpl/ext/ext_test.go +++ b/pkg/sentry/fsimpl/ext/ext_test.go @@ -25,6 +25,7 @@ import ( "github.com/google/go-cmp/cmp" "github.com/google/go-cmp/cmp/cmpopts" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/context/contexttest" "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout" @@ -140,7 +141,7 @@ func TestSeek(t *testing.T) { fd, err := vfsfs.OpenAt( ctx, auth.CredentialsFromContext(ctx), - &vfs.PathOperation{Root: *root, Start: *root, Pathname: test.path}, + &vfs.PathOperation{Root: *root, Start: *root, Path: fspath.Parse(test.path)}, &vfs.OpenOptions{}, ) if err != nil { @@ -359,7 +360,7 @@ func TestStatAt(t *testing.T) { got, err := vfsfs.StatAt(ctx, auth.CredentialsFromContext(ctx), - &vfs.PathOperation{Root: *root, Start: *root, Pathname: test.path}, + &vfs.PathOperation{Root: *root, Start: *root, Path: fspath.Parse(test.path)}, &vfs.StatOptions{}, ) if err != nil { @@ -429,7 +430,7 @@ func TestRead(t *testing.T) { fd, err := vfsfs.OpenAt( ctx, auth.CredentialsFromContext(ctx), - &vfs.PathOperation{Root: *root, Start: *root, Pathname: test.absPath}, + &vfs.PathOperation{Root: *root, Start: *root, Path: fspath.Parse(test.absPath)}, &vfs.OpenOptions{}, ) if err != nil { @@ -565,7 +566,7 @@ func TestIterDirents(t *testing.T) { fd, err := vfsfs.OpenAt( ctx, auth.CredentialsFromContext(ctx), - &vfs.PathOperation{Root: *root, Start: *root, Pathname: test.path}, + &vfs.PathOperation{Root: *root, Start: *root, Path: fspath.Parse(test.path)}, &vfs.OpenOptions{}, ) if err != nil { diff --git a/pkg/sentry/fsimpl/ext/filesystem.go b/pkg/sentry/fsimpl/ext/filesystem.go index d7e87979a..616fc002a 100644 --- a/pkg/sentry/fsimpl/ext/filesystem.go +++ b/pkg/sentry/fsimpl/ext/filesystem.go @@ -275,6 +275,16 @@ func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, op return vfsd, nil } +// GetParentDentryAt implements vfs.FilesystemImpl.GetParentDentryAt. +func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) { + vfsd, inode, err := fs.walk(rp, true) + if err != nil { + return nil, err + } + inode.incRef() + return vfsd, nil +} + // OpenAt implements vfs.FilesystemImpl.OpenAt. func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) { vfsd, inode, err := fs.walk(rp, false) @@ -378,7 +388,7 @@ func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v } // RenameAt implements vfs.FilesystemImpl.RenameAt. -func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry, opts vfs.RenameOptions) error { +func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldParentVD vfs.VirtualDentry, oldName string, opts vfs.RenameOptions) error { if rp.Done() { return syserror.ENOENT } diff --git a/pkg/sentry/fsimpl/kernfs/BUILD b/pkg/sentry/fsimpl/kernfs/BUILD index 52596c090..59f7f39e2 100644 --- a/pkg/sentry/fsimpl/kernfs/BUILD +++ b/pkg/sentry/fsimpl/kernfs/BUILD @@ -49,6 +49,7 @@ go_test( deps = [ ":kernfs", "//pkg/abi/linux", + "//pkg/fspath", "//pkg/sentry/context", "//pkg/sentry/context/contexttest", "//pkg/sentry/kernel/auth", diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go index 3cbbe4b20..a6f9fced5 100644 --- a/pkg/sentry/fsimpl/kernfs/filesystem.go +++ b/pkg/sentry/fsimpl/kernfs/filesystem.go @@ -44,39 +44,37 @@ func (fs *Filesystem) stepExistingLocked(ctx context.Context, rp *vfs.ResolvingP return nil, err } afterSymlink: + name := rp.Component() + // Revalidation must be skipped if name is "." or ".."; d or its parent + // respectively can't be expected to transition from invalidated back to + // valid, so detecting invalidation and retrying would loop forever. This + // is consistent with Linux: fs/namei.c:walk_component() => lookup_fast() + // calls d_revalidate(), but walk_component() => handle_dots() does not. + if name == "." { + rp.Advance() + return vfsd, nil + } + if name == ".." { + nextVFSD, err := rp.ResolveParent(vfsd) + if err != nil { + return nil, err + } + rp.Advance() + return nextVFSD, nil + } d.dirMu.Lock() - nextVFSD, err := rp.ResolveComponent(vfsd) - d.dirMu.Unlock() + nextVFSD, err := rp.ResolveChild(vfsd, name) if err != nil { + d.dirMu.Unlock() return nil, err } - if nextVFSD != nil { - // Cached dentry exists, revalidate. - next := nextVFSD.Impl().(*Dentry) - if !next.inode.Valid(ctx) { - d.dirMu.Lock() - rp.VirtualFilesystem().ForceDeleteDentry(nextVFSD) - d.dirMu.Unlock() - fs.deferDecRef(nextVFSD) // Reference from Lookup. - nextVFSD = nil - } - } - if nextVFSD == nil { - // Dentry isn't cached; it either doesn't exist or failed - // revalidation. Attempt to resolve it via Lookup. - name := rp.Component() - var err error - nextVFSD, err = d.inode.Lookup(ctx, name) - // Reference on nextVFSD dropped by a corresponding Valid. - if err != nil { - return nil, err - } - d.InsertChild(name, nextVFSD) + next, err := fs.revalidateChildLocked(ctx, rp.VirtualFilesystem(), d, name, nextVFSD) + d.dirMu.Unlock() + if err != nil { + return nil, err } - next := nextVFSD.Impl().(*Dentry) - // Resolve any symlink at current path component. - if rp.ShouldFollowSymlink() && d.isSymlink() { + if rp.ShouldFollowSymlink() && next.isSymlink() { // TODO: VFS2 needs something extra for /proc/[pid]/fd/ "magic symlinks". target, err := next.inode.Readlink(ctx) if err != nil { @@ -89,7 +87,44 @@ afterSymlink: } rp.Advance() - return nextVFSD, nil + return &next.vfsd, nil +} + +// revalidateChildLocked must be called after a call to parent.vfsd.Child(name) +// or vfs.ResolvingPath.ResolveChild(name) returns childVFSD (which may be +// nil) to verify that the returned child (or lack thereof) is correct. +// +// Preconditions: Filesystem.mu must be locked for at least reading. +// parent.dirMu must be locked. parent.isDir(). name is not "." or "..". +// +// Postconditions: Caller must call fs.processDeferredDecRefs*. +func (fs *Filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.VirtualFilesystem, parent *Dentry, name string, childVFSD *vfs.Dentry) (*Dentry, error) { + if childVFSD != nil { + // Cached dentry exists, revalidate. + child := childVFSD.Impl().(*Dentry) + if !child.inode.Valid(ctx) { + vfsObj.ForceDeleteDentry(childVFSD) + fs.deferDecRef(childVFSD) // Reference from Lookup. + childVFSD = nil + } + } + if childVFSD == nil { + // Dentry isn't cached; it either doesn't exist or failed + // revalidation. Attempt to resolve it via Lookup. + // + // FIXME(b/144498111): Inode.Lookup() should return *(kernfs.)Dentry, + // not *vfs.Dentry, since (kernfs.)Filesystem assumes that all dentries + // in the filesystem are (kernfs.)Dentry and performs vfs.DentryImpl + // casts accordingly. + var err error + childVFSD, err = parent.inode.Lookup(ctx, name) + if err != nil { + return nil, err + } + // Reference on childVFSD dropped by a corresponding Valid. + parent.InsertChild(name, childVFSD) + } + return childVFSD.Impl().(*Dentry), nil } // walkExistingLocked resolves rp to an existing file. @@ -242,6 +277,19 @@ func (fs *Filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, op return vfsd, nil } +// GetParentDentryAt implements vfs.FilesystemImpl.GetParentDentryAt. +func (fs *Filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) { + fs.mu.RLock() + defer fs.processDeferredDecRefs() + defer fs.mu.RUnlock() + vfsd, _, err := fs.walkParentDirLocked(ctx, rp) + if err != nil { + return nil, err + } + vfsd.IncRef() // Ownership transferred to caller. + return vfsd, nil +} + // LinkAt implements vfs.FilesystemImpl.LinkAt. func (fs *Filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error { if rp.Done() { @@ -459,40 +507,42 @@ func (fs *Filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (st } // RenameAt implements vfs.FilesystemImpl.RenameAt. -func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry, opts vfs.RenameOptions) error { - noReplace := opts.Flags&linux.RENAME_NOREPLACE != 0 - exchange := opts.Flags&linux.RENAME_EXCHANGE != 0 - whiteout := opts.Flags&linux.RENAME_WHITEOUT != 0 - if exchange && (noReplace || whiteout) { - // Can't specify RENAME_NOREPLACE or RENAME_WHITEOUT with RENAME_EXCHANGE. - return syserror.EINVAL - } - if exchange || whiteout { - // Exchange and Whiteout flags are not supported on kernfs. +func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldParentVD vfs.VirtualDentry, oldName string, opts vfs.RenameOptions) error { + // Only RENAME_NOREPLACE is supported. + if opts.Flags&^linux.RENAME_NOREPLACE != 0 { return syserror.EINVAL } + noReplace := opts.Flags&linux.RENAME_NOREPLACE != 0 fs.mu.Lock() defer fs.mu.Lock() + // Resolve the destination directory first to verify that it's on this + // Mount. + dstDirVFSD, dstDirInode, err := fs.walkParentDirLocked(ctx, rp) + fs.processDeferredDecRefsLocked() + if err != nil { + return err + } mnt := rp.Mount() - if mnt != vd.Mount() { + if mnt != oldParentVD.Mount() { return syserror.EXDEV } - if err := mnt.CheckBeginWrite(); err != nil { return err } defer mnt.EndWrite() - dstDirVFSD, dstDirInode, err := fs.walkParentDirLocked(ctx, rp) + srcDirVFSD := oldParentVD.Dentry() + srcDir := srcDirVFSD.Impl().(*Dentry) + srcDir.dirMu.Lock() + src, err := fs.revalidateChildLocked(ctx, rp.VirtualFilesystem(), srcDir, oldName, srcDirVFSD.Child(oldName)) + srcDir.dirMu.Unlock() fs.processDeferredDecRefsLocked() if err != nil { return err } - - srcVFSD := vd.Dentry() - srcDirVFSD := srcVFSD.Parent() + srcVFSD := &src.vfsd // Can we remove the src dentry? if err := checkDeleteLocked(rp, srcVFSD); err != nil { diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go index f78bb7b04..73b6e43b5 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs_test.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go @@ -24,6 +24,7 @@ import ( "github.com/google/go-cmp/cmp" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/context/contexttest" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" @@ -82,9 +83,9 @@ func newTestSystem(t *testing.T, rootFn RootDentryFn) *TestSystem { // Precondition: path should be relative path. func (s *TestSystem) PathOpAtRoot(path string) vfs.PathOperation { return vfs.PathOperation{ - Root: s.root, - Start: s.root, - Pathname: path, + Root: s.root, + Start: s.root, + Path: fspath.Parse(path), } } diff --git a/pkg/sentry/fsimpl/memfs/BUILD b/pkg/sentry/fsimpl/memfs/BUILD index 0cc751eb8..5689bed3b 100644 --- a/pkg/sentry/fsimpl/memfs/BUILD +++ b/pkg/sentry/fsimpl/memfs/BUILD @@ -50,6 +50,7 @@ go_test( deps = [ ":memfs", "//pkg/abi/linux", + "//pkg/fspath", "//pkg/refs", "//pkg/sentry/context", "//pkg/sentry/context/contexttest", @@ -68,6 +69,7 @@ go_test( embed = [":memfs"], deps = [ "//pkg/abi/linux", + "//pkg/fspath", "//pkg/sentry/context", "//pkg/sentry/context/contexttest", "//pkg/sentry/kernel/auth", diff --git a/pkg/sentry/fsimpl/memfs/benchmark_test.go b/pkg/sentry/fsimpl/memfs/benchmark_test.go index 4a7a94a52..6e987af88 100644 --- a/pkg/sentry/fsimpl/memfs/benchmark_test.go +++ b/pkg/sentry/fsimpl/memfs/benchmark_test.go @@ -21,6 +21,7 @@ import ( "testing" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/context/contexttest" @@ -193,9 +194,9 @@ func BenchmarkVFS2MemfsStat(b *testing.B) { for i := depth; i > 0; i-- { name := fmt.Sprintf("%d", i) pop := vfs.PathOperation{ - Root: root, - Start: vd, - Pathname: name, + Root: root, + Start: vd, + Path: fspath.Parse(name), } if err := vfsObj.MkdirAt(ctx, creds, &pop, &vfs.MkdirOptions{ Mode: 0755, @@ -216,7 +217,7 @@ func BenchmarkVFS2MemfsStat(b *testing.B) { fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{ Root: root, Start: vd, - Pathname: filename, + Path: fspath.Parse(filename), FollowFinalSymlink: true, }, &vfs.OpenOptions{ Flags: linux.O_RDWR | linux.O_CREAT | linux.O_EXCL, @@ -237,7 +238,7 @@ func BenchmarkVFS2MemfsStat(b *testing.B) { stat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{ Root: root, Start: root, - Pathname: filePath, + Path: fspath.Parse(filePath), FollowFinalSymlink: true, }, &vfs.StatOptions{}) if err != nil { @@ -378,9 +379,9 @@ func BenchmarkVFS2MemfsMountStat(b *testing.B) { root := mntns.Root() defer root.DecRef() pop := vfs.PathOperation{ - Root: root, - Start: root, - Pathname: mountPointName, + Root: root, + Start: root, + Path: fspath.Parse(mountPointName), } if err := vfsObj.MkdirAt(ctx, creds, &pop, &vfs.MkdirOptions{ Mode: 0755, @@ -408,9 +409,9 @@ func BenchmarkVFS2MemfsMountStat(b *testing.B) { for i := depth; i > 0; i-- { name := fmt.Sprintf("%d", i) pop := vfs.PathOperation{ - Root: root, - Start: vd, - Pathname: name, + Root: root, + Start: vd, + Path: fspath.Parse(name), } if err := vfsObj.MkdirAt(ctx, creds, &pop, &vfs.MkdirOptions{ Mode: 0755, @@ -438,7 +439,7 @@ func BenchmarkVFS2MemfsMountStat(b *testing.B) { fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{ Root: root, Start: vd, - Pathname: filename, + Path: fspath.Parse(filename), FollowFinalSymlink: true, }, &vfs.OpenOptions{ Flags: linux.O_RDWR | linux.O_CREAT | linux.O_EXCL, @@ -458,7 +459,7 @@ func BenchmarkVFS2MemfsMountStat(b *testing.B) { stat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{ Root: root, Start: root, - Pathname: filePath, + Path: fspath.Parse(filePath), FollowFinalSymlink: true, }, &vfs.StatOptions{}) if err != nil { diff --git a/pkg/sentry/fsimpl/memfs/filesystem.go b/pkg/sentry/fsimpl/memfs/filesystem.go index af4389459..4a83f310c 100644 --- a/pkg/sentry/fsimpl/memfs/filesystem.go +++ b/pkg/sentry/fsimpl/memfs/filesystem.go @@ -25,323 +25,283 @@ import ( "gvisor.dev/gvisor/pkg/syserror" ) -// stepLocked resolves rp.Component() in parent directory vfsd. +// Sync implements vfs.FilesystemImpl.Sync. +func (fs *filesystem) Sync(ctx context.Context) error { + // All filesystem state is in-memory. + return nil +} + +// stepLocked resolves rp.Component() to an existing file, starting from the +// given directory. // // stepLocked is loosely analogous to fs/namei.c:walk_component(). // -// Preconditions: filesystem.mu must be locked. !rp.Done(). inode == -// vfsd.Impl().(*dentry).inode. -func stepLocked(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, inode *inode) (*vfs.Dentry, *inode, error) { - if !inode.isDir() { - return nil, nil, syserror.ENOTDIR +// Preconditions: filesystem.mu must be locked. !rp.Done(). +func stepLocked(rp *vfs.ResolvingPath, d *dentry) (*dentry, error) { + if !d.inode.isDir() { + return nil, syserror.ENOTDIR } - if err := inode.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil { - return nil, nil, err + if err := d.inode.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil { + return nil, err } afterSymlink: - nextVFSD, err := rp.ResolveComponent(vfsd) + nextVFSD, err := rp.ResolveComponent(&d.vfsd) if err != nil { - return nil, nil, err + return nil, err } if nextVFSD == nil { // Since the Dentry tree is the sole source of truth for memfs, if it's // not in the Dentry tree, it doesn't exist. - return nil, nil, syserror.ENOENT + return nil, syserror.ENOENT } - nextInode := nextVFSD.Impl().(*dentry).inode - if symlink, ok := nextInode.impl.(*symlink); ok && rp.ShouldFollowSymlink() { + next := nextVFSD.Impl().(*dentry) + if symlink, ok := next.inode.impl.(*symlink); ok && rp.ShouldFollowSymlink() { // TODO: symlink traversals update access time if err := rp.HandleSymlink(symlink.target); err != nil { - return nil, nil, err + return nil, err } goto afterSymlink // don't check the current directory again } rp.Advance() - return nextVFSD, nextInode, nil + return next, nil } -// walkExistingLocked resolves rp to an existing file. +// walkParentDirLocked resolves all but the last path component of rp to an +// existing directory, starting from the given directory (which is usually +// rp.Start().Impl().(*dentry)). It does not check that the returned directory +// is searchable by the provider of rp. // -// walkExistingLocked is loosely analogous to Linux's -// fs/namei.c:path_lookupat(). +// walkParentDirLocked is loosely analogous to Linux's +// fs/namei.c:path_parentat(). // -// Preconditions: filesystem.mu must be locked. -func walkExistingLocked(rp *vfs.ResolvingPath) (*vfs.Dentry, *inode, error) { - vfsd := rp.Start() - inode := vfsd.Impl().(*dentry).inode - for !rp.Done() { - var err error - vfsd, inode, err = stepLocked(rp, vfsd, inode) +// Preconditions: filesystem.mu must be locked. !rp.Done(). +func walkParentDirLocked(rp *vfs.ResolvingPath, d *dentry) (*dentry, error) { + for !rp.Final() { + next, err := stepLocked(rp, d) if err != nil { - return nil, nil, err + return nil, err } + d = next } - if rp.MustBeDir() && !inode.isDir() { - return nil, nil, syserror.ENOTDIR + if !d.inode.isDir() { + return nil, syserror.ENOTDIR } - return vfsd, inode, nil + return d, nil } -// walkParentDirLocked resolves all but the last path component of rp to an -// existing directory. It does not check that the returned directory is -// searchable by the provider of rp. +// resolveLocked resolves rp to an existing file. // -// walkParentDirLocked is loosely analogous to Linux's -// fs/namei.c:path_parentat(). +// resolveLocked is loosely analogous to Linux's fs/namei.c:path_lookupat(). // -// Preconditions: filesystem.mu must be locked. !rp.Done(). -func walkParentDirLocked(rp *vfs.ResolvingPath) (*vfs.Dentry, *inode, error) { - vfsd := rp.Start() - inode := vfsd.Impl().(*dentry).inode - for !rp.Final() { - var err error - vfsd, inode, err = stepLocked(rp, vfsd, inode) +// Preconditions: filesystem.mu must be locked. +func resolveLocked(rp *vfs.ResolvingPath) (*dentry, error) { + d := rp.Start().Impl().(*dentry) + for !rp.Done() { + next, err := stepLocked(rp, d) if err != nil { - return nil, nil, err + return nil, err } + d = next } - if !inode.isDir() { - return nil, nil, syserror.ENOTDIR + if rp.MustBeDir() && !d.inode.isDir() { + return nil, syserror.ENOTDIR } - return vfsd, inode, nil + return d, nil } -// checkCreateLocked checks that a file named rp.Component() may be created in -// directory parentVFSD, then returns rp.Component(). +// doCreateAt checks that creating a file at rp is permitted, then invokes +// create to do so. // -// Preconditions: filesystem.mu must be locked. parentInode == -// parentVFSD.Impl().(*dentry).inode. parentInode.isDir() == true. -func checkCreateLocked(rp *vfs.ResolvingPath, parentVFSD *vfs.Dentry, parentInode *inode) (string, error) { - if err := parentInode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true); err != nil { - return "", err - } - pc := rp.Component() - if pc == "." || pc == ".." { - return "", syserror.EEXIST - } - childVFSD, err := rp.ResolveChild(parentVFSD, pc) +// doCreateAt is loosely analogous to a conjunction of Linux's +// fs/namei.c:filename_create() and done_path_create(). +// +// Preconditions: !rp.Done(). For the final path component in rp, +// !rp.ShouldFollowSymlink(). +func (fs *filesystem) doCreateAt(rp *vfs.ResolvingPath, dir bool, create func(parent *dentry, name string) error) error { + fs.mu.Lock() + defer fs.mu.Unlock() + parent, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry)) if err != nil { - return "", err + return err } - if childVFSD != nil { - return "", syserror.EEXIST + if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true /* isDir */); err != nil { + return err } - if parentVFSD.IsDisowned() { - return "", syserror.ENOENT + name := rp.Component() + if name == "." || name == ".." { + return syserror.EEXIST } - return pc, nil -} - -// checkDeleteLocked checks that the file represented by vfsd may be deleted. -func checkDeleteLocked(vfsd *vfs.Dentry) error { - parentVFSD := vfsd.Parent() - if parentVFSD == nil { - return syserror.EBUSY + // Call parent.vfsd.Child() instead of stepLocked() or rp.ResolveChild(), + // because if the child exists we want to return EEXIST immediately instead + // of attempting symlink/mount traversal. + if parent.vfsd.Child(name) != nil { + return syserror.EEXIST } - if parentVFSD.IsDisowned() { + if !dir && rp.MustBeDir() { return syserror.ENOENT } - return nil + // In memfs, the only way to cause a dentry to be disowned is by removing + // it from the filesystem, so this check is equivalent to checking if + // parent has been removed. + if parent.vfsd.IsDisowned() { + return syserror.ENOENT + } + mnt := rp.Mount() + if err := mnt.CheckBeginWrite(); err != nil { + return err + } + defer mnt.EndWrite() + return create(parent, name) } // GetDentryAt implements vfs.FilesystemImpl.GetDentryAt. func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) { fs.mu.RLock() defer fs.mu.RUnlock() - vfsd, inode, err := walkExistingLocked(rp) + d, err := resolveLocked(rp) if err != nil { return nil, err } if opts.CheckSearchable { - if !inode.isDir() { + if !d.inode.isDir() { return nil, syserror.ENOTDIR } - if err := inode.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil { + if err := d.inode.checkPermissions(rp.Credentials(), vfs.MayExec, true /* isDir */); err != nil { return nil, err } } - inode.incRef() - return vfsd, nil + d.IncRef() + return &d.vfsd, nil } -// LinkAt implements vfs.FilesystemImpl.LinkAt. -func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error { - if rp.Done() { - return syserror.EEXIST - } - fs.mu.Lock() - defer fs.mu.Unlock() - parentVFSD, parentInode, err := walkParentDirLocked(rp) - if err != nil { - return err - } - pc, err := checkCreateLocked(rp, parentVFSD, parentInode) +// GetParentDentryAt implements vfs.FilesystemImpl.GetParentDentryAt. +func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) { + fs.mu.RLock() + defer fs.mu.RUnlock() + d, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry)) if err != nil { - return err - } - if rp.Mount() != vd.Mount() { - return syserror.EXDEV - } - if err := rp.Mount().CheckBeginWrite(); err != nil { - return err - } - defer rp.Mount().EndWrite() - d := vd.Dentry().Impl().(*dentry) - if d.inode.isDir() { - return syserror.EPERM + return nil, err } - d.inode.incLinksLocked() - child := fs.newDentry(d.inode) - parentVFSD.InsertChild(&child.vfsd, pc) - parentInode.impl.(*directory).childList.PushBack(child) - return nil + d.IncRef() + return &d.vfsd, nil +} + +// LinkAt implements vfs.FilesystemImpl.LinkAt. +func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error { + return fs.doCreateAt(rp, false /* dir */, func(parent *dentry, name string) error { + if rp.Mount() != vd.Mount() { + return syserror.EXDEV + } + d := vd.Dentry().Impl().(*dentry) + if d.inode.isDir() { + return syserror.EPERM + } + if d.inode.nlink == 0 { + return syserror.ENOENT + } + if d.inode.nlink == maxLinks { + return syserror.EMLINK + } + d.inode.incLinksLocked() + child := fs.newDentry(d.inode) + parent.vfsd.InsertChild(&child.vfsd, name) + parent.inode.impl.(*directory).childList.PushBack(child) + return nil + }) } // MkdirAt implements vfs.FilesystemImpl.MkdirAt. func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error { - if rp.Done() { - return syserror.EEXIST - } - fs.mu.Lock() - defer fs.mu.Unlock() - parentVFSD, parentInode, err := walkParentDirLocked(rp) - if err != nil { - return err - } - pc, err := checkCreateLocked(rp, parentVFSD, parentInode) - if err != nil { - return err - } - if err := rp.Mount().CheckBeginWrite(); err != nil { - return err - } - defer rp.Mount().EndWrite() - child := fs.newDentry(fs.newDirectory(rp.Credentials(), opts.Mode)) - parentVFSD.InsertChild(&child.vfsd, pc) - parentInode.impl.(*directory).childList.PushBack(child) - parentInode.incLinksLocked() // from child's ".." - return nil + return fs.doCreateAt(rp, true /* dir */, func(parent *dentry, name string) error { + if parent.inode.nlink == maxLinks { + return syserror.EMLINK + } + parent.inode.incLinksLocked() // from child's ".." + child := fs.newDentry(fs.newDirectory(rp.Credentials(), opts.Mode)) + parent.vfsd.InsertChild(&child.vfsd, name) + parent.inode.impl.(*directory).childList.PushBack(child) + return nil + }) } // MknodAt implements vfs.FilesystemImpl.MknodAt. func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error { - if rp.Done() { - return syserror.EEXIST - } - fs.mu.Lock() - defer fs.mu.Unlock() - parentVFSD, parentInode, err := walkParentDirLocked(rp) - if err != nil { - return err - } - pc, err := checkCreateLocked(rp, parentVFSD, parentInode) - if err != nil { - return err - } - if err := rp.Mount().CheckBeginWrite(); err != nil { - return err - } - defer rp.Mount().EndWrite() - - switch opts.Mode.FileType() { - case 0: - // "Zero file type is equivalent to type S_IFREG." - mknod(2) - fallthrough - case linux.ModeRegular: - // TODO(b/138862511): Implement. - return syserror.EINVAL - - case linux.ModeNamedPipe: - child := fs.newDentry(fs.newNamedPipe(rp.Credentials(), opts.Mode)) - parentVFSD.InsertChild(&child.vfsd, pc) - parentInode.impl.(*directory).childList.PushBack(child) - return nil - - case linux.ModeSocket: - // TODO(b/138862511): Implement. - return syserror.EINVAL - - case linux.ModeCharacterDevice: - fallthrough - case linux.ModeBlockDevice: - // TODO(b/72101894): We don't support creating block or character - // devices at the moment. - // - // When we start supporting block and character devices, we'll - // need to check for CAP_MKNOD here. - return syserror.EPERM - - default: - // "EINVAL - mode requested creation of something other than a - // regular file, device special file, FIFO or socket." - mknod(2) - return syserror.EINVAL - } + return fs.doCreateAt(rp, false /* dir */, func(parent *dentry, name string) error { + switch opts.Mode.FileType() { + case 0, linux.S_IFREG: + child := fs.newDentry(fs.newRegularFile(rp.Credentials(), opts.Mode)) + parent.vfsd.InsertChild(&child.vfsd, name) + parent.inode.impl.(*directory).childList.PushBack(child) + return nil + case linux.S_IFIFO: + child := fs.newDentry(fs.newNamedPipe(rp.Credentials(), opts.Mode)) + parent.vfsd.InsertChild(&child.vfsd, name) + parent.inode.impl.(*directory).childList.PushBack(child) + return nil + case linux.S_IFBLK, linux.S_IFCHR, linux.S_IFSOCK: + // Not yet supported. + return syserror.EPERM + default: + return syserror.EINVAL + } + }) } // OpenAt implements vfs.FilesystemImpl.OpenAt. func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - // Filter out flags that are not supported by memfs. O_DIRECTORY and - // O_NOFOLLOW have no effect here (they're handled by VFS by setting - // appropriate bits in rp), but are visible in FD status flags. O_NONBLOCK - // is supported only by pipes. - opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_TRUNC | linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_NONBLOCK + if opts.Flags&linux.O_TMPFILE != 0 { + // Not yet supported. + return nil, syserror.EOPNOTSUPP + } + // Handle O_CREAT and !O_CREAT separately, since in the latter case we + // don't need fs.mu for writing. if opts.Flags&linux.O_CREAT == 0 { fs.mu.RLock() defer fs.mu.RUnlock() - vfsd, inode, err := walkExistingLocked(rp) + d, err := resolveLocked(rp) if err != nil { return nil, err } - return inode.open(ctx, rp, vfsd, opts.Flags, false) + return d.open(ctx, rp, opts.Flags, false /* afterCreate */) } mustCreate := opts.Flags&linux.O_EXCL != 0 - vfsd := rp.Start() - inode := vfsd.Impl().(*dentry).inode + start := rp.Start().Impl().(*dentry) fs.mu.Lock() defer fs.mu.Unlock() if rp.Done() { + // Reject attempts to open directories with O_CREAT. if rp.MustBeDir() { return nil, syserror.EISDIR } if mustCreate { return nil, syserror.EEXIST } - return inode.open(ctx, rp, vfsd, opts.Flags, false) + return start.open(ctx, rp, opts.Flags, false /* afterCreate */) } afterTrailingSymlink: - // Walk to the parent directory of the last path component. - for !rp.Final() { - var err error - vfsd, inode, err = stepLocked(rp, vfsd, inode) - if err != nil { - return nil, err - } - } - if !inode.isDir() { - return nil, syserror.ENOTDIR + parent, err := walkParentDirLocked(rp, start) + if err != nil { + return nil, err } // Check for search permission in the parent directory. - if err := inode.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil { + if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil { return nil, err } // Reject attempts to open directories with O_CREAT. if rp.MustBeDir() { return nil, syserror.EISDIR } - pc := rp.Component() - if pc == "." || pc == ".." { + name := rp.Component() + if name == "." || name == ".." { return nil, syserror.EISDIR } // Determine whether or not we need to create a file. - childVFSD, err := rp.ResolveChild(vfsd, pc) - if err != nil { - return nil, err - } - if childVFSD == nil { + child, err := stepLocked(rp, parent) + if err == syserror.ENOENT { // Already checked for searchability above; now check for writability. - if err := inode.checkPermissions(rp.Credentials(), vfs.MayWrite, true); err != nil { + if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite, true); err != nil { return nil, err } if err := rp.Mount().CheckBeginWrite(); err != nil { @@ -349,38 +309,35 @@ afterTrailingSymlink: } defer rp.Mount().EndWrite() // Create and open the child. - childInode := fs.newRegularFile(rp.Credentials(), opts.Mode) - child := fs.newDentry(childInode) - vfsd.InsertChild(&child.vfsd, pc) - inode.impl.(*directory).childList.PushBack(child) - return childInode.open(ctx, rp, &child.vfsd, opts.Flags, true) + child := fs.newDentry(fs.newRegularFile(rp.Credentials(), opts.Mode)) + parent.vfsd.InsertChild(&child.vfsd, name) + parent.inode.impl.(*directory).childList.PushBack(child) + return child.open(ctx, rp, opts.Flags, true) } - // Open existing file or follow symlink. - if mustCreate { - return nil, syserror.EEXIST + if err != nil { + return nil, err } - childInode := childVFSD.Impl().(*dentry).inode - if symlink, ok := childInode.impl.(*symlink); ok && rp.ShouldFollowSymlink() { - // TODO: symlink traversals update access time - if err := rp.HandleSymlink(symlink.target); err != nil { - return nil, err - } - // rp.Final() may no longer be true since we now need to resolve the - // symlink target. + // Do we need to resolve a trailing symlink? + if !rp.Done() { + start = parent goto afterTrailingSymlink } - return childInode.open(ctx, rp, childVFSD, opts.Flags, false) + // Open existing file. + if mustCreate { + return nil, syserror.EEXIST + } + return child.open(ctx, rp, opts.Flags, false) } -func (i *inode) open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32, afterCreate bool) (*vfs.FileDescription, error) { +func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, flags uint32, afterCreate bool) (*vfs.FileDescription, error) { ats := vfs.AccessTypesForOpenFlags(flags) if !afterCreate { - if err := i.checkPermissions(rp.Credentials(), ats, i.isDir()); err != nil { + if err := d.inode.checkPermissions(rp.Credentials(), ats, d.inode.isDir()); err != nil { return nil, err } } mnt := rp.Mount() - switch impl := i.impl.(type) { + switch impl := d.inode.impl.(type) { case *regularFile: var fd regularFileFD fd.readable = vfs.MayReadFileWithOpenFlags(flags) @@ -392,8 +349,8 @@ func (i *inode) open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentr // mnt.EndWrite() is called by regularFileFD.Release(). } mnt.IncRef() - vfsd.IncRef() - fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{}) + d.IncRef() + fd.vfsfd.Init(&fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{}) if flags&linux.O_TRUNC != 0 { impl.mu.Lock() impl.data = impl.data[:0] @@ -408,28 +365,28 @@ func (i *inode) open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentr } var fd directoryFD mnt.IncRef() - vfsd.IncRef() - fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{}) + d.IncRef() + fd.vfsfd.Init(&fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{}) return &fd.vfsfd, nil case *symlink: // Can't open symlinks without O_PATH (which is unimplemented). return nil, syserror.ELOOP case *namedPipe: - return newNamedPipeFD(ctx, impl, rp, vfsd, flags) + return newNamedPipeFD(ctx, impl, rp, &d.vfsd, flags) default: - panic(fmt.Sprintf("unknown inode type: %T", i.impl)) + panic(fmt.Sprintf("unknown inode type: %T", d.inode.impl)) } } // ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt. func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) { fs.mu.RLock() - _, inode, err := walkExistingLocked(rp) - fs.mu.RUnlock() + defer fs.mu.RUnlock() + d, err := resolveLocked(rp) if err != nil { return "", err } - symlink, ok := inode.impl.(*symlink) + symlink, ok := d.inode.impl.(*symlink) if !ok { return "", syserror.EINVAL } @@ -437,63 +394,172 @@ func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (st } // RenameAt implements vfs.FilesystemImpl.RenameAt. -func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry, opts vfs.RenameOptions) error { - if rp.Done() { - return syserror.ENOENT +func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldParentVD vfs.VirtualDentry, oldName string, opts vfs.RenameOptions) error { + if opts.Flags != 0 { + // TODO(b/145974740): Support renameat2 flags. + return syserror.EINVAL } + + // Resolve newParent first to verify that it's on this Mount. fs.mu.Lock() defer fs.mu.Unlock() - parentVFSD, parentInode, err := walkParentDirLocked(rp) + newParent, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry)) if err != nil { return err } - _, err = checkCreateLocked(rp, parentVFSD, parentInode) - if err != nil { + newName := rp.Component() + if newName == "." || newName == ".." { + return syserror.EBUSY + } + mnt := rp.Mount() + if mnt != oldParentVD.Mount() { + return syserror.EXDEV + } + if err := mnt.CheckBeginWrite(); err != nil { return err } - if err := rp.Mount().CheckBeginWrite(); err != nil { + defer mnt.EndWrite() + + oldParent := oldParentVD.Dentry().Impl().(*dentry) + if err := oldParent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true /* isDir */); err != nil { return err } - defer rp.Mount().EndWrite() - // TODO: actually implement RenameAt - return syserror.EPERM + // Call vfs.Dentry.Child() instead of stepLocked() or rp.ResolveChild(), + // because if the existing child is a symlink or mount point then we want + // to rename over it rather than follow it. + renamedVFSD := oldParent.vfsd.Child(oldName) + if renamedVFSD == nil { + return syserror.ENOENT + } + renamed := renamedVFSD.Impl().(*dentry) + if renamed.inode.isDir() { + if renamed == newParent || renamedVFSD.IsAncestorOf(&newParent.vfsd) { + return syserror.EINVAL + } + if oldParent != newParent { + // Writability is needed to change renamed's "..". + if err := renamed.inode.checkPermissions(rp.Credentials(), vfs.MayWrite, true /* isDir */); err != nil { + return err + } + } + } else { + if opts.MustBeDir || rp.MustBeDir() { + return syserror.ENOTDIR + } + } + + if err := newParent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true /* isDir */); err != nil { + return err + } + replacedVFSD := newParent.vfsd.Child(newName) + var replaced *dentry + if replacedVFSD != nil { + replaced = replacedVFSD.Impl().(*dentry) + if replaced.inode.isDir() { + if !renamed.inode.isDir() { + return syserror.EISDIR + } + if replaced.vfsd.HasChildren() { + return syserror.ENOTEMPTY + } + } else { + if rp.MustBeDir() { + return syserror.ENOTDIR + } + if renamed.inode.isDir() { + return syserror.ENOTDIR + } + } + } else { + if renamed.inode.isDir() && newParent.inode.nlink == maxLinks { + return syserror.EMLINK + } + } + if newParent.vfsd.IsDisowned() { + return syserror.ENOENT + } + + // Linux places this check before some of those above; we do it here for + // simplicity, under the assumption that applications are not intentionally + // doing noop renames expecting them to succeed where non-noop renames + // would fail. + if renamedVFSD == replacedVFSD { + return nil + } + vfsObj := rp.VirtualFilesystem() + oldParentDir := oldParent.inode.impl.(*directory) + newParentDir := newParent.inode.impl.(*directory) + if err := vfsObj.PrepareRenameDentry(vfs.MountNamespaceFromContext(ctx), renamedVFSD, replacedVFSD); err != nil { + return err + } + if replaced != nil { + newParentDir.childList.Remove(replaced) + if replaced.inode.isDir() { + newParent.inode.decLinksLocked() // from replaced's ".." + } + replaced.inode.decLinksLocked() + } + oldParentDir.childList.Remove(renamed) + newParentDir.childList.PushBack(renamed) + if renamed.inode.isDir() { + oldParent.inode.decLinksLocked() + newParent.inode.incLinksLocked() + } + // TODO: update timestamps and parent directory sizes + vfsObj.CommitRenameReplaceDentry(renamedVFSD, &newParent.vfsd, newName, replacedVFSD) + return nil } // RmdirAt implements vfs.FilesystemImpl.RmdirAt. func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error { fs.mu.Lock() defer fs.mu.Unlock() - vfsd, inode, err := walkExistingLocked(rp) + parent, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry)) if err != nil { return err } - if err := rp.Mount().CheckBeginWrite(); err != nil { + if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true /* isDir */); err != nil { return err } - defer rp.Mount().EndWrite() - if err := checkDeleteLocked(vfsd); err != nil { - return err + name := rp.Component() + if name == "." { + return syserror.EINVAL } - if !inode.isDir() { + if name == ".." { + return syserror.ENOTEMPTY + } + childVFSD := parent.vfsd.Child(name) + if childVFSD == nil { + return syserror.ENOENT + } + child := childVFSD.Impl().(*dentry) + if !child.inode.isDir() { return syserror.ENOTDIR } - if vfsd.HasChildren() { + if childVFSD.HasChildren() { return syserror.ENOTEMPTY } - if err := rp.VirtualFilesystem().DeleteDentry(vfs.MountNamespaceFromContext(ctx), vfsd); err != nil { + mnt := rp.Mount() + if err := mnt.CheckBeginWrite(); err != nil { + return err + } + defer mnt.EndWrite() + vfsObj := rp.VirtualFilesystem() + if err := vfsObj.PrepareDeleteDentry(vfs.MountNamespaceFromContext(ctx), childVFSD); err != nil { return err } - // Remove from parent directory's childList. - vfsd.Parent().Impl().(*dentry).inode.impl.(*directory).childList.Remove(vfsd.Impl().(*dentry)) - inode.decRef() + parent.inode.impl.(*directory).childList.Remove(child) + parent.inode.decLinksLocked() // from child's ".." + child.inode.decLinksLocked() + vfsObj.CommitDeleteDentry(childVFSD) return nil } // SetStatAt implements vfs.FilesystemImpl.SetStatAt. func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error { fs.mu.RLock() - _, _, err := walkExistingLocked(rp) - fs.mu.RUnlock() + defer fs.mu.RUnlock() + _, err := resolveLocked(rp) if err != nil { return err } @@ -507,21 +573,21 @@ func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts // StatAt implements vfs.FilesystemImpl.StatAt. func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) { fs.mu.RLock() - _, inode, err := walkExistingLocked(rp) - fs.mu.RUnlock() + defer fs.mu.RUnlock() + d, err := resolveLocked(rp) if err != nil { return linux.Statx{}, err } var stat linux.Statx - inode.statTo(&stat) + d.inode.statTo(&stat) return stat, nil } // StatFSAt implements vfs.FilesystemImpl.StatFSAt. func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) { fs.mu.RLock() - _, _, err := walkExistingLocked(rp) - fs.mu.RUnlock() + defer fs.mu.RUnlock() + _, err := resolveLocked(rp) if err != nil { return linux.Statfs{}, err } @@ -531,53 +597,52 @@ func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linu // SymlinkAt implements vfs.FilesystemImpl.SymlinkAt. func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error { - if rp.Done() { - return syserror.EEXIST - } - fs.mu.Lock() - defer fs.mu.Unlock() - parentVFSD, parentInode, err := walkParentDirLocked(rp) - if err != nil { - return err - } - pc, err := checkCreateLocked(rp, parentVFSD, parentInode) - if err != nil { - return err - } - if err := rp.Mount().CheckBeginWrite(); err != nil { - return err - } - defer rp.Mount().EndWrite() - child := fs.newDentry(fs.newSymlink(rp.Credentials(), target)) - parentVFSD.InsertChild(&child.vfsd, pc) - parentInode.impl.(*directory).childList.PushBack(child) - return nil + return fs.doCreateAt(rp, false /* dir */, func(parent *dentry, name string) error { + child := fs.newDentry(fs.newSymlink(rp.Credentials(), target)) + parent.vfsd.InsertChild(&child.vfsd, name) + parent.inode.impl.(*directory).childList.PushBack(child) + return nil + }) } // UnlinkAt implements vfs.FilesystemImpl.UnlinkAt. func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error { fs.mu.Lock() defer fs.mu.Unlock() - vfsd, inode, err := walkExistingLocked(rp) + parent, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry)) if err != nil { return err } - if err := rp.Mount().CheckBeginWrite(); err != nil { + if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true /* isDir */); err != nil { return err } - defer rp.Mount().EndWrite() - if err := checkDeleteLocked(vfsd); err != nil { - return err + name := rp.Component() + if name == "." || name == ".." { + return syserror.EISDIR } - if inode.isDir() { + childVFSD := parent.vfsd.Child(name) + if childVFSD == nil { + return syserror.ENOENT + } + child := childVFSD.Impl().(*dentry) + if child.inode.isDir() { return syserror.EISDIR } - if err := rp.VirtualFilesystem().DeleteDentry(vfs.MountNamespaceFromContext(ctx), vfsd); err != nil { + if !rp.MustBeDir() { + return syserror.ENOTDIR + } + mnt := rp.Mount() + if err := mnt.CheckBeginWrite(); err != nil { + return err + } + defer mnt.EndWrite() + vfsObj := rp.VirtualFilesystem() + if err := vfsObj.PrepareDeleteDentry(vfs.MountNamespaceFromContext(ctx), childVFSD); err != nil { return err } - // Remove from parent directory's childList. - vfsd.Parent().Impl().(*dentry).inode.impl.(*directory).childList.Remove(vfsd.Impl().(*dentry)) - inode.decLinksLocked() + parent.inode.impl.(*directory).childList.Remove(child) + child.inode.decLinksLocked() + vfsObj.CommitDeleteDentry(childVFSD) return nil } @@ -585,7 +650,7 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([]string, error) { fs.mu.RLock() defer fs.mu.RUnlock() - _, _, err := walkExistingLocked(rp) + _, err := resolveLocked(rp) if err != nil { return nil, err } @@ -597,7 +662,7 @@ func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([ func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) (string, error) { fs.mu.RLock() defer fs.mu.RUnlock() - _, _, err := walkExistingLocked(rp) + _, err := resolveLocked(rp) if err != nil { return "", err } @@ -609,7 +674,7 @@ func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, nam func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error { fs.mu.RLock() defer fs.mu.RUnlock() - _, _, err := walkExistingLocked(rp) + _, err := resolveLocked(rp) if err != nil { return err } @@ -621,7 +686,7 @@ func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error { fs.mu.RLock() defer fs.mu.RUnlock() - _, _, err := walkExistingLocked(rp) + _, err := resolveLocked(rp) if err != nil { return err } diff --git a/pkg/sentry/fsimpl/memfs/memfs.go b/pkg/sentry/fsimpl/memfs/memfs.go index 9d509f6e4..8d0167c93 100644 --- a/pkg/sentry/fsimpl/memfs/memfs.go +++ b/pkg/sentry/fsimpl/memfs/memfs.go @@ -29,6 +29,7 @@ package memfs import ( "fmt" + "math" "sync" "sync/atomic" @@ -64,12 +65,6 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt func (fs *filesystem) Release() { } -// Sync implements vfs.FilesystemImpl.Sync. -func (fs *filesystem) Sync(ctx context.Context) error { - // All filesystem state is in-memory. - return nil -} - // dentry implements vfs.DentryImpl. type dentry struct { vfsd vfs.Dentry @@ -137,6 +132,8 @@ type inode struct { impl interface{} // immutable } +const maxLinks = math.MaxUint32 + func (i *inode) init(impl interface{}, fs *filesystem, creds *auth.Credentials, mode linux.FileMode) { i.refs = 1 i.mode = uint32(mode) @@ -147,20 +144,28 @@ func (i *inode) init(impl interface{}, fs *filesystem, creds *auth.Credentials, i.impl = impl } -// Preconditions: filesystem.mu must be locked for writing. +// incLinksLocked increments i's link count. +// +// Preconditions: filesystem.mu must be locked for writing. i.nlink != 0. +// i.nlink < maxLinks. func (i *inode) incLinksLocked() { - if atomic.AddUint32(&i.nlink, 1) <= 1 { + if i.nlink == 0 { panic("memfs.inode.incLinksLocked() called with no existing links") } + if i.nlink == maxLinks { + panic("memfs.inode.incLinksLocked() called with maximum link count") + } + atomic.AddUint32(&i.nlink, 1) } -// Preconditions: filesystem.mu must be locked for writing. +// decLinksLocked decrements i's link count. +// +// Preconditions: filesystem.mu must be locked for writing. i.nlink != 0. func (i *inode) decLinksLocked() { - if nlink := atomic.AddUint32(&i.nlink, ^uint32(0)); nlink == 0 { - i.decRef() - } else if nlink == ^uint32(0) { // negative overflow + if i.nlink == 0 { panic("memfs.inode.decLinksLocked() called with no existing links") } + atomic.AddUint32(&i.nlink, ^uint32(0)) } func (i *inode) incRef() { diff --git a/pkg/sentry/fsimpl/memfs/pipe_test.go b/pkg/sentry/fsimpl/memfs/pipe_test.go index 5bf527c80..be917aeee 100644 --- a/pkg/sentry/fsimpl/memfs/pipe_test.go +++ b/pkg/sentry/fsimpl/memfs/pipe_test.go @@ -19,6 +19,7 @@ import ( "testing" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/context/contexttest" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" @@ -38,7 +39,7 @@ func TestSeparateFDs(t *testing.T) { pop := vfs.PathOperation{ Root: root, Start: root, - Pathname: fileName, + Path: fspath.Parse(fileName), FollowFinalSymlink: true, } rfdchan := make(chan *vfs.FileDescription) @@ -76,7 +77,7 @@ func TestNonblockingRead(t *testing.T) { pop := vfs.PathOperation{ Root: root, Start: root, - Pathname: fileName, + Path: fspath.Parse(fileName), FollowFinalSymlink: true, } openOpts := vfs.OpenOptions{Flags: linux.O_RDONLY | linux.O_NONBLOCK} @@ -108,7 +109,7 @@ func TestNonblockingWriteError(t *testing.T) { pop := vfs.PathOperation{ Root: root, Start: root, - Pathname: fileName, + Path: fspath.Parse(fileName), FollowFinalSymlink: true, } openOpts := vfs.OpenOptions{Flags: linux.O_WRONLY | linux.O_NONBLOCK} @@ -126,7 +127,7 @@ func TestSingleFD(t *testing.T) { pop := vfs.PathOperation{ Root: root, Start: root, - Pathname: fileName, + Path: fspath.Parse(fileName), FollowFinalSymlink: true, } openOpts := vfs.OpenOptions{Flags: linux.O_RDWR} @@ -160,10 +161,9 @@ func setup(t *testing.T) (context.Context, *auth.Credentials, *vfs.VirtualFilesy // Create the pipe. root := mntns.Root() pop := vfs.PathOperation{ - Root: root, - Start: root, - Pathname: fileName, - FollowFinalSymlink: true, + Root: root, + Start: root, + Path: fspath.Parse(fileName), } mknodOpts := vfs.MknodOptions{Mode: linux.ModeNamedPipe | 0644} if err := vfsObj.MknodAt(ctx, creds, &pop, &mknodOpts); err != nil { @@ -174,7 +174,7 @@ func setup(t *testing.T) (context.Context, *auth.Credentials, *vfs.VirtualFilesy stat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{ Root: root, Start: root, - Pathname: fileName, + Path: fspath.Parse(fileName), FollowFinalSymlink: true, }, &vfs.StatOptions{}) if err != nil { diff --git a/pkg/sentry/vfs/dentry.go b/pkg/sentry/vfs/dentry.go index 6209eb053..1bc9c4a38 100644 --- a/pkg/sentry/vfs/dentry.go +++ b/pkg/sentry/vfs/dentry.go @@ -234,6 +234,18 @@ func (d *Dentry) InsertChild(child *Dentry, name string) { child.name = name } +// IsAncestorOf returns true if d is an ancestor of d2; that is, d is either +// d2's parent or an ancestor of d2's parent. +func (d *Dentry) IsAncestorOf(d2 *Dentry) bool { + for d2.parent != nil { + if d2.parent == d { + return true + } + d2 = d2.parent + } + return false +} + // PrepareDeleteDentry must be called before attempting to delete the file // represented by d. If PrepareDeleteDentry succeeds, the caller must call // AbortDeleteDentry or CommitDeleteDentry depending on the deletion's outcome. @@ -283,21 +295,6 @@ func (vfs *VirtualFilesystem) CommitDeleteDentry(d *Dentry) { } } -// DeleteDentry combines PrepareDeleteDentry and CommitDeleteDentry, as -// appropriate for in-memory filesystems that don't need to ensure that some -// external state change succeeds before committing the deletion. -// -// DeleteDentry is a mutator of d and d.Parent(). -// -// Preconditions: d is a child Dentry. -func (vfs *VirtualFilesystem) DeleteDentry(mntns *MountNamespace, d *Dentry) error { - if err := vfs.PrepareDeleteDentry(mntns, d); err != nil { - return err - } - vfs.CommitDeleteDentry(d) - return nil -} - // ForceDeleteDentry causes d to become disowned. It should only be used in // cases where VFS has no ability to stop the deletion (e.g. d represents the // local state of a file on a remote filesystem on which the file has already @@ -326,7 +323,7 @@ func (vfs *VirtualFilesystem) ForceDeleteDentry(d *Dentry) { // CommitRenameExchangeDentry depending on the rename's outcome. // // Preconditions: from is a child Dentry. If to is not nil, it must be a child -// Dentry from the same Filesystem. +// Dentry from the same Filesystem. from != to. func (vfs *VirtualFilesystem) PrepareRenameDentry(mntns *MountNamespace, from, to *Dentry) error { if checkInvariants { if from.parent == nil { diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go index df03886c3..0b053201a 100644 --- a/pkg/sentry/vfs/file_description.go +++ b/pkg/sentry/vfs/file_description.go @@ -192,6 +192,8 @@ func (fd *FileDescription) Impl() FileDescriptionImpl { // be interpreted as IDs in the root UserNamespace (i.e. as auth.KUID and // auth.KGID respectively). // +// All methods may return errors not specified. +// // FileDescriptionImpl is analogous to Linux's struct file_operations. type FileDescriptionImpl interface { // Release is called when the associated FileDescription reaches zero @@ -220,6 +222,10 @@ type FileDescriptionImpl interface { // PRead reads from the file into dst, starting at the given offset, and // returns the number of bytes read. PRead is permitted to return partial // reads with a nil error. + // + // Errors: + // + // - If opts.Flags specifies unsupported options, PRead returns EOPNOTSUPP. PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) // Read is similar to PRead, but does not specify an offset. @@ -229,6 +235,10 @@ type FileDescriptionImpl interface { // the number of bytes read; note that POSIX 2.9.7 "Thread Interactions // with Regular File Operations" requires that all operations that may // mutate the FileDescription offset are serialized. + // + // Errors: + // + // - If opts.Flags specifies unsupported options, Read returns EOPNOTSUPP. Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) // PWrite writes src to the file, starting at the given offset, and returns @@ -238,6 +248,11 @@ type FileDescriptionImpl interface { // As in Linux (but not POSIX), if O_APPEND is in effect for the // FileDescription, PWrite should ignore the offset and append data to the // end of the file. + // + // Errors: + // + // - If opts.Flags specifies unsupported options, PWrite returns + // EOPNOTSUPP. PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) // Write is similar to PWrite, but does not specify an offset, which is @@ -247,6 +262,10 @@ type FileDescriptionImpl interface { // PWrite that uses a FileDescription offset, to make it possible for // remote filesystems to implement O_APPEND correctly (i.e. atomically with // respect to writers outside the scope of VFS). + // + // Errors: + // + // - If opts.Flags specifies unsupported options, Write returns EOPNOTSUPP. Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) // IterDirents invokes cb on each entry in the directory represented by the diff --git a/pkg/sentry/vfs/filesystem.go b/pkg/sentry/vfs/filesystem.go index b766614e7..89bd58864 100644 --- a/pkg/sentry/vfs/filesystem.go +++ b/pkg/sentry/vfs/filesystem.go @@ -108,6 +108,24 @@ func (fs *Filesystem) DecRef() { // (responsible for actually implementing the operation) isn't known until path // resolution is complete. // +// Unless otherwise specified, FilesystemImpl methods are responsible for +// performing permission checks. In many cases, vfs package functions in +// permissions.go may be used to help perform these checks. +// +// When multiple specified error conditions apply to a given method call, the +// implementation may return any applicable errno unless otherwise specified, +// but returning the earliest error specified is preferable to maximize +// compatibility with Linux. +// +// All methods may return errors not specified, notably including: +// +// - ENOENT if a required path component does not exist. +// +// - ENOTDIR if an intermediate path component is not a directory. +// +// - Errors from vfs-package functions (ResolvingPath.Resolve*(), +// Mount.CheckBeginWrite(), permission-checking functions, etc.) +// // For all methods that take or return linux.Statx, Statx.Uid and Statx.Gid // should be interpreted as IDs in the root UserNamespace (i.e. as auth.KUID // and auth.KGID respectively). @@ -130,46 +148,223 @@ type FilesystemImpl interface { // GetDentryAt does not correspond directly to a Linux syscall; it is used // in the implementation of: // - // - Syscalls that need to resolve two paths: rename(), renameat(), - // renameat2(), link(), linkat(). + // - Syscalls that need to resolve two paths: link(), linkat(). // // - Syscalls that need to refer to a filesystem position outside the // context of a file description: chdir(), fchdir(), chroot(), mount(), // umount(). GetDentryAt(ctx context.Context, rp *ResolvingPath, opts GetDentryOptions) (*Dentry, error) + // GetParentDentryAt returns a Dentry representing the directory at the + // second-to-last path component in rp. (Note that, despite the name, this + // is not necessarily the parent directory of the file at rp, since the + // last path component in rp may be "." or "..".) A reference is taken on + // the returned Dentry. + // + // GetParentDentryAt does not correspond directly to a Linux syscall; it is + // used in the implementation of the rename() family of syscalls, which + // must resolve the parent directories of two paths. + // + // Preconditions: !rp.Done(). + // + // Postconditions: If GetParentDentryAt returns a nil error, then + // rp.Final(). If GetParentDentryAt returns an error returned by + // ResolvingPath.Resolve*(), then !rp.Done(). + GetParentDentryAt(ctx context.Context, rp *ResolvingPath) (*Dentry, error) + // LinkAt creates a hard link at rp representing the same file as vd. It // does not take ownership of references on vd. // - // The implementation is responsible for checking that vd.Mount() == - // rp.Mount(), and that vd does not represent a directory. + // Errors: + // + // - If the last path component in rp is "." or "..", LinkAt returns + // EEXIST. + // + // - If a file already exists at rp, LinkAt returns EEXIST. + // + // - If rp.MustBeDir(), LinkAt returns ENOENT. + // + // - If the directory in which the link would be created has been removed + // by RmdirAt or RenameAt, LinkAt returns ENOENT. + // + // - If rp.Mount != vd.Mount(), LinkAt returns EXDEV. + // + // - If vd represents a directory, LinkAt returns EPERM. + // + // - If vd represents a file for which all existing links have been + // removed, or a file created by open(O_TMPFILE|O_EXCL), LinkAt returns + // ENOENT. Equivalently, if vd represents a file with a link count of 0 not + // created by open(O_TMPFILE) without O_EXCL, LinkAt returns ENOENT. + // + // Preconditions: !rp.Done(). For the final path component in rp, + // !rp.ShouldFollowSymlink(). + // + // Postconditions: If LinkAt returns an error returned by + // ResolvingPath.Resolve*(), then !rp.Done(). LinkAt(ctx context.Context, rp *ResolvingPath, vd VirtualDentry) error // MkdirAt creates a directory at rp. + // + // Errors: + // + // - If the last path component in rp is "." or "..", MkdirAt returns + // EEXIST. + // + // - If a file already exists at rp, MkdirAt returns EEXIST. + // + // - If the directory in which the new directory would be created has been + // removed by RmdirAt or RenameAt, MkdirAt returns ENOENT. + // + // Preconditions: !rp.Done(). For the final path component in rp, + // !rp.ShouldFollowSymlink(). + // + // Postconditions: If MkdirAt returns an error returned by + // ResolvingPath.Resolve*(), then !rp.Done(). MkdirAt(ctx context.Context, rp *ResolvingPath, opts MkdirOptions) error // MknodAt creates a regular file, device special file, or named pipe at // rp. + // + // Errors: + // + // - If the last path component in rp is "." or "..", MknodAt returns + // EEXIST. + // + // - If a file already exists at rp, MknodAt returns EEXIST. + // + // - If rp.MustBeDir(), MknodAt returns ENOENT. + // + // - If the directory in which the file would be created has been removed + // by RmdirAt or RenameAt, MknodAt returns ENOENT. + // + // Preconditions: !rp.Done(). For the final path component in rp, + // !rp.ShouldFollowSymlink(). + // + // Postconditions: If MknodAt returns an error returned by + // ResolvingPath.Resolve*(), then !rp.Done(). MknodAt(ctx context.Context, rp *ResolvingPath, opts MknodOptions) error // OpenAt returns an FileDescription providing access to the file at rp. A // reference is taken on the returned FileDescription. + // + // Errors: + // + // - If opts.Flags specifies O_TMPFILE and this feature is unsupported by + // the implementation, OpenAt returns EOPNOTSUPP. (All other unsupported + // features are silently ignored, consistently with Linux's open*(2).) OpenAt(ctx context.Context, rp *ResolvingPath, opts OpenOptions) (*FileDescription, error) // ReadlinkAt returns the target of the symbolic link at rp. + // + // Errors: + // + // - If the file at rp is not a symbolic link, ReadlinkAt returns EINVAL. ReadlinkAt(ctx context.Context, rp *ResolvingPath) (string, error) - // RenameAt renames the Dentry represented by vd to rp. It does not take - // ownership of references on vd. + // RenameAt renames the file named oldName in directory oldParentVD to rp. + // It does not take ownership of references on oldParentVD. + // + // Errors [1]: + // + // - If opts.Flags specifies unsupported options, RenameAt returns EINVAL. + // + // - If the last path component in rp is "." or "..", and opts.Flags + // contains RENAME_NOREPLACE, RenameAt returns EEXIST. + // + // - If the last path component in rp is "." or "..", and opts.Flags does + // not contain RENAME_NOREPLACE, RenameAt returns EBUSY. + // + // - If rp.Mount != oldParentVD.Mount(), RenameAt returns EXDEV. // - // The implementation is responsible for checking that vd.Mount() == - // rp.Mount(). - RenameAt(ctx context.Context, rp *ResolvingPath, vd VirtualDentry, opts RenameOptions) error + // - If the renamed file is not a directory, and opts.MustBeDir is true, + // RenameAt returns ENOTDIR. + // + // - If renaming would replace an existing file and opts.Flags contains + // RENAME_NOREPLACE, RenameAt returns EEXIST. + // + // - If there is no existing file at rp and opts.Flags contains + // RENAME_EXCHANGE, RenameAt returns ENOENT. + // + // - If there is an existing non-directory file at rp, and rp.MustBeDir() + // is true, RenameAt returns ENOTDIR. + // + // - If the renamed file is not a directory, opts.Flags does not contain + // RENAME_EXCHANGE, and rp.MustBeDir() is true, RenameAt returns ENOTDIR. + // (This check is not subsumed by the check for directory replacement below + // since it applies even if there is no file to replace.) + // + // - If the renamed file is a directory, and the new parent directory of + // the renamed file is either the renamed directory or a descendant + // subdirectory of the renamed directory, RenameAt returns EINVAL. + // + // - If renaming would exchange the renamed file with an ancestor directory + // of the renamed file, RenameAt returns EINVAL. + // + // - If renaming would replace an ancestor directory of the renamed file, + // RenameAt returns ENOTEMPTY. (This check would be subsumed by the + // non-empty directory check below; however, this check takes place before + // the self-rename check.) + // + // - If the renamed file would replace or exchange with itself (i.e. the + // source and destination paths resolve to the same file), RenameAt returns + // nil, skipping the checks described below. + // + // - If the source or destination directory is not writable by the provider + // of rp.Credentials(), RenameAt returns EACCES. + // + // - If the renamed file is a directory, and renaming would replace a + // non-directory file, RenameAt returns ENOTDIR. + // + // - If the renamed file is not a directory, and renaming would replace a + // directory, RenameAt returns EISDIR. + // + // - If the new parent directory of the renamed file has been removed by + // RmdirAt or a preceding call to RenameAt, RenameAt returns ENOENT. + // + // - If the renamed file is a directory, it is not writable by the + // provider of rp.Credentials(), and the source and destination parent + // directories are different, RenameAt returns EACCES. (This is nominally + // required to change the ".." entry in the renamed directory.) + // + // - If renaming would replace a non-empty directory, RenameAt returns + // ENOTEMPTY. + // + // Preconditions: !rp.Done(). For the final path component in rp, + // !rp.ShouldFollowSymlink(). oldName is not "." or "..". + // + // Postconditions: If RenameAt returns an error returned by + // ResolvingPath.Resolve*(), then !rp.Done(). + // + // [1] "The worst of all namespace operations - renaming directory. + // "Perverted" doesn't even start to describe it. Somebody in UCB had a + // heck of a trip..." - fs/namei.c:vfs_rename() + RenameAt(ctx context.Context, rp *ResolvingPath, oldParentVD VirtualDentry, oldName string, opts RenameOptions) error // RmdirAt removes the directory at rp. + // + // Errors: + // + // - If the last path component in rp is ".", RmdirAt returns EINVAL. + // + // - If the last path component in rp is "..", RmdirAt returns ENOTEMPTY. + // + // - If no file exists at rp, RmdirAt returns ENOENT. + // + // - If the file at rp exists but is not a directory, RmdirAt returns + // ENOTDIR. + // + // Preconditions: !rp.Done(). For the final path component in rp, + // !rp.ShouldFollowSymlink(). + // + // Postconditions: If RmdirAt returns an error returned by + // ResolvingPath.Resolve*(), then !rp.Done(). RmdirAt(ctx context.Context, rp *ResolvingPath) error // SetStatAt updates metadata for the file at the given path. + // + // Errors: + // + // - If opts specifies unsupported options, SetStatAt returns EINVAL. SetStatAt(ctx context.Context, rp *ResolvingPath, opts SetStatOptions) error // StatAt returns metadata for the file at rp. @@ -181,9 +376,45 @@ type FilesystemImpl interface { StatFSAt(ctx context.Context, rp *ResolvingPath) (linux.Statfs, error) // SymlinkAt creates a symbolic link at rp referring to the given target. + // + // Errors: + // + // - If the last path component in rp is "." or "..", SymlinkAt returns + // EEXIST. + // + // - If a file already exists at rp, SymlinkAt returns EEXIST. + // + // - If rp.MustBeDir(), SymlinkAt returns ENOENT. + // + // - If the directory in which the symbolic link would be created has been + // removed by RmdirAt or RenameAt, SymlinkAt returns ENOENT. + // + // Preconditions: !rp.Done(). For the final path component in rp, + // !rp.ShouldFollowSymlink(). + // + // Postconditions: If SymlinkAt returns an error returned by + // ResolvingPath.Resolve*(), then !rp.Done(). SymlinkAt(ctx context.Context, rp *ResolvingPath, target string) error - // UnlinkAt removes the non-directory file at rp. + // UnlinkAt removes the file at rp. + // + // Errors: + // + // - If the last path component in rp is "." or "..", UnlinkAt returns + // EISDIR. + // + // - If no file exists at rp, UnlinkAt returns ENOENT. + // + // - If rp.MustBeDir(), and the file at rp exists and is not a directory, + // UnlinkAt returns ENOTDIR. + // + // - If the file at rp exists but is a directory, UnlinkAt returns EISDIR. + // + // Preconditions: !rp.Done(). For the final path component in rp, + // !rp.ShouldFollowSymlink(). + // + // Postconditions: If UnlinkAt returns an error returned by + // ResolvingPath.Resolve*(), then !rp.Done(). UnlinkAt(ctx context.Context, rp *ResolvingPath) error // ListxattrAt returns all extended attribute names for the file at rp. diff --git a/pkg/sentry/vfs/options.go b/pkg/sentry/vfs/options.go index 97ee4a446..87d2b0d1c 100644 --- a/pkg/sentry/vfs/options.go +++ b/pkg/sentry/vfs/options.go @@ -83,6 +83,9 @@ type ReadOptions struct { type RenameOptions struct { // Flags contains flags as specified for renameat2(2). Flags uint32 + + // If MustBeDir is true, the renamed file must be a directory. + MustBeDir bool } // SetStatOptions contains options to VirtualFilesystem.SetStatAt(), diff --git a/pkg/sentry/vfs/resolving_path.go b/pkg/sentry/vfs/resolving_path.go index d580fd39e..f0641d314 100644 --- a/pkg/sentry/vfs/resolving_path.go +++ b/pkg/sentry/vfs/resolving_path.go @@ -112,30 +112,26 @@ var resolvingPathPool = sync.Pool{ }, } -func (vfs *VirtualFilesystem) getResolvingPath(creds *auth.Credentials, pop *PathOperation) (*ResolvingPath, error) { - path, err := fspath.Parse(pop.Pathname) - if err != nil { - return nil, err - } +func (vfs *VirtualFilesystem) getResolvingPath(creds *auth.Credentials, pop *PathOperation) *ResolvingPath { rp := resolvingPathPool.Get().(*ResolvingPath) rp.vfs = vfs rp.root = pop.Root rp.mount = pop.Start.mount rp.start = pop.Start.dentry - rp.pit = path.Begin + rp.pit = pop.Path.Begin rp.flags = 0 if pop.FollowFinalSymlink { rp.flags |= rpflagsFollowFinalSymlink } - rp.mustBeDir = path.Dir - rp.mustBeDirOrig = path.Dir + rp.mustBeDir = pop.Path.Dir + rp.mustBeDirOrig = pop.Path.Dir rp.symlinks = 0 rp.curPart = 0 rp.numOrigParts = 1 rp.creds = creds - rp.parts[0] = path.Begin - rp.origParts[0] = path.Begin - return rp, nil + rp.parts[0] = pop.Path.Begin + rp.origParts[0] = pop.Path.Begin + return rp } func (vfs *VirtualFilesystem) putResolvingPath(rp *ResolvingPath) { @@ -345,29 +341,34 @@ func (rp *ResolvingPath) ShouldFollowSymlink() bool { // symlink target and returns nil. Otherwise it returns a non-nil error. // // Preconditions: !rp.Done(). +// +// Postconditions: If HandleSymlink returns a nil error, then !rp.Done(). func (rp *ResolvingPath) HandleSymlink(target string) error { if rp.symlinks >= linux.MaxSymlinkTraversals { return syserror.ELOOP } - targetPath, err := fspath.Parse(target) - if err != nil { - return err + if len(target) == 0 { + return syserror.ENOENT } rp.symlinks++ + targetPath := fspath.Parse(target) if targetPath.Absolute { rp.absSymlinkTarget = targetPath return resolveAbsSymlinkError{} } - if !targetPath.Begin.Ok() { - panic(fmt.Sprintf("symbolic link has non-empty target %q that is both relative and has no path components?", target)) - } // Consume the path component that represented the symlink. rp.Advance() // Prepend the symlink target to the relative path. + if checkInvariants { + if !targetPath.HasComponents() { + panic(fmt.Sprintf("non-empty pathname %q parsed to relative path with no components", target)) + } + } rp.relpathPrepend(targetPath) return nil } +// Preconditions: path.HasComponents(). func (rp *ResolvingPath) relpathPrepend(path fspath.Path) { if rp.pit.Ok() { rp.parts[rp.curPart] = rp.pit @@ -467,6 +468,17 @@ func (rp *ResolvingPath) handleError(err error) bool { } } +// canHandleError returns true if err is an error returned by rp.Resolve*() +// that rp.handleError() may attempt to handle. +func (rp *ResolvingPath) canHandleError(err error) bool { + switch err.(type) { + case resolveMountRootOrJumpError, resolveMountPointError, resolveAbsSymlinkError: + return true + default: + return false + } +} + // MustBeDir returns true if the file traversed by rp must be a directory. func (rp *ResolvingPath) MustBeDir() bool { return rp.mustBeDir diff --git a/pkg/sentry/vfs/testutil.go b/pkg/sentry/vfs/testutil.go index d94117bce..ee5c8b9e2 100644 --- a/pkg/sentry/vfs/testutil.go +++ b/pkg/sentry/vfs/testutil.go @@ -57,6 +57,11 @@ func (fs *FDTestFilesystem) GetDentryAt(ctx context.Context, rp *ResolvingPath, return nil, syserror.EPERM } +// GetParentDentryAt implements FilesystemImpl.GetParentDentryAt. +func (fs *FDTestFilesystem) GetParentDentryAt(ctx context.Context, rp *ResolvingPath) (*Dentry, error) { + return nil, syserror.EPERM +} + // LinkAt implements FilesystemImpl.LinkAt. func (fs *FDTestFilesystem) LinkAt(ctx context.Context, rp *ResolvingPath, vd VirtualDentry) error { return syserror.EPERM @@ -83,7 +88,7 @@ func (fs *FDTestFilesystem) ReadlinkAt(ctx context.Context, rp *ResolvingPath) ( } // RenameAt implements FilesystemImpl.RenameAt. -func (fs *FDTestFilesystem) RenameAt(ctx context.Context, rp *ResolvingPath, vd VirtualDentry, opts RenameOptions) error { +func (fs *FDTestFilesystem) RenameAt(ctx context.Context, rp *ResolvingPath, oldParentVD VirtualDentry, oldName string, opts RenameOptions) error { return syserror.EPERM } diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go index e60898d7c..3e4df8558 100644 --- a/pkg/sentry/vfs/vfs.go +++ b/pkg/sentry/vfs/vfs.go @@ -28,9 +28,11 @@ package vfs import ( + "fmt" "sync" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/syserror" @@ -111,11 +113,11 @@ type PathOperation struct { // are borrowed from the provider of the PathOperation (i.e. the caller of // the VFS method to which the PathOperation was passed). // - // Invariants: Start.Ok(). If Pathname.Absolute, then Start == Root. + // Invariants: Start.Ok(). If Path.Absolute, then Start == Root. Start VirtualDentry // Path is the pathname traversed by this operation. - Pathname string + Path fspath.Path // If FollowFinalSymlink is true, and the Dentry traversed by the final // path component represents a symbolic link, the symbolic link should be @@ -126,10 +128,7 @@ type PathOperation struct { // GetDentryAt returns a VirtualDentry representing the given path, at which a // file must exist. A reference is taken on the returned VirtualDentry. func (vfs *VirtualFilesystem) GetDentryAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *GetDentryOptions) (VirtualDentry, error) { - rp, err := vfs.getResolvingPath(creds, pop) - if err != nil { - return VirtualDentry{}, err - } + rp := vfs.getResolvingPath(creds, pop) for { d, err := rp.mount.fs.impl.GetDentryAt(ctx, rp, *opts) if err == nil { @@ -148,6 +147,33 @@ func (vfs *VirtualFilesystem) GetDentryAt(ctx context.Context, creds *auth.Crede } } +// Preconditions: pop.Path.Begin.Ok(). +func (vfs *VirtualFilesystem) getParentDirAndName(ctx context.Context, creds *auth.Credentials, pop *PathOperation) (VirtualDentry, string, error) { + rp := vfs.getResolvingPath(creds, pop) + for { + parent, err := rp.mount.fs.impl.GetParentDentryAt(ctx, rp) + if err == nil { + parentVD := VirtualDentry{ + mount: rp.mount, + dentry: parent, + } + rp.mount.IncRef() + name := rp.Component() + vfs.putResolvingPath(rp) + return parentVD, name, nil + } + if checkInvariants { + if rp.canHandleError(err) && rp.Done() { + panic(fmt.Sprintf("%T.GetParentDentryAt() consumed all path components and returned %T", rp.mount.fs.impl, err)) + } + } + if !rp.handleError(err) { + vfs.putResolvingPath(rp) + return VirtualDentry{}, "", err + } + } +} + // LinkAt creates a hard link at newpop representing the existing file at // oldpop. func (vfs *VirtualFilesystem) LinkAt(ctx context.Context, creds *auth.Credentials, oldpop, newpop *PathOperation) error { @@ -155,21 +181,36 @@ func (vfs *VirtualFilesystem) LinkAt(ctx context.Context, creds *auth.Credential if err != nil { return err } - rp, err := vfs.getResolvingPath(creds, newpop) - if err != nil { + + if !newpop.Path.Begin.Ok() { oldVD.DecRef() - return err + if newpop.Path.Absolute { + return syserror.EEXIST + } + return syserror.ENOENT } + if newpop.FollowFinalSymlink { + oldVD.DecRef() + ctx.Warningf("VirtualFilesystem.LinkAt: file creation paths can't follow final symlink") + return syserror.EINVAL + } + + rp := vfs.getResolvingPath(creds, newpop) for { err := rp.mount.fs.impl.LinkAt(ctx, rp, oldVD) if err == nil { - oldVD.DecRef() vfs.putResolvingPath(rp) + oldVD.DecRef() return nil } + if checkInvariants { + if rp.canHandleError(err) && rp.Done() { + panic(fmt.Sprintf("%T.LinkAt() consumed all path components and returned %T", rp.mount.fs.impl, err)) + } + } if !rp.handleError(err) { - oldVD.DecRef() vfs.putResolvingPath(rp) + oldVD.DecRef() return err } } @@ -177,19 +218,32 @@ func (vfs *VirtualFilesystem) LinkAt(ctx context.Context, creds *auth.Credential // MkdirAt creates a directory at the given path. func (vfs *VirtualFilesystem) MkdirAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *MkdirOptions) error { + if !pop.Path.Begin.Ok() { + if pop.Path.Absolute { + return syserror.EEXIST + } + return syserror.ENOENT + } + if pop.FollowFinalSymlink { + ctx.Warningf("VirtualFilesystem.MkdirAt: file creation paths can't follow final symlink") + return syserror.EINVAL + } // "Under Linux, apart from the permission bits, the S_ISVTX mode bit is // also honored." - mkdir(2) opts.Mode &= 0777 | linux.S_ISVTX - rp, err := vfs.getResolvingPath(creds, pop) - if err != nil { - return err - } + + rp := vfs.getResolvingPath(creds, pop) for { err := rp.mount.fs.impl.MkdirAt(ctx, rp, *opts) if err == nil { vfs.putResolvingPath(rp) return nil } + if checkInvariants { + if rp.canHandleError(err) && rp.Done() { + panic(fmt.Sprintf("%T.MkdirAt() consumed all path components and returned %T", rp.mount.fs.impl, err)) + } + } if !rp.handleError(err) { vfs.putResolvingPath(rp) return err @@ -200,16 +254,29 @@ func (vfs *VirtualFilesystem) MkdirAt(ctx context.Context, creds *auth.Credentia // MknodAt creates a file of the given mode at the given path. It returns an // error from the syserror package. func (vfs *VirtualFilesystem) MknodAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *MknodOptions) error { - rp, err := vfs.getResolvingPath(creds, pop) - if err != nil { - return nil + if !pop.Path.Begin.Ok() { + if pop.Path.Absolute { + return syserror.EEXIST + } + return syserror.ENOENT + } + if pop.FollowFinalSymlink { + ctx.Warningf("VirtualFilesystem.MknodAt: file creation paths can't follow final symlink") + return syserror.EINVAL } + + rp := vfs.getResolvingPath(creds, pop) for { - if err = rp.mount.fs.impl.MknodAt(ctx, rp, *opts); err == nil { + err := rp.mount.fs.impl.MknodAt(ctx, rp, *opts) + if err != nil { vfs.putResolvingPath(rp) return nil } - // Handle mount traversals. + if checkInvariants { + if rp.canHandleError(err) && rp.Done() { + panic(fmt.Sprintf("%T.MknodAt() consumed all path components and returned %T", rp.mount.fs.impl, err)) + } + } if !rp.handleError(err) { vfs.putResolvingPath(rp) return err @@ -259,10 +326,7 @@ func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credential if opts.Flags&linux.O_NOFOLLOW != 0 { pop.FollowFinalSymlink = false } - rp, err := vfs.getResolvingPath(creds, pop) - if err != nil { - return nil, err - } + rp := vfs.getResolvingPath(creds, pop) if opts.Flags&linux.O_DIRECTORY != 0 { rp.mustBeDir = true rp.mustBeDirOrig = true @@ -282,10 +346,7 @@ func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credential // ReadlinkAt returns the target of the symbolic link at the given path. func (vfs *VirtualFilesystem) ReadlinkAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) (string, error) { - rp, err := vfs.getResolvingPath(creds, pop) - if err != nil { - return "", err - } + rp := vfs.getResolvingPath(creds, pop) for { target, err := rp.mount.fs.impl.ReadlinkAt(ctx, rp) if err == nil { @@ -301,25 +362,59 @@ func (vfs *VirtualFilesystem) ReadlinkAt(ctx context.Context, creds *auth.Creden // RenameAt renames the file at oldpop to newpop. func (vfs *VirtualFilesystem) RenameAt(ctx context.Context, creds *auth.Credentials, oldpop, newpop *PathOperation, opts *RenameOptions) error { - oldVD, err := vfs.GetDentryAt(ctx, creds, oldpop, &GetDentryOptions{}) - if err != nil { - return err + if !oldpop.Path.Begin.Ok() { + if oldpop.Path.Absolute { + return syserror.EBUSY + } + return syserror.ENOENT } - rp, err := vfs.getResolvingPath(creds, newpop) + if oldpop.FollowFinalSymlink { + ctx.Warningf("VirtualFilesystem.RenameAt: source path can't follow final symlink") + return syserror.EINVAL + } + + oldParentVD, oldName, err := vfs.getParentDirAndName(ctx, creds, oldpop) if err != nil { - oldVD.DecRef() return err } + if oldName == "." || oldName == ".." { + oldParentVD.DecRef() + return syserror.EBUSY + } + + if !newpop.Path.Begin.Ok() { + oldParentVD.DecRef() + if newpop.Path.Absolute { + return syserror.EBUSY + } + return syserror.ENOENT + } + if newpop.FollowFinalSymlink { + oldParentVD.DecRef() + ctx.Warningf("VirtualFilesystem.RenameAt: destination path can't follow final symlink") + return syserror.EINVAL + } + + rp := vfs.getResolvingPath(creds, newpop) + renameOpts := *opts + if oldpop.Path.Dir { + renameOpts.MustBeDir = true + } for { - err := rp.mount.fs.impl.RenameAt(ctx, rp, oldVD, *opts) + err := rp.mount.fs.impl.RenameAt(ctx, rp, oldParentVD, oldName, renameOpts) if err == nil { - oldVD.DecRef() vfs.putResolvingPath(rp) + oldParentVD.DecRef() return nil } + if checkInvariants { + if rp.canHandleError(err) && rp.Done() { + panic(fmt.Sprintf("%T.RenameAt() consumed all path components and returned %T", rp.mount.fs.impl, err)) + } + } if !rp.handleError(err) { - oldVD.DecRef() vfs.putResolvingPath(rp) + oldParentVD.DecRef() return err } } @@ -327,16 +422,29 @@ func (vfs *VirtualFilesystem) RenameAt(ctx context.Context, creds *auth.Credenti // RmdirAt removes the directory at the given path. func (vfs *VirtualFilesystem) RmdirAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) error { - rp, err := vfs.getResolvingPath(creds, pop) - if err != nil { - return err + if !pop.Path.Begin.Ok() { + if pop.Path.Absolute { + return syserror.EBUSY + } + return syserror.ENOENT } + if pop.FollowFinalSymlink { + ctx.Warningf("VirtualFilesystem.RmdirAt: file deletion paths can't follow final symlink") + return syserror.EINVAL + } + + rp := vfs.getResolvingPath(creds, pop) for { err := rp.mount.fs.impl.RmdirAt(ctx, rp) if err == nil { vfs.putResolvingPath(rp) return nil } + if checkInvariants { + if rp.canHandleError(err) && rp.Done() { + panic(fmt.Sprintf("%T.RmdirAt() consumed all path components and returned %T", rp.mount.fs.impl, err)) + } + } if !rp.handleError(err) { vfs.putResolvingPath(rp) return err @@ -346,10 +454,7 @@ func (vfs *VirtualFilesystem) RmdirAt(ctx context.Context, creds *auth.Credentia // SetStatAt changes metadata for the file at the given path. func (vfs *VirtualFilesystem) SetStatAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *SetStatOptions) error { - rp, err := vfs.getResolvingPath(creds, pop) - if err != nil { - return err - } + rp := vfs.getResolvingPath(creds, pop) for { err := rp.mount.fs.impl.SetStatAt(ctx, rp, *opts) if err == nil { @@ -365,10 +470,7 @@ func (vfs *VirtualFilesystem) SetStatAt(ctx context.Context, creds *auth.Credent // StatAt returns metadata for the file at the given path. func (vfs *VirtualFilesystem) StatAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *StatOptions) (linux.Statx, error) { - rp, err := vfs.getResolvingPath(creds, pop) - if err != nil { - return linux.Statx{}, err - } + rp := vfs.getResolvingPath(creds, pop) for { stat, err := rp.mount.fs.impl.StatAt(ctx, rp, *opts) if err == nil { @@ -385,10 +487,7 @@ func (vfs *VirtualFilesystem) StatAt(ctx context.Context, creds *auth.Credential // StatFSAt returns metadata for the filesystem containing the file at the // given path. func (vfs *VirtualFilesystem) StatFSAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) (linux.Statfs, error) { - rp, err := vfs.getResolvingPath(creds, pop) - if err != nil { - return linux.Statfs{}, err - } + rp := vfs.getResolvingPath(creds, pop) for { statfs, err := rp.mount.fs.impl.StatFSAt(ctx, rp) if err == nil { @@ -404,16 +503,29 @@ func (vfs *VirtualFilesystem) StatFSAt(ctx context.Context, creds *auth.Credenti // SymlinkAt creates a symbolic link at the given path with the given target. func (vfs *VirtualFilesystem) SymlinkAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, target string) error { - rp, err := vfs.getResolvingPath(creds, pop) - if err != nil { - return err + if !pop.Path.Begin.Ok() { + if pop.Path.Absolute { + return syserror.EEXIST + } + return syserror.ENOENT + } + if pop.FollowFinalSymlink { + ctx.Warningf("VirtualFilesystem.SymlinkAt: file creation paths can't follow final symlink") + return syserror.EINVAL } + + rp := vfs.getResolvingPath(creds, pop) for { err := rp.mount.fs.impl.SymlinkAt(ctx, rp, target) if err == nil { vfs.putResolvingPath(rp) return nil } + if checkInvariants { + if rp.canHandleError(err) && rp.Done() { + panic(fmt.Sprintf("%T.SymlinkAt() consumed all path components and returned %T", rp.mount.fs.impl, err)) + } + } if !rp.handleError(err) { vfs.putResolvingPath(rp) return err @@ -423,16 +535,29 @@ func (vfs *VirtualFilesystem) SymlinkAt(ctx context.Context, creds *auth.Credent // UnlinkAt deletes the non-directory file at the given path. func (vfs *VirtualFilesystem) UnlinkAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) error { - rp, err := vfs.getResolvingPath(creds, pop) - if err != nil { - return err + if !pop.Path.Begin.Ok() { + if pop.Path.Absolute { + return syserror.EBUSY + } + return syserror.ENOENT + } + if pop.FollowFinalSymlink { + ctx.Warningf("VirtualFilesystem.UnlinkAt: file deletion paths can't follow final symlink") + return syserror.EINVAL } + + rp := vfs.getResolvingPath(creds, pop) for { err := rp.mount.fs.impl.UnlinkAt(ctx, rp) if err == nil { vfs.putResolvingPath(rp) return nil } + if checkInvariants { + if rp.canHandleError(err) && rp.Done() { + panic(fmt.Sprintf("%T.UnlinkAt() consumed all path components and returned %T", rp.mount.fs.impl, err)) + } + } if !rp.handleError(err) { vfs.putResolvingPath(rp) return err @@ -443,10 +568,7 @@ func (vfs *VirtualFilesystem) UnlinkAt(ctx context.Context, creds *auth.Credenti // ListxattrAt returns all extended attribute names for the file at the given // path. func (vfs *VirtualFilesystem) ListxattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) ([]string, error) { - rp, err := vfs.getResolvingPath(creds, pop) - if err != nil { - return nil, err - } + rp := vfs.getResolvingPath(creds, pop) for { names, err := rp.mount.fs.impl.ListxattrAt(ctx, rp) if err == nil { @@ -471,10 +593,7 @@ func (vfs *VirtualFilesystem) ListxattrAt(ctx context.Context, creds *auth.Crede // GetxattrAt returns the value associated with the given extended attribute // for the file at the given path. func (vfs *VirtualFilesystem) GetxattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, name string) (string, error) { - rp, err := vfs.getResolvingPath(creds, pop) - if err != nil { - return "", err - } + rp := vfs.getResolvingPath(creds, pop) for { val, err := rp.mount.fs.impl.GetxattrAt(ctx, rp, name) if err == nil { @@ -491,10 +610,7 @@ func (vfs *VirtualFilesystem) GetxattrAt(ctx context.Context, creds *auth.Creden // SetxattrAt changes the value associated with the given extended attribute // for the file at the given path. func (vfs *VirtualFilesystem) SetxattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *SetxattrOptions) error { - rp, err := vfs.getResolvingPath(creds, pop) - if err != nil { - return err - } + rp := vfs.getResolvingPath(creds, pop) for { err := rp.mount.fs.impl.SetxattrAt(ctx, rp, *opts) if err == nil { @@ -510,10 +626,7 @@ func (vfs *VirtualFilesystem) SetxattrAt(ctx context.Context, creds *auth.Creden // RemovexattrAt removes the given extended attribute from the file at rp. func (vfs *VirtualFilesystem) RemovexattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, name string) error { - rp, err := vfs.getResolvingPath(creds, pop) - if err != nil { - return err - } + rp := vfs.getResolvingPath(creds, pop) for { err := rp.mount.fs.impl.RemovexattrAt(ctx, rp, name) if err == nil { diff --git a/pkg/syserror/syserror.go b/pkg/syserror/syserror.go index 1987e89cc..2269f6237 100644 --- a/pkg/syserror/syserror.go +++ b/pkg/syserror/syserror.go @@ -45,6 +45,7 @@ var ( ELIBBAD = error(syscall.ELIBBAD) ELOOP = error(syscall.ELOOP) EMFILE = error(syscall.EMFILE) + EMLINK = error(syscall.EMLINK) EMSGSIZE = error(syscall.EMSGSIZE) ENAMETOOLONG = error(syscall.ENAMETOOLONG) ENOATTR = ENODATA -- cgit v1.2.3 From 574e988f2bc6060078a17f37a377441703c52a22 Mon Sep 17 00:00:00 2001 From: Fabricio Voznika Date: Mon, 23 Dec 2019 17:31:20 -0800 Subject: Fix deadlock in kernfs.Filesystem.revalidateChildLocked It was calling Dentry.InsertChild with the dentry's mutex already locked. Updates #1035 PiperOrigin-RevId: 286962742 --- pkg/sentry/fsimpl/kernfs/filesystem.go | 2 +- pkg/sentry/fsimpl/kernfs/kernfs.go | 12 ++++++++++-- 2 files changed, 11 insertions(+), 3 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go index a6f9fced5..79759e0fc 100644 --- a/pkg/sentry/fsimpl/kernfs/filesystem.go +++ b/pkg/sentry/fsimpl/kernfs/filesystem.go @@ -122,7 +122,7 @@ func (fs *Filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.Vir return nil, err } // Reference on childVFSD dropped by a corresponding Valid. - parent.InsertChild(name, childVFSD) + parent.insertChildLocked(name, childVFSD) } return childVFSD.Impl().(*Dentry), nil } diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go index bb01c3d01..ac802218d 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs.go @@ -239,14 +239,22 @@ func (d *Dentry) destroy() { // // Precondition: d must represent a directory inode. func (d *Dentry) InsertChild(name string, child *vfs.Dentry) { + d.dirMu.Lock() + d.insertChildLocked(name, child) + d.dirMu.Unlock() +} + +// insertChildLocked is equivalent to InsertChild, with additional +// preconditions. +// +// Precondition: d.dirMu must be locked. +func (d *Dentry) insertChildLocked(name string, child *vfs.Dentry) { if !d.isDir() { panic(fmt.Sprintf("InsertChild called on non-directory Dentry: %+v.", d)) } vfsDentry := d.VFSDentry() vfsDentry.IncRef() // DecRef in child's Dentry.destroy. - d.dirMu.Lock() vfsDentry.InsertChild(child, name) - d.dirMu.Unlock() } // The Inode interface maps filesystem-level operations that operate on paths to -- cgit v1.2.3 From 3c125eb21946e1f6bf8f22f4169baafb7f07bf60 Mon Sep 17 00:00:00 2001 From: Fabricio Voznika Date: Thu, 26 Dec 2019 14:42:19 -0800 Subject: Initial procfs implementation in VFSv2 Updates #1195 PiperOrigin-RevId: 287227722 --- pkg/sentry/fsimpl/kernfs/BUILD | 1 + pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go | 20 +- pkg/sentry/fsimpl/kernfs/fd_impl_util.go | 5 +- pkg/sentry/fsimpl/kernfs/inode_impl_util.go | 20 ++ pkg/sentry/fsimpl/kernfs/kernfs.go | 9 + pkg/sentry/fsimpl/kernfs/kernfs_test.go | 2 +- pkg/sentry/fsimpl/kernfs/symlink.go | 45 +++ pkg/sentry/fsimpl/proc/BUILD | 33 +- pkg/sentry/fsimpl/proc/boot_test.go | 149 +++++++++ pkg/sentry/fsimpl/proc/filesystem.go | 69 +++++ pkg/sentry/fsimpl/proc/filesystems.go | 25 -- pkg/sentry/fsimpl/proc/loadavg.go | 8 +- pkg/sentry/fsimpl/proc/meminfo.go | 6 +- pkg/sentry/fsimpl/proc/proc.go | 16 - pkg/sentry/fsimpl/proc/stat.go | 6 +- pkg/sentry/fsimpl/proc/task.go | 341 ++++++++------------ pkg/sentry/fsimpl/proc/task_files.go | 272 ++++++++++++++++ pkg/sentry/fsimpl/proc/tasks.go | 162 ++++++++++ pkg/sentry/fsimpl/proc/tasks_files.go | 92 ++++++ pkg/sentry/fsimpl/proc/tasks_test.go | 412 +++++++++++++++++++++++++ pkg/sentry/fsimpl/proc/version.go | 6 +- pkg/sentry/kernel/kernel.go | 7 +- pkg/sentry/kernel/task_clone.go | 2 +- pkg/sentry/kernel/thread_group.go | 8 +- pkg/sentry/vfs/file_description_impl_util.go | 11 + 25 files changed, 1454 insertions(+), 273 deletions(-) create mode 100644 pkg/sentry/fsimpl/kernfs/symlink.go create mode 100644 pkg/sentry/fsimpl/proc/boot_test.go create mode 100644 pkg/sentry/fsimpl/proc/filesystem.go delete mode 100644 pkg/sentry/fsimpl/proc/filesystems.go delete mode 100644 pkg/sentry/fsimpl/proc/proc.go create mode 100644 pkg/sentry/fsimpl/proc/task_files.go create mode 100644 pkg/sentry/fsimpl/proc/tasks.go create mode 100644 pkg/sentry/fsimpl/proc/tasks_files.go create mode 100644 pkg/sentry/fsimpl/proc/tasks_test.go (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/kernfs/BUILD b/pkg/sentry/fsimpl/kernfs/BUILD index 59f7f39e2..39c03ee9d 100644 --- a/pkg/sentry/fsimpl/kernfs/BUILD +++ b/pkg/sentry/fsimpl/kernfs/BUILD @@ -25,6 +25,7 @@ go_library( "inode_impl_util.go", "kernfs.go", "slot_list.go", + "symlink.go", ], importpath = "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs", visibility = ["//pkg/sentry:internal"], diff --git a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go index 51102ce48..c5fe65722 100644 --- a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go +++ b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go @@ -15,6 +15,8 @@ package kernfs import ( + "fmt" + "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" @@ -26,7 +28,10 @@ import ( // DynamicBytesFile implements kernfs.Inode and represents a read-only // file whose contents are backed by a vfs.DynamicBytesSource. // -// Must be initialized with Init before first use. +// Must be instantiated with NewDynamicBytesFile or initialized with Init +// before first use. +// +// +stateify savable type DynamicBytesFile struct { InodeAttrs InodeNoopRefCount @@ -36,9 +41,14 @@ type DynamicBytesFile struct { data vfs.DynamicBytesSource } -// Init intializes a dynamic bytes file. -func (f *DynamicBytesFile) Init(creds *auth.Credentials, ino uint64, data vfs.DynamicBytesSource) { - f.InodeAttrs.Init(creds, ino, linux.ModeRegular|0444) +var _ Inode = (*DynamicBytesFile)(nil) + +// Init initializes a dynamic bytes file. +func (f *DynamicBytesFile) Init(creds *auth.Credentials, ino uint64, data vfs.DynamicBytesSource, perm linux.FileMode) { + if perm&^linux.PermissionsMask != 0 { + panic(fmt.Sprintf("Only permission mask must be set: %x", perm&linux.PermissionsMask)) + } + f.InodeAttrs.Init(creds, ino, linux.ModeRegular|perm) f.data = data } @@ -59,6 +69,8 @@ func (f *DynamicBytesFile) SetStat(*vfs.Filesystem, vfs.SetStatOptions) error { // DynamicBytesFile. // // Must be initialized with Init before first use. +// +// +stateify savable type DynamicBytesFD struct { vfs.FileDescriptionDefaultImpl vfs.DynamicBytesFileDescriptionImpl diff --git a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go index bd402330f..77975583b 100644 --- a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go @@ -154,7 +154,10 @@ func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirent fd.off++ } - return nil + var err error + relOffset := fd.off - int64(len(fd.children.set)) - 2 + fd.off, err = fd.inode().IterDirents(ctx, cb, fd.off, relOffset) + return err } // Seek implements vfs.FileDecriptionImpl.Seek. diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go index 7b45b702a..752e0f659 100644 --- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go @@ -139,6 +139,11 @@ func (*InodeNotDirectory) Lookup(ctx context.Context, name string) (*vfs.Dentry, panic("Lookup called on non-directory inode") } +// IterDirents implements Inode.IterDirents. +func (*InodeNotDirectory) IterDirents(ctx context.Context, callback vfs.IterDirentsCallback, offset, relOffset int64) (newOffset int64, err error) { + panic("IterDirents called on non-directory inode") +} + // Valid implements Inode.Valid. func (*InodeNotDirectory) Valid(context.Context) bool { return true @@ -156,6 +161,11 @@ func (*InodeNoDynamicLookup) Lookup(ctx context.Context, name string) (*vfs.Dent return nil, syserror.ENOENT } +// IterDirents implements Inode.IterDirents. +func (*InodeNoDynamicLookup) IterDirents(ctx context.Context, callback vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) { + return offset, nil +} + // Valid implements Inode.Valid. func (*InodeNoDynamicLookup) Valid(ctx context.Context) bool { return true @@ -490,3 +500,13 @@ func (o *OrderedChildren) nthLocked(i int64) *slot { } return nil } + +// InodeSymlink partially implements Inode interface for symlinks. +type InodeSymlink struct { + InodeNotDirectory +} + +// Open implements Inode.Open. +func (InodeSymlink) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) { + return nil, syserror.ELOOP +} diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go index ac802218d..d69b299ae 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs.go @@ -404,6 +404,15 @@ type inodeDynamicLookup interface { // Valid should return true if this inode is still valid, or needs to // be resolved again by a call to Lookup. Valid(ctx context.Context) bool + + // IterDirents is used to iterate over dynamically created entries. It invokes + // cb on each entry in the directory represented by the FileDescription. + // 'offset' is the offset for the entire IterDirents call, which may include + // results from the caller. 'relOffset' is the offset inside the entries + // returned by this IterDirents invocation. In other words, + // 'offset+relOffset+1' is the value that should be set in vfs.Dirent.NextOff, + // while 'relOffset' is the place where iteration should start from. + IterDirents(ctx context.Context, callback vfs.IterDirentsCallback, offset, relOffset int64) (newOffset int64, err error) } type inodeSymlink interface { diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go index 73b6e43b5..3db12caa0 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs_test.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go @@ -133,7 +133,7 @@ type file struct { func (fs *filesystem) newFile(creds *auth.Credentials, content string) *kernfs.Dentry { f := &file{} f.content = content - f.DynamicBytesFile.Init(creds, fs.NextIno(), f) + f.DynamicBytesFile.Init(creds, fs.NextIno(), f, 0777) d := &kernfs.Dentry{} d.Init(f) diff --git a/pkg/sentry/fsimpl/kernfs/symlink.go b/pkg/sentry/fsimpl/kernfs/symlink.go new file mode 100644 index 000000000..068063f4e --- /dev/null +++ b/pkg/sentry/fsimpl/kernfs/symlink.go @@ -0,0 +1,45 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package kernfs + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" +) + +type staticSymlink struct { + InodeAttrs + InodeNoopRefCount + InodeSymlink + + target string +} + +var _ Inode = (*staticSymlink)(nil) + +// NewStaticSymlink creates a new symlink file pointing to 'target'. +func NewStaticSymlink(creds *auth.Credentials, ino uint64, perm linux.FileMode, target string) *Dentry { + inode := &staticSymlink{target: target} + inode.Init(creds, ino, linux.ModeSymlink|perm) + + d := &Dentry{} + d.Init(inode) + return d +} + +func (s *staticSymlink) Readlink(_ context.Context) (string, error) { + return s.target, nil +} diff --git a/pkg/sentry/fsimpl/proc/BUILD b/pkg/sentry/fsimpl/proc/BUILD index ade6ac946..1f44b3217 100644 --- a/pkg/sentry/fsimpl/proc/BUILD +++ b/pkg/sentry/fsimpl/proc/BUILD @@ -6,15 +6,17 @@ package(licenses = ["notice"]) go_library( name = "proc", srcs = [ - "filesystems.go", + "filesystem.go", "loadavg.go", "meminfo.go", "mounts.go", "net.go", - "proc.go", "stat.go", "sys.go", "task.go", + "task_files.go", + "tasks.go", + "tasks_files.go", "version.go", ], importpath = "gvisor.dev/gvisor/pkg/sentry/fsimpl/proc", @@ -24,8 +26,10 @@ go_library( "//pkg/log", "//pkg/sentry/context", "//pkg/sentry/fs", + "//pkg/sentry/fsimpl/kernfs", "//pkg/sentry/inet", "//pkg/sentry/kernel", + "//pkg/sentry/kernel/auth", "//pkg/sentry/limits", "//pkg/sentry/mm", "//pkg/sentry/socket", @@ -34,17 +38,40 @@ go_library( "//pkg/sentry/usage", "//pkg/sentry/usermem", "//pkg/sentry/vfs", + "//pkg/syserror", ], ) go_test( name = "proc_test", size = "small", - srcs = ["net_test.go"], + srcs = [ + "boot_test.go", + "net_test.go", + "tasks_test.go", + ], embed = [":proc"], deps = [ "//pkg/abi/linux", + "//pkg/cpuid", + "//pkg/fspath", + "//pkg/memutil", + "//pkg/sentry/context", "//pkg/sentry/context/contexttest", + "//pkg/sentry/fs", "//pkg/sentry/inet", + "//pkg/sentry/kernel", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/kernel/sched", + "//pkg/sentry/limits", + "//pkg/sentry/loader", + "//pkg/sentry/pgalloc", + "//pkg/sentry/platform", + "//pkg/sentry/platform/kvm", + "//pkg/sentry/platform/ptrace", + "//pkg/sentry/time", + "//pkg/sentry/usermem", + "//pkg/sentry/vfs", + "//pkg/syserror", ], ) diff --git a/pkg/sentry/fsimpl/proc/boot_test.go b/pkg/sentry/fsimpl/proc/boot_test.go new file mode 100644 index 000000000..84a93ee56 --- /dev/null +++ b/pkg/sentry/fsimpl/proc/boot_test.go @@ -0,0 +1,149 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package proc + +import ( + "flag" + "fmt" + "os" + "runtime" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/cpuid" + "gvisor.dev/gvisor/pkg/memutil" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/kernel/sched" + "gvisor.dev/gvisor/pkg/sentry/limits" + "gvisor.dev/gvisor/pkg/sentry/loader" + "gvisor.dev/gvisor/pkg/sentry/pgalloc" + "gvisor.dev/gvisor/pkg/sentry/platform" + "gvisor.dev/gvisor/pkg/sentry/time" + + // Platforms are plugable. + _ "gvisor.dev/gvisor/pkg/sentry/platform/kvm" + _ "gvisor.dev/gvisor/pkg/sentry/platform/ptrace" +) + +var ( + platformFlag = flag.String("platform", "ptrace", "specify which platform to use") +) + +// boot initializes a new bare bones kernel for test. +func boot() (*kernel.Kernel, error) { + platformCtr, err := platform.Lookup(*platformFlag) + if err != nil { + return nil, fmt.Errorf("platform not found: %v", err) + } + deviceFile, err := platformCtr.OpenDevice() + if err != nil { + return nil, fmt.Errorf("creating platform: %v", err) + } + plat, err := platformCtr.New(deviceFile) + if err != nil { + return nil, fmt.Errorf("creating platform: %v", err) + } + + k := &kernel.Kernel{ + Platform: plat, + } + + mf, err := createMemoryFile() + if err != nil { + return nil, err + } + k.SetMemoryFile(mf) + + // Pass k as the platform since it is savable, unlike the actual platform. + vdso, err := loader.PrepareVDSO(nil, k) + if err != nil { + return nil, fmt.Errorf("creating vdso: %v", err) + } + + // Create timekeeper. + tk, err := kernel.NewTimekeeper(k, vdso.ParamPage.FileRange()) + if err != nil { + return nil, fmt.Errorf("creating timekeeper: %v", err) + } + tk.SetClocks(time.NewCalibratedClocks()) + + creds := auth.NewRootCredentials(auth.NewRootUserNamespace()) + + // Initiate the Kernel object, which is required by the Context passed + // to createVFS in order to mount (among other things) procfs. + if err = k.Init(kernel.InitKernelArgs{ + ApplicationCores: uint(runtime.GOMAXPROCS(-1)), + FeatureSet: cpuid.HostFeatureSet(), + Timekeeper: tk, + RootUserNamespace: creds.UserNamespace, + Vdso: vdso, + RootUTSNamespace: kernel.NewUTSNamespace("hostname", "domain", creds.UserNamespace), + RootIPCNamespace: kernel.NewIPCNamespace(creds.UserNamespace), + RootAbstractSocketNamespace: kernel.NewAbstractSocketNamespace(), + PIDNamespace: kernel.NewRootPIDNamespace(creds.UserNamespace), + }); err != nil { + return nil, fmt.Errorf("initializing kernel: %v", err) + } + + ctx := k.SupervisorContext() + + // Create mount namespace without root as it's the minimum required to create + // the global thread group. + mntns, err := fs.NewMountNamespace(ctx, nil) + if err != nil { + return nil, err + } + ls, err := limits.NewLinuxLimitSet() + if err != nil { + return nil, err + } + tg := k.NewThreadGroup(mntns, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, ls) + k.TestOnly_SetGlobalInit(tg) + + return k, nil +} + +// createTask creates a new bare bones task for tests. +func createTask(ctx context.Context, name string, tc *kernel.ThreadGroup) (*kernel.Task, error) { + k := kernel.KernelFromContext(ctx) + config := &kernel.TaskConfig{ + Kernel: k, + ThreadGroup: tc, + TaskContext: &kernel.TaskContext{Name: name}, + Credentials: auth.CredentialsFromContext(ctx), + AllowedCPUMask: sched.NewFullCPUSet(k.ApplicationCores()), + UTSNamespace: kernel.UTSNamespaceFromContext(ctx), + IPCNamespace: kernel.IPCNamespaceFromContext(ctx), + AbstractSocketNamespace: kernel.NewAbstractSocketNamespace(), + } + return k.TaskSet().NewTask(config) +} + +func createMemoryFile() (*pgalloc.MemoryFile, error) { + const memfileName = "test-memory" + memfd, err := memutil.CreateMemFD(memfileName, 0) + if err != nil { + return nil, fmt.Errorf("error creating memfd: %v", err) + } + memfile := os.NewFile(uintptr(memfd), memfileName) + mf, err := pgalloc.NewMemoryFile(memfile, pgalloc.MemoryFileOpts{}) + if err != nil { + memfile.Close() + return nil, fmt.Errorf("error creating pgalloc.MemoryFile: %v", err) + } + return mf, nil +} diff --git a/pkg/sentry/fsimpl/proc/filesystem.go b/pkg/sentry/fsimpl/proc/filesystem.go new file mode 100644 index 000000000..d09182c77 --- /dev/null +++ b/pkg/sentry/fsimpl/proc/filesystem.go @@ -0,0 +1,69 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package proc implements a partial in-memory file system for procfs. +package proc + +import ( + "fmt" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" +) + +// procFSType is the factory class for procfs. +// +// +stateify savable +type procFSType struct{} + +var _ vfs.FilesystemType = (*procFSType)(nil) + +// GetFilesystem implements vfs.FilesystemType. +func (ft *procFSType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { + k := kernel.KernelFromContext(ctx) + if k == nil { + return nil, nil, fmt.Errorf("procfs requires a kernel") + } + pidns := kernel.PIDNamespaceFromContext(ctx) + if pidns == nil { + return nil, nil, fmt.Errorf("procfs requires a PID namespace") + } + + procfs := &kernfs.Filesystem{} + procfs.VFSFilesystem().Init(vfsObj, procfs) + + _, dentry := newTasksInode(procfs, k, pidns) + return procfs.VFSFilesystem(), dentry.VFSDentry(), nil +} + +// dynamicInode is an overfitted interface for common Inodes with +// dynamicByteSource types used in procfs. +type dynamicInode interface { + kernfs.Inode + vfs.DynamicBytesSource + + Init(creds *auth.Credentials, ino uint64, data vfs.DynamicBytesSource, perm linux.FileMode) +} + +func newDentry(creds *auth.Credentials, ino uint64, perm linux.FileMode, inode dynamicInode) *kernfs.Dentry { + inode.Init(creds, ino, inode, perm) + + d := &kernfs.Dentry{} + d.Init(inode) + return d +} diff --git a/pkg/sentry/fsimpl/proc/filesystems.go b/pkg/sentry/fsimpl/proc/filesystems.go deleted file mode 100644 index 0e016bca5..000000000 --- a/pkg/sentry/fsimpl/proc/filesystems.go +++ /dev/null @@ -1,25 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package proc - -// filesystemsData implements vfs.DynamicBytesSource for /proc/filesystems. -// -// +stateify savable -type filesystemsData struct{} - -// TODO(gvisor.dev/issue/1195): Implement vfs.DynamicBytesSource.Generate for -// filesystemsData. We would need to retrive filesystem names from -// vfs.VirtualFilesystem. Also needs vfs replacement for -// fs.Filesystem.AllowUserList() and fs.FilesystemRequiresDev. diff --git a/pkg/sentry/fsimpl/proc/loadavg.go b/pkg/sentry/fsimpl/proc/loadavg.go index 9135afef1..5351d86e8 100644 --- a/pkg/sentry/fsimpl/proc/loadavg.go +++ b/pkg/sentry/fsimpl/proc/loadavg.go @@ -19,15 +19,17 @@ import ( "fmt" "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" ) // loadavgData backs /proc/loadavg. // // +stateify savable -type loadavgData struct{} +type loadavgData struct { + kernfs.DynamicBytesFile +} -var _ vfs.DynamicBytesSource = (*loadavgData)(nil) +var _ dynamicInode = (*loadavgData)(nil) // Generate implements vfs.DynamicBytesSource.Generate. func (d *loadavgData) Generate(ctx context.Context, buf *bytes.Buffer) error { diff --git a/pkg/sentry/fsimpl/proc/meminfo.go b/pkg/sentry/fsimpl/proc/meminfo.go index 9a827cd66..cbdd4f3fc 100644 --- a/pkg/sentry/fsimpl/proc/meminfo.go +++ b/pkg/sentry/fsimpl/proc/meminfo.go @@ -19,21 +19,23 @@ import ( "fmt" "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sentry/usermem" - "gvisor.dev/gvisor/pkg/sentry/vfs" ) // meminfoData implements vfs.DynamicBytesSource for /proc/meminfo. // // +stateify savable type meminfoData struct { + kernfs.DynamicBytesFile + // k is the owning Kernel. k *kernel.Kernel } -var _ vfs.DynamicBytesSource = (*meminfoData)(nil) +var _ dynamicInode = (*meminfoData)(nil) // Generate implements vfs.DynamicBytesSource.Generate. func (d *meminfoData) Generate(ctx context.Context, buf *bytes.Buffer) error { diff --git a/pkg/sentry/fsimpl/proc/proc.go b/pkg/sentry/fsimpl/proc/proc.go deleted file mode 100644 index 31dec36de..000000000 --- a/pkg/sentry/fsimpl/proc/proc.go +++ /dev/null @@ -1,16 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Package proc implements a partial in-memory file system for procfs. -package proc diff --git a/pkg/sentry/fsimpl/proc/stat.go b/pkg/sentry/fsimpl/proc/stat.go index 720db3828..50894a534 100644 --- a/pkg/sentry/fsimpl/proc/stat.go +++ b/pkg/sentry/fsimpl/proc/stat.go @@ -20,8 +20,8 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/vfs" ) // cpuStats contains the breakdown of CPU time for /proc/stat. @@ -66,11 +66,13 @@ func (c cpuStats) String() string { // // +stateify savable type statData struct { + kernfs.DynamicBytesFile + // k is the owning Kernel. k *kernel.Kernel } -var _ vfs.DynamicBytesSource = (*statData)(nil) +var _ dynamicInode = (*statData)(nil) // Generate implements vfs.DynamicBytesSource.Generate. func (s *statData) Generate(ctx context.Context, buf *bytes.Buffer) error { diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go index 0d87be52b..11a64c777 100644 --- a/pkg/sentry/fsimpl/proc/task.go +++ b/pkg/sentry/fsimpl/proc/task.go @@ -15,251 +15,176 @@ package proc import ( - "bytes" - "fmt" - "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/limits" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/mm" - "gvisor.dev/gvisor/pkg/sentry/usage" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" ) -// mapsCommon is embedded by mapsData and smapsData. -type mapsCommon struct { - t *kernel.Task -} - -// mm gets the kernel task's MemoryManager. No additional reference is taken on -// mm here. This is safe because MemoryManager.destroy is required to leave the -// MemoryManager in a state where it's still usable as a DynamicBytesSource. -func (md *mapsCommon) mm() *mm.MemoryManager { - var tmm *mm.MemoryManager - md.t.WithMuLocked(func(t *kernel.Task) { - if mm := t.MemoryManager(); mm != nil { - tmm = mm - } - }) - return tmm -} - -// mapsData implements vfs.DynamicBytesSource for /proc/[pid]/maps. +// taskInode represents the inode for /proc/PID/ directory. // // +stateify savable -type mapsData struct { - mapsCommon +type taskInode struct { + kernfs.InodeNotSymlink + kernfs.InodeDirectoryNoNewChildren + kernfs.InodeNoDynamicLookup + kernfs.InodeAttrs + kernfs.OrderedChildren + + task *kernel.Task } -var _ vfs.DynamicBytesSource = (*mapsData)(nil) - -// Generate implements vfs.DynamicBytesSource.Generate. -func (md *mapsData) Generate(ctx context.Context, buf *bytes.Buffer) error { - if mm := md.mm(); mm != nil { - mm.ReadMapsDataInto(ctx, buf) +var _ kernfs.Inode = (*taskInode)(nil) + +func newTaskInode(inoGen InoGenerator, task *kernel.Task, pidns *kernel.PIDNamespace, isThreadGroup bool) *kernfs.Dentry { + contents := map[string]*kernfs.Dentry{ + //"auxv": newAuxvec(t, msrc), + //"cmdline": newExecArgInode(t, msrc, cmdlineExecArg), + //"comm": newComm(t, msrc), + //"environ": newExecArgInode(t, msrc, environExecArg), + //"exe": newExe(t, msrc), + //"fd": newFdDir(t, msrc), + //"fdinfo": newFdInfoDir(t, msrc), + //"gid_map": newGIDMap(t, msrc), + "io": newTaskOwnedFile(task, inoGen.NextIno(), defaultPermission, newIO(task, isThreadGroup)), + "maps": newTaskOwnedFile(task, inoGen.NextIno(), defaultPermission, &mapsData{task: task}), + //"mountinfo": seqfile.NewSeqFileInode(t, &mountInfoFile{t: t}, msrc), + //"mounts": seqfile.NewSeqFileInode(t, &mountsFile{t: t}, msrc), + //"ns": newNamespaceDir(t, msrc), + "smaps": newTaskOwnedFile(task, inoGen.NextIno(), defaultPermission, &smapsData{task: task}), + "stat": newTaskOwnedFile(task, inoGen.NextIno(), defaultPermission, &taskStatData{t: task, pidns: pidns, tgstats: isThreadGroup}), + "statm": newTaskOwnedFile(task, inoGen.NextIno(), defaultPermission, &statmData{t: task}), + "status": newTaskOwnedFile(task, inoGen.NextIno(), defaultPermission, &statusData{t: task, pidns: pidns}), + //"uid_map": newUIDMap(t, msrc), } - return nil -} - -// smapsData implements vfs.DynamicBytesSource for /proc/[pid]/smaps. -// -// +stateify savable -type smapsData struct { - mapsCommon -} - -var _ vfs.DynamicBytesSource = (*smapsData)(nil) - -// Generate implements vfs.DynamicBytesSource.Generate. -func (sd *smapsData) Generate(ctx context.Context, buf *bytes.Buffer) error { - if mm := sd.mm(); mm != nil { - mm.ReadSmapsDataInto(ctx, buf) + if isThreadGroup { + //contents["task"] = p.newSubtasks(t, msrc) } - return nil -} - -// +stateify savable -type taskStatData struct { - t *kernel.Task + //if len(p.cgroupControllers) > 0 { + // contents["cgroup"] = newCGroupInode(t, msrc, p.cgroupControllers) + //} - // If tgstats is true, accumulate fault stats (not implemented) and CPU - // time across all tasks in t's thread group. - tgstats bool + taskInode := &taskInode{task: task} + // Note: credentials are overridden by taskOwnedInode. + taskInode.InodeAttrs.Init(task.Credentials(), inoGen.NextIno(), linux.ModeDirectory|0555) - // pidns is the PID namespace associated with the proc filesystem that - // includes the file using this statData. - pidns *kernel.PIDNamespace -} - -var _ vfs.DynamicBytesSource = (*taskStatData)(nil) - -// Generate implements vfs.DynamicBytesSource.Generate. -func (s *taskStatData) Generate(ctx context.Context, buf *bytes.Buffer) error { - fmt.Fprintf(buf, "%d ", s.pidns.IDOfTask(s.t)) - fmt.Fprintf(buf, "(%s) ", s.t.Name()) - fmt.Fprintf(buf, "%c ", s.t.StateStatus()[0]) - ppid := kernel.ThreadID(0) - if parent := s.t.Parent(); parent != nil { - ppid = s.pidns.IDOfThreadGroup(parent.ThreadGroup()) - } - fmt.Fprintf(buf, "%d ", ppid) - fmt.Fprintf(buf, "%d ", s.pidns.IDOfProcessGroup(s.t.ThreadGroup().ProcessGroup())) - fmt.Fprintf(buf, "%d ", s.pidns.IDOfSession(s.t.ThreadGroup().Session())) - fmt.Fprintf(buf, "0 0 " /* tty_nr tpgid */) - fmt.Fprintf(buf, "0 " /* flags */) - fmt.Fprintf(buf, "0 0 0 0 " /* minflt cminflt majflt cmajflt */) - var cputime usage.CPUStats - if s.tgstats { - cputime = s.t.ThreadGroup().CPUStats() - } else { - cputime = s.t.CPUStats() - } - fmt.Fprintf(buf, "%d %d ", linux.ClockTFromDuration(cputime.UserTime), linux.ClockTFromDuration(cputime.SysTime)) - cputime = s.t.ThreadGroup().JoinedChildCPUStats() - fmt.Fprintf(buf, "%d %d ", linux.ClockTFromDuration(cputime.UserTime), linux.ClockTFromDuration(cputime.SysTime)) - fmt.Fprintf(buf, "%d %d ", s.t.Priority(), s.t.Niceness()) - fmt.Fprintf(buf, "%d ", s.t.ThreadGroup().Count()) + inode := &taskOwnedInode{Inode: taskInode, owner: task} + dentry := &kernfs.Dentry{} + dentry.Init(inode) - // itrealvalue. Since kernel 2.6.17, this field is no longer - // maintained, and is hard coded as 0. - fmt.Fprintf(buf, "0 ") + taskInode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) + links := taskInode.OrderedChildren.Populate(dentry, contents) + taskInode.IncLinks(links) - // Start time is relative to boot time, expressed in clock ticks. - fmt.Fprintf(buf, "%d ", linux.ClockTFromDuration(s.t.StartTime().Sub(s.t.Kernel().Timekeeper().BootTime()))) + return dentry +} - var vss, rss uint64 - s.t.WithMuLocked(func(t *kernel.Task) { - if mm := t.MemoryManager(); mm != nil { - vss = mm.VirtualMemorySize() - rss = mm.ResidentSetSize() - } - }) - fmt.Fprintf(buf, "%d %d ", vss, rss/usermem.PageSize) +// Valid implements kernfs.inodeDynamicLookup. This inode remains valid as long +// as the task is still running. When it's dead, another tasks with the same +// PID could replace it. +func (i *taskInode) Valid(ctx context.Context) bool { + return i.task.ExitState() != kernel.TaskExitDead +} - // rsslim. - fmt.Fprintf(buf, "%d ", s.t.ThreadGroup().Limits().Get(limits.Rss).Cur) +// Open implements kernfs.Inode. +func (i *taskInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) { + fd := &kernfs.GenericDirectoryFD{} + fd.Init(rp.Mount(), vfsd, &i.OrderedChildren, flags) + return fd.VFSFileDescription(), nil +} - fmt.Fprintf(buf, "0 0 0 0 0 " /* startcode endcode startstack kstkesp kstkeip */) - fmt.Fprintf(buf, "0 0 0 0 0 " /* signal blocked sigignore sigcatch wchan */) - fmt.Fprintf(buf, "0 0 " /* nswap cnswap */) - terminationSignal := linux.Signal(0) - if s.t == s.t.ThreadGroup().Leader() { - terminationSignal = s.t.ThreadGroup().TerminationSignal() +// SetStat implements kernfs.Inode. +func (i *taskInode) SetStat(_ *vfs.Filesystem, opts vfs.SetStatOptions) error { + stat := opts.Stat + if stat.Mask&linux.STATX_MODE != 0 { + return syserror.EPERM } - fmt.Fprintf(buf, "%d ", terminationSignal) - fmt.Fprintf(buf, "0 0 0 " /* processor rt_priority policy */) - fmt.Fprintf(buf, "0 0 0 " /* delayacct_blkio_ticks guest_time cguest_time */) - fmt.Fprintf(buf, "0 0 0 0 0 0 0 " /* start_data end_data start_brk arg_start arg_end env_start env_end */) - fmt.Fprintf(buf, "0\n" /* exit_code */) - return nil } -// statmData implements vfs.DynamicBytesSource for /proc/[pid]/statm. -// -// +stateify savable -type statmData struct { - t *kernel.Task +// taskOwnedInode implements kernfs.Inode and overrides inode owner with task +// effective user and group. +type taskOwnedInode struct { + kernfs.Inode + + // owner is the task that owns this inode. + owner *kernel.Task } -var _ vfs.DynamicBytesSource = (*statmData)(nil) +var _ kernfs.Inode = (*taskOwnedInode)(nil) -// Generate implements vfs.DynamicBytesSource.Generate. -func (s *statmData) Generate(ctx context.Context, buf *bytes.Buffer) error { - var vss, rss uint64 - s.t.WithMuLocked(func(t *kernel.Task) { - if mm := t.MemoryManager(); mm != nil { - vss = mm.VirtualMemorySize() - rss = mm.ResidentSetSize() - } - }) +func newTaskOwnedFile(task *kernel.Task, ino uint64, perm linux.FileMode, inode dynamicInode) *kernfs.Dentry { + // Note: credentials are overridden by taskOwnedInode. + inode.Init(task.Credentials(), ino, inode, perm) - fmt.Fprintf(buf, "%d %d 0 0 0 0 0\n", vss/usermem.PageSize, rss/usermem.PageSize) - return nil + taskInode := &taskOwnedInode{Inode: inode, owner: task} + d := &kernfs.Dentry{} + d.Init(taskInode) + return d } -// statusData implements vfs.DynamicBytesSource for /proc/[pid]/status. -// -// +stateify savable -type statusData struct { - t *kernel.Task - pidns *kernel.PIDNamespace +// Stat implements kernfs.Inode. +func (i *taskOwnedInode) Stat(fs *vfs.Filesystem) linux.Statx { + stat := i.Inode.Stat(fs) + uid, gid := i.getOwner(linux.FileMode(stat.Mode)) + stat.UID = uint32(uid) + stat.GID = uint32(gid) + return stat } -var _ vfs.DynamicBytesSource = (*statusData)(nil) +// CheckPermissions implements kernfs.Inode. +func (i *taskOwnedInode) CheckPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error { + mode := i.Mode() + uid, gid := i.getOwner(mode) + return vfs.GenericCheckPermissions( + creds, + ats, + mode.FileType() == linux.ModeDirectory, + uint16(mode), + uid, + gid, + ) +} -// Generate implements vfs.DynamicBytesSource.Generate. -func (s *statusData) Generate(ctx context.Context, buf *bytes.Buffer) error { - fmt.Fprintf(buf, "Name:\t%s\n", s.t.Name()) - fmt.Fprintf(buf, "State:\t%s\n", s.t.StateStatus()) - fmt.Fprintf(buf, "Tgid:\t%d\n", s.pidns.IDOfThreadGroup(s.t.ThreadGroup())) - fmt.Fprintf(buf, "Pid:\t%d\n", s.pidns.IDOfTask(s.t)) - ppid := kernel.ThreadID(0) - if parent := s.t.Parent(); parent != nil { - ppid = s.pidns.IDOfThreadGroup(parent.ThreadGroup()) +func (i *taskOwnedInode) getOwner(mode linux.FileMode) (auth.KUID, auth.KGID) { + // By default, set the task owner as the file owner. + creds := i.owner.Credentials() + uid := creds.EffectiveKUID + gid := creds.EffectiveKGID + + // Linux doesn't apply dumpability adjustments to world readable/executable + // directories so that applications can stat /proc/PID to determine the + // effective UID of a process. See fs/proc/base.c:task_dump_owner. + if mode.FileType() == linux.ModeDirectory && mode.Permissions() == 0555 { + return uid, gid } - fmt.Fprintf(buf, "PPid:\t%d\n", ppid) - tpid := kernel.ThreadID(0) - if tracer := s.t.Tracer(); tracer != nil { - tpid = s.pidns.IDOfTask(tracer) + + // If the task is not dumpable, then root (in the namespace preferred) + // owns the file. + m := getMM(i.owner) + if m == nil { + return auth.RootKUID, auth.RootKGID } - fmt.Fprintf(buf, "TracerPid:\t%d\n", tpid) - var fds int - var vss, rss, data uint64 - s.t.WithMuLocked(func(t *kernel.Task) { - if fdTable := t.FDTable(); fdTable != nil { - fds = fdTable.Size() + if m.Dumpability() != mm.UserDumpable { + uid = auth.RootKUID + if kuid := creds.UserNamespace.MapToKUID(auth.RootUID); kuid.Ok() { + uid = kuid } - if mm := t.MemoryManager(); mm != nil { - vss = mm.VirtualMemorySize() - rss = mm.ResidentSetSize() - data = mm.VirtualDataSize() + gid = auth.RootKGID + if kgid := creds.UserNamespace.MapToKGID(auth.RootGID); kgid.Ok() { + gid = kgid } - }) - fmt.Fprintf(buf, "FDSize:\t%d\n", fds) - fmt.Fprintf(buf, "VmSize:\t%d kB\n", vss>>10) - fmt.Fprintf(buf, "VmRSS:\t%d kB\n", rss>>10) - fmt.Fprintf(buf, "VmData:\t%d kB\n", data>>10) - fmt.Fprintf(buf, "Threads:\t%d\n", s.t.ThreadGroup().Count()) - creds := s.t.Credentials() - fmt.Fprintf(buf, "CapInh:\t%016x\n", creds.InheritableCaps) - fmt.Fprintf(buf, "CapPrm:\t%016x\n", creds.PermittedCaps) - fmt.Fprintf(buf, "CapEff:\t%016x\n", creds.EffectiveCaps) - fmt.Fprintf(buf, "CapBnd:\t%016x\n", creds.BoundingCaps) - fmt.Fprintf(buf, "Seccomp:\t%d\n", s.t.SeccompMode()) - // We unconditionally report a single NUMA node. See - // pkg/sentry/syscalls/linux/sys_mempolicy.go. - fmt.Fprintf(buf, "Mems_allowed:\t1\n") - fmt.Fprintf(buf, "Mems_allowed_list:\t0\n") - return nil -} - -// ioUsage is the /proc//io and /proc//task//io data provider. -type ioUsage interface { - // IOUsage returns the io usage data. - IOUsage() *usage.IO -} - -// +stateify savable -type ioData struct { - ioUsage + } + return uid, gid } -var _ vfs.DynamicBytesSource = (*ioData)(nil) - -// Generate implements vfs.DynamicBytesSource.Generate. -func (i *ioData) Generate(ctx context.Context, buf *bytes.Buffer) error { - io := usage.IO{} - io.Accumulate(i.IOUsage()) - - fmt.Fprintf(buf, "char: %d\n", io.CharsRead) - fmt.Fprintf(buf, "wchar: %d\n", io.CharsWritten) - fmt.Fprintf(buf, "syscr: %d\n", io.ReadSyscalls) - fmt.Fprintf(buf, "syscw: %d\n", io.WriteSyscalls) - fmt.Fprintf(buf, "read_bytes: %d\n", io.BytesRead) - fmt.Fprintf(buf, "write_bytes: %d\n", io.BytesWritten) - fmt.Fprintf(buf, "cancelled_write_bytes: %d\n", io.BytesWriteCancelled) - return nil +func newIO(t *kernel.Task, isThreadGroup bool) *ioData { + if isThreadGroup { + return &ioData{ioUsage: t.ThreadGroup()} + } + return &ioData{ioUsage: t} } diff --git a/pkg/sentry/fsimpl/proc/task_files.go b/pkg/sentry/fsimpl/proc/task_files.go new file mode 100644 index 000000000..93f0e1aa8 --- /dev/null +++ b/pkg/sentry/fsimpl/proc/task_files.go @@ -0,0 +1,272 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package proc + +import ( + "bytes" + "fmt" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/limits" + "gvisor.dev/gvisor/pkg/sentry/mm" + "gvisor.dev/gvisor/pkg/sentry/usage" + "gvisor.dev/gvisor/pkg/sentry/usermem" +) + +// mm gets the kernel task's MemoryManager. No additional reference is taken on +// mm here. This is safe because MemoryManager.destroy is required to leave the +// MemoryManager in a state where it's still usable as a DynamicBytesSource. +func getMM(task *kernel.Task) *mm.MemoryManager { + var tmm *mm.MemoryManager + task.WithMuLocked(func(t *kernel.Task) { + if mm := t.MemoryManager(); mm != nil { + tmm = mm + } + }) + return tmm +} + +// mapsData implements vfs.DynamicBytesSource for /proc/[pid]/maps. +// +// +stateify savable +type mapsData struct { + kernfs.DynamicBytesFile + + task *kernel.Task +} + +var _ dynamicInode = (*mapsData)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (d *mapsData) Generate(ctx context.Context, buf *bytes.Buffer) error { + if mm := getMM(d.task); mm != nil { + mm.ReadMapsDataInto(ctx, buf) + } + return nil +} + +// smapsData implements vfs.DynamicBytesSource for /proc/[pid]/smaps. +// +// +stateify savable +type smapsData struct { + kernfs.DynamicBytesFile + + task *kernel.Task +} + +var _ dynamicInode = (*smapsData)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (d *smapsData) Generate(ctx context.Context, buf *bytes.Buffer) error { + if mm := getMM(d.task); mm != nil { + mm.ReadSmapsDataInto(ctx, buf) + } + return nil +} + +// +stateify savable +type taskStatData struct { + kernfs.DynamicBytesFile + + t *kernel.Task + + // If tgstats is true, accumulate fault stats (not implemented) and CPU + // time across all tasks in t's thread group. + tgstats bool + + // pidns is the PID namespace associated with the proc filesystem that + // includes the file using this statData. + pidns *kernel.PIDNamespace +} + +var _ dynamicInode = (*taskStatData)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (s *taskStatData) Generate(ctx context.Context, buf *bytes.Buffer) error { + fmt.Fprintf(buf, "%d ", s.pidns.IDOfTask(s.t)) + fmt.Fprintf(buf, "(%s) ", s.t.Name()) + fmt.Fprintf(buf, "%c ", s.t.StateStatus()[0]) + ppid := kernel.ThreadID(0) + if parent := s.t.Parent(); parent != nil { + ppid = s.pidns.IDOfThreadGroup(parent.ThreadGroup()) + } + fmt.Fprintf(buf, "%d ", ppid) + fmt.Fprintf(buf, "%d ", s.pidns.IDOfProcessGroup(s.t.ThreadGroup().ProcessGroup())) + fmt.Fprintf(buf, "%d ", s.pidns.IDOfSession(s.t.ThreadGroup().Session())) + fmt.Fprintf(buf, "0 0 " /* tty_nr tpgid */) + fmt.Fprintf(buf, "0 " /* flags */) + fmt.Fprintf(buf, "0 0 0 0 " /* minflt cminflt majflt cmajflt */) + var cputime usage.CPUStats + if s.tgstats { + cputime = s.t.ThreadGroup().CPUStats() + } else { + cputime = s.t.CPUStats() + } + fmt.Fprintf(buf, "%d %d ", linux.ClockTFromDuration(cputime.UserTime), linux.ClockTFromDuration(cputime.SysTime)) + cputime = s.t.ThreadGroup().JoinedChildCPUStats() + fmt.Fprintf(buf, "%d %d ", linux.ClockTFromDuration(cputime.UserTime), linux.ClockTFromDuration(cputime.SysTime)) + fmt.Fprintf(buf, "%d %d ", s.t.Priority(), s.t.Niceness()) + fmt.Fprintf(buf, "%d ", s.t.ThreadGroup().Count()) + + // itrealvalue. Since kernel 2.6.17, this field is no longer + // maintained, and is hard coded as 0. + fmt.Fprintf(buf, "0 ") + + // Start time is relative to boot time, expressed in clock ticks. + fmt.Fprintf(buf, "%d ", linux.ClockTFromDuration(s.t.StartTime().Sub(s.t.Kernel().Timekeeper().BootTime()))) + + var vss, rss uint64 + s.t.WithMuLocked(func(t *kernel.Task) { + if mm := t.MemoryManager(); mm != nil { + vss = mm.VirtualMemorySize() + rss = mm.ResidentSetSize() + } + }) + fmt.Fprintf(buf, "%d %d ", vss, rss/usermem.PageSize) + + // rsslim. + fmt.Fprintf(buf, "%d ", s.t.ThreadGroup().Limits().Get(limits.Rss).Cur) + + fmt.Fprintf(buf, "0 0 0 0 0 " /* startcode endcode startstack kstkesp kstkeip */) + fmt.Fprintf(buf, "0 0 0 0 0 " /* signal blocked sigignore sigcatch wchan */) + fmt.Fprintf(buf, "0 0 " /* nswap cnswap */) + terminationSignal := linux.Signal(0) + if s.t == s.t.ThreadGroup().Leader() { + terminationSignal = s.t.ThreadGroup().TerminationSignal() + } + fmt.Fprintf(buf, "%d ", terminationSignal) + fmt.Fprintf(buf, "0 0 0 " /* processor rt_priority policy */) + fmt.Fprintf(buf, "0 0 0 " /* delayacct_blkio_ticks guest_time cguest_time */) + fmt.Fprintf(buf, "0 0 0 0 0 0 0 " /* start_data end_data start_brk arg_start arg_end env_start env_end */) + fmt.Fprintf(buf, "0\n" /* exit_code */) + + return nil +} + +// statmData implements vfs.DynamicBytesSource for /proc/[pid]/statm. +// +// +stateify savable +type statmData struct { + kernfs.DynamicBytesFile + + t *kernel.Task +} + +var _ dynamicInode = (*statmData)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (s *statmData) Generate(ctx context.Context, buf *bytes.Buffer) error { + var vss, rss uint64 + s.t.WithMuLocked(func(t *kernel.Task) { + if mm := t.MemoryManager(); mm != nil { + vss = mm.VirtualMemorySize() + rss = mm.ResidentSetSize() + } + }) + + fmt.Fprintf(buf, "%d %d 0 0 0 0 0\n", vss/usermem.PageSize, rss/usermem.PageSize) + return nil +} + +// statusData implements vfs.DynamicBytesSource for /proc/[pid]/status. +// +// +stateify savable +type statusData struct { + kernfs.DynamicBytesFile + + t *kernel.Task + pidns *kernel.PIDNamespace +} + +var _ dynamicInode = (*statusData)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (s *statusData) Generate(ctx context.Context, buf *bytes.Buffer) error { + fmt.Fprintf(buf, "Name:\t%s\n", s.t.Name()) + fmt.Fprintf(buf, "State:\t%s\n", s.t.StateStatus()) + fmt.Fprintf(buf, "Tgid:\t%d\n", s.pidns.IDOfThreadGroup(s.t.ThreadGroup())) + fmt.Fprintf(buf, "Pid:\t%d\n", s.pidns.IDOfTask(s.t)) + ppid := kernel.ThreadID(0) + if parent := s.t.Parent(); parent != nil { + ppid = s.pidns.IDOfThreadGroup(parent.ThreadGroup()) + } + fmt.Fprintf(buf, "PPid:\t%d\n", ppid) + tpid := kernel.ThreadID(0) + if tracer := s.t.Tracer(); tracer != nil { + tpid = s.pidns.IDOfTask(tracer) + } + fmt.Fprintf(buf, "TracerPid:\t%d\n", tpid) + var fds int + var vss, rss, data uint64 + s.t.WithMuLocked(func(t *kernel.Task) { + if fdTable := t.FDTable(); fdTable != nil { + fds = fdTable.Size() + } + if mm := t.MemoryManager(); mm != nil { + vss = mm.VirtualMemorySize() + rss = mm.ResidentSetSize() + data = mm.VirtualDataSize() + } + }) + fmt.Fprintf(buf, "FDSize:\t%d\n", fds) + fmt.Fprintf(buf, "VmSize:\t%d kB\n", vss>>10) + fmt.Fprintf(buf, "VmRSS:\t%d kB\n", rss>>10) + fmt.Fprintf(buf, "VmData:\t%d kB\n", data>>10) + fmt.Fprintf(buf, "Threads:\t%d\n", s.t.ThreadGroup().Count()) + creds := s.t.Credentials() + fmt.Fprintf(buf, "CapInh:\t%016x\n", creds.InheritableCaps) + fmt.Fprintf(buf, "CapPrm:\t%016x\n", creds.PermittedCaps) + fmt.Fprintf(buf, "CapEff:\t%016x\n", creds.EffectiveCaps) + fmt.Fprintf(buf, "CapBnd:\t%016x\n", creds.BoundingCaps) + fmt.Fprintf(buf, "Seccomp:\t%d\n", s.t.SeccompMode()) + // We unconditionally report a single NUMA node. See + // pkg/sentry/syscalls/linux/sys_mempolicy.go. + fmt.Fprintf(buf, "Mems_allowed:\t1\n") + fmt.Fprintf(buf, "Mems_allowed_list:\t0\n") + return nil +} + +// ioUsage is the /proc//io and /proc//task//io data provider. +type ioUsage interface { + // IOUsage returns the io usage data. + IOUsage() *usage.IO +} + +// +stateify savable +type ioData struct { + kernfs.DynamicBytesFile + + ioUsage +} + +var _ dynamicInode = (*ioData)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (i *ioData) Generate(ctx context.Context, buf *bytes.Buffer) error { + io := usage.IO{} + io.Accumulate(i.IOUsage()) + + fmt.Fprintf(buf, "char: %d\n", io.CharsRead) + fmt.Fprintf(buf, "wchar: %d\n", io.CharsWritten) + fmt.Fprintf(buf, "syscr: %d\n", io.ReadSyscalls) + fmt.Fprintf(buf, "syscw: %d\n", io.WriteSyscalls) + fmt.Fprintf(buf, "read_bytes: %d\n", io.BytesRead) + fmt.Fprintf(buf, "write_bytes: %d\n", io.BytesWritten) + fmt.Fprintf(buf, "cancelled_write_bytes: %d\n", io.BytesWriteCancelled) + return nil +} diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go new file mode 100644 index 000000000..50b2a832f --- /dev/null +++ b/pkg/sentry/fsimpl/proc/tasks.go @@ -0,0 +1,162 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package proc + +import ( + "sort" + "strconv" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +const defaultPermission = 0444 + +// InoGenerator generates unique inode numbers for a given filesystem. +type InoGenerator interface { + NextIno() uint64 +} + +// tasksInode represents the inode for /proc/ directory. +// +// +stateify savable +type tasksInode struct { + kernfs.InodeNotSymlink + kernfs.InodeDirectoryNoNewChildren + kernfs.InodeAttrs + kernfs.OrderedChildren + + inoGen InoGenerator + pidns *kernel.PIDNamespace +} + +var _ kernfs.Inode = (*tasksInode)(nil) + +func newTasksInode(inoGen InoGenerator, k *kernel.Kernel, pidns *kernel.PIDNamespace) (*tasksInode, *kernfs.Dentry) { + root := auth.NewRootCredentials(pidns.UserNamespace()) + contents := map[string]*kernfs.Dentry{ + //"cpuinfo": newCPUInfo(ctx, msrc), + //"filesystems": seqfile.NewSeqFileInode(ctx, &filesystemsData{}, msrc), + "loadavg": newDentry(root, inoGen.NextIno(), defaultPermission, &loadavgData{}), + "meminfo": newDentry(root, inoGen.NextIno(), defaultPermission, &meminfoData{k: k}), + "mounts": kernfs.NewStaticSymlink(root, inoGen.NextIno(), defaultPermission, "self/mounts"), + "self": newSelfSymlink(root, inoGen.NextIno(), defaultPermission, pidns), + "stat": newDentry(root, inoGen.NextIno(), defaultPermission, &statData{k: k}), + "thread-self": newThreadSelfSymlink(root, inoGen.NextIno(), defaultPermission, pidns), + //"uptime": newUptime(ctx, msrc), + //"version": newVersionData(root, inoGen.NextIno(), k), + "version": newDentry(root, inoGen.NextIno(), defaultPermission, &versionData{k: k}), + } + + inode := &tasksInode{ + pidns: pidns, + inoGen: inoGen, + } + inode.InodeAttrs.Init(root, inoGen.NextIno(), linux.ModeDirectory|0555) + + dentry := &kernfs.Dentry{} + dentry.Init(inode) + + inode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) + links := inode.OrderedChildren.Populate(dentry, contents) + inode.IncLinks(links) + + return inode, dentry +} + +// Lookup implements kernfs.inodeDynamicLookup. +func (i *tasksInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) { + // Try to lookup a corresponding task. + tid, err := strconv.ParseUint(name, 10, 64) + if err != nil { + return nil, syserror.ENOENT + } + + task := i.pidns.TaskWithID(kernel.ThreadID(tid)) + if task == nil { + return nil, syserror.ENOENT + } + + taskDentry := newTaskInode(i.inoGen, task, i.pidns, true) + return taskDentry.VFSDentry(), nil +} + +// Valid implements kernfs.inodeDynamicLookup. +func (i *tasksInode) Valid(ctx context.Context) bool { + return true +} + +// IterDirents implements kernfs.inodeDynamicLookup. +// +// TODO(gvisor.dev/issue/1195): Use tgid N offset = TGID_OFFSET + N. +func (i *tasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) { + var tids []int + + // Collect all tasks. Per linux we only include it in directory listings if + // it's the leader. But for whatever crazy reason, you can still walk to the + // given node. + for _, tg := range i.pidns.ThreadGroups() { + if leader := tg.Leader(); leader != nil { + tids = append(tids, int(i.pidns.IDOfThreadGroup(tg))) + } + } + + if len(tids) == 0 { + return offset, nil + } + if relOffset >= int64(len(tids)) { + return offset, nil + } + + sort.Ints(tids) + for _, tid := range tids[relOffset:] { + dirent := vfs.Dirent{ + Name: strconv.FormatUint(uint64(tid), 10), + Type: linux.DT_DIR, + Ino: i.inoGen.NextIno(), + NextOff: offset + 1, + } + if !cb.Handle(dirent) { + return offset, nil + } + offset++ + } + return offset, nil +} + +// Open implements kernfs.Inode. +func (i *tasksInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) { + fd := &kernfs.GenericDirectoryFD{} + fd.Init(rp.Mount(), vfsd, &i.OrderedChildren, flags) + return fd.VFSFileDescription(), nil +} + +func (i *tasksInode) Stat(vsfs *vfs.Filesystem) linux.Statx { + stat := i.InodeAttrs.Stat(vsfs) + + // Add dynamic children to link count. + for _, tg := range i.pidns.ThreadGroups() { + if leader := tg.Leader(); leader != nil { + stat.Nlink++ + } + } + + return stat +} diff --git a/pkg/sentry/fsimpl/proc/tasks_files.go b/pkg/sentry/fsimpl/proc/tasks_files.go new file mode 100644 index 000000000..91f30a798 --- /dev/null +++ b/pkg/sentry/fsimpl/proc/tasks_files.go @@ -0,0 +1,92 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package proc + +import ( + "fmt" + "strconv" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/syserror" +) + +type selfSymlink struct { + kernfs.InodeAttrs + kernfs.InodeNoopRefCount + kernfs.InodeSymlink + + pidns *kernel.PIDNamespace +} + +var _ kernfs.Inode = (*selfSymlink)(nil) + +func newSelfSymlink(creds *auth.Credentials, ino uint64, perm linux.FileMode, pidns *kernel.PIDNamespace) *kernfs.Dentry { + inode := &selfSymlink{pidns: pidns} + inode.Init(creds, ino, linux.ModeSymlink|perm) + + d := &kernfs.Dentry{} + d.Init(inode) + return d +} + +func (s *selfSymlink) Readlink(ctx context.Context) (string, error) { + t := kernel.TaskFromContext(ctx) + if t == nil { + // Who is reading this link? + return "", syserror.EINVAL + } + tgid := s.pidns.IDOfThreadGroup(t.ThreadGroup()) + if tgid == 0 { + return "", syserror.ENOENT + } + return strconv.FormatUint(uint64(tgid), 10), nil +} + +type threadSelfSymlink struct { + kernfs.InodeAttrs + kernfs.InodeNoopRefCount + kernfs.InodeSymlink + + pidns *kernel.PIDNamespace +} + +var _ kernfs.Inode = (*threadSelfSymlink)(nil) + +func newThreadSelfSymlink(creds *auth.Credentials, ino uint64, perm linux.FileMode, pidns *kernel.PIDNamespace) *kernfs.Dentry { + inode := &threadSelfSymlink{pidns: pidns} + inode.Init(creds, ino, linux.ModeSymlink|perm) + + d := &kernfs.Dentry{} + d.Init(inode) + return d +} + +func (s *threadSelfSymlink) Readlink(ctx context.Context) (string, error) { + t := kernel.TaskFromContext(ctx) + if t == nil { + // Who is reading this link? + return "", syserror.EINVAL + } + tgid := s.pidns.IDOfThreadGroup(t.ThreadGroup()) + tid := s.pidns.IDOfTask(t) + if tid == 0 || tgid == 0 { + return "", syserror.ENOENT + } + return fmt.Sprintf("%d/task/%d", tgid, tid), nil +} diff --git a/pkg/sentry/fsimpl/proc/tasks_test.go b/pkg/sentry/fsimpl/proc/tasks_test.go new file mode 100644 index 000000000..48201d75a --- /dev/null +++ b/pkg/sentry/fsimpl/proc/tasks_test.go @@ -0,0 +1,412 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package proc + +import ( + "fmt" + "path" + "strconv" + "testing" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +type testIterDirentsCallback struct { + dirents []vfs.Dirent +} + +func (t *testIterDirentsCallback) Handle(d vfs.Dirent) bool { + t.dirents = append(t.dirents, d) + return true +} + +func checkDots(dirs []vfs.Dirent) ([]vfs.Dirent, error) { + if got := len(dirs); got < 2 { + return dirs, fmt.Errorf("wrong number of dirents, want at least: 2, got: %d: %v", got, dirs) + } + for i, want := range []string{".", ".."} { + if got := dirs[i].Name; got != want { + return dirs, fmt.Errorf("wrong name, want: %s, got: %s", want, got) + } + if got := dirs[i].Type; got != linux.DT_DIR { + return dirs, fmt.Errorf("wrong type, want: %d, got: %d", linux.DT_DIR, got) + } + } + return dirs[2:], nil +} + +func checkTasksStaticFiles(gots []vfs.Dirent) ([]vfs.Dirent, error) { + wants := map[string]vfs.Dirent{ + "loadavg": vfs.Dirent{Type: linux.DT_REG}, + "meminfo": vfs.Dirent{Type: linux.DT_REG}, + "mounts": vfs.Dirent{Type: linux.DT_LNK}, + "self": vfs.Dirent{Type: linux.DT_LNK}, + "stat": vfs.Dirent{Type: linux.DT_REG}, + "thread-self": vfs.Dirent{Type: linux.DT_LNK}, + "version": vfs.Dirent{Type: linux.DT_REG}, + } + return checkFiles(gots, wants) +} + +func checkTaskStaticFiles(gots []vfs.Dirent) ([]vfs.Dirent, error) { + wants := map[string]vfs.Dirent{ + "io": vfs.Dirent{Type: linux.DT_REG}, + "maps": vfs.Dirent{Type: linux.DT_REG}, + "smaps": vfs.Dirent{Type: linux.DT_REG}, + "stat": vfs.Dirent{Type: linux.DT_REG}, + "statm": vfs.Dirent{Type: linux.DT_REG}, + "status": vfs.Dirent{Type: linux.DT_REG}, + } + return checkFiles(gots, wants) +} + +func checkFiles(gots []vfs.Dirent, wants map[string]vfs.Dirent) ([]vfs.Dirent, error) { + // Go over all files, when there is a match, the file is removed from both + // 'gots' and 'wants'. wants is expected to reach 0, as all files must + // be present. Remaining files in 'gots', is returned to caller to decide + // whether this is valid or not. + for i := 0; i < len(gots); i++ { + got := gots[i] + want, ok := wants[got.Name] + if !ok { + continue + } + if want.Type != got.Type { + return gots, fmt.Errorf("wrong file type, want: %v, got: %v: %+v", want.Type, got.Type, got) + } + + delete(wants, got.Name) + gots = append(gots[0:i], gots[i+1:]...) + i-- + } + if len(wants) != 0 { + return gots, fmt.Errorf("not all files were found, missing: %+v", wants) + } + return gots, nil +} + +func setup() (context.Context, *vfs.VirtualFilesystem, vfs.VirtualDentry, error) { + k, err := boot() + if err != nil { + return nil, nil, vfs.VirtualDentry{}, fmt.Errorf("creating kernel: %v", err) + } + + ctx := k.SupervisorContext() + creds := auth.CredentialsFromContext(ctx) + + vfsObj := vfs.New() + vfsObj.MustRegisterFilesystemType("procfs", &procFSType{}) + mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "procfs", &vfs.GetFilesystemOptions{}) + if err != nil { + return nil, nil, vfs.VirtualDentry{}, fmt.Errorf("NewMountNamespace(): %v", err) + } + return ctx, vfsObj, mntns.Root(), nil +} + +func TestTasksEmpty(t *testing.T) { + ctx, vfsObj, root, err := setup() + if err != nil { + t.Fatalf("Setup failed: %v", err) + } + defer root.DecRef() + + fd, err := vfsObj.OpenAt( + ctx, + auth.CredentialsFromContext(ctx), + &vfs.PathOperation{Root: root, Start: root, Path: fspath.Parse("/")}, + &vfs.OpenOptions{}, + ) + if err != nil { + t.Fatalf("vfsfs.OpenAt failed: %v", err) + } + + cb := testIterDirentsCallback{} + if err := fd.Impl().IterDirents(ctx, &cb); err != nil { + t.Fatalf("IterDirents(): %v", err) + } + cb.dirents, err = checkDots(cb.dirents) + if err != nil { + t.Error(err.Error()) + } + cb.dirents, err = checkTasksStaticFiles(cb.dirents) + if err != nil { + t.Error(err.Error()) + } + if len(cb.dirents) != 0 { + t.Error("found more files than expected: %+v", cb.dirents) + } +} + +func TestTasks(t *testing.T) { + ctx, vfsObj, root, err := setup() + if err != nil { + t.Fatalf("Setup failed: %v", err) + } + defer root.DecRef() + + k := kernel.KernelFromContext(ctx) + var tasks []*kernel.Task + for i := 0; i < 5; i++ { + tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits()) + task, err := createTask(ctx, fmt.Sprintf("name-%d", i), tc) + if err != nil { + t.Fatalf("CreateTask(): %v", err) + } + tasks = append(tasks, task) + } + + fd, err := vfsObj.OpenAt( + ctx, + auth.CredentialsFromContext(ctx), + &vfs.PathOperation{Root: root, Start: root, Path: fspath.Parse("/")}, + &vfs.OpenOptions{}, + ) + if err != nil { + t.Fatalf("vfsfs.OpenAt(/) failed: %v", err) + } + + cb := testIterDirentsCallback{} + if err := fd.Impl().IterDirents(ctx, &cb); err != nil { + t.Fatalf("IterDirents(): %v", err) + } + cb.dirents, err = checkDots(cb.dirents) + if err != nil { + t.Error(err.Error()) + } + cb.dirents, err = checkTasksStaticFiles(cb.dirents) + if err != nil { + t.Error(err.Error()) + } + lastPid := 0 + for _, d := range cb.dirents { + pid, err := strconv.Atoi(d.Name) + if err != nil { + t.Fatalf("Invalid process directory %q", d.Name) + } + if lastPid > pid { + t.Errorf("pids not in order: %v", cb.dirents) + } + found := false + for _, t := range tasks { + if k.TaskSet().Root.IDOfTask(t) == kernel.ThreadID(pid) { + found = true + } + } + if !found { + t.Errorf("Additional task ID %d listed: %v", pid, tasks) + } + } + + // Test lookup. + for _, path := range []string{"/1", "/2"} { + fd, err := vfsObj.OpenAt( + ctx, + auth.CredentialsFromContext(ctx), + &vfs.PathOperation{Root: root, Start: root, Path: fspath.Parse(path)}, + &vfs.OpenOptions{}, + ) + if err != nil { + t.Fatalf("vfsfs.OpenAt(%q) failed: %v", path, err) + } + buf := make([]byte, 1) + bufIOSeq := usermem.BytesIOSequence(buf) + if _, err := fd.Read(ctx, bufIOSeq, vfs.ReadOptions{}); err != syserror.EISDIR { + t.Errorf("wrong error reading directory: %v", err) + } + } + + if _, err := vfsObj.OpenAt( + ctx, + auth.CredentialsFromContext(ctx), + &vfs.PathOperation{Root: root, Start: root, Path: fspath.Parse("/9999")}, + &vfs.OpenOptions{}, + ); err != syserror.ENOENT { + t.Fatalf("wrong error from vfsfs.OpenAt(/9999): %v", err) + } +} + +func TestTask(t *testing.T) { + ctx, vfsObj, root, err := setup() + if err != nil { + t.Fatalf("Setup failed: %v", err) + } + defer root.DecRef() + + k := kernel.KernelFromContext(ctx) + tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits()) + _, err = createTask(ctx, "name", tc) + if err != nil { + t.Fatalf("CreateTask(): %v", err) + } + + fd, err := vfsObj.OpenAt( + ctx, + auth.CredentialsFromContext(ctx), + &vfs.PathOperation{Root: root, Start: root, Path: fspath.Parse("/1")}, + &vfs.OpenOptions{}, + ) + if err != nil { + t.Fatalf("vfsfs.OpenAt(/1) failed: %v", err) + } + + cb := testIterDirentsCallback{} + if err := fd.Impl().IterDirents(ctx, &cb); err != nil { + t.Fatalf("IterDirents(): %v", err) + } + cb.dirents, err = checkDots(cb.dirents) + if err != nil { + t.Error(err.Error()) + } + cb.dirents, err = checkTaskStaticFiles(cb.dirents) + if err != nil { + t.Error(err.Error()) + } + if len(cb.dirents) != 0 { + t.Errorf("found more files than expected: %+v", cb.dirents) + } +} + +func TestProcSelf(t *testing.T) { + ctx, vfsObj, root, err := setup() + if err != nil { + t.Fatalf("Setup failed: %v", err) + } + defer root.DecRef() + + k := kernel.KernelFromContext(ctx) + tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits()) + task, err := createTask(ctx, "name", tc) + if err != nil { + t.Fatalf("CreateTask(): %v", err) + } + + fd, err := vfsObj.OpenAt( + task, + auth.CredentialsFromContext(ctx), + &vfs.PathOperation{Root: root, Start: root, Path: fspath.Parse("/self/"), FollowFinalSymlink: true}, + &vfs.OpenOptions{}, + ) + if err != nil { + t.Fatalf("vfsfs.OpenAt(/self/) failed: %v", err) + } + + cb := testIterDirentsCallback{} + if err := fd.Impl().IterDirents(ctx, &cb); err != nil { + t.Fatalf("IterDirents(): %v", err) + } + cb.dirents, err = checkDots(cb.dirents) + if err != nil { + t.Error(err.Error()) + } + cb.dirents, err = checkTaskStaticFiles(cb.dirents) + if err != nil { + t.Error(err.Error()) + } + if len(cb.dirents) != 0 { + t.Errorf("found more files than expected: %+v", cb.dirents) + } +} + +func iterateDir(ctx context.Context, t *testing.T, vfsObj *vfs.VirtualFilesystem, root vfs.VirtualDentry, fd *vfs.FileDescription) { + t.Logf("Iterating: /proc%s", fd.MappedName(ctx)) + + cb := testIterDirentsCallback{} + if err := fd.Impl().IterDirents(ctx, &cb); err != nil { + t.Fatalf("IterDirents(): %v", err) + } + var err error + cb.dirents, err = checkDots(cb.dirents) + if err != nil { + t.Error(err.Error()) + } + for _, d := range cb.dirents { + childPath := path.Join(fd.MappedName(ctx), d.Name) + if d.Type == linux.DT_LNK { + link, err := vfsObj.ReadlinkAt( + ctx, + auth.CredentialsFromContext(ctx), + &vfs.PathOperation{Root: root, Start: root, Path: fspath.Parse(childPath)}, + ) + if err != nil { + t.Errorf("vfsfs.ReadlinkAt(%v) failed: %v", childPath, err) + } else { + t.Logf("Skipping symlink: /proc%s => %s", childPath, link) + } + continue + } + + t.Logf("Opening: /proc%s", childPath) + child, err := vfsObj.OpenAt( + ctx, + auth.CredentialsFromContext(ctx), + &vfs.PathOperation{Root: root, Start: root, Path: fspath.Parse(childPath)}, + &vfs.OpenOptions{}, + ) + if err != nil { + t.Errorf("vfsfs.OpenAt(%v) failed: %v", childPath, err) + continue + } + stat, err := child.Stat(ctx, vfs.StatOptions{}) + if err != nil { + t.Errorf("Stat(%v) failed: %v", childPath, err) + } + if got := linux.FileMode(stat.Mode).DirentType(); got != d.Type { + t.Errorf("wrong file mode, stat: %v, dirent: %v", got, d.Type) + } + if d.Type == linux.DT_DIR { + // Found another dir, let's do it again! + iterateDir(ctx, t, vfsObj, root, child) + } + } +} + +// TestTree iterates all directories and stats every file. +func TestTree(t *testing.T) { + uberCtx, vfsObj, root, err := setup() + if err != nil { + t.Fatalf("Setup failed: %v", err) + } + defer root.DecRef() + + k := kernel.KernelFromContext(uberCtx) + var tasks []*kernel.Task + for i := 0; i < 5; i++ { + tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits()) + task, err := createTask(uberCtx, fmt.Sprintf("name-%d", i), tc) + if err != nil { + t.Fatalf("CreateTask(): %v", err) + } + tasks = append(tasks, task) + } + + ctx := tasks[0] + fd, err := vfsObj.OpenAt( + ctx, + auth.CredentialsFromContext(uberCtx), + &vfs.PathOperation{Root: root, Start: root, Path: fspath.Parse("/")}, + &vfs.OpenOptions{}, + ) + if err != nil { + t.Fatalf("vfsfs.OpenAt(/) failed: %v", err) + } + iterateDir(ctx, t, vfsObj, root, fd) +} diff --git a/pkg/sentry/fsimpl/proc/version.go b/pkg/sentry/fsimpl/proc/version.go index e1643d4e0..367f2396b 100644 --- a/pkg/sentry/fsimpl/proc/version.go +++ b/pkg/sentry/fsimpl/proc/version.go @@ -19,19 +19,21 @@ import ( "fmt" "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/vfs" ) // versionData implements vfs.DynamicBytesSource for /proc/version. // // +stateify savable type versionData struct { + kernfs.DynamicBytesFile + // k is the owning Kernel. k *kernel.Kernel } -var _ vfs.DynamicBytesSource = (*versionData)(nil) +var _ dynamicInode = (*versionData)(nil) // Generate implements vfs.DynamicBytesSource.Generate. func (v *versionData) Generate(ctx context.Context, buf *bytes.Buffer) error { diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index bd3fb4c03..8653d2f63 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -762,7 +762,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, mounts.IncRef() } - tg := k.newThreadGroup(mounts, args.PIDNamespace, NewSignalHandlers(), linux.SIGCHLD, args.Limits, k.monotonicClock) + tg := k.NewThreadGroup(mounts, args.PIDNamespace, NewSignalHandlers(), linux.SIGCHLD, args.Limits) ctx := args.NewContext(k) // Get the root directory from the MountNamespace. @@ -1191,6 +1191,11 @@ func (k *Kernel) GlobalInit() *ThreadGroup { return k.globalInit } +// TestOnly_SetGlobalInit sets the thread group with ID 1 in the root PID namespace. +func (k *Kernel) TestOnly_SetGlobalInit(tg *ThreadGroup) { + k.globalInit = tg +} + // ApplicationCores returns the number of CPUs visible to sandboxed // applications. func (k *Kernel) ApplicationCores() uint { diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go index 3eadfedb4..5f3589493 100644 --- a/pkg/sentry/kernel/task_clone.go +++ b/pkg/sentry/kernel/task_clone.go @@ -243,7 +243,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { if opts.NewSignalHandlers { sh = sh.Fork() } - tg = t.k.newThreadGroup(tg.mounts, pidns, sh, opts.TerminationSignal, tg.limits.GetCopy(), t.k.monotonicClock) + tg = t.k.NewThreadGroup(tg.mounts, pidns, sh, opts.TerminationSignal, tg.limits.GetCopy()) } cfg := &TaskConfig{ diff --git a/pkg/sentry/kernel/thread_group.go b/pkg/sentry/kernel/thread_group.go index 72568d296..0cded73f6 100644 --- a/pkg/sentry/kernel/thread_group.go +++ b/pkg/sentry/kernel/thread_group.go @@ -256,20 +256,20 @@ type ThreadGroup struct { tty *TTY } -// newThreadGroup returns a new, empty thread group in PID namespace ns. The +// NewThreadGroup returns a new, empty thread group in PID namespace ns. The // thread group leader will send its parent terminationSignal when it exits. // The new thread group isn't visible to the system until a task has been // created inside of it by a successful call to TaskSet.NewTask. -func (k *Kernel) newThreadGroup(mounts *fs.MountNamespace, ns *PIDNamespace, sh *SignalHandlers, terminationSignal linux.Signal, limits *limits.LimitSet, monotonicClock *timekeeperClock) *ThreadGroup { +func (k *Kernel) NewThreadGroup(mntns *fs.MountNamespace, pidns *PIDNamespace, sh *SignalHandlers, terminationSignal linux.Signal, limits *limits.LimitSet) *ThreadGroup { tg := &ThreadGroup{ threadGroupNode: threadGroupNode{ - pidns: ns, + pidns: pidns, }, signalHandlers: sh, terminationSignal: terminationSignal, ioUsage: &usage.IO{}, limits: limits, - mounts: mounts, + mounts: mntns, } tg.itimerRealTimer = ktime.NewTimer(k.monotonicClock, &itimerRealListener{tg: tg}) tg.timers = make(map[linux.TimerID]*IntervalTimer) diff --git a/pkg/sentry/vfs/file_description_impl_util.go b/pkg/sentry/vfs/file_description_impl_util.go index 3df49991c..de782e577 100644 --- a/pkg/sentry/vfs/file_description_impl_util.go +++ b/pkg/sentry/vfs/file_description_impl_util.go @@ -199,6 +199,17 @@ type DynamicBytesSource interface { Generate(ctx context.Context, buf *bytes.Buffer) error } +// StaticData implements DynamicBytesSource over a static string. +type StaticData struct { + Data string +} + +// Generate implements DynamicBytesSource. +func (s *StaticData) Generate(ctx context.Context, buf *bytes.Buffer) error { + buf.WriteString(s.Data) + return nil +} + // SetDataSource must be called exactly once on fd before first use. func (fd *DynamicBytesFileDescriptionImpl) SetDataSource(data DynamicBytesSource) { fd.data = data -- cgit v1.2.3 From 796f53c0befc21570b185811e26b74e71950dfc3 Mon Sep 17 00:00:00 2001 From: Jamie Liu Date: Fri, 27 Dec 2019 00:12:14 -0800 Subject: Add VFS2 support for /proc/filesystems. Updates #1195 PiperOrigin-RevId: 287269106 --- pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go | 4 +- pkg/sentry/fsimpl/ext/ext_test.go | 4 +- pkg/sentry/fsimpl/kernfs/kernfs_test.go | 4 +- pkg/sentry/fsimpl/memfs/benchmark_test.go | 8 +++- pkg/sentry/fsimpl/memfs/pipe_test.go | 4 +- pkg/sentry/fsimpl/proc/tasks_test.go | 30 +++++++------ pkg/sentry/vfs/file_description_impl_util_test.go | 2 +- pkg/sentry/vfs/filesystem_type.go | 55 ++++++++++++++++++++--- pkg/sentry/vfs/mount.go | 15 ++++--- pkg/sentry/vfs/options.go | 4 ++ pkg/sentry/vfs/vfs.go | 12 ++--- 11 files changed, 103 insertions(+), 39 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go index 2f46d2d13..a56b03711 100644 --- a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go +++ b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go @@ -50,7 +50,9 @@ func setUp(b *testing.B, imagePath string) (context.Context, *vfs.VirtualFilesys // Create VFS. vfsObj := vfs.New() - vfsObj.MustRegisterFilesystemType("extfs", ext.FilesystemType{}) + vfsObj.MustRegisterFilesystemType("extfs", ext.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserMount: true, + }) mntns, err := vfsObj.NewMountNamespace(ctx, creds, imagePath, "extfs", &vfs.GetFilesystemOptions{InternalData: int(f.Fd())}) if err != nil { f.Close() diff --git a/pkg/sentry/fsimpl/ext/ext_test.go b/pkg/sentry/fsimpl/ext/ext_test.go index 5d6c999bd..6c14a1e2d 100644 --- a/pkg/sentry/fsimpl/ext/ext_test.go +++ b/pkg/sentry/fsimpl/ext/ext_test.go @@ -66,7 +66,9 @@ func setUp(t *testing.T, imagePath string) (context.Context, *vfs.VirtualFilesys // Create VFS. vfsObj := vfs.New() - vfsObj.MustRegisterFilesystemType("extfs", FilesystemType{}) + vfsObj.MustRegisterFilesystemType("extfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserMount: true, + }) mntns, err := vfsObj.NewMountNamespace(ctx, creds, localImagePath, "extfs", &vfs.GetFilesystemOptions{InternalData: int(f.Fd())}) if err != nil { f.Close() diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go index 3db12caa0..4b6b95f5f 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs_test.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go @@ -59,7 +59,9 @@ func newTestSystem(t *testing.T, rootFn RootDentryFn) *TestSystem { ctx := contexttest.Context(t) creds := auth.CredentialsFromContext(ctx) v := vfs.New() - v.MustRegisterFilesystemType("testfs", &fsType{rootFn: rootFn}) + v.MustRegisterFilesystemType("testfs", &fsType{rootFn: rootFn}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserMount: true, + }) mns, err := v.NewMountNamespace(ctx, creds, "", "testfs", &vfs.GetFilesystemOptions{}) if err != nil { t.Fatalf("Failed to create testfs root mount: %v", err) diff --git a/pkg/sentry/fsimpl/memfs/benchmark_test.go b/pkg/sentry/fsimpl/memfs/benchmark_test.go index 6e987af88..a27876a4e 100644 --- a/pkg/sentry/fsimpl/memfs/benchmark_test.go +++ b/pkg/sentry/fsimpl/memfs/benchmark_test.go @@ -176,7 +176,9 @@ func BenchmarkVFS2MemfsStat(b *testing.B) { // Create VFS. vfsObj := vfs.New() - vfsObj.MustRegisterFilesystemType("memfs", memfs.FilesystemType{}) + vfsObj.MustRegisterFilesystemType("memfs", memfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserMount: true, + }) mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "memfs", &vfs.GetFilesystemOptions{}) if err != nil { b.Fatalf("failed to create tmpfs root mount: %v", err) @@ -365,7 +367,9 @@ func BenchmarkVFS2MemfsMountStat(b *testing.B) { // Create VFS. vfsObj := vfs.New() - vfsObj.MustRegisterFilesystemType("memfs", memfs.FilesystemType{}) + vfsObj.MustRegisterFilesystemType("memfs", memfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserMount: true, + }) mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "memfs", &vfs.GetFilesystemOptions{}) if err != nil { b.Fatalf("failed to create tmpfs root mount: %v", err) diff --git a/pkg/sentry/fsimpl/memfs/pipe_test.go b/pkg/sentry/fsimpl/memfs/pipe_test.go index be917aeee..807c1af7a 100644 --- a/pkg/sentry/fsimpl/memfs/pipe_test.go +++ b/pkg/sentry/fsimpl/memfs/pipe_test.go @@ -152,7 +152,9 @@ func setup(t *testing.T) (context.Context, *auth.Credentials, *vfs.VirtualFilesy // Create VFS. vfsObj := vfs.New() - vfsObj.MustRegisterFilesystemType("memfs", FilesystemType{}) + vfsObj.MustRegisterFilesystemType("memfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserMount: true, + }) mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "memfs", &vfs.GetFilesystemOptions{}) if err != nil { t.Fatalf("failed to create tmpfs root mount: %v", err) diff --git a/pkg/sentry/fsimpl/proc/tasks_test.go b/pkg/sentry/fsimpl/proc/tasks_test.go index 48201d75a..2560fcef9 100644 --- a/pkg/sentry/fsimpl/proc/tasks_test.go +++ b/pkg/sentry/fsimpl/proc/tasks_test.go @@ -56,25 +56,25 @@ func checkDots(dirs []vfs.Dirent) ([]vfs.Dirent, error) { func checkTasksStaticFiles(gots []vfs.Dirent) ([]vfs.Dirent, error) { wants := map[string]vfs.Dirent{ - "loadavg": vfs.Dirent{Type: linux.DT_REG}, - "meminfo": vfs.Dirent{Type: linux.DT_REG}, - "mounts": vfs.Dirent{Type: linux.DT_LNK}, - "self": vfs.Dirent{Type: linux.DT_LNK}, - "stat": vfs.Dirent{Type: linux.DT_REG}, - "thread-self": vfs.Dirent{Type: linux.DT_LNK}, - "version": vfs.Dirent{Type: linux.DT_REG}, + "loadavg": {Type: linux.DT_REG}, + "meminfo": {Type: linux.DT_REG}, + "mounts": {Type: linux.DT_LNK}, + "self": {Type: linux.DT_LNK}, + "stat": {Type: linux.DT_REG}, + "thread-self": {Type: linux.DT_LNK}, + "version": {Type: linux.DT_REG}, } return checkFiles(gots, wants) } func checkTaskStaticFiles(gots []vfs.Dirent) ([]vfs.Dirent, error) { wants := map[string]vfs.Dirent{ - "io": vfs.Dirent{Type: linux.DT_REG}, - "maps": vfs.Dirent{Type: linux.DT_REG}, - "smaps": vfs.Dirent{Type: linux.DT_REG}, - "stat": vfs.Dirent{Type: linux.DT_REG}, - "statm": vfs.Dirent{Type: linux.DT_REG}, - "status": vfs.Dirent{Type: linux.DT_REG}, + "io": {Type: linux.DT_REG}, + "maps": {Type: linux.DT_REG}, + "smaps": {Type: linux.DT_REG}, + "stat": {Type: linux.DT_REG}, + "statm": {Type: linux.DT_REG}, + "status": {Type: linux.DT_REG}, } return checkFiles(gots, wants) } @@ -114,7 +114,9 @@ func setup() (context.Context, *vfs.VirtualFilesystem, vfs.VirtualDentry, error) creds := auth.CredentialsFromContext(ctx) vfsObj := vfs.New() - vfsObj.MustRegisterFilesystemType("procfs", &procFSType{}) + vfsObj.MustRegisterFilesystemType("procfs", &procFSType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserMount: true, + }) mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "procfs", &vfs.GetFilesystemOptions{}) if err != nil { return nil, nil, vfs.VirtualDentry{}, fmt.Errorf("NewMountNamespace(): %v", err) diff --git a/pkg/sentry/vfs/file_description_impl_util_test.go b/pkg/sentry/vfs/file_description_impl_util_test.go index 678be07fe..9ed58512f 100644 --- a/pkg/sentry/vfs/file_description_impl_util_test.go +++ b/pkg/sentry/vfs/file_description_impl_util_test.go @@ -89,7 +89,7 @@ func TestGenCountFD(t *testing.T) { creds := auth.CredentialsFromContext(ctx) vfsObj := New() // vfs.New() - vfsObj.MustRegisterFilesystemType("testfs", FDTestFilesystemType{}) + vfsObj.MustRegisterFilesystemType("testfs", FDTestFilesystemType{}, &RegisterFilesystemTypeOptions{}) mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "testfs", &GetFilesystemOptions{}) if err != nil { t.Fatalf("failed to create testfs root mount: %v", err) diff --git a/pkg/sentry/vfs/filesystem_type.go b/pkg/sentry/vfs/filesystem_type.go index c335e206d..023301780 100644 --- a/pkg/sentry/vfs/filesystem_type.go +++ b/pkg/sentry/vfs/filesystem_type.go @@ -15,6 +15,7 @@ package vfs import ( + "bytes" "fmt" "gvisor.dev/gvisor/pkg/sentry/context" @@ -43,28 +44,70 @@ type GetFilesystemOptions struct { InternalData interface{} } +type registeredFilesystemType struct { + fsType FilesystemType + opts RegisterFilesystemTypeOptions +} + +// RegisterFilesystemTypeOptions contains options to +// VirtualFilesystem.RegisterFilesystem(). +type RegisterFilesystemTypeOptions struct { + // If AllowUserMount is true, allow calls to VirtualFilesystem.MountAt() + // for which MountOptions.InternalMount == false to use this filesystem + // type. + AllowUserMount bool + + // If AllowUserList is true, make this filesystem type visible in + // /proc/filesystems. + AllowUserList bool + + // If RequiresDevice is true, indicate that mounting this filesystem + // requires a block device as the mount source in /proc/filesystems. + RequiresDevice bool +} + // RegisterFilesystemType registers the given FilesystemType in vfs with the // given name. -func (vfs *VirtualFilesystem) RegisterFilesystemType(name string, fsType FilesystemType) error { +func (vfs *VirtualFilesystem) RegisterFilesystemType(name string, fsType FilesystemType, opts *RegisterFilesystemTypeOptions) error { vfs.fsTypesMu.Lock() defer vfs.fsTypesMu.Unlock() if existing, ok := vfs.fsTypes[name]; ok { - return fmt.Errorf("name %q is already registered to filesystem type %T", name, existing) + return fmt.Errorf("name %q is already registered to filesystem type %T", name, existing.fsType) + } + vfs.fsTypes[name] = ®isteredFilesystemType{ + fsType: fsType, + opts: *opts, } - vfs.fsTypes[name] = fsType return nil } // MustRegisterFilesystemType is equivalent to RegisterFilesystemType but // panics on failure. -func (vfs *VirtualFilesystem) MustRegisterFilesystemType(name string, fsType FilesystemType) { - if err := vfs.RegisterFilesystemType(name, fsType); err != nil { +func (vfs *VirtualFilesystem) MustRegisterFilesystemType(name string, fsType FilesystemType, opts *RegisterFilesystemTypeOptions) { + if err := vfs.RegisterFilesystemType(name, fsType, opts); err != nil { panic(fmt.Sprintf("failed to register filesystem type %T: %v", fsType, err)) } } -func (vfs *VirtualFilesystem) getFilesystemType(name string) FilesystemType { +func (vfs *VirtualFilesystem) getFilesystemType(name string) *registeredFilesystemType { vfs.fsTypesMu.RLock() defer vfs.fsTypesMu.RUnlock() return vfs.fsTypes[name] } + +// GenerateProcFilesystems emits the contents of /proc/filesystems for vfs to +// buf. +func (vfs *VirtualFilesystem) GenerateProcFilesystems(buf *bytes.Buffer) { + vfs.fsTypesMu.RLock() + defer vfs.fsTypesMu.RUnlock() + for name, rft := range vfs.fsTypes { + if !rft.opts.AllowUserList { + continue + } + var nodev string + if !rft.opts.RequiresDevice { + nodev = "nodev" + } + fmt.Fprintf(buf, "%s\t%s\n", nodev, name) + } +} diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go index ec23ab0dd..00177b371 100644 --- a/pkg/sentry/vfs/mount.go +++ b/pkg/sentry/vfs/mount.go @@ -112,11 +112,11 @@ type MountNamespace struct { // configured by the given arguments. A reference is taken on the returned // MountNamespace. func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth.Credentials, source, fsTypeName string, opts *GetFilesystemOptions) (*MountNamespace, error) { - fsType := vfs.getFilesystemType(fsTypeName) - if fsType == nil { + rft := vfs.getFilesystemType(fsTypeName) + if rft == nil { return nil, syserror.ENODEV } - fs, root, err := fsType.GetFilesystem(ctx, vfs, creds, source, *opts) + fs, root, err := rft.fsType.GetFilesystem(ctx, vfs, creds, source, *opts) if err != nil { return nil, err } @@ -136,11 +136,14 @@ func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth // MountAt creates and mounts a Filesystem configured by the given arguments. func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentials, source string, target *PathOperation, fsTypeName string, opts *MountOptions) error { - fsType := vfs.getFilesystemType(fsTypeName) - if fsType == nil { + rft := vfs.getFilesystemType(fsTypeName) + if rft == nil { return syserror.ENODEV } - fs, root, err := fsType.GetFilesystem(ctx, vfs, creds, source, opts.GetFilesystemOptions) + if !opts.InternalMount && !rft.opts.AllowUserMount { + return syserror.ENODEV + } + fs, root, err := rft.fsType.GetFilesystem(ctx, vfs, creds, source, opts.GetFilesystemOptions) if err != nil { return err } diff --git a/pkg/sentry/vfs/options.go b/pkg/sentry/vfs/options.go index 87d2b0d1c..b7774bf28 100644 --- a/pkg/sentry/vfs/options.go +++ b/pkg/sentry/vfs/options.go @@ -50,6 +50,10 @@ type MknodOptions struct { type MountOptions struct { // GetFilesystemOptions contains options to FilesystemType.GetFilesystem(). GetFilesystemOptions GetFilesystemOptions + + // If InternalMount is true, allow the use of filesystem types for which + // RegisterFilesystemTypeOptions.AllowUserMount == false. + InternalMount bool } // OpenOptions contains options to VirtualFilesystem.OpenAt() and diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go index 3e4df8558..a3bdb5805 100644 --- a/pkg/sentry/vfs/vfs.go +++ b/pkg/sentry/vfs/vfs.go @@ -75,23 +75,23 @@ type VirtualFilesystem struct { // mountpoints is analogous to Linux's mountpoint_hashtable. mountpoints map[*Dentry]map[*Mount]struct{} + // fsTypes contains all registered FilesystemTypes. fsTypes is protected by + // fsTypesMu. + fsTypesMu sync.RWMutex + fsTypes map[string]*registeredFilesystemType + // filesystems contains all Filesystems. filesystems is protected by // filesystemsMu. filesystemsMu sync.Mutex filesystems map[*Filesystem]struct{} - - // fsTypes contains all FilesystemTypes that are usable in the - // VirtualFilesystem. fsTypes is protected by fsTypesMu. - fsTypesMu sync.RWMutex - fsTypes map[string]FilesystemType } // New returns a new VirtualFilesystem with no mounts or FilesystemTypes. func New() *VirtualFilesystem { vfs := &VirtualFilesystem{ mountpoints: make(map[*Dentry]map[*Mount]struct{}), + fsTypes: make(map[string]*registeredFilesystemType), filesystems: make(map[*Filesystem]struct{}), - fsTypes: make(map[string]FilesystemType), } vfs.mounts.Init() return vfs -- cgit v1.2.3 From 1f384ac42b9ee8b52000dc2bff79d975853519ed Mon Sep 17 00:00:00 2001 From: Jamie Liu Date: Mon, 30 Dec 2019 11:35:06 -0800 Subject: Add VFS2 support for device special files. - Add FileDescriptionOptions.UseDentryMetadata, which reduces the amount of boilerplate needed for device FDs and the like between filesystems. - Switch back to having FileDescription.Init() take references on the Mount and Dentry; otherwise managing refcounts around failed calls to OpenDeviceSpecialFile() / Device.Open() is tricky. PiperOrigin-RevId: 287575574 --- pkg/sentry/fsimpl/ext/inode.go | 6 -- pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go | 2 - pkg/sentry/fsimpl/kernfs/fd_impl_util.go | 2 - pkg/sentry/fsimpl/memfs/filesystem.go | 4 - pkg/sentry/fsimpl/memfs/named_pipe.go | 2 - pkg/sentry/vfs/BUILD | 1 + pkg/sentry/vfs/device.go | 100 ++++++++++++++++++++++++ pkg/sentry/vfs/file_description.go | 101 +++++++++++++++++++++++-- pkg/sentry/vfs/file_description_impl_util.go | 15 ++++ pkg/sentry/vfs/filesystem.go | 21 +++++ pkg/sentry/vfs/vfs.go | 6 ++ 11 files changed, 236 insertions(+), 24 deletions(-) create mode 100644 pkg/sentry/vfs/device.go (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/ext/inode.go b/pkg/sentry/fsimpl/ext/inode.go index b2cc826c7..8608805bf 100644 --- a/pkg/sentry/fsimpl/ext/inode.go +++ b/pkg/sentry/fsimpl/ext/inode.go @@ -157,8 +157,6 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*v switch in.impl.(type) { case *regularFile: var fd regularFileFD - mnt.IncRef() - vfsd.IncRef() fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{}) return &fd.vfsfd, nil case *directory: @@ -168,8 +166,6 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*v return nil, syserror.EISDIR } var fd directoryFD - mnt.IncRef() - vfsd.IncRef() fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{}) return &fd.vfsfd, nil case *symlink: @@ -178,8 +174,6 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*v return nil, syserror.ELOOP } var fd symlinkFD - mnt.IncRef() - vfsd.IncRef() fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{}) return &fd.vfsfd, nil default: diff --git a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go index c5fe65722..606ca692d 100644 --- a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go +++ b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go @@ -81,8 +81,6 @@ type DynamicBytesFD struct { // Init initializes a DynamicBytesFD. func (fd *DynamicBytesFD) Init(m *vfs.Mount, d *vfs.Dentry, data vfs.DynamicBytesSource, flags uint32) { - m.IncRef() // DecRef in vfs.FileDescription.vd.DecRef on final ref. - d.IncRef() // DecRef in vfs.FileDescription.vd.DecRef on final ref. fd.inode = d.Impl().(*Dentry).inode fd.SetDataSource(data) fd.vfsfd.Init(fd, flags, m, d, &vfs.FileDescriptionOptions{}) diff --git a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go index 77975583b..bcf069b5f 100644 --- a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go @@ -44,8 +44,6 @@ type GenericDirectoryFD struct { // Init initializes a GenericDirectoryFD. func (fd *GenericDirectoryFD) Init(m *vfs.Mount, d *vfs.Dentry, children *OrderedChildren, flags uint32) { - m.IncRef() // DecRef in vfs.FileDescription.vd.DecRef on final ref. - d.IncRef() // DecRef in vfs.FileDescription.vd.DecRef on final ref. fd.children = children fd.vfsfd.Init(fd, flags, m, d, &vfs.FileDescriptionOptions{}) } diff --git a/pkg/sentry/fsimpl/memfs/filesystem.go b/pkg/sentry/fsimpl/memfs/filesystem.go index 4a83f310c..b063e09a3 100644 --- a/pkg/sentry/fsimpl/memfs/filesystem.go +++ b/pkg/sentry/fsimpl/memfs/filesystem.go @@ -348,8 +348,6 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, flags uint32, } // mnt.EndWrite() is called by regularFileFD.Release(). } - mnt.IncRef() - d.IncRef() fd.vfsfd.Init(&fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{}) if flags&linux.O_TRUNC != 0 { impl.mu.Lock() @@ -364,8 +362,6 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, flags uint32, return nil, syserror.EISDIR } var fd directoryFD - mnt.IncRef() - d.IncRef() fd.vfsfd.Init(&fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{}) return &fd.vfsfd, nil case *symlink: diff --git a/pkg/sentry/fsimpl/memfs/named_pipe.go b/pkg/sentry/fsimpl/memfs/named_pipe.go index d5060850e..b5a204438 100644 --- a/pkg/sentry/fsimpl/memfs/named_pipe.go +++ b/pkg/sentry/fsimpl/memfs/named_pipe.go @@ -55,8 +55,6 @@ func newNamedPipeFD(ctx context.Context, np *namedPipe, rp *vfs.ResolvingPath, v return nil, err } mnt := rp.Mount() - mnt.IncRef() - vfsd.IncRef() fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{}) return &fd.vfsfd, nil } diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD index e3e554b88..4c6aa04a1 100644 --- a/pkg/sentry/vfs/BUILD +++ b/pkg/sentry/vfs/BUILD @@ -9,6 +9,7 @@ go_library( "context.go", "debug.go", "dentry.go", + "device.go", "file_description.go", "file_description_impl_util.go", "filesystem.go", diff --git a/pkg/sentry/vfs/device.go b/pkg/sentry/vfs/device.go new file mode 100644 index 000000000..cb672e36f --- /dev/null +++ b/pkg/sentry/vfs/device.go @@ -0,0 +1,100 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs + +import ( + "fmt" + + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/syserror" +) + +// DeviceKind indicates whether a device is a block or character device. +type DeviceKind uint32 + +const ( + // BlockDevice indicates a block device. + BlockDevice DeviceKind = iota + + // CharDevice indicates a character device. + CharDevice +) + +// String implements fmt.Stringer.String. +func (kind DeviceKind) String() string { + switch kind { + case BlockDevice: + return "block" + case CharDevice: + return "character" + default: + return fmt.Sprintf("invalid device kind %d", kind) + } +} + +type devTuple struct { + kind DeviceKind + major uint32 + minor uint32 +} + +// A Device backs device special files. +type Device interface { + // Open returns a FileDescription representing this device. + Open(ctx context.Context, mnt *Mount, d *Dentry, opts OpenOptions) (*FileDescription, error) +} + +type registeredDevice struct { + dev Device + opts RegisterDeviceOptions +} + +// RegisterDeviceOptions contains options to +// VirtualFilesystem.RegisterDevice(). +type RegisterDeviceOptions struct { + // GroupName is the name shown for this device registration in + // /proc/devices. If GroupName is empty, this registration will not be + // shown in /proc/devices. + GroupName string +} + +// RegisterDevice registers the given Device in vfs with the given major and +// minor device numbers. +func (vfs *VirtualFilesystem) RegisterDevice(kind DeviceKind, major, minor uint32, dev Device, opts *RegisterDeviceOptions) error { + tup := devTuple{kind, major, minor} + vfs.devicesMu.Lock() + defer vfs.devicesMu.Unlock() + if existing, ok := vfs.devices[tup]; ok { + return fmt.Errorf("%s device number (%d, %d) is already registered to device type %T", kind, major, minor, existing.dev) + } + vfs.devices[tup] = ®isteredDevice{ + dev: dev, + opts: *opts, + } + return nil +} + +// OpenDeviceSpecialFile returns a FileDescription representing the given +// device. +func (vfs *VirtualFilesystem) OpenDeviceSpecialFile(ctx context.Context, mnt *Mount, d *Dentry, kind DeviceKind, major, minor uint32, opts *OpenOptions) (*FileDescription, error) { + tup := devTuple{kind, major, minor} + vfs.devicesMu.RLock() + defer vfs.devicesMu.RUnlock() + rd, ok := vfs.devices[tup] + if !ok { + return nil, syserror.ENXIO + } + return rd.dev.Open(ctx, mnt, d, *opts) +} diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go index 0b053201a..6afe280bc 100644 --- a/pkg/sentry/vfs/file_description.go +++ b/pkg/sentry/vfs/file_description.go @@ -61,11 +61,25 @@ type FileDescriptionOptions struct { // If AllowDirectIO is true, allow O_DIRECT to be set on the file. This is // usually only the case if O_DIRECT would actually have an effect. AllowDirectIO bool + + // If UseDentryMetadata is true, calls to FileDescription methods that + // interact with file and filesystem metadata (Stat, SetStat, StatFS, + // Listxattr, Getxattr, Setxattr, Removexattr) are implemented by calling + // the corresponding FilesystemImpl methods instead of the corresponding + // FileDescriptionImpl methods. + // + // UseDentryMetadata is intended for file descriptions that are implemented + // outside of individual filesystems, such as pipes, sockets, and device + // special files. FileDescriptions for which UseDentryMetadata is true may + // embed DentryMetadataFileDescriptionImpl to obtain appropriate + // implementations of FileDescriptionImpl methods that should not be + // called. + UseDentryMetadata bool } -// Init must be called before first use of fd. It takes ownership of references -// on mnt and d held by the caller. statusFlags is the initial file description -// status flags, which is usually the full set of flags passed to open(2). +// Init must be called before first use of fd. It takes references on mnt and +// d. statusFlags is the initial file description status flags, which is +// usually the full set of flags passed to open(2). func (fd *FileDescription) Init(impl FileDescriptionImpl, statusFlags uint32, mnt *Mount, d *Dentry, opts *FileDescriptionOptions) { fd.refs = 1 fd.statusFlags = statusFlags | linux.O_LARGEFILE @@ -73,6 +87,7 @@ func (fd *FileDescription) Init(impl FileDescriptionImpl, statusFlags uint32, mn mount: mnt, dentry: d, } + fd.vd.IncRef() fd.opts = *opts fd.impl = impl } @@ -140,7 +155,7 @@ func (fd *FileDescription) SetStatusFlags(ctx context.Context, creds *auth.Crede // sense. However, the check as actually implemented seems to be "O_APPEND // cannot be changed if the file is marked as append-only". if (flags^oldFlags)&linux.O_APPEND != 0 { - stat, err := fd.impl.Stat(ctx, StatOptions{ + stat, err := fd.Stat(ctx, StatOptions{ // There is no mask bit for stx_attributes. Mask: 0, // Linux just reads inode::i_flags directly. @@ -154,7 +169,7 @@ func (fd *FileDescription) SetStatusFlags(ctx context.Context, creds *auth.Crede } } if (flags&linux.O_NOATIME != 0) && (oldFlags&linux.O_NOATIME == 0) { - stat, err := fd.impl.Stat(ctx, StatOptions{ + stat, err := fd.Stat(ctx, StatOptions{ Mask: linux.STATX_UID, // Linux's inode_owner_or_capable() just reads inode::i_uid // directly. @@ -348,17 +363,47 @@ func (fd *FileDescription) OnClose(ctx context.Context) error { // Stat returns metadata for the file represented by fd. func (fd *FileDescription) Stat(ctx context.Context, opts StatOptions) (linux.Statx, error) { + if fd.opts.UseDentryMetadata { + vfsObj := fd.vd.mount.vfs + rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{ + Root: fd.vd, + Start: fd.vd, + }) + stat, err := fd.vd.mount.fs.impl.StatAt(ctx, rp, opts) + vfsObj.putResolvingPath(rp) + return stat, err + } return fd.impl.Stat(ctx, opts) } // SetStat updates metadata for the file represented by fd. func (fd *FileDescription) SetStat(ctx context.Context, opts SetStatOptions) error { + if fd.opts.UseDentryMetadata { + vfsObj := fd.vd.mount.vfs + rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{ + Root: fd.vd, + Start: fd.vd, + }) + err := fd.vd.mount.fs.impl.SetStatAt(ctx, rp, opts) + vfsObj.putResolvingPath(rp) + return err + } return fd.impl.SetStat(ctx, opts) } // StatFS returns metadata for the filesystem containing the file represented // by fd. func (fd *FileDescription) StatFS(ctx context.Context) (linux.Statfs, error) { + if fd.opts.UseDentryMetadata { + vfsObj := fd.vd.mount.vfs + rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{ + Root: fd.vd, + Start: fd.vd, + }) + statfs, err := fd.vd.mount.fs.impl.StatFSAt(ctx, rp) + vfsObj.putResolvingPath(rp) + return statfs, err + } return fd.impl.StatFS(ctx) } @@ -417,6 +462,16 @@ func (fd *FileDescription) Ioctl(ctx context.Context, uio usermem.IO, args arch. // Listxattr returns all extended attribute names for the file represented by // fd. func (fd *FileDescription) Listxattr(ctx context.Context) ([]string, error) { + if fd.opts.UseDentryMetadata { + vfsObj := fd.vd.mount.vfs + rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{ + Root: fd.vd, + Start: fd.vd, + }) + names, err := fd.vd.mount.fs.impl.ListxattrAt(ctx, rp) + vfsObj.putResolvingPath(rp) + return names, err + } names, err := fd.impl.Listxattr(ctx) if err == syserror.ENOTSUP { // Linux doesn't actually return ENOTSUP in this case; instead, @@ -431,18 +486,48 @@ func (fd *FileDescription) Listxattr(ctx context.Context) ([]string, error) { // Getxattr returns the value associated with the given extended attribute for // the file represented by fd. func (fd *FileDescription) Getxattr(ctx context.Context, name string) (string, error) { + if fd.opts.UseDentryMetadata { + vfsObj := fd.vd.mount.vfs + rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{ + Root: fd.vd, + Start: fd.vd, + }) + val, err := fd.vd.mount.fs.impl.GetxattrAt(ctx, rp, name) + vfsObj.putResolvingPath(rp) + return val, err + } return fd.impl.Getxattr(ctx, name) } // Setxattr changes the value associated with the given extended attribute for // the file represented by fd. func (fd *FileDescription) Setxattr(ctx context.Context, opts SetxattrOptions) error { + if fd.opts.UseDentryMetadata { + vfsObj := fd.vd.mount.vfs + rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{ + Root: fd.vd, + Start: fd.vd, + }) + err := fd.vd.mount.fs.impl.SetxattrAt(ctx, rp, opts) + vfsObj.putResolvingPath(rp) + return err + } return fd.impl.Setxattr(ctx, opts) } // Removexattr removes the given extended attribute from the file represented // by fd. func (fd *FileDescription) Removexattr(ctx context.Context, name string) error { + if fd.opts.UseDentryMetadata { + vfsObj := fd.vd.mount.vfs + rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{ + Root: fd.vd, + Start: fd.vd, + }) + err := fd.vd.mount.fs.impl.RemovexattrAt(ctx, rp, name) + vfsObj.putResolvingPath(rp) + return err + } return fd.impl.Removexattr(ctx, name) } @@ -464,7 +549,7 @@ func (fd *FileDescription) MappedName(ctx context.Context) string { // DeviceID implements memmap.MappingIdentity.DeviceID. func (fd *FileDescription) DeviceID() uint64 { - stat, err := fd.impl.Stat(context.Background(), StatOptions{ + stat, err := fd.Stat(context.Background(), StatOptions{ // There is no STATX_DEV; we assume that Stat will return it if it's // available regardless of mask. Mask: 0, @@ -480,7 +565,7 @@ func (fd *FileDescription) DeviceID() uint64 { // InodeID implements memmap.MappingIdentity.InodeID. func (fd *FileDescription) InodeID() uint64 { - stat, err := fd.impl.Stat(context.Background(), StatOptions{ + stat, err := fd.Stat(context.Background(), StatOptions{ Mask: linux.STATX_INO, // fs/proc/task_mmu.c:show_map_vma() just reads inode::i_ino directly. Sync: linux.AT_STATX_DONT_SYNC, @@ -493,5 +578,5 @@ func (fd *FileDescription) InodeID() uint64 { // Msync implements memmap.MappingIdentity.Msync. func (fd *FileDescription) Msync(ctx context.Context, mr memmap.MappableRange) error { - return fd.impl.Sync(ctx) + return fd.Sync(ctx) } diff --git a/pkg/sentry/vfs/file_description_impl_util.go b/pkg/sentry/vfs/file_description_impl_util.go index de782e577..66eb57bc2 100644 --- a/pkg/sentry/vfs/file_description_impl_util.go +++ b/pkg/sentry/vfs/file_description_impl_util.go @@ -177,6 +177,21 @@ func (DirectoryFileDescriptionDefaultImpl) Write(ctx context.Context, src userme return 0, syserror.EISDIR } +// DentryMetadataFileDescriptionImpl may be embedded by implementations of +// FileDescriptionImpl for which FileDescriptionOptions.UseDentryMetadata is +// true to obtain implementations of Stat and SetStat that panic. +type DentryMetadataFileDescriptionImpl struct{} + +// Stat implements FileDescriptionImpl.Stat. +func (DentryMetadataFileDescriptionImpl) Stat(ctx context.Context, opts StatOptions) (linux.Statx, error) { + panic("illegal call to DentryMetadataFileDescriptionImpl.Stat") +} + +// SetStat implements FileDescriptionImpl.SetStat. +func (DentryMetadataFileDescriptionImpl) SetStat(ctx context.Context, opts SetStatOptions) error { + panic("illegal call to DentryMetadataFileDescriptionImpl.SetStat") +} + // DynamicBytesFileDescriptionImpl may be embedded by implementations of // FileDescriptionImpl that represent read-only regular files whose contents // are backed by a bytes.Buffer that is regenerated when necessary, consistent diff --git a/pkg/sentry/vfs/filesystem.go b/pkg/sentry/vfs/filesystem.go index 89bd58864..ea78f555b 100644 --- a/pkg/sentry/vfs/filesystem.go +++ b/pkg/sentry/vfs/filesystem.go @@ -418,17 +418,38 @@ type FilesystemImpl interface { UnlinkAt(ctx context.Context, rp *ResolvingPath) error // ListxattrAt returns all extended attribute names for the file at rp. + // + // Errors: + // + // - If extended attributes are not supported by the filesystem, + // ListxattrAt returns nil. (See FileDescription.Listxattr for an + // explanation.) ListxattrAt(ctx context.Context, rp *ResolvingPath) ([]string, error) // GetxattrAt returns the value associated with the given extended // attribute for the file at rp. + // + // Errors: + // + // - If extended attributes are not supported by the filesystem, GetxattrAt + // returns ENOTSUP. GetxattrAt(ctx context.Context, rp *ResolvingPath, name string) (string, error) // SetxattrAt changes the value associated with the given extended // attribute for the file at rp. + // + // Errors: + // + // - If extended attributes are not supported by the filesystem, SetxattrAt + // returns ENOTSUP. SetxattrAt(ctx context.Context, rp *ResolvingPath, opts SetxattrOptions) error // RemovexattrAt removes the given extended attribute from the file at rp. + // + // Errors: + // + // - If extended attributes are not supported by the filesystem, + // RemovexattrAt returns ENOTSUP. RemovexattrAt(ctx context.Context, rp *ResolvingPath, name string) error // PrependPath prepends a path from vd to vd.Mount().Root() to b. diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go index a3bdb5805..ea2db7031 100644 --- a/pkg/sentry/vfs/vfs.go +++ b/pkg/sentry/vfs/vfs.go @@ -75,6 +75,11 @@ type VirtualFilesystem struct { // mountpoints is analogous to Linux's mountpoint_hashtable. mountpoints map[*Dentry]map[*Mount]struct{} + // devices contains all registered Devices. devices is protected by + // devicesMu. + devicesMu sync.RWMutex + devices map[devTuple]*registeredDevice + // fsTypes contains all registered FilesystemTypes. fsTypes is protected by // fsTypesMu. fsTypesMu sync.RWMutex @@ -90,6 +95,7 @@ type VirtualFilesystem struct { func New() *VirtualFilesystem { vfs := &VirtualFilesystem{ mountpoints: make(map[*Dentry]map[*Mount]struct{}), + devices: make(map[devTuple]*registeredDevice), fsTypes: make(map[string]*registeredFilesystemType), filesystems: make(map[*Filesystem]struct{}), } -- cgit v1.2.3 From 51f3ab85e024fcd74c49d273ce5202a207577d31 Mon Sep 17 00:00:00 2001 From: Nicolas Lacasse Date: Mon, 6 Jan 2020 12:51:35 -0800 Subject: Convert memfs into proto-tmpfs. - Renamed memfs to tmpfs. - Copied fileRangeSet bits from fs/fsutil/ to fsimpl/tmpfs/ - Changed tmpfs to be backed by filemem instead of byte slice. - regularFileReadWriter uses a sync.Pool, similar to gofer client. PiperOrigin-RevId: 288356380 --- pkg/sentry/fs/fsutil/BUILD | 2 +- pkg/sentry/fs/fsutil/file_range_set.go | 14 +- pkg/sentry/fsimpl/memfs/BUILD | 80 --- pkg/sentry/fsimpl/memfs/benchmark_test.go | 487 ------------------- pkg/sentry/fsimpl/memfs/directory.go | 187 ------- pkg/sentry/fsimpl/memfs/filesystem.go | 698 --------------------------- pkg/sentry/fsimpl/memfs/memfs.go | 293 ----------- pkg/sentry/fsimpl/memfs/named_pipe.go | 60 --- pkg/sentry/fsimpl/memfs/pipe_test.go | 235 --------- pkg/sentry/fsimpl/memfs/regular_file.go | 154 ------ pkg/sentry/fsimpl/memfs/symlink.go | 36 -- pkg/sentry/fsimpl/tmpfs/BUILD | 92 ++++ pkg/sentry/fsimpl/tmpfs/benchmark_test.go | 487 +++++++++++++++++++ pkg/sentry/fsimpl/tmpfs/directory.go | 187 +++++++ pkg/sentry/fsimpl/tmpfs/filesystem.go | 698 +++++++++++++++++++++++++++ pkg/sentry/fsimpl/tmpfs/named_pipe.go | 60 +++ pkg/sentry/fsimpl/tmpfs/pipe_test.go | 235 +++++++++ pkg/sentry/fsimpl/tmpfs/regular_file.go | 357 ++++++++++++++ pkg/sentry/fsimpl/tmpfs/regular_file_test.go | 224 +++++++++ pkg/sentry/fsimpl/tmpfs/symlink.go | 36 ++ pkg/sentry/fsimpl/tmpfs/tmpfs.go | 299 ++++++++++++ 21 files changed, 2683 insertions(+), 2238 deletions(-) delete mode 100644 pkg/sentry/fsimpl/memfs/BUILD delete mode 100644 pkg/sentry/fsimpl/memfs/benchmark_test.go delete mode 100644 pkg/sentry/fsimpl/memfs/directory.go delete mode 100644 pkg/sentry/fsimpl/memfs/filesystem.go delete mode 100644 pkg/sentry/fsimpl/memfs/memfs.go delete mode 100644 pkg/sentry/fsimpl/memfs/named_pipe.go delete mode 100644 pkg/sentry/fsimpl/memfs/pipe_test.go delete mode 100644 pkg/sentry/fsimpl/memfs/regular_file.go delete mode 100644 pkg/sentry/fsimpl/memfs/symlink.go create mode 100644 pkg/sentry/fsimpl/tmpfs/BUILD create mode 100644 pkg/sentry/fsimpl/tmpfs/benchmark_test.go create mode 100644 pkg/sentry/fsimpl/tmpfs/directory.go create mode 100644 pkg/sentry/fsimpl/tmpfs/filesystem.go create mode 100644 pkg/sentry/fsimpl/tmpfs/named_pipe.go create mode 100644 pkg/sentry/fsimpl/tmpfs/pipe_test.go create mode 100644 pkg/sentry/fsimpl/tmpfs/regular_file.go create mode 100644 pkg/sentry/fsimpl/tmpfs/regular_file_test.go create mode 100644 pkg/sentry/fsimpl/tmpfs/symlink.go create mode 100644 pkg/sentry/fsimpl/tmpfs/tmpfs.go (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fs/fsutil/BUILD b/pkg/sentry/fs/fsutil/BUILD index b2e8d9c77..9ca695a95 100644 --- a/pkg/sentry/fs/fsutil/BUILD +++ b/pkg/sentry/fs/fsutil/BUILD @@ -53,7 +53,7 @@ go_template_instance( "Key": "uint64", "Range": "memmap.MappableRange", "Value": "uint64", - "Functions": "fileRangeSetFunctions", + "Functions": "FileRangeSetFunctions", }, ) diff --git a/pkg/sentry/fs/fsutil/file_range_set.go b/pkg/sentry/fs/fsutil/file_range_set.go index 0a5466b0a..f52d712e3 100644 --- a/pkg/sentry/fs/fsutil/file_range_set.go +++ b/pkg/sentry/fs/fsutil/file_range_set.go @@ -34,25 +34,25 @@ import ( // // type FileRangeSet -// fileRangeSetFunctions implements segment.Functions for FileRangeSet. -type fileRangeSetFunctions struct{} +// FileRangeSetFunctions implements segment.Functions for FileRangeSet. +type FileRangeSetFunctions struct{} // MinKey implements segment.Functions.MinKey. -func (fileRangeSetFunctions) MinKey() uint64 { +func (FileRangeSetFunctions) MinKey() uint64 { return 0 } // MaxKey implements segment.Functions.MaxKey. -func (fileRangeSetFunctions) MaxKey() uint64 { +func (FileRangeSetFunctions) MaxKey() uint64 { return math.MaxUint64 } // ClearValue implements segment.Functions.ClearValue. -func (fileRangeSetFunctions) ClearValue(_ *uint64) { +func (FileRangeSetFunctions) ClearValue(_ *uint64) { } // Merge implements segment.Functions.Merge. -func (fileRangeSetFunctions) Merge(mr1 memmap.MappableRange, frstart1 uint64, _ memmap.MappableRange, frstart2 uint64) (uint64, bool) { +func (FileRangeSetFunctions) Merge(mr1 memmap.MappableRange, frstart1 uint64, _ memmap.MappableRange, frstart2 uint64) (uint64, bool) { if frstart1+mr1.Length() != frstart2 { return 0, false } @@ -60,7 +60,7 @@ func (fileRangeSetFunctions) Merge(mr1 memmap.MappableRange, frstart1 uint64, _ } // Split implements segment.Functions.Split. -func (fileRangeSetFunctions) Split(mr memmap.MappableRange, frstart uint64, split uint64) (uint64, uint64) { +func (FileRangeSetFunctions) Split(mr memmap.MappableRange, frstart uint64, split uint64) (uint64, uint64) { return frstart, frstart + (split - mr.Start) } diff --git a/pkg/sentry/fsimpl/memfs/BUILD b/pkg/sentry/fsimpl/memfs/BUILD deleted file mode 100644 index 5689bed3b..000000000 --- a/pkg/sentry/fsimpl/memfs/BUILD +++ /dev/null @@ -1,80 +0,0 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") - -package(licenses = ["notice"]) - -load("//tools/go_generics:defs.bzl", "go_template_instance") - -go_template_instance( - name = "dentry_list", - out = "dentry_list.go", - package = "memfs", - prefix = "dentry", - template = "//pkg/ilist:generic_list", - types = { - "Element": "*dentry", - "Linker": "*dentry", - }, -) - -go_library( - name = "memfs", - srcs = [ - "dentry_list.go", - "directory.go", - "filesystem.go", - "memfs.go", - "named_pipe.go", - "regular_file.go", - "symlink.go", - ], - importpath = "gvisor.dev/gvisor/pkg/sentry/fsimpl/memfs", - deps = [ - "//pkg/abi/linux", - "//pkg/amutex", - "//pkg/fspath", - "//pkg/sentry/arch", - "//pkg/sentry/context", - "//pkg/sentry/kernel/auth", - "//pkg/sentry/kernel/pipe", - "//pkg/sentry/usermem", - "//pkg/sentry/vfs", - "//pkg/syserror", - ], -) - -go_test( - name = "benchmark_test", - size = "small", - srcs = ["benchmark_test.go"], - deps = [ - ":memfs", - "//pkg/abi/linux", - "//pkg/fspath", - "//pkg/refs", - "//pkg/sentry/context", - "//pkg/sentry/context/contexttest", - "//pkg/sentry/fs", - "//pkg/sentry/fs/tmpfs", - "//pkg/sentry/kernel/auth", - "//pkg/sentry/vfs", - "//pkg/syserror", - ], -) - -go_test( - name = "memfs_test", - size = "small", - srcs = ["pipe_test.go"], - embed = [":memfs"], - deps = [ - "//pkg/abi/linux", - "//pkg/fspath", - "//pkg/sentry/context", - "//pkg/sentry/context/contexttest", - "//pkg/sentry/kernel/auth", - "//pkg/sentry/usermem", - "//pkg/sentry/vfs", - "//pkg/syserror", - ], -) diff --git a/pkg/sentry/fsimpl/memfs/benchmark_test.go b/pkg/sentry/fsimpl/memfs/benchmark_test.go deleted file mode 100644 index a27876a4e..000000000 --- a/pkg/sentry/fsimpl/memfs/benchmark_test.go +++ /dev/null @@ -1,487 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package benchmark_test - -import ( - "fmt" - "runtime" - "strings" - "testing" - - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/fspath" - "gvisor.dev/gvisor/pkg/refs" - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/context/contexttest" - "gvisor.dev/gvisor/pkg/sentry/fs" - _ "gvisor.dev/gvisor/pkg/sentry/fs/tmpfs" - "gvisor.dev/gvisor/pkg/sentry/fsimpl/memfs" - "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" -) - -// Differences from stat_benchmark: -// -// - Syscall interception, CopyInPath, copyOutStat, and overlayfs overheads are -// not included. -// -// - *MountStat benchmarks use a tmpfs root mount and a tmpfs submount at /tmp. -// Non-MountStat benchmarks use a tmpfs root mount and no submounts. -// stat_benchmark uses a varying root mount, a tmpfs submount at /tmp, and a -// subdirectory /tmp/ (assuming TEST_TMPDIR == "/tmp"). Thus -// stat_benchmark at depth 1 does a comparable amount of work to *MountStat -// benchmarks at depth 2, and non-MountStat benchmarks at depth 3. -var depths = []int{1, 2, 3, 8, 64, 100} - -const ( - mountPointName = "tmp" - filename = "gvisor_test_temp_0_1557494568" -) - -// This is copied from syscalls/linux/sys_file.go, with the dependency on -// kernel.Task stripped out. -func fileOpOn(ctx context.Context, mntns *fs.MountNamespace, root, wd *fs.Dirent, dirFD int32, path string, resolve bool, fn func(root *fs.Dirent, d *fs.Dirent) error) error { - var ( - d *fs.Dirent // The file. - rel *fs.Dirent // The relative directory for search (if required.) - err error - ) - - // Extract the working directory (maybe). - if len(path) > 0 && path[0] == '/' { - // Absolute path; rel can be nil. - } else if dirFD == linux.AT_FDCWD { - // Need to reference the working directory. - rel = wd - } else { - // Need to extract the given FD. - return syserror.EBADF - } - - // Lookup the node. - remainingTraversals := uint(linux.MaxSymlinkTraversals) - if resolve { - d, err = mntns.FindInode(ctx, root, rel, path, &remainingTraversals) - } else { - d, err = mntns.FindLink(ctx, root, rel, path, &remainingTraversals) - } - if err != nil { - return err - } - - err = fn(root, d) - d.DecRef() - return err -} - -func BenchmarkVFS1TmpfsStat(b *testing.B) { - for _, depth := range depths { - b.Run(fmt.Sprintf("%d", depth), func(b *testing.B) { - ctx := contexttest.Context(b) - - // Create VFS. - tmpfsFS, ok := fs.FindFilesystem("tmpfs") - if !ok { - b.Fatalf("failed to find tmpfs filesystem type") - } - rootInode, err := tmpfsFS.Mount(ctx, "tmpfs", fs.MountSourceFlags{}, "", nil) - if err != nil { - b.Fatalf("failed to create tmpfs root mount: %v", err) - } - mntns, err := fs.NewMountNamespace(ctx, rootInode) - if err != nil { - b.Fatalf("failed to create mount namespace: %v", err) - } - defer mntns.DecRef() - - var filePathBuilder strings.Builder - filePathBuilder.WriteByte('/') - - // Create nested directories with given depth. - root := mntns.Root() - defer root.DecRef() - d := root - d.IncRef() - defer d.DecRef() - for i := depth; i > 0; i-- { - name := fmt.Sprintf("%d", i) - if err := d.Inode.CreateDirectory(ctx, d, name, fs.FilePermsFromMode(0755)); err != nil { - b.Fatalf("failed to create directory %q: %v", name, err) - } - next, err := d.Walk(ctx, root, name) - if err != nil { - b.Fatalf("failed to walk to directory %q: %v", name, err) - } - d.DecRef() - d = next - filePathBuilder.WriteString(name) - filePathBuilder.WriteByte('/') - } - - // Create the file that will be stat'd. - file, err := d.Inode.Create(ctx, d, filename, fs.FileFlags{Read: true, Write: true}, fs.FilePermsFromMode(0644)) - if err != nil { - b.Fatalf("failed to create file %q: %v", filename, err) - } - file.DecRef() - filePathBuilder.WriteString(filename) - filePath := filePathBuilder.String() - - dirPath := false - runtime.GC() - b.ResetTimer() - for i := 0; i < b.N; i++ { - err := fileOpOn(ctx, mntns, root, root, linux.AT_FDCWD, filePath, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent) error { - if dirPath && !fs.IsDir(d.Inode.StableAttr) { - return syserror.ENOTDIR - } - uattr, err := d.Inode.UnstableAttr(ctx) - if err != nil { - return err - } - // Sanity check. - if uattr.Perms.User.Execute { - b.Fatalf("got wrong permissions (%0o)", uattr.Perms.LinuxMode()) - } - return nil - }) - if err != nil { - b.Fatalf("stat(%q) failed: %v", filePath, err) - } - } - // Don't include deferred cleanup in benchmark time. - b.StopTimer() - }) - } -} - -func BenchmarkVFS2MemfsStat(b *testing.B) { - for _, depth := range depths { - b.Run(fmt.Sprintf("%d", depth), func(b *testing.B) { - ctx := contexttest.Context(b) - creds := auth.CredentialsFromContext(ctx) - - // Create VFS. - vfsObj := vfs.New() - vfsObj.MustRegisterFilesystemType("memfs", memfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ - AllowUserMount: true, - }) - mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "memfs", &vfs.GetFilesystemOptions{}) - if err != nil { - b.Fatalf("failed to create tmpfs root mount: %v", err) - } - defer mntns.DecRef(vfsObj) - - var filePathBuilder strings.Builder - filePathBuilder.WriteByte('/') - - // Create nested directories with given depth. - root := mntns.Root() - defer root.DecRef() - vd := root - vd.IncRef() - for i := depth; i > 0; i-- { - name := fmt.Sprintf("%d", i) - pop := vfs.PathOperation{ - Root: root, - Start: vd, - Path: fspath.Parse(name), - } - if err := vfsObj.MkdirAt(ctx, creds, &pop, &vfs.MkdirOptions{ - Mode: 0755, - }); err != nil { - b.Fatalf("failed to create directory %q: %v", name, err) - } - nextVD, err := vfsObj.GetDentryAt(ctx, creds, &pop, &vfs.GetDentryOptions{}) - if err != nil { - b.Fatalf("failed to walk to directory %q: %v", name, err) - } - vd.DecRef() - vd = nextVD - filePathBuilder.WriteString(name) - filePathBuilder.WriteByte('/') - } - - // Create the file that will be stat'd. - fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{ - Root: root, - Start: vd, - Path: fspath.Parse(filename), - FollowFinalSymlink: true, - }, &vfs.OpenOptions{ - Flags: linux.O_RDWR | linux.O_CREAT | linux.O_EXCL, - Mode: 0644, - }) - vd.DecRef() - vd = vfs.VirtualDentry{} - if err != nil { - b.Fatalf("failed to create file %q: %v", filename, err) - } - defer fd.DecRef() - filePathBuilder.WriteString(filename) - filePath := filePathBuilder.String() - - runtime.GC() - b.ResetTimer() - for i := 0; i < b.N; i++ { - stat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{ - Root: root, - Start: root, - Path: fspath.Parse(filePath), - FollowFinalSymlink: true, - }, &vfs.StatOptions{}) - if err != nil { - b.Fatalf("stat(%q) failed: %v", filePath, err) - } - // Sanity check. - if stat.Mode&^linux.S_IFMT != 0644 { - b.Fatalf("got wrong permissions (%0o)", stat.Mode) - } - } - // Don't include deferred cleanup in benchmark time. - b.StopTimer() - }) - } -} - -func BenchmarkVFS1TmpfsMountStat(b *testing.B) { - for _, depth := range depths { - b.Run(fmt.Sprintf("%d", depth), func(b *testing.B) { - ctx := contexttest.Context(b) - - // Create VFS. - tmpfsFS, ok := fs.FindFilesystem("tmpfs") - if !ok { - b.Fatalf("failed to find tmpfs filesystem type") - } - rootInode, err := tmpfsFS.Mount(ctx, "tmpfs", fs.MountSourceFlags{}, "", nil) - if err != nil { - b.Fatalf("failed to create tmpfs root mount: %v", err) - } - mntns, err := fs.NewMountNamespace(ctx, rootInode) - if err != nil { - b.Fatalf("failed to create mount namespace: %v", err) - } - defer mntns.DecRef() - - var filePathBuilder strings.Builder - filePathBuilder.WriteByte('/') - - // Create and mount the submount. - root := mntns.Root() - defer root.DecRef() - if err := root.Inode.CreateDirectory(ctx, root, mountPointName, fs.FilePermsFromMode(0755)); err != nil { - b.Fatalf("failed to create mount point: %v", err) - } - mountPoint, err := root.Walk(ctx, root, mountPointName) - if err != nil { - b.Fatalf("failed to walk to mount point: %v", err) - } - defer mountPoint.DecRef() - submountInode, err := tmpfsFS.Mount(ctx, "tmpfs", fs.MountSourceFlags{}, "", nil) - if err != nil { - b.Fatalf("failed to create tmpfs submount: %v", err) - } - if err := mntns.Mount(ctx, mountPoint, submountInode); err != nil { - b.Fatalf("failed to mount tmpfs submount: %v", err) - } - filePathBuilder.WriteString(mountPointName) - filePathBuilder.WriteByte('/') - - // Create nested directories with given depth. - d, err := root.Walk(ctx, root, mountPointName) - if err != nil { - b.Fatalf("failed to walk to mount root: %v", err) - } - defer d.DecRef() - for i := depth; i > 0; i-- { - name := fmt.Sprintf("%d", i) - if err := d.Inode.CreateDirectory(ctx, d, name, fs.FilePermsFromMode(0755)); err != nil { - b.Fatalf("failed to create directory %q: %v", name, err) - } - next, err := d.Walk(ctx, root, name) - if err != nil { - b.Fatalf("failed to walk to directory %q: %v", name, err) - } - d.DecRef() - d = next - filePathBuilder.WriteString(name) - filePathBuilder.WriteByte('/') - } - - // Create the file that will be stat'd. - file, err := d.Inode.Create(ctx, d, filename, fs.FileFlags{Read: true, Write: true}, fs.FilePermsFromMode(0644)) - if err != nil { - b.Fatalf("failed to create file %q: %v", filename, err) - } - file.DecRef() - filePathBuilder.WriteString(filename) - filePath := filePathBuilder.String() - - dirPath := false - runtime.GC() - b.ResetTimer() - for i := 0; i < b.N; i++ { - err := fileOpOn(ctx, mntns, root, root, linux.AT_FDCWD, filePath, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent) error { - if dirPath && !fs.IsDir(d.Inode.StableAttr) { - return syserror.ENOTDIR - } - uattr, err := d.Inode.UnstableAttr(ctx) - if err != nil { - return err - } - // Sanity check. - if uattr.Perms.User.Execute { - b.Fatalf("got wrong permissions (%0o)", uattr.Perms.LinuxMode()) - } - return nil - }) - if err != nil { - b.Fatalf("stat(%q) failed: %v", filePath, err) - } - } - // Don't include deferred cleanup in benchmark time. - b.StopTimer() - }) - } -} - -func BenchmarkVFS2MemfsMountStat(b *testing.B) { - for _, depth := range depths { - b.Run(fmt.Sprintf("%d", depth), func(b *testing.B) { - ctx := contexttest.Context(b) - creds := auth.CredentialsFromContext(ctx) - - // Create VFS. - vfsObj := vfs.New() - vfsObj.MustRegisterFilesystemType("memfs", memfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ - AllowUserMount: true, - }) - mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "memfs", &vfs.GetFilesystemOptions{}) - if err != nil { - b.Fatalf("failed to create tmpfs root mount: %v", err) - } - defer mntns.DecRef(vfsObj) - - var filePathBuilder strings.Builder - filePathBuilder.WriteByte('/') - - // Create the mount point. - root := mntns.Root() - defer root.DecRef() - pop := vfs.PathOperation{ - Root: root, - Start: root, - Path: fspath.Parse(mountPointName), - } - if err := vfsObj.MkdirAt(ctx, creds, &pop, &vfs.MkdirOptions{ - Mode: 0755, - }); err != nil { - b.Fatalf("failed to create mount point: %v", err) - } - // Save the mount point for later use. - mountPoint, err := vfsObj.GetDentryAt(ctx, creds, &pop, &vfs.GetDentryOptions{}) - if err != nil { - b.Fatalf("failed to walk to mount point: %v", err) - } - defer mountPoint.DecRef() - // Create and mount the submount. - if err := vfsObj.MountAt(ctx, creds, "", &pop, "memfs", &vfs.MountOptions{}); err != nil { - b.Fatalf("failed to mount tmpfs submount: %v", err) - } - filePathBuilder.WriteString(mountPointName) - filePathBuilder.WriteByte('/') - - // Create nested directories with given depth. - vd, err := vfsObj.GetDentryAt(ctx, creds, &pop, &vfs.GetDentryOptions{}) - if err != nil { - b.Fatalf("failed to walk to mount root: %v", err) - } - for i := depth; i > 0; i-- { - name := fmt.Sprintf("%d", i) - pop := vfs.PathOperation{ - Root: root, - Start: vd, - Path: fspath.Parse(name), - } - if err := vfsObj.MkdirAt(ctx, creds, &pop, &vfs.MkdirOptions{ - Mode: 0755, - }); err != nil { - b.Fatalf("failed to create directory %q: %v", name, err) - } - nextVD, err := vfsObj.GetDentryAt(ctx, creds, &pop, &vfs.GetDentryOptions{}) - if err != nil { - b.Fatalf("failed to walk to directory %q: %v", name, err) - } - vd.DecRef() - vd = nextVD - filePathBuilder.WriteString(name) - filePathBuilder.WriteByte('/') - } - - // Verify that we didn't create any directories under the mount - // point (i.e. they were all created on the submount). - firstDirName := fmt.Sprintf("%d", depth) - if child := mountPoint.Dentry().Child(firstDirName); child != nil { - b.Fatalf("created directory %q under root mount, not submount", firstDirName) - } - - // Create the file that will be stat'd. - fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{ - Root: root, - Start: vd, - Path: fspath.Parse(filename), - FollowFinalSymlink: true, - }, &vfs.OpenOptions{ - Flags: linux.O_RDWR | linux.O_CREAT | linux.O_EXCL, - Mode: 0644, - }) - vd.DecRef() - if err != nil { - b.Fatalf("failed to create file %q: %v", filename, err) - } - fd.DecRef() - filePathBuilder.WriteString(filename) - filePath := filePathBuilder.String() - - runtime.GC() - b.ResetTimer() - for i := 0; i < b.N; i++ { - stat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{ - Root: root, - Start: root, - Path: fspath.Parse(filePath), - FollowFinalSymlink: true, - }, &vfs.StatOptions{}) - if err != nil { - b.Fatalf("stat(%q) failed: %v", filePath, err) - } - // Sanity check. - if stat.Mode&^linux.S_IFMT != 0644 { - b.Fatalf("got wrong permissions (%0o)", stat.Mode) - } - } - // Don't include deferred cleanup in benchmark time. - b.StopTimer() - }) - } -} - -func init() { - // Turn off reference leak checking for a fair comparison between vfs1 and - // vfs2. - refs.SetLeakMode(refs.NoLeakChecking) -} diff --git a/pkg/sentry/fsimpl/memfs/directory.go b/pkg/sentry/fsimpl/memfs/directory.go deleted file mode 100644 index 0bd82e480..000000000 --- a/pkg/sentry/fsimpl/memfs/directory.go +++ /dev/null @@ -1,187 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package memfs - -import ( - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" -) - -type directory struct { - inode inode - - // childList is a list containing (1) child Dentries and (2) fake Dentries - // (with inode == nil) that represent the iteration position of - // directoryFDs. childList is used to support directoryFD.IterDirents() - // efficiently. childList is protected by filesystem.mu. - childList dentryList -} - -func (fs *filesystem) newDirectory(creds *auth.Credentials, mode linux.FileMode) *inode { - dir := &directory{} - dir.inode.init(dir, fs, creds, mode) - dir.inode.nlink = 2 // from "." and parent directory or ".." for root - return &dir.inode -} - -func (i *inode) isDir() bool { - _, ok := i.impl.(*directory) - return ok -} - -type directoryFD struct { - fileDescription - vfs.DirectoryFileDescriptionDefaultImpl - - // Protected by filesystem.mu. - iter *dentry - off int64 -} - -// Release implements vfs.FileDescriptionImpl.Release. -func (fd *directoryFD) Release() { - if fd.iter != nil { - fs := fd.filesystem() - dir := fd.inode().impl.(*directory) - fs.mu.Lock() - dir.childList.Remove(fd.iter) - fs.mu.Unlock() - fd.iter = nil - } -} - -// IterDirents implements vfs.FileDescriptionImpl.IterDirents. -func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error { - fs := fd.filesystem() - vfsd := fd.vfsfd.VirtualDentry().Dentry() - - fs.mu.Lock() - defer fs.mu.Unlock() - - if fd.off == 0 { - if !cb.Handle(vfs.Dirent{ - Name: ".", - Type: linux.DT_DIR, - Ino: vfsd.Impl().(*dentry).inode.ino, - NextOff: 1, - }) { - return nil - } - fd.off++ - } - if fd.off == 1 { - parentInode := vfsd.ParentOrSelf().Impl().(*dentry).inode - if !cb.Handle(vfs.Dirent{ - Name: "..", - Type: parentInode.direntType(), - Ino: parentInode.ino, - NextOff: 2, - }) { - return nil - } - fd.off++ - } - - dir := vfsd.Impl().(*dentry).inode.impl.(*directory) - var child *dentry - if fd.iter == nil { - // Start iteration at the beginning of dir. - child = dir.childList.Front() - fd.iter = &dentry{} - } else { - // Continue iteration from where we left off. - child = fd.iter.Next() - dir.childList.Remove(fd.iter) - } - for child != nil { - // Skip other directoryFD iterators. - if child.inode != nil { - if !cb.Handle(vfs.Dirent{ - Name: child.vfsd.Name(), - Type: child.inode.direntType(), - Ino: child.inode.ino, - NextOff: fd.off + 1, - }) { - dir.childList.InsertBefore(child, fd.iter) - return nil - } - fd.off++ - } - child = child.Next() - } - dir.childList.PushBack(fd.iter) - return nil -} - -// Seek implements vfs.FileDescriptionImpl.Seek. -func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { - fs := fd.filesystem() - fs.mu.Lock() - defer fs.mu.Unlock() - - switch whence { - case linux.SEEK_SET: - // Use offset as given. - case linux.SEEK_CUR: - offset += fd.off - default: - return 0, syserror.EINVAL - } - if offset < 0 { - return 0, syserror.EINVAL - } - - // If the offset isn't changing (e.g. due to lseek(0, SEEK_CUR)), don't - // seek even if doing so might reposition the iterator due to concurrent - // mutation of the directory. Compare fs/libfs.c:dcache_dir_lseek(). - if fd.off == offset { - return offset, nil - } - - fd.off = offset - // Compensate for "." and "..". - remChildren := int64(0) - if offset >= 2 { - remChildren = offset - 2 - } - - dir := fd.inode().impl.(*directory) - - // Ensure that fd.iter exists and is not linked into dir.childList. - if fd.iter == nil { - fd.iter = &dentry{} - } else { - dir.childList.Remove(fd.iter) - } - // Insert fd.iter before the remChildren'th child, or at the end of the - // list if remChildren >= number of children. - child := dir.childList.Front() - for child != nil { - // Skip other directoryFD iterators. - if child.inode != nil { - if remChildren == 0 { - dir.childList.InsertBefore(child, fd.iter) - return offset, nil - } - remChildren-- - } - child = child.Next() - } - dir.childList.PushBack(fd.iter) - return offset, nil -} diff --git a/pkg/sentry/fsimpl/memfs/filesystem.go b/pkg/sentry/fsimpl/memfs/filesystem.go deleted file mode 100644 index b063e09a3..000000000 --- a/pkg/sentry/fsimpl/memfs/filesystem.go +++ /dev/null @@ -1,698 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package memfs - -import ( - "fmt" - "sync/atomic" - - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/fspath" - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" -) - -// Sync implements vfs.FilesystemImpl.Sync. -func (fs *filesystem) Sync(ctx context.Context) error { - // All filesystem state is in-memory. - return nil -} - -// stepLocked resolves rp.Component() to an existing file, starting from the -// given directory. -// -// stepLocked is loosely analogous to fs/namei.c:walk_component(). -// -// Preconditions: filesystem.mu must be locked. !rp.Done(). -func stepLocked(rp *vfs.ResolvingPath, d *dentry) (*dentry, error) { - if !d.inode.isDir() { - return nil, syserror.ENOTDIR - } - if err := d.inode.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil { - return nil, err - } -afterSymlink: - nextVFSD, err := rp.ResolveComponent(&d.vfsd) - if err != nil { - return nil, err - } - if nextVFSD == nil { - // Since the Dentry tree is the sole source of truth for memfs, if it's - // not in the Dentry tree, it doesn't exist. - return nil, syserror.ENOENT - } - next := nextVFSD.Impl().(*dentry) - if symlink, ok := next.inode.impl.(*symlink); ok && rp.ShouldFollowSymlink() { - // TODO: symlink traversals update access time - if err := rp.HandleSymlink(symlink.target); err != nil { - return nil, err - } - goto afterSymlink // don't check the current directory again - } - rp.Advance() - return next, nil -} - -// walkParentDirLocked resolves all but the last path component of rp to an -// existing directory, starting from the given directory (which is usually -// rp.Start().Impl().(*dentry)). It does not check that the returned directory -// is searchable by the provider of rp. -// -// walkParentDirLocked is loosely analogous to Linux's -// fs/namei.c:path_parentat(). -// -// Preconditions: filesystem.mu must be locked. !rp.Done(). -func walkParentDirLocked(rp *vfs.ResolvingPath, d *dentry) (*dentry, error) { - for !rp.Final() { - next, err := stepLocked(rp, d) - if err != nil { - return nil, err - } - d = next - } - if !d.inode.isDir() { - return nil, syserror.ENOTDIR - } - return d, nil -} - -// resolveLocked resolves rp to an existing file. -// -// resolveLocked is loosely analogous to Linux's fs/namei.c:path_lookupat(). -// -// Preconditions: filesystem.mu must be locked. -func resolveLocked(rp *vfs.ResolvingPath) (*dentry, error) { - d := rp.Start().Impl().(*dentry) - for !rp.Done() { - next, err := stepLocked(rp, d) - if err != nil { - return nil, err - } - d = next - } - if rp.MustBeDir() && !d.inode.isDir() { - return nil, syserror.ENOTDIR - } - return d, nil -} - -// doCreateAt checks that creating a file at rp is permitted, then invokes -// create to do so. -// -// doCreateAt is loosely analogous to a conjunction of Linux's -// fs/namei.c:filename_create() and done_path_create(). -// -// Preconditions: !rp.Done(). For the final path component in rp, -// !rp.ShouldFollowSymlink(). -func (fs *filesystem) doCreateAt(rp *vfs.ResolvingPath, dir bool, create func(parent *dentry, name string) error) error { - fs.mu.Lock() - defer fs.mu.Unlock() - parent, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry)) - if err != nil { - return err - } - if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true /* isDir */); err != nil { - return err - } - name := rp.Component() - if name == "." || name == ".." { - return syserror.EEXIST - } - // Call parent.vfsd.Child() instead of stepLocked() or rp.ResolveChild(), - // because if the child exists we want to return EEXIST immediately instead - // of attempting symlink/mount traversal. - if parent.vfsd.Child(name) != nil { - return syserror.EEXIST - } - if !dir && rp.MustBeDir() { - return syserror.ENOENT - } - // In memfs, the only way to cause a dentry to be disowned is by removing - // it from the filesystem, so this check is equivalent to checking if - // parent has been removed. - if parent.vfsd.IsDisowned() { - return syserror.ENOENT - } - mnt := rp.Mount() - if err := mnt.CheckBeginWrite(); err != nil { - return err - } - defer mnt.EndWrite() - return create(parent, name) -} - -// GetDentryAt implements vfs.FilesystemImpl.GetDentryAt. -func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) { - fs.mu.RLock() - defer fs.mu.RUnlock() - d, err := resolveLocked(rp) - if err != nil { - return nil, err - } - if opts.CheckSearchable { - if !d.inode.isDir() { - return nil, syserror.ENOTDIR - } - if err := d.inode.checkPermissions(rp.Credentials(), vfs.MayExec, true /* isDir */); err != nil { - return nil, err - } - } - d.IncRef() - return &d.vfsd, nil -} - -// GetParentDentryAt implements vfs.FilesystemImpl.GetParentDentryAt. -func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) { - fs.mu.RLock() - defer fs.mu.RUnlock() - d, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry)) - if err != nil { - return nil, err - } - d.IncRef() - return &d.vfsd, nil -} - -// LinkAt implements vfs.FilesystemImpl.LinkAt. -func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error { - return fs.doCreateAt(rp, false /* dir */, func(parent *dentry, name string) error { - if rp.Mount() != vd.Mount() { - return syserror.EXDEV - } - d := vd.Dentry().Impl().(*dentry) - if d.inode.isDir() { - return syserror.EPERM - } - if d.inode.nlink == 0 { - return syserror.ENOENT - } - if d.inode.nlink == maxLinks { - return syserror.EMLINK - } - d.inode.incLinksLocked() - child := fs.newDentry(d.inode) - parent.vfsd.InsertChild(&child.vfsd, name) - parent.inode.impl.(*directory).childList.PushBack(child) - return nil - }) -} - -// MkdirAt implements vfs.FilesystemImpl.MkdirAt. -func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error { - return fs.doCreateAt(rp, true /* dir */, func(parent *dentry, name string) error { - if parent.inode.nlink == maxLinks { - return syserror.EMLINK - } - parent.inode.incLinksLocked() // from child's ".." - child := fs.newDentry(fs.newDirectory(rp.Credentials(), opts.Mode)) - parent.vfsd.InsertChild(&child.vfsd, name) - parent.inode.impl.(*directory).childList.PushBack(child) - return nil - }) -} - -// MknodAt implements vfs.FilesystemImpl.MknodAt. -func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error { - return fs.doCreateAt(rp, false /* dir */, func(parent *dentry, name string) error { - switch opts.Mode.FileType() { - case 0, linux.S_IFREG: - child := fs.newDentry(fs.newRegularFile(rp.Credentials(), opts.Mode)) - parent.vfsd.InsertChild(&child.vfsd, name) - parent.inode.impl.(*directory).childList.PushBack(child) - return nil - case linux.S_IFIFO: - child := fs.newDentry(fs.newNamedPipe(rp.Credentials(), opts.Mode)) - parent.vfsd.InsertChild(&child.vfsd, name) - parent.inode.impl.(*directory).childList.PushBack(child) - return nil - case linux.S_IFBLK, linux.S_IFCHR, linux.S_IFSOCK: - // Not yet supported. - return syserror.EPERM - default: - return syserror.EINVAL - } - }) -} - -// OpenAt implements vfs.FilesystemImpl.OpenAt. -func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - if opts.Flags&linux.O_TMPFILE != 0 { - // Not yet supported. - return nil, syserror.EOPNOTSUPP - } - - // Handle O_CREAT and !O_CREAT separately, since in the latter case we - // don't need fs.mu for writing. - if opts.Flags&linux.O_CREAT == 0 { - fs.mu.RLock() - defer fs.mu.RUnlock() - d, err := resolveLocked(rp) - if err != nil { - return nil, err - } - return d.open(ctx, rp, opts.Flags, false /* afterCreate */) - } - - mustCreate := opts.Flags&linux.O_EXCL != 0 - start := rp.Start().Impl().(*dentry) - fs.mu.Lock() - defer fs.mu.Unlock() - if rp.Done() { - // Reject attempts to open directories with O_CREAT. - if rp.MustBeDir() { - return nil, syserror.EISDIR - } - if mustCreate { - return nil, syserror.EEXIST - } - return start.open(ctx, rp, opts.Flags, false /* afterCreate */) - } -afterTrailingSymlink: - parent, err := walkParentDirLocked(rp, start) - if err != nil { - return nil, err - } - // Check for search permission in the parent directory. - if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil { - return nil, err - } - // Reject attempts to open directories with O_CREAT. - if rp.MustBeDir() { - return nil, syserror.EISDIR - } - name := rp.Component() - if name == "." || name == ".." { - return nil, syserror.EISDIR - } - // Determine whether or not we need to create a file. - child, err := stepLocked(rp, parent) - if err == syserror.ENOENT { - // Already checked for searchability above; now check for writability. - if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite, true); err != nil { - return nil, err - } - if err := rp.Mount().CheckBeginWrite(); err != nil { - return nil, err - } - defer rp.Mount().EndWrite() - // Create and open the child. - child := fs.newDentry(fs.newRegularFile(rp.Credentials(), opts.Mode)) - parent.vfsd.InsertChild(&child.vfsd, name) - parent.inode.impl.(*directory).childList.PushBack(child) - return child.open(ctx, rp, opts.Flags, true) - } - if err != nil { - return nil, err - } - // Do we need to resolve a trailing symlink? - if !rp.Done() { - start = parent - goto afterTrailingSymlink - } - // Open existing file. - if mustCreate { - return nil, syserror.EEXIST - } - return child.open(ctx, rp, opts.Flags, false) -} - -func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, flags uint32, afterCreate bool) (*vfs.FileDescription, error) { - ats := vfs.AccessTypesForOpenFlags(flags) - if !afterCreate { - if err := d.inode.checkPermissions(rp.Credentials(), ats, d.inode.isDir()); err != nil { - return nil, err - } - } - mnt := rp.Mount() - switch impl := d.inode.impl.(type) { - case *regularFile: - var fd regularFileFD - fd.readable = vfs.MayReadFileWithOpenFlags(flags) - fd.writable = vfs.MayWriteFileWithOpenFlags(flags) - if fd.writable { - if err := mnt.CheckBeginWrite(); err != nil { - return nil, err - } - // mnt.EndWrite() is called by regularFileFD.Release(). - } - fd.vfsfd.Init(&fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{}) - if flags&linux.O_TRUNC != 0 { - impl.mu.Lock() - impl.data = impl.data[:0] - atomic.StoreInt64(&impl.dataLen, 0) - impl.mu.Unlock() - } - return &fd.vfsfd, nil - case *directory: - // Can't open directories writably. - if ats&vfs.MayWrite != 0 { - return nil, syserror.EISDIR - } - var fd directoryFD - fd.vfsfd.Init(&fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{}) - return &fd.vfsfd, nil - case *symlink: - // Can't open symlinks without O_PATH (which is unimplemented). - return nil, syserror.ELOOP - case *namedPipe: - return newNamedPipeFD(ctx, impl, rp, &d.vfsd, flags) - default: - panic(fmt.Sprintf("unknown inode type: %T", d.inode.impl)) - } -} - -// ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt. -func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) { - fs.mu.RLock() - defer fs.mu.RUnlock() - d, err := resolveLocked(rp) - if err != nil { - return "", err - } - symlink, ok := d.inode.impl.(*symlink) - if !ok { - return "", syserror.EINVAL - } - return symlink.target, nil -} - -// RenameAt implements vfs.FilesystemImpl.RenameAt. -func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldParentVD vfs.VirtualDentry, oldName string, opts vfs.RenameOptions) error { - if opts.Flags != 0 { - // TODO(b/145974740): Support renameat2 flags. - return syserror.EINVAL - } - - // Resolve newParent first to verify that it's on this Mount. - fs.mu.Lock() - defer fs.mu.Unlock() - newParent, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry)) - if err != nil { - return err - } - newName := rp.Component() - if newName == "." || newName == ".." { - return syserror.EBUSY - } - mnt := rp.Mount() - if mnt != oldParentVD.Mount() { - return syserror.EXDEV - } - if err := mnt.CheckBeginWrite(); err != nil { - return err - } - defer mnt.EndWrite() - - oldParent := oldParentVD.Dentry().Impl().(*dentry) - if err := oldParent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true /* isDir */); err != nil { - return err - } - // Call vfs.Dentry.Child() instead of stepLocked() or rp.ResolveChild(), - // because if the existing child is a symlink or mount point then we want - // to rename over it rather than follow it. - renamedVFSD := oldParent.vfsd.Child(oldName) - if renamedVFSD == nil { - return syserror.ENOENT - } - renamed := renamedVFSD.Impl().(*dentry) - if renamed.inode.isDir() { - if renamed == newParent || renamedVFSD.IsAncestorOf(&newParent.vfsd) { - return syserror.EINVAL - } - if oldParent != newParent { - // Writability is needed to change renamed's "..". - if err := renamed.inode.checkPermissions(rp.Credentials(), vfs.MayWrite, true /* isDir */); err != nil { - return err - } - } - } else { - if opts.MustBeDir || rp.MustBeDir() { - return syserror.ENOTDIR - } - } - - if err := newParent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true /* isDir */); err != nil { - return err - } - replacedVFSD := newParent.vfsd.Child(newName) - var replaced *dentry - if replacedVFSD != nil { - replaced = replacedVFSD.Impl().(*dentry) - if replaced.inode.isDir() { - if !renamed.inode.isDir() { - return syserror.EISDIR - } - if replaced.vfsd.HasChildren() { - return syserror.ENOTEMPTY - } - } else { - if rp.MustBeDir() { - return syserror.ENOTDIR - } - if renamed.inode.isDir() { - return syserror.ENOTDIR - } - } - } else { - if renamed.inode.isDir() && newParent.inode.nlink == maxLinks { - return syserror.EMLINK - } - } - if newParent.vfsd.IsDisowned() { - return syserror.ENOENT - } - - // Linux places this check before some of those above; we do it here for - // simplicity, under the assumption that applications are not intentionally - // doing noop renames expecting them to succeed where non-noop renames - // would fail. - if renamedVFSD == replacedVFSD { - return nil - } - vfsObj := rp.VirtualFilesystem() - oldParentDir := oldParent.inode.impl.(*directory) - newParentDir := newParent.inode.impl.(*directory) - if err := vfsObj.PrepareRenameDentry(vfs.MountNamespaceFromContext(ctx), renamedVFSD, replacedVFSD); err != nil { - return err - } - if replaced != nil { - newParentDir.childList.Remove(replaced) - if replaced.inode.isDir() { - newParent.inode.decLinksLocked() // from replaced's ".." - } - replaced.inode.decLinksLocked() - } - oldParentDir.childList.Remove(renamed) - newParentDir.childList.PushBack(renamed) - if renamed.inode.isDir() { - oldParent.inode.decLinksLocked() - newParent.inode.incLinksLocked() - } - // TODO: update timestamps and parent directory sizes - vfsObj.CommitRenameReplaceDentry(renamedVFSD, &newParent.vfsd, newName, replacedVFSD) - return nil -} - -// RmdirAt implements vfs.FilesystemImpl.RmdirAt. -func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error { - fs.mu.Lock() - defer fs.mu.Unlock() - parent, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry)) - if err != nil { - return err - } - if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true /* isDir */); err != nil { - return err - } - name := rp.Component() - if name == "." { - return syserror.EINVAL - } - if name == ".." { - return syserror.ENOTEMPTY - } - childVFSD := parent.vfsd.Child(name) - if childVFSD == nil { - return syserror.ENOENT - } - child := childVFSD.Impl().(*dentry) - if !child.inode.isDir() { - return syserror.ENOTDIR - } - if childVFSD.HasChildren() { - return syserror.ENOTEMPTY - } - mnt := rp.Mount() - if err := mnt.CheckBeginWrite(); err != nil { - return err - } - defer mnt.EndWrite() - vfsObj := rp.VirtualFilesystem() - if err := vfsObj.PrepareDeleteDentry(vfs.MountNamespaceFromContext(ctx), childVFSD); err != nil { - return err - } - parent.inode.impl.(*directory).childList.Remove(child) - parent.inode.decLinksLocked() // from child's ".." - child.inode.decLinksLocked() - vfsObj.CommitDeleteDentry(childVFSD) - return nil -} - -// SetStatAt implements vfs.FilesystemImpl.SetStatAt. -func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error { - fs.mu.RLock() - defer fs.mu.RUnlock() - _, err := resolveLocked(rp) - if err != nil { - return err - } - if opts.Stat.Mask == 0 { - return nil - } - // TODO: implement inode.setStat - return syserror.EPERM -} - -// StatAt implements vfs.FilesystemImpl.StatAt. -func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) { - fs.mu.RLock() - defer fs.mu.RUnlock() - d, err := resolveLocked(rp) - if err != nil { - return linux.Statx{}, err - } - var stat linux.Statx - d.inode.statTo(&stat) - return stat, nil -} - -// StatFSAt implements vfs.FilesystemImpl.StatFSAt. -func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) { - fs.mu.RLock() - defer fs.mu.RUnlock() - _, err := resolveLocked(rp) - if err != nil { - return linux.Statfs{}, err - } - // TODO: actually implement statfs - return linux.Statfs{}, syserror.ENOSYS -} - -// SymlinkAt implements vfs.FilesystemImpl.SymlinkAt. -func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error { - return fs.doCreateAt(rp, false /* dir */, func(parent *dentry, name string) error { - child := fs.newDentry(fs.newSymlink(rp.Credentials(), target)) - parent.vfsd.InsertChild(&child.vfsd, name) - parent.inode.impl.(*directory).childList.PushBack(child) - return nil - }) -} - -// UnlinkAt implements vfs.FilesystemImpl.UnlinkAt. -func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error { - fs.mu.Lock() - defer fs.mu.Unlock() - parent, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry)) - if err != nil { - return err - } - if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true /* isDir */); err != nil { - return err - } - name := rp.Component() - if name == "." || name == ".." { - return syserror.EISDIR - } - childVFSD := parent.vfsd.Child(name) - if childVFSD == nil { - return syserror.ENOENT - } - child := childVFSD.Impl().(*dentry) - if child.inode.isDir() { - return syserror.EISDIR - } - if !rp.MustBeDir() { - return syserror.ENOTDIR - } - mnt := rp.Mount() - if err := mnt.CheckBeginWrite(); err != nil { - return err - } - defer mnt.EndWrite() - vfsObj := rp.VirtualFilesystem() - if err := vfsObj.PrepareDeleteDentry(vfs.MountNamespaceFromContext(ctx), childVFSD); err != nil { - return err - } - parent.inode.impl.(*directory).childList.Remove(child) - child.inode.decLinksLocked() - vfsObj.CommitDeleteDentry(childVFSD) - return nil -} - -// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt. -func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([]string, error) { - fs.mu.RLock() - defer fs.mu.RUnlock() - _, err := resolveLocked(rp) - if err != nil { - return nil, err - } - // TODO(b/127675828): support extended attributes - return nil, syserror.ENOTSUP -} - -// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt. -func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) (string, error) { - fs.mu.RLock() - defer fs.mu.RUnlock() - _, err := resolveLocked(rp) - if err != nil { - return "", err - } - // TODO(b/127675828): support extended attributes - return "", syserror.ENOTSUP -} - -// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt. -func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error { - fs.mu.RLock() - defer fs.mu.RUnlock() - _, err := resolveLocked(rp) - if err != nil { - return err - } - // TODO(b/127675828): support extended attributes - return syserror.ENOTSUP -} - -// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt. -func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error { - fs.mu.RLock() - defer fs.mu.RUnlock() - _, err := resolveLocked(rp) - if err != nil { - return err - } - // TODO(b/127675828): support extended attributes - return syserror.ENOTSUP -} - -// PrependPath implements vfs.FilesystemImpl.PrependPath. -func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error { - fs.mu.RLock() - defer fs.mu.RUnlock() - return vfs.GenericPrependPath(vfsroot, vd, b) -} diff --git a/pkg/sentry/fsimpl/memfs/memfs.go b/pkg/sentry/fsimpl/memfs/memfs.go deleted file mode 100644 index 8d0167c93..000000000 --- a/pkg/sentry/fsimpl/memfs/memfs.go +++ /dev/null @@ -1,293 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Package memfs provides a filesystem implementation that behaves like tmpfs: -// the Dentry tree is the sole source of truth for the state of the filesystem. -// -// memfs is intended primarily to demonstrate filesystem implementation -// patterns. Real uses cases for an in-memory filesystem should use tmpfs -// instead. -// -// Lock order: -// -// filesystem.mu -// regularFileFD.offMu -// regularFile.mu -// inode.mu -package memfs - -import ( - "fmt" - "math" - "sync" - "sync/atomic" - - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" -) - -// FilesystemType implements vfs.FilesystemType. -type FilesystemType struct{} - -// filesystem implements vfs.FilesystemImpl. -type filesystem struct { - vfsfs vfs.Filesystem - - // mu serializes changes to the Dentry tree. - mu sync.RWMutex - - nextInoMinusOne uint64 // accessed using atomic memory operations -} - -// GetFilesystem implements vfs.FilesystemType.GetFilesystem. -func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { - var fs filesystem - fs.vfsfs.Init(vfsObj, &fs) - root := fs.newDentry(fs.newDirectory(creds, 01777)) - return &fs.vfsfs, &root.vfsd, nil -} - -// Release implements vfs.FilesystemImpl.Release. -func (fs *filesystem) Release() { -} - -// dentry implements vfs.DentryImpl. -type dentry struct { - vfsd vfs.Dentry - - // inode is the inode represented by this dentry. Multiple Dentries may - // share a single non-directory inode (with hard links). inode is - // immutable. - inode *inode - - // memfs doesn't count references on dentries; because the dentry tree is - // the sole source of truth, it is by definition always consistent with the - // state of the filesystem. However, it does count references on inodes, - // because inode resources are released when all references are dropped. - // (memfs doesn't really have resources to release, but we implement - // reference counting because tmpfs regular files will.) - - // dentryEntry (ugh) links dentries into their parent directory.childList. - dentryEntry -} - -func (fs *filesystem) newDentry(inode *inode) *dentry { - d := &dentry{ - inode: inode, - } - d.vfsd.Init(d) - return d -} - -// IncRef implements vfs.DentryImpl.IncRef. -func (d *dentry) IncRef() { - d.inode.incRef() -} - -// TryIncRef implements vfs.DentryImpl.TryIncRef. -func (d *dentry) TryIncRef() bool { - return d.inode.tryIncRef() -} - -// DecRef implements vfs.DentryImpl.DecRef. -func (d *dentry) DecRef() { - d.inode.decRef() -} - -// inode represents a filesystem object. -type inode struct { - // refs is a reference count. refs is accessed using atomic memory - // operations. - // - // A reference is held on all inodes that are reachable in the filesystem - // tree. For non-directories (which may have multiple hard links), this - // means that a reference is dropped when nlink reaches 0. For directories, - // nlink never reaches 0 due to the "." entry; instead, - // filesystem.RmdirAt() drops the reference. - refs int64 - - // Inode metadata; protected by mu and accessed using atomic memory - // operations unless otherwise specified. - mu sync.RWMutex - mode uint32 // excluding file type bits, which are based on impl - nlink uint32 // protected by filesystem.mu instead of inode.mu - uid uint32 // auth.KUID, but stored as raw uint32 for sync/atomic - gid uint32 // auth.KGID, but ... - ino uint64 // immutable - - impl interface{} // immutable -} - -const maxLinks = math.MaxUint32 - -func (i *inode) init(impl interface{}, fs *filesystem, creds *auth.Credentials, mode linux.FileMode) { - i.refs = 1 - i.mode = uint32(mode) - i.uid = uint32(creds.EffectiveKUID) - i.gid = uint32(creds.EffectiveKGID) - i.ino = atomic.AddUint64(&fs.nextInoMinusOne, 1) - // i.nlink initialized by caller - i.impl = impl -} - -// incLinksLocked increments i's link count. -// -// Preconditions: filesystem.mu must be locked for writing. i.nlink != 0. -// i.nlink < maxLinks. -func (i *inode) incLinksLocked() { - if i.nlink == 0 { - panic("memfs.inode.incLinksLocked() called with no existing links") - } - if i.nlink == maxLinks { - panic("memfs.inode.incLinksLocked() called with maximum link count") - } - atomic.AddUint32(&i.nlink, 1) -} - -// decLinksLocked decrements i's link count. -// -// Preconditions: filesystem.mu must be locked for writing. i.nlink != 0. -func (i *inode) decLinksLocked() { - if i.nlink == 0 { - panic("memfs.inode.decLinksLocked() called with no existing links") - } - atomic.AddUint32(&i.nlink, ^uint32(0)) -} - -func (i *inode) incRef() { - if atomic.AddInt64(&i.refs, 1) <= 1 { - panic("memfs.inode.incRef() called without holding a reference") - } -} - -func (i *inode) tryIncRef() bool { - for { - refs := atomic.LoadInt64(&i.refs) - if refs == 0 { - return false - } - if atomic.CompareAndSwapInt64(&i.refs, refs, refs+1) { - return true - } - } -} - -func (i *inode) decRef() { - if refs := atomic.AddInt64(&i.refs, -1); refs == 0 { - // This is unnecessary; it's mostly to simulate what tmpfs would do. - if regfile, ok := i.impl.(*regularFile); ok { - regfile.mu.Lock() - regfile.data = nil - atomic.StoreInt64(®file.dataLen, 0) - regfile.mu.Unlock() - } - } else if refs < 0 { - panic("memfs.inode.decRef() called without holding a reference") - } -} - -func (i *inode) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes, isDir bool) error { - return vfs.GenericCheckPermissions(creds, ats, isDir, uint16(atomic.LoadUint32(&i.mode)), auth.KUID(atomic.LoadUint32(&i.uid)), auth.KGID(atomic.LoadUint32(&i.gid))) -} - -// Go won't inline this function, and returning linux.Statx (which is quite -// big) means spending a lot of time in runtime.duffcopy(), so instead it's an -// output parameter. -func (i *inode) statTo(stat *linux.Statx) { - stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO - stat.Blksize = 1 // usermem.PageSize in tmpfs - stat.Nlink = atomic.LoadUint32(&i.nlink) - stat.UID = atomic.LoadUint32(&i.uid) - stat.GID = atomic.LoadUint32(&i.gid) - stat.Mode = uint16(atomic.LoadUint32(&i.mode)) - stat.Ino = i.ino - // TODO: device number - switch impl := i.impl.(type) { - case *regularFile: - stat.Mode |= linux.S_IFREG - stat.Mask |= linux.STATX_SIZE | linux.STATX_BLOCKS - stat.Size = uint64(atomic.LoadInt64(&impl.dataLen)) - // In tmpfs, this will be FileRangeSet.Span() / 512 (but also cached in - // a uint64 accessed using atomic memory operations to avoid taking - // locks). - stat.Blocks = allocatedBlocksForSize(stat.Size) - case *directory: - stat.Mode |= linux.S_IFDIR - case *symlink: - stat.Mode |= linux.S_IFLNK - stat.Mask |= linux.STATX_SIZE | linux.STATX_BLOCKS - stat.Size = uint64(len(impl.target)) - stat.Blocks = allocatedBlocksForSize(stat.Size) - case *namedPipe: - stat.Mode |= linux.S_IFIFO - default: - panic(fmt.Sprintf("unknown inode type: %T", i.impl)) - } -} - -// allocatedBlocksForSize returns the number of 512B blocks needed to -// accommodate the given size in bytes, as appropriate for struct -// stat::st_blocks and struct statx::stx_blocks. (Note that this 512B block -// size is independent of the "preferred block size for I/O", struct -// stat::st_blksize and struct statx::stx_blksize.) -func allocatedBlocksForSize(size uint64) uint64 { - return (size + 511) / 512 -} - -func (i *inode) direntType() uint8 { - switch i.impl.(type) { - case *regularFile: - return linux.DT_REG - case *directory: - return linux.DT_DIR - case *symlink: - return linux.DT_LNK - default: - panic(fmt.Sprintf("unknown inode type: %T", i.impl)) - } -} - -// fileDescription is embedded by memfs implementations of -// vfs.FileDescriptionImpl. -type fileDescription struct { - vfsfd vfs.FileDescription - vfs.FileDescriptionDefaultImpl -} - -func (fd *fileDescription) filesystem() *filesystem { - return fd.vfsfd.Mount().Filesystem().Impl().(*filesystem) -} - -func (fd *fileDescription) inode() *inode { - return fd.vfsfd.Dentry().Impl().(*dentry).inode -} - -// Stat implements vfs.FileDescriptionImpl.Stat. -func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { - var stat linux.Statx - fd.inode().statTo(&stat) - return stat, nil -} - -// SetStat implements vfs.FileDescriptionImpl.SetStat. -func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { - if opts.Stat.Mask == 0 { - return nil - } - // TODO: implement inode.setStat - return syserror.EPERM -} diff --git a/pkg/sentry/fsimpl/memfs/named_pipe.go b/pkg/sentry/fsimpl/memfs/named_pipe.go deleted file mode 100644 index b5a204438..000000000 --- a/pkg/sentry/fsimpl/memfs/named_pipe.go +++ /dev/null @@ -1,60 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package memfs - -import ( - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" - "gvisor.dev/gvisor/pkg/sentry/usermem" - "gvisor.dev/gvisor/pkg/sentry/vfs" -) - -type namedPipe struct { - inode inode - - pipe *pipe.VFSPipe -} - -// Preconditions: -// * fs.mu must be locked. -// * rp.Mount().CheckBeginWrite() has been called successfully. -func (fs *filesystem) newNamedPipe(creds *auth.Credentials, mode linux.FileMode) *inode { - file := &namedPipe{pipe: pipe.NewVFSPipe(pipe.DefaultPipeSize, usermem.PageSize)} - file.inode.init(file, fs, creds, mode) - file.inode.nlink = 1 // Only the parent has a link. - return &file.inode -} - -// namedPipeFD implements vfs.FileDescriptionImpl. Methods are implemented -// entirely via struct embedding. -type namedPipeFD struct { - fileDescription - - *pipe.VFSPipeFD -} - -func newNamedPipeFD(ctx context.Context, np *namedPipe, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) { - var err error - var fd namedPipeFD - fd.VFSPipeFD, err = np.pipe.NewVFSPipeFD(ctx, rp, vfsd, &fd.vfsfd, flags) - if err != nil { - return nil, err - } - mnt := rp.Mount() - fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{}) - return &fd.vfsfd, nil -} diff --git a/pkg/sentry/fsimpl/memfs/pipe_test.go b/pkg/sentry/fsimpl/memfs/pipe_test.go deleted file mode 100644 index 807c1af7a..000000000 --- a/pkg/sentry/fsimpl/memfs/pipe_test.go +++ /dev/null @@ -1,235 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package memfs - -import ( - "bytes" - "testing" - - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/fspath" - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/context/contexttest" - "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/sentry/usermem" - "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" -) - -const fileName = "mypipe" - -func TestSeparateFDs(t *testing.T) { - ctx, creds, vfsObj, root := setup(t) - defer root.DecRef() - - // Open the read side. This is done in a concurrently because opening - // One end the pipe blocks until the other end is opened. - pop := vfs.PathOperation{ - Root: root, - Start: root, - Path: fspath.Parse(fileName), - FollowFinalSymlink: true, - } - rfdchan := make(chan *vfs.FileDescription) - go func() { - openOpts := vfs.OpenOptions{Flags: linux.O_RDONLY} - rfd, _ := vfsObj.OpenAt(ctx, creds, &pop, &openOpts) - rfdchan <- rfd - }() - - // Open the write side. - openOpts := vfs.OpenOptions{Flags: linux.O_WRONLY} - wfd, err := vfsObj.OpenAt(ctx, creds, &pop, &openOpts) - if err != nil { - t.Fatalf("failed to open pipe for writing %q: %v", fileName, err) - } - defer wfd.DecRef() - - rfd, ok := <-rfdchan - if !ok { - t.Fatalf("failed to open pipe for reading %q", fileName) - } - defer rfd.DecRef() - - const msg = "vamos azul" - checkEmpty(ctx, t, rfd) - checkWrite(ctx, t, wfd, msg) - checkRead(ctx, t, rfd, msg) -} - -func TestNonblockingRead(t *testing.T) { - ctx, creds, vfsObj, root := setup(t) - defer root.DecRef() - - // Open the read side as nonblocking. - pop := vfs.PathOperation{ - Root: root, - Start: root, - Path: fspath.Parse(fileName), - FollowFinalSymlink: true, - } - openOpts := vfs.OpenOptions{Flags: linux.O_RDONLY | linux.O_NONBLOCK} - rfd, err := vfsObj.OpenAt(ctx, creds, &pop, &openOpts) - if err != nil { - t.Fatalf("failed to open pipe for reading %q: %v", fileName, err) - } - defer rfd.DecRef() - - // Open the write side. - openOpts = vfs.OpenOptions{Flags: linux.O_WRONLY} - wfd, err := vfsObj.OpenAt(ctx, creds, &pop, &openOpts) - if err != nil { - t.Fatalf("failed to open pipe for writing %q: %v", fileName, err) - } - defer wfd.DecRef() - - const msg = "geh blau" - checkEmpty(ctx, t, rfd) - checkWrite(ctx, t, wfd, msg) - checkRead(ctx, t, rfd, msg) -} - -func TestNonblockingWriteError(t *testing.T) { - ctx, creds, vfsObj, root := setup(t) - defer root.DecRef() - - // Open the write side as nonblocking, which should return ENXIO. - pop := vfs.PathOperation{ - Root: root, - Start: root, - Path: fspath.Parse(fileName), - FollowFinalSymlink: true, - } - openOpts := vfs.OpenOptions{Flags: linux.O_WRONLY | linux.O_NONBLOCK} - _, err := vfsObj.OpenAt(ctx, creds, &pop, &openOpts) - if err != syserror.ENXIO { - t.Fatalf("expected ENXIO, but got error: %v", err) - } -} - -func TestSingleFD(t *testing.T) { - ctx, creds, vfsObj, root := setup(t) - defer root.DecRef() - - // Open the pipe as readable and writable. - pop := vfs.PathOperation{ - Root: root, - Start: root, - Path: fspath.Parse(fileName), - FollowFinalSymlink: true, - } - openOpts := vfs.OpenOptions{Flags: linux.O_RDWR} - fd, err := vfsObj.OpenAt(ctx, creds, &pop, &openOpts) - if err != nil { - t.Fatalf("failed to open pipe for writing %q: %v", fileName, err) - } - defer fd.DecRef() - - const msg = "forza blu" - checkEmpty(ctx, t, fd) - checkWrite(ctx, t, fd, msg) - checkRead(ctx, t, fd, msg) -} - -// setup creates a VFS with a pipe in the root directory at path fileName. The -// returned VirtualDentry must be DecRef()'d be the caller. It calls t.Fatal -// upon failure. -func setup(t *testing.T) (context.Context, *auth.Credentials, *vfs.VirtualFilesystem, vfs.VirtualDentry) { - ctx := contexttest.Context(t) - creds := auth.CredentialsFromContext(ctx) - - // Create VFS. - vfsObj := vfs.New() - vfsObj.MustRegisterFilesystemType("memfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ - AllowUserMount: true, - }) - mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "memfs", &vfs.GetFilesystemOptions{}) - if err != nil { - t.Fatalf("failed to create tmpfs root mount: %v", err) - } - - // Create the pipe. - root := mntns.Root() - pop := vfs.PathOperation{ - Root: root, - Start: root, - Path: fspath.Parse(fileName), - } - mknodOpts := vfs.MknodOptions{Mode: linux.ModeNamedPipe | 0644} - if err := vfsObj.MknodAt(ctx, creds, &pop, &mknodOpts); err != nil { - t.Fatalf("failed to create file %q: %v", fileName, err) - } - - // Sanity check: the file pipe exists and has the correct mode. - stat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{ - Root: root, - Start: root, - Path: fspath.Parse(fileName), - FollowFinalSymlink: true, - }, &vfs.StatOptions{}) - if err != nil { - t.Fatalf("stat(%q) failed: %v", fileName, err) - } - if stat.Mode&^linux.S_IFMT != 0644 { - t.Errorf("got wrong permissions (%0o)", stat.Mode) - } - if stat.Mode&linux.S_IFMT != linux.ModeNamedPipe { - t.Errorf("got wrong file type (%0o)", stat.Mode) - } - - return ctx, creds, vfsObj, root -} - -// checkEmpty calls t.Fatal if the pipe in fd is not empty. -func checkEmpty(ctx context.Context, t *testing.T, fd *vfs.FileDescription) { - readData := make([]byte, 1) - dst := usermem.BytesIOSequence(readData) - bytesRead, err := fd.Read(ctx, dst, vfs.ReadOptions{}) - if err != syserror.ErrWouldBlock { - t.Fatalf("expected ErrWouldBlock reading from empty pipe %q, but got: %v", fileName, err) - } - if bytesRead != 0 { - t.Fatalf("expected to read 0 bytes, but got %d", bytesRead) - } -} - -// checkWrite calls t.Fatal if it fails to write all of msg to fd. -func checkWrite(ctx context.Context, t *testing.T, fd *vfs.FileDescription, msg string) { - writeData := []byte(msg) - src := usermem.BytesIOSequence(writeData) - bytesWritten, err := fd.Write(ctx, src, vfs.WriteOptions{}) - if err != nil { - t.Fatalf("error writing to pipe %q: %v", fileName, err) - } - if bytesWritten != int64(len(writeData)) { - t.Fatalf("expected to write %d bytes, but wrote %d", len(writeData), bytesWritten) - } -} - -// checkRead calls t.Fatal if it fails to read msg from fd. -func checkRead(ctx context.Context, t *testing.T, fd *vfs.FileDescription, msg string) { - readData := make([]byte, len(msg)) - dst := usermem.BytesIOSequence(readData) - bytesRead, err := fd.Read(ctx, dst, vfs.ReadOptions{}) - if err != nil { - t.Fatalf("error reading from pipe %q: %v", fileName, err) - } - if bytesRead != int64(len(msg)) { - t.Fatalf("expected to read %d bytes, but got %d", len(msg), bytesRead) - } - if !bytes.Equal(readData, []byte(msg)) { - t.Fatalf("expected to read %q from pipe, but got %q", msg, string(readData)) - } -} diff --git a/pkg/sentry/fsimpl/memfs/regular_file.go b/pkg/sentry/fsimpl/memfs/regular_file.go deleted file mode 100644 index b7f4853b3..000000000 --- a/pkg/sentry/fsimpl/memfs/regular_file.go +++ /dev/null @@ -1,154 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package memfs - -import ( - "io" - "sync" - "sync/atomic" - - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/sentry/usermem" - "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" -) - -type regularFile struct { - inode inode - - mu sync.RWMutex - data []byte - // dataLen is len(data), but accessed using atomic memory operations to - // avoid locking in inode.stat(). - dataLen int64 -} - -func (fs *filesystem) newRegularFile(creds *auth.Credentials, mode linux.FileMode) *inode { - file := ®ularFile{} - file.inode.init(file, fs, creds, mode) - file.inode.nlink = 1 // from parent directory - return &file.inode -} - -type regularFileFD struct { - fileDescription - - // These are immutable. - readable bool - writable bool - - // off is the file offset. off is accessed using atomic memory operations. - // offMu serializes operations that may mutate off. - off int64 - offMu sync.Mutex -} - -// Release implements vfs.FileDescriptionImpl.Release. -func (fd *regularFileFD) Release() { - if fd.writable { - fd.vfsfd.VirtualDentry().Mount().EndWrite() - } -} - -// PRead implements vfs.FileDescriptionImpl.PRead. -func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { - if !fd.readable { - return 0, syserror.EINVAL - } - f := fd.inode().impl.(*regularFile) - f.mu.RLock() - if offset >= int64(len(f.data)) { - f.mu.RUnlock() - return 0, io.EOF - } - n, err := dst.CopyOut(ctx, f.data[offset:]) - f.mu.RUnlock() - return int64(n), err -} - -// Read implements vfs.FileDescriptionImpl.Read. -func (fd *regularFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { - fd.offMu.Lock() - n, err := fd.PRead(ctx, dst, fd.off, opts) - fd.off += n - fd.offMu.Unlock() - return n, err -} - -// PWrite implements vfs.FileDescriptionImpl.PWrite. -func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { - if !fd.writable { - return 0, syserror.EINVAL - } - if offset < 0 { - return 0, syserror.EINVAL - } - srclen := src.NumBytes() - if srclen == 0 { - return 0, nil - } - f := fd.inode().impl.(*regularFile) - f.mu.Lock() - end := offset + srclen - if end < offset { - // Overflow. - f.mu.Unlock() - return 0, syserror.EFBIG - } - if end > f.dataLen { - f.data = append(f.data, make([]byte, end-f.dataLen)...) - atomic.StoreInt64(&f.dataLen, end) - } - n, err := src.CopyIn(ctx, f.data[offset:end]) - f.mu.Unlock() - return int64(n), err -} - -// Write implements vfs.FileDescriptionImpl.Write. -func (fd *regularFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { - fd.offMu.Lock() - n, err := fd.PWrite(ctx, src, fd.off, opts) - fd.off += n - fd.offMu.Unlock() - return n, err -} - -// Seek implements vfs.FileDescriptionImpl.Seek. -func (fd *regularFileFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { - fd.offMu.Lock() - defer fd.offMu.Unlock() - switch whence { - case linux.SEEK_SET: - // use offset as specified - case linux.SEEK_CUR: - offset += fd.off - case linux.SEEK_END: - offset += atomic.LoadInt64(&fd.inode().impl.(*regularFile).dataLen) - default: - return 0, syserror.EINVAL - } - if offset < 0 { - return 0, syserror.EINVAL - } - fd.off = offset - return offset, nil -} - -// Sync implements vfs.FileDescriptionImpl.Sync. -func (fd *regularFileFD) Sync(ctx context.Context) error { - return nil -} diff --git a/pkg/sentry/fsimpl/memfs/symlink.go b/pkg/sentry/fsimpl/memfs/symlink.go deleted file mode 100644 index b2ac2cbeb..000000000 --- a/pkg/sentry/fsimpl/memfs/symlink.go +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package memfs - -import ( - "gvisor.dev/gvisor/pkg/sentry/kernel/auth" -) - -type symlink struct { - inode inode - target string // immutable -} - -func (fs *filesystem) newSymlink(creds *auth.Credentials, target string) *inode { - link := &symlink{ - target: target, - } - link.inode.init(link, fs, creds, 0777) - link.inode.nlink = 1 // from parent directory - return &link.inode -} - -// O_PATH is unimplemented, so there's no way to get a FileDescription -// representing a symlink yet. diff --git a/pkg/sentry/fsimpl/tmpfs/BUILD b/pkg/sentry/fsimpl/tmpfs/BUILD new file mode 100644 index 000000000..a5b285987 --- /dev/null +++ b/pkg/sentry/fsimpl/tmpfs/BUILD @@ -0,0 +1,92 @@ +load("//tools/go_stateify:defs.bzl", "go_library") +load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools/go_generics:defs.bzl", "go_template_instance") + +package(licenses = ["notice"]) + +go_template_instance( + name = "dentry_list", + out = "dentry_list.go", + package = "tmpfs", + prefix = "dentry", + template = "//pkg/ilist:generic_list", + types = { + "Element": "*dentry", + "Linker": "*dentry", + }, +) + +go_library( + name = "tmpfs", + srcs = [ + "dentry_list.go", + "directory.go", + "filesystem.go", + "named_pipe.go", + "regular_file.go", + "symlink.go", + "tmpfs.go", + ], + importpath = "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs", + deps = [ + "//pkg/abi/linux", + "//pkg/amutex", + "//pkg/fspath", + "//pkg/log", + "//pkg/sentry/arch", + "//pkg/sentry/context", + "//pkg/sentry/fs", + "//pkg/sentry/fs/fsutil", + "//pkg/sentry/kernel", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/kernel/pipe", + "//pkg/sentry/memmap", + "//pkg/sentry/pgalloc", + "//pkg/sentry/platform", + "//pkg/sentry/safemem", + "//pkg/sentry/usage", + "//pkg/sentry/usermem", + "//pkg/sentry/vfs", + "//pkg/syserror", + ], +) + +go_test( + name = "benchmark_test", + size = "small", + srcs = ["benchmark_test.go"], + deps = [ + ":tmpfs", + "//pkg/abi/linux", + "//pkg/fspath", + "//pkg/refs", + "//pkg/sentry/context", + "//pkg/sentry/context/contexttest", + "//pkg/sentry/fs", + "//pkg/sentry/fs/tmpfs", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/vfs", + "//pkg/syserror", + ], +) + +go_test( + name = "tmpfs_test", + size = "small", + srcs = [ + "pipe_test.go", + "regular_file_test.go", + ], + embed = [":tmpfs"], + deps = [ + "//pkg/abi/linux", + "//pkg/fspath", + "//pkg/sentry/context", + "//pkg/sentry/context/contexttest", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/kernel/contexttest", + "//pkg/sentry/usermem", + "//pkg/sentry/vfs", + "//pkg/syserror", + ], +) diff --git a/pkg/sentry/fsimpl/tmpfs/benchmark_test.go b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go new file mode 100644 index 000000000..d88c83499 --- /dev/null +++ b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go @@ -0,0 +1,487 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package benchmark_test + +import ( + "fmt" + "runtime" + "strings" + "testing" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/refs" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/context/contexttest" + "gvisor.dev/gvisor/pkg/sentry/fs" + _ "gvisor.dev/gvisor/pkg/sentry/fs/tmpfs" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +// Differences from stat_benchmark: +// +// - Syscall interception, CopyInPath, copyOutStat, and overlayfs overheads are +// not included. +// +// - *MountStat benchmarks use a tmpfs root mount and a tmpfs submount at /tmp. +// Non-MountStat benchmarks use a tmpfs root mount and no submounts. +// stat_benchmark uses a varying root mount, a tmpfs submount at /tmp, and a +// subdirectory /tmp/ (assuming TEST_TMPDIR == "/tmp"). Thus +// stat_benchmark at depth 1 does a comparable amount of work to *MountStat +// benchmarks at depth 2, and non-MountStat benchmarks at depth 3. +var depths = []int{1, 2, 3, 8, 64, 100} + +const ( + mountPointName = "tmp" + filename = "gvisor_test_temp_0_1557494568" +) + +// This is copied from syscalls/linux/sys_file.go, with the dependency on +// kernel.Task stripped out. +func fileOpOn(ctx context.Context, mntns *fs.MountNamespace, root, wd *fs.Dirent, dirFD int32, path string, resolve bool, fn func(root *fs.Dirent, d *fs.Dirent) error) error { + var ( + d *fs.Dirent // The file. + rel *fs.Dirent // The relative directory for search (if required.) + err error + ) + + // Extract the working directory (maybe). + if len(path) > 0 && path[0] == '/' { + // Absolute path; rel can be nil. + } else if dirFD == linux.AT_FDCWD { + // Need to reference the working directory. + rel = wd + } else { + // Need to extract the given FD. + return syserror.EBADF + } + + // Lookup the node. + remainingTraversals := uint(linux.MaxSymlinkTraversals) + if resolve { + d, err = mntns.FindInode(ctx, root, rel, path, &remainingTraversals) + } else { + d, err = mntns.FindLink(ctx, root, rel, path, &remainingTraversals) + } + if err != nil { + return err + } + + err = fn(root, d) + d.DecRef() + return err +} + +func BenchmarkVFS1TmpfsStat(b *testing.B) { + for _, depth := range depths { + b.Run(fmt.Sprintf("%d", depth), func(b *testing.B) { + ctx := contexttest.Context(b) + + // Create VFS. + tmpfsFS, ok := fs.FindFilesystem("tmpfs") + if !ok { + b.Fatalf("failed to find tmpfs filesystem type") + } + rootInode, err := tmpfsFS.Mount(ctx, "tmpfs", fs.MountSourceFlags{}, "", nil) + if err != nil { + b.Fatalf("failed to create tmpfs root mount: %v", err) + } + mntns, err := fs.NewMountNamespace(ctx, rootInode) + if err != nil { + b.Fatalf("failed to create mount namespace: %v", err) + } + defer mntns.DecRef() + + var filePathBuilder strings.Builder + filePathBuilder.WriteByte('/') + + // Create nested directories with given depth. + root := mntns.Root() + defer root.DecRef() + d := root + d.IncRef() + defer d.DecRef() + for i := depth; i > 0; i-- { + name := fmt.Sprintf("%d", i) + if err := d.Inode.CreateDirectory(ctx, d, name, fs.FilePermsFromMode(0755)); err != nil { + b.Fatalf("failed to create directory %q: %v", name, err) + } + next, err := d.Walk(ctx, root, name) + if err != nil { + b.Fatalf("failed to walk to directory %q: %v", name, err) + } + d.DecRef() + d = next + filePathBuilder.WriteString(name) + filePathBuilder.WriteByte('/') + } + + // Create the file that will be stat'd. + file, err := d.Inode.Create(ctx, d, filename, fs.FileFlags{Read: true, Write: true}, fs.FilePermsFromMode(0644)) + if err != nil { + b.Fatalf("failed to create file %q: %v", filename, err) + } + file.DecRef() + filePathBuilder.WriteString(filename) + filePath := filePathBuilder.String() + + dirPath := false + runtime.GC() + b.ResetTimer() + for i := 0; i < b.N; i++ { + err := fileOpOn(ctx, mntns, root, root, linux.AT_FDCWD, filePath, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent) error { + if dirPath && !fs.IsDir(d.Inode.StableAttr) { + return syserror.ENOTDIR + } + uattr, err := d.Inode.UnstableAttr(ctx) + if err != nil { + return err + } + // Sanity check. + if uattr.Perms.User.Execute { + b.Fatalf("got wrong permissions (%0o)", uattr.Perms.LinuxMode()) + } + return nil + }) + if err != nil { + b.Fatalf("stat(%q) failed: %v", filePath, err) + } + } + // Don't include deferred cleanup in benchmark time. + b.StopTimer() + }) + } +} + +func BenchmarkVFS2MemfsStat(b *testing.B) { + for _, depth := range depths { + b.Run(fmt.Sprintf("%d", depth), func(b *testing.B) { + ctx := contexttest.Context(b) + creds := auth.CredentialsFromContext(ctx) + + // Create VFS. + vfsObj := vfs.New() + vfsObj.MustRegisterFilesystemType("tmpfs", tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserMount: true, + }) + mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{}) + if err != nil { + b.Fatalf("failed to create tmpfs root mount: %v", err) + } + defer mntns.DecRef(vfsObj) + + var filePathBuilder strings.Builder + filePathBuilder.WriteByte('/') + + // Create nested directories with given depth. + root := mntns.Root() + defer root.DecRef() + vd := root + vd.IncRef() + for i := depth; i > 0; i-- { + name := fmt.Sprintf("%d", i) + pop := vfs.PathOperation{ + Root: root, + Start: vd, + Path: fspath.Parse(name), + } + if err := vfsObj.MkdirAt(ctx, creds, &pop, &vfs.MkdirOptions{ + Mode: 0755, + }); err != nil { + b.Fatalf("failed to create directory %q: %v", name, err) + } + nextVD, err := vfsObj.GetDentryAt(ctx, creds, &pop, &vfs.GetDentryOptions{}) + if err != nil { + b.Fatalf("failed to walk to directory %q: %v", name, err) + } + vd.DecRef() + vd = nextVD + filePathBuilder.WriteString(name) + filePathBuilder.WriteByte('/') + } + + // Create the file that will be stat'd. + fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{ + Root: root, + Start: vd, + Path: fspath.Parse(filename), + FollowFinalSymlink: true, + }, &vfs.OpenOptions{ + Flags: linux.O_RDWR | linux.O_CREAT | linux.O_EXCL, + Mode: 0644, + }) + vd.DecRef() + vd = vfs.VirtualDentry{} + if err != nil { + b.Fatalf("failed to create file %q: %v", filename, err) + } + defer fd.DecRef() + filePathBuilder.WriteString(filename) + filePath := filePathBuilder.String() + + runtime.GC() + b.ResetTimer() + for i := 0; i < b.N; i++ { + stat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(filePath), + FollowFinalSymlink: true, + }, &vfs.StatOptions{}) + if err != nil { + b.Fatalf("stat(%q) failed: %v", filePath, err) + } + // Sanity check. + if stat.Mode&^linux.S_IFMT != 0644 { + b.Fatalf("got wrong permissions (%0o)", stat.Mode) + } + } + // Don't include deferred cleanup in benchmark time. + b.StopTimer() + }) + } +} + +func BenchmarkVFS1TmpfsMountStat(b *testing.B) { + for _, depth := range depths { + b.Run(fmt.Sprintf("%d", depth), func(b *testing.B) { + ctx := contexttest.Context(b) + + // Create VFS. + tmpfsFS, ok := fs.FindFilesystem("tmpfs") + if !ok { + b.Fatalf("failed to find tmpfs filesystem type") + } + rootInode, err := tmpfsFS.Mount(ctx, "tmpfs", fs.MountSourceFlags{}, "", nil) + if err != nil { + b.Fatalf("failed to create tmpfs root mount: %v", err) + } + mntns, err := fs.NewMountNamespace(ctx, rootInode) + if err != nil { + b.Fatalf("failed to create mount namespace: %v", err) + } + defer mntns.DecRef() + + var filePathBuilder strings.Builder + filePathBuilder.WriteByte('/') + + // Create and mount the submount. + root := mntns.Root() + defer root.DecRef() + if err := root.Inode.CreateDirectory(ctx, root, mountPointName, fs.FilePermsFromMode(0755)); err != nil { + b.Fatalf("failed to create mount point: %v", err) + } + mountPoint, err := root.Walk(ctx, root, mountPointName) + if err != nil { + b.Fatalf("failed to walk to mount point: %v", err) + } + defer mountPoint.DecRef() + submountInode, err := tmpfsFS.Mount(ctx, "tmpfs", fs.MountSourceFlags{}, "", nil) + if err != nil { + b.Fatalf("failed to create tmpfs submount: %v", err) + } + if err := mntns.Mount(ctx, mountPoint, submountInode); err != nil { + b.Fatalf("failed to mount tmpfs submount: %v", err) + } + filePathBuilder.WriteString(mountPointName) + filePathBuilder.WriteByte('/') + + // Create nested directories with given depth. + d, err := root.Walk(ctx, root, mountPointName) + if err != nil { + b.Fatalf("failed to walk to mount root: %v", err) + } + defer d.DecRef() + for i := depth; i > 0; i-- { + name := fmt.Sprintf("%d", i) + if err := d.Inode.CreateDirectory(ctx, d, name, fs.FilePermsFromMode(0755)); err != nil { + b.Fatalf("failed to create directory %q: %v", name, err) + } + next, err := d.Walk(ctx, root, name) + if err != nil { + b.Fatalf("failed to walk to directory %q: %v", name, err) + } + d.DecRef() + d = next + filePathBuilder.WriteString(name) + filePathBuilder.WriteByte('/') + } + + // Create the file that will be stat'd. + file, err := d.Inode.Create(ctx, d, filename, fs.FileFlags{Read: true, Write: true}, fs.FilePermsFromMode(0644)) + if err != nil { + b.Fatalf("failed to create file %q: %v", filename, err) + } + file.DecRef() + filePathBuilder.WriteString(filename) + filePath := filePathBuilder.String() + + dirPath := false + runtime.GC() + b.ResetTimer() + for i := 0; i < b.N; i++ { + err := fileOpOn(ctx, mntns, root, root, linux.AT_FDCWD, filePath, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent) error { + if dirPath && !fs.IsDir(d.Inode.StableAttr) { + return syserror.ENOTDIR + } + uattr, err := d.Inode.UnstableAttr(ctx) + if err != nil { + return err + } + // Sanity check. + if uattr.Perms.User.Execute { + b.Fatalf("got wrong permissions (%0o)", uattr.Perms.LinuxMode()) + } + return nil + }) + if err != nil { + b.Fatalf("stat(%q) failed: %v", filePath, err) + } + } + // Don't include deferred cleanup in benchmark time. + b.StopTimer() + }) + } +} + +func BenchmarkVFS2MemfsMountStat(b *testing.B) { + for _, depth := range depths { + b.Run(fmt.Sprintf("%d", depth), func(b *testing.B) { + ctx := contexttest.Context(b) + creds := auth.CredentialsFromContext(ctx) + + // Create VFS. + vfsObj := vfs.New() + vfsObj.MustRegisterFilesystemType("tmpfs", tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserMount: true, + }) + mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{}) + if err != nil { + b.Fatalf("failed to create tmpfs root mount: %v", err) + } + defer mntns.DecRef(vfsObj) + + var filePathBuilder strings.Builder + filePathBuilder.WriteByte('/') + + // Create the mount point. + root := mntns.Root() + defer root.DecRef() + pop := vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(mountPointName), + } + if err := vfsObj.MkdirAt(ctx, creds, &pop, &vfs.MkdirOptions{ + Mode: 0755, + }); err != nil { + b.Fatalf("failed to create mount point: %v", err) + } + // Save the mount point for later use. + mountPoint, err := vfsObj.GetDentryAt(ctx, creds, &pop, &vfs.GetDentryOptions{}) + if err != nil { + b.Fatalf("failed to walk to mount point: %v", err) + } + defer mountPoint.DecRef() + // Create and mount the submount. + if err := vfsObj.MountAt(ctx, creds, "", &pop, "tmpfs", &vfs.MountOptions{}); err != nil { + b.Fatalf("failed to mount tmpfs submount: %v", err) + } + filePathBuilder.WriteString(mountPointName) + filePathBuilder.WriteByte('/') + + // Create nested directories with given depth. + vd, err := vfsObj.GetDentryAt(ctx, creds, &pop, &vfs.GetDentryOptions{}) + if err != nil { + b.Fatalf("failed to walk to mount root: %v", err) + } + for i := depth; i > 0; i-- { + name := fmt.Sprintf("%d", i) + pop := vfs.PathOperation{ + Root: root, + Start: vd, + Path: fspath.Parse(name), + } + if err := vfsObj.MkdirAt(ctx, creds, &pop, &vfs.MkdirOptions{ + Mode: 0755, + }); err != nil { + b.Fatalf("failed to create directory %q: %v", name, err) + } + nextVD, err := vfsObj.GetDentryAt(ctx, creds, &pop, &vfs.GetDentryOptions{}) + if err != nil { + b.Fatalf("failed to walk to directory %q: %v", name, err) + } + vd.DecRef() + vd = nextVD + filePathBuilder.WriteString(name) + filePathBuilder.WriteByte('/') + } + + // Verify that we didn't create any directories under the mount + // point (i.e. they were all created on the submount). + firstDirName := fmt.Sprintf("%d", depth) + if child := mountPoint.Dentry().Child(firstDirName); child != nil { + b.Fatalf("created directory %q under root mount, not submount", firstDirName) + } + + // Create the file that will be stat'd. + fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{ + Root: root, + Start: vd, + Path: fspath.Parse(filename), + FollowFinalSymlink: true, + }, &vfs.OpenOptions{ + Flags: linux.O_RDWR | linux.O_CREAT | linux.O_EXCL, + Mode: 0644, + }) + vd.DecRef() + if err != nil { + b.Fatalf("failed to create file %q: %v", filename, err) + } + fd.DecRef() + filePathBuilder.WriteString(filename) + filePath := filePathBuilder.String() + + runtime.GC() + b.ResetTimer() + for i := 0; i < b.N; i++ { + stat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(filePath), + FollowFinalSymlink: true, + }, &vfs.StatOptions{}) + if err != nil { + b.Fatalf("stat(%q) failed: %v", filePath, err) + } + // Sanity check. + if stat.Mode&^linux.S_IFMT != 0644 { + b.Fatalf("got wrong permissions (%0o)", stat.Mode) + } + } + // Don't include deferred cleanup in benchmark time. + b.StopTimer() + }) + } +} + +func init() { + // Turn off reference leak checking for a fair comparison between vfs1 and + // vfs2. + refs.SetLeakMode(refs.NoLeakChecking) +} diff --git a/pkg/sentry/fsimpl/tmpfs/directory.go b/pkg/sentry/fsimpl/tmpfs/directory.go new file mode 100644 index 000000000..887ca2619 --- /dev/null +++ b/pkg/sentry/fsimpl/tmpfs/directory.go @@ -0,0 +1,187 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tmpfs + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +type directory struct { + inode inode + + // childList is a list containing (1) child Dentries and (2) fake Dentries + // (with inode == nil) that represent the iteration position of + // directoryFDs. childList is used to support directoryFD.IterDirents() + // efficiently. childList is protected by filesystem.mu. + childList dentryList +} + +func (fs *filesystem) newDirectory(creds *auth.Credentials, mode linux.FileMode) *inode { + dir := &directory{} + dir.inode.init(dir, fs, creds, mode) + dir.inode.nlink = 2 // from "." and parent directory or ".." for root + return &dir.inode +} + +func (i *inode) isDir() bool { + _, ok := i.impl.(*directory) + return ok +} + +type directoryFD struct { + fileDescription + vfs.DirectoryFileDescriptionDefaultImpl + + // Protected by filesystem.mu. + iter *dentry + off int64 +} + +// Release implements vfs.FileDescriptionImpl.Release. +func (fd *directoryFD) Release() { + if fd.iter != nil { + fs := fd.filesystem() + dir := fd.inode().impl.(*directory) + fs.mu.Lock() + dir.childList.Remove(fd.iter) + fs.mu.Unlock() + fd.iter = nil + } +} + +// IterDirents implements vfs.FileDescriptionImpl.IterDirents. +func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error { + fs := fd.filesystem() + vfsd := fd.vfsfd.VirtualDentry().Dentry() + + fs.mu.Lock() + defer fs.mu.Unlock() + + if fd.off == 0 { + if !cb.Handle(vfs.Dirent{ + Name: ".", + Type: linux.DT_DIR, + Ino: vfsd.Impl().(*dentry).inode.ino, + NextOff: 1, + }) { + return nil + } + fd.off++ + } + if fd.off == 1 { + parentInode := vfsd.ParentOrSelf().Impl().(*dentry).inode + if !cb.Handle(vfs.Dirent{ + Name: "..", + Type: parentInode.direntType(), + Ino: parentInode.ino, + NextOff: 2, + }) { + return nil + } + fd.off++ + } + + dir := vfsd.Impl().(*dentry).inode.impl.(*directory) + var child *dentry + if fd.iter == nil { + // Start iteration at the beginning of dir. + child = dir.childList.Front() + fd.iter = &dentry{} + } else { + // Continue iteration from where we left off. + child = fd.iter.Next() + dir.childList.Remove(fd.iter) + } + for child != nil { + // Skip other directoryFD iterators. + if child.inode != nil { + if !cb.Handle(vfs.Dirent{ + Name: child.vfsd.Name(), + Type: child.inode.direntType(), + Ino: child.inode.ino, + NextOff: fd.off + 1, + }) { + dir.childList.InsertBefore(child, fd.iter) + return nil + } + fd.off++ + } + child = child.Next() + } + dir.childList.PushBack(fd.iter) + return nil +} + +// Seek implements vfs.FileDescriptionImpl.Seek. +func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { + fs := fd.filesystem() + fs.mu.Lock() + defer fs.mu.Unlock() + + switch whence { + case linux.SEEK_SET: + // Use offset as given. + case linux.SEEK_CUR: + offset += fd.off + default: + return 0, syserror.EINVAL + } + if offset < 0 { + return 0, syserror.EINVAL + } + + // If the offset isn't changing (e.g. due to lseek(0, SEEK_CUR)), don't + // seek even if doing so might reposition the iterator due to concurrent + // mutation of the directory. Compare fs/libfs.c:dcache_dir_lseek(). + if fd.off == offset { + return offset, nil + } + + fd.off = offset + // Compensate for "." and "..". + remChildren := int64(0) + if offset >= 2 { + remChildren = offset - 2 + } + + dir := fd.inode().impl.(*directory) + + // Ensure that fd.iter exists and is not linked into dir.childList. + if fd.iter == nil { + fd.iter = &dentry{} + } else { + dir.childList.Remove(fd.iter) + } + // Insert fd.iter before the remChildren'th child, or at the end of the + // list if remChildren >= number of children. + child := dir.childList.Front() + for child != nil { + // Skip other directoryFD iterators. + if child.inode != nil { + if remChildren == 0 { + dir.childList.InsertBefore(child, fd.iter) + return offset, nil + } + remChildren-- + } + child = child.Next() + } + dir.childList.PushBack(fd.iter) + return offset, nil +} diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go new file mode 100644 index 000000000..26979729e --- /dev/null +++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go @@ -0,0 +1,698 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tmpfs + +import ( + "fmt" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +// Sync implements vfs.FilesystemImpl.Sync. +func (fs *filesystem) Sync(ctx context.Context) error { + // All filesystem state is in-memory. + return nil +} + +// stepLocked resolves rp.Component() to an existing file, starting from the +// given directory. +// +// stepLocked is loosely analogous to fs/namei.c:walk_component(). +// +// Preconditions: filesystem.mu must be locked. !rp.Done(). +func stepLocked(rp *vfs.ResolvingPath, d *dentry) (*dentry, error) { + if !d.inode.isDir() { + return nil, syserror.ENOTDIR + } + if err := d.inode.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil { + return nil, err + } +afterSymlink: + nextVFSD, err := rp.ResolveComponent(&d.vfsd) + if err != nil { + return nil, err + } + if nextVFSD == nil { + // Since the Dentry tree is the sole source of truth for tmpfs, if it's + // not in the Dentry tree, it doesn't exist. + return nil, syserror.ENOENT + } + next := nextVFSD.Impl().(*dentry) + if symlink, ok := next.inode.impl.(*symlink); ok && rp.ShouldFollowSymlink() { + // TODO: symlink traversals update access time + if err := rp.HandleSymlink(symlink.target); err != nil { + return nil, err + } + goto afterSymlink // don't check the current directory again + } + rp.Advance() + return next, nil +} + +// walkParentDirLocked resolves all but the last path component of rp to an +// existing directory, starting from the given directory (which is usually +// rp.Start().Impl().(*dentry)). It does not check that the returned directory +// is searchable by the provider of rp. +// +// walkParentDirLocked is loosely analogous to Linux's +// fs/namei.c:path_parentat(). +// +// Preconditions: filesystem.mu must be locked. !rp.Done(). +func walkParentDirLocked(rp *vfs.ResolvingPath, d *dentry) (*dentry, error) { + for !rp.Final() { + next, err := stepLocked(rp, d) + if err != nil { + return nil, err + } + d = next + } + if !d.inode.isDir() { + return nil, syserror.ENOTDIR + } + return d, nil +} + +// resolveLocked resolves rp to an existing file. +// +// resolveLocked is loosely analogous to Linux's fs/namei.c:path_lookupat(). +// +// Preconditions: filesystem.mu must be locked. +func resolveLocked(rp *vfs.ResolvingPath) (*dentry, error) { + d := rp.Start().Impl().(*dentry) + for !rp.Done() { + next, err := stepLocked(rp, d) + if err != nil { + return nil, err + } + d = next + } + if rp.MustBeDir() && !d.inode.isDir() { + return nil, syserror.ENOTDIR + } + return d, nil +} + +// doCreateAt checks that creating a file at rp is permitted, then invokes +// create to do so. +// +// doCreateAt is loosely analogous to a conjunction of Linux's +// fs/namei.c:filename_create() and done_path_create(). +// +// Preconditions: !rp.Done(). For the final path component in rp, +// !rp.ShouldFollowSymlink(). +func (fs *filesystem) doCreateAt(rp *vfs.ResolvingPath, dir bool, create func(parent *dentry, name string) error) error { + fs.mu.Lock() + defer fs.mu.Unlock() + parent, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry)) + if err != nil { + return err + } + if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true /* isDir */); err != nil { + return err + } + name := rp.Component() + if name == "." || name == ".." { + return syserror.EEXIST + } + // Call parent.vfsd.Child() instead of stepLocked() or rp.ResolveChild(), + // because if the child exists we want to return EEXIST immediately instead + // of attempting symlink/mount traversal. + if parent.vfsd.Child(name) != nil { + return syserror.EEXIST + } + if !dir && rp.MustBeDir() { + return syserror.ENOENT + } + // In memfs, the only way to cause a dentry to be disowned is by removing + // it from the filesystem, so this check is equivalent to checking if + // parent has been removed. + if parent.vfsd.IsDisowned() { + return syserror.ENOENT + } + mnt := rp.Mount() + if err := mnt.CheckBeginWrite(); err != nil { + return err + } + defer mnt.EndWrite() + return create(parent, name) +} + +// GetDentryAt implements vfs.FilesystemImpl.GetDentryAt. +func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) { + fs.mu.RLock() + defer fs.mu.RUnlock() + d, err := resolveLocked(rp) + if err != nil { + return nil, err + } + if opts.CheckSearchable { + if !d.inode.isDir() { + return nil, syserror.ENOTDIR + } + if err := d.inode.checkPermissions(rp.Credentials(), vfs.MayExec, true /* isDir */); err != nil { + return nil, err + } + } + d.IncRef() + return &d.vfsd, nil +} + +// GetParentDentryAt implements vfs.FilesystemImpl.GetParentDentryAt. +func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) { + fs.mu.RLock() + defer fs.mu.RUnlock() + d, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry)) + if err != nil { + return nil, err + } + d.IncRef() + return &d.vfsd, nil +} + +// LinkAt implements vfs.FilesystemImpl.LinkAt. +func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error { + return fs.doCreateAt(rp, false /* dir */, func(parent *dentry, name string) error { + if rp.Mount() != vd.Mount() { + return syserror.EXDEV + } + d := vd.Dentry().Impl().(*dentry) + if d.inode.isDir() { + return syserror.EPERM + } + if d.inode.nlink == 0 { + return syserror.ENOENT + } + if d.inode.nlink == maxLinks { + return syserror.EMLINK + } + d.inode.incLinksLocked() + child := fs.newDentry(d.inode) + parent.vfsd.InsertChild(&child.vfsd, name) + parent.inode.impl.(*directory).childList.PushBack(child) + return nil + }) +} + +// MkdirAt implements vfs.FilesystemImpl.MkdirAt. +func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error { + return fs.doCreateAt(rp, true /* dir */, func(parent *dentry, name string) error { + if parent.inode.nlink == maxLinks { + return syserror.EMLINK + } + parent.inode.incLinksLocked() // from child's ".." + child := fs.newDentry(fs.newDirectory(rp.Credentials(), opts.Mode)) + parent.vfsd.InsertChild(&child.vfsd, name) + parent.inode.impl.(*directory).childList.PushBack(child) + return nil + }) +} + +// MknodAt implements vfs.FilesystemImpl.MknodAt. +func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error { + return fs.doCreateAt(rp, false /* dir */, func(parent *dentry, name string) error { + switch opts.Mode.FileType() { + case 0, linux.S_IFREG: + child := fs.newDentry(fs.newRegularFile(rp.Credentials(), opts.Mode)) + parent.vfsd.InsertChild(&child.vfsd, name) + parent.inode.impl.(*directory).childList.PushBack(child) + return nil + case linux.S_IFIFO: + child := fs.newDentry(fs.newNamedPipe(rp.Credentials(), opts.Mode)) + parent.vfsd.InsertChild(&child.vfsd, name) + parent.inode.impl.(*directory).childList.PushBack(child) + return nil + case linux.S_IFBLK, linux.S_IFCHR, linux.S_IFSOCK: + // Not yet supported. + return syserror.EPERM + default: + return syserror.EINVAL + } + }) +} + +// OpenAt implements vfs.FilesystemImpl.OpenAt. +func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + if opts.Flags&linux.O_TMPFILE != 0 { + // Not yet supported. + return nil, syserror.EOPNOTSUPP + } + + // Handle O_CREAT and !O_CREAT separately, since in the latter case we + // don't need fs.mu for writing. + if opts.Flags&linux.O_CREAT == 0 { + fs.mu.RLock() + defer fs.mu.RUnlock() + d, err := resolveLocked(rp) + if err != nil { + return nil, err + } + return d.open(ctx, rp, opts.Flags, false /* afterCreate */) + } + + mustCreate := opts.Flags&linux.O_EXCL != 0 + start := rp.Start().Impl().(*dentry) + fs.mu.Lock() + defer fs.mu.Unlock() + if rp.Done() { + // Reject attempts to open directories with O_CREAT. + if rp.MustBeDir() { + return nil, syserror.EISDIR + } + if mustCreate { + return nil, syserror.EEXIST + } + return start.open(ctx, rp, opts.Flags, false /* afterCreate */) + } +afterTrailingSymlink: + parent, err := walkParentDirLocked(rp, start) + if err != nil { + return nil, err + } + // Check for search permission in the parent directory. + if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil { + return nil, err + } + // Reject attempts to open directories with O_CREAT. + if rp.MustBeDir() { + return nil, syserror.EISDIR + } + name := rp.Component() + if name == "." || name == ".." { + return nil, syserror.EISDIR + } + // Determine whether or not we need to create a file. + child, err := stepLocked(rp, parent) + if err == syserror.ENOENT { + // Already checked for searchability above; now check for writability. + if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite, true); err != nil { + return nil, err + } + if err := rp.Mount().CheckBeginWrite(); err != nil { + return nil, err + } + defer rp.Mount().EndWrite() + // Create and open the child. + child := fs.newDentry(fs.newRegularFile(rp.Credentials(), opts.Mode)) + parent.vfsd.InsertChild(&child.vfsd, name) + parent.inode.impl.(*directory).childList.PushBack(child) + return child.open(ctx, rp, opts.Flags, true) + } + if err != nil { + return nil, err + } + // Do we need to resolve a trailing symlink? + if !rp.Done() { + start = parent + goto afterTrailingSymlink + } + // Open existing file. + if mustCreate { + return nil, syserror.EEXIST + } + return child.open(ctx, rp, opts.Flags, false) +} + +func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, flags uint32, afterCreate bool) (*vfs.FileDescription, error) { + ats := vfs.AccessTypesForOpenFlags(flags) + if !afterCreate { + if err := d.inode.checkPermissions(rp.Credentials(), ats, d.inode.isDir()); err != nil { + return nil, err + } + } + mnt := rp.Mount() + switch impl := d.inode.impl.(type) { + case *regularFile: + var fd regularFileFD + fd.readable = vfs.MayReadFileWithOpenFlags(flags) + fd.writable = vfs.MayWriteFileWithOpenFlags(flags) + if fd.writable { + if err := mnt.CheckBeginWrite(); err != nil { + return nil, err + } + // mnt.EndWrite() is called by regularFileFD.Release(). + } + fd.vfsfd.Init(&fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{}) + if flags&linux.O_TRUNC != 0 { + impl.mu.Lock() + impl.data.Truncate(0, impl.memFile) + atomic.StoreUint64(&impl.size, 0) + impl.mu.Unlock() + } + return &fd.vfsfd, nil + case *directory: + // Can't open directories writably. + if ats&vfs.MayWrite != 0 { + return nil, syserror.EISDIR + } + var fd directoryFD + fd.vfsfd.Init(&fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{}) + return &fd.vfsfd, nil + case *symlink: + // Can't open symlinks without O_PATH (which is unimplemented). + return nil, syserror.ELOOP + case *namedPipe: + return newNamedPipeFD(ctx, impl, rp, &d.vfsd, flags) + default: + panic(fmt.Sprintf("unknown inode type: %T", d.inode.impl)) + } +} + +// ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt. +func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) { + fs.mu.RLock() + defer fs.mu.RUnlock() + d, err := resolveLocked(rp) + if err != nil { + return "", err + } + symlink, ok := d.inode.impl.(*symlink) + if !ok { + return "", syserror.EINVAL + } + return symlink.target, nil +} + +// RenameAt implements vfs.FilesystemImpl.RenameAt. +func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldParentVD vfs.VirtualDentry, oldName string, opts vfs.RenameOptions) error { + if opts.Flags != 0 { + // TODO(b/145974740): Support renameat2 flags. + return syserror.EINVAL + } + + // Resolve newParent first to verify that it's on this Mount. + fs.mu.Lock() + defer fs.mu.Unlock() + newParent, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry)) + if err != nil { + return err + } + newName := rp.Component() + if newName == "." || newName == ".." { + return syserror.EBUSY + } + mnt := rp.Mount() + if mnt != oldParentVD.Mount() { + return syserror.EXDEV + } + if err := mnt.CheckBeginWrite(); err != nil { + return err + } + defer mnt.EndWrite() + + oldParent := oldParentVD.Dentry().Impl().(*dentry) + if err := oldParent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true /* isDir */); err != nil { + return err + } + // Call vfs.Dentry.Child() instead of stepLocked() or rp.ResolveChild(), + // because if the existing child is a symlink or mount point then we want + // to rename over it rather than follow it. + renamedVFSD := oldParent.vfsd.Child(oldName) + if renamedVFSD == nil { + return syserror.ENOENT + } + renamed := renamedVFSD.Impl().(*dentry) + if renamed.inode.isDir() { + if renamed == newParent || renamedVFSD.IsAncestorOf(&newParent.vfsd) { + return syserror.EINVAL + } + if oldParent != newParent { + // Writability is needed to change renamed's "..". + if err := renamed.inode.checkPermissions(rp.Credentials(), vfs.MayWrite, true /* isDir */); err != nil { + return err + } + } + } else { + if opts.MustBeDir || rp.MustBeDir() { + return syserror.ENOTDIR + } + } + + if err := newParent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true /* isDir */); err != nil { + return err + } + replacedVFSD := newParent.vfsd.Child(newName) + var replaced *dentry + if replacedVFSD != nil { + replaced = replacedVFSD.Impl().(*dentry) + if replaced.inode.isDir() { + if !renamed.inode.isDir() { + return syserror.EISDIR + } + if replaced.vfsd.HasChildren() { + return syserror.ENOTEMPTY + } + } else { + if rp.MustBeDir() { + return syserror.ENOTDIR + } + if renamed.inode.isDir() { + return syserror.ENOTDIR + } + } + } else { + if renamed.inode.isDir() && newParent.inode.nlink == maxLinks { + return syserror.EMLINK + } + } + if newParent.vfsd.IsDisowned() { + return syserror.ENOENT + } + + // Linux places this check before some of those above; we do it here for + // simplicity, under the assumption that applications are not intentionally + // doing noop renames expecting them to succeed where non-noop renames + // would fail. + if renamedVFSD == replacedVFSD { + return nil + } + vfsObj := rp.VirtualFilesystem() + oldParentDir := oldParent.inode.impl.(*directory) + newParentDir := newParent.inode.impl.(*directory) + if err := vfsObj.PrepareRenameDentry(vfs.MountNamespaceFromContext(ctx), renamedVFSD, replacedVFSD); err != nil { + return err + } + if replaced != nil { + newParentDir.childList.Remove(replaced) + if replaced.inode.isDir() { + newParent.inode.decLinksLocked() // from replaced's ".." + } + replaced.inode.decLinksLocked() + } + oldParentDir.childList.Remove(renamed) + newParentDir.childList.PushBack(renamed) + if renamed.inode.isDir() { + oldParent.inode.decLinksLocked() + newParent.inode.incLinksLocked() + } + // TODO: update timestamps and parent directory sizes + vfsObj.CommitRenameReplaceDentry(renamedVFSD, &newParent.vfsd, newName, replacedVFSD) + return nil +} + +// RmdirAt implements vfs.FilesystemImpl.RmdirAt. +func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error { + fs.mu.Lock() + defer fs.mu.Unlock() + parent, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry)) + if err != nil { + return err + } + if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true /* isDir */); err != nil { + return err + } + name := rp.Component() + if name == "." { + return syserror.EINVAL + } + if name == ".." { + return syserror.ENOTEMPTY + } + childVFSD := parent.vfsd.Child(name) + if childVFSD == nil { + return syserror.ENOENT + } + child := childVFSD.Impl().(*dentry) + if !child.inode.isDir() { + return syserror.ENOTDIR + } + if childVFSD.HasChildren() { + return syserror.ENOTEMPTY + } + mnt := rp.Mount() + if err := mnt.CheckBeginWrite(); err != nil { + return err + } + defer mnt.EndWrite() + vfsObj := rp.VirtualFilesystem() + if err := vfsObj.PrepareDeleteDentry(vfs.MountNamespaceFromContext(ctx), childVFSD); err != nil { + return err + } + parent.inode.impl.(*directory).childList.Remove(child) + parent.inode.decLinksLocked() // from child's ".." + child.inode.decLinksLocked() + vfsObj.CommitDeleteDentry(childVFSD) + return nil +} + +// SetStatAt implements vfs.FilesystemImpl.SetStatAt. +func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error { + fs.mu.RLock() + defer fs.mu.RUnlock() + _, err := resolveLocked(rp) + if err != nil { + return err + } + if opts.Stat.Mask == 0 { + return nil + } + // TODO: implement inode.setStat + return syserror.EPERM +} + +// StatAt implements vfs.FilesystemImpl.StatAt. +func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) { + fs.mu.RLock() + defer fs.mu.RUnlock() + d, err := resolveLocked(rp) + if err != nil { + return linux.Statx{}, err + } + var stat linux.Statx + d.inode.statTo(&stat) + return stat, nil +} + +// StatFSAt implements vfs.FilesystemImpl.StatFSAt. +func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) { + fs.mu.RLock() + defer fs.mu.RUnlock() + _, err := resolveLocked(rp) + if err != nil { + return linux.Statfs{}, err + } + // TODO: actually implement statfs + return linux.Statfs{}, syserror.ENOSYS +} + +// SymlinkAt implements vfs.FilesystemImpl.SymlinkAt. +func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error { + return fs.doCreateAt(rp, false /* dir */, func(parent *dentry, name string) error { + child := fs.newDentry(fs.newSymlink(rp.Credentials(), target)) + parent.vfsd.InsertChild(&child.vfsd, name) + parent.inode.impl.(*directory).childList.PushBack(child) + return nil + }) +} + +// UnlinkAt implements vfs.FilesystemImpl.UnlinkAt. +func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error { + fs.mu.Lock() + defer fs.mu.Unlock() + parent, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry)) + if err != nil { + return err + } + if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true /* isDir */); err != nil { + return err + } + name := rp.Component() + if name == "." || name == ".." { + return syserror.EISDIR + } + childVFSD := parent.vfsd.Child(name) + if childVFSD == nil { + return syserror.ENOENT + } + child := childVFSD.Impl().(*dentry) + if child.inode.isDir() { + return syserror.EISDIR + } + if !rp.MustBeDir() { + return syserror.ENOTDIR + } + mnt := rp.Mount() + if err := mnt.CheckBeginWrite(); err != nil { + return err + } + defer mnt.EndWrite() + vfsObj := rp.VirtualFilesystem() + if err := vfsObj.PrepareDeleteDentry(vfs.MountNamespaceFromContext(ctx), childVFSD); err != nil { + return err + } + parent.inode.impl.(*directory).childList.Remove(child) + child.inode.decLinksLocked() + vfsObj.CommitDeleteDentry(childVFSD) + return nil +} + +// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt. +func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([]string, error) { + fs.mu.RLock() + defer fs.mu.RUnlock() + _, err := resolveLocked(rp) + if err != nil { + return nil, err + } + // TODO(b/127675828): support extended attributes + return nil, syserror.ENOTSUP +} + +// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt. +func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) (string, error) { + fs.mu.RLock() + defer fs.mu.RUnlock() + _, err := resolveLocked(rp) + if err != nil { + return "", err + } + // TODO(b/127675828): support extended attributes + return "", syserror.ENOTSUP +} + +// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt. +func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error { + fs.mu.RLock() + defer fs.mu.RUnlock() + _, err := resolveLocked(rp) + if err != nil { + return err + } + // TODO(b/127675828): support extended attributes + return syserror.ENOTSUP +} + +// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt. +func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error { + fs.mu.RLock() + defer fs.mu.RUnlock() + _, err := resolveLocked(rp) + if err != nil { + return err + } + // TODO(b/127675828): support extended attributes + return syserror.ENOTSUP +} + +// PrependPath implements vfs.FilesystemImpl.PrependPath. +func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error { + fs.mu.RLock() + defer fs.mu.RUnlock() + return vfs.GenericPrependPath(vfsroot, vd, b) +} diff --git a/pkg/sentry/fsimpl/tmpfs/named_pipe.go b/pkg/sentry/fsimpl/tmpfs/named_pipe.go new file mode 100644 index 000000000..40bde54de --- /dev/null +++ b/pkg/sentry/fsimpl/tmpfs/named_pipe.go @@ -0,0 +1,60 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tmpfs + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" + "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sentry/vfs" +) + +type namedPipe struct { + inode inode + + pipe *pipe.VFSPipe +} + +// Preconditions: +// * fs.mu must be locked. +// * rp.Mount().CheckBeginWrite() has been called successfully. +func (fs *filesystem) newNamedPipe(creds *auth.Credentials, mode linux.FileMode) *inode { + file := &namedPipe{pipe: pipe.NewVFSPipe(pipe.DefaultPipeSize, usermem.PageSize)} + file.inode.init(file, fs, creds, mode) + file.inode.nlink = 1 // Only the parent has a link. + return &file.inode +} + +// namedPipeFD implements vfs.FileDescriptionImpl. Methods are implemented +// entirely via struct embedding. +type namedPipeFD struct { + fileDescription + + *pipe.VFSPipeFD +} + +func newNamedPipeFD(ctx context.Context, np *namedPipe, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) { + var err error + var fd namedPipeFD + fd.VFSPipeFD, err = np.pipe.NewVFSPipeFD(ctx, rp, vfsd, &fd.vfsfd, flags) + if err != nil { + return nil, err + } + mnt := rp.Mount() + fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{}) + return &fd.vfsfd, nil +} diff --git a/pkg/sentry/fsimpl/tmpfs/pipe_test.go b/pkg/sentry/fsimpl/tmpfs/pipe_test.go new file mode 100644 index 000000000..70b42a6ec --- /dev/null +++ b/pkg/sentry/fsimpl/tmpfs/pipe_test.go @@ -0,0 +1,235 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tmpfs + +import ( + "bytes" + "testing" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/context/contexttest" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +const fileName = "mypipe" + +func TestSeparateFDs(t *testing.T) { + ctx, creds, vfsObj, root := setup(t) + defer root.DecRef() + + // Open the read side. This is done in a concurrently because opening + // One end the pipe blocks until the other end is opened. + pop := vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(fileName), + FollowFinalSymlink: true, + } + rfdchan := make(chan *vfs.FileDescription) + go func() { + openOpts := vfs.OpenOptions{Flags: linux.O_RDONLY} + rfd, _ := vfsObj.OpenAt(ctx, creds, &pop, &openOpts) + rfdchan <- rfd + }() + + // Open the write side. + openOpts := vfs.OpenOptions{Flags: linux.O_WRONLY} + wfd, err := vfsObj.OpenAt(ctx, creds, &pop, &openOpts) + if err != nil { + t.Fatalf("failed to open pipe for writing %q: %v", fileName, err) + } + defer wfd.DecRef() + + rfd, ok := <-rfdchan + if !ok { + t.Fatalf("failed to open pipe for reading %q", fileName) + } + defer rfd.DecRef() + + const msg = "vamos azul" + checkEmpty(ctx, t, rfd) + checkWrite(ctx, t, wfd, msg) + checkRead(ctx, t, rfd, msg) +} + +func TestNonblockingRead(t *testing.T) { + ctx, creds, vfsObj, root := setup(t) + defer root.DecRef() + + // Open the read side as nonblocking. + pop := vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(fileName), + FollowFinalSymlink: true, + } + openOpts := vfs.OpenOptions{Flags: linux.O_RDONLY | linux.O_NONBLOCK} + rfd, err := vfsObj.OpenAt(ctx, creds, &pop, &openOpts) + if err != nil { + t.Fatalf("failed to open pipe for reading %q: %v", fileName, err) + } + defer rfd.DecRef() + + // Open the write side. + openOpts = vfs.OpenOptions{Flags: linux.O_WRONLY} + wfd, err := vfsObj.OpenAt(ctx, creds, &pop, &openOpts) + if err != nil { + t.Fatalf("failed to open pipe for writing %q: %v", fileName, err) + } + defer wfd.DecRef() + + const msg = "geh blau" + checkEmpty(ctx, t, rfd) + checkWrite(ctx, t, wfd, msg) + checkRead(ctx, t, rfd, msg) +} + +func TestNonblockingWriteError(t *testing.T) { + ctx, creds, vfsObj, root := setup(t) + defer root.DecRef() + + // Open the write side as nonblocking, which should return ENXIO. + pop := vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(fileName), + FollowFinalSymlink: true, + } + openOpts := vfs.OpenOptions{Flags: linux.O_WRONLY | linux.O_NONBLOCK} + _, err := vfsObj.OpenAt(ctx, creds, &pop, &openOpts) + if err != syserror.ENXIO { + t.Fatalf("expected ENXIO, but got error: %v", err) + } +} + +func TestSingleFD(t *testing.T) { + ctx, creds, vfsObj, root := setup(t) + defer root.DecRef() + + // Open the pipe as readable and writable. + pop := vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(fileName), + FollowFinalSymlink: true, + } + openOpts := vfs.OpenOptions{Flags: linux.O_RDWR} + fd, err := vfsObj.OpenAt(ctx, creds, &pop, &openOpts) + if err != nil { + t.Fatalf("failed to open pipe for writing %q: %v", fileName, err) + } + defer fd.DecRef() + + const msg = "forza blu" + checkEmpty(ctx, t, fd) + checkWrite(ctx, t, fd, msg) + checkRead(ctx, t, fd, msg) +} + +// setup creates a VFS with a pipe in the root directory at path fileName. The +// returned VirtualDentry must be DecRef()'d be the caller. It calls t.Fatal +// upon failure. +func setup(t *testing.T) (context.Context, *auth.Credentials, *vfs.VirtualFilesystem, vfs.VirtualDentry) { + ctx := contexttest.Context(t) + creds := auth.CredentialsFromContext(ctx) + + // Create VFS. + vfsObj := vfs.New() + vfsObj.MustRegisterFilesystemType("tmpfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserMount: true, + }) + mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{}) + if err != nil { + t.Fatalf("failed to create tmpfs root mount: %v", err) + } + + // Create the pipe. + root := mntns.Root() + pop := vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(fileName), + } + mknodOpts := vfs.MknodOptions{Mode: linux.ModeNamedPipe | 0644} + if err := vfsObj.MknodAt(ctx, creds, &pop, &mknodOpts); err != nil { + t.Fatalf("failed to create file %q: %v", fileName, err) + } + + // Sanity check: the file pipe exists and has the correct mode. + stat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(fileName), + FollowFinalSymlink: true, + }, &vfs.StatOptions{}) + if err != nil { + t.Fatalf("stat(%q) failed: %v", fileName, err) + } + if stat.Mode&^linux.S_IFMT != 0644 { + t.Errorf("got wrong permissions (%0o)", stat.Mode) + } + if stat.Mode&linux.S_IFMT != linux.ModeNamedPipe { + t.Errorf("got wrong file type (%0o)", stat.Mode) + } + + return ctx, creds, vfsObj, root +} + +// checkEmpty calls t.Fatal if the pipe in fd is not empty. +func checkEmpty(ctx context.Context, t *testing.T, fd *vfs.FileDescription) { + readData := make([]byte, 1) + dst := usermem.BytesIOSequence(readData) + bytesRead, err := fd.Read(ctx, dst, vfs.ReadOptions{}) + if err != syserror.ErrWouldBlock { + t.Fatalf("expected ErrWouldBlock reading from empty pipe %q, but got: %v", fileName, err) + } + if bytesRead != 0 { + t.Fatalf("expected to read 0 bytes, but got %d", bytesRead) + } +} + +// checkWrite calls t.Fatal if it fails to write all of msg to fd. +func checkWrite(ctx context.Context, t *testing.T, fd *vfs.FileDescription, msg string) { + writeData := []byte(msg) + src := usermem.BytesIOSequence(writeData) + bytesWritten, err := fd.Write(ctx, src, vfs.WriteOptions{}) + if err != nil { + t.Fatalf("error writing to pipe %q: %v", fileName, err) + } + if bytesWritten != int64(len(writeData)) { + t.Fatalf("expected to write %d bytes, but wrote %d", len(writeData), bytesWritten) + } +} + +// checkRead calls t.Fatal if it fails to read msg from fd. +func checkRead(ctx context.Context, t *testing.T, fd *vfs.FileDescription, msg string) { + readData := make([]byte, len(msg)) + dst := usermem.BytesIOSequence(readData) + bytesRead, err := fd.Read(ctx, dst, vfs.ReadOptions{}) + if err != nil { + t.Fatalf("error reading from pipe %q: %v", fileName, err) + } + if bytesRead != int64(len(msg)) { + t.Fatalf("expected to read %d bytes, but got %d", len(msg), bytesRead) + } + if !bytes.Equal(readData, []byte(msg)) { + t.Fatalf("expected to read %q from pipe, but got %q", msg, string(readData)) + } +} diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file.go b/pkg/sentry/fsimpl/tmpfs/regular_file.go new file mode 100644 index 000000000..f51e247a7 --- /dev/null +++ b/pkg/sentry/fsimpl/tmpfs/regular_file.go @@ -0,0 +1,357 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tmpfs + +import ( + "io" + "math" + "sync" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/memmap" + "gvisor.dev/gvisor/pkg/sentry/pgalloc" + "gvisor.dev/gvisor/pkg/sentry/safemem" + "gvisor.dev/gvisor/pkg/sentry/usage" + "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +type regularFile struct { + inode inode + + // memFile is a platform.File used to allocate pages to this regularFile. + memFile *pgalloc.MemoryFile + + // mu protects the fields below. + mu sync.RWMutex + + // data maps offsets into the file to offsets into memFile that store + // the file's data. + data fsutil.FileRangeSet + + // size is the size of data, but accessed using atomic memory + // operations to avoid locking in inode.stat(). + size uint64 + + // seals represents file seals on this inode. + seals uint32 +} + +func (fs *filesystem) newRegularFile(creds *auth.Credentials, mode linux.FileMode) *inode { + file := ®ularFile{ + memFile: fs.memFile, + } + file.inode.init(file, fs, creds, mode) + file.inode.nlink = 1 // from parent directory + return &file.inode +} + +type regularFileFD struct { + fileDescription + + // These are immutable. + readable bool + writable bool + + // off is the file offset. off is accessed using atomic memory operations. + // offMu serializes operations that may mutate off. + off int64 + offMu sync.Mutex +} + +// Release implements vfs.FileDescriptionImpl.Release. +func (fd *regularFileFD) Release() { + if fd.writable { + fd.vfsfd.VirtualDentry().Mount().EndWrite() + } +} + +// PRead implements vfs.FileDescriptionImpl.PRead. +func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { + if !fd.readable { + return 0, syserror.EINVAL + } + if offset < 0 { + return 0, syserror.EINVAL + } + if dst.NumBytes() == 0 { + return 0, nil + } + f := fd.inode().impl.(*regularFile) + rw := getRegularFileReadWriter(f, offset) + n, err := dst.CopyOutFrom(ctx, rw) + putRegularFileReadWriter(rw) + return int64(n), err +} + +// Read implements vfs.FileDescriptionImpl.Read. +func (fd *regularFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + fd.offMu.Lock() + n, err := fd.PRead(ctx, dst, fd.off, opts) + fd.off += n + fd.offMu.Unlock() + return n, err +} + +// PWrite implements vfs.FileDescriptionImpl.PWrite. +func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { + if !fd.writable { + return 0, syserror.EINVAL + } + if offset < 0 { + return 0, syserror.EINVAL + } + srclen := src.NumBytes() + if srclen == 0 { + return 0, nil + } + f := fd.inode().impl.(*regularFile) + end := offset + srclen + if end < offset { + // Overflow. + return 0, syserror.EFBIG + } + rw := getRegularFileReadWriter(f, offset) + n, err := src.CopyInTo(ctx, rw) + putRegularFileReadWriter(rw) + return n, err +} + +// Write implements vfs.FileDescriptionImpl.Write. +func (fd *regularFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { + fd.offMu.Lock() + n, err := fd.PWrite(ctx, src, fd.off, opts) + fd.off += n + fd.offMu.Unlock() + return n, err +} + +// Seek implements vfs.FileDescriptionImpl.Seek. +func (fd *regularFileFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { + fd.offMu.Lock() + defer fd.offMu.Unlock() + switch whence { + case linux.SEEK_SET: + // use offset as specified + case linux.SEEK_CUR: + offset += fd.off + case linux.SEEK_END: + offset += int64(atomic.LoadUint64(&fd.inode().impl.(*regularFile).size)) + default: + return 0, syserror.EINVAL + } + if offset < 0 { + return 0, syserror.EINVAL + } + fd.off = offset + return offset, nil +} + +// Sync implements vfs.FileDescriptionImpl.Sync. +func (fd *regularFileFD) Sync(ctx context.Context) error { + return nil +} + +// regularFileReadWriter implements safemem.Reader and Safemem.Writer. +type regularFileReadWriter struct { + file *regularFile + + // Offset into the file to read/write at. Note that this may be + // different from the FD offset if PRead/PWrite is used. + off uint64 +} + +var regularFileReadWriterPool = sync.Pool{ + New: func() interface{} { + return ®ularFileReadWriter{} + }, +} + +func getRegularFileReadWriter(file *regularFile, offset int64) *regularFileReadWriter { + rw := regularFileReadWriterPool.Get().(*regularFileReadWriter) + rw.file = file + rw.off = uint64(offset) + return rw +} + +func putRegularFileReadWriter(rw *regularFileReadWriter) { + rw.file = nil + regularFileReadWriterPool.Put(rw) +} + +// ReadToBlocks implements safemem.Reader.ReadToBlocks. +func (rw *regularFileReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) { + rw.file.mu.RLock() + + // Compute the range to read (limited by file size and overflow-checked). + if rw.off >= rw.file.size { + rw.file.mu.RUnlock() + return 0, io.EOF + } + end := rw.file.size + if rend := rw.off + dsts.NumBytes(); rend > rw.off && rend < end { + end = rend + } + + var done uint64 + seg, gap := rw.file.data.Find(uint64(rw.off)) + for rw.off < end { + mr := memmap.MappableRange{uint64(rw.off), uint64(end)} + switch { + case seg.Ok(): + // Get internal mappings. + ims, err := rw.file.memFile.MapInternal(seg.FileRangeOf(seg.Range().Intersect(mr)), usermem.Read) + if err != nil { + rw.file.mu.RUnlock() + return done, err + } + + // Copy from internal mappings. + n, err := safemem.CopySeq(dsts, ims) + done += n + rw.off += uint64(n) + dsts = dsts.DropFirst64(n) + if err != nil { + rw.file.mu.RUnlock() + return done, err + } + + // Continue. + seg, gap = seg.NextNonEmpty() + + case gap.Ok(): + // Tmpfs holes are zero-filled. + gapmr := gap.Range().Intersect(mr) + dst := dsts.TakeFirst64(gapmr.Length()) + n, err := safemem.ZeroSeq(dst) + done += n + rw.off += uint64(n) + dsts = dsts.DropFirst64(n) + if err != nil { + rw.file.mu.RUnlock() + return done, err + } + + // Continue. + seg, gap = gap.NextSegment(), fsutil.FileRangeGapIterator{} + } + } + rw.file.mu.RUnlock() + return done, nil +} + +// WriteFromBlocks implements safemem.Writer.WriteFromBlocks. +func (rw *regularFileReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) { + rw.file.mu.Lock() + + // Compute the range to write (overflow-checked). + end := rw.off + srcs.NumBytes() + if end <= rw.off { + end = math.MaxInt64 + } + + // Check if seals prevent either file growth or all writes. + switch { + case rw.file.seals&linux.F_SEAL_WRITE != 0: // Write sealed + rw.file.mu.Unlock() + return 0, syserror.EPERM + case end > rw.file.size && rw.file.seals&linux.F_SEAL_GROW != 0: // Grow sealed + // When growth is sealed, Linux effectively allows writes which would + // normally grow the file to partially succeed up to the current EOF, + // rounded down to the page boundary before the EOF. + // + // This happens because writes (and thus the growth check) for tmpfs + // files proceed page-by-page on Linux, and the final write to the page + // containing EOF fails, resulting in a partial write up to the start of + // that page. + // + // To emulate this behaviour, artifically truncate the write to the + // start of the page containing the current EOF. + // + // See Linux, mm/filemap.c:generic_perform_write() and + // mm/shmem.c:shmem_write_begin(). + if pgstart := uint64(usermem.Addr(rw.file.size).RoundDown()); end > pgstart { + end = pgstart + } + if end <= rw.off { + // Truncation would result in no data being written. + rw.file.mu.Unlock() + return 0, syserror.EPERM + } + } + + // Page-aligned mr for when we need to allocate memory. RoundUp can't + // overflow since end is an int64. + pgstartaddr := usermem.Addr(rw.off).RoundDown() + pgendaddr, _ := usermem.Addr(end).RoundUp() + pgMR := memmap.MappableRange{uint64(pgstartaddr), uint64(pgendaddr)} + + var ( + done uint64 + retErr error + ) + seg, gap := rw.file.data.Find(uint64(rw.off)) + for rw.off < end { + mr := memmap.MappableRange{uint64(rw.off), uint64(end)} + switch { + case seg.Ok(): + // Get internal mappings. + ims, err := rw.file.memFile.MapInternal(seg.FileRangeOf(seg.Range().Intersect(mr)), usermem.Write) + if err != nil { + retErr = err + goto exitLoop + } + + // Copy to internal mappings. + n, err := safemem.CopySeq(ims, srcs) + done += n + rw.off += uint64(n) + srcs = srcs.DropFirst64(n) + if err != nil { + retErr = err + goto exitLoop + } + + // Continue. + seg, gap = seg.NextNonEmpty() + + case gap.Ok(): + // Allocate memory for the write. + gapMR := gap.Range().Intersect(pgMR) + fr, err := rw.file.memFile.Allocate(gapMR.Length(), usage.Tmpfs) + if err != nil { + retErr = err + goto exitLoop + } + + // Write to that memory as usual. + seg, gap = rw.file.data.Insert(gap, gapMR, fr.Start), fsutil.FileRangeGapIterator{} + } + } +exitLoop: + // If the write ends beyond the file's previous size, it causes the + // file to grow. + if rw.off > rw.file.size { + atomic.StoreUint64(&rw.file.size, rw.off) + } + + rw.file.mu.Unlock() + return done, retErr +} diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file_test.go b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go new file mode 100644 index 000000000..3731c5b6f --- /dev/null +++ b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go @@ -0,0 +1,224 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tmpfs + +import ( + "bytes" + "fmt" + "io" + "testing" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/kernel/contexttest" + "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sentry/vfs" +) + +// newFileFD creates a new file in a new tmpfs mount, and returns the FD. If +// the returned err is not nil, then cleanup should be called when the FD is no +// longer needed. +func newFileFD(ctx context.Context, filename string) (*vfs.FileDescription, func(), error) { + creds := auth.CredentialsFromContext(ctx) + + vfsObj := vfs.New() + vfsObj.MustRegisterFilesystemType("tmpfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserMount: true, + }) + mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{}) + if err != nil { + return nil, nil, fmt.Errorf("failed to create tmpfs root mount: %v", err) + } + root := mntns.Root() + + // Create the file that will be write/read. + fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(filename), + FollowFinalSymlink: true, + }, &vfs.OpenOptions{ + Flags: linux.O_RDWR | linux.O_CREAT | linux.O_EXCL, + Mode: 0644, + }) + if err != nil { + root.DecRef() + mntns.DecRef(vfsObj) + return nil, nil, fmt.Errorf("failed to create file %q: %v", filename, err) + } + + return fd, func() { + root.DecRef() + mntns.DecRef(vfsObj) + }, nil +} + +// Test that we can write some data to a file and read it back.` +func TestSimpleWriteRead(t *testing.T) { + ctx := contexttest.Context(t) + fd, cleanup, err := newFileFD(ctx, "simpleReadWrite") + if err != nil { + t.Fatal(err) + } + defer cleanup() + + // Write. + data := []byte("foobarbaz") + n, err := fd.Write(ctx, usermem.BytesIOSequence(data), vfs.WriteOptions{}) + if err != nil { + t.Fatalf("fd.Write failed: %v", err) + } + if n != int64(len(data)) { + t.Errorf("fd.Write got short write length %d, want %d", n, len(data)) + } + if got, want := fd.Impl().(*regularFileFD).off, int64(len(data)); got != want { + t.Errorf("fd.Write left offset at %d, want %d", got, want) + } + + // Seek back to beginning. + if _, err := fd.Seek(ctx, 0, linux.SEEK_SET); err != nil { + t.Fatalf("fd.Seek failed: %v", err) + } + if got, want := fd.Impl().(*regularFileFD).off, int64(0); got != want { + t.Errorf("fd.Seek(0) left offset at %d, want %d", got, want) + } + + // Read. + buf := make([]byte, len(data)) + n, err = fd.Read(ctx, usermem.BytesIOSequence(buf), vfs.ReadOptions{}) + if err != nil && err != io.EOF { + t.Fatalf("fd.Read failed: %v", err) + } + if n != int64(len(data)) { + t.Errorf("fd.Read got short read length %d, want %d", n, len(data)) + } + if got, want := string(buf), string(data); got != want { + t.Errorf("Read got %q want %s", got, want) + } + if got, want := fd.Impl().(*regularFileFD).off, int64(len(data)); got != want { + t.Errorf("fd.Write left offset at %d, want %d", got, want) + } +} + +func TestPWrite(t *testing.T) { + ctx := contexttest.Context(t) + fd, cleanup, err := newFileFD(ctx, "PRead") + if err != nil { + t.Fatal(err) + } + defer cleanup() + + // Fill file with 1k 'a's. + data := bytes.Repeat([]byte{'a'}, 1000) + n, err := fd.Write(ctx, usermem.BytesIOSequence(data), vfs.WriteOptions{}) + if err != nil { + t.Fatalf("fd.Write failed: %v", err) + } + if n != int64(len(data)) { + t.Errorf("fd.Write got short write length %d, want %d", n, len(data)) + } + + // Write "gVisor is awesome" at various offsets. + buf := []byte("gVisor is awesome") + offsets := []int{0, 1, 2, 10, 20, 50, 100, len(data) - 100, len(data) - 1, len(data), len(data) + 1} + for _, offset := range offsets { + name := fmt.Sprintf("PWrite offset=%d", offset) + t.Run(name, func(t *testing.T) { + n, err := fd.PWrite(ctx, usermem.BytesIOSequence(buf), int64(offset), vfs.WriteOptions{}) + if err != nil { + t.Errorf("fd.PWrite got err %v want nil", err) + } + if n != int64(len(buf)) { + t.Errorf("fd.PWrite got %d bytes want %d", n, len(buf)) + } + + // Update data to reflect expected file contents. + if len(data) < offset+len(buf) { + data = append(data, make([]byte, (offset+len(buf))-len(data))...) + } + copy(data[offset:], buf) + + // Read the whole file and compare with data. + readBuf := make([]byte, len(data)) + n, err = fd.PRead(ctx, usermem.BytesIOSequence(readBuf), 0, vfs.ReadOptions{}) + if err != nil { + t.Fatalf("fd.PRead failed: %v", err) + } + if n != int64(len(data)) { + t.Errorf("fd.PRead got short read length %d, want %d", n, len(data)) + } + if got, want := string(readBuf), string(data); got != want { + t.Errorf("PRead got %q want %s", got, want) + } + + }) + } +} + +func TestPRead(t *testing.T) { + ctx := contexttest.Context(t) + fd, cleanup, err := newFileFD(ctx, "PRead") + if err != nil { + t.Fatal(err) + } + defer cleanup() + + // Write 100 sequences of 'gVisor is awesome'. + data := bytes.Repeat([]byte("gVisor is awsome"), 100) + n, err := fd.Write(ctx, usermem.BytesIOSequence(data), vfs.WriteOptions{}) + if err != nil { + t.Fatalf("fd.Write failed: %v", err) + } + if n != int64(len(data)) { + t.Errorf("fd.Write got short write length %d, want %d", n, len(data)) + } + + // Read various sizes from various offsets. + sizes := []int{0, 1, 2, 10, 20, 50, 100, 1000} + offsets := []int{0, 1, 2, 10, 20, 50, 100, 1000, len(data) - 100, len(data) - 1, len(data), len(data) + 1} + + for _, size := range sizes { + for _, offset := range offsets { + name := fmt.Sprintf("PRead offset=%d size=%d", offset, size) + t.Run(name, func(t *testing.T) { + var ( + wantRead []byte + wantErr error + ) + if offset < len(data) { + wantRead = data[offset:] + } else if size > 0 { + wantErr = io.EOF + } + if offset+size < len(data) { + wantRead = wantRead[:size] + } + buf := make([]byte, size) + n, err := fd.PRead(ctx, usermem.BytesIOSequence(buf), int64(offset), vfs.ReadOptions{}) + if err != wantErr { + t.Errorf("fd.PRead got err %v want %v", err, wantErr) + } + if n != int64(len(wantRead)) { + t.Errorf("fd.PRead got %d bytes want %d", n, len(wantRead)) + } + if got := string(buf[:n]); got != string(wantRead) { + t.Errorf("fd.PRead got %q want %q", got, string(wantRead)) + } + }) + } + } +} diff --git a/pkg/sentry/fsimpl/tmpfs/symlink.go b/pkg/sentry/fsimpl/tmpfs/symlink.go new file mode 100644 index 000000000..5246aca84 --- /dev/null +++ b/pkg/sentry/fsimpl/tmpfs/symlink.go @@ -0,0 +1,36 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tmpfs + +import ( + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" +) + +type symlink struct { + inode inode + target string // immutable +} + +func (fs *filesystem) newSymlink(creds *auth.Credentials, target string) *inode { + link := &symlink{ + target: target, + } + link.inode.init(link, fs, creds, 0777) + link.inode.nlink = 1 // from parent directory + return &link.inode +} + +// O_PATH is unimplemented, so there's no way to get a FileDescription +// representing a symlink yet. diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go new file mode 100644 index 000000000..7be6faa5b --- /dev/null +++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go @@ -0,0 +1,299 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package tmpfs provides a filesystem implementation that behaves like tmpfs: +// the Dentry tree is the sole source of truth for the state of the filesystem. +// +// Lock order: +// +// filesystem.mu +// regularFileFD.offMu +// regularFile.mu +// inode.mu +package tmpfs + +import ( + "fmt" + "math" + "sync" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/pgalloc" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +// FilesystemType implements vfs.FilesystemType. +type FilesystemType struct{} + +// filesystem implements vfs.FilesystemImpl. +type filesystem struct { + vfsfs vfs.Filesystem + + // memFile is used to allocate pages to for regular files. + memFile *pgalloc.MemoryFile + + // mu serializes changes to the Dentry tree. + mu sync.RWMutex + + nextInoMinusOne uint64 // accessed using atomic memory operations +} + +// GetFilesystem implements vfs.FilesystemType.GetFilesystem. +func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { + memFileProvider := pgalloc.MemoryFileProviderFromContext(ctx) + if memFileProvider == nil { + panic("MemoryFileProviderFromContext returned nil") + } + fs := filesystem{ + memFile: memFileProvider.MemoryFile(), + } + fs.vfsfs.Init(vfsObj, &fs) + root := fs.newDentry(fs.newDirectory(creds, 01777)) + return &fs.vfsfs, &root.vfsd, nil +} + +// Release implements vfs.FilesystemImpl.Release. +func (fs *filesystem) Release() { +} + +// dentry implements vfs.DentryImpl. +type dentry struct { + vfsd vfs.Dentry + + // inode is the inode represented by this dentry. Multiple Dentries may + // share a single non-directory inode (with hard links). inode is + // immutable. + inode *inode + + // tmpfs doesn't count references on dentries; because the dentry tree is + // the sole source of truth, it is by definition always consistent with the + // state of the filesystem. However, it does count references on inodes, + // because inode resources are released when all references are dropped. + // (tmpfs doesn't really have resources to release, but we implement + // reference counting because tmpfs regular files will.) + + // dentryEntry (ugh) links dentries into their parent directory.childList. + dentryEntry +} + +func (fs *filesystem) newDentry(inode *inode) *dentry { + d := &dentry{ + inode: inode, + } + d.vfsd.Init(d) + return d +} + +// IncRef implements vfs.DentryImpl.IncRef. +func (d *dentry) IncRef() { + d.inode.incRef() +} + +// TryIncRef implements vfs.DentryImpl.TryIncRef. +func (d *dentry) TryIncRef() bool { + return d.inode.tryIncRef() +} + +// DecRef implements vfs.DentryImpl.DecRef. +func (d *dentry) DecRef() { + d.inode.decRef() +} + +// inode represents a filesystem object. +type inode struct { + // refs is a reference count. refs is accessed using atomic memory + // operations. + // + // A reference is held on all inodes that are reachable in the filesystem + // tree. For non-directories (which may have multiple hard links), this + // means that a reference is dropped when nlink reaches 0. For directories, + // nlink never reaches 0 due to the "." entry; instead, + // filesystem.RmdirAt() drops the reference. + refs int64 + + // Inode metadata; protected by mu and accessed using atomic memory + // operations unless otherwise specified. + mu sync.RWMutex + mode uint32 // excluding file type bits, which are based on impl + nlink uint32 // protected by filesystem.mu instead of inode.mu + uid uint32 // auth.KUID, but stored as raw uint32 for sync/atomic + gid uint32 // auth.KGID, but ... + ino uint64 // immutable + + impl interface{} // immutable +} + +const maxLinks = math.MaxUint32 + +func (i *inode) init(impl interface{}, fs *filesystem, creds *auth.Credentials, mode linux.FileMode) { + i.refs = 1 + i.mode = uint32(mode) + i.uid = uint32(creds.EffectiveKUID) + i.gid = uint32(creds.EffectiveKGID) + i.ino = atomic.AddUint64(&fs.nextInoMinusOne, 1) + // i.nlink initialized by caller + i.impl = impl +} + +// incLinksLocked increments i's link count. +// +// Preconditions: filesystem.mu must be locked for writing. i.nlink != 0. +// i.nlink < maxLinks. +func (i *inode) incLinksLocked() { + if i.nlink == 0 { + panic("tmpfs.inode.incLinksLocked() called with no existing links") + } + if i.nlink == maxLinks { + panic("memfs.inode.incLinksLocked() called with maximum link count") + } + atomic.AddUint32(&i.nlink, 1) +} + +// decLinksLocked decrements i's link count. +// +// Preconditions: filesystem.mu must be locked for writing. i.nlink != 0. +func (i *inode) decLinksLocked() { + if i.nlink == 0 { + panic("tmpfs.inode.decLinksLocked() called with no existing links") + } + atomic.AddUint32(&i.nlink, ^uint32(0)) +} + +func (i *inode) incRef() { + if atomic.AddInt64(&i.refs, 1) <= 1 { + panic("tmpfs.inode.incRef() called without holding a reference") + } +} + +func (i *inode) tryIncRef() bool { + for { + refs := atomic.LoadInt64(&i.refs) + if refs == 0 { + return false + } + if atomic.CompareAndSwapInt64(&i.refs, refs, refs+1) { + return true + } + } +} + +func (i *inode) decRef() { + if refs := atomic.AddInt64(&i.refs, -1); refs == 0 { + // This is unnecessary; it's mostly to simulate what tmpfs would do. + if regFile, ok := i.impl.(*regularFile); ok { + regFile.mu.Lock() + regFile.data.DropAll(regFile.memFile) + atomic.StoreUint64(®File.size, 0) + regFile.mu.Unlock() + } + } else if refs < 0 { + panic("tmpfs.inode.decRef() called without holding a reference") + } +} + +func (i *inode) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes, isDir bool) error { + return vfs.GenericCheckPermissions(creds, ats, isDir, uint16(atomic.LoadUint32(&i.mode)), auth.KUID(atomic.LoadUint32(&i.uid)), auth.KGID(atomic.LoadUint32(&i.gid))) +} + +// Go won't inline this function, and returning linux.Statx (which is quite +// big) means spending a lot of time in runtime.duffcopy(), so instead it's an +// output parameter. +func (i *inode) statTo(stat *linux.Statx) { + stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO + stat.Blksize = 1 // usermem.PageSize in tmpfs + stat.Nlink = atomic.LoadUint32(&i.nlink) + stat.UID = atomic.LoadUint32(&i.uid) + stat.GID = atomic.LoadUint32(&i.gid) + stat.Mode = uint16(atomic.LoadUint32(&i.mode)) + stat.Ino = i.ino + // TODO: device number + switch impl := i.impl.(type) { + case *regularFile: + stat.Mode |= linux.S_IFREG + stat.Mask |= linux.STATX_SIZE | linux.STATX_BLOCKS + stat.Size = uint64(atomic.LoadUint64(&impl.size)) + // In tmpfs, this will be FileRangeSet.Span() / 512 (but also cached in + // a uint64 accessed using atomic memory operations to avoid taking + // locks). + stat.Blocks = allocatedBlocksForSize(stat.Size) + case *directory: + stat.Mode |= linux.S_IFDIR + case *symlink: + stat.Mode |= linux.S_IFLNK + stat.Mask |= linux.STATX_SIZE | linux.STATX_BLOCKS + stat.Size = uint64(len(impl.target)) + stat.Blocks = allocatedBlocksForSize(stat.Size) + case *namedPipe: + stat.Mode |= linux.S_IFIFO + default: + panic(fmt.Sprintf("unknown inode type: %T", i.impl)) + } +} + +// allocatedBlocksForSize returns the number of 512B blocks needed to +// accommodate the given size in bytes, as appropriate for struct +// stat::st_blocks and struct statx::stx_blocks. (Note that this 512B block +// size is independent of the "preferred block size for I/O", struct +// stat::st_blksize and struct statx::stx_blksize.) +func allocatedBlocksForSize(size uint64) uint64 { + return (size + 511) / 512 +} + +func (i *inode) direntType() uint8 { + switch i.impl.(type) { + case *regularFile: + return linux.DT_REG + case *directory: + return linux.DT_DIR + case *symlink: + return linux.DT_LNK + default: + panic(fmt.Sprintf("unknown inode type: %T", i.impl)) + } +} + +// fileDescription is embedded by tmpfs implementations of +// vfs.FileDescriptionImpl. +type fileDescription struct { + vfsfd vfs.FileDescription + vfs.FileDescriptionDefaultImpl +} + +func (fd *fileDescription) filesystem() *filesystem { + return fd.vfsfd.Mount().Filesystem().Impl().(*filesystem) +} + +func (fd *fileDescription) inode() *inode { + return fd.vfsfd.Dentry().Impl().(*dentry).inode +} + +// Stat implements vfs.FileDescriptionImpl.Stat. +func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { + var stat linux.Statx + fd.inode().statTo(&stat) + return stat, nil +} + +// SetStat implements vfs.FileDescriptionImpl.SetStat. +func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { + if opts.Stat.Mask == 0 { + return nil + } + // TODO: implement inode.setStat + return syserror.EPERM +} -- cgit v1.2.3 From db376e13924be59182ed4df95762328febf26298 Mon Sep 17 00:00:00 2001 From: Fabricio Voznika Date: Wed, 8 Jan 2020 10:30:50 -0800 Subject: Make /proc/[pid] offset start at TGID_OFFSET Updates #1195 PiperOrigin-RevId: 288725745 --- pkg/sentry/fsimpl/proc/tasks.go | 102 ++++++++++++++++++------ pkg/sentry/fsimpl/proc/tasks_test.go | 147 ++++++++++++++++++++++++++++++++++- 2 files changed, 223 insertions(+), 26 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go index 50b2a832f..d8f92d52f 100644 --- a/pkg/sentry/fsimpl/proc/tasks.go +++ b/pkg/sentry/fsimpl/proc/tasks.go @@ -27,7 +27,11 @@ import ( "gvisor.dev/gvisor/pkg/syserror" ) -const defaultPermission = 0444 +const ( + defaultPermission = 0444 + selfName = "self" + threadSelfName = "thread-self" +) // InoGenerator generates unique inode numbers for a given filesystem. type InoGenerator interface { @@ -45,6 +49,11 @@ type tasksInode struct { inoGen InoGenerator pidns *kernel.PIDNamespace + + // '/proc/self' and '/proc/thread-self' have custom directory offsets in + // Linux. So handle them outside of OrderedChildren. + selfSymlink *vfs.Dentry + threadSelfSymlink *vfs.Dentry } var _ kernfs.Inode = (*tasksInode)(nil) @@ -54,20 +63,20 @@ func newTasksInode(inoGen InoGenerator, k *kernel.Kernel, pidns *kernel.PIDNames contents := map[string]*kernfs.Dentry{ //"cpuinfo": newCPUInfo(ctx, msrc), //"filesystems": seqfile.NewSeqFileInode(ctx, &filesystemsData{}, msrc), - "loadavg": newDentry(root, inoGen.NextIno(), defaultPermission, &loadavgData{}), - "meminfo": newDentry(root, inoGen.NextIno(), defaultPermission, &meminfoData{k: k}), - "mounts": kernfs.NewStaticSymlink(root, inoGen.NextIno(), defaultPermission, "self/mounts"), - "self": newSelfSymlink(root, inoGen.NextIno(), defaultPermission, pidns), - "stat": newDentry(root, inoGen.NextIno(), defaultPermission, &statData{k: k}), - "thread-self": newThreadSelfSymlink(root, inoGen.NextIno(), defaultPermission, pidns), + "loadavg": newDentry(root, inoGen.NextIno(), defaultPermission, &loadavgData{}), + "meminfo": newDentry(root, inoGen.NextIno(), defaultPermission, &meminfoData{k: k}), + "mounts": kernfs.NewStaticSymlink(root, inoGen.NextIno(), defaultPermission, "self/mounts"), + "stat": newDentry(root, inoGen.NextIno(), defaultPermission, &statData{k: k}), //"uptime": newUptime(ctx, msrc), //"version": newVersionData(root, inoGen.NextIno(), k), "version": newDentry(root, inoGen.NextIno(), defaultPermission, &versionData{k: k}), } inode := &tasksInode{ - pidns: pidns, - inoGen: inoGen, + pidns: pidns, + inoGen: inoGen, + selfSymlink: newSelfSymlink(root, inoGen.NextIno(), 0444, pidns).VFSDentry(), + threadSelfSymlink: newThreadSelfSymlink(root, inoGen.NextIno(), 0444, pidns).VFSDentry(), } inode.InodeAttrs.Init(root, inoGen.NextIno(), linux.ModeDirectory|0555) @@ -86,6 +95,13 @@ func (i *tasksInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, erro // Try to lookup a corresponding task. tid, err := strconv.ParseUint(name, 10, 64) if err != nil { + // If it failed to parse, check if it's one of the special handled files. + switch name { + case selfName: + return i.selfSymlink, nil + case threadSelfName: + return i.threadSelfSymlink, nil + } return nil, syserror.ENOENT } @@ -104,41 +120,81 @@ func (i *tasksInode) Valid(ctx context.Context) bool { } // IterDirents implements kernfs.inodeDynamicLookup. -// -// TODO(gvisor.dev/issue/1195): Use tgid N offset = TGID_OFFSET + N. -func (i *tasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) { - var tids []int +func (i *tasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, offset, _ int64) (int64, error) { + // fs/proc/internal.h: #define FIRST_PROCESS_ENTRY 256 + const FIRST_PROCESS_ENTRY = 256 + + // Use maxTaskID to shortcut searches that will result in 0 entries. + const maxTaskID = kernel.TasksLimit + 1 + if offset >= maxTaskID { + return offset, nil + } + + // According to Linux (fs/proc/base.c:proc_pid_readdir()), process directories + // start at offset FIRST_PROCESS_ENTRY with '/proc/self', followed by + // '/proc/thread-self' and then '/proc/[pid]'. + if offset < FIRST_PROCESS_ENTRY { + offset = FIRST_PROCESS_ENTRY + } + + if offset == FIRST_PROCESS_ENTRY { + dirent := vfs.Dirent{ + Name: selfName, + Type: linux.DT_LNK, + Ino: i.inoGen.NextIno(), + NextOff: offset + 1, + } + if !cb.Handle(dirent) { + return offset, nil + } + offset++ + } + if offset == FIRST_PROCESS_ENTRY+1 { + dirent := vfs.Dirent{ + Name: threadSelfName, + Type: linux.DT_LNK, + Ino: i.inoGen.NextIno(), + NextOff: offset + 1, + } + if !cb.Handle(dirent) { + return offset, nil + } + offset++ + } - // Collect all tasks. Per linux we only include it in directory listings if - // it's the leader. But for whatever crazy reason, you can still walk to the - // given node. + // Collect all tasks that TGIDs are greater than the offset specified. Per + // Linux we only include in directory listings if it's the leader. But for + // whatever crazy reason, you can still walk to the given node. + var tids []int + startTid := offset - FIRST_PROCESS_ENTRY - 2 for _, tg := range i.pidns.ThreadGroups() { + tid := i.pidns.IDOfThreadGroup(tg) + if int64(tid) < startTid { + continue + } if leader := tg.Leader(); leader != nil { - tids = append(tids, int(i.pidns.IDOfThreadGroup(tg))) + tids = append(tids, int(tid)) } } if len(tids) == 0 { return offset, nil } - if relOffset >= int64(len(tids)) { - return offset, nil - } sort.Ints(tids) - for _, tid := range tids[relOffset:] { + for _, tid := range tids { dirent := vfs.Dirent{ Name: strconv.FormatUint(uint64(tid), 10), Type: linux.DT_DIR, Ino: i.inoGen.NextIno(), - NextOff: offset + 1, + NextOff: FIRST_PROCESS_ENTRY + 2 + int64(tid) + 1, } if !cb.Handle(dirent) { return offset, nil } offset++ } - return offset, nil + return maxTaskID, nil } // Open implements kernfs.Inode. diff --git a/pkg/sentry/fsimpl/proc/tasks_test.go b/pkg/sentry/fsimpl/proc/tasks_test.go index 2560fcef9..ca8c87ec2 100644 --- a/pkg/sentry/fsimpl/proc/tasks_test.go +++ b/pkg/sentry/fsimpl/proc/tasks_test.go @@ -16,6 +16,7 @@ package proc import ( "fmt" + "math" "path" "strconv" "testing" @@ -30,6 +31,18 @@ import ( "gvisor.dev/gvisor/pkg/syserror" ) +var ( + // Next offset 256 by convention. Adds 1 for the next offset. + selfLink = vfs.Dirent{Type: linux.DT_LNK, NextOff: 256 + 0 + 1} + threadSelfLink = vfs.Dirent{Type: linux.DT_LNK, NextOff: 256 + 1 + 1} + + // /proc/[pid] next offset starts at 256+2 (files above), then adds the + // PID, and adds 1 for the next offset. + proc1 = vfs.Dirent{Type: linux.DT_DIR, NextOff: 258 + 1 + 1} + proc2 = vfs.Dirent{Type: linux.DT_DIR, NextOff: 258 + 2 + 1} + proc3 = vfs.Dirent{Type: linux.DT_DIR, NextOff: 258 + 3 + 1} +) + type testIterDirentsCallback struct { dirents []vfs.Dirent } @@ -59,9 +72,9 @@ func checkTasksStaticFiles(gots []vfs.Dirent) ([]vfs.Dirent, error) { "loadavg": {Type: linux.DT_REG}, "meminfo": {Type: linux.DT_REG}, "mounts": {Type: linux.DT_LNK}, - "self": {Type: linux.DT_LNK}, + "self": selfLink, "stat": {Type: linux.DT_REG}, - "thread-self": {Type: linux.DT_LNK}, + "thread-self": threadSelfLink, "version": {Type: linux.DT_REG}, } return checkFiles(gots, wants) @@ -93,6 +106,9 @@ func checkFiles(gots []vfs.Dirent, wants map[string]vfs.Dirent) ([]vfs.Dirent, e if want.Type != got.Type { return gots, fmt.Errorf("wrong file type, want: %v, got: %v: %+v", want.Type, got.Type, got) } + if want.NextOff != 0 && want.NextOff != got.NextOff { + return gots, fmt.Errorf("wrong dirent offset, want: %v, got: %v: %+v", want.NextOff, got.NextOff, got) + } delete(wants, got.Name) gots = append(gots[0:i], gots[i+1:]...) @@ -154,7 +170,7 @@ func TestTasksEmpty(t *testing.T) { t.Error(err.Error()) } if len(cb.dirents) != 0 { - t.Error("found more files than expected: %+v", cb.dirents) + t.Errorf("found more files than expected: %+v", cb.dirents) } } @@ -216,6 +232,11 @@ func TestTasks(t *testing.T) { if !found { t.Errorf("Additional task ID %d listed: %v", pid, tasks) } + // Next offset starts at 256+2 ('self' and 'thread-self'), then adds the + // PID, and adds 1 for the next offset. + if want := int64(256 + 2 + pid + 1); d.NextOff != want { + t.Errorf("Wrong dirent offset want: %d got: %d: %+v", want, d.NextOff, d) + } } // Test lookup. @@ -246,6 +267,126 @@ func TestTasks(t *testing.T) { } } +func TestTasksOffset(t *testing.T) { + ctx, vfsObj, root, err := setup() + if err != nil { + t.Fatalf("Setup failed: %v", err) + } + defer root.DecRef() + + k := kernel.KernelFromContext(ctx) + for i := 0; i < 3; i++ { + tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits()) + if _, err := createTask(ctx, fmt.Sprintf("name-%d", i), tc); err != nil { + t.Fatalf("CreateTask(): %v", err) + } + } + + for _, tc := range []struct { + name string + offset int64 + wants map[string]vfs.Dirent + }{ + { + name: "small offset", + offset: 100, + wants: map[string]vfs.Dirent{ + "self": selfLink, + "thread-self": threadSelfLink, + "1": proc1, + "2": proc2, + "3": proc3, + }, + }, + { + name: "offset at start", + offset: 256, + wants: map[string]vfs.Dirent{ + "self": selfLink, + "thread-self": threadSelfLink, + "1": proc1, + "2": proc2, + "3": proc3, + }, + }, + { + name: "skip /proc/self", + offset: 257, + wants: map[string]vfs.Dirent{ + "thread-self": threadSelfLink, + "1": proc1, + "2": proc2, + "3": proc3, + }, + }, + { + name: "skip symlinks", + offset: 258, + wants: map[string]vfs.Dirent{ + "1": proc1, + "2": proc2, + "3": proc3, + }, + }, + { + name: "skip first process", + offset: 260, + wants: map[string]vfs.Dirent{ + "2": proc2, + "3": proc3, + }, + }, + { + name: "last process", + offset: 261, + wants: map[string]vfs.Dirent{ + "3": proc3, + }, + }, + { + name: "after last", + offset: 262, + wants: nil, + }, + { + name: "TaskLimit+1", + offset: kernel.TasksLimit + 1, + wants: nil, + }, + { + name: "max", + offset: math.MaxInt64, + wants: nil, + }, + } { + t.Run(tc.name, func(t *testing.T) { + fd, err := vfsObj.OpenAt( + ctx, + auth.CredentialsFromContext(ctx), + &vfs.PathOperation{Root: root, Start: root, Path: fspath.Parse("/")}, + &vfs.OpenOptions{}, + ) + if err != nil { + t.Fatalf("vfsfs.OpenAt(/) failed: %v", err) + } + if _, err := fd.Impl().Seek(ctx, tc.offset, linux.SEEK_SET); err != nil { + t.Fatalf("Seek(%d, SEEK_SET): %v", tc.offset, err) + } + + cb := testIterDirentsCallback{} + if err := fd.Impl().IterDirents(ctx, &cb); err != nil { + t.Fatalf("IterDirents(): %v", err) + } + if cb.dirents, err = checkFiles(cb.dirents, tc.wants); err != nil { + t.Error(err.Error()) + } + if len(cb.dirents) != 0 { + t.Errorf("found more files than expected: %+v", cb.dirents) + } + }) + } +} + func TestTask(t *testing.T) { ctx, vfsObj, root, err := setup() if err != nil { -- cgit v1.2.3 From 565b64148314018e1234196182b55c4f01772e77 Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Wed, 8 Jan 2020 16:32:50 -0800 Subject: Define sizes for extent headers and entries separately to improve clarity. PiperOrigin-RevId: 288799694 --- pkg/sentry/fsimpl/ext/disklayout/extent.go | 10 +++++++--- pkg/sentry/fsimpl/ext/disklayout/extent_test.go | 6 +++--- pkg/sentry/fsimpl/ext/extent_file.go | 8 ++++---- 3 files changed, 14 insertions(+), 10 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/ext/disklayout/extent.go b/pkg/sentry/fsimpl/ext/disklayout/extent.go index 567523d32..4110649ab 100644 --- a/pkg/sentry/fsimpl/ext/disklayout/extent.go +++ b/pkg/sentry/fsimpl/ext/disklayout/extent.go @@ -29,8 +29,12 @@ package disklayout // byte (i * sb.BlockSize()) to ((i+1) * sb.BlockSize()). const ( - // ExtentStructsSize is the size of all the three extent on-disk structs. - ExtentStructsSize = 12 + // ExtentHeaderSize is the size of the header of an extent tree node. + ExtentHeaderSize = 12 + + // ExtentEntrySize is the size of an entry in an extent tree node. + // This size is the same for both leaf and internal nodes. + ExtentEntrySize = 12 // ExtentMagic is the magic number which must be present in the header. ExtentMagic = 0xf30a @@ -57,7 +61,7 @@ type ExtentNode struct { Entries []ExtentEntryPair } -// ExtentEntry reprsents an extent tree node entry. The entry can either be +// ExtentEntry represents an extent tree node entry. The entry can either be // an ExtentIdx or Extent itself. This exists to simplify navigation logic. type ExtentEntry interface { // FileBlock returns the first file block number covered by this entry. diff --git a/pkg/sentry/fsimpl/ext/disklayout/extent_test.go b/pkg/sentry/fsimpl/ext/disklayout/extent_test.go index b0fad9b71..8762b90db 100644 --- a/pkg/sentry/fsimpl/ext/disklayout/extent_test.go +++ b/pkg/sentry/fsimpl/ext/disklayout/extent_test.go @@ -21,7 +21,7 @@ import ( // TestExtentSize tests that the extent structs are of the correct // size. func TestExtentSize(t *testing.T) { - assertSize(t, ExtentHeader{}, ExtentStructsSize) - assertSize(t, ExtentIdx{}, ExtentStructsSize) - assertSize(t, Extent{}, ExtentStructsSize) + assertSize(t, ExtentHeader{}, ExtentHeaderSize) + assertSize(t, ExtentIdx{}, ExtentEntrySize) + assertSize(t, Extent{}, ExtentEntrySize) } diff --git a/pkg/sentry/fsimpl/ext/extent_file.go b/pkg/sentry/fsimpl/ext/extent_file.go index 3d3ebaca6..11dcc0346 100644 --- a/pkg/sentry/fsimpl/ext/extent_file.go +++ b/pkg/sentry/fsimpl/ext/extent_file.go @@ -57,7 +57,7 @@ func newExtentFile(regFile regularFile) (*extentFile, error) { func (f *extentFile) buildExtTree() error { rootNodeData := f.regFile.inode.diskInode.Data() - binary.Unmarshal(rootNodeData[:disklayout.ExtentStructsSize], binary.LittleEndian, &f.root.Header) + binary.Unmarshal(rootNodeData[:disklayout.ExtentHeaderSize], binary.LittleEndian, &f.root.Header) // Root node can not have more than 4 entries: 60 bytes = 1 header + 4 entries. if f.root.Header.NumEntries > 4 { @@ -67,7 +67,7 @@ func (f *extentFile) buildExtTree() error { } f.root.Entries = make([]disklayout.ExtentEntryPair, f.root.Header.NumEntries) - for i, off := uint16(0), disklayout.ExtentStructsSize; i < f.root.Header.NumEntries; i, off = i+1, off+disklayout.ExtentStructsSize { + for i, off := uint16(0), disklayout.ExtentEntrySize; i < f.root.Header.NumEntries; i, off = i+1, off+disklayout.ExtentEntrySize { var curEntry disklayout.ExtentEntry if f.root.Header.Height == 0 { // Leaf node. @@ -76,7 +76,7 @@ func (f *extentFile) buildExtTree() error { // Internal node. curEntry = &disklayout.ExtentIdx{} } - binary.Unmarshal(rootNodeData[off:off+disklayout.ExtentStructsSize], binary.LittleEndian, curEntry) + binary.Unmarshal(rootNodeData[off:off+disklayout.ExtentEntrySize], binary.LittleEndian, curEntry) f.root.Entries[i].Entry = curEntry } @@ -105,7 +105,7 @@ func (f *extentFile) buildExtTreeFromDisk(entry disklayout.ExtentEntry) (*diskla } entries := make([]disklayout.ExtentEntryPair, header.NumEntries) - for i, off := uint16(0), off+disklayout.ExtentStructsSize; i < header.NumEntries; i, off = i+1, off+disklayout.ExtentStructsSize { + for i, off := uint16(0), off+disklayout.ExtentEntrySize; i < header.NumEntries; i, off = i+1, off+disklayout.ExtentEntrySize { var curEntry disklayout.ExtentEntry if header.Height == 0 { // Leaf node. -- cgit v1.2.3 From 27500d529f7fb87eef8812278fd1bbca67bcba72 Mon Sep 17 00:00:00 2001 From: Ian Gudger Date: Thu, 9 Jan 2020 22:00:42 -0800 Subject: New sync package. * Rename syncutil to sync. * Add aliases to sync types. * Replace existing usage of standard library sync package. This will make it easier to swap out synchronization primitives. For example, this will allow us to use primitives from github.com/sasha-s/go-deadlock to check for lock ordering violations. Updates #1472 PiperOrigin-RevId: 289033387 --- pkg/amutex/BUILD | 1 + pkg/amutex/amutex_test.go | 3 +- pkg/atomicbitops/BUILD | 1 + pkg/atomicbitops/atomic_bitops_test.go | 3 +- pkg/compressio/BUILD | 5 +- pkg/compressio/compressio.go | 2 +- pkg/control/server/BUILD | 1 + pkg/control/server/server.go | 2 +- pkg/eventchannel/BUILD | 2 + pkg/eventchannel/event.go | 2 +- pkg/eventchannel/event_test.go | 2 +- pkg/fdchannel/BUILD | 1 + pkg/fdchannel/fdchannel_test.go | 3 +- pkg/fdnotifier/BUILD | 1 + pkg/fdnotifier/fdnotifier.go | 2 +- pkg/flipcall/BUILD | 3 +- pkg/flipcall/flipcall_example_test.go | 3 +- pkg/flipcall/flipcall_test.go | 3 +- pkg/flipcall/flipcall_unsafe.go | 10 +- pkg/gate/BUILD | 1 + pkg/gate/gate_test.go | 2 +- pkg/linewriter/BUILD | 1 + pkg/linewriter/linewriter.go | 3 +- pkg/log/BUILD | 5 +- pkg/log/log.go | 2 +- pkg/metric/BUILD | 1 + pkg/metric/metric.go | 2 +- pkg/p9/BUILD | 1 + pkg/p9/client.go | 2 +- pkg/p9/p9test/BUILD | 2 + pkg/p9/p9test/client_test.go | 2 +- pkg/p9/p9test/p9test.go | 2 +- pkg/p9/path_tree.go | 3 +- pkg/p9/pool.go | 2 +- pkg/p9/server.go | 2 +- pkg/p9/transport.go | 2 +- pkg/procid/BUILD | 2 + pkg/procid/procid_test.go | 3 +- pkg/rand/BUILD | 5 +- pkg/rand/rand_linux.go | 2 +- pkg/refs/BUILD | 2 + pkg/refs/refcounter.go | 2 +- pkg/refs/refcounter_test.go | 3 +- pkg/sentry/arch/BUILD | 1 + pkg/sentry/arch/arch_x86.go | 2 +- pkg/sentry/control/BUILD | 1 + pkg/sentry/control/pprof.go | 2 +- pkg/sentry/device/BUILD | 5 +- pkg/sentry/device/device.go | 2 +- pkg/sentry/fs/BUILD | 3 +- pkg/sentry/fs/copy_up.go | 2 +- pkg/sentry/fs/copy_up_test.go | 2 +- pkg/sentry/fs/dirent.go | 2 +- pkg/sentry/fs/dirent_cache.go | 3 +- pkg/sentry/fs/dirent_cache_limiter.go | 3 +- pkg/sentry/fs/fdpipe/BUILD | 1 + pkg/sentry/fs/fdpipe/pipe.go | 2 +- pkg/sentry/fs/fdpipe/pipe_state.go | 2 +- pkg/sentry/fs/file.go | 2 +- pkg/sentry/fs/file_overlay.go | 2 +- pkg/sentry/fs/filesystems.go | 2 +- pkg/sentry/fs/fs.go | 3 +- pkg/sentry/fs/fsutil/BUILD | 1 + pkg/sentry/fs/fsutil/host_file_mapper.go | 2 +- pkg/sentry/fs/fsutil/host_mappable.go | 2 +- pkg/sentry/fs/fsutil/inode.go | 3 +- pkg/sentry/fs/fsutil/inode_cached.go | 2 +- pkg/sentry/fs/gofer/BUILD | 1 + pkg/sentry/fs/gofer/inode.go | 2 +- pkg/sentry/fs/gofer/session.go | 2 +- pkg/sentry/fs/host/BUILD | 1 + pkg/sentry/fs/host/inode.go | 2 +- pkg/sentry/fs/host/socket.go | 2 +- pkg/sentry/fs/host/tty.go | 3 +- pkg/sentry/fs/inode.go | 3 +- pkg/sentry/fs/inode_inotify.go | 3 +- pkg/sentry/fs/inotify.go | 2 +- pkg/sentry/fs/inotify_watch.go | 2 +- pkg/sentry/fs/lock/BUILD | 1 + pkg/sentry/fs/lock/lock.go | 2 +- pkg/sentry/fs/mounts.go | 2 +- pkg/sentry/fs/overlay.go | 5 +- pkg/sentry/fs/proc/BUILD | 1 + pkg/sentry/fs/proc/seqfile/BUILD | 1 + pkg/sentry/fs/proc/seqfile/seqfile.go | 2 +- pkg/sentry/fs/proc/sys_net.go | 2 +- pkg/sentry/fs/ramfs/BUILD | 1 + pkg/sentry/fs/ramfs/dir.go | 2 +- pkg/sentry/fs/restore.go | 2 +- pkg/sentry/fs/tmpfs/BUILD | 1 + pkg/sentry/fs/tmpfs/inode_file.go | 2 +- pkg/sentry/fs/tty/BUILD | 1 + pkg/sentry/fs/tty/dir.go | 2 +- pkg/sentry/fs/tty/line_discipline.go | 2 +- pkg/sentry/fs/tty/queue.go | 3 +- pkg/sentry/fsimpl/ext/BUILD | 1 + pkg/sentry/fsimpl/ext/directory.go | 3 +- pkg/sentry/fsimpl/ext/filesystem.go | 2 +- pkg/sentry/fsimpl/ext/regular_file.go | 2 +- pkg/sentry/fsimpl/kernfs/BUILD | 2 + pkg/sentry/fsimpl/kernfs/inode_impl_util.go | 2 +- pkg/sentry/fsimpl/kernfs/kernfs.go | 2 +- pkg/sentry/fsimpl/kernfs/kernfs_test.go | 2 +- pkg/sentry/fsimpl/tmpfs/BUILD | 1 + pkg/sentry/fsimpl/tmpfs/regular_file.go | 2 +- pkg/sentry/fsimpl/tmpfs/tmpfs.go | 2 +- pkg/sentry/kernel/BUILD | 5 +- pkg/sentry/kernel/abstract_socket_namespace.go | 2 +- pkg/sentry/kernel/auth/BUILD | 3 +- pkg/sentry/kernel/auth/user_namespace.go | 2 +- pkg/sentry/kernel/epoll/BUILD | 1 + pkg/sentry/kernel/epoll/epoll.go | 2 +- pkg/sentry/kernel/eventfd/BUILD | 1 + pkg/sentry/kernel/eventfd/eventfd.go | 2 +- pkg/sentry/kernel/fasync/BUILD | 1 + pkg/sentry/kernel/fasync/fasync.go | 3 +- pkg/sentry/kernel/fd_table.go | 2 +- pkg/sentry/kernel/fd_table_test.go | 2 +- pkg/sentry/kernel/fs_context.go | 2 +- pkg/sentry/kernel/futex/BUILD | 8 +- pkg/sentry/kernel/futex/futex.go | 3 +- pkg/sentry/kernel/futex/futex_test.go | 2 +- pkg/sentry/kernel/kernel.go | 2 +- pkg/sentry/kernel/memevent/BUILD | 1 + pkg/sentry/kernel/memevent/memory_events.go | 2 +- pkg/sentry/kernel/pipe/BUILD | 1 + pkg/sentry/kernel/pipe/buffer.go | 2 +- pkg/sentry/kernel/pipe/node.go | 3 +- pkg/sentry/kernel/pipe/pipe.go | 2 +- pkg/sentry/kernel/pipe/pipe_util.go | 2 +- pkg/sentry/kernel/pipe/vfs.go | 3 +- pkg/sentry/kernel/semaphore/BUILD | 1 + pkg/sentry/kernel/semaphore/semaphore.go | 2 +- pkg/sentry/kernel/shm/BUILD | 1 + pkg/sentry/kernel/shm/shm.go | 2 +- pkg/sentry/kernel/signal_handlers.go | 3 +- pkg/sentry/kernel/signalfd/BUILD | 1 + pkg/sentry/kernel/signalfd/signalfd.go | 3 +- pkg/sentry/kernel/syscalls.go | 2 +- pkg/sentry/kernel/syslog.go | 3 +- pkg/sentry/kernel/task.go | 5 +- pkg/sentry/kernel/thread_group.go | 2 +- pkg/sentry/kernel/threads.go | 2 +- pkg/sentry/kernel/time/BUILD | 1 + pkg/sentry/kernel/time/time.go | 2 +- pkg/sentry/kernel/timekeeper.go | 2 +- pkg/sentry/kernel/tty.go | 2 +- pkg/sentry/kernel/uts_namespace.go | 3 +- pkg/sentry/limits/BUILD | 1 + pkg/sentry/limits/limits.go | 3 +- pkg/sentry/mm/BUILD | 2 +- pkg/sentry/mm/aio_context.go | 3 +- pkg/sentry/mm/mm.go | 8 +- pkg/sentry/pgalloc/BUILD | 1 + pkg/sentry/pgalloc/pgalloc.go | 2 +- pkg/sentry/platform/interrupt/BUILD | 1 + pkg/sentry/platform/interrupt/interrupt.go | 3 +- pkg/sentry/platform/kvm/BUILD | 1 + pkg/sentry/platform/kvm/address_space.go | 2 +- pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go | 2 - pkg/sentry/platform/kvm/kvm.go | 2 +- pkg/sentry/platform/kvm/machine.go | 2 +- pkg/sentry/platform/ptrace/BUILD | 1 + pkg/sentry/platform/ptrace/ptrace.go | 2 +- pkg/sentry/platform/ptrace/subprocess.go | 2 +- .../platform/ptrace/subprocess_linux_unsafe.go | 2 +- pkg/sentry/platform/ring0/defs.go | 2 +- pkg/sentry/platform/ring0/defs_amd64.go | 1 + pkg/sentry/platform/ring0/defs_arm64.go | 1 + pkg/sentry/platform/ring0/pagetables/BUILD | 5 +- pkg/sentry/platform/ring0/pagetables/pcids_x86.go | 2 +- pkg/sentry/socket/netlink/BUILD | 1 + pkg/sentry/socket/netlink/port/BUILD | 1 + pkg/sentry/socket/netlink/port/port.go | 3 +- pkg/sentry/socket/netlink/socket.go | 2 +- pkg/sentry/socket/netstack/BUILD | 1 + pkg/sentry/socket/netstack/netstack.go | 2 +- pkg/sentry/socket/rpcinet/conn/BUILD | 1 + pkg/sentry/socket/rpcinet/conn/conn.go | 2 +- pkg/sentry/socket/rpcinet/notifier/BUILD | 1 + pkg/sentry/socket/rpcinet/notifier/notifier.go | 2 +- pkg/sentry/socket/unix/transport/BUILD | 1 + pkg/sentry/socket/unix/transport/connectioned.go | 3 +- pkg/sentry/socket/unix/transport/queue.go | 3 +- pkg/sentry/socket/unix/transport/unix.go | 2 +- pkg/sentry/syscalls/linux/BUILD | 1 + pkg/sentry/syscalls/linux/error.go | 2 +- pkg/sentry/time/BUILD | 4 +- pkg/sentry/time/calibrated_clock.go | 2 +- pkg/sentry/usage/BUILD | 1 + pkg/sentry/usage/memory.go | 2 +- pkg/sentry/vfs/BUILD | 3 +- pkg/sentry/vfs/dentry.go | 2 +- pkg/sentry/vfs/file_description_impl_util.go | 2 +- pkg/sentry/vfs/mount_test.go | 3 +- pkg/sentry/vfs/mount_unsafe.go | 4 +- pkg/sentry/vfs/pathname.go | 3 +- pkg/sentry/vfs/resolving_path.go | 2 +- pkg/sentry/vfs/vfs.go | 2 +- pkg/sentry/watchdog/BUILD | 1 + pkg/sentry/watchdog/watchdog.go | 2 +- pkg/sync/BUILD | 53 +++++++ pkg/sync/LICENSE | 27 ++++ pkg/sync/README.md | 5 + pkg/sync/aliases.go | 37 +++++ pkg/sync/atomicptr_unsafe.go | 47 +++++++ pkg/sync/atomicptrtest/BUILD | 29 ++++ pkg/sync/atomicptrtest/atomicptr_test.go | 31 +++++ pkg/sync/downgradable_rwmutex_test.go | 150 ++++++++++++++++++++ pkg/sync/downgradable_rwmutex_unsafe.go | 146 ++++++++++++++++++++ pkg/sync/memmove_unsafe.go | 28 ++++ pkg/sync/norace_unsafe.go | 35 +++++ pkg/sync/race_unsafe.go | 41 ++++++ pkg/sync/seqatomic_unsafe.go | 72 ++++++++++ pkg/sync/seqatomictest/BUILD | 33 +++++ pkg/sync/seqatomictest/seqatomic_test.go | 132 ++++++++++++++++++ pkg/sync/seqcount.go | 149 ++++++++++++++++++++ pkg/sync/seqcount_test.go | 153 +++++++++++++++++++++ pkg/sync/syncutil.go | 7 + pkg/syncutil/BUILD | 52 ------- pkg/syncutil/LICENSE | 27 ---- pkg/syncutil/README.md | 5 - pkg/syncutil/atomicptr_unsafe.go | 47 ------- pkg/syncutil/atomicptrtest/BUILD | 29 ---- pkg/syncutil/atomicptrtest/atomicptr_test.go | 31 ----- pkg/syncutil/downgradable_rwmutex_test.go | 150 -------------------- pkg/syncutil/downgradable_rwmutex_unsafe.go | 146 -------------------- pkg/syncutil/memmove_unsafe.go | 28 ---- pkg/syncutil/norace_unsafe.go | 35 ----- pkg/syncutil/race_unsafe.go | 41 ------ pkg/syncutil/seqatomic_unsafe.go | 72 ---------- pkg/syncutil/seqatomictest/BUILD | 35 ----- pkg/syncutil/seqatomictest/seqatomic_test.go | 132 ------------------ pkg/syncutil/seqcount.go | 149 -------------------- pkg/syncutil/seqcount_test.go | 153 --------------------- pkg/syncutil/syncutil.go | 7 - pkg/tcpip/BUILD | 1 + pkg/tcpip/adapters/gonet/BUILD | 1 + pkg/tcpip/adapters/gonet/gonet.go | 2 +- pkg/tcpip/link/fdbased/BUILD | 1 + pkg/tcpip/link/fdbased/endpoint.go | 2 +- pkg/tcpip/link/sharedmem/BUILD | 2 + pkg/tcpip/link/sharedmem/pipe/BUILD | 1 + pkg/tcpip/link/sharedmem/pipe/pipe_test.go | 3 +- pkg/tcpip/link/sharedmem/sharedmem.go | 2 +- pkg/tcpip/link/sharedmem/sharedmem_test.go | 2 +- pkg/tcpip/network/fragmentation/BUILD | 1 + pkg/tcpip/network/fragmentation/fragmentation.go | 2 +- pkg/tcpip/network/fragmentation/reassembler.go | 2 +- pkg/tcpip/ports/BUILD | 1 + pkg/tcpip/ports/ports.go | 2 +- pkg/tcpip/stack/BUILD | 2 + pkg/tcpip/stack/linkaddrcache.go | 2 +- pkg/tcpip/stack/linkaddrcache_test.go | 2 +- pkg/tcpip/stack/nic.go | 2 +- pkg/tcpip/stack/stack.go | 2 +- pkg/tcpip/stack/transport_demuxer.go | 2 +- pkg/tcpip/tcpip.go | 2 +- pkg/tcpip/transport/icmp/BUILD | 1 + pkg/tcpip/transport/icmp/endpoint.go | 3 +- pkg/tcpip/transport/packet/BUILD | 1 + pkg/tcpip/transport/packet/endpoint.go | 3 +- pkg/tcpip/transport/raw/BUILD | 1 + pkg/tcpip/transport/raw/endpoint.go | 3 +- pkg/tcpip/transport/tcp/BUILD | 1 + pkg/tcpip/transport/tcp/accept.go | 2 +- pkg/tcpip/transport/tcp/connect.go | 2 +- pkg/tcpip/transport/tcp/endpoint.go | 2 +- pkg/tcpip/transport/tcp/endpoint_state.go | 2 +- pkg/tcpip/transport/tcp/forwarder.go | 3 +- pkg/tcpip/transport/tcp/protocol.go | 2 +- pkg/tcpip/transport/tcp/segment_queue.go | 2 +- pkg/tcpip/transport/tcp/snd.go | 2 +- pkg/tcpip/transport/udp/BUILD | 1 + pkg/tcpip/transport/udp/endpoint.go | 3 +- pkg/tmutex/BUILD | 1 + pkg/tmutex/tmutex_test.go | 3 +- pkg/unet/BUILD | 1 + pkg/unet/unet_test.go | 3 +- pkg/urpc/BUILD | 1 + pkg/urpc/urpc.go | 2 +- pkg/waiter/BUILD | 1 + pkg/waiter/waiter.go | 2 +- runsc/boot/BUILD | 2 + runsc/boot/compat.go | 2 +- runsc/boot/limits.go | 2 +- runsc/boot/loader.go | 2 +- runsc/boot/loader_test.go | 2 +- runsc/cmd/BUILD | 1 + runsc/cmd/create.go | 1 + runsc/cmd/gofer.go | 2 +- runsc/cmd/start.go | 1 + runsc/container/BUILD | 2 + runsc/container/console_test.go | 2 +- runsc/container/container_test.go | 2 +- runsc/container/multi_container_test.go | 2 +- runsc/container/state_file.go | 2 +- runsc/fsgofer/BUILD | 1 + runsc/fsgofer/fsgofer.go | 2 +- runsc/sandbox/BUILD | 1 + runsc/sandbox/sandbox.go | 2 +- runsc/testutil/BUILD | 1 + runsc/testutil/testutil.go | 2 +- 303 files changed, 1507 insertions(+), 1368 deletions(-) create mode 100644 pkg/sync/BUILD create mode 100644 pkg/sync/LICENSE create mode 100644 pkg/sync/README.md create mode 100644 pkg/sync/aliases.go create mode 100644 pkg/sync/atomicptr_unsafe.go create mode 100644 pkg/sync/atomicptrtest/BUILD create mode 100644 pkg/sync/atomicptrtest/atomicptr_test.go create mode 100644 pkg/sync/downgradable_rwmutex_test.go create mode 100644 pkg/sync/downgradable_rwmutex_unsafe.go create mode 100644 pkg/sync/memmove_unsafe.go create mode 100644 pkg/sync/norace_unsafe.go create mode 100644 pkg/sync/race_unsafe.go create mode 100644 pkg/sync/seqatomic_unsafe.go create mode 100644 pkg/sync/seqatomictest/BUILD create mode 100644 pkg/sync/seqatomictest/seqatomic_test.go create mode 100644 pkg/sync/seqcount.go create mode 100644 pkg/sync/seqcount_test.go create mode 100644 pkg/sync/syncutil.go delete mode 100644 pkg/syncutil/BUILD delete mode 100644 pkg/syncutil/LICENSE delete mode 100644 pkg/syncutil/README.md delete mode 100644 pkg/syncutil/atomicptr_unsafe.go delete mode 100644 pkg/syncutil/atomicptrtest/BUILD delete mode 100644 pkg/syncutil/atomicptrtest/atomicptr_test.go delete mode 100644 pkg/syncutil/downgradable_rwmutex_test.go delete mode 100644 pkg/syncutil/downgradable_rwmutex_unsafe.go delete mode 100644 pkg/syncutil/memmove_unsafe.go delete mode 100644 pkg/syncutil/norace_unsafe.go delete mode 100644 pkg/syncutil/race_unsafe.go delete mode 100644 pkg/syncutil/seqatomic_unsafe.go delete mode 100644 pkg/syncutil/seqatomictest/BUILD delete mode 100644 pkg/syncutil/seqatomictest/seqatomic_test.go delete mode 100644 pkg/syncutil/seqcount.go delete mode 100644 pkg/syncutil/seqcount_test.go delete mode 100644 pkg/syncutil/syncutil.go (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/amutex/BUILD b/pkg/amutex/BUILD index 6bc486b62..d99e37b40 100644 --- a/pkg/amutex/BUILD +++ b/pkg/amutex/BUILD @@ -15,4 +15,5 @@ go_test( size = "small", srcs = ["amutex_test.go"], embed = [":amutex"], + deps = ["//pkg/sync"], ) diff --git a/pkg/amutex/amutex_test.go b/pkg/amutex/amutex_test.go index 1d7f45641..8a3952f2a 100644 --- a/pkg/amutex/amutex_test.go +++ b/pkg/amutex/amutex_test.go @@ -15,9 +15,10 @@ package amutex import ( - "sync" "testing" "time" + + "gvisor.dev/gvisor/pkg/sync" ) type sleeper struct { diff --git a/pkg/atomicbitops/BUILD b/pkg/atomicbitops/BUILD index 36beaade9..6403c60c2 100644 --- a/pkg/atomicbitops/BUILD +++ b/pkg/atomicbitops/BUILD @@ -20,4 +20,5 @@ go_test( size = "small", srcs = ["atomic_bitops_test.go"], embed = [":atomicbitops"], + deps = ["//pkg/sync"], ) diff --git a/pkg/atomicbitops/atomic_bitops_test.go b/pkg/atomicbitops/atomic_bitops_test.go index 965e9be79..9466d3e23 100644 --- a/pkg/atomicbitops/atomic_bitops_test.go +++ b/pkg/atomicbitops/atomic_bitops_test.go @@ -16,8 +16,9 @@ package atomicbitops import ( "runtime" - "sync" "testing" + + "gvisor.dev/gvisor/pkg/sync" ) const iterations = 100 diff --git a/pkg/compressio/BUILD b/pkg/compressio/BUILD index a0b21d4bd..2bb581b18 100644 --- a/pkg/compressio/BUILD +++ b/pkg/compressio/BUILD @@ -8,7 +8,10 @@ go_library( srcs = ["compressio.go"], importpath = "gvisor.dev/gvisor/pkg/compressio", visibility = ["//:sandbox"], - deps = ["//pkg/binary"], + deps = [ + "//pkg/binary", + "//pkg/sync", + ], ) go_test( diff --git a/pkg/compressio/compressio.go b/pkg/compressio/compressio.go index 3b0bb086e..5f52cbe74 100644 --- a/pkg/compressio/compressio.go +++ b/pkg/compressio/compressio.go @@ -52,9 +52,9 @@ import ( "hash" "io" "runtime" - "sync" "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/sync" ) var bufPool = sync.Pool{ diff --git a/pkg/control/server/BUILD b/pkg/control/server/BUILD index 21adf3adf..adbd1e3f8 100644 --- a/pkg/control/server/BUILD +++ b/pkg/control/server/BUILD @@ -9,6 +9,7 @@ go_library( visibility = ["//:sandbox"], deps = [ "//pkg/log", + "//pkg/sync", "//pkg/unet", "//pkg/urpc", ], diff --git a/pkg/control/server/server.go b/pkg/control/server/server.go index a56152d10..41abe1f2d 100644 --- a/pkg/control/server/server.go +++ b/pkg/control/server/server.go @@ -22,9 +22,9 @@ package server import ( "os" - "sync" "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/unet" "gvisor.dev/gvisor/pkg/urpc" ) diff --git a/pkg/eventchannel/BUILD b/pkg/eventchannel/BUILD index 0b4b7cc44..9d68682c7 100644 --- a/pkg/eventchannel/BUILD +++ b/pkg/eventchannel/BUILD @@ -15,6 +15,7 @@ go_library( deps = [ ":eventchannel_go_proto", "//pkg/log", + "//pkg/sync", "//pkg/unet", "@com_github_golang_protobuf//proto:go_default_library", "@com_github_golang_protobuf//ptypes:go_default_library_gen", @@ -40,6 +41,7 @@ go_test( srcs = ["event_test.go"], embed = [":eventchannel"], deps = [ + "//pkg/sync", "@com_github_golang_protobuf//proto:go_default_library", ], ) diff --git a/pkg/eventchannel/event.go b/pkg/eventchannel/event.go index d37ad0428..9a29c58bd 100644 --- a/pkg/eventchannel/event.go +++ b/pkg/eventchannel/event.go @@ -22,13 +22,13 @@ package eventchannel import ( "encoding/binary" "fmt" - "sync" "syscall" "github.com/golang/protobuf/proto" "github.com/golang/protobuf/ptypes" pb "gvisor.dev/gvisor/pkg/eventchannel/eventchannel_go_proto" "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/unet" ) diff --git a/pkg/eventchannel/event_test.go b/pkg/eventchannel/event_test.go index 3649097d6..7f41b4a27 100644 --- a/pkg/eventchannel/event_test.go +++ b/pkg/eventchannel/event_test.go @@ -16,11 +16,11 @@ package eventchannel import ( "fmt" - "sync" "testing" "time" "github.com/golang/protobuf/proto" + "gvisor.dev/gvisor/pkg/sync" ) // testEmitter is an emitter that can be used in tests. It records all events diff --git a/pkg/fdchannel/BUILD b/pkg/fdchannel/BUILD index 56495cbd9..b0478c672 100644 --- a/pkg/fdchannel/BUILD +++ b/pkg/fdchannel/BUILD @@ -15,4 +15,5 @@ go_test( size = "small", srcs = ["fdchannel_test.go"], embed = [":fdchannel"], + deps = ["//pkg/sync"], ) diff --git a/pkg/fdchannel/fdchannel_test.go b/pkg/fdchannel/fdchannel_test.go index 5d01dc636..7a8a63a59 100644 --- a/pkg/fdchannel/fdchannel_test.go +++ b/pkg/fdchannel/fdchannel_test.go @@ -17,10 +17,11 @@ package fdchannel import ( "io/ioutil" "os" - "sync" "syscall" "testing" "time" + + "gvisor.dev/gvisor/pkg/sync" ) func TestSendRecvFD(t *testing.T) { diff --git a/pkg/fdnotifier/BUILD b/pkg/fdnotifier/BUILD index aca2d8a82..91a202a30 100644 --- a/pkg/fdnotifier/BUILD +++ b/pkg/fdnotifier/BUILD @@ -11,6 +11,7 @@ go_library( importpath = "gvisor.dev/gvisor/pkg/fdnotifier", visibility = ["//:sandbox"], deps = [ + "//pkg/sync", "//pkg/waiter", "@org_golang_x_sys//unix:go_default_library", ], diff --git a/pkg/fdnotifier/fdnotifier.go b/pkg/fdnotifier/fdnotifier.go index f4aae1953..a6b63c982 100644 --- a/pkg/fdnotifier/fdnotifier.go +++ b/pkg/fdnotifier/fdnotifier.go @@ -22,10 +22,10 @@ package fdnotifier import ( "fmt" - "sync" "syscall" "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/flipcall/BUILD b/pkg/flipcall/BUILD index e590a71ba..85bd83af1 100644 --- a/pkg/flipcall/BUILD +++ b/pkg/flipcall/BUILD @@ -19,7 +19,7 @@ go_library( "//pkg/abi/linux", "//pkg/log", "//pkg/memutil", - "//pkg/syncutil", + "//pkg/sync", ], ) @@ -31,4 +31,5 @@ go_test( "flipcall_test.go", ], embed = [":flipcall"], + deps = ["//pkg/sync"], ) diff --git a/pkg/flipcall/flipcall_example_test.go b/pkg/flipcall/flipcall_example_test.go index 8d88b845d..2e28a149a 100644 --- a/pkg/flipcall/flipcall_example_test.go +++ b/pkg/flipcall/flipcall_example_test.go @@ -17,7 +17,8 @@ package flipcall import ( "bytes" "fmt" - "sync" + + "gvisor.dev/gvisor/pkg/sync" ) func Example() { diff --git a/pkg/flipcall/flipcall_test.go b/pkg/flipcall/flipcall_test.go index 168a487ec..33fd55a44 100644 --- a/pkg/flipcall/flipcall_test.go +++ b/pkg/flipcall/flipcall_test.go @@ -16,9 +16,10 @@ package flipcall import ( "runtime" - "sync" "testing" "time" + + "gvisor.dev/gvisor/pkg/sync" ) var testPacketWindowSize = pageSize diff --git a/pkg/flipcall/flipcall_unsafe.go b/pkg/flipcall/flipcall_unsafe.go index 27b8939fc..ac974b232 100644 --- a/pkg/flipcall/flipcall_unsafe.go +++ b/pkg/flipcall/flipcall_unsafe.go @@ -18,7 +18,7 @@ import ( "reflect" "unsafe" - "gvisor.dev/gvisor/pkg/syncutil" + "gvisor.dev/gvisor/pkg/sync" ) // Packets consist of a 16-byte header followed by an arbitrarily-sized @@ -75,13 +75,13 @@ func (ep *Endpoint) Data() []byte { var ioSync int64 func raceBecomeActive() { - if syncutil.RaceEnabled { - syncutil.RaceAcquire((unsafe.Pointer)(&ioSync)) + if sync.RaceEnabled { + sync.RaceAcquire((unsafe.Pointer)(&ioSync)) } } func raceBecomeInactive() { - if syncutil.RaceEnabled { - syncutil.RaceReleaseMerge((unsafe.Pointer)(&ioSync)) + if sync.RaceEnabled { + sync.RaceReleaseMerge((unsafe.Pointer)(&ioSync)) } } diff --git a/pkg/gate/BUILD b/pkg/gate/BUILD index 4b9321711..f22bd070d 100644 --- a/pkg/gate/BUILD +++ b/pkg/gate/BUILD @@ -19,5 +19,6 @@ go_test( ], deps = [ ":gate", + "//pkg/sync", ], ) diff --git a/pkg/gate/gate_test.go b/pkg/gate/gate_test.go index 5dbd8d712..850693df8 100644 --- a/pkg/gate/gate_test.go +++ b/pkg/gate/gate_test.go @@ -15,11 +15,11 @@ package gate_test import ( - "sync" "testing" "time" "gvisor.dev/gvisor/pkg/gate" + "gvisor.dev/gvisor/pkg/sync" ) func TestBasicEnter(t *testing.T) { diff --git a/pkg/linewriter/BUILD b/pkg/linewriter/BUILD index a5d980d14..bcde6d308 100644 --- a/pkg/linewriter/BUILD +++ b/pkg/linewriter/BUILD @@ -8,6 +8,7 @@ go_library( srcs = ["linewriter.go"], importpath = "gvisor.dev/gvisor/pkg/linewriter", visibility = ["//visibility:public"], + deps = ["//pkg/sync"], ) go_test( diff --git a/pkg/linewriter/linewriter.go b/pkg/linewriter/linewriter.go index cd6e4e2ce..a1b1285d4 100644 --- a/pkg/linewriter/linewriter.go +++ b/pkg/linewriter/linewriter.go @@ -17,7 +17,8 @@ package linewriter import ( "bytes" - "sync" + + "gvisor.dev/gvisor/pkg/sync" ) // Writer is an io.Writer which buffers input, flushing diff --git a/pkg/log/BUILD b/pkg/log/BUILD index fc5f5779b..0df0f2849 100644 --- a/pkg/log/BUILD +++ b/pkg/log/BUILD @@ -16,7 +16,10 @@ go_library( visibility = [ "//visibility:public", ], - deps = ["//pkg/linewriter"], + deps = [ + "//pkg/linewriter", + "//pkg/sync", + ], ) go_test( diff --git a/pkg/log/log.go b/pkg/log/log.go index 9387586e6..91a81b288 100644 --- a/pkg/log/log.go +++ b/pkg/log/log.go @@ -25,12 +25,12 @@ import ( stdlog "log" "os" "runtime" - "sync" "sync/atomic" "syscall" "time" "gvisor.dev/gvisor/pkg/linewriter" + "gvisor.dev/gvisor/pkg/sync" ) // Level is the log level. diff --git a/pkg/metric/BUILD b/pkg/metric/BUILD index dd6ca6d39..9145f3233 100644 --- a/pkg/metric/BUILD +++ b/pkg/metric/BUILD @@ -14,6 +14,7 @@ go_library( ":metric_go_proto", "//pkg/eventchannel", "//pkg/log", + "//pkg/sync", ], ) diff --git a/pkg/metric/metric.go b/pkg/metric/metric.go index eadde06e4..93d4f2b8c 100644 --- a/pkg/metric/metric.go +++ b/pkg/metric/metric.go @@ -18,12 +18,12 @@ package metric import ( "errors" "fmt" - "sync" "sync/atomic" "gvisor.dev/gvisor/pkg/eventchannel" "gvisor.dev/gvisor/pkg/log" pb "gvisor.dev/gvisor/pkg/metric/metric_go_proto" + "gvisor.dev/gvisor/pkg/sync" ) var ( diff --git a/pkg/p9/BUILD b/pkg/p9/BUILD index f32244c69..a3e05c96d 100644 --- a/pkg/p9/BUILD +++ b/pkg/p9/BUILD @@ -29,6 +29,7 @@ go_library( "//pkg/fdchannel", "//pkg/flipcall", "//pkg/log", + "//pkg/sync", "//pkg/unet", "@org_golang_x_sys//unix:go_default_library", ], diff --git a/pkg/p9/client.go b/pkg/p9/client.go index 221516c6c..4045e41fa 100644 --- a/pkg/p9/client.go +++ b/pkg/p9/client.go @@ -17,12 +17,12 @@ package p9 import ( "errors" "fmt" - "sync" "syscall" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/flipcall" "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/unet" ) diff --git a/pkg/p9/p9test/BUILD b/pkg/p9/p9test/BUILD index 28707c0ca..f4edd68b2 100644 --- a/pkg/p9/p9test/BUILD +++ b/pkg/p9/p9test/BUILD @@ -70,6 +70,7 @@ go_library( "//pkg/fd", "//pkg/log", "//pkg/p9", + "//pkg/sync", "//pkg/unet", "@com_github_golang_mock//gomock:go_default_library", ], @@ -83,6 +84,7 @@ go_test( deps = [ "//pkg/fd", "//pkg/p9", + "//pkg/sync", "@com_github_golang_mock//gomock:go_default_library", ], ) diff --git a/pkg/p9/p9test/client_test.go b/pkg/p9/p9test/client_test.go index 6e758148d..6e7bb3db2 100644 --- a/pkg/p9/p9test/client_test.go +++ b/pkg/p9/p9test/client_test.go @@ -22,7 +22,6 @@ import ( "os" "reflect" "strings" - "sync" "syscall" "testing" "time" @@ -30,6 +29,7 @@ import ( "github.com/golang/mock/gomock" "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/p9" + "gvisor.dev/gvisor/pkg/sync" ) func TestPanic(t *testing.T) { diff --git a/pkg/p9/p9test/p9test.go b/pkg/p9/p9test/p9test.go index 4d3271b37..dd8b01b6d 100644 --- a/pkg/p9/p9test/p9test.go +++ b/pkg/p9/p9test/p9test.go @@ -17,13 +17,13 @@ package p9test import ( "fmt" - "sync" "sync/atomic" "syscall" "testing" "github.com/golang/mock/gomock" "gvisor.dev/gvisor/pkg/p9" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/unet" ) diff --git a/pkg/p9/path_tree.go b/pkg/p9/path_tree.go index 865459411..72ef53313 100644 --- a/pkg/p9/path_tree.go +++ b/pkg/p9/path_tree.go @@ -16,7 +16,8 @@ package p9 import ( "fmt" - "sync" + + "gvisor.dev/gvisor/pkg/sync" ) // pathNode is a single node in a path traversal. diff --git a/pkg/p9/pool.go b/pkg/p9/pool.go index 52de889e1..2b14a5ce3 100644 --- a/pkg/p9/pool.go +++ b/pkg/p9/pool.go @@ -15,7 +15,7 @@ package p9 import ( - "sync" + "gvisor.dev/gvisor/pkg/sync" ) // pool is a simple allocator. diff --git a/pkg/p9/server.go b/pkg/p9/server.go index 40b8fa023..fdfa83648 100644 --- a/pkg/p9/server.go +++ b/pkg/p9/server.go @@ -17,7 +17,6 @@ package p9 import ( "io" "runtime/debug" - "sync" "sync/atomic" "syscall" @@ -25,6 +24,7 @@ import ( "gvisor.dev/gvisor/pkg/fdchannel" "gvisor.dev/gvisor/pkg/flipcall" "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/unet" ) diff --git a/pkg/p9/transport.go b/pkg/p9/transport.go index 6e8b4bbcd..9c11e28ce 100644 --- a/pkg/p9/transport.go +++ b/pkg/p9/transport.go @@ -19,11 +19,11 @@ import ( "fmt" "io" "io/ioutil" - "sync" "syscall" "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/unet" ) diff --git a/pkg/procid/BUILD b/pkg/procid/BUILD index 078f084b2..b506813f0 100644 --- a/pkg/procid/BUILD +++ b/pkg/procid/BUILD @@ -21,6 +21,7 @@ go_test( "procid_test.go", ], embed = [":procid"], + deps = ["//pkg/sync"], ) go_test( @@ -31,4 +32,5 @@ go_test( "procid_test.go", ], embed = [":procid"], + deps = ["//pkg/sync"], ) diff --git a/pkg/procid/procid_test.go b/pkg/procid/procid_test.go index 88dd0b3ae..9ec08c3d6 100644 --- a/pkg/procid/procid_test.go +++ b/pkg/procid/procid_test.go @@ -17,9 +17,10 @@ package procid import ( "os" "runtime" - "sync" "syscall" "testing" + + "gvisor.dev/gvisor/pkg/sync" ) // runOnMain is used to send functions to run on the main (initial) thread. diff --git a/pkg/rand/BUILD b/pkg/rand/BUILD index f4f2001f3..9d5b4859b 100644 --- a/pkg/rand/BUILD +++ b/pkg/rand/BUILD @@ -10,5 +10,8 @@ go_library( ], importpath = "gvisor.dev/gvisor/pkg/rand", visibility = ["//:sandbox"], - deps = ["@org_golang_x_sys//unix:go_default_library"], + deps = [ + "//pkg/sync", + "@org_golang_x_sys//unix:go_default_library", + ], ) diff --git a/pkg/rand/rand_linux.go b/pkg/rand/rand_linux.go index 2b92db3e6..0bdad5fad 100644 --- a/pkg/rand/rand_linux.go +++ b/pkg/rand/rand_linux.go @@ -19,9 +19,9 @@ package rand import ( "crypto/rand" "io" - "sync" "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/sync" ) // reader implements an io.Reader that returns pseudorandom bytes. diff --git a/pkg/refs/BUILD b/pkg/refs/BUILD index 7ad59dfd7..974d9af9b 100644 --- a/pkg/refs/BUILD +++ b/pkg/refs/BUILD @@ -27,6 +27,7 @@ go_library( visibility = ["//:sandbox"], deps = [ "//pkg/log", + "//pkg/sync", ], ) @@ -35,4 +36,5 @@ go_test( size = "small", srcs = ["refcounter_test.go"], embed = [":refs"], + deps = ["//pkg/sync"], ) diff --git a/pkg/refs/refcounter.go b/pkg/refs/refcounter.go index ad69e0757..c45ba8200 100644 --- a/pkg/refs/refcounter.go +++ b/pkg/refs/refcounter.go @@ -21,10 +21,10 @@ import ( "fmt" "reflect" "runtime" - "sync" "sync/atomic" "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sync" ) // RefCounter is the interface to be implemented by objects that are reference diff --git a/pkg/refs/refcounter_test.go b/pkg/refs/refcounter_test.go index ffd3d3f07..1ab4a4440 100644 --- a/pkg/refs/refcounter_test.go +++ b/pkg/refs/refcounter_test.go @@ -16,8 +16,9 @@ package refs import ( "reflect" - "sync" "testing" + + "gvisor.dev/gvisor/pkg/sync" ) type testCounter struct { diff --git a/pkg/sentry/arch/BUILD b/pkg/sentry/arch/BUILD index 18c73cc24..ae3e364cd 100644 --- a/pkg/sentry/arch/BUILD +++ b/pkg/sentry/arch/BUILD @@ -32,6 +32,7 @@ go_library( "//pkg/sentry/context", "//pkg/sentry/limits", "//pkg/sentry/usermem", + "//pkg/sync", "//pkg/syserror", ], ) diff --git a/pkg/sentry/arch/arch_x86.go b/pkg/sentry/arch/arch_x86.go index 9294ac773..9f41e566f 100644 --- a/pkg/sentry/arch/arch_x86.go +++ b/pkg/sentry/arch/arch_x86.go @@ -19,7 +19,6 @@ package arch import ( "fmt" "io" - "sync" "syscall" "gvisor.dev/gvisor/pkg/binary" @@ -27,6 +26,7 @@ import ( "gvisor.dev/gvisor/pkg/log" rpb "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto" "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) diff --git a/pkg/sentry/control/BUILD b/pkg/sentry/control/BUILD index 5522cecd0..2561a6109 100644 --- a/pkg/sentry/control/BUILD +++ b/pkg/sentry/control/BUILD @@ -30,6 +30,7 @@ go_library( "//pkg/sentry/strace", "//pkg/sentry/usage", "//pkg/sentry/watchdog", + "//pkg/sync", "//pkg/tcpip/link/sniffer", "//pkg/urpc", ], diff --git a/pkg/sentry/control/pprof.go b/pkg/sentry/control/pprof.go index e1f2fea60..151808911 100644 --- a/pkg/sentry/control/pprof.go +++ b/pkg/sentry/control/pprof.go @@ -19,10 +19,10 @@ import ( "runtime" "runtime/pprof" "runtime/trace" - "sync" "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/urpc" ) diff --git a/pkg/sentry/device/BUILD b/pkg/sentry/device/BUILD index 1098ed777..97fa1512c 100644 --- a/pkg/sentry/device/BUILD +++ b/pkg/sentry/device/BUILD @@ -8,7 +8,10 @@ go_library( srcs = ["device.go"], importpath = "gvisor.dev/gvisor/pkg/sentry/device", visibility = ["//pkg/sentry:internal"], - deps = ["//pkg/abi/linux"], + deps = [ + "//pkg/abi/linux", + "//pkg/sync", + ], ) go_test( diff --git a/pkg/sentry/device/device.go b/pkg/sentry/device/device.go index 47945d1a7..69e71e322 100644 --- a/pkg/sentry/device/device.go +++ b/pkg/sentry/device/device.go @@ -19,10 +19,10 @@ package device import ( "bytes" "fmt" - "sync" "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sync" ) // Registry tracks all simple devices and related state on the system for diff --git a/pkg/sentry/fs/BUILD b/pkg/sentry/fs/BUILD index c035ffff7..7d5d72d5a 100644 --- a/pkg/sentry/fs/BUILD +++ b/pkg/sentry/fs/BUILD @@ -68,7 +68,7 @@ go_library( "//pkg/sentry/usage", "//pkg/sentry/usermem", "//pkg/state", - "//pkg/syncutil", + "//pkg/sync", "//pkg/syserror", "//pkg/waiter", ], @@ -115,6 +115,7 @@ go_test( "//pkg/sentry/fs/tmpfs", "//pkg/sentry/kernel/contexttest", "//pkg/sentry/usermem", + "//pkg/sync", "//pkg/syserror", ], ) diff --git a/pkg/sentry/fs/copy_up.go b/pkg/sentry/fs/copy_up.go index 9ac62c84d..734177e90 100644 --- a/pkg/sentry/fs/copy_up.go +++ b/pkg/sentry/fs/copy_up.go @@ -17,12 +17,12 @@ package fs import ( "fmt" "io" - "sync" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) diff --git a/pkg/sentry/fs/copy_up_test.go b/pkg/sentry/fs/copy_up_test.go index 1d80bf15a..738580c5f 100644 --- a/pkg/sentry/fs/copy_up_test.go +++ b/pkg/sentry/fs/copy_up_test.go @@ -19,13 +19,13 @@ import ( "crypto/rand" "fmt" "io" - "sync" "testing" "gvisor.dev/gvisor/pkg/sentry/fs" _ "gvisor.dev/gvisor/pkg/sentry/fs/tmpfs" "gvisor.dev/gvisor/pkg/sentry/kernel/contexttest" "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sync" ) const ( diff --git a/pkg/sentry/fs/dirent.go b/pkg/sentry/fs/dirent.go index 3cb73bd78..31fc4d87b 100644 --- a/pkg/sentry/fs/dirent.go +++ b/pkg/sentry/fs/dirent.go @@ -18,7 +18,6 @@ import ( "fmt" "path" "sort" - "sync" "sync/atomic" "syscall" @@ -28,6 +27,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/uniqueid" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) diff --git a/pkg/sentry/fs/dirent_cache.go b/pkg/sentry/fs/dirent_cache.go index 60a15a275..25514ace4 100644 --- a/pkg/sentry/fs/dirent_cache.go +++ b/pkg/sentry/fs/dirent_cache.go @@ -16,7 +16,8 @@ package fs import ( "fmt" - "sync" + + "gvisor.dev/gvisor/pkg/sync" ) // DirentCache is an LRU cache of Dirents. The Dirent's refCount is diff --git a/pkg/sentry/fs/dirent_cache_limiter.go b/pkg/sentry/fs/dirent_cache_limiter.go index ebb80bd50..525ee25f9 100644 --- a/pkg/sentry/fs/dirent_cache_limiter.go +++ b/pkg/sentry/fs/dirent_cache_limiter.go @@ -16,7 +16,8 @@ package fs import ( "fmt" - "sync" + + "gvisor.dev/gvisor/pkg/sync" ) // DirentCacheLimiter acts as a global limit for all dirent caches in the diff --git a/pkg/sentry/fs/fdpipe/BUILD b/pkg/sentry/fs/fdpipe/BUILD index 277ee4c31..cc43de69d 100644 --- a/pkg/sentry/fs/fdpipe/BUILD +++ b/pkg/sentry/fs/fdpipe/BUILD @@ -23,6 +23,7 @@ go_library( "//pkg/sentry/fs/fsutil", "//pkg/sentry/safemem", "//pkg/sentry/usermem", + "//pkg/sync", "//pkg/syserror", "//pkg/waiter", ], diff --git a/pkg/sentry/fs/fdpipe/pipe.go b/pkg/sentry/fs/fdpipe/pipe.go index 669ffcb75..5b6cfeb0a 100644 --- a/pkg/sentry/fs/fdpipe/pipe.go +++ b/pkg/sentry/fs/fdpipe/pipe.go @@ -17,7 +17,6 @@ package fdpipe import ( "os" - "sync" "syscall" "gvisor.dev/gvisor/pkg/fd" @@ -29,6 +28,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/safemem" "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/fs/fdpipe/pipe_state.go b/pkg/sentry/fs/fdpipe/pipe_state.go index 29175fb3d..cee87f726 100644 --- a/pkg/sentry/fs/fdpipe/pipe_state.go +++ b/pkg/sentry/fs/fdpipe/pipe_state.go @@ -17,10 +17,10 @@ package fdpipe import ( "fmt" "io/ioutil" - "sync" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sync" ) // beforeSave is invoked by stateify. diff --git a/pkg/sentry/fs/file.go b/pkg/sentry/fs/file.go index a2f966cb6..7c4586296 100644 --- a/pkg/sentry/fs/file.go +++ b/pkg/sentry/fs/file.go @@ -16,7 +16,6 @@ package fs import ( "math" - "sync" "sync/atomic" "time" @@ -29,6 +28,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/uniqueid" "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/fs/file_overlay.go b/pkg/sentry/fs/file_overlay.go index 225e40186..8a633b1ba 100644 --- a/pkg/sentry/fs/file_overlay.go +++ b/pkg/sentry/fs/file_overlay.go @@ -16,13 +16,13 @@ package fs import ( "io" - "sync" "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/fs/filesystems.go b/pkg/sentry/fs/filesystems.go index b157fd228..c5b51620a 100644 --- a/pkg/sentry/fs/filesystems.go +++ b/pkg/sentry/fs/filesystems.go @@ -18,9 +18,9 @@ import ( "fmt" "sort" "strings" - "sync" "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sync" ) // FilesystemFlags matches include/linux/fs.h:file_system_type.fs_flags. diff --git a/pkg/sentry/fs/fs.go b/pkg/sentry/fs/fs.go index 8b2a5e6b2..26abf49e2 100644 --- a/pkg/sentry/fs/fs.go +++ b/pkg/sentry/fs/fs.go @@ -54,10 +54,9 @@ package fs import ( - "sync" - "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sync" ) var ( diff --git a/pkg/sentry/fs/fsutil/BUILD b/pkg/sentry/fs/fsutil/BUILD index 9ca695a95..945b6270d 100644 --- a/pkg/sentry/fs/fsutil/BUILD +++ b/pkg/sentry/fs/fsutil/BUILD @@ -93,6 +93,7 @@ go_library( "//pkg/sentry/usage", "//pkg/sentry/usermem", "//pkg/state", + "//pkg/sync", "//pkg/syserror", "//pkg/waiter", ], diff --git a/pkg/sentry/fs/fsutil/host_file_mapper.go b/pkg/sentry/fs/fsutil/host_file_mapper.go index b06a71cc2..837fc70b5 100644 --- a/pkg/sentry/fs/fsutil/host_file_mapper.go +++ b/pkg/sentry/fs/fsutil/host_file_mapper.go @@ -16,7 +16,6 @@ package fsutil import ( "fmt" - "sync" "syscall" "gvisor.dev/gvisor/pkg/log" @@ -24,6 +23,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/safemem" "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sync" ) // HostFileMapper caches mappings of an arbitrary host file descriptor. It is diff --git a/pkg/sentry/fs/fsutil/host_mappable.go b/pkg/sentry/fs/fsutil/host_mappable.go index 30475f340..a625f0e26 100644 --- a/pkg/sentry/fs/fsutil/host_mappable.go +++ b/pkg/sentry/fs/fsutil/host_mappable.go @@ -16,7 +16,6 @@ package fsutil import ( "math" - "sync" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" @@ -24,6 +23,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/safemem" "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sync" ) // HostMappable implements memmap.Mappable and platform.File over a diff --git a/pkg/sentry/fs/fsutil/inode.go b/pkg/sentry/fs/fsutil/inode.go index 4e100a402..adf5ec69c 100644 --- a/pkg/sentry/fs/fsutil/inode.go +++ b/pkg/sentry/fs/fsutil/inode.go @@ -15,13 +15,12 @@ package fsutil import ( - "sync" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/fs/fsutil/inode_cached.go b/pkg/sentry/fs/fsutil/inode_cached.go index 798920d18..20a014402 100644 --- a/pkg/sentry/fs/fsutil/inode_cached.go +++ b/pkg/sentry/fs/fsutil/inode_cached.go @@ -17,7 +17,6 @@ package fsutil import ( "fmt" "io" - "sync" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/context" @@ -30,6 +29,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/safemem" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sync" ) // Lock order (compare the lock order model in mm/mm.go): diff --git a/pkg/sentry/fs/gofer/BUILD b/pkg/sentry/fs/gofer/BUILD index 4a005c605..fd870e8e1 100644 --- a/pkg/sentry/fs/gofer/BUILD +++ b/pkg/sentry/fs/gofer/BUILD @@ -44,6 +44,7 @@ go_library( "//pkg/sentry/safemem", "//pkg/sentry/socket/unix/transport", "//pkg/sentry/usermem", + "//pkg/sync", "//pkg/syserr", "//pkg/syserror", "//pkg/unet", diff --git a/pkg/sentry/fs/gofer/inode.go b/pkg/sentry/fs/gofer/inode.go index 91263ebdc..245fe2ef1 100644 --- a/pkg/sentry/fs/gofer/inode.go +++ b/pkg/sentry/fs/gofer/inode.go @@ -16,7 +16,6 @@ package gofer import ( "errors" - "sync" "syscall" "gvisor.dev/gvisor/pkg/abi/linux" @@ -31,6 +30,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs/host" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/safemem" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) diff --git a/pkg/sentry/fs/gofer/session.go b/pkg/sentry/fs/gofer/session.go index 4e358a46a..edc796ce0 100644 --- a/pkg/sentry/fs/gofer/session.go +++ b/pkg/sentry/fs/gofer/session.go @@ -16,7 +16,6 @@ package gofer import ( "fmt" - "sync" "gvisor.dev/gvisor/pkg/p9" "gvisor.dev/gvisor/pkg/refs" @@ -25,6 +24,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/unet" ) diff --git a/pkg/sentry/fs/host/BUILD b/pkg/sentry/fs/host/BUILD index 23daeb528..2b581aa69 100644 --- a/pkg/sentry/fs/host/BUILD +++ b/pkg/sentry/fs/host/BUILD @@ -50,6 +50,7 @@ go_library( "//pkg/sentry/unimpl", "//pkg/sentry/uniqueid", "//pkg/sentry/usermem", + "//pkg/sync", "//pkg/syserr", "//pkg/syserror", "//pkg/tcpip", diff --git a/pkg/sentry/fs/host/inode.go b/pkg/sentry/fs/host/inode.go index a6e4a09e3..873a1c52d 100644 --- a/pkg/sentry/fs/host/inode.go +++ b/pkg/sentry/fs/host/inode.go @@ -15,7 +15,6 @@ package host import ( - "sync" "syscall" "gvisor.dev/gvisor/pkg/abi/linux" @@ -28,6 +27,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/safemem" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/fs/host/socket.go b/pkg/sentry/fs/host/socket.go index 107336a3e..c076d5bdd 100644 --- a/pkg/sentry/fs/host/socket.go +++ b/pkg/sentry/fs/host/socket.go @@ -16,7 +16,6 @@ package host import ( "fmt" - "sync" "syscall" "gvisor.dev/gvisor/pkg/abi/linux" @@ -30,6 +29,7 @@ import ( unixsocket "gvisor.dev/gvisor/pkg/sentry/socket/unix" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/uniqueid" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/tcpip" diff --git a/pkg/sentry/fs/host/tty.go b/pkg/sentry/fs/host/tty.go index 90331e3b2..753ef8cd6 100644 --- a/pkg/sentry/fs/host/tty.go +++ b/pkg/sentry/fs/host/tty.go @@ -15,8 +15,6 @@ package host import ( - "sync" - "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/context" @@ -24,6 +22,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/unimpl" "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) diff --git a/pkg/sentry/fs/inode.go b/pkg/sentry/fs/inode.go index 91e2fde2f..468043df0 100644 --- a/pkg/sentry/fs/inode.go +++ b/pkg/sentry/fs/inode.go @@ -15,8 +15,6 @@ package fs import ( - "sync" - "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/metric" @@ -26,6 +24,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) diff --git a/pkg/sentry/fs/inode_inotify.go b/pkg/sentry/fs/inode_inotify.go index 0f2a66a79..efd3c962b 100644 --- a/pkg/sentry/fs/inode_inotify.go +++ b/pkg/sentry/fs/inode_inotify.go @@ -16,7 +16,8 @@ package fs import ( "fmt" - "sync" + + "gvisor.dev/gvisor/pkg/sync" ) // Watches is the collection of inotify watches on an inode. diff --git a/pkg/sentry/fs/inotify.go b/pkg/sentry/fs/inotify.go index ba3e0233d..cc7dd1c92 100644 --- a/pkg/sentry/fs/inotify.go +++ b/pkg/sentry/fs/inotify.go @@ -16,7 +16,6 @@ package fs import ( "io" - "sync" "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" @@ -25,6 +24,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/uniqueid" "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/fs/inotify_watch.go b/pkg/sentry/fs/inotify_watch.go index 0aa0a5e9b..900cba3ca 100644 --- a/pkg/sentry/fs/inotify_watch.go +++ b/pkg/sentry/fs/inotify_watch.go @@ -15,10 +15,10 @@ package fs import ( - "sync" "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sync" ) // Watch represent a particular inotify watch created by inotify_add_watch. diff --git a/pkg/sentry/fs/lock/BUILD b/pkg/sentry/fs/lock/BUILD index 8d62642e7..2c332a82a 100644 --- a/pkg/sentry/fs/lock/BUILD +++ b/pkg/sentry/fs/lock/BUILD @@ -44,6 +44,7 @@ go_library( visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/log", + "//pkg/sync", "//pkg/waiter", ], ) diff --git a/pkg/sentry/fs/lock/lock.go b/pkg/sentry/fs/lock/lock.go index 636484424..41b040818 100644 --- a/pkg/sentry/fs/lock/lock.go +++ b/pkg/sentry/fs/lock/lock.go @@ -52,9 +52,9 @@ package lock import ( "fmt" "math" - "sync" "syscall" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/fs/mounts.go b/pkg/sentry/fs/mounts.go index ac0398bd9..db3dfd096 100644 --- a/pkg/sentry/fs/mounts.go +++ b/pkg/sentry/fs/mounts.go @@ -19,7 +19,6 @@ import ( "math" "path" "strings" - "sync" "syscall" "gvisor.dev/gvisor/pkg/abi/linux" @@ -27,6 +26,7 @@ import ( "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) diff --git a/pkg/sentry/fs/overlay.go b/pkg/sentry/fs/overlay.go index 25573e986..4cad55327 100644 --- a/pkg/sentry/fs/overlay.go +++ b/pkg/sentry/fs/overlay.go @@ -17,13 +17,12 @@ package fs import ( "fmt" "strings" - "sync" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/usermem" - "gvisor.dev/gvisor/pkg/syncutil" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) @@ -199,7 +198,7 @@ type overlayEntry struct { upper *Inode // dirCacheMu protects dirCache. - dirCacheMu syncutil.DowngradableRWMutex `state:"nosave"` + dirCacheMu sync.DowngradableRWMutex `state:"nosave"` // dirCache is cache of DentAttrs from upper and lower Inodes. dirCache *SortedDentryMap diff --git a/pkg/sentry/fs/proc/BUILD b/pkg/sentry/fs/proc/BUILD index 75cbb0622..94d46ab1b 100644 --- a/pkg/sentry/fs/proc/BUILD +++ b/pkg/sentry/fs/proc/BUILD @@ -51,6 +51,7 @@ go_library( "//pkg/sentry/socket/unix/transport", "//pkg/sentry/usage", "//pkg/sentry/usermem", + "//pkg/sync", "//pkg/syserror", "//pkg/tcpip/header", "//pkg/waiter", diff --git a/pkg/sentry/fs/proc/seqfile/BUILD b/pkg/sentry/fs/proc/seqfile/BUILD index fe7067be1..38b246dff 100644 --- a/pkg/sentry/fs/proc/seqfile/BUILD +++ b/pkg/sentry/fs/proc/seqfile/BUILD @@ -16,6 +16,7 @@ go_library( "//pkg/sentry/fs/proc/device", "//pkg/sentry/kernel/time", "//pkg/sentry/usermem", + "//pkg/sync", "//pkg/syserror", "//pkg/waiter", ], diff --git a/pkg/sentry/fs/proc/seqfile/seqfile.go b/pkg/sentry/fs/proc/seqfile/seqfile.go index 5fe823000..f9af191d5 100644 --- a/pkg/sentry/fs/proc/seqfile/seqfile.go +++ b/pkg/sentry/fs/proc/seqfile/seqfile.go @@ -17,7 +17,6 @@ package seqfile import ( "io" - "sync" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/context" @@ -26,6 +25,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs/proc/device" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/fs/proc/sys_net.go b/pkg/sentry/fs/proc/sys_net.go index bd93f83fa..a37e1fa06 100644 --- a/pkg/sentry/fs/proc/sys_net.go +++ b/pkg/sentry/fs/proc/sys_net.go @@ -17,7 +17,6 @@ package proc import ( "fmt" "io" - "sync" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/context" @@ -27,6 +26,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs/ramfs" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/fs/ramfs/BUILD b/pkg/sentry/fs/ramfs/BUILD index 012cb3e44..3fb7b0633 100644 --- a/pkg/sentry/fs/ramfs/BUILD +++ b/pkg/sentry/fs/ramfs/BUILD @@ -21,6 +21,7 @@ go_library( "//pkg/sentry/fs/fsutil", "//pkg/sentry/socket/unix/transport", "//pkg/sentry/usermem", + "//pkg/sync", "//pkg/syserror", "//pkg/waiter", ], diff --git a/pkg/sentry/fs/ramfs/dir.go b/pkg/sentry/fs/ramfs/dir.go index 78e082b8e..dcbb8eb2e 100644 --- a/pkg/sentry/fs/ramfs/dir.go +++ b/pkg/sentry/fs/ramfs/dir.go @@ -17,7 +17,6 @@ package ramfs import ( "fmt" - "sync" "syscall" "gvisor.dev/gvisor/pkg/abi/linux" @@ -25,6 +24,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) diff --git a/pkg/sentry/fs/restore.go b/pkg/sentry/fs/restore.go index f10168125..64c6a6ae9 100644 --- a/pkg/sentry/fs/restore.go +++ b/pkg/sentry/fs/restore.go @@ -15,7 +15,7 @@ package fs import ( - "sync" + "gvisor.dev/gvisor/pkg/sync" ) // RestoreEnvironment is the restore environment for file systems. It consists diff --git a/pkg/sentry/fs/tmpfs/BUILD b/pkg/sentry/fs/tmpfs/BUILD index 59ce400c2..3400b940c 100644 --- a/pkg/sentry/fs/tmpfs/BUILD +++ b/pkg/sentry/fs/tmpfs/BUILD @@ -31,6 +31,7 @@ go_library( "//pkg/sentry/socket/unix/transport", "//pkg/sentry/usage", "//pkg/sentry/usermem", + "//pkg/sync", "//pkg/syserror", "//pkg/waiter", ], diff --git a/pkg/sentry/fs/tmpfs/inode_file.go b/pkg/sentry/fs/tmpfs/inode_file.go index f86dfaa36..f1c87fe41 100644 --- a/pkg/sentry/fs/tmpfs/inode_file.go +++ b/pkg/sentry/fs/tmpfs/inode_file.go @@ -17,7 +17,6 @@ package tmpfs import ( "fmt" "io" - "sync" "time" "gvisor.dev/gvisor/pkg/abi/linux" @@ -31,6 +30,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/safemem" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) diff --git a/pkg/sentry/fs/tty/BUILD b/pkg/sentry/fs/tty/BUILD index 95ad98cb0..f6f60d0cf 100644 --- a/pkg/sentry/fs/tty/BUILD +++ b/pkg/sentry/fs/tty/BUILD @@ -30,6 +30,7 @@ go_library( "//pkg/sentry/socket/unix/transport", "//pkg/sentry/unimpl", "//pkg/sentry/usermem", + "//pkg/sync", "//pkg/syserror", "//pkg/waiter", ], diff --git a/pkg/sentry/fs/tty/dir.go b/pkg/sentry/fs/tty/dir.go index 2f639c823..88aa66b24 100644 --- a/pkg/sentry/fs/tty/dir.go +++ b/pkg/sentry/fs/tty/dir.go @@ -19,7 +19,6 @@ import ( "fmt" "math" "strconv" - "sync" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/context" @@ -28,6 +27,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/fs/tty/line_discipline.go b/pkg/sentry/fs/tty/line_discipline.go index 7cc0eb409..894964260 100644 --- a/pkg/sentry/fs/tty/line_discipline.go +++ b/pkg/sentry/fs/tty/line_discipline.go @@ -16,13 +16,13 @@ package tty import ( "bytes" - "sync" "unicode/utf8" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/fs/tty/queue.go b/pkg/sentry/fs/tty/queue.go index 231e4e6eb..8b5d4699a 100644 --- a/pkg/sentry/fs/tty/queue.go +++ b/pkg/sentry/fs/tty/queue.go @@ -15,13 +15,12 @@ package tty import ( - "sync" - "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/safemem" "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/fsimpl/ext/BUILD b/pkg/sentry/fsimpl/ext/BUILD index bc90330bc..903874141 100644 --- a/pkg/sentry/fsimpl/ext/BUILD +++ b/pkg/sentry/fsimpl/ext/BUILD @@ -50,6 +50,7 @@ go_library( "//pkg/sentry/syscalls/linux", "//pkg/sentry/usermem", "//pkg/sentry/vfs", + "//pkg/sync", "//pkg/syserror", "//pkg/waiter", ], diff --git a/pkg/sentry/fsimpl/ext/directory.go b/pkg/sentry/fsimpl/ext/directory.go index 91802dc1e..8944171c8 100644 --- a/pkg/sentry/fsimpl/ext/directory.go +++ b/pkg/sentry/fsimpl/ext/directory.go @@ -15,8 +15,6 @@ package ext import ( - "sync" - "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/binary" "gvisor.dev/gvisor/pkg/log" @@ -25,6 +23,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) diff --git a/pkg/sentry/fsimpl/ext/filesystem.go b/pkg/sentry/fsimpl/ext/filesystem.go index 616fc002a..9afb1a84c 100644 --- a/pkg/sentry/fsimpl/ext/filesystem.go +++ b/pkg/sentry/fsimpl/ext/filesystem.go @@ -17,13 +17,13 @@ package ext import ( "errors" "io" - "sync" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) diff --git a/pkg/sentry/fsimpl/ext/regular_file.go b/pkg/sentry/fsimpl/ext/regular_file.go index aec33e00a..d11153c90 100644 --- a/pkg/sentry/fsimpl/ext/regular_file.go +++ b/pkg/sentry/fsimpl/ext/regular_file.go @@ -16,7 +16,6 @@ package ext import ( "io" - "sync" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/context" @@ -24,6 +23,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/safemem" "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) diff --git a/pkg/sentry/fsimpl/kernfs/BUILD b/pkg/sentry/fsimpl/kernfs/BUILD index 39c03ee9d..809178250 100644 --- a/pkg/sentry/fsimpl/kernfs/BUILD +++ b/pkg/sentry/fsimpl/kernfs/BUILD @@ -39,6 +39,7 @@ go_library( "//pkg/sentry/memmap", "//pkg/sentry/usermem", "//pkg/sentry/vfs", + "//pkg/sync", "//pkg/syserror", ], ) @@ -56,6 +57,7 @@ go_test( "//pkg/sentry/kernel/auth", "//pkg/sentry/usermem", "//pkg/sentry/vfs", + "//pkg/sync", "//pkg/syserror", "@com_github_google_go-cmp//cmp:go_default_library", ], diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go index 752e0f659..1d469a0db 100644 --- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go @@ -16,7 +16,6 @@ package kernfs import ( "fmt" - "sync" "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" @@ -24,6 +23,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go index d69b299ae..bb12f39a2 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs.go @@ -53,7 +53,6 @@ package kernfs import ( "fmt" - "sync" "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" @@ -61,6 +60,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" ) // FilesystemType implements vfs.FilesystemType. diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go index 4b6b95f5f..5c9d580e1 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs_test.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go @@ -19,7 +19,6 @@ import ( "fmt" "io" "runtime" - "sync" "testing" "github.com/google/go-cmp/cmp" @@ -31,6 +30,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) diff --git a/pkg/sentry/fsimpl/tmpfs/BUILD b/pkg/sentry/fsimpl/tmpfs/BUILD index a5b285987..82f5c2f41 100644 --- a/pkg/sentry/fsimpl/tmpfs/BUILD +++ b/pkg/sentry/fsimpl/tmpfs/BUILD @@ -47,6 +47,7 @@ go_library( "//pkg/sentry/usage", "//pkg/sentry/usermem", "//pkg/sentry/vfs", + "//pkg/sync", "//pkg/syserror", ], ) diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file.go b/pkg/sentry/fsimpl/tmpfs/regular_file.go index f51e247a7..f200e767d 100644 --- a/pkg/sentry/fsimpl/tmpfs/regular_file.go +++ b/pkg/sentry/fsimpl/tmpfs/regular_file.go @@ -17,7 +17,6 @@ package tmpfs import ( "io" "math" - "sync" "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" @@ -30,6 +29,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go index 7be6faa5b..701826f90 100644 --- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go +++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go @@ -26,7 +26,6 @@ package tmpfs import ( "fmt" "math" - "sync" "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" @@ -34,6 +33,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD index 2706927ff..ac85ba0c8 100644 --- a/pkg/sentry/kernel/BUILD +++ b/pkg/sentry/kernel/BUILD @@ -35,7 +35,7 @@ go_template_instance( out = "seqatomic_taskgoroutineschedinfo_unsafe.go", package = "kernel", suffix = "TaskGoroutineSchedInfo", - template = "//pkg/syncutil:generic_seqatomic", + template = "//pkg/sync:generic_seqatomic", types = { "Value": "TaskGoroutineSchedInfo", }, @@ -209,7 +209,7 @@ go_library( "//pkg/sentry/usermem", "//pkg/state", "//pkg/state/statefile", - "//pkg/syncutil", + "//pkg/sync", "//pkg/syserr", "//pkg/syserror", "//pkg/tcpip", @@ -241,6 +241,7 @@ go_test( "//pkg/sentry/time", "//pkg/sentry/usage", "//pkg/sentry/usermem", + "//pkg/sync", "//pkg/syserror", ], ) diff --git a/pkg/sentry/kernel/abstract_socket_namespace.go b/pkg/sentry/kernel/abstract_socket_namespace.go index 244655b5c..920fe4329 100644 --- a/pkg/sentry/kernel/abstract_socket_namespace.go +++ b/pkg/sentry/kernel/abstract_socket_namespace.go @@ -15,11 +15,11 @@ package kernel import ( - "sync" "syscall" "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" + "gvisor.dev/gvisor/pkg/sync" ) // +stateify savable diff --git a/pkg/sentry/kernel/auth/BUILD b/pkg/sentry/kernel/auth/BUILD index 04c244447..1aa72fa47 100644 --- a/pkg/sentry/kernel/auth/BUILD +++ b/pkg/sentry/kernel/auth/BUILD @@ -8,7 +8,7 @@ go_template_instance( out = "atomicptr_credentials_unsafe.go", package = "auth", suffix = "Credentials", - template = "//pkg/syncutil:generic_atomicptr", + template = "//pkg/sync:generic_atomicptr", types = { "Value": "Credentials", }, @@ -64,6 +64,7 @@ go_library( "//pkg/bits", "//pkg/log", "//pkg/sentry/context", + "//pkg/sync", "//pkg/syserror", ], ) diff --git a/pkg/sentry/kernel/auth/user_namespace.go b/pkg/sentry/kernel/auth/user_namespace.go index af28ccc65..9dd52c860 100644 --- a/pkg/sentry/kernel/auth/user_namespace.go +++ b/pkg/sentry/kernel/auth/user_namespace.go @@ -16,8 +16,8 @@ package auth import ( "math" - "sync" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) diff --git a/pkg/sentry/kernel/epoll/BUILD b/pkg/sentry/kernel/epoll/BUILD index 3361e8b7d..c47f6b6fc 100644 --- a/pkg/sentry/kernel/epoll/BUILD +++ b/pkg/sentry/kernel/epoll/BUILD @@ -32,6 +32,7 @@ go_library( "//pkg/sentry/fs/anon", "//pkg/sentry/fs/fsutil", "//pkg/sentry/usermem", + "//pkg/sync", "//pkg/waiter", ], ) diff --git a/pkg/sentry/kernel/epoll/epoll.go b/pkg/sentry/kernel/epoll/epoll.go index 9c0a4e1b4..430311cc0 100644 --- a/pkg/sentry/kernel/epoll/epoll.go +++ b/pkg/sentry/kernel/epoll/epoll.go @@ -18,7 +18,6 @@ package epoll import ( "fmt" - "sync" "syscall" "gvisor.dev/gvisor/pkg/refs" @@ -27,6 +26,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs/anon" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/kernel/eventfd/BUILD b/pkg/sentry/kernel/eventfd/BUILD index e65b961e8..c831fbab2 100644 --- a/pkg/sentry/kernel/eventfd/BUILD +++ b/pkg/sentry/kernel/eventfd/BUILD @@ -16,6 +16,7 @@ go_library( "//pkg/sentry/fs/anon", "//pkg/sentry/fs/fsutil", "//pkg/sentry/usermem", + "//pkg/sync", "//pkg/syserror", "//pkg/waiter", ], diff --git a/pkg/sentry/kernel/eventfd/eventfd.go b/pkg/sentry/kernel/eventfd/eventfd.go index 12f0d429b..687690679 100644 --- a/pkg/sentry/kernel/eventfd/eventfd.go +++ b/pkg/sentry/kernel/eventfd/eventfd.go @@ -18,7 +18,6 @@ package eventfd import ( "math" - "sync" "syscall" "gvisor.dev/gvisor/pkg/abi/linux" @@ -28,6 +27,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs/anon" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/kernel/fasync/BUILD b/pkg/sentry/kernel/fasync/BUILD index 49d81b712..6b36bc63e 100644 --- a/pkg/sentry/kernel/fasync/BUILD +++ b/pkg/sentry/kernel/fasync/BUILD @@ -12,6 +12,7 @@ go_library( "//pkg/sentry/fs", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", + "//pkg/sync", "//pkg/waiter", ], ) diff --git a/pkg/sentry/kernel/fasync/fasync.go b/pkg/sentry/kernel/fasync/fasync.go index 6b0bb0324..d32c3e90a 100644 --- a/pkg/sentry/kernel/fasync/fasync.go +++ b/pkg/sentry/kernel/fasync/fasync.go @@ -16,12 +16,11 @@ package fasync import ( - "sync" - "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go index 11f613a11..cd1501f85 100644 --- a/pkg/sentry/kernel/fd_table.go +++ b/pkg/sentry/kernel/fd_table.go @@ -18,7 +18,6 @@ import ( "bytes" "fmt" "math" - "sync" "sync/atomic" "syscall" @@ -28,6 +27,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/limits" + "gvisor.dev/gvisor/pkg/sync" ) // FDFlags define flags for an individual descriptor. diff --git a/pkg/sentry/kernel/fd_table_test.go b/pkg/sentry/kernel/fd_table_test.go index 2bcb6216a..eccb7d1e7 100644 --- a/pkg/sentry/kernel/fd_table_test.go +++ b/pkg/sentry/kernel/fd_table_test.go @@ -16,7 +16,6 @@ package kernel import ( "runtime" - "sync" "testing" "gvisor.dev/gvisor/pkg/sentry/context" @@ -24,6 +23,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/filetest" "gvisor.dev/gvisor/pkg/sentry/limits" + "gvisor.dev/gvisor/pkg/sync" ) const ( diff --git a/pkg/sentry/kernel/fs_context.go b/pkg/sentry/kernel/fs_context.go index ded27d668..2448c1d99 100644 --- a/pkg/sentry/kernel/fs_context.go +++ b/pkg/sentry/kernel/fs_context.go @@ -16,10 +16,10 @@ package kernel import ( "fmt" - "sync" "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sync" ) // FSContext contains filesystem context. diff --git a/pkg/sentry/kernel/futex/BUILD b/pkg/sentry/kernel/futex/BUILD index 75ec31761..50db443ce 100644 --- a/pkg/sentry/kernel/futex/BUILD +++ b/pkg/sentry/kernel/futex/BUILD @@ -9,7 +9,7 @@ go_template_instance( out = "atomicptr_bucket_unsafe.go", package = "futex", suffix = "Bucket", - template = "//pkg/syncutil:generic_atomicptr", + template = "//pkg/sync:generic_atomicptr", types = { "Value": "bucket", }, @@ -42,6 +42,7 @@ go_library( "//pkg/sentry/context", "//pkg/sentry/memmap", "//pkg/sentry/usermem", + "//pkg/sync", "//pkg/syserror", ], ) @@ -51,5 +52,8 @@ go_test( size = "small", srcs = ["futex_test.go"], embed = [":futex"], - deps = ["//pkg/sentry/usermem"], + deps = [ + "//pkg/sentry/usermem", + "//pkg/sync", + ], ) diff --git a/pkg/sentry/kernel/futex/futex.go b/pkg/sentry/kernel/futex/futex.go index 278cc8143..d1931c8f4 100644 --- a/pkg/sentry/kernel/futex/futex.go +++ b/pkg/sentry/kernel/futex/futex.go @@ -18,11 +18,10 @@ package futex import ( - "sync" - "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) diff --git a/pkg/sentry/kernel/futex/futex_test.go b/pkg/sentry/kernel/futex/futex_test.go index 65e5d1428..c23126ca5 100644 --- a/pkg/sentry/kernel/futex/futex_test.go +++ b/pkg/sentry/kernel/futex/futex_test.go @@ -17,13 +17,13 @@ package futex import ( "math" "runtime" - "sync" "sync/atomic" "syscall" "testing" "unsafe" "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sync" ) // testData implements the Target interface, and allows us to diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index 8653d2f63..c85e97fef 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -36,7 +36,6 @@ import ( "fmt" "io" "path/filepath" - "sync" "sync/atomic" "time" @@ -67,6 +66,7 @@ import ( uspb "gvisor.dev/gvisor/pkg/sentry/unimpl/unimplemented_syscall_go_proto" "gvisor.dev/gvisor/pkg/sentry/uniqueid" "gvisor.dev/gvisor/pkg/state" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" ) diff --git a/pkg/sentry/kernel/memevent/BUILD b/pkg/sentry/kernel/memevent/BUILD index d7a7d1169..7f36252a9 100644 --- a/pkg/sentry/kernel/memevent/BUILD +++ b/pkg/sentry/kernel/memevent/BUILD @@ -16,6 +16,7 @@ go_library( "//pkg/metric", "//pkg/sentry/kernel", "//pkg/sentry/usage", + "//pkg/sync", ], ) diff --git a/pkg/sentry/kernel/memevent/memory_events.go b/pkg/sentry/kernel/memevent/memory_events.go index b0d98e7f0..200565bb8 100644 --- a/pkg/sentry/kernel/memevent/memory_events.go +++ b/pkg/sentry/kernel/memevent/memory_events.go @@ -17,7 +17,6 @@ package memevent import ( - "sync" "time" "gvisor.dev/gvisor/pkg/eventchannel" @@ -26,6 +25,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel" pb "gvisor.dev/gvisor/pkg/sentry/kernel/memevent/memory_events_go_proto" "gvisor.dev/gvisor/pkg/sentry/usage" + "gvisor.dev/gvisor/pkg/sync" ) var totalTicks = metric.MustCreateNewUint64Metric("/memory_events/ticks", false /*sync*/, "Total number of memory event periods that have elapsed since startup.") diff --git a/pkg/sentry/kernel/pipe/BUILD b/pkg/sentry/kernel/pipe/BUILD index 9d34f6d4d..5eeaeff66 100644 --- a/pkg/sentry/kernel/pipe/BUILD +++ b/pkg/sentry/kernel/pipe/BUILD @@ -43,6 +43,7 @@ go_library( "//pkg/sentry/safemem", "//pkg/sentry/usermem", "//pkg/sentry/vfs", + "//pkg/sync", "//pkg/syserror", "//pkg/waiter", ], diff --git a/pkg/sentry/kernel/pipe/buffer.go b/pkg/sentry/kernel/pipe/buffer.go index 95bee2d37..1c0f34269 100644 --- a/pkg/sentry/kernel/pipe/buffer.go +++ b/pkg/sentry/kernel/pipe/buffer.go @@ -16,9 +16,9 @@ package pipe import ( "io" - "sync" "gvisor.dev/gvisor/pkg/sentry/safemem" + "gvisor.dev/gvisor/pkg/sync" ) // buffer encapsulates a queueable byte buffer. diff --git a/pkg/sentry/kernel/pipe/node.go b/pkg/sentry/kernel/pipe/node.go index 4a19ab7ce..716f589af 100644 --- a/pkg/sentry/kernel/pipe/node.go +++ b/pkg/sentry/kernel/pipe/node.go @@ -15,12 +15,11 @@ package pipe import ( - "sync" - "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) diff --git a/pkg/sentry/kernel/pipe/pipe.go b/pkg/sentry/kernel/pipe/pipe.go index 1a1b38f83..e4fd7d420 100644 --- a/pkg/sentry/kernel/pipe/pipe.go +++ b/pkg/sentry/kernel/pipe/pipe.go @@ -17,12 +17,12 @@ package pipe import ( "fmt" - "sync" "sync/atomic" "syscall" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/kernel/pipe/pipe_util.go b/pkg/sentry/kernel/pipe/pipe_util.go index ef9641e6a..8394eb78b 100644 --- a/pkg/sentry/kernel/pipe/pipe_util.go +++ b/pkg/sentry/kernel/pipe/pipe_util.go @@ -17,7 +17,6 @@ package pipe import ( "io" "math" - "sync" "syscall" "gvisor.dev/gvisor/pkg/abi/linux" @@ -25,6 +24,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/kernel/pipe/vfs.go b/pkg/sentry/kernel/pipe/vfs.go index 6416e0dd8..bf7461cbb 100644 --- a/pkg/sentry/kernel/pipe/vfs.go +++ b/pkg/sentry/kernel/pipe/vfs.go @@ -15,13 +15,12 @@ package pipe import ( - "sync" - "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/kernel/semaphore/BUILD b/pkg/sentry/kernel/semaphore/BUILD index f4c00cd86..13a961594 100644 --- a/pkg/sentry/kernel/semaphore/BUILD +++ b/pkg/sentry/kernel/semaphore/BUILD @@ -31,6 +31,7 @@ go_library( "//pkg/sentry/fs", "//pkg/sentry/kernel/auth", "//pkg/sentry/kernel/time", + "//pkg/sync", "//pkg/syserror", ], ) diff --git a/pkg/sentry/kernel/semaphore/semaphore.go b/pkg/sentry/kernel/semaphore/semaphore.go index de9617e9d..18299814e 100644 --- a/pkg/sentry/kernel/semaphore/semaphore.go +++ b/pkg/sentry/kernel/semaphore/semaphore.go @@ -17,7 +17,6 @@ package semaphore import ( "fmt" - "sync" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/log" @@ -25,6 +24,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) diff --git a/pkg/sentry/kernel/shm/BUILD b/pkg/sentry/kernel/shm/BUILD index cd48945e6..7321b22ed 100644 --- a/pkg/sentry/kernel/shm/BUILD +++ b/pkg/sentry/kernel/shm/BUILD @@ -24,6 +24,7 @@ go_library( "//pkg/sentry/platform", "//pkg/sentry/usage", "//pkg/sentry/usermem", + "//pkg/sync", "//pkg/syserror", ], ) diff --git a/pkg/sentry/kernel/shm/shm.go b/pkg/sentry/kernel/shm/shm.go index 19034a21e..8ddef7eb8 100644 --- a/pkg/sentry/kernel/shm/shm.go +++ b/pkg/sentry/kernel/shm/shm.go @@ -35,7 +35,6 @@ package shm import ( "fmt" - "sync" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/log" @@ -49,6 +48,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) diff --git a/pkg/sentry/kernel/signal_handlers.go b/pkg/sentry/kernel/signal_handlers.go index a16f3d57f..768fda220 100644 --- a/pkg/sentry/kernel/signal_handlers.go +++ b/pkg/sentry/kernel/signal_handlers.go @@ -15,10 +15,9 @@ package kernel import ( - "sync" - "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sync" ) // SignalHandlers holds information about signal actions. diff --git a/pkg/sentry/kernel/signalfd/BUILD b/pkg/sentry/kernel/signalfd/BUILD index 9f7e19b4d..89e4d84b1 100644 --- a/pkg/sentry/kernel/signalfd/BUILD +++ b/pkg/sentry/kernel/signalfd/BUILD @@ -16,6 +16,7 @@ go_library( "//pkg/sentry/fs/fsutil", "//pkg/sentry/kernel", "//pkg/sentry/usermem", + "//pkg/sync", "//pkg/syserror", "//pkg/waiter", ], diff --git a/pkg/sentry/kernel/signalfd/signalfd.go b/pkg/sentry/kernel/signalfd/signalfd.go index 4b08d7d72..28be4a939 100644 --- a/pkg/sentry/kernel/signalfd/signalfd.go +++ b/pkg/sentry/kernel/signalfd/signalfd.go @@ -16,8 +16,6 @@ package signalfd import ( - "sync" - "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/binary" "gvisor.dev/gvisor/pkg/sentry/context" @@ -26,6 +24,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/kernel/syscalls.go b/pkg/sentry/kernel/syscalls.go index 2fdee0282..d2d01add4 100644 --- a/pkg/sentry/kernel/syscalls.go +++ b/pkg/sentry/kernel/syscalls.go @@ -16,13 +16,13 @@ package kernel import ( "fmt" - "sync" "sync/atomic" "gvisor.dev/gvisor/pkg/abi" "gvisor.dev/gvisor/pkg/bits" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sync" ) // maxSyscallNum is the highest supported syscall number. diff --git a/pkg/sentry/kernel/syslog.go b/pkg/sentry/kernel/syslog.go index 8227ecf1d..4607cde2f 100644 --- a/pkg/sentry/kernel/syslog.go +++ b/pkg/sentry/kernel/syslog.go @@ -17,7 +17,8 @@ package kernel import ( "fmt" "math/rand" - "sync" + + "gvisor.dev/gvisor/pkg/sync" ) // syslog represents a sentry-global kernel log. diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go index d25a7903b..978d66da8 100644 --- a/pkg/sentry/kernel/task.go +++ b/pkg/sentry/kernel/task.go @@ -17,7 +17,6 @@ package kernel import ( gocontext "context" "runtime/trace" - "sync" "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" @@ -37,7 +36,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/uniqueid" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sentry/usermem" - "gvisor.dev/gvisor/pkg/syncutil" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/waiter" ) @@ -85,7 +84,7 @@ type Task struct { // // gosched is protected by goschedSeq. gosched is owned by the task // goroutine. - goschedSeq syncutil.SeqCount `state:"nosave"` + goschedSeq sync.SeqCount `state:"nosave"` gosched TaskGoroutineSchedInfo // yieldCount is the number of times the task goroutine has called diff --git a/pkg/sentry/kernel/thread_group.go b/pkg/sentry/kernel/thread_group.go index c0197a563..768e958d2 100644 --- a/pkg/sentry/kernel/thread_group.go +++ b/pkg/sentry/kernel/thread_group.go @@ -15,7 +15,6 @@ package kernel import ( - "sync" "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" @@ -25,6 +24,7 @@ import ( ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/usage" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) diff --git a/pkg/sentry/kernel/threads.go b/pkg/sentry/kernel/threads.go index 8267929a6..bf2dabb6e 100644 --- a/pkg/sentry/kernel/threads.go +++ b/pkg/sentry/kernel/threads.go @@ -16,9 +16,9 @@ package kernel import ( "fmt" - "sync" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/kernel/time/BUILD b/pkg/sentry/kernel/time/BUILD index 31847e1df..4e4de0512 100644 --- a/pkg/sentry/kernel/time/BUILD +++ b/pkg/sentry/kernel/time/BUILD @@ -13,6 +13,7 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/sentry/context", + "//pkg/sync", "//pkg/syserror", "//pkg/waiter", ], diff --git a/pkg/sentry/kernel/time/time.go b/pkg/sentry/kernel/time/time.go index 107394183..706de83ef 100644 --- a/pkg/sentry/kernel/time/time.go +++ b/pkg/sentry/kernel/time/time.go @@ -19,10 +19,10 @@ package time import ( "fmt" "math" - "sync" "time" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/kernel/timekeeper.go b/pkg/sentry/kernel/timekeeper.go index 76417342a..dc99301de 100644 --- a/pkg/sentry/kernel/timekeeper.go +++ b/pkg/sentry/kernel/timekeeper.go @@ -16,7 +16,6 @@ package kernel import ( "fmt" - "sync" "time" "gvisor.dev/gvisor/pkg/log" @@ -24,6 +23,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/platform" sentrytime "gvisor.dev/gvisor/pkg/sentry/time" + "gvisor.dev/gvisor/pkg/sync" ) // Timekeeper manages all of the kernel clocks. diff --git a/pkg/sentry/kernel/tty.go b/pkg/sentry/kernel/tty.go index 048de26dc..464d2306a 100644 --- a/pkg/sentry/kernel/tty.go +++ b/pkg/sentry/kernel/tty.go @@ -14,7 +14,7 @@ package kernel -import "sync" +import "gvisor.dev/gvisor/pkg/sync" // TTY defines the relationship between a thread group and its controlling // terminal. diff --git a/pkg/sentry/kernel/uts_namespace.go b/pkg/sentry/kernel/uts_namespace.go index 0a563e715..8ccf04bd1 100644 --- a/pkg/sentry/kernel/uts_namespace.go +++ b/pkg/sentry/kernel/uts_namespace.go @@ -15,9 +15,8 @@ package kernel import ( - "sync" - "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sync" ) // UTSNamespace represents a UTS namespace, a holder of two system identifiers: diff --git a/pkg/sentry/limits/BUILD b/pkg/sentry/limits/BUILD index 156e67bf8..9fa841e8b 100644 --- a/pkg/sentry/limits/BUILD +++ b/pkg/sentry/limits/BUILD @@ -15,6 +15,7 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/sentry/context", + "//pkg/sync", ], ) diff --git a/pkg/sentry/limits/limits.go b/pkg/sentry/limits/limits.go index b6c22656b..31b9e9ff6 100644 --- a/pkg/sentry/limits/limits.go +++ b/pkg/sentry/limits/limits.go @@ -16,8 +16,9 @@ package limits import ( - "sync" "syscall" + + "gvisor.dev/gvisor/pkg/sync" ) // LimitType defines a type of resource limit. diff --git a/pkg/sentry/mm/BUILD b/pkg/sentry/mm/BUILD index 839931f67..83e248431 100644 --- a/pkg/sentry/mm/BUILD +++ b/pkg/sentry/mm/BUILD @@ -118,7 +118,7 @@ go_library( "//pkg/sentry/safemem", "//pkg/sentry/usage", "//pkg/sentry/usermem", - "//pkg/syncutil", + "//pkg/sync", "//pkg/syserror", "//pkg/tcpip/buffer", ], diff --git a/pkg/sentry/mm/aio_context.go b/pkg/sentry/mm/aio_context.go index 1b746d030..4b48866ad 100644 --- a/pkg/sentry/mm/aio_context.go +++ b/pkg/sentry/mm/aio_context.go @@ -15,8 +15,6 @@ package mm import ( - "sync" - "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/context" @@ -25,6 +23,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) diff --git a/pkg/sentry/mm/mm.go b/pkg/sentry/mm/mm.go index 58a5c186d..fa86ebced 100644 --- a/pkg/sentry/mm/mm.go +++ b/pkg/sentry/mm/mm.go @@ -35,8 +35,6 @@ package mm import ( - "sync" - "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/memmap" @@ -44,7 +42,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/safemem" "gvisor.dev/gvisor/pkg/sentry/usermem" - "gvisor.dev/gvisor/pkg/syncutil" + "gvisor.dev/gvisor/pkg/sync" ) // MemoryManager implements a virtual address space. @@ -82,7 +80,7 @@ type MemoryManager struct { users int32 // mappingMu is analogous to Linux's struct mm_struct::mmap_sem. - mappingMu syncutil.DowngradableRWMutex `state:"nosave"` + mappingMu sync.DowngradableRWMutex `state:"nosave"` // vmas stores virtual memory areas. Since vmas are stored by value, // clients should usually use vmaIterator.ValuePtr() instead of @@ -125,7 +123,7 @@ type MemoryManager struct { // activeMu is loosely analogous to Linux's struct // mm_struct::page_table_lock. - activeMu syncutil.DowngradableRWMutex `state:"nosave"` + activeMu sync.DowngradableRWMutex `state:"nosave"` // pmas stores platform mapping areas used to implement vmas. Since pmas // are stored by value, clients should usually use pmaIterator.ValuePtr() diff --git a/pkg/sentry/pgalloc/BUILD b/pkg/sentry/pgalloc/BUILD index f404107af..a9a2642c5 100644 --- a/pkg/sentry/pgalloc/BUILD +++ b/pkg/sentry/pgalloc/BUILD @@ -73,6 +73,7 @@ go_library( "//pkg/sentry/usage", "//pkg/sentry/usermem", "//pkg/state", + "//pkg/sync", "//pkg/syserror", ], ) diff --git a/pkg/sentry/pgalloc/pgalloc.go b/pkg/sentry/pgalloc/pgalloc.go index f7f7298c4..c99e023d9 100644 --- a/pkg/sentry/pgalloc/pgalloc.go +++ b/pkg/sentry/pgalloc/pgalloc.go @@ -25,7 +25,6 @@ import ( "fmt" "math" "os" - "sync" "sync/atomic" "syscall" "time" @@ -37,6 +36,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/safemem" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) diff --git a/pkg/sentry/platform/interrupt/BUILD b/pkg/sentry/platform/interrupt/BUILD index b6d008dbe..85e882df9 100644 --- a/pkg/sentry/platform/interrupt/BUILD +++ b/pkg/sentry/platform/interrupt/BUILD @@ -10,6 +10,7 @@ go_library( ], importpath = "gvisor.dev/gvisor/pkg/sentry/platform/interrupt", visibility = ["//pkg/sentry:internal"], + deps = ["//pkg/sync"], ) go_test( diff --git a/pkg/sentry/platform/interrupt/interrupt.go b/pkg/sentry/platform/interrupt/interrupt.go index a4651f500..57be41647 100644 --- a/pkg/sentry/platform/interrupt/interrupt.go +++ b/pkg/sentry/platform/interrupt/interrupt.go @@ -17,7 +17,8 @@ package interrupt import ( "fmt" - "sync" + + "gvisor.dev/gvisor/pkg/sync" ) // Receiver receives interrupt notifications from a Forwarder. diff --git a/pkg/sentry/platform/kvm/BUILD b/pkg/sentry/platform/kvm/BUILD index f3afd98da..6a358d1d4 100644 --- a/pkg/sentry/platform/kvm/BUILD +++ b/pkg/sentry/platform/kvm/BUILD @@ -55,6 +55,7 @@ go_library( "//pkg/sentry/platform/safecopy", "//pkg/sentry/time", "//pkg/sentry/usermem", + "//pkg/sync", ], ) diff --git a/pkg/sentry/platform/kvm/address_space.go b/pkg/sentry/platform/kvm/address_space.go index ea8b9632e..a25f3c449 100644 --- a/pkg/sentry/platform/kvm/address_space.go +++ b/pkg/sentry/platform/kvm/address_space.go @@ -15,13 +15,13 @@ package kvm import ( - "sync" "sync/atomic" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables" "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sync" ) // dirtySet tracks vCPUs for invalidation. diff --git a/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go b/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go index e5fac0d6a..2f02c03cf 100644 --- a/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go +++ b/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go @@ -17,8 +17,6 @@ package kvm import ( - "unsafe" - "gvisor.dev/gvisor/pkg/sentry/arch" ) diff --git a/pkg/sentry/platform/kvm/kvm.go b/pkg/sentry/platform/kvm/kvm.go index f2c2c059e..a7850faed 100644 --- a/pkg/sentry/platform/kvm/kvm.go +++ b/pkg/sentry/platform/kvm/kvm.go @@ -18,13 +18,13 @@ package kvm import ( "fmt" "os" - "sync" "syscall" "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/platform/ring0" "gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables" "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sync" ) // KVM represents a lightweight VM context. diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go index 7d02ebf19..e6d912168 100644 --- a/pkg/sentry/platform/kvm/machine.go +++ b/pkg/sentry/platform/kvm/machine.go @@ -17,7 +17,6 @@ package kvm import ( "fmt" "runtime" - "sync" "sync/atomic" "syscall" @@ -27,6 +26,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/platform/ring0" "gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables" "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sync" ) // machine contains state associated with the VM as a whole. diff --git a/pkg/sentry/platform/ptrace/BUILD b/pkg/sentry/platform/ptrace/BUILD index 0df8cfa0f..cd13390c3 100644 --- a/pkg/sentry/platform/ptrace/BUILD +++ b/pkg/sentry/platform/ptrace/BUILD @@ -33,6 +33,7 @@ go_library( "//pkg/sentry/platform/interrupt", "//pkg/sentry/platform/safecopy", "//pkg/sentry/usermem", + "//pkg/sync", "@org_golang_x_sys//unix:go_default_library", ], ) diff --git a/pkg/sentry/platform/ptrace/ptrace.go b/pkg/sentry/platform/ptrace/ptrace.go index 7b120a15d..bb0e03880 100644 --- a/pkg/sentry/platform/ptrace/ptrace.go +++ b/pkg/sentry/platform/ptrace/ptrace.go @@ -46,13 +46,13 @@ package ptrace import ( "os" - "sync" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/platform/interrupt" "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sync" ) var ( diff --git a/pkg/sentry/platform/ptrace/subprocess.go b/pkg/sentry/platform/ptrace/subprocess.go index 20244fd95..15dc46a5b 100644 --- a/pkg/sentry/platform/ptrace/subprocess.go +++ b/pkg/sentry/platform/ptrace/subprocess.go @@ -18,7 +18,6 @@ import ( "fmt" "os" "runtime" - "sync" "syscall" "golang.org/x/sys/unix" @@ -27,6 +26,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sync" ) // Linux kernel errnos which "should never be seen by user programs", but will diff --git a/pkg/sentry/platform/ptrace/subprocess_linux_unsafe.go b/pkg/sentry/platform/ptrace/subprocess_linux_unsafe.go index 2e6fbe488..245b20722 100644 --- a/pkg/sentry/platform/ptrace/subprocess_linux_unsafe.go +++ b/pkg/sentry/platform/ptrace/subprocess_linux_unsafe.go @@ -18,7 +18,6 @@ package ptrace import ( - "sync" "sync/atomic" "syscall" "unsafe" @@ -26,6 +25,7 @@ import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/hostcpu" + "gvisor.dev/gvisor/pkg/sync" ) // maskPool contains reusable CPU masks for setting affinity. Unfortunately, diff --git a/pkg/sentry/platform/ring0/defs.go b/pkg/sentry/platform/ring0/defs.go index 3f094c2a7..86fd5ed58 100644 --- a/pkg/sentry/platform/ring0/defs.go +++ b/pkg/sentry/platform/ring0/defs.go @@ -17,7 +17,7 @@ package ring0 import ( "syscall" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables" ) // Kernel is a global kernel object. diff --git a/pkg/sentry/platform/ring0/defs_amd64.go b/pkg/sentry/platform/ring0/defs_amd64.go index 10dbd381f..9dae0dccb 100644 --- a/pkg/sentry/platform/ring0/defs_amd64.go +++ b/pkg/sentry/platform/ring0/defs_amd64.go @@ -18,6 +18,7 @@ package ring0 import ( "gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables" + "gvisor.dev/gvisor/pkg/sentry/usermem" ) var ( diff --git a/pkg/sentry/platform/ring0/defs_arm64.go b/pkg/sentry/platform/ring0/defs_arm64.go index dc0eeec01..a850ce6cf 100644 --- a/pkg/sentry/platform/ring0/defs_arm64.go +++ b/pkg/sentry/platform/ring0/defs_arm64.go @@ -18,6 +18,7 @@ package ring0 import ( "gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables" + "gvisor.dev/gvisor/pkg/sentry/usermem" ) var ( diff --git a/pkg/sentry/platform/ring0/pagetables/BUILD b/pkg/sentry/platform/ring0/pagetables/BUILD index e2e15ba5c..387a7f6c3 100644 --- a/pkg/sentry/platform/ring0/pagetables/BUILD +++ b/pkg/sentry/platform/ring0/pagetables/BUILD @@ -96,7 +96,10 @@ go_library( "//pkg/sentry/platform/kvm:__subpackages__", "//pkg/sentry/platform/ring0:__subpackages__", ], - deps = ["//pkg/sentry/usermem"], + deps = [ + "//pkg/sentry/usermem", + "//pkg/sync", + ], ) go_test( diff --git a/pkg/sentry/platform/ring0/pagetables/pcids_x86.go b/pkg/sentry/platform/ring0/pagetables/pcids_x86.go index 0f029f25d..e199bae18 100644 --- a/pkg/sentry/platform/ring0/pagetables/pcids_x86.go +++ b/pkg/sentry/platform/ring0/pagetables/pcids_x86.go @@ -17,7 +17,7 @@ package pagetables import ( - "sync" + "gvisor.dev/gvisor/pkg/sync" ) // limitPCID is the number of valid PCIDs. diff --git a/pkg/sentry/socket/netlink/BUILD b/pkg/sentry/socket/netlink/BUILD index 136821963..103933144 100644 --- a/pkg/sentry/socket/netlink/BUILD +++ b/pkg/sentry/socket/netlink/BUILD @@ -27,6 +27,7 @@ go_library( "//pkg/sentry/socket/unix", "//pkg/sentry/socket/unix/transport", "//pkg/sentry/usermem", + "//pkg/sync", "//pkg/syserr", "//pkg/syserror", "//pkg/tcpip", diff --git a/pkg/sentry/socket/netlink/port/BUILD b/pkg/sentry/socket/netlink/port/BUILD index 463544c1a..2d9f4ba9b 100644 --- a/pkg/sentry/socket/netlink/port/BUILD +++ b/pkg/sentry/socket/netlink/port/BUILD @@ -8,6 +8,7 @@ go_library( srcs = ["port.go"], importpath = "gvisor.dev/gvisor/pkg/sentry/socket/netlink/port", visibility = ["//pkg/sentry:internal"], + deps = ["//pkg/sync"], ) go_test( diff --git a/pkg/sentry/socket/netlink/port/port.go b/pkg/sentry/socket/netlink/port/port.go index e9d3275b1..2cd3afc22 100644 --- a/pkg/sentry/socket/netlink/port/port.go +++ b/pkg/sentry/socket/netlink/port/port.go @@ -24,7 +24,8 @@ import ( "fmt" "math" "math/rand" - "sync" + + "gvisor.dev/gvisor/pkg/sync" ) // maxPorts is a sanity limit on the maximum number of ports to allocate per diff --git a/pkg/sentry/socket/netlink/socket.go b/pkg/sentry/socket/netlink/socket.go index d2e3644a6..cea56f4ed 100644 --- a/pkg/sentry/socket/netlink/socket.go +++ b/pkg/sentry/socket/netlink/socket.go @@ -17,7 +17,6 @@ package netlink import ( "math" - "sync" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/binary" @@ -34,6 +33,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/socket/unix" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/tcpip" diff --git a/pkg/sentry/socket/netstack/BUILD b/pkg/sentry/socket/netstack/BUILD index e414d8055..f78784569 100644 --- a/pkg/sentry/socket/netstack/BUILD +++ b/pkg/sentry/socket/netstack/BUILD @@ -34,6 +34,7 @@ go_library( "//pkg/sentry/socket/netfilter", "//pkg/sentry/unimpl", "//pkg/sentry/usermem", + "//pkg/sync", "//pkg/syserr", "//pkg/syserror", "//pkg/tcpip", diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go index 764f11a6b..0affb8071 100644 --- a/pkg/sentry/socket/netstack/netstack.go +++ b/pkg/sentry/socket/netstack/netstack.go @@ -29,7 +29,6 @@ import ( "io" "math" "reflect" - "sync" "syscall" "time" @@ -49,6 +48,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/socket/netfilter" "gvisor.dev/gvisor/pkg/sentry/unimpl" "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/tcpip" diff --git a/pkg/sentry/socket/rpcinet/conn/BUILD b/pkg/sentry/socket/rpcinet/conn/BUILD index 23eadcb1b..b2677c659 100644 --- a/pkg/sentry/socket/rpcinet/conn/BUILD +++ b/pkg/sentry/socket/rpcinet/conn/BUILD @@ -10,6 +10,7 @@ go_library( deps = [ "//pkg/binary", "//pkg/sentry/socket/rpcinet:syscall_rpc_go_proto", + "//pkg/sync", "//pkg/syserr", "//pkg/unet", "@com_github_golang_protobuf//proto:go_default_library", diff --git a/pkg/sentry/socket/rpcinet/conn/conn.go b/pkg/sentry/socket/rpcinet/conn/conn.go index 356adad99..02f39c767 100644 --- a/pkg/sentry/socket/rpcinet/conn/conn.go +++ b/pkg/sentry/socket/rpcinet/conn/conn.go @@ -17,12 +17,12 @@ package conn import ( "fmt" - "sync" "sync/atomic" "syscall" "github.com/golang/protobuf/proto" "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/unet" diff --git a/pkg/sentry/socket/rpcinet/notifier/BUILD b/pkg/sentry/socket/rpcinet/notifier/BUILD index a3585e10d..a5954f22b 100644 --- a/pkg/sentry/socket/rpcinet/notifier/BUILD +++ b/pkg/sentry/socket/rpcinet/notifier/BUILD @@ -10,6 +10,7 @@ go_library( deps = [ "//pkg/sentry/socket/rpcinet:syscall_rpc_go_proto", "//pkg/sentry/socket/rpcinet/conn", + "//pkg/sync", "//pkg/waiter", "@org_golang_x_sys//unix:go_default_library", ], diff --git a/pkg/sentry/socket/rpcinet/notifier/notifier.go b/pkg/sentry/socket/rpcinet/notifier/notifier.go index 7efe4301f..82b75d6dd 100644 --- a/pkg/sentry/socket/rpcinet/notifier/notifier.go +++ b/pkg/sentry/socket/rpcinet/notifier/notifier.go @@ -17,12 +17,12 @@ package notifier import ( "fmt" - "sync" "syscall" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/sentry/socket/rpcinet/conn" pb "gvisor.dev/gvisor/pkg/sentry/socket/rpcinet/syscall_rpc_go_proto" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/socket/unix/transport/BUILD b/pkg/sentry/socket/unix/transport/BUILD index 788ad70d2..d7ba95dff 100644 --- a/pkg/sentry/socket/unix/transport/BUILD +++ b/pkg/sentry/socket/unix/transport/BUILD @@ -32,6 +32,7 @@ go_library( "//pkg/ilist", "//pkg/refs", "//pkg/sentry/context", + "//pkg/sync", "//pkg/syserr", "//pkg/tcpip", "//pkg/tcpip/buffer", diff --git a/pkg/sentry/socket/unix/transport/connectioned.go b/pkg/sentry/socket/unix/transport/connectioned.go index dea11e253..9e6fbc111 100644 --- a/pkg/sentry/socket/unix/transport/connectioned.go +++ b/pkg/sentry/socket/unix/transport/connectioned.go @@ -15,10 +15,9 @@ package transport import ( - "sync" - "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/waiter" diff --git a/pkg/sentry/socket/unix/transport/queue.go b/pkg/sentry/socket/unix/transport/queue.go index e27b1c714..5dcd3d95e 100644 --- a/pkg/sentry/socket/unix/transport/queue.go +++ b/pkg/sentry/socket/unix/transport/queue.go @@ -15,9 +15,8 @@ package transport import ( - "sync" - "gvisor.dev/gvisor/pkg/refs" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/socket/unix/transport/unix.go b/pkg/sentry/socket/unix/transport/unix.go index 37c7ac3c1..fcc0da332 100644 --- a/pkg/sentry/socket/unix/transport/unix.go +++ b/pkg/sentry/socket/unix/transport/unix.go @@ -16,11 +16,11 @@ package transport import ( - "sync" "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/buffer" diff --git a/pkg/sentry/syscalls/linux/BUILD b/pkg/sentry/syscalls/linux/BUILD index a76975cee..aa05e208a 100644 --- a/pkg/sentry/syscalls/linux/BUILD +++ b/pkg/sentry/syscalls/linux/BUILD @@ -91,6 +91,7 @@ go_library( "//pkg/sentry/syscalls", "//pkg/sentry/usage", "//pkg/sentry/usermem", + "//pkg/sync", "//pkg/syserr", "//pkg/syserror", "//pkg/waiter", diff --git a/pkg/sentry/syscalls/linux/error.go b/pkg/sentry/syscalls/linux/error.go index 1d9018c96..60469549d 100644 --- a/pkg/sentry/syscalls/linux/error.go +++ b/pkg/sentry/syscalls/linux/error.go @@ -16,13 +16,13 @@ package linux import ( "io" - "sync" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/metric" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) diff --git a/pkg/sentry/time/BUILD b/pkg/sentry/time/BUILD index 18e212dff..3cde3a0be 100644 --- a/pkg/sentry/time/BUILD +++ b/pkg/sentry/time/BUILD @@ -9,7 +9,7 @@ go_template_instance( out = "seqatomic_parameters_unsafe.go", package = "time", suffix = "Parameters", - template = "//pkg/syncutil:generic_seqatomic", + template = "//pkg/sync:generic_seqatomic", types = { "Value": "Parameters", }, @@ -36,7 +36,7 @@ go_library( deps = [ "//pkg/log", "//pkg/metric", - "//pkg/syncutil", + "//pkg/sync", "//pkg/syserror", ], ) diff --git a/pkg/sentry/time/calibrated_clock.go b/pkg/sentry/time/calibrated_clock.go index 318503277..f9a93115d 100644 --- a/pkg/sentry/time/calibrated_clock.go +++ b/pkg/sentry/time/calibrated_clock.go @@ -17,11 +17,11 @@ package time import ( - "sync" "time" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/metric" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) diff --git a/pkg/sentry/usage/BUILD b/pkg/sentry/usage/BUILD index c32fe3241..5518ac3d0 100644 --- a/pkg/sentry/usage/BUILD +++ b/pkg/sentry/usage/BUILD @@ -18,5 +18,6 @@ go_library( deps = [ "//pkg/bits", "//pkg/memutil", + "//pkg/sync", ], ) diff --git a/pkg/sentry/usage/memory.go b/pkg/sentry/usage/memory.go index d6ef644d8..538c645eb 100644 --- a/pkg/sentry/usage/memory.go +++ b/pkg/sentry/usage/memory.go @@ -17,12 +17,12 @@ package usage import ( "fmt" "os" - "sync" "sync/atomic" "syscall" "gvisor.dev/gvisor/pkg/bits" "gvisor.dev/gvisor/pkg/memutil" + "gvisor.dev/gvisor/pkg/sync" ) // MemoryKind represents a type of memory used by the application. diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD index 4c6aa04a1..35c7be259 100644 --- a/pkg/sentry/vfs/BUILD +++ b/pkg/sentry/vfs/BUILD @@ -34,7 +34,7 @@ go_library( "//pkg/sentry/kernel/auth", "//pkg/sentry/memmap", "//pkg/sentry/usermem", - "//pkg/syncutil", + "//pkg/sync", "//pkg/syserror", "//pkg/waiter", ], @@ -54,6 +54,7 @@ go_test( "//pkg/sentry/context/contexttest", "//pkg/sentry/kernel/auth", "//pkg/sentry/usermem", + "//pkg/sync", "//pkg/syserror", ], ) diff --git a/pkg/sentry/vfs/dentry.go b/pkg/sentry/vfs/dentry.go index 1bc9c4a38..486a76475 100644 --- a/pkg/sentry/vfs/dentry.go +++ b/pkg/sentry/vfs/dentry.go @@ -16,9 +16,9 @@ package vfs import ( "fmt" - "sync" "sync/atomic" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) diff --git a/pkg/sentry/vfs/file_description_impl_util.go b/pkg/sentry/vfs/file_description_impl_util.go index 66eb57bc2..c00b3c84b 100644 --- a/pkg/sentry/vfs/file_description_impl_util.go +++ b/pkg/sentry/vfs/file_description_impl_util.go @@ -17,13 +17,13 @@ package vfs import ( "bytes" "io" - "sync" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/vfs/mount_test.go b/pkg/sentry/vfs/mount_test.go index adff0b94b..3b933468d 100644 --- a/pkg/sentry/vfs/mount_test.go +++ b/pkg/sentry/vfs/mount_test.go @@ -17,8 +17,9 @@ package vfs import ( "fmt" "runtime" - "sync" "testing" + + "gvisor.dev/gvisor/pkg/sync" ) func TestMountTableLookupEmpty(t *testing.T) { diff --git a/pkg/sentry/vfs/mount_unsafe.go b/pkg/sentry/vfs/mount_unsafe.go index ab13fa461..bd90d36c4 100644 --- a/pkg/sentry/vfs/mount_unsafe.go +++ b/pkg/sentry/vfs/mount_unsafe.go @@ -26,7 +26,7 @@ import ( "sync/atomic" "unsafe" - "gvisor.dev/gvisor/pkg/syncutil" + "gvisor.dev/gvisor/pkg/sync" ) // mountKey represents the location at which a Mount is mounted. It is @@ -75,7 +75,7 @@ type mountTable struct { // intrinsics and inline assembly, limiting the performance of this // approach.) - seq syncutil.SeqCount + seq sync.SeqCount seed uint32 // for hashing keys // size holds both length (number of elements) and capacity (number of diff --git a/pkg/sentry/vfs/pathname.go b/pkg/sentry/vfs/pathname.go index 8e155654f..cf80df90e 100644 --- a/pkg/sentry/vfs/pathname.go +++ b/pkg/sentry/vfs/pathname.go @@ -15,10 +15,9 @@ package vfs import ( - "sync" - "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) diff --git a/pkg/sentry/vfs/resolving_path.go b/pkg/sentry/vfs/resolving_path.go index f0641d314..8a0b382f6 100644 --- a/pkg/sentry/vfs/resolving_path.go +++ b/pkg/sentry/vfs/resolving_path.go @@ -16,11 +16,11 @@ package vfs import ( "fmt" - "sync" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go index ea2db7031..1f21b0b31 100644 --- a/pkg/sentry/vfs/vfs.go +++ b/pkg/sentry/vfs/vfs.go @@ -29,12 +29,12 @@ package vfs import ( "fmt" - "sync" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) diff --git a/pkg/sentry/watchdog/BUILD b/pkg/sentry/watchdog/BUILD index 4d8435265..28f21f13d 100644 --- a/pkg/sentry/watchdog/BUILD +++ b/pkg/sentry/watchdog/BUILD @@ -13,5 +13,6 @@ go_library( "//pkg/metric", "//pkg/sentry/kernel", "//pkg/sentry/kernel/time", + "//pkg/sync", ], ) diff --git a/pkg/sentry/watchdog/watchdog.go b/pkg/sentry/watchdog/watchdog.go index 5e4611333..bfb2fac26 100644 --- a/pkg/sentry/watchdog/watchdog.go +++ b/pkg/sentry/watchdog/watchdog.go @@ -32,7 +32,6 @@ package watchdog import ( "bytes" "fmt" - "sync" "time" "gvisor.dev/gvisor/pkg/abi/linux" @@ -40,6 +39,7 @@ import ( "gvisor.dev/gvisor/pkg/metric" "gvisor.dev/gvisor/pkg/sentry/kernel" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" + "gvisor.dev/gvisor/pkg/sync" ) // Opts configures the watchdog. diff --git a/pkg/sync/BUILD b/pkg/sync/BUILD new file mode 100644 index 000000000..e8cd16b8f --- /dev/null +++ b/pkg/sync/BUILD @@ -0,0 +1,53 @@ +load("//tools/go_stateify:defs.bzl", "go_library") +load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools/go_generics:defs.bzl", "go_template") + +package( + default_visibility = ["//:sandbox"], + licenses = ["notice"], +) + +exports_files(["LICENSE"]) + +go_template( + name = "generic_atomicptr", + srcs = ["atomicptr_unsafe.go"], + types = [ + "Value", + ], +) + +go_template( + name = "generic_seqatomic", + srcs = ["seqatomic_unsafe.go"], + types = [ + "Value", + ], + deps = [ + ":sync", + ], +) + +go_library( + name = "sync", + srcs = [ + "aliases.go", + "downgradable_rwmutex_unsafe.go", + "memmove_unsafe.go", + "norace_unsafe.go", + "race_unsafe.go", + "seqcount.go", + "syncutil.go", + ], + importpath = "gvisor.dev/gvisor/pkg/sync", +) + +go_test( + name = "sync_test", + size = "small", + srcs = [ + "downgradable_rwmutex_test.go", + "seqcount_test.go", + ], + embed = [":sync"], +) diff --git a/pkg/sync/LICENSE b/pkg/sync/LICENSE new file mode 100644 index 000000000..6a66aea5e --- /dev/null +++ b/pkg/sync/LICENSE @@ -0,0 +1,27 @@ +Copyright (c) 2009 The Go Authors. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/pkg/sync/README.md b/pkg/sync/README.md new file mode 100644 index 000000000..2183c4e20 --- /dev/null +++ b/pkg/sync/README.md @@ -0,0 +1,5 @@ +# Syncutil + +This package provides additional synchronization primitives not provided by the +Go stdlib 'sync' package. It is partially derived from the upstream 'sync' +package from go1.10. diff --git a/pkg/sync/aliases.go b/pkg/sync/aliases.go new file mode 100644 index 000000000..20c7ca041 --- /dev/null +++ b/pkg/sync/aliases.go @@ -0,0 +1,37 @@ +// Copyright 2020 The gVisor Authors. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package sync + +import ( + "sync" +) + +// Aliases of standard library types. +type ( + // Mutex is an alias of sync.Mutex. + Mutex = sync.Mutex + + // RWMutex is an alias of sync.RWMutex. + RWMutex = sync.RWMutex + + // Cond is an alias of sync.Cond. + Cond = sync.Cond + + // Locker is an alias of sync.Locker. + Locker = sync.Locker + + // Once is an alias of sync.Once. + Once = sync.Once + + // Pool is an alias of sync.Pool. + Pool = sync.Pool + + // WaitGroup is an alias of sync.WaitGroup. + WaitGroup = sync.WaitGroup + + // Map is an alias of sync.Map. + Map = sync.Map +) diff --git a/pkg/sync/atomicptr_unsafe.go b/pkg/sync/atomicptr_unsafe.go new file mode 100644 index 000000000..525c4beed --- /dev/null +++ b/pkg/sync/atomicptr_unsafe.go @@ -0,0 +1,47 @@ +// Copyright 2019 The gVisor Authors. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package template doesn't exist. This file must be instantiated using the +// go_template_instance rule in tools/go_generics/defs.bzl. +package template + +import ( + "sync/atomic" + "unsafe" +) + +// Value is a required type parameter. +type Value struct{} + +// An AtomicPtr is a pointer to a value of type Value that can be atomically +// loaded and stored. The zero value of an AtomicPtr represents nil. +// +// Note that copying AtomicPtr by value performs a non-atomic read of the +// stored pointer, which is unsafe if Store() can be called concurrently; in +// this case, do `dst.Store(src.Load())` instead. +// +// +stateify savable +type AtomicPtr struct { + ptr unsafe.Pointer `state:".(*Value)"` +} + +func (p *AtomicPtr) savePtr() *Value { + return p.Load() +} + +func (p *AtomicPtr) loadPtr(v *Value) { + p.Store(v) +} + +// Load returns the value set by the most recent Store. It returns nil if there +// has been no previous call to Store. +func (p *AtomicPtr) Load() *Value { + return (*Value)(atomic.LoadPointer(&p.ptr)) +} + +// Store sets the value returned by Load to x. +func (p *AtomicPtr) Store(x *Value) { + atomic.StorePointer(&p.ptr, (unsafe.Pointer)(x)) +} diff --git a/pkg/sync/atomicptrtest/BUILD b/pkg/sync/atomicptrtest/BUILD new file mode 100644 index 000000000..418eda29c --- /dev/null +++ b/pkg/sync/atomicptrtest/BUILD @@ -0,0 +1,29 @@ +load("//tools/go_stateify:defs.bzl", "go_library") +load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools/go_generics:defs.bzl", "go_template_instance") + +package(licenses = ["notice"]) + +go_template_instance( + name = "atomicptr_int", + out = "atomicptr_int_unsafe.go", + package = "atomicptr", + suffix = "Int", + template = "//pkg/sync:generic_atomicptr", + types = { + "Value": "int", + }, +) + +go_library( + name = "atomicptr", + srcs = ["atomicptr_int_unsafe.go"], + importpath = "gvisor.dev/gvisor/pkg/sync/atomicptr", +) + +go_test( + name = "atomicptr_test", + size = "small", + srcs = ["atomicptr_test.go"], + embed = [":atomicptr"], +) diff --git a/pkg/sync/atomicptrtest/atomicptr_test.go b/pkg/sync/atomicptrtest/atomicptr_test.go new file mode 100644 index 000000000..8fdc5112e --- /dev/null +++ b/pkg/sync/atomicptrtest/atomicptr_test.go @@ -0,0 +1,31 @@ +// Copyright 2019 The gVisor Authors. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package atomicptr + +import ( + "testing" +) + +func newInt(val int) *int { + return &val +} + +func TestAtomicPtr(t *testing.T) { + var p AtomicPtrInt + if got := p.Load(); got != nil { + t.Errorf("initial value is %p (%v), wanted nil", got, got) + } + want := newInt(42) + p.Store(want) + if got := p.Load(); got != want { + t.Errorf("wrong value: got %p (%v), wanted %p (%v)", got, got, want, want) + } + want = newInt(100) + p.Store(want) + if got := p.Load(); got != want { + t.Errorf("wrong value: got %p (%v), wanted %p (%v)", got, got, want, want) + } +} diff --git a/pkg/sync/downgradable_rwmutex_test.go b/pkg/sync/downgradable_rwmutex_test.go new file mode 100644 index 000000000..f04496bc5 --- /dev/null +++ b/pkg/sync/downgradable_rwmutex_test.go @@ -0,0 +1,150 @@ +// Copyright 2009 The Go Authors. All rights reserved. +// Copyright 2019 The gVisor Authors. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// GOMAXPROCS=10 go test + +// Copy/pasted from the standard library's sync/rwmutex_test.go, except for the +// addition of downgradingWriter and the renaming of num_iterations to +// numIterations to shut up Golint. + +package sync + +import ( + "fmt" + "runtime" + "sync/atomic" + "testing" +) + +func parallelReader(m *DowngradableRWMutex, clocked, cunlock, cdone chan bool) { + m.RLock() + clocked <- true + <-cunlock + m.RUnlock() + cdone <- true +} + +func doTestParallelReaders(numReaders, gomaxprocs int) { + runtime.GOMAXPROCS(gomaxprocs) + var m DowngradableRWMutex + clocked := make(chan bool) + cunlock := make(chan bool) + cdone := make(chan bool) + for i := 0; i < numReaders; i++ { + go parallelReader(&m, clocked, cunlock, cdone) + } + // Wait for all parallel RLock()s to succeed. + for i := 0; i < numReaders; i++ { + <-clocked + } + for i := 0; i < numReaders; i++ { + cunlock <- true + } + // Wait for the goroutines to finish. + for i := 0; i < numReaders; i++ { + <-cdone + } +} + +func TestParallelReaders(t *testing.T) { + defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(-1)) + doTestParallelReaders(1, 4) + doTestParallelReaders(3, 4) + doTestParallelReaders(4, 2) +} + +func reader(rwm *DowngradableRWMutex, numIterations int, activity *int32, cdone chan bool) { + for i := 0; i < numIterations; i++ { + rwm.RLock() + n := atomic.AddInt32(activity, 1) + if n < 1 || n >= 10000 { + panic(fmt.Sprintf("wlock(%d)\n", n)) + } + for i := 0; i < 100; i++ { + } + atomic.AddInt32(activity, -1) + rwm.RUnlock() + } + cdone <- true +} + +func writer(rwm *DowngradableRWMutex, numIterations int, activity *int32, cdone chan bool) { + for i := 0; i < numIterations; i++ { + rwm.Lock() + n := atomic.AddInt32(activity, 10000) + if n != 10000 { + panic(fmt.Sprintf("wlock(%d)\n", n)) + } + for i := 0; i < 100; i++ { + } + atomic.AddInt32(activity, -10000) + rwm.Unlock() + } + cdone <- true +} + +func downgradingWriter(rwm *DowngradableRWMutex, numIterations int, activity *int32, cdone chan bool) { + for i := 0; i < numIterations; i++ { + rwm.Lock() + n := atomic.AddInt32(activity, 10000) + if n != 10000 { + panic(fmt.Sprintf("wlock(%d)\n", n)) + } + for i := 0; i < 100; i++ { + } + atomic.AddInt32(activity, -10000) + rwm.DowngradeLock() + n = atomic.AddInt32(activity, 1) + if n < 1 || n >= 10000 { + panic(fmt.Sprintf("wlock(%d)\n", n)) + } + for i := 0; i < 100; i++ { + } + n = atomic.AddInt32(activity, -1) + rwm.RUnlock() + } + cdone <- true +} + +func HammerDowngradableRWMutex(gomaxprocs, numReaders, numIterations int) { + runtime.GOMAXPROCS(gomaxprocs) + // Number of active readers + 10000 * number of active writers. + var activity int32 + var rwm DowngradableRWMutex + cdone := make(chan bool) + go writer(&rwm, numIterations, &activity, cdone) + go downgradingWriter(&rwm, numIterations, &activity, cdone) + var i int + for i = 0; i < numReaders/2; i++ { + go reader(&rwm, numIterations, &activity, cdone) + } + go writer(&rwm, numIterations, &activity, cdone) + go downgradingWriter(&rwm, numIterations, &activity, cdone) + for ; i < numReaders; i++ { + go reader(&rwm, numIterations, &activity, cdone) + } + // Wait for the 4 writers and all readers to finish. + for i := 0; i < 4+numReaders; i++ { + <-cdone + } +} + +func TestDowngradableRWMutex(t *testing.T) { + defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(-1)) + n := 1000 + if testing.Short() { + n = 5 + } + HammerDowngradableRWMutex(1, 1, n) + HammerDowngradableRWMutex(1, 3, n) + HammerDowngradableRWMutex(1, 10, n) + HammerDowngradableRWMutex(4, 1, n) + HammerDowngradableRWMutex(4, 3, n) + HammerDowngradableRWMutex(4, 10, n) + HammerDowngradableRWMutex(10, 1, n) + HammerDowngradableRWMutex(10, 3, n) + HammerDowngradableRWMutex(10, 10, n) + HammerDowngradableRWMutex(10, 5, n) +} diff --git a/pkg/sync/downgradable_rwmutex_unsafe.go b/pkg/sync/downgradable_rwmutex_unsafe.go new file mode 100644 index 000000000..9bb55cd3a --- /dev/null +++ b/pkg/sync/downgradable_rwmutex_unsafe.go @@ -0,0 +1,146 @@ +// Copyright 2009 The Go Authors. All rights reserved. +// Copyright 2019 The gVisor Authors. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build go1.13 +// +build !go1.15 + +// Check go:linkname function signatures when updating Go version. + +// This is mostly copied from the standard library's sync/rwmutex.go. +// +// Happens-before relationships indicated to the race detector: +// - Unlock -> Lock (via writerSem) +// - Unlock -> RLock (via readerSem) +// - RUnlock -> Lock (via writerSem) +// - DowngradeLock -> RLock (via readerSem) + +package sync + +import ( + "sync" + "sync/atomic" + "unsafe" +) + +//go:linkname runtimeSemacquire sync.runtime_Semacquire +func runtimeSemacquire(s *uint32) + +//go:linkname runtimeSemrelease sync.runtime_Semrelease +func runtimeSemrelease(s *uint32, handoff bool, skipframes int) + +// DowngradableRWMutex is identical to sync.RWMutex, but adds the DowngradeLock +// method. +type DowngradableRWMutex struct { + w sync.Mutex // held if there are pending writers + writerSem uint32 // semaphore for writers to wait for completing readers + readerSem uint32 // semaphore for readers to wait for completing writers + readerCount int32 // number of pending readers + readerWait int32 // number of departing readers +} + +const rwmutexMaxReaders = 1 << 30 + +// RLock locks rw for reading. +func (rw *DowngradableRWMutex) RLock() { + if RaceEnabled { + RaceDisable() + } + if atomic.AddInt32(&rw.readerCount, 1) < 0 { + // A writer is pending, wait for it. + runtimeSemacquire(&rw.readerSem) + } + if RaceEnabled { + RaceEnable() + RaceAcquire(unsafe.Pointer(&rw.readerSem)) + } +} + +// RUnlock undoes a single RLock call. +func (rw *DowngradableRWMutex) RUnlock() { + if RaceEnabled { + RaceReleaseMerge(unsafe.Pointer(&rw.writerSem)) + RaceDisable() + } + if r := atomic.AddInt32(&rw.readerCount, -1); r < 0 { + if r+1 == 0 || r+1 == -rwmutexMaxReaders { + panic("RUnlock of unlocked DowngradableRWMutex") + } + // A writer is pending. + if atomic.AddInt32(&rw.readerWait, -1) == 0 { + // The last reader unblocks the writer. + runtimeSemrelease(&rw.writerSem, false, 0) + } + } + if RaceEnabled { + RaceEnable() + } +} + +// Lock locks rw for writing. +func (rw *DowngradableRWMutex) Lock() { + if RaceEnabled { + RaceDisable() + } + // First, resolve competition with other writers. + rw.w.Lock() + // Announce to readers there is a pending writer. + r := atomic.AddInt32(&rw.readerCount, -rwmutexMaxReaders) + rwmutexMaxReaders + // Wait for active readers. + if r != 0 && atomic.AddInt32(&rw.readerWait, r) != 0 { + runtimeSemacquire(&rw.writerSem) + } + if RaceEnabled { + RaceEnable() + RaceAcquire(unsafe.Pointer(&rw.writerSem)) + } +} + +// Unlock unlocks rw for writing. +func (rw *DowngradableRWMutex) Unlock() { + if RaceEnabled { + RaceRelease(unsafe.Pointer(&rw.writerSem)) + RaceRelease(unsafe.Pointer(&rw.readerSem)) + RaceDisable() + } + // Announce to readers there is no active writer. + r := atomic.AddInt32(&rw.readerCount, rwmutexMaxReaders) + if r >= rwmutexMaxReaders { + panic("Unlock of unlocked DowngradableRWMutex") + } + // Unblock blocked readers, if any. + for i := 0; i < int(r); i++ { + runtimeSemrelease(&rw.readerSem, false, 0) + } + // Allow other writers to proceed. + rw.w.Unlock() + if RaceEnabled { + RaceEnable() + } +} + +// DowngradeLock atomically unlocks rw for writing and locks it for reading. +func (rw *DowngradableRWMutex) DowngradeLock() { + if RaceEnabled { + RaceRelease(unsafe.Pointer(&rw.readerSem)) + RaceDisable() + } + // Announce to readers there is no active writer and one additional reader. + r := atomic.AddInt32(&rw.readerCount, rwmutexMaxReaders+1) + if r >= rwmutexMaxReaders+1 { + panic("DowngradeLock of unlocked DowngradableRWMutex") + } + // Unblock blocked readers, if any. Note that this loop starts as 1 since r + // includes this goroutine. + for i := 1; i < int(r); i++ { + runtimeSemrelease(&rw.readerSem, false, 0) + } + // Allow other writers to proceed to rw.w.Lock(). Note that they will still + // block on rw.writerSem since at least this reader exists, such that + // DowngradeLock() is atomic with the previous write lock. + rw.w.Unlock() + if RaceEnabled { + RaceEnable() + } +} diff --git a/pkg/sync/memmove_unsafe.go b/pkg/sync/memmove_unsafe.go new file mode 100644 index 000000000..ad4a3a37e --- /dev/null +++ b/pkg/sync/memmove_unsafe.go @@ -0,0 +1,28 @@ +// Copyright 2019 The gVisor Authors. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build go1.12 +// +build !go1.15 + +// Check go:linkname function signatures when updating Go version. + +package sync + +import ( + "unsafe" +) + +//go:linkname memmove runtime.memmove +//go:noescape +func memmove(to, from unsafe.Pointer, n uintptr) + +// Memmove is exported for SeqAtomicLoad/SeqAtomicTryLoad, which can't +// define it because go_generics can't update the go:linkname annotation. +// Furthermore, go:linkname silently doesn't work if the local name is exported +// (this is of course undocumented), which is why this indirection is +// necessary. +func Memmove(to, from unsafe.Pointer, n uintptr) { + memmove(to, from, n) +} diff --git a/pkg/sync/norace_unsafe.go b/pkg/sync/norace_unsafe.go new file mode 100644 index 000000000..006055dd6 --- /dev/null +++ b/pkg/sync/norace_unsafe.go @@ -0,0 +1,35 @@ +// Copyright 2019 The gVisor Authors. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build !race + +package sync + +import ( + "unsafe" +) + +// RaceEnabled is true if the Go data race detector is enabled. +const RaceEnabled = false + +// RaceDisable has the same semantics as runtime.RaceDisable. +func RaceDisable() { +} + +// RaceEnable has the same semantics as runtime.RaceEnable. +func RaceEnable() { +} + +// RaceAcquire has the same semantics as runtime.RaceAcquire. +func RaceAcquire(addr unsafe.Pointer) { +} + +// RaceRelease has the same semantics as runtime.RaceRelease. +func RaceRelease(addr unsafe.Pointer) { +} + +// RaceReleaseMerge has the same semantics as runtime.RaceReleaseMerge. +func RaceReleaseMerge(addr unsafe.Pointer) { +} diff --git a/pkg/sync/race_unsafe.go b/pkg/sync/race_unsafe.go new file mode 100644 index 000000000..31d8fa9a6 --- /dev/null +++ b/pkg/sync/race_unsafe.go @@ -0,0 +1,41 @@ +// Copyright 2019 The gVisor Authors. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build race + +package sync + +import ( + "runtime" + "unsafe" +) + +// RaceEnabled is true if the Go data race detector is enabled. +const RaceEnabled = true + +// RaceDisable has the same semantics as runtime.RaceDisable. +func RaceDisable() { + runtime.RaceDisable() +} + +// RaceEnable has the same semantics as runtime.RaceEnable. +func RaceEnable() { + runtime.RaceEnable() +} + +// RaceAcquire has the same semantics as runtime.RaceAcquire. +func RaceAcquire(addr unsafe.Pointer) { + runtime.RaceAcquire(addr) +} + +// RaceRelease has the same semantics as runtime.RaceRelease. +func RaceRelease(addr unsafe.Pointer) { + runtime.RaceRelease(addr) +} + +// RaceReleaseMerge has the same semantics as runtime.RaceReleaseMerge. +func RaceReleaseMerge(addr unsafe.Pointer) { + runtime.RaceReleaseMerge(addr) +} diff --git a/pkg/sync/seqatomic_unsafe.go b/pkg/sync/seqatomic_unsafe.go new file mode 100644 index 000000000..eda6fb131 --- /dev/null +++ b/pkg/sync/seqatomic_unsafe.go @@ -0,0 +1,72 @@ +// Copyright 2019 The gVisor Authors. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package template doesn't exist. This file must be instantiated using the +// go_template_instance rule in tools/go_generics/defs.bzl. +package template + +import ( + "fmt" + "reflect" + "strings" + "unsafe" + + "gvisor.dev/gvisor/pkg/sync" +) + +// Value is a required type parameter. +// +// Value must not contain any pointers, including interface objects, function +// objects, slices, maps, channels, unsafe.Pointer, and arrays or structs +// containing any of the above. An init() function will panic if this property +// does not hold. +type Value struct{} + +// SeqAtomicLoad returns a copy of *ptr, ensuring that the read does not race +// with any writer critical sections in sc. +func SeqAtomicLoad(sc *sync.SeqCount, ptr *Value) Value { + // This function doesn't use SeqAtomicTryLoad because doing so is + // measurably, significantly (~20%) slower; Go is awful at inlining. + var val Value + for { + epoch := sc.BeginRead() + if sync.RaceEnabled { + // runtime.RaceDisable() doesn't actually stop the race detector, + // so it can't help us here. Instead, call runtime.memmove + // directly, which is not instrumented by the race detector. + sync.Memmove(unsafe.Pointer(&val), unsafe.Pointer(ptr), unsafe.Sizeof(val)) + } else { + // This is ~40% faster for short reads than going through memmove. + val = *ptr + } + if sc.ReadOk(epoch) { + break + } + } + return val +} + +// SeqAtomicTryLoad returns a copy of *ptr while in a reader critical section +// in sc initiated by a call to sc.BeginRead() that returned epoch. If the read +// would race with a writer critical section, SeqAtomicTryLoad returns +// (unspecified, false). +func SeqAtomicTryLoad(sc *sync.SeqCount, epoch sync.SeqCountEpoch, ptr *Value) (Value, bool) { + var val Value + if sync.RaceEnabled { + sync.Memmove(unsafe.Pointer(&val), unsafe.Pointer(ptr), unsafe.Sizeof(val)) + } else { + val = *ptr + } + return val, sc.ReadOk(epoch) +} + +func init() { + var val Value + typ := reflect.TypeOf(val) + name := typ.Name() + if ptrs := sync.PointersInType(typ, name); len(ptrs) != 0 { + panic(fmt.Sprintf("SeqAtomicLoad<%s> is invalid since values %s of type %s contain pointers:\n%s", typ, name, typ, strings.Join(ptrs, "\n"))) + } +} diff --git a/pkg/sync/seqatomictest/BUILD b/pkg/sync/seqatomictest/BUILD new file mode 100644 index 000000000..eba21518d --- /dev/null +++ b/pkg/sync/seqatomictest/BUILD @@ -0,0 +1,33 @@ +load("//tools/go_stateify:defs.bzl", "go_library") +load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools/go_generics:defs.bzl", "go_template_instance") + +package(licenses = ["notice"]) + +go_template_instance( + name = "seqatomic_int", + out = "seqatomic_int_unsafe.go", + package = "seqatomic", + suffix = "Int", + template = "//pkg/sync:generic_seqatomic", + types = { + "Value": "int", + }, +) + +go_library( + name = "seqatomic", + srcs = ["seqatomic_int_unsafe.go"], + importpath = "gvisor.dev/gvisor/pkg/sync/seqatomic", + deps = [ + "//pkg/sync", + ], +) + +go_test( + name = "seqatomic_test", + size = "small", + srcs = ["seqatomic_test.go"], + embed = [":seqatomic"], + deps = ["//pkg/sync"], +) diff --git a/pkg/sync/seqatomictest/seqatomic_test.go b/pkg/sync/seqatomictest/seqatomic_test.go new file mode 100644 index 000000000..2c4568b07 --- /dev/null +++ b/pkg/sync/seqatomictest/seqatomic_test.go @@ -0,0 +1,132 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package seqatomic + +import ( + "sync/atomic" + "testing" + "time" + + "gvisor.dev/gvisor/pkg/sync" +) + +func TestSeqAtomicLoadUncontended(t *testing.T) { + var seq sync.SeqCount + const want = 1 + data := want + if got := SeqAtomicLoadInt(&seq, &data); got != want { + t.Errorf("SeqAtomicLoadInt: got %v, wanted %v", got, want) + } +} + +func TestSeqAtomicLoadAfterWrite(t *testing.T) { + var seq sync.SeqCount + var data int + const want = 1 + seq.BeginWrite() + data = want + seq.EndWrite() + if got := SeqAtomicLoadInt(&seq, &data); got != want { + t.Errorf("SeqAtomicLoadInt: got %v, wanted %v", got, want) + } +} + +func TestSeqAtomicLoadDuringWrite(t *testing.T) { + var seq sync.SeqCount + var data int + const want = 1 + seq.BeginWrite() + go func() { + time.Sleep(time.Second) + data = want + seq.EndWrite() + }() + if got := SeqAtomicLoadInt(&seq, &data); got != want { + t.Errorf("SeqAtomicLoadInt: got %v, wanted %v", got, want) + } +} + +func TestSeqAtomicTryLoadUncontended(t *testing.T) { + var seq sync.SeqCount + const want = 1 + data := want + epoch := seq.BeginRead() + if got, ok := SeqAtomicTryLoadInt(&seq, epoch, &data); !ok || got != want { + t.Errorf("SeqAtomicTryLoadInt: got (%v, %v), wanted (%v, true)", got, ok, want) + } +} + +func TestSeqAtomicTryLoadDuringWrite(t *testing.T) { + var seq sync.SeqCount + var data int + epoch := seq.BeginRead() + seq.BeginWrite() + if got, ok := SeqAtomicTryLoadInt(&seq, epoch, &data); ok { + t.Errorf("SeqAtomicTryLoadInt: got (%v, true), wanted (_, false)", got) + } + seq.EndWrite() +} + +func TestSeqAtomicTryLoadAfterWrite(t *testing.T) { + var seq sync.SeqCount + var data int + epoch := seq.BeginRead() + seq.BeginWrite() + seq.EndWrite() + if got, ok := SeqAtomicTryLoadInt(&seq, epoch, &data); ok { + t.Errorf("SeqAtomicTryLoadInt: got (%v, true), wanted (_, false)", got) + } +} + +func BenchmarkSeqAtomicLoadIntUncontended(b *testing.B) { + var seq sync.SeqCount + const want = 42 + data := want + b.RunParallel(func(pb *testing.PB) { + for pb.Next() { + if got := SeqAtomicLoadInt(&seq, &data); got != want { + b.Fatalf("SeqAtomicLoadInt: got %v, wanted %v", got, want) + } + } + }) +} + +func BenchmarkSeqAtomicTryLoadIntUncontended(b *testing.B) { + var seq sync.SeqCount + const want = 42 + data := want + b.RunParallel(func(pb *testing.PB) { + epoch := seq.BeginRead() + for pb.Next() { + if got, ok := SeqAtomicTryLoadInt(&seq, epoch, &data); !ok || got != want { + b.Fatalf("SeqAtomicTryLoadInt: got (%v, %v), wanted (%v, true)", got, ok, want) + } + } + }) +} + +// For comparison: +func BenchmarkAtomicValueLoadIntUncontended(b *testing.B) { + var a atomic.Value + const want = 42 + a.Store(int(want)) + b.RunParallel(func(pb *testing.PB) { + for pb.Next() { + if got := a.Load().(int); got != want { + b.Fatalf("atomic.Value.Load: got %v, wanted %v", got, want) + } + } + }) +} diff --git a/pkg/sync/seqcount.go b/pkg/sync/seqcount.go new file mode 100644 index 000000000..a1e895352 --- /dev/null +++ b/pkg/sync/seqcount.go @@ -0,0 +1,149 @@ +// Copyright 2019 The gVisor Authors. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package sync + +import ( + "fmt" + "reflect" + "runtime" + "sync/atomic" +) + +// SeqCount is a synchronization primitive for optimistic reader/writer +// synchronization in cases where readers can work with stale data and +// therefore do not need to block writers. +// +// Compared to sync/atomic.Value: +// +// - Mutation of SeqCount-protected data does not require memory allocation, +// whereas atomic.Value generally does. This is a significant advantage when +// writes are common. +// +// - Atomic reads of SeqCount-protected data require copying. This is a +// disadvantage when atomic reads are common. +// +// - SeqCount may be more flexible: correct use of SeqCount.ReadOk allows other +// operations to be made atomic with reads of SeqCount-protected data. +// +// - SeqCount may be less flexible: as of this writing, SeqCount-protected data +// cannot include pointers. +// +// - SeqCount is more cumbersome to use; atomic reads of SeqCount-protected +// data require instantiating function templates using go_generics (see +// seqatomic.go). +type SeqCount struct { + // epoch is incremented by BeginWrite and EndWrite, such that epoch is odd + // if a writer critical section is active, and a read from data protected + // by this SeqCount is atomic iff epoch is the same even value before and + // after the read. + epoch uint32 +} + +// SeqCountEpoch tracks writer critical sections in a SeqCount. +type SeqCountEpoch struct { + val uint32 +} + +// We assume that: +// +// - All functions in sync/atomic that perform a memory read are at least a +// read fence: memory reads before calls to such functions cannot be reordered +// after the call, and memory reads after calls to such functions cannot be +// reordered before the call, even if those reads do not use sync/atomic. +// +// - All functions in sync/atomic that perform a memory write are at least a +// write fence: memory writes before calls to such functions cannot be +// reordered after the call, and memory writes after calls to such functions +// cannot be reordered before the call, even if those writes do not use +// sync/atomic. +// +// As of this writing, the Go memory model completely fails to describe +// sync/atomic, but these properties are implied by +// https://groups.google.com/forum/#!topic/golang-nuts/7EnEhM3U7B8. + +// BeginRead indicates the beginning of a reader critical section. Reader +// critical sections DO NOT BLOCK writer critical sections, so operations in a +// reader critical section MAY RACE with writer critical sections. Races are +// detected by ReadOk at the end of the reader critical section. Thus, the +// low-level structure of readers is generally: +// +// for { +// epoch := seq.BeginRead() +// // do something idempotent with seq-protected data +// if seq.ReadOk(epoch) { +// break +// } +// } +// +// However, since reader critical sections may race with writer critical +// sections, the Go race detector will (accurately) flag data races in readers +// using this pattern. Most users of SeqCount will need to use the +// SeqAtomicLoad function template in seqatomic.go. +func (s *SeqCount) BeginRead() SeqCountEpoch { + epoch := atomic.LoadUint32(&s.epoch) + for epoch&1 != 0 { + runtime.Gosched() + epoch = atomic.LoadUint32(&s.epoch) + } + return SeqCountEpoch{epoch} +} + +// ReadOk returns true if the reader critical section initiated by a previous +// call to BeginRead() that returned epoch did not race with any writer critical +// sections. +// +// ReadOk may be called any number of times during a reader critical section. +// Reader critical sections do not need to be explicitly terminated; the last +// call to ReadOk is implicitly the end of the reader critical section. +func (s *SeqCount) ReadOk(epoch SeqCountEpoch) bool { + return atomic.LoadUint32(&s.epoch) == epoch.val +} + +// BeginWrite indicates the beginning of a writer critical section. +// +// SeqCount does not support concurrent writer critical sections; clients with +// concurrent writers must synchronize them using e.g. sync.Mutex. +func (s *SeqCount) BeginWrite() { + if epoch := atomic.AddUint32(&s.epoch, 1); epoch&1 == 0 { + panic("SeqCount.BeginWrite during writer critical section") + } +} + +// EndWrite ends the effect of a preceding BeginWrite. +func (s *SeqCount) EndWrite() { + if epoch := atomic.AddUint32(&s.epoch, 1); epoch&1 != 0 { + panic("SeqCount.EndWrite outside writer critical section") + } +} + +// PointersInType returns a list of pointers reachable from values named +// valName of the given type. +// +// PointersInType is not exhaustive, but it is guaranteed that if typ contains +// at least one pointer, then PointersInTypeOf returns a non-empty list. +func PointersInType(typ reflect.Type, valName string) []string { + switch kind := typ.Kind(); kind { + case reflect.Bool, reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64, reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64, reflect.Uintptr, reflect.Float32, reflect.Float64, reflect.Complex64, reflect.Complex128: + return nil + + case reflect.Chan, reflect.Func, reflect.Interface, reflect.Map, reflect.Ptr, reflect.Slice, reflect.String, reflect.UnsafePointer: + return []string{valName} + + case reflect.Array: + return PointersInType(typ.Elem(), valName+"[]") + + case reflect.Struct: + var ptrs []string + for i, n := 0, typ.NumField(); i < n; i++ { + field := typ.Field(i) + ptrs = append(ptrs, PointersInType(field.Type, fmt.Sprintf("%s.%s", valName, field.Name))...) + } + return ptrs + + default: + return []string{fmt.Sprintf("%s (of type %s with unknown kind %s)", valName, typ, kind)} + } +} diff --git a/pkg/sync/seqcount_test.go b/pkg/sync/seqcount_test.go new file mode 100644 index 000000000..6eb7b4b59 --- /dev/null +++ b/pkg/sync/seqcount_test.go @@ -0,0 +1,153 @@ +// Copyright 2019 The gVisor Authors. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package sync + +import ( + "reflect" + "testing" + "time" +) + +func TestSeqCountWriteUncontended(t *testing.T) { + var seq SeqCount + seq.BeginWrite() + seq.EndWrite() +} + +func TestSeqCountReadUncontended(t *testing.T) { + var seq SeqCount + epoch := seq.BeginRead() + if !seq.ReadOk(epoch) { + t.Errorf("ReadOk: got false, wanted true") + } +} + +func TestSeqCountBeginReadAfterWrite(t *testing.T) { + var seq SeqCount + var data int32 + const want = 1 + seq.BeginWrite() + data = want + seq.EndWrite() + epoch := seq.BeginRead() + if data != want { + t.Errorf("Reader: got %v, wanted %v", data, want) + } + if !seq.ReadOk(epoch) { + t.Errorf("ReadOk: got false, wanted true") + } +} + +func TestSeqCountBeginReadDuringWrite(t *testing.T) { + var seq SeqCount + var data int + const want = 1 + seq.BeginWrite() + go func() { + time.Sleep(time.Second) + data = want + seq.EndWrite() + }() + epoch := seq.BeginRead() + if data != want { + t.Errorf("Reader: got %v, wanted %v", data, want) + } + if !seq.ReadOk(epoch) { + t.Errorf("ReadOk: got false, wanted true") + } +} + +func TestSeqCountReadOkAfterWrite(t *testing.T) { + var seq SeqCount + epoch := seq.BeginRead() + seq.BeginWrite() + seq.EndWrite() + if seq.ReadOk(epoch) { + t.Errorf("ReadOk: got true, wanted false") + } +} + +func TestSeqCountReadOkDuringWrite(t *testing.T) { + var seq SeqCount + epoch := seq.BeginRead() + seq.BeginWrite() + if seq.ReadOk(epoch) { + t.Errorf("ReadOk: got true, wanted false") + } + seq.EndWrite() +} + +func BenchmarkSeqCountWriteUncontended(b *testing.B) { + var seq SeqCount + for i := 0; i < b.N; i++ { + seq.BeginWrite() + seq.EndWrite() + } +} + +func BenchmarkSeqCountReadUncontended(b *testing.B) { + var seq SeqCount + b.RunParallel(func(pb *testing.PB) { + for pb.Next() { + epoch := seq.BeginRead() + if !seq.ReadOk(epoch) { + b.Fatalf("ReadOk: got false, wanted true") + } + } + }) +} + +func TestPointersInType(t *testing.T) { + for _, test := range []struct { + name string // used for both test and value name + val interface{} + ptrs []string + }{ + { + name: "EmptyStruct", + val: struct{}{}, + }, + { + name: "Int", + val: int(0), + }, + { + name: "MixedStruct", + val: struct { + b bool + I int + ExportedPtr *struct{} + unexportedPtr *struct{} + arr [2]int + ptrArr [2]*int + nestedStruct struct { + nestedNonptr int + nestedPtr *int + } + structArr [1]struct { + nonptr int + ptr *int + } + }{}, + ptrs: []string{ + "MixedStruct.ExportedPtr", + "MixedStruct.unexportedPtr", + "MixedStruct.ptrArr[]", + "MixedStruct.nestedStruct.nestedPtr", + "MixedStruct.structArr[].ptr", + }, + }, + } { + t.Run(test.name, func(t *testing.T) { + typ := reflect.TypeOf(test.val) + ptrs := PointersInType(typ, test.name) + t.Logf("Found pointers: %v", ptrs) + if (len(ptrs) != 0 || len(test.ptrs) != 0) && !reflect.DeepEqual(ptrs, test.ptrs) { + t.Errorf("Got %v, wanted %v", ptrs, test.ptrs) + } + }) + } +} diff --git a/pkg/sync/syncutil.go b/pkg/sync/syncutil.go new file mode 100644 index 000000000..b16cf5333 --- /dev/null +++ b/pkg/sync/syncutil.go @@ -0,0 +1,7 @@ +// Copyright 2019 The gVisor Authors. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package sync provides synchronization primitives. +package sync diff --git a/pkg/syncutil/BUILD b/pkg/syncutil/BUILD deleted file mode 100644 index cb1f41628..000000000 --- a/pkg/syncutil/BUILD +++ /dev/null @@ -1,52 +0,0 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") -load("//tools/go_generics:defs.bzl", "go_template") - -package( - default_visibility = ["//:sandbox"], - licenses = ["notice"], -) - -exports_files(["LICENSE"]) - -go_template( - name = "generic_atomicptr", - srcs = ["atomicptr_unsafe.go"], - types = [ - "Value", - ], -) - -go_template( - name = "generic_seqatomic", - srcs = ["seqatomic_unsafe.go"], - types = [ - "Value", - ], - deps = [ - ":sync", - ], -) - -go_library( - name = "syncutil", - srcs = [ - "downgradable_rwmutex_unsafe.go", - "memmove_unsafe.go", - "norace_unsafe.go", - "race_unsafe.go", - "seqcount.go", - "syncutil.go", - ], - importpath = "gvisor.dev/gvisor/pkg/syncutil", -) - -go_test( - name = "syncutil_test", - size = "small", - srcs = [ - "downgradable_rwmutex_test.go", - "seqcount_test.go", - ], - embed = [":syncutil"], -) diff --git a/pkg/syncutil/LICENSE b/pkg/syncutil/LICENSE deleted file mode 100644 index 6a66aea5e..000000000 --- a/pkg/syncutil/LICENSE +++ /dev/null @@ -1,27 +0,0 @@ -Copyright (c) 2009 The Go Authors. All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: - - * Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above -copyright notice, this list of conditions and the following disclaimer -in the documentation and/or other materials provided with the -distribution. - * Neither the name of Google Inc. nor the names of its -contributors may be used to endorse or promote products derived from -this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/pkg/syncutil/README.md b/pkg/syncutil/README.md deleted file mode 100644 index 2183c4e20..000000000 --- a/pkg/syncutil/README.md +++ /dev/null @@ -1,5 +0,0 @@ -# Syncutil - -This package provides additional synchronization primitives not provided by the -Go stdlib 'sync' package. It is partially derived from the upstream 'sync' -package from go1.10. diff --git a/pkg/syncutil/atomicptr_unsafe.go b/pkg/syncutil/atomicptr_unsafe.go deleted file mode 100644 index 525c4beed..000000000 --- a/pkg/syncutil/atomicptr_unsafe.go +++ /dev/null @@ -1,47 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// Package template doesn't exist. This file must be instantiated using the -// go_template_instance rule in tools/go_generics/defs.bzl. -package template - -import ( - "sync/atomic" - "unsafe" -) - -// Value is a required type parameter. -type Value struct{} - -// An AtomicPtr is a pointer to a value of type Value that can be atomically -// loaded and stored. The zero value of an AtomicPtr represents nil. -// -// Note that copying AtomicPtr by value performs a non-atomic read of the -// stored pointer, which is unsafe if Store() can be called concurrently; in -// this case, do `dst.Store(src.Load())` instead. -// -// +stateify savable -type AtomicPtr struct { - ptr unsafe.Pointer `state:".(*Value)"` -} - -func (p *AtomicPtr) savePtr() *Value { - return p.Load() -} - -func (p *AtomicPtr) loadPtr(v *Value) { - p.Store(v) -} - -// Load returns the value set by the most recent Store. It returns nil if there -// has been no previous call to Store. -func (p *AtomicPtr) Load() *Value { - return (*Value)(atomic.LoadPointer(&p.ptr)) -} - -// Store sets the value returned by Load to x. -func (p *AtomicPtr) Store(x *Value) { - atomic.StorePointer(&p.ptr, (unsafe.Pointer)(x)) -} diff --git a/pkg/syncutil/atomicptrtest/BUILD b/pkg/syncutil/atomicptrtest/BUILD deleted file mode 100644 index 63f411a90..000000000 --- a/pkg/syncutil/atomicptrtest/BUILD +++ /dev/null @@ -1,29 +0,0 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") -load("//tools/go_generics:defs.bzl", "go_template_instance") - -package(licenses = ["notice"]) - -go_template_instance( - name = "atomicptr_int", - out = "atomicptr_int_unsafe.go", - package = "atomicptr", - suffix = "Int", - template = "//pkg/syncutil:generic_atomicptr", - types = { - "Value": "int", - }, -) - -go_library( - name = "atomicptr", - srcs = ["atomicptr_int_unsafe.go"], - importpath = "gvisor.dev/gvisor/pkg/syncutil/atomicptr", -) - -go_test( - name = "atomicptr_test", - size = "small", - srcs = ["atomicptr_test.go"], - embed = [":atomicptr"], -) diff --git a/pkg/syncutil/atomicptrtest/atomicptr_test.go b/pkg/syncutil/atomicptrtest/atomicptr_test.go deleted file mode 100644 index 8fdc5112e..000000000 --- a/pkg/syncutil/atomicptrtest/atomicptr_test.go +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package atomicptr - -import ( - "testing" -) - -func newInt(val int) *int { - return &val -} - -func TestAtomicPtr(t *testing.T) { - var p AtomicPtrInt - if got := p.Load(); got != nil { - t.Errorf("initial value is %p (%v), wanted nil", got, got) - } - want := newInt(42) - p.Store(want) - if got := p.Load(); got != want { - t.Errorf("wrong value: got %p (%v), wanted %p (%v)", got, got, want, want) - } - want = newInt(100) - p.Store(want) - if got := p.Load(); got != want { - t.Errorf("wrong value: got %p (%v), wanted %p (%v)", got, got, want, want) - } -} diff --git a/pkg/syncutil/downgradable_rwmutex_test.go b/pkg/syncutil/downgradable_rwmutex_test.go deleted file mode 100644 index ffaf7ecc7..000000000 --- a/pkg/syncutil/downgradable_rwmutex_test.go +++ /dev/null @@ -1,150 +0,0 @@ -// Copyright 2009 The Go Authors. All rights reserved. -// Copyright 2019 The gVisor Authors. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// GOMAXPROCS=10 go test - -// Copy/pasted from the standard library's sync/rwmutex_test.go, except for the -// addition of downgradingWriter and the renaming of num_iterations to -// numIterations to shut up Golint. - -package syncutil - -import ( - "fmt" - "runtime" - "sync/atomic" - "testing" -) - -func parallelReader(m *DowngradableRWMutex, clocked, cunlock, cdone chan bool) { - m.RLock() - clocked <- true - <-cunlock - m.RUnlock() - cdone <- true -} - -func doTestParallelReaders(numReaders, gomaxprocs int) { - runtime.GOMAXPROCS(gomaxprocs) - var m DowngradableRWMutex - clocked := make(chan bool) - cunlock := make(chan bool) - cdone := make(chan bool) - for i := 0; i < numReaders; i++ { - go parallelReader(&m, clocked, cunlock, cdone) - } - // Wait for all parallel RLock()s to succeed. - for i := 0; i < numReaders; i++ { - <-clocked - } - for i := 0; i < numReaders; i++ { - cunlock <- true - } - // Wait for the goroutines to finish. - for i := 0; i < numReaders; i++ { - <-cdone - } -} - -func TestParallelReaders(t *testing.T) { - defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(-1)) - doTestParallelReaders(1, 4) - doTestParallelReaders(3, 4) - doTestParallelReaders(4, 2) -} - -func reader(rwm *DowngradableRWMutex, numIterations int, activity *int32, cdone chan bool) { - for i := 0; i < numIterations; i++ { - rwm.RLock() - n := atomic.AddInt32(activity, 1) - if n < 1 || n >= 10000 { - panic(fmt.Sprintf("wlock(%d)\n", n)) - } - for i := 0; i < 100; i++ { - } - atomic.AddInt32(activity, -1) - rwm.RUnlock() - } - cdone <- true -} - -func writer(rwm *DowngradableRWMutex, numIterations int, activity *int32, cdone chan bool) { - for i := 0; i < numIterations; i++ { - rwm.Lock() - n := atomic.AddInt32(activity, 10000) - if n != 10000 { - panic(fmt.Sprintf("wlock(%d)\n", n)) - } - for i := 0; i < 100; i++ { - } - atomic.AddInt32(activity, -10000) - rwm.Unlock() - } - cdone <- true -} - -func downgradingWriter(rwm *DowngradableRWMutex, numIterations int, activity *int32, cdone chan bool) { - for i := 0; i < numIterations; i++ { - rwm.Lock() - n := atomic.AddInt32(activity, 10000) - if n != 10000 { - panic(fmt.Sprintf("wlock(%d)\n", n)) - } - for i := 0; i < 100; i++ { - } - atomic.AddInt32(activity, -10000) - rwm.DowngradeLock() - n = atomic.AddInt32(activity, 1) - if n < 1 || n >= 10000 { - panic(fmt.Sprintf("wlock(%d)\n", n)) - } - for i := 0; i < 100; i++ { - } - n = atomic.AddInt32(activity, -1) - rwm.RUnlock() - } - cdone <- true -} - -func HammerDowngradableRWMutex(gomaxprocs, numReaders, numIterations int) { - runtime.GOMAXPROCS(gomaxprocs) - // Number of active readers + 10000 * number of active writers. - var activity int32 - var rwm DowngradableRWMutex - cdone := make(chan bool) - go writer(&rwm, numIterations, &activity, cdone) - go downgradingWriter(&rwm, numIterations, &activity, cdone) - var i int - for i = 0; i < numReaders/2; i++ { - go reader(&rwm, numIterations, &activity, cdone) - } - go writer(&rwm, numIterations, &activity, cdone) - go downgradingWriter(&rwm, numIterations, &activity, cdone) - for ; i < numReaders; i++ { - go reader(&rwm, numIterations, &activity, cdone) - } - // Wait for the 4 writers and all readers to finish. - for i := 0; i < 4+numReaders; i++ { - <-cdone - } -} - -func TestDowngradableRWMutex(t *testing.T) { - defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(-1)) - n := 1000 - if testing.Short() { - n = 5 - } - HammerDowngradableRWMutex(1, 1, n) - HammerDowngradableRWMutex(1, 3, n) - HammerDowngradableRWMutex(1, 10, n) - HammerDowngradableRWMutex(4, 1, n) - HammerDowngradableRWMutex(4, 3, n) - HammerDowngradableRWMutex(4, 10, n) - HammerDowngradableRWMutex(10, 1, n) - HammerDowngradableRWMutex(10, 3, n) - HammerDowngradableRWMutex(10, 10, n) - HammerDowngradableRWMutex(10, 5, n) -} diff --git a/pkg/syncutil/downgradable_rwmutex_unsafe.go b/pkg/syncutil/downgradable_rwmutex_unsafe.go deleted file mode 100644 index 51e11555d..000000000 --- a/pkg/syncutil/downgradable_rwmutex_unsafe.go +++ /dev/null @@ -1,146 +0,0 @@ -// Copyright 2009 The Go Authors. All rights reserved. -// Copyright 2019 The gVisor Authors. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// +build go1.13 -// +build !go1.15 - -// Check go:linkname function signatures when updating Go version. - -// This is mostly copied from the standard library's sync/rwmutex.go. -// -// Happens-before relationships indicated to the race detector: -// - Unlock -> Lock (via writerSem) -// - Unlock -> RLock (via readerSem) -// - RUnlock -> Lock (via writerSem) -// - DowngradeLock -> RLock (via readerSem) - -package syncutil - -import ( - "sync" - "sync/atomic" - "unsafe" -) - -//go:linkname runtimeSemacquire sync.runtime_Semacquire -func runtimeSemacquire(s *uint32) - -//go:linkname runtimeSemrelease sync.runtime_Semrelease -func runtimeSemrelease(s *uint32, handoff bool, skipframes int) - -// DowngradableRWMutex is identical to sync.RWMutex, but adds the DowngradeLock -// method. -type DowngradableRWMutex struct { - w sync.Mutex // held if there are pending writers - writerSem uint32 // semaphore for writers to wait for completing readers - readerSem uint32 // semaphore for readers to wait for completing writers - readerCount int32 // number of pending readers - readerWait int32 // number of departing readers -} - -const rwmutexMaxReaders = 1 << 30 - -// RLock locks rw for reading. -func (rw *DowngradableRWMutex) RLock() { - if RaceEnabled { - RaceDisable() - } - if atomic.AddInt32(&rw.readerCount, 1) < 0 { - // A writer is pending, wait for it. - runtimeSemacquire(&rw.readerSem) - } - if RaceEnabled { - RaceEnable() - RaceAcquire(unsafe.Pointer(&rw.readerSem)) - } -} - -// RUnlock undoes a single RLock call. -func (rw *DowngradableRWMutex) RUnlock() { - if RaceEnabled { - RaceReleaseMerge(unsafe.Pointer(&rw.writerSem)) - RaceDisable() - } - if r := atomic.AddInt32(&rw.readerCount, -1); r < 0 { - if r+1 == 0 || r+1 == -rwmutexMaxReaders { - panic("RUnlock of unlocked DowngradableRWMutex") - } - // A writer is pending. - if atomic.AddInt32(&rw.readerWait, -1) == 0 { - // The last reader unblocks the writer. - runtimeSemrelease(&rw.writerSem, false, 0) - } - } - if RaceEnabled { - RaceEnable() - } -} - -// Lock locks rw for writing. -func (rw *DowngradableRWMutex) Lock() { - if RaceEnabled { - RaceDisable() - } - // First, resolve competition with other writers. - rw.w.Lock() - // Announce to readers there is a pending writer. - r := atomic.AddInt32(&rw.readerCount, -rwmutexMaxReaders) + rwmutexMaxReaders - // Wait for active readers. - if r != 0 && atomic.AddInt32(&rw.readerWait, r) != 0 { - runtimeSemacquire(&rw.writerSem) - } - if RaceEnabled { - RaceEnable() - RaceAcquire(unsafe.Pointer(&rw.writerSem)) - } -} - -// Unlock unlocks rw for writing. -func (rw *DowngradableRWMutex) Unlock() { - if RaceEnabled { - RaceRelease(unsafe.Pointer(&rw.writerSem)) - RaceRelease(unsafe.Pointer(&rw.readerSem)) - RaceDisable() - } - // Announce to readers there is no active writer. - r := atomic.AddInt32(&rw.readerCount, rwmutexMaxReaders) - if r >= rwmutexMaxReaders { - panic("Unlock of unlocked DowngradableRWMutex") - } - // Unblock blocked readers, if any. - for i := 0; i < int(r); i++ { - runtimeSemrelease(&rw.readerSem, false, 0) - } - // Allow other writers to proceed. - rw.w.Unlock() - if RaceEnabled { - RaceEnable() - } -} - -// DowngradeLock atomically unlocks rw for writing and locks it for reading. -func (rw *DowngradableRWMutex) DowngradeLock() { - if RaceEnabled { - RaceRelease(unsafe.Pointer(&rw.readerSem)) - RaceDisable() - } - // Announce to readers there is no active writer and one additional reader. - r := atomic.AddInt32(&rw.readerCount, rwmutexMaxReaders+1) - if r >= rwmutexMaxReaders+1 { - panic("DowngradeLock of unlocked DowngradableRWMutex") - } - // Unblock blocked readers, if any. Note that this loop starts as 1 since r - // includes this goroutine. - for i := 1; i < int(r); i++ { - runtimeSemrelease(&rw.readerSem, false, 0) - } - // Allow other writers to proceed to rw.w.Lock(). Note that they will still - // block on rw.writerSem since at least this reader exists, such that - // DowngradeLock() is atomic with the previous write lock. - rw.w.Unlock() - if RaceEnabled { - RaceEnable() - } -} diff --git a/pkg/syncutil/memmove_unsafe.go b/pkg/syncutil/memmove_unsafe.go deleted file mode 100644 index 348675baa..000000000 --- a/pkg/syncutil/memmove_unsafe.go +++ /dev/null @@ -1,28 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// +build go1.12 -// +build !go1.15 - -// Check go:linkname function signatures when updating Go version. - -package syncutil - -import ( - "unsafe" -) - -//go:linkname memmove runtime.memmove -//go:noescape -func memmove(to, from unsafe.Pointer, n uintptr) - -// Memmove is exported for SeqAtomicLoad/SeqAtomicTryLoad, which can't -// define it because go_generics can't update the go:linkname annotation. -// Furthermore, go:linkname silently doesn't work if the local name is exported -// (this is of course undocumented), which is why this indirection is -// necessary. -func Memmove(to, from unsafe.Pointer, n uintptr) { - memmove(to, from, n) -} diff --git a/pkg/syncutil/norace_unsafe.go b/pkg/syncutil/norace_unsafe.go deleted file mode 100644 index 0a0a9deda..000000000 --- a/pkg/syncutil/norace_unsafe.go +++ /dev/null @@ -1,35 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// +build !race - -package syncutil - -import ( - "unsafe" -) - -// RaceEnabled is true if the Go data race detector is enabled. -const RaceEnabled = false - -// RaceDisable has the same semantics as runtime.RaceDisable. -func RaceDisable() { -} - -// RaceEnable has the same semantics as runtime.RaceEnable. -func RaceEnable() { -} - -// RaceAcquire has the same semantics as runtime.RaceAcquire. -func RaceAcquire(addr unsafe.Pointer) { -} - -// RaceRelease has the same semantics as runtime.RaceRelease. -func RaceRelease(addr unsafe.Pointer) { -} - -// RaceReleaseMerge has the same semantics as runtime.RaceReleaseMerge. -func RaceReleaseMerge(addr unsafe.Pointer) { -} diff --git a/pkg/syncutil/race_unsafe.go b/pkg/syncutil/race_unsafe.go deleted file mode 100644 index 206067ec1..000000000 --- a/pkg/syncutil/race_unsafe.go +++ /dev/null @@ -1,41 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// +build race - -package syncutil - -import ( - "runtime" - "unsafe" -) - -// RaceEnabled is true if the Go data race detector is enabled. -const RaceEnabled = true - -// RaceDisable has the same semantics as runtime.RaceDisable. -func RaceDisable() { - runtime.RaceDisable() -} - -// RaceEnable has the same semantics as runtime.RaceEnable. -func RaceEnable() { - runtime.RaceEnable() -} - -// RaceAcquire has the same semantics as runtime.RaceAcquire. -func RaceAcquire(addr unsafe.Pointer) { - runtime.RaceAcquire(addr) -} - -// RaceRelease has the same semantics as runtime.RaceRelease. -func RaceRelease(addr unsafe.Pointer) { - runtime.RaceRelease(addr) -} - -// RaceReleaseMerge has the same semantics as runtime.RaceReleaseMerge. -func RaceReleaseMerge(addr unsafe.Pointer) { - runtime.RaceReleaseMerge(addr) -} diff --git a/pkg/syncutil/seqatomic_unsafe.go b/pkg/syncutil/seqatomic_unsafe.go deleted file mode 100644 index cb6d2eb22..000000000 --- a/pkg/syncutil/seqatomic_unsafe.go +++ /dev/null @@ -1,72 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// Package template doesn't exist. This file must be instantiated using the -// go_template_instance rule in tools/go_generics/defs.bzl. -package template - -import ( - "fmt" - "reflect" - "strings" - "unsafe" - - "gvisor.dev/gvisor/pkg/syncutil" -) - -// Value is a required type parameter. -// -// Value must not contain any pointers, including interface objects, function -// objects, slices, maps, channels, unsafe.Pointer, and arrays or structs -// containing any of the above. An init() function will panic if this property -// does not hold. -type Value struct{} - -// SeqAtomicLoad returns a copy of *ptr, ensuring that the read does not race -// with any writer critical sections in sc. -func SeqAtomicLoad(sc *syncutil.SeqCount, ptr *Value) Value { - // This function doesn't use SeqAtomicTryLoad because doing so is - // measurably, significantly (~20%) slower; Go is awful at inlining. - var val Value - for { - epoch := sc.BeginRead() - if syncutil.RaceEnabled { - // runtime.RaceDisable() doesn't actually stop the race detector, - // so it can't help us here. Instead, call runtime.memmove - // directly, which is not instrumented by the race detector. - syncutil.Memmove(unsafe.Pointer(&val), unsafe.Pointer(ptr), unsafe.Sizeof(val)) - } else { - // This is ~40% faster for short reads than going through memmove. - val = *ptr - } - if sc.ReadOk(epoch) { - break - } - } - return val -} - -// SeqAtomicTryLoad returns a copy of *ptr while in a reader critical section -// in sc initiated by a call to sc.BeginRead() that returned epoch. If the read -// would race with a writer critical section, SeqAtomicTryLoad returns -// (unspecified, false). -func SeqAtomicTryLoad(sc *syncutil.SeqCount, epoch syncutil.SeqCountEpoch, ptr *Value) (Value, bool) { - var val Value - if syncutil.RaceEnabled { - syncutil.Memmove(unsafe.Pointer(&val), unsafe.Pointer(ptr), unsafe.Sizeof(val)) - } else { - val = *ptr - } - return val, sc.ReadOk(epoch) -} - -func init() { - var val Value - typ := reflect.TypeOf(val) - name := typ.Name() - if ptrs := syncutil.PointersInType(typ, name); len(ptrs) != 0 { - panic(fmt.Sprintf("SeqAtomicLoad<%s> is invalid since values %s of type %s contain pointers:\n%s", typ, name, typ, strings.Join(ptrs, "\n"))) - } -} diff --git a/pkg/syncutil/seqatomictest/BUILD b/pkg/syncutil/seqatomictest/BUILD deleted file mode 100644 index ba18f3238..000000000 --- a/pkg/syncutil/seqatomictest/BUILD +++ /dev/null @@ -1,35 +0,0 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") -load("//tools/go_generics:defs.bzl", "go_template_instance") - -package(licenses = ["notice"]) - -go_template_instance( - name = "seqatomic_int", - out = "seqatomic_int_unsafe.go", - package = "seqatomic", - suffix = "Int", - template = "//pkg/syncutil:generic_seqatomic", - types = { - "Value": "int", - }, -) - -go_library( - name = "seqatomic", - srcs = ["seqatomic_int_unsafe.go"], - importpath = "gvisor.dev/gvisor/pkg/syncutil/seqatomic", - deps = [ - "//pkg/syncutil", - ], -) - -go_test( - name = "seqatomic_test", - size = "small", - srcs = ["seqatomic_test.go"], - embed = [":seqatomic"], - deps = [ - "//pkg/syncutil", - ], -) diff --git a/pkg/syncutil/seqatomictest/seqatomic_test.go b/pkg/syncutil/seqatomictest/seqatomic_test.go deleted file mode 100644 index b0db44999..000000000 --- a/pkg/syncutil/seqatomictest/seqatomic_test.go +++ /dev/null @@ -1,132 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package seqatomic - -import ( - "sync/atomic" - "testing" - "time" - - "gvisor.dev/gvisor/pkg/syncutil" -) - -func TestSeqAtomicLoadUncontended(t *testing.T) { - var seq syncutil.SeqCount - const want = 1 - data := want - if got := SeqAtomicLoadInt(&seq, &data); got != want { - t.Errorf("SeqAtomicLoadInt: got %v, wanted %v", got, want) - } -} - -func TestSeqAtomicLoadAfterWrite(t *testing.T) { - var seq syncutil.SeqCount - var data int - const want = 1 - seq.BeginWrite() - data = want - seq.EndWrite() - if got := SeqAtomicLoadInt(&seq, &data); got != want { - t.Errorf("SeqAtomicLoadInt: got %v, wanted %v", got, want) - } -} - -func TestSeqAtomicLoadDuringWrite(t *testing.T) { - var seq syncutil.SeqCount - var data int - const want = 1 - seq.BeginWrite() - go func() { - time.Sleep(time.Second) - data = want - seq.EndWrite() - }() - if got := SeqAtomicLoadInt(&seq, &data); got != want { - t.Errorf("SeqAtomicLoadInt: got %v, wanted %v", got, want) - } -} - -func TestSeqAtomicTryLoadUncontended(t *testing.T) { - var seq syncutil.SeqCount - const want = 1 - data := want - epoch := seq.BeginRead() - if got, ok := SeqAtomicTryLoadInt(&seq, epoch, &data); !ok || got != want { - t.Errorf("SeqAtomicTryLoadInt: got (%v, %v), wanted (%v, true)", got, ok, want) - } -} - -func TestSeqAtomicTryLoadDuringWrite(t *testing.T) { - var seq syncutil.SeqCount - var data int - epoch := seq.BeginRead() - seq.BeginWrite() - if got, ok := SeqAtomicTryLoadInt(&seq, epoch, &data); ok { - t.Errorf("SeqAtomicTryLoadInt: got (%v, true), wanted (_, false)", got) - } - seq.EndWrite() -} - -func TestSeqAtomicTryLoadAfterWrite(t *testing.T) { - var seq syncutil.SeqCount - var data int - epoch := seq.BeginRead() - seq.BeginWrite() - seq.EndWrite() - if got, ok := SeqAtomicTryLoadInt(&seq, epoch, &data); ok { - t.Errorf("SeqAtomicTryLoadInt: got (%v, true), wanted (_, false)", got) - } -} - -func BenchmarkSeqAtomicLoadIntUncontended(b *testing.B) { - var seq syncutil.SeqCount - const want = 42 - data := want - b.RunParallel(func(pb *testing.PB) { - for pb.Next() { - if got := SeqAtomicLoadInt(&seq, &data); got != want { - b.Fatalf("SeqAtomicLoadInt: got %v, wanted %v", got, want) - } - } - }) -} - -func BenchmarkSeqAtomicTryLoadIntUncontended(b *testing.B) { - var seq syncutil.SeqCount - const want = 42 - data := want - b.RunParallel(func(pb *testing.PB) { - epoch := seq.BeginRead() - for pb.Next() { - if got, ok := SeqAtomicTryLoadInt(&seq, epoch, &data); !ok || got != want { - b.Fatalf("SeqAtomicTryLoadInt: got (%v, %v), wanted (%v, true)", got, ok, want) - } - } - }) -} - -// For comparison: -func BenchmarkAtomicValueLoadIntUncontended(b *testing.B) { - var a atomic.Value - const want = 42 - a.Store(int(want)) - b.RunParallel(func(pb *testing.PB) { - for pb.Next() { - if got := a.Load().(int); got != want { - b.Fatalf("atomic.Value.Load: got %v, wanted %v", got, want) - } - } - }) -} diff --git a/pkg/syncutil/seqcount.go b/pkg/syncutil/seqcount.go deleted file mode 100644 index 11d8dbfaa..000000000 --- a/pkg/syncutil/seqcount.go +++ /dev/null @@ -1,149 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package syncutil - -import ( - "fmt" - "reflect" - "runtime" - "sync/atomic" -) - -// SeqCount is a synchronization primitive for optimistic reader/writer -// synchronization in cases where readers can work with stale data and -// therefore do not need to block writers. -// -// Compared to sync/atomic.Value: -// -// - Mutation of SeqCount-protected data does not require memory allocation, -// whereas atomic.Value generally does. This is a significant advantage when -// writes are common. -// -// - Atomic reads of SeqCount-protected data require copying. This is a -// disadvantage when atomic reads are common. -// -// - SeqCount may be more flexible: correct use of SeqCount.ReadOk allows other -// operations to be made atomic with reads of SeqCount-protected data. -// -// - SeqCount may be less flexible: as of this writing, SeqCount-protected data -// cannot include pointers. -// -// - SeqCount is more cumbersome to use; atomic reads of SeqCount-protected -// data require instantiating function templates using go_generics (see -// seqatomic.go). -type SeqCount struct { - // epoch is incremented by BeginWrite and EndWrite, such that epoch is odd - // if a writer critical section is active, and a read from data protected - // by this SeqCount is atomic iff epoch is the same even value before and - // after the read. - epoch uint32 -} - -// SeqCountEpoch tracks writer critical sections in a SeqCount. -type SeqCountEpoch struct { - val uint32 -} - -// We assume that: -// -// - All functions in sync/atomic that perform a memory read are at least a -// read fence: memory reads before calls to such functions cannot be reordered -// after the call, and memory reads after calls to such functions cannot be -// reordered before the call, even if those reads do not use sync/atomic. -// -// - All functions in sync/atomic that perform a memory write are at least a -// write fence: memory writes before calls to such functions cannot be -// reordered after the call, and memory writes after calls to such functions -// cannot be reordered before the call, even if those writes do not use -// sync/atomic. -// -// As of this writing, the Go memory model completely fails to describe -// sync/atomic, but these properties are implied by -// https://groups.google.com/forum/#!topic/golang-nuts/7EnEhM3U7B8. - -// BeginRead indicates the beginning of a reader critical section. Reader -// critical sections DO NOT BLOCK writer critical sections, so operations in a -// reader critical section MAY RACE with writer critical sections. Races are -// detected by ReadOk at the end of the reader critical section. Thus, the -// low-level structure of readers is generally: -// -// for { -// epoch := seq.BeginRead() -// // do something idempotent with seq-protected data -// if seq.ReadOk(epoch) { -// break -// } -// } -// -// However, since reader critical sections may race with writer critical -// sections, the Go race detector will (accurately) flag data races in readers -// using this pattern. Most users of SeqCount will need to use the -// SeqAtomicLoad function template in seqatomic.go. -func (s *SeqCount) BeginRead() SeqCountEpoch { - epoch := atomic.LoadUint32(&s.epoch) - for epoch&1 != 0 { - runtime.Gosched() - epoch = atomic.LoadUint32(&s.epoch) - } - return SeqCountEpoch{epoch} -} - -// ReadOk returns true if the reader critical section initiated by a previous -// call to BeginRead() that returned epoch did not race with any writer critical -// sections. -// -// ReadOk may be called any number of times during a reader critical section. -// Reader critical sections do not need to be explicitly terminated; the last -// call to ReadOk is implicitly the end of the reader critical section. -func (s *SeqCount) ReadOk(epoch SeqCountEpoch) bool { - return atomic.LoadUint32(&s.epoch) == epoch.val -} - -// BeginWrite indicates the beginning of a writer critical section. -// -// SeqCount does not support concurrent writer critical sections; clients with -// concurrent writers must synchronize them using e.g. sync.Mutex. -func (s *SeqCount) BeginWrite() { - if epoch := atomic.AddUint32(&s.epoch, 1); epoch&1 == 0 { - panic("SeqCount.BeginWrite during writer critical section") - } -} - -// EndWrite ends the effect of a preceding BeginWrite. -func (s *SeqCount) EndWrite() { - if epoch := atomic.AddUint32(&s.epoch, 1); epoch&1 != 0 { - panic("SeqCount.EndWrite outside writer critical section") - } -} - -// PointersInType returns a list of pointers reachable from values named -// valName of the given type. -// -// PointersInType is not exhaustive, but it is guaranteed that if typ contains -// at least one pointer, then PointersInTypeOf returns a non-empty list. -func PointersInType(typ reflect.Type, valName string) []string { - switch kind := typ.Kind(); kind { - case reflect.Bool, reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64, reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64, reflect.Uintptr, reflect.Float32, reflect.Float64, reflect.Complex64, reflect.Complex128: - return nil - - case reflect.Chan, reflect.Func, reflect.Interface, reflect.Map, reflect.Ptr, reflect.Slice, reflect.String, reflect.UnsafePointer: - return []string{valName} - - case reflect.Array: - return PointersInType(typ.Elem(), valName+"[]") - - case reflect.Struct: - var ptrs []string - for i, n := 0, typ.NumField(); i < n; i++ { - field := typ.Field(i) - ptrs = append(ptrs, PointersInType(field.Type, fmt.Sprintf("%s.%s", valName, field.Name))...) - } - return ptrs - - default: - return []string{fmt.Sprintf("%s (of type %s with unknown kind %s)", valName, typ, kind)} - } -} diff --git a/pkg/syncutil/seqcount_test.go b/pkg/syncutil/seqcount_test.go deleted file mode 100644 index 14d6aedea..000000000 --- a/pkg/syncutil/seqcount_test.go +++ /dev/null @@ -1,153 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package syncutil - -import ( - "reflect" - "testing" - "time" -) - -func TestSeqCountWriteUncontended(t *testing.T) { - var seq SeqCount - seq.BeginWrite() - seq.EndWrite() -} - -func TestSeqCountReadUncontended(t *testing.T) { - var seq SeqCount - epoch := seq.BeginRead() - if !seq.ReadOk(epoch) { - t.Errorf("ReadOk: got false, wanted true") - } -} - -func TestSeqCountBeginReadAfterWrite(t *testing.T) { - var seq SeqCount - var data int32 - const want = 1 - seq.BeginWrite() - data = want - seq.EndWrite() - epoch := seq.BeginRead() - if data != want { - t.Errorf("Reader: got %v, wanted %v", data, want) - } - if !seq.ReadOk(epoch) { - t.Errorf("ReadOk: got false, wanted true") - } -} - -func TestSeqCountBeginReadDuringWrite(t *testing.T) { - var seq SeqCount - var data int - const want = 1 - seq.BeginWrite() - go func() { - time.Sleep(time.Second) - data = want - seq.EndWrite() - }() - epoch := seq.BeginRead() - if data != want { - t.Errorf("Reader: got %v, wanted %v", data, want) - } - if !seq.ReadOk(epoch) { - t.Errorf("ReadOk: got false, wanted true") - } -} - -func TestSeqCountReadOkAfterWrite(t *testing.T) { - var seq SeqCount - epoch := seq.BeginRead() - seq.BeginWrite() - seq.EndWrite() - if seq.ReadOk(epoch) { - t.Errorf("ReadOk: got true, wanted false") - } -} - -func TestSeqCountReadOkDuringWrite(t *testing.T) { - var seq SeqCount - epoch := seq.BeginRead() - seq.BeginWrite() - if seq.ReadOk(epoch) { - t.Errorf("ReadOk: got true, wanted false") - } - seq.EndWrite() -} - -func BenchmarkSeqCountWriteUncontended(b *testing.B) { - var seq SeqCount - for i := 0; i < b.N; i++ { - seq.BeginWrite() - seq.EndWrite() - } -} - -func BenchmarkSeqCountReadUncontended(b *testing.B) { - var seq SeqCount - b.RunParallel(func(pb *testing.PB) { - for pb.Next() { - epoch := seq.BeginRead() - if !seq.ReadOk(epoch) { - b.Fatalf("ReadOk: got false, wanted true") - } - } - }) -} - -func TestPointersInType(t *testing.T) { - for _, test := range []struct { - name string // used for both test and value name - val interface{} - ptrs []string - }{ - { - name: "EmptyStruct", - val: struct{}{}, - }, - { - name: "Int", - val: int(0), - }, - { - name: "MixedStruct", - val: struct { - b bool - I int - ExportedPtr *struct{} - unexportedPtr *struct{} - arr [2]int - ptrArr [2]*int - nestedStruct struct { - nestedNonptr int - nestedPtr *int - } - structArr [1]struct { - nonptr int - ptr *int - } - }{}, - ptrs: []string{ - "MixedStruct.ExportedPtr", - "MixedStruct.unexportedPtr", - "MixedStruct.ptrArr[]", - "MixedStruct.nestedStruct.nestedPtr", - "MixedStruct.structArr[].ptr", - }, - }, - } { - t.Run(test.name, func(t *testing.T) { - typ := reflect.TypeOf(test.val) - ptrs := PointersInType(typ, test.name) - t.Logf("Found pointers: %v", ptrs) - if (len(ptrs) != 0 || len(test.ptrs) != 0) && !reflect.DeepEqual(ptrs, test.ptrs) { - t.Errorf("Got %v, wanted %v", ptrs, test.ptrs) - } - }) - } -} diff --git a/pkg/syncutil/syncutil.go b/pkg/syncutil/syncutil.go deleted file mode 100644 index 66e750d06..000000000 --- a/pkg/syncutil/syncutil.go +++ /dev/null @@ -1,7 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// Package syncutil provides synchronization primitives. -package syncutil diff --git a/pkg/tcpip/BUILD b/pkg/tcpip/BUILD index e07ebd153..db06d02c6 100644 --- a/pkg/tcpip/BUILD +++ b/pkg/tcpip/BUILD @@ -15,6 +15,7 @@ go_library( importpath = "gvisor.dev/gvisor/pkg/tcpip", visibility = ["//visibility:public"], deps = [ + "//pkg/sync", "//pkg/tcpip/buffer", "//pkg/tcpip/iptables", "//pkg/waiter", diff --git a/pkg/tcpip/adapters/gonet/BUILD b/pkg/tcpip/adapters/gonet/BUILD index 78df5a0b1..3df7d18d3 100644 --- a/pkg/tcpip/adapters/gonet/BUILD +++ b/pkg/tcpip/adapters/gonet/BUILD @@ -9,6 +9,7 @@ go_library( importpath = "gvisor.dev/gvisor/pkg/tcpip/adapters/gonet", visibility = ["//visibility:public"], deps = [ + "//pkg/sync", "//pkg/tcpip", "//pkg/tcpip/buffer", "//pkg/tcpip/stack", diff --git a/pkg/tcpip/adapters/gonet/gonet.go b/pkg/tcpip/adapters/gonet/gonet.go index cd6ce930a..a2f44b496 100644 --- a/pkg/tcpip/adapters/gonet/gonet.go +++ b/pkg/tcpip/adapters/gonet/gonet.go @@ -20,9 +20,9 @@ import ( "errors" "io" "net" - "sync" "time" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/buffer" "gvisor.dev/gvisor/pkg/tcpip/stack" diff --git a/pkg/tcpip/link/fdbased/BUILD b/pkg/tcpip/link/fdbased/BUILD index 897c94821..66cc53ed4 100644 --- a/pkg/tcpip/link/fdbased/BUILD +++ b/pkg/tcpip/link/fdbased/BUILD @@ -16,6 +16,7 @@ go_library( importpath = "gvisor.dev/gvisor/pkg/tcpip/link/fdbased", visibility = ["//visibility:public"], deps = [ + "//pkg/sync", "//pkg/tcpip", "//pkg/tcpip/buffer", "//pkg/tcpip/header", diff --git a/pkg/tcpip/link/fdbased/endpoint.go b/pkg/tcpip/link/fdbased/endpoint.go index fa8a703d9..b7f60178e 100644 --- a/pkg/tcpip/link/fdbased/endpoint.go +++ b/pkg/tcpip/link/fdbased/endpoint.go @@ -41,10 +41,10 @@ package fdbased import ( "fmt" - "sync" "syscall" "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/buffer" "gvisor.dev/gvisor/pkg/tcpip/header" diff --git a/pkg/tcpip/link/sharedmem/BUILD b/pkg/tcpip/link/sharedmem/BUILD index a4f9cdd69..09165dd4c 100644 --- a/pkg/tcpip/link/sharedmem/BUILD +++ b/pkg/tcpip/link/sharedmem/BUILD @@ -15,6 +15,7 @@ go_library( visibility = ["//visibility:public"], deps = [ "//pkg/log", + "//pkg/sync", "//pkg/tcpip", "//pkg/tcpip/buffer", "//pkg/tcpip/header", @@ -31,6 +32,7 @@ go_test( ], embed = [":sharedmem"], deps = [ + "//pkg/sync", "//pkg/tcpip", "//pkg/tcpip/buffer", "//pkg/tcpip/header", diff --git a/pkg/tcpip/link/sharedmem/pipe/BUILD b/pkg/tcpip/link/sharedmem/pipe/BUILD index 6b5bc542c..a0d4ad0be 100644 --- a/pkg/tcpip/link/sharedmem/pipe/BUILD +++ b/pkg/tcpip/link/sharedmem/pipe/BUILD @@ -21,4 +21,5 @@ go_test( "pipe_test.go", ], embed = [":pipe"], + deps = ["//pkg/sync"], ) diff --git a/pkg/tcpip/link/sharedmem/pipe/pipe_test.go b/pkg/tcpip/link/sharedmem/pipe/pipe_test.go index 59ef69a8b..dc239a0d0 100644 --- a/pkg/tcpip/link/sharedmem/pipe/pipe_test.go +++ b/pkg/tcpip/link/sharedmem/pipe/pipe_test.go @@ -18,8 +18,9 @@ import ( "math/rand" "reflect" "runtime" - "sync" "testing" + + "gvisor.dev/gvisor/pkg/sync" ) func TestSimpleReadWrite(t *testing.T) { diff --git a/pkg/tcpip/link/sharedmem/sharedmem.go b/pkg/tcpip/link/sharedmem/sharedmem.go index 080f9d667..655e537c4 100644 --- a/pkg/tcpip/link/sharedmem/sharedmem.go +++ b/pkg/tcpip/link/sharedmem/sharedmem.go @@ -23,11 +23,11 @@ package sharedmem import ( - "sync" "sync/atomic" "syscall" "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/buffer" "gvisor.dev/gvisor/pkg/tcpip/header" diff --git a/pkg/tcpip/link/sharedmem/sharedmem_test.go b/pkg/tcpip/link/sharedmem/sharedmem_test.go index 89603c48f..5c729a439 100644 --- a/pkg/tcpip/link/sharedmem/sharedmem_test.go +++ b/pkg/tcpip/link/sharedmem/sharedmem_test.go @@ -22,11 +22,11 @@ import ( "math/rand" "os" "strings" - "sync" "syscall" "testing" "time" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/buffer" "gvisor.dev/gvisor/pkg/tcpip/header" diff --git a/pkg/tcpip/network/fragmentation/BUILD b/pkg/tcpip/network/fragmentation/BUILD index acf1e022c..ed16076fd 100644 --- a/pkg/tcpip/network/fragmentation/BUILD +++ b/pkg/tcpip/network/fragmentation/BUILD @@ -28,6 +28,7 @@ go_library( visibility = ["//visibility:public"], deps = [ "//pkg/log", + "//pkg/sync", "//pkg/tcpip", "//pkg/tcpip/buffer", ], diff --git a/pkg/tcpip/network/fragmentation/fragmentation.go b/pkg/tcpip/network/fragmentation/fragmentation.go index 6da5238ec..92f2aa13a 100644 --- a/pkg/tcpip/network/fragmentation/fragmentation.go +++ b/pkg/tcpip/network/fragmentation/fragmentation.go @@ -19,9 +19,9 @@ package fragmentation import ( "fmt" "log" - "sync" "time" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip/buffer" ) diff --git a/pkg/tcpip/network/fragmentation/reassembler.go b/pkg/tcpip/network/fragmentation/reassembler.go index 9e002e396..0a83d81f2 100644 --- a/pkg/tcpip/network/fragmentation/reassembler.go +++ b/pkg/tcpip/network/fragmentation/reassembler.go @@ -18,9 +18,9 @@ import ( "container/heap" "fmt" "math" - "sync" "time" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip/buffer" ) diff --git a/pkg/tcpip/ports/BUILD b/pkg/tcpip/ports/BUILD index e156b01f6..a6ef3bdcc 100644 --- a/pkg/tcpip/ports/BUILD +++ b/pkg/tcpip/ports/BUILD @@ -9,6 +9,7 @@ go_library( importpath = "gvisor.dev/gvisor/pkg/tcpip/ports", visibility = ["//visibility:public"], deps = [ + "//pkg/sync", "//pkg/tcpip", ], ) diff --git a/pkg/tcpip/ports/ports.go b/pkg/tcpip/ports/ports.go index 6c5e19e8f..b937cb84b 100644 --- a/pkg/tcpip/ports/ports.go +++ b/pkg/tcpip/ports/ports.go @@ -18,9 +18,9 @@ package ports import ( "math" "math/rand" - "sync" "sync/atomic" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" ) diff --git a/pkg/tcpip/stack/BUILD b/pkg/tcpip/stack/BUILD index 826fca4de..6a8654105 100644 --- a/pkg/tcpip/stack/BUILD +++ b/pkg/tcpip/stack/BUILD @@ -36,6 +36,7 @@ go_library( "//pkg/ilist", "//pkg/rand", "//pkg/sleep", + "//pkg/sync", "//pkg/tcpip", "//pkg/tcpip/buffer", "//pkg/tcpip/hash/jenkins", @@ -80,6 +81,7 @@ go_test( embed = [":stack"], deps = [ "//pkg/sleep", + "//pkg/sync", "//pkg/tcpip", ], ) diff --git a/pkg/tcpip/stack/linkaddrcache.go b/pkg/tcpip/stack/linkaddrcache.go index 267df60d1..403557fd7 100644 --- a/pkg/tcpip/stack/linkaddrcache.go +++ b/pkg/tcpip/stack/linkaddrcache.go @@ -16,10 +16,10 @@ package stack import ( "fmt" - "sync" "time" "gvisor.dev/gvisor/pkg/sleep" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" ) diff --git a/pkg/tcpip/stack/linkaddrcache_test.go b/pkg/tcpip/stack/linkaddrcache_test.go index 9946b8fe8..1baa498d0 100644 --- a/pkg/tcpip/stack/linkaddrcache_test.go +++ b/pkg/tcpip/stack/linkaddrcache_test.go @@ -16,12 +16,12 @@ package stack import ( "fmt" - "sync" "sync/atomic" "testing" "time" "gvisor.dev/gvisor/pkg/sleep" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" ) diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go index 3810c6602..fe557ccbd 100644 --- a/pkg/tcpip/stack/nic.go +++ b/pkg/tcpip/stack/nic.go @@ -16,9 +16,9 @@ package stack import ( "strings" - "sync" "sync/atomic" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/buffer" "gvisor.dev/gvisor/pkg/tcpip/header" diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go index 41bf9fd9b..a47ceba54 100644 --- a/pkg/tcpip/stack/stack.go +++ b/pkg/tcpip/stack/stack.go @@ -21,13 +21,13 @@ package stack import ( "encoding/binary" - "sync" "sync/atomic" "time" "golang.org/x/time/rate" "gvisor.dev/gvisor/pkg/rand" "gvisor.dev/gvisor/pkg/sleep" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/buffer" "gvisor.dev/gvisor/pkg/tcpip/header" diff --git a/pkg/tcpip/stack/transport_demuxer.go b/pkg/tcpip/stack/transport_demuxer.go index 67c21be42..f384a91de 100644 --- a/pkg/tcpip/stack/transport_demuxer.go +++ b/pkg/tcpip/stack/transport_demuxer.go @@ -18,8 +18,8 @@ import ( "fmt" "math/rand" "sort" - "sync" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/hash/jenkins" "gvisor.dev/gvisor/pkg/tcpip/header" diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go index 72b5ce179..4a090ac86 100644 --- a/pkg/tcpip/tcpip.go +++ b/pkg/tcpip/tcpip.go @@ -35,10 +35,10 @@ import ( "reflect" "strconv" "strings" - "sync" "sync/atomic" "time" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip/buffer" "gvisor.dev/gvisor/pkg/tcpip/iptables" "gvisor.dev/gvisor/pkg/waiter" diff --git a/pkg/tcpip/transport/icmp/BUILD b/pkg/tcpip/transport/icmp/BUILD index d8c5b5058..3aa23d529 100644 --- a/pkg/tcpip/transport/icmp/BUILD +++ b/pkg/tcpip/transport/icmp/BUILD @@ -28,6 +28,7 @@ go_library( visibility = ["//visibility:public"], deps = [ "//pkg/sleep", + "//pkg/sync", "//pkg/tcpip", "//pkg/tcpip/buffer", "//pkg/tcpip/header", diff --git a/pkg/tcpip/transport/icmp/endpoint.go b/pkg/tcpip/transport/icmp/endpoint.go index c7ce74cdd..330786f4c 100644 --- a/pkg/tcpip/transport/icmp/endpoint.go +++ b/pkg/tcpip/transport/icmp/endpoint.go @@ -15,8 +15,7 @@ package icmp import ( - "sync" - + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/buffer" "gvisor.dev/gvisor/pkg/tcpip/header" diff --git a/pkg/tcpip/transport/packet/BUILD b/pkg/tcpip/transport/packet/BUILD index 44b58ff6b..4858d150c 100644 --- a/pkg/tcpip/transport/packet/BUILD +++ b/pkg/tcpip/transport/packet/BUILD @@ -28,6 +28,7 @@ go_library( deps = [ "//pkg/log", "//pkg/sleep", + "//pkg/sync", "//pkg/tcpip", "//pkg/tcpip/buffer", "//pkg/tcpip/header", diff --git a/pkg/tcpip/transport/packet/endpoint.go b/pkg/tcpip/transport/packet/endpoint.go index 07ffa8aba..fc5bc69fa 100644 --- a/pkg/tcpip/transport/packet/endpoint.go +++ b/pkg/tcpip/transport/packet/endpoint.go @@ -25,8 +25,7 @@ package packet import ( - "sync" - + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/buffer" "gvisor.dev/gvisor/pkg/tcpip/header" diff --git a/pkg/tcpip/transport/raw/BUILD b/pkg/tcpip/transport/raw/BUILD index 00991ac8e..2f2131ff7 100644 --- a/pkg/tcpip/transport/raw/BUILD +++ b/pkg/tcpip/transport/raw/BUILD @@ -29,6 +29,7 @@ go_library( deps = [ "//pkg/log", "//pkg/sleep", + "//pkg/sync", "//pkg/tcpip", "//pkg/tcpip/buffer", "//pkg/tcpip/header", diff --git a/pkg/tcpip/transport/raw/endpoint.go b/pkg/tcpip/transport/raw/endpoint.go index 85f7eb76b..ee9c4c58b 100644 --- a/pkg/tcpip/transport/raw/endpoint.go +++ b/pkg/tcpip/transport/raw/endpoint.go @@ -26,8 +26,7 @@ package raw import ( - "sync" - + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/buffer" "gvisor.dev/gvisor/pkg/tcpip/header" diff --git a/pkg/tcpip/transport/tcp/BUILD b/pkg/tcpip/transport/tcp/BUILD index 3b353d56c..353bd06f4 100644 --- a/pkg/tcpip/transport/tcp/BUILD +++ b/pkg/tcpip/transport/tcp/BUILD @@ -48,6 +48,7 @@ go_library( "//pkg/log", "//pkg/rand", "//pkg/sleep", + "//pkg/sync", "//pkg/tcpip", "//pkg/tcpip/buffer", "//pkg/tcpip/hash/jenkins", diff --git a/pkg/tcpip/transport/tcp/accept.go b/pkg/tcpip/transport/tcp/accept.go index 5422ae80c..1ea996936 100644 --- a/pkg/tcpip/transport/tcp/accept.go +++ b/pkg/tcpip/transport/tcp/accept.go @@ -19,11 +19,11 @@ import ( "encoding/binary" "hash" "io" - "sync" "time" "gvisor.dev/gvisor/pkg/rand" "gvisor.dev/gvisor/pkg/sleep" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/buffer" "gvisor.dev/gvisor/pkg/tcpip/header" diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go index cdd69f360..613ec1775 100644 --- a/pkg/tcpip/transport/tcp/connect.go +++ b/pkg/tcpip/transport/tcp/connect.go @@ -16,11 +16,11 @@ package tcp import ( "encoding/binary" - "sync" "time" "gvisor.dev/gvisor/pkg/rand" "gvisor.dev/gvisor/pkg/sleep" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/buffer" "gvisor.dev/gvisor/pkg/tcpip/hash/jenkins" diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go index 830bc1e3e..cca511fb9 100644 --- a/pkg/tcpip/transport/tcp/endpoint.go +++ b/pkg/tcpip/transport/tcp/endpoint.go @@ -19,12 +19,12 @@ import ( "fmt" "math" "strings" - "sync" "sync/atomic" "time" "gvisor.dev/gvisor/pkg/rand" "gvisor.dev/gvisor/pkg/sleep" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/buffer" "gvisor.dev/gvisor/pkg/tcpip/hash/jenkins" diff --git a/pkg/tcpip/transport/tcp/endpoint_state.go b/pkg/tcpip/transport/tcp/endpoint_state.go index 7aa4c3f0e..4b8d867bc 100644 --- a/pkg/tcpip/transport/tcp/endpoint_state.go +++ b/pkg/tcpip/transport/tcp/endpoint_state.go @@ -16,9 +16,9 @@ package tcp import ( "fmt" - "sync" "time" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/stack" diff --git a/pkg/tcpip/transport/tcp/forwarder.go b/pkg/tcpip/transport/tcp/forwarder.go index 4983bca81..7eb613be5 100644 --- a/pkg/tcpip/transport/tcp/forwarder.go +++ b/pkg/tcpip/transport/tcp/forwarder.go @@ -15,8 +15,7 @@ package tcp import ( - "sync" - + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/seqnum" diff --git a/pkg/tcpip/transport/tcp/protocol.go b/pkg/tcpip/transport/tcp/protocol.go index bc718064c..9a8f64aa6 100644 --- a/pkg/tcpip/transport/tcp/protocol.go +++ b/pkg/tcpip/transport/tcp/protocol.go @@ -22,9 +22,9 @@ package tcp import ( "strings" - "sync" "time" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/buffer" "gvisor.dev/gvisor/pkg/tcpip/header" diff --git a/pkg/tcpip/transport/tcp/segment_queue.go b/pkg/tcpip/transport/tcp/segment_queue.go index e0759225e..bd20a7ee9 100644 --- a/pkg/tcpip/transport/tcp/segment_queue.go +++ b/pkg/tcpip/transport/tcp/segment_queue.go @@ -15,7 +15,7 @@ package tcp import ( - "sync" + "gvisor.dev/gvisor/pkg/sync" ) // segmentQueue is a bounded, thread-safe queue of TCP segments. diff --git a/pkg/tcpip/transport/tcp/snd.go b/pkg/tcpip/transport/tcp/snd.go index 8a947dc66..79f2d274b 100644 --- a/pkg/tcpip/transport/tcp/snd.go +++ b/pkg/tcpip/transport/tcp/snd.go @@ -16,11 +16,11 @@ package tcp import ( "math" - "sync" "sync/atomic" "time" "gvisor.dev/gvisor/pkg/sleep" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/buffer" "gvisor.dev/gvisor/pkg/tcpip/header" diff --git a/pkg/tcpip/transport/udp/BUILD b/pkg/tcpip/transport/udp/BUILD index 97e4d5825..57ff123e3 100644 --- a/pkg/tcpip/transport/udp/BUILD +++ b/pkg/tcpip/transport/udp/BUILD @@ -30,6 +30,7 @@ go_library( visibility = ["//visibility:public"], deps = [ "//pkg/sleep", + "//pkg/sync", "//pkg/tcpip", "//pkg/tcpip/buffer", "//pkg/tcpip/header", diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go index 864dc8733..a4ff29a7d 100644 --- a/pkg/tcpip/transport/udp/endpoint.go +++ b/pkg/tcpip/transport/udp/endpoint.go @@ -15,8 +15,7 @@ package udp import ( - "sync" - + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/buffer" "gvisor.dev/gvisor/pkg/tcpip/header" diff --git a/pkg/tmutex/BUILD b/pkg/tmutex/BUILD index 6afdb29b7..07778e4f7 100644 --- a/pkg/tmutex/BUILD +++ b/pkg/tmutex/BUILD @@ -15,4 +15,5 @@ go_test( size = "medium", srcs = ["tmutex_test.go"], embed = [":tmutex"], + deps = ["//pkg/sync"], ) diff --git a/pkg/tmutex/tmutex_test.go b/pkg/tmutex/tmutex_test.go index ce34c7962..05540696a 100644 --- a/pkg/tmutex/tmutex_test.go +++ b/pkg/tmutex/tmutex_test.go @@ -17,10 +17,11 @@ package tmutex import ( "fmt" "runtime" - "sync" "sync/atomic" "testing" "time" + + "gvisor.dev/gvisor/pkg/sync" ) func TestBasicLock(t *testing.T) { diff --git a/pkg/unet/BUILD b/pkg/unet/BUILD index 8f6f180e5..d1885ae66 100644 --- a/pkg/unet/BUILD +++ b/pkg/unet/BUILD @@ -24,4 +24,5 @@ go_test( "unet_test.go", ], embed = [":unet"], + deps = ["//pkg/sync"], ) diff --git a/pkg/unet/unet_test.go b/pkg/unet/unet_test.go index a3cc6f5d3..5c4b9e8e9 100644 --- a/pkg/unet/unet_test.go +++ b/pkg/unet/unet_test.go @@ -19,10 +19,11 @@ import ( "os" "path/filepath" "reflect" - "sync" "syscall" "testing" "time" + + "gvisor.dev/gvisor/pkg/sync" ) func randomFilename() (string, error) { diff --git a/pkg/urpc/BUILD b/pkg/urpc/BUILD index b6bbb0ea2..b8fdc3125 100644 --- a/pkg/urpc/BUILD +++ b/pkg/urpc/BUILD @@ -11,6 +11,7 @@ go_library( deps = [ "//pkg/fd", "//pkg/log", + "//pkg/sync", "//pkg/unet", ], ) diff --git a/pkg/urpc/urpc.go b/pkg/urpc/urpc.go index df59ffab1..13b2ea314 100644 --- a/pkg/urpc/urpc.go +++ b/pkg/urpc/urpc.go @@ -27,10 +27,10 @@ import ( "os" "reflect" "runtime" - "sync" "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/unet" ) diff --git a/pkg/waiter/BUILD b/pkg/waiter/BUILD index 0427bc41f..1c6890e52 100644 --- a/pkg/waiter/BUILD +++ b/pkg/waiter/BUILD @@ -24,6 +24,7 @@ go_library( ], importpath = "gvisor.dev/gvisor/pkg/waiter", visibility = ["//visibility:public"], + deps = ["//pkg/sync"], ) go_test( diff --git a/pkg/waiter/waiter.go b/pkg/waiter/waiter.go index 8a65ed164..f708e95fa 100644 --- a/pkg/waiter/waiter.go +++ b/pkg/waiter/waiter.go @@ -58,7 +58,7 @@ package waiter import ( - "sync" + "gvisor.dev/gvisor/pkg/sync" ) // EventMask represents io events as used in the poll() syscall. diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD index 6226b63f8..3e20f8f2f 100644 --- a/runsc/boot/BUILD +++ b/runsc/boot/BUILD @@ -74,6 +74,7 @@ go_library( "//pkg/sentry/usage", "//pkg/sentry/usermem", "//pkg/sentry/watchdog", + "//pkg/sync", "//pkg/syserror", "//pkg/tcpip", "//pkg/tcpip/link/fdbased", @@ -114,6 +115,7 @@ go_test( "//pkg/sentry/context/contexttest", "//pkg/sentry/fs", "//pkg/sentry/kernel/auth", + "//pkg/sync", "//pkg/unet", "//runsc/fsgofer", "@com_github_opencontainers_runtime-spec//specs-go:go_default_library", diff --git a/runsc/boot/compat.go b/runsc/boot/compat.go index 352e710d2..9c23b9553 100644 --- a/runsc/boot/compat.go +++ b/runsc/boot/compat.go @@ -17,7 +17,6 @@ package boot import ( "fmt" "os" - "sync" "syscall" "github.com/golang/protobuf/proto" @@ -27,6 +26,7 @@ import ( ucspb "gvisor.dev/gvisor/pkg/sentry/kernel/uncaught_signal_go_proto" "gvisor.dev/gvisor/pkg/sentry/strace" spb "gvisor.dev/gvisor/pkg/sentry/unimpl/unimplemented_syscall_go_proto" + "gvisor.dev/gvisor/pkg/sync" ) func initCompatLogs(fd int) error { diff --git a/runsc/boot/limits.go b/runsc/boot/limits.go index d1c0bb9b5..ce62236e5 100644 --- a/runsc/boot/limits.go +++ b/runsc/boot/limits.go @@ -16,12 +16,12 @@ package boot import ( "fmt" - "sync" "syscall" specs "github.com/opencontainers/runtime-spec/specs-go" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/limits" + "gvisor.dev/gvisor/pkg/sync" ) // Mapping from linux resource names to limits.LimitType. diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index bc1d0c1bb..fad72f4ab 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -20,7 +20,6 @@ import ( mrand "math/rand" "os" "runtime" - "sync" "sync/atomic" "syscall" gtime "time" @@ -46,6 +45,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/time" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sentry/watchdog" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/link/sniffer" "gvisor.dev/gvisor/pkg/tcpip/network/arp" diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go index 147ff7703..bec0dc292 100644 --- a/runsc/boot/loader_test.go +++ b/runsc/boot/loader_test.go @@ -19,7 +19,6 @@ import ( "math/rand" "os" "reflect" - "sync" "syscall" "testing" "time" @@ -30,6 +29,7 @@ import ( "gvisor.dev/gvisor/pkg/p9" "gvisor.dev/gvisor/pkg/sentry/context/contexttest" "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/unet" "gvisor.dev/gvisor/runsc/fsgofer" ) diff --git a/runsc/cmd/BUILD b/runsc/cmd/BUILD index 250845ad7..b94bc4fa0 100644 --- a/runsc/cmd/BUILD +++ b/runsc/cmd/BUILD @@ -44,6 +44,7 @@ go_library( "//pkg/sentry/control", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", + "//pkg/sync", "//pkg/unet", "//pkg/urpc", "//runsc/boot", diff --git a/runsc/cmd/create.go b/runsc/cmd/create.go index a4e3071b3..1815c93b9 100644 --- a/runsc/cmd/create.go +++ b/runsc/cmd/create.go @@ -16,6 +16,7 @@ package cmd import ( "context" + "flag" "github.com/google/subcommands" "gvisor.dev/gvisor/runsc/boot" diff --git a/runsc/cmd/gofer.go b/runsc/cmd/gofer.go index 4831210c0..7df7995f0 100644 --- a/runsc/cmd/gofer.go +++ b/runsc/cmd/gofer.go @@ -21,7 +21,6 @@ import ( "os" "path/filepath" "strings" - "sync" "syscall" "flag" @@ -30,6 +29,7 @@ import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/p9" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/unet" "gvisor.dev/gvisor/runsc/boot" "gvisor.dev/gvisor/runsc/fsgofer" diff --git a/runsc/cmd/start.go b/runsc/cmd/start.go index de2115dff..5e9bc53ab 100644 --- a/runsc/cmd/start.go +++ b/runsc/cmd/start.go @@ -16,6 +16,7 @@ package cmd import ( "context" + "flag" "github.com/google/subcommands" "gvisor.dev/gvisor/runsc/boot" diff --git a/runsc/container/BUILD b/runsc/container/BUILD index 2bd12120d..6dea179e4 100644 --- a/runsc/container/BUILD +++ b/runsc/container/BUILD @@ -18,6 +18,7 @@ go_library( deps = [ "//pkg/log", "//pkg/sentry/control", + "//pkg/sync", "//runsc/boot", "//runsc/cgroup", "//runsc/sandbox", @@ -53,6 +54,7 @@ go_test( "//pkg/sentry/control", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", + "//pkg/sync", "//pkg/unet", "//pkg/urpc", "//runsc/boot", diff --git a/runsc/container/console_test.go b/runsc/container/console_test.go index 5ed131a7f..060b63bf3 100644 --- a/runsc/container/console_test.go +++ b/runsc/container/console_test.go @@ -20,7 +20,6 @@ import ( "io" "os" "path/filepath" - "sync" "syscall" "testing" "time" @@ -29,6 +28,7 @@ import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/sentry/control" "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/unet" "gvisor.dev/gvisor/pkg/urpc" "gvisor.dev/gvisor/runsc/testutil" diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go index c10f85992..b54d8f712 100644 --- a/runsc/container/container_test.go +++ b/runsc/container/container_test.go @@ -26,7 +26,6 @@ import ( "reflect" "strconv" "strings" - "sync" "syscall" "testing" "time" @@ -39,6 +38,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/control" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/runsc/boot" "gvisor.dev/gvisor/runsc/boot/platforms" "gvisor.dev/gvisor/runsc/specutils" diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go index 4ad09ceab..2da93ec5b 100644 --- a/runsc/container/multi_container_test.go +++ b/runsc/container/multi_container_test.go @@ -22,7 +22,6 @@ import ( "path" "path/filepath" "strings" - "sync" "syscall" "testing" "time" @@ -30,6 +29,7 @@ import ( specs "github.com/opencontainers/runtime-spec/specs-go" "gvisor.dev/gvisor/pkg/sentry/control" "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/runsc/boot" "gvisor.dev/gvisor/runsc/specutils" "gvisor.dev/gvisor/runsc/testutil" diff --git a/runsc/container/state_file.go b/runsc/container/state_file.go index d95151ea5..17a251530 100644 --- a/runsc/container/state_file.go +++ b/runsc/container/state_file.go @@ -20,10 +20,10 @@ import ( "io/ioutil" "os" "path/filepath" - "sync" "github.com/gofrs/flock" "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sync" ) const stateFileExtension = ".state" diff --git a/runsc/fsgofer/BUILD b/runsc/fsgofer/BUILD index afcb41801..a9582d92b 100644 --- a/runsc/fsgofer/BUILD +++ b/runsc/fsgofer/BUILD @@ -19,6 +19,7 @@ go_library( "//pkg/fd", "//pkg/log", "//pkg/p9", + "//pkg/sync", "//pkg/syserr", "//runsc/specutils", "@org_golang_x_sys//unix:go_default_library", diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go index b59e1a70e..93606d051 100644 --- a/runsc/fsgofer/fsgofer.go +++ b/runsc/fsgofer/fsgofer.go @@ -29,7 +29,6 @@ import ( "path/filepath" "runtime" "strconv" - "sync" "syscall" "golang.org/x/sys/unix" @@ -37,6 +36,7 @@ import ( "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/p9" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/runsc/specutils" ) diff --git a/runsc/sandbox/BUILD b/runsc/sandbox/BUILD index 8001949d5..ddbc37456 100644 --- a/runsc/sandbox/BUILD +++ b/runsc/sandbox/BUILD @@ -19,6 +19,7 @@ go_library( "//pkg/log", "//pkg/sentry/control", "//pkg/sentry/platform", + "//pkg/sync", "//pkg/tcpip/header", "//pkg/tcpip/stack", "//pkg/urpc", diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go index ce1452b87..ec72bdbfd 100644 --- a/runsc/sandbox/sandbox.go +++ b/runsc/sandbox/sandbox.go @@ -22,7 +22,6 @@ import ( "os" "os/exec" "strconv" - "sync" "syscall" "time" @@ -34,6 +33,7 @@ import ( "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/control" "gvisor.dev/gvisor/pkg/sentry/platform" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/urpc" "gvisor.dev/gvisor/runsc/boot" "gvisor.dev/gvisor/runsc/boot/platforms" diff --git a/runsc/testutil/BUILD b/runsc/testutil/BUILD index c96ca2eb6..3c3027cb5 100644 --- a/runsc/testutil/BUILD +++ b/runsc/testutil/BUILD @@ -10,6 +10,7 @@ go_library( visibility = ["//:sandbox"], deps = [ "//pkg/log", + "//pkg/sync", "//runsc/boot", "//runsc/specutils", "@com_github_cenkalti_backoff//:go_default_library", diff --git a/runsc/testutil/testutil.go b/runsc/testutil/testutil.go index 9632776d2..fb22eae39 100644 --- a/runsc/testutil/testutil.go +++ b/runsc/testutil/testutil.go @@ -34,7 +34,6 @@ import ( "path/filepath" "strconv" "strings" - "sync" "sync/atomic" "syscall" "time" @@ -42,6 +41,7 @@ import ( "github.com/cenkalti/backoff" specs "github.com/opencontainers/runtime-spec/specs-go" "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/runsc/boot" "gvisor.dev/gvisor/runsc/specutils" ) -- cgit v1.2.3 From d6fb1ec6c7c76040dd20e915b32f9ed795ae7077 Mon Sep 17 00:00:00 2001 From: Nicolas Lacasse Date: Wed, 15 Jan 2020 16:31:24 -0800 Subject: Add timestamps to VFS2 tmpfs, and implement some of SetStat. PiperOrigin-RevId: 289962040 --- pkg/abi/linux/time.go | 13 ++ pkg/sentry/fsimpl/tmpfs/BUILD | 2 + pkg/sentry/fsimpl/tmpfs/filesystem.go | 16 +- pkg/sentry/fsimpl/tmpfs/regular_file_test.go | 129 ++++++++++++--- pkg/sentry/fsimpl/tmpfs/stat_test.go | 232 +++++++++++++++++++++++++++ pkg/sentry/fsimpl/tmpfs/tmpfs.go | 72 +++++++-- 6 files changed, 425 insertions(+), 39 deletions(-) create mode 100644 pkg/sentry/fsimpl/tmpfs/stat_test.go (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/abi/linux/time.go b/pkg/abi/linux/time.go index 546668bca..5c5a58cd4 100644 --- a/pkg/abi/linux/time.go +++ b/pkg/abi/linux/time.go @@ -234,6 +234,19 @@ type StatxTimestamp struct { _ int32 } +// ToNsec returns the nanosecond representation. +func (sxts StatxTimestamp) ToNsec() int64 { + return int64(sxts.Sec)*1e9 + int64(sxts.Nsec) +} + +// ToNsecCapped returns the safe nanosecond representation. +func (sxts StatxTimestamp) ToNsecCapped() int64 { + if sxts.Sec > maxSecInDuration { + return math.MaxInt64 + } + return sxts.ToNsec() +} + // NsecToStatxTimestamp translates nanoseconds to StatxTimestamp. func NsecToStatxTimestamp(nsec int64) (ts StatxTimestamp) { return StatxTimestamp{ diff --git a/pkg/sentry/fsimpl/tmpfs/BUILD b/pkg/sentry/fsimpl/tmpfs/BUILD index 82f5c2f41..7601c7c04 100644 --- a/pkg/sentry/fsimpl/tmpfs/BUILD +++ b/pkg/sentry/fsimpl/tmpfs/BUILD @@ -40,6 +40,7 @@ go_library( "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", "//pkg/sentry/kernel/pipe", + "//pkg/sentry/kernel/time", "//pkg/sentry/memmap", "//pkg/sentry/pgalloc", "//pkg/sentry/platform", @@ -77,6 +78,7 @@ go_test( srcs = [ "pipe_test.go", "regular_file_test.go", + "stat_test.go", ], embed = [":tmpfs"], deps = [ diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go index 26979729e..4cd7e9aea 100644 --- a/pkg/sentry/fsimpl/tmpfs/filesystem.go +++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go @@ -56,7 +56,8 @@ afterSymlink: } next := nextVFSD.Impl().(*dentry) if symlink, ok := next.inode.impl.(*symlink); ok && rp.ShouldFollowSymlink() { - // TODO: symlink traversals update access time + // TODO(gvisor.dev/issues/1197): Symlink traversals updates + // access time. if err := rp.HandleSymlink(symlink.target); err != nil { return nil, err } @@ -501,7 +502,8 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa oldParent.inode.decLinksLocked() newParent.inode.incLinksLocked() } - // TODO: update timestamps and parent directory sizes + // TODO(gvisor.dev/issues/1197): Update timestamps and parent directory + // sizes. vfsObj.CommitRenameReplaceDentry(renamedVFSD, &newParent.vfsd, newName, replacedVFSD) return nil } @@ -555,15 +557,11 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error { fs.mu.RLock() defer fs.mu.RUnlock() - _, err := resolveLocked(rp) + d, err := resolveLocked(rp) if err != nil { return err } - if opts.Stat.Mask == 0 { - return nil - } - // TODO: implement inode.setStat - return syserror.EPERM + return d.inode.setStat(opts.Stat) } // StatAt implements vfs.FilesystemImpl.StatAt. @@ -587,7 +585,7 @@ func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linu if err != nil { return linux.Statfs{}, err } - // TODO: actually implement statfs + // TODO(gvisor.dev/issues/1197): Actually implement statfs. return linux.Statfs{}, syserror.ENOSYS } diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file_test.go b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go index 3731c5b6f..7b0a962f0 100644 --- a/pkg/sentry/fsimpl/tmpfs/regular_file_test.go +++ b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go @@ -18,6 +18,7 @@ import ( "bytes" "fmt" "io" + "sync/atomic" "testing" "gvisor.dev/gvisor/pkg/abi/linux" @@ -29,10 +30,12 @@ import ( "gvisor.dev/gvisor/pkg/sentry/vfs" ) -// newFileFD creates a new file in a new tmpfs mount, and returns the FD. If -// the returned err is not nil, then cleanup should be called when the FD is no -// longer needed. -func newFileFD(ctx context.Context, filename string) (*vfs.FileDescription, func(), error) { +// nextFileID is used to generate unique file names. +var nextFileID int64 + +// newTmpfsRoot creates a new tmpfs mount, and returns the root. If the error +// is not nil, then cleanup should be called when the root is no longer needed. +func newTmpfsRoot(ctx context.Context) (*vfs.VirtualFilesystem, vfs.VirtualDentry, func(), error) { creds := auth.CredentialsFromContext(ctx) vfsObj := vfs.New() @@ -41,36 +44,124 @@ func newFileFD(ctx context.Context, filename string) (*vfs.FileDescription, func }) mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{}) if err != nil { - return nil, nil, fmt.Errorf("failed to create tmpfs root mount: %v", err) + return nil, vfs.VirtualDentry{}, nil, fmt.Errorf("failed to create tmpfs root mount: %v", err) } root := mntns.Root() + return vfsObj, root, func() { + root.DecRef() + mntns.DecRef(vfsObj) + }, nil +} + +// newFileFD creates a new file in a new tmpfs mount, and returns the FD. If +// the returned err is not nil, then cleanup should be called when the FD is no +// longer needed. +func newFileFD(ctx context.Context, mode linux.FileMode) (*vfs.FileDescription, func(), error) { + creds := auth.CredentialsFromContext(ctx) + vfsObj, root, cleanup, err := newTmpfsRoot(ctx) + if err != nil { + return nil, nil, err + } + + filename := fmt.Sprintf("tmpfs-test-file-%d", atomic.AddInt64(&nextFileID, 1)) // Create the file that will be write/read. fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{ - Root: root, - Start: root, - Path: fspath.Parse(filename), - FollowFinalSymlink: true, + Root: root, + Start: root, + Path: fspath.Parse(filename), }, &vfs.OpenOptions{ Flags: linux.O_RDWR | linux.O_CREAT | linux.O_EXCL, - Mode: 0644, + Mode: linux.ModeRegular | mode, }) if err != nil { - root.DecRef() - mntns.DecRef(vfsObj) + cleanup() return nil, nil, fmt.Errorf("failed to create file %q: %v", filename, err) } - return fd, func() { - root.DecRef() - mntns.DecRef(vfsObj) - }, nil + return fd, cleanup, nil +} + +// newDirFD is like newFileFD, but for directories. +func newDirFD(ctx context.Context, mode linux.FileMode) (*vfs.FileDescription, func(), error) { + creds := auth.CredentialsFromContext(ctx) + vfsObj, root, cleanup, err := newTmpfsRoot(ctx) + if err != nil { + return nil, nil, err + } + + dirname := fmt.Sprintf("tmpfs-test-dir-%d", atomic.AddInt64(&nextFileID, 1)) + + // Create the dir. + if err := vfsObj.MkdirAt(ctx, creds, &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(dirname), + }, &vfs.MkdirOptions{ + Mode: linux.ModeDirectory | mode, + }); err != nil { + cleanup() + return nil, nil, fmt.Errorf("failed to create directory %q: %v", dirname, err) + } + + // Open the dir and return it. + fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(dirname), + }, &vfs.OpenOptions{ + Flags: linux.O_RDONLY | linux.O_DIRECTORY, + }) + if err != nil { + cleanup() + return nil, nil, fmt.Errorf("failed to open directory %q: %v", dirname, err) + } + + return fd, cleanup, nil +} + +// newPipeFD is like newFileFD, but for pipes. +func newPipeFD(ctx context.Context, mode linux.FileMode) (*vfs.FileDescription, func(), error) { + creds := auth.CredentialsFromContext(ctx) + vfsObj, root, cleanup, err := newTmpfsRoot(ctx) + if err != nil { + return nil, nil, err + } + + pipename := fmt.Sprintf("tmpfs-test-pipe-%d", atomic.AddInt64(&nextFileID, 1)) + + // Create the pipe. + if err := vfsObj.MknodAt(ctx, creds, &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(pipename), + }, &vfs.MknodOptions{ + Mode: linux.ModeNamedPipe | mode, + }); err != nil { + cleanup() + return nil, nil, fmt.Errorf("failed to create pipe %q: %v", pipename, err) + } + + // Open the pipe and return it. + fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(pipename), + }, &vfs.OpenOptions{ + Flags: linux.O_RDWR, + }) + if err != nil { + cleanup() + return nil, nil, fmt.Errorf("failed to open pipe %q: %v", pipename, err) + } + + return fd, cleanup, nil } // Test that we can write some data to a file and read it back.` func TestSimpleWriteRead(t *testing.T) { ctx := contexttest.Context(t) - fd, cleanup, err := newFileFD(ctx, "simpleReadWrite") + fd, cleanup, err := newFileFD(ctx, 0644) if err != nil { t.Fatal(err) } @@ -116,7 +207,7 @@ func TestSimpleWriteRead(t *testing.T) { func TestPWrite(t *testing.T) { ctx := contexttest.Context(t) - fd, cleanup, err := newFileFD(ctx, "PRead") + fd, cleanup, err := newFileFD(ctx, 0644) if err != nil { t.Fatal(err) } @@ -171,7 +262,7 @@ func TestPWrite(t *testing.T) { func TestPRead(t *testing.T) { ctx := contexttest.Context(t) - fd, cleanup, err := newFileFD(ctx, "PRead") + fd, cleanup, err := newFileFD(ctx, 0644) if err != nil { t.Fatal(err) } diff --git a/pkg/sentry/fsimpl/tmpfs/stat_test.go b/pkg/sentry/fsimpl/tmpfs/stat_test.go new file mode 100644 index 000000000..ebe035dee --- /dev/null +++ b/pkg/sentry/fsimpl/tmpfs/stat_test.go @@ -0,0 +1,232 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tmpfs + +import ( + "fmt" + "testing" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/kernel/contexttest" + "gvisor.dev/gvisor/pkg/sentry/vfs" +) + +func TestStatAfterCreate(t *testing.T) { + ctx := contexttest.Context(t) + mode := linux.FileMode(0644) + + // Run with different file types. + // TODO(gvisor.dev/issues/1197): Also test symlinks and sockets. + for _, typ := range []string{"file", "dir", "pipe"} { + t.Run(fmt.Sprintf("type=%q", typ), func(t *testing.T) { + var ( + fd *vfs.FileDescription + cleanup func() + err error + ) + switch typ { + case "file": + fd, cleanup, err = newFileFD(ctx, mode) + case "dir": + fd, cleanup, err = newDirFD(ctx, mode) + case "pipe": + fd, cleanup, err = newPipeFD(ctx, mode) + default: + panic(fmt.Sprintf("unknown typ %q", typ)) + } + if err != nil { + t.Fatal(err) + } + defer cleanup() + + got, err := fd.Stat(ctx, vfs.StatOptions{}) + if err != nil { + t.Fatalf("Stat failed: %v", err) + } + + // Atime, Ctime, Mtime should all be current time (non-zero). + atime, ctime, mtime := got.Atime.ToNsec(), got.Ctime.ToNsec(), got.Mtime.ToNsec() + if atime != ctime || ctime != mtime { + t.Errorf("got atime=%d ctime=%d mtime=%d, wanted equal values", atime, ctime, mtime) + } + if atime == 0 { + t.Errorf("got atime=%d, want non-zero", atime) + } + + // Btime should be 0, as it is not set by tmpfs. + if btime := got.Btime.ToNsec(); btime != 0 { + t.Errorf("got btime %d, want 0", got.Btime.ToNsec()) + } + + // Size should be 0. + if got.Size != 0 { + t.Errorf("got size %d, want 0", got.Size) + } + + // Nlink should be 1 for files, 2 for dirs. + wantNlink := uint32(1) + if typ == "dir" { + wantNlink = 2 + } + if got.Nlink != wantNlink { + t.Errorf("got nlink %d, want %d", got.Nlink, wantNlink) + } + + // UID and GID are set from context creds. + creds := auth.CredentialsFromContext(ctx) + if got.UID != uint32(creds.EffectiveKUID) { + t.Errorf("got uid %d, want %d", got.UID, uint32(creds.EffectiveKUID)) + } + if got.GID != uint32(creds.EffectiveKGID) { + t.Errorf("got gid %d, want %d", got.GID, uint32(creds.EffectiveKGID)) + } + + // Mode. + wantMode := uint16(mode) + switch typ { + case "file": + wantMode |= linux.S_IFREG + case "dir": + wantMode |= linux.S_IFDIR + case "pipe": + wantMode |= linux.S_IFIFO + default: + panic(fmt.Sprintf("unknown typ %q", typ)) + } + + if got.Mode != wantMode { + t.Errorf("got mode %x, want %x", got.Mode, wantMode) + } + + // Ino. + if got.Ino == 0 { + t.Errorf("got ino %d, want not 0", got.Ino) + } + }) + } +} + +func TestSetStatAtime(t *testing.T) { + ctx := contexttest.Context(t) + fd, cleanup, err := newFileFD(ctx, 0644) + if err != nil { + t.Fatal(err) + } + defer cleanup() + + allStatOptions := vfs.StatOptions{Mask: linux.STATX_ALL} + + // Get initial stat. + initialStat, err := fd.Stat(ctx, allStatOptions) + if err != nil { + t.Fatalf("Stat failed: %v", err) + } + + // Set atime, but without the mask. + if err := fd.SetStat(ctx, vfs.SetStatOptions{Stat: linux.Statx{ + Mask: 0, + Atime: linux.NsecToStatxTimestamp(100), + }}); err != nil { + t.Errorf("SetStat atime without mask failed: %v") + } + // Atime should be unchanged. + if gotStat, err := fd.Stat(ctx, allStatOptions); err != nil { + t.Errorf("Stat got error: %v", err) + } else if gotStat.Atime != initialStat.Atime { + t.Errorf("Stat got atime %d, want %d", gotStat.Atime, initialStat.Atime) + } + + // Set atime, this time included in the mask. + setStat := linux.Statx{ + Mask: linux.STATX_ATIME, + Atime: linux.NsecToStatxTimestamp(100), + } + if err := fd.SetStat(ctx, vfs.SetStatOptions{Stat: setStat}); err != nil { + t.Errorf("SetStat atime with mask failed: %v") + } + if gotStat, err := fd.Stat(ctx, allStatOptions); err != nil { + t.Errorf("Stat got error: %v", err) + } else if gotStat.Atime != setStat.Atime { + t.Errorf("Stat got atime %d, want %d", gotStat.Atime, setStat.Atime) + } +} + +func TestSetStat(t *testing.T) { + ctx := contexttest.Context(t) + mode := linux.FileMode(0644) + + // Run with different file types. + // TODO(gvisor.dev/issues/1197): Also test symlinks and sockets. + for _, typ := range []string{"file", "dir", "pipe"} { + t.Run(fmt.Sprintf("type=%q", typ), func(t *testing.T) { + var ( + fd *vfs.FileDescription + cleanup func() + err error + ) + switch typ { + case "file": + fd, cleanup, err = newFileFD(ctx, mode) + case "dir": + fd, cleanup, err = newDirFD(ctx, mode) + case "pipe": + fd, cleanup, err = newPipeFD(ctx, mode) + default: + panic(fmt.Sprintf("unknown typ %q", typ)) + } + if err != nil { + t.Fatal(err) + } + defer cleanup() + + allStatOptions := vfs.StatOptions{Mask: linux.STATX_ALL} + + // Get initial stat. + initialStat, err := fd.Stat(ctx, allStatOptions) + if err != nil { + t.Fatalf("Stat failed: %v", err) + } + + // Set atime, but without the mask. + if err := fd.SetStat(ctx, vfs.SetStatOptions{Stat: linux.Statx{ + Mask: 0, + Atime: linux.NsecToStatxTimestamp(100), + }}); err != nil { + t.Errorf("SetStat atime without mask failed: %v") + } + // Atime should be unchanged. + if gotStat, err := fd.Stat(ctx, allStatOptions); err != nil { + t.Errorf("Stat got error: %v", err) + } else if gotStat.Atime != initialStat.Atime { + t.Errorf("Stat got atime %d, want %d", gotStat.Atime, initialStat.Atime) + } + + // Set atime, this time included in the mask. + setStat := linux.Statx{ + Mask: linux.STATX_ATIME, + Atime: linux.NsecToStatxTimestamp(100), + } + if err := fd.SetStat(ctx, vfs.SetStatOptions{Stat: setStat}); err != nil { + t.Errorf("SetStat atime with mask failed: %v") + } + if gotStat, err := fd.Stat(ctx, allStatOptions); err != nil { + t.Errorf("Stat got error: %v", err) + } else if gotStat.Atime != setStat.Atime { + t.Errorf("Stat got atime %d, want %d", gotStat.Atime, setStat.Atime) + } + }) + } +} diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go index 701826f90..d6960ee47 100644 --- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go +++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go @@ -31,10 +31,10 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" ) // FilesystemType implements vfs.FilesystemType. @@ -47,6 +47,9 @@ type filesystem struct { // memFile is used to allocate pages to for regular files. memFile *pgalloc.MemoryFile + // clock is a realtime clock used to set timestamps in file operations. + clock time.Clock + // mu serializes changes to the Dentry tree. mu sync.RWMutex @@ -59,8 +62,10 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt if memFileProvider == nil { panic("MemoryFileProviderFromContext returned nil") } + clock := time.RealtimeClockFromContext(ctx) fs := filesystem{ memFile: memFileProvider.MemoryFile(), + clock: clock, } fs.vfsfs.Init(vfsObj, &fs) root := fs.newDentry(fs.newDirectory(creds, 01777)) @@ -126,26 +131,36 @@ type inode struct { // filesystem.RmdirAt() drops the reference. refs int64 - // Inode metadata; protected by mu and accessed using atomic memory - // operations unless otherwise specified. - mu sync.RWMutex + // Inode metadata. Writing multiple fields atomically requires holding + // mu, othewise atomic operations can be used. + mu sync.Mutex mode uint32 // excluding file type bits, which are based on impl nlink uint32 // protected by filesystem.mu instead of inode.mu uid uint32 // auth.KUID, but stored as raw uint32 for sync/atomic gid uint32 // auth.KGID, but ... ino uint64 // immutable + // Linux's tmpfs has no concept of btime. + atime int64 // nanoseconds + ctime int64 // nanoseconds + mtime int64 // nanoseconds + impl interface{} // immutable } const maxLinks = math.MaxUint32 func (i *inode) init(impl interface{}, fs *filesystem, creds *auth.Credentials, mode linux.FileMode) { + now := fs.clock.Now().Nanoseconds() i.refs = 1 i.mode = uint32(mode) i.uid = uint32(creds.EffectiveKUID) i.gid = uint32(creds.EffectiveKGID) i.ino = atomic.AddUint64(&fs.nextInoMinusOne, 1) + // Tmpfs creation sets atime, ctime, and mtime to current time. + i.atime = now + i.ctime = now + i.mtime = now // i.nlink initialized by caller i.impl = impl } @@ -213,15 +228,24 @@ func (i *inode) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes, i // Go won't inline this function, and returning linux.Statx (which is quite // big) means spending a lot of time in runtime.duffcopy(), so instead it's an // output parameter. +// +// Note that Linux does not guarantee to return consistent data (in the case of +// a concurrent modification), so we do not require holding inode.mu. func (i *inode) statTo(stat *linux.Statx) { - stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO + stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | + linux.STATX_UID | linux.STATX_GID | linux.STATX_INO | linux.STATX_ATIME | + linux.STATX_BTIME | linux.STATX_CTIME | linux.STATX_MTIME stat.Blksize = 1 // usermem.PageSize in tmpfs stat.Nlink = atomic.LoadUint32(&i.nlink) stat.UID = atomic.LoadUint32(&i.uid) stat.GID = atomic.LoadUint32(&i.gid) stat.Mode = uint16(atomic.LoadUint32(&i.mode)) stat.Ino = i.ino - // TODO: device number + // Linux's tmpfs has no concept of btime, so zero-value is returned. + stat.Atime = linux.NsecToStatxTimestamp(i.atime) + stat.Ctime = linux.NsecToStatxTimestamp(i.ctime) + stat.Mtime = linux.NsecToStatxTimestamp(i.mtime) + // TODO(gvisor.dev/issues/1197): Device number. switch impl := i.impl.(type) { case *regularFile: stat.Mode |= linux.S_IFREG @@ -245,6 +269,36 @@ func (i *inode) statTo(stat *linux.Statx) { } } +func (i *inode) setStat(stat linux.Statx) error { + // TODO(gvisor.dev/issues/1197): Handle stat.Size by growing/shrinking + // the file. + if stat.Mask == 0 { + return nil + } + i.mu.Lock() + mask := stat.Mask + if mask&linux.STATX_MODE != 0 { + atomic.StoreUint32(&i.mode, uint32(stat.Mode)) + } + if mask&linux.STATX_UID != 0 { + atomic.StoreUint32(&i.uid, stat.UID) + } + if mask&linux.STATX_GID != 0 { + atomic.StoreUint32(&i.gid, stat.GID) + } + if mask&linux.STATX_ATIME != 0 { + atomic.StoreInt64(&i.atime, stat.Atime.ToNsecCapped()) + } + if mask&linux.STATX_CTIME != 0 { + atomic.StoreInt64(&i.ctime, stat.Ctime.ToNsecCapped()) + } + if mask&linux.STATX_MTIME != 0 { + atomic.StoreInt64(&i.mtime, stat.Mtime.ToNsecCapped()) + } + i.mu.Unlock() + return nil +} + // allocatedBlocksForSize returns the number of 512B blocks needed to // accommodate the given size in bytes, as appropriate for struct // stat::st_blocks and struct statx::stx_blocks. (Note that this 512B block @@ -291,9 +345,5 @@ func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linu // SetStat implements vfs.FileDescriptionImpl.SetStat. func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { - if opts.Stat.Mask == 0 { - return nil - } - // TODO: implement inode.setStat - return syserror.EPERM + return fd.inode().setStat(opts.Stat) } -- cgit v1.2.3 From 7b7c31820b83abcfe43f7170eff1f7953f3f27e2 Mon Sep 17 00:00:00 2001 From: Fabricio Voznika Date: Thu, 16 Jan 2020 12:28:44 -0800 Subject: Add remaining /proc/* and /proc/sys/* files Except for one under /proc/sys/net/ipv4/tcp_sack. /proc/pid/* is still incomplete. Updates #1195 PiperOrigin-RevId: 290120438 --- pkg/cpuid/cpuid.go | 44 ++-- pkg/sentry/fs/proc/cpuinfo.go | 8 +- pkg/sentry/fsimpl/kernfs/inode_impl_util.go | 40 ++++ pkg/sentry/fsimpl/proc/BUILD | 11 +- pkg/sentry/fsimpl/proc/filesystem.go | 11 + pkg/sentry/fsimpl/proc/loadavg.go | 42 ---- pkg/sentry/fsimpl/proc/meminfo.go | 79 ------- pkg/sentry/fsimpl/proc/net.go | 338 ---------------------------- pkg/sentry/fsimpl/proc/net_test.go | 78 ------- pkg/sentry/fsimpl/proc/stat.go | 129 ----------- pkg/sentry/fsimpl/proc/sys.go | 51 ----- pkg/sentry/fsimpl/proc/task.go | 12 +- pkg/sentry/fsimpl/proc/tasks.go | 41 +++- pkg/sentry/fsimpl/proc/tasks_files.go | 245 ++++++++++++++++++++ pkg/sentry/fsimpl/proc/tasks_net.go | 337 +++++++++++++++++++++++++++ pkg/sentry/fsimpl/proc/tasks_sys.go | 143 ++++++++++++ pkg/sentry/fsimpl/proc/tasks_sys_test.go | 78 +++++++ pkg/sentry/fsimpl/proc/tasks_test.go | 3 + pkg/sentry/fsimpl/proc/version.go | 70 ------ 19 files changed, 922 insertions(+), 838 deletions(-) delete mode 100644 pkg/sentry/fsimpl/proc/loadavg.go delete mode 100644 pkg/sentry/fsimpl/proc/meminfo.go delete mode 100644 pkg/sentry/fsimpl/proc/net.go delete mode 100644 pkg/sentry/fsimpl/proc/net_test.go delete mode 100644 pkg/sentry/fsimpl/proc/stat.go delete mode 100644 pkg/sentry/fsimpl/proc/sys.go create mode 100644 pkg/sentry/fsimpl/proc/tasks_net.go create mode 100644 pkg/sentry/fsimpl/proc/tasks_sys.go create mode 100644 pkg/sentry/fsimpl/proc/tasks_sys_test.go delete mode 100644 pkg/sentry/fsimpl/proc/version.go (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/cpuid/cpuid.go b/pkg/cpuid/cpuid.go index d37047368..cf50ee53f 100644 --- a/pkg/cpuid/cpuid.go +++ b/pkg/cpuid/cpuid.go @@ -657,30 +657,28 @@ func (fs *FeatureSet) FlagsString(cpuinfoOnly bool) string { return strings.Join(s, " ") } -// CPUInfo is to generate a section of one cpu in /proc/cpuinfo. This is a -// minimal /proc/cpuinfo, it is missing some fields like "microcode" that are +// WriteCPUInfoTo is to generate a section of one cpu in /proc/cpuinfo. This is +// a minimal /proc/cpuinfo, it is missing some fields like "microcode" that are // not always printed in Linux. The bogomips field is simply made up. -func (fs FeatureSet) CPUInfo(cpu uint) string { - var b bytes.Buffer - fmt.Fprintf(&b, "processor\t: %d\n", cpu) - fmt.Fprintf(&b, "vendor_id\t: %s\n", fs.VendorID) - fmt.Fprintf(&b, "cpu family\t: %d\n", ((fs.ExtendedFamily<<4)&0xff)|fs.Family) - fmt.Fprintf(&b, "model\t\t: %d\n", ((fs.ExtendedModel<<4)&0xff)|fs.Model) - fmt.Fprintf(&b, "model name\t: %s\n", "unknown") // Unknown for now. - fmt.Fprintf(&b, "stepping\t: %s\n", "unknown") // Unknown for now. - fmt.Fprintf(&b, "cpu MHz\t\t: %.3f\n", cpuFreqMHz) - fmt.Fprintln(&b, "fpu\t\t: yes") - fmt.Fprintln(&b, "fpu_exception\t: yes") - fmt.Fprintf(&b, "cpuid level\t: %d\n", uint32(xSaveInfo)) // Same as ax in vendorID. - fmt.Fprintln(&b, "wp\t\t: yes") - fmt.Fprintf(&b, "flags\t\t: %s\n", fs.FlagsString(true)) - fmt.Fprintf(&b, "bogomips\t: %.02f\n", cpuFreqMHz) // It's bogus anyway. - fmt.Fprintf(&b, "clflush size\t: %d\n", fs.CacheLine) - fmt.Fprintf(&b, "cache_alignment\t: %d\n", fs.CacheLine) - fmt.Fprintf(&b, "address sizes\t: %d bits physical, %d bits virtual\n", 46, 48) - fmt.Fprintln(&b, "power management:") // This is always here, but can be blank. - fmt.Fprintln(&b, "") // The /proc/cpuinfo file ends with an extra newline. - return b.String() +func (fs FeatureSet) WriteCPUInfoTo(cpu uint, b *bytes.Buffer) { + fmt.Fprintf(b, "processor\t: %d\n", cpu) + fmt.Fprintf(b, "vendor_id\t: %s\n", fs.VendorID) + fmt.Fprintf(b, "cpu family\t: %d\n", ((fs.ExtendedFamily<<4)&0xff)|fs.Family) + fmt.Fprintf(b, "model\t\t: %d\n", ((fs.ExtendedModel<<4)&0xff)|fs.Model) + fmt.Fprintf(b, "model name\t: %s\n", "unknown") // Unknown for now. + fmt.Fprintf(b, "stepping\t: %s\n", "unknown") // Unknown for now. + fmt.Fprintf(b, "cpu MHz\t\t: %.3f\n", cpuFreqMHz) + fmt.Fprintln(b, "fpu\t\t: yes") + fmt.Fprintln(b, "fpu_exception\t: yes") + fmt.Fprintf(b, "cpuid level\t: %d\n", uint32(xSaveInfo)) // Same as ax in vendorID. + fmt.Fprintln(b, "wp\t\t: yes") + fmt.Fprintf(b, "flags\t\t: %s\n", fs.FlagsString(true)) + fmt.Fprintf(b, "bogomips\t: %.02f\n", cpuFreqMHz) // It's bogus anyway. + fmt.Fprintf(b, "clflush size\t: %d\n", fs.CacheLine) + fmt.Fprintf(b, "cache_alignment\t: %d\n", fs.CacheLine) + fmt.Fprintf(b, "address sizes\t: %d bits physical, %d bits virtual\n", 46, 48) + fmt.Fprintln(b, "power management:") // This is always here, but can be blank. + fmt.Fprintln(b, "") // The /proc/cpuinfo file ends with an extra newline. } const ( diff --git a/pkg/sentry/fs/proc/cpuinfo.go b/pkg/sentry/fs/proc/cpuinfo.go index 3edf36780..6330337eb 100644 --- a/pkg/sentry/fs/proc/cpuinfo.go +++ b/pkg/sentry/fs/proc/cpuinfo.go @@ -15,6 +15,8 @@ package proc import ( + "bytes" + "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" @@ -27,9 +29,9 @@ func newCPUInfo(ctx context.Context, msrc *fs.MountSource) *fs.Inode { // Kernel is always initialized with a FeatureSet. panic("cpuinfo read with nil FeatureSet") } - contents := make([]byte, 0, 1024) + var buf bytes.Buffer for i, max := uint(0), k.ApplicationCores(); i < max; i++ { - contents = append(contents, []byte(features.CPUInfo(i))...) + features.WriteCPUInfoTo(i, &buf) } - return newStaticProcInode(ctx, msrc, contents) + return newStaticProcInode(ctx, msrc, buf.Bytes()) } diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go index 1d469a0db..6aff3d39a 100644 --- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go @@ -510,3 +510,43 @@ type InodeSymlink struct { func (InodeSymlink) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) { return nil, syserror.ELOOP } + +// StaticDirectory is a standard implementation of a directory with static +// contents. +// +// +stateify savable +type StaticDirectory struct { + InodeNotSymlink + InodeDirectoryNoNewChildren + InodeAttrs + InodeNoDynamicLookup + OrderedChildren +} + +var _ Inode = (*StaticDirectory)(nil) + +// NewStaticDir creates a new static directory and returns its dentry. +func NewStaticDir(creds *auth.Credentials, ino uint64, perm linux.FileMode, children map[string]*Dentry) *Dentry { + if perm&^linux.PermissionsMask != 0 { + panic(fmt.Sprintf("Only permission mask must be set: %x", perm&linux.PermissionsMask)) + } + + inode := &StaticDirectory{} + inode.InodeAttrs.Init(creds, ino, linux.ModeDirectory|perm) + + dentry := &Dentry{} + dentry.Init(inode) + + inode.OrderedChildren.Init(OrderedChildrenOptions{}) + links := inode.OrderedChildren.Populate(dentry, children) + inode.IncLinks(links) + + return dentry +} + +// Open implements kernfs.Inode. +func (s *StaticDirectory) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) { + fd := &GenericDirectoryFD{} + fd.Init(rp.Mount(), vfsd, &s.OrderedChildren, flags) + return fd.VFSFileDescription(), nil +} diff --git a/pkg/sentry/fsimpl/proc/BUILD b/pkg/sentry/fsimpl/proc/BUILD index 1f44b3217..6cd18cec8 100644 --- a/pkg/sentry/fsimpl/proc/BUILD +++ b/pkg/sentry/fsimpl/proc/BUILD @@ -7,17 +7,13 @@ go_library( name = "proc", srcs = [ "filesystem.go", - "loadavg.go", - "meminfo.go", "mounts.go", - "net.go", - "stat.go", - "sys.go", "task.go", "task_files.go", "tasks.go", "tasks_files.go", - "version.go", + "tasks_net.go", + "tasks_sys.go", ], importpath = "gvisor.dev/gvisor/pkg/sentry/fsimpl/proc", deps = [ @@ -30,6 +26,7 @@ go_library( "//pkg/sentry/inet", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", + "//pkg/sentry/kernel/time", "//pkg/sentry/limits", "//pkg/sentry/mm", "//pkg/sentry/socket", @@ -47,7 +44,7 @@ go_test( size = "small", srcs = [ "boot_test.go", - "net_test.go", + "tasks_sys_test.go", "tasks_test.go", ], embed = [":proc"], diff --git a/pkg/sentry/fsimpl/proc/filesystem.go b/pkg/sentry/fsimpl/proc/filesystem.go index d09182c77..e9cb7895f 100644 --- a/pkg/sentry/fsimpl/proc/filesystem.go +++ b/pkg/sentry/fsimpl/proc/filesystem.go @@ -67,3 +67,14 @@ func newDentry(creds *auth.Credentials, ino uint64, perm linux.FileMode, inode d d.Init(inode) return d } + +type staticFile struct { + kernfs.DynamicBytesFile + vfs.StaticData +} + +var _ dynamicInode = (*staticFile)(nil) + +func newStaticFile(data string) *staticFile { + return &staticFile{StaticData: vfs.StaticData{Data: data}} +} diff --git a/pkg/sentry/fsimpl/proc/loadavg.go b/pkg/sentry/fsimpl/proc/loadavg.go deleted file mode 100644 index 5351d86e8..000000000 --- a/pkg/sentry/fsimpl/proc/loadavg.go +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package proc - -import ( - "bytes" - "fmt" - - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" -) - -// loadavgData backs /proc/loadavg. -// -// +stateify savable -type loadavgData struct { - kernfs.DynamicBytesFile -} - -var _ dynamicInode = (*loadavgData)(nil) - -// Generate implements vfs.DynamicBytesSource.Generate. -func (d *loadavgData) Generate(ctx context.Context, buf *bytes.Buffer) error { - // TODO(b/62345059): Include real data in fields. - // Column 1-3: CPU and IO utilization of the last 1, 5, and 10 minute periods. - // Column 4-5: currently running processes and the total number of processes. - // Column 6: the last process ID used. - fmt.Fprintf(buf, "%.2f %.2f %.2f %d/%d %d\n", 0.00, 0.00, 0.00, 0, 0, 0) - return nil -} diff --git a/pkg/sentry/fsimpl/proc/meminfo.go b/pkg/sentry/fsimpl/proc/meminfo.go deleted file mode 100644 index cbdd4f3fc..000000000 --- a/pkg/sentry/fsimpl/proc/meminfo.go +++ /dev/null @@ -1,79 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package proc - -import ( - "bytes" - "fmt" - - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" - "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/usage" - "gvisor.dev/gvisor/pkg/sentry/usermem" -) - -// meminfoData implements vfs.DynamicBytesSource for /proc/meminfo. -// -// +stateify savable -type meminfoData struct { - kernfs.DynamicBytesFile - - // k is the owning Kernel. - k *kernel.Kernel -} - -var _ dynamicInode = (*meminfoData)(nil) - -// Generate implements vfs.DynamicBytesSource.Generate. -func (d *meminfoData) Generate(ctx context.Context, buf *bytes.Buffer) error { - mf := d.k.MemoryFile() - mf.UpdateUsage() - snapshot, totalUsage := usage.MemoryAccounting.Copy() - totalSize := usage.TotalMemory(mf.TotalSize(), totalUsage) - anon := snapshot.Anonymous + snapshot.Tmpfs - file := snapshot.PageCache + snapshot.Mapped - // We don't actually have active/inactive LRUs, so just make up numbers. - activeFile := (file / 2) &^ (usermem.PageSize - 1) - inactiveFile := file - activeFile - - fmt.Fprintf(buf, "MemTotal: %8d kB\n", totalSize/1024) - memFree := (totalSize - totalUsage) / 1024 - // We use MemFree as MemAvailable because we don't swap. - // TODO(rahat): When reclaim is implemented the value of MemAvailable - // should change. - fmt.Fprintf(buf, "MemFree: %8d kB\n", memFree) - fmt.Fprintf(buf, "MemAvailable: %8d kB\n", memFree) - fmt.Fprintf(buf, "Buffers: 0 kB\n") // memory usage by block devices - fmt.Fprintf(buf, "Cached: %8d kB\n", (file+snapshot.Tmpfs)/1024) - // Emulate a system with no swap, which disables inactivation of anon pages. - fmt.Fprintf(buf, "SwapCache: 0 kB\n") - fmt.Fprintf(buf, "Active: %8d kB\n", (anon+activeFile)/1024) - fmt.Fprintf(buf, "Inactive: %8d kB\n", inactiveFile/1024) - fmt.Fprintf(buf, "Active(anon): %8d kB\n", anon/1024) - fmt.Fprintf(buf, "Inactive(anon): 0 kB\n") - fmt.Fprintf(buf, "Active(file): %8d kB\n", activeFile/1024) - fmt.Fprintf(buf, "Inactive(file): %8d kB\n", inactiveFile/1024) - fmt.Fprintf(buf, "Unevictable: 0 kB\n") // TODO(b/31823263) - fmt.Fprintf(buf, "Mlocked: 0 kB\n") // TODO(b/31823263) - fmt.Fprintf(buf, "SwapTotal: 0 kB\n") - fmt.Fprintf(buf, "SwapFree: 0 kB\n") - fmt.Fprintf(buf, "Dirty: 0 kB\n") - fmt.Fprintf(buf, "Writeback: 0 kB\n") - fmt.Fprintf(buf, "AnonPages: %8d kB\n", anon/1024) - fmt.Fprintf(buf, "Mapped: %8d kB\n", file/1024) // doesn't count mapped tmpfs, which we don't know - fmt.Fprintf(buf, "Shmem: %8d kB\n", snapshot.Tmpfs/1024) - return nil -} diff --git a/pkg/sentry/fsimpl/proc/net.go b/pkg/sentry/fsimpl/proc/net.go deleted file mode 100644 index fd46eebf8..000000000 --- a/pkg/sentry/fsimpl/proc/net.go +++ /dev/null @@ -1,338 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package proc - -import ( - "bytes" - "fmt" - - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/binary" - "gvisor.dev/gvisor/pkg/log" - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/fs" - "gvisor.dev/gvisor/pkg/sentry/inet" - "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/socket" - "gvisor.dev/gvisor/pkg/sentry/socket/unix" - "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" - "gvisor.dev/gvisor/pkg/sentry/vfs" -) - -// ifinet6 implements vfs.DynamicBytesSource for /proc/net/if_inet6. -// -// +stateify savable -type ifinet6 struct { - s inet.Stack -} - -var _ vfs.DynamicBytesSource = (*ifinet6)(nil) - -func (n *ifinet6) contents() []string { - var lines []string - nics := n.s.Interfaces() - for id, naddrs := range n.s.InterfaceAddrs() { - nic, ok := nics[id] - if !ok { - // NIC was added after NICNames was called. We'll just - // ignore it. - continue - } - - for _, a := range naddrs { - // IPv6 only. - if a.Family != linux.AF_INET6 { - continue - } - - // Fields: - // IPv6 address displayed in 32 hexadecimal chars without colons - // Netlink device number (interface index) in hexadecimal (use nic id) - // Prefix length in hexadecimal - // Scope value (use 0) - // Interface flags - // Device name - lines = append(lines, fmt.Sprintf("%032x %02x %02x %02x %02x %8s\n", a.Addr, id, a.PrefixLen, 0, a.Flags, nic.Name)) - } - } - return lines -} - -// Generate implements vfs.DynamicBytesSource.Generate. -func (n *ifinet6) Generate(ctx context.Context, buf *bytes.Buffer) error { - for _, l := range n.contents() { - buf.WriteString(l) - } - return nil -} - -// netDev implements vfs.DynamicBytesSource for /proc/net/dev. -// -// +stateify savable -type netDev struct { - s inet.Stack -} - -var _ vfs.DynamicBytesSource = (*netDev)(nil) - -// Generate implements vfs.DynamicBytesSource.Generate. -func (n *netDev) Generate(ctx context.Context, buf *bytes.Buffer) error { - interfaces := n.s.Interfaces() - buf.WriteString("Inter-| Receive | Transmit\n") - buf.WriteString(" face |bytes packets errs drop fifo frame compressed multicast|bytes packets errs drop fifo colls carrier compressed\n") - - for _, i := range interfaces { - // Implements the same format as - // net/core/net-procfs.c:dev_seq_printf_stats. - var stats inet.StatDev - if err := n.s.Statistics(&stats, i.Name); err != nil { - log.Warningf("Failed to retrieve interface statistics for %v: %v", i.Name, err) - continue - } - fmt.Fprintf( - buf, - "%6s: %7d %7d %4d %4d %4d %5d %10d %9d %8d %7d %4d %4d %4d %5d %7d %10d\n", - i.Name, - // Received - stats[0], // bytes - stats[1], // packets - stats[2], // errors - stats[3], // dropped - stats[4], // fifo - stats[5], // frame - stats[6], // compressed - stats[7], // multicast - // Transmitted - stats[8], // bytes - stats[9], // packets - stats[10], // errors - stats[11], // dropped - stats[12], // fifo - stats[13], // frame - stats[14], // compressed - stats[15], // multicast - ) - } - - return nil -} - -// netUnix implements vfs.DynamicBytesSource for /proc/net/unix. -// -// +stateify savable -type netUnix struct { - k *kernel.Kernel -} - -var _ vfs.DynamicBytesSource = (*netUnix)(nil) - -// Generate implements vfs.DynamicBytesSource.Generate. -func (n *netUnix) Generate(ctx context.Context, buf *bytes.Buffer) error { - buf.WriteString("Num RefCount Protocol Flags Type St Inode Path\n") - for _, se := range n.k.ListSockets() { - s := se.Sock.Get() - if s == nil { - log.Debugf("Couldn't resolve weakref %v in socket table, racing with destruction?", se.Sock) - continue - } - sfile := s.(*fs.File) - if family, _, _ := sfile.FileOperations.(socket.Socket).Type(); family != linux.AF_UNIX { - s.DecRef() - // Not a unix socket. - continue - } - sops := sfile.FileOperations.(*unix.SocketOperations) - - addr, err := sops.Endpoint().GetLocalAddress() - if err != nil { - log.Warningf("Failed to retrieve socket name from %+v: %v", sfile, err) - addr.Addr = "" - } - - sockFlags := 0 - if ce, ok := sops.Endpoint().(transport.ConnectingEndpoint); ok { - if ce.Listening() { - // For unix domain sockets, linux reports a single flag - // value if the socket is listening, of __SO_ACCEPTCON. - sockFlags = linux.SO_ACCEPTCON - } - } - - // In the socket entry below, the value for the 'Num' field requires - // some consideration. Linux prints the address to the struct - // unix_sock representing a socket in the kernel, but may redact the - // value for unprivileged users depending on the kptr_restrict - // sysctl. - // - // One use for this field is to allow a privileged user to - // introspect into the kernel memory to determine information about - // a socket not available through procfs, such as the socket's peer. - // - // In gvisor, returning a pointer to our internal structures would - // be pointless, as it wouldn't match the memory layout for struct - // unix_sock, making introspection difficult. We could populate a - // struct unix_sock with the appropriate data, but even that - // requires consideration for which kernel version to emulate, as - // the definition of this struct changes over time. - // - // For now, we always redact this pointer. - fmt.Fprintf(buf, "%#016p: %08X %08X %08X %04X %02X %5d", - (*unix.SocketOperations)(nil), // Num, pointer to kernel socket struct. - sfile.ReadRefs()-1, // RefCount, don't count our own ref. - 0, // Protocol, always 0 for UDS. - sockFlags, // Flags. - sops.Endpoint().Type(), // Type. - sops.State(), // State. - sfile.InodeID(), // Inode. - ) - - // Path - if len(addr.Addr) != 0 { - if addr.Addr[0] == 0 { - // Abstract path. - fmt.Fprintf(buf, " @%s", string(addr.Addr[1:])) - } else { - fmt.Fprintf(buf, " %s", string(addr.Addr)) - } - } - fmt.Fprintf(buf, "\n") - - s.DecRef() - } - return nil -} - -// netTCP implements vfs.DynamicBytesSource for /proc/net/tcp. -// -// +stateify savable -type netTCP struct { - k *kernel.Kernel -} - -var _ vfs.DynamicBytesSource = (*netTCP)(nil) - -func (n *netTCP) Generate(ctx context.Context, buf *bytes.Buffer) error { - t := kernel.TaskFromContext(ctx) - buf.WriteString(" sl local_address rem_address st tx_queue rx_queue tr tm->when retrnsmt uid timeout inode \n") - for _, se := range n.k.ListSockets() { - s := se.Sock.Get() - if s == nil { - log.Debugf("Couldn't resolve weakref %+v in socket table, racing with destruction?", se.Sock) - continue - } - sfile := s.(*fs.File) - sops, ok := sfile.FileOperations.(socket.Socket) - if !ok { - panic(fmt.Sprintf("Found non-socket file in socket table: %+v", sfile)) - } - if family, stype, _ := sops.Type(); !(family == linux.AF_INET && stype == linux.SOCK_STREAM) { - s.DecRef() - // Not tcp4 sockets. - continue - } - - // Linux's documentation for the fields below can be found at - // https://www.kernel.org/doc/Documentation/networking/proc_net_tcp.txt. - // For Linux's implementation, see net/ipv4/tcp_ipv4.c:get_tcp4_sock(). - // Note that the header doesn't contain labels for all the fields. - - // Field: sl; entry number. - fmt.Fprintf(buf, "%4d: ", se.ID) - - portBuf := make([]byte, 2) - - // Field: local_adddress. - var localAddr linux.SockAddrInet - if local, _, err := sops.GetSockName(t); err == nil { - localAddr = *local.(*linux.SockAddrInet) - } - binary.LittleEndian.PutUint16(portBuf, localAddr.Port) - fmt.Fprintf(buf, "%08X:%04X ", - binary.LittleEndian.Uint32(localAddr.Addr[:]), - portBuf) - - // Field: rem_address. - var remoteAddr linux.SockAddrInet - if remote, _, err := sops.GetPeerName(t); err == nil { - remoteAddr = *remote.(*linux.SockAddrInet) - } - binary.LittleEndian.PutUint16(portBuf, remoteAddr.Port) - fmt.Fprintf(buf, "%08X:%04X ", - binary.LittleEndian.Uint32(remoteAddr.Addr[:]), - portBuf) - - // Field: state; socket state. - fmt.Fprintf(buf, "%02X ", sops.State()) - - // Field: tx_queue, rx_queue; number of packets in the transmit and - // receive queue. Unimplemented. - fmt.Fprintf(buf, "%08X:%08X ", 0, 0) - - // Field: tr, tm->when; timer active state and number of jiffies - // until timer expires. Unimplemented. - fmt.Fprintf(buf, "%02X:%08X ", 0, 0) - - // Field: retrnsmt; number of unrecovered RTO timeouts. - // Unimplemented. - fmt.Fprintf(buf, "%08X ", 0) - - // Field: uid. - uattr, err := sfile.Dirent.Inode.UnstableAttr(ctx) - if err != nil { - log.Warningf("Failed to retrieve unstable attr for socket file: %v", err) - fmt.Fprintf(buf, "%5d ", 0) - } else { - fmt.Fprintf(buf, "%5d ", uint32(uattr.Owner.UID.In(t.UserNamespace()).OrOverflow())) - } - - // Field: timeout; number of unanswered 0-window probes. - // Unimplemented. - fmt.Fprintf(buf, "%8d ", 0) - - // Field: inode. - fmt.Fprintf(buf, "%8d ", sfile.InodeID()) - - // Field: refcount. Don't count the ref we obtain while deferencing - // the weakref to this socket. - fmt.Fprintf(buf, "%d ", sfile.ReadRefs()-1) - - // Field: Socket struct address. Redacted due to the same reason as - // the 'Num' field in /proc/net/unix, see netUnix.ReadSeqFileData. - fmt.Fprintf(buf, "%#016p ", (*socket.Socket)(nil)) - - // Field: retransmit timeout. Unimplemented. - fmt.Fprintf(buf, "%d ", 0) - - // Field: predicted tick of soft clock (delayed ACK control data). - // Unimplemented. - fmt.Fprintf(buf, "%d ", 0) - - // Field: (ack.quick<<1)|ack.pingpong, Unimplemented. - fmt.Fprintf(buf, "%d ", 0) - - // Field: sending congestion window, Unimplemented. - fmt.Fprintf(buf, "%d ", 0) - - // Field: Slow start size threshold, -1 if threshold >= 0xFFFF. - // Unimplemented, report as large threshold. - fmt.Fprintf(buf, "%d", -1) - - fmt.Fprintf(buf, "\n") - - s.DecRef() - } - - return nil -} diff --git a/pkg/sentry/fsimpl/proc/net_test.go b/pkg/sentry/fsimpl/proc/net_test.go deleted file mode 100644 index 20a77a8ca..000000000 --- a/pkg/sentry/fsimpl/proc/net_test.go +++ /dev/null @@ -1,78 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package proc - -import ( - "bytes" - "reflect" - "testing" - - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context/contexttest" - "gvisor.dev/gvisor/pkg/sentry/inet" -) - -func newIPv6TestStack() *inet.TestStack { - s := inet.NewTestStack() - s.SupportsIPv6Flag = true - return s -} - -func TestIfinet6NoAddresses(t *testing.T) { - n := &ifinet6{s: newIPv6TestStack()} - var buf bytes.Buffer - n.Generate(contexttest.Context(t), &buf) - if buf.Len() > 0 { - t.Errorf("n.Generate() generated = %v, want = %v", buf.Bytes(), []byte{}) - } -} - -func TestIfinet6(t *testing.T) { - s := newIPv6TestStack() - s.InterfacesMap[1] = inet.Interface{Name: "eth0"} - s.InterfaceAddrsMap[1] = []inet.InterfaceAddr{ - { - Family: linux.AF_INET6, - PrefixLen: 128, - Addr: []byte("\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"), - }, - } - s.InterfacesMap[2] = inet.Interface{Name: "eth1"} - s.InterfaceAddrsMap[2] = []inet.InterfaceAddr{ - { - Family: linux.AF_INET6, - PrefixLen: 128, - Addr: []byte("\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"), - }, - } - want := map[string]struct{}{ - "000102030405060708090a0b0c0d0e0f 01 80 00 00 eth0\n": {}, - "101112131415161718191a1b1c1d1e1f 02 80 00 00 eth1\n": {}, - } - - n := &ifinet6{s: s} - contents := n.contents() - if len(contents) != len(want) { - t.Errorf("Got len(n.contents()) = %d, want = %d", len(contents), len(want)) - } - got := map[string]struct{}{} - for _, l := range contents { - got[l] = struct{}{} - } - - if !reflect.DeepEqual(got, want) { - t.Errorf("Got n.contents() = %v, want = %v", got, want) - } -} diff --git a/pkg/sentry/fsimpl/proc/stat.go b/pkg/sentry/fsimpl/proc/stat.go deleted file mode 100644 index 50894a534..000000000 --- a/pkg/sentry/fsimpl/proc/stat.go +++ /dev/null @@ -1,129 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package proc - -import ( - "bytes" - "fmt" - - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" - "gvisor.dev/gvisor/pkg/sentry/kernel" -) - -// cpuStats contains the breakdown of CPU time for /proc/stat. -type cpuStats struct { - // user is time spent in userspace tasks with non-positive niceness. - user uint64 - - // nice is time spent in userspace tasks with positive niceness. - nice uint64 - - // system is time spent in non-interrupt kernel context. - system uint64 - - // idle is time spent idle. - idle uint64 - - // ioWait is time spent waiting for IO. - ioWait uint64 - - // irq is time spent in interrupt context. - irq uint64 - - // softirq is time spent in software interrupt context. - softirq uint64 - - // steal is involuntary wait time. - steal uint64 - - // guest is time spent in guests with non-positive niceness. - guest uint64 - - // guestNice is time spent in guests with positive niceness. - guestNice uint64 -} - -// String implements fmt.Stringer. -func (c cpuStats) String() string { - return fmt.Sprintf("%d %d %d %d %d %d %d %d %d %d", c.user, c.nice, c.system, c.idle, c.ioWait, c.irq, c.softirq, c.steal, c.guest, c.guestNice) -} - -// statData implements vfs.DynamicBytesSource for /proc/stat. -// -// +stateify savable -type statData struct { - kernfs.DynamicBytesFile - - // k is the owning Kernel. - k *kernel.Kernel -} - -var _ dynamicInode = (*statData)(nil) - -// Generate implements vfs.DynamicBytesSource.Generate. -func (s *statData) Generate(ctx context.Context, buf *bytes.Buffer) error { - // TODO(b/37226836): We currently export only zero CPU stats. We could - // at least provide some aggregate stats. - var cpu cpuStats - fmt.Fprintf(buf, "cpu %s\n", cpu) - - for c, max := uint(0), s.k.ApplicationCores(); c < max; c++ { - fmt.Fprintf(buf, "cpu%d %s\n", c, cpu) - } - - // The total number of interrupts is dependent on the CPUs and PCI - // devices on the system. See arch_probe_nr_irqs. - // - // Since we don't report real interrupt stats, just choose an arbitrary - // value from a representative VM. - const numInterrupts = 256 - - // The Kernel doesn't handle real interrupts, so report all zeroes. - // TODO(b/37226836): We could count page faults as #PF. - fmt.Fprintf(buf, "intr 0") // total - for i := 0; i < numInterrupts; i++ { - fmt.Fprintf(buf, " 0") - } - fmt.Fprintf(buf, "\n") - - // Total number of context switches. - // TODO(b/37226836): Count this. - fmt.Fprintf(buf, "ctxt 0\n") - - // CLOCK_REALTIME timestamp from boot, in seconds. - fmt.Fprintf(buf, "btime %d\n", s.k.Timekeeper().BootTime().Seconds()) - - // Total number of clones. - // TODO(b/37226836): Count this. - fmt.Fprintf(buf, "processes 0\n") - - // Number of runnable tasks. - // TODO(b/37226836): Count this. - fmt.Fprintf(buf, "procs_running 0\n") - - // Number of tasks waiting on IO. - // TODO(b/37226836): Count this. - fmt.Fprintf(buf, "procs_blocked 0\n") - - // Number of each softirq handled. - fmt.Fprintf(buf, "softirq 0") // total - for i := 0; i < linux.NumSoftIRQ; i++ { - fmt.Fprintf(buf, " 0") - } - fmt.Fprintf(buf, "\n") - return nil -} diff --git a/pkg/sentry/fsimpl/proc/sys.go b/pkg/sentry/fsimpl/proc/sys.go deleted file mode 100644 index b88256e12..000000000 --- a/pkg/sentry/fsimpl/proc/sys.go +++ /dev/null @@ -1,51 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package proc - -import ( - "bytes" - "fmt" - - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/vfs" -) - -// mmapMinAddrData implements vfs.DynamicBytesSource for -// /proc/sys/vm/mmap_min_addr. -// -// +stateify savable -type mmapMinAddrData struct { - k *kernel.Kernel -} - -var _ vfs.DynamicBytesSource = (*mmapMinAddrData)(nil) - -// Generate implements vfs.DynamicBytesSource.Generate. -func (d *mmapMinAddrData) Generate(ctx context.Context, buf *bytes.Buffer) error { - fmt.Fprintf(buf, "%d\n", d.k.Platform.MinUserAddress()) - return nil -} - -// +stateify savable -type overcommitMemory struct{} - -var _ vfs.DynamicBytesSource = (*overcommitMemory)(nil) - -// Generate implements vfs.DynamicBytesSource.Generate. -func (d *overcommitMemory) Generate(ctx context.Context, buf *bytes.Buffer) error { - fmt.Fprintf(buf, "0\n") - return nil -} diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go index 11a64c777..5a384817f 100644 --- a/pkg/sentry/fsimpl/proc/task.go +++ b/pkg/sentry/fsimpl/proc/task.go @@ -50,15 +50,15 @@ func newTaskInode(inoGen InoGenerator, task *kernel.Task, pidns *kernel.PIDNames //"fd": newFdDir(t, msrc), //"fdinfo": newFdInfoDir(t, msrc), //"gid_map": newGIDMap(t, msrc), - "io": newTaskOwnedFile(task, inoGen.NextIno(), defaultPermission, newIO(task, isThreadGroup)), - "maps": newTaskOwnedFile(task, inoGen.NextIno(), defaultPermission, &mapsData{task: task}), + "io": newTaskOwnedFile(task, inoGen.NextIno(), 0400, newIO(task, isThreadGroup)), + "maps": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &mapsData{task: task}), //"mountinfo": seqfile.NewSeqFileInode(t, &mountInfoFile{t: t}, msrc), //"mounts": seqfile.NewSeqFileInode(t, &mountsFile{t: t}, msrc), //"ns": newNamespaceDir(t, msrc), - "smaps": newTaskOwnedFile(task, inoGen.NextIno(), defaultPermission, &smapsData{task: task}), - "stat": newTaskOwnedFile(task, inoGen.NextIno(), defaultPermission, &taskStatData{t: task, pidns: pidns, tgstats: isThreadGroup}), - "statm": newTaskOwnedFile(task, inoGen.NextIno(), defaultPermission, &statmData{t: task}), - "status": newTaskOwnedFile(task, inoGen.NextIno(), defaultPermission, &statusData{t: task, pidns: pidns}), + "smaps": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &smapsData{task: task}), + "stat": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &taskStatData{t: task, pidns: pidns, tgstats: isThreadGroup}), + "statm": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &statmData{t: task}), + "status": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &statusData{t: task, pidns: pidns}), //"uid_map": newUIDMap(t, msrc), } if isThreadGroup { diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go index d8f92d52f..72315d25c 100644 --- a/pkg/sentry/fsimpl/proc/tasks.go +++ b/pkg/sentry/fsimpl/proc/tasks.go @@ -15,6 +15,7 @@ package proc import ( + "bytes" "sort" "strconv" @@ -28,9 +29,8 @@ import ( ) const ( - defaultPermission = 0444 - selfName = "self" - threadSelfName = "thread-self" + selfName = "self" + threadSelfName = "thread-self" ) // InoGenerator generates unique inode numbers for a given filesystem. @@ -61,15 +61,15 @@ var _ kernfs.Inode = (*tasksInode)(nil) func newTasksInode(inoGen InoGenerator, k *kernel.Kernel, pidns *kernel.PIDNamespace) (*tasksInode, *kernfs.Dentry) { root := auth.NewRootCredentials(pidns.UserNamespace()) contents := map[string]*kernfs.Dentry{ - //"cpuinfo": newCPUInfo(ctx, msrc), - //"filesystems": seqfile.NewSeqFileInode(ctx, &filesystemsData{}, msrc), - "loadavg": newDentry(root, inoGen.NextIno(), defaultPermission, &loadavgData{}), - "meminfo": newDentry(root, inoGen.NextIno(), defaultPermission, &meminfoData{k: k}), - "mounts": kernfs.NewStaticSymlink(root, inoGen.NextIno(), defaultPermission, "self/mounts"), - "stat": newDentry(root, inoGen.NextIno(), defaultPermission, &statData{k: k}), - //"uptime": newUptime(ctx, msrc), - //"version": newVersionData(root, inoGen.NextIno(), k), - "version": newDentry(root, inoGen.NextIno(), defaultPermission, &versionData{k: k}), + "cpuinfo": newDentry(root, inoGen.NextIno(), 0444, newStaticFile(cpuInfoData(k))), + //"filesystems": newDentry(root, inoGen.NextIno(), 0444, &filesystemsData{}), + "loadavg": newDentry(root, inoGen.NextIno(), 0444, &loadavgData{}), + "sys": newSysDir(root, inoGen), + "meminfo": newDentry(root, inoGen.NextIno(), 0444, &meminfoData{}), + "mounts": kernfs.NewStaticSymlink(root, inoGen.NextIno(), 0777, "self/mounts"), + "stat": newDentry(root, inoGen.NextIno(), 0444, &statData{}), + "uptime": newDentry(root, inoGen.NextIno(), 0444, &uptimeData{}), + "version": newDentry(root, inoGen.NextIno(), 0444, &versionData{}), } inode := &tasksInode{ @@ -216,3 +216,20 @@ func (i *tasksInode) Stat(vsfs *vfs.Filesystem) linux.Statx { return stat } + +func cpuInfoData(k *kernel.Kernel) string { + features := k.FeatureSet() + if features == nil { + // Kernel is always initialized with a FeatureSet. + panic("cpuinfo read with nil FeatureSet") + } + var buf bytes.Buffer + for i, max := uint(0), k.ApplicationCores(); i < max; i++ { + features.WriteCPUInfoTo(i, &buf) + } + return buf.String() +} + +func shmData(v uint64) dynamicInode { + return newStaticFile(strconv.FormatUint(v, 10)) +} diff --git a/pkg/sentry/fsimpl/proc/tasks_files.go b/pkg/sentry/fsimpl/proc/tasks_files.go index 91f30a798..ad3760e39 100644 --- a/pkg/sentry/fsimpl/proc/tasks_files.go +++ b/pkg/sentry/fsimpl/proc/tasks_files.go @@ -15,6 +15,7 @@ package proc import ( + "bytes" "fmt" "strconv" @@ -23,6 +24,9 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/kernel/time" + "gvisor.dev/gvisor/pkg/sentry/usage" + "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" ) @@ -90,3 +94,244 @@ func (s *threadSelfSymlink) Readlink(ctx context.Context) (string, error) { } return fmt.Sprintf("%d/task/%d", tgid, tid), nil } + +// cpuStats contains the breakdown of CPU time for /proc/stat. +type cpuStats struct { + // user is time spent in userspace tasks with non-positive niceness. + user uint64 + + // nice is time spent in userspace tasks with positive niceness. + nice uint64 + + // system is time spent in non-interrupt kernel context. + system uint64 + + // idle is time spent idle. + idle uint64 + + // ioWait is time spent waiting for IO. + ioWait uint64 + + // irq is time spent in interrupt context. + irq uint64 + + // softirq is time spent in software interrupt context. + softirq uint64 + + // steal is involuntary wait time. + steal uint64 + + // guest is time spent in guests with non-positive niceness. + guest uint64 + + // guestNice is time spent in guests with positive niceness. + guestNice uint64 +} + +// String implements fmt.Stringer. +func (c cpuStats) String() string { + return fmt.Sprintf("%d %d %d %d %d %d %d %d %d %d", c.user, c.nice, c.system, c.idle, c.ioWait, c.irq, c.softirq, c.steal, c.guest, c.guestNice) +} + +// statData implements vfs.DynamicBytesSource for /proc/stat. +// +// +stateify savable +type statData struct { + kernfs.DynamicBytesFile + + // k is the owning Kernel. + k *kernel.Kernel +} + +var _ dynamicInode = (*statData)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (s *statData) Generate(ctx context.Context, buf *bytes.Buffer) error { + // TODO(b/37226836): We currently export only zero CPU stats. We could + // at least provide some aggregate stats. + var cpu cpuStats + fmt.Fprintf(buf, "cpu %s\n", cpu) + + for c, max := uint(0), s.k.ApplicationCores(); c < max; c++ { + fmt.Fprintf(buf, "cpu%d %s\n", c, cpu) + } + + // The total number of interrupts is dependent on the CPUs and PCI + // devices on the system. See arch_probe_nr_irqs. + // + // Since we don't report real interrupt stats, just choose an arbitrary + // value from a representative VM. + const numInterrupts = 256 + + // The Kernel doesn't handle real interrupts, so report all zeroes. + // TODO(b/37226836): We could count page faults as #PF. + fmt.Fprintf(buf, "intr 0") // total + for i := 0; i < numInterrupts; i++ { + fmt.Fprintf(buf, " 0") + } + fmt.Fprintf(buf, "\n") + + // Total number of context switches. + // TODO(b/37226836): Count this. + fmt.Fprintf(buf, "ctxt 0\n") + + // CLOCK_REALTIME timestamp from boot, in seconds. + fmt.Fprintf(buf, "btime %d\n", s.k.Timekeeper().BootTime().Seconds()) + + // Total number of clones. + // TODO(b/37226836): Count this. + fmt.Fprintf(buf, "processes 0\n") + + // Number of runnable tasks. + // TODO(b/37226836): Count this. + fmt.Fprintf(buf, "procs_running 0\n") + + // Number of tasks waiting on IO. + // TODO(b/37226836): Count this. + fmt.Fprintf(buf, "procs_blocked 0\n") + + // Number of each softirq handled. + fmt.Fprintf(buf, "softirq 0") // total + for i := 0; i < linux.NumSoftIRQ; i++ { + fmt.Fprintf(buf, " 0") + } + fmt.Fprintf(buf, "\n") + return nil +} + +// loadavgData backs /proc/loadavg. +// +// +stateify savable +type loadavgData struct { + kernfs.DynamicBytesFile +} + +var _ dynamicInode = (*loadavgData)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (d *loadavgData) Generate(ctx context.Context, buf *bytes.Buffer) error { + // TODO(b/62345059): Include real data in fields. + // Column 1-3: CPU and IO utilization of the last 1, 5, and 10 minute periods. + // Column 4-5: currently running processes and the total number of processes. + // Column 6: the last process ID used. + fmt.Fprintf(buf, "%.2f %.2f %.2f %d/%d %d\n", 0.00, 0.00, 0.00, 0, 0, 0) + return nil +} + +// meminfoData implements vfs.DynamicBytesSource for /proc/meminfo. +// +// +stateify savable +type meminfoData struct { + kernfs.DynamicBytesFile + + // k is the owning Kernel. + k *kernel.Kernel +} + +var _ dynamicInode = (*meminfoData)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (d *meminfoData) Generate(ctx context.Context, buf *bytes.Buffer) error { + mf := d.k.MemoryFile() + mf.UpdateUsage() + snapshot, totalUsage := usage.MemoryAccounting.Copy() + totalSize := usage.TotalMemory(mf.TotalSize(), totalUsage) + anon := snapshot.Anonymous + snapshot.Tmpfs + file := snapshot.PageCache + snapshot.Mapped + // We don't actually have active/inactive LRUs, so just make up numbers. + activeFile := (file / 2) &^ (usermem.PageSize - 1) + inactiveFile := file - activeFile + + fmt.Fprintf(buf, "MemTotal: %8d kB\n", totalSize/1024) + memFree := (totalSize - totalUsage) / 1024 + // We use MemFree as MemAvailable because we don't swap. + // TODO(rahat): When reclaim is implemented the value of MemAvailable + // should change. + fmt.Fprintf(buf, "MemFree: %8d kB\n", memFree) + fmt.Fprintf(buf, "MemAvailable: %8d kB\n", memFree) + fmt.Fprintf(buf, "Buffers: 0 kB\n") // memory usage by block devices + fmt.Fprintf(buf, "Cached: %8d kB\n", (file+snapshot.Tmpfs)/1024) + // Emulate a system with no swap, which disables inactivation of anon pages. + fmt.Fprintf(buf, "SwapCache: 0 kB\n") + fmt.Fprintf(buf, "Active: %8d kB\n", (anon+activeFile)/1024) + fmt.Fprintf(buf, "Inactive: %8d kB\n", inactiveFile/1024) + fmt.Fprintf(buf, "Active(anon): %8d kB\n", anon/1024) + fmt.Fprintf(buf, "Inactive(anon): 0 kB\n") + fmt.Fprintf(buf, "Active(file): %8d kB\n", activeFile/1024) + fmt.Fprintf(buf, "Inactive(file): %8d kB\n", inactiveFile/1024) + fmt.Fprintf(buf, "Unevictable: 0 kB\n") // TODO(b/31823263) + fmt.Fprintf(buf, "Mlocked: 0 kB\n") // TODO(b/31823263) + fmt.Fprintf(buf, "SwapTotal: 0 kB\n") + fmt.Fprintf(buf, "SwapFree: 0 kB\n") + fmt.Fprintf(buf, "Dirty: 0 kB\n") + fmt.Fprintf(buf, "Writeback: 0 kB\n") + fmt.Fprintf(buf, "AnonPages: %8d kB\n", anon/1024) + fmt.Fprintf(buf, "Mapped: %8d kB\n", file/1024) // doesn't count mapped tmpfs, which we don't know + fmt.Fprintf(buf, "Shmem: %8d kB\n", snapshot.Tmpfs/1024) + return nil +} + +// uptimeData implements vfs.DynamicBytesSource for /proc/uptime. +// +// +stateify savable +type uptimeData struct { + kernfs.DynamicBytesFile +} + +var _ dynamicInode = (*uptimeData)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (*uptimeData) Generate(ctx context.Context, buf *bytes.Buffer) error { + k := kernel.KernelFromContext(ctx) + now := time.NowFromContext(ctx) + + // Pretend that we've spent zero time sleeping (second number). + fmt.Fprintf(buf, "%.2f 0.00\n", now.Sub(k.Timekeeper().BootTime()).Seconds()) + return nil +} + +// versionData implements vfs.DynamicBytesSource for /proc/version. +// +// +stateify savable +type versionData struct { + kernfs.DynamicBytesFile + + // k is the owning Kernel. + k *kernel.Kernel +} + +var _ dynamicInode = (*versionData)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (v *versionData) Generate(ctx context.Context, buf *bytes.Buffer) error { + init := v.k.GlobalInit() + if init == nil { + // Attempted to read before the init Task is created. This can + // only occur during startup, which should never need to read + // this file. + panic("Attempted to read version before initial Task is available") + } + + // /proc/version takes the form: + // + // "SYSNAME version RELEASE (COMPILE_USER@COMPILE_HOST) + // (COMPILER_VERSION) VERSION" + // + // where: + // - SYSNAME, RELEASE, and VERSION are the same as returned by + // sys_utsname + // - COMPILE_USER is the user that build the kernel + // - COMPILE_HOST is the hostname of the machine on which the kernel + // was built + // - COMPILER_VERSION is the version reported by the building compiler + // + // Since we don't really want to expose build information to + // applications, those fields are omitted. + // + // FIXME(mpratt): Using Version from the init task SyscallTable + // disregards the different version a task may have (e.g., in a uts + // namespace). + ver := init.Leader().SyscallTable().Version + fmt.Fprintf(buf, "%s version %s %s\n", ver.Sysname, ver.Release, ver.Version) + return nil +} diff --git a/pkg/sentry/fsimpl/proc/tasks_net.go b/pkg/sentry/fsimpl/proc/tasks_net.go new file mode 100644 index 000000000..06dc43c26 --- /dev/null +++ b/pkg/sentry/fsimpl/proc/tasks_net.go @@ -0,0 +1,337 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package proc + +import ( + "bytes" + "fmt" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/inet" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/socket" + "gvisor.dev/gvisor/pkg/sentry/socket/unix" + "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" + "gvisor.dev/gvisor/pkg/sentry/vfs" +) + +// ifinet6 implements vfs.DynamicBytesSource for /proc/net/if_inet6. +// +// +stateify savable +type ifinet6 struct { + s inet.Stack +} + +var _ vfs.DynamicBytesSource = (*ifinet6)(nil) + +func (n *ifinet6) contents() []string { + var lines []string + nics := n.s.Interfaces() + for id, naddrs := range n.s.InterfaceAddrs() { + nic, ok := nics[id] + if !ok { + // NIC was added after NICNames was called. We'll just ignore it. + continue + } + + for _, a := range naddrs { + // IPv6 only. + if a.Family != linux.AF_INET6 { + continue + } + + // Fields: + // IPv6 address displayed in 32 hexadecimal chars without colons + // Netlink device number (interface index) in hexadecimal (use nic id) + // Prefix length in hexadecimal + // Scope value (use 0) + // Interface flags + // Device name + lines = append(lines, fmt.Sprintf("%032x %02x %02x %02x %02x %8s\n", a.Addr, id, a.PrefixLen, 0, a.Flags, nic.Name)) + } + } + return lines +} + +// Generate implements vfs.DynamicBytesSource.Generate. +func (n *ifinet6) Generate(ctx context.Context, buf *bytes.Buffer) error { + for _, l := range n.contents() { + buf.WriteString(l) + } + return nil +} + +// netDev implements vfs.DynamicBytesSource for /proc/net/dev. +// +// +stateify savable +type netDev struct { + s inet.Stack +} + +var _ vfs.DynamicBytesSource = (*netDev)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (n *netDev) Generate(ctx context.Context, buf *bytes.Buffer) error { + interfaces := n.s.Interfaces() + buf.WriteString("Inter-| Receive | Transmit\n") + buf.WriteString(" face |bytes packets errs drop fifo frame compressed multicast|bytes packets errs drop fifo colls carrier compressed\n") + + for _, i := range interfaces { + // Implements the same format as + // net/core/net-procfs.c:dev_seq_printf_stats. + var stats inet.StatDev + if err := n.s.Statistics(&stats, i.Name); err != nil { + log.Warningf("Failed to retrieve interface statistics for %v: %v", i.Name, err) + continue + } + fmt.Fprintf( + buf, + "%6s: %7d %7d %4d %4d %4d %5d %10d %9d %8d %7d %4d %4d %4d %5d %7d %10d\n", + i.Name, + // Received + stats[0], // bytes + stats[1], // packets + stats[2], // errors + stats[3], // dropped + stats[4], // fifo + stats[5], // frame + stats[6], // compressed + stats[7], // multicast + // Transmitted + stats[8], // bytes + stats[9], // packets + stats[10], // errors + stats[11], // dropped + stats[12], // fifo + stats[13], // frame + stats[14], // compressed + stats[15], // multicast + ) + } + + return nil +} + +// netUnix implements vfs.DynamicBytesSource for /proc/net/unix. +// +// +stateify savable +type netUnix struct { + k *kernel.Kernel +} + +var _ vfs.DynamicBytesSource = (*netUnix)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (n *netUnix) Generate(ctx context.Context, buf *bytes.Buffer) error { + buf.WriteString("Num RefCount Protocol Flags Type St Inode Path\n") + for _, se := range n.k.ListSockets() { + s := se.Sock.Get() + if s == nil { + log.Debugf("Couldn't resolve weakref %v in socket table, racing with destruction?", se.Sock) + continue + } + sfile := s.(*fs.File) + if family, _, _ := sfile.FileOperations.(socket.Socket).Type(); family != linux.AF_UNIX { + s.DecRef() + // Not a unix socket. + continue + } + sops := sfile.FileOperations.(*unix.SocketOperations) + + addr, err := sops.Endpoint().GetLocalAddress() + if err != nil { + log.Warningf("Failed to retrieve socket name from %+v: %v", sfile, err) + addr.Addr = "" + } + + sockFlags := 0 + if ce, ok := sops.Endpoint().(transport.ConnectingEndpoint); ok { + if ce.Listening() { + // For unix domain sockets, linux reports a single flag + // value if the socket is listening, of __SO_ACCEPTCON. + sockFlags = linux.SO_ACCEPTCON + } + } + + // In the socket entry below, the value for the 'Num' field requires + // some consideration. Linux prints the address to the struct + // unix_sock representing a socket in the kernel, but may redact the + // value for unprivileged users depending on the kptr_restrict + // sysctl. + // + // One use for this field is to allow a privileged user to + // introspect into the kernel memory to determine information about + // a socket not available through procfs, such as the socket's peer. + // + // In gvisor, returning a pointer to our internal structures would + // be pointless, as it wouldn't match the memory layout for struct + // unix_sock, making introspection difficult. We could populate a + // struct unix_sock with the appropriate data, but even that + // requires consideration for which kernel version to emulate, as + // the definition of this struct changes over time. + // + // For now, we always redact this pointer. + fmt.Fprintf(buf, "%#016p: %08X %08X %08X %04X %02X %5d", + (*unix.SocketOperations)(nil), // Num, pointer to kernel socket struct. + sfile.ReadRefs()-1, // RefCount, don't count our own ref. + 0, // Protocol, always 0 for UDS. + sockFlags, // Flags. + sops.Endpoint().Type(), // Type. + sops.State(), // State. + sfile.InodeID(), // Inode. + ) + + // Path + if len(addr.Addr) != 0 { + if addr.Addr[0] == 0 { + // Abstract path. + fmt.Fprintf(buf, " @%s", string(addr.Addr[1:])) + } else { + fmt.Fprintf(buf, " %s", string(addr.Addr)) + } + } + fmt.Fprintf(buf, "\n") + + s.DecRef() + } + return nil +} + +// netTCP implements vfs.DynamicBytesSource for /proc/net/tcp. +// +// +stateify savable +type netTCP struct { + k *kernel.Kernel +} + +var _ vfs.DynamicBytesSource = (*netTCP)(nil) + +func (n *netTCP) Generate(ctx context.Context, buf *bytes.Buffer) error { + t := kernel.TaskFromContext(ctx) + buf.WriteString(" sl local_address rem_address st tx_queue rx_queue tr tm->when retrnsmt uid timeout inode \n") + for _, se := range n.k.ListSockets() { + s := se.Sock.Get() + if s == nil { + log.Debugf("Couldn't resolve weakref %+v in socket table, racing with destruction?", se.Sock) + continue + } + sfile := s.(*fs.File) + sops, ok := sfile.FileOperations.(socket.Socket) + if !ok { + panic(fmt.Sprintf("Found non-socket file in socket table: %+v", sfile)) + } + if family, stype, _ := sops.Type(); !(family == linux.AF_INET && stype == linux.SOCK_STREAM) { + s.DecRef() + // Not tcp4 sockets. + continue + } + + // Linux's documentation for the fields below can be found at + // https://www.kernel.org/doc/Documentation/networking/proc_net_tcp.txt. + // For Linux's implementation, see net/ipv4/tcp_ipv4.c:get_tcp4_sock(). + // Note that the header doesn't contain labels for all the fields. + + // Field: sl; entry number. + fmt.Fprintf(buf, "%4d: ", se.ID) + + portBuf := make([]byte, 2) + + // Field: local_adddress. + var localAddr linux.SockAddrInet + if local, _, err := sops.GetSockName(t); err == nil { + localAddr = *local.(*linux.SockAddrInet) + } + binary.LittleEndian.PutUint16(portBuf, localAddr.Port) + fmt.Fprintf(buf, "%08X:%04X ", + binary.LittleEndian.Uint32(localAddr.Addr[:]), + portBuf) + + // Field: rem_address. + var remoteAddr linux.SockAddrInet + if remote, _, err := sops.GetPeerName(t); err == nil { + remoteAddr = *remote.(*linux.SockAddrInet) + } + binary.LittleEndian.PutUint16(portBuf, remoteAddr.Port) + fmt.Fprintf(buf, "%08X:%04X ", + binary.LittleEndian.Uint32(remoteAddr.Addr[:]), + portBuf) + + // Field: state; socket state. + fmt.Fprintf(buf, "%02X ", sops.State()) + + // Field: tx_queue, rx_queue; number of packets in the transmit and + // receive queue. Unimplemented. + fmt.Fprintf(buf, "%08X:%08X ", 0, 0) + + // Field: tr, tm->when; timer active state and number of jiffies + // until timer expires. Unimplemented. + fmt.Fprintf(buf, "%02X:%08X ", 0, 0) + + // Field: retrnsmt; number of unrecovered RTO timeouts. + // Unimplemented. + fmt.Fprintf(buf, "%08X ", 0) + + // Field: uid. + uattr, err := sfile.Dirent.Inode.UnstableAttr(ctx) + if err != nil { + log.Warningf("Failed to retrieve unstable attr for socket file: %v", err) + fmt.Fprintf(buf, "%5d ", 0) + } else { + fmt.Fprintf(buf, "%5d ", uint32(uattr.Owner.UID.In(t.UserNamespace()).OrOverflow())) + } + + // Field: timeout; number of unanswered 0-window probes. + // Unimplemented. + fmt.Fprintf(buf, "%8d ", 0) + + // Field: inode. + fmt.Fprintf(buf, "%8d ", sfile.InodeID()) + + // Field: refcount. Don't count the ref we obtain while deferencing + // the weakref to this socket. + fmt.Fprintf(buf, "%d ", sfile.ReadRefs()-1) + + // Field: Socket struct address. Redacted due to the same reason as + // the 'Num' field in /proc/net/unix, see netUnix.ReadSeqFileData. + fmt.Fprintf(buf, "%#016p ", (*socket.Socket)(nil)) + + // Field: retransmit timeout. Unimplemented. + fmt.Fprintf(buf, "%d ", 0) + + // Field: predicted tick of soft clock (delayed ACK control data). + // Unimplemented. + fmt.Fprintf(buf, "%d ", 0) + + // Field: (ack.quick<<1)|ack.pingpong, Unimplemented. + fmt.Fprintf(buf, "%d ", 0) + + // Field: sending congestion window, Unimplemented. + fmt.Fprintf(buf, "%d ", 0) + + // Field: Slow start size threshold, -1 if threshold >= 0xFFFF. + // Unimplemented, report as large threshold. + fmt.Fprintf(buf, "%d", -1) + + fmt.Fprintf(buf, "\n") + + s.DecRef() + } + + return nil +} diff --git a/pkg/sentry/fsimpl/proc/tasks_sys.go b/pkg/sentry/fsimpl/proc/tasks_sys.go new file mode 100644 index 000000000..aabf2bf0c --- /dev/null +++ b/pkg/sentry/fsimpl/proc/tasks_sys.go @@ -0,0 +1,143 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package proc + +import ( + "bytes" + "fmt" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" +) + +// newSysDir returns the dentry corresponding to /proc/sys directory. +func newSysDir(root *auth.Credentials, inoGen InoGenerator) *kernfs.Dentry { + return kernfs.NewStaticDir(root, inoGen.NextIno(), 0555, map[string]*kernfs.Dentry{ + "kernel": kernfs.NewStaticDir(root, inoGen.NextIno(), 0555, map[string]*kernfs.Dentry{ + "hostname": newDentry(root, inoGen.NextIno(), 0444, &hostnameData{}), + "shmall": newDentry(root, inoGen.NextIno(), 0444, shmData(linux.SHMALL)), + "shmmax": newDentry(root, inoGen.NextIno(), 0444, shmData(linux.SHMMAX)), + "shmmni": newDentry(root, inoGen.NextIno(), 0444, shmData(linux.SHMMNI)), + }), + "vm": kernfs.NewStaticDir(root, inoGen.NextIno(), 0555, map[string]*kernfs.Dentry{ + "mmap_min_addr": newDentry(root, inoGen.NextIno(), 0444, &mmapMinAddrData{}), + "overcommit_memory": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("0\n")), + }), + "net": newSysNetDir(root, inoGen), + }) +} + +// newSysNetDir returns the dentry corresponding to /proc/sys/net directory. +func newSysNetDir(root *auth.Credentials, inoGen InoGenerator) *kernfs.Dentry { + return kernfs.NewStaticDir(root, inoGen.NextIno(), 0555, map[string]*kernfs.Dentry{ + "net": kernfs.NewStaticDir(root, inoGen.NextIno(), 0555, map[string]*kernfs.Dentry{ + "ipv4": kernfs.NewStaticDir(root, inoGen.NextIno(), 0555, map[string]*kernfs.Dentry{ + // Add tcp_sack. + // TODO(gvisor.dev/issue/1195): tcp_sack allows write(2) + // "tcp_sack": newTCPSackInode(ctx, msrc, s), + + // The following files are simple stubs until they are implemented in + // netstack, most of these files are configuration related. We use the + // value closest to the actual netstack behavior or any empty file, all + // of these files will have mode 0444 (read-only for all users). + "ip_local_port_range": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("16000 65535")), + "ip_local_reserved_ports": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("")), + "ipfrag_time": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("30")), + "ip_nonlocal_bind": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("0")), + "ip_no_pmtu_disc": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("1")), + + // tcp_allowed_congestion_control tell the user what they are able to + // do as an unprivledged process so we leave it empty. + "tcp_allowed_congestion_control": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("")), + "tcp_available_congestion_control": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("reno")), + "tcp_congestion_control": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("reno")), + + // Many of the following stub files are features netstack doesn't + // support. The unsupported features return "0" to indicate they are + // disabled. + "tcp_base_mss": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("1280")), + "tcp_dsack": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("0")), + "tcp_early_retrans": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("0")), + "tcp_fack": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("0")), + "tcp_fastopen": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("0")), + "tcp_fastopen_key": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("")), + "tcp_invalid_ratelimit": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("0")), + "tcp_keepalive_intvl": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("0")), + "tcp_keepalive_probes": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("0")), + "tcp_keepalive_time": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("7200")), + "tcp_mtu_probing": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("0")), + "tcp_no_metrics_save": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("1")), + "tcp_probe_interval": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("0")), + "tcp_probe_threshold": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("0")), + "tcp_retries1": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("3")), + "tcp_retries2": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("15")), + "tcp_rfc1337": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("1")), + "tcp_slow_start_after_idle": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("1")), + "tcp_synack_retries": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("5")), + "tcp_syn_retries": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("3")), + "tcp_timestamps": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("1")), + }), + "core": kernfs.NewStaticDir(root, inoGen.NextIno(), 0555, map[string]*kernfs.Dentry{ + "default_qdisc": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("pfifo_fast")), + "message_burst": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("10")), + "message_cost": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("5")), + "optmem_max": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("0")), + "rmem_default": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("212992")), + "rmem_max": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("212992")), + "somaxconn": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("128")), + "wmem_default": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("212992")), + "wmem_max": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("212992")), + }), + }), + }) +} + +// mmapMinAddrData implements vfs.DynamicBytesSource for +// /proc/sys/vm/mmap_min_addr. +// +// +stateify savable +type mmapMinAddrData struct { + kernfs.DynamicBytesFile + + k *kernel.Kernel +} + +var _ dynamicInode = (*mmapMinAddrData)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (d *mmapMinAddrData) Generate(ctx context.Context, buf *bytes.Buffer) error { + fmt.Fprintf(buf, "%d\n", d.k.Platform.MinUserAddress()) + return nil +} + +// hostnameData implements vfs.DynamicBytesSource for /proc/sys/kernel/hostname. +// +// +stateify savable +type hostnameData struct { + kernfs.DynamicBytesFile +} + +var _ dynamicInode = (*hostnameData)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (*hostnameData) Generate(ctx context.Context, buf *bytes.Buffer) error { + utsns := kernel.UTSNamespaceFromContext(ctx) + buf.WriteString(utsns.HostName()) + buf.WriteString("\n") + return nil +} diff --git a/pkg/sentry/fsimpl/proc/tasks_sys_test.go b/pkg/sentry/fsimpl/proc/tasks_sys_test.go new file mode 100644 index 000000000..20a77a8ca --- /dev/null +++ b/pkg/sentry/fsimpl/proc/tasks_sys_test.go @@ -0,0 +1,78 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package proc + +import ( + "bytes" + "reflect" + "testing" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/context/contexttest" + "gvisor.dev/gvisor/pkg/sentry/inet" +) + +func newIPv6TestStack() *inet.TestStack { + s := inet.NewTestStack() + s.SupportsIPv6Flag = true + return s +} + +func TestIfinet6NoAddresses(t *testing.T) { + n := &ifinet6{s: newIPv6TestStack()} + var buf bytes.Buffer + n.Generate(contexttest.Context(t), &buf) + if buf.Len() > 0 { + t.Errorf("n.Generate() generated = %v, want = %v", buf.Bytes(), []byte{}) + } +} + +func TestIfinet6(t *testing.T) { + s := newIPv6TestStack() + s.InterfacesMap[1] = inet.Interface{Name: "eth0"} + s.InterfaceAddrsMap[1] = []inet.InterfaceAddr{ + { + Family: linux.AF_INET6, + PrefixLen: 128, + Addr: []byte("\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"), + }, + } + s.InterfacesMap[2] = inet.Interface{Name: "eth1"} + s.InterfaceAddrsMap[2] = []inet.InterfaceAddr{ + { + Family: linux.AF_INET6, + PrefixLen: 128, + Addr: []byte("\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"), + }, + } + want := map[string]struct{}{ + "000102030405060708090a0b0c0d0e0f 01 80 00 00 eth0\n": {}, + "101112131415161718191a1b1c1d1e1f 02 80 00 00 eth1\n": {}, + } + + n := &ifinet6{s: s} + contents := n.contents() + if len(contents) != len(want) { + t.Errorf("Got len(n.contents()) = %d, want = %d", len(contents), len(want)) + } + got := map[string]struct{}{} + for _, l := range contents { + got[l] = struct{}{} + } + + if !reflect.DeepEqual(got, want) { + t.Errorf("Got n.contents() = %v, want = %v", got, want) + } +} diff --git a/pkg/sentry/fsimpl/proc/tasks_test.go b/pkg/sentry/fsimpl/proc/tasks_test.go index ca8c87ec2..76eafe593 100644 --- a/pkg/sentry/fsimpl/proc/tasks_test.go +++ b/pkg/sentry/fsimpl/proc/tasks_test.go @@ -69,12 +69,15 @@ func checkDots(dirs []vfs.Dirent) ([]vfs.Dirent, error) { func checkTasksStaticFiles(gots []vfs.Dirent) ([]vfs.Dirent, error) { wants := map[string]vfs.Dirent{ + "cpuinfo": {Type: linux.DT_REG}, "loadavg": {Type: linux.DT_REG}, "meminfo": {Type: linux.DT_REG}, "mounts": {Type: linux.DT_LNK}, "self": selfLink, "stat": {Type: linux.DT_REG}, + "sys": {Type: linux.DT_DIR}, "thread-self": threadSelfLink, + "uptime": {Type: linux.DT_REG}, "version": {Type: linux.DT_REG}, } return checkFiles(gots, wants) diff --git a/pkg/sentry/fsimpl/proc/version.go b/pkg/sentry/fsimpl/proc/version.go deleted file mode 100644 index 367f2396b..000000000 --- a/pkg/sentry/fsimpl/proc/version.go +++ /dev/null @@ -1,70 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package proc - -import ( - "bytes" - "fmt" - - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" - "gvisor.dev/gvisor/pkg/sentry/kernel" -) - -// versionData implements vfs.DynamicBytesSource for /proc/version. -// -// +stateify savable -type versionData struct { - kernfs.DynamicBytesFile - - // k is the owning Kernel. - k *kernel.Kernel -} - -var _ dynamicInode = (*versionData)(nil) - -// Generate implements vfs.DynamicBytesSource.Generate. -func (v *versionData) Generate(ctx context.Context, buf *bytes.Buffer) error { - init := v.k.GlobalInit() - if init == nil { - // Attempted to read before the init Task is created. This can - // only occur during startup, which should never need to read - // this file. - panic("Attempted to read version before initial Task is available") - } - - // /proc/version takes the form: - // - // "SYSNAME version RELEASE (COMPILE_USER@COMPILE_HOST) - // (COMPILER_VERSION) VERSION" - // - // where: - // - SYSNAME, RELEASE, and VERSION are the same as returned by - // sys_utsname - // - COMPILE_USER is the user that build the kernel - // - COMPILE_HOST is the hostname of the machine on which the kernel - // was built - // - COMPILER_VERSION is the version reported by the building compiler - // - // Since we don't really want to expose build information to - // applications, those fields are omitted. - // - // FIXME(mpratt): Using Version from the init task SyscallTable - // disregards the different version a task may have (e.g., in a uts - // namespace). - ver := init.Leader().SyscallTable().Version - fmt.Fprintf(buf, "%s version %s %s\n", ver.Sysname, ver.Release, ver.Version) - return nil -} -- cgit v1.2.3 From 3dd3275da7b665cf2ca297e4bf566fcc77025af8 Mon Sep 17 00:00:00 2001 From: Fabricio Voznika Date: Thu, 16 Jan 2020 13:13:22 -0800 Subject: Add more files to /proc/[pid]/* Files not implemented require VFSv2 plumbing into the kernel. Also, cgroup is not implemented yet. Updates #1195 PiperOrigin-RevId: 290129176 --- pkg/sentry/fsimpl/kernfs/filesystem.go | 63 ++---- pkg/sentry/fsimpl/kernfs/inode_impl_util.go | 16 +- pkg/sentry/fsimpl/kernfs/kernfs.go | 2 +- pkg/sentry/fsimpl/kernfs/symlink.go | 21 +- pkg/sentry/fsimpl/proc/BUILD | 3 +- pkg/sentry/fsimpl/proc/mounts.go | 33 --- pkg/sentry/fsimpl/proc/subtasks.go | 126 +++++++++++ pkg/sentry/fsimpl/proc/task.go | 69 ++++-- pkg/sentry/fsimpl/proc/task_files.go | 315 +++++++++++++++++++++++++--- pkg/sentry/fsimpl/proc/tasks.go | 2 +- pkg/sentry/fsimpl/proc/tasks_test.go | 20 +- pkg/sentry/vfs/permissions.go | 24 ++- 12 files changed, 549 insertions(+), 145 deletions(-) delete mode 100644 pkg/sentry/fsimpl/proc/mounts.go create mode 100644 pkg/sentry/fsimpl/proc/subtasks.go (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go index 79759e0fc..a4600ad47 100644 --- a/pkg/sentry/fsimpl/kernfs/filesystem.go +++ b/pkg/sentry/fsimpl/kernfs/filesystem.go @@ -22,7 +22,6 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" ) @@ -40,7 +39,7 @@ func (fs *Filesystem) stepExistingLocked(ctx context.Context, rp *vfs.ResolvingP return nil, syserror.ENOTDIR } // Directory searchable? - if err := d.inode.CheckPermissions(rp.Credentials(), vfs.MayExec); err != nil { + if err := d.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayExec); err != nil { return nil, err } afterSymlink: @@ -182,8 +181,8 @@ func (fs *Filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.Resolving // // Preconditions: Filesystem.mu must be locked for at least reading. parentInode // == parentVFSD.Impl().(*Dentry).Inode. isDir(parentInode) == true. -func checkCreateLocked(rp *vfs.ResolvingPath, parentVFSD *vfs.Dentry, parentInode Inode) (string, error) { - if err := parentInode.CheckPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { +func checkCreateLocked(ctx context.Context, rp *vfs.ResolvingPath, parentVFSD *vfs.Dentry, parentInode Inode) (string, error) { + if err := parentInode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { return "", err } pc := rp.Component() @@ -206,7 +205,7 @@ func checkCreateLocked(rp *vfs.ResolvingPath, parentVFSD *vfs.Dentry, parentInod // checkDeleteLocked checks that the file represented by vfsd may be deleted. // // Preconditions: Filesystem.mu must be locked for at least reading. -func checkDeleteLocked(rp *vfs.ResolvingPath, vfsd *vfs.Dentry) error { +func checkDeleteLocked(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry) error { parentVFSD := vfsd.Parent() if parentVFSD == nil { return syserror.EBUSY @@ -214,36 +213,12 @@ func checkDeleteLocked(rp *vfs.ResolvingPath, vfsd *vfs.Dentry) error { if parentVFSD.IsDisowned() { return syserror.ENOENT } - if err := parentVFSD.Impl().(*Dentry).inode.CheckPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { + if err := parentVFSD.Impl().(*Dentry).inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { return err } return nil } -// checkRenameLocked checks that a rename operation may be performed on the -// target dentry across the given set of parent directories. The target dentry -// may be nil. -// -// Precondition: isDir(dstInode) == true. -func checkRenameLocked(creds *auth.Credentials, src, dstDir *vfs.Dentry, dstInode Inode) error { - srcDir := src.Parent() - if srcDir == nil { - return syserror.EBUSY - } - if srcDir.IsDisowned() { - return syserror.ENOENT - } - if dstDir.IsDisowned() { - return syserror.ENOENT - } - // Check for creation permissions on dst dir. - if err := dstInode.CheckPermissions(creds, vfs.MayWrite|vfs.MayExec); err != nil { - return err - } - - return nil -} - // Release implements vfs.FilesystemImpl.Release. func (fs *Filesystem) Release() { } @@ -269,7 +244,7 @@ func (fs *Filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, op if !d.isDir() { return nil, syserror.ENOTDIR } - if err := inode.CheckPermissions(rp.Credentials(), vfs.MayExec); err != nil { + if err := inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayExec); err != nil { return nil, err } } @@ -302,7 +277,7 @@ func (fs *Filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs. if err != nil { return err } - pc, err := checkCreateLocked(rp, parentVFSD, parentInode) + pc, err := checkCreateLocked(ctx, rp, parentVFSD, parentInode) if err != nil { return err } @@ -339,7 +314,7 @@ func (fs *Filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v if err != nil { return err } - pc, err := checkCreateLocked(rp, parentVFSD, parentInode) + pc, err := checkCreateLocked(ctx, rp, parentVFSD, parentInode) if err != nil { return err } @@ -367,7 +342,7 @@ func (fs *Filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v if err != nil { return err } - pc, err := checkCreateLocked(rp, parentVFSD, parentInode) + pc, err := checkCreateLocked(ctx, rp, parentVFSD, parentInode) if err != nil { return err } @@ -401,7 +376,7 @@ func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf if err != nil { return nil, err } - if err := inode.CheckPermissions(rp.Credentials(), ats); err != nil { + if err := inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil { return nil, err } return inode.Open(rp, vfsd, opts.Flags) @@ -420,7 +395,7 @@ func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf if mustCreate { return nil, syserror.EEXIST } - if err := inode.CheckPermissions(rp.Credentials(), ats); err != nil { + if err := inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil { return nil, err } return inode.Open(rp, vfsd, opts.Flags) @@ -432,7 +407,7 @@ afterTrailingSymlink: return nil, err } // Check for search permission in the parent directory. - if err := parentInode.CheckPermissions(rp.Credentials(), vfs.MayExec); err != nil { + if err := parentInode.CheckPermissions(ctx, rp.Credentials(), vfs.MayExec); err != nil { return nil, err } // Reject attempts to open directories with O_CREAT. @@ -450,7 +425,7 @@ afterTrailingSymlink: } if childVFSD == nil { // Already checked for searchability above; now check for writability. - if err := parentInode.CheckPermissions(rp.Credentials(), vfs.MayWrite); err != nil { + if err := parentInode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite); err != nil { return nil, err } if err := rp.Mount().CheckBeginWrite(); err != nil { @@ -485,7 +460,7 @@ afterTrailingSymlink: goto afterTrailingSymlink } } - if err := childInode.CheckPermissions(rp.Credentials(), ats); err != nil { + if err := childInode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil { return nil, err } return childInode.Open(rp, childVFSD, opts.Flags) @@ -545,13 +520,13 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa srcVFSD := &src.vfsd // Can we remove the src dentry? - if err := checkDeleteLocked(rp, srcVFSD); err != nil { + if err := checkDeleteLocked(ctx, rp, srcVFSD); err != nil { return err } // Can we create the dst dentry? var dstVFSD *vfs.Dentry - pc, err := checkCreateLocked(rp, dstDirVFSD, dstDirInode) + pc, err := checkCreateLocked(ctx, rp, dstDirVFSD, dstDirInode) switch err { case nil: // Ok, continue with rename as replacement. @@ -607,7 +582,7 @@ func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error return err } defer rp.Mount().EndWrite() - if err := checkDeleteLocked(rp, vfsd); err != nil { + if err := checkDeleteLocked(ctx, rp, vfsd); err != nil { return err } if !vfsd.Impl().(*Dentry).isDir() { @@ -683,7 +658,7 @@ func (fs *Filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, targ if err != nil { return err } - pc, err := checkCreateLocked(rp, parentVFSD, parentInode) + pc, err := checkCreateLocked(ctx, rp, parentVFSD, parentInode) if err != nil { return err } @@ -712,7 +687,7 @@ func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error return err } defer rp.Mount().EndWrite() - if err := checkDeleteLocked(rp, vfsd); err != nil { + if err := checkDeleteLocked(ctx, rp, vfsd); err != nil { return err } if vfsd.Impl().(*Dentry).isDir() { diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go index 6aff3d39a..1700fffd9 100644 --- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go @@ -262,7 +262,7 @@ func (a *InodeAttrs) SetStat(_ *vfs.Filesystem, opts vfs.SetStatOptions) error { } // CheckPermissions implements Inode.CheckPermissions. -func (a *InodeAttrs) CheckPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error { +func (a *InodeAttrs) CheckPermissions(_ context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error { mode := a.Mode() return vfs.GenericCheckPermissions( creds, @@ -527,12 +527,8 @@ var _ Inode = (*StaticDirectory)(nil) // NewStaticDir creates a new static directory and returns its dentry. func NewStaticDir(creds *auth.Credentials, ino uint64, perm linux.FileMode, children map[string]*Dentry) *Dentry { - if perm&^linux.PermissionsMask != 0 { - panic(fmt.Sprintf("Only permission mask must be set: %x", perm&linux.PermissionsMask)) - } - inode := &StaticDirectory{} - inode.InodeAttrs.Init(creds, ino, linux.ModeDirectory|perm) + inode.Init(creds, ino, perm) dentry := &Dentry{} dentry.Init(inode) @@ -544,6 +540,14 @@ func NewStaticDir(creds *auth.Credentials, ino uint64, perm linux.FileMode, chil return dentry } +// Init initializes StaticDirectory. +func (s *StaticDirectory) Init(creds *auth.Credentials, ino uint64, perm linux.FileMode) { + if perm&^linux.PermissionsMask != 0 { + panic(fmt.Sprintf("Only permission mask must be set: %x", perm&linux.PermissionsMask)) + } + s.InodeAttrs.Init(creds, ino, linux.ModeDirectory|perm) +} + // Open implements kernfs.Inode. func (s *StaticDirectory) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) { fd := &GenericDirectoryFD{} diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go index bb12f39a2..85bcdcc57 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs.go @@ -320,7 +320,7 @@ type inodeMetadata interface { // CheckPermissions checks that creds may access this inode for the // requested access type, per the the rules of // fs/namei.c:generic_permission(). - CheckPermissions(creds *auth.Credentials, atx vfs.AccessTypes) error + CheckPermissions(ctx context.Context, creds *auth.Credentials, atx vfs.AccessTypes) error // Mode returns the (struct stat)::st_mode value for this inode. This is // separated from Stat for performance. diff --git a/pkg/sentry/fsimpl/kernfs/symlink.go b/pkg/sentry/fsimpl/kernfs/symlink.go index 068063f4e..f19f12854 100644 --- a/pkg/sentry/fsimpl/kernfs/symlink.go +++ b/pkg/sentry/fsimpl/kernfs/symlink.go @@ -20,7 +20,9 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/auth" ) -type staticSymlink struct { +// StaticSymlink provides an Inode implementation for symlinks that point to +// a immutable target. +type StaticSymlink struct { InodeAttrs InodeNoopRefCount InodeSymlink @@ -28,18 +30,25 @@ type staticSymlink struct { target string } -var _ Inode = (*staticSymlink)(nil) +var _ Inode = (*StaticSymlink)(nil) // NewStaticSymlink creates a new symlink file pointing to 'target'. -func NewStaticSymlink(creds *auth.Credentials, ino uint64, perm linux.FileMode, target string) *Dentry { - inode := &staticSymlink{target: target} - inode.Init(creds, ino, linux.ModeSymlink|perm) +func NewStaticSymlink(creds *auth.Credentials, ino uint64, target string) *Dentry { + inode := &StaticSymlink{} + inode.Init(creds, ino, target) d := &Dentry{} d.Init(inode) return d } -func (s *staticSymlink) Readlink(_ context.Context) (string, error) { +// Init initializes the instance. +func (s *StaticSymlink) Init(creds *auth.Credentials, ino uint64, target string) { + s.target = target + s.InodeAttrs.Init(creds, ino, linux.ModeSymlink|0777) +} + +// Readlink implements Inode. +func (s *StaticSymlink) Readlink(_ context.Context) (string, error) { return s.target, nil } diff --git a/pkg/sentry/fsimpl/proc/BUILD b/pkg/sentry/fsimpl/proc/BUILD index 6cd18cec8..e92564b5d 100644 --- a/pkg/sentry/fsimpl/proc/BUILD +++ b/pkg/sentry/fsimpl/proc/BUILD @@ -7,7 +7,7 @@ go_library( name = "proc", srcs = [ "filesystem.go", - "mounts.go", + "subtasks.go", "task.go", "task_files.go", "tasks.go", @@ -29,6 +29,7 @@ go_library( "//pkg/sentry/kernel/time", "//pkg/sentry/limits", "//pkg/sentry/mm", + "//pkg/sentry/safemem", "//pkg/sentry/socket", "//pkg/sentry/socket/unix", "//pkg/sentry/socket/unix/transport", diff --git a/pkg/sentry/fsimpl/proc/mounts.go b/pkg/sentry/fsimpl/proc/mounts.go deleted file mode 100644 index 8683cf677..000000000 --- a/pkg/sentry/fsimpl/proc/mounts.go +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package proc - -import "gvisor.dev/gvisor/pkg/sentry/kernel" - -// TODO(gvisor.dev/issue/1195): Implement mountInfoFile and mountsFile. - -// mountInfoFile implements vfs.DynamicBytesSource for /proc/[pid]/mountinfo. -// -// +stateify savable -type mountInfoFile struct { - t *kernel.Task -} - -// mountsFile implements vfs.DynamicBytesSource for /proc/[pid]/mounts. -// -// +stateify savable -type mountsFile struct { - t *kernel.Task -} diff --git a/pkg/sentry/fsimpl/proc/subtasks.go b/pkg/sentry/fsimpl/proc/subtasks.go new file mode 100644 index 000000000..8892c5a11 --- /dev/null +++ b/pkg/sentry/fsimpl/proc/subtasks.go @@ -0,0 +1,126 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package proc + +import ( + "sort" + "strconv" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +// subtasksInode represents the inode for /proc/[pid]/task/ directory. +// +// +stateify savable +type subtasksInode struct { + kernfs.InodeNotSymlink + kernfs.InodeDirectoryNoNewChildren + kernfs.InodeAttrs + kernfs.OrderedChildren + + task *kernel.Task + pidns *kernel.PIDNamespace + inoGen InoGenerator +} + +var _ kernfs.Inode = (*subtasksInode)(nil) + +func newSubtasks(task *kernel.Task, pidns *kernel.PIDNamespace, inoGen InoGenerator) *kernfs.Dentry { + subInode := &subtasksInode{ + task: task, + pidns: pidns, + inoGen: inoGen, + } + // Note: credentials are overridden by taskOwnedInode. + subInode.InodeAttrs.Init(task.Credentials(), inoGen.NextIno(), linux.ModeDirectory|0555) + subInode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) + + inode := &taskOwnedInode{Inode: subInode, owner: task} + dentry := &kernfs.Dentry{} + dentry.Init(inode) + + return dentry +} + +// Valid implements kernfs.inodeDynamicLookup. +func (i *subtasksInode) Valid(ctx context.Context) bool { + return true +} + +// Lookup implements kernfs.inodeDynamicLookup. +func (i *subtasksInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) { + tid, err := strconv.ParseUint(name, 10, 32) + if err != nil { + return nil, syserror.ENOENT + } + + subTask := i.pidns.TaskWithID(kernel.ThreadID(tid)) + if subTask == nil { + return nil, syserror.ENOENT + } + if subTask.ThreadGroup() != i.task.ThreadGroup() { + return nil, syserror.ENOENT + } + + subTaskDentry := newTaskInode(i.inoGen, subTask, i.pidns, false) + return subTaskDentry.VFSDentry(), nil +} + +// IterDirents implements kernfs.inodeDynamicLookup. +func (i *subtasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) { + tasks := i.task.ThreadGroup().MemberIDs(i.pidns) + if len(tasks) == 0 { + return offset, syserror.ENOENT + } + + tids := make([]int, 0, len(tasks)) + for _, tid := range tasks { + tids = append(tids, int(tid)) + } + + sort.Ints(tids) + for _, tid := range tids[relOffset:] { + dirent := vfs.Dirent{ + Name: strconv.FormatUint(uint64(tid), 10), + Type: linux.DT_DIR, + Ino: i.inoGen.NextIno(), + NextOff: offset + 1, + } + if !cb.Handle(dirent) { + return offset, nil + } + offset++ + } + return offset, nil +} + +// Open implements kernfs.Inode. +func (i *subtasksInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) { + fd := &kernfs.GenericDirectoryFD{} + fd.Init(rp.Mount(), vfsd, &i.OrderedChildren, flags) + return fd.VFSFileDescription(), nil +} + +// Stat implements kernfs.Inode. +func (i *subtasksInode) Stat(vsfs *vfs.Filesystem) linux.Statx { + stat := i.InodeAttrs.Stat(vsfs) + stat.Nlink += uint32(i.task.ThreadGroup().Count()) + return stat +} diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go index 5a384817f..621c17cfe 100644 --- a/pkg/sentry/fsimpl/proc/task.go +++ b/pkg/sentry/fsimpl/proc/task.go @@ -15,6 +15,8 @@ package proc import ( + "fmt" + "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" @@ -42,27 +44,31 @@ var _ kernfs.Inode = (*taskInode)(nil) func newTaskInode(inoGen InoGenerator, task *kernel.Task, pidns *kernel.PIDNamespace, isThreadGroup bool) *kernfs.Dentry { contents := map[string]*kernfs.Dentry{ - //"auxv": newAuxvec(t, msrc), - //"cmdline": newExecArgInode(t, msrc, cmdlineExecArg), - //"comm": newComm(t, msrc), - //"environ": newExecArgInode(t, msrc, environExecArg), + "auxv": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &auxvData{task: task}), + "cmdline": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &cmdlineData{task: task, arg: cmdlineDataArg}), + "comm": newComm(task, inoGen.NextIno(), 0444), + "environ": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &cmdlineData{task: task, arg: environDataArg}), //"exe": newExe(t, msrc), //"fd": newFdDir(t, msrc), //"fdinfo": newFdInfoDir(t, msrc), - //"gid_map": newGIDMap(t, msrc), - "io": newTaskOwnedFile(task, inoGen.NextIno(), 0400, newIO(task, isThreadGroup)), - "maps": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &mapsData{task: task}), + "gid_map": newTaskOwnedFile(task, inoGen.NextIno(), 0644, &idMapData{task: task, gids: true}), + "io": newTaskOwnedFile(task, inoGen.NextIno(), 0400, newIO(task, isThreadGroup)), + "maps": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &mapsData{task: task}), //"mountinfo": seqfile.NewSeqFileInode(t, &mountInfoFile{t: t}, msrc), //"mounts": seqfile.NewSeqFileInode(t, &mountsFile{t: t}, msrc), - //"ns": newNamespaceDir(t, msrc), - "smaps": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &smapsData{task: task}), - "stat": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &taskStatData{t: task, pidns: pidns, tgstats: isThreadGroup}), - "statm": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &statmData{t: task}), - "status": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &statusData{t: task, pidns: pidns}), - //"uid_map": newUIDMap(t, msrc), + "ns": newTaskOwnedDir(task, inoGen.NextIno(), 0511, map[string]*kernfs.Dentry{ + "net": newNamespaceSymlink(task, inoGen.NextIno(), "net"), + "pid": newNamespaceSymlink(task, inoGen.NextIno(), "pid"), + "user": newNamespaceSymlink(task, inoGen.NextIno(), "user"), + }), + "smaps": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &smapsData{task: task}), + "stat": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &taskStatData{task: task, pidns: pidns, tgstats: isThreadGroup}), + "statm": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &statmData{task: task}), + "status": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &statusData{task: task, pidns: pidns}), + "uid_map": newTaskOwnedFile(task, inoGen.NextIno(), 0644, &idMapData{task: task, gids: false}), } if isThreadGroup { - //contents["task"] = p.newSubtasks(t, msrc) + contents["task"] = newSubtasks(task, pidns, inoGen) } //if len(p.cgroupControllers) > 0 { // contents["cgroup"] = newCGroupInode(t, msrc, p.cgroupControllers) @@ -127,6 +133,23 @@ func newTaskOwnedFile(task *kernel.Task, ino uint64, perm linux.FileMode, inode return d } +func newTaskOwnedDir(task *kernel.Task, ino uint64, perm linux.FileMode, children map[string]*kernfs.Dentry) *kernfs.Dentry { + dir := &kernfs.StaticDirectory{} + + // Note: credentials are overridden by taskOwnedInode. + dir.Init(task.Credentials(), ino, perm) + + inode := &taskOwnedInode{Inode: dir, owner: task} + d := &kernfs.Dentry{} + d.Init(inode) + + dir.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) + links := dir.OrderedChildren.Populate(d, children) + dir.IncLinks(links) + + return d +} + // Stat implements kernfs.Inode. func (i *taskOwnedInode) Stat(fs *vfs.Filesystem) linux.Statx { stat := i.Inode.Stat(fs) @@ -137,7 +160,7 @@ func (i *taskOwnedInode) Stat(fs *vfs.Filesystem) linux.Statx { } // CheckPermissions implements kernfs.Inode. -func (i *taskOwnedInode) CheckPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error { +func (i *taskOwnedInode) CheckPermissions(_ context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error { mode := i.Mode() uid, gid := i.getOwner(mode) return vfs.GenericCheckPermissions( @@ -188,3 +211,19 @@ func newIO(t *kernel.Task, isThreadGroup bool) *ioData { } return &ioData{ioUsage: t} } + +func newNamespaceSymlink(task *kernel.Task, ino uint64, ns string) *kernfs.Dentry { + // Namespace symlinks should contain the namespace name and the inode number + // for the namespace instance, so for example user:[123456]. We currently fake + // the inode number by sticking the symlink inode in its place. + target := fmt.Sprintf("%s:[%d]", ns, ino) + + inode := &kernfs.StaticSymlink{} + // Note: credentials are overridden by taskOwnedInode. + inode.Init(task.Credentials(), ino, target) + + taskInode := &taskOwnedInode{Inode: inode, owner: task} + d := &kernfs.Dentry{} + d.Init(taskInode) + return d +} diff --git a/pkg/sentry/fsimpl/proc/task_files.go b/pkg/sentry/fsimpl/proc/task_files.go index 93f0e1aa8..7bc352ae9 100644 --- a/pkg/sentry/fsimpl/proc/task_files.go +++ b/pkg/sentry/fsimpl/proc/task_files.go @@ -17,15 +17,20 @@ package proc import ( "bytes" "fmt" + "io" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/mm" + "gvisor.dev/gvisor/pkg/sentry/safemem" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" ) // mm gets the kernel task's MemoryManager. No additional reference is taken on @@ -41,6 +46,256 @@ func getMM(task *kernel.Task) *mm.MemoryManager { return tmm } +// getMMIncRef returns t's MemoryManager. If getMMIncRef succeeds, the +// MemoryManager's users count is incremented, and must be decremented by the +// caller when it is no longer in use. +func getMMIncRef(task *kernel.Task) (*mm.MemoryManager, error) { + if task.ExitState() == kernel.TaskExitDead { + return nil, syserror.ESRCH + } + var m *mm.MemoryManager + task.WithMuLocked(func(t *kernel.Task) { + m = t.MemoryManager() + }) + if m == nil || !m.IncUsers() { + return nil, io.EOF + } + return m, nil +} + +type bufferWriter struct { + buf *bytes.Buffer +} + +// WriteFromBlocks writes up to srcs.NumBytes() bytes from srcs and returns +// the number of bytes written. It may return a partial write without an +// error (i.e. (n, nil) where 0 < n < srcs.NumBytes()). It should not +// return a full write with an error (i.e. srcs.NumBytes(), err) where err +// != nil). +func (w *bufferWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) { + written := srcs.NumBytes() + for !srcs.IsEmpty() { + w.buf.Write(srcs.Head().ToSlice()) + srcs = srcs.Tail() + } + return written, nil +} + +// auxvData implements vfs.DynamicBytesSource for /proc/[pid]/auxv. +// +// +stateify savable +type auxvData struct { + kernfs.DynamicBytesFile + + task *kernel.Task +} + +var _ dynamicInode = (*auxvData)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (d *auxvData) Generate(ctx context.Context, buf *bytes.Buffer) error { + m, err := getMMIncRef(d.task) + if err != nil { + return err + } + defer m.DecUsers(ctx) + + // Space for buffer with AT_NULL (0) terminator at the end. + auxv := m.Auxv() + buf.Grow((len(auxv) + 1) * 16) + for _, e := range auxv { + var tmp [8]byte + usermem.ByteOrder.PutUint64(tmp[:], e.Key) + buf.Write(tmp[:]) + + usermem.ByteOrder.PutUint64(tmp[:], uint64(e.Value)) + buf.Write(tmp[:]) + } + return nil +} + +// execArgType enumerates the types of exec arguments that are exposed through +// proc. +type execArgType int + +const ( + cmdlineDataArg execArgType = iota + environDataArg +) + +// cmdlineData implements vfs.DynamicBytesSource for /proc/[pid]/cmdline. +// +// +stateify savable +type cmdlineData struct { + kernfs.DynamicBytesFile + + task *kernel.Task + + // arg is the type of exec argument this file contains. + arg execArgType +} + +var _ dynamicInode = (*cmdlineData)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (d *cmdlineData) Generate(ctx context.Context, buf *bytes.Buffer) error { + m, err := getMMIncRef(d.task) + if err != nil { + return err + } + defer m.DecUsers(ctx) + + // Figure out the bounds of the exec arg we are trying to read. + var ar usermem.AddrRange + switch d.arg { + case cmdlineDataArg: + ar = usermem.AddrRange{ + Start: m.ArgvStart(), + End: m.ArgvEnd(), + } + case environDataArg: + ar = usermem.AddrRange{ + Start: m.EnvvStart(), + End: m.EnvvEnd(), + } + default: + panic(fmt.Sprintf("unknown exec arg type %v", d.arg)) + } + if ar.Start == 0 || ar.End == 0 { + // Don't attempt to read before the start/end are set up. + return io.EOF + } + + // N.B. Technically this should be usermem.IOOpts.IgnorePermissions = true + // until Linux 4.9 (272ddc8b3735 "proc: don't use FOLL_FORCE for reading + // cmdline and environment"). + writer := &bufferWriter{buf: buf} + if n, err := m.CopyInTo(ctx, usermem.AddrRangeSeqOf(ar), writer, usermem.IOOpts{}); n == 0 || err != nil { + // Nothing to copy or something went wrong. + return err + } + + // On Linux, if the NULL byte at the end of the argument vector has been + // overwritten, it continues reading the environment vector as part of + // the argument vector. + if d.arg == cmdlineDataArg && buf.Bytes()[buf.Len()-1] != 0 { + if end := bytes.IndexByte(buf.Bytes(), 0); end != -1 { + // If we found a NULL character somewhere else in argv, truncate the + // return up to the NULL terminator (including it). + buf.Truncate(end) + return nil + } + + // There is no NULL terminator in the string, return into envp. + arEnvv := usermem.AddrRange{ + Start: m.EnvvStart(), + End: m.EnvvEnd(), + } + + // Upstream limits the returned amount to one page of slop. + // https://elixir.bootlin.com/linux/v4.20/source/fs/proc/base.c#L208 + // we'll return one page total between argv and envp because of the + // above page restrictions. + if buf.Len() >= usermem.PageSize { + // Returned at least one page already, nothing else to add. + return nil + } + remaining := usermem.PageSize - buf.Len() + if int(arEnvv.Length()) > remaining { + end, ok := arEnvv.Start.AddLength(uint64(remaining)) + if !ok { + return syserror.EFAULT + } + arEnvv.End = end + } + if _, err := m.CopyInTo(ctx, usermem.AddrRangeSeqOf(arEnvv), writer, usermem.IOOpts{}); err != nil { + return err + } + + // Linux will return envp up to and including the first NULL character, + // so find it. + if end := bytes.IndexByte(buf.Bytes()[ar.Length():], 0); end != -1 { + buf.Truncate(end) + } + } + + return nil +} + +// +stateify savable +type commInode struct { + kernfs.DynamicBytesFile + + task *kernel.Task +} + +func newComm(task *kernel.Task, ino uint64, perm linux.FileMode) *kernfs.Dentry { + inode := &commInode{task: task} + inode.DynamicBytesFile.Init(task.Credentials(), ino, &commData{task: task}, perm) + + d := &kernfs.Dentry{} + d.Init(inode) + return d +} + +func (i *commInode) CheckPermissions(ctx context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error { + // This file can always be read or written by members of the same thread + // group. See fs/proc/base.c:proc_tid_comm_permission. + // + // N.B. This check is currently a no-op as we don't yet support writing and + // this file is world-readable anyways. + t := kernel.TaskFromContext(ctx) + if t != nil && t.ThreadGroup() == i.task.ThreadGroup() && !ats.MayExec() { + return nil + } + + return i.DynamicBytesFile.CheckPermissions(ctx, creds, ats) +} + +// commData implements vfs.DynamicBytesSource for /proc/[pid]/comm. +// +// +stateify savable +type commData struct { + kernfs.DynamicBytesFile + + task *kernel.Task +} + +var _ dynamicInode = (*commData)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (d *commData) Generate(ctx context.Context, buf *bytes.Buffer) error { + buf.WriteString(d.task.Name()) + buf.WriteString("\n") + return nil +} + +// idMapData implements vfs.DynamicBytesSource for /proc/[pid]/{gid_map|uid_map}. +// +// +stateify savable +type idMapData struct { + kernfs.DynamicBytesFile + + task *kernel.Task + gids bool +} + +var _ dynamicInode = (*idMapData)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (d *idMapData) Generate(ctx context.Context, buf *bytes.Buffer) error { + var entries []auth.IDMapEntry + if d.gids { + entries = d.task.UserNamespace().GIDMap() + } else { + entries = d.task.UserNamespace().UIDMap() + } + for _, e := range entries { + fmt.Fprintf(buf, "%10d %10d %10d\n", e.FirstID, e.FirstParentID, e.Length) + } + return nil +} + // mapsData implements vfs.DynamicBytesSource for /proc/[pid]/maps. // // +stateify savable @@ -83,7 +338,7 @@ func (d *smapsData) Generate(ctx context.Context, buf *bytes.Buffer) error { type taskStatData struct { kernfs.DynamicBytesFile - t *kernel.Task + task *kernel.Task // If tgstats is true, accumulate fault stats (not implemented) and CPU // time across all tasks in t's thread group. @@ -98,40 +353,40 @@ var _ dynamicInode = (*taskStatData)(nil) // Generate implements vfs.DynamicBytesSource.Generate. func (s *taskStatData) Generate(ctx context.Context, buf *bytes.Buffer) error { - fmt.Fprintf(buf, "%d ", s.pidns.IDOfTask(s.t)) - fmt.Fprintf(buf, "(%s) ", s.t.Name()) - fmt.Fprintf(buf, "%c ", s.t.StateStatus()[0]) + fmt.Fprintf(buf, "%d ", s.pidns.IDOfTask(s.task)) + fmt.Fprintf(buf, "(%s) ", s.task.Name()) + fmt.Fprintf(buf, "%c ", s.task.StateStatus()[0]) ppid := kernel.ThreadID(0) - if parent := s.t.Parent(); parent != nil { + if parent := s.task.Parent(); parent != nil { ppid = s.pidns.IDOfThreadGroup(parent.ThreadGroup()) } fmt.Fprintf(buf, "%d ", ppid) - fmt.Fprintf(buf, "%d ", s.pidns.IDOfProcessGroup(s.t.ThreadGroup().ProcessGroup())) - fmt.Fprintf(buf, "%d ", s.pidns.IDOfSession(s.t.ThreadGroup().Session())) + fmt.Fprintf(buf, "%d ", s.pidns.IDOfProcessGroup(s.task.ThreadGroup().ProcessGroup())) + fmt.Fprintf(buf, "%d ", s.pidns.IDOfSession(s.task.ThreadGroup().Session())) fmt.Fprintf(buf, "0 0 " /* tty_nr tpgid */) fmt.Fprintf(buf, "0 " /* flags */) fmt.Fprintf(buf, "0 0 0 0 " /* minflt cminflt majflt cmajflt */) var cputime usage.CPUStats if s.tgstats { - cputime = s.t.ThreadGroup().CPUStats() + cputime = s.task.ThreadGroup().CPUStats() } else { - cputime = s.t.CPUStats() + cputime = s.task.CPUStats() } fmt.Fprintf(buf, "%d %d ", linux.ClockTFromDuration(cputime.UserTime), linux.ClockTFromDuration(cputime.SysTime)) - cputime = s.t.ThreadGroup().JoinedChildCPUStats() + cputime = s.task.ThreadGroup().JoinedChildCPUStats() fmt.Fprintf(buf, "%d %d ", linux.ClockTFromDuration(cputime.UserTime), linux.ClockTFromDuration(cputime.SysTime)) - fmt.Fprintf(buf, "%d %d ", s.t.Priority(), s.t.Niceness()) - fmt.Fprintf(buf, "%d ", s.t.ThreadGroup().Count()) + fmt.Fprintf(buf, "%d %d ", s.task.Priority(), s.task.Niceness()) + fmt.Fprintf(buf, "%d ", s.task.ThreadGroup().Count()) // itrealvalue. Since kernel 2.6.17, this field is no longer // maintained, and is hard coded as 0. fmt.Fprintf(buf, "0 ") // Start time is relative to boot time, expressed in clock ticks. - fmt.Fprintf(buf, "%d ", linux.ClockTFromDuration(s.t.StartTime().Sub(s.t.Kernel().Timekeeper().BootTime()))) + fmt.Fprintf(buf, "%d ", linux.ClockTFromDuration(s.task.StartTime().Sub(s.task.Kernel().Timekeeper().BootTime()))) var vss, rss uint64 - s.t.WithMuLocked(func(t *kernel.Task) { + s.task.WithMuLocked(func(t *kernel.Task) { if mm := t.MemoryManager(); mm != nil { vss = mm.VirtualMemorySize() rss = mm.ResidentSetSize() @@ -140,14 +395,14 @@ func (s *taskStatData) Generate(ctx context.Context, buf *bytes.Buffer) error { fmt.Fprintf(buf, "%d %d ", vss, rss/usermem.PageSize) // rsslim. - fmt.Fprintf(buf, "%d ", s.t.ThreadGroup().Limits().Get(limits.Rss).Cur) + fmt.Fprintf(buf, "%d ", s.task.ThreadGroup().Limits().Get(limits.Rss).Cur) fmt.Fprintf(buf, "0 0 0 0 0 " /* startcode endcode startstack kstkesp kstkeip */) fmt.Fprintf(buf, "0 0 0 0 0 " /* signal blocked sigignore sigcatch wchan */) fmt.Fprintf(buf, "0 0 " /* nswap cnswap */) terminationSignal := linux.Signal(0) - if s.t == s.t.ThreadGroup().Leader() { - terminationSignal = s.t.ThreadGroup().TerminationSignal() + if s.task == s.task.ThreadGroup().Leader() { + terminationSignal = s.task.ThreadGroup().TerminationSignal() } fmt.Fprintf(buf, "%d ", terminationSignal) fmt.Fprintf(buf, "0 0 0 " /* processor rt_priority policy */) @@ -164,7 +419,7 @@ func (s *taskStatData) Generate(ctx context.Context, buf *bytes.Buffer) error { type statmData struct { kernfs.DynamicBytesFile - t *kernel.Task + task *kernel.Task } var _ dynamicInode = (*statmData)(nil) @@ -172,7 +427,7 @@ var _ dynamicInode = (*statmData)(nil) // Generate implements vfs.DynamicBytesSource.Generate. func (s *statmData) Generate(ctx context.Context, buf *bytes.Buffer) error { var vss, rss uint64 - s.t.WithMuLocked(func(t *kernel.Task) { + s.task.WithMuLocked(func(t *kernel.Task) { if mm := t.MemoryManager(); mm != nil { vss = mm.VirtualMemorySize() rss = mm.ResidentSetSize() @@ -189,7 +444,7 @@ func (s *statmData) Generate(ctx context.Context, buf *bytes.Buffer) error { type statusData struct { kernfs.DynamicBytesFile - t *kernel.Task + task *kernel.Task pidns *kernel.PIDNamespace } @@ -197,23 +452,23 @@ var _ dynamicInode = (*statusData)(nil) // Generate implements vfs.DynamicBytesSource.Generate. func (s *statusData) Generate(ctx context.Context, buf *bytes.Buffer) error { - fmt.Fprintf(buf, "Name:\t%s\n", s.t.Name()) - fmt.Fprintf(buf, "State:\t%s\n", s.t.StateStatus()) - fmt.Fprintf(buf, "Tgid:\t%d\n", s.pidns.IDOfThreadGroup(s.t.ThreadGroup())) - fmt.Fprintf(buf, "Pid:\t%d\n", s.pidns.IDOfTask(s.t)) + fmt.Fprintf(buf, "Name:\t%s\n", s.task.Name()) + fmt.Fprintf(buf, "State:\t%s\n", s.task.StateStatus()) + fmt.Fprintf(buf, "Tgid:\t%d\n", s.pidns.IDOfThreadGroup(s.task.ThreadGroup())) + fmt.Fprintf(buf, "Pid:\t%d\n", s.pidns.IDOfTask(s.task)) ppid := kernel.ThreadID(0) - if parent := s.t.Parent(); parent != nil { + if parent := s.task.Parent(); parent != nil { ppid = s.pidns.IDOfThreadGroup(parent.ThreadGroup()) } fmt.Fprintf(buf, "PPid:\t%d\n", ppid) tpid := kernel.ThreadID(0) - if tracer := s.t.Tracer(); tracer != nil { + if tracer := s.task.Tracer(); tracer != nil { tpid = s.pidns.IDOfTask(tracer) } fmt.Fprintf(buf, "TracerPid:\t%d\n", tpid) var fds int var vss, rss, data uint64 - s.t.WithMuLocked(func(t *kernel.Task) { + s.task.WithMuLocked(func(t *kernel.Task) { if fdTable := t.FDTable(); fdTable != nil { fds = fdTable.Size() } @@ -227,13 +482,13 @@ func (s *statusData) Generate(ctx context.Context, buf *bytes.Buffer) error { fmt.Fprintf(buf, "VmSize:\t%d kB\n", vss>>10) fmt.Fprintf(buf, "VmRSS:\t%d kB\n", rss>>10) fmt.Fprintf(buf, "VmData:\t%d kB\n", data>>10) - fmt.Fprintf(buf, "Threads:\t%d\n", s.t.ThreadGroup().Count()) - creds := s.t.Credentials() + fmt.Fprintf(buf, "Threads:\t%d\n", s.task.ThreadGroup().Count()) + creds := s.task.Credentials() fmt.Fprintf(buf, "CapInh:\t%016x\n", creds.InheritableCaps) fmt.Fprintf(buf, "CapPrm:\t%016x\n", creds.PermittedCaps) fmt.Fprintf(buf, "CapEff:\t%016x\n", creds.EffectiveCaps) fmt.Fprintf(buf, "CapBnd:\t%016x\n", creds.BoundingCaps) - fmt.Fprintf(buf, "Seccomp:\t%d\n", s.t.SeccompMode()) + fmt.Fprintf(buf, "Seccomp:\t%d\n", s.task.SeccompMode()) // We unconditionally report a single NUMA node. See // pkg/sentry/syscalls/linux/sys_mempolicy.go. fmt.Fprintf(buf, "Mems_allowed:\t1\n") diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go index 72315d25c..a97b1753a 100644 --- a/pkg/sentry/fsimpl/proc/tasks.go +++ b/pkg/sentry/fsimpl/proc/tasks.go @@ -66,7 +66,7 @@ func newTasksInode(inoGen InoGenerator, k *kernel.Kernel, pidns *kernel.PIDNames "loadavg": newDentry(root, inoGen.NextIno(), 0444, &loadavgData{}), "sys": newSysDir(root, inoGen), "meminfo": newDentry(root, inoGen.NextIno(), 0444, &meminfoData{}), - "mounts": kernfs.NewStaticSymlink(root, inoGen.NextIno(), 0777, "self/mounts"), + "mounts": kernfs.NewStaticSymlink(root, inoGen.NextIno(), "self/mounts"), "stat": newDentry(root, inoGen.NextIno(), 0444, &statData{}), "uptime": newDentry(root, inoGen.NextIno(), 0444, &uptimeData{}), "version": newDentry(root, inoGen.NextIno(), 0444, &versionData{}), diff --git a/pkg/sentry/fsimpl/proc/tasks_test.go b/pkg/sentry/fsimpl/proc/tasks_test.go index 76eafe593..6b58c16b9 100644 --- a/pkg/sentry/fsimpl/proc/tasks_test.go +++ b/pkg/sentry/fsimpl/proc/tasks_test.go @@ -85,12 +85,20 @@ func checkTasksStaticFiles(gots []vfs.Dirent) ([]vfs.Dirent, error) { func checkTaskStaticFiles(gots []vfs.Dirent) ([]vfs.Dirent, error) { wants := map[string]vfs.Dirent{ - "io": {Type: linux.DT_REG}, - "maps": {Type: linux.DT_REG}, - "smaps": {Type: linux.DT_REG}, - "stat": {Type: linux.DT_REG}, - "statm": {Type: linux.DT_REG}, - "status": {Type: linux.DT_REG}, + "auxv": {Type: linux.DT_REG}, + "cmdline": {Type: linux.DT_REG}, + "comm": {Type: linux.DT_REG}, + "environ": {Type: linux.DT_REG}, + "gid_map": {Type: linux.DT_REG}, + "io": {Type: linux.DT_REG}, + "maps": {Type: linux.DT_REG}, + "ns": {Type: linux.DT_DIR}, + "smaps": {Type: linux.DT_REG}, + "stat": {Type: linux.DT_REG}, + "statm": {Type: linux.DT_REG}, + "status": {Type: linux.DT_REG}, + "task": {Type: linux.DT_DIR}, + "uid_map": {Type: linux.DT_REG}, } return checkFiles(gots, wants) } diff --git a/pkg/sentry/vfs/permissions.go b/pkg/sentry/vfs/permissions.go index f1edb0680..d279d05ca 100644 --- a/pkg/sentry/vfs/permissions.go +++ b/pkg/sentry/vfs/permissions.go @@ -30,6 +30,26 @@ const ( MayExec = 1 ) +// OnlyRead returns true if access _only_ allows read. +func (a AccessTypes) OnlyRead() bool { + return a == MayRead +} + +// MayRead returns true if access allows read. +func (a AccessTypes) MayRead() bool { + return a&MayRead != 0 +} + +// MayWrite returns true if access allows write. +func (a AccessTypes) MayWrite() bool { + return a&MayWrite != 0 +} + +// MayExec returns true if access allows exec. +func (a AccessTypes) MayExec() bool { + return a&MayExec != 0 +} + // GenericCheckPermissions checks that creds has the given access rights on a // file with the given permissions, UID, and GID, subject to the rules of // fs/namei.c:generic_permission(). isDir is true if the file is a directory. @@ -53,7 +73,7 @@ func GenericCheckPermissions(creds *auth.Credentials, ats AccessTypes, isDir boo } // CAP_DAC_READ_SEARCH allows the caller to read and search arbitrary // directories, and read arbitrary non-directory files. - if (isDir && (ats&MayWrite == 0)) || ats == MayRead { + if (isDir && !ats.MayWrite()) || ats.OnlyRead() { if creds.HasCapability(linux.CAP_DAC_READ_SEARCH) { return nil } @@ -61,7 +81,7 @@ func GenericCheckPermissions(creds *auth.Credentials, ats AccessTypes, isDir boo // CAP_DAC_OVERRIDE allows arbitrary access to directories, read/write // access to non-directory files, and execute access to non-directory files // for which at least one execute bit is set. - if isDir || (ats&MayExec == 0) || (mode&0111 != 0) { + if isDir || !ats.MayExec() || (mode&0111 != 0) { if creds.HasCapability(linux.CAP_DAC_OVERRIDE) { return nil } -- cgit v1.2.3 From 70d7c52bd7583393d39177a7935cca57372d67f1 Mon Sep 17 00:00:00 2001 From: Nicolas Lacasse Date: Thu, 16 Jan 2020 13:58:25 -0800 Subject: Implement tmpfs.SetStat with a size argument. This is similar to 'Truncate' in vfs1. Updates https://github.com/google/gvisor/issues/1197 PiperOrigin-RevId: 290139140 --- pkg/sentry/fsimpl/tmpfs/regular_file.go | 35 ++++++++ pkg/sentry/fsimpl/tmpfs/regular_file_test.go | 121 +++++++++++++++++++++++++++ pkg/sentry/fsimpl/tmpfs/tmpfs.go | 54 ++++++++++-- 3 files changed, 205 insertions(+), 5 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file.go b/pkg/sentry/fsimpl/tmpfs/regular_file.go index f200e767d..5fa70cc6d 100644 --- a/pkg/sentry/fsimpl/tmpfs/regular_file.go +++ b/pkg/sentry/fsimpl/tmpfs/regular_file.go @@ -63,6 +63,41 @@ func (fs *filesystem) newRegularFile(creds *auth.Credentials, mode linux.FileMod return &file.inode } +// truncate grows or shrinks the file to the given size. It returns true if the +// file size was updated. +func (rf *regularFile) truncate(size uint64) (bool, error) { + rf.mu.Lock() + defer rf.mu.Unlock() + + if size == rf.size { + // Nothing to do. + return false, nil + } + + if size > rf.size { + // Growing the file. + if rf.seals&linux.F_SEAL_GROW != 0 { + // Seal does not allow growth. + return false, syserror.EPERM + } + rf.size = size + return true, nil + } + + // Shrinking the file + if rf.seals&linux.F_SEAL_SHRINK != 0 { + // Seal does not allow shrink. + return false, syserror.EPERM + } + + // TODO(gvisor.dev/issues/1197): Invalidate mappings once we have + // mappings. + + rf.data.Truncate(size, rf.memFile) + rf.size = size + return true, nil +} + type regularFileFD struct { fileDescription diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file_test.go b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go index 7b0a962f0..034a29fdb 100644 --- a/pkg/sentry/fsimpl/tmpfs/regular_file_test.go +++ b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go @@ -313,3 +313,124 @@ func TestPRead(t *testing.T) { } } } + +func TestTruncate(t *testing.T) { + ctx := contexttest.Context(t) + fd, cleanup, err := newFileFD(ctx, 0644) + if err != nil { + t.Fatal(err) + } + defer cleanup() + + // Fill the file with some data. + data := bytes.Repeat([]byte("gVisor is awsome"), 100) + written, err := fd.Write(ctx, usermem.BytesIOSequence(data), vfs.WriteOptions{}) + if err != nil { + t.Fatalf("fd.Write failed: %v", err) + } + + // Size should be same as written. + sizeStatOpts := vfs.StatOptions{Mask: linux.STATX_SIZE} + stat, err := fd.Stat(ctx, sizeStatOpts) + if err != nil { + t.Fatalf("fd.Stat failed: %v", err) + } + if got, want := int64(stat.Size), written; got != want { + t.Errorf("fd.Stat got size %d, want %d", got, want) + } + + // Truncate down. + newSize := uint64(10) + if err := fd.SetStat(ctx, vfs.SetStatOptions{ + Stat: linux.Statx{ + Mask: linux.STATX_SIZE, + Size: newSize, + }, + }); err != nil { + t.Errorf("fd.Truncate failed: %v", err) + } + // Size should be updated. + statAfterTruncateDown, err := fd.Stat(ctx, sizeStatOpts) + if err != nil { + t.Fatalf("fd.Stat failed: %v", err) + } + if got, want := statAfterTruncateDown.Size, newSize; got != want { + t.Errorf("fd.Stat got size %d, want %d", got, want) + } + // We should only read newSize worth of data. + buf := make([]byte, 1000) + if n, err := fd.PRead(ctx, usermem.BytesIOSequence(buf), 0, vfs.ReadOptions{}); err != nil && err != io.EOF { + t.Fatalf("fd.PRead failed: %v", err) + } else if uint64(n) != newSize { + t.Errorf("fd.PRead got size %d, want %d", n, newSize) + } + // Mtime and Ctime should be bumped. + if got := statAfterTruncateDown.Mtime.ToNsec(); got <= stat.Mtime.ToNsec() { + t.Errorf("fd.Stat got Mtime %v, want > %v", got, stat.Mtime) + } + if got := statAfterTruncateDown.Ctime.ToNsec(); got <= stat.Ctime.ToNsec() { + t.Errorf("fd.Stat got Ctime %v, want > %v", got, stat.Ctime) + } + + // Truncate up. + newSize = 100 + if err := fd.SetStat(ctx, vfs.SetStatOptions{ + Stat: linux.Statx{ + Mask: linux.STATX_SIZE, + Size: newSize, + }, + }); err != nil { + t.Errorf("fd.Truncate failed: %v", err) + } + // Size should be updated. + statAfterTruncateUp, err := fd.Stat(ctx, sizeStatOpts) + if err != nil { + t.Fatalf("fd.Stat failed: %v", err) + } + if got, want := statAfterTruncateUp.Size, newSize; got != want { + t.Errorf("fd.Stat got size %d, want %d", got, want) + } + // We should read newSize worth of data. + buf = make([]byte, 1000) + if n, err := fd.PRead(ctx, usermem.BytesIOSequence(buf), 0, vfs.ReadOptions{}); err != nil && err != io.EOF { + t.Fatalf("fd.PRead failed: %v", err) + } else if uint64(n) != newSize { + t.Errorf("fd.PRead got size %d, want %d", n, newSize) + } + // Bytes should be null after 10, since we previously truncated to 10. + for i := uint64(10); i < newSize; i++ { + if buf[i] != 0 { + t.Errorf("fd.PRead got byte %d=%x, want 0", i, buf[i]) + break + } + } + // Mtime and Ctime should be bumped. + if got := statAfterTruncateUp.Mtime.ToNsec(); got <= statAfterTruncateDown.Mtime.ToNsec() { + t.Errorf("fd.Stat got Mtime %v, want > %v", got, statAfterTruncateDown.Mtime) + } + if got := statAfterTruncateUp.Ctime.ToNsec(); got <= statAfterTruncateDown.Ctime.ToNsec() { + t.Errorf("fd.Stat got Ctime %v, want > %v", got, stat.Ctime) + } + + // Truncate to the current size. + newSize = statAfterTruncateUp.Size + if err := fd.SetStat(ctx, vfs.SetStatOptions{ + Stat: linux.Statx{ + Mask: linux.STATX_SIZE, + Size: newSize, + }, + }); err != nil { + t.Errorf("fd.Truncate failed: %v", err) + } + statAfterTruncateNoop, err := fd.Stat(ctx, sizeStatOpts) + if err != nil { + t.Fatalf("fd.Stat failed: %v", err) + } + // Mtime and Ctime should not be bumped, since operation is a noop. + if got := statAfterTruncateNoop.Mtime.ToNsec(); got != statAfterTruncateUp.Mtime.ToNsec() { + t.Errorf("fd.Stat got Mtime %v, want %v", got, statAfterTruncateUp.Mtime) + } + if got := statAfterTruncateNoop.Ctime.ToNsec(); got != statAfterTruncateUp.Ctime.ToNsec() { + t.Errorf("fd.Stat got Ctime %v, want %v", got, statAfterTruncateUp.Ctime) + } +} diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go index d6960ee47..1d4889c89 100644 --- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go +++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go @@ -35,6 +35,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" ) // FilesystemType implements vfs.FilesystemType. @@ -121,6 +122,9 @@ func (d *dentry) DecRef() { // inode represents a filesystem object. type inode struct { + // clock is a realtime clock used to set timestamps in file operations. + clock time.Clock + // refs is a reference count. refs is accessed using atomic memory // operations. // @@ -151,13 +155,14 @@ type inode struct { const maxLinks = math.MaxUint32 func (i *inode) init(impl interface{}, fs *filesystem, creds *auth.Credentials, mode linux.FileMode) { - now := fs.clock.Now().Nanoseconds() + i.clock = fs.clock i.refs = 1 i.mode = uint32(mode) i.uid = uint32(creds.EffectiveKUID) i.gid = uint32(creds.EffectiveKGID) i.ino = atomic.AddUint64(&fs.nextInoMinusOne, 1) // Tmpfs creation sets atime, ctime, and mtime to current time. + now := i.clock.Now().Nanoseconds() i.atime = now i.ctime = now i.mtime = now @@ -270,30 +275,69 @@ func (i *inode) statTo(stat *linux.Statx) { } func (i *inode) setStat(stat linux.Statx) error { - // TODO(gvisor.dev/issues/1197): Handle stat.Size by growing/shrinking - // the file. if stat.Mask == 0 { return nil } i.mu.Lock() + var ( + needsMtimeBump bool + needsCtimeBump bool + ) mask := stat.Mask if mask&linux.STATX_MODE != 0 { atomic.StoreUint32(&i.mode, uint32(stat.Mode)) + needsCtimeBump = true } if mask&linux.STATX_UID != 0 { atomic.StoreUint32(&i.uid, stat.UID) + needsCtimeBump = true } if mask&linux.STATX_GID != 0 { atomic.StoreUint32(&i.gid, stat.GID) + needsCtimeBump = true + } + if mask&linux.STATX_SIZE != 0 { + switch impl := i.impl.(type) { + case *regularFile: + updated, err := impl.truncate(stat.Size) + if err != nil { + return err + } + if updated { + needsMtimeBump = true + needsCtimeBump = true + } + case *directory: + return syserror.EISDIR + case *symlink: + return syserror.EINVAL + case *namedPipe: + // Nothing. + default: + panic(fmt.Sprintf("unknown inode type: %T", i.impl)) + } } if mask&linux.STATX_ATIME != 0 { atomic.StoreInt64(&i.atime, stat.Atime.ToNsecCapped()) + needsCtimeBump = true + } + if mask&linux.STATX_MTIME != 0 { + atomic.StoreInt64(&i.mtime, stat.Mtime.ToNsecCapped()) + needsCtimeBump = true + // Ignore the mtime bump, since we just set it ourselves. + needsMtimeBump = false } if mask&linux.STATX_CTIME != 0 { atomic.StoreInt64(&i.ctime, stat.Ctime.ToNsecCapped()) + // Ignore the ctime bump, since we just set it ourselves. + needsCtimeBump = false } - if mask&linux.STATX_MTIME != 0 { - atomic.StoreInt64(&i.mtime, stat.Mtime.ToNsecCapped()) + now := i.clock.Now().Nanoseconds() + if needsMtimeBump { + atomic.StoreInt64(&i.mtime, now) + } + if needsCtimeBump { + atomic.StoreInt64(&i.ctime, now) } i.mu.Unlock() return nil -- cgit v1.2.3 From ff9960985848a48863c01f91acd5b34d3e83a9c5 Mon Sep 17 00:00:00 2001 From: Fabricio Voznika Date: Fri, 17 Jan 2020 09:33:14 -0800 Subject: Add /proc/net/* files Updates #1195 PiperOrigin-RevId: 290285420 --- pkg/sentry/fsimpl/proc/BUILD | 2 +- pkg/sentry/fsimpl/proc/tasks.go | 1 + pkg/sentry/fsimpl/proc/tasks_net.go | 541 ++++++++++++++++++++++++++++--- pkg/sentry/fsimpl/proc/tasks_sys_test.go | 4 +- pkg/sentry/fsimpl/proc/tasks_test.go | 1 + 5 files changed, 499 insertions(+), 50 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/proc/BUILD b/pkg/sentry/fsimpl/proc/BUILD index e92564b5d..f69aa19c4 100644 --- a/pkg/sentry/fsimpl/proc/BUILD +++ b/pkg/sentry/fsimpl/proc/BUILD @@ -18,7 +18,6 @@ go_library( importpath = "gvisor.dev/gvisor/pkg/sentry/fsimpl/proc", deps = [ "//pkg/abi/linux", - "//pkg/binary", "//pkg/log", "//pkg/sentry/context", "//pkg/sentry/fs", @@ -37,6 +36,7 @@ go_library( "//pkg/sentry/usermem", "//pkg/sentry/vfs", "//pkg/syserror", + "//pkg/tcpip/header", ], ) diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go index a97b1753a..5646c602a 100644 --- a/pkg/sentry/fsimpl/proc/tasks.go +++ b/pkg/sentry/fsimpl/proc/tasks.go @@ -67,6 +67,7 @@ func newTasksInode(inoGen InoGenerator, k *kernel.Kernel, pidns *kernel.PIDNames "sys": newSysDir(root, inoGen), "meminfo": newDentry(root, inoGen.NextIno(), 0444, &meminfoData{}), "mounts": kernfs.NewStaticSymlink(root, inoGen.NextIno(), "self/mounts"), + "net": newNetDir(root, inoGen, k), "stat": newDentry(root, inoGen.NextIno(), 0444, &statData{}), "uptime": newDentry(root, inoGen.NextIno(), 0444, &uptimeData{}), "version": newDentry(root, inoGen.NextIno(), 0444, &versionData{}), diff --git a/pkg/sentry/fsimpl/proc/tasks_net.go b/pkg/sentry/fsimpl/proc/tasks_net.go index 06dc43c26..3dbf3ba41 100644 --- a/pkg/sentry/fsimpl/proc/tasks_net.go +++ b/pkg/sentry/fsimpl/proc/tasks_net.go @@ -17,33 +17,88 @@ package proc import ( "bytes" "fmt" + "io" + "reflect" + "time" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/binary" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/socket" "gvisor.dev/gvisor/pkg/sentry/socket/unix" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" - "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/tcpip/header" ) +func newNetDir(root *auth.Credentials, inoGen InoGenerator, k *kernel.Kernel) *kernfs.Dentry { + var contents map[string]*kernfs.Dentry + if stack := k.NetworkStack(); stack != nil { + const ( + arp = "IP address HW type Flags HW address Mask Device" + netlink = "sk Eth Pid Groups Rmem Wmem Dump Locks Drops Inode" + packet = "sk RefCnt Type Proto Iface R Rmem User Inode" + protocols = "protocol size sockets memory press maxhdr slab module cl co di ac io in de sh ss gs se re sp bi br ha uh gp em" + ptype = "Type Device Function" + upd6 = " sl local_address remote_address st tx_queue rx_queue tr tm->when retrnsmt uid timeout inode" + ) + psched := fmt.Sprintf("%08x %08x %08x %08x\n", uint64(time.Microsecond/time.Nanosecond), 64, 1000000, uint64(time.Second/time.Nanosecond)) + + contents = map[string]*kernfs.Dentry{ + "dev": newDentry(root, inoGen.NextIno(), 0444, &netDevData{stack: stack}), + "snmp": newDentry(root, inoGen.NextIno(), 0444, &netSnmpData{stack: stack}), + + // The following files are simple stubs until they are implemented in + // netstack, if the file contains a header the stub is just the header + // otherwise it is an empty file. + "arp": newDentry(root, inoGen.NextIno(), 0444, newStaticFile(arp)), + "netlink": newDentry(root, inoGen.NextIno(), 0444, newStaticFile(netlink)), + "netstat": newDentry(root, inoGen.NextIno(), 0444, &netStatData{}), + "packet": newDentry(root, inoGen.NextIno(), 0444, newStaticFile(packet)), + "protocols": newDentry(root, inoGen.NextIno(), 0444, newStaticFile(protocols)), + + // Linux sets psched values to: nsec per usec, psched tick in ns, 1000000, + // high res timer ticks per sec (ClockGetres returns 1ns resolution). + "psched": newDentry(root, inoGen.NextIno(), 0444, newStaticFile(psched)), + "ptype": newDentry(root, inoGen.NextIno(), 0444, newStaticFile(ptype)), + "route": newDentry(root, inoGen.NextIno(), 0444, &netRouteData{stack: stack}), + "tcp": newDentry(root, inoGen.NextIno(), 0444, &netTCPData{kernel: k}), + "udp": newDentry(root, inoGen.NextIno(), 0444, &netUDPData{kernel: k}), + "unix": newDentry(root, inoGen.NextIno(), 0444, &netUnixData{kernel: k}), + } + + if stack.SupportsIPv6() { + contents["if_inet6"] = newDentry(root, inoGen.NextIno(), 0444, &ifinet6{stack: stack}) + contents["ipv6_route"] = newDentry(root, inoGen.NextIno(), 0444, newStaticFile("")) + contents["tcp6"] = newDentry(root, inoGen.NextIno(), 0444, &netTCP6Data{kernel: k}) + contents["udp6"] = newDentry(root, inoGen.NextIno(), 0444, newStaticFile(upd6)) + } + } + + return kernfs.NewStaticDir(root, inoGen.NextIno(), 0555, contents) +} + // ifinet6 implements vfs.DynamicBytesSource for /proc/net/if_inet6. // // +stateify savable type ifinet6 struct { - s inet.Stack + kernfs.DynamicBytesFile + + stack inet.Stack } -var _ vfs.DynamicBytesSource = (*ifinet6)(nil) +var _ dynamicInode = (*ifinet6)(nil) func (n *ifinet6) contents() []string { var lines []string - nics := n.s.Interfaces() - for id, naddrs := range n.s.InterfaceAddrs() { + nics := n.stack.Interfaces() + for id, naddrs := range n.stack.InterfaceAddrs() { nic, ok := nics[id] if !ok { // NIC was added after NICNames was called. We'll just ignore it. @@ -77,18 +132,20 @@ func (n *ifinet6) Generate(ctx context.Context, buf *bytes.Buffer) error { return nil } -// netDev implements vfs.DynamicBytesSource for /proc/net/dev. +// netDevData implements vfs.DynamicBytesSource for /proc/net/dev. // // +stateify savable -type netDev struct { - s inet.Stack +type netDevData struct { + kernfs.DynamicBytesFile + + stack inet.Stack } -var _ vfs.DynamicBytesSource = (*netDev)(nil) +var _ dynamicInode = (*netDevData)(nil) // Generate implements vfs.DynamicBytesSource.Generate. -func (n *netDev) Generate(ctx context.Context, buf *bytes.Buffer) error { - interfaces := n.s.Interfaces() +func (n *netDevData) Generate(ctx context.Context, buf *bytes.Buffer) error { + interfaces := n.stack.Interfaces() buf.WriteString("Inter-| Receive | Transmit\n") buf.WriteString(" face |bytes packets errs drop fifo frame compressed multicast|bytes packets errs drop fifo colls carrier compressed\n") @@ -96,7 +153,7 @@ func (n *netDev) Generate(ctx context.Context, buf *bytes.Buffer) error { // Implements the same format as // net/core/net-procfs.c:dev_seq_printf_stats. var stats inet.StatDev - if err := n.s.Statistics(&stats, i.Name); err != nil { + if err := n.stack.Statistics(&stats, i.Name); err != nil { log.Warningf("Failed to retrieve interface statistics for %v: %v", i.Name, err) continue } @@ -128,19 +185,21 @@ func (n *netDev) Generate(ctx context.Context, buf *bytes.Buffer) error { return nil } -// netUnix implements vfs.DynamicBytesSource for /proc/net/unix. +// netUnixData implements vfs.DynamicBytesSource for /proc/net/unix. // // +stateify savable -type netUnix struct { - k *kernel.Kernel +type netUnixData struct { + kernfs.DynamicBytesFile + + kernel *kernel.Kernel } -var _ vfs.DynamicBytesSource = (*netUnix)(nil) +var _ dynamicInode = (*netUnixData)(nil) // Generate implements vfs.DynamicBytesSource.Generate. -func (n *netUnix) Generate(ctx context.Context, buf *bytes.Buffer) error { +func (n *netUnixData) Generate(ctx context.Context, buf *bytes.Buffer) error { buf.WriteString("Num RefCount Protocol Flags Type St Inode Path\n") - for _, se := range n.k.ListSockets() { + for _, se := range n.kernel.ListSockets() { s := se.Sock.Get() if s == nil { log.Debugf("Couldn't resolve weakref %v in socket table, racing with destruction?", se.Sock) @@ -213,22 +272,72 @@ func (n *netUnix) Generate(ctx context.Context, buf *bytes.Buffer) error { return nil } -// netTCP implements vfs.DynamicBytesSource for /proc/net/tcp. -// -// +stateify savable -type netTCP struct { - k *kernel.Kernel +func networkToHost16(n uint16) uint16 { + // n is in network byte order, so is big-endian. The most-significant byte + // should be stored in the lower address. + // + // We manually inline binary.BigEndian.Uint16() because Go does not support + // non-primitive consts, so binary.BigEndian is a (mutable) var, so calls to + // binary.BigEndian.Uint16() require a read of binary.BigEndian and an + // interface method call, defeating inlining. + buf := [2]byte{byte(n >> 8 & 0xff), byte(n & 0xff)} + return usermem.ByteOrder.Uint16(buf[:]) } -var _ vfs.DynamicBytesSource = (*netTCP)(nil) +func writeInetAddr(w io.Writer, family int, i linux.SockAddr) { + switch family { + case linux.AF_INET: + var a linux.SockAddrInet + if i != nil { + a = *i.(*linux.SockAddrInet) + } + + // linux.SockAddrInet.Port is stored in the network byte order and is + // printed like a number in host byte order. Note that all numbers in host + // byte order are printed with the most-significant byte first when + // formatted with %X. See get_tcp4_sock() and udp4_format_sock() in Linux. + port := networkToHost16(a.Port) + + // linux.SockAddrInet.Addr is stored as a byte slice in big-endian order + // (i.e. most-significant byte in index 0). Linux represents this as a + // __be32 which is a typedef for an unsigned int, and is printed with + // %X. This means that for a little-endian machine, Linux prints the + // least-significant byte of the address first. To emulate this, we first + // invert the byte order for the address using usermem.ByteOrder.Uint32, + // which makes it have the equivalent encoding to a __be32 on a little + // endian machine. Note that this operation is a no-op on a big endian + // machine. Then similar to Linux, we format it with %X, which will print + // the most-significant byte of the __be32 address first, which is now + // actually the least-significant byte of the original address in + // linux.SockAddrInet.Addr on little endian machines, due to the conversion. + addr := usermem.ByteOrder.Uint32(a.Addr[:]) + + fmt.Fprintf(w, "%08X:%04X ", addr, port) + case linux.AF_INET6: + var a linux.SockAddrInet6 + if i != nil { + a = *i.(*linux.SockAddrInet6) + } -func (n *netTCP) Generate(ctx context.Context, buf *bytes.Buffer) error { + port := networkToHost16(a.Port) + addr0 := usermem.ByteOrder.Uint32(a.Addr[0:4]) + addr1 := usermem.ByteOrder.Uint32(a.Addr[4:8]) + addr2 := usermem.ByteOrder.Uint32(a.Addr[8:12]) + addr3 := usermem.ByteOrder.Uint32(a.Addr[12:16]) + fmt.Fprintf(w, "%08X%08X%08X%08X:%04X ", addr0, addr1, addr2, addr3, port) + } +} + +func commonGenerateTCP(ctx context.Context, buf *bytes.Buffer, k *kernel.Kernel, family int) error { + // t may be nil here if our caller is not part of a task goroutine. This can + // happen for example if we're here for "sentryctl cat". When t is nil, + // degrade gracefully and retrieve what we can. t := kernel.TaskFromContext(ctx) - buf.WriteString(" sl local_address rem_address st tx_queue rx_queue tr tm->when retrnsmt uid timeout inode \n") - for _, se := range n.k.ListSockets() { + + for _, se := range k.ListSockets() { s := se.Sock.Get() if s == nil { - log.Debugf("Couldn't resolve weakref %+v in socket table, racing with destruction?", se.Sock) + log.Debugf("Couldn't resolve weakref with ID %v in socket table, racing with destruction?", se.ID) continue } sfile := s.(*fs.File) @@ -236,7 +345,7 @@ func (n *netTCP) Generate(ctx context.Context, buf *bytes.Buffer) error { if !ok { panic(fmt.Sprintf("Found non-socket file in socket table: %+v", sfile)) } - if family, stype, _ := sops.Type(); !(family == linux.AF_INET && stype == linux.SOCK_STREAM) { + if fa, stype, _ := sops.Type(); !(family == fa && stype == linux.SOCK_STREAM) { s.DecRef() // Not tcp4 sockets. continue @@ -250,27 +359,23 @@ func (n *netTCP) Generate(ctx context.Context, buf *bytes.Buffer) error { // Field: sl; entry number. fmt.Fprintf(buf, "%4d: ", se.ID) - portBuf := make([]byte, 2) - // Field: local_adddress. - var localAddr linux.SockAddrInet - if local, _, err := sops.GetSockName(t); err == nil { - localAddr = *local.(*linux.SockAddrInet) + var localAddr linux.SockAddr + if t != nil { + if local, _, err := sops.GetSockName(t); err == nil { + localAddr = local + } } - binary.LittleEndian.PutUint16(portBuf, localAddr.Port) - fmt.Fprintf(buf, "%08X:%04X ", - binary.LittleEndian.Uint32(localAddr.Addr[:]), - portBuf) + writeInetAddr(buf, family, localAddr) // Field: rem_address. - var remoteAddr linux.SockAddrInet - if remote, _, err := sops.GetPeerName(t); err == nil { - remoteAddr = *remote.(*linux.SockAddrInet) + var remoteAddr linux.SockAddr + if t != nil { + if remote, _, err := sops.GetPeerName(t); err == nil { + remoteAddr = remote + } } - binary.LittleEndian.PutUint16(portBuf, remoteAddr.Port) - fmt.Fprintf(buf, "%08X:%04X ", - binary.LittleEndian.Uint32(remoteAddr.Addr[:]), - portBuf) + writeInetAddr(buf, family, remoteAddr) // Field: state; socket state. fmt.Fprintf(buf, "%02X ", sops.State()) @@ -293,7 +398,8 @@ func (n *netTCP) Generate(ctx context.Context, buf *bytes.Buffer) error { log.Warningf("Failed to retrieve unstable attr for socket file: %v", err) fmt.Fprintf(buf, "%5d ", 0) } else { - fmt.Fprintf(buf, "%5d ", uint32(uattr.Owner.UID.In(t.UserNamespace()).OrOverflow())) + creds := auth.CredentialsFromContext(ctx) + fmt.Fprintf(buf, "%5d ", uint32(uattr.Owner.UID.In(creds.UserNamespace).OrOverflow())) } // Field: timeout; number of unanswered 0-window probes. @@ -335,3 +441,344 @@ func (n *netTCP) Generate(ctx context.Context, buf *bytes.Buffer) error { return nil } + +// netTCPData implements vfs.DynamicBytesSource for /proc/net/tcp. +// +// +stateify savable +type netTCPData struct { + kernfs.DynamicBytesFile + + kernel *kernel.Kernel +} + +var _ dynamicInode = (*netTCPData)(nil) + +func (d *netTCPData) Generate(ctx context.Context, buf *bytes.Buffer) error { + buf.WriteString(" sl local_address rem_address st tx_queue rx_queue tr tm->when retrnsmt uid timeout inode \n") + return commonGenerateTCP(ctx, buf, d.kernel, linux.AF_INET) +} + +// netTCP6Data implements vfs.DynamicBytesSource for /proc/net/tcp6. +// +// +stateify savable +type netTCP6Data struct { + kernfs.DynamicBytesFile + + kernel *kernel.Kernel +} + +var _ dynamicInode = (*netTCP6Data)(nil) + +func (d *netTCP6Data) Generate(ctx context.Context, buf *bytes.Buffer) error { + buf.WriteString(" sl local_address remote_address st tx_queue rx_queue tr tm->when retrnsmt uid timeout inode\n") + return commonGenerateTCP(ctx, buf, d.kernel, linux.AF_INET6) +} + +// netUDPData implements vfs.DynamicBytesSource for /proc/net/udp. +// +// +stateify savable +type netUDPData struct { + kernfs.DynamicBytesFile + + kernel *kernel.Kernel +} + +var _ dynamicInode = (*netUDPData)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (d *netUDPData) Generate(ctx context.Context, buf *bytes.Buffer) error { + // t may be nil here if our caller is not part of a task goroutine. This can + // happen for example if we're here for "sentryctl cat". When t is nil, + // degrade gracefully and retrieve what we can. + t := kernel.TaskFromContext(ctx) + + for _, se := range d.kernel.ListSockets() { + s := se.Sock.Get() + if s == nil { + log.Debugf("Couldn't resolve weakref with ID %v in socket table, racing with destruction?", se.ID) + continue + } + sfile := s.(*fs.File) + sops, ok := sfile.FileOperations.(socket.Socket) + if !ok { + panic(fmt.Sprintf("Found non-socket file in socket table: %+v", sfile)) + } + if family, stype, _ := sops.Type(); family != linux.AF_INET || stype != linux.SOCK_DGRAM { + s.DecRef() + // Not udp4 socket. + continue + } + + // For Linux's implementation, see net/ipv4/udp.c:udp4_format_sock(). + + // Field: sl; entry number. + fmt.Fprintf(buf, "%5d: ", se.ID) + + // Field: local_adddress. + var localAddr linux.SockAddrInet + if t != nil { + if local, _, err := sops.GetSockName(t); err == nil { + localAddr = *local.(*linux.SockAddrInet) + } + } + writeInetAddr(buf, linux.AF_INET, &localAddr) + + // Field: rem_address. + var remoteAddr linux.SockAddrInet + if t != nil { + if remote, _, err := sops.GetPeerName(t); err == nil { + remoteAddr = *remote.(*linux.SockAddrInet) + } + } + writeInetAddr(buf, linux.AF_INET, &remoteAddr) + + // Field: state; socket state. + fmt.Fprintf(buf, "%02X ", sops.State()) + + // Field: tx_queue, rx_queue; number of packets in the transmit and + // receive queue. Unimplemented. + fmt.Fprintf(buf, "%08X:%08X ", 0, 0) + + // Field: tr, tm->when. Always 0 for UDP. + fmt.Fprintf(buf, "%02X:%08X ", 0, 0) + + // Field: retrnsmt. Always 0 for UDP. + fmt.Fprintf(buf, "%08X ", 0) + + // Field: uid. + uattr, err := sfile.Dirent.Inode.UnstableAttr(ctx) + if err != nil { + log.Warningf("Failed to retrieve unstable attr for socket file: %v", err) + fmt.Fprintf(buf, "%5d ", 0) + } else { + creds := auth.CredentialsFromContext(ctx) + fmt.Fprintf(buf, "%5d ", uint32(uattr.Owner.UID.In(creds.UserNamespace).OrOverflow())) + } + + // Field: timeout. Always 0 for UDP. + fmt.Fprintf(buf, "%8d ", 0) + + // Field: inode. + fmt.Fprintf(buf, "%8d ", sfile.InodeID()) + + // Field: ref; reference count on the socket inode. Don't count the ref + // we obtain while deferencing the weakref to this socket. + fmt.Fprintf(buf, "%d ", sfile.ReadRefs()-1) + + // Field: Socket struct address. Redacted due to the same reason as + // the 'Num' field in /proc/net/unix, see netUnix.ReadSeqFileData. + fmt.Fprintf(buf, "%#016p ", (*socket.Socket)(nil)) + + // Field: drops; number of dropped packets. Unimplemented. + fmt.Fprintf(buf, "%d", 0) + + fmt.Fprintf(buf, "\n") + + s.DecRef() + } + return nil +} + +// netSnmpData implements vfs.DynamicBytesSource for /proc/net/snmp. +// +// +stateify savable +type netSnmpData struct { + kernfs.DynamicBytesFile + + stack inet.Stack +} + +var _ dynamicInode = (*netSnmpData)(nil) + +type snmpLine struct { + prefix string + header string +} + +var snmp = []snmpLine{ + { + prefix: "Ip", + header: "Forwarding DefaultTTL InReceives InHdrErrors InAddrErrors ForwDatagrams InUnknownProtos InDiscards InDelivers OutRequests OutDiscards OutNoRoutes ReasmTimeout ReasmReqds ReasmOKs ReasmFails FragOKs FragFails FragCreates", + }, + { + prefix: "Icmp", + header: "InMsgs InErrors InCsumErrors InDestUnreachs InTimeExcds InParmProbs InSrcQuenchs InRedirects InEchos InEchoReps InTimestamps InTimestampReps InAddrMasks InAddrMaskReps OutMsgs OutErrors OutDestUnreachs OutTimeExcds OutParmProbs OutSrcQuenchs OutRedirects OutEchos OutEchoReps OutTimestamps OutTimestampReps OutAddrMasks OutAddrMaskReps", + }, + { + prefix: "IcmpMsg", + }, + { + prefix: "Tcp", + header: "RtoAlgorithm RtoMin RtoMax MaxConn ActiveOpens PassiveOpens AttemptFails EstabResets CurrEstab InSegs OutSegs RetransSegs InErrs OutRsts InCsumErrors", + }, + { + prefix: "Udp", + header: "InDatagrams NoPorts InErrors OutDatagrams RcvbufErrors SndbufErrors InCsumErrors IgnoredMulti", + }, + { + prefix: "UdpLite", + header: "InDatagrams NoPorts InErrors OutDatagrams RcvbufErrors SndbufErrors InCsumErrors IgnoredMulti", + }, +} + +func toSlice(a interface{}) []uint64 { + v := reflect.Indirect(reflect.ValueOf(a)) + return v.Slice(0, v.Len()).Interface().([]uint64) +} + +func sprintSlice(s []uint64) string { + if len(s) == 0 { + return "" + } + r := fmt.Sprint(s) + return r[1 : len(r)-1] // Remove "[]" introduced by fmt of slice. +} + +// Generate implements vfs.DynamicBytesSource. +func (d *netSnmpData) Generate(ctx context.Context, buf *bytes.Buffer) error { + types := []interface{}{ + &inet.StatSNMPIP{}, + &inet.StatSNMPICMP{}, + nil, // TODO(gvisor.dev/issue/628): Support IcmpMsg stats. + &inet.StatSNMPTCP{}, + &inet.StatSNMPUDP{}, + &inet.StatSNMPUDPLite{}, + } + for i, stat := range types { + line := snmp[i] + if stat == nil { + fmt.Fprintf(buf, "%s:\n", line.prefix) + fmt.Fprintf(buf, "%s:\n", line.prefix) + continue + } + if err := d.stack.Statistics(stat, line.prefix); err != nil { + if err == syserror.EOPNOTSUPP { + log.Infof("Failed to retrieve %s of /proc/net/snmp: %v", line.prefix, err) + } else { + log.Warningf("Failed to retrieve %s of /proc/net/snmp: %v", line.prefix, err) + } + } + + fmt.Fprintf(buf, "%s: %s\n", line.prefix, line.header) + + if line.prefix == "Tcp" { + tcp := stat.(*inet.StatSNMPTCP) + // "Tcp" needs special processing because MaxConn is signed. RFC 2012. + fmt.Sprintf("%s: %s %d %s\n", line.prefix, sprintSlice(tcp[:3]), int64(tcp[3]), sprintSlice(tcp[4:])) + } else { + fmt.Sprintf("%s: %s\n", line.prefix, sprintSlice(toSlice(stat))) + } + } + return nil +} + +// netRouteData implements vfs.DynamicBytesSource for /proc/net/route. +// +// +stateify savable +type netRouteData struct { + kernfs.DynamicBytesFile + + stack inet.Stack +} + +var _ dynamicInode = (*netRouteData)(nil) + +// Generate implements vfs.DynamicBytesSource. +// See Linux's net/ipv4/fib_trie.c:fib_route_seq_show. +func (d *netRouteData) Generate(ctx context.Context, buf *bytes.Buffer) error { + fmt.Fprintf(buf, "%-127s\n", "Iface\tDestination\tGateway\tFlags\tRefCnt\tUse\tMetric\tMask\tMTU\tWindow\tIRTT") + + interfaces := d.stack.Interfaces() + for _, rt := range d.stack.RouteTable() { + // /proc/net/route only includes ipv4 routes. + if rt.Family != linux.AF_INET { + continue + } + + // /proc/net/route does not include broadcast or multicast routes. + if rt.Type == linux.RTN_BROADCAST || rt.Type == linux.RTN_MULTICAST { + continue + } + + iface, ok := interfaces[rt.OutputInterface] + if !ok || iface.Name == "lo" { + continue + } + + var ( + gw uint32 + prefix uint32 + flags = linux.RTF_UP + ) + if len(rt.GatewayAddr) == header.IPv4AddressSize { + flags |= linux.RTF_GATEWAY + gw = usermem.ByteOrder.Uint32(rt.GatewayAddr) + } + if len(rt.DstAddr) == header.IPv4AddressSize { + prefix = usermem.ByteOrder.Uint32(rt.DstAddr) + } + l := fmt.Sprintf( + "%s\t%08X\t%08X\t%04X\t%d\t%d\t%d\t%08X\t%d\t%d\t%d", + iface.Name, + prefix, + gw, + flags, + 0, // RefCnt. + 0, // Use. + 0, // Metric. + (uint32(1)< 0 { @@ -62,7 +62,7 @@ func TestIfinet6(t *testing.T) { "101112131415161718191a1b1c1d1e1f 02 80 00 00 eth1\n": {}, } - n := &ifinet6{s: s} + n := &ifinet6{stack: s} contents := n.contents() if len(contents) != len(want) { t.Errorf("Got len(n.contents()) = %d, want = %d", len(contents), len(want)) diff --git a/pkg/sentry/fsimpl/proc/tasks_test.go b/pkg/sentry/fsimpl/proc/tasks_test.go index 6b58c16b9..8eddf95e0 100644 --- a/pkg/sentry/fsimpl/proc/tasks_test.go +++ b/pkg/sentry/fsimpl/proc/tasks_test.go @@ -73,6 +73,7 @@ func checkTasksStaticFiles(gots []vfs.Dirent) ([]vfs.Dirent, error) { "loadavg": {Type: linux.DT_REG}, "meminfo": {Type: linux.DT_REG}, "mounts": {Type: linux.DT_LNK}, + "net": {Type: linux.DT_DIR}, "self": selfLink, "stat": {Type: linux.DT_REG}, "sys": {Type: linux.DT_DIR}, -- cgit v1.2.3 From 8e8d0f96f651ce161dfe6003d738dbda28f7cb0e Mon Sep 17 00:00:00 2001 From: Fabricio Voznika Date: Fri, 17 Jan 2020 10:39:24 -0800 Subject: Add /proc/[pid]/cgroups file Updates #1195 PiperOrigin-RevId: 290298266 --- pkg/sentry/fsimpl/proc/filesystem.go | 13 ++++++++++++- pkg/sentry/fsimpl/proc/subtasks.go | 18 ++++++++++-------- pkg/sentry/fsimpl/proc/task.go | 30 +++++++++++++++++++++++++----- pkg/sentry/fsimpl/proc/tasks.go | 10 ++++++++-- pkg/sentry/fsimpl/proc/tasks_test.go | 11 ++++++++++- 5 files changed, 65 insertions(+), 17 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/proc/filesystem.go b/pkg/sentry/fsimpl/proc/filesystem.go index e9cb7895f..f49819187 100644 --- a/pkg/sentry/fsimpl/proc/filesystem.go +++ b/pkg/sentry/fsimpl/proc/filesystem.go @@ -47,7 +47,12 @@ func (ft *procFSType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFile procfs := &kernfs.Filesystem{} procfs.VFSFilesystem().Init(vfsObj, procfs) - _, dentry := newTasksInode(procfs, k, pidns) + var data *InternalData + if opts.InternalData != nil { + data = opts.InternalData.(*InternalData) + } + + _, dentry := newTasksInode(procfs, k, pidns, data.Cgroups) return procfs.VFSFilesystem(), dentry.VFSDentry(), nil } @@ -78,3 +83,9 @@ var _ dynamicInode = (*staticFile)(nil) func newStaticFile(data string) *staticFile { return &staticFile{StaticData: vfs.StaticData{Data: data}} } + +// InternalData contains internal data passed in to the procfs mount via +// vfs.GetFilesystemOptions.InternalData. +type InternalData struct { + Cgroups map[string]string +} diff --git a/pkg/sentry/fsimpl/proc/subtasks.go b/pkg/sentry/fsimpl/proc/subtasks.go index 8892c5a11..91eded415 100644 --- a/pkg/sentry/fsimpl/proc/subtasks.go +++ b/pkg/sentry/fsimpl/proc/subtasks.go @@ -35,18 +35,20 @@ type subtasksInode struct { kernfs.InodeAttrs kernfs.OrderedChildren - task *kernel.Task - pidns *kernel.PIDNamespace - inoGen InoGenerator + task *kernel.Task + pidns *kernel.PIDNamespace + inoGen InoGenerator + cgroupControllers map[string]string } var _ kernfs.Inode = (*subtasksInode)(nil) -func newSubtasks(task *kernel.Task, pidns *kernel.PIDNamespace, inoGen InoGenerator) *kernfs.Dentry { +func newSubtasks(task *kernel.Task, pidns *kernel.PIDNamespace, inoGen InoGenerator, cgroupControllers map[string]string) *kernfs.Dentry { subInode := &subtasksInode{ - task: task, - pidns: pidns, - inoGen: inoGen, + task: task, + pidns: pidns, + inoGen: inoGen, + cgroupControllers: cgroupControllers, } // Note: credentials are overridden by taskOwnedInode. subInode.InodeAttrs.Init(task.Credentials(), inoGen.NextIno(), linux.ModeDirectory|0555) @@ -79,7 +81,7 @@ func (i *subtasksInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, e return nil, syserror.ENOENT } - subTaskDentry := newTaskInode(i.inoGen, subTask, i.pidns, false) + subTaskDentry := newTaskInode(i.inoGen, subTask, i.pidns, false, i.cgroupControllers) return subTaskDentry.VFSDentry(), nil } diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go index 621c17cfe..a0580f20d 100644 --- a/pkg/sentry/fsimpl/proc/task.go +++ b/pkg/sentry/fsimpl/proc/task.go @@ -15,6 +15,7 @@ package proc import ( + "bytes" "fmt" "gvisor.dev/gvisor/pkg/abi/linux" @@ -42,7 +43,7 @@ type taskInode struct { var _ kernfs.Inode = (*taskInode)(nil) -func newTaskInode(inoGen InoGenerator, task *kernel.Task, pidns *kernel.PIDNamespace, isThreadGroup bool) *kernfs.Dentry { +func newTaskInode(inoGen InoGenerator, task *kernel.Task, pidns *kernel.PIDNamespace, isThreadGroup bool, cgroupControllers map[string]string) *kernfs.Dentry { contents := map[string]*kernfs.Dentry{ "auxv": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &auxvData{task: task}), "cmdline": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &cmdlineData{task: task, arg: cmdlineDataArg}), @@ -68,11 +69,11 @@ func newTaskInode(inoGen InoGenerator, task *kernel.Task, pidns *kernel.PIDNames "uid_map": newTaskOwnedFile(task, inoGen.NextIno(), 0644, &idMapData{task: task, gids: false}), } if isThreadGroup { - contents["task"] = newSubtasks(task, pidns, inoGen) + contents["task"] = newSubtasks(task, pidns, inoGen, cgroupControllers) + } + if len(cgroupControllers) > 0 { + contents["cgroup"] = newTaskOwnedFile(task, inoGen.NextIno(), 0444, newCgroupData(cgroupControllers)) } - //if len(p.cgroupControllers) > 0 { - // contents["cgroup"] = newCGroupInode(t, msrc, p.cgroupControllers) - //} taskInode := &taskInode{task: task} // Note: credentials are overridden by taskOwnedInode. @@ -227,3 +228,22 @@ func newNamespaceSymlink(task *kernel.Task, ino uint64, ns string) *kernfs.Dentr d.Init(taskInode) return d } + +// newCgroupData creates inode that shows cgroup information. +// From man 7 cgroups: "For each cgroup hierarchy of which the process is a +// member, there is one entry containing three colon-separated fields: +// hierarchy-ID:controller-list:cgroup-path" +func newCgroupData(controllers map[string]string) dynamicInode { + buf := bytes.Buffer{} + + // The hierarchy ids must be positive integers (for cgroup v1), but the + // exact number does not matter, so long as they are unique. We can + // just use a counter, but since linux sorts this file in descending + // order, we must count down to preserve this behavior. + i := len(controllers) + for name, dir := range controllers { + fmt.Fprintf(&buf, "%d:%s:%s\n", i, name, dir) + i-- + } + return newStaticFile(buf.String()) +} diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go index 5646c602a..51f634716 100644 --- a/pkg/sentry/fsimpl/proc/tasks.go +++ b/pkg/sentry/fsimpl/proc/tasks.go @@ -54,11 +54,16 @@ type tasksInode struct { // Linux. So handle them outside of OrderedChildren. selfSymlink *vfs.Dentry threadSelfSymlink *vfs.Dentry + + // cgroupControllers is a map of controller name to directory in the + // cgroup hierarchy. These controllers are immutable and will be listed + // in /proc/pid/cgroup if not nil. + cgroupControllers map[string]string } var _ kernfs.Inode = (*tasksInode)(nil) -func newTasksInode(inoGen InoGenerator, k *kernel.Kernel, pidns *kernel.PIDNamespace) (*tasksInode, *kernfs.Dentry) { +func newTasksInode(inoGen InoGenerator, k *kernel.Kernel, pidns *kernel.PIDNamespace, cgroupControllers map[string]string) (*tasksInode, *kernfs.Dentry) { root := auth.NewRootCredentials(pidns.UserNamespace()) contents := map[string]*kernfs.Dentry{ "cpuinfo": newDentry(root, inoGen.NextIno(), 0444, newStaticFile(cpuInfoData(k))), @@ -78,6 +83,7 @@ func newTasksInode(inoGen InoGenerator, k *kernel.Kernel, pidns *kernel.PIDNames inoGen: inoGen, selfSymlink: newSelfSymlink(root, inoGen.NextIno(), 0444, pidns).VFSDentry(), threadSelfSymlink: newThreadSelfSymlink(root, inoGen.NextIno(), 0444, pidns).VFSDentry(), + cgroupControllers: cgroupControllers, } inode.InodeAttrs.Init(root, inoGen.NextIno(), linux.ModeDirectory|0555) @@ -111,7 +117,7 @@ func (i *tasksInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, erro return nil, syserror.ENOENT } - taskDentry := newTaskInode(i.inoGen, task, i.pidns, true) + taskDentry := newTaskInode(i.inoGen, task, i.pidns, true, i.cgroupControllers) return taskDentry.VFSDentry(), nil } diff --git a/pkg/sentry/fsimpl/proc/tasks_test.go b/pkg/sentry/fsimpl/proc/tasks_test.go index 8eddf95e0..002d2f73b 100644 --- a/pkg/sentry/fsimpl/proc/tasks_test.go +++ b/pkg/sentry/fsimpl/proc/tasks_test.go @@ -87,6 +87,7 @@ func checkTasksStaticFiles(gots []vfs.Dirent) ([]vfs.Dirent, error) { func checkTaskStaticFiles(gots []vfs.Dirent) ([]vfs.Dirent, error) { wants := map[string]vfs.Dirent{ "auxv": {Type: linux.DT_REG}, + "cgroup": {Type: linux.DT_REG}, "cmdline": {Type: linux.DT_REG}, "comm": {Type: linux.DT_REG}, "environ": {Type: linux.DT_REG}, @@ -145,7 +146,15 @@ func setup() (context.Context, *vfs.VirtualFilesystem, vfs.VirtualDentry, error) vfsObj.MustRegisterFilesystemType("procfs", &procFSType{}, &vfs.RegisterFilesystemTypeOptions{ AllowUserMount: true, }) - mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "procfs", &vfs.GetFilesystemOptions{}) + fsOpts := vfs.GetFilesystemOptions{ + InternalData: &InternalData{ + Cgroups: map[string]string{ + "cpuset": "/foo/cpuset", + "memory": "/foo/memory", + }, + }, + } + mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "procfs", &fsOpts) if err != nil { return nil, nil, vfs.VirtualDentry{}, fmt.Errorf("NewMountNamespace(): %v", err) } -- cgit v1.2.3 From d46c397a1cd38f1e2aa5c864c1bb8594fb87bb63 Mon Sep 17 00:00:00 2001 From: Fabricio Voznika Date: Tue, 21 Jan 2020 13:26:50 -0800 Subject: Add line break to /proc/net files Some files were missing the last line break. PiperOrigin-RevId: 290808898 --- pkg/sentry/fs/proc/net.go | 14 +++++++------- pkg/sentry/fsimpl/proc/tasks_net.go | 14 +++++++------- 2 files changed, 14 insertions(+), 14 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fs/proc/net.go b/pkg/sentry/fs/proc/net.go index 3f17e98ea..bad445f3f 100644 --- a/pkg/sentry/fs/proc/net.go +++ b/pkg/sentry/fs/proc/net.go @@ -52,17 +52,17 @@ func (p *proc) newNetDir(ctx context.Context, k *kernel.Kernel, msrc *fs.MountSo // implemented in netstack, if the file contains a // header the stub is just the header otherwise it is // an empty file. - "arp": newStaticProcInode(ctx, msrc, []byte("IP address HW type Flags HW address Mask Device")), + "arp": newStaticProcInode(ctx, msrc, []byte("IP address HW type Flags HW address Mask Device\n")), - "netlink": newStaticProcInode(ctx, msrc, []byte("sk Eth Pid Groups Rmem Wmem Dump Locks Drops Inode")), - "netstat": newStaticProcInode(ctx, msrc, []byte("TcpExt: SyncookiesSent SyncookiesRecv SyncookiesFailed EmbryonicRsts PruneCalled RcvPruned OfoPruned OutOfWindowIcmps LockDroppedIcmps ArpFilter TW TWRecycled TWKilled PAWSPassive PAWSActive PAWSEstab DelayedACKs DelayedACKLocked DelayedACKLost ListenOverflows ListenDrops TCPPrequeued TCPDirectCopyFromBacklog TCPDirectCopyFromPrequeue TCPPrequeueDropped TCPHPHits TCPHPHitsToUser TCPPureAcks TCPHPAcks TCPRenoRecovery TCPSackRecovery TCPSACKReneging TCPFACKReorder TCPSACKReorder TCPRenoReorder TCPTSReorder TCPFullUndo TCPPartialUndo TCPDSACKUndo TCPLossUndo TCPLostRetransmit TCPRenoFailures TCPSackFailures TCPLossFailures TCPFastRetrans TCPForwardRetrans TCPSlowStartRetrans TCPTimeouts TCPLossProbes TCPLossProbeRecovery TCPRenoRecoveryFail TCPSackRecoveryFail TCPSchedulerFailed TCPRcvCollapsed TCPDSACKOldSent TCPDSACKOfoSent TCPDSACKRecv TCPDSACKOfoRecv TCPAbortOnData TCPAbortOnClose TCPAbortOnMemory TCPAbortOnTimeout TCPAbortOnLinger TCPAbortFailed TCPMemoryPressures TCPSACKDiscard TCPDSACKIgnoredOld TCPDSACKIgnoredNoUndo TCPSpuriousRTOs TCPMD5NotFound TCPMD5Unexpected TCPMD5Failure TCPSackShifted TCPSackMerged TCPSackShiftFallback TCPBacklogDrop TCPMinTTLDrop TCPDeferAcceptDrop IPReversePathFilter TCPTimeWaitOverflow TCPReqQFullDoCookies TCPReqQFullDrop TCPRetransFail TCPRcvCoalesce TCPOFOQueue TCPOFODrop TCPOFOMerge TCPChallengeACK TCPSYNChallenge TCPFastOpenActive TCPFastOpenActiveFail TCPFastOpenPassive TCPFastOpenPassiveFail TCPFastOpenListenOverflow TCPFastOpenCookieReqd TCPSpuriousRtxHostQueues BusyPollRxPackets TCPAutoCorking TCPFromZeroWindowAdv TCPToZeroWindowAdv TCPWantZeroWindowAdv TCPSynRetrans TCPOrigDataSent TCPHystartTrainDetect TCPHystartTrainCwnd TCPHystartDelayDetect TCPHystartDelayCwnd TCPACKSkippedSynRecv TCPACKSkippedPAWS TCPACKSkippedSeq TCPACKSkippedFinWait2 TCPACKSkippedTimeWait TCPACKSkippedChallenge TCPWinProbe TCPKeepAlive TCPMTUPFail TCPMTUPSuccess")), - "packet": newStaticProcInode(ctx, msrc, []byte("sk RefCnt Type Proto Iface R Rmem User Inode")), - "protocols": newStaticProcInode(ctx, msrc, []byte("protocol size sockets memory press maxhdr slab module cl co di ac io in de sh ss gs se re sp bi br ha uh gp em")), + "netlink": newStaticProcInode(ctx, msrc, []byte("sk Eth Pid Groups Rmem Wmem Dump Locks Drops Inode\n")), + "netstat": newStaticProcInode(ctx, msrc, []byte("TcpExt: SyncookiesSent SyncookiesRecv SyncookiesFailed EmbryonicRsts PruneCalled RcvPruned OfoPruned OutOfWindowIcmps LockDroppedIcmps ArpFilter TW TWRecycled TWKilled PAWSPassive PAWSActive PAWSEstab DelayedACKs DelayedACKLocked DelayedACKLost ListenOverflows ListenDrops TCPPrequeued TCPDirectCopyFromBacklog TCPDirectCopyFromPrequeue TCPPrequeueDropped TCPHPHits TCPHPHitsToUser TCPPureAcks TCPHPAcks TCPRenoRecovery TCPSackRecovery TCPSACKReneging TCPFACKReorder TCPSACKReorder TCPRenoReorder TCPTSReorder TCPFullUndo TCPPartialUndo TCPDSACKUndo TCPLossUndo TCPLostRetransmit TCPRenoFailures TCPSackFailures TCPLossFailures TCPFastRetrans TCPForwardRetrans TCPSlowStartRetrans TCPTimeouts TCPLossProbes TCPLossProbeRecovery TCPRenoRecoveryFail TCPSackRecoveryFail TCPSchedulerFailed TCPRcvCollapsed TCPDSACKOldSent TCPDSACKOfoSent TCPDSACKRecv TCPDSACKOfoRecv TCPAbortOnData TCPAbortOnClose TCPAbortOnMemory TCPAbortOnTimeout TCPAbortOnLinger TCPAbortFailed TCPMemoryPressures TCPSACKDiscard TCPDSACKIgnoredOld TCPDSACKIgnoredNoUndo TCPSpuriousRTOs TCPMD5NotFound TCPMD5Unexpected TCPMD5Failure TCPSackShifted TCPSackMerged TCPSackShiftFallback TCPBacklogDrop TCPMinTTLDrop TCPDeferAcceptDrop IPReversePathFilter TCPTimeWaitOverflow TCPReqQFullDoCookies TCPReqQFullDrop TCPRetransFail TCPRcvCoalesce TCPOFOQueue TCPOFODrop TCPOFOMerge TCPChallengeACK TCPSYNChallenge TCPFastOpenActive TCPFastOpenActiveFail TCPFastOpenPassive TCPFastOpenPassiveFail TCPFastOpenListenOverflow TCPFastOpenCookieReqd TCPSpuriousRtxHostQueues BusyPollRxPackets TCPAutoCorking TCPFromZeroWindowAdv TCPToZeroWindowAdv TCPWantZeroWindowAdv TCPSynRetrans TCPOrigDataSent TCPHystartTrainDetect TCPHystartTrainCwnd TCPHystartDelayDetect TCPHystartDelayCwnd TCPACKSkippedSynRecv TCPACKSkippedPAWS TCPACKSkippedSeq TCPACKSkippedFinWait2 TCPACKSkippedTimeWait TCPACKSkippedChallenge TCPWinProbe TCPKeepAlive TCPMTUPFail TCPMTUPSuccess\n")), + "packet": newStaticProcInode(ctx, msrc, []byte("sk RefCnt Type Proto Iface R Rmem User Inode\n")), + "protocols": newStaticProcInode(ctx, msrc, []byte("protocol size sockets memory press maxhdr slab module cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n")), // Linux sets psched values to: nsec per usec, psched // tick in ns, 1000000, high res timer ticks per sec // (ClockGetres returns 1ns resolution). "psched": newStaticProcInode(ctx, msrc, []byte(fmt.Sprintf("%08x %08x %08x %08x\n", uint64(time.Microsecond/time.Nanosecond), 64, 1000000, uint64(time.Second/time.Nanosecond)))), - "ptype": newStaticProcInode(ctx, msrc, []byte("Type Device Function")), + "ptype": newStaticProcInode(ctx, msrc, []byte("Type Device Function\n")), "route": seqfile.NewSeqFileInode(ctx, &netRoute{s: s}, msrc), "tcp": seqfile.NewSeqFileInode(ctx, &netTCP{k: k}, msrc), "udp": seqfile.NewSeqFileInode(ctx, &netUDP{k: k}, msrc), @@ -73,7 +73,7 @@ func (p *proc) newNetDir(ctx context.Context, k *kernel.Kernel, msrc *fs.MountSo contents["if_inet6"] = seqfile.NewSeqFileInode(ctx, &ifinet6{s: s}, msrc) contents["ipv6_route"] = newStaticProcInode(ctx, msrc, []byte("")) contents["tcp6"] = seqfile.NewSeqFileInode(ctx, &netTCP6{k: k}, msrc) - contents["udp6"] = newStaticProcInode(ctx, msrc, []byte(" sl local_address remote_address st tx_queue rx_queue tr tm->when retrnsmt uid timeout inode")) + contents["udp6"] = newStaticProcInode(ctx, msrc, []byte(" sl local_address remote_address st tx_queue rx_queue tr tm->when retrnsmt uid timeout inode\n")) } } d := ramfs.NewDir(ctx, contents, fs.RootOwner, fs.FilePermsFromMode(0555)) diff --git a/pkg/sentry/fsimpl/proc/tasks_net.go b/pkg/sentry/fsimpl/proc/tasks_net.go index 3dbf3ba41..4aaf23e97 100644 --- a/pkg/sentry/fsimpl/proc/tasks_net.go +++ b/pkg/sentry/fsimpl/proc/tasks_net.go @@ -41,12 +41,12 @@ func newNetDir(root *auth.Credentials, inoGen InoGenerator, k *kernel.Kernel) *k var contents map[string]*kernfs.Dentry if stack := k.NetworkStack(); stack != nil { const ( - arp = "IP address HW type Flags HW address Mask Device" - netlink = "sk Eth Pid Groups Rmem Wmem Dump Locks Drops Inode" - packet = "sk RefCnt Type Proto Iface R Rmem User Inode" - protocols = "protocol size sockets memory press maxhdr slab module cl co di ac io in de sh ss gs se re sp bi br ha uh gp em" - ptype = "Type Device Function" - upd6 = " sl local_address remote_address st tx_queue rx_queue tr tm->when retrnsmt uid timeout inode" + arp = "IP address HW type Flags HW address Mask Device\n" + netlink = "sk Eth Pid Groups Rmem Wmem Dump Locks Drops Inode\n" + packet = "sk RefCnt Type Proto Iface R Rmem User Inode\n" + protocols = "protocol size sockets memory press maxhdr slab module cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n" + ptype = "Type Device Function\n" + upd6 = " sl local_address remote_address st tx_queue rx_queue tr tm->when retrnsmt uid timeout inode\n" ) psched := fmt.Sprintf("%08x %08x %08x %08x\n", uint64(time.Microsecond/time.Nanosecond), 64, 1000000, uint64(time.Second/time.Nanosecond)) @@ -779,6 +779,6 @@ func (d *netStatData) Generate(ctx context.Context, buf *bytes.Buffer) error { "TCPHystartTrainCwnd TCPHystartDelayDetect TCPHystartDelayCwnd " + "TCPACKSkippedSynRecv TCPACKSkippedPAWS TCPACKSkippedSeq " + "TCPACKSkippedFinWait2 TCPACKSkippedTimeWait TCPACKSkippedChallenge " + - "TCPWinProbe TCPKeepAlive TCPMTUPFail TCPMTUPSuccess") + "TCPWinProbe TCPKeepAlive TCPMTUPFail TCPMTUPSuccess\n") return nil } -- cgit v1.2.3 From ad1968ed5665c7541d6920edbd7c7492b7db3046 Mon Sep 17 00:00:00 2001 From: Rahat Mahmood Date: Tue, 21 Jan 2020 14:25:14 -0800 Subject: Implement sysfs. PiperOrigin-RevId: 290822487 --- pkg/abi/linux/file.go | 13 +++ pkg/sentry/fsimpl/kernfs/BUILD | 3 +- pkg/sentry/fsimpl/kernfs/kernfs_test.go | 175 ++++++------------------------- pkg/sentry/fsimpl/proc/BUILD | 13 +-- pkg/sentry/fsimpl/proc/boot_test.go | 149 -------------------------- pkg/sentry/fsimpl/proc/tasks_test.go | 13 +-- pkg/sentry/fsimpl/sys/BUILD | 35 +++++++ pkg/sentry/fsimpl/sys/sys.go | 124 ++++++++++++++++++++++ pkg/sentry/fsimpl/sys/sys_test.go | 90 ++++++++++++++++ pkg/sentry/fsimpl/testutil/BUILD | 36 +++++++ pkg/sentry/fsimpl/testutil/kernel.go | 149 ++++++++++++++++++++++++++ pkg/sentry/fsimpl/testutil/testutil.go | 178 ++++++++++++++++++++++++++++++++ 12 files changed, 663 insertions(+), 315 deletions(-) delete mode 100644 pkg/sentry/fsimpl/proc/boot_test.go create mode 100644 pkg/sentry/fsimpl/sys/BUILD create mode 100644 pkg/sentry/fsimpl/sys/sys.go create mode 100644 pkg/sentry/fsimpl/sys/sys_test.go create mode 100644 pkg/sentry/fsimpl/testutil/BUILD create mode 100644 pkg/sentry/fsimpl/testutil/kernel.go create mode 100644 pkg/sentry/fsimpl/testutil/testutil.go (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/abi/linux/file.go b/pkg/abi/linux/file.go index 16791d03e..6fbdd668d 100644 --- a/pkg/abi/linux/file.go +++ b/pkg/abi/linux/file.go @@ -180,6 +180,19 @@ const ( DT_WHT = 14 ) +// DirentType are the friendly strings for linux_dirent64.d_type. +var DirentType = abi.ValueSet{ + DT_UNKNOWN: "DT_UNKNOWN", + DT_FIFO: "DT_FIFO", + DT_CHR: "DT_CHR", + DT_DIR: "DT_DIR", + DT_BLK: "DT_BLK", + DT_REG: "DT_REG", + DT_LNK: "DT_LNK", + DT_SOCK: "DT_SOCK", + DT_WHT: "DT_WHT", +} + // Values for preadv2/pwritev2. const ( // Note: gVisor does not implement the RWF_HIPRI feature, but the flag is diff --git a/pkg/sentry/fsimpl/kernfs/BUILD b/pkg/sentry/fsimpl/kernfs/BUILD index 809178250..66d409785 100644 --- a/pkg/sentry/fsimpl/kernfs/BUILD +++ b/pkg/sentry/fsimpl/kernfs/BUILD @@ -51,13 +51,12 @@ go_test( deps = [ ":kernfs", "//pkg/abi/linux", - "//pkg/fspath", "//pkg/sentry/context", "//pkg/sentry/context/contexttest", + "//pkg/sentry/fsimpl/testutil", "//pkg/sentry/kernel/auth", "//pkg/sentry/usermem", "//pkg/sentry/vfs", - "//pkg/sync", "//pkg/syserror", "@com_github_google_go-cmp//cmp:go_default_library", ], diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go index 5c9d580e1..a5fdfbde5 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs_test.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go @@ -17,20 +17,17 @@ package kernfs_test import ( "bytes" "fmt" - "io" - "runtime" "testing" "github.com/google/go-cmp/cmp" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/context/contexttest" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/testutil" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) @@ -41,21 +38,11 @@ const staticFileContent = "This is sample content for a static test file." // filesystem. See newTestSystem. type RootDentryFn func(*auth.Credentials, *filesystem) *kernfs.Dentry -// TestSystem represents the context for a single test. -type TestSystem struct { - t *testing.T - ctx context.Context - creds *auth.Credentials - vfs *vfs.VirtualFilesystem - mns *vfs.MountNamespace - root vfs.VirtualDentry -} - // newTestSystem sets up a minimal environment for running a test, including an // instance of a test filesystem. Tests can control the contents of the // filesystem by providing an appropriate rootFn, which should return a // pre-populated root dentry. -func newTestSystem(t *testing.T, rootFn RootDentryFn) *TestSystem { +func newTestSystem(t *testing.T, rootFn RootDentryFn) *testutil.System { ctx := contexttest.Context(t) creds := auth.CredentialsFromContext(ctx) v := vfs.New() @@ -66,57 +53,7 @@ func newTestSystem(t *testing.T, rootFn RootDentryFn) *TestSystem { if err != nil { t.Fatalf("Failed to create testfs root mount: %v", err) } - - s := &TestSystem{ - t: t, - ctx: ctx, - creds: creds, - vfs: v, - mns: mns, - root: mns.Root(), - } - runtime.SetFinalizer(s, func(s *TestSystem) { s.root.DecRef() }) - return s -} - -// PathOpAtRoot constructs a vfs.PathOperation for a path from the -// root of the test filesystem. -// -// Precondition: path should be relative path. -func (s *TestSystem) PathOpAtRoot(path string) vfs.PathOperation { - return vfs.PathOperation{ - Root: s.root, - Start: s.root, - Path: fspath.Parse(path), - } -} - -// GetDentryOrDie attempts to resolve a dentry referred to by the -// provided path operation. If unsuccessful, the test fails. -func (s *TestSystem) GetDentryOrDie(pop vfs.PathOperation) vfs.VirtualDentry { - vd, err := s.vfs.GetDentryAt(s.ctx, s.creds, &pop, &vfs.GetDentryOptions{}) - if err != nil { - s.t.Fatalf("GetDentryAt(pop:%+v) failed: %v", pop, err) - } - return vd -} - -func (s *TestSystem) ReadToEnd(fd *vfs.FileDescription) (string, error) { - buf := make([]byte, usermem.PageSize) - bufIOSeq := usermem.BytesIOSequence(buf) - opts := vfs.ReadOptions{} - - var content bytes.Buffer - for { - n, err := fd.Impl().Read(s.ctx, bufIOSeq, opts) - if n == 0 || err != nil { - if err == io.EOF { - err = nil - } - return content.String(), err - } - content.Write(buf[:n]) - } + return testutil.NewSystem(ctx, t, v, mns) } type fsType struct { @@ -260,6 +197,7 @@ func TestBasic(t *testing.T) { "file1": fs.newFile(creds, staticFileContent), }) }) + defer sys.Destroy() sys.GetDentryOrDie(sys.PathOpAtRoot("file1")).DecRef() } @@ -269,9 +207,10 @@ func TestMkdirGetDentry(t *testing.T) { "dir1": fs.newDir(creds, 0755, nil), }) }) + defer sys.Destroy() pop := sys.PathOpAtRoot("dir1/a new directory") - if err := sys.vfs.MkdirAt(sys.ctx, sys.creds, &pop, &vfs.MkdirOptions{Mode: 0755}); err != nil { + if err := sys.VFS.MkdirAt(sys.Ctx, sys.Creds, &pop, &vfs.MkdirOptions{Mode: 0755}); err != nil { t.Fatalf("MkdirAt for PathOperation %+v failed: %v", pop, err) } sys.GetDentryOrDie(pop).DecRef() @@ -283,20 +222,21 @@ func TestReadStaticFile(t *testing.T) { "file1": fs.newFile(creds, staticFileContent), }) }) + defer sys.Destroy() pop := sys.PathOpAtRoot("file1") - fd, err := sys.vfs.OpenAt(sys.ctx, sys.creds, &pop, &vfs.OpenOptions{}) + fd, err := sys.VFS.OpenAt(sys.Ctx, sys.Creds, &pop, &vfs.OpenOptions{}) if err != nil { - sys.t.Fatalf("OpenAt for PathOperation %+v failed: %v", pop, err) + t.Fatalf("OpenAt for PathOperation %+v failed: %v", pop, err) } defer fd.DecRef() content, err := sys.ReadToEnd(fd) if err != nil { - sys.t.Fatalf("Read failed: %v", err) + t.Fatalf("Read failed: %v", err) } if diff := cmp.Diff(staticFileContent, content); diff != "" { - sys.t.Fatalf("Read returned unexpected data:\n--- want\n+++ got\n%v", diff) + t.Fatalf("Read returned unexpected data:\n--- want\n+++ got\n%v", diff) } } @@ -306,83 +246,44 @@ func TestCreateNewFileInStaticDir(t *testing.T) { "dir1": fs.newDir(creds, 0755, nil), }) }) + defer sys.Destroy() pop := sys.PathOpAtRoot("dir1/newfile") opts := &vfs.OpenOptions{Flags: linux.O_CREAT | linux.O_EXCL, Mode: defaultMode} - fd, err := sys.vfs.OpenAt(sys.ctx, sys.creds, &pop, opts) + fd, err := sys.VFS.OpenAt(sys.Ctx, sys.Creds, &pop, opts) if err != nil { - sys.t.Fatalf("OpenAt(pop:%+v, opts:%+v) failed: %v", pop, opts, err) + t.Fatalf("OpenAt(pop:%+v, opts:%+v) failed: %v", pop, opts, err) } // Close the file. The file should persist. fd.DecRef() - fd, err = sys.vfs.OpenAt(sys.ctx, sys.creds, &pop, &vfs.OpenOptions{}) + fd, err = sys.VFS.OpenAt(sys.Ctx, sys.Creds, &pop, &vfs.OpenOptions{}) if err != nil { - sys.t.Fatalf("OpenAt(pop:%+v) = %+v failed: %v", pop, fd, err) + t.Fatalf("OpenAt(pop:%+v) = %+v failed: %v", pop, fd, err) } fd.DecRef() } -// direntCollector provides an implementation for vfs.IterDirentsCallback for -// testing. It simply iterates to the end of a given directory FD and collects -// all dirents emitted by the callback. -type direntCollector struct { - mu sync.Mutex - dirents map[string]vfs.Dirent -} - -// Handle implements vfs.IterDirentsCallback.Handle. -func (d *direntCollector) Handle(dirent vfs.Dirent) bool { - d.mu.Lock() - if d.dirents == nil { - d.dirents = make(map[string]vfs.Dirent) - } - d.dirents[dirent.Name] = dirent - d.mu.Unlock() - return true -} - -// count returns the number of dirents currently in the collector. -func (d *direntCollector) count() int { - d.mu.Lock() - defer d.mu.Unlock() - return len(d.dirents) -} - -// contains checks whether the collector has a dirent with the given name and -// type. -func (d *direntCollector) contains(name string, typ uint8) error { - d.mu.Lock() - defer d.mu.Unlock() - dirent, ok := d.dirents[name] - if !ok { - return fmt.Errorf("No dirent named %q found", name) - } - if dirent.Type != typ { - return fmt.Errorf("Dirent named %q found, but was expecting type %d, got: %+v", name, typ, dirent) - } - return nil -} - func TestDirFDReadWrite(t *testing.T) { sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry { return fs.newReadonlyDir(creds, 0755, nil) }) + defer sys.Destroy() pop := sys.PathOpAtRoot("/") - fd, err := sys.vfs.OpenAt(sys.ctx, sys.creds, &pop, &vfs.OpenOptions{}) + fd, err := sys.VFS.OpenAt(sys.Ctx, sys.Creds, &pop, &vfs.OpenOptions{}) if err != nil { - sys.t.Fatalf("OpenAt for PathOperation %+v failed: %v", pop, err) + t.Fatalf("OpenAt for PathOperation %+v failed: %v", pop, err) } defer fd.DecRef() // Read/Write should fail for directory FDs. - if _, err := fd.Read(sys.ctx, usermem.BytesIOSequence([]byte{}), vfs.ReadOptions{}); err != syserror.EISDIR { - sys.t.Fatalf("Read for directory FD failed with unexpected error: %v", err) + if _, err := fd.Read(sys.Ctx, usermem.BytesIOSequence([]byte{}), vfs.ReadOptions{}); err != syserror.EISDIR { + t.Fatalf("Read for directory FD failed with unexpected error: %v", err) } - if _, err := fd.Write(sys.ctx, usermem.BytesIOSequence([]byte{}), vfs.WriteOptions{}); err != syserror.EISDIR { - sys.t.Fatalf("Wrire for directory FD failed with unexpected error: %v", err) + if _, err := fd.Write(sys.Ctx, usermem.BytesIOSequence([]byte{}), vfs.WriteOptions{}); err != syserror.EISDIR { + t.Fatalf("Write for directory FD failed with unexpected error: %v", err) } } @@ -397,30 +298,12 @@ func TestDirFDIterDirents(t *testing.T) { "file1": fs.newFile(creds, staticFileContent), }) }) + defer sys.Destroy() pop := sys.PathOpAtRoot("/") - fd, err := sys.vfs.OpenAt(sys.ctx, sys.creds, &pop, &vfs.OpenOptions{}) - if err != nil { - sys.t.Fatalf("OpenAt for PathOperation %+v failed: %v", pop, err) - } - defer fd.DecRef() - - collector := &direntCollector{} - if err := fd.IterDirents(sys.ctx, collector); err != nil { - sys.t.Fatalf("IterDirent failed: %v", err) - } - - // Root directory should contain ".", ".." and 3 children: - if collector.count() != 5 { - sys.t.Fatalf("IterDirent returned too many dirents") - } - for _, dirName := range []string{".", "..", "dir1", "dir2"} { - if err := collector.contains(dirName, linux.DT_DIR); err != nil { - sys.t.Fatalf("IterDirent had unexpected results: %v", err) - } - } - if err := collector.contains("file1", linux.DT_REG); err != nil { - sys.t.Fatalf("IterDirent had unexpected results: %v", err) - } - + sys.AssertDirectoryContains(&pop, map[string]testutil.DirentType{ + "dir1": linux.DT_DIR, + "dir2": linux.DT_DIR, + "file1": linux.DT_REG, + }) } diff --git a/pkg/sentry/fsimpl/proc/BUILD b/pkg/sentry/fsimpl/proc/BUILD index f69aa19c4..c5b79fb38 100644 --- a/pkg/sentry/fsimpl/proc/BUILD +++ b/pkg/sentry/fsimpl/proc/BUILD @@ -44,30 +44,19 @@ go_test( name = "proc_test", size = "small", srcs = [ - "boot_test.go", "tasks_sys_test.go", "tasks_test.go", ], embed = [":proc"], deps = [ "//pkg/abi/linux", - "//pkg/cpuid", "//pkg/fspath", - "//pkg/memutil", "//pkg/sentry/context", "//pkg/sentry/context/contexttest", - "//pkg/sentry/fs", + "//pkg/sentry/fsimpl/testutil", "//pkg/sentry/inet", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", - "//pkg/sentry/kernel/sched", - "//pkg/sentry/limits", - "//pkg/sentry/loader", - "//pkg/sentry/pgalloc", - "//pkg/sentry/platform", - "//pkg/sentry/platform/kvm", - "//pkg/sentry/platform/ptrace", - "//pkg/sentry/time", "//pkg/sentry/usermem", "//pkg/sentry/vfs", "//pkg/syserror", diff --git a/pkg/sentry/fsimpl/proc/boot_test.go b/pkg/sentry/fsimpl/proc/boot_test.go deleted file mode 100644 index 84a93ee56..000000000 --- a/pkg/sentry/fsimpl/proc/boot_test.go +++ /dev/null @@ -1,149 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package proc - -import ( - "flag" - "fmt" - "os" - "runtime" - - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/cpuid" - "gvisor.dev/gvisor/pkg/memutil" - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/fs" - "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/sentry/kernel/sched" - "gvisor.dev/gvisor/pkg/sentry/limits" - "gvisor.dev/gvisor/pkg/sentry/loader" - "gvisor.dev/gvisor/pkg/sentry/pgalloc" - "gvisor.dev/gvisor/pkg/sentry/platform" - "gvisor.dev/gvisor/pkg/sentry/time" - - // Platforms are plugable. - _ "gvisor.dev/gvisor/pkg/sentry/platform/kvm" - _ "gvisor.dev/gvisor/pkg/sentry/platform/ptrace" -) - -var ( - platformFlag = flag.String("platform", "ptrace", "specify which platform to use") -) - -// boot initializes a new bare bones kernel for test. -func boot() (*kernel.Kernel, error) { - platformCtr, err := platform.Lookup(*platformFlag) - if err != nil { - return nil, fmt.Errorf("platform not found: %v", err) - } - deviceFile, err := platformCtr.OpenDevice() - if err != nil { - return nil, fmt.Errorf("creating platform: %v", err) - } - plat, err := platformCtr.New(deviceFile) - if err != nil { - return nil, fmt.Errorf("creating platform: %v", err) - } - - k := &kernel.Kernel{ - Platform: plat, - } - - mf, err := createMemoryFile() - if err != nil { - return nil, err - } - k.SetMemoryFile(mf) - - // Pass k as the platform since it is savable, unlike the actual platform. - vdso, err := loader.PrepareVDSO(nil, k) - if err != nil { - return nil, fmt.Errorf("creating vdso: %v", err) - } - - // Create timekeeper. - tk, err := kernel.NewTimekeeper(k, vdso.ParamPage.FileRange()) - if err != nil { - return nil, fmt.Errorf("creating timekeeper: %v", err) - } - tk.SetClocks(time.NewCalibratedClocks()) - - creds := auth.NewRootCredentials(auth.NewRootUserNamespace()) - - // Initiate the Kernel object, which is required by the Context passed - // to createVFS in order to mount (among other things) procfs. - if err = k.Init(kernel.InitKernelArgs{ - ApplicationCores: uint(runtime.GOMAXPROCS(-1)), - FeatureSet: cpuid.HostFeatureSet(), - Timekeeper: tk, - RootUserNamespace: creds.UserNamespace, - Vdso: vdso, - RootUTSNamespace: kernel.NewUTSNamespace("hostname", "domain", creds.UserNamespace), - RootIPCNamespace: kernel.NewIPCNamespace(creds.UserNamespace), - RootAbstractSocketNamespace: kernel.NewAbstractSocketNamespace(), - PIDNamespace: kernel.NewRootPIDNamespace(creds.UserNamespace), - }); err != nil { - return nil, fmt.Errorf("initializing kernel: %v", err) - } - - ctx := k.SupervisorContext() - - // Create mount namespace without root as it's the minimum required to create - // the global thread group. - mntns, err := fs.NewMountNamespace(ctx, nil) - if err != nil { - return nil, err - } - ls, err := limits.NewLinuxLimitSet() - if err != nil { - return nil, err - } - tg := k.NewThreadGroup(mntns, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, ls) - k.TestOnly_SetGlobalInit(tg) - - return k, nil -} - -// createTask creates a new bare bones task for tests. -func createTask(ctx context.Context, name string, tc *kernel.ThreadGroup) (*kernel.Task, error) { - k := kernel.KernelFromContext(ctx) - config := &kernel.TaskConfig{ - Kernel: k, - ThreadGroup: tc, - TaskContext: &kernel.TaskContext{Name: name}, - Credentials: auth.CredentialsFromContext(ctx), - AllowedCPUMask: sched.NewFullCPUSet(k.ApplicationCores()), - UTSNamespace: kernel.UTSNamespaceFromContext(ctx), - IPCNamespace: kernel.IPCNamespaceFromContext(ctx), - AbstractSocketNamespace: kernel.NewAbstractSocketNamespace(), - } - return k.TaskSet().NewTask(config) -} - -func createMemoryFile() (*pgalloc.MemoryFile, error) { - const memfileName = "test-memory" - memfd, err := memutil.CreateMemFD(memfileName, 0) - if err != nil { - return nil, fmt.Errorf("error creating memfd: %v", err) - } - memfile := os.NewFile(uintptr(memfd), memfileName) - mf, err := pgalloc.NewMemoryFile(memfile, pgalloc.MemoryFileOpts{}) - if err != nil { - memfile.Close() - return nil, fmt.Errorf("error creating pgalloc.MemoryFile: %v", err) - } - return mf, nil -} diff --git a/pkg/sentry/fsimpl/proc/tasks_test.go b/pkg/sentry/fsimpl/proc/tasks_test.go index 002d2f73b..41977d816 100644 --- a/pkg/sentry/fsimpl/proc/tasks_test.go +++ b/pkg/sentry/fsimpl/proc/tasks_test.go @@ -24,6 +24,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/testutil" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/usermem" @@ -134,7 +135,7 @@ func checkFiles(gots []vfs.Dirent, wants map[string]vfs.Dirent) ([]vfs.Dirent, e } func setup() (context.Context, *vfs.VirtualFilesystem, vfs.VirtualDentry, error) { - k, err := boot() + k, err := testutil.Boot() if err != nil { return nil, nil, vfs.VirtualDentry{}, fmt.Errorf("creating kernel: %v", err) } @@ -206,7 +207,7 @@ func TestTasks(t *testing.T) { var tasks []*kernel.Task for i := 0; i < 5; i++ { tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits()) - task, err := createTask(ctx, fmt.Sprintf("name-%d", i), tc) + task, err := testutil.CreateTask(ctx, fmt.Sprintf("name-%d", i), tc) if err != nil { t.Fatalf("CreateTask(): %v", err) } @@ -298,7 +299,7 @@ func TestTasksOffset(t *testing.T) { k := kernel.KernelFromContext(ctx) for i := 0; i < 3; i++ { tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits()) - if _, err := createTask(ctx, fmt.Sprintf("name-%d", i), tc); err != nil { + if _, err := testutil.CreateTask(ctx, fmt.Sprintf("name-%d", i), tc); err != nil { t.Fatalf("CreateTask(): %v", err) } } @@ -417,7 +418,7 @@ func TestTask(t *testing.T) { k := kernel.KernelFromContext(ctx) tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits()) - _, err = createTask(ctx, "name", tc) + _, err = testutil.CreateTask(ctx, "name", tc) if err != nil { t.Fatalf("CreateTask(): %v", err) } @@ -458,7 +459,7 @@ func TestProcSelf(t *testing.T) { k := kernel.KernelFromContext(ctx) tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits()) - task, err := createTask(ctx, "name", tc) + task, err := testutil.CreateTask(ctx, "name", tc) if err != nil { t.Fatalf("CreateTask(): %v", err) } @@ -555,7 +556,7 @@ func TestTree(t *testing.T) { var tasks []*kernel.Task for i := 0; i < 5; i++ { tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits()) - task, err := createTask(uberCtx, fmt.Sprintf("name-%d", i), tc) + task, err := testutil.CreateTask(uberCtx, fmt.Sprintf("name-%d", i), tc) if err != nil { t.Fatalf("CreateTask(): %v", err) } diff --git a/pkg/sentry/fsimpl/sys/BUILD b/pkg/sentry/fsimpl/sys/BUILD new file mode 100644 index 000000000..ee3c842bd --- /dev/null +++ b/pkg/sentry/fsimpl/sys/BUILD @@ -0,0 +1,35 @@ +load("//tools/go_stateify:defs.bzl", "go_library") +load("@io_bazel_rules_go//go:def.bzl", "go_test") + +package(licenses = ["notice"]) + +go_library( + name = "sys", + srcs = [ + "sys.go", + ], + importpath = "gvisor.dev/gvisor/pkg/sentry/fsimpl/sys", + deps = [ + "//pkg/abi/linux", + "//pkg/sentry/context", + "//pkg/sentry/fsimpl/kernfs", + "//pkg/sentry/kernel", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/vfs", + "//pkg/syserror", + ], +) + +go_test( + name = "sys_test", + srcs = ["sys_test.go"], + deps = [ + ":sys", + "//pkg/abi/linux", + "//pkg/sentry/fsimpl/testutil", + "//pkg/sentry/kernel", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/vfs", + "@com_github_google_go-cmp//cmp:go_default_library", + ], +) diff --git a/pkg/sentry/fsimpl/sys/sys.go b/pkg/sentry/fsimpl/sys/sys.go new file mode 100644 index 000000000..1305ad01d --- /dev/null +++ b/pkg/sentry/fsimpl/sys/sys.go @@ -0,0 +1,124 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package sys implements sysfs. +package sys + +import ( + "bytes" + "fmt" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +// FilesystemType implements vfs.FilesystemType. +type FilesystemType struct{} + +// filesystem implements vfs.FilesystemImpl. +type filesystem struct { + kernfs.Filesystem +} + +// GetFilesystem implements vfs.FilesystemType.GetFilesystem. +func (FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { + fs := &filesystem{} + fs.Filesystem.Init(vfsObj) + k := kernel.KernelFromContext(ctx) + maxCPUCores := k.ApplicationCores() + defaultSysDirMode := linux.FileMode(0755) + + root := fs.newDir(creds, defaultSysDirMode, map[string]*kernfs.Dentry{ + "block": fs.newDir(creds, defaultSysDirMode, nil), + "bus": fs.newDir(creds, defaultSysDirMode, nil), + "class": fs.newDir(creds, defaultSysDirMode, map[string]*kernfs.Dentry{ + "power_supply": fs.newDir(creds, defaultSysDirMode, nil), + }), + "dev": fs.newDir(creds, defaultSysDirMode, nil), + "devices": fs.newDir(creds, defaultSysDirMode, map[string]*kernfs.Dentry{ + "system": fs.newDir(creds, defaultSysDirMode, map[string]*kernfs.Dentry{ + "cpu": fs.newDir(creds, defaultSysDirMode, map[string]*kernfs.Dentry{ + "online": fs.newCPUFile(creds, maxCPUCores, linux.FileMode(0444)), + "possible": fs.newCPUFile(creds, maxCPUCores, linux.FileMode(0444)), + "present": fs.newCPUFile(creds, maxCPUCores, linux.FileMode(0444)), + }), + }), + }), + "firmware": fs.newDir(creds, defaultSysDirMode, nil), + "fs": fs.newDir(creds, defaultSysDirMode, nil), + "kernel": fs.newDir(creds, defaultSysDirMode, nil), + "module": fs.newDir(creds, defaultSysDirMode, nil), + "power": fs.newDir(creds, defaultSysDirMode, nil), + }) + return fs.VFSFilesystem(), root.VFSDentry(), nil +} + +// dir implements kernfs.Inode. +type dir struct { + kernfs.InodeAttrs + kernfs.InodeNoDynamicLookup + kernfs.InodeNotSymlink + kernfs.InodeDirectoryNoNewChildren + + kernfs.OrderedChildren + dentry kernfs.Dentry +} + +func (fs *filesystem) newDir(creds *auth.Credentials, mode linux.FileMode, contents map[string]*kernfs.Dentry) *kernfs.Dentry { + d := &dir{} + d.InodeAttrs.Init(creds, fs.NextIno(), linux.ModeDirectory|0755) + d.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) + d.dentry.Init(d) + + d.IncLinks(d.OrderedChildren.Populate(&d.dentry, contents)) + + return &d.dentry +} + +// SetStat implements kernfs.Inode.SetStat. +func (d *dir) SetStat(fs *vfs.Filesystem, opts vfs.SetStatOptions) error { + return syserror.EPERM +} + +// Open implements kernfs.Inode.Open. +func (d *dir) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) { + fd := &kernfs.GenericDirectoryFD{} + fd.Init(rp.Mount(), vfsd, &d.OrderedChildren, flags) + return fd.VFSFileDescription(), nil +} + +// cpuFile implements kernfs.Inode. +type cpuFile struct { + kernfs.DynamicBytesFile + maxCores uint +} + +// Generate implements vfs.DynamicBytesSource.Generate. +func (c *cpuFile) Generate(ctx context.Context, buf *bytes.Buffer) error { + fmt.Fprintf(buf, "0-%d", c.maxCores-1) + return nil +} + +func (fs *filesystem) newCPUFile(creds *auth.Credentials, maxCores uint, mode linux.FileMode) *kernfs.Dentry { + c := &cpuFile{maxCores: maxCores} + c.DynamicBytesFile.Init(creds, fs.NextIno(), c, mode) + d := &kernfs.Dentry{} + d.Init(c) + return d +} diff --git a/pkg/sentry/fsimpl/sys/sys_test.go b/pkg/sentry/fsimpl/sys/sys_test.go new file mode 100644 index 000000000..60a1634a9 --- /dev/null +++ b/pkg/sentry/fsimpl/sys/sys_test.go @@ -0,0 +1,90 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package sys_test + +import ( + "fmt" + "testing" + + "github.com/google/go-cmp/cmp" + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/sys" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/testutil" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" +) + +func newTestSystem(t *testing.T) *testutil.System { + k, err := testutil.Boot() + if err != nil { + t.Fatalf("Failed to create test kernel: %v", err) + } + ctx := k.SupervisorContext() + creds := auth.CredentialsFromContext(ctx) + v := vfs.New() + v.MustRegisterFilesystemType("sysfs", sys.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserMount: true, + }) + + mns, err := v.NewMountNamespace(ctx, creds, "", "sysfs", &vfs.GetFilesystemOptions{}) + if err != nil { + t.Fatalf("Failed to create new mount namespace: %v", err) + } + return testutil.NewSystem(ctx, t, v, mns) +} + +func TestReadCPUFile(t *testing.T) { + s := newTestSystem(t) + defer s.Destroy() + k := kernel.KernelFromContext(s.Ctx) + maxCPUCores := k.ApplicationCores() + + expected := fmt.Sprintf("0-%d", maxCPUCores-1) + + for _, fname := range []string{"online", "possible", "present"} { + pop := s.PathOpAtRoot(fmt.Sprintf("devices/system/cpu/%s", fname)) + fd, err := s.VFS.OpenAt(s.Ctx, s.Creds, &pop, &vfs.OpenOptions{}) + if err != nil { + t.Fatalf("OpenAt(pop:%+v) = %+v failed: %v", pop, fd, err) + } + defer fd.DecRef() + content, err := s.ReadToEnd(fd) + if err != nil { + t.Fatalf("Read failed: %v", err) + } + if diff := cmp.Diff(expected, content); diff != "" { + t.Fatalf("Read returned unexpected data:\n--- want\n+++ got\n%v", diff) + } + } +} + +func TestSysRootContainsExpectedEntries(t *testing.T) { + s := newTestSystem(t) + defer s.Destroy() + pop := s.PathOpAtRoot("/") + s.AssertDirectoryContains(&pop, map[string]testutil.DirentType{ + "block": linux.DT_DIR, + "bus": linux.DT_DIR, + "class": linux.DT_DIR, + "dev": linux.DT_DIR, + "devices": linux.DT_DIR, + "firmware": linux.DT_DIR, + "fs": linux.DT_DIR, + "kernel": linux.DT_DIR, + "module": linux.DT_DIR, + "power": linux.DT_DIR, + }) +} diff --git a/pkg/sentry/fsimpl/testutil/BUILD b/pkg/sentry/fsimpl/testutil/BUILD new file mode 100644 index 000000000..4e70d84a7 --- /dev/null +++ b/pkg/sentry/fsimpl/testutil/BUILD @@ -0,0 +1,36 @@ +load("//tools/go_stateify:defs.bzl", "go_library") + +package(licenses = ["notice"]) + +go_library( + name = "testutil", + testonly = 1, + srcs = [ + "kernel.go", + "testutil.go", + ], + importpath = "gvisor.dev/gvisor/pkg/sentry/fsimpl/testutil", + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/cpuid", + "//pkg/fspath", + "//pkg/memutil", + "//pkg/sentry/context", + "//pkg/sentry/fs", + "//pkg/sentry/kernel", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/kernel/sched", + "//pkg/sentry/limits", + "//pkg/sentry/loader", + "//pkg/sentry/pgalloc", + "//pkg/sentry/platform", + "//pkg/sentry/platform/kvm", + "//pkg/sentry/platform/ptrace", + "//pkg/sentry/time", + "//pkg/sentry/usermem", + "//pkg/sentry/vfs", + "//pkg/sync", + "@com_github_google_go-cmp//cmp:go_default_library", + ], +) diff --git a/pkg/sentry/fsimpl/testutil/kernel.go b/pkg/sentry/fsimpl/testutil/kernel.go new file mode 100644 index 000000000..295da2d52 --- /dev/null +++ b/pkg/sentry/fsimpl/testutil/kernel.go @@ -0,0 +1,149 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package testutil + +import ( + "flag" + "fmt" + "os" + "runtime" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/cpuid" + "gvisor.dev/gvisor/pkg/memutil" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/kernel/sched" + "gvisor.dev/gvisor/pkg/sentry/limits" + "gvisor.dev/gvisor/pkg/sentry/loader" + "gvisor.dev/gvisor/pkg/sentry/pgalloc" + "gvisor.dev/gvisor/pkg/sentry/platform" + "gvisor.dev/gvisor/pkg/sentry/time" + + // Platforms are plugable. + _ "gvisor.dev/gvisor/pkg/sentry/platform/kvm" + _ "gvisor.dev/gvisor/pkg/sentry/platform/ptrace" +) + +var ( + platformFlag = flag.String("platform", "ptrace", "specify which platform to use") +) + +// Boot initializes a new bare bones kernel for test. +func Boot() (*kernel.Kernel, error) { + platformCtr, err := platform.Lookup(*platformFlag) + if err != nil { + return nil, fmt.Errorf("platform not found: %v", err) + } + deviceFile, err := platformCtr.OpenDevice() + if err != nil { + return nil, fmt.Errorf("creating platform: %v", err) + } + plat, err := platformCtr.New(deviceFile) + if err != nil { + return nil, fmt.Errorf("creating platform: %v", err) + } + + k := &kernel.Kernel{ + Platform: plat, + } + + mf, err := createMemoryFile() + if err != nil { + return nil, err + } + k.SetMemoryFile(mf) + + // Pass k as the platform since it is savable, unlike the actual platform. + vdso, err := loader.PrepareVDSO(nil, k) + if err != nil { + return nil, fmt.Errorf("creating vdso: %v", err) + } + + // Create timekeeper. + tk, err := kernel.NewTimekeeper(k, vdso.ParamPage.FileRange()) + if err != nil { + return nil, fmt.Errorf("creating timekeeper: %v", err) + } + tk.SetClocks(time.NewCalibratedClocks()) + + creds := auth.NewRootCredentials(auth.NewRootUserNamespace()) + + // Initiate the Kernel object, which is required by the Context passed + // to createVFS in order to mount (among other things) procfs. + if err = k.Init(kernel.InitKernelArgs{ + ApplicationCores: uint(runtime.GOMAXPROCS(-1)), + FeatureSet: cpuid.HostFeatureSet(), + Timekeeper: tk, + RootUserNamespace: creds.UserNamespace, + Vdso: vdso, + RootUTSNamespace: kernel.NewUTSNamespace("hostname", "domain", creds.UserNamespace), + RootIPCNamespace: kernel.NewIPCNamespace(creds.UserNamespace), + RootAbstractSocketNamespace: kernel.NewAbstractSocketNamespace(), + PIDNamespace: kernel.NewRootPIDNamespace(creds.UserNamespace), + }); err != nil { + return nil, fmt.Errorf("initializing kernel: %v", err) + } + + ctx := k.SupervisorContext() + + // Create mount namespace without root as it's the minimum required to create + // the global thread group. + mntns, err := fs.NewMountNamespace(ctx, nil) + if err != nil { + return nil, err + } + ls, err := limits.NewLinuxLimitSet() + if err != nil { + return nil, err + } + tg := k.NewThreadGroup(mntns, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, ls) + k.TestOnly_SetGlobalInit(tg) + + return k, nil +} + +// CreateTask creates a new bare bones task for tests. +func CreateTask(ctx context.Context, name string, tc *kernel.ThreadGroup) (*kernel.Task, error) { + k := kernel.KernelFromContext(ctx) + config := &kernel.TaskConfig{ + Kernel: k, + ThreadGroup: tc, + TaskContext: &kernel.TaskContext{Name: name}, + Credentials: auth.CredentialsFromContext(ctx), + AllowedCPUMask: sched.NewFullCPUSet(k.ApplicationCores()), + UTSNamespace: kernel.UTSNamespaceFromContext(ctx), + IPCNamespace: kernel.IPCNamespaceFromContext(ctx), + AbstractSocketNamespace: kernel.NewAbstractSocketNamespace(), + } + return k.TaskSet().NewTask(config) +} + +func createMemoryFile() (*pgalloc.MemoryFile, error) { + const memfileName = "test-memory" + memfd, err := memutil.CreateMemFD(memfileName, 0) + if err != nil { + return nil, fmt.Errorf("error creating memfd: %v", err) + } + memfile := os.NewFile(uintptr(memfd), memfileName) + mf, err := pgalloc.NewMemoryFile(memfile, pgalloc.MemoryFileOpts{}) + if err != nil { + memfile.Close() + return nil, fmt.Errorf("error creating pgalloc.MemoryFile: %v", err) + } + return mf, nil +} diff --git a/pkg/sentry/fsimpl/testutil/testutil.go b/pkg/sentry/fsimpl/testutil/testutil.go new file mode 100644 index 000000000..eada31d94 --- /dev/null +++ b/pkg/sentry/fsimpl/testutil/testutil.go @@ -0,0 +1,178 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package testutil provides common test utilities for kernfs-based +// filesystems. +package testutil + +import ( + "fmt" + "io" + "strings" + "testing" + + "github.com/google/go-cmp/cmp" + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" +) + +// System represents the context for a single test. +// +// Test systems must be explicitly destroyed with System.Destroy. +type System struct { + t *testing.T + Ctx context.Context + Creds *auth.Credentials + VFS *vfs.VirtualFilesystem + mns *vfs.MountNamespace + root vfs.VirtualDentry +} + +// NewSystem constructs a System. +// +// Precondition: Caller must hold a reference on mns, whose ownership +// is transferred to the new System. +func NewSystem(ctx context.Context, t *testing.T, v *vfs.VirtualFilesystem, mns *vfs.MountNamespace) *System { + s := &System{ + t: t, + Ctx: ctx, + Creds: auth.CredentialsFromContext(ctx), + VFS: v, + mns: mns, + root: mns.Root(), + } + return s +} + +// Destroy release resources associated with a test system. +func (s *System) Destroy() { + s.root.DecRef() + s.mns.DecRef(s.VFS) // Reference on mns passed to NewSystem. +} + +// ReadToEnd reads the contents of fd until EOF to a string. +func (s *System) ReadToEnd(fd *vfs.FileDescription) (string, error) { + buf := make([]byte, usermem.PageSize) + bufIOSeq := usermem.BytesIOSequence(buf) + opts := vfs.ReadOptions{} + + var content strings.Builder + for { + n, err := fd.Read(s.Ctx, bufIOSeq, opts) + if n == 0 || err != nil { + if err == io.EOF { + err = nil + } + return content.String(), err + } + content.Write(buf[:n]) + } +} + +// PathOpAtRoot constructs a PathOperation with the given path from +// the root of the filesystem. +func (s *System) PathOpAtRoot(path string) vfs.PathOperation { + return vfs.PathOperation{ + Root: s.root, + Start: s.root, + Path: fspath.Parse(path), + } +} + +// GetDentryOrDie attempts to resolve a dentry referred to by the +// provided path operation. If unsuccessful, the test fails. +func (s *System) GetDentryOrDie(pop vfs.PathOperation) vfs.VirtualDentry { + vd, err := s.VFS.GetDentryAt(s.Ctx, s.Creds, &pop, &vfs.GetDentryOptions{}) + if err != nil { + s.t.Fatalf("GetDentryAt(pop:%+v) failed: %v", pop, err) + } + return vd +} + +// DirentType is an alias for values for linux_dirent64.d_type. +type DirentType = uint8 + +// AssertDirectoryContains verifies that a directory at pop contains the entries +// specified. AssertDirectoryContains implicitly checks for "." and "..", these +// need not be included in entries. +func (s *System) AssertDirectoryContains(pop *vfs.PathOperation, entries map[string]DirentType) { + // Also implicitly check for "." and "..". + entries["."] = linux.DT_DIR + entries[".."] = linux.DT_DIR + + fd, err := s.VFS.OpenAt(s.Ctx, s.Creds, pop, &vfs.OpenOptions{Flags: linux.O_RDONLY}) + if err != nil { + s.t.Fatalf("OpenAt for PathOperation %+v failed: %v", pop, err) + } + defer fd.DecRef() + + collector := &DirentCollector{} + if err := fd.IterDirents(s.Ctx, collector); err != nil { + s.t.Fatalf("IterDirent failed: %v", err) + } + + collectedEntries := make(map[string]DirentType) + for _, dirent := range collector.dirents { + collectedEntries[dirent.Name] = dirent.Type + } + if diff := cmp.Diff(entries, collectedEntries); diff != "" { + s.t.Fatalf("IterDirent had unexpected results:\n--- want\n+++ got\n%v", diff) + } +} + +// DirentCollector provides an implementation for vfs.IterDirentsCallback for +// testing. It simply iterates to the end of a given directory FD and collects +// all dirents emitted by the callback. +type DirentCollector struct { + mu sync.Mutex + dirents map[string]vfs.Dirent +} + +// Handle implements vfs.IterDirentsCallback.Handle. +func (d *DirentCollector) Handle(dirent vfs.Dirent) bool { + d.mu.Lock() + if d.dirents == nil { + d.dirents = make(map[string]vfs.Dirent) + } + d.dirents[dirent.Name] = dirent + d.mu.Unlock() + return true +} + +// Count returns the number of dirents currently in the collector. +func (d *DirentCollector) Count() int { + d.mu.Lock() + defer d.mu.Unlock() + return len(d.dirents) +} + +// Contains checks whether the collector has a dirent with the given name and +// type. +func (d *DirentCollector) Contains(name string, typ uint8) error { + d.mu.Lock() + defer d.mu.Unlock() + dirent, ok := d.dirents[name] + if !ok { + return fmt.Errorf("No dirent named %q found", name) + } + if dirent.Type != typ { + return fmt.Errorf("Dirent named %q found, but was expecting type %s, got: %+v", name, linux.DirentType.Parse(uint64(typ)), dirent) + } + return nil +} -- cgit v1.2.3 From 5ab1213a6c405071546c783d6d93b4e9af52842e Mon Sep 17 00:00:00 2001 From: Jamie Liu Date: Wed, 22 Jan 2020 12:27:16 -0800 Subject: Move VFS2 handling of FD readability/writability to vfs.FileDescription. PiperOrigin-RevId: 291006713 --- pkg/sentry/fsimpl/ext/inode.go | 8 +++- pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go | 11 +++-- pkg/sentry/fsimpl/kernfs/fd_impl_util.go | 11 ++++- pkg/sentry/fsimpl/kernfs/kernfs_test.go | 18 +++++-- pkg/sentry/fsimpl/tmpfs/filesystem.go | 15 ++---- pkg/sentry/fsimpl/tmpfs/named_pipe.go | 5 +- pkg/sentry/fsimpl/tmpfs/regular_file.go | 14 +----- pkg/sentry/kernel/pipe/vfs.go | 12 ++--- pkg/sentry/vfs/file_description.go | 66 ++++++++++++++++++++++++-- pkg/sentry/vfs/permissions.go | 5 +- 10 files changed, 111 insertions(+), 54 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/ext/inode.go b/pkg/sentry/fsimpl/ext/inode.go index 8608805bf..191b39970 100644 --- a/pkg/sentry/fsimpl/ext/inode.go +++ b/pkg/sentry/fsimpl/ext/inode.go @@ -157,7 +157,9 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*v switch in.impl.(type) { case *regularFile: var fd regularFileFD - fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{}) + if err := fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{}); err != nil { + return nil, err + } return &fd.vfsfd, nil case *directory: // Can't open directories writably. This check is not necessary for a read @@ -166,7 +168,9 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*v return nil, syserror.EISDIR } var fd directoryFD - fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{}) + if err := fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{}); err != nil { + return nil, err + } return &fd.vfsfd, nil case *symlink: if flags&linux.O_PATH == 0 { diff --git a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go index 606ca692d..75624e0b1 100644 --- a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go +++ b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go @@ -55,7 +55,9 @@ func (f *DynamicBytesFile) Init(creds *auth.Credentials, ino uint64, data vfs.Dy // Open implements Inode.Open. func (f *DynamicBytesFile) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) { fd := &DynamicBytesFD{} - fd.Init(rp.Mount(), vfsd, f.data, flags) + if err := fd.Init(rp.Mount(), vfsd, f.data, flags); err != nil { + return nil, err + } return &fd.vfsfd, nil } @@ -80,10 +82,13 @@ type DynamicBytesFD struct { } // Init initializes a DynamicBytesFD. -func (fd *DynamicBytesFD) Init(m *vfs.Mount, d *vfs.Dentry, data vfs.DynamicBytesSource, flags uint32) { +func (fd *DynamicBytesFD) Init(m *vfs.Mount, d *vfs.Dentry, data vfs.DynamicBytesSource, flags uint32) error { + if err := fd.vfsfd.Init(fd, flags, m, d, &vfs.FileDescriptionOptions{}); err != nil { + return err + } fd.inode = d.Impl().(*Dentry).inode fd.SetDataSource(data) - fd.vfsfd.Init(fd, flags, m, d, &vfs.FileDescriptionOptions{}) + return nil } // Seek implements vfs.FileDescriptionImpl.Seek. diff --git a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go index bcf069b5f..5fa1fa67b 100644 --- a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go @@ -43,9 +43,16 @@ type GenericDirectoryFD struct { } // Init initializes a GenericDirectoryFD. -func (fd *GenericDirectoryFD) Init(m *vfs.Mount, d *vfs.Dentry, children *OrderedChildren, flags uint32) { +func (fd *GenericDirectoryFD) Init(m *vfs.Mount, d *vfs.Dentry, children *OrderedChildren, flags uint32) error { + if vfs.AccessTypesForOpenFlags(flags)&vfs.MayWrite != 0 { + // Can't open directories for writing. + return syserror.EISDIR + } + if err := fd.vfsfd.Init(fd, flags, m, d, &vfs.FileDescriptionOptions{}); err != nil { + return err + } fd.children = children - fd.vfsfd.Init(fd, flags, m, d, &vfs.FileDescriptionOptions{}) + return nil } // VFSFileDescription returns a pointer to the vfs.FileDescription representing diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go index a5fdfbde5..aa3fe76ee 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs_test.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go @@ -115,7 +115,9 @@ func (fs *filesystem) newReadonlyDir(creds *auth.Credentials, mode linux.FileMod func (d *readonlyDir) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) { fd := &kernfs.GenericDirectoryFD{} - fd.Init(rp.Mount(), vfsd, &d.OrderedChildren, flags) + if err := fd.Init(rp.Mount(), vfsd, &d.OrderedChildren, flags); err != nil { + return nil, err + } return fd.VFSFileDescription(), nil } @@ -225,7 +227,9 @@ func TestReadStaticFile(t *testing.T) { defer sys.Destroy() pop := sys.PathOpAtRoot("file1") - fd, err := sys.VFS.OpenAt(sys.Ctx, sys.Creds, &pop, &vfs.OpenOptions{}) + fd, err := sys.VFS.OpenAt(sys.Ctx, sys.Creds, &pop, &vfs.OpenOptions{ + Flags: linux.O_RDONLY, + }) if err != nil { t.Fatalf("OpenAt for PathOperation %+v failed: %v", pop, err) } @@ -258,7 +262,9 @@ func TestCreateNewFileInStaticDir(t *testing.T) { // Close the file. The file should persist. fd.DecRef() - fd, err = sys.VFS.OpenAt(sys.Ctx, sys.Creds, &pop, &vfs.OpenOptions{}) + fd, err = sys.VFS.OpenAt(sys.Ctx, sys.Creds, &pop, &vfs.OpenOptions{ + Flags: linux.O_RDONLY, + }) if err != nil { t.Fatalf("OpenAt(pop:%+v) = %+v failed: %v", pop, fd, err) } @@ -272,7 +278,9 @@ func TestDirFDReadWrite(t *testing.T) { defer sys.Destroy() pop := sys.PathOpAtRoot("/") - fd, err := sys.VFS.OpenAt(sys.Ctx, sys.Creds, &pop, &vfs.OpenOptions{}) + fd, err := sys.VFS.OpenAt(sys.Ctx, sys.Creds, &pop, &vfs.OpenOptions{ + Flags: linux.O_RDONLY, + }) if err != nil { t.Fatalf("OpenAt for PathOperation %+v failed: %v", pop, err) } @@ -282,7 +290,7 @@ func TestDirFDReadWrite(t *testing.T) { if _, err := fd.Read(sys.Ctx, usermem.BytesIOSequence([]byte{}), vfs.ReadOptions{}); err != syserror.EISDIR { t.Fatalf("Read for directory FD failed with unexpected error: %v", err) } - if _, err := fd.Write(sys.Ctx, usermem.BytesIOSequence([]byte{}), vfs.WriteOptions{}); err != syserror.EISDIR { + if _, err := fd.Write(sys.Ctx, usermem.BytesIOSequence([]byte{}), vfs.WriteOptions{}); err != syserror.EBADF { t.Fatalf("Write for directory FD failed with unexpected error: %v", err) } } diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go index 4cd7e9aea..a9f66a42a 100644 --- a/pkg/sentry/fsimpl/tmpfs/filesystem.go +++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go @@ -337,19 +337,12 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, flags uint32, return nil, err } } - mnt := rp.Mount() switch impl := d.inode.impl.(type) { case *regularFile: var fd regularFileFD - fd.readable = vfs.MayReadFileWithOpenFlags(flags) - fd.writable = vfs.MayWriteFileWithOpenFlags(flags) - if fd.writable { - if err := mnt.CheckBeginWrite(); err != nil { - return nil, err - } - // mnt.EndWrite() is called by regularFileFD.Release(). + if err := fd.vfsfd.Init(&fd, flags, rp.Mount(), &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil { + return nil, err } - fd.vfsfd.Init(&fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{}) if flags&linux.O_TRUNC != 0 { impl.mu.Lock() impl.data.Truncate(0, impl.memFile) @@ -363,7 +356,9 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, flags uint32, return nil, syserror.EISDIR } var fd directoryFD - fd.vfsfd.Init(&fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{}) + if err := fd.vfsfd.Init(&fd, flags, rp.Mount(), &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil { + return nil, err + } return &fd.vfsfd, nil case *symlink: // Can't open symlinks without O_PATH (which is unimplemented). diff --git a/pkg/sentry/fsimpl/tmpfs/named_pipe.go b/pkg/sentry/fsimpl/tmpfs/named_pipe.go index 40bde54de..482aabd52 100644 --- a/pkg/sentry/fsimpl/tmpfs/named_pipe.go +++ b/pkg/sentry/fsimpl/tmpfs/named_pipe.go @@ -50,11 +50,10 @@ type namedPipeFD struct { func newNamedPipeFD(ctx context.Context, np *namedPipe, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) { var err error var fd namedPipeFD - fd.VFSPipeFD, err = np.pipe.NewVFSPipeFD(ctx, rp, vfsd, &fd.vfsfd, flags) + fd.VFSPipeFD, err = np.pipe.NewVFSPipeFD(ctx, vfsd, &fd.vfsfd, flags) if err != nil { return nil, err } - mnt := rp.Mount() - fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{}) + fd.vfsfd.Init(&fd, flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{}) return &fd.vfsfd, nil } diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file.go b/pkg/sentry/fsimpl/tmpfs/regular_file.go index 5fa70cc6d..7c633c1b0 100644 --- a/pkg/sentry/fsimpl/tmpfs/regular_file.go +++ b/pkg/sentry/fsimpl/tmpfs/regular_file.go @@ -101,10 +101,6 @@ func (rf *regularFile) truncate(size uint64) (bool, error) { type regularFileFD struct { fileDescription - // These are immutable. - readable bool - writable bool - // off is the file offset. off is accessed using atomic memory operations. // offMu serializes operations that may mutate off. off int64 @@ -113,16 +109,11 @@ type regularFileFD struct { // Release implements vfs.FileDescriptionImpl.Release. func (fd *regularFileFD) Release() { - if fd.writable { - fd.vfsfd.VirtualDentry().Mount().EndWrite() - } + // noop } // PRead implements vfs.FileDescriptionImpl.PRead. func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { - if !fd.readable { - return 0, syserror.EINVAL - } if offset < 0 { return 0, syserror.EINVAL } @@ -147,9 +138,6 @@ func (fd *regularFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts // PWrite implements vfs.FileDescriptionImpl.PWrite. func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { - if !fd.writable { - return 0, syserror.EINVAL - } if offset < 0 { return 0, syserror.EINVAL } diff --git a/pkg/sentry/kernel/pipe/vfs.go b/pkg/sentry/kernel/pipe/vfs.go index bf7461cbb..6f83e3cee 100644 --- a/pkg/sentry/kernel/pipe/vfs.go +++ b/pkg/sentry/kernel/pipe/vfs.go @@ -66,7 +66,7 @@ func NewVFSPipe(sizeBytes, atomicIOBytes int64) *VFSPipe { // for read and write will succeed both in blocking and nonblocking mode. POSIX // leaves this behavior undefined. This can be used to open a FIFO for writing // while there are no readers available." - fifo(7) -func (vp *VFSPipe) NewVFSPipeFD(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, vfsfd *vfs.FileDescription, flags uint32) (*VFSPipeFD, error) { +func (vp *VFSPipe) NewVFSPipeFD(ctx context.Context, vfsd *vfs.Dentry, vfsfd *vfs.FileDescription, flags uint32) (*VFSPipeFD, error) { vp.mu.Lock() defer vp.mu.Unlock() @@ -76,7 +76,7 @@ func (vp *VFSPipe) NewVFSPipeFD(ctx context.Context, rp *vfs.ResolvingPath, vfsd return nil, syserror.EINVAL } - vfd, err := vp.open(rp, vfsd, vfsfd, flags) + vfd, err := vp.open(vfsd, vfsfd, flags) if err != nil { return nil, err } @@ -118,19 +118,13 @@ func (vp *VFSPipe) NewVFSPipeFD(ctx context.Context, rp *vfs.ResolvingPath, vfsd } // Preconditions: vp.mu must be held. -func (vp *VFSPipe) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, vfsfd *vfs.FileDescription, flags uint32) (*VFSPipeFD, error) { +func (vp *VFSPipe) open(vfsd *vfs.Dentry, vfsfd *vfs.FileDescription, flags uint32) (*VFSPipeFD, error) { var fd VFSPipeFD fd.flags = flags fd.readable = vfs.MayReadFileWithOpenFlags(flags) fd.writable = vfs.MayWriteFileWithOpenFlags(flags) fd.vfsfd = vfsfd fd.pipe = &vp.pipe - if fd.writable { - // The corresponding Mount.EndWrite() is in VFSPipe.Release(). - if err := rp.Mount().CheckBeginWrite(); err != nil { - return nil, err - } - } switch { case fd.readable && fd.writable: diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go index 6afe280bc..51c95c2d9 100644 --- a/pkg/sentry/vfs/file_description.go +++ b/pkg/sentry/vfs/file_description.go @@ -49,8 +49,23 @@ type FileDescription struct { // A reference is held on vd. vd is immutable. vd VirtualDentry + // opts contains options passed to FileDescription.Init(). opts is + // immutable. opts FileDescriptionOptions + // readable is MayReadFileWithOpenFlags(statusFlags). readable is + // immutable. + // + // readable is analogous to Linux's FMODE_READ. + readable bool + + // writable is MayWriteFileWithOpenFlags(statusFlags). If writable is true, + // the FileDescription holds a write count on vd.mount. writable is + // immutable. + // + // writable is analogous to Linux's FMODE_WRITE. + writable bool + // impl is the FileDescriptionImpl associated with this Filesystem. impl is // immutable. This should be the last field in FileDescription. impl FileDescriptionImpl @@ -77,10 +92,17 @@ type FileDescriptionOptions struct { UseDentryMetadata bool } -// Init must be called before first use of fd. It takes references on mnt and -// d. statusFlags is the initial file description status flags, which is -// usually the full set of flags passed to open(2). -func (fd *FileDescription) Init(impl FileDescriptionImpl, statusFlags uint32, mnt *Mount, d *Dentry, opts *FileDescriptionOptions) { +// Init must be called before first use of fd. If it succeeds, it takes +// references on mnt and d. statusFlags is the initial file description status +// flags, which is usually the full set of flags passed to open(2). +func (fd *FileDescription) Init(impl FileDescriptionImpl, statusFlags uint32, mnt *Mount, d *Dentry, opts *FileDescriptionOptions) error { + writable := MayWriteFileWithOpenFlags(statusFlags) + if writable { + if err := mnt.CheckBeginWrite(); err != nil { + return err + } + } + fd.refs = 1 fd.statusFlags = statusFlags | linux.O_LARGEFILE fd.vd = VirtualDentry{ @@ -89,7 +111,10 @@ func (fd *FileDescription) Init(impl FileDescriptionImpl, statusFlags uint32, mn } fd.vd.IncRef() fd.opts = *opts + fd.readable = MayReadFileWithOpenFlags(statusFlags) + fd.writable = writable fd.impl = impl + return nil } // IncRef increments fd's reference count. @@ -117,6 +142,9 @@ func (fd *FileDescription) TryIncRef() bool { func (fd *FileDescription) DecRef() { if refs := atomic.AddInt64(&fd.refs, -1); refs == 0 { fd.impl.Release() + if fd.writable { + fd.vd.mount.EndWrite() + } fd.vd.DecRef() } else if refs < 0 { panic("FileDescription.DecRef() called without holding a reference") @@ -194,6 +222,16 @@ func (fd *FileDescription) SetStatusFlags(ctx context.Context, creds *auth.Crede return nil } +// IsReadable returns true if fd was opened for reading. +func (fd *FileDescription) IsReadable() bool { + return fd.readable +} + +// IsWritable returns true if fd was opened for writing. +func (fd *FileDescription) IsWritable() bool { + return fd.writable +} + // Impl returns the FileDescriptionImpl associated with fd. func (fd *FileDescription) Impl() FileDescriptionImpl { return fd.impl @@ -241,6 +279,8 @@ type FileDescriptionImpl interface { // Errors: // // - If opts.Flags specifies unsupported options, PRead returns EOPNOTSUPP. + // + // Preconditions: The FileDescription was opened for reading. PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) // Read is similar to PRead, but does not specify an offset. @@ -254,6 +294,8 @@ type FileDescriptionImpl interface { // Errors: // // - If opts.Flags specifies unsupported options, Read returns EOPNOTSUPP. + // + // Preconditions: The FileDescription was opened for reading. Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) // PWrite writes src to the file, starting at the given offset, and returns @@ -268,6 +310,8 @@ type FileDescriptionImpl interface { // // - If opts.Flags specifies unsupported options, PWrite returns // EOPNOTSUPP. + // + // Preconditions: The FileDescription was opened for writing. PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) // Write is similar to PWrite, but does not specify an offset, which is @@ -281,6 +325,8 @@ type FileDescriptionImpl interface { // Errors: // // - If opts.Flags specifies unsupported options, Write returns EOPNOTSUPP. + // + // Preconditions: The FileDescription was opened for writing. Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) // IterDirents invokes cb on each entry in the directory represented by the @@ -411,11 +457,17 @@ func (fd *FileDescription) StatFS(ctx context.Context) (linux.Statfs, error) { // offset, and returns the number of bytes read. PRead is permitted to return // partial reads with a nil error. func (fd *FileDescription) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) { + if !fd.readable { + return 0, syserror.EBADF + } return fd.impl.PRead(ctx, dst, offset, opts) } // Read is similar to PRead, but does not specify an offset. func (fd *FileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) { + if !fd.readable { + return 0, syserror.EBADF + } return fd.impl.Read(ctx, dst, opts) } @@ -423,11 +475,17 @@ func (fd *FileDescription) Read(ctx context.Context, dst usermem.IOSequence, opt // offset, and returns the number of bytes written. PWrite is permitted to // return partial writes with a nil error. func (fd *FileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) { + if !fd.writable { + return 0, syserror.EBADF + } return fd.impl.PWrite(ctx, src, offset, opts) } // Write is similar to PWrite, but does not specify an offset. func (fd *FileDescription) Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) { + if !fd.writable { + return 0, syserror.EBADF + } return fd.impl.Write(ctx, src, opts) } diff --git a/pkg/sentry/vfs/permissions.go b/pkg/sentry/vfs/permissions.go index d279d05ca..f664581f4 100644 --- a/pkg/sentry/vfs/permissions.go +++ b/pkg/sentry/vfs/permissions.go @@ -94,14 +94,13 @@ func GenericCheckPermissions(creds *auth.Credentials, ats AccessTypes, isDir boo // the set of accesses permitted for the opened file: // // - O_TRUNC causes MayWrite to be set in the returned AccessTypes (since it -// mutates the file), but does not permit the opened to write to the file +// mutates the file), but does not permit writing to the open file description // thereafter. // // - "Linux reserves the special, nonstandard access mode 3 (binary 11) in // flags to mean: check for read and write permission on the file and return a // file descriptor that can't be used for reading or writing." - open(2). Thus -// AccessTypesForOpenFlags returns MayRead|MayWrite in this case, but -// filesystems are responsible for ensuring that access is denied. +// AccessTypesForOpenFlags returns MayRead|MayWrite in this case. // // Use May{Read,Write}FileWithOpenFlags() for these checks instead. func AccessTypesForOpenFlags(flags uint32) AccessTypes { -- cgit v1.2.3 From 896bd654b6622d20cbaf8e82b4554a5375addf81 Mon Sep 17 00:00:00 2001 From: Rahat Mahmood Date: Wed, 22 Jan 2020 15:14:43 -0800 Subject: De-duplicate common test functionality for VFS2 filesystems. PiperOrigin-RevId: 291041576 --- pkg/sentry/fsimpl/kernfs/kernfs_test.go | 12 +- pkg/sentry/fsimpl/proc/tasks_test.go | 411 +++++++++++--------------------- pkg/sentry/fsimpl/sys/sys_test.go | 4 +- pkg/sentry/fsimpl/testutil/testutil.go | 149 ++++++++++-- 4 files changed, 278 insertions(+), 298 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go index aa3fe76ee..fade59491 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs_test.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go @@ -212,7 +212,7 @@ func TestMkdirGetDentry(t *testing.T) { defer sys.Destroy() pop := sys.PathOpAtRoot("dir1/a new directory") - if err := sys.VFS.MkdirAt(sys.Ctx, sys.Creds, &pop, &vfs.MkdirOptions{Mode: 0755}); err != nil { + if err := sys.VFS.MkdirAt(sys.Ctx, sys.Creds, pop, &vfs.MkdirOptions{Mode: 0755}); err != nil { t.Fatalf("MkdirAt for PathOperation %+v failed: %v", pop, err) } sys.GetDentryOrDie(pop).DecRef() @@ -227,7 +227,7 @@ func TestReadStaticFile(t *testing.T) { defer sys.Destroy() pop := sys.PathOpAtRoot("file1") - fd, err := sys.VFS.OpenAt(sys.Ctx, sys.Creds, &pop, &vfs.OpenOptions{ + fd, err := sys.VFS.OpenAt(sys.Ctx, sys.Creds, pop, &vfs.OpenOptions{ Flags: linux.O_RDONLY, }) if err != nil { @@ -254,7 +254,7 @@ func TestCreateNewFileInStaticDir(t *testing.T) { pop := sys.PathOpAtRoot("dir1/newfile") opts := &vfs.OpenOptions{Flags: linux.O_CREAT | linux.O_EXCL, Mode: defaultMode} - fd, err := sys.VFS.OpenAt(sys.Ctx, sys.Creds, &pop, opts) + fd, err := sys.VFS.OpenAt(sys.Ctx, sys.Creds, pop, opts) if err != nil { t.Fatalf("OpenAt(pop:%+v, opts:%+v) failed: %v", pop, opts, err) } @@ -262,7 +262,7 @@ func TestCreateNewFileInStaticDir(t *testing.T) { // Close the file. The file should persist. fd.DecRef() - fd, err = sys.VFS.OpenAt(sys.Ctx, sys.Creds, &pop, &vfs.OpenOptions{ + fd, err = sys.VFS.OpenAt(sys.Ctx, sys.Creds, pop, &vfs.OpenOptions{ Flags: linux.O_RDONLY, }) if err != nil { @@ -278,7 +278,7 @@ func TestDirFDReadWrite(t *testing.T) { defer sys.Destroy() pop := sys.PathOpAtRoot("/") - fd, err := sys.VFS.OpenAt(sys.Ctx, sys.Creds, &pop, &vfs.OpenOptions{ + fd, err := sys.VFS.OpenAt(sys.Ctx, sys.Creds, pop, &vfs.OpenOptions{ Flags: linux.O_RDONLY, }) if err != nil { @@ -309,7 +309,7 @@ func TestDirFDIterDirents(t *testing.T) { defer sys.Destroy() pop := sys.PathOpAtRoot("/") - sys.AssertDirectoryContains(&pop, map[string]testutil.DirentType{ + sys.AssertAllDirentTypes(sys.ListDirents(pop), map[string]testutil.DirentType{ "dir1": linux.DT_DIR, "dir2": linux.DT_DIR, "file1": linux.DT_REG, diff --git a/pkg/sentry/fsimpl/proc/tasks_test.go b/pkg/sentry/fsimpl/proc/tasks_test.go index 41977d816..2c1635f33 100644 --- a/pkg/sentry/fsimpl/proc/tasks_test.go +++ b/pkg/sentry/fsimpl/proc/tasks_test.go @@ -44,100 +44,47 @@ var ( proc3 = vfs.Dirent{Type: linux.DT_DIR, NextOff: 258 + 3 + 1} ) -type testIterDirentsCallback struct { - dirents []vfs.Dirent -} - -func (t *testIterDirentsCallback) Handle(d vfs.Dirent) bool { - t.dirents = append(t.dirents, d) - return true -} - -func checkDots(dirs []vfs.Dirent) ([]vfs.Dirent, error) { - if got := len(dirs); got < 2 { - return dirs, fmt.Errorf("wrong number of dirents, want at least: 2, got: %d: %v", got, dirs) - } - for i, want := range []string{".", ".."} { - if got := dirs[i].Name; got != want { - return dirs, fmt.Errorf("wrong name, want: %s, got: %s", want, got) - } - if got := dirs[i].Type; got != linux.DT_DIR { - return dirs, fmt.Errorf("wrong type, want: %d, got: %d", linux.DT_DIR, got) - } - } - return dirs[2:], nil -} - -func checkTasksStaticFiles(gots []vfs.Dirent) ([]vfs.Dirent, error) { - wants := map[string]vfs.Dirent{ - "cpuinfo": {Type: linux.DT_REG}, - "loadavg": {Type: linux.DT_REG}, - "meminfo": {Type: linux.DT_REG}, - "mounts": {Type: linux.DT_LNK}, - "net": {Type: linux.DT_DIR}, - "self": selfLink, - "stat": {Type: linux.DT_REG}, - "sys": {Type: linux.DT_DIR}, - "thread-self": threadSelfLink, - "uptime": {Type: linux.DT_REG}, - "version": {Type: linux.DT_REG}, - } - return checkFiles(gots, wants) -} - -func checkTaskStaticFiles(gots []vfs.Dirent) ([]vfs.Dirent, error) { - wants := map[string]vfs.Dirent{ - "auxv": {Type: linux.DT_REG}, - "cgroup": {Type: linux.DT_REG}, - "cmdline": {Type: linux.DT_REG}, - "comm": {Type: linux.DT_REG}, - "environ": {Type: linux.DT_REG}, - "gid_map": {Type: linux.DT_REG}, - "io": {Type: linux.DT_REG}, - "maps": {Type: linux.DT_REG}, - "ns": {Type: linux.DT_DIR}, - "smaps": {Type: linux.DT_REG}, - "stat": {Type: linux.DT_REG}, - "statm": {Type: linux.DT_REG}, - "status": {Type: linux.DT_REG}, - "task": {Type: linux.DT_DIR}, - "uid_map": {Type: linux.DT_REG}, - } - return checkFiles(gots, wants) -} - -func checkFiles(gots []vfs.Dirent, wants map[string]vfs.Dirent) ([]vfs.Dirent, error) { - // Go over all files, when there is a match, the file is removed from both - // 'gots' and 'wants'. wants is expected to reach 0, as all files must - // be present. Remaining files in 'gots', is returned to caller to decide - // whether this is valid or not. - for i := 0; i < len(gots); i++ { - got := gots[i] - want, ok := wants[got.Name] - if !ok { - continue - } - if want.Type != got.Type { - return gots, fmt.Errorf("wrong file type, want: %v, got: %v: %+v", want.Type, got.Type, got) - } - if want.NextOff != 0 && want.NextOff != got.NextOff { - return gots, fmt.Errorf("wrong dirent offset, want: %v, got: %v: %+v", want.NextOff, got.NextOff, got) - } - - delete(wants, got.Name) - gots = append(gots[0:i], gots[i+1:]...) - i-- - } - if len(wants) != 0 { - return gots, fmt.Errorf("not all files were found, missing: %+v", wants) +var ( + tasksStaticFiles = map[string]testutil.DirentType{ + "cpuinfo": linux.DT_REG, + "loadavg": linux.DT_REG, + "meminfo": linux.DT_REG, + "mounts": linux.DT_LNK, + "net": linux.DT_DIR, + "self": linux.DT_LNK, + "stat": linux.DT_REG, + "sys": linux.DT_DIR, + "thread-self": linux.DT_LNK, + "uptime": linux.DT_REG, + "version": linux.DT_REG, + } + tasksStaticFilesNextOffs = map[string]int64{ + "self": selfLink.NextOff, + "thread-self": threadSelfLink.NextOff, + } + taskStaticFiles = map[string]testutil.DirentType{ + "auxv": linux.DT_REG, + "cgroup": linux.DT_REG, + "cmdline": linux.DT_REG, + "comm": linux.DT_REG, + "environ": linux.DT_REG, + "gid_map": linux.DT_REG, + "io": linux.DT_REG, + "maps": linux.DT_REG, + "ns": linux.DT_DIR, + "smaps": linux.DT_REG, + "stat": linux.DT_REG, + "statm": linux.DT_REG, + "status": linux.DT_REG, + "task": linux.DT_DIR, + "uid_map": linux.DT_REG, } - return gots, nil -} +) -func setup() (context.Context, *vfs.VirtualFilesystem, vfs.VirtualDentry, error) { +func setup(t *testing.T) *testutil.System { k, err := testutil.Boot() if err != nil { - return nil, nil, vfs.VirtualDentry{}, fmt.Errorf("creating kernel: %v", err) + t.Fatalf("Error creating kernel: %v", err) } ctx := k.SupervisorContext() @@ -157,93 +104,60 @@ func setup() (context.Context, *vfs.VirtualFilesystem, vfs.VirtualDentry, error) } mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "procfs", &fsOpts) if err != nil { - return nil, nil, vfs.VirtualDentry{}, fmt.Errorf("NewMountNamespace(): %v", err) + t.Fatalf("NewMountNamespace(): %v", err) } - return ctx, vfsObj, mntns.Root(), nil + return testutil.NewSystem(ctx, t, vfsObj, mntns) } func TestTasksEmpty(t *testing.T) { - ctx, vfsObj, root, err := setup() - if err != nil { - t.Fatalf("Setup failed: %v", err) - } - defer root.DecRef() - - fd, err := vfsObj.OpenAt( - ctx, - auth.CredentialsFromContext(ctx), - &vfs.PathOperation{Root: root, Start: root, Path: fspath.Parse("/")}, - &vfs.OpenOptions{}, - ) - if err != nil { - t.Fatalf("vfsfs.OpenAt failed: %v", err) - } + s := setup(t) + defer s.Destroy() - cb := testIterDirentsCallback{} - if err := fd.Impl().IterDirents(ctx, &cb); err != nil { - t.Fatalf("IterDirents(): %v", err) - } - cb.dirents, err = checkDots(cb.dirents) - if err != nil { - t.Error(err.Error()) - } - cb.dirents, err = checkTasksStaticFiles(cb.dirents) - if err != nil { - t.Error(err.Error()) - } - if len(cb.dirents) != 0 { - t.Errorf("found more files than expected: %+v", cb.dirents) - } + collector := s.ListDirents(s.PathOpAtRoot("/")) + s.AssertAllDirentTypes(collector, tasksStaticFiles) + s.AssertDirentOffsets(collector, tasksStaticFilesNextOffs) } func TestTasks(t *testing.T) { - ctx, vfsObj, root, err := setup() - if err != nil { - t.Fatalf("Setup failed: %v", err) + s := setup(t) + defer s.Destroy() + + expectedDirents := make(map[string]testutil.DirentType) + for n, d := range tasksStaticFiles { + expectedDirents[n] = d } - defer root.DecRef() - k := kernel.KernelFromContext(ctx) + k := kernel.KernelFromContext(s.Ctx) var tasks []*kernel.Task for i := 0; i < 5; i++ { tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits()) - task, err := testutil.CreateTask(ctx, fmt.Sprintf("name-%d", i), tc) + task, err := testutil.CreateTask(s.Ctx, fmt.Sprintf("name-%d", i), tc) if err != nil { t.Fatalf("CreateTask(): %v", err) } tasks = append(tasks, task) + expectedDirents[fmt.Sprintf("%d", i+1)] = linux.DT_DIR } - fd, err := vfsObj.OpenAt( - ctx, - auth.CredentialsFromContext(ctx), - &vfs.PathOperation{Root: root, Start: root, Path: fspath.Parse("/")}, - &vfs.OpenOptions{}, - ) - if err != nil { - t.Fatalf("vfsfs.OpenAt(/) failed: %v", err) - } + collector := s.ListDirents(s.PathOpAtRoot("/")) + s.AssertAllDirentTypes(collector, expectedDirents) + s.AssertDirentOffsets(collector, tasksStaticFilesNextOffs) - cb := testIterDirentsCallback{} - if err := fd.Impl().IterDirents(ctx, &cb); err != nil { - t.Fatalf("IterDirents(): %v", err) - } - cb.dirents, err = checkDots(cb.dirents) - if err != nil { - t.Error(err.Error()) - } - cb.dirents, err = checkTasksStaticFiles(cb.dirents) - if err != nil { - t.Error(err.Error()) - } lastPid := 0 - for _, d := range cb.dirents { + dirents := collector.OrderedDirents() + doneSkippingNonTaskDirs := false + for _, d := range dirents { pid, err := strconv.Atoi(d.Name) if err != nil { + if !doneSkippingNonTaskDirs { + // We haven't gotten to the task dirs yet. + continue + } t.Fatalf("Invalid process directory %q", d.Name) } + doneSkippingNonTaskDirs = true if lastPid > pid { - t.Errorf("pids not in order: %v", cb.dirents) + t.Errorf("pids not in order: %v", dirents) } found := false for _, t := range tasks { @@ -260,13 +174,16 @@ func TestTasks(t *testing.T) { t.Errorf("Wrong dirent offset want: %d got: %d: %+v", want, d.NextOff, d) } } + if !doneSkippingNonTaskDirs { + t.Fatalf("Never found any process directories.") + } // Test lookup. for _, path := range []string{"/1", "/2"} { - fd, err := vfsObj.OpenAt( - ctx, - auth.CredentialsFromContext(ctx), - &vfs.PathOperation{Root: root, Start: root, Path: fspath.Parse(path)}, + fd, err := s.VFS.OpenAt( + s.Ctx, + s.Creds, + s.PathOpAtRoot(path), &vfs.OpenOptions{}, ) if err != nil { @@ -274,15 +191,15 @@ func TestTasks(t *testing.T) { } buf := make([]byte, 1) bufIOSeq := usermem.BytesIOSequence(buf) - if _, err := fd.Read(ctx, bufIOSeq, vfs.ReadOptions{}); err != syserror.EISDIR { + if _, err := fd.Read(s.Ctx, bufIOSeq, vfs.ReadOptions{}); err != syserror.EISDIR { t.Errorf("wrong error reading directory: %v", err) } } - if _, err := vfsObj.OpenAt( - ctx, - auth.CredentialsFromContext(ctx), - &vfs.PathOperation{Root: root, Start: root, Path: fspath.Parse("/9999")}, + if _, err := s.VFS.OpenAt( + s.Ctx, + s.Creds, + s.PathOpAtRoot("/9999"), &vfs.OpenOptions{}, ); err != syserror.ENOENT { t.Fatalf("wrong error from vfsfs.OpenAt(/9999): %v", err) @@ -290,16 +207,13 @@ func TestTasks(t *testing.T) { } func TestTasksOffset(t *testing.T) { - ctx, vfsObj, root, err := setup() - if err != nil { - t.Fatalf("Setup failed: %v", err) - } - defer root.DecRef() + s := setup(t) + defer s.Destroy() - k := kernel.KernelFromContext(ctx) + k := kernel.KernelFromContext(s.Ctx) for i := 0; i < 3; i++ { tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits()) - if _, err := testutil.CreateTask(ctx, fmt.Sprintf("name-%d", i), tc); err != nil { + if _, err := testutil.CreateTask(s.Ctx, fmt.Sprintf("name-%d", i), tc); err != nil { t.Fatalf("CreateTask(): %v", err) } } @@ -382,134 +296,100 @@ func TestTasksOffset(t *testing.T) { }, } { t.Run(tc.name, func(t *testing.T) { - fd, err := vfsObj.OpenAt( - ctx, - auth.CredentialsFromContext(ctx), - &vfs.PathOperation{Root: root, Start: root, Path: fspath.Parse("/")}, + s := s.WithSubtest(t) + fd, err := s.VFS.OpenAt( + s.Ctx, + s.Creds, + s.PathOpAtRoot("/"), &vfs.OpenOptions{}, ) if err != nil { t.Fatalf("vfsfs.OpenAt(/) failed: %v", err) } - if _, err := fd.Impl().Seek(ctx, tc.offset, linux.SEEK_SET); err != nil { + if _, err := fd.Seek(s.Ctx, tc.offset, linux.SEEK_SET); err != nil { t.Fatalf("Seek(%d, SEEK_SET): %v", tc.offset, err) } - cb := testIterDirentsCallback{} - if err := fd.Impl().IterDirents(ctx, &cb); err != nil { - t.Fatalf("IterDirents(): %v", err) + var collector testutil.DirentCollector + if err := fd.IterDirents(s.Ctx, &collector); err != nil { + t.Fatalf("IterDirent(): %v", err) } - if cb.dirents, err = checkFiles(cb.dirents, tc.wants); err != nil { - t.Error(err.Error()) - } - if len(cb.dirents) != 0 { - t.Errorf("found more files than expected: %+v", cb.dirents) + + expectedTypes := make(map[string]testutil.DirentType) + expectedOffsets := make(map[string]int64) + for name, want := range tc.wants { + expectedTypes[name] = want.Type + if want.NextOff != 0 { + expectedOffsets[name] = want.NextOff + } } + + collector.SkipDotsChecks(true) // We seek()ed past the dots. + s.AssertAllDirentTypes(&collector, expectedTypes) + s.AssertDirentOffsets(&collector, expectedOffsets) }) } } func TestTask(t *testing.T) { - ctx, vfsObj, root, err := setup() - if err != nil { - t.Fatalf("Setup failed: %v", err) - } - defer root.DecRef() + s := setup(t) + defer s.Destroy() - k := kernel.KernelFromContext(ctx) + k := kernel.KernelFromContext(s.Ctx) tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits()) - _, err = testutil.CreateTask(ctx, "name", tc) + _, err := testutil.CreateTask(s.Ctx, "name", tc) if err != nil { t.Fatalf("CreateTask(): %v", err) } - fd, err := vfsObj.OpenAt( - ctx, - auth.CredentialsFromContext(ctx), - &vfs.PathOperation{Root: root, Start: root, Path: fspath.Parse("/1")}, - &vfs.OpenOptions{}, - ) - if err != nil { - t.Fatalf("vfsfs.OpenAt(/1) failed: %v", err) - } - - cb := testIterDirentsCallback{} - if err := fd.Impl().IterDirents(ctx, &cb); err != nil { - t.Fatalf("IterDirents(): %v", err) - } - cb.dirents, err = checkDots(cb.dirents) - if err != nil { - t.Error(err.Error()) - } - cb.dirents, err = checkTaskStaticFiles(cb.dirents) - if err != nil { - t.Error(err.Error()) - } - if len(cb.dirents) != 0 { - t.Errorf("found more files than expected: %+v", cb.dirents) - } + collector := s.ListDirents(s.PathOpAtRoot("/1")) + s.AssertAllDirentTypes(collector, taskStaticFiles) } func TestProcSelf(t *testing.T) { - ctx, vfsObj, root, err := setup() - if err != nil { - t.Fatalf("Setup failed: %v", err) - } - defer root.DecRef() + s := setup(t) + defer s.Destroy() - k := kernel.KernelFromContext(ctx) + k := kernel.KernelFromContext(s.Ctx) tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits()) - task, err := testutil.CreateTask(ctx, "name", tc) + task, err := testutil.CreateTask(s.Ctx, "name", tc) if err != nil { t.Fatalf("CreateTask(): %v", err) } - fd, err := vfsObj.OpenAt( - task, - auth.CredentialsFromContext(ctx), - &vfs.PathOperation{Root: root, Start: root, Path: fspath.Parse("/self/"), FollowFinalSymlink: true}, - &vfs.OpenOptions{}, - ) - if err != nil { - t.Fatalf("vfsfs.OpenAt(/self/) failed: %v", err) - } - - cb := testIterDirentsCallback{} - if err := fd.Impl().IterDirents(ctx, &cb); err != nil { - t.Fatalf("IterDirents(): %v", err) - } - cb.dirents, err = checkDots(cb.dirents) - if err != nil { - t.Error(err.Error()) - } - cb.dirents, err = checkTaskStaticFiles(cb.dirents) - if err != nil { - t.Error(err.Error()) - } - if len(cb.dirents) != 0 { - t.Errorf("found more files than expected: %+v", cb.dirents) - } + collector := s.WithTemporaryContext(task).ListDirents(&vfs.PathOperation{ + Root: s.Root, + Start: s.Root, + Path: fspath.Parse("/self/"), + FollowFinalSymlink: true, + }) + s.AssertAllDirentTypes(collector, taskStaticFiles) } -func iterateDir(ctx context.Context, t *testing.T, vfsObj *vfs.VirtualFilesystem, root vfs.VirtualDentry, fd *vfs.FileDescription) { +func iterateDir(ctx context.Context, t *testing.T, s *testutil.System, fd *vfs.FileDescription) { t.Logf("Iterating: /proc%s", fd.MappedName(ctx)) - cb := testIterDirentsCallback{} - if err := fd.Impl().IterDirents(ctx, &cb); err != nil { + var collector testutil.DirentCollector + if err := fd.IterDirents(ctx, &collector); err != nil { t.Fatalf("IterDirents(): %v", err) } - var err error - cb.dirents, err = checkDots(cb.dirents) - if err != nil { + if err := collector.Contains(".", linux.DT_DIR); err != nil { t.Error(err.Error()) } - for _, d := range cb.dirents { + if err := collector.Contains("..", linux.DT_DIR); err != nil { + t.Error(err.Error()) + } + + for _, d := range collector.Dirents() { + if d.Name == "." || d.Name == ".." { + continue + } childPath := path.Join(fd.MappedName(ctx), d.Name) if d.Type == linux.DT_LNK { - link, err := vfsObj.ReadlinkAt( + link, err := s.VFS.ReadlinkAt( ctx, auth.CredentialsFromContext(ctx), - &vfs.PathOperation{Root: root, Start: root, Path: fspath.Parse(childPath)}, + &vfs.PathOperation{Root: s.Root, Start: s.Root, Path: fspath.Parse(childPath)}, ) if err != nil { t.Errorf("vfsfs.ReadlinkAt(%v) failed: %v", childPath, err) @@ -520,10 +400,10 @@ func iterateDir(ctx context.Context, t *testing.T, vfsObj *vfs.VirtualFilesystem } t.Logf("Opening: /proc%s", childPath) - child, err := vfsObj.OpenAt( + child, err := s.VFS.OpenAt( ctx, auth.CredentialsFromContext(ctx), - &vfs.PathOperation{Root: root, Start: root, Path: fspath.Parse(childPath)}, + &vfs.PathOperation{Root: s.Root, Start: s.Root, Path: fspath.Parse(childPath)}, &vfs.OpenOptions{}, ) if err != nil { @@ -539,24 +419,21 @@ func iterateDir(ctx context.Context, t *testing.T, vfsObj *vfs.VirtualFilesystem } if d.Type == linux.DT_DIR { // Found another dir, let's do it again! - iterateDir(ctx, t, vfsObj, root, child) + iterateDir(ctx, t, s, child) } } } // TestTree iterates all directories and stats every file. func TestTree(t *testing.T) { - uberCtx, vfsObj, root, err := setup() - if err != nil { - t.Fatalf("Setup failed: %v", err) - } - defer root.DecRef() + s := setup(t) + defer s.Destroy() - k := kernel.KernelFromContext(uberCtx) + k := kernel.KernelFromContext(s.Ctx) var tasks []*kernel.Task for i := 0; i < 5; i++ { tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits()) - task, err := testutil.CreateTask(uberCtx, fmt.Sprintf("name-%d", i), tc) + task, err := testutil.CreateTask(s.Ctx, fmt.Sprintf("name-%d", i), tc) if err != nil { t.Fatalf("CreateTask(): %v", err) } @@ -564,14 +441,14 @@ func TestTree(t *testing.T) { } ctx := tasks[0] - fd, err := vfsObj.OpenAt( + fd, err := s.VFS.OpenAt( ctx, - auth.CredentialsFromContext(uberCtx), - &vfs.PathOperation{Root: root, Start: root, Path: fspath.Parse("/")}, + auth.CredentialsFromContext(s.Ctx), + &vfs.PathOperation{Root: s.Root, Start: s.Root, Path: fspath.Parse("/")}, &vfs.OpenOptions{}, ) if err != nil { t.Fatalf("vfsfs.OpenAt(/) failed: %v", err) } - iterateDir(ctx, t, vfsObj, root, fd) + iterateDir(ctx, t, s, fd) } diff --git a/pkg/sentry/fsimpl/sys/sys_test.go b/pkg/sentry/fsimpl/sys/sys_test.go index 60a1634a9..8b1cf0bd0 100644 --- a/pkg/sentry/fsimpl/sys/sys_test.go +++ b/pkg/sentry/fsimpl/sys/sys_test.go @@ -56,7 +56,7 @@ func TestReadCPUFile(t *testing.T) { for _, fname := range []string{"online", "possible", "present"} { pop := s.PathOpAtRoot(fmt.Sprintf("devices/system/cpu/%s", fname)) - fd, err := s.VFS.OpenAt(s.Ctx, s.Creds, &pop, &vfs.OpenOptions{}) + fd, err := s.VFS.OpenAt(s.Ctx, s.Creds, pop, &vfs.OpenOptions{}) if err != nil { t.Fatalf("OpenAt(pop:%+v) = %+v failed: %v", pop, fd, err) } @@ -75,7 +75,7 @@ func TestSysRootContainsExpectedEntries(t *testing.T) { s := newTestSystem(t) defer s.Destroy() pop := s.PathOpAtRoot("/") - s.AssertDirectoryContains(&pop, map[string]testutil.DirentType{ + s.AssertAllDirentTypes(s.ListDirents(pop), map[string]testutil.DirentType{ "block": linux.DT_DIR, "bus": linux.DT_DIR, "class": linux.DT_DIR, diff --git a/pkg/sentry/fsimpl/testutil/testutil.go b/pkg/sentry/fsimpl/testutil/testutil.go index eada31d94..2a723a89f 100644 --- a/pkg/sentry/fsimpl/testutil/testutil.go +++ b/pkg/sentry/fsimpl/testutil/testutil.go @@ -40,8 +40,8 @@ type System struct { Ctx context.Context Creds *auth.Credentials VFS *vfs.VirtualFilesystem + Root vfs.VirtualDentry mns *vfs.MountNamespace - root vfs.VirtualDentry } // NewSystem constructs a System. @@ -55,14 +55,49 @@ func NewSystem(ctx context.Context, t *testing.T, v *vfs.VirtualFilesystem, mns Creds: auth.CredentialsFromContext(ctx), VFS: v, mns: mns, - root: mns.Root(), + Root: mns.Root(), } return s } +// WithSubtest creates a temporary test system with a new test harness, +// referencing all other resources from the original system. This is useful when +// a system is reused for multiple subtests, and the T needs to change for each +// case. Note that this is safe when test cases run in parallel, as all +// resources referenced by the system are immutable, or handle interior +// mutations in a thread-safe manner. +// +// The returned system must not outlive the original and should not be destroyed +// via System.Destroy. +func (s *System) WithSubtest(t *testing.T) *System { + return &System{ + t: t, + Ctx: s.Ctx, + Creds: s.Creds, + VFS: s.VFS, + mns: s.mns, + Root: s.Root, + } +} + +// WithTemporaryContext constructs a temporary test system with a new context +// ctx. The temporary system borrows all resources and references from the +// original system. The returned temporary system must not outlive the original +// system, and should not be destroyed via System.Destroy. +func (s *System) WithTemporaryContext(ctx context.Context) *System { + return &System{ + t: s.t, + Ctx: ctx, + Creds: s.Creds, + VFS: s.VFS, + mns: s.mns, + Root: s.Root, + } +} + // Destroy release resources associated with a test system. func (s *System) Destroy() { - s.root.DecRef() + s.Root.DecRef() s.mns.DecRef(s.VFS) // Reference on mns passed to NewSystem. } @@ -87,18 +122,18 @@ func (s *System) ReadToEnd(fd *vfs.FileDescription) (string, error) { // PathOpAtRoot constructs a PathOperation with the given path from // the root of the filesystem. -func (s *System) PathOpAtRoot(path string) vfs.PathOperation { - return vfs.PathOperation{ - Root: s.root, - Start: s.root, +func (s *System) PathOpAtRoot(path string) *vfs.PathOperation { + return &vfs.PathOperation{ + Root: s.Root, + Start: s.Root, Path: fspath.Parse(path), } } // GetDentryOrDie attempts to resolve a dentry referred to by the // provided path operation. If unsuccessful, the test fails. -func (s *System) GetDentryOrDie(pop vfs.PathOperation) vfs.VirtualDentry { - vd, err := s.VFS.GetDentryAt(s.Ctx, s.Creds, &pop, &vfs.GetDentryOptions{}) +func (s *System) GetDentryOrDie(pop *vfs.PathOperation) vfs.VirtualDentry { + vd, err := s.VFS.GetDentryAt(s.Ctx, s.Creds, pop, &vfs.GetDentryOptions{}) if err != nil { s.t.Fatalf("GetDentryAt(pop:%+v) failed: %v", pop, err) } @@ -108,14 +143,8 @@ func (s *System) GetDentryOrDie(pop vfs.PathOperation) vfs.VirtualDentry { // DirentType is an alias for values for linux_dirent64.d_type. type DirentType = uint8 -// AssertDirectoryContains verifies that a directory at pop contains the entries -// specified. AssertDirectoryContains implicitly checks for "." and "..", these -// need not be included in entries. -func (s *System) AssertDirectoryContains(pop *vfs.PathOperation, entries map[string]DirentType) { - // Also implicitly check for "." and "..". - entries["."] = linux.DT_DIR - entries[".."] = linux.DT_DIR - +// ListDirents lists the Dirents for a directory at pop. +func (s *System) ListDirents(pop *vfs.PathOperation) *DirentCollector { fd, err := s.VFS.OpenAt(s.Ctx, s.Creds, pop, &vfs.OpenOptions{Flags: linux.O_RDONLY}) if err != nil { s.t.Fatalf("OpenAt for PathOperation %+v failed: %v", pop, err) @@ -126,12 +155,52 @@ func (s *System) AssertDirectoryContains(pop *vfs.PathOperation, entries map[str if err := fd.IterDirents(s.Ctx, collector); err != nil { s.t.Fatalf("IterDirent failed: %v", err) } + return collector +} + +// AssertAllDirentTypes verifies that the set of dirents in collector contains +// exactly the specified set of expected entries. AssertAllDirentTypes respects +// collector.skipDots, and implicitly checks for "." and ".." accordingly. +func (s *System) AssertAllDirentTypes(collector *DirentCollector, expected map[string]DirentType) { + // Also implicitly check for "." and "..", if enabled. + if !collector.skipDots { + expected["."] = linux.DT_DIR + expected[".."] = linux.DT_DIR + } - collectedEntries := make(map[string]DirentType) + dentryTypes := make(map[string]DirentType) + collector.mu.Lock() for _, dirent := range collector.dirents { - collectedEntries[dirent.Name] = dirent.Type + dentryTypes[dirent.Name] = dirent.Type } - if diff := cmp.Diff(entries, collectedEntries); diff != "" { + collector.mu.Unlock() + if diff := cmp.Diff(expected, dentryTypes); diff != "" { + s.t.Fatalf("IterDirent had unexpected results:\n--- want\n+++ got\n%v", diff) + } +} + +// AssertDirentOffsets verifies that collector contains at least the entries +// specified in expected, with the given NextOff field. Entries specified in +// expected but missing from collector result in failure. Extra entries in +// collector are ignored. AssertDirentOffsets respects collector.skipDots, and +// implicitly checks for "." and ".." accordingly. +func (s *System) AssertDirentOffsets(collector *DirentCollector, expected map[string]int64) { + // Also implicitly check for "." and "..", if enabled. + if !collector.skipDots { + expected["."] = 1 + expected[".."] = 2 + } + + dentryNextOffs := make(map[string]int64) + collector.mu.Lock() + for _, dirent := range collector.dirents { + // Ignore extra entries in dentries that are not in expected. + if _, ok := expected[dirent.Name]; ok { + dentryNextOffs[dirent.Name] = dirent.NextOff + } + } + collector.mu.Unlock() + if diff := cmp.Diff(expected, dentryNextOffs); diff != "" { s.t.Fatalf("IterDirent had unexpected results:\n--- want\n+++ got\n%v", diff) } } @@ -141,16 +210,29 @@ func (s *System) AssertDirectoryContains(pop *vfs.PathOperation, entries map[str // all dirents emitted by the callback. type DirentCollector struct { mu sync.Mutex - dirents map[string]vfs.Dirent + order []*vfs.Dirent + dirents map[string]*vfs.Dirent + // When the collector is used in various Assert* functions, should "." and + // ".." be implicitly checked? + skipDots bool +} + +// SkipDotsChecks enables or disables the implicit checks on "." and ".." when +// the collector is used in various Assert* functions. Note that "." and ".." +// are still collected if passed to d.Handle, so the caller should only disable +// the checks when they aren't expected. +func (d *DirentCollector) SkipDotsChecks(value bool) { + d.skipDots = value } // Handle implements vfs.IterDirentsCallback.Handle. func (d *DirentCollector) Handle(dirent vfs.Dirent) bool { d.mu.Lock() if d.dirents == nil { - d.dirents = make(map[string]vfs.Dirent) + d.dirents = make(map[string]*vfs.Dirent) } - d.dirents[dirent.Name] = dirent + d.order = append(d.order, &dirent) + d.dirents[dirent.Name] = &dirent d.mu.Unlock() return true } @@ -176,3 +258,24 @@ func (d *DirentCollector) Contains(name string, typ uint8) error { } return nil } + +// Dirents returns all dirents discovered by this collector. +func (d *DirentCollector) Dirents() map[string]*vfs.Dirent { + d.mu.Lock() + dirents := make(map[string]*vfs.Dirent) + for n, d := range d.dirents { + dirents[n] = d + } + d.mu.Unlock() + return dirents +} + +// OrderedDirents returns an ordered list of dirents as discovered by this +// collector. +func (d *DirentCollector) OrderedDirents() []*vfs.Dirent { + d.mu.Lock() + dirents := make([]*vfs.Dirent, len(d.order)) + copy(dirents, d.order) + d.mu.Unlock() + return dirents +} -- cgit v1.2.3 From 18a7e1309decb9bc09879e337adbc00f81d420c5 Mon Sep 17 00:00:00 2001 From: Jamie Liu Date: Fri, 24 Jan 2020 17:06:30 -0800 Subject: Add support for device special files to VFS2 tmpfs. PiperOrigin-RevId: 291471892 --- pkg/sentry/fsimpl/tmpfs/BUILD | 1 + pkg/sentry/fsimpl/tmpfs/device_file.go | 39 ++++++++++++++++++++++++++++++ pkg/sentry/fsimpl/tmpfs/filesystem.go | 43 +++++++++++++++++++--------------- pkg/sentry/fsimpl/tmpfs/tmpfs.go | 30 +++++++++++++++++++----- 4 files changed, 88 insertions(+), 25 deletions(-) create mode 100644 pkg/sentry/fsimpl/tmpfs/device_file.go (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/tmpfs/BUILD b/pkg/sentry/fsimpl/tmpfs/BUILD index 7601c7c04..691476b4f 100644 --- a/pkg/sentry/fsimpl/tmpfs/BUILD +++ b/pkg/sentry/fsimpl/tmpfs/BUILD @@ -20,6 +20,7 @@ go_library( name = "tmpfs", srcs = [ "dentry_list.go", + "device_file.go", "directory.go", "filesystem.go", "named_pipe.go", diff --git a/pkg/sentry/fsimpl/tmpfs/device_file.go b/pkg/sentry/fsimpl/tmpfs/device_file.go new file mode 100644 index 000000000..84b181b90 --- /dev/null +++ b/pkg/sentry/fsimpl/tmpfs/device_file.go @@ -0,0 +1,39 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tmpfs + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" +) + +type deviceFile struct { + inode inode + kind vfs.DeviceKind + major uint32 + minor uint32 +} + +func (fs *filesystem) newDeviceFile(creds *auth.Credentials, mode linux.FileMode, kind vfs.DeviceKind, major, minor uint32) *inode { + file := &deviceFile{ + kind: kind, + major: major, + minor: minor, + } + file.inode.init(file, fs, creds, mode) + file.inode.nlink = 1 // from parent directory + return &file.inode +} diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go index a9f66a42a..d726f03c5 100644 --- a/pkg/sentry/fsimpl/tmpfs/filesystem.go +++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go @@ -228,23 +228,26 @@ func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v // MknodAt implements vfs.FilesystemImpl.MknodAt. func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error { return fs.doCreateAt(rp, false /* dir */, func(parent *dentry, name string) error { + var childInode *inode switch opts.Mode.FileType() { case 0, linux.S_IFREG: - child := fs.newDentry(fs.newRegularFile(rp.Credentials(), opts.Mode)) - parent.vfsd.InsertChild(&child.vfsd, name) - parent.inode.impl.(*directory).childList.PushBack(child) - return nil + childInode = fs.newRegularFile(rp.Credentials(), opts.Mode) case linux.S_IFIFO: - child := fs.newDentry(fs.newNamedPipe(rp.Credentials(), opts.Mode)) - parent.vfsd.InsertChild(&child.vfsd, name) - parent.inode.impl.(*directory).childList.PushBack(child) - return nil - case linux.S_IFBLK, linux.S_IFCHR, linux.S_IFSOCK: + childInode = fs.newNamedPipe(rp.Credentials(), opts.Mode) + case linux.S_IFBLK: + childInode = fs.newDeviceFile(rp.Credentials(), opts.Mode, vfs.BlockDevice, opts.DevMajor, opts.DevMinor) + case linux.S_IFCHR: + childInode = fs.newDeviceFile(rp.Credentials(), opts.Mode, vfs.CharDevice, opts.DevMajor, opts.DevMinor) + case linux.S_IFSOCK: // Not yet supported. return syserror.EPERM default: return syserror.EINVAL } + child := fs.newDentry(childInode) + parent.vfsd.InsertChild(&child.vfsd, name) + parent.inode.impl.(*directory).childList.PushBack(child) + return nil }) } @@ -264,7 +267,7 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf if err != nil { return nil, err } - return d.open(ctx, rp, opts.Flags, false /* afterCreate */) + return d.open(ctx, rp, &opts, false /* afterCreate */) } mustCreate := opts.Flags&linux.O_EXCL != 0 @@ -279,7 +282,7 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf if mustCreate { return nil, syserror.EEXIST } - return start.open(ctx, rp, opts.Flags, false /* afterCreate */) + return start.open(ctx, rp, &opts, false /* afterCreate */) } afterTrailingSymlink: parent, err := walkParentDirLocked(rp, start) @@ -313,7 +316,7 @@ afterTrailingSymlink: child := fs.newDentry(fs.newRegularFile(rp.Credentials(), opts.Mode)) parent.vfsd.InsertChild(&child.vfsd, name) parent.inode.impl.(*directory).childList.PushBack(child) - return child.open(ctx, rp, opts.Flags, true) + return child.open(ctx, rp, &opts, true) } if err != nil { return nil, err @@ -327,11 +330,11 @@ afterTrailingSymlink: if mustCreate { return nil, syserror.EEXIST } - return child.open(ctx, rp, opts.Flags, false) + return child.open(ctx, rp, &opts, false) } -func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, flags uint32, afterCreate bool) (*vfs.FileDescription, error) { - ats := vfs.AccessTypesForOpenFlags(flags) +func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions, afterCreate bool) (*vfs.FileDescription, error) { + ats := vfs.AccessTypesForOpenFlags(opts.Flags) if !afterCreate { if err := d.inode.checkPermissions(rp.Credentials(), ats, d.inode.isDir()); err != nil { return nil, err @@ -340,10 +343,10 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, flags uint32, switch impl := d.inode.impl.(type) { case *regularFile: var fd regularFileFD - if err := fd.vfsfd.Init(&fd, flags, rp.Mount(), &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil { + if err := fd.vfsfd.Init(&fd, opts.Flags, rp.Mount(), &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil { return nil, err } - if flags&linux.O_TRUNC != 0 { + if opts.Flags&linux.O_TRUNC != 0 { impl.mu.Lock() impl.data.Truncate(0, impl.memFile) atomic.StoreUint64(&impl.size, 0) @@ -356,7 +359,7 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, flags uint32, return nil, syserror.EISDIR } var fd directoryFD - if err := fd.vfsfd.Init(&fd, flags, rp.Mount(), &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil { + if err := fd.vfsfd.Init(&fd, opts.Flags, rp.Mount(), &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil { return nil, err } return &fd.vfsfd, nil @@ -364,7 +367,9 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, flags uint32, // Can't open symlinks without O_PATH (which is unimplemented). return nil, syserror.ELOOP case *namedPipe: - return newNamedPipeFD(ctx, impl, rp, &d.vfsd, flags) + return newNamedPipeFD(ctx, impl, rp, &d.vfsd, opts.Flags) + case *deviceFile: + return rp.VirtualFilesystem().OpenDeviceSpecialFile(ctx, rp.Mount(), &d.vfsd, impl.kind, impl.major, impl.minor, opts) default: panic(fmt.Sprintf("unknown inode type: %T", d.inode.impl)) } diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go index 1d4889c89..515f033f2 100644 --- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go +++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go @@ -149,6 +149,10 @@ type inode struct { ctime int64 // nanoseconds mtime int64 // nanoseconds + // Only meaningful for device special files. + rdevMajor uint32 + rdevMinor uint32 + impl interface{} // immutable } @@ -269,6 +273,15 @@ func (i *inode) statTo(stat *linux.Statx) { stat.Blocks = allocatedBlocksForSize(stat.Size) case *namedPipe: stat.Mode |= linux.S_IFIFO + case *deviceFile: + switch impl.kind { + case vfs.BlockDevice: + stat.Mode |= linux.S_IFBLK + case vfs.CharDevice: + stat.Mode |= linux.S_IFCHR + } + stat.RdevMajor = impl.major + stat.RdevMinor = impl.minor default: panic(fmt.Sprintf("unknown inode type: %T", i.impl)) } @@ -309,12 +322,8 @@ func (i *inode) setStat(stat linux.Statx) error { } case *directory: return syserror.EISDIR - case *symlink: - return syserror.EINVAL - case *namedPipe: - // Nothing. default: - panic(fmt.Sprintf("unknown inode type: %T", i.impl)) + return syserror.EINVAL } } if mask&linux.STATX_ATIME != 0 { @@ -353,13 +362,22 @@ func allocatedBlocksForSize(size uint64) uint64 { } func (i *inode) direntType() uint8 { - switch i.impl.(type) { + switch impl := i.impl.(type) { case *regularFile: return linux.DT_REG case *directory: return linux.DT_DIR case *symlink: return linux.DT_LNK + case *deviceFile: + switch impl.kind { + case vfs.BlockDevice: + return linux.DT_BLK + case vfs.CharDevice: + return linux.DT_CHR + default: + panic(fmt.Sprintf("unknown vfs.DeviceKind: %v", impl.kind)) + } default: panic(fmt.Sprintf("unknown inode type: %T", i.impl)) } -- cgit v1.2.3 From d29e59af9fbd420e34378bcbf7ae543134070217 Mon Sep 17 00:00:00 2001 From: Adin Scannell Date: Mon, 27 Jan 2020 10:04:07 -0800 Subject: Standardize on tools directory. PiperOrigin-RevId: 291745021 --- .bazelrc | 8 +- BUILD | 49 ++++++- benchmarks/defs.bzl | 18 --- benchmarks/harness/BUILD | 74 +++++----- benchmarks/harness/machine_producers/BUILD | 4 +- benchmarks/runner/BUILD | 24 ++-- benchmarks/tcp/BUILD | 3 +- benchmarks/workloads/ab/BUILD | 19 ++- benchmarks/workloads/absl/BUILD | 19 ++- benchmarks/workloads/curl/BUILD | 2 +- benchmarks/workloads/ffmpeg/BUILD | 2 +- benchmarks/workloads/fio/BUILD | 19 ++- benchmarks/workloads/httpd/BUILD | 2 +- benchmarks/workloads/iperf/BUILD | 19 ++- benchmarks/workloads/netcat/BUILD | 2 +- benchmarks/workloads/nginx/BUILD | 2 +- benchmarks/workloads/node/BUILD | 2 +- benchmarks/workloads/node_template/BUILD | 2 +- benchmarks/workloads/redis/BUILD | 2 +- benchmarks/workloads/redisbenchmark/BUILD | 19 ++- benchmarks/workloads/ruby/BUILD | 2 +- benchmarks/workloads/ruby_template/BUILD | 2 +- benchmarks/workloads/sleep/BUILD | 2 +- benchmarks/workloads/sysbench/BUILD | 19 ++- benchmarks/workloads/syscall/BUILD | 19 ++- benchmarks/workloads/tensorflow/BUILD | 2 +- benchmarks/workloads/true/BUILD | 2 +- pkg/abi/BUILD | 3 +- pkg/abi/linux/BUILD | 6 +- pkg/amutex/BUILD | 6 +- pkg/atomicbitops/BUILD | 6 +- pkg/binary/BUILD | 6 +- pkg/bits/BUILD | 6 +- pkg/bpf/BUILD | 6 +- pkg/compressio/BUILD | 6 +- pkg/control/client/BUILD | 3 +- pkg/control/server/BUILD | 3 +- pkg/cpuid/BUILD | 8 +- pkg/eventchannel/BUILD | 16 +-- pkg/fd/BUILD | 6 +- pkg/fdchannel/BUILD | 8 +- pkg/fdnotifier/BUILD | 3 +- pkg/flipcall/BUILD | 8 +- pkg/fspath/BUILD | 13 +- pkg/gate/BUILD | 4 +- pkg/goid/BUILD | 6 +- pkg/ilist/BUILD | 6 +- pkg/linewriter/BUILD | 6 +- pkg/log/BUILD | 6 +- pkg/memutil/BUILD | 3 +- pkg/metric/BUILD | 23 +-- pkg/p9/BUILD | 6 +- pkg/p9/p9test/BUILD | 6 +- pkg/procid/BUILD | 8 +- pkg/rand/BUILD | 3 +- pkg/refs/BUILD | 6 +- pkg/seccomp/BUILD | 6 +- pkg/secio/BUILD | 6 +- pkg/segment/test/BUILD | 6 +- pkg/sentry/BUILD | 2 + pkg/sentry/arch/BUILD | 20 +-- pkg/sentry/context/BUILD | 3 +- pkg/sentry/context/contexttest/BUILD | 3 +- pkg/sentry/control/BUILD | 8 +- pkg/sentry/device/BUILD | 6 +- pkg/sentry/fs/BUILD | 6 +- pkg/sentry/fs/anon/BUILD | 3 +- pkg/sentry/fs/dev/BUILD | 3 +- pkg/sentry/fs/fdpipe/BUILD | 6 +- pkg/sentry/fs/filetest/BUILD | 3 +- pkg/sentry/fs/fsutil/BUILD | 6 +- pkg/sentry/fs/gofer/BUILD | 6 +- pkg/sentry/fs/host/BUILD | 6 +- pkg/sentry/fs/lock/BUILD | 6 +- pkg/sentry/fs/proc/BUILD | 6 +- pkg/sentry/fs/proc/device/BUILD | 3 +- pkg/sentry/fs/proc/seqfile/BUILD | 6 +- pkg/sentry/fs/ramfs/BUILD | 6 +- pkg/sentry/fs/sys/BUILD | 3 +- pkg/sentry/fs/timerfd/BUILD | 3 +- pkg/sentry/fs/tmpfs/BUILD | 6 +- pkg/sentry/fs/tty/BUILD | 6 +- pkg/sentry/fsimpl/ext/BUILD | 6 +- pkg/sentry/fsimpl/ext/benchmark/BUILD | 2 +- pkg/sentry/fsimpl/ext/disklayout/BUILD | 6 +- pkg/sentry/fsimpl/kernfs/BUILD | 6 +- pkg/sentry/fsimpl/proc/BUILD | 8 +- pkg/sentry/fsimpl/sys/BUILD | 6 +- pkg/sentry/fsimpl/testutil/BUILD | 5 +- pkg/sentry/fsimpl/tmpfs/BUILD | 8 +- pkg/sentry/hostcpu/BUILD | 6 +- pkg/sentry/hostmm/BUILD | 3 +- pkg/sentry/inet/BUILD | 3 +- pkg/sentry/kernel/BUILD | 24 +--- pkg/sentry/kernel/auth/BUILD | 3 +- pkg/sentry/kernel/contexttest/BUILD | 3 +- pkg/sentry/kernel/epoll/BUILD | 6 +- pkg/sentry/kernel/eventfd/BUILD | 6 +- pkg/sentry/kernel/fasync/BUILD | 3 +- pkg/sentry/kernel/futex/BUILD | 6 +- pkg/sentry/kernel/memevent/BUILD | 20 +-- pkg/sentry/kernel/pipe/BUILD | 6 +- pkg/sentry/kernel/sched/BUILD | 6 +- pkg/sentry/kernel/semaphore/BUILD | 6 +- pkg/sentry/kernel/shm/BUILD | 3 +- pkg/sentry/kernel/signalfd/BUILD | 5 +- pkg/sentry/kernel/time/BUILD | 3 +- pkg/sentry/limits/BUILD | 6 +- pkg/sentry/loader/BUILD | 4 +- pkg/sentry/memmap/BUILD | 6 +- pkg/sentry/mm/BUILD | 6 +- pkg/sentry/pgalloc/BUILD | 6 +- pkg/sentry/platform/BUILD | 3 +- pkg/sentry/platform/interrupt/BUILD | 6 +- pkg/sentry/platform/kvm/BUILD | 6 +- pkg/sentry/platform/kvm/testutil/BUILD | 3 +- pkg/sentry/platform/ptrace/BUILD | 3 +- pkg/sentry/platform/ring0/BUILD | 3 +- pkg/sentry/platform/ring0/gen_offsets/BUILD | 2 +- pkg/sentry/platform/ring0/pagetables/BUILD | 16 +-- pkg/sentry/platform/safecopy/BUILD | 6 +- pkg/sentry/safemem/BUILD | 6 +- pkg/sentry/sighandling/BUILD | 3 +- pkg/sentry/socket/BUILD | 3 +- pkg/sentry/socket/control/BUILD | 3 +- pkg/sentry/socket/hostinet/BUILD | 3 +- pkg/sentry/socket/netfilter/BUILD | 3 +- pkg/sentry/socket/netlink/BUILD | 3 +- pkg/sentry/socket/netlink/port/BUILD | 6 +- pkg/sentry/socket/netlink/route/BUILD | 3 +- pkg/sentry/socket/netlink/uevent/BUILD | 3 +- pkg/sentry/socket/netstack/BUILD | 3 +- pkg/sentry/socket/unix/BUILD | 3 +- pkg/sentry/socket/unix/transport/BUILD | 3 +- pkg/sentry/state/BUILD | 3 +- pkg/sentry/strace/BUILD | 20 +-- pkg/sentry/syscalls/BUILD | 3 +- pkg/sentry/syscalls/linux/BUILD | 3 +- pkg/sentry/time/BUILD | 6 +- pkg/sentry/unimpl/BUILD | 21 +-- pkg/sentry/uniqueid/BUILD | 3 +- pkg/sentry/usage/BUILD | 5 +- pkg/sentry/usermem/BUILD | 7 +- pkg/sentry/vfs/BUILD | 8 +- pkg/sentry/watchdog/BUILD | 3 +- pkg/sleep/BUILD | 6 +- pkg/state/BUILD | 17 +-- pkg/state/statefile/BUILD | 6 +- pkg/sync/BUILD | 6 +- pkg/sync/atomicptrtest/BUILD | 6 +- pkg/sync/seqatomictest/BUILD | 6 +- pkg/syserr/BUILD | 3 +- pkg/syserror/BUILD | 4 +- pkg/tcpip/BUILD | 6 +- pkg/tcpip/adapters/gonet/BUILD | 6 +- pkg/tcpip/buffer/BUILD | 6 +- pkg/tcpip/checker/BUILD | 3 +- pkg/tcpip/hash/jenkins/BUILD | 6 +- pkg/tcpip/header/BUILD | 6 +- pkg/tcpip/iptables/BUILD | 3 +- pkg/tcpip/link/channel/BUILD | 3 +- pkg/tcpip/link/fdbased/BUILD | 6 +- pkg/tcpip/link/loopback/BUILD | 3 +- pkg/tcpip/link/muxed/BUILD | 6 +- pkg/tcpip/link/rawfile/BUILD | 3 +- pkg/tcpip/link/sharedmem/BUILD | 6 +- pkg/tcpip/link/sharedmem/pipe/BUILD | 6 +- pkg/tcpip/link/sharedmem/queue/BUILD | 6 +- pkg/tcpip/link/sniffer/BUILD | 3 +- pkg/tcpip/link/tun/BUILD | 3 +- pkg/tcpip/link/waitable/BUILD | 6 +- pkg/tcpip/network/BUILD | 2 +- pkg/tcpip/network/arp/BUILD | 4 +- pkg/tcpip/network/fragmentation/BUILD | 6 +- pkg/tcpip/network/hash/BUILD | 3 +- pkg/tcpip/network/ipv4/BUILD | 4 +- pkg/tcpip/network/ipv6/BUILD | 6 +- pkg/tcpip/ports/BUILD | 6 +- pkg/tcpip/sample/tun_tcp_connect/BUILD | 2 +- pkg/tcpip/sample/tun_tcp_echo/BUILD | 2 +- pkg/tcpip/seqnum/BUILD | 3 +- pkg/tcpip/stack/BUILD | 6 +- pkg/tcpip/transport/icmp/BUILD | 3 +- pkg/tcpip/transport/packet/BUILD | 3 +- pkg/tcpip/transport/raw/BUILD | 3 +- pkg/tcpip/transport/tcp/BUILD | 4 +- pkg/tcpip/transport/tcp/testing/context/BUILD | 3 +- pkg/tcpip/transport/tcpconntrack/BUILD | 4 +- pkg/tcpip/transport/udp/BUILD | 4 +- pkg/tmutex/BUILD | 6 +- pkg/unet/BUILD | 6 +- pkg/urpc/BUILD | 6 +- pkg/waiter/BUILD | 6 +- runsc/BUILD | 27 ++-- runsc/boot/BUILD | 5 +- runsc/boot/filter/BUILD | 3 +- runsc/boot/platforms/BUILD | 3 +- runsc/cgroup/BUILD | 5 +- runsc/cmd/BUILD | 5 +- runsc/console/BUILD | 3 +- runsc/container/BUILD | 5 +- runsc/container/test_app/BUILD | 4 +- runsc/criutil/BUILD | 3 +- runsc/dockerutil/BUILD | 3 +- runsc/fsgofer/BUILD | 9 +- runsc/fsgofer/filter/BUILD | 3 +- runsc/sandbox/BUILD | 3 +- runsc/specutils/BUILD | 5 +- runsc/testutil/BUILD | 3 +- runsc/version_test.sh | 2 +- scripts/common.sh | 6 +- scripts/common_bazel.sh | 99 ------------- scripts/common_build.sh | 99 +++++++++++++ test/BUILD | 45 +----- test/e2e/BUILD | 5 +- test/image/BUILD | 5 +- test/iptables/BUILD | 5 +- test/iptables/runner/BUILD | 12 +- test/root/BUILD | 5 +- test/root/testdata/BUILD | 3 +- test/runtimes/BUILD | 4 +- test/runtimes/build_defs.bzl | 5 +- test/runtimes/images/proctor/BUILD | 4 +- test/syscalls/BUILD | 2 +- test/syscalls/build_defs.bzl | 6 +- test/syscalls/gtest/BUILD | 7 +- test/syscalls/linux/BUILD | 23 ++- test/syscalls/linux/arch_prctl.cc | 2 + test/syscalls/linux/rseq/BUILD | 5 +- .../linux/udp_socket_errqueue_test_case.cc | 4 + test/uds/BUILD | 3 +- test/util/BUILD | 27 ++-- test/util/save_util_linux.cc | 4 + test/util/save_util_other.cc | 4 + test/util/test_util_runfiles.cc | 4 + tools/BUILD | 3 + tools/build/BUILD | 10 ++ tools/build/defs.bzl | 91 ++++++++++++ tools/checkunsafe/BUILD | 3 +- tools/defs.bzl | 154 +++++++++++++++++++++ tools/go_generics/BUILD | 2 +- tools/go_generics/globals/BUILD | 4 +- tools/go_generics/go_merge/BUILD | 2 +- tools/go_generics/rules_tests/BUILD | 2 +- tools/go_marshal/BUILD | 4 +- tools/go_marshal/README.md | 52 +------ tools/go_marshal/analysis/BUILD | 5 +- tools/go_marshal/defs.bzl | 112 ++------------- tools/go_marshal/gomarshal/BUILD | 6 +- tools/go_marshal/gomarshal/generator.go | 20 ++- tools/go_marshal/gomarshal/generator_tests.go | 6 +- tools/go_marshal/main.go | 11 +- tools/go_marshal/marshal/BUILD | 5 +- tools/go_marshal/test/BUILD | 7 +- tools/go_marshal/test/external/BUILD | 6 +- tools/go_stateify/BUILD | 2 +- tools/go_stateify/defs.bzl | 79 +---------- tools/images/BUILD | 2 +- tools/images/defs.bzl | 6 +- tools/issue_reviver/BUILD | 2 +- tools/issue_reviver/github/BUILD | 3 +- tools/issue_reviver/reviver/BUILD | 5 +- tools/workspace_status.sh | 2 +- vdso/BUILD | 33 ++--- 264 files changed, 1012 insertions(+), 1380 deletions(-) delete mode 100644 benchmarks/defs.bzl delete mode 100755 scripts/common_bazel.sh create mode 100755 scripts/common_build.sh create mode 100644 tools/BUILD create mode 100644 tools/build/BUILD create mode 100644 tools/build/defs.bzl create mode 100644 tools/defs.bzl (limited to 'pkg/sentry/fsimpl') diff --git a/.bazelrc b/.bazelrc index 9c35c5e7b..ef214bcfa 100644 --- a/.bazelrc +++ b/.bazelrc @@ -30,10 +30,10 @@ build:remote --auth_scope="https://www.googleapis.com/auth/cloud-source-tools" # Add a custom platform and toolchain that builds in a privileged docker # container, which is required by our syscall tests. -build:remote --host_platform=//test:rbe_ubuntu1604 -build:remote --extra_toolchains=//test:cc-toolchain-clang-x86_64-default -build:remote --extra_execution_platforms=//test:rbe_ubuntu1604 -build:remote --platforms=//test:rbe_ubuntu1604 +build:remote --host_platform=//:rbe_ubuntu1604 +build:remote --extra_toolchains=//:cc-toolchain-clang-x86_64-default +build:remote --extra_execution_platforms=//:rbe_ubuntu1604 +build:remote --platforms=//:rbe_ubuntu1604 build:remote --crosstool_top=@rbe_default//cc:toolchain build:remote --jobs=50 build:remote --remote_timeout=3600 diff --git a/BUILD b/BUILD index 76286174f..5fd929378 100644 --- a/BUILD +++ b/BUILD @@ -1,8 +1,8 @@ -package(licenses = ["notice"]) # Apache 2.0 - load("@io_bazel_rules_go//go:def.bzl", "go_path", "nogo") load("@bazel_gazelle//:def.bzl", "gazelle") +package(licenses = ["notice"]) + # The sandbox filegroup is used for sandbox-internal dependencies. package_group( name = "sandbox", @@ -49,9 +49,52 @@ gazelle(name = "gazelle") # live in the tools subdirectory (unless they are standard). nogo( name = "nogo", - config = "tools/nogo.js", + config = "//tools:nogo.js", visibility = ["//visibility:public"], deps = [ "//tools/checkunsafe", ], ) + +# We need to define a bazel platform and toolchain to specify dockerPrivileged +# and dockerRunAsRoot options, they are required to run tests on the RBE +# cluster in Kokoro. +alias( + name = "rbe_ubuntu1604", + actual = ":rbe_ubuntu1604_r346485", +) + +platform( + name = "rbe_ubuntu1604_r346485", + constraint_values = [ + "@bazel_tools//platforms:x86_64", + "@bazel_tools//platforms:linux", + "@bazel_tools//tools/cpp:clang", + "@bazel_toolchains//constraints:xenial", + "@bazel_toolchains//constraints/sanitizers:support_msan", + ], + remote_execution_properties = """ + properties: { + name: "container-image" + value:"docker://gcr.io/cloud-marketplace/google/rbe-ubuntu16-04@sha256:93f7e127196b9b653d39830c50f8b05d49ef6fd8739a9b5b8ab16e1df5399e50" + } + properties: { + name: "dockerAddCapabilities" + value: "SYS_ADMIN" + } + properties: { + name: "dockerPrivileged" + value: "true" + } + """, +) + +toolchain( + name = "cc-toolchain-clang-x86_64-default", + exec_compatible_with = [ + ], + target_compatible_with = [ + ], + toolchain = "@bazel_toolchains//configs/ubuntu16_04_clang/10.0.0/bazel_2.0.0/cc:cc-compiler-k8", + toolchain_type = "@bazel_tools//tools/cpp:toolchain_type", +) diff --git a/benchmarks/defs.bzl b/benchmarks/defs.bzl deleted file mode 100644 index 79e6cdbc8..000000000 --- a/benchmarks/defs.bzl +++ /dev/null @@ -1,18 +0,0 @@ -"""Provides python helper functions.""" - -load("@pydeps//:requirements.bzl", _requirement = "requirement") - -def filter_deps(deps = None): - if deps == None: - deps = [] - return [dep for dep in deps if dep] - -def py_library(deps = None, **kwargs): - return native.py_library(deps = filter_deps(deps), **kwargs) - -def py_test(deps = None, **kwargs): - return native.py_test(deps = filter_deps(deps), **kwargs) - -def requirement(name, direct = True): - """ requirement returns the required dependency. """ - return _requirement(name) diff --git a/benchmarks/harness/BUILD b/benchmarks/harness/BUILD index 081a74243..52d4e42f8 100644 --- a/benchmarks/harness/BUILD +++ b/benchmarks/harness/BUILD @@ -1,4 +1,4 @@ -load("//benchmarks:defs.bzl", "py_library", "requirement") +load("//tools:defs.bzl", "py_library", "py_requirement") package( default_visibility = ["//benchmarks:__subpackages__"], @@ -25,16 +25,16 @@ py_library( srcs = ["container.py"], deps = [ "//benchmarks/workloads", - requirement("asn1crypto", False), - requirement("chardet", False), - requirement("certifi", False), - requirement("docker", True), - requirement("docker-pycreds", False), - requirement("idna", False), - requirement("ptyprocess", False), - requirement("requests", False), - requirement("urllib3", False), - requirement("websocket-client", False), + py_requirement("asn1crypto", False), + py_requirement("chardet", False), + py_requirement("certifi", False), + py_requirement("docker", True), + py_requirement("docker-pycreds", False), + py_requirement("idna", False), + py_requirement("ptyprocess", False), + py_requirement("requests", False), + py_requirement("urllib3", False), + py_requirement("websocket-client", False), ], ) @@ -47,17 +47,17 @@ py_library( "//benchmarks/harness:ssh_connection", "//benchmarks/harness:tunnel_dispatcher", "//benchmarks/harness/machine_mocks", - requirement("asn1crypto", False), - requirement("chardet", False), - requirement("certifi", False), - requirement("docker", True), - requirement("docker-pycreds", False), - requirement("idna", False), - requirement("ptyprocess", False), - requirement("requests", False), - requirement("six", False), - requirement("urllib3", False), - requirement("websocket-client", False), + py_requirement("asn1crypto", False), + py_requirement("chardet", False), + py_requirement("certifi", False), + py_requirement("docker", True), + py_requirement("docker-pycreds", False), + py_requirement("idna", False), + py_requirement("ptyprocess", False), + py_requirement("requests", False), + py_requirement("six", False), + py_requirement("urllib3", False), + py_requirement("websocket-client", False), ], ) @@ -66,10 +66,10 @@ py_library( srcs = ["ssh_connection.py"], deps = [ "//benchmarks/harness", - requirement("bcrypt", False), - requirement("cffi", True), - requirement("paramiko", True), - requirement("cryptography", False), + py_requirement("bcrypt", False), + py_requirement("cffi", True), + py_requirement("paramiko", True), + py_requirement("cryptography", False), ], ) @@ -77,16 +77,16 @@ py_library( name = "tunnel_dispatcher", srcs = ["tunnel_dispatcher.py"], deps = [ - requirement("asn1crypto", False), - requirement("chardet", False), - requirement("certifi", False), - requirement("docker", True), - requirement("docker-pycreds", False), - requirement("idna", False), - requirement("pexpect", True), - requirement("ptyprocess", False), - requirement("requests", False), - requirement("urllib3", False), - requirement("websocket-client", False), + py_requirement("asn1crypto", False), + py_requirement("chardet", False), + py_requirement("certifi", False), + py_requirement("docker", True), + py_requirement("docker-pycreds", False), + py_requirement("idna", False), + py_requirement("pexpect", True), + py_requirement("ptyprocess", False), + py_requirement("requests", False), + py_requirement("urllib3", False), + py_requirement("websocket-client", False), ], ) diff --git a/benchmarks/harness/machine_producers/BUILD b/benchmarks/harness/machine_producers/BUILD index c4e943882..48ea0ef39 100644 --- a/benchmarks/harness/machine_producers/BUILD +++ b/benchmarks/harness/machine_producers/BUILD @@ -1,4 +1,4 @@ -load("//benchmarks:defs.bzl", "py_library", "requirement") +load("//tools:defs.bzl", "py_library", "py_requirement") package( default_visibility = ["//benchmarks:__subpackages__"], @@ -31,7 +31,7 @@ py_library( deps = [ "//benchmarks/harness:machine", "//benchmarks/harness/machine_producers:machine_producer", - requirement("PyYAML", False), + py_requirement("PyYAML", False), ], ) diff --git a/benchmarks/runner/BUILD b/benchmarks/runner/BUILD index e1b2ea550..fae0ca800 100644 --- a/benchmarks/runner/BUILD +++ b/benchmarks/runner/BUILD @@ -1,4 +1,4 @@ -load("//benchmarks:defs.bzl", "py_library", "py_test", "requirement") +load("//tools:defs.bzl", "py_library", "py_requirement", "py_test") package(licenses = ["notice"]) @@ -28,7 +28,7 @@ py_library( "//benchmarks/suites:startup", "//benchmarks/suites:sysbench", "//benchmarks/suites:syscall", - requirement("click", True), + py_requirement("click", True), ], ) @@ -36,7 +36,7 @@ py_library( name = "commands", srcs = ["commands.py"], deps = [ - requirement("click", True), + py_requirement("click", True), ], ) @@ -50,14 +50,14 @@ py_test( ], deps = [ ":runner", - requirement("click", True), - requirement("attrs", False), - requirement("atomicwrites", False), - requirement("more-itertools", False), - requirement("pathlib2", False), - requirement("pluggy", False), - requirement("py", False), - requirement("pytest", True), - requirement("six", False), + py_requirement("click", True), + py_requirement("attrs", False), + py_requirement("atomicwrites", False), + py_requirement("more-itertools", False), + py_requirement("pathlib2", False), + py_requirement("pluggy", False), + py_requirement("py", False), + py_requirement("pytest", True), + py_requirement("six", False), ], ) diff --git a/benchmarks/tcp/BUILD b/benchmarks/tcp/BUILD index 735d7127f..d5e401acc 100644 --- a/benchmarks/tcp/BUILD +++ b/benchmarks/tcp/BUILD @@ -1,5 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_binary") -load("@rules_cc//cc:defs.bzl", "cc_binary") +load("//tools:defs.bzl", "cc_binary", "go_binary") package(licenses = ["notice"]) diff --git a/benchmarks/workloads/ab/BUILD b/benchmarks/workloads/ab/BUILD index 4fc0ab735..4dd91ceb3 100644 --- a/benchmarks/workloads/ab/BUILD +++ b/benchmarks/workloads/ab/BUILD @@ -1,5 +1,4 @@ -load("//benchmarks:defs.bzl", "py_library", "py_test", "requirement") -load("@rules_pkg//:pkg.bzl", "pkg_tar") +load("//tools:defs.bzl", "pkg_tar", "py_library", "py_requirement", "py_test") package( default_visibility = ["//benchmarks:__subpackages__"], @@ -17,14 +16,14 @@ py_test( python_version = "PY3", deps = [ ":ab", - requirement("attrs", False), - requirement("atomicwrites", False), - requirement("more-itertools", False), - requirement("pathlib2", False), - requirement("pluggy", False), - requirement("py", False), - requirement("pytest", True), - requirement("six", False), + py_requirement("attrs", False), + py_requirement("atomicwrites", False), + py_requirement("more-itertools", False), + py_requirement("pathlib2", False), + py_requirement("pluggy", False), + py_requirement("py", False), + py_requirement("pytest", True), + py_requirement("six", False), ], ) diff --git a/benchmarks/workloads/absl/BUILD b/benchmarks/workloads/absl/BUILD index 61e010096..55dae3baa 100644 --- a/benchmarks/workloads/absl/BUILD +++ b/benchmarks/workloads/absl/BUILD @@ -1,5 +1,4 @@ -load("//benchmarks:defs.bzl", "py_library", "py_test", "requirement") -load("@rules_pkg//:pkg.bzl", "pkg_tar") +load("//tools:defs.bzl", "pkg_tar", "py_library", "py_requirement", "py_test") package( default_visibility = ["//benchmarks:__subpackages__"], @@ -17,14 +16,14 @@ py_test( python_version = "PY3", deps = [ ":absl", - requirement("attrs", False), - requirement("atomicwrites", False), - requirement("more-itertools", False), - requirement("pathlib2", False), - requirement("pluggy", False), - requirement("py", False), - requirement("pytest", True), - requirement("six", False), + py_requirement("attrs", False), + py_requirement("atomicwrites", False), + py_requirement("more-itertools", False), + py_requirement("pathlib2", False), + py_requirement("pluggy", False), + py_requirement("py", False), + py_requirement("pytest", True), + py_requirement("six", False), ], ) diff --git a/benchmarks/workloads/curl/BUILD b/benchmarks/workloads/curl/BUILD index eb0fb6165..a70873065 100644 --- a/benchmarks/workloads/curl/BUILD +++ b/benchmarks/workloads/curl/BUILD @@ -1,4 +1,4 @@ -load("@rules_pkg//:pkg.bzl", "pkg_tar") +load("//tools:defs.bzl", "pkg_tar") package( default_visibility = ["//benchmarks:__subpackages__"], diff --git a/benchmarks/workloads/ffmpeg/BUILD b/benchmarks/workloads/ffmpeg/BUILD index be472dfb2..7c41ba631 100644 --- a/benchmarks/workloads/ffmpeg/BUILD +++ b/benchmarks/workloads/ffmpeg/BUILD @@ -1,4 +1,4 @@ -load("@rules_pkg//:pkg.bzl", "pkg_tar") +load("//tools:defs.bzl", "pkg_tar") package( default_visibility = ["//benchmarks:__subpackages__"], diff --git a/benchmarks/workloads/fio/BUILD b/benchmarks/workloads/fio/BUILD index de257adad..7b78e8e75 100644 --- a/benchmarks/workloads/fio/BUILD +++ b/benchmarks/workloads/fio/BUILD @@ -1,5 +1,4 @@ -load("//benchmarks:defs.bzl", "py_library", "py_test", "requirement") -load("@rules_pkg//:pkg.bzl", "pkg_tar") +load("//tools:defs.bzl", "pkg_tar", "py_library", "py_requirement", "py_test") package( default_visibility = ["//benchmarks:__subpackages__"], @@ -17,14 +16,14 @@ py_test( python_version = "PY3", deps = [ ":fio", - requirement("attrs", False), - requirement("atomicwrites", False), - requirement("more-itertools", False), - requirement("pathlib2", False), - requirement("pluggy", False), - requirement("py", False), - requirement("pytest", True), - requirement("six", False), + py_requirement("attrs", False), + py_requirement("atomicwrites", False), + py_requirement("more-itertools", False), + py_requirement("pathlib2", False), + py_requirement("pluggy", False), + py_requirement("py", False), + py_requirement("pytest", True), + py_requirement("six", False), ], ) diff --git a/benchmarks/workloads/httpd/BUILD b/benchmarks/workloads/httpd/BUILD index eb0fb6165..a70873065 100644 --- a/benchmarks/workloads/httpd/BUILD +++ b/benchmarks/workloads/httpd/BUILD @@ -1,4 +1,4 @@ -load("@rules_pkg//:pkg.bzl", "pkg_tar") +load("//tools:defs.bzl", "pkg_tar") package( default_visibility = ["//benchmarks:__subpackages__"], diff --git a/benchmarks/workloads/iperf/BUILD b/benchmarks/workloads/iperf/BUILD index 8832a996c..570f40148 100644 --- a/benchmarks/workloads/iperf/BUILD +++ b/benchmarks/workloads/iperf/BUILD @@ -1,5 +1,4 @@ -load("//benchmarks:defs.bzl", "py_library", "py_test", "requirement") -load("@rules_pkg//:pkg.bzl", "pkg_tar") +load("//tools:defs.bzl", "pkg_tar", "py_library", "py_requirement", "py_test") package( default_visibility = ["//benchmarks:__subpackages__"], @@ -17,14 +16,14 @@ py_test( python_version = "PY3", deps = [ ":iperf", - requirement("attrs", False), - requirement("atomicwrites", False), - requirement("more-itertools", False), - requirement("pathlib2", False), - requirement("pluggy", False), - requirement("py", False), - requirement("pytest", True), - requirement("six", False), + py_requirement("attrs", False), + py_requirement("atomicwrites", False), + py_requirement("more-itertools", False), + py_requirement("pathlib2", False), + py_requirement("pluggy", False), + py_requirement("py", False), + py_requirement("pytest", True), + py_requirement("six", False), ], ) diff --git a/benchmarks/workloads/netcat/BUILD b/benchmarks/workloads/netcat/BUILD index eb0fb6165..a70873065 100644 --- a/benchmarks/workloads/netcat/BUILD +++ b/benchmarks/workloads/netcat/BUILD @@ -1,4 +1,4 @@ -load("@rules_pkg//:pkg.bzl", "pkg_tar") +load("//tools:defs.bzl", "pkg_tar") package( default_visibility = ["//benchmarks:__subpackages__"], diff --git a/benchmarks/workloads/nginx/BUILD b/benchmarks/workloads/nginx/BUILD index eb0fb6165..a70873065 100644 --- a/benchmarks/workloads/nginx/BUILD +++ b/benchmarks/workloads/nginx/BUILD @@ -1,4 +1,4 @@ -load("@rules_pkg//:pkg.bzl", "pkg_tar") +load("//tools:defs.bzl", "pkg_tar") package( default_visibility = ["//benchmarks:__subpackages__"], diff --git a/benchmarks/workloads/node/BUILD b/benchmarks/workloads/node/BUILD index 71cd9f519..bfcf78cf9 100644 --- a/benchmarks/workloads/node/BUILD +++ b/benchmarks/workloads/node/BUILD @@ -1,4 +1,4 @@ -load("@rules_pkg//:pkg.bzl", "pkg_tar") +load("//tools:defs.bzl", "pkg_tar") package( default_visibility = ["//benchmarks:__subpackages__"], diff --git a/benchmarks/workloads/node_template/BUILD b/benchmarks/workloads/node_template/BUILD index ca996f068..e142f082a 100644 --- a/benchmarks/workloads/node_template/BUILD +++ b/benchmarks/workloads/node_template/BUILD @@ -1,4 +1,4 @@ -load("@rules_pkg//:pkg.bzl", "pkg_tar") +load("//tools:defs.bzl", "pkg_tar") package( default_visibility = ["//benchmarks:__subpackages__"], diff --git a/benchmarks/workloads/redis/BUILD b/benchmarks/workloads/redis/BUILD index eb0fb6165..a70873065 100644 --- a/benchmarks/workloads/redis/BUILD +++ b/benchmarks/workloads/redis/BUILD @@ -1,4 +1,4 @@ -load("@rules_pkg//:pkg.bzl", "pkg_tar") +load("//tools:defs.bzl", "pkg_tar") package( default_visibility = ["//benchmarks:__subpackages__"], diff --git a/benchmarks/workloads/redisbenchmark/BUILD b/benchmarks/workloads/redisbenchmark/BUILD index f5994a815..f472a4443 100644 --- a/benchmarks/workloads/redisbenchmark/BUILD +++ b/benchmarks/workloads/redisbenchmark/BUILD @@ -1,5 +1,4 @@ -load("//benchmarks:defs.bzl", "py_library", "py_test", "requirement") -load("@rules_pkg//:pkg.bzl", "pkg_tar") +load("//tools:defs.bzl", "pkg_tar", "py_library", "py_requirement", "py_test") package( default_visibility = ["//benchmarks:__subpackages__"], @@ -17,14 +16,14 @@ py_test( python_version = "PY3", deps = [ ":redisbenchmark", - requirement("attrs", False), - requirement("atomicwrites", False), - requirement("more-itertools", False), - requirement("pathlib2", False), - requirement("pluggy", False), - requirement("py", False), - requirement("pytest", True), - requirement("six", False), + py_requirement("attrs", False), + py_requirement("atomicwrites", False), + py_requirement("more-itertools", False), + py_requirement("pathlib2", False), + py_requirement("pluggy", False), + py_requirement("py", False), + py_requirement("pytest", True), + py_requirement("six", False), ], ) diff --git a/benchmarks/workloads/ruby/BUILD b/benchmarks/workloads/ruby/BUILD index e37d77804..a3be4fe92 100644 --- a/benchmarks/workloads/ruby/BUILD +++ b/benchmarks/workloads/ruby/BUILD @@ -1,4 +1,4 @@ -load("@rules_pkg//:pkg.bzl", "pkg_tar") +load("//tools:defs.bzl", "pkg_tar") package( default_visibility = ["//benchmarks:__subpackages__"], diff --git a/benchmarks/workloads/ruby_template/BUILD b/benchmarks/workloads/ruby_template/BUILD index 27f7c0c46..59443b14a 100644 --- a/benchmarks/workloads/ruby_template/BUILD +++ b/benchmarks/workloads/ruby_template/BUILD @@ -1,4 +1,4 @@ -load("@rules_pkg//:pkg.bzl", "pkg_tar") +load("//tools:defs.bzl", "pkg_tar") package( default_visibility = ["//benchmarks:__subpackages__"], diff --git a/benchmarks/workloads/sleep/BUILD b/benchmarks/workloads/sleep/BUILD index eb0fb6165..a70873065 100644 --- a/benchmarks/workloads/sleep/BUILD +++ b/benchmarks/workloads/sleep/BUILD @@ -1,4 +1,4 @@ -load("@rules_pkg//:pkg.bzl", "pkg_tar") +load("//tools:defs.bzl", "pkg_tar") package( default_visibility = ["//benchmarks:__subpackages__"], diff --git a/benchmarks/workloads/sysbench/BUILD b/benchmarks/workloads/sysbench/BUILD index fd2f8f03d..3834af7ed 100644 --- a/benchmarks/workloads/sysbench/BUILD +++ b/benchmarks/workloads/sysbench/BUILD @@ -1,5 +1,4 @@ -load("//benchmarks:defs.bzl", "py_library", "py_test", "requirement") -load("@rules_pkg//:pkg.bzl", "pkg_tar") +load("//tools:defs.bzl", "pkg_tar", "py_library", "py_requirement", "py_test") package( default_visibility = ["//benchmarks:__subpackages__"], @@ -17,14 +16,14 @@ py_test( python_version = "PY3", deps = [ ":sysbench", - requirement("attrs", False), - requirement("atomicwrites", False), - requirement("more-itertools", False), - requirement("pathlib2", False), - requirement("pluggy", False), - requirement("py", False), - requirement("pytest", True), - requirement("six", False), + py_requirement("attrs", False), + py_requirement("atomicwrites", False), + py_requirement("more-itertools", False), + py_requirement("pathlib2", False), + py_requirement("pluggy", False), + py_requirement("py", False), + py_requirement("pytest", True), + py_requirement("six", False), ], ) diff --git a/benchmarks/workloads/syscall/BUILD b/benchmarks/workloads/syscall/BUILD index 5100cbb21..dba4bb1e7 100644 --- a/benchmarks/workloads/syscall/BUILD +++ b/benchmarks/workloads/syscall/BUILD @@ -1,5 +1,4 @@ -load("//benchmarks:defs.bzl", "py_library", "py_test", "requirement") -load("@rules_pkg//:pkg.bzl", "pkg_tar") +load("//tools:defs.bzl", "pkg_tar", "py_library", "py_requirement", "py_test") package( default_visibility = ["//benchmarks:__subpackages__"], @@ -17,14 +16,14 @@ py_test( python_version = "PY3", deps = [ ":syscall", - requirement("attrs", False), - requirement("atomicwrites", False), - requirement("more-itertools", False), - requirement("pathlib2", False), - requirement("pluggy", False), - requirement("py", False), - requirement("pytest", True), - requirement("six", False), + py_requirement("attrs", False), + py_requirement("atomicwrites", False), + py_requirement("more-itertools", False), + py_requirement("pathlib2", False), + py_requirement("pluggy", False), + py_requirement("py", False), + py_requirement("pytest", True), + py_requirement("six", False), ], ) diff --git a/benchmarks/workloads/tensorflow/BUILD b/benchmarks/workloads/tensorflow/BUILD index 026c3b316..a7b7742f4 100644 --- a/benchmarks/workloads/tensorflow/BUILD +++ b/benchmarks/workloads/tensorflow/BUILD @@ -1,4 +1,4 @@ -load("@rules_pkg//:pkg.bzl", "pkg_tar") +load("//tools:defs.bzl", "pkg_tar") package( default_visibility = ["//benchmarks:__subpackages__"], diff --git a/benchmarks/workloads/true/BUILD b/benchmarks/workloads/true/BUILD index 221c4b9a7..eba23d325 100644 --- a/benchmarks/workloads/true/BUILD +++ b/benchmarks/workloads/true/BUILD @@ -1,4 +1,4 @@ -load("@rules_pkg//:pkg.bzl", "pkg_tar") +load("//tools:defs.bzl", "pkg_tar") package( default_visibility = ["//benchmarks:__subpackages__"], diff --git a/pkg/abi/BUILD b/pkg/abi/BUILD index f5c08ea06..839f822eb 100644 --- a/pkg/abi/BUILD +++ b/pkg/abi/BUILD @@ -1,4 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -9,6 +9,5 @@ go_library( "abi_linux.go", "flag.go", ], - importpath = "gvisor.dev/gvisor/pkg/abi", visibility = ["//:sandbox"], ) diff --git a/pkg/abi/linux/BUILD b/pkg/abi/linux/BUILD index 716ff22d2..1f3c0c687 100644 --- a/pkg/abi/linux/BUILD +++ b/pkg/abi/linux/BUILD @@ -1,5 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_test") -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library", "go_test") # Package linux contains the constants and types needed to interface with a # Linux kernel. It should be used instead of syscall or golang.org/x/sys/unix @@ -60,7 +59,6 @@ go_library( "wait.go", "xattr.go", ], - importpath = "gvisor.dev/gvisor/pkg/abi/linux", visibility = ["//visibility:public"], deps = [ "//pkg/abi", @@ -73,7 +71,7 @@ go_test( name = "linux_test", size = "small", srcs = ["netfilter_test.go"], - embed = [":linux"], + library = ":linux", deps = [ "//pkg/binary", ], diff --git a/pkg/amutex/BUILD b/pkg/amutex/BUILD index d99e37b40..9612f072e 100644 --- a/pkg/amutex/BUILD +++ b/pkg/amutex/BUILD @@ -1,12 +1,10 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) go_library( name = "amutex", srcs = ["amutex.go"], - importpath = "gvisor.dev/gvisor/pkg/amutex", visibility = ["//:sandbox"], ) @@ -14,6 +12,6 @@ go_test( name = "amutex_test", size = "small", srcs = ["amutex_test.go"], - embed = [":amutex"], + library = ":amutex", deps = ["//pkg/sync"], ) diff --git a/pkg/atomicbitops/BUILD b/pkg/atomicbitops/BUILD index 6403c60c2..3948074ba 100644 --- a/pkg/atomicbitops/BUILD +++ b/pkg/atomicbitops/BUILD @@ -1,5 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) @@ -11,7 +10,6 @@ go_library( "atomic_bitops_arm64.s", "atomic_bitops_common.go", ], - importpath = "gvisor.dev/gvisor/pkg/atomicbitops", visibility = ["//:sandbox"], ) @@ -19,6 +17,6 @@ go_test( name = "atomicbitops_test", size = "small", srcs = ["atomic_bitops_test.go"], - embed = [":atomicbitops"], + library = ":atomicbitops", deps = ["//pkg/sync"], ) diff --git a/pkg/binary/BUILD b/pkg/binary/BUILD index 543fb54bf..7ca2fda90 100644 --- a/pkg/binary/BUILD +++ b/pkg/binary/BUILD @@ -1,12 +1,10 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) go_library( name = "binary", srcs = ["binary.go"], - importpath = "gvisor.dev/gvisor/pkg/binary", visibility = ["//:sandbox"], ) @@ -14,5 +12,5 @@ go_test( name = "binary_test", size = "small", srcs = ["binary_test.go"], - embed = [":binary"], + library = ":binary", ) diff --git a/pkg/bits/BUILD b/pkg/bits/BUILD index 93b88a29a..63f4670d7 100644 --- a/pkg/bits/BUILD +++ b/pkg/bits/BUILD @@ -1,5 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") load("//tools/go_generics:defs.bzl", "go_template", "go_template_instance") package(licenses = ["notice"]) @@ -15,7 +14,6 @@ go_library( "uint64_arch_arm64_asm.s", "uint64_arch_generic.go", ], - importpath = "gvisor.dev/gvisor/pkg/bits", visibility = ["//:sandbox"], ) @@ -53,5 +51,5 @@ go_test( name = "bits_test", size = "small", srcs = ["uint64_test.go"], - embed = [":bits"], + library = ":bits", ) diff --git a/pkg/bpf/BUILD b/pkg/bpf/BUILD index fba5643e8..2a6977f85 100644 --- a/pkg/bpf/BUILD +++ b/pkg/bpf/BUILD @@ -1,5 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_test") -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) @@ -12,7 +11,6 @@ go_library( "interpreter.go", "program_builder.go", ], - importpath = "gvisor.dev/gvisor/pkg/bpf", visibility = ["//visibility:public"], deps = ["//pkg/abi/linux"], ) @@ -25,7 +23,7 @@ go_test( "interpreter_test.go", "program_builder_test.go", ], - embed = [":bpf"], + library = ":bpf", deps = [ "//pkg/abi/linux", "//pkg/binary", diff --git a/pkg/compressio/BUILD b/pkg/compressio/BUILD index 2bb581b18..1f75319a7 100644 --- a/pkg/compressio/BUILD +++ b/pkg/compressio/BUILD @@ -1,12 +1,10 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) go_library( name = "compressio", srcs = ["compressio.go"], - importpath = "gvisor.dev/gvisor/pkg/compressio", visibility = ["//:sandbox"], deps = [ "//pkg/binary", @@ -18,5 +16,5 @@ go_test( name = "compressio_test", size = "medium", srcs = ["compressio_test.go"], - embed = [":compressio"], + library = ":compressio", ) diff --git a/pkg/control/client/BUILD b/pkg/control/client/BUILD index 066d7b1a1..1b9e10ee7 100644 --- a/pkg/control/client/BUILD +++ b/pkg/control/client/BUILD @@ -1,4 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -7,7 +7,6 @@ go_library( srcs = [ "client.go", ], - importpath = "gvisor.dev/gvisor/pkg/control/client", visibility = ["//:sandbox"], deps = [ "//pkg/unet", diff --git a/pkg/control/server/BUILD b/pkg/control/server/BUILD index adbd1e3f8..002d2ef44 100644 --- a/pkg/control/server/BUILD +++ b/pkg/control/server/BUILD @@ -1,11 +1,10 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) go_library( name = "server", srcs = ["server.go"], - importpath = "gvisor.dev/gvisor/pkg/control/server", visibility = ["//:sandbox"], deps = [ "//pkg/log", diff --git a/pkg/cpuid/BUILD b/pkg/cpuid/BUILD index ed111fd2a..43a432190 100644 --- a/pkg/cpuid/BUILD +++ b/pkg/cpuid/BUILD @@ -1,5 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_test") -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) @@ -9,7 +8,6 @@ go_library( "cpu_amd64.s", "cpuid.go", ], - importpath = "gvisor.dev/gvisor/pkg/cpuid", visibility = ["//:sandbox"], deps = ["//pkg/log"], ) @@ -18,7 +16,7 @@ go_test( name = "cpuid_test", size = "small", srcs = ["cpuid_test.go"], - embed = [":cpuid"], + library = ":cpuid", ) go_test( @@ -27,6 +25,6 @@ go_test( srcs = [ "cpuid_parse_test.go", ], - embed = [":cpuid"], + library = ":cpuid", tags = ["manual"], ) diff --git a/pkg/eventchannel/BUILD b/pkg/eventchannel/BUILD index 9d68682c7..bee28b68d 100644 --- a/pkg/eventchannel/BUILD +++ b/pkg/eventchannel/BUILD @@ -1,6 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test", "proto_library") package(licenses = ["notice"]) @@ -10,7 +8,6 @@ go_library( "event.go", "rate.go", ], - importpath = "gvisor.dev/gvisor/pkg/eventchannel", visibility = ["//:sandbox"], deps = [ ":eventchannel_go_proto", @@ -24,22 +21,15 @@ go_library( ) proto_library( - name = "eventchannel_proto", + name = "eventchannel", srcs = ["event.proto"], visibility = ["//:sandbox"], ) -go_proto_library( - name = "eventchannel_go_proto", - importpath = "gvisor.dev/gvisor/pkg/eventchannel/eventchannel_go_proto", - proto = ":eventchannel_proto", - visibility = ["//:sandbox"], -) - go_test( name = "eventchannel_test", srcs = ["event_test.go"], - embed = [":eventchannel"], + library = ":eventchannel", deps = [ "//pkg/sync", "@com_github_golang_protobuf//proto:go_default_library", diff --git a/pkg/fd/BUILD b/pkg/fd/BUILD index afa8f7659..872361546 100644 --- a/pkg/fd/BUILD +++ b/pkg/fd/BUILD @@ -1,12 +1,10 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) go_library( name = "fd", srcs = ["fd.go"], - importpath = "gvisor.dev/gvisor/pkg/fd", visibility = ["//visibility:public"], ) @@ -14,5 +12,5 @@ go_test( name = "fd_test", size = "small", srcs = ["fd_test.go"], - embed = [":fd"], + library = ":fd", ) diff --git a/pkg/fdchannel/BUILD b/pkg/fdchannel/BUILD index b0478c672..d9104ef02 100644 --- a/pkg/fdchannel/BUILD +++ b/pkg/fdchannel/BUILD @@ -1,12 +1,10 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") -package(licenses = ["notice"]) +licenses(["notice"]) go_library( name = "fdchannel", srcs = ["fdchannel_unsafe.go"], - importpath = "gvisor.dev/gvisor/pkg/fdchannel", visibility = ["//visibility:public"], ) @@ -14,6 +12,6 @@ go_test( name = "fdchannel_test", size = "small", srcs = ["fdchannel_test.go"], - embed = [":fdchannel"], + library = ":fdchannel", deps = ["//pkg/sync"], ) diff --git a/pkg/fdnotifier/BUILD b/pkg/fdnotifier/BUILD index 91a202a30..235dcc490 100644 --- a/pkg/fdnotifier/BUILD +++ b/pkg/fdnotifier/BUILD @@ -1,4 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -8,7 +8,6 @@ go_library( "fdnotifier.go", "poll_unsafe.go", ], - importpath = "gvisor.dev/gvisor/pkg/fdnotifier", visibility = ["//:sandbox"], deps = [ "//pkg/sync", diff --git a/pkg/flipcall/BUILD b/pkg/flipcall/BUILD index 85bd83af1..9c5ad500b 100644 --- a/pkg/flipcall/BUILD +++ b/pkg/flipcall/BUILD @@ -1,7 +1,6 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") -package(licenses = ["notice"]) +licenses(["notice"]) go_library( name = "flipcall", @@ -13,7 +12,6 @@ go_library( "io.go", "packet_window_allocator.go", ], - importpath = "gvisor.dev/gvisor/pkg/flipcall", visibility = ["//visibility:public"], deps = [ "//pkg/abi/linux", @@ -30,6 +28,6 @@ go_test( "flipcall_example_test.go", "flipcall_test.go", ], - embed = [":flipcall"], + library = ":flipcall", deps = ["//pkg/sync"], ) diff --git a/pkg/fspath/BUILD b/pkg/fspath/BUILD index ca540363c..ee84471b2 100644 --- a/pkg/fspath/BUILD +++ b/pkg/fspath/BUILD @@ -1,10 +1,8 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") -package( - default_visibility = ["//visibility:public"], - licenses = ["notice"], -) +package(default_visibility = ["//visibility:public"]) + +licenses(["notice"]) go_library( name = "fspath", @@ -13,7 +11,6 @@ go_library( "builder_unsafe.go", "fspath.go", ], - importpath = "gvisor.dev/gvisor/pkg/fspath", ) go_test( @@ -23,5 +20,5 @@ go_test( "builder_test.go", "fspath_test.go", ], - embed = [":fspath"], + library = ":fspath", ) diff --git a/pkg/gate/BUILD b/pkg/gate/BUILD index f22bd070d..dd3141143 100644 --- a/pkg/gate/BUILD +++ b/pkg/gate/BUILD @@ -1,5 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) @@ -8,7 +7,6 @@ go_library( srcs = [ "gate.go", ], - importpath = "gvisor.dev/gvisor/pkg/gate", visibility = ["//visibility:public"], ) diff --git a/pkg/goid/BUILD b/pkg/goid/BUILD index 5d31e5366..ea8d2422c 100644 --- a/pkg/goid/BUILD +++ b/pkg/goid/BUILD @@ -1,5 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) @@ -11,7 +10,6 @@ go_library( "goid_race.go", "goid_unsafe.go", ], - importpath = "gvisor.dev/gvisor/pkg/goid", visibility = ["//visibility:public"], ) @@ -22,5 +20,5 @@ go_test( "empty_test.go", "goid_test.go", ], - embed = [":goid"], + library = ":goid", ) diff --git a/pkg/ilist/BUILD b/pkg/ilist/BUILD index 34d2673ef..3f6eb07df 100644 --- a/pkg/ilist/BUILD +++ b/pkg/ilist/BUILD @@ -1,6 +1,5 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") load("//tools/go_generics:defs.bzl", "go_template", "go_template_instance") -load("//tools/go_stateify:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -9,7 +8,6 @@ go_library( srcs = [ "interface_list.go", ], - importpath = "gvisor.dev/gvisor/pkg/ilist", visibility = ["//visibility:public"], ) @@ -41,7 +39,7 @@ go_test( "list_test.go", "test_list.go", ], - embed = [":ilist"], + library = ":ilist", ) go_template( diff --git a/pkg/linewriter/BUILD b/pkg/linewriter/BUILD index bcde6d308..41bf104d0 100644 --- a/pkg/linewriter/BUILD +++ b/pkg/linewriter/BUILD @@ -1,12 +1,10 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) go_library( name = "linewriter", srcs = ["linewriter.go"], - importpath = "gvisor.dev/gvisor/pkg/linewriter", visibility = ["//visibility:public"], deps = ["//pkg/sync"], ) @@ -14,5 +12,5 @@ go_library( go_test( name = "linewriter_test", srcs = ["linewriter_test.go"], - embed = [":linewriter"], + library = ":linewriter", ) diff --git a/pkg/log/BUILD b/pkg/log/BUILD index 0df0f2849..935d06963 100644 --- a/pkg/log/BUILD +++ b/pkg/log/BUILD @@ -1,5 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) @@ -12,7 +11,6 @@ go_library( "json_k8s.go", "log.go", ], - importpath = "gvisor.dev/gvisor/pkg/log", visibility = [ "//visibility:public", ], @@ -29,5 +27,5 @@ go_test( "json_test.go", "log_test.go", ], - embed = [":log"], + library = ":log", ) diff --git a/pkg/memutil/BUILD b/pkg/memutil/BUILD index 7b50e2b28..9d07d98b4 100644 --- a/pkg/memutil/BUILD +++ b/pkg/memutil/BUILD @@ -1,11 +1,10 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) go_library( name = "memutil", srcs = ["memutil_unsafe.go"], - importpath = "gvisor.dev/gvisor/pkg/memutil", visibility = ["//visibility:public"], deps = ["@org_golang_x_sys//unix:go_default_library"], ) diff --git a/pkg/metric/BUILD b/pkg/metric/BUILD index 9145f3233..58305009d 100644 --- a/pkg/metric/BUILD +++ b/pkg/metric/BUILD @@ -1,14 +1,10 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") -load("@rules_cc//cc:defs.bzl", "cc_proto_library") +load("//tools:defs.bzl", "go_library", "go_test", "proto_library") package(licenses = ["notice"]) go_library( name = "metric", srcs = ["metric.go"], - importpath = "gvisor.dev/gvisor/pkg/metric", visibility = ["//:sandbox"], deps = [ ":metric_go_proto", @@ -19,28 +15,15 @@ go_library( ) proto_library( - name = "metric_proto", + name = "metric", srcs = ["metric.proto"], visibility = ["//:sandbox"], ) -cc_proto_library( - name = "metric_cc_proto", - visibility = ["//:sandbox"], - deps = [":metric_proto"], -) - -go_proto_library( - name = "metric_go_proto", - importpath = "gvisor.dev/gvisor/pkg/metric/metric_go_proto", - proto = ":metric_proto", - visibility = ["//:sandbox"], -) - go_test( name = "metric_test", srcs = ["metric_test.go"], - embed = [":metric"], + library = ":metric", deps = [ ":metric_go_proto", "//pkg/eventchannel", diff --git a/pkg/p9/BUILD b/pkg/p9/BUILD index a3e05c96d..4ccc1de86 100644 --- a/pkg/p9/BUILD +++ b/pkg/p9/BUILD @@ -1,5 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package( default_visibility = ["//visibility:public"], @@ -23,7 +22,6 @@ go_library( "transport_flipcall.go", "version.go", ], - importpath = "gvisor.dev/gvisor/pkg/p9", deps = [ "//pkg/fd", "//pkg/fdchannel", @@ -47,7 +45,7 @@ go_test( "transport_test.go", "version_test.go", ], - embed = [":p9"], + library = ":p9", deps = [ "//pkg/fd", "//pkg/unet", diff --git a/pkg/p9/p9test/BUILD b/pkg/p9/p9test/BUILD index f4edd68b2..7ca67cb19 100644 --- a/pkg/p9/p9test/BUILD +++ b/pkg/p9/p9test/BUILD @@ -1,5 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_binary", "go_test") +load("//tools:defs.bzl", "go_binary", "go_library", "go_test") package(licenses = ["notice"]) @@ -64,7 +63,6 @@ go_library( "mocks.go", "p9test.go", ], - importpath = "gvisor.dev/gvisor/pkg/p9/p9test", visibility = ["//:sandbox"], deps = [ "//pkg/fd", @@ -80,7 +78,7 @@ go_test( name = "client_test", size = "medium", srcs = ["client_test.go"], - embed = [":p9test"], + library = ":p9test", deps = [ "//pkg/fd", "//pkg/p9", diff --git a/pkg/procid/BUILD b/pkg/procid/BUILD index b506813f0..aa3e3ac0b 100644 --- a/pkg/procid/BUILD +++ b/pkg/procid/BUILD @@ -1,5 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) @@ -10,7 +9,6 @@ go_library( "procid_amd64.s", "procid_arm64.s", ], - importpath = "gvisor.dev/gvisor/pkg/procid", visibility = ["//visibility:public"], ) @@ -20,7 +18,7 @@ go_test( srcs = [ "procid_test.go", ], - embed = [":procid"], + library = ":procid", deps = ["//pkg/sync"], ) @@ -31,6 +29,6 @@ go_test( "procid_net_test.go", "procid_test.go", ], - embed = [":procid"], + library = ":procid", deps = ["//pkg/sync"], ) diff --git a/pkg/rand/BUILD b/pkg/rand/BUILD index 9d5b4859b..80b8ceb02 100644 --- a/pkg/rand/BUILD +++ b/pkg/rand/BUILD @@ -1,4 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -8,7 +8,6 @@ go_library( "rand.go", "rand_linux.go", ], - importpath = "gvisor.dev/gvisor/pkg/rand", visibility = ["//:sandbox"], deps = [ "//pkg/sync", diff --git a/pkg/refs/BUILD b/pkg/refs/BUILD index 974d9af9b..74affc887 100644 --- a/pkg/refs/BUILD +++ b/pkg/refs/BUILD @@ -1,6 +1,5 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") load("//tools/go_generics:defs.bzl", "go_template_instance") -load("//tools/go_stateify:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -23,7 +22,6 @@ go_library( "refcounter_state.go", "weak_ref_list.go", ], - importpath = "gvisor.dev/gvisor/pkg/refs", visibility = ["//:sandbox"], deps = [ "//pkg/log", @@ -35,6 +33,6 @@ go_test( name = "refs_test", size = "small", srcs = ["refcounter_test.go"], - embed = [":refs"], + library = ":refs", deps = ["//pkg/sync"], ) diff --git a/pkg/seccomp/BUILD b/pkg/seccomp/BUILD index af94e944d..742c8b79b 100644 --- a/pkg/seccomp/BUILD +++ b/pkg/seccomp/BUILD @@ -1,5 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_binary", "go_embed_data", "go_test") +load("//tools:defs.bzl", "go_binary", "go_embed_data", "go_library", "go_test") package(licenses = ["notice"]) @@ -27,7 +26,6 @@ go_library( "seccomp_rules.go", "seccomp_unsafe.go", ], - importpath = "gvisor.dev/gvisor/pkg/seccomp", visibility = ["//visibility:public"], deps = [ "//pkg/abi/linux", @@ -43,7 +41,7 @@ go_test( "seccomp_test.go", ":victim_data", ], - embed = [":seccomp"], + library = ":seccomp", deps = [ "//pkg/abi/linux", "//pkg/binary", diff --git a/pkg/secio/BUILD b/pkg/secio/BUILD index 22abdc69f..60f63c7a6 100644 --- a/pkg/secio/BUILD +++ b/pkg/secio/BUILD @@ -1,5 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) @@ -9,7 +8,6 @@ go_library( "full_reader.go", "secio.go", ], - importpath = "gvisor.dev/gvisor/pkg/secio", visibility = ["//pkg/sentry:internal"], ) @@ -17,5 +15,5 @@ go_test( name = "secio_test", size = "small", srcs = ["secio_test.go"], - embed = [":secio"], + library = ":secio", ) diff --git a/pkg/segment/test/BUILD b/pkg/segment/test/BUILD index a27c35e21..f2d8462d8 100644 --- a/pkg/segment/test/BUILD +++ b/pkg/segment/test/BUILD @@ -1,5 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") load("//tools/go_generics:defs.bzl", "go_template_instance") package( @@ -38,7 +37,6 @@ go_library( "int_set.go", "set_functions.go", ], - importpath = "gvisor.dev/gvisor/pkg/segment/segment", deps = [ "//pkg/state", ], @@ -48,5 +46,5 @@ go_test( name = "segment_test", size = "small", srcs = ["segment_test.go"], - embed = [":segment"], + library = ":segment", ) diff --git a/pkg/sentry/BUILD b/pkg/sentry/BUILD index 2d6379c86..e8b794179 100644 --- a/pkg/sentry/BUILD +++ b/pkg/sentry/BUILD @@ -6,6 +6,8 @@ package(licenses = ["notice"]) package_group( name = "internal", packages = [ + "//cloud/gvisor/gopkg/sentry/...", + "//cloud/gvisor/sentry/...", "//pkg/sentry/...", "//runsc/...", # Code generated by go_marshal relies on go_marshal libraries. diff --git a/pkg/sentry/arch/BUILD b/pkg/sentry/arch/BUILD index 65f22af2b..51ca09b24 100644 --- a/pkg/sentry/arch/BUILD +++ b/pkg/sentry/arch/BUILD @@ -1,6 +1,4 @@ -load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library") -load("@rules_cc//cc:defs.bzl", "cc_proto_library") -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library", "proto_library") package(licenses = ["notice"]) @@ -27,7 +25,6 @@ go_library( "syscalls_amd64.go", "syscalls_arm64.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/arch", visibility = ["//:sandbox"], deps = [ ":registers_go_proto", @@ -44,20 +41,7 @@ go_library( ) proto_library( - name = "registers_proto", + name = "registers", srcs = ["registers.proto"], visibility = ["//visibility:public"], ) - -cc_proto_library( - name = "registers_cc_proto", - visibility = ["//visibility:public"], - deps = [":registers_proto"], -) - -go_proto_library( - name = "registers_go_proto", - importpath = "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto", - proto = ":registers_proto", - visibility = ["//visibility:public"], -) diff --git a/pkg/sentry/context/BUILD b/pkg/sentry/context/BUILD index 8dc1a77b1..e13a9ce20 100644 --- a/pkg/sentry/context/BUILD +++ b/pkg/sentry/context/BUILD @@ -1,11 +1,10 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) go_library( name = "context", srcs = ["context.go"], - importpath = "gvisor.dev/gvisor/pkg/sentry/context", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/amutex", diff --git a/pkg/sentry/context/contexttest/BUILD b/pkg/sentry/context/contexttest/BUILD index 581e7aa96..f91a6d4ed 100644 --- a/pkg/sentry/context/contexttest/BUILD +++ b/pkg/sentry/context/contexttest/BUILD @@ -1,4 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -6,7 +6,6 @@ go_library( name = "contexttest", testonly = 1, srcs = ["contexttest.go"], - importpath = "gvisor.dev/gvisor/pkg/sentry/context/contexttest", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/memutil", diff --git a/pkg/sentry/control/BUILD b/pkg/sentry/control/BUILD index 2561a6109..e69496477 100644 --- a/pkg/sentry/control/BUILD +++ b/pkg/sentry/control/BUILD @@ -1,5 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) @@ -12,9 +11,8 @@ go_library( "proc.go", "state.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/control", visibility = [ - "//pkg/sentry:internal", + "//:sandbox", ], deps = [ "//pkg/abi/linux", @@ -40,7 +38,7 @@ go_test( name = "control_test", size = "small", srcs = ["proc_test.go"], - embed = [":control"], + library = ":control", deps = [ "//pkg/log", "//pkg/sentry/kernel/time", diff --git a/pkg/sentry/device/BUILD b/pkg/sentry/device/BUILD index 97fa1512c..e403cbd8b 100644 --- a/pkg/sentry/device/BUILD +++ b/pkg/sentry/device/BUILD @@ -1,12 +1,10 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_test") -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) go_library( name = "device", srcs = ["device.go"], - importpath = "gvisor.dev/gvisor/pkg/sentry/device", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", @@ -18,5 +16,5 @@ go_test( name = "device_test", size = "small", srcs = ["device_test.go"], - embed = [":device"], + library = ":device", ) diff --git a/pkg/sentry/fs/BUILD b/pkg/sentry/fs/BUILD index 7d5d72d5a..605d61dbe 100644 --- a/pkg/sentry/fs/BUILD +++ b/pkg/sentry/fs/BUILD @@ -1,6 +1,5 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") load("//tools/go_generics:defs.bzl", "go_template_instance") -load("//tools/go_stateify:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -44,7 +43,6 @@ go_library( "splice.go", "sync.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/fs", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", @@ -129,7 +127,7 @@ go_test( "mount_test.go", "path_test.go", ], - embed = [":fs"], + library = ":fs", deps = [ "//pkg/sentry/context", "//pkg/sentry/context/contexttest", diff --git a/pkg/sentry/fs/anon/BUILD b/pkg/sentry/fs/anon/BUILD index ae1c9cf76..c14e5405e 100644 --- a/pkg/sentry/fs/anon/BUILD +++ b/pkg/sentry/fs/anon/BUILD @@ -1,4 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -8,7 +8,6 @@ go_library( "anon.go", "device.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/fs/anon", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", diff --git a/pkg/sentry/fs/dev/BUILD b/pkg/sentry/fs/dev/BUILD index a0d9e8496..0c7247bd7 100644 --- a/pkg/sentry/fs/dev/BUILD +++ b/pkg/sentry/fs/dev/BUILD @@ -1,4 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -13,7 +13,6 @@ go_library( "random.go", "tty.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/fs/dev", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", diff --git a/pkg/sentry/fs/fdpipe/BUILD b/pkg/sentry/fs/fdpipe/BUILD index cc43de69d..25ef96299 100644 --- a/pkg/sentry/fs/fdpipe/BUILD +++ b/pkg/sentry/fs/fdpipe/BUILD @@ -1,5 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_test") -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) @@ -10,7 +9,6 @@ go_library( "pipe_opener.go", "pipe_state.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/fs/fdpipe", imports = ["gvisor.dev/gvisor/pkg/sentry/fs"], visibility = ["//pkg/sentry:internal"], deps = [ @@ -36,7 +34,7 @@ go_test( "pipe_opener_test.go", "pipe_test.go", ], - embed = [":fdpipe"], + library = ":fdpipe", deps = [ "//pkg/fd", "//pkg/fdnotifier", diff --git a/pkg/sentry/fs/filetest/BUILD b/pkg/sentry/fs/filetest/BUILD index 358dc2be3..9a7608cae 100644 --- a/pkg/sentry/fs/filetest/BUILD +++ b/pkg/sentry/fs/filetest/BUILD @@ -1,4 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -6,7 +6,6 @@ go_library( name = "filetest", testonly = 1, srcs = ["filetest.go"], - importpath = "gvisor.dev/gvisor/pkg/sentry/fs/filetest", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/sentry/context", diff --git a/pkg/sentry/fs/fsutil/BUILD b/pkg/sentry/fs/fsutil/BUILD index 945b6270d..9142f5bdf 100644 --- a/pkg/sentry/fs/fsutil/BUILD +++ b/pkg/sentry/fs/fsutil/BUILD @@ -1,6 +1,5 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") load("//tools/go_generics:defs.bzl", "go_template_instance") -load("//tools/go_stateify:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -75,7 +74,6 @@ go_library( "inode.go", "inode_cached.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/fs/fsutil", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", @@ -106,7 +104,7 @@ go_test( "dirty_set_test.go", "inode_cached_test.go", ], - embed = [":fsutil"], + library = ":fsutil", deps = [ "//pkg/sentry/context", "//pkg/sentry/context/contexttest", diff --git a/pkg/sentry/fs/gofer/BUILD b/pkg/sentry/fs/gofer/BUILD index fd870e8e1..cf48e7c03 100644 --- a/pkg/sentry/fs/gofer/BUILD +++ b/pkg/sentry/fs/gofer/BUILD @@ -1,5 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_test") -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) @@ -22,7 +21,6 @@ go_library( "socket.go", "util.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/fs/gofer", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", @@ -56,7 +54,7 @@ go_test( name = "gofer_test", size = "small", srcs = ["gofer_test.go"], - embed = [":gofer"], + library = ":gofer", deps = [ "//pkg/p9", "//pkg/p9/p9test", diff --git a/pkg/sentry/fs/host/BUILD b/pkg/sentry/fs/host/BUILD index 2b581aa69..f586f47c1 100644 --- a/pkg/sentry/fs/host/BUILD +++ b/pkg/sentry/fs/host/BUILD @@ -1,5 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_test") -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) @@ -25,7 +24,6 @@ go_library( "util_arm64_unsafe.go", "util_unsafe.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/fs/host", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", @@ -69,7 +67,7 @@ go_test( "socket_test.go", "wait_test.go", ], - embed = [":host"], + library = ":host", deps = [ "//pkg/fd", "//pkg/fdnotifier", diff --git a/pkg/sentry/fs/lock/BUILD b/pkg/sentry/fs/lock/BUILD index 2c332a82a..ae3331737 100644 --- a/pkg/sentry/fs/lock/BUILD +++ b/pkg/sentry/fs/lock/BUILD @@ -1,6 +1,5 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") load("//tools/go_generics:defs.bzl", "go_template_instance") -load("//tools/go_stateify:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -40,7 +39,6 @@ go_library( "lock_set.go", "lock_set_functions.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/fs/lock", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/log", @@ -56,5 +54,5 @@ go_test( "lock_range_test.go", "lock_test.go", ], - embed = [":lock"], + library = ":lock", ) diff --git a/pkg/sentry/fs/proc/BUILD b/pkg/sentry/fs/proc/BUILD index cb37c6c6b..b06bead41 100644 --- a/pkg/sentry/fs/proc/BUILD +++ b/pkg/sentry/fs/proc/BUILD @@ -1,5 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_test") -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) @@ -27,7 +26,6 @@ go_library( "uptime.go", "version.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/fs/proc", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", @@ -63,7 +61,7 @@ go_test( "net_test.go", "sys_net_test.go", ], - embed = [":proc"], + library = ":proc", deps = [ "//pkg/abi/linux", "//pkg/sentry/context", diff --git a/pkg/sentry/fs/proc/device/BUILD b/pkg/sentry/fs/proc/device/BUILD index 0394451d4..52c9aa93d 100644 --- a/pkg/sentry/fs/proc/device/BUILD +++ b/pkg/sentry/fs/proc/device/BUILD @@ -1,11 +1,10 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) go_library( name = "device", srcs = ["device.go"], - importpath = "gvisor.dev/gvisor/pkg/sentry/fs/proc/device", visibility = ["//pkg/sentry:internal"], deps = ["//pkg/sentry/device"], ) diff --git a/pkg/sentry/fs/proc/seqfile/BUILD b/pkg/sentry/fs/proc/seqfile/BUILD index 38b246dff..310d8dd52 100644 --- a/pkg/sentry/fs/proc/seqfile/BUILD +++ b/pkg/sentry/fs/proc/seqfile/BUILD @@ -1,12 +1,10 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_test") -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) go_library( name = "seqfile", srcs = ["seqfile.go"], - importpath = "gvisor.dev/gvisor/pkg/sentry/fs/proc/seqfile", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", @@ -26,7 +24,7 @@ go_test( name = "seqfile_test", size = "small", srcs = ["seqfile_test.go"], - embed = [":seqfile"], + library = ":seqfile", deps = [ "//pkg/sentry/context", "//pkg/sentry/context/contexttest", diff --git a/pkg/sentry/fs/ramfs/BUILD b/pkg/sentry/fs/ramfs/BUILD index 3fb7b0633..39c4b84f8 100644 --- a/pkg/sentry/fs/ramfs/BUILD +++ b/pkg/sentry/fs/ramfs/BUILD @@ -1,5 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_test") -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) @@ -11,7 +10,6 @@ go_library( "symlink.go", "tree.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/fs/ramfs", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", @@ -31,7 +29,7 @@ go_test( name = "ramfs_test", size = "small", srcs = ["tree_test.go"], - embed = [":ramfs"], + library = ":ramfs", deps = [ "//pkg/sentry/context/contexttest", "//pkg/sentry/fs", diff --git a/pkg/sentry/fs/sys/BUILD b/pkg/sentry/fs/sys/BUILD index 25f0f124e..cc6b3bfbf 100644 --- a/pkg/sentry/fs/sys/BUILD +++ b/pkg/sentry/fs/sys/BUILD @@ -1,4 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -10,7 +10,6 @@ go_library( "fs.go", "sys.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/fs/sys", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", diff --git a/pkg/sentry/fs/timerfd/BUILD b/pkg/sentry/fs/timerfd/BUILD index a215c1b95..092668e8d 100644 --- a/pkg/sentry/fs/timerfd/BUILD +++ b/pkg/sentry/fs/timerfd/BUILD @@ -1,11 +1,10 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) go_library( name = "timerfd", srcs = ["timerfd.go"], - importpath = "gvisor.dev/gvisor/pkg/sentry/fs/timerfd", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/sentry/context", diff --git a/pkg/sentry/fs/tmpfs/BUILD b/pkg/sentry/fs/tmpfs/BUILD index 3400b940c..04776555f 100644 --- a/pkg/sentry/fs/tmpfs/BUILD +++ b/pkg/sentry/fs/tmpfs/BUILD @@ -1,5 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_test") -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) @@ -12,7 +11,6 @@ go_library( "inode_file.go", "tmpfs.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/fs/tmpfs", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", @@ -41,7 +39,7 @@ go_test( name = "tmpfs_test", size = "small", srcs = ["file_test.go"], - embed = [":tmpfs"], + library = ":tmpfs", deps = [ "//pkg/sentry/context", "//pkg/sentry/fs", diff --git a/pkg/sentry/fs/tty/BUILD b/pkg/sentry/fs/tty/BUILD index f6f60d0cf..29f804c6c 100644 --- a/pkg/sentry/fs/tty/BUILD +++ b/pkg/sentry/fs/tty/BUILD @@ -1,5 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_test") -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) @@ -14,7 +13,6 @@ go_library( "slave.go", "terminal.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/fs/tty", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", @@ -40,7 +38,7 @@ go_test( name = "tty_test", size = "small", srcs = ["tty_test.go"], - embed = [":tty"], + library = ":tty", deps = [ "//pkg/abi/linux", "//pkg/sentry/context/contexttest", diff --git a/pkg/sentry/fsimpl/ext/BUILD b/pkg/sentry/fsimpl/ext/BUILD index 903874141..a718920d5 100644 --- a/pkg/sentry/fsimpl/ext/BUILD +++ b/pkg/sentry/fsimpl/ext/BUILD @@ -1,5 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_test") -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library", "go_test") load("//tools/go_generics:defs.bzl", "go_template_instance") package(licenses = ["notice"]) @@ -32,7 +31,6 @@ go_library( "symlink.go", "utils.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", @@ -71,7 +69,7 @@ go_test( "//pkg/sentry/fsimpl/ext:assets/tiny.ext3", "//pkg/sentry/fsimpl/ext:assets/tiny.ext4", ], - embed = [":ext"], + library = ":ext", deps = [ "//pkg/abi/linux", "//pkg/binary", diff --git a/pkg/sentry/fsimpl/ext/benchmark/BUILD b/pkg/sentry/fsimpl/ext/benchmark/BUILD index 4fc8296ef..12f3990c1 100644 --- a/pkg/sentry/fsimpl/ext/benchmark/BUILD +++ b/pkg/sentry/fsimpl/ext/benchmark/BUILD @@ -1,4 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_test") package(licenses = ["notice"]) diff --git a/pkg/sentry/fsimpl/ext/disklayout/BUILD b/pkg/sentry/fsimpl/ext/disklayout/BUILD index fcfaf5c3e..9bd9c76c0 100644 --- a/pkg/sentry/fsimpl/ext/disklayout/BUILD +++ b/pkg/sentry/fsimpl/ext/disklayout/BUILD @@ -1,5 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_test") -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) @@ -23,7 +22,6 @@ go_library( "superblock_old.go", "test_utils.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", @@ -44,6 +42,6 @@ go_test( "inode_test.go", "superblock_test.go", ], - embed = [":disklayout"], + library = ":disklayout", deps = ["//pkg/sentry/kernel/time"], ) diff --git a/pkg/sentry/fsimpl/kernfs/BUILD b/pkg/sentry/fsimpl/kernfs/BUILD index 66d409785..7bf83ccba 100644 --- a/pkg/sentry/fsimpl/kernfs/BUILD +++ b/pkg/sentry/fsimpl/kernfs/BUILD @@ -1,8 +1,7 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") load("//tools/go_generics:defs.bzl", "go_template_instance") -package(licenses = ["notice"]) +licenses(["notice"]) go_template_instance( name = "slot_list", @@ -27,7 +26,6 @@ go_library( "slot_list.go", "symlink.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", diff --git a/pkg/sentry/fsimpl/proc/BUILD b/pkg/sentry/fsimpl/proc/BUILD index c5b79fb38..3768f55b2 100644 --- a/pkg/sentry/fsimpl/proc/BUILD +++ b/pkg/sentry/fsimpl/proc/BUILD @@ -1,7 +1,6 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") -package(licenses = ["notice"]) +licenses(["notice"]) go_library( name = "proc", @@ -15,7 +14,6 @@ go_library( "tasks_net.go", "tasks_sys.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/fsimpl/proc", deps = [ "//pkg/abi/linux", "//pkg/log", @@ -47,7 +45,7 @@ go_test( "tasks_sys_test.go", "tasks_test.go", ], - embed = [":proc"], + library = ":proc", deps = [ "//pkg/abi/linux", "//pkg/fspath", diff --git a/pkg/sentry/fsimpl/sys/BUILD b/pkg/sentry/fsimpl/sys/BUILD index ee3c842bd..beda141f1 100644 --- a/pkg/sentry/fsimpl/sys/BUILD +++ b/pkg/sentry/fsimpl/sys/BUILD @@ -1,14 +1,12 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") -package(licenses = ["notice"]) +licenses(["notice"]) go_library( name = "sys", srcs = [ "sys.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/fsimpl/sys", deps = [ "//pkg/abi/linux", "//pkg/sentry/context", diff --git a/pkg/sentry/fsimpl/testutil/BUILD b/pkg/sentry/fsimpl/testutil/BUILD index 4e70d84a7..12053a5b6 100644 --- a/pkg/sentry/fsimpl/testutil/BUILD +++ b/pkg/sentry/fsimpl/testutil/BUILD @@ -1,6 +1,6 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") -package(licenses = ["notice"]) +licenses(["notice"]) go_library( name = "testutil", @@ -9,7 +9,6 @@ go_library( "kernel.go", "testutil.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/fsimpl/testutil", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", diff --git a/pkg/sentry/fsimpl/tmpfs/BUILD b/pkg/sentry/fsimpl/tmpfs/BUILD index 691476b4f..857e98bc5 100644 --- a/pkg/sentry/fsimpl/tmpfs/BUILD +++ b/pkg/sentry/fsimpl/tmpfs/BUILD @@ -1,8 +1,7 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") load("//tools/go_generics:defs.bzl", "go_template_instance") -package(licenses = ["notice"]) +licenses(["notice"]) go_template_instance( name = "dentry_list", @@ -28,7 +27,6 @@ go_library( "symlink.go", "tmpfs.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs", deps = [ "//pkg/abi/linux", "//pkg/amutex", @@ -81,7 +79,7 @@ go_test( "regular_file_test.go", "stat_test.go", ], - embed = [":tmpfs"], + library = ":tmpfs", deps = [ "//pkg/abi/linux", "//pkg/fspath", diff --git a/pkg/sentry/hostcpu/BUILD b/pkg/sentry/hostcpu/BUILD index 359468ccc..e6933aa70 100644 --- a/pkg/sentry/hostcpu/BUILD +++ b/pkg/sentry/hostcpu/BUILD @@ -1,5 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) @@ -10,7 +9,6 @@ go_library( "getcpu_arm64.s", "hostcpu.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/hostcpu", visibility = ["//:sandbox"], ) @@ -18,5 +16,5 @@ go_test( name = "hostcpu_test", size = "small", srcs = ["hostcpu_test.go"], - embed = [":hostcpu"], + library = ":hostcpu", ) diff --git a/pkg/sentry/hostmm/BUILD b/pkg/sentry/hostmm/BUILD index 67831d5a1..a145a5ca3 100644 --- a/pkg/sentry/hostmm/BUILD +++ b/pkg/sentry/hostmm/BUILD @@ -1,4 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -8,7 +8,6 @@ go_library( "cgroup.go", "hostmm.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/hostmm", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/fd", diff --git a/pkg/sentry/inet/BUILD b/pkg/sentry/inet/BUILD index 8d60ad4ad..aa621b724 100644 --- a/pkg/sentry/inet/BUILD +++ b/pkg/sentry/inet/BUILD @@ -1,4 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package( default_visibility = ["//:sandbox"], @@ -12,7 +12,6 @@ go_library( "inet.go", "test_stack.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/inet", deps = [ "//pkg/sentry/context", "//pkg/tcpip/stack", diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD index ac85ba0c8..cebaccd92 100644 --- a/pkg/sentry/kernel/BUILD +++ b/pkg/sentry/kernel/BUILD @@ -1,8 +1,5 @@ -load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") -load("@rules_cc//cc:defs.bzl", "cc_proto_library") +load("//tools:defs.bzl", "go_library", "go_test", "proto_library") load("//tools/go_generics:defs.bzl", "go_template_instance") -load("//tools/go_stateify:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -78,26 +75,12 @@ go_template_instance( ) proto_library( - name = "uncaught_signal_proto", + name = "uncaught_signal", srcs = ["uncaught_signal.proto"], visibility = ["//visibility:public"], deps = ["//pkg/sentry/arch:registers_proto"], ) -cc_proto_library( - name = "uncaught_signal_cc_proto", - visibility = ["//visibility:public"], - deps = [":uncaught_signal_proto"], -) - -go_proto_library( - name = "uncaught_signal_go_proto", - importpath = "gvisor.dev/gvisor/pkg/sentry/kernel/uncaught_signal_go_proto", - proto = ":uncaught_signal_proto", - visibility = ["//visibility:public"], - deps = ["//pkg/sentry/arch:registers_go_proto"], -) - go_library( name = "kernel", srcs = [ @@ -156,7 +139,6 @@ go_library( "vdso.go", "version.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/kernel", imports = [ "gvisor.dev/gvisor/pkg/bpf", "gvisor.dev/gvisor/pkg/sentry/device", @@ -227,7 +209,7 @@ go_test( "task_test.go", "timekeeper_test.go", ], - embed = [":kernel"], + library = ":kernel", deps = [ "//pkg/abi", "//pkg/sentry/arch", diff --git a/pkg/sentry/kernel/auth/BUILD b/pkg/sentry/kernel/auth/BUILD index 1aa72fa47..64537c9be 100644 --- a/pkg/sentry/kernel/auth/BUILD +++ b/pkg/sentry/kernel/auth/BUILD @@ -1,5 +1,5 @@ +load("//tools:defs.bzl", "go_library") load("//tools/go_generics:defs.bzl", "go_template_instance") -load("//tools/go_stateify:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -57,7 +57,6 @@ go_library( "id_map_set.go", "user_namespace.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/kernel/auth", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", diff --git a/pkg/sentry/kernel/contexttest/BUILD b/pkg/sentry/kernel/contexttest/BUILD index 3a88a585c..daff608d7 100644 --- a/pkg/sentry/kernel/contexttest/BUILD +++ b/pkg/sentry/kernel/contexttest/BUILD @@ -1,4 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -6,7 +6,6 @@ go_library( name = "contexttest", testonly = 1, srcs = ["contexttest.go"], - importpath = "gvisor.dev/gvisor/pkg/sentry/kernel/contexttest", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/sentry/context", diff --git a/pkg/sentry/kernel/epoll/BUILD b/pkg/sentry/kernel/epoll/BUILD index c47f6b6fc..19e16ab3a 100644 --- a/pkg/sentry/kernel/epoll/BUILD +++ b/pkg/sentry/kernel/epoll/BUILD @@ -1,6 +1,5 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") load("//tools/go_generics:defs.bzl", "go_template_instance") -load("//tools/go_stateify:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -23,7 +22,6 @@ go_library( "epoll_list.go", "epoll_state.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/kernel/epoll", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/refs", @@ -43,7 +41,7 @@ go_test( srcs = [ "epoll_test.go", ], - embed = [":epoll"], + library = ":epoll", deps = [ "//pkg/sentry/context/contexttest", "//pkg/sentry/fs/filetest", diff --git a/pkg/sentry/kernel/eventfd/BUILD b/pkg/sentry/kernel/eventfd/BUILD index c831fbab2..ee2d74864 100644 --- a/pkg/sentry/kernel/eventfd/BUILD +++ b/pkg/sentry/kernel/eventfd/BUILD @@ -1,12 +1,10 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_test") -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) go_library( name = "eventfd", srcs = ["eventfd.go"], - importpath = "gvisor.dev/gvisor/pkg/sentry/kernel/eventfd", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", @@ -26,7 +24,7 @@ go_test( name = "eventfd_test", size = "small", srcs = ["eventfd_test.go"], - embed = [":eventfd"], + library = ":eventfd", deps = [ "//pkg/sentry/context/contexttest", "//pkg/sentry/usermem", diff --git a/pkg/sentry/kernel/fasync/BUILD b/pkg/sentry/kernel/fasync/BUILD index 6b36bc63e..b9126e946 100644 --- a/pkg/sentry/kernel/fasync/BUILD +++ b/pkg/sentry/kernel/fasync/BUILD @@ -1,11 +1,10 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) go_library( name = "fasync", srcs = ["fasync.go"], - importpath = "gvisor.dev/gvisor/pkg/sentry/kernel/fasync", visibility = ["//:sandbox"], deps = [ "//pkg/abi/linux", diff --git a/pkg/sentry/kernel/futex/BUILD b/pkg/sentry/kernel/futex/BUILD index 50db443ce..f413d8ae2 100644 --- a/pkg/sentry/kernel/futex/BUILD +++ b/pkg/sentry/kernel/futex/BUILD @@ -1,6 +1,5 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") load("//tools/go_generics:defs.bzl", "go_template_instance") -load("//tools/go_stateify:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -34,7 +33,6 @@ go_library( "futex.go", "waiter_list.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/kernel/futex", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", @@ -51,7 +49,7 @@ go_test( name = "futex_test", size = "small", srcs = ["futex_test.go"], - embed = [":futex"], + library = ":futex", deps = [ "//pkg/sentry/usermem", "//pkg/sync", diff --git a/pkg/sentry/kernel/memevent/BUILD b/pkg/sentry/kernel/memevent/BUILD index 7f36252a9..4486848d2 100644 --- a/pkg/sentry/kernel/memevent/BUILD +++ b/pkg/sentry/kernel/memevent/BUILD @@ -1,13 +1,10 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library") -load("@rules_cc//cc:defs.bzl", "cc_proto_library") +load("//tools:defs.bzl", "go_library", "proto_library") package(licenses = ["notice"]) go_library( name = "memevent", srcs = ["memory_events.go"], - importpath = "gvisor.dev/gvisor/pkg/sentry/kernel/memevent", visibility = ["//:sandbox"], deps = [ ":memory_events_go_proto", @@ -21,20 +18,7 @@ go_library( ) proto_library( - name = "memory_events_proto", + name = "memory_events", srcs = ["memory_events.proto"], visibility = ["//visibility:public"], ) - -cc_proto_library( - name = "memory_events_cc_proto", - visibility = ["//visibility:public"], - deps = [":memory_events_proto"], -) - -go_proto_library( - name = "memory_events_go_proto", - importpath = "gvisor.dev/gvisor/pkg/sentry/kernel/memevent/memory_events_go_proto", - proto = ":memory_events_proto", - visibility = ["//visibility:public"], -) diff --git a/pkg/sentry/kernel/pipe/BUILD b/pkg/sentry/kernel/pipe/BUILD index 5eeaeff66..2c7b6206f 100644 --- a/pkg/sentry/kernel/pipe/BUILD +++ b/pkg/sentry/kernel/pipe/BUILD @@ -1,6 +1,5 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") load("//tools/go_generics:defs.bzl", "go_template_instance") -load("//tools/go_stateify:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -30,7 +29,6 @@ go_library( "vfs.go", "writer.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/kernel/pipe", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", @@ -57,7 +55,7 @@ go_test( "node_test.go", "pipe_test.go", ], - embed = [":pipe"], + library = ":pipe", deps = [ "//pkg/sentry/context", "//pkg/sentry/context/contexttest", diff --git a/pkg/sentry/kernel/sched/BUILD b/pkg/sentry/kernel/sched/BUILD index 98ea7a0d8..1b82e087b 100644 --- a/pkg/sentry/kernel/sched/BUILD +++ b/pkg/sentry/kernel/sched/BUILD @@ -1,5 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) @@ -9,7 +8,6 @@ go_library( "cpuset.go", "sched.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/kernel/sched", visibility = ["//pkg/sentry:internal"], ) @@ -17,5 +15,5 @@ go_test( name = "sched_test", size = "small", srcs = ["cpuset_test.go"], - embed = [":sched"], + library = ":sched", ) diff --git a/pkg/sentry/kernel/semaphore/BUILD b/pkg/sentry/kernel/semaphore/BUILD index 13a961594..76e19b551 100644 --- a/pkg/sentry/kernel/semaphore/BUILD +++ b/pkg/sentry/kernel/semaphore/BUILD @@ -1,6 +1,5 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") load("//tools/go_generics:defs.bzl", "go_template_instance") -load("//tools/go_stateify:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -22,7 +21,6 @@ go_library( "semaphore.go", "waiter_list.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/kernel/semaphore", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", @@ -40,7 +38,7 @@ go_test( name = "semaphore_test", size = "small", srcs = ["semaphore_test.go"], - embed = [":semaphore"], + library = ":semaphore", deps = [ "//pkg/abi/linux", "//pkg/sentry/context", diff --git a/pkg/sentry/kernel/shm/BUILD b/pkg/sentry/kernel/shm/BUILD index 7321b22ed..5547c5abf 100644 --- a/pkg/sentry/kernel/shm/BUILD +++ b/pkg/sentry/kernel/shm/BUILD @@ -1,4 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -8,7 +8,6 @@ go_library( "device.go", "shm.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/kernel/shm", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", diff --git a/pkg/sentry/kernel/signalfd/BUILD b/pkg/sentry/kernel/signalfd/BUILD index 89e4d84b1..5d44773d4 100644 --- a/pkg/sentry/kernel/signalfd/BUILD +++ b/pkg/sentry/kernel/signalfd/BUILD @@ -1,11 +1,10 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") -package(licenses = ["notice"]) +licenses(["notice"]) go_library( name = "signalfd", srcs = ["signalfd.go"], - importpath = "gvisor.dev/gvisor/pkg/sentry/kernel/signalfd", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", diff --git a/pkg/sentry/kernel/time/BUILD b/pkg/sentry/kernel/time/BUILD index 4e4de0512..d49594d9f 100644 --- a/pkg/sentry/kernel/time/BUILD +++ b/pkg/sentry/kernel/time/BUILD @@ -1,4 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -8,7 +8,6 @@ go_library( "context.go", "time.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/kernel/time", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", diff --git a/pkg/sentry/limits/BUILD b/pkg/sentry/limits/BUILD index 9fa841e8b..67869757f 100644 --- a/pkg/sentry/limits/BUILD +++ b/pkg/sentry/limits/BUILD @@ -1,5 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_test") -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) @@ -10,7 +9,6 @@ go_library( "limits.go", "linux.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/limits", visibility = ["//:sandbox"], deps = [ "//pkg/abi/linux", @@ -25,5 +23,5 @@ go_test( srcs = [ "limits_test.go", ], - embed = [":limits"], + library = ":limits", ) diff --git a/pkg/sentry/loader/BUILD b/pkg/sentry/loader/BUILD index 2890393bd..d4ad2bd6c 100644 --- a/pkg/sentry/loader/BUILD +++ b/pkg/sentry/loader/BUILD @@ -1,5 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_embed_data") -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_embed_data", "go_library") package(licenses = ["notice"]) @@ -20,7 +19,6 @@ go_library( "vdso_state.go", ":vdso_bin", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/loader", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi", diff --git a/pkg/sentry/memmap/BUILD b/pkg/sentry/memmap/BUILD index 112794e9c..f9a65f086 100644 --- a/pkg/sentry/memmap/BUILD +++ b/pkg/sentry/memmap/BUILD @@ -1,6 +1,5 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") load("//tools/go_generics:defs.bzl", "go_template_instance") -load("//tools/go_stateify:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -37,7 +36,6 @@ go_library( "mapping_set_impl.go", "memmap.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/memmap", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/log", @@ -52,6 +50,6 @@ go_test( name = "memmap_test", size = "small", srcs = ["mapping_set_test.go"], - embed = [":memmap"], + library = ":memmap", deps = ["//pkg/sentry/usermem"], ) diff --git a/pkg/sentry/mm/BUILD b/pkg/sentry/mm/BUILD index 83e248431..bd6399fa2 100644 --- a/pkg/sentry/mm/BUILD +++ b/pkg/sentry/mm/BUILD @@ -1,6 +1,5 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") load("//tools/go_generics:defs.bzl", "go_template_instance") -load("//tools/go_stateify:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -96,7 +95,6 @@ go_library( "vma.go", "vma_set.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/mm", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", @@ -128,7 +126,7 @@ go_test( name = "mm_test", size = "small", srcs = ["mm_test.go"], - embed = [":mm"], + library = ":mm", deps = [ "//pkg/sentry/arch", "//pkg/sentry/context", diff --git a/pkg/sentry/pgalloc/BUILD b/pkg/sentry/pgalloc/BUILD index a9a2642c5..02385a3ce 100644 --- a/pkg/sentry/pgalloc/BUILD +++ b/pkg/sentry/pgalloc/BUILD @@ -1,6 +1,5 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") load("//tools/go_generics:defs.bzl", "go_template_instance") -load("//tools/go_stateify:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -60,7 +59,6 @@ go_library( "save_restore.go", "usage_set.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/pgalloc", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/log", @@ -82,6 +80,6 @@ go_test( name = "pgalloc_test", size = "small", srcs = ["pgalloc_test.go"], - embed = [":pgalloc"], + library = ":pgalloc", deps = ["//pkg/sentry/usermem"], ) diff --git a/pkg/sentry/platform/BUILD b/pkg/sentry/platform/BUILD index 157bffa81..006450b2d 100644 --- a/pkg/sentry/platform/BUILD +++ b/pkg/sentry/platform/BUILD @@ -1,5 +1,5 @@ +load("//tools:defs.bzl", "go_library") load("//tools/go_generics:defs.bzl", "go_template_instance") -load("//tools/go_stateify:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -22,7 +22,6 @@ go_library( "mmap_min_addr.go", "platform.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/platform", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", diff --git a/pkg/sentry/platform/interrupt/BUILD b/pkg/sentry/platform/interrupt/BUILD index 85e882df9..83b385f14 100644 --- a/pkg/sentry/platform/interrupt/BUILD +++ b/pkg/sentry/platform/interrupt/BUILD @@ -1,5 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) @@ -8,7 +7,6 @@ go_library( srcs = [ "interrupt.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/platform/interrupt", visibility = ["//pkg/sentry:internal"], deps = ["//pkg/sync"], ) @@ -17,5 +15,5 @@ go_test( name = "interrupt_test", size = "small", srcs = ["interrupt_test.go"], - embed = [":interrupt"], + library = ":interrupt", ) diff --git a/pkg/sentry/platform/kvm/BUILD b/pkg/sentry/platform/kvm/BUILD index 6a358d1d4..a4532a766 100644 --- a/pkg/sentry/platform/kvm/BUILD +++ b/pkg/sentry/platform/kvm/BUILD @@ -1,5 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) @@ -38,7 +37,6 @@ go_library( "physical_map_arm64.go", "virtual_map.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/platform/kvm", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", @@ -65,7 +63,7 @@ go_test( "kvm_test.go", "virtual_map_test.go", ], - embed = [":kvm"], + library = ":kvm", tags = [ "manual", "nogotsan", diff --git a/pkg/sentry/platform/kvm/testutil/BUILD b/pkg/sentry/platform/kvm/testutil/BUILD index b0e45f159..f7605df8a 100644 --- a/pkg/sentry/platform/kvm/testutil/BUILD +++ b/pkg/sentry/platform/kvm/testutil/BUILD @@ -1,4 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -12,6 +12,5 @@ go_library( "testutil_arm64.go", "testutil_arm64.s", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/platform/kvm/testutil", visibility = ["//pkg/sentry/platform/kvm:__pkg__"], ) diff --git a/pkg/sentry/platform/ptrace/BUILD b/pkg/sentry/platform/ptrace/BUILD index cd13390c3..3bcc5e040 100644 --- a/pkg/sentry/platform/ptrace/BUILD +++ b/pkg/sentry/platform/ptrace/BUILD @@ -1,4 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -20,7 +20,6 @@ go_library( "subprocess_linux_unsafe.go", "subprocess_unsafe.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/platform/ptrace", visibility = ["//:sandbox"], deps = [ "//pkg/abi/linux", diff --git a/pkg/sentry/platform/ring0/BUILD b/pkg/sentry/platform/ring0/BUILD index 87f4552b5..6dee8fcc5 100644 --- a/pkg/sentry/platform/ring0/BUILD +++ b/pkg/sentry/platform/ring0/BUILD @@ -1,4 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") load("//tools/go_generics:defs.bzl", "go_template", "go_template_instance") package(licenses = ["notice"]) @@ -74,7 +74,6 @@ go_library( "lib_arm64.s", "ring0.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/platform/ring0", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/cpuid", diff --git a/pkg/sentry/platform/ring0/gen_offsets/BUILD b/pkg/sentry/platform/ring0/gen_offsets/BUILD index 42076fb04..147311ed3 100644 --- a/pkg/sentry/platform/ring0/gen_offsets/BUILD +++ b/pkg/sentry/platform/ring0/gen_offsets/BUILD @@ -1,4 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_binary") +load("//tools:defs.bzl", "go_binary") load("//tools/go_generics:defs.bzl", "go_template_instance") package(licenses = ["notice"]) diff --git a/pkg/sentry/platform/ring0/pagetables/BUILD b/pkg/sentry/platform/ring0/pagetables/BUILD index 387a7f6c3..8b5cdd6c1 100644 --- a/pkg/sentry/platform/ring0/pagetables/BUILD +++ b/pkg/sentry/platform/ring0/pagetables/BUILD @@ -1,17 +1,14 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_test") -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library", "go_test", "select_arch") load("//tools/go_generics:defs.bzl", "go_template", "go_template_instance") package(licenses = ["notice"]) -config_setting( - name = "aarch64", - constraint_values = ["@bazel_tools//platforms:aarch64"], -) - go_template( name = "generic_walker", - srcs = ["walker_amd64.go"], + srcs = select_arch( + amd64 = ["walker_amd64.go"], + arm64 = ["walker_amd64.go"], + ), opt_types = [ "Visitor", ], @@ -91,7 +88,6 @@ go_library( "walker_map.go", "walker_unmap.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables", visibility = [ "//pkg/sentry/platform/kvm:__subpackages__", "//pkg/sentry/platform/ring0:__subpackages__", @@ -111,6 +107,6 @@ go_test( "pagetables_test.go", "walker_check.go", ], - embed = [":pagetables"], + library = ":pagetables", deps = ["//pkg/sentry/usermem"], ) diff --git a/pkg/sentry/platform/safecopy/BUILD b/pkg/sentry/platform/safecopy/BUILD index 6769cd0a5..b8747585b 100644 --- a/pkg/sentry/platform/safecopy/BUILD +++ b/pkg/sentry/platform/safecopy/BUILD @@ -1,5 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) @@ -17,7 +16,6 @@ go_library( "sighandler_amd64.s", "sighandler_arm64.s", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/platform/safecopy", visibility = ["//pkg/sentry:internal"], deps = ["//pkg/syserror"], ) @@ -27,5 +25,5 @@ go_test( srcs = [ "safecopy_test.go", ], - embed = [":safecopy"], + library = ":safecopy", ) diff --git a/pkg/sentry/safemem/BUILD b/pkg/sentry/safemem/BUILD index 884020f7b..3ab76da97 100644 --- a/pkg/sentry/safemem/BUILD +++ b/pkg/sentry/safemem/BUILD @@ -1,5 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) @@ -11,7 +10,6 @@ go_library( "safemem.go", "seq_unsafe.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/safemem", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/sentry/platform/safecopy", @@ -25,5 +23,5 @@ go_test( "io_test.go", "seq_test.go", ], - embed = [":safemem"], + library = ":safemem", ) diff --git a/pkg/sentry/sighandling/BUILD b/pkg/sentry/sighandling/BUILD index f561670c7..6c38a3f44 100644 --- a/pkg/sentry/sighandling/BUILD +++ b/pkg/sentry/sighandling/BUILD @@ -1,4 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -8,7 +8,6 @@ go_library( "sighandling.go", "sighandling_unsafe.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/sighandling", visibility = ["//pkg/sentry:internal"], deps = ["//pkg/abi/linux"], ) diff --git a/pkg/sentry/socket/BUILD b/pkg/sentry/socket/BUILD index 26176b10d..8e2b97afb 100644 --- a/pkg/sentry/socket/BUILD +++ b/pkg/sentry/socket/BUILD @@ -1,11 +1,10 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) go_library( name = "socket", srcs = ["socket.go"], - importpath = "gvisor.dev/gvisor/pkg/sentry/socket", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", diff --git a/pkg/sentry/socket/control/BUILD b/pkg/sentry/socket/control/BUILD index 357517ed4..3850f6345 100644 --- a/pkg/sentry/socket/control/BUILD +++ b/pkg/sentry/socket/control/BUILD @@ -1,11 +1,10 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) go_library( name = "control", srcs = ["control.go"], - importpath = "gvisor.dev/gvisor/pkg/sentry/socket/control", imports = [ "gvisor.dev/gvisor/pkg/sentry/fs", ], diff --git a/pkg/sentry/socket/hostinet/BUILD b/pkg/sentry/socket/hostinet/BUILD index 4c44c7c0f..42bf7be6a 100644 --- a/pkg/sentry/socket/hostinet/BUILD +++ b/pkg/sentry/socket/hostinet/BUILD @@ -1,4 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -12,7 +12,6 @@ go_library( "socket_unsafe.go", "stack.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/socket/hostinet", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", diff --git a/pkg/sentry/socket/netfilter/BUILD b/pkg/sentry/socket/netfilter/BUILD index b70047d81..ed34a8308 100644 --- a/pkg/sentry/socket/netfilter/BUILD +++ b/pkg/sentry/socket/netfilter/BUILD @@ -1,4 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -7,7 +7,6 @@ go_library( srcs = [ "netfilter.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/socket/netfilter", # This target depends on netstack and should only be used by epsocket, # which is allowed to depend on netstack. visibility = ["//pkg/sentry:internal"], diff --git a/pkg/sentry/socket/netlink/BUILD b/pkg/sentry/socket/netlink/BUILD index 103933144..baaac13c6 100644 --- a/pkg/sentry/socket/netlink/BUILD +++ b/pkg/sentry/socket/netlink/BUILD @@ -1,4 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -9,7 +9,6 @@ go_library( "provider.go", "socket.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/socket/netlink", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", diff --git a/pkg/sentry/socket/netlink/port/BUILD b/pkg/sentry/socket/netlink/port/BUILD index 2d9f4ba9b..3a22923d8 100644 --- a/pkg/sentry/socket/netlink/port/BUILD +++ b/pkg/sentry/socket/netlink/port/BUILD @@ -1,12 +1,10 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_test") -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) go_library( name = "port", srcs = ["port.go"], - importpath = "gvisor.dev/gvisor/pkg/sentry/socket/netlink/port", visibility = ["//pkg/sentry:internal"], deps = ["//pkg/sync"], ) @@ -14,5 +12,5 @@ go_library( go_test( name = "port_test", srcs = ["port_test.go"], - embed = [":port"], + library = ":port", ) diff --git a/pkg/sentry/socket/netlink/route/BUILD b/pkg/sentry/socket/netlink/route/BUILD index 1d4912753..2137c7aeb 100644 --- a/pkg/sentry/socket/netlink/route/BUILD +++ b/pkg/sentry/socket/netlink/route/BUILD @@ -1,11 +1,10 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) go_library( name = "route", srcs = ["protocol.go"], - importpath = "gvisor.dev/gvisor/pkg/sentry/socket/netlink/route", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", diff --git a/pkg/sentry/socket/netlink/uevent/BUILD b/pkg/sentry/socket/netlink/uevent/BUILD index 0777f3baf..73fbdf1eb 100644 --- a/pkg/sentry/socket/netlink/uevent/BUILD +++ b/pkg/sentry/socket/netlink/uevent/BUILD @@ -1,11 +1,10 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) go_library( name = "uevent", srcs = ["protocol.go"], - importpath = "gvisor.dev/gvisor/pkg/sentry/socket/netlink/uevent", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", diff --git a/pkg/sentry/socket/netstack/BUILD b/pkg/sentry/socket/netstack/BUILD index f78784569..e3d1f90cb 100644 --- a/pkg/sentry/socket/netstack/BUILD +++ b/pkg/sentry/socket/netstack/BUILD @@ -1,4 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -11,7 +11,6 @@ go_library( "save_restore.go", "stack.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/socket/netstack", visibility = [ "//pkg/sentry:internal", ], diff --git a/pkg/sentry/socket/unix/BUILD b/pkg/sentry/socket/unix/BUILD index 5b6a154f6..bade18686 100644 --- a/pkg/sentry/socket/unix/BUILD +++ b/pkg/sentry/socket/unix/BUILD @@ -1,4 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -9,7 +9,6 @@ go_library( "io.go", "unix.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/socket/unix", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", diff --git a/pkg/sentry/socket/unix/transport/BUILD b/pkg/sentry/socket/unix/transport/BUILD index d7ba95dff..4bdfc9208 100644 --- a/pkg/sentry/socket/unix/transport/BUILD +++ b/pkg/sentry/socket/unix/transport/BUILD @@ -1,4 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") load("//tools/go_generics:defs.bzl", "go_template_instance") package(licenses = ["notice"]) @@ -25,7 +25,6 @@ go_library( "transport_message_list.go", "unix.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport", visibility = ["//:sandbox"], deps = [ "//pkg/abi/linux", diff --git a/pkg/sentry/state/BUILD b/pkg/sentry/state/BUILD index 88765f4d6..0ea4aab8b 100644 --- a/pkg/sentry/state/BUILD +++ b/pkg/sentry/state/BUILD @@ -1,4 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -9,7 +9,6 @@ go_library( "state_metadata.go", "state_unsafe.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/state", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", diff --git a/pkg/sentry/strace/BUILD b/pkg/sentry/strace/BUILD index aa1ac720c..ff6fafa63 100644 --- a/pkg/sentry/strace/BUILD +++ b/pkg/sentry/strace/BUILD @@ -1,6 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library") -load("@rules_cc//cc:defs.bzl", "cc_proto_library") +load("//tools:defs.bzl", "go_library", "proto_library") package(licenses = ["notice"]) @@ -21,7 +19,6 @@ go_library( "strace.go", "syscalls.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/strace", visibility = ["//:sandbox"], deps = [ ":strace_go_proto", @@ -42,20 +39,7 @@ go_library( ) proto_library( - name = "strace_proto", + name = "strace", srcs = ["strace.proto"], visibility = ["//visibility:public"], ) - -cc_proto_library( - name = "strace_cc_proto", - visibility = ["//visibility:public"], - deps = [":strace_proto"], -) - -go_proto_library( - name = "strace_go_proto", - importpath = "gvisor.dev/gvisor/pkg/sentry/strace/strace_go_proto", - proto = ":strace_proto", - visibility = ["//visibility:public"], -) diff --git a/pkg/sentry/syscalls/BUILD b/pkg/sentry/syscalls/BUILD index 79d972202..b8d1bd415 100644 --- a/pkg/sentry/syscalls/BUILD +++ b/pkg/sentry/syscalls/BUILD @@ -1,4 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -8,7 +8,6 @@ go_library( "epoll.go", "syscalls.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/syscalls", visibility = ["//:sandbox"], deps = [ "//pkg/abi/linux", diff --git a/pkg/sentry/syscalls/linux/BUILD b/pkg/sentry/syscalls/linux/BUILD index 917f74e07..7d74e0f70 100644 --- a/pkg/sentry/syscalls/linux/BUILD +++ b/pkg/sentry/syscalls/linux/BUILD @@ -1,4 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -57,7 +57,6 @@ go_library( "sys_xattr.go", "timespec.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/syscalls/linux", visibility = ["//:sandbox"], deps = [ "//pkg/abi", diff --git a/pkg/sentry/time/BUILD b/pkg/sentry/time/BUILD index 3cde3a0be..04f81a35b 100644 --- a/pkg/sentry/time/BUILD +++ b/pkg/sentry/time/BUILD @@ -1,5 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") load("//tools/go_generics:defs.bzl", "go_template_instance") package(licenses = ["notice"]) @@ -31,7 +30,6 @@ go_library( "tsc_amd64.s", "tsc_arm64.s", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/time", visibility = ["//:sandbox"], deps = [ "//pkg/log", @@ -48,5 +46,5 @@ go_test( "parameters_test.go", "sampler_test.go", ], - embed = [":time"], + library = ":time", ) diff --git a/pkg/sentry/unimpl/BUILD b/pkg/sentry/unimpl/BUILD index fc7614fff..370fa6ec5 100644 --- a/pkg/sentry/unimpl/BUILD +++ b/pkg/sentry/unimpl/BUILD @@ -1,34 +1,17 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library") -load("@rules_cc//cc:defs.bzl", "cc_proto_library") +load("//tools:defs.bzl", "go_library", "proto_library") package(licenses = ["notice"]) proto_library( - name = "unimplemented_syscall_proto", + name = "unimplemented_syscall", srcs = ["unimplemented_syscall.proto"], visibility = ["//visibility:public"], deps = ["//pkg/sentry/arch:registers_proto"], ) -cc_proto_library( - name = "unimplemented_syscall_cc_proto", - visibility = ["//visibility:public"], - deps = [":unimplemented_syscall_proto"], -) - -go_proto_library( - name = "unimplemented_syscall_go_proto", - importpath = "gvisor.dev/gvisor/pkg/sentry/unimpl/unimplemented_syscall_go_proto", - proto = ":unimplemented_syscall_proto", - visibility = ["//visibility:public"], - deps = ["//pkg/sentry/arch:registers_go_proto"], -) - go_library( name = "unimpl", srcs = ["events.go"], - importpath = "gvisor.dev/gvisor/pkg/sentry/unimpl", visibility = ["//:sandbox"], deps = [ "//pkg/log", diff --git a/pkg/sentry/uniqueid/BUILD b/pkg/sentry/uniqueid/BUILD index 86a87edd4..e9c18f170 100644 --- a/pkg/sentry/uniqueid/BUILD +++ b/pkg/sentry/uniqueid/BUILD @@ -1,11 +1,10 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) go_library( name = "uniqueid", srcs = ["context.go"], - importpath = "gvisor.dev/gvisor/pkg/sentry/uniqueid", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/sentry/context", diff --git a/pkg/sentry/usage/BUILD b/pkg/sentry/usage/BUILD index 5518ac3d0..099315613 100644 --- a/pkg/sentry/usage/BUILD +++ b/pkg/sentry/usage/BUILD @@ -1,4 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -11,9 +11,8 @@ go_library( "memory_unsafe.go", "usage.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/usage", visibility = [ - "//pkg/sentry:internal", + "//:sandbox", ], deps = [ "//pkg/bits", diff --git a/pkg/sentry/usermem/BUILD b/pkg/sentry/usermem/BUILD index 684f59a6b..c8322e29e 100644 --- a/pkg/sentry/usermem/BUILD +++ b/pkg/sentry/usermem/BUILD @@ -1,6 +1,5 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") load("//tools/go_generics:defs.bzl", "go_template_instance") -load("//tools/go_stateify:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -29,7 +28,6 @@ go_library( "usermem_unsafe.go", "usermem_x86.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/usermem", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/atomicbitops", @@ -38,7 +36,6 @@ go_library( "//pkg/sentry/context", "//pkg/sentry/safemem", "//pkg/syserror", - "//pkg/tcpip/buffer", ], ) @@ -49,7 +46,7 @@ go_test( "addr_range_seq_test.go", "usermem_test.go", ], - embed = [":usermem"], + library = ":usermem", deps = [ "//pkg/sentry/context", "//pkg/sentry/safemem", diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD index 35c7be259..51acdc4e9 100644 --- a/pkg/sentry/vfs/BUILD +++ b/pkg/sentry/vfs/BUILD @@ -1,7 +1,6 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") -package(licenses = ["notice"]) +licenses(["notice"]) go_library( name = "vfs", @@ -24,7 +23,6 @@ go_library( "testutil.go", "vfs.go", ], - importpath = "gvisor.dev/gvisor/pkg/sentry/vfs", visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", @@ -47,7 +45,7 @@ go_test( "file_description_impl_util_test.go", "mount_test.go", ], - embed = [":vfs"], + library = ":vfs", deps = [ "//pkg/abi/linux", "//pkg/sentry/context", diff --git a/pkg/sentry/watchdog/BUILD b/pkg/sentry/watchdog/BUILD index 28f21f13d..1c5a1c9b6 100644 --- a/pkg/sentry/watchdog/BUILD +++ b/pkg/sentry/watchdog/BUILD @@ -1,11 +1,10 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) go_library( name = "watchdog", srcs = ["watchdog.go"], - importpath = "gvisor.dev/gvisor/pkg/sentry/watchdog", visibility = ["//:sandbox"], deps = [ "//pkg/abi/linux", diff --git a/pkg/sleep/BUILD b/pkg/sleep/BUILD index a23c86fb1..e131455f7 100644 --- a/pkg/sleep/BUILD +++ b/pkg/sleep/BUILD @@ -1,5 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) @@ -12,7 +11,6 @@ go_library( "commit_noasm.go", "sleep_unsafe.go", ], - importpath = "gvisor.dev/gvisor/pkg/sleep", visibility = ["//:sandbox"], ) @@ -22,5 +20,5 @@ go_test( srcs = [ "sleep_test.go", ], - embed = [":sleep"], + library = ":sleep", ) diff --git a/pkg/state/BUILD b/pkg/state/BUILD index be93750bf..921af9d63 100644 --- a/pkg/state/BUILD +++ b/pkg/state/BUILD @@ -1,6 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//proto:def.bzl", "go_proto_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test", "proto_library") load("//tools/go_generics:defs.bzl", "go_template_instance") package(licenses = ["notice"]) @@ -49,7 +47,7 @@ go_library( "state.go", "stats.go", ], - importpath = "gvisor.dev/gvisor/pkg/state", + stateify = False, visibility = ["//:sandbox"], deps = [ ":object_go_proto", @@ -58,21 +56,14 @@ go_library( ) proto_library( - name = "object_proto", + name = "object", srcs = ["object.proto"], visibility = ["//:sandbox"], ) -go_proto_library( - name = "object_go_proto", - importpath = "gvisor.dev/gvisor/pkg/state/object_go_proto", - proto = ":object_proto", - visibility = ["//:sandbox"], -) - go_test( name = "state_test", timeout = "long", srcs = ["state_test.go"], - embed = [":state"], + library = ":state", ) diff --git a/pkg/state/statefile/BUILD b/pkg/state/statefile/BUILD index 8a865d229..e7581c09b 100644 --- a/pkg/state/statefile/BUILD +++ b/pkg/state/statefile/BUILD @@ -1,12 +1,10 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) go_library( name = "statefile", srcs = ["statefile.go"], - importpath = "gvisor.dev/gvisor/pkg/state/statefile", visibility = ["//:sandbox"], deps = [ "//pkg/binary", @@ -18,6 +16,6 @@ go_test( name = "statefile_test", size = "small", srcs = ["statefile_test.go"], - embed = [":statefile"], + library = ":statefile", deps = ["//pkg/compressio"], ) diff --git a/pkg/sync/BUILD b/pkg/sync/BUILD index 97c4b3b1e..5340cf0d6 100644 --- a/pkg/sync/BUILD +++ b/pkg/sync/BUILD @@ -1,5 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") load("//tools/go_generics:defs.bzl", "go_template") package( @@ -40,7 +39,6 @@ go_library( "syncutil.go", "tmutex_unsafe.go", ], - importpath = "gvisor.dev/gvisor/pkg/sync", ) go_test( @@ -51,5 +49,5 @@ go_test( "seqcount_test.go", "tmutex_test.go", ], - embed = [":sync"], + library = ":sync", ) diff --git a/pkg/sync/atomicptrtest/BUILD b/pkg/sync/atomicptrtest/BUILD index 418eda29c..e97553254 100644 --- a/pkg/sync/atomicptrtest/BUILD +++ b/pkg/sync/atomicptrtest/BUILD @@ -1,5 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") load("//tools/go_generics:defs.bzl", "go_template_instance") package(licenses = ["notice"]) @@ -18,12 +17,11 @@ go_template_instance( go_library( name = "atomicptr", srcs = ["atomicptr_int_unsafe.go"], - importpath = "gvisor.dev/gvisor/pkg/sync/atomicptr", ) go_test( name = "atomicptr_test", size = "small", srcs = ["atomicptr_test.go"], - embed = [":atomicptr"], + library = ":atomicptr", ) diff --git a/pkg/sync/seqatomictest/BUILD b/pkg/sync/seqatomictest/BUILD index eba21518d..5c38c783e 100644 --- a/pkg/sync/seqatomictest/BUILD +++ b/pkg/sync/seqatomictest/BUILD @@ -1,5 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") load("//tools/go_generics:defs.bzl", "go_template_instance") package(licenses = ["notice"]) @@ -18,7 +17,6 @@ go_template_instance( go_library( name = "seqatomic", srcs = ["seqatomic_int_unsafe.go"], - importpath = "gvisor.dev/gvisor/pkg/sync/seqatomic", deps = [ "//pkg/sync", ], @@ -28,6 +26,6 @@ go_test( name = "seqatomic_test", size = "small", srcs = ["seqatomic_test.go"], - embed = [":seqatomic"], + library = ":seqatomic", deps = ["//pkg/sync"], ) diff --git a/pkg/syserr/BUILD b/pkg/syserr/BUILD index 5665ad4ee..7d760344a 100644 --- a/pkg/syserr/BUILD +++ b/pkg/syserr/BUILD @@ -1,4 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -9,7 +9,6 @@ go_library( "netstack.go", "syserr.go", ], - importpath = "gvisor.dev/gvisor/pkg/syserr", visibility = ["//visibility:public"], deps = [ "//pkg/abi/linux", diff --git a/pkg/syserror/BUILD b/pkg/syserror/BUILD index bd3f9fd28..b13c15d9b 100644 --- a/pkg/syserror/BUILD +++ b/pkg/syserror/BUILD @@ -1,12 +1,10 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) go_library( name = "syserror", srcs = ["syserror.go"], - importpath = "gvisor.dev/gvisor/pkg/syserror", visibility = ["//visibility:public"], ) diff --git a/pkg/tcpip/BUILD b/pkg/tcpip/BUILD index 23e4b09e7..26f7ba86b 100644 --- a/pkg/tcpip/BUILD +++ b/pkg/tcpip/BUILD @@ -1,5 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_test") -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) @@ -12,7 +11,6 @@ go_library( "time_unsafe.go", "timer.go", ], - importpath = "gvisor.dev/gvisor/pkg/tcpip", visibility = ["//visibility:public"], deps = [ "//pkg/sync", @@ -25,7 +23,7 @@ go_test( name = "tcpip_test", size = "small", srcs = ["tcpip_test.go"], - embed = [":tcpip"], + library = ":tcpip", ) go_test( diff --git a/pkg/tcpip/adapters/gonet/BUILD b/pkg/tcpip/adapters/gonet/BUILD index 3df7d18d3..a984f1712 100644 --- a/pkg/tcpip/adapters/gonet/BUILD +++ b/pkg/tcpip/adapters/gonet/BUILD @@ -1,12 +1,10 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) go_library( name = "gonet", srcs = ["gonet.go"], - importpath = "gvisor.dev/gvisor/pkg/tcpip/adapters/gonet", visibility = ["//visibility:public"], deps = [ "//pkg/sync", @@ -23,7 +21,7 @@ go_test( name = "gonet_test", size = "small", srcs = ["gonet_test.go"], - embed = [":gonet"], + library = ":gonet", deps = [ "//pkg/tcpip", "//pkg/tcpip/header", diff --git a/pkg/tcpip/buffer/BUILD b/pkg/tcpip/buffer/BUILD index d6c31bfa2..563bc78ea 100644 --- a/pkg/tcpip/buffer/BUILD +++ b/pkg/tcpip/buffer/BUILD @@ -1,5 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_test") -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) @@ -9,7 +8,6 @@ go_library( "prependable.go", "view.go", ], - importpath = "gvisor.dev/gvisor/pkg/tcpip/buffer", visibility = ["//visibility:public"], ) @@ -17,5 +15,5 @@ go_test( name = "buffer_test", size = "small", srcs = ["view_test.go"], - embed = [":buffer"], + library = ":buffer", ) diff --git a/pkg/tcpip/checker/BUILD b/pkg/tcpip/checker/BUILD index b6fa6fc37..ed434807f 100644 --- a/pkg/tcpip/checker/BUILD +++ b/pkg/tcpip/checker/BUILD @@ -1,4 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -6,7 +6,6 @@ go_library( name = "checker", testonly = 1, srcs = ["checker.go"], - importpath = "gvisor.dev/gvisor/pkg/tcpip/checker", visibility = ["//visibility:public"], deps = [ "//pkg/tcpip", diff --git a/pkg/tcpip/hash/jenkins/BUILD b/pkg/tcpip/hash/jenkins/BUILD index e648efa71..ff2719291 100644 --- a/pkg/tcpip/hash/jenkins/BUILD +++ b/pkg/tcpip/hash/jenkins/BUILD @@ -1,12 +1,10 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) go_library( name = "jenkins", srcs = ["jenkins.go"], - importpath = "gvisor.dev/gvisor/pkg/tcpip/hash/jenkins", visibility = ["//visibility:public"], ) @@ -16,5 +14,5 @@ go_test( srcs = [ "jenkins_test.go", ], - embed = [":jenkins"], + library = ":jenkins", ) diff --git a/pkg/tcpip/header/BUILD b/pkg/tcpip/header/BUILD index cd747d100..9da0d71f8 100644 --- a/pkg/tcpip/header/BUILD +++ b/pkg/tcpip/header/BUILD @@ -1,5 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_test") -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) @@ -24,7 +23,6 @@ go_library( "tcp.go", "udp.go", ], - importpath = "gvisor.dev/gvisor/pkg/tcpip/header", visibility = ["//visibility:public"], deps = [ "//pkg/tcpip", @@ -59,7 +57,7 @@ go_test( "eth_test.go", "ndp_test.go", ], - embed = [":header"], + library = ":header", deps = [ "//pkg/tcpip", "@com_github_google_go-cmp//cmp:go_default_library", diff --git a/pkg/tcpip/iptables/BUILD b/pkg/tcpip/iptables/BUILD index 297eaccaf..d1b73cfdf 100644 --- a/pkg/tcpip/iptables/BUILD +++ b/pkg/tcpip/iptables/BUILD @@ -1,4 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -9,7 +9,6 @@ go_library( "targets.go", "types.go", ], - importpath = "gvisor.dev/gvisor/pkg/tcpip/iptables", visibility = ["//visibility:public"], deps = [ "//pkg/log", diff --git a/pkg/tcpip/link/channel/BUILD b/pkg/tcpip/link/channel/BUILD index 7dbc05754..3974c464e 100644 --- a/pkg/tcpip/link/channel/BUILD +++ b/pkg/tcpip/link/channel/BUILD @@ -1,11 +1,10 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) go_library( name = "channel", srcs = ["channel.go"], - importpath = "gvisor.dev/gvisor/pkg/tcpip/link/channel", visibility = ["//visibility:public"], deps = [ "//pkg/tcpip", diff --git a/pkg/tcpip/link/fdbased/BUILD b/pkg/tcpip/link/fdbased/BUILD index 66cc53ed4..abe725548 100644 --- a/pkg/tcpip/link/fdbased/BUILD +++ b/pkg/tcpip/link/fdbased/BUILD @@ -1,5 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) @@ -13,7 +12,6 @@ go_library( "mmap_unsafe.go", "packet_dispatchers.go", ], - importpath = "gvisor.dev/gvisor/pkg/tcpip/link/fdbased", visibility = ["//visibility:public"], deps = [ "//pkg/sync", @@ -30,7 +28,7 @@ go_test( name = "fdbased_test", size = "small", srcs = ["endpoint_test.go"], - embed = [":fdbased"], + library = ":fdbased", deps = [ "//pkg/tcpip", "//pkg/tcpip/buffer", diff --git a/pkg/tcpip/link/loopback/BUILD b/pkg/tcpip/link/loopback/BUILD index f35fcdff4..6bf3805b7 100644 --- a/pkg/tcpip/link/loopback/BUILD +++ b/pkg/tcpip/link/loopback/BUILD @@ -1,11 +1,10 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) go_library( name = "loopback", srcs = ["loopback.go"], - importpath = "gvisor.dev/gvisor/pkg/tcpip/link/loopback", visibility = ["//visibility:public"], deps = [ "//pkg/tcpip", diff --git a/pkg/tcpip/link/muxed/BUILD b/pkg/tcpip/link/muxed/BUILD index 1ac7948b6..82b441b79 100644 --- a/pkg/tcpip/link/muxed/BUILD +++ b/pkg/tcpip/link/muxed/BUILD @@ -1,12 +1,10 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) go_library( name = "muxed", srcs = ["injectable.go"], - importpath = "gvisor.dev/gvisor/pkg/tcpip/link/muxed", visibility = ["//visibility:public"], deps = [ "//pkg/tcpip", @@ -19,7 +17,7 @@ go_test( name = "muxed_test", size = "small", srcs = ["injectable_test.go"], - embed = [":muxed"], + library = ":muxed", deps = [ "//pkg/tcpip", "//pkg/tcpip/buffer", diff --git a/pkg/tcpip/link/rawfile/BUILD b/pkg/tcpip/link/rawfile/BUILD index d8211e93d..14b527bc2 100644 --- a/pkg/tcpip/link/rawfile/BUILD +++ b/pkg/tcpip/link/rawfile/BUILD @@ -1,4 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -12,7 +12,6 @@ go_library( "errors.go", "rawfile_unsafe.go", ], - importpath = "gvisor.dev/gvisor/pkg/tcpip/link/rawfile", visibility = ["//visibility:public"], deps = [ "//pkg/tcpip", diff --git a/pkg/tcpip/link/sharedmem/BUILD b/pkg/tcpip/link/sharedmem/BUILD index 09165dd4c..13243ebbb 100644 --- a/pkg/tcpip/link/sharedmem/BUILD +++ b/pkg/tcpip/link/sharedmem/BUILD @@ -1,5 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) @@ -11,7 +10,6 @@ go_library( "sharedmem_unsafe.go", "tx.go", ], - importpath = "gvisor.dev/gvisor/pkg/tcpip/link/sharedmem", visibility = ["//visibility:public"], deps = [ "//pkg/log", @@ -30,7 +28,7 @@ go_test( srcs = [ "sharedmem_test.go", ], - embed = [":sharedmem"], + library = ":sharedmem", deps = [ "//pkg/sync", "//pkg/tcpip", diff --git a/pkg/tcpip/link/sharedmem/pipe/BUILD b/pkg/tcpip/link/sharedmem/pipe/BUILD index a0d4ad0be..87020ec08 100644 --- a/pkg/tcpip/link/sharedmem/pipe/BUILD +++ b/pkg/tcpip/link/sharedmem/pipe/BUILD @@ -1,5 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) @@ -11,7 +10,6 @@ go_library( "rx.go", "tx.go", ], - importpath = "gvisor.dev/gvisor/pkg/tcpip/link/sharedmem/pipe", visibility = ["//visibility:public"], ) @@ -20,6 +18,6 @@ go_test( srcs = [ "pipe_test.go", ], - embed = [":pipe"], + library = ":pipe", deps = ["//pkg/sync"], ) diff --git a/pkg/tcpip/link/sharedmem/queue/BUILD b/pkg/tcpip/link/sharedmem/queue/BUILD index 8c9234d54..3ba06af73 100644 --- a/pkg/tcpip/link/sharedmem/queue/BUILD +++ b/pkg/tcpip/link/sharedmem/queue/BUILD @@ -1,5 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) @@ -9,7 +8,6 @@ go_library( "rx.go", "tx.go", ], - importpath = "gvisor.dev/gvisor/pkg/tcpip/link/sharedmem/queue", visibility = ["//visibility:public"], deps = [ "//pkg/log", @@ -22,7 +20,7 @@ go_test( srcs = [ "queue_test.go", ], - embed = [":queue"], + library = ":queue", deps = [ "//pkg/tcpip/link/sharedmem/pipe", ], diff --git a/pkg/tcpip/link/sniffer/BUILD b/pkg/tcpip/link/sniffer/BUILD index d6ae0368a..230a8d53a 100644 --- a/pkg/tcpip/link/sniffer/BUILD +++ b/pkg/tcpip/link/sniffer/BUILD @@ -1,4 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -8,7 +8,6 @@ go_library( "pcap.go", "sniffer.go", ], - importpath = "gvisor.dev/gvisor/pkg/tcpip/link/sniffer", visibility = ["//visibility:public"], deps = [ "//pkg/log", diff --git a/pkg/tcpip/link/tun/BUILD b/pkg/tcpip/link/tun/BUILD index a71a493fc..e5096ea38 100644 --- a/pkg/tcpip/link/tun/BUILD +++ b/pkg/tcpip/link/tun/BUILD @@ -1,10 +1,9 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) go_library( name = "tun", srcs = ["tun_unsafe.go"], - importpath = "gvisor.dev/gvisor/pkg/tcpip/link/tun", visibility = ["//visibility:public"], ) diff --git a/pkg/tcpip/link/waitable/BUILD b/pkg/tcpip/link/waitable/BUILD index 134837943..0956d2c65 100644 --- a/pkg/tcpip/link/waitable/BUILD +++ b/pkg/tcpip/link/waitable/BUILD @@ -1,5 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) @@ -8,7 +7,6 @@ go_library( srcs = [ "waitable.go", ], - importpath = "gvisor.dev/gvisor/pkg/tcpip/link/waitable", visibility = ["//visibility:public"], deps = [ "//pkg/gate", @@ -23,7 +21,7 @@ go_test( srcs = [ "waitable_test.go", ], - embed = [":waitable"], + library = ":waitable", deps = [ "//pkg/tcpip", "//pkg/tcpip/buffer", diff --git a/pkg/tcpip/network/BUILD b/pkg/tcpip/network/BUILD index 9d16ff8c9..6a4839fb8 100644 --- a/pkg/tcpip/network/BUILD +++ b/pkg/tcpip/network/BUILD @@ -1,4 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_test") package(licenses = ["notice"]) diff --git a/pkg/tcpip/network/arp/BUILD b/pkg/tcpip/network/arp/BUILD index e7617229b..eddf7b725 100644 --- a/pkg/tcpip/network/arp/BUILD +++ b/pkg/tcpip/network/arp/BUILD @@ -1,12 +1,10 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) go_library( name = "arp", srcs = ["arp.go"], - importpath = "gvisor.dev/gvisor/pkg/tcpip/network/arp", visibility = ["//visibility:public"], deps = [ "//pkg/tcpip", diff --git a/pkg/tcpip/network/fragmentation/BUILD b/pkg/tcpip/network/fragmentation/BUILD index ed16076fd..d1c728ccf 100644 --- a/pkg/tcpip/network/fragmentation/BUILD +++ b/pkg/tcpip/network/fragmentation/BUILD @@ -1,6 +1,5 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") load("//tools/go_generics:defs.bzl", "go_template_instance") -load("//tools/go_stateify:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -24,7 +23,6 @@ go_library( "reassembler.go", "reassembler_list.go", ], - importpath = "gvisor.dev/gvisor/pkg/tcpip/network/fragmentation", visibility = ["//visibility:public"], deps = [ "//pkg/log", @@ -42,6 +40,6 @@ go_test( "fragmentation_test.go", "reassembler_test.go", ], - embed = [":fragmentation"], + library = ":fragmentation", deps = ["//pkg/tcpip/buffer"], ) diff --git a/pkg/tcpip/network/hash/BUILD b/pkg/tcpip/network/hash/BUILD index e6db5c0b0..872165866 100644 --- a/pkg/tcpip/network/hash/BUILD +++ b/pkg/tcpip/network/hash/BUILD @@ -1,11 +1,10 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) go_library( name = "hash", srcs = ["hash.go"], - importpath = "gvisor.dev/gvisor/pkg/tcpip/network/hash", visibility = ["//visibility:public"], deps = [ "//pkg/rand", diff --git a/pkg/tcpip/network/ipv4/BUILD b/pkg/tcpip/network/ipv4/BUILD index 4e2aae9a3..0fef2b1f1 100644 --- a/pkg/tcpip/network/ipv4/BUILD +++ b/pkg/tcpip/network/ipv4/BUILD @@ -1,5 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) @@ -9,7 +8,6 @@ go_library( "icmp.go", "ipv4.go", ], - importpath = "gvisor.dev/gvisor/pkg/tcpip/network/ipv4", visibility = ["//visibility:public"], deps = [ "//pkg/tcpip", diff --git a/pkg/tcpip/network/ipv6/BUILD b/pkg/tcpip/network/ipv6/BUILD index e4e273460..fb11874c6 100644 --- a/pkg/tcpip/network/ipv6/BUILD +++ b/pkg/tcpip/network/ipv6/BUILD @@ -1,5 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) @@ -9,7 +8,6 @@ go_library( "icmp.go", "ipv6.go", ], - importpath = "gvisor.dev/gvisor/pkg/tcpip/network/ipv6", visibility = ["//visibility:public"], deps = [ "//pkg/tcpip", @@ -27,7 +25,7 @@ go_test( "ipv6_test.go", "ndp_test.go", ], - embed = [":ipv6"], + library = ":ipv6", deps = [ "//pkg/tcpip", "//pkg/tcpip/buffer", diff --git a/pkg/tcpip/ports/BUILD b/pkg/tcpip/ports/BUILD index a6ef3bdcc..2bad05a2e 100644 --- a/pkg/tcpip/ports/BUILD +++ b/pkg/tcpip/ports/BUILD @@ -1,12 +1,10 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_test") -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) go_library( name = "ports", srcs = ["ports.go"], - importpath = "gvisor.dev/gvisor/pkg/tcpip/ports", visibility = ["//visibility:public"], deps = [ "//pkg/sync", @@ -17,7 +15,7 @@ go_library( go_test( name = "ports_test", srcs = ["ports_test.go"], - embed = [":ports"], + library = ":ports", deps = [ "//pkg/tcpip", ], diff --git a/pkg/tcpip/sample/tun_tcp_connect/BUILD b/pkg/tcpip/sample/tun_tcp_connect/BUILD index d7496fde6..cf0a5fefe 100644 --- a/pkg/tcpip/sample/tun_tcp_connect/BUILD +++ b/pkg/tcpip/sample/tun_tcp_connect/BUILD @@ -1,4 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_binary") +load("//tools:defs.bzl", "go_binary") package(licenses = ["notice"]) diff --git a/pkg/tcpip/sample/tun_tcp_echo/BUILD b/pkg/tcpip/sample/tun_tcp_echo/BUILD index 875561566..43264b76d 100644 --- a/pkg/tcpip/sample/tun_tcp_echo/BUILD +++ b/pkg/tcpip/sample/tun_tcp_echo/BUILD @@ -1,4 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_binary") +load("//tools:defs.bzl", "go_binary") package(licenses = ["notice"]) diff --git a/pkg/tcpip/seqnum/BUILD b/pkg/tcpip/seqnum/BUILD index b31ddba2f..45f503845 100644 --- a/pkg/tcpip/seqnum/BUILD +++ b/pkg/tcpip/seqnum/BUILD @@ -1,10 +1,9 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) go_library( name = "seqnum", srcs = ["seqnum.go"], - importpath = "gvisor.dev/gvisor/pkg/tcpip/seqnum", visibility = ["//visibility:public"], ) diff --git a/pkg/tcpip/stack/BUILD b/pkg/tcpip/stack/BUILD index 783351a69..f5b750046 100644 --- a/pkg/tcpip/stack/BUILD +++ b/pkg/tcpip/stack/BUILD @@ -1,6 +1,5 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") load("//tools/go_generics:defs.bzl", "go_template_instance") -load("//tools/go_stateify:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -30,7 +29,6 @@ go_library( "stack_global_state.go", "transport_demuxer.go", ], - importpath = "gvisor.dev/gvisor/pkg/tcpip/stack", visibility = ["//visibility:public"], deps = [ "//pkg/ilist", @@ -81,7 +79,7 @@ go_test( name = "stack_test", size = "small", srcs = ["linkaddrcache_test.go"], - embed = [":stack"], + library = ":stack", deps = [ "//pkg/sleep", "//pkg/sync", diff --git a/pkg/tcpip/transport/icmp/BUILD b/pkg/tcpip/transport/icmp/BUILD index 3aa23d529..ac18ec5b1 100644 --- a/pkg/tcpip/transport/icmp/BUILD +++ b/pkg/tcpip/transport/icmp/BUILD @@ -1,5 +1,5 @@ +load("//tools:defs.bzl", "go_library") load("//tools/go_generics:defs.bzl", "go_template_instance") -load("//tools/go_stateify:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -23,7 +23,6 @@ go_library( "icmp_packet_list.go", "protocol.go", ], - importpath = "gvisor.dev/gvisor/pkg/tcpip/transport/icmp", imports = ["gvisor.dev/gvisor/pkg/tcpip/buffer"], visibility = ["//visibility:public"], deps = [ diff --git a/pkg/tcpip/transport/packet/BUILD b/pkg/tcpip/transport/packet/BUILD index 4858d150c..d22de6b26 100644 --- a/pkg/tcpip/transport/packet/BUILD +++ b/pkg/tcpip/transport/packet/BUILD @@ -1,5 +1,5 @@ +load("//tools:defs.bzl", "go_library") load("//tools/go_generics:defs.bzl", "go_template_instance") -load("//tools/go_stateify:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -22,7 +22,6 @@ go_library( "endpoint_state.go", "packet_list.go", ], - importpath = "gvisor.dev/gvisor/pkg/tcpip/transport/packet", imports = ["gvisor.dev/gvisor/pkg/tcpip/buffer"], visibility = ["//visibility:public"], deps = [ diff --git a/pkg/tcpip/transport/raw/BUILD b/pkg/tcpip/transport/raw/BUILD index 2f2131ff7..c9baf4600 100644 --- a/pkg/tcpip/transport/raw/BUILD +++ b/pkg/tcpip/transport/raw/BUILD @@ -1,5 +1,5 @@ +load("//tools:defs.bzl", "go_library") load("//tools/go_generics:defs.bzl", "go_template_instance") -load("//tools/go_stateify:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -23,7 +23,6 @@ go_library( "protocol.go", "raw_packet_list.go", ], - importpath = "gvisor.dev/gvisor/pkg/tcpip/transport/raw", imports = ["gvisor.dev/gvisor/pkg/tcpip/buffer"], visibility = ["//visibility:public"], deps = [ diff --git a/pkg/tcpip/transport/tcp/BUILD b/pkg/tcpip/transport/tcp/BUILD index 0e3ab05ad..4acd9fb9a 100644 --- a/pkg/tcpip/transport/tcp/BUILD +++ b/pkg/tcpip/transport/tcp/BUILD @@ -1,6 +1,5 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") load("//tools/go_generics:defs.bzl", "go_template_instance") -load("//tools/go_stateify:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -55,7 +54,6 @@ go_library( "tcp_segment_list.go", "timer.go", ], - importpath = "gvisor.dev/gvisor/pkg/tcpip/transport/tcp", imports = ["gvisor.dev/gvisor/pkg/tcpip/buffer"], visibility = ["//visibility:public"], deps = [ diff --git a/pkg/tcpip/transport/tcp/testing/context/BUILD b/pkg/tcpip/transport/tcp/testing/context/BUILD index b33ec2087..ce6a2c31d 100644 --- a/pkg/tcpip/transport/tcp/testing/context/BUILD +++ b/pkg/tcpip/transport/tcp/testing/context/BUILD @@ -1,4 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -6,7 +6,6 @@ go_library( name = "context", testonly = 1, srcs = ["context.go"], - importpath = "gvisor.dev/gvisor/pkg/tcpip/transport/tcp/testing/context", visibility = [ "//visibility:public", ], diff --git a/pkg/tcpip/transport/tcpconntrack/BUILD b/pkg/tcpip/transport/tcpconntrack/BUILD index 43fcc27f0..3ad6994a7 100644 --- a/pkg/tcpip/transport/tcpconntrack/BUILD +++ b/pkg/tcpip/transport/tcpconntrack/BUILD @@ -1,12 +1,10 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) go_library( name = "tcpconntrack", srcs = ["tcp_conntrack.go"], - importpath = "gvisor.dev/gvisor/pkg/tcpip/transport/tcpconntrack", visibility = ["//visibility:public"], deps = [ "//pkg/tcpip/header", diff --git a/pkg/tcpip/transport/udp/BUILD b/pkg/tcpip/transport/udp/BUILD index 57ff123e3..adc908e24 100644 --- a/pkg/tcpip/transport/udp/BUILD +++ b/pkg/tcpip/transport/udp/BUILD @@ -1,6 +1,5 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") load("//tools/go_generics:defs.bzl", "go_template_instance") -load("//tools/go_stateify:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -25,7 +24,6 @@ go_library( "protocol.go", "udp_packet_list.go", ], - importpath = "gvisor.dev/gvisor/pkg/tcpip/transport/udp", imports = ["gvisor.dev/gvisor/pkg/tcpip/buffer"], visibility = ["//visibility:public"], deps = [ diff --git a/pkg/tmutex/BUILD b/pkg/tmutex/BUILD index 07778e4f7..2dcba84ae 100644 --- a/pkg/tmutex/BUILD +++ b/pkg/tmutex/BUILD @@ -1,12 +1,10 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) go_library( name = "tmutex", srcs = ["tmutex.go"], - importpath = "gvisor.dev/gvisor/pkg/tmutex", visibility = ["//:sandbox"], ) @@ -14,6 +12,6 @@ go_test( name = "tmutex_test", size = "medium", srcs = ["tmutex_test.go"], - embed = [":tmutex"], + library = ":tmutex", deps = ["//pkg/sync"], ) diff --git a/pkg/unet/BUILD b/pkg/unet/BUILD index d1885ae66..a86501fa2 100644 --- a/pkg/unet/BUILD +++ b/pkg/unet/BUILD @@ -1,5 +1,4 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) @@ -9,7 +8,6 @@ go_library( "unet.go", "unet_unsafe.go", ], - importpath = "gvisor.dev/gvisor/pkg/unet", visibility = ["//visibility:public"], deps = [ "//pkg/gate", @@ -23,6 +21,6 @@ go_test( srcs = [ "unet_test.go", ], - embed = [":unet"], + library = ":unet", deps = ["//pkg/sync"], ) diff --git a/pkg/urpc/BUILD b/pkg/urpc/BUILD index b8fdc3125..850c34ed0 100644 --- a/pkg/urpc/BUILD +++ b/pkg/urpc/BUILD @@ -1,12 +1,10 @@ -load("//tools/go_stateify:defs.bzl", "go_library") -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) go_library( name = "urpc", srcs = ["urpc.go"], - importpath = "gvisor.dev/gvisor/pkg/urpc", visibility = ["//:sandbox"], deps = [ "//pkg/fd", @@ -20,6 +18,6 @@ go_test( name = "urpc_test", size = "small", srcs = ["urpc_test.go"], - embed = [":urpc"], + library = ":urpc", deps = ["//pkg/unet"], ) diff --git a/pkg/waiter/BUILD b/pkg/waiter/BUILD index 1c6890e52..852480a09 100644 --- a/pkg/waiter/BUILD +++ b/pkg/waiter/BUILD @@ -1,6 +1,5 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") load("//tools/go_generics:defs.bzl", "go_template_instance") -load("//tools/go_stateify:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -22,7 +21,6 @@ go_library( "waiter.go", "waiter_list.go", ], - importpath = "gvisor.dev/gvisor/pkg/waiter", visibility = ["//visibility:public"], deps = ["//pkg/sync"], ) @@ -33,5 +31,5 @@ go_test( srcs = [ "waiter_test.go", ], - embed = [":waiter"], + library = ":waiter", ) diff --git a/runsc/BUILD b/runsc/BUILD index e5587421d..b35b41d81 100644 --- a/runsc/BUILD +++ b/runsc/BUILD @@ -1,7 +1,6 @@ -package(licenses = ["notice"]) # Apache 2.0 +load("//tools:defs.bzl", "go_binary", "pkg_deb", "pkg_tar") -load("@io_bazel_rules_go//go:def.bzl", "go_binary") -load("@rules_pkg//:pkg.bzl", "pkg_deb", "pkg_tar") +package(licenses = ["notice"]) go_binary( name = "runsc", @@ -9,7 +8,7 @@ go_binary( "main.go", "version.go", ], - pure = "on", + pure = True, visibility = [ "//visibility:public", ], @@ -26,10 +25,12 @@ go_binary( ) # The runsc-race target is a race-compatible BUILD target. This must be built -# via "bazel build --features=race //runsc:runsc-race", since the race feature -# must apply to all dependencies due a bug in gazelle file selection. The pure -# attribute must be off because the race detector requires linking with non-Go -# components, although we still require a static binary. +# via: bazel build --features=race //runsc:runsc-race +# +# This is neccessary because the race feature must apply to all dependencies +# due a bug in gazelle file selection. The pure attribute must be off because +# the race detector requires linking with non-Go components, although we still +# require a static binary. # # Note that in the future this might be convertible to a compatible target by # using the pure and static attributes within a select function, but select is @@ -42,7 +43,7 @@ go_binary( "main.go", "version.go", ], - static = "on", + static = True, visibility = [ "//visibility:public", ], @@ -82,7 +83,12 @@ genrule( # because they are assumes to be hermetic). srcs = [":runsc"], outs = ["version.txt"], - cmd = "$(location :runsc) -version | grep 'runsc version' | sed 's/^[^0-9]*//' > $@", + # Note that the little dance here is necessary because files in the $(SRCS) + # attribute are not executable by default, and we can't touch in place. + cmd = "cp $(location :runsc) $(@D)/runsc && \ + chmod a+x $(@D)/runsc && \ + $(@D)/runsc -version | grep version | sed 's/^[^0-9]*//' > $@ && \ + rm -f $(@D)/runsc", stamp = 1, ) @@ -109,5 +115,6 @@ sh_test( name = "version_test", size = "small", srcs = ["version_test.sh"], + args = ["$(location :runsc)"], data = [":runsc"], ) diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD index 3e20f8f2f..f3ebc0231 100644 --- a/runsc/boot/BUILD +++ b/runsc/boot/BUILD @@ -1,4 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) @@ -23,7 +23,6 @@ go_library( "strace.go", "user.go", ], - importpath = "gvisor.dev/gvisor/runsc/boot", visibility = [ "//runsc:__subpackages__", "//test:__subpackages__", @@ -107,7 +106,7 @@ go_test( "loader_test.go", "user_test.go", ], - embed = [":boot"], + library = ":boot", deps = [ "//pkg/control/server", "//pkg/log", diff --git a/runsc/boot/filter/BUILD b/runsc/boot/filter/BUILD index 3a9dcfc04..ce30f6c53 100644 --- a/runsc/boot/filter/BUILD +++ b/runsc/boot/filter/BUILD @@ -1,4 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -13,7 +13,6 @@ go_library( "extra_filters_race.go", "filter.go", ], - importpath = "gvisor.dev/gvisor/runsc/boot/filter", visibility = [ "//runsc/boot:__subpackages__", ], diff --git a/runsc/boot/platforms/BUILD b/runsc/boot/platforms/BUILD index 03391cdca..77774f43c 100644 --- a/runsc/boot/platforms/BUILD +++ b/runsc/boot/platforms/BUILD @@ -1,11 +1,10 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) go_library( name = "platforms", srcs = ["platforms.go"], - importpath = "gvisor.dev/gvisor/runsc/boot/platforms", visibility = [ "//runsc:__subpackages__", ], diff --git a/runsc/cgroup/BUILD b/runsc/cgroup/BUILD index d6165f9e5..d4c7bdfbb 100644 --- a/runsc/cgroup/BUILD +++ b/runsc/cgroup/BUILD @@ -1,11 +1,10 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) go_library( name = "cgroup", srcs = ["cgroup.go"], - importpath = "gvisor.dev/gvisor/runsc/cgroup", visibility = ["//:sandbox"], deps = [ "//pkg/log", @@ -19,6 +18,6 @@ go_test( name = "cgroup_test", size = "small", srcs = ["cgroup_test.go"], - embed = [":cgroup"], + library = ":cgroup", tags = ["local"], ) diff --git a/runsc/cmd/BUILD b/runsc/cmd/BUILD index b94bc4fa0..09aa46434 100644 --- a/runsc/cmd/BUILD +++ b/runsc/cmd/BUILD @@ -1,4 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) @@ -34,7 +34,6 @@ go_library( "syscalls.go", "wait.go", ], - importpath = "gvisor.dev/gvisor/runsc/cmd", visibility = [ "//runsc:__subpackages__", ], @@ -73,7 +72,7 @@ go_test( data = [ "//runsc", ], - embed = [":cmd"], + library = ":cmd", deps = [ "//pkg/abi/linux", "//pkg/log", diff --git a/runsc/console/BUILD b/runsc/console/BUILD index e623c1a0f..06924bccd 100644 --- a/runsc/console/BUILD +++ b/runsc/console/BUILD @@ -1,4 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -7,7 +7,6 @@ go_library( srcs = [ "console.go", ], - importpath = "gvisor.dev/gvisor/runsc/console", visibility = [ "//runsc:__subpackages__", ], diff --git a/runsc/container/BUILD b/runsc/container/BUILD index 6dea179e4..e21431e4c 100644 --- a/runsc/container/BUILD +++ b/runsc/container/BUILD @@ -1,4 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) @@ -10,7 +10,6 @@ go_library( "state_file.go", "status.go", ], - importpath = "gvisor.dev/gvisor/runsc/container", visibility = [ "//runsc:__subpackages__", "//test:__subpackages__", @@ -42,7 +41,7 @@ go_test( "//runsc", "//runsc/container/test_app", ], - embed = [":container"], + library = ":container", shard_count = 5, tags = [ "requires-kvm", diff --git a/runsc/container/test_app/BUILD b/runsc/container/test_app/BUILD index bfd338bb6..e200bafd9 100644 --- a/runsc/container/test_app/BUILD +++ b/runsc/container/test_app/BUILD @@ -1,4 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_binary") +load("//tools:defs.bzl", "go_binary") package(licenses = ["notice"]) @@ -9,7 +9,7 @@ go_binary( "fds.go", "test_app.go", ], - pure = "on", + pure = True, visibility = ["//runsc/container:__pkg__"], deps = [ "//pkg/unet", diff --git a/runsc/criutil/BUILD b/runsc/criutil/BUILD index 558133a0e..8a571a000 100644 --- a/runsc/criutil/BUILD +++ b/runsc/criutil/BUILD @@ -1,4 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -6,7 +6,6 @@ go_library( name = "criutil", testonly = 1, srcs = ["criutil.go"], - importpath = "gvisor.dev/gvisor/runsc/criutil", visibility = ["//:sandbox"], deps = ["//runsc/testutil"], ) diff --git a/runsc/dockerutil/BUILD b/runsc/dockerutil/BUILD index 0e0423504..8621af901 100644 --- a/runsc/dockerutil/BUILD +++ b/runsc/dockerutil/BUILD @@ -1,4 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -6,7 +6,6 @@ go_library( name = "dockerutil", testonly = 1, srcs = ["dockerutil.go"], - importpath = "gvisor.dev/gvisor/runsc/dockerutil", visibility = ["//:sandbox"], deps = [ "//runsc/testutil", diff --git a/runsc/fsgofer/BUILD b/runsc/fsgofer/BUILD index a9582d92b..64a406ae2 100644 --- a/runsc/fsgofer/BUILD +++ b/runsc/fsgofer/BUILD @@ -1,4 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) @@ -10,10 +10,7 @@ go_library( "fsgofer_arm64_unsafe.go", "fsgofer_unsafe.go", ], - importpath = "gvisor.dev/gvisor/runsc/fsgofer", - visibility = [ - "//runsc:__subpackages__", - ], + visibility = ["//runsc:__subpackages__"], deps = [ "//pkg/abi/linux", "//pkg/fd", @@ -30,7 +27,7 @@ go_test( name = "fsgofer_test", size = "small", srcs = ["fsgofer_test.go"], - embed = [":fsgofer"], + library = ":fsgofer", deps = [ "//pkg/log", "//pkg/p9", diff --git a/runsc/fsgofer/filter/BUILD b/runsc/fsgofer/filter/BUILD index bac73f89d..82b48ef32 100644 --- a/runsc/fsgofer/filter/BUILD +++ b/runsc/fsgofer/filter/BUILD @@ -1,4 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -13,7 +13,6 @@ go_library( "extra_filters_race.go", "filter.go", ], - importpath = "gvisor.dev/gvisor/runsc/fsgofer/filter", visibility = [ "//runsc:__subpackages__", ], diff --git a/runsc/sandbox/BUILD b/runsc/sandbox/BUILD index ddbc37456..c95d50294 100644 --- a/runsc/sandbox/BUILD +++ b/runsc/sandbox/BUILD @@ -1,4 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -9,7 +9,6 @@ go_library( "network_unsafe.go", "sandbox.go", ], - importpath = "gvisor.dev/gvisor/runsc/sandbox", visibility = [ "//runsc:__subpackages__", ], diff --git a/runsc/specutils/BUILD b/runsc/specutils/BUILD index 205638803..4ccd77f63 100644 --- a/runsc/specutils/BUILD +++ b/runsc/specutils/BUILD @@ -1,4 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) @@ -10,7 +10,6 @@ go_library( "namespace.go", "specutils.go", ], - importpath = "gvisor.dev/gvisor/runsc/specutils", visibility = ["//:sandbox"], deps = [ "//pkg/abi/linux", @@ -28,6 +27,6 @@ go_test( name = "specutils_test", size = "small", srcs = ["specutils_test.go"], - embed = [":specutils"], + library = ":specutils", deps = ["@com_github_opencontainers_runtime-spec//specs-go:go_default_library"], ) diff --git a/runsc/testutil/BUILD b/runsc/testutil/BUILD index 3c3027cb5..f845120b0 100644 --- a/runsc/testutil/BUILD +++ b/runsc/testutil/BUILD @@ -1,4 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -6,7 +6,6 @@ go_library( name = "testutil", testonly = 1, srcs = ["testutil.go"], - importpath = "gvisor.dev/gvisor/runsc/testutil", visibility = ["//:sandbox"], deps = [ "//pkg/log", diff --git a/runsc/version_test.sh b/runsc/version_test.sh index cc0ca3f05..747350654 100755 --- a/runsc/version_test.sh +++ b/runsc/version_test.sh @@ -16,7 +16,7 @@ set -euf -x -o pipefail -readonly runsc="${TEST_SRCDIR}/__main__/runsc/linux_amd64_pure_stripped/runsc" +readonly runsc="$1" readonly version=$($runsc --version) # Version should should not match VERSION, which is the default and which will diff --git a/scripts/common.sh b/scripts/common.sh index fdb1aa142..cd91b9f8e 100755 --- a/scripts/common.sh +++ b/scripts/common.sh @@ -16,11 +16,7 @@ set -xeou pipefail -if [[ -f $(dirname $0)/common_google.sh ]]; then - source $(dirname $0)/common_google.sh -else - source $(dirname $0)/common_bazel.sh -fi +source $(dirname $0)/common_build.sh # Ensure it attempts to collect logs in all cases. trap collect_logs EXIT diff --git a/scripts/common_bazel.sh b/scripts/common_bazel.sh deleted file mode 100755 index a473a88a4..000000000 --- a/scripts/common_bazel.sh +++ /dev/null @@ -1,99 +0,0 @@ -#!/bin/bash - -# Copyright 2019 The gVisor Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Install the latest version of Bazel and log the version. -(which use_bazel.sh && use_bazel.sh latest) || which bazel -bazel version - -# Switch into the workspace; only necessary if run with kokoro. -if [[ -v KOKORO_GIT_COMMIT ]] && [[ -d git/repo ]]; then - cd git/repo -elif [[ -v KOKORO_GIT_COMMIT ]] && [[ -d github/repo ]]; then - cd github/repo -fi - -# Set the standard bazel flags. -declare -r BAZEL_FLAGS=( - "--show_timestamps" - "--test_output=errors" - "--keep_going" - "--verbose_failures=true" -) -if [[ -v KOKORO_BAZEL_AUTH_CREDENTIAL ]]; then - declare -r BAZEL_RBE_AUTH_FLAGS=( - "--auth_credentials=${KOKORO_BAZEL_AUTH_CREDENTIAL}" - ) - declare -r BAZEL_RBE_FLAGS=("--config=remote") -fi - -# Wrap bazel. -function build() { - bazel build "${BAZEL_RBE_FLAGS[@]}" "${BAZEL_RBE_AUTH_FLAGS[@]}" "${BAZEL_FLAGS[@]}" "$@" 2>&1 | - tee /dev/fd/2 | grep -E '^ bazel-bin/' | awk '{ print $1; }' -} - -function test() { - bazel test "${BAZEL_RBE_FLAGS[@]}" "${BAZEL_RBE_AUTH_FLAGS[@]}" "${BAZEL_FLAGS[@]}" "$@" -} - -function run() { - local binary=$1 - shift - bazel run "${binary}" -- "$@" -} - -function run_as_root() { - local binary=$1 - shift - bazel run --run_under="sudo" "${binary}" -- "$@" -} - -function collect_logs() { - # Zip out everything into a convenient form. - if [[ -v KOKORO_ARTIFACTS_DIR ]] && [[ -e bazel-testlogs ]]; then - # Merge results files of all shards for each test suite. - for d in `find -L "bazel-testlogs" -name 'shard_*_of_*' | xargs dirname | sort | uniq`; do - junitparser merge `find $d -name test.xml` $d/test.xml - cat $d/shard_*_of_*/test.log > $d/test.log - ls -l $d/shard_*_of_*/test.outputs/outputs.zip && zip -r -1 $d/outputs.zip $d/shard_*_of_*/test.outputs/outputs.zip - done - find -L "bazel-testlogs" -name 'shard_*_of_*' | xargs rm -rf - # Move test logs to Kokoro directory. tar is used to conveniently perform - # renames while moving files. - find -L "bazel-testlogs" -name "test.xml" -o -name "test.log" -o -name "outputs.zip" | - tar --create --files-from - --transform 's/test\./sponge_log./' | - tar --extract --directory ${KOKORO_ARTIFACTS_DIR} - - # Collect sentry logs, if any. - if [[ -v RUNSC_LOGS_DIR ]] && [[ -d "${RUNSC_LOGS_DIR}" ]]; then - # Check if the directory is empty or not (only the first line it needed). - local -r logs=$(ls "${RUNSC_LOGS_DIR}" | head -n1) - if [[ "${logs}" ]]; then - local -r archive=runsc_logs_"${RUNTIME}".tar.gz - if [[ -v KOKORO_BUILD_ARTIFACTS_SUBDIR ]]; then - echo "runsc logs will be uploaded to:" - echo " gsutil cp gs://gvisor/logs/${KOKORO_BUILD_ARTIFACTS_SUBDIR}/${archive} /tmp" - echo " https://storage.cloud.google.com/gvisor/logs/${KOKORO_BUILD_ARTIFACTS_SUBDIR}/${archive}" - fi - tar --create --gzip --file="${KOKORO_ARTIFACTS_DIR}/${archive}" -C "${RUNSC_LOGS_DIR}" . - fi - fi - fi -} - -function find_branch_name() { - git branch --show-current || git rev-parse HEAD || bazel info workspace | xargs basename -} diff --git a/scripts/common_build.sh b/scripts/common_build.sh new file mode 100755 index 000000000..a473a88a4 --- /dev/null +++ b/scripts/common_build.sh @@ -0,0 +1,99 @@ +#!/bin/bash + +# Copyright 2019 The gVisor Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Install the latest version of Bazel and log the version. +(which use_bazel.sh && use_bazel.sh latest) || which bazel +bazel version + +# Switch into the workspace; only necessary if run with kokoro. +if [[ -v KOKORO_GIT_COMMIT ]] && [[ -d git/repo ]]; then + cd git/repo +elif [[ -v KOKORO_GIT_COMMIT ]] && [[ -d github/repo ]]; then + cd github/repo +fi + +# Set the standard bazel flags. +declare -r BAZEL_FLAGS=( + "--show_timestamps" + "--test_output=errors" + "--keep_going" + "--verbose_failures=true" +) +if [[ -v KOKORO_BAZEL_AUTH_CREDENTIAL ]]; then + declare -r BAZEL_RBE_AUTH_FLAGS=( + "--auth_credentials=${KOKORO_BAZEL_AUTH_CREDENTIAL}" + ) + declare -r BAZEL_RBE_FLAGS=("--config=remote") +fi + +# Wrap bazel. +function build() { + bazel build "${BAZEL_RBE_FLAGS[@]}" "${BAZEL_RBE_AUTH_FLAGS[@]}" "${BAZEL_FLAGS[@]}" "$@" 2>&1 | + tee /dev/fd/2 | grep -E '^ bazel-bin/' | awk '{ print $1; }' +} + +function test() { + bazel test "${BAZEL_RBE_FLAGS[@]}" "${BAZEL_RBE_AUTH_FLAGS[@]}" "${BAZEL_FLAGS[@]}" "$@" +} + +function run() { + local binary=$1 + shift + bazel run "${binary}" -- "$@" +} + +function run_as_root() { + local binary=$1 + shift + bazel run --run_under="sudo" "${binary}" -- "$@" +} + +function collect_logs() { + # Zip out everything into a convenient form. + if [[ -v KOKORO_ARTIFACTS_DIR ]] && [[ -e bazel-testlogs ]]; then + # Merge results files of all shards for each test suite. + for d in `find -L "bazel-testlogs" -name 'shard_*_of_*' | xargs dirname | sort | uniq`; do + junitparser merge `find $d -name test.xml` $d/test.xml + cat $d/shard_*_of_*/test.log > $d/test.log + ls -l $d/shard_*_of_*/test.outputs/outputs.zip && zip -r -1 $d/outputs.zip $d/shard_*_of_*/test.outputs/outputs.zip + done + find -L "bazel-testlogs" -name 'shard_*_of_*' | xargs rm -rf + # Move test logs to Kokoro directory. tar is used to conveniently perform + # renames while moving files. + find -L "bazel-testlogs" -name "test.xml" -o -name "test.log" -o -name "outputs.zip" | + tar --create --files-from - --transform 's/test\./sponge_log./' | + tar --extract --directory ${KOKORO_ARTIFACTS_DIR} + + # Collect sentry logs, if any. + if [[ -v RUNSC_LOGS_DIR ]] && [[ -d "${RUNSC_LOGS_DIR}" ]]; then + # Check if the directory is empty or not (only the first line it needed). + local -r logs=$(ls "${RUNSC_LOGS_DIR}" | head -n1) + if [[ "${logs}" ]]; then + local -r archive=runsc_logs_"${RUNTIME}".tar.gz + if [[ -v KOKORO_BUILD_ARTIFACTS_SUBDIR ]]; then + echo "runsc logs will be uploaded to:" + echo " gsutil cp gs://gvisor/logs/${KOKORO_BUILD_ARTIFACTS_SUBDIR}/${archive} /tmp" + echo " https://storage.cloud.google.com/gvisor/logs/${KOKORO_BUILD_ARTIFACTS_SUBDIR}/${archive}" + fi + tar --create --gzip --file="${KOKORO_ARTIFACTS_DIR}/${archive}" -C "${RUNSC_LOGS_DIR}" . + fi + fi + fi +} + +function find_branch_name() { + git branch --show-current || git rev-parse HEAD || bazel info workspace | xargs basename +} diff --git a/test/BUILD b/test/BUILD index bf834d994..34b950644 100644 --- a/test/BUILD +++ b/test/BUILD @@ -1,44 +1 @@ -package(licenses = ["notice"]) # Apache 2.0 - -# We need to define a bazel platform and toolchain to specify dockerPrivileged -# and dockerRunAsRoot options, they are required to run tests on the RBE -# cluster in Kokoro. -alias( - name = "rbe_ubuntu1604", - actual = ":rbe_ubuntu1604_r346485", -) - -platform( - name = "rbe_ubuntu1604_r346485", - constraint_values = [ - "@bazel_tools//platforms:x86_64", - "@bazel_tools//platforms:linux", - "@bazel_tools//tools/cpp:clang", - "@bazel_toolchains//constraints:xenial", - "@bazel_toolchains//constraints/sanitizers:support_msan", - ], - remote_execution_properties = """ - properties: { - name: "container-image" - value:"docker://gcr.io/cloud-marketplace/google/rbe-ubuntu16-04@sha256:93f7e127196b9b653d39830c50f8b05d49ef6fd8739a9b5b8ab16e1df5399e50" - } - properties: { - name: "dockerAddCapabilities" - value: "SYS_ADMIN" - } - properties: { - name: "dockerPrivileged" - value: "true" - } - """, -) - -toolchain( - name = "cc-toolchain-clang-x86_64-default", - exec_compatible_with = [ - ], - target_compatible_with = [ - ], - toolchain = "@bazel_toolchains//configs/ubuntu16_04_clang/10.0.0/bazel_2.0.0/cc:cc-compiler-k8", - toolchain_type = "@bazel_tools//tools/cpp:toolchain_type", -) +package(licenses = ["notice"]) diff --git a/test/e2e/BUILD b/test/e2e/BUILD index 4fe03a220..76e04f878 100644 --- a/test/e2e/BUILD +++ b/test/e2e/BUILD @@ -1,4 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) @@ -10,7 +10,7 @@ go_test( "integration_test.go", "regression_test.go", ], - embed = [":integration"], + library = ":integration", tags = [ # Requires docker and runsc to be configured before the test runs. "manual", @@ -29,5 +29,4 @@ go_test( go_library( name = "integration", srcs = ["integration.go"], - importpath = "gvisor.dev/gvisor/test/integration", ) diff --git a/test/image/BUILD b/test/image/BUILD index 09b0a0ad5..7392ac54e 100644 --- a/test/image/BUILD +++ b/test/image/BUILD @@ -1,4 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) @@ -14,7 +14,7 @@ go_test( "ruby.rb", "ruby.sh", ], - embed = [":image"], + library = ":image", tags = [ # Requires docker and runsc to be configured before the test runs. "manual", @@ -30,5 +30,4 @@ go_test( go_library( name = "image", srcs = ["image.go"], - importpath = "gvisor.dev/gvisor/test/image", ) diff --git a/test/iptables/BUILD b/test/iptables/BUILD index 22f470092..6bb3b82b5 100644 --- a/test/iptables/BUILD +++ b/test/iptables/BUILD @@ -1,4 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) @@ -12,7 +12,6 @@ go_library( "iptables_util.go", "nat.go", ], - importpath = "gvisor.dev/gvisor/test/iptables", visibility = ["//test/iptables:__subpackages__"], deps = [ "//runsc/testutil", @@ -24,7 +23,7 @@ go_test( srcs = [ "iptables_test.go", ], - embed = [":iptables"], + library = ":iptables", tags = [ "local", "manual", diff --git a/test/iptables/runner/BUILD b/test/iptables/runner/BUILD index a5b6f082c..b9199387a 100644 --- a/test/iptables/runner/BUILD +++ b/test/iptables/runner/BUILD @@ -1,15 +1,21 @@ -load("@io_bazel_rules_docker//go:image.bzl", "go_image") -load("@io_bazel_rules_docker//container:container.bzl", "container_image") +load("//tools:defs.bzl", "container_image", "go_binary", "go_image") package(licenses = ["notice"]) +go_binary( + name = "runner", + testonly = 1, + srcs = ["main.go"], + deps = ["//test/iptables"], +) + container_image( name = "iptables-base", base = "@iptables-test//image", ) go_image( - name = "runner", + name = "runner-image", testonly = 1, srcs = ["main.go"], base = ":iptables-base", diff --git a/test/root/BUILD b/test/root/BUILD index d5dd9bca2..23ce2a70f 100644 --- a/test/root/BUILD +++ b/test/root/BUILD @@ -1,11 +1,10 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) go_library( name = "root", srcs = ["root.go"], - importpath = "gvisor.dev/gvisor/test/root", ) go_test( @@ -21,7 +20,7 @@ go_test( data = [ "//runsc", ], - embed = [":root"], + library = ":root", tags = [ # Requires docker and runsc to be configured before the test runs. # Also test only runs as root. diff --git a/test/root/testdata/BUILD b/test/root/testdata/BUILD index 125633680..bca5f9cab 100644 --- a/test/root/testdata/BUILD +++ b/test/root/testdata/BUILD @@ -1,4 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -12,7 +12,6 @@ go_library( "sandbox.go", "simple.go", ], - importpath = "gvisor.dev/gvisor/test/root/testdata", visibility = [ "//visibility:public", ], diff --git a/test/runtimes/BUILD b/test/runtimes/BUILD index 367295206..2c472bf8d 100644 --- a/test/runtimes/BUILD +++ b/test/runtimes/BUILD @@ -1,6 +1,6 @@ # These packages are used to run language runtime tests inside gVisor sandboxes. -load("@io_bazel_rules_go//go:def.bzl", "go_binary", "go_test") +load("//tools:defs.bzl", "go_binary", "go_test") load("//test/runtimes:build_defs.bzl", "runtime_test") package(licenses = ["notice"]) @@ -49,5 +49,5 @@ go_test( name = "blacklist_test", size = "small", srcs = ["blacklist_test.go"], - embed = [":runner"], + library = ":runner", ) diff --git a/test/runtimes/build_defs.bzl b/test/runtimes/build_defs.bzl index 6f84ca852..92e275a76 100644 --- a/test/runtimes/build_defs.bzl +++ b/test/runtimes/build_defs.bzl @@ -1,6 +1,6 @@ """Defines a rule for runtime test targets.""" -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_test", "loopback") def runtime_test( name, @@ -34,6 +34,7 @@ def runtime_test( ] data = [ ":runner", + loopback, ] if blacklist_file: args += ["--blacklist_file", "test/runtimes/" + blacklist_file] @@ -61,7 +62,7 @@ def blacklist_test(name, blacklist_file): """Test that a blacklist parses correctly.""" go_test( name = name + "_blacklist_test", - embed = [":runner"], + library = ":runner", srcs = ["blacklist_test.go"], args = ["--blacklist_file", "test/runtimes/" + blacklist_file], data = [blacklist_file], diff --git a/test/runtimes/images/proctor/BUILD b/test/runtimes/images/proctor/BUILD index 09dc6c42f..85e004c45 100644 --- a/test/runtimes/images/proctor/BUILD +++ b/test/runtimes/images/proctor/BUILD @@ -1,4 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_binary", "go_test") +load("//tools:defs.bzl", "go_binary", "go_test") package(licenses = ["notice"]) @@ -19,7 +19,7 @@ go_test( name = "proctor_test", size = "small", srcs = ["proctor_test.go"], - embed = [":proctor"], + library = ":proctor", deps = [ "//runsc/testutil", ], diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD index 90d52e73b..40e974314 100644 --- a/test/syscalls/BUILD +++ b/test/syscalls/BUILD @@ -1,4 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_binary") +load("//tools:defs.bzl", "go_binary") load("//test/syscalls:build_defs.bzl", "syscall_test") package(licenses = ["notice"]) diff --git a/test/syscalls/build_defs.bzl b/test/syscalls/build_defs.bzl index aaf77c65b..1df761dd0 100644 --- a/test/syscalls/build_defs.bzl +++ b/test/syscalls/build_defs.bzl @@ -1,5 +1,7 @@ """Defines a rule for syscall test targets.""" +load("//tools:defs.bzl", "loopback") + # syscall_test is a macro that will create targets to run the given test target # on the host (native) and runsc. def syscall_test( @@ -135,6 +137,7 @@ def _syscall_test( name = name, data = [ ":syscall_test_runner", + loopback, test, ], args = args, @@ -148,6 +151,3 @@ def sh_test(**kwargs): native.sh_test( **kwargs ) - -def select_for_linux(for_linux, for_others = []): - return for_linux diff --git a/test/syscalls/gtest/BUILD b/test/syscalls/gtest/BUILD index 9293f25cb..de4b2727c 100644 --- a/test/syscalls/gtest/BUILD +++ b/test/syscalls/gtest/BUILD @@ -1,12 +1,9 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) go_library( name = "gtest", srcs = ["gtest.go"], - importpath = "gvisor.dev/gvisor/test/syscalls/gtest", - visibility = [ - "//test:__subpackages__", - ], + visibility = ["//:sandbox"], ) diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD index 4c7ec3f06..c2ef50c1d 100644 --- a/test/syscalls/linux/BUILD +++ b/test/syscalls/linux/BUILD @@ -1,5 +1,4 @@ -load("@rules_cc//cc:defs.bzl", "cc_binary", "cc_library") -load("//test/syscalls:build_defs.bzl", "select_for_linux") +load("//tools:defs.bzl", "cc_binary", "cc_library", "default_net_util", "select_system") package( default_visibility = ["//:sandbox"], @@ -126,13 +125,11 @@ cc_library( testonly = 1, srcs = [ "socket_test_util.cc", - ] + select_for_linux( - [ - "socket_test_util_impl.cc", - ], - ), + "socket_test_util_impl.cc", + ], hdrs = ["socket_test_util.h"], - deps = [ + defines = select_system(), + deps = default_net_util() + [ "@com_google_googletest//:gtest", "@com_google_absl//absl/memory", "@com_google_absl//absl/strings", @@ -143,8 +140,7 @@ cc_library( "//test/util:temp_path", "//test/util:test_util", "//test/util:thread_util", - ] + select_for_linux([ - ]), + ], ) cc_library( @@ -1443,6 +1439,7 @@ cc_binary( srcs = ["arch_prctl.cc"], linkstatic = 1, deps = [ + "//test/util:file_descriptor", "//test/util:test_main", "//test/util:test_util", "@com_google_googletest//:gtest", @@ -3383,11 +3380,11 @@ cc_library( name = "udp_socket_test_cases", testonly = 1, srcs = [ - "udp_socket_test_cases.cc", - ] + select_for_linux([ "udp_socket_errqueue_test_case.cc", - ]), + "udp_socket_test_cases.cc", + ], hdrs = ["udp_socket_test_cases.h"], + defines = select_system(), deps = [ ":socket_test_util", ":unix_domain_socket_test_util", diff --git a/test/syscalls/linux/arch_prctl.cc b/test/syscalls/linux/arch_prctl.cc index 81bf5a775..3a901faf5 100644 --- a/test/syscalls/linux/arch_prctl.cc +++ b/test/syscalls/linux/arch_prctl.cc @@ -14,8 +14,10 @@ #include #include +#include #include "gtest/gtest.h" +#include "test/util/file_descriptor.h" #include "test/util/test_util.h" // glibc does not provide a prototype for arch_prctl() so declare it here. diff --git a/test/syscalls/linux/rseq/BUILD b/test/syscalls/linux/rseq/BUILD index 5cfe4e56f..ed488dbc2 100644 --- a/test/syscalls/linux/rseq/BUILD +++ b/test/syscalls/linux/rseq/BUILD @@ -1,8 +1,7 @@ # This package contains a standalone rseq test binary. This binary must not # depend on libc, which might use rseq itself. -load("@bazel_tools//tools/cpp:cc_flags_supplier.bzl", "cc_flags_supplier") -load("@rules_cc//cc:defs.bzl", "cc_library") +load("//tools:defs.bzl", "cc_flags_supplier", "cc_library", "cc_toolchain") package(licenses = ["notice"]) @@ -37,8 +36,8 @@ genrule( "$(location start.S)", ]), toolchains = [ + cc_toolchain, ":no_pie_cc_flags", - "@bazel_tools//tools/cpp:current_cc_toolchain", ], visibility = ["//:sandbox"], ) diff --git a/test/syscalls/linux/udp_socket_errqueue_test_case.cc b/test/syscalls/linux/udp_socket_errqueue_test_case.cc index 147978f46..9a24e1df0 100644 --- a/test/syscalls/linux/udp_socket_errqueue_test_case.cc +++ b/test/syscalls/linux/udp_socket_errqueue_test_case.cc @@ -12,6 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +#ifndef __fuchsia__ + #include "test/syscalls/linux/udp_socket_test_cases.h" #include @@ -52,3 +54,5 @@ TEST_P(UdpSocketTest, ErrorQueue) { } // namespace testing } // namespace gvisor + +#endif // __fuchsia__ diff --git a/test/uds/BUILD b/test/uds/BUILD index a3843e699..51e2c7ce8 100644 --- a/test/uds/BUILD +++ b/test/uds/BUILD @@ -1,4 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package( default_visibility = ["//:sandbox"], @@ -9,7 +9,6 @@ go_library( name = "uds", testonly = 1, srcs = ["uds.go"], - importpath = "gvisor.dev/gvisor/test/uds", deps = [ "//pkg/log", "//pkg/unet", diff --git a/test/util/BUILD b/test/util/BUILD index cbc728159..3c732be62 100644 --- a/test/util/BUILD +++ b/test/util/BUILD @@ -1,5 +1,4 @@ -load("@rules_cc//cc:defs.bzl", "cc_library", "cc_test") -load("//test/syscalls:build_defs.bzl", "select_for_linux") +load("//tools:defs.bzl", "cc_library", "cc_test", "select_system") package( default_visibility = ["//:sandbox"], @@ -142,12 +141,13 @@ cc_library( cc_library( name = "save_util", testonly = 1, - srcs = ["save_util.cc"] + - select_for_linux( - ["save_util_linux.cc"], - ["save_util_other.cc"], - ), + srcs = [ + "save_util.cc", + "save_util_linux.cc", + "save_util_other.cc", + ], hdrs = ["save_util.h"], + defines = select_system(), ) cc_library( @@ -234,13 +234,16 @@ cc_library( testonly = 1, srcs = [ "test_util.cc", - ] + select_for_linux( - [ - "test_util_impl.cc", - "test_util_runfiles.cc", + "test_util_impl.cc", + "test_util_runfiles.cc", + ], + hdrs = ["test_util.h"], + defines = select_system( + fuchsia = [ + "__opensource__", + "__fuchsia__", ], ), - hdrs = ["test_util.h"], deps = [ ":fs_util", ":logging", diff --git a/test/util/save_util_linux.cc b/test/util/save_util_linux.cc index cd56118c0..d0aea8e6a 100644 --- a/test/util/save_util_linux.cc +++ b/test/util/save_util_linux.cc @@ -12,6 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +#ifdef __linux__ + #include #include #include @@ -43,3 +45,5 @@ void MaybeSave() { } // namespace testing } // namespace gvisor + +#endif diff --git a/test/util/save_util_other.cc b/test/util/save_util_other.cc index 1aca663b7..931af2c29 100644 --- a/test/util/save_util_other.cc +++ b/test/util/save_util_other.cc @@ -12,6 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +#ifndef __linux__ + namespace gvisor { namespace testing { @@ -21,3 +23,5 @@ void MaybeSave() { } // namespace testing } // namespace gvisor + +#endif diff --git a/test/util/test_util_runfiles.cc b/test/util/test_util_runfiles.cc index 7210094eb..694d21692 100644 --- a/test/util/test_util_runfiles.cc +++ b/test/util/test_util_runfiles.cc @@ -12,6 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +#ifndef __fuchsia__ + #include #include @@ -44,3 +46,5 @@ std::string RunfilePath(std::string path) { } // namespace testing } // namespace gvisor + +#endif // __fuchsia__ diff --git a/tools/BUILD b/tools/BUILD new file mode 100644 index 000000000..e73a9c885 --- /dev/null +++ b/tools/BUILD @@ -0,0 +1,3 @@ +package(licenses = ["notice"]) + +exports_files(["nogo.js"]) diff --git a/tools/build/BUILD b/tools/build/BUILD new file mode 100644 index 000000000..0c0ce3f4d --- /dev/null +++ b/tools/build/BUILD @@ -0,0 +1,10 @@ +package(licenses = ["notice"]) + +# In bazel, no special support is required for loopback networking. This is +# just a dummy data target that does not change the test environment. +genrule( + name = "loopback", + outs = ["loopback.txt"], + cmd = "touch $@", + visibility = ["//visibility:public"], +) diff --git a/tools/build/defs.bzl b/tools/build/defs.bzl new file mode 100644 index 000000000..d0556abd1 --- /dev/null +++ b/tools/build/defs.bzl @@ -0,0 +1,91 @@ +"""Bazel implementations of standard rules.""" + +load("@bazel_tools//tools/cpp:cc_flags_supplier.bzl", _cc_flags_supplier = "cc_flags_supplier") +load("@io_bazel_rules_go//go:def.bzl", _go_binary = "go_binary", _go_embed_data = "go_embed_data", _go_library = "go_library", _go_test = "go_test", _go_tool_library = "go_tool_library") +load("@io_bazel_rules_go//proto:def.bzl", _go_proto_library = "go_proto_library") +load("@rules_cc//cc:defs.bzl", _cc_binary = "cc_binary", _cc_library = "cc_library", _cc_proto_library = "cc_proto_library", _cc_test = "cc_test") +load("@rules_pkg//:pkg.bzl", _pkg_deb = "pkg_deb", _pkg_tar = "pkg_tar") +load("@io_bazel_rules_docker//go:image.bzl", _go_image = "go_image") +load("@io_bazel_rules_docker//container:container.bzl", _container_image = "container_image") +load("@pydeps//:requirements.bzl", _py_requirement = "requirement") + +container_image = _container_image +cc_binary = _cc_binary +cc_library = _cc_library +cc_flags_supplier = _cc_flags_supplier +cc_proto_library = _cc_proto_library +cc_test = _cc_test +cc_toolchain = "@bazel_tools//tools/cpp:current_cc_toolchain" +go_image = _go_image +go_embed_data = _go_embed_data +loopback = "//tools/build:loopback" +proto_library = native.proto_library +pkg_deb = _pkg_deb +pkg_tar = _pkg_tar +py_library = native.py_library +py_binary = native.py_binary +py_test = native.py_test + +def go_binary(name, static = False, pure = False, **kwargs): + if static: + kwargs["static"] = "on" + if pure: + kwargs["pure"] = "on" + _go_binary( + name = name, + **kwargs + ) + +def go_library(name, **kwargs): + _go_library( + name = name, + importpath = "gvisor.dev/gvisor/" + native.package_name(), + **kwargs + ) + +def go_tool_library(name, **kwargs): + _go_tool_library( + name = name, + importpath = "gvisor.dev/gvisor/" + native.package_name(), + **kwargs + ) + +def go_proto_library(name, proto, **kwargs): + deps = kwargs.pop("deps", []) + _go_proto_library( + name = name, + importpath = "gvisor.dev/gvisor/" + native.package_name() + "/" + name, + proto = proto, + deps = [dep.replace("_proto", "_go_proto") for dep in deps], + **kwargs + ) + +def go_test(name, **kwargs): + library = kwargs.pop("library", None) + if library: + kwargs["embed"] = [library] + _go_test( + name = name, + **kwargs + ) + +def py_requirement(name, direct = False): + return _py_requirement(name) + +def select_arch(amd64 = "amd64", arm64 = "arm64", default = None, **kwargs): + values = { + "@bazel_tools//src/conditions:linux_x86_64": amd64, + "@bazel_tools//src/conditions:linux_aarch64": arm64, + } + if default: + values["//conditions:default"] = default + return select(values, **kwargs) + +def select_system(linux = ["__linux__"], **kwargs): + return linux # Only Linux supported. + +def default_installer(): + return None + +def default_net_util(): + return [] # Nothing needed. diff --git a/tools/checkunsafe/BUILD b/tools/checkunsafe/BUILD index d85c56131..92ba8ab06 100644 --- a/tools/checkunsafe/BUILD +++ b/tools/checkunsafe/BUILD @@ -1,11 +1,10 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_tool_library") +load("//tools:defs.bzl", "go_tool_library") package(licenses = ["notice"]) go_tool_library( name = "checkunsafe", srcs = ["check_unsafe.go"], - importpath = "checkunsafe", visibility = ["//visibility:public"], deps = [ "@org_golang_x_tools//go/analysis:go_tool_library", diff --git a/tools/defs.bzl b/tools/defs.bzl new file mode 100644 index 000000000..819f12b0d --- /dev/null +++ b/tools/defs.bzl @@ -0,0 +1,154 @@ +"""Wrappers for common build rules. + +These wrappers apply common BUILD configurations (e.g., proto_library +automagically creating cc_ and go_ proto targets) and act as a single point of +change for Google-internal and bazel-compatible rules. +""" + +load("//tools/go_stateify:defs.bzl", "go_stateify") +load("//tools/go_marshal:defs.bzl", "go_marshal", "marshal_deps", "marshal_test_deps") +load("//tools/build:defs.bzl", _cc_binary = "cc_binary", _cc_flags_supplier = "cc_flags_supplier", _cc_library = "cc_library", _cc_proto_library = "cc_proto_library", _cc_test = "cc_test", _cc_toolchain = "cc_toolchain", _container_image = "container_image", _default_installer = "default_installer", _default_net_util = "default_net_util", _go_binary = "go_binary", _go_embed_data = "go_embed_data", _go_image = "go_image", _go_library = "go_library", _go_proto_library = "go_proto_library", _go_test = "go_test", _go_tool_library = "go_tool_library", _loopback = "loopback", _pkg_deb = "pkg_deb", _pkg_tar = "pkg_tar", _proto_library = "proto_library", _py_binary = "py_binary", _py_library = "py_library", _py_requirement = "py_requirement", _py_test = "py_test", _select_arch = "select_arch", _select_system = "select_system") + +# Delegate directly. +cc_binary = _cc_binary +cc_library = _cc_library +cc_test = _cc_test +cc_toolchain = _cc_toolchain +cc_flags_supplier = _cc_flags_supplier +container_image = _container_image +go_embed_data = _go_embed_data +go_image = _go_image +go_test = _go_test +go_tool_library = _go_tool_library +pkg_deb = _pkg_deb +pkg_tar = _pkg_tar +py_library = _py_library +py_binary = _py_binary +py_test = _py_test +py_requirement = _py_requirement +select_arch = _select_arch +select_system = _select_system +loopback = _loopback +default_installer = _default_installer +default_net_util = _default_net_util + +def go_binary(name, **kwargs): + """Wraps the standard go_binary. + + Args: + name: the rule name. + **kwargs: standard go_binary arguments. + """ + _go_binary( + name = name, + **kwargs + ) + +def go_library(name, srcs, deps = [], imports = [], stateify = True, marshal = False, **kwargs): + """Wraps the standard go_library and does stateification and marshalling. + + The recommended way is to use this rule with mostly identical configuration as the native + go_library rule. + + These definitions provide additional flags (stateify, marshal) that can be used + with the generators to automatically supplement the library code. + + load("//tools:defs.bzl", "go_library") + + go_library( + name = "foo", + srcs = ["foo.go"], + ) + + Args: + name: the rule name. + srcs: the library sources. + deps: the library dependencies. + imports: imports required for stateify. + stateify: whether statify is enabled (default: true). + marshal: whether marshal is enabled (default: false). + **kwargs: standard go_library arguments. + """ + if stateify: + # Only do stateification for non-state packages without manual autogen. + go_stateify( + name = name + "_state_autogen", + srcs = [src for src in srcs if src.endswith(".go")], + imports = imports, + package = name, + arch = select_arch(), + out = name + "_state_autogen.go", + ) + all_srcs = srcs + [name + "_state_autogen.go"] + if "//pkg/state" not in deps: + all_deps = deps + ["//pkg/state"] + else: + all_deps = deps + else: + all_deps = deps + all_srcs = srcs + if marshal: + go_marshal( + name = name + "_abi_autogen", + srcs = [src for src in srcs if src.endswith(".go")], + debug = False, + imports = imports, + package = name, + ) + extra_deps = [ + dep + for dep in marshal_deps + if not dep in all_deps + ] + all_deps = all_deps + extra_deps + all_srcs = srcs + [name + "_abi_autogen_unsafe.go"] + + _go_library( + name = name, + srcs = all_srcs, + deps = all_deps, + **kwargs + ) + + if marshal: + # Ignore importpath for go_test. + kwargs.pop("importpath", None) + + _go_test( + name = name + "_abi_autogen_test", + srcs = [name + "_abi_autogen_test.go"], + library = ":" + name, + deps = marshal_test_deps, + **kwargs + ) + +def proto_library(name, srcs, **kwargs): + """Wraps the standard proto_library. + + Given a proto_library named "foo", this produces three different targets: + - foo_proto: proto_library rule. + - foo_go_proto: go_proto_library rule. + - foo_cc_proto: cc_proto_library rule. + + Args: + srcs: the proto sources. + **kwargs: standard proto_library arguments. + """ + deps = kwargs.pop("deps", []) + _proto_library( + name = name + "_proto", + srcs = srcs, + deps = deps, + **kwargs + ) + _go_proto_library( + name = name + "_go_proto", + proto = ":" + name + "_proto", + deps = deps, + **kwargs + ) + _cc_proto_library( + name = name + "_cc_proto", + deps = [":" + name + "_proto"], + **kwargs + ) diff --git a/tools/go_generics/BUILD b/tools/go_generics/BUILD index 39318b877..069df3856 100644 --- a/tools/go_generics/BUILD +++ b/tools/go_generics/BUILD @@ -1,4 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_binary") +load("//tools:defs.bzl", "go_binary") package(licenses = ["notice"]) diff --git a/tools/go_generics/globals/BUILD b/tools/go_generics/globals/BUILD index 74853c7d2..38caa3ce7 100644 --- a/tools/go_generics/globals/BUILD +++ b/tools/go_generics/globals/BUILD @@ -1,4 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) @@ -8,6 +8,6 @@ go_library( "globals_visitor.go", "scope.go", ], - importpath = "gvisor.dev/gvisor/tools/go_generics/globals", + stateify = False, visibility = ["//tools/go_generics:__pkg__"], ) diff --git a/tools/go_generics/go_merge/BUILD b/tools/go_generics/go_merge/BUILD index 02b09120e..b7d35e272 100644 --- a/tools/go_generics/go_merge/BUILD +++ b/tools/go_generics/go_merge/BUILD @@ -1,4 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_binary") +load("//tools:defs.bzl", "go_binary") package(licenses = ["notice"]) diff --git a/tools/go_generics/rules_tests/BUILD b/tools/go_generics/rules_tests/BUILD index 9d26a88b7..8a329dfc6 100644 --- a/tools/go_generics/rules_tests/BUILD +++ b/tools/go_generics/rules_tests/BUILD @@ -1,4 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_test") +load("//tools:defs.bzl", "go_test") load("//tools/go_generics:defs.bzl", "go_template", "go_template_instance") package(licenses = ["notice"]) diff --git a/tools/go_marshal/BUILD b/tools/go_marshal/BUILD index c862b277c..80d9c0504 100644 --- a/tools/go_marshal/BUILD +++ b/tools/go_marshal/BUILD @@ -1,6 +1,6 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_binary") +load("//tools:defs.bzl", "go_binary") -package(licenses = ["notice"]) +licenses(["notice"]) go_binary( name = "go_marshal", diff --git a/tools/go_marshal/README.md b/tools/go_marshal/README.md index 481575bd3..4886efddf 100644 --- a/tools/go_marshal/README.md +++ b/tools/go_marshal/README.md @@ -20,19 +20,7 @@ comment `// +marshal`. # Usage -See `defs.bzl`: two new rules are provided, `go_marshal` and `go_library`. - -The recommended way to generate a go library with marshalling is to use the -`go_library` with mostly identical configuration as the native go_library rule. - -``` -load("/gvisor/tools/go_marshal:defs.bzl", "go_library") - -go_library( - name = "foo", - srcs = ["foo.go"], -) -``` +See `defs.bzl`: a new rule is provided, `go_marshal`. Under the hood, the `go_marshal` rule is used to generate a file that will appear in a Go target; the output file should appear explicitly in a srcs list. @@ -54,11 +42,7 @@ go_library( "foo.go", "foo_abi.go", ], - deps = [ - "/gvisor/pkg/abi", - "/gvisor/pkg/sentry/safemem/safemem", - "/gvisor/pkg/sentry/usermem/usermem", - ], + ... ) ``` @@ -69,22 +53,6 @@ These tests use reflection to verify properties of the ABI struct, and should be considered part of the generated interfaces (but are too expensive to execute at runtime). Ensure these tests run at some point. -``` -$ cat BUILD -load("/gvisor/tools/go_marshal:defs.bzl", "go_library") - -go_library( - name = "foo", - srcs = ["foo.go"], -) -$ blaze build :foo -$ blaze query ... -:foo_abi_autogen -:foo_abi_autogen_test -$ blaze test :foo_abi_autogen_test - -``` - # Restrictions Not all valid go type definitions can be used with `go_marshal`. `go_marshal` is @@ -131,22 +99,6 @@ for embedded structs that are not aligned. Because of this, it's generally best to avoid using `marshal:"unaligned"` and insert explicit padding fields instead. -## Debugging go_marshal - -To enable debugging output from the go marshal tool, pass the `-debug` flag to -the tool. When using the build rules from above, add a `debug = True` field to -the build rule like this: - -``` -load("/gvisor/tools/go_marshal:defs.bzl", "go_library") - -go_library( - name = "foo", - srcs = ["foo.go"], - debug = True, -) -``` - ## Modifying the `go_marshal` Tool The following are some guidelines for modifying the `go_marshal` tool: diff --git a/tools/go_marshal/analysis/BUILD b/tools/go_marshal/analysis/BUILD index c859ced77..c2a4d45c4 100644 --- a/tools/go_marshal/analysis/BUILD +++ b/tools/go_marshal/analysis/BUILD @@ -1,12 +1,11 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_library") +load("//tools:defs.bzl", "go_library") -package(licenses = ["notice"]) +licenses(["notice"]) go_library( name = "analysis", testonly = 1, srcs = ["analysis_unsafe.go"], - importpath = "gvisor.dev/gvisor/tools/go_marshal/analysis", visibility = [ "//:sandbox", ], diff --git a/tools/go_marshal/defs.bzl b/tools/go_marshal/defs.bzl index c32eb559f..2918ceffe 100644 --- a/tools/go_marshal/defs.bzl +++ b/tools/go_marshal/defs.bzl @@ -1,57 +1,14 @@ -"""Marshal is a tool for generating marshalling interfaces for Go types. - -The recommended way is to use the go_library rule defined below with mostly -identical configuration as the native go_library rule. - -load("//tools/go_marshal:defs.bzl", "go_library") - -go_library( - name = "foo", - srcs = ["foo.go"], -) - -Under the hood, the go_marshal rule is used to generate a file that will -appear in a Go target; the output file should appear explicitly in a srcs list. -For example (the above is still the preferred way): - -load("//tools/go_marshal:defs.bzl", "go_marshal") - -go_marshal( - name = "foo_abi", - srcs = ["foo.go"], - out = "foo_abi.go", - package = "foo", -) - -go_library( - name = "foo", - srcs = [ - "foo.go", - "foo_abi.go", - ], - deps = [ - "//tools/go_marshal:marshal", - "//pkg/sentry/platform/safecopy", - "//pkg/sentry/usermem", - ], -) -""" - -load("@io_bazel_rules_go//go:def.bzl", _go_library = "go_library", _go_test = "go_test") +"""Marshal is a tool for generating marshalling interfaces for Go types.""" def _go_marshal_impl(ctx): """Execute the go_marshal tool.""" output = ctx.outputs.lib output_test = ctx.outputs.test - (build_dir, _, _) = ctx.build_file_path.rpartition("/BUILD") - - decl = "/".join(["gvisor.dev/gvisor", build_dir]) # Run the marshal command. args = ["-output=%s" % output.path] args += ["-pkg=%s" % ctx.attr.package] args += ["-output_test=%s" % output_test.path] - args += ["-declarationPkg=%s" % decl] if ctx.attr.debug: args += ["-debug"] @@ -83,7 +40,6 @@ go_marshal = rule( implementation = _go_marshal_impl, attrs = { "srcs": attr.label_list(mandatory = True, allow_files = True), - "libname": attr.string(mandatory = True), "imports": attr.string_list(mandatory = False), "package": attr.string(mandatory = True), "debug": attr.bool(doc = "enable debugging output from the go_marshal tool"), @@ -95,58 +51,14 @@ go_marshal = rule( }, ) -def go_library(name, srcs, deps = [], imports = [], debug = False, **kwargs): - """wraps the standard go_library and does mashalling interface generation. - - Args: - name: Same as native go_library. - srcs: Same as native go_library. - deps: Same as native go_library. - imports: Extra import paths to pass to the go_marshal tool. - debug: Enables debugging output from the go_marshal tool. - **kwargs: Remaining args to pass to the native go_library rule unmodified. - """ - go_marshal( - name = name + "_abi_autogen", - libname = name, - srcs = [src for src in srcs if src.endswith(".go")], - debug = debug, - imports = imports, - package = name, - ) - - extra_deps = [ - "//tools/go_marshal/marshal", - "//pkg/sentry/platform/safecopy", - "//pkg/sentry/usermem", - ] - - all_srcs = srcs + [name + "_abi_autogen_unsafe.go"] - all_deps = deps + [] # + extra_deps - - for extra in extra_deps: - if extra not in deps: - all_deps.append(extra) - - _go_library( - name = name, - srcs = all_srcs, - deps = all_deps, - **kwargs - ) - - # Don't pass importpath arg to go_test. - kwargs.pop("importpath", "") - - _go_test( - name = name + "_abi_autogen_test", - srcs = [name + "_abi_autogen_test.go"], - # Generated test has a fixed set of dependencies since we generate these - # tests. They should only depend on the library generated above, and the - # Marshallable interface. - deps = [ - ":" + name, - "//tools/go_marshal/analysis", - ], - **kwargs - ) +# marshal_deps are the dependencies requied by generated code. +marshal_deps = [ + "//tools/go_marshal/marshal", + "//pkg/sentry/platform/safecopy", + "//pkg/sentry/usermem", +] + +# marshal_test_deps are required by test targets. +marshal_test_deps = [ + "//tools/go_marshal/analysis", +] diff --git a/tools/go_marshal/gomarshal/BUILD b/tools/go_marshal/gomarshal/BUILD index a0eae6492..c92b59dd6 100644 --- a/tools/go_marshal/gomarshal/BUILD +++ b/tools/go_marshal/gomarshal/BUILD @@ -1,6 +1,6 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_library") +load("//tools:defs.bzl", "go_library") -package(licenses = ["notice"]) +licenses(["notice"]) go_library( name = "gomarshal", @@ -10,7 +10,7 @@ go_library( "generator_tests.go", "util.go", ], - importpath = "gvisor.dev/gvisor/tools/go_marshal/gomarshal", + stateify = False, visibility = [ "//:sandbox", ], diff --git a/tools/go_marshal/gomarshal/generator.go b/tools/go_marshal/gomarshal/generator.go index 641ccd938..8392f3f6d 100644 --- a/tools/go_marshal/gomarshal/generator.go +++ b/tools/go_marshal/gomarshal/generator.go @@ -62,15 +62,12 @@ type Generator struct { outputTest *os.File // Package name for the generated file. pkg string - // Go import path for package we're processing. This package should directly - // declare the type we're generating code for. - declaration string // Set of extra packages to import in the generated file. imports *importTable } // NewGenerator creates a new code Generator. -func NewGenerator(srcs []string, out, outTest, pkg, declaration string, imports []string) (*Generator, error) { +func NewGenerator(srcs []string, out, outTest, pkg string, imports []string) (*Generator, error) { f, err := os.OpenFile(out, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0644) if err != nil { return nil, fmt.Errorf("Couldn't open output file %q: %v", out, err) @@ -80,12 +77,11 @@ func NewGenerator(srcs []string, out, outTest, pkg, declaration string, imports return nil, fmt.Errorf("Couldn't open test output file %q: %v", out, err) } g := Generator{ - inputs: srcs, - output: f, - outputTest: fTest, - pkg: pkg, - declaration: declaration, - imports: newImportTable(), + inputs: srcs, + output: f, + outputTest: fTest, + pkg: pkg, + imports: newImportTable(), } for _, i := range imports { // All imports on the extra imports list are unconditionally marked as @@ -264,7 +260,7 @@ func (g *Generator) generateOne(t *ast.TypeSpec, fset *token.FileSet) *interface // generateOneTestSuite generates a test suite for the automatically generated // implementations type t. func (g *Generator) generateOneTestSuite(t *ast.TypeSpec) *testGenerator { - i := newTestGenerator(t, g.declaration) + i := newTestGenerator(t) i.emitTests() return i } @@ -359,7 +355,7 @@ func (g *Generator) Run() error { // source file. func (g *Generator) writeTests(ts []*testGenerator) error { var b sourceBuffer - b.emit("package %s_test\n\n", g.pkg) + b.emit("package %s\n\n", g.pkg) if err := b.write(g.outputTest); err != nil { return err } diff --git a/tools/go_marshal/gomarshal/generator_tests.go b/tools/go_marshal/gomarshal/generator_tests.go index df25cb5b2..bcda17c3b 100644 --- a/tools/go_marshal/gomarshal/generator_tests.go +++ b/tools/go_marshal/gomarshal/generator_tests.go @@ -46,7 +46,7 @@ type testGenerator struct { decl *importStmt } -func newTestGenerator(t *ast.TypeSpec, declaration string) *testGenerator { +func newTestGenerator(t *ast.TypeSpec) *testGenerator { if _, ok := t.Type.(*ast.StructType); !ok { panic(fmt.Sprintf("Attempting to generate code for a not struct type %v", t)) } @@ -59,14 +59,12 @@ func newTestGenerator(t *ast.TypeSpec, declaration string) *testGenerator { for _, i := range standardImports { g.imports.add(i).markUsed() } - g.decl = g.imports.add(declaration) - g.decl.markUsed() return g } func (g *testGenerator) typeName() string { - return fmt.Sprintf("%s.%s", g.decl.name, g.t.Name.Name) + return g.t.Name.Name } func (g *testGenerator) forEachField(fn func(f *ast.Field)) { diff --git a/tools/go_marshal/main.go b/tools/go_marshal/main.go index 3d12eb93c..e1a97b311 100644 --- a/tools/go_marshal/main.go +++ b/tools/go_marshal/main.go @@ -31,11 +31,10 @@ import ( ) var ( - pkg = flag.String("pkg", "", "output package") - output = flag.String("output", "", "output file") - outputTest = flag.String("output_test", "", "output file for tests") - imports = flag.String("imports", "", "comma-separated list of extra packages to import in generated code") - declarationPkg = flag.String("declarationPkg", "", "import path of target declaring the types we're generating on") + pkg = flag.String("pkg", "", "output package") + output = flag.String("output", "", "output file") + outputTest = flag.String("output_test", "", "output file for tests") + imports = flag.String("imports", "", "comma-separated list of extra packages to import in generated code") ) func main() { @@ -62,7 +61,7 @@ func main() { // as an import. extraImports = strings.Split(*imports, ",") } - g, err := gomarshal.NewGenerator(flag.Args(), *output, *outputTest, *pkg, *declarationPkg, extraImports) + g, err := gomarshal.NewGenerator(flag.Args(), *output, *outputTest, *pkg, extraImports) if err != nil { panic(err) } diff --git a/tools/go_marshal/marshal/BUILD b/tools/go_marshal/marshal/BUILD index 47dda97a1..ad508c72f 100644 --- a/tools/go_marshal/marshal/BUILD +++ b/tools/go_marshal/marshal/BUILD @@ -1,13 +1,12 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_library") +load("//tools:defs.bzl", "go_library") -package(licenses = ["notice"]) +licenses(["notice"]) go_library( name = "marshal", srcs = [ "marshal.go", ], - importpath = "gvisor.dev/gvisor/tools/go_marshal/marshal", visibility = [ "//:sandbox", ], diff --git a/tools/go_marshal/test/BUILD b/tools/go_marshal/test/BUILD index d412e1ccf..38ba49fed 100644 --- a/tools/go_marshal/test/BUILD +++ b/tools/go_marshal/test/BUILD @@ -1,7 +1,6 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_test") -load("//tools/go_marshal:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library", "go_test") -package(licenses = ["notice"]) +licenses(["notice"]) package_group( name = "gomarshal_test", @@ -25,6 +24,6 @@ go_library( name = "test", testonly = 1, srcs = ["test.go"], - importpath = "gvisor.dev/gvisor/tools/go_marshal/test", + marshal = True, deps = ["//tools/go_marshal/test/external"], ) diff --git a/tools/go_marshal/test/external/BUILD b/tools/go_marshal/test/external/BUILD index 9bb89e1da..0cf6da603 100644 --- a/tools/go_marshal/test/external/BUILD +++ b/tools/go_marshal/test/external/BUILD @@ -1,11 +1,11 @@ -load("//tools/go_marshal:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library") -package(licenses = ["notice"]) +licenses(["notice"]) go_library( name = "external", testonly = 1, srcs = ["external.go"], - importpath = "gvisor.dev/gvisor/tools/go_marshal/test/external", + marshal = True, visibility = ["//tools/go_marshal/test:gomarshal_test"], ) diff --git a/tools/go_stateify/BUILD b/tools/go_stateify/BUILD index bb53f8ae9..a133d6f8b 100644 --- a/tools/go_stateify/BUILD +++ b/tools/go_stateify/BUILD @@ -1,4 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_binary") +load("//tools:defs.bzl", "go_binary") package(licenses = ["notice"]) diff --git a/tools/go_stateify/defs.bzl b/tools/go_stateify/defs.bzl index 33267c074..0f261d89f 100644 --- a/tools/go_stateify/defs.bzl +++ b/tools/go_stateify/defs.bzl @@ -1,41 +1,4 @@ -"""Stateify is a tool for generating state wrappers for Go types. - -The recommended way is to use the go_library rule defined below with mostly -identical configuration as the native go_library rule. - -load("//tools/go_stateify:defs.bzl", "go_library") - -go_library( - name = "foo", - srcs = ["foo.go"], -) - -Under the hood, the go_stateify rule is used to generate a file that will -appear in a Go target; the output file should appear explicitly in a srcs list. -For example (the above is still the preferred way): - -load("//tools/go_stateify:defs.bzl", "go_stateify") - -go_stateify( - name = "foo_state", - srcs = ["foo.go"], - out = "foo_state.go", - package = "foo", -) - -go_library( - name = "foo", - srcs = [ - "foo.go", - "foo_state.go", - ], - deps = [ - "//pkg/state", - ], -) -""" - -load("@io_bazel_rules_go//go:def.bzl", _go_library = "go_library") +"""Stateify is a tool for generating state wrappers for Go types.""" def _go_stateify_impl(ctx): """Implementation for the stateify tool.""" @@ -103,43 +66,3 @@ files and must be added to the srcs of the relevant go_library. "_statepkg": attr.string(default = "gvisor.dev/gvisor/pkg/state"), }, ) - -def go_library(name, srcs, deps = [], imports = [], **kwargs): - """Standard go_library wrapped which generates state source files. - - Args: - name: the name of the go_library rule. - srcs: sources of the go_library. Each will be processed for stateify - annotations. - deps: dependencies for the go_library. - imports: an optional list of extra non-aliased, Go-style absolute import - paths required for stateified types. - **kwargs: passed to go_library. - """ - if "encode_unsafe.go" not in srcs and (name + "_state_autogen.go") not in srcs: - # Only do stateification for non-state packages without manual autogen. - go_stateify( - name = name + "_state_autogen", - srcs = [src for src in srcs if src.endswith(".go")], - imports = imports, - package = name, - arch = select({ - "@bazel_tools//src/conditions:linux_aarch64": "arm64", - "//conditions:default": "amd64", - }), - out = name + "_state_autogen.go", - ) - all_srcs = srcs + [name + "_state_autogen.go"] - if "//pkg/state" not in deps: - all_deps = deps + ["//pkg/state"] - else: - all_deps = deps - else: - all_deps = deps - all_srcs = srcs - _go_library( - name = name, - srcs = all_srcs, - deps = all_deps, - **kwargs - ) diff --git a/tools/images/BUILD b/tools/images/BUILD index 2b77c2737..f1699b184 100644 --- a/tools/images/BUILD +++ b/tools/images/BUILD @@ -1,4 +1,4 @@ -load("@rules_cc//cc:defs.bzl", "cc_binary") +load("//tools:defs.bzl", "cc_binary") load("//tools/images:defs.bzl", "vm_image", "vm_test") package( diff --git a/tools/images/defs.bzl b/tools/images/defs.bzl index d8e422a5d..32235813a 100644 --- a/tools/images/defs.bzl +++ b/tools/images/defs.bzl @@ -28,6 +28,8 @@ The vm_test rule can be used to execute a command remotely. For example, ) """ +load("//tools:defs.bzl", "default_installer") + def _vm_image_impl(ctx): script_paths = [] for script in ctx.files.scripts: @@ -165,8 +167,8 @@ def vm_test( targets = kwargs.pop("targets", []) if installer: targets = [installer] + targets - targets = [ - ] + targets + if default_installer(): + targets = [default_installer()] + targets _vm_test( tags = [ "local", diff --git a/tools/issue_reviver/BUILD b/tools/issue_reviver/BUILD index ee7ea11fd..4ef1a3124 100644 --- a/tools/issue_reviver/BUILD +++ b/tools/issue_reviver/BUILD @@ -1,4 +1,4 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_binary") +load("//tools:defs.bzl", "go_binary") package(licenses = ["notice"]) diff --git a/tools/issue_reviver/github/BUILD b/tools/issue_reviver/github/BUILD index 6da22ba1c..da4133472 100644 --- a/tools/issue_reviver/github/BUILD +++ b/tools/issue_reviver/github/BUILD @@ -1,11 +1,10 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_library") +load("//tools:defs.bzl", "go_library") package(licenses = ["notice"]) go_library( name = "github", srcs = ["github.go"], - importpath = "gvisor.dev/gvisor/tools/issue_reviver/github", visibility = [ "//tools/issue_reviver:__subpackages__", ], diff --git a/tools/issue_reviver/reviver/BUILD b/tools/issue_reviver/reviver/BUILD index 2c3675977..d262932bd 100644 --- a/tools/issue_reviver/reviver/BUILD +++ b/tools/issue_reviver/reviver/BUILD @@ -1,11 +1,10 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test") +load("//tools:defs.bzl", "go_library", "go_test") package(licenses = ["notice"]) go_library( name = "reviver", srcs = ["reviver.go"], - importpath = "gvisor.dev/gvisor/tools/issue_reviver/reviver", visibility = [ "//tools/issue_reviver:__subpackages__", ], @@ -15,5 +14,5 @@ go_test( name = "reviver_test", size = "small", srcs = ["reviver_test.go"], - embed = [":reviver"], + library = ":reviver", ) diff --git a/tools/workspace_status.sh b/tools/workspace_status.sh index fb09ff331..a22c8c9f2 100755 --- a/tools/workspace_status.sh +++ b/tools/workspace_status.sh @@ -15,4 +15,4 @@ # limitations under the License. # The STABLE_ prefix will trigger a re-link if it changes. -echo STABLE_VERSION $(git describe --always --tags --abbrev=12 --dirty) +echo STABLE_VERSION $(git describe --always --tags --abbrev=12 --dirty || echo 0.0.0) diff --git a/vdso/BUILD b/vdso/BUILD index 2b6744c26..d37d4266d 100644 --- a/vdso/BUILD +++ b/vdso/BUILD @@ -3,20 +3,10 @@ # normal system VDSO (time, gettimeofday, clock_gettimeofday) but which uses # timekeeping parameters managed by the sandbox kernel. -load("@bazel_tools//tools/cpp:cc_flags_supplier.bzl", "cc_flags_supplier") +load("//tools:defs.bzl", "cc_flags_supplier", "cc_toolchain", "select_arch") package(licenses = ["notice"]) -config_setting( - name = "x86_64", - constraint_values = ["@bazel_tools//platforms:x86_64"], -) - -config_setting( - name = "aarch64", - constraint_values = ["@bazel_tools//platforms:aarch64"], -) - genrule( name = "vdso", srcs = [ @@ -39,14 +29,15 @@ genrule( "-O2 " + "-std=c++11 " + "-fPIC " + + "-fno-sanitize=all " + # Some toolchains enable stack protector by default. Disable it, the # VDSO has no hooks to handle failures. "-fno-stack-protector " + "-fuse-ld=gold " + - select({ - ":x86_64": "-m64 ", - "//conditions:default": "", - }) + + select_arch( + amd64 = "-m64 ", + arm64 = "", + ) + "-shared " + "-nostdlib " + "-Wl,-soname=linux-vdso.so.1 " + @@ -55,12 +46,10 @@ genrule( "-Wl,-Bsymbolic " + "-Wl,-z,max-page-size=4096 " + "-Wl,-z,common-page-size=4096 " + - select( - { - ":x86_64": "-Wl,-T$(location vdso_amd64.lds) ", - ":aarch64": "-Wl,-T$(location vdso_arm64.lds) ", - }, - no_match_error = "Unsupported architecture", + select_arch( + amd64 = "-Wl,-T$(location vdso_amd64.lds) ", + arm64 = "-Wl,-T$(location vdso_arm64.lds) ", + no_match_error = "unsupported architecture", ) + "-o $(location vdso.so) " + "$(location vdso.cc) " + @@ -73,7 +62,7 @@ genrule( ], features = ["-pie"], toolchains = [ - "@bazel_tools//tools/cpp:current_cc_toolchain", + cc_toolchain, ":no_pie_cc_flags", ], visibility = ["//:sandbox"], -- cgit v1.2.3 From 0e2f1b7abd219f39d67cc2cecd00c441a13eeb29 Mon Sep 17 00:00:00 2001 From: Adin Scannell Date: Mon, 27 Jan 2020 15:17:58 -0800 Subject: Update package locations. Because the abi will depend on the core types for marshalling (usermem, context, safemem, safecopy), these need to be flattened from the sentry directory. These packages contain no sentry-specific details. PiperOrigin-RevId: 291811289 --- pkg/abi/abi.go | 4 + pkg/context/BUILD | 13 + pkg/context/context.go | 141 +++++ pkg/safecopy/BUILD | 29 + pkg/safecopy/LICENSE | 27 + pkg/safecopy/atomic_amd64.s | 136 +++++ pkg/safecopy/atomic_arm64.s | 126 +++++ pkg/safecopy/memclr_amd64.s | 147 +++++ pkg/safecopy/memclr_arm64.s | 74 +++ pkg/safecopy/memcpy_amd64.s | 250 +++++++++ pkg/safecopy/memcpy_arm64.s | 78 +++ pkg/safecopy/safecopy.go | 144 +++++ pkg/safecopy/safecopy_test.go | 617 +++++++++++++++++++++ pkg/safecopy/safecopy_unsafe.go | 335 +++++++++++ pkg/safecopy/sighandler_amd64.s | 133 +++++ pkg/safecopy/sighandler_arm64.s | 143 +++++ pkg/safemem/BUILD | 27 + pkg/safemem/block_unsafe.go | 279 ++++++++++ pkg/safemem/io.go | 392 +++++++++++++ pkg/safemem/io_test.go | 199 +++++++ pkg/safemem/safemem.go | 16 + pkg/safemem/seq_test.go | 196 +++++++ pkg/safemem/seq_unsafe.go | 299 ++++++++++ pkg/sentry/arch/BUILD | 4 +- pkg/sentry/arch/arch.go | 2 +- pkg/sentry/arch/arch_aarch64.go | 2 +- pkg/sentry/arch/arch_amd64.go | 2 +- pkg/sentry/arch/arch_arm64.go | 2 +- pkg/sentry/arch/arch_state_x86.go | 2 +- pkg/sentry/arch/arch_x86.go | 2 +- pkg/sentry/arch/auxv.go | 2 +- pkg/sentry/arch/signal.go | 2 +- pkg/sentry/arch/signal_amd64.go | 2 +- pkg/sentry/arch/signal_arm64.go | 2 +- pkg/sentry/arch/signal_stack.go | 2 +- pkg/sentry/arch/stack.go | 4 +- pkg/sentry/context/BUILD | 13 - pkg/sentry/context/context.go | 141 ----- pkg/sentry/context/contexttest/BUILD | 21 - pkg/sentry/context/contexttest/contexttest.go | 188 ------- pkg/sentry/contexttest/BUILD | 21 + pkg/sentry/contexttest/contexttest.go | 188 +++++++ pkg/sentry/fs/BUILD | 12 +- pkg/sentry/fs/anon/BUILD | 4 +- pkg/sentry/fs/anon/anon.go | 4 +- pkg/sentry/fs/attr.go | 2 +- pkg/sentry/fs/context.go | 2 +- pkg/sentry/fs/copy_up.go | 4 +- pkg/sentry/fs/copy_up_test.go | 2 +- pkg/sentry/fs/dev/BUILD | 6 +- pkg/sentry/fs/dev/dev.go | 4 +- pkg/sentry/fs/dev/fs.go | 2 +- pkg/sentry/fs/dev/full.go | 4 +- pkg/sentry/fs/dev/null.go | 2 +- pkg/sentry/fs/dev/random.go | 6 +- pkg/sentry/fs/dev/tty.go | 2 +- pkg/sentry/fs/dirent.go | 2 +- pkg/sentry/fs/dirent_refs_test.go | 4 +- pkg/sentry/fs/fdpipe/BUILD | 12 +- pkg/sentry/fs/fdpipe/pipe.go | 6 +- pkg/sentry/fs/fdpipe/pipe_opener.go | 2 +- pkg/sentry/fs/fdpipe/pipe_opener_test.go | 6 +- pkg/sentry/fs/fdpipe/pipe_state.go | 2 +- pkg/sentry/fs/fdpipe/pipe_test.go | 4 +- pkg/sentry/fs/file.go | 4 +- pkg/sentry/fs/file_operations.go | 4 +- pkg/sentry/fs/file_overlay.go | 4 +- pkg/sentry/fs/file_overlay_test.go | 2 +- pkg/sentry/fs/filesystems.go | 2 +- pkg/sentry/fs/filetest/BUILD | 6 +- pkg/sentry/fs/filetest/filetest.go | 6 +- pkg/sentry/fs/fs.go | 2 +- pkg/sentry/fs/fsutil/BUILD | 14 +- pkg/sentry/fs/fsutil/dirty_set.go | 6 +- pkg/sentry/fs/fsutil/dirty_set_test.go | 2 +- pkg/sentry/fs/fsutil/file.go | 4 +- pkg/sentry/fs/fsutil/file_range_set.go | 6 +- pkg/sentry/fs/fsutil/host_file_mapper.go | 4 +- pkg/sentry/fs/fsutil/host_file_mapper_unsafe.go | 2 +- pkg/sentry/fs/fsutil/host_mappable.go | 6 +- pkg/sentry/fs/fsutil/inode.go | 2 +- pkg/sentry/fs/fsutil/inode_cached.go | 6 +- pkg/sentry/fs/fsutil/inode_cached_test.go | 8 +- pkg/sentry/fs/gofer/BUILD | 10 +- pkg/sentry/fs/gofer/attr.go | 4 +- pkg/sentry/fs/gofer/cache_policy.go | 2 +- pkg/sentry/fs/gofer/context_file.go | 2 +- pkg/sentry/fs/gofer/file.go | 4 +- pkg/sentry/fs/gofer/file_state.go | 2 +- pkg/sentry/fs/gofer/fs.go | 2 +- pkg/sentry/fs/gofer/gofer_test.go | 4 +- pkg/sentry/fs/gofer/handles.go | 4 +- pkg/sentry/fs/gofer/inode.go | 4 +- pkg/sentry/fs/gofer/inode_state.go | 2 +- pkg/sentry/fs/gofer/path.go | 2 +- pkg/sentry/fs/gofer/session.go | 2 +- pkg/sentry/fs/gofer/session_state.go | 2 +- pkg/sentry/fs/gofer/socket.go | 2 +- pkg/sentry/fs/gofer/util.go | 2 +- pkg/sentry/fs/host/BUILD | 12 +- pkg/sentry/fs/host/control.go | 2 +- pkg/sentry/fs/host/file.go | 6 +- pkg/sentry/fs/host/fs.go | 2 +- pkg/sentry/fs/host/fs_test.go | 4 +- pkg/sentry/fs/host/inode.go | 4 +- pkg/sentry/fs/host/inode_state.go | 2 +- pkg/sentry/fs/host/inode_test.go | 2 +- pkg/sentry/fs/host/socket.go | 2 +- pkg/sentry/fs/host/socket_test.go | 4 +- pkg/sentry/fs/host/tty.go | 4 +- pkg/sentry/fs/host/wait_test.go | 2 +- pkg/sentry/fs/inode.go | 2 +- pkg/sentry/fs/inode_operations.go | 2 +- pkg/sentry/fs/inode_overlay.go | 2 +- pkg/sentry/fs/inode_overlay_test.go | 2 +- pkg/sentry/fs/inotify.go | 4 +- pkg/sentry/fs/inotify_event.go | 4 +- pkg/sentry/fs/mock.go | 2 +- pkg/sentry/fs/mount.go | 2 +- pkg/sentry/fs/mount_overlay.go | 2 +- pkg/sentry/fs/mount_test.go | 2 +- pkg/sentry/fs/mounts.go | 2 +- pkg/sentry/fs/mounts_test.go | 2 +- pkg/sentry/fs/offset.go | 2 +- pkg/sentry/fs/overlay.go | 4 +- pkg/sentry/fs/proc/BUILD | 8 +- pkg/sentry/fs/proc/cgroup.go | 2 +- pkg/sentry/fs/proc/cpuinfo.go | 2 +- pkg/sentry/fs/proc/exec_args.go | 4 +- pkg/sentry/fs/proc/fds.go | 2 +- pkg/sentry/fs/proc/filesystems.go | 2 +- pkg/sentry/fs/proc/fs.go | 2 +- pkg/sentry/fs/proc/inode.go | 4 +- pkg/sentry/fs/proc/loadavg.go | 2 +- pkg/sentry/fs/proc/meminfo.go | 4 +- pkg/sentry/fs/proc/mounts.go | 2 +- pkg/sentry/fs/proc/net.go | 4 +- pkg/sentry/fs/proc/proc.go | 2 +- pkg/sentry/fs/proc/seqfile/BUILD | 10 +- pkg/sentry/fs/proc/seqfile/seqfile.go | 4 +- pkg/sentry/fs/proc/seqfile/seqfile_test.go | 6 +- pkg/sentry/fs/proc/stat.go | 2 +- pkg/sentry/fs/proc/sys.go | 4 +- pkg/sentry/fs/proc/sys_net.go | 4 +- pkg/sentry/fs/proc/sys_net_test.go | 4 +- pkg/sentry/fs/proc/task.go | 4 +- pkg/sentry/fs/proc/uid_gid_map.go | 4 +- pkg/sentry/fs/proc/uptime.go | 4 +- pkg/sentry/fs/proc/version.go | 2 +- pkg/sentry/fs/ramfs/BUILD | 6 +- pkg/sentry/fs/ramfs/dir.go | 2 +- pkg/sentry/fs/ramfs/socket.go | 2 +- pkg/sentry/fs/ramfs/symlink.go | 2 +- pkg/sentry/fs/ramfs/tree.go | 4 +- pkg/sentry/fs/ramfs/tree_test.go | 2 +- pkg/sentry/fs/splice.go | 2 +- pkg/sentry/fs/sys/BUILD | 4 +- pkg/sentry/fs/sys/devices.go | 2 +- pkg/sentry/fs/sys/fs.go | 2 +- pkg/sentry/fs/sys/sys.go | 4 +- pkg/sentry/fs/timerfd/BUILD | 4 +- pkg/sentry/fs/timerfd/timerfd.go | 4 +- pkg/sentry/fs/tmpfs/BUILD | 10 +- pkg/sentry/fs/tmpfs/file_regular.go | 4 +- pkg/sentry/fs/tmpfs/file_test.go | 4 +- pkg/sentry/fs/tmpfs/fs.go | 2 +- pkg/sentry/fs/tmpfs/inode_file.go | 6 +- pkg/sentry/fs/tmpfs/tmpfs.go | 4 +- pkg/sentry/fs/tty/BUILD | 10 +- pkg/sentry/fs/tty/dir.go | 4 +- pkg/sentry/fs/tty/fs.go | 2 +- pkg/sentry/fs/tty/line_discipline.go | 4 +- pkg/sentry/fs/tty/master.go | 4 +- pkg/sentry/fs/tty/queue.go | 6 +- pkg/sentry/fs/tty/slave.go | 4 +- pkg/sentry/fs/tty/terminal.go | 4 +- pkg/sentry/fs/tty/tty_test.go | 4 +- pkg/sentry/fsimpl/ext/BUILD | 12 +- pkg/sentry/fsimpl/ext/benchmark/BUILD | 4 +- pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go | 4 +- pkg/sentry/fsimpl/ext/directory.go | 2 +- pkg/sentry/fsimpl/ext/ext.go | 2 +- pkg/sentry/fsimpl/ext/ext_test.go | 6 +- pkg/sentry/fsimpl/ext/file_description.go | 2 +- pkg/sentry/fsimpl/ext/filesystem.go | 2 +- pkg/sentry/fsimpl/ext/regular_file.go | 6 +- pkg/sentry/fsimpl/ext/symlink.go | 4 +- pkg/sentry/fsimpl/kernfs/BUILD | 10 +- pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go | 4 +- pkg/sentry/fsimpl/kernfs/fd_impl_util.go | 4 +- pkg/sentry/fsimpl/kernfs/filesystem.go | 2 +- pkg/sentry/fsimpl/kernfs/inode_impl_util.go | 2 +- pkg/sentry/fsimpl/kernfs/kernfs.go | 2 +- pkg/sentry/fsimpl/kernfs/kernfs_test.go | 6 +- pkg/sentry/fsimpl/kernfs/symlink.go | 2 +- pkg/sentry/fsimpl/proc/BUILD | 12 +- pkg/sentry/fsimpl/proc/filesystem.go | 2 +- pkg/sentry/fsimpl/proc/subtasks.go | 2 +- pkg/sentry/fsimpl/proc/task.go | 2 +- pkg/sentry/fsimpl/proc/task_files.go | 6 +- pkg/sentry/fsimpl/proc/tasks.go | 2 +- pkg/sentry/fsimpl/proc/tasks_files.go | 4 +- pkg/sentry/fsimpl/proc/tasks_net.go | 4 +- pkg/sentry/fsimpl/proc/tasks_sys.go | 2 +- pkg/sentry/fsimpl/proc/tasks_sys_test.go | 2 +- pkg/sentry/fsimpl/proc/tasks_test.go | 4 +- pkg/sentry/fsimpl/sys/BUILD | 2 +- pkg/sentry/fsimpl/sys/sys.go | 2 +- pkg/sentry/fsimpl/testutil/BUILD | 4 +- pkg/sentry/fsimpl/testutil/kernel.go | 2 +- pkg/sentry/fsimpl/testutil/testutil.go | 4 +- pkg/sentry/fsimpl/tmpfs/BUILD | 16 +- pkg/sentry/fsimpl/tmpfs/benchmark_test.go | 4 +- pkg/sentry/fsimpl/tmpfs/directory.go | 2 +- pkg/sentry/fsimpl/tmpfs/filesystem.go | 2 +- pkg/sentry/fsimpl/tmpfs/named_pipe.go | 4 +- pkg/sentry/fsimpl/tmpfs/pipe_test.go | 6 +- pkg/sentry/fsimpl/tmpfs/regular_file.go | 6 +- pkg/sentry/fsimpl/tmpfs/regular_file_test.go | 4 +- pkg/sentry/fsimpl/tmpfs/tmpfs.go | 2 +- pkg/sentry/hostmm/BUILD | 2 +- pkg/sentry/hostmm/hostmm.go | 2 +- pkg/sentry/inet/BUILD | 2 +- pkg/sentry/inet/context.go | 2 +- pkg/sentry/kernel/BUILD | 12 +- pkg/sentry/kernel/auth/BUILD | 2 +- pkg/sentry/kernel/auth/context.go | 2 +- pkg/sentry/kernel/auth/id_map.go | 2 +- pkg/sentry/kernel/context.go | 2 +- pkg/sentry/kernel/contexttest/BUILD | 4 +- pkg/sentry/kernel/contexttest/contexttest.go | 4 +- pkg/sentry/kernel/epoll/BUILD | 6 +- pkg/sentry/kernel/epoll/epoll.go | 4 +- pkg/sentry/kernel/epoll/epoll_test.go | 2 +- pkg/sentry/kernel/eventfd/BUILD | 8 +- pkg/sentry/kernel/eventfd/eventfd.go | 4 +- pkg/sentry/kernel/eventfd/eventfd_test.go | 4 +- pkg/sentry/kernel/fd_table.go | 2 +- pkg/sentry/kernel/fd_table_test.go | 4 +- pkg/sentry/kernel/futex/BUILD | 6 +- pkg/sentry/kernel/futex/futex.go | 2 +- pkg/sentry/kernel/futex/futex_test.go | 2 +- pkg/sentry/kernel/kernel.go | 2 +- pkg/sentry/kernel/pipe/BUILD | 12 +- pkg/sentry/kernel/pipe/buffer.go | 2 +- pkg/sentry/kernel/pipe/buffer_test.go | 2 +- pkg/sentry/kernel/pipe/node.go | 2 +- pkg/sentry/kernel/pipe/node_test.go | 6 +- pkg/sentry/kernel/pipe/pipe.go | 2 +- pkg/sentry/kernel/pipe/pipe_test.go | 4 +- pkg/sentry/kernel/pipe/pipe_util.go | 4 +- pkg/sentry/kernel/pipe/reader_writer.go | 4 +- pkg/sentry/kernel/pipe/vfs.go | 4 +- pkg/sentry/kernel/ptrace.go | 2 +- pkg/sentry/kernel/ptrace_amd64.go | 2 +- pkg/sentry/kernel/ptrace_arm64.go | 2 +- pkg/sentry/kernel/rseq.go | 2 +- pkg/sentry/kernel/seccomp.go | 2 +- pkg/sentry/kernel/semaphore/BUILD | 6 +- pkg/sentry/kernel/semaphore/semaphore.go | 2 +- pkg/sentry/kernel/semaphore/semaphore_test.go | 4 +- pkg/sentry/kernel/shm/BUILD | 4 +- pkg/sentry/kernel/shm/shm.go | 4 +- pkg/sentry/kernel/signalfd/BUILD | 4 +- pkg/sentry/kernel/signalfd/signalfd.go | 4 +- pkg/sentry/kernel/syscalls.go | 2 +- pkg/sentry/kernel/task.go | 4 +- pkg/sentry/kernel/task_clone.go | 2 +- pkg/sentry/kernel/task_context.go | 4 +- pkg/sentry/kernel/task_futex.go | 2 +- pkg/sentry/kernel/task_log.go | 2 +- pkg/sentry/kernel/task_run.go | 2 +- pkg/sentry/kernel/task_signals.go | 2 +- pkg/sentry/kernel/task_start.go | 2 +- pkg/sentry/kernel/task_syscall.go | 2 +- pkg/sentry/kernel/task_usermem.go | 2 +- pkg/sentry/kernel/time/BUILD | 2 +- pkg/sentry/kernel/time/context.go | 2 +- pkg/sentry/kernel/timekeeper_test.go | 4 +- pkg/sentry/kernel/vdso.go | 4 +- pkg/sentry/limits/BUILD | 2 +- pkg/sentry/limits/context.go | 2 +- pkg/sentry/loader/BUILD | 6 +- pkg/sentry/loader/elf.go | 4 +- pkg/sentry/loader/interpreter.go | 4 +- pkg/sentry/loader/loader.go | 4 +- pkg/sentry/loader/vdso.go | 6 +- pkg/sentry/memmap/BUILD | 6 +- pkg/sentry/memmap/mapping_set.go | 2 +- pkg/sentry/memmap/mapping_set_test.go | 2 +- pkg/sentry/memmap/memmap.go | 4 +- pkg/sentry/mm/BUILD | 18 +- pkg/sentry/mm/address_space.go | 2 +- pkg/sentry/mm/aio_context.go | 4 +- pkg/sentry/mm/debug.go | 2 +- pkg/sentry/mm/io.go | 6 +- pkg/sentry/mm/lifecycle.go | 4 +- pkg/sentry/mm/metadata.go | 2 +- pkg/sentry/mm/mm.go | 4 +- pkg/sentry/mm/mm_test.go | 6 +- pkg/sentry/mm/pma.go | 8 +- pkg/sentry/mm/procfs.go | 4 +- pkg/sentry/mm/save_restore.go | 2 +- pkg/sentry/mm/shm.go | 4 +- pkg/sentry/mm/special_mappable.go | 4 +- pkg/sentry/mm/syscalls.go | 4 +- pkg/sentry/mm/vma.go | 4 +- pkg/sentry/pgalloc/BUILD | 8 +- pkg/sentry/pgalloc/context.go | 2 +- pkg/sentry/pgalloc/pgalloc.go | 6 +- pkg/sentry/pgalloc/pgalloc_test.go | 2 +- pkg/sentry/pgalloc/save_restore.go | 2 +- pkg/sentry/platform/BUILD | 8 +- pkg/sentry/platform/context.go | 2 +- pkg/sentry/platform/kvm/BUILD | 6 +- pkg/sentry/platform/kvm/address_space.go | 2 +- pkg/sentry/platform/kvm/bluepill.go | 2 +- pkg/sentry/platform/kvm/bluepill_fault.go | 2 +- pkg/sentry/platform/kvm/context.go | 2 +- pkg/sentry/platform/kvm/kvm.go | 2 +- pkg/sentry/platform/kvm/kvm_test.go | 2 +- pkg/sentry/platform/kvm/machine.go | 2 +- pkg/sentry/platform/kvm/machine_amd64.go | 2 +- pkg/sentry/platform/kvm/machine_arm64.go | 2 +- pkg/sentry/platform/kvm/machine_arm64_unsafe.go | 2 +- pkg/sentry/platform/kvm/physical_map.go | 2 +- pkg/sentry/platform/kvm/virtual_map.go | 2 +- pkg/sentry/platform/kvm/virtual_map_test.go | 2 +- pkg/sentry/platform/mmap_min_addr.go | 2 +- pkg/sentry/platform/platform.go | 4 +- pkg/sentry/platform/ptrace/BUILD | 4 +- pkg/sentry/platform/ptrace/ptrace.go | 2 +- pkg/sentry/platform/ptrace/ptrace_unsafe.go | 2 +- pkg/sentry/platform/ptrace/stub_unsafe.go | 4 +- pkg/sentry/platform/ptrace/subprocess.go | 2 +- pkg/sentry/platform/ring0/BUILD | 2 +- pkg/sentry/platform/ring0/defs_amd64.go | 2 +- pkg/sentry/platform/ring0/defs_arm64.go | 2 +- pkg/sentry/platform/ring0/gen_offsets/BUILD | 2 +- pkg/sentry/platform/ring0/pagetables/BUILD | 4 +- .../platform/ring0/pagetables/allocator_unsafe.go | 2 +- pkg/sentry/platform/ring0/pagetables/pagetables.go | 2 +- .../ring0/pagetables/pagetables_aarch64.go | 2 +- .../ring0/pagetables/pagetables_amd64_test.go | 2 +- .../ring0/pagetables/pagetables_arm64_test.go | 2 +- .../platform/ring0/pagetables/pagetables_test.go | 2 +- .../platform/ring0/pagetables/pagetables_x86.go | 2 +- pkg/sentry/platform/safecopy/BUILD | 29 - pkg/sentry/platform/safecopy/LICENSE | 27 - pkg/sentry/platform/safecopy/atomic_amd64.s | 136 ----- pkg/sentry/platform/safecopy/atomic_arm64.s | 126 ----- pkg/sentry/platform/safecopy/memclr_amd64.s | 147 ----- pkg/sentry/platform/safecopy/memclr_arm64.s | 74 --- pkg/sentry/platform/safecopy/memcpy_amd64.s | 250 --------- pkg/sentry/platform/safecopy/memcpy_arm64.s | 78 --- pkg/sentry/platform/safecopy/safecopy.go | 144 ----- pkg/sentry/platform/safecopy/safecopy_test.go | 617 --------------------- pkg/sentry/platform/safecopy/safecopy_unsafe.go | 335 ----------- pkg/sentry/platform/safecopy/sighandler_amd64.s | 133 ----- pkg/sentry/platform/safecopy/sighandler_arm64.s | 143 ----- pkg/sentry/safemem/BUILD | 27 - pkg/sentry/safemem/block_unsafe.go | 279 ---------- pkg/sentry/safemem/io.go | 392 ------------- pkg/sentry/safemem/io_test.go | 199 ------- pkg/sentry/safemem/safemem.go | 16 - pkg/sentry/safemem/seq_test.go | 196 ------- pkg/sentry/safemem/seq_unsafe.go | 299 ---------- pkg/sentry/socket/BUILD | 4 +- pkg/sentry/socket/control/BUILD | 4 +- pkg/sentry/socket/control/control.go | 4 +- pkg/sentry/socket/hostinet/BUILD | 6 +- pkg/sentry/socket/hostinet/socket.go | 6 +- pkg/sentry/socket/hostinet/socket_unsafe.go | 4 +- pkg/sentry/socket/hostinet/stack.go | 4 +- pkg/sentry/socket/netfilter/BUILD | 2 +- pkg/sentry/socket/netfilter/netfilter.go | 2 +- pkg/sentry/socket/netlink/BUILD | 4 +- pkg/sentry/socket/netlink/message.go | 2 +- pkg/sentry/socket/netlink/provider.go | 2 +- pkg/sentry/socket/netlink/route/BUILD | 2 +- pkg/sentry/socket/netlink/route/protocol.go | 2 +- pkg/sentry/socket/netlink/socket.go | 4 +- pkg/sentry/socket/netlink/uevent/BUILD | 2 +- pkg/sentry/socket/netlink/uevent/protocol.go | 2 +- pkg/sentry/socket/netstack/BUILD | 6 +- pkg/sentry/socket/netstack/netstack.go | 6 +- pkg/sentry/socket/netstack/provider.go | 2 +- pkg/sentry/socket/socket.go | 4 +- pkg/sentry/socket/unix/BUILD | 6 +- pkg/sentry/socket/unix/io.go | 4 +- pkg/sentry/socket/unix/transport/BUILD | 2 +- pkg/sentry/socket/unix/transport/connectioned.go | 2 +- pkg/sentry/socket/unix/transport/connectionless.go | 2 +- pkg/sentry/socket/unix/transport/unix.go | 2 +- pkg/sentry/socket/unix/unix.go | 4 +- pkg/sentry/strace/BUILD | 2 +- pkg/sentry/strace/poll.go | 2 +- pkg/sentry/strace/select.go | 2 +- pkg/sentry/strace/signal.go | 2 +- pkg/sentry/strace/socket.go | 2 +- pkg/sentry/strace/strace.go | 2 +- pkg/sentry/syscalls/linux/BUILD | 6 +- pkg/sentry/syscalls/linux/linux64_amd64.go | 2 +- pkg/sentry/syscalls/linux/linux64_arm64.go | 2 +- pkg/sentry/syscalls/linux/sigset.go | 2 +- pkg/sentry/syscalls/linux/sys_aio.go | 2 +- pkg/sentry/syscalls/linux/sys_epoll.go | 2 +- pkg/sentry/syscalls/linux/sys_file.go | 4 +- pkg/sentry/syscalls/linux/sys_futex.go | 2 +- pkg/sentry/syscalls/linux/sys_getdents.go | 2 +- pkg/sentry/syscalls/linux/sys_mempolicy.go | 2 +- pkg/sentry/syscalls/linux/sys_mmap.go | 2 +- pkg/sentry/syscalls/linux/sys_mount.go | 2 +- pkg/sentry/syscalls/linux/sys_pipe.go | 2 +- pkg/sentry/syscalls/linux/sys_poll.go | 2 +- pkg/sentry/syscalls/linux/sys_random.go | 4 +- pkg/sentry/syscalls/linux/sys_read.go | 2 +- pkg/sentry/syscalls/linux/sys_rlimit.go | 2 +- pkg/sentry/syscalls/linux/sys_seccomp.go | 2 +- pkg/sentry/syscalls/linux/sys_sem.go | 2 +- pkg/sentry/syscalls/linux/sys_signal.go | 2 +- pkg/sentry/syscalls/linux/sys_socket.go | 2 +- pkg/sentry/syscalls/linux/sys_stat.go | 2 +- pkg/sentry/syscalls/linux/sys_stat_amd64.go | 2 +- pkg/sentry/syscalls/linux/sys_stat_arm64.go | 2 +- pkg/sentry/syscalls/linux/sys_thread.go | 2 +- pkg/sentry/syscalls/linux/sys_time.go | 2 +- pkg/sentry/syscalls/linux/sys_timer.go | 2 +- pkg/sentry/syscalls/linux/sys_write.go | 2 +- pkg/sentry/syscalls/linux/sys_xattr.go | 2 +- pkg/sentry/syscalls/linux/timespec.go | 2 +- pkg/sentry/unimpl/BUILD | 2 +- pkg/sentry/unimpl/events.go | 2 +- pkg/sentry/uniqueid/BUILD | 2 +- pkg/sentry/uniqueid/context.go | 2 +- pkg/sentry/usermem/BUILD | 55 -- pkg/sentry/usermem/README.md | 31 -- pkg/sentry/usermem/access_type.go | 128 ----- pkg/sentry/usermem/addr.go | 108 ---- pkg/sentry/usermem/addr_range_seq_test.go | 197 ------- pkg/sentry/usermem/addr_range_seq_unsafe.go | 277 --------- pkg/sentry/usermem/bytes_io.go | 141 ----- pkg/sentry/usermem/bytes_io_unsafe.go | 47 -- pkg/sentry/usermem/usermem.go | 597 -------------------- pkg/sentry/usermem/usermem_arm64.go | 53 -- pkg/sentry/usermem/usermem_test.go | 424 -------------- pkg/sentry/usermem/usermem_unsafe.go | 27 - pkg/sentry/usermem/usermem_x86.go | 38 -- pkg/sentry/vfs/BUILD | 10 +- pkg/sentry/vfs/context.go | 2 +- pkg/sentry/vfs/device.go | 2 +- pkg/sentry/vfs/file_description.go | 4 +- pkg/sentry/vfs/file_description_impl_util.go | 4 +- pkg/sentry/vfs/file_description_impl_util_test.go | 6 +- pkg/sentry/vfs/filesystem.go | 2 +- pkg/sentry/vfs/filesystem_type.go | 2 +- pkg/sentry/vfs/mount.go | 2 +- pkg/sentry/vfs/pathname.go | 2 +- pkg/sentry/vfs/testutil.go | 2 +- pkg/sentry/vfs/vfs.go | 2 +- pkg/usermem/BUILD | 55 ++ pkg/usermem/README.md | 31 ++ pkg/usermem/access_type.go | 128 +++++ pkg/usermem/addr.go | 108 ++++ pkg/usermem/addr_range_seq_test.go | 197 +++++++ pkg/usermem/addr_range_seq_unsafe.go | 277 +++++++++ pkg/usermem/bytes_io.go | 141 +++++ pkg/usermem/bytes_io_unsafe.go | 47 ++ pkg/usermem/usermem.go | 597 ++++++++++++++++++++ pkg/usermem/usermem_arm64.go | 53 ++ pkg/usermem/usermem_test.go | 424 ++++++++++++++ pkg/usermem/usermem_unsafe.go | 27 + pkg/usermem/usermem_x86.go | 38 ++ runsc/boot/BUILD | 6 +- runsc/boot/fds.go | 2 +- runsc/boot/fs.go | 2 +- runsc/boot/loader_test.go | 2 +- runsc/boot/user.go | 4 +- runsc/boot/user_test.go | 2 +- tools/go_marshal/defs.bzl | 4 +- tools/go_marshal/gomarshal/generator.go | 4 +- tools/go_marshal/test/BUILD | 2 +- tools/go_marshal/test/benchmark_test.go | 2 +- 483 files changed, 6839 insertions(+), 6835 deletions(-) create mode 100644 pkg/context/BUILD create mode 100644 pkg/context/context.go create mode 100644 pkg/safecopy/BUILD create mode 100644 pkg/safecopy/LICENSE create mode 100644 pkg/safecopy/atomic_amd64.s create mode 100644 pkg/safecopy/atomic_arm64.s create mode 100644 pkg/safecopy/memclr_amd64.s create mode 100644 pkg/safecopy/memclr_arm64.s create mode 100644 pkg/safecopy/memcpy_amd64.s create mode 100644 pkg/safecopy/memcpy_arm64.s create mode 100644 pkg/safecopy/safecopy.go create mode 100644 pkg/safecopy/safecopy_test.go create mode 100644 pkg/safecopy/safecopy_unsafe.go create mode 100644 pkg/safecopy/sighandler_amd64.s create mode 100644 pkg/safecopy/sighandler_arm64.s create mode 100644 pkg/safemem/BUILD create mode 100644 pkg/safemem/block_unsafe.go create mode 100644 pkg/safemem/io.go create mode 100644 pkg/safemem/io_test.go create mode 100644 pkg/safemem/safemem.go create mode 100644 pkg/safemem/seq_test.go create mode 100644 pkg/safemem/seq_unsafe.go delete mode 100644 pkg/sentry/context/BUILD delete mode 100644 pkg/sentry/context/context.go delete mode 100644 pkg/sentry/context/contexttest/BUILD delete mode 100644 pkg/sentry/context/contexttest/contexttest.go create mode 100644 pkg/sentry/contexttest/BUILD create mode 100644 pkg/sentry/contexttest/contexttest.go delete mode 100644 pkg/sentry/platform/safecopy/BUILD delete mode 100644 pkg/sentry/platform/safecopy/LICENSE delete mode 100644 pkg/sentry/platform/safecopy/atomic_amd64.s delete mode 100644 pkg/sentry/platform/safecopy/atomic_arm64.s delete mode 100644 pkg/sentry/platform/safecopy/memclr_amd64.s delete mode 100644 pkg/sentry/platform/safecopy/memclr_arm64.s delete mode 100644 pkg/sentry/platform/safecopy/memcpy_amd64.s delete mode 100644 pkg/sentry/platform/safecopy/memcpy_arm64.s delete mode 100644 pkg/sentry/platform/safecopy/safecopy.go delete mode 100644 pkg/sentry/platform/safecopy/safecopy_test.go delete mode 100644 pkg/sentry/platform/safecopy/safecopy_unsafe.go delete mode 100644 pkg/sentry/platform/safecopy/sighandler_amd64.s delete mode 100644 pkg/sentry/platform/safecopy/sighandler_arm64.s delete mode 100644 pkg/sentry/safemem/BUILD delete mode 100644 pkg/sentry/safemem/block_unsafe.go delete mode 100644 pkg/sentry/safemem/io.go delete mode 100644 pkg/sentry/safemem/io_test.go delete mode 100644 pkg/sentry/safemem/safemem.go delete mode 100644 pkg/sentry/safemem/seq_test.go delete mode 100644 pkg/sentry/safemem/seq_unsafe.go delete mode 100644 pkg/sentry/usermem/BUILD delete mode 100644 pkg/sentry/usermem/README.md delete mode 100644 pkg/sentry/usermem/access_type.go delete mode 100644 pkg/sentry/usermem/addr.go delete mode 100644 pkg/sentry/usermem/addr_range_seq_test.go delete mode 100644 pkg/sentry/usermem/addr_range_seq_unsafe.go delete mode 100644 pkg/sentry/usermem/bytes_io.go delete mode 100644 pkg/sentry/usermem/bytes_io_unsafe.go delete mode 100644 pkg/sentry/usermem/usermem.go delete mode 100644 pkg/sentry/usermem/usermem_arm64.go delete mode 100644 pkg/sentry/usermem/usermem_test.go delete mode 100644 pkg/sentry/usermem/usermem_unsafe.go delete mode 100644 pkg/sentry/usermem/usermem_x86.go create mode 100644 pkg/usermem/BUILD create mode 100644 pkg/usermem/README.md create mode 100644 pkg/usermem/access_type.go create mode 100644 pkg/usermem/addr.go create mode 100644 pkg/usermem/addr_range_seq_test.go create mode 100644 pkg/usermem/addr_range_seq_unsafe.go create mode 100644 pkg/usermem/bytes_io.go create mode 100644 pkg/usermem/bytes_io_unsafe.go create mode 100644 pkg/usermem/usermem.go create mode 100644 pkg/usermem/usermem_arm64.go create mode 100644 pkg/usermem/usermem_test.go create mode 100644 pkg/usermem/usermem_unsafe.go create mode 100644 pkg/usermem/usermem_x86.go (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/abi/abi.go b/pkg/abi/abi.go index d56c481c9..e6be93c3a 100644 --- a/pkg/abi/abi.go +++ b/pkg/abi/abi.go @@ -39,3 +39,7 @@ func (o OS) String() string { return fmt.Sprintf("OS(%d)", o) } } + +// ABI is an interface that defines OS-specific interactions. +type ABI interface { +} diff --git a/pkg/context/BUILD b/pkg/context/BUILD new file mode 100644 index 000000000..239f31149 --- /dev/null +++ b/pkg/context/BUILD @@ -0,0 +1,13 @@ +load("//tools:defs.bzl", "go_library") + +package(licenses = ["notice"]) + +go_library( + name = "context", + srcs = ["context.go"], + visibility = ["//:sandbox"], + deps = [ + "//pkg/amutex", + "//pkg/log", + ], +) diff --git a/pkg/context/context.go b/pkg/context/context.go new file mode 100644 index 000000000..23e009ef3 --- /dev/null +++ b/pkg/context/context.go @@ -0,0 +1,141 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package context defines an internal context type. +// +// The given Context conforms to the standard Go context, but mandates +// additional methods that are specific to the kernel internals. Note however, +// that the Context described by this package carries additional constraints +// regarding concurrent access and retaining beyond the scope of a call. +// +// See the Context type for complete details. +package context + +import ( + "context" + "time" + + "gvisor.dev/gvisor/pkg/amutex" + "gvisor.dev/gvisor/pkg/log" +) + +type contextID int + +// Globally accessible values from a context. These keys are defined in the +// context package to resolve dependency cycles by not requiring the caller to +// import packages usually required to get these information. +const ( + // CtxThreadGroupID is the current thread group ID when a context represents + // a task context. The value is represented as an int32. + CtxThreadGroupID contextID = iota +) + +// ThreadGroupIDFromContext returns the current thread group ID when ctx +// represents a task context. +func ThreadGroupIDFromContext(ctx Context) (tgid int32, ok bool) { + if tgid := ctx.Value(CtxThreadGroupID); tgid != nil { + return tgid.(int32), true + } + return 0, false +} + +// A Context represents a thread of execution (hereafter "goroutine" to reflect +// Go idiosyncrasy). It carries state associated with the goroutine across API +// boundaries. +// +// While Context exists for essentially the same reasons as Go's standard +// context.Context, the standard type represents the state of an operation +// rather than that of a goroutine. This is a critical distinction: +// +// - Unlike context.Context, which "may be passed to functions running in +// different goroutines", it is *not safe* to use the same Context in multiple +// concurrent goroutines. +// +// - It is *not safe* to retain a Context passed to a function beyond the scope +// of that function call. +// +// In both cases, values extracted from the Context should be used instead. +type Context interface { + log.Logger + amutex.Sleeper + context.Context + + // UninterruptibleSleepStart indicates the beginning of an uninterruptible + // sleep state (equivalent to Linux's TASK_UNINTERRUPTIBLE). If deactivate + // is true and the Context represents a Task, the Task's AddressSpace is + // deactivated. + UninterruptibleSleepStart(deactivate bool) + + // UninterruptibleSleepFinish indicates the end of an uninterruptible sleep + // state that was begun by a previous call to UninterruptibleSleepStart. If + // activate is true and the Context represents a Task, the Task's + // AddressSpace is activated. Normally activate is the same value as the + // deactivate parameter passed to UninterruptibleSleepStart. + UninterruptibleSleepFinish(activate bool) +} + +// NoopSleeper is a noop implementation of amutex.Sleeper and UninterruptibleSleep +// methods for anonymous embedding in other types that do not implement sleeps. +type NoopSleeper struct { + amutex.NoopSleeper +} + +// UninterruptibleSleepStart does nothing. +func (NoopSleeper) UninterruptibleSleepStart(bool) {} + +// UninterruptibleSleepFinish does nothing. +func (NoopSleeper) UninterruptibleSleepFinish(bool) {} + +// Deadline returns zero values, meaning no deadline. +func (NoopSleeper) Deadline() (time.Time, bool) { + return time.Time{}, false +} + +// Done returns nil. +func (NoopSleeper) Done() <-chan struct{} { + return nil +} + +// Err returns nil. +func (NoopSleeper) Err() error { + return nil +} + +// logContext implements basic logging. +type logContext struct { + log.Logger + NoopSleeper +} + +// Value implements Context.Value. +func (logContext) Value(key interface{}) interface{} { + return nil +} + +// bgContext is the context returned by context.Background. +var bgContext = &logContext{Logger: log.Log()} + +// Background returns an empty context using the default logger. +// +// Users should be wary of using a Background context. Please tag any use with +// FIXME(b/38173783) and a note to remove this use. +// +// Generally, one should use the Task as their context when available, or avoid +// having to use a context in places where a Task is unavailable. +// +// Using a Background context for tests is fine, as long as no values are +// needed from the context in the tested code paths. +func Background() Context { + return bgContext +} diff --git a/pkg/safecopy/BUILD b/pkg/safecopy/BUILD new file mode 100644 index 000000000..426ef30c9 --- /dev/null +++ b/pkg/safecopy/BUILD @@ -0,0 +1,29 @@ +load("//tools:defs.bzl", "go_library", "go_test") + +package(licenses = ["notice"]) + +go_library( + name = "safecopy", + srcs = [ + "atomic_amd64.s", + "atomic_arm64.s", + "memclr_amd64.s", + "memclr_arm64.s", + "memcpy_amd64.s", + "memcpy_arm64.s", + "safecopy.go", + "safecopy_unsafe.go", + "sighandler_amd64.s", + "sighandler_arm64.s", + ], + visibility = ["//:sandbox"], + deps = ["//pkg/syserror"], +) + +go_test( + name = "safecopy_test", + srcs = [ + "safecopy_test.go", + ], + library = ":safecopy", +) diff --git a/pkg/safecopy/LICENSE b/pkg/safecopy/LICENSE new file mode 100644 index 000000000..6a66aea5e --- /dev/null +++ b/pkg/safecopy/LICENSE @@ -0,0 +1,27 @@ +Copyright (c) 2009 The Go Authors. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/pkg/safecopy/atomic_amd64.s b/pkg/safecopy/atomic_amd64.s new file mode 100644 index 000000000..a0cd78f33 --- /dev/null +++ b/pkg/safecopy/atomic_amd64.s @@ -0,0 +1,136 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "textflag.h" + +// handleSwapUint32Fault returns the value stored in DI. Control is transferred +// to it when swapUint32 below receives SIGSEGV or SIGBUS, with the signal +// number stored in DI. +// +// It must have the same frame configuration as swapUint32 so that it can undo +// any potential call frame set up by the assembler. +TEXT handleSwapUint32Fault(SB), NOSPLIT, $0-24 + MOVL DI, sig+20(FP) + RET + +// swapUint32 atomically stores new into *addr and returns (the previous *addr +// value, 0). If a SIGSEGV or SIGBUS signal is received during the swap, the +// value of old is unspecified, and sig is the number of the signal that was +// received. +// +// Preconditions: addr must be aligned to a 4-byte boundary. +// +//func swapUint32(ptr unsafe.Pointer, new uint32) (old uint32, sig int32) +TEXT ·swapUint32(SB), NOSPLIT, $0-24 + // Store 0 as the returned signal number. If we run to completion, + // this is the value the caller will see; if a signal is received, + // handleSwapUint32Fault will store a different value in this address. + MOVL $0, sig+20(FP) + + MOVQ addr+0(FP), DI + MOVL new+8(FP), AX + XCHGL AX, 0(DI) + MOVL AX, old+16(FP) + RET + +// handleSwapUint64Fault returns the value stored in DI. Control is transferred +// to it when swapUint64 below receives SIGSEGV or SIGBUS, with the signal +// number stored in DI. +// +// It must have the same frame configuration as swapUint64 so that it can undo +// any potential call frame set up by the assembler. +TEXT handleSwapUint64Fault(SB), NOSPLIT, $0-28 + MOVL DI, sig+24(FP) + RET + +// swapUint64 atomically stores new into *addr and returns (the previous *addr +// value, 0). If a SIGSEGV or SIGBUS signal is received during the swap, the +// value of old is unspecified, and sig is the number of the signal that was +// received. +// +// Preconditions: addr must be aligned to a 8-byte boundary. +// +//func swapUint64(ptr unsafe.Pointer, new uint64) (old uint64, sig int32) +TEXT ·swapUint64(SB), NOSPLIT, $0-28 + // Store 0 as the returned signal number. If we run to completion, + // this is the value the caller will see; if a signal is received, + // handleSwapUint64Fault will store a different value in this address. + MOVL $0, sig+24(FP) + + MOVQ addr+0(FP), DI + MOVQ new+8(FP), AX + XCHGQ AX, 0(DI) + MOVQ AX, old+16(FP) + RET + +// handleCompareAndSwapUint32Fault returns the value stored in DI. Control is +// transferred to it when swapUint64 below receives SIGSEGV or SIGBUS, with the +// signal number stored in DI. +// +// It must have the same frame configuration as compareAndSwapUint32 so that it +// can undo any potential call frame set up by the assembler. +TEXT handleCompareAndSwapUint32Fault(SB), NOSPLIT, $0-24 + MOVL DI, sig+20(FP) + RET + +// compareAndSwapUint32 is like sync/atomic.CompareAndSwapUint32, but returns +// (the value previously stored at addr, 0). If a SIGSEGV or SIGBUS signal is +// received during the operation, the value of prev is unspecified, and sig is +// the number of the signal that was received. +// +// Preconditions: addr must be aligned to a 4-byte boundary. +// +//func compareAndSwapUint32(ptr unsafe.Pointer, old, new uint32) (prev uint32, sig int32) +TEXT ·compareAndSwapUint32(SB), NOSPLIT, $0-24 + // Store 0 as the returned signal number. If we run to completion, this is + // the value the caller will see; if a signal is received, + // handleCompareAndSwapUint32Fault will store a different value in this + // address. + MOVL $0, sig+20(FP) + + MOVQ addr+0(FP), DI + MOVL old+8(FP), AX + MOVL new+12(FP), DX + LOCK + CMPXCHGL DX, 0(DI) + MOVL AX, prev+16(FP) + RET + +// handleLoadUint32Fault returns the value stored in DI. Control is transferred +// to it when LoadUint32 below receives SIGSEGV or SIGBUS, with the signal +// number stored in DI. +// +// It must have the same frame configuration as loadUint32 so that it can undo +// any potential call frame set up by the assembler. +TEXT handleLoadUint32Fault(SB), NOSPLIT, $0-16 + MOVL DI, sig+12(FP) + RET + +// loadUint32 atomically loads *addr and returns it. If a SIGSEGV or SIGBUS +// signal is received, the value returned is unspecified, and sig is the number +// of the signal that was received. +// +// Preconditions: addr must be aligned to a 4-byte boundary. +// +//func loadUint32(ptr unsafe.Pointer) (val uint32, sig int32) +TEXT ·loadUint32(SB), NOSPLIT, $0-16 + // Store 0 as the returned signal number. If we run to completion, + // this is the value the caller will see; if a signal is received, + // handleLoadUint32Fault will store a different value in this address. + MOVL $0, sig+12(FP) + + MOVQ addr+0(FP), AX + MOVL (AX), BX + MOVL BX, val+8(FP) + RET diff --git a/pkg/safecopy/atomic_arm64.s b/pkg/safecopy/atomic_arm64.s new file mode 100644 index 000000000..d58ed71f7 --- /dev/null +++ b/pkg/safecopy/atomic_arm64.s @@ -0,0 +1,126 @@ +// Copyright 2014 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "textflag.h" + +// handleSwapUint32Fault returns the value stored in R1. Control is transferred +// to it when swapUint32 below receives SIGSEGV or SIGBUS, with the signal +// number stored in R1. +// +// It must have the same frame configuration as swapUint32 so that it can undo +// any potential call frame set up by the assembler. +TEXT handleSwapUint32Fault(SB), NOSPLIT, $0-24 + MOVW R1, sig+20(FP) + RET + +// See the corresponding doc in safecopy_unsafe.go +// +// The code is derived from Go source runtime/internal/atomic.Xchg. +// +//func swapUint32(ptr unsafe.Pointer, new uint32) (old uint32, sig int32) +TEXT ·swapUint32(SB), NOSPLIT, $0-24 + // Store 0 as the returned signal number. If we run to completion, + // this is the value the caller will see; if a signal is received, + // handleSwapUint32Fault will store a different value in this address. + MOVW $0, sig+20(FP) +again: + MOVD addr+0(FP), R0 + MOVW new+8(FP), R1 + LDAXRW (R0), R2 + STLXRW R1, (R0), R3 + CBNZ R3, again + MOVW R2, old+16(FP) + RET + +// handleSwapUint64Fault returns the value stored in R1. Control is transferred +// to it when swapUint64 below receives SIGSEGV or SIGBUS, with the signal +// number stored in R1. +// +// It must have the same frame configuration as swapUint64 so that it can undo +// any potential call frame set up by the assembler. +TEXT handleSwapUint64Fault(SB), NOSPLIT, $0-28 + MOVW R1, sig+24(FP) + RET + +// See the corresponding doc in safecopy_unsafe.go +// +// The code is derived from Go source runtime/internal/atomic.Xchg64. +// +//func swapUint64(ptr unsafe.Pointer, new uint64) (old uint64, sig int32) +TEXT ·swapUint64(SB), NOSPLIT, $0-28 + // Store 0 as the returned signal number. If we run to completion, + // this is the value the caller will see; if a signal is received, + // handleSwapUint64Fault will store a different value in this address. + MOVW $0, sig+24(FP) +again: + MOVD addr+0(FP), R0 + MOVD new+8(FP), R1 + LDAXR (R0), R2 + STLXR R1, (R0), R3 + CBNZ R3, again + MOVD R2, old+16(FP) + RET + +// handleCompareAndSwapUint32Fault returns the value stored in R1. Control is +// transferred to it when compareAndSwapUint32 below receives SIGSEGV or SIGBUS, +// with the signal number stored in R1. +// +// It must have the same frame configuration as compareAndSwapUint32 so that it +// can undo any potential call frame set up by the assembler. +TEXT handleCompareAndSwapUint32Fault(SB), NOSPLIT, $0-24 + MOVW R1, sig+20(FP) + RET + +// See the corresponding doc in safecopy_unsafe.go +// +// The code is derived from Go source runtime/internal/atomic.Cas. +// +//func compareAndSwapUint32(ptr unsafe.Pointer, old, new uint32) (prev uint32, sig int32) +TEXT ·compareAndSwapUint32(SB), NOSPLIT, $0-24 + // Store 0 as the returned signal number. If we run to completion, this is + // the value the caller will see; if a signal is received, + // handleCompareAndSwapUint32Fault will store a different value in this + // address. + MOVW $0, sig+20(FP) + + MOVD addr+0(FP), R0 + MOVW old+8(FP), R1 + MOVW new+12(FP), R2 +again: + LDAXRW (R0), R3 + CMPW R1, R3 + BNE done + STLXRW R2, (R0), R4 + CBNZ R4, again +done: + MOVW R3, prev+16(FP) + RET + +// handleLoadUint32Fault returns the value stored in DI. Control is transferred +// to it when LoadUint32 below receives SIGSEGV or SIGBUS, with the signal +// number stored in DI. +// +// It must have the same frame configuration as loadUint32 so that it can undo +// any potential call frame set up by the assembler. +TEXT handleLoadUint32Fault(SB), NOSPLIT, $0-16 + MOVW R1, sig+12(FP) + RET + +// loadUint32 atomically loads *addr and returns it. If a SIGSEGV or SIGBUS +// signal is received, the value returned is unspecified, and sig is the number +// of the signal that was received. +// +// Preconditions: addr must be aligned to a 4-byte boundary. +// +//func loadUint32(ptr unsafe.Pointer) (val uint32, sig int32) +TEXT ·loadUint32(SB), NOSPLIT, $0-16 + // Store 0 as the returned signal number. If we run to completion, + // this is the value the caller will see; if a signal is received, + // handleLoadUint32Fault will store a different value in this address. + MOVW $0, sig+12(FP) + + MOVD addr+0(FP), R0 + LDARW (R0), R1 + MOVW R1, val+8(FP) + RET diff --git a/pkg/safecopy/memclr_amd64.s b/pkg/safecopy/memclr_amd64.s new file mode 100644 index 000000000..64cf32f05 --- /dev/null +++ b/pkg/safecopy/memclr_amd64.s @@ -0,0 +1,147 @@ +// Copyright 2014 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "textflag.h" + +// handleMemclrFault returns (the value stored in AX, the value stored in DI). +// Control is transferred to it when memclr below receives SIGSEGV or SIGBUS, +// with the faulting address stored in AX and the signal number stored in DI. +// +// It must have the same frame configuration as memclr so that it can undo any +// potential call frame set up by the assembler. +TEXT handleMemclrFault(SB), NOSPLIT, $0-28 + MOVQ AX, addr+16(FP) + MOVL DI, sig+24(FP) + RET + +// memclr sets the n bytes following ptr to zeroes. If a SIGSEGV or SIGBUS +// signal is received during the write, it returns the address that caused the +// fault and the number of the signal that was received. Otherwise, it returns +// an unspecified address and a signal number of 0. +// +// Data is written in order, such that if a fault happens at address p, it is +// safe to assume that all data before p-maxRegisterSize has already been +// successfully written. +// +// The code is derived from runtime.memclrNoHeapPointers. +// +// func memclr(ptr unsafe.Pointer, n uintptr) (fault unsafe.Pointer, sig int32) +TEXT ·memclr(SB), NOSPLIT, $0-28 + // Store 0 as the returned signal number. If we run to completion, + // this is the value the caller will see; if a signal is received, + // handleMemclrFault will store a different value in this address. + MOVL $0, sig+24(FP) + + MOVQ ptr+0(FP), DI + MOVQ n+8(FP), BX + XORQ AX, AX + + // MOVOU seems always faster than REP STOSQ. +tail: + TESTQ BX, BX + JEQ _0 + CMPQ BX, $2 + JBE _1or2 + CMPQ BX, $4 + JBE _3or4 + CMPQ BX, $8 + JB _5through7 + JE _8 + CMPQ BX, $16 + JBE _9through16 + PXOR X0, X0 + CMPQ BX, $32 + JBE _17through32 + CMPQ BX, $64 + JBE _33through64 + CMPQ BX, $128 + JBE _65through128 + CMPQ BX, $256 + JBE _129through256 + // TODO: use branch table and BSR to make this just a single dispatch + // TODO: for really big clears, use MOVNTDQ, even without AVX2. + +loop: + MOVOU X0, 0(DI) + MOVOU X0, 16(DI) + MOVOU X0, 32(DI) + MOVOU X0, 48(DI) + MOVOU X0, 64(DI) + MOVOU X0, 80(DI) + MOVOU X0, 96(DI) + MOVOU X0, 112(DI) + MOVOU X0, 128(DI) + MOVOU X0, 144(DI) + MOVOU X0, 160(DI) + MOVOU X0, 176(DI) + MOVOU X0, 192(DI) + MOVOU X0, 208(DI) + MOVOU X0, 224(DI) + MOVOU X0, 240(DI) + SUBQ $256, BX + ADDQ $256, DI + CMPQ BX, $256 + JAE loop + JMP tail + +_1or2: + MOVB AX, (DI) + MOVB AX, -1(DI)(BX*1) + RET +_0: + RET +_3or4: + MOVW AX, (DI) + MOVW AX, -2(DI)(BX*1) + RET +_5through7: + MOVL AX, (DI) + MOVL AX, -4(DI)(BX*1) + RET +_8: + // We need a separate case for 8 to make sure we clear pointers atomically. + MOVQ AX, (DI) + RET +_9through16: + MOVQ AX, (DI) + MOVQ AX, -8(DI)(BX*1) + RET +_17through32: + MOVOU X0, (DI) + MOVOU X0, -16(DI)(BX*1) + RET +_33through64: + MOVOU X0, (DI) + MOVOU X0, 16(DI) + MOVOU X0, -32(DI)(BX*1) + MOVOU X0, -16(DI)(BX*1) + RET +_65through128: + MOVOU X0, (DI) + MOVOU X0, 16(DI) + MOVOU X0, 32(DI) + MOVOU X0, 48(DI) + MOVOU X0, -64(DI)(BX*1) + MOVOU X0, -48(DI)(BX*1) + MOVOU X0, -32(DI)(BX*1) + MOVOU X0, -16(DI)(BX*1) + RET +_129through256: + MOVOU X0, (DI) + MOVOU X0, 16(DI) + MOVOU X0, 32(DI) + MOVOU X0, 48(DI) + MOVOU X0, 64(DI) + MOVOU X0, 80(DI) + MOVOU X0, 96(DI) + MOVOU X0, 112(DI) + MOVOU X0, -128(DI)(BX*1) + MOVOU X0, -112(DI)(BX*1) + MOVOU X0, -96(DI)(BX*1) + MOVOU X0, -80(DI)(BX*1) + MOVOU X0, -64(DI)(BX*1) + MOVOU X0, -48(DI)(BX*1) + MOVOU X0, -32(DI)(BX*1) + MOVOU X0, -16(DI)(BX*1) + RET diff --git a/pkg/safecopy/memclr_arm64.s b/pkg/safecopy/memclr_arm64.s new file mode 100644 index 000000000..7361b9067 --- /dev/null +++ b/pkg/safecopy/memclr_arm64.s @@ -0,0 +1,74 @@ +// Copyright 2014 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "textflag.h" + +// handleMemclrFault returns (the value stored in R0, the value stored in R1). +// Control is transferred to it when memclr below receives SIGSEGV or SIGBUS, +// with the faulting address stored in R0 and the signal number stored in R1. +// +// It must have the same frame configuration as memclr so that it can undo any +// potential call frame set up by the assembler. +TEXT handleMemclrFault(SB), NOSPLIT, $0-28 + MOVD R0, addr+16(FP) + MOVW R1, sig+24(FP) + RET + +// See the corresponding doc in safecopy_unsafe.go +// +// The code is derived from runtime.memclrNoHeapPointers. +// +// func memclr(ptr unsafe.Pointer, n uintptr) (fault unsafe.Pointer, sig int32) +TEXT ·memclr(SB), NOSPLIT, $0-28 + // Store 0 as the returned signal number. If we run to completion, + // this is the value the caller will see; if a signal is received, + // handleMemclrFault will store a different value in this address. + MOVW $0, sig+24(FP) + MOVD ptr+0(FP), R0 + MOVD n+8(FP), R1 + + // If size is less than 16 bytes, use tail_zero to zero what remains + CMP $16, R1 + BLT tail_zero + // Get buffer offset into 16 byte aligned address for better performance + ANDS $15, R0, ZR + BNE unaligned_to_16 +aligned_to_16: + LSR $4, R1, R2 +zero_by_16: + STP.P (ZR, ZR), 16(R0) // Store pair with post index. + SUBS $1, R2, R2 + BNE zero_by_16 + ANDS $15, R1, R1 + BEQ end + + // Zero buffer with size=R1 < 16 +tail_zero: + TBZ $3, R1, tail_zero_4 + MOVD.P ZR, 8(R0) +tail_zero_4: + TBZ $2, R1, tail_zero_2 + MOVW.P ZR, 4(R0) +tail_zero_2: + TBZ $1, R1, tail_zero_1 + MOVH.P ZR, 2(R0) +tail_zero_1: + TBZ $0, R1, end + MOVB ZR, (R0) +end: + RET + +unaligned_to_16: + MOVD R0, R2 +head_loop: + MOVBU.P ZR, 1(R0) + ANDS $15, R0, ZR + BNE head_loop + // Adjust length for what remains + SUB R2, R0, R3 + SUB R3, R1 + // If size is less than 16 bytes, use tail_zero to zero what remains + CMP $16, R1 + BLT tail_zero + B aligned_to_16 diff --git a/pkg/safecopy/memcpy_amd64.s b/pkg/safecopy/memcpy_amd64.s new file mode 100644 index 000000000..129691d68 --- /dev/null +++ b/pkg/safecopy/memcpy_amd64.s @@ -0,0 +1,250 @@ +// Copyright © 1994-1999 Lucent Technologies Inc. All rights reserved. +// Revisions Copyright © 2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com). All rights reserved. +// Portions Copyright 2009 The Go Authors. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#include "textflag.h" + +// handleMemcpyFault returns (the value stored in AX, the value stored in DI). +// Control is transferred to it when memcpy below receives SIGSEGV or SIGBUS, +// with the faulting address stored in AX and the signal number stored in DI. +// +// It must have the same frame configuration as memcpy so that it can undo any +// potential call frame set up by the assembler. +TEXT handleMemcpyFault(SB), NOSPLIT, $0-36 + MOVQ AX, addr+24(FP) + MOVL DI, sig+32(FP) + RET + +// memcpy copies data from src to dst. If a SIGSEGV or SIGBUS signal is received +// during the copy, it returns the address that caused the fault and the number +// of the signal that was received. Otherwise, it returns an unspecified address +// and a signal number of 0. +// +// Data is copied in order, such that if a fault happens at address p, it is +// safe to assume that all data before p-maxRegisterSize has already been +// successfully copied. +// +// The code is derived from the forward copying part of runtime.memmove. +// +// func memcpy(dst, src unsafe.Pointer, n uintptr) (fault unsafe.Pointer, sig int32) +TEXT ·memcpy(SB), NOSPLIT, $0-36 + // Store 0 as the returned signal number. If we run to completion, + // this is the value the caller will see; if a signal is received, + // handleMemcpyFault will store a different value in this address. + MOVL $0, sig+32(FP) + + MOVQ to+0(FP), DI + MOVQ from+8(FP), SI + MOVQ n+16(FP), BX + + // REP instructions have a high startup cost, so we handle small sizes + // with some straightline code. The REP MOVSQ instruction is really fast + // for large sizes. The cutover is approximately 2K. +tail: + // move_129through256 or smaller work whether or not the source and the + // destination memory regions overlap because they load all data into + // registers before writing it back. move_256through2048 on the other + // hand can be used only when the memory regions don't overlap or the copy + // direction is forward. + TESTQ BX, BX + JEQ move_0 + CMPQ BX, $2 + JBE move_1or2 + CMPQ BX, $4 + JBE move_3or4 + CMPQ BX, $8 + JB move_5through7 + JE move_8 + CMPQ BX, $16 + JBE move_9through16 + CMPQ BX, $32 + JBE move_17through32 + CMPQ BX, $64 + JBE move_33through64 + CMPQ BX, $128 + JBE move_65through128 + CMPQ BX, $256 + JBE move_129through256 + // TODO: use branch table and BSR to make this just a single dispatch + +/* + * forward copy loop + */ + CMPQ BX, $2048 + JLS move_256through2048 + + // Check alignment + MOVL SI, AX + ORL DI, AX + TESTL $7, AX + JEQ fwdBy8 + + // Do 1 byte at a time + MOVQ BX, CX + REP; MOVSB + RET + +fwdBy8: + // Do 8 bytes at a time + MOVQ BX, CX + SHRQ $3, CX + ANDQ $7, BX + REP; MOVSQ + JMP tail + +move_1or2: + MOVB (SI), AX + MOVB AX, (DI) + MOVB -1(SI)(BX*1), CX + MOVB CX, -1(DI)(BX*1) + RET +move_0: + RET +move_3or4: + MOVW (SI), AX + MOVW AX, (DI) + MOVW -2(SI)(BX*1), CX + MOVW CX, -2(DI)(BX*1) + RET +move_5through7: + MOVL (SI), AX + MOVL AX, (DI) + MOVL -4(SI)(BX*1), CX + MOVL CX, -4(DI)(BX*1) + RET +move_8: + // We need a separate case for 8 to make sure we write pointers atomically. + MOVQ (SI), AX + MOVQ AX, (DI) + RET +move_9through16: + MOVQ (SI), AX + MOVQ AX, (DI) + MOVQ -8(SI)(BX*1), CX + MOVQ CX, -8(DI)(BX*1) + RET +move_17through32: + MOVOU (SI), X0 + MOVOU X0, (DI) + MOVOU -16(SI)(BX*1), X1 + MOVOU X1, -16(DI)(BX*1) + RET +move_33through64: + MOVOU (SI), X0 + MOVOU X0, (DI) + MOVOU 16(SI), X1 + MOVOU X1, 16(DI) + MOVOU -32(SI)(BX*1), X2 + MOVOU X2, -32(DI)(BX*1) + MOVOU -16(SI)(BX*1), X3 + MOVOU X3, -16(DI)(BX*1) + RET +move_65through128: + MOVOU (SI), X0 + MOVOU X0, (DI) + MOVOU 16(SI), X1 + MOVOU X1, 16(DI) + MOVOU 32(SI), X2 + MOVOU X2, 32(DI) + MOVOU 48(SI), X3 + MOVOU X3, 48(DI) + MOVOU -64(SI)(BX*1), X4 + MOVOU X4, -64(DI)(BX*1) + MOVOU -48(SI)(BX*1), X5 + MOVOU X5, -48(DI)(BX*1) + MOVOU -32(SI)(BX*1), X6 + MOVOU X6, -32(DI)(BX*1) + MOVOU -16(SI)(BX*1), X7 + MOVOU X7, -16(DI)(BX*1) + RET +move_129through256: + MOVOU (SI), X0 + MOVOU X0, (DI) + MOVOU 16(SI), X1 + MOVOU X1, 16(DI) + MOVOU 32(SI), X2 + MOVOU X2, 32(DI) + MOVOU 48(SI), X3 + MOVOU X3, 48(DI) + MOVOU 64(SI), X4 + MOVOU X4, 64(DI) + MOVOU 80(SI), X5 + MOVOU X5, 80(DI) + MOVOU 96(SI), X6 + MOVOU X6, 96(DI) + MOVOU 112(SI), X7 + MOVOU X7, 112(DI) + MOVOU -128(SI)(BX*1), X8 + MOVOU X8, -128(DI)(BX*1) + MOVOU -112(SI)(BX*1), X9 + MOVOU X9, -112(DI)(BX*1) + MOVOU -96(SI)(BX*1), X10 + MOVOU X10, -96(DI)(BX*1) + MOVOU -80(SI)(BX*1), X11 + MOVOU X11, -80(DI)(BX*1) + MOVOU -64(SI)(BX*1), X12 + MOVOU X12, -64(DI)(BX*1) + MOVOU -48(SI)(BX*1), X13 + MOVOU X13, -48(DI)(BX*1) + MOVOU -32(SI)(BX*1), X14 + MOVOU X14, -32(DI)(BX*1) + MOVOU -16(SI)(BX*1), X15 + MOVOU X15, -16(DI)(BX*1) + RET +move_256through2048: + SUBQ $256, BX + MOVOU (SI), X0 + MOVOU X0, (DI) + MOVOU 16(SI), X1 + MOVOU X1, 16(DI) + MOVOU 32(SI), X2 + MOVOU X2, 32(DI) + MOVOU 48(SI), X3 + MOVOU X3, 48(DI) + MOVOU 64(SI), X4 + MOVOU X4, 64(DI) + MOVOU 80(SI), X5 + MOVOU X5, 80(DI) + MOVOU 96(SI), X6 + MOVOU X6, 96(DI) + MOVOU 112(SI), X7 + MOVOU X7, 112(DI) + MOVOU 128(SI), X8 + MOVOU X8, 128(DI) + MOVOU 144(SI), X9 + MOVOU X9, 144(DI) + MOVOU 160(SI), X10 + MOVOU X10, 160(DI) + MOVOU 176(SI), X11 + MOVOU X11, 176(DI) + MOVOU 192(SI), X12 + MOVOU X12, 192(DI) + MOVOU 208(SI), X13 + MOVOU X13, 208(DI) + MOVOU 224(SI), X14 + MOVOU X14, 224(DI) + MOVOU 240(SI), X15 + MOVOU X15, 240(DI) + CMPQ BX, $256 + LEAQ 256(SI), SI + LEAQ 256(DI), DI + JGE move_256through2048 + JMP tail diff --git a/pkg/safecopy/memcpy_arm64.s b/pkg/safecopy/memcpy_arm64.s new file mode 100644 index 000000000..e7e541565 --- /dev/null +++ b/pkg/safecopy/memcpy_arm64.s @@ -0,0 +1,78 @@ +// Copyright 2014 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "textflag.h" + +// handleMemcpyFault returns (the value stored in R0, the value stored in R1). +// Control is transferred to it when memcpy below receives SIGSEGV or SIGBUS, +// with the faulting address stored in R0 and the signal number stored in R1. +// +// It must have the same frame configuration as memcpy so that it can undo any +// potential call frame set up by the assembler. +TEXT handleMemcpyFault(SB), NOSPLIT, $0-36 + MOVD R0, addr+24(FP) + MOVW R1, sig+32(FP) + RET + +// memcpy copies data from src to dst. If a SIGSEGV or SIGBUS signal is received +// during the copy, it returns the address that caused the fault and the number +// of the signal that was received. Otherwise, it returns an unspecified address +// and a signal number of 0. +// +// Data is copied in order, such that if a fault happens at address p, it is +// safe to assume that all data before p-maxRegisterSize has already been +// successfully copied. +// +// The code is derived from the Go source runtime.memmove. +// +// func memcpy(dst, src unsafe.Pointer, n uintptr) (fault unsafe.Pointer, sig int32) +TEXT ·memcpy(SB), NOSPLIT, $-8-36 + // Store 0 as the returned signal number. If we run to completion, + // this is the value the caller will see; if a signal is received, + // handleMemcpyFault will store a different value in this address. + MOVW $0, sig+32(FP) + + MOVD to+0(FP), R3 + MOVD from+8(FP), R4 + MOVD n+16(FP), R5 + CMP $0, R5 + BNE check + RET + +check: + AND $~7, R5, R7 // R7 is N&~7. + SUB R7, R5, R6 // R6 is N&7. + + // Copying forward proceeds by copying R7/8 words then copying R6 bytes. + // R3 and R4 are advanced as we copy. + + // (There may be implementations of armv8 where copying by bytes until + // at least one of source or dest is word aligned is a worthwhile + // optimization, but the on the one tested so far (xgene) it did not + // make a significance difference.) + + CMP $0, R7 // Do we need to do any word-by-word copying? + BEQ noforwardlarge + ADD R3, R7, R9 // R9 points just past where we copy by word. + +forwardlargeloop: + MOVD.P 8(R4), R8 // R8 is just a scratch register. + MOVD.P R8, 8(R3) + CMP R3, R9 + BNE forwardlargeloop + +noforwardlarge: + CMP $0, R6 // Do we need to do any byte-by-byte copying? + BNE forwardtail + RET + +forwardtail: + ADD R3, R6, R9 // R9 points just past the destination memory. + +forwardtailloop: + MOVBU.P 1(R4), R8 + MOVBU.P R8, 1(R3) + CMP R3, R9 + BNE forwardtailloop + RET diff --git a/pkg/safecopy/safecopy.go b/pkg/safecopy/safecopy.go new file mode 100644 index 000000000..2fb7e5809 --- /dev/null +++ b/pkg/safecopy/safecopy.go @@ -0,0 +1,144 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package safecopy provides an efficient implementation of functions to access +// memory that may result in SIGSEGV or SIGBUS being sent to the accessor. +package safecopy + +import ( + "fmt" + "reflect" + "runtime" + "syscall" + + "gvisor.dev/gvisor/pkg/syserror" +) + +// SegvError is returned when a safecopy function receives SIGSEGV. +type SegvError struct { + // Addr is the address at which the SIGSEGV occurred. + Addr uintptr +} + +// Error implements error.Error. +func (e SegvError) Error() string { + return fmt.Sprintf("SIGSEGV at %#x", e.Addr) +} + +// BusError is returned when a safecopy function receives SIGBUS. +type BusError struct { + // Addr is the address at which the SIGBUS occurred. + Addr uintptr +} + +// Error implements error.Error. +func (e BusError) Error() string { + return fmt.Sprintf("SIGBUS at %#x", e.Addr) +} + +// AlignmentError is returned when a safecopy function is passed an address +// that does not meet alignment requirements. +type AlignmentError struct { + // Addr is the invalid address. + Addr uintptr + + // Alignment is the required alignment. + Alignment uintptr +} + +// Error implements error.Error. +func (e AlignmentError) Error() string { + return fmt.Sprintf("address %#x is not aligned to a %d-byte boundary", e.Addr, e.Alignment) +} + +var ( + // The begin and end addresses below are for the functions that are + // checked by the signal handler. + memcpyBegin uintptr + memcpyEnd uintptr + memclrBegin uintptr + memclrEnd uintptr + swapUint32Begin uintptr + swapUint32End uintptr + swapUint64Begin uintptr + swapUint64End uintptr + compareAndSwapUint32Begin uintptr + compareAndSwapUint32End uintptr + loadUint32Begin uintptr + loadUint32End uintptr + + // savedSigSegVHandler is a pointer to the SIGSEGV handler that was + // configured before we replaced it with our own. We still call into it + // when we get a SIGSEGV that is not interesting to us. + savedSigSegVHandler uintptr + + // same a above, but for SIGBUS signals. + savedSigBusHandler uintptr +) + +// signalHandler is our replacement signal handler for SIGSEGV and SIGBUS +// signals. +func signalHandler() + +// FindEndAddress returns the end address (one byte beyond the last) of the +// function that contains the specified address (begin). +func FindEndAddress(begin uintptr) uintptr { + f := runtime.FuncForPC(begin) + if f != nil { + for p := begin; ; p++ { + g := runtime.FuncForPC(p) + if f != g { + return p + } + } + } + return begin +} + +// initializeAddresses initializes the addresses used by the signal handler. +func initializeAddresses() { + // The following functions are written in assembly language, so they won't + // be inlined by the existing compiler/linker. Tests will fail if this + // assumption is violated. + memcpyBegin = reflect.ValueOf(memcpy).Pointer() + memcpyEnd = FindEndAddress(memcpyBegin) + memclrBegin = reflect.ValueOf(memclr).Pointer() + memclrEnd = FindEndAddress(memclrBegin) + swapUint32Begin = reflect.ValueOf(swapUint32).Pointer() + swapUint32End = FindEndAddress(swapUint32Begin) + swapUint64Begin = reflect.ValueOf(swapUint64).Pointer() + swapUint64End = FindEndAddress(swapUint64Begin) + compareAndSwapUint32Begin = reflect.ValueOf(compareAndSwapUint32).Pointer() + compareAndSwapUint32End = FindEndAddress(compareAndSwapUint32Begin) + loadUint32Begin = reflect.ValueOf(loadUint32).Pointer() + loadUint32End = FindEndAddress(loadUint32Begin) +} + +func init() { + initializeAddresses() + if err := ReplaceSignalHandler(syscall.SIGSEGV, reflect.ValueOf(signalHandler).Pointer(), &savedSigSegVHandler); err != nil { + panic(fmt.Sprintf("Unable to set handler for SIGSEGV: %v", err)) + } + if err := ReplaceSignalHandler(syscall.SIGBUS, reflect.ValueOf(signalHandler).Pointer(), &savedSigBusHandler); err != nil { + panic(fmt.Sprintf("Unable to set handler for SIGBUS: %v", err)) + } + syserror.AddErrorUnwrapper(func(e error) (syscall.Errno, bool) { + switch e.(type) { + case SegvError, BusError, AlignmentError: + return syscall.EFAULT, true + default: + return 0, false + } + }) +} diff --git a/pkg/safecopy/safecopy_test.go b/pkg/safecopy/safecopy_test.go new file mode 100644 index 000000000..5818f7f9b --- /dev/null +++ b/pkg/safecopy/safecopy_test.go @@ -0,0 +1,617 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package safecopy + +import ( + "bytes" + "fmt" + "io/ioutil" + "math/rand" + "os" + "runtime/debug" + "syscall" + "testing" + "unsafe" +) + +// Size of a page in bytes. Cloned from usermem.PageSize to avoid a circular +// dependency. +const pageSize = 4096 + +func initRandom(b []byte) { + for i := range b { + b[i] = byte(rand.Intn(256)) + } +} + +func randBuf(size int) []byte { + b := make([]byte, size) + initRandom(b) + return b +} + +func TestCopyInSuccess(t *testing.T) { + // Test that CopyIn does not return an error when all pages are accessible. + const bufLen = 8192 + a := randBuf(bufLen) + b := make([]byte, bufLen) + + n, err := CopyIn(b, unsafe.Pointer(&a[0])) + if n != bufLen { + t.Errorf("Unexpected copy length, got %v, want %v", n, bufLen) + } + if err != nil { + t.Errorf("Unexpected error: %v", err) + } + if !bytes.Equal(a, b) { + t.Errorf("Buffers are not equal when they should be: %v %v", a, b) + } +} + +func TestCopyOutSuccess(t *testing.T) { + // Test that CopyOut does not return an error when all pages are + // accessible. + const bufLen = 8192 + a := randBuf(bufLen) + b := make([]byte, bufLen) + + n, err := CopyOut(unsafe.Pointer(&b[0]), a) + if n != bufLen { + t.Errorf("Unexpected copy length, got %v, want %v", n, bufLen) + } + if err != nil { + t.Errorf("Unexpected error: %v", err) + } + if !bytes.Equal(a, b) { + t.Errorf("Buffers are not equal when they should be: %v %v", a, b) + } +} + +func TestCopySuccess(t *testing.T) { + // Test that Copy does not return an error when all pages are accessible. + const bufLen = 8192 + a := randBuf(bufLen) + b := make([]byte, bufLen) + + n, err := Copy(unsafe.Pointer(&b[0]), unsafe.Pointer(&a[0]), bufLen) + if n != bufLen { + t.Errorf("Unexpected copy length, got %v, want %v", n, bufLen) + } + if err != nil { + t.Errorf("Unexpected error: %v", err) + } + if !bytes.Equal(a, b) { + t.Errorf("Buffers are not equal when they should be: %v %v", a, b) + } +} + +func TestZeroOutSuccess(t *testing.T) { + // Test that ZeroOut does not return an error when all pages are + // accessible. + const bufLen = 8192 + a := make([]byte, bufLen) + b := randBuf(bufLen) + + n, err := ZeroOut(unsafe.Pointer(&b[0]), bufLen) + if n != bufLen { + t.Errorf("Unexpected copy length, got %v, want %v", n, bufLen) + } + if err != nil { + t.Errorf("Unexpected error: %v", err) + } + if !bytes.Equal(a, b) { + t.Errorf("Buffers are not equal when they should be: %v %v", a, b) + } +} + +func TestSwapUint32Success(t *testing.T) { + // Test that SwapUint32 does not return an error when the page is + // accessible. + before := uint32(rand.Int31()) + after := uint32(rand.Int31()) + val := before + + old, err := SwapUint32(unsafe.Pointer(&val), after) + if err != nil { + t.Errorf("Unexpected error: %v", err) + } + if old != before { + t.Errorf("Unexpected old value: got %v, want %v", old, before) + } + if val != after { + t.Errorf("Unexpected new value: got %v, want %v", val, after) + } +} + +func TestSwapUint32AlignmentError(t *testing.T) { + // Test that SwapUint32 returns an AlignmentError when passed an unaligned + // address. + data := new(struct{ val uint64 }) + addr := uintptr(unsafe.Pointer(&data.val)) + 1 + want := AlignmentError{Addr: addr, Alignment: 4} + if _, err := SwapUint32(unsafe.Pointer(addr), 1); err != want { + t.Errorf("Unexpected error: got %v, want %v", err, want) + } +} + +func TestSwapUint64Success(t *testing.T) { + // Test that SwapUint64 does not return an error when the page is + // accessible. + before := uint64(rand.Int63()) + after := uint64(rand.Int63()) + // "The first word in ... an allocated struct or slice can be relied upon + // to be 64-bit aligned." - sync/atomic docs + data := new(struct{ val uint64 }) + data.val = before + + old, err := SwapUint64(unsafe.Pointer(&data.val), after) + if err != nil { + t.Errorf("Unexpected error: %v", err) + } + if old != before { + t.Errorf("Unexpected old value: got %v, want %v", old, before) + } + if data.val != after { + t.Errorf("Unexpected new value: got %v, want %v", data.val, after) + } +} + +func TestSwapUint64AlignmentError(t *testing.T) { + // Test that SwapUint64 returns an AlignmentError when passed an unaligned + // address. + data := new(struct{ val1, val2 uint64 }) + addr := uintptr(unsafe.Pointer(&data.val1)) + 1 + want := AlignmentError{Addr: addr, Alignment: 8} + if _, err := SwapUint64(unsafe.Pointer(addr), 1); err != want { + t.Errorf("Unexpected error: got %v, want %v", err, want) + } +} + +func TestCompareAndSwapUint32Success(t *testing.T) { + // Test that CompareAndSwapUint32 does not return an error when the page is + // accessible. + before := uint32(rand.Int31()) + after := uint32(rand.Int31()) + val := before + + old, err := CompareAndSwapUint32(unsafe.Pointer(&val), before, after) + if err != nil { + t.Errorf("Unexpected error: %v", err) + } + if old != before { + t.Errorf("Unexpected old value: got %v, want %v", old, before) + } + if val != after { + t.Errorf("Unexpected new value: got %v, want %v", val, after) + } +} + +func TestCompareAndSwapUint32AlignmentError(t *testing.T) { + // Test that CompareAndSwapUint32 returns an AlignmentError when passed an + // unaligned address. + data := new(struct{ val uint64 }) + addr := uintptr(unsafe.Pointer(&data.val)) + 1 + want := AlignmentError{Addr: addr, Alignment: 4} + if _, err := CompareAndSwapUint32(unsafe.Pointer(addr), 0, 1); err != want { + t.Errorf("Unexpected error: got %v, want %v", err, want) + } +} + +// withSegvErrorTestMapping calls fn with a two-page mapping. The first page +// contains random data, and the second page generates SIGSEGV when accessed. +func withSegvErrorTestMapping(t *testing.T, fn func(m []byte)) { + mapping, err := syscall.Mmap(-1, 0, 2*pageSize, syscall.PROT_READ|syscall.PROT_WRITE, syscall.MAP_ANONYMOUS|syscall.MAP_PRIVATE) + if err != nil { + t.Fatalf("Mmap failed: %v", err) + } + defer syscall.Munmap(mapping) + if err := syscall.Mprotect(mapping[pageSize:], syscall.PROT_NONE); err != nil { + t.Fatalf("Mprotect failed: %v", err) + } + initRandom(mapping[:pageSize]) + + fn(mapping) +} + +// withBusErrorTestMapping calls fn with a two-page mapping. The first page +// contains random data, and the second page generates SIGBUS when accessed. +func withBusErrorTestMapping(t *testing.T, fn func(m []byte)) { + f, err := ioutil.TempFile("", "sigbus_test") + if err != nil { + t.Fatalf("TempFile failed: %v", err) + } + defer f.Close() + if err := f.Truncate(pageSize); err != nil { + t.Fatalf("Truncate failed: %v", err) + } + mapping, err := syscall.Mmap(int(f.Fd()), 0, 2*pageSize, syscall.PROT_READ|syscall.PROT_WRITE, syscall.MAP_SHARED) + if err != nil { + t.Fatalf("Mmap failed: %v", err) + } + defer syscall.Munmap(mapping) + initRandom(mapping[:pageSize]) + + fn(mapping) +} + +func TestCopyInSegvError(t *testing.T) { + // Test that CopyIn returns a SegvError when reaching a page that signals + // SIGSEGV. + for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ { + t.Run(fmt.Sprintf("starting copy %d bytes before SIGSEGV", bytesBeforeFault), func(t *testing.T) { + withSegvErrorTestMapping(t, func(mapping []byte) { + secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize + src := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault)) + dst := randBuf(pageSize) + n, err := CopyIn(dst, src) + if n != bytesBeforeFault { + t.Errorf("Unexpected copy length: got %v, want %v", n, bytesBeforeFault) + } + if want := (SegvError{secondPage}); err != want { + t.Errorf("Unexpected error: got %v, want %v", err, want) + } + if got, want := dst[:bytesBeforeFault], mapping[pageSize-bytesBeforeFault:pageSize]; !bytes.Equal(got, want) { + t.Errorf("Buffers are not equal when they should be: %v %v", got, want) + } + }) + }) + } +} + +func TestCopyInBusError(t *testing.T) { + // Test that CopyIn returns a BusError when reaching a page that signals + // SIGBUS. + for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ { + t.Run(fmt.Sprintf("starting copy %d bytes before SIGBUS", bytesBeforeFault), func(t *testing.T) { + withBusErrorTestMapping(t, func(mapping []byte) { + secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize + src := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault)) + dst := randBuf(pageSize) + n, err := CopyIn(dst, src) + if n != bytesBeforeFault { + t.Errorf("Unexpected copy length: got %v, want %v", n, bytesBeforeFault) + } + if want := (BusError{secondPage}); err != want { + t.Errorf("Unexpected error: got %v, want %v", err, want) + } + if got, want := dst[:bytesBeforeFault], mapping[pageSize-bytesBeforeFault:pageSize]; !bytes.Equal(got, want) { + t.Errorf("Buffers are not equal when they should be: %v %v", got, want) + } + }) + }) + } +} + +func TestCopyOutSegvError(t *testing.T) { + // Test that CopyOut returns a SegvError when reaching a page that signals + // SIGSEGV. + for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ { + t.Run(fmt.Sprintf("starting copy %d bytes before SIGSEGV", bytesBeforeFault), func(t *testing.T) { + withSegvErrorTestMapping(t, func(mapping []byte) { + secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize + dst := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault)) + src := randBuf(pageSize) + n, err := CopyOut(dst, src) + if n != bytesBeforeFault { + t.Errorf("Unexpected copy length: got %v, want %v", n, bytesBeforeFault) + } + if want := (SegvError{secondPage}); err != want { + t.Errorf("Unexpected error: got %v, want %v", err, want) + } + if got, want := mapping[pageSize-bytesBeforeFault:pageSize], src[:bytesBeforeFault]; !bytes.Equal(got, want) { + t.Errorf("Buffers are not equal when they should be: %v %v", got, want) + } + }) + }) + } +} + +func TestCopyOutBusError(t *testing.T) { + // Test that CopyOut returns a BusError when reaching a page that signals + // SIGBUS. + for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ { + t.Run(fmt.Sprintf("starting copy %d bytes before SIGSEGV", bytesBeforeFault), func(t *testing.T) { + withBusErrorTestMapping(t, func(mapping []byte) { + secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize + dst := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault)) + src := randBuf(pageSize) + n, err := CopyOut(dst, src) + if n != bytesBeforeFault { + t.Errorf("Unexpected copy length: got %v, want %v", n, bytesBeforeFault) + } + if want := (BusError{secondPage}); err != want { + t.Errorf("Unexpected error: got %v, want %v", err, want) + } + if got, want := mapping[pageSize-bytesBeforeFault:pageSize], src[:bytesBeforeFault]; !bytes.Equal(got, want) { + t.Errorf("Buffers are not equal when they should be: %v %v", got, want) + } + }) + }) + } +} + +func TestCopySourceSegvError(t *testing.T) { + // Test that Copy returns a SegvError when copying from a page that signals + // SIGSEGV. + for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ { + t.Run(fmt.Sprintf("starting copy %d bytes before SIGSEGV", bytesBeforeFault), func(t *testing.T) { + withSegvErrorTestMapping(t, func(mapping []byte) { + secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize + src := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault)) + dst := randBuf(pageSize) + n, err := Copy(unsafe.Pointer(&dst[0]), src, pageSize) + if n != uintptr(bytesBeforeFault) { + t.Errorf("Unexpected copy length: got %v, want %v", n, bytesBeforeFault) + } + if want := (SegvError{secondPage}); err != want { + t.Errorf("Unexpected error: got %v, want %v", err, want) + } + if got, want := dst[:bytesBeforeFault], mapping[pageSize-bytesBeforeFault:pageSize]; !bytes.Equal(got, want) { + t.Errorf("Buffers are not equal when they should be: %v %v", got, want) + } + }) + }) + } +} + +func TestCopySourceBusError(t *testing.T) { + // Test that Copy returns a BusError when copying from a page that signals + // SIGBUS. + for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ { + t.Run(fmt.Sprintf("starting copy %d bytes before SIGBUS", bytesBeforeFault), func(t *testing.T) { + withBusErrorTestMapping(t, func(mapping []byte) { + secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize + src := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault)) + dst := randBuf(pageSize) + n, err := Copy(unsafe.Pointer(&dst[0]), src, pageSize) + if n != uintptr(bytesBeforeFault) { + t.Errorf("Unexpected copy length: got %v, want %v", n, bytesBeforeFault) + } + if want := (BusError{secondPage}); err != want { + t.Errorf("Unexpected error: got %v, want %v", err, want) + } + if got, want := dst[:bytesBeforeFault], mapping[pageSize-bytesBeforeFault:pageSize]; !bytes.Equal(got, want) { + t.Errorf("Buffers are not equal when they should be: %v %v", got, want) + } + }) + }) + } +} + +func TestCopyDestinationSegvError(t *testing.T) { + // Test that Copy returns a SegvError when copying to a page that signals + // SIGSEGV. + for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ { + t.Run(fmt.Sprintf("starting copy %d bytes before SIGSEGV", bytesBeforeFault), func(t *testing.T) { + withSegvErrorTestMapping(t, func(mapping []byte) { + secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize + dst := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault)) + src := randBuf(pageSize) + n, err := Copy(dst, unsafe.Pointer(&src[0]), pageSize) + if n != uintptr(bytesBeforeFault) { + t.Errorf("Unexpected copy length: got %v, want %v", n, bytesBeforeFault) + } + if want := (SegvError{secondPage}); err != want { + t.Errorf("Unexpected error: got %v, want %v", err, want) + } + if got, want := mapping[pageSize-bytesBeforeFault:pageSize], src[:bytesBeforeFault]; !bytes.Equal(got, want) { + t.Errorf("Buffers are not equal when they should be: %v %v", got, want) + } + }) + }) + } +} + +func TestCopyDestinationBusError(t *testing.T) { + // Test that Copy returns a BusError when copying to a page that signals + // SIGBUS. + for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ { + t.Run(fmt.Sprintf("starting copy %d bytes before SIGBUS", bytesBeforeFault), func(t *testing.T) { + withBusErrorTestMapping(t, func(mapping []byte) { + secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize + dst := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault)) + src := randBuf(pageSize) + n, err := Copy(dst, unsafe.Pointer(&src[0]), pageSize) + if n != uintptr(bytesBeforeFault) { + t.Errorf("Unexpected copy length: got %v, want %v", n, bytesBeforeFault) + } + if want := (BusError{secondPage}); err != want { + t.Errorf("Unexpected error: got %v, want %v", err, want) + } + if got, want := mapping[pageSize-bytesBeforeFault:pageSize], src[:bytesBeforeFault]; !bytes.Equal(got, want) { + t.Errorf("Buffers are not equal when they should be: %v %v", got, want) + } + }) + }) + } +} + +func TestZeroOutSegvError(t *testing.T) { + // Test that ZeroOut returns a SegvError when reaching a page that signals + // SIGSEGV. + for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ { + t.Run(fmt.Sprintf("starting write %d bytes before SIGSEGV", bytesBeforeFault), func(t *testing.T) { + withSegvErrorTestMapping(t, func(mapping []byte) { + secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize + dst := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault)) + n, err := ZeroOut(dst, pageSize) + if n != uintptr(bytesBeforeFault) { + t.Errorf("Unexpected write length: got %v, want %v", n, bytesBeforeFault) + } + if want := (SegvError{secondPage}); err != want { + t.Errorf("Unexpected error: got %v, want %v", err, want) + } + if got, want := mapping[pageSize-bytesBeforeFault:pageSize], make([]byte, bytesBeforeFault); !bytes.Equal(got, want) { + t.Errorf("Non-zero bytes in written part of mapping: %v", got) + } + }) + }) + } +} + +func TestZeroOutBusError(t *testing.T) { + // Test that ZeroOut returns a BusError when reaching a page that signals + // SIGBUS. + for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ { + t.Run(fmt.Sprintf("starting write %d bytes before SIGBUS", bytesBeforeFault), func(t *testing.T) { + withBusErrorTestMapping(t, func(mapping []byte) { + secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize + dst := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault)) + n, err := ZeroOut(dst, pageSize) + if n != uintptr(bytesBeforeFault) { + t.Errorf("Unexpected write length: got %v, want %v", n, bytesBeforeFault) + } + if want := (BusError{secondPage}); err != want { + t.Errorf("Unexpected error: got %v, want %v", err, want) + } + if got, want := mapping[pageSize-bytesBeforeFault:pageSize], make([]byte, bytesBeforeFault); !bytes.Equal(got, want) { + t.Errorf("Non-zero bytes in written part of mapping: %v", got) + } + }) + }) + } +} + +func TestSwapUint32SegvError(t *testing.T) { + // Test that SwapUint32 returns a SegvError when reaching a page that + // signals SIGSEGV. + withSegvErrorTestMapping(t, func(mapping []byte) { + secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize + _, err := SwapUint32(unsafe.Pointer(secondPage), 1) + if want := (SegvError{secondPage}); err != want { + t.Errorf("Unexpected error: got %v, want %v", err, want) + } + }) +} + +func TestSwapUint32BusError(t *testing.T) { + // Test that SwapUint32 returns a BusError when reaching a page that + // signals SIGBUS. + withBusErrorTestMapping(t, func(mapping []byte) { + secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize + _, err := SwapUint32(unsafe.Pointer(secondPage), 1) + if want := (BusError{secondPage}); err != want { + t.Errorf("Unexpected error: got %v, want %v", err, want) + } + }) +} + +func TestSwapUint64SegvError(t *testing.T) { + // Test that SwapUint64 returns a SegvError when reaching a page that + // signals SIGSEGV. + withSegvErrorTestMapping(t, func(mapping []byte) { + secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize + _, err := SwapUint64(unsafe.Pointer(secondPage), 1) + if want := (SegvError{secondPage}); err != want { + t.Errorf("Unexpected error: got %v, want %v", err, want) + } + }) +} + +func TestSwapUint64BusError(t *testing.T) { + // Test that SwapUint64 returns a BusError when reaching a page that + // signals SIGBUS. + withBusErrorTestMapping(t, func(mapping []byte) { + secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize + _, err := SwapUint64(unsafe.Pointer(secondPage), 1) + if want := (BusError{secondPage}); err != want { + t.Errorf("Unexpected error: got %v, want %v", err, want) + } + }) +} + +func TestCompareAndSwapUint32SegvError(t *testing.T) { + // Test that CompareAndSwapUint32 returns a SegvError when reaching a page + // that signals SIGSEGV. + withSegvErrorTestMapping(t, func(mapping []byte) { + secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize + _, err := CompareAndSwapUint32(unsafe.Pointer(secondPage), 0, 1) + if want := (SegvError{secondPage}); err != want { + t.Errorf("Unexpected error: got %v, want %v", err, want) + } + }) +} + +func TestCompareAndSwapUint32BusError(t *testing.T) { + // Test that CompareAndSwapUint32 returns a BusError when reaching a page + // that signals SIGBUS. + withBusErrorTestMapping(t, func(mapping []byte) { + secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize + _, err := CompareAndSwapUint32(unsafe.Pointer(secondPage), 0, 1) + if want := (BusError{secondPage}); err != want { + t.Errorf("Unexpected error: got %v, want %v", err, want) + } + }) +} + +func testCopy(dst, src []byte) (panicked bool) { + defer func() { + if r := recover(); r != nil { + panicked = true + } + }() + debug.SetPanicOnFault(true) + copy(dst, src) + return +} + +func TestSegVOnMemmove(t *testing.T) { + // Test that SIGSEGVs received by runtime.memmove when *not* doing + // CopyIn or CopyOut work gets propagated to the runtime. + const bufLen = pageSize + a, err := syscall.Mmap(-1, 0, bufLen, syscall.PROT_NONE, syscall.MAP_ANON|syscall.MAP_PRIVATE) + if err != nil { + t.Fatalf("Mmap failed: %v", err) + + } + defer syscall.Munmap(a) + b := randBuf(bufLen) + + if !testCopy(b, a) { + t.Fatalf("testCopy didn't panic when it should have") + } + + if !testCopy(a, b) { + t.Fatalf("testCopy didn't panic when it should have") + } +} + +func TestSigbusOnMemmove(t *testing.T) { + // Test that SIGBUS received by runtime.memmove when *not* doing + // CopyIn or CopyOut work gets propagated to the runtime. + const bufLen = pageSize + f, err := ioutil.TempFile("", "sigbus_test") + if err != nil { + t.Fatalf("TempFile failed: %v", err) + } + os.Remove(f.Name()) + defer f.Close() + + a, err := syscall.Mmap(int(f.Fd()), 0, bufLen, syscall.PROT_READ|syscall.PROT_WRITE, syscall.MAP_SHARED) + if err != nil { + t.Fatalf("Mmap failed: %v", err) + + } + defer syscall.Munmap(a) + b := randBuf(bufLen) + + if !testCopy(b, a) { + t.Fatalf("testCopy didn't panic when it should have") + } + + if !testCopy(a, b) { + t.Fatalf("testCopy didn't panic when it should have") + } +} diff --git a/pkg/safecopy/safecopy_unsafe.go b/pkg/safecopy/safecopy_unsafe.go new file mode 100644 index 000000000..eef028e68 --- /dev/null +++ b/pkg/safecopy/safecopy_unsafe.go @@ -0,0 +1,335 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package safecopy + +import ( + "fmt" + "syscall" + "unsafe" +) + +// maxRegisterSize is the maximum register size used in memcpy and memclr. It +// is used to decide by how much to rewind the copy (for memcpy) or zeroing +// (for memclr) before proceeding. +const maxRegisterSize = 16 + +// memcpy copies data from src to dst. If a SIGSEGV or SIGBUS signal is received +// during the copy, it returns the address that caused the fault and the number +// of the signal that was received. Otherwise, it returns an unspecified address +// and a signal number of 0. +// +// Data is copied in order, such that if a fault happens at address p, it is +// safe to assume that all data before p-maxRegisterSize has already been +// successfully copied. +// +//go:noescape +func memcpy(dst, src unsafe.Pointer, n uintptr) (fault unsafe.Pointer, sig int32) + +// memclr sets the n bytes following ptr to zeroes. If a SIGSEGV or SIGBUS +// signal is received during the write, it returns the address that caused the +// fault and the number of the signal that was received. Otherwise, it returns +// an unspecified address and a signal number of 0. +// +// Data is written in order, such that if a fault happens at address p, it is +// safe to assume that all data before p-maxRegisterSize has already been +// successfully written. +// +//go:noescape +func memclr(ptr unsafe.Pointer, n uintptr) (fault unsafe.Pointer, sig int32) + +// swapUint32 atomically stores new into *ptr and returns (the previous *ptr +// value, 0). If a SIGSEGV or SIGBUS signal is received during the swap, the +// value of old is unspecified, and sig is the number of the signal that was +// received. +// +// Preconditions: ptr must be aligned to a 4-byte boundary. +// +//go:noescape +func swapUint32(ptr unsafe.Pointer, new uint32) (old uint32, sig int32) + +// swapUint64 atomically stores new into *ptr and returns (the previous *ptr +// value, 0). If a SIGSEGV or SIGBUS signal is received during the swap, the +// value of old is unspecified, and sig is the number of the signal that was +// received. +// +// Preconditions: ptr must be aligned to a 8-byte boundary. +// +//go:noescape +func swapUint64(ptr unsafe.Pointer, new uint64) (old uint64, sig int32) + +// compareAndSwapUint32 is like sync/atomic.CompareAndSwapUint32, but returns +// (the value previously stored at ptr, 0). If a SIGSEGV or SIGBUS signal is +// received during the operation, the value of prev is unspecified, and sig is +// the number of the signal that was received. +// +// Preconditions: ptr must be aligned to a 4-byte boundary. +// +//go:noescape +func compareAndSwapUint32(ptr unsafe.Pointer, old, new uint32) (prev uint32, sig int32) + +// LoadUint32 is like sync/atomic.LoadUint32, but operates with user memory. It +// may fail with SIGSEGV or SIGBUS if it is received while reading from ptr. +// +// Preconditions: ptr must be aligned to a 4-byte boundary. +// +//go:noescape +func loadUint32(ptr unsafe.Pointer) (val uint32, sig int32) + +// CopyIn copies len(dst) bytes from src to dst. It returns the number of bytes +// copied and an error if SIGSEGV or SIGBUS is received while reading from src. +func CopyIn(dst []byte, src unsafe.Pointer) (int, error) { + toCopy := uintptr(len(dst)) + if len(dst) == 0 { + return 0, nil + } + + fault, sig := memcpy(unsafe.Pointer(&dst[0]), src, toCopy) + if sig == 0 { + return len(dst), nil + } + + faultN, srcN := uintptr(fault), uintptr(src) + if faultN < srcN || faultN >= srcN+toCopy { + panic(fmt.Sprintf("CopyIn raised signal %d at %#x, which is outside source [%#x, %#x)", sig, faultN, srcN, srcN+toCopy)) + } + + // memcpy might have ended the copy up to maxRegisterSize bytes before + // fault, if an instruction caused a memory access that straddled two + // pages, and the second one faulted. Try to copy up to the fault. + var done int + if faultN-srcN > maxRegisterSize { + done = int(faultN - srcN - maxRegisterSize) + } + n, err := CopyIn(dst[done:int(faultN-srcN)], unsafe.Pointer(srcN+uintptr(done))) + done += n + if err != nil { + return done, err + } + return done, errorFromFaultSignal(fault, sig) +} + +// CopyOut copies len(src) bytes from src to dst. If returns the number of +// bytes done and an error if SIGSEGV or SIGBUS is received while writing to +// dst. +func CopyOut(dst unsafe.Pointer, src []byte) (int, error) { + toCopy := uintptr(len(src)) + if toCopy == 0 { + return 0, nil + } + + fault, sig := memcpy(dst, unsafe.Pointer(&src[0]), toCopy) + if sig == 0 { + return len(src), nil + } + + faultN, dstN := uintptr(fault), uintptr(dst) + if faultN < dstN || faultN >= dstN+toCopy { + panic(fmt.Sprintf("CopyOut raised signal %d at %#x, which is outside destination [%#x, %#x)", sig, faultN, dstN, dstN+toCopy)) + } + + // memcpy might have ended the copy up to maxRegisterSize bytes before + // fault, if an instruction caused a memory access that straddled two + // pages, and the second one faulted. Try to copy up to the fault. + var done int + if faultN-dstN > maxRegisterSize { + done = int(faultN - dstN - maxRegisterSize) + } + n, err := CopyOut(unsafe.Pointer(dstN+uintptr(done)), src[done:int(faultN-dstN)]) + done += n + if err != nil { + return done, err + } + return done, errorFromFaultSignal(fault, sig) +} + +// Copy copies toCopy bytes from src to dst. It returns the number of bytes +// copied and an error if SIGSEGV or SIGBUS is received while reading from src +// or writing to dst. +// +// Data is copied in order; if [src, src+toCopy) and [dst, dst+toCopy) overlap, +// the resulting contents of dst are unspecified. +func Copy(dst, src unsafe.Pointer, toCopy uintptr) (uintptr, error) { + if toCopy == 0 { + return 0, nil + } + + fault, sig := memcpy(dst, src, toCopy) + if sig == 0 { + return toCopy, nil + } + + // Did the fault occur while reading from src or writing to dst? + faultN, srcN, dstN := uintptr(fault), uintptr(src), uintptr(dst) + faultAfterSrc := ^uintptr(0) + if faultN >= srcN { + faultAfterSrc = faultN - srcN + } + faultAfterDst := ^uintptr(0) + if faultN >= dstN { + faultAfterDst = faultN - dstN + } + if faultAfterSrc >= toCopy && faultAfterDst >= toCopy { + panic(fmt.Sprintf("Copy raised signal %d at %#x, which is outside source [%#x, %#x) and destination [%#x, %#x)", sig, faultN, srcN, srcN+toCopy, dstN, dstN+toCopy)) + } + faultedAfter := faultAfterSrc + if faultedAfter > faultAfterDst { + faultedAfter = faultAfterDst + } + + // memcpy might have ended the copy up to maxRegisterSize bytes before + // fault, if an instruction caused a memory access that straddled two + // pages, and the second one faulted. Try to copy up to the fault. + var done uintptr + if faultedAfter > maxRegisterSize { + done = faultedAfter - maxRegisterSize + } + n, err := Copy(unsafe.Pointer(dstN+done), unsafe.Pointer(srcN+done), faultedAfter-done) + done += n + if err != nil { + return done, err + } + return done, errorFromFaultSignal(fault, sig) +} + +// ZeroOut writes toZero zero bytes to dst. It returns the number of bytes +// written and an error if SIGSEGV or SIGBUS is received while writing to dst. +func ZeroOut(dst unsafe.Pointer, toZero uintptr) (uintptr, error) { + if toZero == 0 { + return 0, nil + } + + fault, sig := memclr(dst, toZero) + if sig == 0 { + return toZero, nil + } + + faultN, dstN := uintptr(fault), uintptr(dst) + if faultN < dstN || faultN >= dstN+toZero { + panic(fmt.Sprintf("ZeroOut raised signal %d at %#x, which is outside destination [%#x, %#x)", sig, faultN, dstN, dstN+toZero)) + } + + // memclr might have ended the write up to maxRegisterSize bytes before + // fault, if an instruction caused a memory access that straddled two + // pages, and the second one faulted. Try to write up to the fault. + var done uintptr + if faultN-dstN > maxRegisterSize { + done = faultN - dstN - maxRegisterSize + } + n, err := ZeroOut(unsafe.Pointer(dstN+done), faultN-dstN-done) + done += n + if err != nil { + return done, err + } + return done, errorFromFaultSignal(fault, sig) +} + +// SwapUint32 is equivalent to sync/atomic.SwapUint32, except that it returns +// an error if SIGSEGV or SIGBUS is received while accessing ptr, or if ptr is +// not aligned to a 4-byte boundary. +func SwapUint32(ptr unsafe.Pointer, new uint32) (uint32, error) { + if addr := uintptr(ptr); addr&3 != 0 { + return 0, AlignmentError{addr, 4} + } + old, sig := swapUint32(ptr, new) + return old, errorFromFaultSignal(ptr, sig) +} + +// SwapUint64 is equivalent to sync/atomic.SwapUint64, except that it returns +// an error if SIGSEGV or SIGBUS is received while accessing ptr, or if ptr is +// not aligned to an 8-byte boundary. +func SwapUint64(ptr unsafe.Pointer, new uint64) (uint64, error) { + if addr := uintptr(ptr); addr&7 != 0 { + return 0, AlignmentError{addr, 8} + } + old, sig := swapUint64(ptr, new) + return old, errorFromFaultSignal(ptr, sig) +} + +// CompareAndSwapUint32 is equivalent to atomicbitops.CompareAndSwapUint32, +// except that it returns an error if SIGSEGV or SIGBUS is received while +// accessing ptr, or if ptr is not aligned to a 4-byte boundary. +func CompareAndSwapUint32(ptr unsafe.Pointer, old, new uint32) (uint32, error) { + if addr := uintptr(ptr); addr&3 != 0 { + return 0, AlignmentError{addr, 4} + } + prev, sig := compareAndSwapUint32(ptr, old, new) + return prev, errorFromFaultSignal(ptr, sig) +} + +// LoadUint32 is like sync/atomic.LoadUint32, but operates with user memory. It +// may fail with SIGSEGV or SIGBUS if it is received while reading from ptr. +// +// Preconditions: ptr must be aligned to a 4-byte boundary. +func LoadUint32(ptr unsafe.Pointer) (uint32, error) { + if addr := uintptr(ptr); addr&3 != 0 { + return 0, AlignmentError{addr, 4} + } + val, sig := loadUint32(ptr) + return val, errorFromFaultSignal(ptr, sig) +} + +func errorFromFaultSignal(addr unsafe.Pointer, sig int32) error { + switch sig { + case 0: + return nil + case int32(syscall.SIGSEGV): + return SegvError{uintptr(addr)} + case int32(syscall.SIGBUS): + return BusError{uintptr(addr)} + default: + panic(fmt.Sprintf("safecopy got unexpected signal %d at address %#x", sig, addr)) + } +} + +// ReplaceSignalHandler replaces the existing signal handler for the provided +// signal with the one that handles faults in safecopy-protected functions. +// +// It stores the value of the previously set handler in previous. +// +// This function will be called on initialization in order to install safecopy +// handlers for appropriate signals. These handlers will call the previous +// handler however, and if this is function is being used externally then the +// same courtesy is expected. +func ReplaceSignalHandler(sig syscall.Signal, handler uintptr, previous *uintptr) error { + var sa struct { + handler uintptr + flags uint64 + restorer uintptr + mask uint64 + } + const maskLen = 8 + + // Get the existing signal handler information, and save the current + // handler. Once we replace it, we will use this pointer to fall back to + // it when we receive other signals. + if _, _, e := syscall.RawSyscall6(syscall.SYS_RT_SIGACTION, uintptr(sig), 0, uintptr(unsafe.Pointer(&sa)), maskLen, 0, 0); e != 0 { + return e + } + + // Fail if there isn't a previous handler. + if sa.handler == 0 { + return fmt.Errorf("previous handler for signal %x isn't set", sig) + } + + *previous = sa.handler + + // Install our own handler. + sa.handler = handler + if _, _, e := syscall.RawSyscall6(syscall.SYS_RT_SIGACTION, uintptr(sig), uintptr(unsafe.Pointer(&sa)), 0, maskLen, 0, 0); e != 0 { + return e + } + + return nil +} diff --git a/pkg/safecopy/sighandler_amd64.s b/pkg/safecopy/sighandler_amd64.s new file mode 100644 index 000000000..475ae48e9 --- /dev/null +++ b/pkg/safecopy/sighandler_amd64.s @@ -0,0 +1,133 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "textflag.h" + +// The signals handled by sigHandler. +#define SIGBUS 7 +#define SIGSEGV 11 + +// Offsets to the registers in context->uc_mcontext.gregs[]. +#define REG_RDI 0x68 +#define REG_RAX 0x90 +#define REG_IP 0xa8 + +// Offset to the si_addr field of siginfo. +#define SI_CODE 0x08 +#define SI_ADDR 0x10 + +// signalHandler is the signal handler for SIGSEGV and SIGBUS signals. It must +// not be set up as a handler to any other signals. +// +// If the instruction causing the signal is within a safecopy-protected +// function, the signal is handled such that execution resumes in the +// appropriate fault handling stub with AX containing the faulting address and +// DI containing the signal number. Otherwise control is transferred to the +// previously configured signal handler (savedSigSegvHandler or +// savedSigBusHandler). +// +// This function cannot be written in go because it runs whenever a signal is +// received by the thread (preempting whatever was running), which includes when +// garbage collector has stopped or isn't expecting any interactions (like +// barriers). +// +// The arguments are the following: +// DI - The signal number. +// SI - Pointer to siginfo_t structure. +// DX - Pointer to ucontext structure. +TEXT ·signalHandler(SB),NOSPLIT,$0 + // Check if the signal is from the kernel. + MOVQ $0x0, CX + CMPL CX, SI_CODE(SI) + JGE original_handler + + // Check if RIP is within the area we care about. + MOVQ REG_IP(DX), CX + CMPQ CX, ·memcpyBegin(SB) + JB not_memcpy + CMPQ CX, ·memcpyEnd(SB) + JAE not_memcpy + + // Modify the context such that execution will resume in the fault + // handler. + LEAQ handleMemcpyFault(SB), CX + JMP handle_fault + +not_memcpy: + CMPQ CX, ·memclrBegin(SB) + JB not_memclr + CMPQ CX, ·memclrEnd(SB) + JAE not_memclr + + LEAQ handleMemclrFault(SB), CX + JMP handle_fault + +not_memclr: + CMPQ CX, ·swapUint32Begin(SB) + JB not_swapuint32 + CMPQ CX, ·swapUint32End(SB) + JAE not_swapuint32 + + LEAQ handleSwapUint32Fault(SB), CX + JMP handle_fault + +not_swapuint32: + CMPQ CX, ·swapUint64Begin(SB) + JB not_swapuint64 + CMPQ CX, ·swapUint64End(SB) + JAE not_swapuint64 + + LEAQ handleSwapUint64Fault(SB), CX + JMP handle_fault + +not_swapuint64: + CMPQ CX, ·compareAndSwapUint32Begin(SB) + JB not_casuint32 + CMPQ CX, ·compareAndSwapUint32End(SB) + JAE not_casuint32 + + LEAQ handleCompareAndSwapUint32Fault(SB), CX + JMP handle_fault + +not_casuint32: + CMPQ CX, ·loadUint32Begin(SB) + JB not_loaduint32 + CMPQ CX, ·loadUint32End(SB) + JAE not_loaduint32 + + LEAQ handleLoadUint32Fault(SB), CX + JMP handle_fault + +not_loaduint32: +original_handler: + // Jump to the previous signal handler, which is likely the golang one. + XORQ CX, CX + MOVQ ·savedSigBusHandler(SB), AX + CMPL DI, $SIGSEGV + CMOVQEQ ·savedSigSegVHandler(SB), AX + JMP AX + +handle_fault: + // Entered with the address of the fault handler in RCX; store it in + // RIP. + MOVQ CX, REG_IP(DX) + + // Store the faulting address in RAX. + MOVQ SI_ADDR(SI), CX + MOVQ CX, REG_RAX(DX) + + // Store the signal number in EDI. + MOVL DI, REG_RDI(DX) + + RET diff --git a/pkg/safecopy/sighandler_arm64.s b/pkg/safecopy/sighandler_arm64.s new file mode 100644 index 000000000..53e4ac2c1 --- /dev/null +++ b/pkg/safecopy/sighandler_arm64.s @@ -0,0 +1,143 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "textflag.h" + +// The signals handled by sigHandler. +#define SIGBUS 7 +#define SIGSEGV 11 + +// Offsets to the registers in context->uc_mcontext.gregs[]. +#define REG_R0 0xB8 +#define REG_R1 0xC0 +#define REG_PC 0x1B8 + +// Offset to the si_addr field of siginfo. +#define SI_CODE 0x08 +#define SI_ADDR 0x10 + +// signalHandler is the signal handler for SIGSEGV and SIGBUS signals. It must +// not be set up as a handler to any other signals. +// +// If the instruction causing the signal is within a safecopy-protected +// function, the signal is handled such that execution resumes in the +// appropriate fault handling stub with R0 containing the faulting address and +// R1 containing the signal number. Otherwise control is transferred to the +// previously configured signal handler (savedSigSegvHandler or +// savedSigBusHandler). +// +// This function cannot be written in go because it runs whenever a signal is +// received by the thread (preempting whatever was running), which includes when +// garbage collector has stopped or isn't expecting any interactions (like +// barriers). +// +// The arguments are the following: +// R0 - The signal number. +// R1 - Pointer to siginfo_t structure. +// R2 - Pointer to ucontext structure. +TEXT ·signalHandler(SB),NOSPLIT,$0 + // Check if the signal is from the kernel, si_code > 0 means a kernel signal. + MOVD SI_CODE(R1), R7 + CMPW $0x0, R7 + BLE original_handler + + // Check if PC is within the area we care about. + MOVD REG_PC(R2), R7 + MOVD ·memcpyBegin(SB), R8 + CMP R8, R7 + BLO not_memcpy + MOVD ·memcpyEnd(SB), R8 + CMP R8, R7 + BHS not_memcpy + + // Modify the context such that execution will resume in the fault handler. + MOVD $handleMemcpyFault(SB), R7 + B handle_fault + +not_memcpy: + MOVD ·memclrBegin(SB), R8 + CMP R8, R7 + BLO not_memclr + MOVD ·memclrEnd(SB), R8 + CMP R8, R7 + BHS not_memclr + + MOVD $handleMemclrFault(SB), R7 + B handle_fault + +not_memclr: + MOVD ·swapUint32Begin(SB), R8 + CMP R8, R7 + BLO not_swapuint32 + MOVD ·swapUint32End(SB), R8 + CMP R8, R7 + BHS not_swapuint32 + + MOVD $handleSwapUint32Fault(SB), R7 + B handle_fault + +not_swapuint32: + MOVD ·swapUint64Begin(SB), R8 + CMP R8, R7 + BLO not_swapuint64 + MOVD ·swapUint64End(SB), R8 + CMP R8, R7 + BHS not_swapuint64 + + MOVD $handleSwapUint64Fault(SB), R7 + B handle_fault + +not_swapuint64: + MOVD ·compareAndSwapUint32Begin(SB), R8 + CMP R8, R7 + BLO not_casuint32 + MOVD ·compareAndSwapUint32End(SB), R8 + CMP R8, R7 + BHS not_casuint32 + + MOVD $handleCompareAndSwapUint32Fault(SB), R7 + B handle_fault + +not_casuint32: + MOVD ·loadUint32Begin(SB), R8 + CMP R8, R7 + BLO not_loaduint32 + MOVD ·loadUint32End(SB), R8 + CMP R8, R7 + BHS not_loaduint32 + + MOVD $handleLoadUint32Fault(SB), R7 + B handle_fault + +not_loaduint32: +original_handler: + // Jump to the previous signal handler, which is likely the golang one. + MOVD ·savedSigBusHandler(SB), R7 + MOVD ·savedSigSegVHandler(SB), R8 + CMPW $SIGSEGV, R0 + CSEL EQ, R8, R7, R7 + B (R7) + +handle_fault: + // Entered with the address of the fault handler in R7; store it in PC. + MOVD R7, REG_PC(R2) + + // Store the faulting address in R0. + MOVD SI_ADDR(R1), R7 + MOVD R7, REG_R0(R2) + + // Store the signal number in R1. + MOVW R0, REG_R1(R2) + + RET diff --git a/pkg/safemem/BUILD b/pkg/safemem/BUILD new file mode 100644 index 000000000..ce30382ab --- /dev/null +++ b/pkg/safemem/BUILD @@ -0,0 +1,27 @@ +load("//tools:defs.bzl", "go_library", "go_test") + +package(licenses = ["notice"]) + +go_library( + name = "safemem", + srcs = [ + "block_unsafe.go", + "io.go", + "safemem.go", + "seq_unsafe.go", + ], + visibility = ["//:sandbox"], + deps = [ + "//pkg/safecopy", + ], +) + +go_test( + name = "safemem_test", + size = "small", + srcs = [ + "io_test.go", + "seq_test.go", + ], + library = ":safemem", +) diff --git a/pkg/safemem/block_unsafe.go b/pkg/safemem/block_unsafe.go new file mode 100644 index 000000000..e7fd30743 --- /dev/null +++ b/pkg/safemem/block_unsafe.go @@ -0,0 +1,279 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package safemem + +import ( + "fmt" + "reflect" + "unsafe" + + "gvisor.dev/gvisor/pkg/safecopy" +) + +// A Block is a range of contiguous bytes, similar to []byte but with the +// following differences: +// +// - The memory represented by a Block may require the use of safecopy to +// access. +// +// - Block does not carry a capacity and cannot be expanded. +// +// Blocks are immutable and may be copied by value. The zero value of Block +// represents an empty range, analogous to a nil []byte. +type Block struct { + // [start, start+length) is the represented memory. + // + // start is an unsafe.Pointer to ensure that Block prevents the represented + // memory from being garbage-collected. + start unsafe.Pointer + length int + + // needSafecopy is true if accessing the represented memory requires the + // use of safecopy. + needSafecopy bool +} + +// BlockFromSafeSlice returns a Block equivalent to slice, which is safe to +// access without safecopy. +func BlockFromSafeSlice(slice []byte) Block { + return blockFromSlice(slice, false) +} + +// BlockFromUnsafeSlice returns a Block equivalent to bs, which is not safe to +// access without safecopy. +func BlockFromUnsafeSlice(slice []byte) Block { + return blockFromSlice(slice, true) +} + +func blockFromSlice(slice []byte, needSafecopy bool) Block { + if len(slice) == 0 { + return Block{} + } + return Block{ + start: unsafe.Pointer(&slice[0]), + length: len(slice), + needSafecopy: needSafecopy, + } +} + +// BlockFromSafePointer returns a Block equivalent to [ptr, ptr+len), which is +// safe to access without safecopy. +// +// Preconditions: ptr+len does not overflow. +func BlockFromSafePointer(ptr unsafe.Pointer, len int) Block { + return blockFromPointer(ptr, len, false) +} + +// BlockFromUnsafePointer returns a Block equivalent to [ptr, ptr+len), which +// is not safe to access without safecopy. +// +// Preconditions: ptr+len does not overflow. +func BlockFromUnsafePointer(ptr unsafe.Pointer, len int) Block { + return blockFromPointer(ptr, len, true) +} + +func blockFromPointer(ptr unsafe.Pointer, len int, needSafecopy bool) Block { + if uptr := uintptr(ptr); uptr+uintptr(len) < uptr { + panic(fmt.Sprintf("ptr %#x + len %#x overflows", ptr, len)) + } + return Block{ + start: ptr, + length: len, + needSafecopy: needSafecopy, + } +} + +// DropFirst returns a Block equivalent to b, but with the first n bytes +// omitted. It is analogous to the [n:] operation on a slice, except that if n +// > b.Len(), DropFirst returns an empty Block instead of panicking. +// +// Preconditions: n >= 0. +func (b Block) DropFirst(n int) Block { + if n < 0 { + panic(fmt.Sprintf("invalid n: %d", n)) + } + return b.DropFirst64(uint64(n)) +} + +// DropFirst64 is equivalent to DropFirst but takes a uint64. +func (b Block) DropFirst64(n uint64) Block { + if n >= uint64(b.length) { + return Block{} + } + return Block{ + start: unsafe.Pointer(uintptr(b.start) + uintptr(n)), + length: b.length - int(n), + needSafecopy: b.needSafecopy, + } +} + +// TakeFirst returns a Block equivalent to the first n bytes of b. It is +// analogous to the [:n] operation on a slice, except that if n > b.Len(), +// TakeFirst returns a copy of b instead of panicking. +// +// Preconditions: n >= 0. +func (b Block) TakeFirst(n int) Block { + if n < 0 { + panic(fmt.Sprintf("invalid n: %d", n)) + } + return b.TakeFirst64(uint64(n)) +} + +// TakeFirst64 is equivalent to TakeFirst but takes a uint64. +func (b Block) TakeFirst64(n uint64) Block { + if n == 0 { + return Block{} + } + if n >= uint64(b.length) { + return b + } + return Block{ + start: b.start, + length: int(n), + needSafecopy: b.needSafecopy, + } +} + +// ToSlice returns a []byte equivalent to b. +func (b Block) ToSlice() []byte { + var bs []byte + hdr := (*reflect.SliceHeader)(unsafe.Pointer(&bs)) + hdr.Data = uintptr(b.start) + hdr.Len = b.length + hdr.Cap = b.length + return bs +} + +// Addr returns b's start address as a uintptr. It returns uintptr instead of +// unsafe.Pointer so that code using safemem cannot obtain unsafe.Pointers +// without importing the unsafe package explicitly. +// +// Note that a uintptr is not recognized as a pointer by the garbage collector, +// such that if there are no uses of b after a call to b.Addr() and the address +// is to Go-managed memory, the returned uintptr does not prevent garbage +// collection of the pointee. +func (b Block) Addr() uintptr { + return uintptr(b.start) +} + +// Len returns b's length in bytes. +func (b Block) Len() int { + return b.length +} + +// NeedSafecopy returns true if accessing b.ToSlice() requires the use of safecopy. +func (b Block) NeedSafecopy() bool { + return b.needSafecopy +} + +// String implements fmt.Stringer.String. +func (b Block) String() string { + if uintptr(b.start) == 0 && b.length == 0 { + return "" + } + var suffix string + if b.needSafecopy { + suffix = "*" + } + return fmt.Sprintf("[%#x-%#x)%s", uintptr(b.start), uintptr(b.start)+uintptr(b.length), suffix) +} + +// Copy copies src.Len() or dst.Len() bytes, whichever is less, from src +// to dst and returns the number of bytes copied. +// +// If src and dst overlap, the data stored in dst is unspecified. +func Copy(dst, src Block) (int, error) { + if !dst.needSafecopy && !src.needSafecopy { + return copy(dst.ToSlice(), src.ToSlice()), nil + } + + n := dst.length + if n > src.length { + n = src.length + } + if n == 0 { + return 0, nil + } + + switch { + case dst.needSafecopy && !src.needSafecopy: + return safecopy.CopyOut(dst.start, src.TakeFirst(n).ToSlice()) + case !dst.needSafecopy && src.needSafecopy: + return safecopy.CopyIn(dst.TakeFirst(n).ToSlice(), src.start) + case dst.needSafecopy && src.needSafecopy: + n64, err := safecopy.Copy(dst.start, src.start, uintptr(n)) + return int(n64), err + default: + panic("unreachable") + } +} + +// Zero sets all bytes in dst to 0 and returns the number of bytes zeroed. +func Zero(dst Block) (int, error) { + if !dst.needSafecopy { + bs := dst.ToSlice() + for i := range bs { + bs[i] = 0 + } + return len(bs), nil + } + + n64, err := safecopy.ZeroOut(dst.start, uintptr(dst.length)) + return int(n64), err +} + +// Safecopy atomics are no slower than non-safecopy atomics, so use the former +// even when !b.needSafecopy to get consistent alignment checking. + +// SwapUint32 invokes safecopy.SwapUint32 on the first 4 bytes of b. +// +// Preconditions: b.Len() >= 4. +func SwapUint32(b Block, new uint32) (uint32, error) { + if b.length < 4 { + panic(fmt.Sprintf("insufficient length: %d", b.length)) + } + return safecopy.SwapUint32(b.start, new) +} + +// SwapUint64 invokes safecopy.SwapUint64 on the first 8 bytes of b. +// +// Preconditions: b.Len() >= 8. +func SwapUint64(b Block, new uint64) (uint64, error) { + if b.length < 8 { + panic(fmt.Sprintf("insufficient length: %d", b.length)) + } + return safecopy.SwapUint64(b.start, new) +} + +// CompareAndSwapUint32 invokes safecopy.CompareAndSwapUint32 on the first 4 +// bytes of b. +// +// Preconditions: b.Len() >= 4. +func CompareAndSwapUint32(b Block, old, new uint32) (uint32, error) { + if b.length < 4 { + panic(fmt.Sprintf("insufficient length: %d", b.length)) + } + return safecopy.CompareAndSwapUint32(b.start, old, new) +} + +// LoadUint32 invokes safecopy.LoadUint32 on the first 4 bytes of b. +// +// Preconditions: b.Len() >= 4. +func LoadUint32(b Block) (uint32, error) { + if b.length < 4 { + panic(fmt.Sprintf("insufficient length: %d", b.length)) + } + return safecopy.LoadUint32(b.start) +} diff --git a/pkg/safemem/io.go b/pkg/safemem/io.go new file mode 100644 index 000000000..f039a5c34 --- /dev/null +++ b/pkg/safemem/io.go @@ -0,0 +1,392 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package safemem + +import ( + "errors" + "io" + "math" +) + +// ErrEndOfBlockSeq is returned by BlockSeqWriter when attempting to write +// beyond the end of the BlockSeq. +var ErrEndOfBlockSeq = errors.New("write beyond end of BlockSeq") + +// Reader represents a streaming byte source like io.Reader. +type Reader interface { + // ReadToBlocks reads up to dsts.NumBytes() bytes into dsts and returns the + // number of bytes read. It may return a partial read without an error + // (i.e. (n, nil) where 0 < n < dsts.NumBytes()). It should not return a + // full read with an error (i.e. (dsts.NumBytes(), err) where err != nil); + // note that this differs from io.Reader.Read (in particular, io.EOF should + // not be returned if ReadToBlocks successfully reads dsts.NumBytes() + // bytes.) + ReadToBlocks(dsts BlockSeq) (uint64, error) +} + +// Writer represents a streaming byte sink like io.Writer. +type Writer interface { + // WriteFromBlocks writes up to srcs.NumBytes() bytes from srcs and returns + // the number of bytes written. It may return a partial write without an + // error (i.e. (n, nil) where 0 < n < srcs.NumBytes()). It should not + // return a full write with an error (i.e. srcs.NumBytes(), err) where err + // != nil). + WriteFromBlocks(srcs BlockSeq) (uint64, error) +} + +// ReadFullToBlocks repeatedly invokes r.ReadToBlocks until dsts.NumBytes() +// bytes have been read or ReadToBlocks returns an error. +func ReadFullToBlocks(r Reader, dsts BlockSeq) (uint64, error) { + var done uint64 + for !dsts.IsEmpty() { + n, err := r.ReadToBlocks(dsts) + done += n + if err != nil { + return done, err + } + dsts = dsts.DropFirst64(n) + } + return done, nil +} + +// WriteFullFromBlocks repeatedly invokes w.WriteFromBlocks until +// srcs.NumBytes() bytes have been written or WriteFromBlocks returns an error. +func WriteFullFromBlocks(w Writer, srcs BlockSeq) (uint64, error) { + var done uint64 + for !srcs.IsEmpty() { + n, err := w.WriteFromBlocks(srcs) + done += n + if err != nil { + return done, err + } + srcs = srcs.DropFirst64(n) + } + return done, nil +} + +// BlockSeqReader implements Reader by reading from a BlockSeq. +type BlockSeqReader struct { + Blocks BlockSeq +} + +// ReadToBlocks implements Reader.ReadToBlocks. +func (r *BlockSeqReader) ReadToBlocks(dsts BlockSeq) (uint64, error) { + n, err := CopySeq(dsts, r.Blocks) + r.Blocks = r.Blocks.DropFirst64(n) + if err != nil { + return n, err + } + if n < dsts.NumBytes() { + return n, io.EOF + } + return n, nil +} + +// BlockSeqWriter implements Writer by writing to a BlockSeq. +type BlockSeqWriter struct { + Blocks BlockSeq +} + +// WriteFromBlocks implements Writer.WriteFromBlocks. +func (w *BlockSeqWriter) WriteFromBlocks(srcs BlockSeq) (uint64, error) { + n, err := CopySeq(w.Blocks, srcs) + w.Blocks = w.Blocks.DropFirst64(n) + if err != nil { + return n, err + } + if n < srcs.NumBytes() { + return n, ErrEndOfBlockSeq + } + return n, nil +} + +// ReaderFunc implements Reader for a function with the semantics of +// Reader.ReadToBlocks. +type ReaderFunc func(dsts BlockSeq) (uint64, error) + +// ReadToBlocks implements Reader.ReadToBlocks. +func (f ReaderFunc) ReadToBlocks(dsts BlockSeq) (uint64, error) { + return f(dsts) +} + +// WriterFunc implements Writer for a function with the semantics of +// Writer.WriteFromBlocks. +type WriterFunc func(srcs BlockSeq) (uint64, error) + +// WriteFromBlocks implements Writer.WriteFromBlocks. +func (f WriterFunc) WriteFromBlocks(srcs BlockSeq) (uint64, error) { + return f(srcs) +} + +// ToIOReader implements io.Reader for a (safemem.)Reader. +// +// ToIOReader will return a successful partial read iff Reader.ReadToBlocks does +// so. +type ToIOReader struct { + Reader Reader +} + +// Read implements io.Reader.Read. +func (r ToIOReader) Read(dst []byte) (int, error) { + n, err := r.Reader.ReadToBlocks(BlockSeqOf(BlockFromSafeSlice(dst))) + return int(n), err +} + +// ToIOWriter implements io.Writer for a (safemem.)Writer. +type ToIOWriter struct { + Writer Writer +} + +// Write implements io.Writer.Write. +func (w ToIOWriter) Write(src []byte) (int, error) { + // io.Writer does not permit partial writes. + n, err := WriteFullFromBlocks(w.Writer, BlockSeqOf(BlockFromSafeSlice(src))) + return int(n), err +} + +// FromIOReader implements Reader for an io.Reader by repeatedly invoking +// io.Reader.Read until it returns an error or partial read. This is not +// thread-safe. +// +// FromIOReader will return a successful partial read iff Reader.Read does so. +type FromIOReader struct { + Reader io.Reader +} + +// ReadToBlocks implements Reader.ReadToBlocks. +func (r FromIOReader) ReadToBlocks(dsts BlockSeq) (uint64, error) { + var buf []byte + var done uint64 + for !dsts.IsEmpty() { + dst := dsts.Head() + var n int + var err error + n, buf, err = r.readToBlock(dst, buf) + done += uint64(n) + if n != dst.Len() { + return done, err + } + dsts = dsts.Tail() + if err != nil { + if dsts.IsEmpty() && err == io.EOF { + return done, nil + } + return done, err + } + } + return done, nil +} + +func (r FromIOReader) readToBlock(dst Block, buf []byte) (int, []byte, error) { + // io.Reader isn't safecopy-aware, so we have to buffer Blocks that require + // safecopy. + if !dst.NeedSafecopy() { + n, err := r.Reader.Read(dst.ToSlice()) + return n, buf, err + } + if len(buf) < dst.Len() { + buf = make([]byte, dst.Len()) + } + rn, rerr := r.Reader.Read(buf[:dst.Len()]) + wbn, wberr := Copy(dst, BlockFromSafeSlice(buf[:rn])) + if wberr != nil { + return wbn, buf, wberr + } + return wbn, buf, rerr +} + +// FromIOReaderAt implements Reader for an io.ReaderAt. Does not repeatedly +// invoke io.ReaderAt.ReadAt because ReadAt is more strict than Read. A partial +// read indicates an error. This is not thread-safe. +type FromIOReaderAt struct { + ReaderAt io.ReaderAt + Offset int64 +} + +// ReadToBlocks implements Reader.ReadToBlocks. +func (r FromIOReaderAt) ReadToBlocks(dsts BlockSeq) (uint64, error) { + var buf []byte + var done uint64 + for !dsts.IsEmpty() { + dst := dsts.Head() + var n int + var err error + n, buf, err = r.readToBlock(dst, buf) + done += uint64(n) + if n != dst.Len() { + return done, err + } + dsts = dsts.Tail() + if err != nil { + if dsts.IsEmpty() && err == io.EOF { + return done, nil + } + return done, err + } + } + return done, nil +} + +func (r FromIOReaderAt) readToBlock(dst Block, buf []byte) (int, []byte, error) { + // io.Reader isn't safecopy-aware, so we have to buffer Blocks that require + // safecopy. + if !dst.NeedSafecopy() { + n, err := r.ReaderAt.ReadAt(dst.ToSlice(), r.Offset) + r.Offset += int64(n) + return n, buf, err + } + if len(buf) < dst.Len() { + buf = make([]byte, dst.Len()) + } + rn, rerr := r.ReaderAt.ReadAt(buf[:dst.Len()], r.Offset) + r.Offset += int64(rn) + wbn, wberr := Copy(dst, BlockFromSafeSlice(buf[:rn])) + if wberr != nil { + return wbn, buf, wberr + } + return wbn, buf, rerr +} + +// FromIOWriter implements Writer for an io.Writer by repeatedly invoking +// io.Writer.Write until it returns an error or partial write. +// +// FromIOWriter will tolerate implementations of io.Writer.Write that return +// partial writes with a nil error in contravention of io.Writer's +// requirements, since Writer is permitted to do so. FromIOWriter will return a +// successful partial write iff Writer.Write does so. +type FromIOWriter struct { + Writer io.Writer +} + +// WriteFromBlocks implements Writer.WriteFromBlocks. +func (w FromIOWriter) WriteFromBlocks(srcs BlockSeq) (uint64, error) { + var buf []byte + var done uint64 + for !srcs.IsEmpty() { + src := srcs.Head() + var n int + var err error + n, buf, err = w.writeFromBlock(src, buf) + done += uint64(n) + if n != src.Len() || err != nil { + return done, err + } + srcs = srcs.Tail() + } + return done, nil +} + +func (w FromIOWriter) writeFromBlock(src Block, buf []byte) (int, []byte, error) { + // io.Writer isn't safecopy-aware, so we have to buffer Blocks that require + // safecopy. + if !src.NeedSafecopy() { + n, err := w.Writer.Write(src.ToSlice()) + return n, buf, err + } + if len(buf) < src.Len() { + buf = make([]byte, src.Len()) + } + bufn, buferr := Copy(BlockFromSafeSlice(buf[:src.Len()]), src) + wn, werr := w.Writer.Write(buf[:bufn]) + if werr != nil { + return wn, buf, werr + } + return wn, buf, buferr +} + +// FromVecReaderFunc implements Reader for a function that reads data into a +// [][]byte and returns the number of bytes read as an int64. +type FromVecReaderFunc struct { + ReadVec func(dsts [][]byte) (int64, error) +} + +// ReadToBlocks implements Reader.ReadToBlocks. +// +// ReadToBlocks calls r.ReadVec at most once. +func (r FromVecReaderFunc) ReadToBlocks(dsts BlockSeq) (uint64, error) { + if dsts.IsEmpty() { + return 0, nil + } + // Ensure that we don't pass a [][]byte with a total length > MaxInt64. + dsts = dsts.TakeFirst64(uint64(math.MaxInt64)) + dstSlices := make([][]byte, 0, dsts.NumBlocks()) + // Buffer Blocks that require safecopy. + for tmp := dsts; !tmp.IsEmpty(); tmp = tmp.Tail() { + dst := tmp.Head() + if dst.NeedSafecopy() { + dstSlices = append(dstSlices, make([]byte, dst.Len())) + } else { + dstSlices = append(dstSlices, dst.ToSlice()) + } + } + rn, rerr := r.ReadVec(dstSlices) + dsts = dsts.TakeFirst64(uint64(rn)) + var done uint64 + var i int + for !dsts.IsEmpty() { + dst := dsts.Head() + if dst.NeedSafecopy() { + n, err := Copy(dst, BlockFromSafeSlice(dstSlices[i])) + done += uint64(n) + if err != nil { + return done, err + } + } else { + done += uint64(dst.Len()) + } + dsts = dsts.Tail() + i++ + } + return done, rerr +} + +// FromVecWriterFunc implements Writer for a function that writes data from a +// [][]byte and returns the number of bytes written. +type FromVecWriterFunc struct { + WriteVec func(srcs [][]byte) (int64, error) +} + +// WriteFromBlocks implements Writer.WriteFromBlocks. +// +// WriteFromBlocks calls w.WriteVec at most once. +func (w FromVecWriterFunc) WriteFromBlocks(srcs BlockSeq) (uint64, error) { + if srcs.IsEmpty() { + return 0, nil + } + // Ensure that we don't pass a [][]byte with a total length > MaxInt64. + srcs = srcs.TakeFirst64(uint64(math.MaxInt64)) + srcSlices := make([][]byte, 0, srcs.NumBlocks()) + // Buffer Blocks that require safecopy. + var buferr error + for tmp := srcs; !tmp.IsEmpty(); tmp = tmp.Tail() { + src := tmp.Head() + if src.NeedSafecopy() { + slice := make([]byte, src.Len()) + n, err := Copy(BlockFromSafeSlice(slice), src) + srcSlices = append(srcSlices, slice[:n]) + if err != nil { + buferr = err + break + } + } else { + srcSlices = append(srcSlices, src.ToSlice()) + } + } + n, err := w.WriteVec(srcSlices) + if err != nil { + return uint64(n), err + } + return uint64(n), buferr +} diff --git a/pkg/safemem/io_test.go b/pkg/safemem/io_test.go new file mode 100644 index 000000000..629741bee --- /dev/null +++ b/pkg/safemem/io_test.go @@ -0,0 +1,199 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package safemem + +import ( + "bytes" + "io" + "testing" +) + +func makeBlocks(slices ...[]byte) []Block { + blocks := make([]Block, 0, len(slices)) + for _, s := range slices { + blocks = append(blocks, BlockFromSafeSlice(s)) + } + return blocks +} + +func TestFromIOReaderFullRead(t *testing.T) { + r := FromIOReader{bytes.NewBufferString("foobar")} + dsts := makeBlocks(make([]byte, 3), make([]byte, 3)) + n, err := r.ReadToBlocks(BlockSeqFromSlice(dsts)) + if wantN := uint64(6); n != wantN || err != nil { + t.Errorf("ReadToBlocks: got (%v, %v), wanted (%v, nil)", n, err, wantN) + } + for i, want := range [][]byte{[]byte("foo"), []byte("bar")} { + if got := dsts[i].ToSlice(); !bytes.Equal(got, want) { + t.Errorf("dsts[%d]: got %q, wanted %q", i, got, want) + } + } +} + +type eofHidingReader struct { + Reader io.Reader +} + +func (r eofHidingReader) Read(dst []byte) (int, error) { + n, err := r.Reader.Read(dst) + if err == io.EOF { + return n, nil + } + return n, err +} + +func TestFromIOReaderPartialRead(t *testing.T) { + r := FromIOReader{eofHidingReader{bytes.NewBufferString("foob")}} + dsts := makeBlocks(make([]byte, 3), make([]byte, 3)) + n, err := r.ReadToBlocks(BlockSeqFromSlice(dsts)) + // FromIOReader should stop after the eofHidingReader returns (1, nil) + // for a 3-byte read. + if wantN := uint64(4); n != wantN || err != nil { + t.Errorf("ReadToBlocks: got (%v, %v), wanted (%v, nil)", n, err, wantN) + } + for i, want := range [][]byte{[]byte("foo"), []byte("b\x00\x00")} { + if got := dsts[i].ToSlice(); !bytes.Equal(got, want) { + t.Errorf("dsts[%d]: got %q, wanted %q", i, got, want) + } + } +} + +type singleByteReader struct { + Reader io.Reader +} + +func (r singleByteReader) Read(dst []byte) (int, error) { + if len(dst) == 0 { + return r.Reader.Read(dst) + } + return r.Reader.Read(dst[:1]) +} + +func TestSingleByteReader(t *testing.T) { + r := FromIOReader{singleByteReader{bytes.NewBufferString("foobar")}} + dsts := makeBlocks(make([]byte, 3), make([]byte, 3)) + n, err := r.ReadToBlocks(BlockSeqFromSlice(dsts)) + // FromIOReader should stop after the singleByteReader returns (1, nil) + // for a 3-byte read. + if wantN := uint64(1); n != wantN || err != nil { + t.Errorf("ReadToBlocks: got (%v, %v), wanted (%v, nil)", n, err, wantN) + } + for i, want := range [][]byte{[]byte("f\x00\x00"), []byte("\x00\x00\x00")} { + if got := dsts[i].ToSlice(); !bytes.Equal(got, want) { + t.Errorf("dsts[%d]: got %q, wanted %q", i, got, want) + } + } +} + +func TestReadFullToBlocks(t *testing.T) { + r := FromIOReader{singleByteReader{bytes.NewBufferString("foobar")}} + dsts := makeBlocks(make([]byte, 3), make([]byte, 3)) + n, err := ReadFullToBlocks(r, BlockSeqFromSlice(dsts)) + // ReadFullToBlocks should call into FromIOReader => singleByteReader + // repeatedly until dsts is exhausted. + if wantN := uint64(6); n != wantN || err != nil { + t.Errorf("ReadFullToBlocks: got (%v, %v), wanted (%v, nil)", n, err, wantN) + } + for i, want := range [][]byte{[]byte("foo"), []byte("bar")} { + if got := dsts[i].ToSlice(); !bytes.Equal(got, want) { + t.Errorf("dsts[%d]: got %q, wanted %q", i, got, want) + } + } +} + +func TestFromIOWriterFullWrite(t *testing.T) { + srcs := makeBlocks([]byte("foo"), []byte("bar")) + var dst bytes.Buffer + w := FromIOWriter{&dst} + n, err := w.WriteFromBlocks(BlockSeqFromSlice(srcs)) + if wantN := uint64(6); n != wantN || err != nil { + t.Errorf("WriteFromBlocks: got (%v, %v), wanted (%v, nil)", n, err, wantN) + } + if got, want := dst.Bytes(), []byte("foobar"); !bytes.Equal(got, want) { + t.Errorf("dst: got %q, wanted %q", got, want) + } +} + +type limitedWriter struct { + Writer io.Writer + Done int + Limit int +} + +func (w *limitedWriter) Write(src []byte) (int, error) { + count := len(src) + if count > (w.Limit - w.Done) { + count = w.Limit - w.Done + } + n, err := w.Writer.Write(src[:count]) + w.Done += n + return n, err +} + +func TestFromIOWriterPartialWrite(t *testing.T) { + srcs := makeBlocks([]byte("foo"), []byte("bar")) + var dst bytes.Buffer + w := FromIOWriter{&limitedWriter{&dst, 0, 4}} + n, err := w.WriteFromBlocks(BlockSeqFromSlice(srcs)) + // FromIOWriter should stop after the limitedWriter returns (1, nil) for a + // 3-byte write. + if wantN := uint64(4); n != wantN || err != nil { + t.Errorf("WriteFromBlocks: got (%v, %v), wanted (%v, nil)", n, err, wantN) + } + if got, want := dst.Bytes(), []byte("foob"); !bytes.Equal(got, want) { + t.Errorf("dst: got %q, wanted %q", got, want) + } +} + +type singleByteWriter struct { + Writer io.Writer +} + +func (w singleByteWriter) Write(src []byte) (int, error) { + if len(src) == 0 { + return w.Writer.Write(src) + } + return w.Writer.Write(src[:1]) +} + +func TestSingleByteWriter(t *testing.T) { + srcs := makeBlocks([]byte("foo"), []byte("bar")) + var dst bytes.Buffer + w := FromIOWriter{singleByteWriter{&dst}} + n, err := w.WriteFromBlocks(BlockSeqFromSlice(srcs)) + // FromIOWriter should stop after the singleByteWriter returns (1, nil) + // for a 3-byte write. + if wantN := uint64(1); n != wantN || err != nil { + t.Errorf("WriteFromBlocks: got (%v, %v), wanted (%v, nil)", n, err, wantN) + } + if got, want := dst.Bytes(), []byte("f"); !bytes.Equal(got, want) { + t.Errorf("dst: got %q, wanted %q", got, want) + } +} + +func TestWriteFullToBlocks(t *testing.T) { + srcs := makeBlocks([]byte("foo"), []byte("bar")) + var dst bytes.Buffer + w := FromIOWriter{singleByteWriter{&dst}} + n, err := WriteFullFromBlocks(w, BlockSeqFromSlice(srcs)) + // WriteFullToBlocks should call into FromIOWriter => singleByteWriter + // repeatedly until srcs is exhausted. + if wantN := uint64(6); n != wantN || err != nil { + t.Errorf("WriteFullFromBlocks: got (%v, %v), wanted (%v, nil)", n, err, wantN) + } + if got, want := dst.Bytes(), []byte("foobar"); !bytes.Equal(got, want) { + t.Errorf("dst: got %q, wanted %q", got, want) + } +} diff --git a/pkg/safemem/safemem.go b/pkg/safemem/safemem.go new file mode 100644 index 000000000..3e70d33a2 --- /dev/null +++ b/pkg/safemem/safemem.go @@ -0,0 +1,16 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package safemem provides the Block and BlockSeq types. +package safemem diff --git a/pkg/safemem/seq_test.go b/pkg/safemem/seq_test.go new file mode 100644 index 000000000..eba4bb535 --- /dev/null +++ b/pkg/safemem/seq_test.go @@ -0,0 +1,196 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package safemem + +import ( + "bytes" + "reflect" + "testing" +) + +type blockSeqTest struct { + desc string + + pieces []string + haveOffset bool + offset uint64 + haveLimit bool + limit uint64 + + want string +} + +func (t blockSeqTest) NonEmptyByteSlices() [][]byte { + // t is a value, so we can mutate it freely. + slices := make([][]byte, 0, len(t.pieces)) + for _, str := range t.pieces { + if t.haveOffset { + strOff := t.offset + if strOff > uint64(len(str)) { + strOff = uint64(len(str)) + } + str = str[strOff:] + t.offset -= strOff + } + if t.haveLimit { + strLim := t.limit + if strLim > uint64(len(str)) { + strLim = uint64(len(str)) + } + str = str[:strLim] + t.limit -= strLim + } + if len(str) != 0 { + slices = append(slices, []byte(str)) + } + } + return slices +} + +func (t blockSeqTest) BlockSeq() BlockSeq { + blocks := make([]Block, 0, len(t.pieces)) + for _, str := range t.pieces { + blocks = append(blocks, BlockFromSafeSlice([]byte(str))) + } + bs := BlockSeqFromSlice(blocks) + if t.haveOffset { + bs = bs.DropFirst64(t.offset) + } + if t.haveLimit { + bs = bs.TakeFirst64(t.limit) + } + return bs +} + +var blockSeqTests = []blockSeqTest{ + { + desc: "Empty sequence", + }, + { + desc: "Sequence of length 1", + pieces: []string{"foobar"}, + want: "foobar", + }, + { + desc: "Sequence of length 2", + pieces: []string{"foo", "bar"}, + want: "foobar", + }, + { + desc: "Empty Blocks", + pieces: []string{"", "foo", "", "", "bar", ""}, + want: "foobar", + }, + { + desc: "Sequence with non-zero offset", + pieces: []string{"foo", "bar"}, + haveOffset: true, + offset: 2, + want: "obar", + }, + { + desc: "Sequence with non-maximal limit", + pieces: []string{"foo", "bar"}, + haveLimit: true, + limit: 5, + want: "fooba", + }, + { + desc: "Sequence with offset and limit", + pieces: []string{"foo", "bar"}, + haveOffset: true, + offset: 2, + haveLimit: true, + limit: 3, + want: "oba", + }, +} + +func TestBlockSeqNumBytes(t *testing.T) { + for _, test := range blockSeqTests { + t.Run(test.desc, func(t *testing.T) { + if got, want := test.BlockSeq().NumBytes(), uint64(len(test.want)); got != want { + t.Errorf("NumBytes: got %d, wanted %d", got, want) + } + }) + } +} + +func TestBlockSeqIterBlocks(t *testing.T) { + // Tests BlockSeq iteration using Head/Tail. + for _, test := range blockSeqTests { + t.Run(test.desc, func(t *testing.T) { + srcs := test.BlockSeq() + // "Note that a non-nil empty slice and a nil slice ... are not + // deeply equal." - reflect + slices := make([][]byte, 0, 0) + for !srcs.IsEmpty() { + src := srcs.Head() + slices = append(slices, src.ToSlice()) + nextSrcs := srcs.Tail() + if got, want := nextSrcs.NumBytes(), srcs.NumBytes()-uint64(src.Len()); got != want { + t.Fatalf("%v.Tail(): got %v (%d bytes), wanted %d bytes", srcs, nextSrcs, got, want) + } + srcs = nextSrcs + } + if wantSlices := test.NonEmptyByteSlices(); !reflect.DeepEqual(slices, wantSlices) { + t.Errorf("Accumulated slices: got %v, wanted %v", slices, wantSlices) + } + }) + } +} + +func TestBlockSeqIterBytes(t *testing.T) { + // Tests BlockSeq iteration using Head/DropFirst. + for _, test := range blockSeqTests { + t.Run(test.desc, func(t *testing.T) { + srcs := test.BlockSeq() + var dst bytes.Buffer + for !srcs.IsEmpty() { + src := srcs.Head() + var b [1]byte + n, err := Copy(BlockFromSafeSlice(b[:]), src) + if n != 1 || err != nil { + t.Fatalf("Copy: got (%v, %v), wanted (1, nil)", n, err) + } + dst.WriteByte(b[0]) + nextSrcs := srcs.DropFirst(1) + if got, want := nextSrcs.NumBytes(), srcs.NumBytes()-1; got != want { + t.Fatalf("%v.DropFirst(1): got %v (%d bytes), wanted %d bytes", srcs, nextSrcs, got, want) + } + srcs = nextSrcs + } + if got := string(dst.Bytes()); got != test.want { + t.Errorf("Copied string: got %q, wanted %q", got, test.want) + } + }) + } +} + +func TestBlockSeqDropBeyondLimit(t *testing.T) { + blocks := []Block{BlockFromSafeSlice([]byte("123")), BlockFromSafeSlice([]byte("4"))} + bs := BlockSeqFromSlice(blocks) + if got, want := bs.NumBytes(), uint64(4); got != want { + t.Errorf("%v.NumBytes(): got %d, wanted %d", bs, got, want) + } + bs = bs.TakeFirst(1) + if got, want := bs.NumBytes(), uint64(1); got != want { + t.Errorf("%v.NumBytes(): got %d, wanted %d", bs, got, want) + } + bs = bs.DropFirst(2) + if got, want := bs.NumBytes(), uint64(0); got != want { + t.Errorf("%v.NumBytes(): got %d, wanted %d", bs, got, want) + } +} diff --git a/pkg/safemem/seq_unsafe.go b/pkg/safemem/seq_unsafe.go new file mode 100644 index 000000000..354a95dde --- /dev/null +++ b/pkg/safemem/seq_unsafe.go @@ -0,0 +1,299 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package safemem + +import ( + "bytes" + "fmt" + "reflect" + "unsafe" +) + +// A BlockSeq represents a sequence of Blocks, each of which has non-zero +// length. +// +// BlockSeqs are immutable and may be copied by value. The zero value of +// BlockSeq represents an empty sequence. +type BlockSeq struct { + // If length is 0, then the BlockSeq is empty. Invariants: data == 0; + // offset == 0; limit == 0. + // + // If length is -1, then the BlockSeq represents the single Block{data, + // limit, false}. Invariants: offset == 0; limit > 0; limit does not + // overflow the range of an int. + // + // If length is -2, then the BlockSeq represents the single Block{data, + // limit, true}. Invariants: offset == 0; limit > 0; limit does not + // overflow the range of an int. + // + // Otherwise, length >= 2, and the BlockSeq represents the `length` Blocks + // in the array of Blocks starting at address `data`, starting at `offset` + // bytes into the first Block and limited to the following `limit` bytes. + // Invariants: data != 0; offset < len(data[0]); limit > 0; offset+limit <= + // the combined length of all Blocks in the array; the first Block in the + // array has non-zero length. + // + // length is never 1; sequences consisting of a single Block are always + // stored inline (with length < 0). + data unsafe.Pointer + length int + offset int + limit uint64 +} + +// BlockSeqOf returns a BlockSeq representing the single Block b. +func BlockSeqOf(b Block) BlockSeq { + bs := BlockSeq{ + data: b.start, + length: -1, + limit: uint64(b.length), + } + if b.needSafecopy { + bs.length = -2 + } + return bs +} + +// BlockSeqFromSlice returns a BlockSeq representing all Blocks in slice. +// If slice contains Blocks with zero length, BlockSeq will skip them during +// iteration. +// +// Whether the returned BlockSeq shares memory with slice is unspecified; +// clients should avoid mutating slices passed to BlockSeqFromSlice. +// +// Preconditions: The combined length of all Blocks in slice <= math.MaxUint64. +func BlockSeqFromSlice(slice []Block) BlockSeq { + slice = skipEmpty(slice) + var limit uint64 + for _, b := range slice { + sum := limit + uint64(b.Len()) + if sum < limit { + panic("BlockSeq length overflows uint64") + } + limit = sum + } + return blockSeqFromSliceLimited(slice, limit) +} + +// Preconditions: The combined length of all Blocks in slice <= limit. If +// len(slice) != 0, the first Block in slice has non-zero length, and limit > +// 0. +func blockSeqFromSliceLimited(slice []Block, limit uint64) BlockSeq { + switch len(slice) { + case 0: + return BlockSeq{} + case 1: + return BlockSeqOf(slice[0].TakeFirst64(limit)) + default: + return BlockSeq{ + data: unsafe.Pointer(&slice[0]), + length: len(slice), + limit: limit, + } + } +} + +func skipEmpty(slice []Block) []Block { + for i, b := range slice { + if b.Len() != 0 { + return slice[i:] + } + } + return nil +} + +// IsEmpty returns true if bs contains no Blocks. +// +// Invariants: bs.IsEmpty() == (bs.NumBlocks() == 0) == (bs.NumBytes() == 0). +// (Of these, prefer to use bs.IsEmpty().) +func (bs BlockSeq) IsEmpty() bool { + return bs.length == 0 +} + +// NumBlocks returns the number of Blocks in bs. +func (bs BlockSeq) NumBlocks() int { + // In general, we have to count: if bs represents a windowed slice then the + // slice may contain Blocks with zero length, and bs.length may be larger + // than the actual number of Blocks due to bs.limit. + var n int + for !bs.IsEmpty() { + n++ + bs = bs.Tail() + } + return n +} + +// NumBytes returns the sum of Block.Len() for all Blocks in bs. +func (bs BlockSeq) NumBytes() uint64 { + return bs.limit +} + +// Head returns the first Block in bs. +// +// Preconditions: !bs.IsEmpty(). +func (bs BlockSeq) Head() Block { + if bs.length == 0 { + panic("empty BlockSeq") + } + if bs.length < 0 { + return bs.internalBlock() + } + return (*Block)(bs.data).DropFirst(bs.offset).TakeFirst64(bs.limit) +} + +// Preconditions: bs.length < 0. +func (bs BlockSeq) internalBlock() Block { + return Block{ + start: bs.data, + length: int(bs.limit), + needSafecopy: bs.length == -2, + } +} + +// Tail returns a BlockSeq consisting of all Blocks in bs after the first. +// +// Preconditions: !bs.IsEmpty(). +func (bs BlockSeq) Tail() BlockSeq { + if bs.length == 0 { + panic("empty BlockSeq") + } + if bs.length < 0 { + return BlockSeq{} + } + head := (*Block)(bs.data).DropFirst(bs.offset) + headLen := uint64(head.Len()) + if headLen >= bs.limit { + // The head Block exhausts the limit, so the tail is empty. + return BlockSeq{} + } + var extSlice []Block + extSliceHdr := (*reflect.SliceHeader)(unsafe.Pointer(&extSlice)) + extSliceHdr.Data = uintptr(bs.data) + extSliceHdr.Len = bs.length + extSliceHdr.Cap = bs.length + tailSlice := skipEmpty(extSlice[1:]) + tailLimit := bs.limit - headLen + return blockSeqFromSliceLimited(tailSlice, tailLimit) +} + +// DropFirst returns a BlockSeq equivalent to bs, but with the first n bytes +// omitted. If n > bs.NumBytes(), DropFirst returns an empty BlockSeq. +// +// Preconditions: n >= 0. +func (bs BlockSeq) DropFirst(n int) BlockSeq { + if n < 0 { + panic(fmt.Sprintf("invalid n: %d", n)) + } + return bs.DropFirst64(uint64(n)) +} + +// DropFirst64 is equivalent to DropFirst but takes an uint64. +func (bs BlockSeq) DropFirst64(n uint64) BlockSeq { + if n >= bs.limit { + return BlockSeq{} + } + for { + // Calling bs.Head() here is surprisingly expensive, so inline getting + // the head's length. + var headLen uint64 + if bs.length < 0 { + headLen = bs.limit + } else { + headLen = uint64((*Block)(bs.data).Len() - bs.offset) + } + if n < headLen { + // Dropping ends partway through the head Block. + if bs.length < 0 { + return BlockSeqOf(bs.internalBlock().DropFirst64(n)) + } + bs.offset += int(n) + bs.limit -= n + return bs + } + n -= headLen + bs = bs.Tail() + } +} + +// TakeFirst returns a BlockSeq equivalent to the first n bytes of bs. If n > +// bs.NumBytes(), TakeFirst returns a BlockSeq equivalent to bs. +// +// Preconditions: n >= 0. +func (bs BlockSeq) TakeFirst(n int) BlockSeq { + if n < 0 { + panic(fmt.Sprintf("invalid n: %d", n)) + } + return bs.TakeFirst64(uint64(n)) +} + +// TakeFirst64 is equivalent to TakeFirst but takes a uint64. +func (bs BlockSeq) TakeFirst64(n uint64) BlockSeq { + if n == 0 { + return BlockSeq{} + } + if bs.limit > n { + bs.limit = n + } + return bs +} + +// String implements fmt.Stringer.String. +func (bs BlockSeq) String() string { + var buf bytes.Buffer + buf.WriteByte('[') + var sep string + for !bs.IsEmpty() { + buf.WriteString(sep) + sep = " " + buf.WriteString(bs.Head().String()) + bs = bs.Tail() + } + buf.WriteByte(']') + return buf.String() +} + +// CopySeq copies srcs.NumBytes() or dsts.NumBytes() bytes, whichever is less, +// from srcs to dsts and returns the number of bytes copied. +// +// If srcs and dsts overlap, the data stored in dsts is unspecified. +func CopySeq(dsts, srcs BlockSeq) (uint64, error) { + var done uint64 + for !dsts.IsEmpty() && !srcs.IsEmpty() { + dst := dsts.Head() + src := srcs.Head() + n, err := Copy(dst, src) + done += uint64(n) + if err != nil { + return done, err + } + dsts = dsts.DropFirst(n) + srcs = srcs.DropFirst(n) + } + return done, nil +} + +// ZeroSeq sets all bytes in dsts to 0 and returns the number of bytes zeroed. +func ZeroSeq(dsts BlockSeq) (uint64, error) { + var done uint64 + for !dsts.IsEmpty() { + n, err := Zero(dsts.Head()) + done += uint64(n) + if err != nil { + return done, err + } + dsts = dsts.DropFirst(n) + } + return done, nil +} diff --git a/pkg/sentry/arch/BUILD b/pkg/sentry/arch/BUILD index 51ca09b24..34c0a867d 100644 --- a/pkg/sentry/arch/BUILD +++ b/pkg/sentry/arch/BUILD @@ -30,13 +30,13 @@ go_library( ":registers_go_proto", "//pkg/abi/linux", "//pkg/binary", + "//pkg/context", "//pkg/cpuid", "//pkg/log", - "//pkg/sentry/context", "//pkg/sentry/limits", - "//pkg/sentry/usermem", "//pkg/sync", "//pkg/syserror", + "//pkg/usermem", ], ) diff --git a/pkg/sentry/arch/arch.go b/pkg/sentry/arch/arch.go index 81ec98a77..1d11cc472 100644 --- a/pkg/sentry/arch/arch.go +++ b/pkg/sentry/arch/arch.go @@ -24,7 +24,7 @@ import ( "gvisor.dev/gvisor/pkg/cpuid" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/limits" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) // Arch describes an architecture. diff --git a/pkg/sentry/arch/arch_aarch64.go b/pkg/sentry/arch/arch_aarch64.go index ea4dedbdf..3b6987665 100644 --- a/pkg/sentry/arch/arch_aarch64.go +++ b/pkg/sentry/arch/arch_aarch64.go @@ -25,8 +25,8 @@ import ( "gvisor.dev/gvisor/pkg/cpuid" "gvisor.dev/gvisor/pkg/log" rpb "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) const ( diff --git a/pkg/sentry/arch/arch_amd64.go b/pkg/sentry/arch/arch_amd64.go index 2aa08b1a9..85d6acc0f 100644 --- a/pkg/sentry/arch/arch_amd64.go +++ b/pkg/sentry/arch/arch_amd64.go @@ -25,7 +25,7 @@ import ( "gvisor.dev/gvisor/pkg/binary" "gvisor.dev/gvisor/pkg/cpuid" "gvisor.dev/gvisor/pkg/sentry/limits" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) // Host specifies the host architecture. diff --git a/pkg/sentry/arch/arch_arm64.go b/pkg/sentry/arch/arch_arm64.go index 0d5b7d317..94f1a808f 100644 --- a/pkg/sentry/arch/arch_arm64.go +++ b/pkg/sentry/arch/arch_arm64.go @@ -21,7 +21,7 @@ import ( "gvisor.dev/gvisor/pkg/cpuid" "gvisor.dev/gvisor/pkg/sentry/limits" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) // Host specifies the host architecture. diff --git a/pkg/sentry/arch/arch_state_x86.go b/pkg/sentry/arch/arch_state_x86.go index 84f11b0d1..d388ee9cf 100644 --- a/pkg/sentry/arch/arch_state_x86.go +++ b/pkg/sentry/arch/arch_state_x86.go @@ -21,7 +21,7 @@ import ( "syscall" "gvisor.dev/gvisor/pkg/cpuid" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) // ErrFloatingPoint indicates a failed restore due to unusable floating point diff --git a/pkg/sentry/arch/arch_x86.go b/pkg/sentry/arch/arch_x86.go index 9f41e566f..a18093155 100644 --- a/pkg/sentry/arch/arch_x86.go +++ b/pkg/sentry/arch/arch_x86.go @@ -25,9 +25,9 @@ import ( "gvisor.dev/gvisor/pkg/cpuid" "gvisor.dev/gvisor/pkg/log" rpb "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // System-related constants for x86. diff --git a/pkg/sentry/arch/auxv.go b/pkg/sentry/arch/auxv.go index 4546b2ef9..2b4c8f3fc 100644 --- a/pkg/sentry/arch/auxv.go +++ b/pkg/sentry/arch/auxv.go @@ -15,7 +15,7 @@ package arch import ( - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) // An AuxEntry represents an entry in an ELF auxiliary vector. diff --git a/pkg/sentry/arch/signal.go b/pkg/sentry/arch/signal.go index 402e46025..8b03d0187 100644 --- a/pkg/sentry/arch/signal.go +++ b/pkg/sentry/arch/signal.go @@ -16,7 +16,7 @@ package arch import ( "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) // SignalAct represents the action that should be taken when a signal is diff --git a/pkg/sentry/arch/signal_amd64.go b/pkg/sentry/arch/signal_amd64.go index 1e4f9c3c2..81b92bb43 100644 --- a/pkg/sentry/arch/signal_amd64.go +++ b/pkg/sentry/arch/signal_amd64.go @@ -23,7 +23,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/log" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) // SignalContext64 is equivalent to struct sigcontext, the type passed as the diff --git a/pkg/sentry/arch/signal_arm64.go b/pkg/sentry/arch/signal_arm64.go index 7d0e98935..4f4cc46a8 100644 --- a/pkg/sentry/arch/signal_arm64.go +++ b/pkg/sentry/arch/signal_arm64.go @@ -19,7 +19,7 @@ import ( "syscall" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) // SignalContext64 is equivalent to struct sigcontext, the type passed as the diff --git a/pkg/sentry/arch/signal_stack.go b/pkg/sentry/arch/signal_stack.go index d324da705..1a6056171 100644 --- a/pkg/sentry/arch/signal_stack.go +++ b/pkg/sentry/arch/signal_stack.go @@ -17,7 +17,7 @@ package arch import ( - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) const ( diff --git a/pkg/sentry/arch/stack.go b/pkg/sentry/arch/stack.go index 7472c3c61..09bceabc9 100644 --- a/pkg/sentry/arch/stack.go +++ b/pkg/sentry/arch/stack.go @@ -18,8 +18,8 @@ import ( "encoding/binary" "fmt" - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/usermem" ) // Stack is a simple wrapper around a usermem.IO and an address. diff --git a/pkg/sentry/context/BUILD b/pkg/sentry/context/BUILD deleted file mode 100644 index e13a9ce20..000000000 --- a/pkg/sentry/context/BUILD +++ /dev/null @@ -1,13 +0,0 @@ -load("//tools:defs.bzl", "go_library") - -package(licenses = ["notice"]) - -go_library( - name = "context", - srcs = ["context.go"], - visibility = ["//pkg/sentry:internal"], - deps = [ - "//pkg/amutex", - "//pkg/log", - ], -) diff --git a/pkg/sentry/context/context.go b/pkg/sentry/context/context.go deleted file mode 100644 index 23e009ef3..000000000 --- a/pkg/sentry/context/context.go +++ /dev/null @@ -1,141 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Package context defines an internal context type. -// -// The given Context conforms to the standard Go context, but mandates -// additional methods that are specific to the kernel internals. Note however, -// that the Context described by this package carries additional constraints -// regarding concurrent access and retaining beyond the scope of a call. -// -// See the Context type for complete details. -package context - -import ( - "context" - "time" - - "gvisor.dev/gvisor/pkg/amutex" - "gvisor.dev/gvisor/pkg/log" -) - -type contextID int - -// Globally accessible values from a context. These keys are defined in the -// context package to resolve dependency cycles by not requiring the caller to -// import packages usually required to get these information. -const ( - // CtxThreadGroupID is the current thread group ID when a context represents - // a task context. The value is represented as an int32. - CtxThreadGroupID contextID = iota -) - -// ThreadGroupIDFromContext returns the current thread group ID when ctx -// represents a task context. -func ThreadGroupIDFromContext(ctx Context) (tgid int32, ok bool) { - if tgid := ctx.Value(CtxThreadGroupID); tgid != nil { - return tgid.(int32), true - } - return 0, false -} - -// A Context represents a thread of execution (hereafter "goroutine" to reflect -// Go idiosyncrasy). It carries state associated with the goroutine across API -// boundaries. -// -// While Context exists for essentially the same reasons as Go's standard -// context.Context, the standard type represents the state of an operation -// rather than that of a goroutine. This is a critical distinction: -// -// - Unlike context.Context, which "may be passed to functions running in -// different goroutines", it is *not safe* to use the same Context in multiple -// concurrent goroutines. -// -// - It is *not safe* to retain a Context passed to a function beyond the scope -// of that function call. -// -// In both cases, values extracted from the Context should be used instead. -type Context interface { - log.Logger - amutex.Sleeper - context.Context - - // UninterruptibleSleepStart indicates the beginning of an uninterruptible - // sleep state (equivalent to Linux's TASK_UNINTERRUPTIBLE). If deactivate - // is true and the Context represents a Task, the Task's AddressSpace is - // deactivated. - UninterruptibleSleepStart(deactivate bool) - - // UninterruptibleSleepFinish indicates the end of an uninterruptible sleep - // state that was begun by a previous call to UninterruptibleSleepStart. If - // activate is true and the Context represents a Task, the Task's - // AddressSpace is activated. Normally activate is the same value as the - // deactivate parameter passed to UninterruptibleSleepStart. - UninterruptibleSleepFinish(activate bool) -} - -// NoopSleeper is a noop implementation of amutex.Sleeper and UninterruptibleSleep -// methods for anonymous embedding in other types that do not implement sleeps. -type NoopSleeper struct { - amutex.NoopSleeper -} - -// UninterruptibleSleepStart does nothing. -func (NoopSleeper) UninterruptibleSleepStart(bool) {} - -// UninterruptibleSleepFinish does nothing. -func (NoopSleeper) UninterruptibleSleepFinish(bool) {} - -// Deadline returns zero values, meaning no deadline. -func (NoopSleeper) Deadline() (time.Time, bool) { - return time.Time{}, false -} - -// Done returns nil. -func (NoopSleeper) Done() <-chan struct{} { - return nil -} - -// Err returns nil. -func (NoopSleeper) Err() error { - return nil -} - -// logContext implements basic logging. -type logContext struct { - log.Logger - NoopSleeper -} - -// Value implements Context.Value. -func (logContext) Value(key interface{}) interface{} { - return nil -} - -// bgContext is the context returned by context.Background. -var bgContext = &logContext{Logger: log.Log()} - -// Background returns an empty context using the default logger. -// -// Users should be wary of using a Background context. Please tag any use with -// FIXME(b/38173783) and a note to remove this use. -// -// Generally, one should use the Task as their context when available, or avoid -// having to use a context in places where a Task is unavailable. -// -// Using a Background context for tests is fine, as long as no values are -// needed from the context in the tested code paths. -func Background() Context { - return bgContext -} diff --git a/pkg/sentry/context/contexttest/BUILD b/pkg/sentry/context/contexttest/BUILD deleted file mode 100644 index f91a6d4ed..000000000 --- a/pkg/sentry/context/contexttest/BUILD +++ /dev/null @@ -1,21 +0,0 @@ -load("//tools:defs.bzl", "go_library") - -package(licenses = ["notice"]) - -go_library( - name = "contexttest", - testonly = 1, - srcs = ["contexttest.go"], - visibility = ["//pkg/sentry:internal"], - deps = [ - "//pkg/memutil", - "//pkg/sentry/context", - "//pkg/sentry/kernel/auth", - "//pkg/sentry/kernel/time", - "//pkg/sentry/limits", - "//pkg/sentry/pgalloc", - "//pkg/sentry/platform", - "//pkg/sentry/platform/ptrace", - "//pkg/sentry/uniqueid", - ], -) diff --git a/pkg/sentry/context/contexttest/contexttest.go b/pkg/sentry/context/contexttest/contexttest.go deleted file mode 100644 index 15cf086a9..000000000 --- a/pkg/sentry/context/contexttest/contexttest.go +++ /dev/null @@ -1,188 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Package contexttest builds a test context.Context. -package contexttest - -import ( - "os" - "sync/atomic" - "testing" - "time" - - "gvisor.dev/gvisor/pkg/memutil" - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" - "gvisor.dev/gvisor/pkg/sentry/limits" - "gvisor.dev/gvisor/pkg/sentry/pgalloc" - "gvisor.dev/gvisor/pkg/sentry/platform" - "gvisor.dev/gvisor/pkg/sentry/platform/ptrace" - "gvisor.dev/gvisor/pkg/sentry/uniqueid" -) - -// Context returns a Context that may be used in tests. Uses ptrace as the -// platform.Platform. -// -// Note that some filesystems may require a minimal kernel for testing, which -// this test context does not provide. For such tests, see kernel/contexttest. -func Context(tb testing.TB) context.Context { - const memfileName = "contexttest-memory" - memfd, err := memutil.CreateMemFD(memfileName, 0) - if err != nil { - tb.Fatalf("error creating application memory file: %v", err) - } - memfile := os.NewFile(uintptr(memfd), memfileName) - mf, err := pgalloc.NewMemoryFile(memfile, pgalloc.MemoryFileOpts{}) - if err != nil { - memfile.Close() - tb.Fatalf("error creating pgalloc.MemoryFile: %v", err) - } - p, err := ptrace.New() - if err != nil { - tb.Fatal(err) - } - // Test usage of context.Background is fine. - return &TestContext{ - Context: context.Background(), - l: limits.NewLimitSet(), - mf: mf, - platform: p, - creds: auth.NewAnonymousCredentials(), - otherValues: make(map[interface{}]interface{}), - } -} - -// TestContext represents a context with minimal functionality suitable for -// running tests. -type TestContext struct { - context.Context - l *limits.LimitSet - mf *pgalloc.MemoryFile - platform platform.Platform - creds *auth.Credentials - otherValues map[interface{}]interface{} -} - -// globalUniqueID tracks incremental unique identifiers for tests. -var globalUniqueID uint64 - -// globalUniqueIDProvider implements unix.UniqueIDProvider. -type globalUniqueIDProvider struct{} - -// UniqueID implements unix.UniqueIDProvider.UniqueID. -func (*globalUniqueIDProvider) UniqueID() uint64 { - return atomic.AddUint64(&globalUniqueID, 1) -} - -// lastInotifyCookie is a monotonically increasing counter for generating unique -// inotify cookies. Must be accessed using atomic ops. -var lastInotifyCookie uint32 - -// hostClock implements ktime.Clock. -type hostClock struct { - ktime.WallRateClock - ktime.NoClockEvents -} - -// Now implements ktime.Clock.Now. -func (hostClock) Now() ktime.Time { - return ktime.FromNanoseconds(time.Now().UnixNano()) -} - -// RegisterValue registers additional values with this test context. Useful for -// providing values from external packages that contexttest can't depend on. -func (t *TestContext) RegisterValue(key, value interface{}) { - t.otherValues[key] = value -} - -// Value implements context.Context. -func (t *TestContext) Value(key interface{}) interface{} { - switch key { - case auth.CtxCredentials: - return t.creds - case limits.CtxLimits: - return t.l - case pgalloc.CtxMemoryFile: - return t.mf - case pgalloc.CtxMemoryFileProvider: - return t - case platform.CtxPlatform: - return t.platform - case uniqueid.CtxGlobalUniqueID: - return (*globalUniqueIDProvider).UniqueID(nil) - case uniqueid.CtxGlobalUniqueIDProvider: - return &globalUniqueIDProvider{} - case uniqueid.CtxInotifyCookie: - return atomic.AddUint32(&lastInotifyCookie, 1) - case ktime.CtxRealtimeClock: - return hostClock{} - default: - if val, ok := t.otherValues[key]; ok { - return val - } - return t.Context.Value(key) - } -} - -// MemoryFile implements pgalloc.MemoryFileProvider.MemoryFile. -func (t *TestContext) MemoryFile() *pgalloc.MemoryFile { - return t.mf -} - -// RootContext returns a Context that may be used in tests that need root -// credentials. Uses ptrace as the platform.Platform. -func RootContext(tb testing.TB) context.Context { - return WithCreds(Context(tb), auth.NewRootCredentials(auth.NewRootUserNamespace())) -} - -// WithCreds returns a copy of ctx carrying creds. -func WithCreds(ctx context.Context, creds *auth.Credentials) context.Context { - return &authContext{ctx, creds} -} - -type authContext struct { - context.Context - creds *auth.Credentials -} - -// Value implements context.Context. -func (ac *authContext) Value(key interface{}) interface{} { - switch key { - case auth.CtxCredentials: - return ac.creds - default: - return ac.Context.Value(key) - } -} - -// WithLimitSet returns a copy of ctx carrying l. -func WithLimitSet(ctx context.Context, l *limits.LimitSet) context.Context { - return limitContext{ctx, l} -} - -type limitContext struct { - context.Context - l *limits.LimitSet -} - -// Value implements context.Context. -func (lc limitContext) Value(key interface{}) interface{} { - switch key { - case limits.CtxLimits: - return lc.l - default: - return lc.Context.Value(key) - } -} diff --git a/pkg/sentry/contexttest/BUILD b/pkg/sentry/contexttest/BUILD new file mode 100644 index 000000000..6f4c86684 --- /dev/null +++ b/pkg/sentry/contexttest/BUILD @@ -0,0 +1,21 @@ +load("//tools:defs.bzl", "go_library") + +package(licenses = ["notice"]) + +go_library( + name = "contexttest", + testonly = 1, + srcs = ["contexttest.go"], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/context", + "//pkg/memutil", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/kernel/time", + "//pkg/sentry/limits", + "//pkg/sentry/pgalloc", + "//pkg/sentry/platform", + "//pkg/sentry/platform/ptrace", + "//pkg/sentry/uniqueid", + ], +) diff --git a/pkg/sentry/contexttest/contexttest.go b/pkg/sentry/contexttest/contexttest.go new file mode 100644 index 000000000..031fc64ec --- /dev/null +++ b/pkg/sentry/contexttest/contexttest.go @@ -0,0 +1,188 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package contexttest builds a test context.Context. +package contexttest + +import ( + "os" + "sync/atomic" + "testing" + "time" + + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/memutil" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" + "gvisor.dev/gvisor/pkg/sentry/limits" + "gvisor.dev/gvisor/pkg/sentry/pgalloc" + "gvisor.dev/gvisor/pkg/sentry/platform" + "gvisor.dev/gvisor/pkg/sentry/platform/ptrace" + "gvisor.dev/gvisor/pkg/sentry/uniqueid" +) + +// Context returns a Context that may be used in tests. Uses ptrace as the +// platform.Platform. +// +// Note that some filesystems may require a minimal kernel for testing, which +// this test context does not provide. For such tests, see kernel/contexttest. +func Context(tb testing.TB) context.Context { + const memfileName = "contexttest-memory" + memfd, err := memutil.CreateMemFD(memfileName, 0) + if err != nil { + tb.Fatalf("error creating application memory file: %v", err) + } + memfile := os.NewFile(uintptr(memfd), memfileName) + mf, err := pgalloc.NewMemoryFile(memfile, pgalloc.MemoryFileOpts{}) + if err != nil { + memfile.Close() + tb.Fatalf("error creating pgalloc.MemoryFile: %v", err) + } + p, err := ptrace.New() + if err != nil { + tb.Fatal(err) + } + // Test usage of context.Background is fine. + return &TestContext{ + Context: context.Background(), + l: limits.NewLimitSet(), + mf: mf, + platform: p, + creds: auth.NewAnonymousCredentials(), + otherValues: make(map[interface{}]interface{}), + } +} + +// TestContext represents a context with minimal functionality suitable for +// running tests. +type TestContext struct { + context.Context + l *limits.LimitSet + mf *pgalloc.MemoryFile + platform platform.Platform + creds *auth.Credentials + otherValues map[interface{}]interface{} +} + +// globalUniqueID tracks incremental unique identifiers for tests. +var globalUniqueID uint64 + +// globalUniqueIDProvider implements unix.UniqueIDProvider. +type globalUniqueIDProvider struct{} + +// UniqueID implements unix.UniqueIDProvider.UniqueID. +func (*globalUniqueIDProvider) UniqueID() uint64 { + return atomic.AddUint64(&globalUniqueID, 1) +} + +// lastInotifyCookie is a monotonically increasing counter for generating unique +// inotify cookies. Must be accessed using atomic ops. +var lastInotifyCookie uint32 + +// hostClock implements ktime.Clock. +type hostClock struct { + ktime.WallRateClock + ktime.NoClockEvents +} + +// Now implements ktime.Clock.Now. +func (hostClock) Now() ktime.Time { + return ktime.FromNanoseconds(time.Now().UnixNano()) +} + +// RegisterValue registers additional values with this test context. Useful for +// providing values from external packages that contexttest can't depend on. +func (t *TestContext) RegisterValue(key, value interface{}) { + t.otherValues[key] = value +} + +// Value implements context.Context. +func (t *TestContext) Value(key interface{}) interface{} { + switch key { + case auth.CtxCredentials: + return t.creds + case limits.CtxLimits: + return t.l + case pgalloc.CtxMemoryFile: + return t.mf + case pgalloc.CtxMemoryFileProvider: + return t + case platform.CtxPlatform: + return t.platform + case uniqueid.CtxGlobalUniqueID: + return (*globalUniqueIDProvider).UniqueID(nil) + case uniqueid.CtxGlobalUniqueIDProvider: + return &globalUniqueIDProvider{} + case uniqueid.CtxInotifyCookie: + return atomic.AddUint32(&lastInotifyCookie, 1) + case ktime.CtxRealtimeClock: + return hostClock{} + default: + if val, ok := t.otherValues[key]; ok { + return val + } + return t.Context.Value(key) + } +} + +// MemoryFile implements pgalloc.MemoryFileProvider.MemoryFile. +func (t *TestContext) MemoryFile() *pgalloc.MemoryFile { + return t.mf +} + +// RootContext returns a Context that may be used in tests that need root +// credentials. Uses ptrace as the platform.Platform. +func RootContext(tb testing.TB) context.Context { + return WithCreds(Context(tb), auth.NewRootCredentials(auth.NewRootUserNamespace())) +} + +// WithCreds returns a copy of ctx carrying creds. +func WithCreds(ctx context.Context, creds *auth.Credentials) context.Context { + return &authContext{ctx, creds} +} + +type authContext struct { + context.Context + creds *auth.Credentials +} + +// Value implements context.Context. +func (ac *authContext) Value(key interface{}) interface{} { + switch key { + case auth.CtxCredentials: + return ac.creds + default: + return ac.Context.Value(key) + } +} + +// WithLimitSet returns a copy of ctx carrying l. +func WithLimitSet(ctx context.Context, l *limits.LimitSet) context.Context { + return limitContext{ctx, l} +} + +type limitContext struct { + context.Context + l *limits.LimitSet +} + +// Value implements context.Context. +func (lc limitContext) Value(key interface{}) interface{} { + switch key { + case limits.CtxLimits: + return lc.l + default: + return lc.Context.Value(key) + } +} diff --git a/pkg/sentry/fs/BUILD b/pkg/sentry/fs/BUILD index 605d61dbe..ea85ab33c 100644 --- a/pkg/sentry/fs/BUILD +++ b/pkg/sentry/fs/BUILD @@ -47,13 +47,13 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/amutex", + "//pkg/context", "//pkg/log", "//pkg/metric", "//pkg/p9", "//pkg/refs", "//pkg/secio", "//pkg/sentry/arch", - "//pkg/sentry/context", "//pkg/sentry/device", "//pkg/sentry/fs/lock", "//pkg/sentry/kernel/auth", @@ -64,10 +64,10 @@ go_library( "//pkg/sentry/socket/unix/transport", "//pkg/sentry/uniqueid", "//pkg/sentry/usage", - "//pkg/sentry/usermem", "//pkg/state", "//pkg/sync", "//pkg/syserror", + "//pkg/usermem", "//pkg/waiter", ], ) @@ -107,14 +107,14 @@ go_test( ], deps = [ ":fs", - "//pkg/sentry/context", + "//pkg/context", "//pkg/sentry/fs/fsutil", "//pkg/sentry/fs/ramfs", "//pkg/sentry/fs/tmpfs", "//pkg/sentry/kernel/contexttest", - "//pkg/sentry/usermem", "//pkg/sync", "//pkg/syserror", + "//pkg/usermem", ], ) @@ -129,7 +129,7 @@ go_test( ], library = ":fs", deps = [ - "//pkg/sentry/context", - "//pkg/sentry/context/contexttest", + "//pkg/context", + "//pkg/sentry/contexttest", ], ) diff --git a/pkg/sentry/fs/anon/BUILD b/pkg/sentry/fs/anon/BUILD index c14e5405e..aedcecfa1 100644 --- a/pkg/sentry/fs/anon/BUILD +++ b/pkg/sentry/fs/anon/BUILD @@ -11,10 +11,10 @@ go_library( visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", - "//pkg/sentry/context", + "//pkg/context", "//pkg/sentry/device", "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", - "//pkg/sentry/usermem", + "//pkg/usermem", ], ) diff --git a/pkg/sentry/fs/anon/anon.go b/pkg/sentry/fs/anon/anon.go index 7323c7222..5c421f5fb 100644 --- a/pkg/sentry/fs/anon/anon.go +++ b/pkg/sentry/fs/anon/anon.go @@ -18,10 +18,10 @@ package anon import ( "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) // NewInode constructs an anonymous Inode that is not associated diff --git a/pkg/sentry/fs/attr.go b/pkg/sentry/fs/attr.go index 4f3d6410e..fa9e7d517 100644 --- a/pkg/sentry/fs/attr.go +++ b/pkg/sentry/fs/attr.go @@ -20,8 +20,8 @@ import ( "syscall" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/p9" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" ) diff --git a/pkg/sentry/fs/context.go b/pkg/sentry/fs/context.go index dd427de5d..0fbd60056 100644 --- a/pkg/sentry/fs/context.go +++ b/pkg/sentry/fs/context.go @@ -16,7 +16,7 @@ package fs import ( "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" ) diff --git a/pkg/sentry/fs/copy_up.go b/pkg/sentry/fs/copy_up.go index e03e3e417..f6c79e51b 100644 --- a/pkg/sentry/fs/copy_up.go +++ b/pkg/sentry/fs/copy_up.go @@ -19,12 +19,12 @@ import ( "io" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // copyUp copies a file in an overlay from a lower filesystem to an diff --git a/pkg/sentry/fs/copy_up_test.go b/pkg/sentry/fs/copy_up_test.go index 738580c5f..91792d9fe 100644 --- a/pkg/sentry/fs/copy_up_test.go +++ b/pkg/sentry/fs/copy_up_test.go @@ -24,8 +24,8 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs" _ "gvisor.dev/gvisor/pkg/sentry/fs/tmpfs" "gvisor.dev/gvisor/pkg/sentry/kernel/contexttest" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/usermem" ) const ( diff --git a/pkg/sentry/fs/dev/BUILD b/pkg/sentry/fs/dev/BUILD index 0c7247bd7..4c4b7d5cc 100644 --- a/pkg/sentry/fs/dev/BUILD +++ b/pkg/sentry/fs/dev/BUILD @@ -16,8 +16,9 @@ go_library( visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", + "//pkg/context", "//pkg/rand", - "//pkg/sentry/context", + "//pkg/safemem", "//pkg/sentry/device", "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", @@ -26,9 +27,8 @@ go_library( "//pkg/sentry/memmap", "//pkg/sentry/mm", "//pkg/sentry/pgalloc", - "//pkg/sentry/safemem", - "//pkg/sentry/usermem", "//pkg/syserror", + "//pkg/usermem", "//pkg/waiter", ], ) diff --git a/pkg/sentry/fs/dev/dev.go b/pkg/sentry/fs/dev/dev.go index f739c476c..35bd23991 100644 --- a/pkg/sentry/fs/dev/dev.go +++ b/pkg/sentry/fs/dev/dev.go @@ -18,11 +18,11 @@ package dev import ( "math" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/ramfs" "gvisor.dev/gvisor/pkg/sentry/fs/tmpfs" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) // Memory device numbers are from Linux's drivers/char/mem.c diff --git a/pkg/sentry/fs/dev/fs.go b/pkg/sentry/fs/dev/fs.go index 55f8af704..5e518fb63 100644 --- a/pkg/sentry/fs/dev/fs.go +++ b/pkg/sentry/fs/dev/fs.go @@ -15,7 +15,7 @@ package dev import ( - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" ) diff --git a/pkg/sentry/fs/dev/full.go b/pkg/sentry/fs/dev/full.go index 07e0ea010..deb9c6ad8 100644 --- a/pkg/sentry/fs/dev/full.go +++ b/pkg/sentry/fs/dev/full.go @@ -16,11 +16,11 @@ package dev import ( "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/fs/dev/null.go b/pkg/sentry/fs/dev/null.go index 4404b97ef..aec33d0d9 100644 --- a/pkg/sentry/fs/dev/null.go +++ b/pkg/sentry/fs/dev/null.go @@ -16,7 +16,7 @@ package dev import ( "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/memmap" diff --git a/pkg/sentry/fs/dev/random.go b/pkg/sentry/fs/dev/random.go index 49cb92f6e..2a9bbeb18 100644 --- a/pkg/sentry/fs/dev/random.go +++ b/pkg/sentry/fs/dev/random.go @@ -16,12 +16,12 @@ package dev import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/rand" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" - "gvisor.dev/gvisor/pkg/sentry/safemem" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/fs/dev/tty.go b/pkg/sentry/fs/dev/tty.go index 87d80e292..760ca563d 100644 --- a/pkg/sentry/fs/dev/tty.go +++ b/pkg/sentry/fs/dev/tty.go @@ -16,7 +16,7 @@ package dev import ( "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/waiter" diff --git a/pkg/sentry/fs/dirent.go b/pkg/sentry/fs/dirent.go index 31fc4d87b..acab0411a 100644 --- a/pkg/sentry/fs/dirent.go +++ b/pkg/sentry/fs/dirent.go @@ -22,8 +22,8 @@ import ( "syscall" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/refs" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/uniqueid" diff --git a/pkg/sentry/fs/dirent_refs_test.go b/pkg/sentry/fs/dirent_refs_test.go index 47bc72a88..98d69c6f2 100644 --- a/pkg/sentry/fs/dirent_refs_test.go +++ b/pkg/sentry/fs/dirent_refs_test.go @@ -18,8 +18,8 @@ import ( "syscall" "testing" - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/context/contexttest" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/contexttest" ) func newMockDirInode(ctx context.Context, cache *DirentCache) *Inode { diff --git a/pkg/sentry/fs/fdpipe/BUILD b/pkg/sentry/fs/fdpipe/BUILD index 25ef96299..1d09e983c 100644 --- a/pkg/sentry/fs/fdpipe/BUILD +++ b/pkg/sentry/fs/fdpipe/BUILD @@ -12,17 +12,17 @@ go_library( imports = ["gvisor.dev/gvisor/pkg/sentry/fs"], visibility = ["//pkg/sentry:internal"], deps = [ + "//pkg/context", "//pkg/fd", "//pkg/fdnotifier", "//pkg/log", + "//pkg/safemem", "//pkg/secio", - "//pkg/sentry/context", "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", - "//pkg/sentry/safemem", - "//pkg/sentry/usermem", "//pkg/sync", "//pkg/syserror", + "//pkg/usermem", "//pkg/waiter", ], ) @@ -36,13 +36,13 @@ go_test( ], library = ":fdpipe", deps = [ + "//pkg/context", "//pkg/fd", "//pkg/fdnotifier", - "//pkg/sentry/context", - "//pkg/sentry/context/contexttest", + "//pkg/sentry/contexttest", "//pkg/sentry/fs", - "//pkg/sentry/usermem", "//pkg/syserror", + "//pkg/usermem", "@com_github_google_uuid//:go_default_library", ], ) diff --git a/pkg/sentry/fs/fdpipe/pipe.go b/pkg/sentry/fs/fdpipe/pipe.go index 5b6cfeb0a..9fce177ad 100644 --- a/pkg/sentry/fs/fdpipe/pipe.go +++ b/pkg/sentry/fs/fdpipe/pipe.go @@ -19,17 +19,17 @@ import ( "os" "syscall" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/fdnotifier" "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/secio" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" - "gvisor.dev/gvisor/pkg/sentry/safemem" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/fs/fdpipe/pipe_opener.go b/pkg/sentry/fs/fdpipe/pipe_opener.go index 64b558975..0c3595998 100644 --- a/pkg/sentry/fs/fdpipe/pipe_opener.go +++ b/pkg/sentry/fs/fdpipe/pipe_opener.go @@ -20,8 +20,8 @@ import ( "syscall" "time" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fd" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/syserror" ) diff --git a/pkg/sentry/fs/fdpipe/pipe_opener_test.go b/pkg/sentry/fs/fdpipe/pipe_opener_test.go index 577445148..e556da48a 100644 --- a/pkg/sentry/fs/fdpipe/pipe_opener_test.go +++ b/pkg/sentry/fs/fdpipe/pipe_opener_test.go @@ -26,12 +26,12 @@ import ( "github.com/google/uuid" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fd" - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/context/contexttest" + "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/sentry/fs" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) type hostOpener struct { diff --git a/pkg/sentry/fs/fdpipe/pipe_state.go b/pkg/sentry/fs/fdpipe/pipe_state.go index cee87f726..af8230a7d 100644 --- a/pkg/sentry/fs/fdpipe/pipe_state.go +++ b/pkg/sentry/fs/fdpipe/pipe_state.go @@ -18,7 +18,7 @@ import ( "fmt" "io/ioutil" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sync" ) diff --git a/pkg/sentry/fs/fdpipe/pipe_test.go b/pkg/sentry/fs/fdpipe/pipe_test.go index 69abc1e71..5aff0cc95 100644 --- a/pkg/sentry/fs/fdpipe/pipe_test.go +++ b/pkg/sentry/fs/fdpipe/pipe_test.go @@ -23,10 +23,10 @@ import ( "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/fdnotifier" - "gvisor.dev/gvisor/pkg/sentry/context/contexttest" + "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/sentry/fs" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) func singlePipeFD() (int, error) { diff --git a/pkg/sentry/fs/file.go b/pkg/sentry/fs/file.go index 7c4586296..ca3466f4f 100644 --- a/pkg/sentry/fs/file.go +++ b/pkg/sentry/fs/file.go @@ -20,16 +20,16 @@ import ( "time" "gvisor.dev/gvisor/pkg/amutex" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/metric" "gvisor.dev/gvisor/pkg/refs" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/uniqueid" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/fs/file_operations.go b/pkg/sentry/fs/file_operations.go index b88303f17..beba0f771 100644 --- a/pkg/sentry/fs/file_operations.go +++ b/pkg/sentry/fs/file_operations.go @@ -17,10 +17,10 @@ package fs import ( "io" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/fs/file_overlay.go b/pkg/sentry/fs/file_overlay.go index 8991207b4..dcc1df38f 100644 --- a/pkg/sentry/fs/file_overlay.go +++ b/pkg/sentry/fs/file_overlay.go @@ -17,13 +17,13 @@ package fs import ( "io" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/fs/file_overlay_test.go b/pkg/sentry/fs/file_overlay_test.go index 2fb824d5c..02538bb4f 100644 --- a/pkg/sentry/fs/file_overlay_test.go +++ b/pkg/sentry/fs/file_overlay_test.go @@ -18,7 +18,7 @@ import ( "reflect" "testing" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/fs/ramfs" diff --git a/pkg/sentry/fs/filesystems.go b/pkg/sentry/fs/filesystems.go index c5b51620a..084da2a8d 100644 --- a/pkg/sentry/fs/filesystems.go +++ b/pkg/sentry/fs/filesystems.go @@ -19,7 +19,7 @@ import ( "sort" "strings" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sync" ) diff --git a/pkg/sentry/fs/filetest/BUILD b/pkg/sentry/fs/filetest/BUILD index 9a7608cae..a8000e010 100644 --- a/pkg/sentry/fs/filetest/BUILD +++ b/pkg/sentry/fs/filetest/BUILD @@ -8,12 +8,12 @@ go_library( srcs = ["filetest.go"], visibility = ["//pkg/sentry:internal"], deps = [ - "//pkg/sentry/context", - "//pkg/sentry/context/contexttest", + "//pkg/context", + "//pkg/sentry/contexttest", "//pkg/sentry/fs", "//pkg/sentry/fs/anon", "//pkg/sentry/fs/fsutil", - "//pkg/sentry/usermem", + "//pkg/usermem", "//pkg/waiter", ], ) diff --git a/pkg/sentry/fs/filetest/filetest.go b/pkg/sentry/fs/filetest/filetest.go index 22270a494..8049538f2 100644 --- a/pkg/sentry/fs/filetest/filetest.go +++ b/pkg/sentry/fs/filetest/filetest.go @@ -19,12 +19,12 @@ import ( "fmt" "testing" - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/context/contexttest" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/anon" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/fs/fs.go b/pkg/sentry/fs/fs.go index 26abf49e2..bdba6efe5 100644 --- a/pkg/sentry/fs/fs.go +++ b/pkg/sentry/fs/fs.go @@ -54,8 +54,8 @@ package fs import ( + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sync" ) diff --git a/pkg/sentry/fs/fsutil/BUILD b/pkg/sentry/fs/fsutil/BUILD index 9142f5bdf..4ab2a384f 100644 --- a/pkg/sentry/fs/fsutil/BUILD +++ b/pkg/sentry/fs/fsutil/BUILD @@ -77,22 +77,22 @@ go_library( visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", + "//pkg/context", "//pkg/log", + "//pkg/safemem", "//pkg/sentry/arch", - "//pkg/sentry/context", "//pkg/sentry/device", "//pkg/sentry/fs", "//pkg/sentry/kernel/time", "//pkg/sentry/memmap", "//pkg/sentry/pgalloc", "//pkg/sentry/platform", - "//pkg/sentry/safemem", "//pkg/sentry/socket/unix/transport", "//pkg/sentry/usage", - "//pkg/sentry/usermem", "//pkg/state", "//pkg/sync", "//pkg/syserror", + "//pkg/usermem", "//pkg/waiter", ], ) @@ -106,13 +106,13 @@ go_test( ], library = ":fsutil", deps = [ - "//pkg/sentry/context", - "//pkg/sentry/context/contexttest", + "//pkg/context", + "//pkg/safemem", + "//pkg/sentry/contexttest", "//pkg/sentry/fs", "//pkg/sentry/kernel/time", "//pkg/sentry/memmap", - "//pkg/sentry/safemem", - "//pkg/sentry/usermem", "//pkg/syserror", + "//pkg/usermem", ], ) diff --git a/pkg/sentry/fs/fsutil/dirty_set.go b/pkg/sentry/fs/fsutil/dirty_set.go index 12132680b..c6cd45087 100644 --- a/pkg/sentry/fs/fsutil/dirty_set.go +++ b/pkg/sentry/fs/fsutil/dirty_set.go @@ -17,11 +17,11 @@ package fsutil import ( "math" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/platform" - "gvisor.dev/gvisor/pkg/sentry/safemem" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) // DirtySet maps offsets into a memmap.Mappable to DirtyInfo. It is used to diff --git a/pkg/sentry/fs/fsutil/dirty_set_test.go b/pkg/sentry/fs/fsutil/dirty_set_test.go index 75575d994..e3579c23c 100644 --- a/pkg/sentry/fs/fsutil/dirty_set_test.go +++ b/pkg/sentry/fs/fsutil/dirty_set_test.go @@ -19,7 +19,7 @@ import ( "testing" "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) func TestDirtySet(t *testing.T) { diff --git a/pkg/sentry/fs/fsutil/file.go b/pkg/sentry/fs/fsutil/file.go index fc5b3b1a1..08695391c 100644 --- a/pkg/sentry/fs/fsutil/file.go +++ b/pkg/sentry/fs/fsutil/file.go @@ -17,12 +17,12 @@ package fsutil import ( "io" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/fs/fsutil/file_range_set.go b/pkg/sentry/fs/fsutil/file_range_set.go index f52d712e3..5643cdac9 100644 --- a/pkg/sentry/fs/fsutil/file_range_set.go +++ b/pkg/sentry/fs/fsutil/file_range_set.go @@ -19,13 +19,13 @@ import ( "io" "math" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/platform" - "gvisor.dev/gvisor/pkg/sentry/safemem" "gvisor.dev/gvisor/pkg/sentry/usage" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) // FileRangeSet maps offsets into a memmap.Mappable to offsets into a diff --git a/pkg/sentry/fs/fsutil/host_file_mapper.go b/pkg/sentry/fs/fsutil/host_file_mapper.go index 837fc70b5..67278aa86 100644 --- a/pkg/sentry/fs/fsutil/host_file_mapper.go +++ b/pkg/sentry/fs/fsutil/host_file_mapper.go @@ -19,11 +19,11 @@ import ( "syscall" "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/platform" - "gvisor.dev/gvisor/pkg/sentry/safemem" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/usermem" ) // HostFileMapper caches mappings of an arbitrary host file descriptor. It is diff --git a/pkg/sentry/fs/fsutil/host_file_mapper_unsafe.go b/pkg/sentry/fs/fsutil/host_file_mapper_unsafe.go index ad11a0573..2d4778d64 100644 --- a/pkg/sentry/fs/fsutil/host_file_mapper_unsafe.go +++ b/pkg/sentry/fs/fsutil/host_file_mapper_unsafe.go @@ -17,7 +17,7 @@ package fsutil import ( "unsafe" - "gvisor.dev/gvisor/pkg/sentry/safemem" + "gvisor.dev/gvisor/pkg/safemem" ) func (*HostFileMapper) unsafeBlockFromChunkMapping(addr uintptr) safemem.Block { diff --git a/pkg/sentry/fs/fsutil/host_mappable.go b/pkg/sentry/fs/fsutil/host_mappable.go index a625f0e26..78fec553e 100644 --- a/pkg/sentry/fs/fsutil/host_mappable.go +++ b/pkg/sentry/fs/fsutil/host_mappable.go @@ -17,13 +17,13 @@ package fsutil import ( "math" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/platform" - "gvisor.dev/gvisor/pkg/sentry/safemem" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/usermem" ) // HostMappable implements memmap.Mappable and platform.File over a diff --git a/pkg/sentry/fs/fsutil/inode.go b/pkg/sentry/fs/fsutil/inode.go index df7b74855..252830572 100644 --- a/pkg/sentry/fs/fsutil/inode.go +++ b/pkg/sentry/fs/fsutil/inode.go @@ -16,7 +16,7 @@ package fsutil import ( "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/memmap" diff --git a/pkg/sentry/fs/fsutil/inode_cached.go b/pkg/sentry/fs/fsutil/inode_cached.go index 20a014402..573b8586e 100644 --- a/pkg/sentry/fs/fsutil/inode_cached.go +++ b/pkg/sentry/fs/fsutil/inode_cached.go @@ -18,18 +18,18 @@ import ( "fmt" "io" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel/time" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/platform" - "gvisor.dev/gvisor/pkg/sentry/safemem" "gvisor.dev/gvisor/pkg/sentry/usage" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/usermem" ) // Lock order (compare the lock order model in mm/mm.go): diff --git a/pkg/sentry/fs/fsutil/inode_cached_test.go b/pkg/sentry/fs/fsutil/inode_cached_test.go index 129f314c8..1547584c5 100644 --- a/pkg/sentry/fs/fsutil/inode_cached_test.go +++ b/pkg/sentry/fs/fsutil/inode_cached_test.go @@ -19,14 +19,14 @@ import ( "io" "testing" - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/context/contexttest" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/safemem" + "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/sentry/fs" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/sentry/safemem" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) type noopBackingFile struct{} diff --git a/pkg/sentry/fs/gofer/BUILD b/pkg/sentry/fs/gofer/BUILD index cf48e7c03..971d3718e 100644 --- a/pkg/sentry/fs/gofer/BUILD +++ b/pkg/sentry/fs/gofer/BUILD @@ -24,13 +24,14 @@ go_library( visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", + "//pkg/context", "//pkg/fd", "//pkg/log", "//pkg/metric", "//pkg/p9", "//pkg/refs", + "//pkg/safemem", "//pkg/secio", - "//pkg/sentry/context", "//pkg/sentry/device", "//pkg/sentry/fs", "//pkg/sentry/fs/fdpipe", @@ -39,13 +40,12 @@ go_library( "//pkg/sentry/kernel/auth", "//pkg/sentry/kernel/time", "//pkg/sentry/memmap", - "//pkg/sentry/safemem", "//pkg/sentry/socket/unix/transport", - "//pkg/sentry/usermem", "//pkg/sync", "//pkg/syserr", "//pkg/syserror", "//pkg/unet", + "//pkg/usermem", "//pkg/waiter", ], ) @@ -56,10 +56,10 @@ go_test( srcs = ["gofer_test.go"], library = ":gofer", deps = [ + "//pkg/context", "//pkg/p9", "//pkg/p9/p9test", - "//pkg/sentry/context", - "//pkg/sentry/context/contexttest", + "//pkg/sentry/contexttest", "//pkg/sentry/fs", ], ) diff --git a/pkg/sentry/fs/gofer/attr.go b/pkg/sentry/fs/gofer/attr.go index 4848e2374..71cccdc34 100644 --- a/pkg/sentry/fs/gofer/attr.go +++ b/pkg/sentry/fs/gofer/attr.go @@ -17,12 +17,12 @@ package gofer import ( "syscall" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/p9" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) // getattr returns the 9p attributes of the p9.File. On success, Mode, Size, and RDev diff --git a/pkg/sentry/fs/gofer/cache_policy.go b/pkg/sentry/fs/gofer/cache_policy.go index cc11c6339..ebea03c42 100644 --- a/pkg/sentry/fs/gofer/cache_policy.go +++ b/pkg/sentry/fs/gofer/cache_policy.go @@ -17,7 +17,7 @@ package gofer import ( "fmt" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" ) diff --git a/pkg/sentry/fs/gofer/context_file.go b/pkg/sentry/fs/gofer/context_file.go index 2125dafef..3da818aed 100644 --- a/pkg/sentry/fs/gofer/context_file.go +++ b/pkg/sentry/fs/gofer/context_file.go @@ -15,9 +15,9 @@ package gofer import ( + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/p9" - "gvisor.dev/gvisor/pkg/sentry/context" ) // contextFile is a wrapper around p9.File that notifies the context that diff --git a/pkg/sentry/fs/gofer/file.go b/pkg/sentry/fs/gofer/file.go index 7960b9c7b..23296f246 100644 --- a/pkg/sentry/fs/gofer/file.go +++ b/pkg/sentry/fs/gofer/file.go @@ -19,16 +19,16 @@ import ( "syscall" "time" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/metric" "gvisor.dev/gvisor/pkg/p9" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/device" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/fs/gofer/file_state.go b/pkg/sentry/fs/gofer/file_state.go index bb8312849..ff96b28ba 100644 --- a/pkg/sentry/fs/gofer/file_state.go +++ b/pkg/sentry/fs/gofer/file_state.go @@ -17,7 +17,7 @@ package gofer import ( "fmt" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" ) diff --git a/pkg/sentry/fs/gofer/fs.go b/pkg/sentry/fs/gofer/fs.go index cf96dd9fa..9d41fcbdb 100644 --- a/pkg/sentry/fs/gofer/fs.go +++ b/pkg/sentry/fs/gofer/fs.go @@ -20,8 +20,8 @@ import ( "fmt" "strconv" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/p9" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" ) diff --git a/pkg/sentry/fs/gofer/gofer_test.go b/pkg/sentry/fs/gofer/gofer_test.go index 7fc3c32ae..0c2f89ae8 100644 --- a/pkg/sentry/fs/gofer/gofer_test.go +++ b/pkg/sentry/fs/gofer/gofer_test.go @@ -20,10 +20,10 @@ import ( "testing" "time" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/p9" "gvisor.dev/gvisor/pkg/p9/p9test" - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/context/contexttest" + "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/sentry/fs" ) diff --git a/pkg/sentry/fs/gofer/handles.go b/pkg/sentry/fs/gofer/handles.go index b86c49b39..9f7c3e89f 100644 --- a/pkg/sentry/fs/gofer/handles.go +++ b/pkg/sentry/fs/gofer/handles.go @@ -17,14 +17,14 @@ package gofer import ( "io" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/p9" "gvisor.dev/gvisor/pkg/refs" + "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/secio" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" - "gvisor.dev/gvisor/pkg/sentry/safemem" ) // handles are the open handles of a gofer file. They are reference counted to diff --git a/pkg/sentry/fs/gofer/inode.go b/pkg/sentry/fs/gofer/inode.go index 98d1a8a48..ac28174d2 100644 --- a/pkg/sentry/fs/gofer/inode.go +++ b/pkg/sentry/fs/gofer/inode.go @@ -19,17 +19,17 @@ import ( "syscall" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/p9" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/device" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fdpipe" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/fs/host" "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/sentry/safemem" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) diff --git a/pkg/sentry/fs/gofer/inode_state.go b/pkg/sentry/fs/gofer/inode_state.go index 0b2eedb7c..238f7804c 100644 --- a/pkg/sentry/fs/gofer/inode_state.go +++ b/pkg/sentry/fs/gofer/inode_state.go @@ -20,8 +20,8 @@ import ( "path/filepath" "strings" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/p9" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/device" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel/time" diff --git a/pkg/sentry/fs/gofer/path.go b/pkg/sentry/fs/gofer/path.go index c09f3b71c..0c1be05ef 100644 --- a/pkg/sentry/fs/gofer/path.go +++ b/pkg/sentry/fs/gofer/path.go @@ -18,9 +18,9 @@ import ( "fmt" "syscall" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/p9" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/device" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" diff --git a/pkg/sentry/fs/gofer/session.go b/pkg/sentry/fs/gofer/session.go index edc796ce0..498c4645a 100644 --- a/pkg/sentry/fs/gofer/session.go +++ b/pkg/sentry/fs/gofer/session.go @@ -17,9 +17,9 @@ package gofer import ( "fmt" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/p9" "gvisor.dev/gvisor/pkg/refs" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/device" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" diff --git a/pkg/sentry/fs/gofer/session_state.go b/pkg/sentry/fs/gofer/session_state.go index d045e04ff..0285c5361 100644 --- a/pkg/sentry/fs/gofer/session_state.go +++ b/pkg/sentry/fs/gofer/session_state.go @@ -17,8 +17,8 @@ package gofer import ( "fmt" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/p9" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/unet" ) diff --git a/pkg/sentry/fs/gofer/socket.go b/pkg/sentry/fs/gofer/socket.go index a45a8f36c..376cfce2c 100644 --- a/pkg/sentry/fs/gofer/socket.go +++ b/pkg/sentry/fs/gofer/socket.go @@ -16,9 +16,9 @@ package gofer import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/p9" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/host" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" diff --git a/pkg/sentry/fs/gofer/util.go b/pkg/sentry/fs/gofer/util.go index 848e6812b..2d8d3a2ea 100644 --- a/pkg/sentry/fs/gofer/util.go +++ b/pkg/sentry/fs/gofer/util.go @@ -17,8 +17,8 @@ package gofer import ( "syscall" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/p9" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" ) diff --git a/pkg/sentry/fs/host/BUILD b/pkg/sentry/fs/host/BUILD index f586f47c1..21003ea45 100644 --- a/pkg/sentry/fs/host/BUILD +++ b/pkg/sentry/fs/host/BUILD @@ -27,13 +27,14 @@ go_library( visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", + "//pkg/context", "//pkg/fd", "//pkg/fdnotifier", "//pkg/log", "//pkg/refs", + "//pkg/safemem", "//pkg/secio", "//pkg/sentry/arch", - "//pkg/sentry/context", "//pkg/sentry/device", "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", @@ -41,18 +42,17 @@ go_library( "//pkg/sentry/kernel/auth", "//pkg/sentry/kernel/time", "//pkg/sentry/memmap", - "//pkg/sentry/safemem", "//pkg/sentry/socket/control", "//pkg/sentry/socket/unix", "//pkg/sentry/socket/unix/transport", "//pkg/sentry/unimpl", "//pkg/sentry/uniqueid", - "//pkg/sentry/usermem", "//pkg/sync", "//pkg/syserr", "//pkg/syserror", "//pkg/tcpip", "//pkg/unet", + "//pkg/usermem", "//pkg/waiter", ], ) @@ -69,17 +69,17 @@ go_test( ], library = ":host", deps = [ + "//pkg/context", "//pkg/fd", "//pkg/fdnotifier", - "//pkg/sentry/context", - "//pkg/sentry/context/contexttest", + "//pkg/sentry/contexttest", "//pkg/sentry/fs", "//pkg/sentry/kernel/time", "//pkg/sentry/socket", "//pkg/sentry/socket/unix/transport", - "//pkg/sentry/usermem", "//pkg/syserr", "//pkg/tcpip", + "//pkg/usermem", "//pkg/waiter", ], ) diff --git a/pkg/sentry/fs/host/control.go b/pkg/sentry/fs/host/control.go index 5532ff5a0..1658979fc 100644 --- a/pkg/sentry/fs/host/control.go +++ b/pkg/sentry/fs/host/control.go @@ -17,7 +17,7 @@ package host import ( "syscall" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/socket/control" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" diff --git a/pkg/sentry/fs/host/file.go b/pkg/sentry/fs/host/file.go index f6c626f2c..e08f56d04 100644 --- a/pkg/sentry/fs/host/file.go +++ b/pkg/sentry/fs/host/file.go @@ -18,17 +18,17 @@ import ( "fmt" "syscall" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/fdnotifier" "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/secio" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/sentry/safemem" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/fs/host/fs.go b/pkg/sentry/fs/host/fs.go index 68d2697c0..d3e8e3a36 100644 --- a/pkg/sentry/fs/host/fs.go +++ b/pkg/sentry/fs/host/fs.go @@ -23,8 +23,8 @@ import ( "strconv" "strings" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" ) diff --git a/pkg/sentry/fs/host/fs_test.go b/pkg/sentry/fs/host/fs_test.go index c6852ee30..3111d2df9 100644 --- a/pkg/sentry/fs/host/fs_test.go +++ b/pkg/sentry/fs/host/fs_test.go @@ -23,8 +23,8 @@ import ( "sort" "testing" - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/context/contexttest" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/sentry/fs" ) diff --git a/pkg/sentry/fs/host/inode.go b/pkg/sentry/fs/host/inode.go index 873a1c52d..6fa39caab 100644 --- a/pkg/sentry/fs/host/inode.go +++ b/pkg/sentry/fs/host/inode.go @@ -18,14 +18,14 @@ import ( "syscall" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fd" + "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/secio" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/device" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/sentry/safemem" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" diff --git a/pkg/sentry/fs/host/inode_state.go b/pkg/sentry/fs/host/inode_state.go index b267ec305..299e0e0b0 100644 --- a/pkg/sentry/fs/host/inode_state.go +++ b/pkg/sentry/fs/host/inode_state.go @@ -18,7 +18,7 @@ import ( "fmt" "syscall" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/device" "gvisor.dev/gvisor/pkg/sentry/fs" ) diff --git a/pkg/sentry/fs/host/inode_test.go b/pkg/sentry/fs/host/inode_test.go index 2d959f10d..7221bc825 100644 --- a/pkg/sentry/fs/host/inode_test.go +++ b/pkg/sentry/fs/host/inode_test.go @@ -21,7 +21,7 @@ import ( "syscall" "testing" - "gvisor.dev/gvisor/pkg/sentry/context/contexttest" + "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/sentry/fs" ) diff --git a/pkg/sentry/fs/host/socket.go b/pkg/sentry/fs/host/socket.go index c076d5bdd..06fc2d80a 100644 --- a/pkg/sentry/fs/host/socket.go +++ b/pkg/sentry/fs/host/socket.go @@ -19,11 +19,11 @@ import ( "syscall" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/fdnotifier" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/refs" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/socket/control" unixsocket "gvisor.dev/gvisor/pkg/sentry/socket/unix" diff --git a/pkg/sentry/fs/host/socket_test.go b/pkg/sentry/fs/host/socket_test.go index 68b38fd1c..eb4afe520 100644 --- a/pkg/sentry/fs/host/socket_test.go +++ b/pkg/sentry/fs/host/socket_test.go @@ -21,13 +21,13 @@ import ( "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/fdnotifier" - "gvisor.dev/gvisor/pkg/sentry/context/contexttest" + "gvisor.dev/gvisor/pkg/sentry/contexttest" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/socket" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/fs/host/tty.go b/pkg/sentry/fs/host/tty.go index 753ef8cd6..3f218b4a7 100644 --- a/pkg/sentry/fs/host/tty.go +++ b/pkg/sentry/fs/host/tty.go @@ -16,14 +16,14 @@ package host import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/unimpl" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // TTYFileOperations implements fs.FileOperations for a host file descriptor diff --git a/pkg/sentry/fs/host/wait_test.go b/pkg/sentry/fs/host/wait_test.go index 88d24d693..d49c3a635 100644 --- a/pkg/sentry/fs/host/wait_test.go +++ b/pkg/sentry/fs/host/wait_test.go @@ -19,7 +19,7 @@ import ( "testing" "time" - "gvisor.dev/gvisor/pkg/sentry/context/contexttest" + "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/fs/inode.go b/pkg/sentry/fs/inode.go index e4cf5a570..b66c091ab 100644 --- a/pkg/sentry/fs/inode.go +++ b/pkg/sentry/fs/inode.go @@ -16,10 +16,10 @@ package fs import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/metric" "gvisor.dev/gvisor/pkg/refs" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/memmap" diff --git a/pkg/sentry/fs/inode_operations.go b/pkg/sentry/fs/inode_operations.go index 13261cb81..70f2eae96 100644 --- a/pkg/sentry/fs/inode_operations.go +++ b/pkg/sentry/fs/inode_operations.go @@ -17,7 +17,7 @@ package fs import ( "errors" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" diff --git a/pkg/sentry/fs/inode_overlay.go b/pkg/sentry/fs/inode_overlay.go index c477de837..4729b4aac 100644 --- a/pkg/sentry/fs/inode_overlay.go +++ b/pkg/sentry/fs/inode_overlay.go @@ -19,8 +19,8 @@ import ( "strings" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/syserror" ) diff --git a/pkg/sentry/fs/inode_overlay_test.go b/pkg/sentry/fs/inode_overlay_test.go index 493d98c36..389c219d6 100644 --- a/pkg/sentry/fs/inode_overlay_test.go +++ b/pkg/sentry/fs/inode_overlay_test.go @@ -17,7 +17,7 @@ package fs_test import ( "testing" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/fs/ramfs" diff --git a/pkg/sentry/fs/inotify.go b/pkg/sentry/fs/inotify.go index cc7dd1c92..928c90aa0 100644 --- a/pkg/sentry/fs/inotify.go +++ b/pkg/sentry/fs/inotify.go @@ -19,13 +19,13 @@ import ( "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/uniqueid" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/fs/inotify_event.go b/pkg/sentry/fs/inotify_event.go index 9f70a3e82..686e1b1cd 100644 --- a/pkg/sentry/fs/inotify_event.go +++ b/pkg/sentry/fs/inotify_event.go @@ -18,8 +18,8 @@ import ( "bytes" "fmt" - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/usermem" ) // inotifyEventBaseSize is the base size of linux's struct inotify_event. This diff --git a/pkg/sentry/fs/mock.go b/pkg/sentry/fs/mock.go index 7a24c6f1b..1d6ea5736 100644 --- a/pkg/sentry/fs/mock.go +++ b/pkg/sentry/fs/mock.go @@ -15,7 +15,7 @@ package fs import ( - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/syserror" ) diff --git a/pkg/sentry/fs/mount.go b/pkg/sentry/fs/mount.go index 7a9692800..37bae6810 100644 --- a/pkg/sentry/fs/mount.go +++ b/pkg/sentry/fs/mount.go @@ -19,8 +19,8 @@ import ( "fmt" "sync/atomic" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/refs" - "gvisor.dev/gvisor/pkg/sentry/context" ) // DirentOperations provide file systems greater control over how long a Dirent diff --git a/pkg/sentry/fs/mount_overlay.go b/pkg/sentry/fs/mount_overlay.go index 299712cd7..78e35b1e6 100644 --- a/pkg/sentry/fs/mount_overlay.go +++ b/pkg/sentry/fs/mount_overlay.go @@ -15,7 +15,7 @@ package fs import ( - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" ) // overlayMountSourceOperations implements MountSourceOperations for an overlay diff --git a/pkg/sentry/fs/mount_test.go b/pkg/sentry/fs/mount_test.go index 0b84732aa..e672a438c 100644 --- a/pkg/sentry/fs/mount_test.go +++ b/pkg/sentry/fs/mount_test.go @@ -18,7 +18,7 @@ import ( "fmt" "testing" - "gvisor.dev/gvisor/pkg/sentry/context/contexttest" + "gvisor.dev/gvisor/pkg/sentry/contexttest" ) // cacheReallyContains iterates through the dirent cache to determine whether diff --git a/pkg/sentry/fs/mounts.go b/pkg/sentry/fs/mounts.go index a9627a9d1..574a2cc91 100644 --- a/pkg/sentry/fs/mounts.go +++ b/pkg/sentry/fs/mounts.go @@ -22,9 +22,9 @@ import ( "syscall" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/refs" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" diff --git a/pkg/sentry/fs/mounts_test.go b/pkg/sentry/fs/mounts_test.go index c4c771f2c..a69b41468 100644 --- a/pkg/sentry/fs/mounts_test.go +++ b/pkg/sentry/fs/mounts_test.go @@ -17,7 +17,7 @@ package fs_test import ( "testing" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/fs/ramfs" diff --git a/pkg/sentry/fs/offset.go b/pkg/sentry/fs/offset.go index f7d844ce7..53b5df175 100644 --- a/pkg/sentry/fs/offset.go +++ b/pkg/sentry/fs/offset.go @@ -17,7 +17,7 @@ package fs import ( "math" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) // OffsetPageEnd returns the file offset rounded up to the nearest diff --git a/pkg/sentry/fs/overlay.go b/pkg/sentry/fs/overlay.go index f7702f8f4..a8ae7d81d 100644 --- a/pkg/sentry/fs/overlay.go +++ b/pkg/sentry/fs/overlay.go @@ -18,12 +18,12 @@ import ( "fmt" "strings" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // The virtual filesystem implements an overlay configuration. For a high-level diff --git a/pkg/sentry/fs/proc/BUILD b/pkg/sentry/fs/proc/BUILD index b06bead41..280093c5e 100644 --- a/pkg/sentry/fs/proc/BUILD +++ b/pkg/sentry/fs/proc/BUILD @@ -29,8 +29,8 @@ go_library( visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", + "//pkg/context", "//pkg/log", - "//pkg/sentry/context", "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", "//pkg/sentry/fs/proc/device", @@ -46,10 +46,10 @@ go_library( "//pkg/sentry/socket/unix", "//pkg/sentry/socket/unix/transport", "//pkg/sentry/usage", - "//pkg/sentry/usermem", "//pkg/sync", "//pkg/syserror", "//pkg/tcpip/header", + "//pkg/usermem", "//pkg/waiter", ], ) @@ -64,8 +64,8 @@ go_test( library = ":proc", deps = [ "//pkg/abi/linux", - "//pkg/sentry/context", + "//pkg/context", "//pkg/sentry/inet", - "//pkg/sentry/usermem", + "//pkg/usermem", ], ) diff --git a/pkg/sentry/fs/proc/cgroup.go b/pkg/sentry/fs/proc/cgroup.go index c4abe319d..7c1d9e7e9 100644 --- a/pkg/sentry/fs/proc/cgroup.go +++ b/pkg/sentry/fs/proc/cgroup.go @@ -17,7 +17,7 @@ package proc import ( "fmt" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" ) diff --git a/pkg/sentry/fs/proc/cpuinfo.go b/pkg/sentry/fs/proc/cpuinfo.go index df0c4e3a7..c96533401 100644 --- a/pkg/sentry/fs/proc/cpuinfo.go +++ b/pkg/sentry/fs/proc/cpuinfo.go @@ -17,7 +17,7 @@ package proc import ( "bytes" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" ) diff --git a/pkg/sentry/fs/proc/exec_args.go b/pkg/sentry/fs/proc/exec_args.go index 9aaeb780b..8fe626e1c 100644 --- a/pkg/sentry/fs/proc/exec_args.go +++ b/pkg/sentry/fs/proc/exec_args.go @@ -20,12 +20,12 @@ import ( "io" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/fs/proc/fds.go b/pkg/sentry/fs/proc/fds.go index 2fa3cfa7d..35972e23c 100644 --- a/pkg/sentry/fs/proc/fds.go +++ b/pkg/sentry/fs/proc/fds.go @@ -19,7 +19,7 @@ import ( "sort" "strconv" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/fs/proc/device" diff --git a/pkg/sentry/fs/proc/filesystems.go b/pkg/sentry/fs/proc/filesystems.go index 7b3b974ab..0a58ac34c 100644 --- a/pkg/sentry/fs/proc/filesystems.go +++ b/pkg/sentry/fs/proc/filesystems.go @@ -18,7 +18,7 @@ import ( "bytes" "fmt" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/proc/seqfile" ) diff --git a/pkg/sentry/fs/proc/fs.go b/pkg/sentry/fs/proc/fs.go index 761d24462..daf1ba781 100644 --- a/pkg/sentry/fs/proc/fs.go +++ b/pkg/sentry/fs/proc/fs.go @@ -17,7 +17,7 @@ package proc import ( "fmt" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" ) diff --git a/pkg/sentry/fs/proc/inode.go b/pkg/sentry/fs/proc/inode.go index 723f6b661..d2859a4c2 100644 --- a/pkg/sentry/fs/proc/inode.go +++ b/pkg/sentry/fs/proc/inode.go @@ -16,14 +16,14 @@ package proc import ( "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/fs/proc/device" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/mm" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) // LINT.IfChange diff --git a/pkg/sentry/fs/proc/loadavg.go b/pkg/sentry/fs/proc/loadavg.go index d7d2afcb7..139d49c34 100644 --- a/pkg/sentry/fs/proc/loadavg.go +++ b/pkg/sentry/fs/proc/loadavg.go @@ -18,7 +18,7 @@ import ( "bytes" "fmt" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs/proc/seqfile" ) diff --git a/pkg/sentry/fs/proc/meminfo.go b/pkg/sentry/fs/proc/meminfo.go index 313c6a32b..465b47da9 100644 --- a/pkg/sentry/fs/proc/meminfo.go +++ b/pkg/sentry/fs/proc/meminfo.go @@ -18,11 +18,11 @@ import ( "bytes" "fmt" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs/proc/seqfile" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/usage" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) // LINT.IfChange diff --git a/pkg/sentry/fs/proc/mounts.go b/pkg/sentry/fs/proc/mounts.go index d4efc86e0..c10888100 100644 --- a/pkg/sentry/fs/proc/mounts.go +++ b/pkg/sentry/fs/proc/mounts.go @@ -20,7 +20,7 @@ import ( "sort" "strings" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/proc/seqfile" "gvisor.dev/gvisor/pkg/sentry/kernel" diff --git a/pkg/sentry/fs/proc/net.go b/pkg/sentry/fs/proc/net.go index bad445f3f..6f2775344 100644 --- a/pkg/sentry/fs/proc/net.go +++ b/pkg/sentry/fs/proc/net.go @@ -22,8 +22,8 @@ import ( "time" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/proc/seqfile" "gvisor.dev/gvisor/pkg/sentry/fs/ramfs" @@ -33,9 +33,9 @@ import ( "gvisor.dev/gvisor/pkg/sentry/socket" "gvisor.dev/gvisor/pkg/sentry/socket/unix" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/tcpip/header" + "gvisor.dev/gvisor/pkg/usermem" ) // LINT.IfChange diff --git a/pkg/sentry/fs/proc/proc.go b/pkg/sentry/fs/proc/proc.go index 29867dc3a..c8abb5052 100644 --- a/pkg/sentry/fs/proc/proc.go +++ b/pkg/sentry/fs/proc/proc.go @@ -20,7 +20,7 @@ import ( "sort" "strconv" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/fs/proc/device" diff --git a/pkg/sentry/fs/proc/seqfile/BUILD b/pkg/sentry/fs/proc/seqfile/BUILD index 310d8dd52..21338d912 100644 --- a/pkg/sentry/fs/proc/seqfile/BUILD +++ b/pkg/sentry/fs/proc/seqfile/BUILD @@ -8,14 +8,14 @@ go_library( visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", - "//pkg/sentry/context", + "//pkg/context", "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", "//pkg/sentry/fs/proc/device", "//pkg/sentry/kernel/time", - "//pkg/sentry/usermem", "//pkg/sync", "//pkg/syserror", + "//pkg/usermem", "//pkg/waiter", ], ) @@ -26,10 +26,10 @@ go_test( srcs = ["seqfile_test.go"], library = ":seqfile", deps = [ - "//pkg/sentry/context", - "//pkg/sentry/context/contexttest", + "//pkg/context", + "//pkg/sentry/contexttest", "//pkg/sentry/fs", "//pkg/sentry/fs/ramfs", - "//pkg/sentry/usermem", + "//pkg/usermem", ], ) diff --git a/pkg/sentry/fs/proc/seqfile/seqfile.go b/pkg/sentry/fs/proc/seqfile/seqfile.go index f9af191d5..6121f0e95 100644 --- a/pkg/sentry/fs/proc/seqfile/seqfile.go +++ b/pkg/sentry/fs/proc/seqfile/seqfile.go @@ -19,14 +19,14 @@ import ( "io" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/fs/proc/device" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/fs/proc/seqfile/seqfile_test.go b/pkg/sentry/fs/proc/seqfile/seqfile_test.go index ebfeee835..98e394569 100644 --- a/pkg/sentry/fs/proc/seqfile/seqfile_test.go +++ b/pkg/sentry/fs/proc/seqfile/seqfile_test.go @@ -20,11 +20,11 @@ import ( "io" "testing" - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/context/contexttest" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/ramfs" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) type seqTest struct { diff --git a/pkg/sentry/fs/proc/stat.go b/pkg/sentry/fs/proc/stat.go index bc5b2bc7b..d4fbd76ac 100644 --- a/pkg/sentry/fs/proc/stat.go +++ b/pkg/sentry/fs/proc/stat.go @@ -19,7 +19,7 @@ import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs/proc/seqfile" "gvisor.dev/gvisor/pkg/sentry/kernel" ) diff --git a/pkg/sentry/fs/proc/sys.go b/pkg/sentry/fs/proc/sys.go index 2bdcf5f70..f8aad2dbd 100644 --- a/pkg/sentry/fs/proc/sys.go +++ b/pkg/sentry/fs/proc/sys.go @@ -20,13 +20,13 @@ import ( "strconv" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/fs/proc/seqfile" "gvisor.dev/gvisor/pkg/sentry/fs/ramfs" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/fs/proc/sys_net.go b/pkg/sentry/fs/proc/sys_net.go index b9e8ef35f..0772d4ae4 100644 --- a/pkg/sentry/fs/proc/sys_net.go +++ b/pkg/sentry/fs/proc/sys_net.go @@ -19,14 +19,14 @@ import ( "io" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/fs/proc/device" "gvisor.dev/gvisor/pkg/sentry/fs/ramfs" "gvisor.dev/gvisor/pkg/sentry/inet" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/fs/proc/sys_net_test.go b/pkg/sentry/fs/proc/sys_net_test.go index 6abae7a60..355e83d47 100644 --- a/pkg/sentry/fs/proc/sys_net_test.go +++ b/pkg/sentry/fs/proc/sys_net_test.go @@ -17,9 +17,9 @@ package proc import ( "testing" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/inet" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) func TestQuerySendBufferSize(t *testing.T) { diff --git a/pkg/sentry/fs/proc/task.go b/pkg/sentry/fs/proc/task.go index 7358d6ef9..ca020e11e 100644 --- a/pkg/sentry/fs/proc/task.go +++ b/pkg/sentry/fs/proc/task.go @@ -22,7 +22,7 @@ import ( "strconv" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/fs/proc/device" @@ -32,8 +32,8 @@ import ( "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/mm" "gvisor.dev/gvisor/pkg/sentry/usage" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/fs/proc/uid_gid_map.go b/pkg/sentry/fs/proc/uid_gid_map.go index 3eacc9265..8d9517b95 100644 --- a/pkg/sentry/fs/proc/uid_gid_map.go +++ b/pkg/sentry/fs/proc/uid_gid_map.go @@ -20,13 +20,13 @@ import ( "io" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/fs/proc/uptime.go b/pkg/sentry/fs/proc/uptime.go index adfe58adb..c0f6fb802 100644 --- a/pkg/sentry/fs/proc/uptime.go +++ b/pkg/sentry/fs/proc/uptime.go @@ -19,12 +19,12 @@ import ( "io" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/fs/proc/version.go b/pkg/sentry/fs/proc/version.go index 27fd5b1cb..35e258ff6 100644 --- a/pkg/sentry/fs/proc/version.go +++ b/pkg/sentry/fs/proc/version.go @@ -17,7 +17,7 @@ package proc import ( "fmt" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs/proc/seqfile" "gvisor.dev/gvisor/pkg/sentry/kernel" ) diff --git a/pkg/sentry/fs/ramfs/BUILD b/pkg/sentry/fs/ramfs/BUILD index 39c4b84f8..8ca823fb3 100644 --- a/pkg/sentry/fs/ramfs/BUILD +++ b/pkg/sentry/fs/ramfs/BUILD @@ -13,14 +13,14 @@ go_library( visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", - "//pkg/sentry/context", + "//pkg/context", "//pkg/sentry/fs", "//pkg/sentry/fs/anon", "//pkg/sentry/fs/fsutil", "//pkg/sentry/socket/unix/transport", - "//pkg/sentry/usermem", "//pkg/sync", "//pkg/syserror", + "//pkg/usermem", "//pkg/waiter", ], ) @@ -31,7 +31,7 @@ go_test( srcs = ["tree_test.go"], library = ":ramfs", deps = [ - "//pkg/sentry/context/contexttest", + "//pkg/sentry/contexttest", "//pkg/sentry/fs", ], ) diff --git a/pkg/sentry/fs/ramfs/dir.go b/pkg/sentry/fs/ramfs/dir.go index dcbb8eb2e..bfa304552 100644 --- a/pkg/sentry/fs/ramfs/dir.go +++ b/pkg/sentry/fs/ramfs/dir.go @@ -20,7 +20,7 @@ import ( "syscall" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" diff --git a/pkg/sentry/fs/ramfs/socket.go b/pkg/sentry/fs/ramfs/socket.go index a24fe2ea2..29ff004f2 100644 --- a/pkg/sentry/fs/ramfs/socket.go +++ b/pkg/sentry/fs/ramfs/socket.go @@ -16,7 +16,7 @@ package ramfs import ( "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" diff --git a/pkg/sentry/fs/ramfs/symlink.go b/pkg/sentry/fs/ramfs/symlink.go index fcfaa29aa..d988349aa 100644 --- a/pkg/sentry/fs/ramfs/symlink.go +++ b/pkg/sentry/fs/ramfs/symlink.go @@ -16,7 +16,7 @@ package ramfs import ( "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/waiter" diff --git a/pkg/sentry/fs/ramfs/tree.go b/pkg/sentry/fs/ramfs/tree.go index 702cc4a1e..dfc9d3453 100644 --- a/pkg/sentry/fs/ramfs/tree.go +++ b/pkg/sentry/fs/ramfs/tree.go @@ -19,10 +19,10 @@ import ( "path" "strings" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/anon" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) // MakeDirectoryTree constructs a ramfs tree of all directories containing diff --git a/pkg/sentry/fs/ramfs/tree_test.go b/pkg/sentry/fs/ramfs/tree_test.go index 61a7e2900..a6ed8b2c5 100644 --- a/pkg/sentry/fs/ramfs/tree_test.go +++ b/pkg/sentry/fs/ramfs/tree_test.go @@ -17,7 +17,7 @@ package ramfs import ( "testing" - "gvisor.dev/gvisor/pkg/sentry/context/contexttest" + "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/sentry/fs" ) diff --git a/pkg/sentry/fs/splice.go b/pkg/sentry/fs/splice.go index 389c330a0..791d1526c 100644 --- a/pkg/sentry/fs/splice.go +++ b/pkg/sentry/fs/splice.go @@ -18,7 +18,7 @@ import ( "io" "sync/atomic" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/syserror" ) diff --git a/pkg/sentry/fs/sys/BUILD b/pkg/sentry/fs/sys/BUILD index cc6b3bfbf..f2e8b9932 100644 --- a/pkg/sentry/fs/sys/BUILD +++ b/pkg/sentry/fs/sys/BUILD @@ -13,12 +13,12 @@ go_library( visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", - "//pkg/sentry/context", + "//pkg/context", "//pkg/sentry/device", "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", "//pkg/sentry/fs/ramfs", "//pkg/sentry/kernel", - "//pkg/sentry/usermem", + "//pkg/usermem", ], ) diff --git a/pkg/sentry/fs/sys/devices.go b/pkg/sentry/fs/sys/devices.go index 4f78ca8d2..b67065956 100644 --- a/pkg/sentry/fs/sys/devices.go +++ b/pkg/sentry/fs/sys/devices.go @@ -18,7 +18,7 @@ import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/kernel" diff --git a/pkg/sentry/fs/sys/fs.go b/pkg/sentry/fs/sys/fs.go index e60b63e75..fd03a4e38 100644 --- a/pkg/sentry/fs/sys/fs.go +++ b/pkg/sentry/fs/sys/fs.go @@ -15,7 +15,7 @@ package sys import ( - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" ) diff --git a/pkg/sentry/fs/sys/sys.go b/pkg/sentry/fs/sys/sys.go index b14bf3f55..0891645e4 100644 --- a/pkg/sentry/fs/sys/sys.go +++ b/pkg/sentry/fs/sys/sys.go @@ -16,10 +16,10 @@ package sys import ( - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/ramfs" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) func newFile(ctx context.Context, node fs.InodeOperations, msrc *fs.MountSource) *fs.Inode { diff --git a/pkg/sentry/fs/timerfd/BUILD b/pkg/sentry/fs/timerfd/BUILD index 092668e8d..d16cdb4df 100644 --- a/pkg/sentry/fs/timerfd/BUILD +++ b/pkg/sentry/fs/timerfd/BUILD @@ -7,13 +7,13 @@ go_library( srcs = ["timerfd.go"], visibility = ["//pkg/sentry:internal"], deps = [ - "//pkg/sentry/context", + "//pkg/context", "//pkg/sentry/fs", "//pkg/sentry/fs/anon", "//pkg/sentry/fs/fsutil", "//pkg/sentry/kernel/time", - "//pkg/sentry/usermem", "//pkg/syserror", + "//pkg/usermem", "//pkg/waiter", ], ) diff --git a/pkg/sentry/fs/timerfd/timerfd.go b/pkg/sentry/fs/timerfd/timerfd.go index f8bf663bb..88c344089 100644 --- a/pkg/sentry/fs/timerfd/timerfd.go +++ b/pkg/sentry/fs/timerfd/timerfd.go @@ -19,13 +19,13 @@ package timerfd import ( "sync/atomic" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/anon" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/fs/tmpfs/BUILD b/pkg/sentry/fs/tmpfs/BUILD index 04776555f..aa7199014 100644 --- a/pkg/sentry/fs/tmpfs/BUILD +++ b/pkg/sentry/fs/tmpfs/BUILD @@ -14,8 +14,9 @@ go_library( visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", + "//pkg/context", "//pkg/metric", - "//pkg/sentry/context", + "//pkg/safemem", "//pkg/sentry/device", "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", @@ -25,12 +26,11 @@ go_library( "//pkg/sentry/kernel/pipe", "//pkg/sentry/kernel/time", "//pkg/sentry/memmap", - "//pkg/sentry/safemem", "//pkg/sentry/socket/unix/transport", "//pkg/sentry/usage", - "//pkg/sentry/usermem", "//pkg/sync", "//pkg/syserror", + "//pkg/usermem", "//pkg/waiter", ], ) @@ -41,10 +41,10 @@ go_test( srcs = ["file_test.go"], library = ":tmpfs", deps = [ - "//pkg/sentry/context", + "//pkg/context", "//pkg/sentry/fs", "//pkg/sentry/kernel/contexttest", "//pkg/sentry/usage", - "//pkg/sentry/usermem", + "//pkg/usermem", ], ) diff --git a/pkg/sentry/fs/tmpfs/file_regular.go b/pkg/sentry/fs/tmpfs/file_regular.go index 9a6943fe4..614f8f8a1 100644 --- a/pkg/sentry/fs/tmpfs/file_regular.go +++ b/pkg/sentry/fs/tmpfs/file_regular.go @@ -15,11 +15,11 @@ package tmpfs import ( - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/fs/tmpfs/file_test.go b/pkg/sentry/fs/tmpfs/file_test.go index 0075ef023..aaba35502 100644 --- a/pkg/sentry/fs/tmpfs/file_test.go +++ b/pkg/sentry/fs/tmpfs/file_test.go @@ -18,11 +18,11 @@ import ( "bytes" "testing" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel/contexttest" "gvisor.dev/gvisor/pkg/sentry/usage" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) func newFileInode(ctx context.Context) *fs.Inode { diff --git a/pkg/sentry/fs/tmpfs/fs.go b/pkg/sentry/fs/tmpfs/fs.go index be98ad751..d5be56c3f 100644 --- a/pkg/sentry/fs/tmpfs/fs.go +++ b/pkg/sentry/fs/tmpfs/fs.go @@ -19,7 +19,7 @@ import ( "strconv" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" ) diff --git a/pkg/sentry/fs/tmpfs/inode_file.go b/pkg/sentry/fs/tmpfs/inode_file.go index f1c87fe41..dabc10662 100644 --- a/pkg/sentry/fs/tmpfs/inode_file.go +++ b/pkg/sentry/fs/tmpfs/inode_file.go @@ -20,18 +20,18 @@ import ( "time" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/metric" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/kernel" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/sentry/safemem" "gvisor.dev/gvisor/pkg/sentry/usage" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) var ( diff --git a/pkg/sentry/fs/tmpfs/tmpfs.go b/pkg/sentry/fs/tmpfs/tmpfs.go index 0f718e236..c00cef0a5 100644 --- a/pkg/sentry/fs/tmpfs/tmpfs.go +++ b/pkg/sentry/fs/tmpfs/tmpfs.go @@ -17,7 +17,7 @@ package tmpfs import ( "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/fs/ramfs" @@ -25,8 +25,8 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/usage" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) var fsInfo = fs.Info{ diff --git a/pkg/sentry/fs/tty/BUILD b/pkg/sentry/fs/tty/BUILD index 29f804c6c..5cb0e0417 100644 --- a/pkg/sentry/fs/tty/BUILD +++ b/pkg/sentry/fs/tty/BUILD @@ -16,20 +16,20 @@ go_library( visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", + "//pkg/context", "//pkg/refs", + "//pkg/safemem", "//pkg/sentry/arch", - "//pkg/sentry/context", "//pkg/sentry/device", "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", - "//pkg/sentry/safemem", "//pkg/sentry/socket/unix/transport", "//pkg/sentry/unimpl", - "//pkg/sentry/usermem", "//pkg/sync", "//pkg/syserror", + "//pkg/usermem", "//pkg/waiter", ], ) @@ -41,7 +41,7 @@ go_test( library = ":tty", deps = [ "//pkg/abi/linux", - "//pkg/sentry/context/contexttest", - "//pkg/sentry/usermem", + "//pkg/sentry/contexttest", + "//pkg/usermem", ], ) diff --git a/pkg/sentry/fs/tty/dir.go b/pkg/sentry/fs/tty/dir.go index 88aa66b24..108654827 100644 --- a/pkg/sentry/fs/tty/dir.go +++ b/pkg/sentry/fs/tty/dir.go @@ -21,14 +21,14 @@ import ( "strconv" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/fs/tty/fs.go b/pkg/sentry/fs/tty/fs.go index edee56c12..8fe05ebe5 100644 --- a/pkg/sentry/fs/tty/fs.go +++ b/pkg/sentry/fs/tty/fs.go @@ -15,7 +15,7 @@ package tty import ( - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/device" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/syserror" diff --git a/pkg/sentry/fs/tty/line_discipline.go b/pkg/sentry/fs/tty/line_discipline.go index 9fe02657e..12b1c6097 100644 --- a/pkg/sentry/fs/tty/line_discipline.go +++ b/pkg/sentry/fs/tty/line_discipline.go @@ -19,11 +19,11 @@ import ( "unicode/utf8" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/fs/tty/master.go b/pkg/sentry/fs/tty/master.go index 6b07f6bf2..f62da49bd 100644 --- a/pkg/sentry/fs/tty/master.go +++ b/pkg/sentry/fs/tty/master.go @@ -16,13 +16,13 @@ package tty import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/unimpl" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/fs/tty/queue.go b/pkg/sentry/fs/tty/queue.go index 21ccc6f32..1ca79c0b2 100644 --- a/pkg/sentry/fs/tty/queue.go +++ b/pkg/sentry/fs/tty/queue.go @@ -16,12 +16,12 @@ package tty import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/safemem" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/fs/tty/slave.go b/pkg/sentry/fs/tty/slave.go index 2a51e6bab..db55cdc48 100644 --- a/pkg/sentry/fs/tty/slave.go +++ b/pkg/sentry/fs/tty/slave.go @@ -16,12 +16,12 @@ package tty import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/fs/tty/terminal.go b/pkg/sentry/fs/tty/terminal.go index 917f90cc0..5883f26db 100644 --- a/pkg/sentry/fs/tty/terminal.go +++ b/pkg/sentry/fs/tty/terminal.go @@ -16,11 +16,11 @@ package tty import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) // Terminal is a pseudoterminal. diff --git a/pkg/sentry/fs/tty/tty_test.go b/pkg/sentry/fs/tty/tty_test.go index 59f07ff8e..2cbc05678 100644 --- a/pkg/sentry/fs/tty/tty_test.go +++ b/pkg/sentry/fs/tty/tty_test.go @@ -18,8 +18,8 @@ import ( "testing" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context/contexttest" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sentry/contexttest" + "gvisor.dev/gvisor/pkg/usermem" ) func TestSimpleMasterToSlave(t *testing.T) { diff --git a/pkg/sentry/fsimpl/ext/BUILD b/pkg/sentry/fsimpl/ext/BUILD index a718920d5..6f78f478f 100644 --- a/pkg/sentry/fsimpl/ext/BUILD +++ b/pkg/sentry/fsimpl/ext/BUILD @@ -35,21 +35,21 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/binary", + "//pkg/context", "//pkg/fd", "//pkg/fspath", "//pkg/log", + "//pkg/safemem", "//pkg/sentry/arch", - "//pkg/sentry/context", "//pkg/sentry/fs", "//pkg/sentry/fsimpl/ext/disklayout", "//pkg/sentry/kernel/auth", "//pkg/sentry/memmap", - "//pkg/sentry/safemem", "//pkg/sentry/syscalls/linux", - "//pkg/sentry/usermem", "//pkg/sentry/vfs", "//pkg/sync", "//pkg/syserror", + "//pkg/usermem", "//pkg/waiter", ], ) @@ -73,14 +73,14 @@ go_test( deps = [ "//pkg/abi/linux", "//pkg/binary", + "//pkg/context", "//pkg/fspath", - "//pkg/sentry/context", - "//pkg/sentry/context/contexttest", + "//pkg/sentry/contexttest", "//pkg/sentry/fsimpl/ext/disklayout", "//pkg/sentry/kernel/auth", - "//pkg/sentry/usermem", "//pkg/sentry/vfs", "//pkg/syserror", + "//pkg/usermem", "//runsc/testutil", "@com_github_google_go-cmp//cmp:go_default_library", "@com_github_google_go-cmp//cmp/cmpopts:go_default_library", diff --git a/pkg/sentry/fsimpl/ext/benchmark/BUILD b/pkg/sentry/fsimpl/ext/benchmark/BUILD index 12f3990c1..6c5a559fd 100644 --- a/pkg/sentry/fsimpl/ext/benchmark/BUILD +++ b/pkg/sentry/fsimpl/ext/benchmark/BUILD @@ -7,9 +7,9 @@ go_test( size = "small", srcs = ["benchmark_test.go"], deps = [ + "//pkg/context", "//pkg/fspath", - "//pkg/sentry/context", - "//pkg/sentry/context/contexttest", + "//pkg/sentry/contexttest", "//pkg/sentry/fsimpl/ext", "//pkg/sentry/kernel/auth", "//pkg/sentry/vfs", diff --git a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go index a56b03711..d1436b943 100644 --- a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go +++ b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go @@ -24,9 +24,9 @@ import ( "strings" "testing" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fspath" - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/context/contexttest" + "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" diff --git a/pkg/sentry/fsimpl/ext/directory.go b/pkg/sentry/fsimpl/ext/directory.go index 8944171c8..ebb72b75e 100644 --- a/pkg/sentry/fsimpl/ext/directory.go +++ b/pkg/sentry/fsimpl/ext/directory.go @@ -17,8 +17,8 @@ package ext import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout" "gvisor.dev/gvisor/pkg/sentry/memmap" diff --git a/pkg/sentry/fsimpl/ext/ext.go b/pkg/sentry/fsimpl/ext/ext.go index 4b7d17dc6..373d23b74 100644 --- a/pkg/sentry/fsimpl/ext/ext.go +++ b/pkg/sentry/fsimpl/ext/ext.go @@ -21,9 +21,9 @@ import ( "io" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/log" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" diff --git a/pkg/sentry/fsimpl/ext/ext_test.go b/pkg/sentry/fsimpl/ext/ext_test.go index 6c14a1e2d..05f992826 100644 --- a/pkg/sentry/fsimpl/ext/ext_test.go +++ b/pkg/sentry/fsimpl/ext/ext_test.go @@ -25,14 +25,14 @@ import ( "github.com/google/go-cmp/cmp" "github.com/google/go-cmp/cmp/cmpopts" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fspath" - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/context/contexttest" + "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/runsc/testutil" ) diff --git a/pkg/sentry/fsimpl/ext/file_description.go b/pkg/sentry/fsimpl/ext/file_description.go index 841274daf..92f7da40d 100644 --- a/pkg/sentry/fsimpl/ext/file_description.go +++ b/pkg/sentry/fsimpl/ext/file_description.go @@ -16,7 +16,7 @@ package ext import ( "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" ) diff --git a/pkg/sentry/fsimpl/ext/filesystem.go b/pkg/sentry/fsimpl/ext/filesystem.go index 9afb1a84c..07bf58953 100644 --- a/pkg/sentry/fsimpl/ext/filesystem.go +++ b/pkg/sentry/fsimpl/ext/filesystem.go @@ -19,8 +19,8 @@ import ( "io" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fspath" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" diff --git a/pkg/sentry/fsimpl/ext/regular_file.go b/pkg/sentry/fsimpl/ext/regular_file.go index d11153c90..30135ddb0 100644 --- a/pkg/sentry/fsimpl/ext/regular_file.go +++ b/pkg/sentry/fsimpl/ext/regular_file.go @@ -18,13 +18,13 @@ import ( "io" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/sentry/safemem" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // regularFile represents a regular file's inode. This too follows the diff --git a/pkg/sentry/fsimpl/ext/symlink.go b/pkg/sentry/fsimpl/ext/symlink.go index bdf8705c1..1447a4dc1 100644 --- a/pkg/sentry/fsimpl/ext/symlink.go +++ b/pkg/sentry/fsimpl/ext/symlink.go @@ -15,11 +15,11 @@ package ext import ( - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // symlink represents a symlink inode. diff --git a/pkg/sentry/fsimpl/kernfs/BUILD b/pkg/sentry/fsimpl/kernfs/BUILD index 7bf83ccba..e73f1f857 100644 --- a/pkg/sentry/fsimpl/kernfs/BUILD +++ b/pkg/sentry/fsimpl/kernfs/BUILD @@ -29,16 +29,16 @@ go_library( visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", + "//pkg/context", "//pkg/fspath", "//pkg/log", "//pkg/refs", - "//pkg/sentry/context", "//pkg/sentry/kernel/auth", "//pkg/sentry/memmap", - "//pkg/sentry/usermem", "//pkg/sentry/vfs", "//pkg/sync", "//pkg/syserror", + "//pkg/usermem", ], ) @@ -49,13 +49,13 @@ go_test( deps = [ ":kernfs", "//pkg/abi/linux", - "//pkg/sentry/context", - "//pkg/sentry/context/contexttest", + "//pkg/context", + "//pkg/sentry/contexttest", "//pkg/sentry/fsimpl/testutil", "//pkg/sentry/kernel/auth", - "//pkg/sentry/usermem", "//pkg/sentry/vfs", "//pkg/syserror", + "//pkg/usermem", "@com_github_google_go-cmp//cmp:go_default_library", ], ) diff --git a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go index 75624e0b1..373f801ff 100644 --- a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go +++ b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go @@ -18,11 +18,11 @@ import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // DynamicBytesFile implements kernfs.Inode and represents a read-only diff --git a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go index 5fa1fa67b..6104751c8 100644 --- a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go @@ -16,11 +16,11 @@ package kernfs import ( "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // GenericDirectoryFD implements vfs.FileDescriptionImpl for a generic directory diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go index a4600ad47..9d65d0179 100644 --- a/pkg/sentry/fsimpl/kernfs/filesystem.go +++ b/pkg/sentry/fsimpl/kernfs/filesystem.go @@ -20,8 +20,8 @@ import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fspath" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" ) diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go index 1700fffd9..adca2313f 100644 --- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go @@ -19,8 +19,8 @@ import ( "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/refs" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go index 85bcdcc57..79ebea8a5 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs.go @@ -56,8 +56,8 @@ import ( "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/refs" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go index fade59491..ee65cf491 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs_test.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go @@ -21,14 +21,14 @@ import ( "github.com/google/go-cmp/cmp" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/context/contexttest" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/fsimpl/testutil" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) const defaultMode linux.FileMode = 01777 diff --git a/pkg/sentry/fsimpl/kernfs/symlink.go b/pkg/sentry/fsimpl/kernfs/symlink.go index f19f12854..0ee7eb9b7 100644 --- a/pkg/sentry/fsimpl/kernfs/symlink.go +++ b/pkg/sentry/fsimpl/kernfs/symlink.go @@ -16,7 +16,7 @@ package kernfs import ( "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" ) diff --git a/pkg/sentry/fsimpl/proc/BUILD b/pkg/sentry/fsimpl/proc/BUILD index 3768f55b2..12aac2e6a 100644 --- a/pkg/sentry/fsimpl/proc/BUILD +++ b/pkg/sentry/fsimpl/proc/BUILD @@ -16,8 +16,9 @@ go_library( ], deps = [ "//pkg/abi/linux", + "//pkg/context", "//pkg/log", - "//pkg/sentry/context", + "//pkg/safemem", "//pkg/sentry/fs", "//pkg/sentry/fsimpl/kernfs", "//pkg/sentry/inet", @@ -26,15 +27,14 @@ go_library( "//pkg/sentry/kernel/time", "//pkg/sentry/limits", "//pkg/sentry/mm", - "//pkg/sentry/safemem", "//pkg/sentry/socket", "//pkg/sentry/socket/unix", "//pkg/sentry/socket/unix/transport", "//pkg/sentry/usage", - "//pkg/sentry/usermem", "//pkg/sentry/vfs", "//pkg/syserror", "//pkg/tcpip/header", + "//pkg/usermem", ], ) @@ -48,15 +48,15 @@ go_test( library = ":proc", deps = [ "//pkg/abi/linux", + "//pkg/context", "//pkg/fspath", - "//pkg/sentry/context", - "//pkg/sentry/context/contexttest", + "//pkg/sentry/contexttest", "//pkg/sentry/fsimpl/testutil", "//pkg/sentry/inet", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", - "//pkg/sentry/usermem", "//pkg/sentry/vfs", "//pkg/syserror", + "//pkg/usermem", ], ) diff --git a/pkg/sentry/fsimpl/proc/filesystem.go b/pkg/sentry/fsimpl/proc/filesystem.go index f49819187..11477b6a9 100644 --- a/pkg/sentry/fsimpl/proc/filesystem.go +++ b/pkg/sentry/fsimpl/proc/filesystem.go @@ -19,7 +19,7 @@ import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" diff --git a/pkg/sentry/fsimpl/proc/subtasks.go b/pkg/sentry/fsimpl/proc/subtasks.go index 91eded415..353e37195 100644 --- a/pkg/sentry/fsimpl/proc/subtasks.go +++ b/pkg/sentry/fsimpl/proc/subtasks.go @@ -19,7 +19,7 @@ import ( "strconv" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/vfs" diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go index a0580f20d..eb5bc62c0 100644 --- a/pkg/sentry/fsimpl/proc/task.go +++ b/pkg/sentry/fsimpl/proc/task.go @@ -19,7 +19,7 @@ import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" diff --git a/pkg/sentry/fsimpl/proc/task_files.go b/pkg/sentry/fsimpl/proc/task_files.go index 7bc352ae9..efd3b3453 100644 --- a/pkg/sentry/fsimpl/proc/task_files.go +++ b/pkg/sentry/fsimpl/proc/task_files.go @@ -20,17 +20,17 @@ import ( "io" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/mm" - "gvisor.dev/gvisor/pkg/sentry/safemem" "gvisor.dev/gvisor/pkg/sentry/usage" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // mm gets the kernel task's MemoryManager. No additional reference is taken on diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go index 51f634716..e0cb9c47b 100644 --- a/pkg/sentry/fsimpl/proc/tasks.go +++ b/pkg/sentry/fsimpl/proc/tasks.go @@ -20,7 +20,7 @@ import ( "strconv" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" diff --git a/pkg/sentry/fsimpl/proc/tasks_files.go b/pkg/sentry/fsimpl/proc/tasks_files.go index ad3760e39..434998910 100644 --- a/pkg/sentry/fsimpl/proc/tasks_files.go +++ b/pkg/sentry/fsimpl/proc/tasks_files.go @@ -20,14 +20,14 @@ import ( "strconv" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/usage" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) type selfSymlink struct { diff --git a/pkg/sentry/fsimpl/proc/tasks_net.go b/pkg/sentry/fsimpl/proc/tasks_net.go index 4aaf23e97..608fec017 100644 --- a/pkg/sentry/fsimpl/proc/tasks_net.go +++ b/pkg/sentry/fsimpl/proc/tasks_net.go @@ -22,8 +22,8 @@ import ( "time" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/inet" @@ -32,9 +32,9 @@ import ( "gvisor.dev/gvisor/pkg/sentry/socket" "gvisor.dev/gvisor/pkg/sentry/socket/unix" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/tcpip/header" + "gvisor.dev/gvisor/pkg/usermem" ) func newNetDir(root *auth.Credentials, inoGen InoGenerator, k *kernel.Kernel) *kernfs.Dentry { diff --git a/pkg/sentry/fsimpl/proc/tasks_sys.go b/pkg/sentry/fsimpl/proc/tasks_sys.go index aabf2bf0c..ad963870b 100644 --- a/pkg/sentry/fsimpl/proc/tasks_sys.go +++ b/pkg/sentry/fsimpl/proc/tasks_sys.go @@ -19,7 +19,7 @@ import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" diff --git a/pkg/sentry/fsimpl/proc/tasks_sys_test.go b/pkg/sentry/fsimpl/proc/tasks_sys_test.go index 0a1d3f34b..be54897bb 100644 --- a/pkg/sentry/fsimpl/proc/tasks_sys_test.go +++ b/pkg/sentry/fsimpl/proc/tasks_sys_test.go @@ -20,7 +20,7 @@ import ( "testing" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context/contexttest" + "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/sentry/inet" ) diff --git a/pkg/sentry/fsimpl/proc/tasks_test.go b/pkg/sentry/fsimpl/proc/tasks_test.go index 2c1635f33..6fc3524db 100644 --- a/pkg/sentry/fsimpl/proc/tasks_test.go +++ b/pkg/sentry/fsimpl/proc/tasks_test.go @@ -22,14 +22,14 @@ import ( "testing" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fspath" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fsimpl/testutil" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) var ( diff --git a/pkg/sentry/fsimpl/sys/BUILD b/pkg/sentry/fsimpl/sys/BUILD index beda141f1..66c0d8bc8 100644 --- a/pkg/sentry/fsimpl/sys/BUILD +++ b/pkg/sentry/fsimpl/sys/BUILD @@ -9,7 +9,7 @@ go_library( ], deps = [ "//pkg/abi/linux", - "//pkg/sentry/context", + "//pkg/context", "//pkg/sentry/fsimpl/kernfs", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", diff --git a/pkg/sentry/fsimpl/sys/sys.go b/pkg/sentry/fsimpl/sys/sys.go index 1305ad01d..e35d52d17 100644 --- a/pkg/sentry/fsimpl/sys/sys.go +++ b/pkg/sentry/fsimpl/sys/sys.go @@ -20,7 +20,7 @@ import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" diff --git a/pkg/sentry/fsimpl/testutil/BUILD b/pkg/sentry/fsimpl/testutil/BUILD index 12053a5b6..efd5974c4 100644 --- a/pkg/sentry/fsimpl/testutil/BUILD +++ b/pkg/sentry/fsimpl/testutil/BUILD @@ -12,10 +12,10 @@ go_library( visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", + "//pkg/context", "//pkg/cpuid", "//pkg/fspath", "//pkg/memutil", - "//pkg/sentry/context", "//pkg/sentry/fs", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", @@ -27,9 +27,9 @@ go_library( "//pkg/sentry/platform/kvm", "//pkg/sentry/platform/ptrace", "//pkg/sentry/time", - "//pkg/sentry/usermem", "//pkg/sentry/vfs", "//pkg/sync", + "//pkg/usermem", "@com_github_google_go-cmp//cmp:go_default_library", ], ) diff --git a/pkg/sentry/fsimpl/testutil/kernel.go b/pkg/sentry/fsimpl/testutil/kernel.go index 295da2d52..89f8c4915 100644 --- a/pkg/sentry/fsimpl/testutil/kernel.go +++ b/pkg/sentry/fsimpl/testutil/kernel.go @@ -21,9 +21,9 @@ import ( "runtime" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/cpuid" "gvisor.dev/gvisor/pkg/memutil" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" diff --git a/pkg/sentry/fsimpl/testutil/testutil.go b/pkg/sentry/fsimpl/testutil/testutil.go index 2a723a89f..1c98335c1 100644 --- a/pkg/sentry/fsimpl/testutil/testutil.go +++ b/pkg/sentry/fsimpl/testutil/testutil.go @@ -24,12 +24,12 @@ import ( "github.com/google/go-cmp/cmp" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fspath" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/usermem" ) // System represents the context for a single test. diff --git a/pkg/sentry/fsimpl/tmpfs/BUILD b/pkg/sentry/fsimpl/tmpfs/BUILD index 857e98bc5..fb436860c 100644 --- a/pkg/sentry/fsimpl/tmpfs/BUILD +++ b/pkg/sentry/fsimpl/tmpfs/BUILD @@ -30,10 +30,11 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/amutex", + "//pkg/context", "//pkg/fspath", "//pkg/log", + "//pkg/safemem", "//pkg/sentry/arch", - "//pkg/sentry/context", "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", "//pkg/sentry/kernel", @@ -43,12 +44,11 @@ go_library( "//pkg/sentry/memmap", "//pkg/sentry/pgalloc", "//pkg/sentry/platform", - "//pkg/sentry/safemem", "//pkg/sentry/usage", - "//pkg/sentry/usermem", "//pkg/sentry/vfs", "//pkg/sync", "//pkg/syserror", + "//pkg/usermem", ], ) @@ -59,10 +59,10 @@ go_test( deps = [ ":tmpfs", "//pkg/abi/linux", + "//pkg/context", "//pkg/fspath", "//pkg/refs", - "//pkg/sentry/context", - "//pkg/sentry/context/contexttest", + "//pkg/sentry/contexttest", "//pkg/sentry/fs", "//pkg/sentry/fs/tmpfs", "//pkg/sentry/kernel/auth", @@ -82,13 +82,13 @@ go_test( library = ":tmpfs", deps = [ "//pkg/abi/linux", + "//pkg/context", "//pkg/fspath", - "//pkg/sentry/context", - "//pkg/sentry/context/contexttest", + "//pkg/sentry/contexttest", "//pkg/sentry/kernel/auth", "//pkg/sentry/kernel/contexttest", - "//pkg/sentry/usermem", "//pkg/sentry/vfs", "//pkg/syserror", + "//pkg/usermem", ], ) diff --git a/pkg/sentry/fsimpl/tmpfs/benchmark_test.go b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go index d88c83499..54241c8e8 100644 --- a/pkg/sentry/fsimpl/tmpfs/benchmark_test.go +++ b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go @@ -21,10 +21,10 @@ import ( "testing" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/refs" - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/context/contexttest" + "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/sentry/fs" _ "gvisor.dev/gvisor/pkg/sentry/fs/tmpfs" "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs" diff --git a/pkg/sentry/fsimpl/tmpfs/directory.go b/pkg/sentry/fsimpl/tmpfs/directory.go index 887ca2619..dc0d27cf9 100644 --- a/pkg/sentry/fsimpl/tmpfs/directory.go +++ b/pkg/sentry/fsimpl/tmpfs/directory.go @@ -16,7 +16,7 @@ package tmpfs import ( "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go index d726f03c5..5ee9cf1e9 100644 --- a/pkg/sentry/fsimpl/tmpfs/filesystem.go +++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go @@ -19,8 +19,8 @@ import ( "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fspath" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" ) diff --git a/pkg/sentry/fsimpl/tmpfs/named_pipe.go b/pkg/sentry/fsimpl/tmpfs/named_pipe.go index 482aabd52..0c57fdca3 100644 --- a/pkg/sentry/fsimpl/tmpfs/named_pipe.go +++ b/pkg/sentry/fsimpl/tmpfs/named_pipe.go @@ -16,11 +16,11 @@ package tmpfs import ( "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/usermem" ) type namedPipe struct { diff --git a/pkg/sentry/fsimpl/tmpfs/pipe_test.go b/pkg/sentry/fsimpl/tmpfs/pipe_test.go index 70b42a6ec..5ee7f2a72 100644 --- a/pkg/sentry/fsimpl/tmpfs/pipe_test.go +++ b/pkg/sentry/fsimpl/tmpfs/pipe_test.go @@ -19,13 +19,13 @@ import ( "testing" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fspath" - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/context/contexttest" + "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) const fileName = "mypipe" diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file.go b/pkg/sentry/fsimpl/tmpfs/regular_file.go index 7c633c1b0..e9e6faf67 100644 --- a/pkg/sentry/fsimpl/tmpfs/regular_file.go +++ b/pkg/sentry/fsimpl/tmpfs/regular_file.go @@ -20,17 +20,17 @@ import ( "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" - "gvisor.dev/gvisor/pkg/sentry/safemem" "gvisor.dev/gvisor/pkg/sentry/usage" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) type regularFile struct { diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file_test.go b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go index 034a29fdb..32552e261 100644 --- a/pkg/sentry/fsimpl/tmpfs/regular_file_test.go +++ b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go @@ -22,12 +22,12 @@ import ( "testing" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fspath" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/contexttest" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/usermem" ) // nextFileID is used to generate unique file names. diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go index 515f033f2..88dbd6e35 100644 --- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go +++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go @@ -29,7 +29,7 @@ import ( "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/pgalloc" diff --git a/pkg/sentry/hostmm/BUILD b/pkg/sentry/hostmm/BUILD index a145a5ca3..61c78569d 100644 --- a/pkg/sentry/hostmm/BUILD +++ b/pkg/sentry/hostmm/BUILD @@ -12,6 +12,6 @@ go_library( deps = [ "//pkg/fd", "//pkg/log", - "//pkg/sentry/usermem", + "//pkg/usermem", ], ) diff --git a/pkg/sentry/hostmm/hostmm.go b/pkg/sentry/hostmm/hostmm.go index 19335ca73..506c7864a 100644 --- a/pkg/sentry/hostmm/hostmm.go +++ b/pkg/sentry/hostmm/hostmm.go @@ -24,7 +24,7 @@ import ( "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/log" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) // NotifyCurrentMemcgPressureCallback requests that f is called whenever the diff --git a/pkg/sentry/inet/BUILD b/pkg/sentry/inet/BUILD index aa621b724..334432abf 100644 --- a/pkg/sentry/inet/BUILD +++ b/pkg/sentry/inet/BUILD @@ -13,7 +13,7 @@ go_library( "test_stack.go", ], deps = [ - "//pkg/sentry/context", + "//pkg/context", "//pkg/tcpip/stack", ], ) diff --git a/pkg/sentry/inet/context.go b/pkg/sentry/inet/context.go index 4eda7dd1f..e8cc1bffd 100644 --- a/pkg/sentry/inet/context.go +++ b/pkg/sentry/inet/context.go @@ -15,7 +15,7 @@ package inet import ( - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" ) // contextID is the inet package's type for context.Context.Value keys. diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD index cebaccd92..0738946d9 100644 --- a/pkg/sentry/kernel/BUILD +++ b/pkg/sentry/kernel/BUILD @@ -153,14 +153,15 @@ go_library( "//pkg/binary", "//pkg/bits", "//pkg/bpf", + "//pkg/context", "//pkg/cpuid", "//pkg/eventchannel", "//pkg/log", "//pkg/metric", "//pkg/refs", + "//pkg/safemem", "//pkg/secio", "//pkg/sentry/arch", - "//pkg/sentry/context", "//pkg/sentry/device", "//pkg/sentry/fs", "//pkg/sentry/fs/lock", @@ -180,7 +181,6 @@ go_library( "//pkg/sentry/mm", "//pkg/sentry/pgalloc", "//pkg/sentry/platform", - "//pkg/sentry/safemem", "//pkg/sentry/socket/netlink/port", "//pkg/sentry/socket/unix/transport", "//pkg/sentry/time", @@ -188,7 +188,6 @@ go_library( "//pkg/sentry/unimpl:unimplemented_syscall_go_proto", "//pkg/sentry/uniqueid", "//pkg/sentry/usage", - "//pkg/sentry/usermem", "//pkg/state", "//pkg/state/statefile", "//pkg/sync", @@ -196,6 +195,7 @@ go_library( "//pkg/syserror", "//pkg/tcpip", "//pkg/tcpip/stack", + "//pkg/usermem", "//pkg/waiter", ], ) @@ -212,9 +212,9 @@ go_test( library = ":kernel", deps = [ "//pkg/abi", + "//pkg/context", "//pkg/sentry/arch", - "//pkg/sentry/context", - "//pkg/sentry/context/contexttest", + "//pkg/sentry/contexttest", "//pkg/sentry/fs", "//pkg/sentry/fs/filetest", "//pkg/sentry/kernel/sched", @@ -222,8 +222,8 @@ go_test( "//pkg/sentry/pgalloc", "//pkg/sentry/time", "//pkg/sentry/usage", - "//pkg/sentry/usermem", "//pkg/sync", "//pkg/syserror", + "//pkg/usermem", ], ) diff --git a/pkg/sentry/kernel/auth/BUILD b/pkg/sentry/kernel/auth/BUILD index 64537c9be..2bc49483a 100644 --- a/pkg/sentry/kernel/auth/BUILD +++ b/pkg/sentry/kernel/auth/BUILD @@ -61,8 +61,8 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/bits", + "//pkg/context", "//pkg/log", - "//pkg/sentry/context", "//pkg/sync", "//pkg/syserror", ], diff --git a/pkg/sentry/kernel/auth/context.go b/pkg/sentry/kernel/auth/context.go index 5c0e7d6b6..ef5723127 100644 --- a/pkg/sentry/kernel/auth/context.go +++ b/pkg/sentry/kernel/auth/context.go @@ -15,7 +15,7 @@ package auth import ( - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" ) // contextID is the auth package's type for context.Context.Value keys. diff --git a/pkg/sentry/kernel/auth/id_map.go b/pkg/sentry/kernel/auth/id_map.go index 3d74bc610..28cbe159d 100644 --- a/pkg/sentry/kernel/auth/id_map.go +++ b/pkg/sentry/kernel/auth/id_map.go @@ -16,7 +16,7 @@ package auth import ( "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/syserror" ) diff --git a/pkg/sentry/kernel/context.go b/pkg/sentry/kernel/context.go index 3c9dceaba..0c40bf315 100644 --- a/pkg/sentry/kernel/context.go +++ b/pkg/sentry/kernel/context.go @@ -17,8 +17,8 @@ package kernel import ( "time" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" - "gvisor.dev/gvisor/pkg/sentry/context" ) // contextID is the kernel package's type for context.Context.Value keys. diff --git a/pkg/sentry/kernel/contexttest/BUILD b/pkg/sentry/kernel/contexttest/BUILD index daff608d7..9d26392c0 100644 --- a/pkg/sentry/kernel/contexttest/BUILD +++ b/pkg/sentry/kernel/contexttest/BUILD @@ -8,8 +8,8 @@ go_library( srcs = ["contexttest.go"], visibility = ["//pkg/sentry:internal"], deps = [ - "//pkg/sentry/context", - "//pkg/sentry/context/contexttest", + "//pkg/context", + "//pkg/sentry/contexttest", "//pkg/sentry/kernel", "//pkg/sentry/pgalloc", "//pkg/sentry/platform", diff --git a/pkg/sentry/kernel/contexttest/contexttest.go b/pkg/sentry/kernel/contexttest/contexttest.go index 82f9d8922..22c340e56 100644 --- a/pkg/sentry/kernel/contexttest/contexttest.go +++ b/pkg/sentry/kernel/contexttest/contexttest.go @@ -19,8 +19,8 @@ package contexttest import ( "testing" - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/context/contexttest" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/platform" diff --git a/pkg/sentry/kernel/epoll/BUILD b/pkg/sentry/kernel/epoll/BUILD index 19e16ab3a..dedf0fa15 100644 --- a/pkg/sentry/kernel/epoll/BUILD +++ b/pkg/sentry/kernel/epoll/BUILD @@ -24,13 +24,13 @@ go_library( ], visibility = ["//pkg/sentry:internal"], deps = [ + "//pkg/context", "//pkg/refs", - "//pkg/sentry/context", "//pkg/sentry/fs", "//pkg/sentry/fs/anon", "//pkg/sentry/fs/fsutil", - "//pkg/sentry/usermem", "//pkg/sync", + "//pkg/usermem", "//pkg/waiter", ], ) @@ -43,7 +43,7 @@ go_test( ], library = ":epoll", deps = [ - "//pkg/sentry/context/contexttest", + "//pkg/sentry/contexttest", "//pkg/sentry/fs/filetest", "//pkg/waiter", ], diff --git a/pkg/sentry/kernel/epoll/epoll.go b/pkg/sentry/kernel/epoll/epoll.go index e84742993..8bffb78fc 100644 --- a/pkg/sentry/kernel/epoll/epoll.go +++ b/pkg/sentry/kernel/epoll/epoll.go @@ -20,13 +20,13 @@ import ( "fmt" "syscall" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/refs" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/anon" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/kernel/epoll/epoll_test.go b/pkg/sentry/kernel/epoll/epoll_test.go index 4a20d4c82..22630e9c5 100644 --- a/pkg/sentry/kernel/epoll/epoll_test.go +++ b/pkg/sentry/kernel/epoll/epoll_test.go @@ -17,7 +17,7 @@ package epoll import ( "testing" - "gvisor.dev/gvisor/pkg/sentry/context/contexttest" + "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/sentry/fs/filetest" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/kernel/eventfd/BUILD b/pkg/sentry/kernel/eventfd/BUILD index ee2d74864..9983a32e5 100644 --- a/pkg/sentry/kernel/eventfd/BUILD +++ b/pkg/sentry/kernel/eventfd/BUILD @@ -8,14 +8,14 @@ go_library( visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", + "//pkg/context", "//pkg/fdnotifier", - "//pkg/sentry/context", "//pkg/sentry/fs", "//pkg/sentry/fs/anon", "//pkg/sentry/fs/fsutil", - "//pkg/sentry/usermem", "//pkg/sync", "//pkg/syserror", + "//pkg/usermem", "//pkg/waiter", ], ) @@ -26,8 +26,8 @@ go_test( srcs = ["eventfd_test.go"], library = ":eventfd", deps = [ - "//pkg/sentry/context/contexttest", - "//pkg/sentry/usermem", + "//pkg/sentry/contexttest", + "//pkg/usermem", "//pkg/waiter", ], ) diff --git a/pkg/sentry/kernel/eventfd/eventfd.go b/pkg/sentry/kernel/eventfd/eventfd.go index 687690679..87951adeb 100644 --- a/pkg/sentry/kernel/eventfd/eventfd.go +++ b/pkg/sentry/kernel/eventfd/eventfd.go @@ -21,14 +21,14 @@ import ( "syscall" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fdnotifier" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/anon" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/kernel/eventfd/eventfd_test.go b/pkg/sentry/kernel/eventfd/eventfd_test.go index 018c7f3ef..9b4892f74 100644 --- a/pkg/sentry/kernel/eventfd/eventfd_test.go +++ b/pkg/sentry/kernel/eventfd/eventfd_test.go @@ -17,8 +17,8 @@ package eventfd import ( "testing" - "gvisor.dev/gvisor/pkg/sentry/context/contexttest" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sentry/contexttest" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go index 0ad4135b3..9460bb235 100644 --- a/pkg/sentry/kernel/fd_table.go +++ b/pkg/sentry/kernel/fd_table.go @@ -22,8 +22,8 @@ import ( "syscall" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/refs" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/limits" diff --git a/pkg/sentry/kernel/fd_table_test.go b/pkg/sentry/kernel/fd_table_test.go index 86164df49..261b815f2 100644 --- a/pkg/sentry/kernel/fd_table_test.go +++ b/pkg/sentry/kernel/fd_table_test.go @@ -18,8 +18,8 @@ import ( "runtime" "testing" - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/context/contexttest" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/filetest" "gvisor.dev/gvisor/pkg/sentry/limits" diff --git a/pkg/sentry/kernel/futex/BUILD b/pkg/sentry/kernel/futex/BUILD index f413d8ae2..c5021f2db 100644 --- a/pkg/sentry/kernel/futex/BUILD +++ b/pkg/sentry/kernel/futex/BUILD @@ -36,12 +36,12 @@ go_library( visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", + "//pkg/context", "//pkg/log", - "//pkg/sentry/context", "//pkg/sentry/memmap", - "//pkg/sentry/usermem", "//pkg/sync", "//pkg/syserror", + "//pkg/usermem", ], ) @@ -51,7 +51,7 @@ go_test( srcs = ["futex_test.go"], library = ":futex", deps = [ - "//pkg/sentry/usermem", "//pkg/sync", + "//pkg/usermem", ], ) diff --git a/pkg/sentry/kernel/futex/futex.go b/pkg/sentry/kernel/futex/futex.go index d1931c8f4..732e66da4 100644 --- a/pkg/sentry/kernel/futex/futex.go +++ b/pkg/sentry/kernel/futex/futex.go @@ -20,9 +20,9 @@ package futex import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // KeyKind indicates the type of a Key. diff --git a/pkg/sentry/kernel/futex/futex_test.go b/pkg/sentry/kernel/futex/futex_test.go index c23126ca5..7c5c7665b 100644 --- a/pkg/sentry/kernel/futex/futex_test.go +++ b/pkg/sentry/kernel/futex/futex_test.go @@ -22,8 +22,8 @@ import ( "testing" "unsafe" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/usermem" ) // testData implements the Target interface, and allows us to diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index c85e97fef..7b90fac5a 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -40,12 +40,12 @@ import ( "time" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/cpuid" "gvisor.dev/gvisor/pkg/eventchannel" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/timerfd" "gvisor.dev/gvisor/pkg/sentry/hostcpu" diff --git a/pkg/sentry/kernel/pipe/BUILD b/pkg/sentry/kernel/pipe/BUILD index 2c7b6206f..4c049d5b4 100644 --- a/pkg/sentry/kernel/pipe/BUILD +++ b/pkg/sentry/kernel/pipe/BUILD @@ -33,16 +33,16 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/amutex", + "//pkg/context", + "//pkg/safemem", "//pkg/sentry/arch", - "//pkg/sentry/context", "//pkg/sentry/device", "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", - "//pkg/sentry/safemem", - "//pkg/sentry/usermem", "//pkg/sentry/vfs", "//pkg/sync", "//pkg/syserror", + "//pkg/usermem", "//pkg/waiter", ], ) @@ -57,11 +57,11 @@ go_test( ], library = ":pipe", deps = [ - "//pkg/sentry/context", - "//pkg/sentry/context/contexttest", + "//pkg/context", + "//pkg/sentry/contexttest", "//pkg/sentry/fs", - "//pkg/sentry/usermem", "//pkg/syserror", + "//pkg/usermem", "//pkg/waiter", ], ) diff --git a/pkg/sentry/kernel/pipe/buffer.go b/pkg/sentry/kernel/pipe/buffer.go index 1c0f34269..fe3be5dbd 100644 --- a/pkg/sentry/kernel/pipe/buffer.go +++ b/pkg/sentry/kernel/pipe/buffer.go @@ -17,7 +17,7 @@ package pipe import ( "io" - "gvisor.dev/gvisor/pkg/sentry/safemem" + "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sync" ) diff --git a/pkg/sentry/kernel/pipe/buffer_test.go b/pkg/sentry/kernel/pipe/buffer_test.go index ee1b90115..4d54b8b8f 100644 --- a/pkg/sentry/kernel/pipe/buffer_test.go +++ b/pkg/sentry/kernel/pipe/buffer_test.go @@ -18,7 +18,7 @@ import ( "testing" "unsafe" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) func TestBufferSize(t *testing.T) { diff --git a/pkg/sentry/kernel/pipe/node.go b/pkg/sentry/kernel/pipe/node.go index 716f589af..4b688c627 100644 --- a/pkg/sentry/kernel/pipe/node.go +++ b/pkg/sentry/kernel/pipe/node.go @@ -16,7 +16,7 @@ package pipe import ( "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sync" diff --git a/pkg/sentry/kernel/pipe/node_test.go b/pkg/sentry/kernel/pipe/node_test.go index 16fa80abe..ab75a87ff 100644 --- a/pkg/sentry/kernel/pipe/node_test.go +++ b/pkg/sentry/kernel/pipe/node_test.go @@ -18,11 +18,11 @@ import ( "testing" "time" - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/context/contexttest" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/sentry/fs" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) type sleeper struct { diff --git a/pkg/sentry/kernel/pipe/pipe.go b/pkg/sentry/kernel/pipe/pipe.go index e4fd7d420..08410283f 100644 --- a/pkg/sentry/kernel/pipe/pipe.go +++ b/pkg/sentry/kernel/pipe/pipe.go @@ -20,7 +20,7 @@ import ( "sync/atomic" "syscall" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" diff --git a/pkg/sentry/kernel/pipe/pipe_test.go b/pkg/sentry/kernel/pipe/pipe_test.go index e3a14b665..bda739dbe 100644 --- a/pkg/sentry/kernel/pipe/pipe_test.go +++ b/pkg/sentry/kernel/pipe/pipe_test.go @@ -18,9 +18,9 @@ import ( "bytes" "testing" - "gvisor.dev/gvisor/pkg/sentry/context/contexttest" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/kernel/pipe/pipe_util.go b/pkg/sentry/kernel/pipe/pipe_util.go index 8394eb78b..80158239e 100644 --- a/pkg/sentry/kernel/pipe/pipe_util.go +++ b/pkg/sentry/kernel/pipe/pipe_util.go @@ -21,10 +21,10 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/amutex" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/kernel/pipe/reader_writer.go b/pkg/sentry/kernel/pipe/reader_writer.go index b4d29fc77..b2b5691ee 100644 --- a/pkg/sentry/kernel/pipe/reader_writer.go +++ b/pkg/sentry/kernel/pipe/reader_writer.go @@ -17,11 +17,11 @@ package pipe import ( "io" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) // ReaderWriter satisfies the FileOperations interface and services both diff --git a/pkg/sentry/kernel/pipe/vfs.go b/pkg/sentry/kernel/pipe/vfs.go index 6f83e3cee..a5675bd70 100644 --- a/pkg/sentry/kernel/pipe/vfs.go +++ b/pkg/sentry/kernel/pipe/vfs.go @@ -16,12 +16,12 @@ package pipe import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/kernel/ptrace.go b/pkg/sentry/kernel/ptrace.go index 3be171cdc..35ad97d5d 100644 --- a/pkg/sentry/kernel/ptrace.go +++ b/pkg/sentry/kernel/ptrace.go @@ -20,8 +20,8 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/mm" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // ptraceOptions are the subset of options controlling a task's ptrace behavior diff --git a/pkg/sentry/kernel/ptrace_amd64.go b/pkg/sentry/kernel/ptrace_amd64.go index 5514cf432..cef1276ec 100644 --- a/pkg/sentry/kernel/ptrace_amd64.go +++ b/pkg/sentry/kernel/ptrace_amd64.go @@ -18,8 +18,8 @@ package kernel import ( "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // ptraceArch implements arch-specific ptrace commands. diff --git a/pkg/sentry/kernel/ptrace_arm64.go b/pkg/sentry/kernel/ptrace_arm64.go index 61e412911..d971b96b3 100644 --- a/pkg/sentry/kernel/ptrace_arm64.go +++ b/pkg/sentry/kernel/ptrace_arm64.go @@ -17,8 +17,8 @@ package kernel import ( - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // ptraceArch implements arch-specific ptrace commands. diff --git a/pkg/sentry/kernel/rseq.go b/pkg/sentry/kernel/rseq.go index b14429854..efebfd872 100644 --- a/pkg/sentry/kernel/rseq.go +++ b/pkg/sentry/kernel/rseq.go @@ -19,8 +19,8 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/hostcpu" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // Restartable sequences. diff --git a/pkg/sentry/kernel/seccomp.go b/pkg/sentry/kernel/seccomp.go index 2347dcf36..c38c5a40c 100644 --- a/pkg/sentry/kernel/seccomp.go +++ b/pkg/sentry/kernel/seccomp.go @@ -21,8 +21,8 @@ import ( "gvisor.dev/gvisor/pkg/binary" "gvisor.dev/gvisor/pkg/bpf" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) const maxSyscallFilterInstructions = 1 << 15 diff --git a/pkg/sentry/kernel/semaphore/BUILD b/pkg/sentry/kernel/semaphore/BUILD index 76e19b551..65e5427c1 100644 --- a/pkg/sentry/kernel/semaphore/BUILD +++ b/pkg/sentry/kernel/semaphore/BUILD @@ -24,8 +24,8 @@ go_library( visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", + "//pkg/context", "//pkg/log", - "//pkg/sentry/context", "//pkg/sentry/fs", "//pkg/sentry/kernel/auth", "//pkg/sentry/kernel/time", @@ -41,8 +41,8 @@ go_test( library = ":semaphore", deps = [ "//pkg/abi/linux", - "//pkg/sentry/context", - "//pkg/sentry/context/contexttest", + "//pkg/context", + "//pkg/sentry/contexttest", "//pkg/sentry/kernel/auth", "//pkg/syserror", ], diff --git a/pkg/sentry/kernel/semaphore/semaphore.go b/pkg/sentry/kernel/semaphore/semaphore.go index 18299814e..1000f3287 100644 --- a/pkg/sentry/kernel/semaphore/semaphore.go +++ b/pkg/sentry/kernel/semaphore/semaphore.go @@ -19,8 +19,8 @@ import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" diff --git a/pkg/sentry/kernel/semaphore/semaphore_test.go b/pkg/sentry/kernel/semaphore/semaphore_test.go index c235f6ca4..e47acefdf 100644 --- a/pkg/sentry/kernel/semaphore/semaphore_test.go +++ b/pkg/sentry/kernel/semaphore/semaphore_test.go @@ -18,8 +18,8 @@ import ( "testing" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/context/contexttest" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/syserror" ) diff --git a/pkg/sentry/kernel/shm/BUILD b/pkg/sentry/kernel/shm/BUILD index 5547c5abf..bfd779837 100644 --- a/pkg/sentry/kernel/shm/BUILD +++ b/pkg/sentry/kernel/shm/BUILD @@ -11,9 +11,9 @@ go_library( visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", + "//pkg/context", "//pkg/log", "//pkg/refs", - "//pkg/sentry/context", "//pkg/sentry/device", "//pkg/sentry/fs", "//pkg/sentry/kernel/auth", @@ -22,8 +22,8 @@ go_library( "//pkg/sentry/pgalloc", "//pkg/sentry/platform", "//pkg/sentry/usage", - "//pkg/sentry/usermem", "//pkg/sync", "//pkg/syserror", + "//pkg/usermem", ], ) diff --git a/pkg/sentry/kernel/shm/shm.go b/pkg/sentry/kernel/shm/shm.go index 8ddef7eb8..208569057 100644 --- a/pkg/sentry/kernel/shm/shm.go +++ b/pkg/sentry/kernel/shm/shm.go @@ -37,9 +37,9 @@ import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/refs" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" @@ -47,9 +47,9 @@ import ( "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/usage" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // Key represents a shm segment key. Analogous to a file name. diff --git a/pkg/sentry/kernel/signalfd/BUILD b/pkg/sentry/kernel/signalfd/BUILD index 5d44773d4..3eb78e91b 100644 --- a/pkg/sentry/kernel/signalfd/BUILD +++ b/pkg/sentry/kernel/signalfd/BUILD @@ -9,14 +9,14 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/binary", - "//pkg/sentry/context", + "//pkg/context", "//pkg/sentry/fs", "//pkg/sentry/fs/anon", "//pkg/sentry/fs/fsutil", "//pkg/sentry/kernel", - "//pkg/sentry/usermem", "//pkg/sync", "//pkg/syserror", + "//pkg/usermem", "//pkg/waiter", ], ) diff --git a/pkg/sentry/kernel/signalfd/signalfd.go b/pkg/sentry/kernel/signalfd/signalfd.go index 28be4a939..8243bb93e 100644 --- a/pkg/sentry/kernel/signalfd/signalfd.go +++ b/pkg/sentry/kernel/signalfd/signalfd.go @@ -18,14 +18,14 @@ package signalfd import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/binary" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/anon" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/kernel/syscalls.go b/pkg/sentry/kernel/syscalls.go index d2d01add4..93c4fe969 100644 --- a/pkg/sentry/kernel/syscalls.go +++ b/pkg/sentry/kernel/syscalls.go @@ -21,8 +21,8 @@ import ( "gvisor.dev/gvisor/pkg/abi" "gvisor.dev/gvisor/pkg/bits" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/usermem" ) // maxSyscallNum is the highest supported syscall number. diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go index 978d66da8..95adf2778 100644 --- a/pkg/sentry/kernel/task.go +++ b/pkg/sentry/kernel/task.go @@ -21,8 +21,8 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/bpf" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" @@ -35,8 +35,8 @@ import ( "gvisor.dev/gvisor/pkg/sentry/unimpl" "gvisor.dev/gvisor/pkg/sentry/uniqueid" "gvisor.dev/gvisor/pkg/sentry/usage" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go index 247bd4aba..53d4d211b 100644 --- a/pkg/sentry/kernel/task_clone.go +++ b/pkg/sentry/kernel/task_clone.go @@ -17,8 +17,8 @@ package kernel import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/bpf" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // SharingOptions controls what resources are shared by a new task created by diff --git a/pkg/sentry/kernel/task_context.go b/pkg/sentry/kernel/task_context.go index bb5560acf..2d6e7733c 100644 --- a/pkg/sentry/kernel/task_context.go +++ b/pkg/sentry/kernel/task_context.go @@ -18,13 +18,13 @@ import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/kernel/futex" "gvisor.dev/gvisor/pkg/sentry/loader" "gvisor.dev/gvisor/pkg/sentry/mm" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserr" + "gvisor.dev/gvisor/pkg/usermem" ) var errNoSyscalls = syserr.New("no syscall table found", linux.ENOEXEC) diff --git a/pkg/sentry/kernel/task_futex.go b/pkg/sentry/kernel/task_futex.go index c211b5b74..a53e77c9f 100644 --- a/pkg/sentry/kernel/task_futex.go +++ b/pkg/sentry/kernel/task_futex.go @@ -16,7 +16,7 @@ package kernel import ( "gvisor.dev/gvisor/pkg/sentry/kernel/futex" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) // Futex returns t's futex manager. diff --git a/pkg/sentry/kernel/task_log.go b/pkg/sentry/kernel/task_log.go index 0fb3661de..41259210c 100644 --- a/pkg/sentry/kernel/task_log.go +++ b/pkg/sentry/kernel/task_log.go @@ -20,7 +20,7 @@ import ( "sort" "gvisor.dev/gvisor/pkg/log" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) const ( diff --git a/pkg/sentry/kernel/task_run.go b/pkg/sentry/kernel/task_run.go index 6357273d3..5568c91bc 100644 --- a/pkg/sentry/kernel/task_run.go +++ b/pkg/sentry/kernel/task_run.go @@ -26,7 +26,7 @@ import ( ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/platform" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) // A taskRunState is a reified state in the task state machine. See README.md diff --git a/pkg/sentry/kernel/task_signals.go b/pkg/sentry/kernel/task_signals.go index 39cd1340d..8802db142 100644 --- a/pkg/sentry/kernel/task_signals.go +++ b/pkg/sentry/kernel/task_signals.go @@ -26,8 +26,8 @@ import ( "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" ucspb "gvisor.dev/gvisor/pkg/sentry/kernel/uncaught_signal_go_proto" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go index 58af16ee2..de838beef 100644 --- a/pkg/sentry/kernel/task_start.go +++ b/pkg/sentry/kernel/task_start.go @@ -21,8 +21,8 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/futex" "gvisor.dev/gvisor/pkg/sentry/kernel/sched" "gvisor.dev/gvisor/pkg/sentry/usage" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // TaskConfig defines the configuration of a new Task (see below). diff --git a/pkg/sentry/kernel/task_syscall.go b/pkg/sentry/kernel/task_syscall.go index 3180f5560..d555d69a8 100644 --- a/pkg/sentry/kernel/task_syscall.go +++ b/pkg/sentry/kernel/task_syscall.go @@ -25,8 +25,8 @@ import ( "gvisor.dev/gvisor/pkg/metric" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // SyscallRestartErrno represents a ERESTART* errno defined in the Linux's kernel diff --git a/pkg/sentry/kernel/task_usermem.go b/pkg/sentry/kernel/task_usermem.go index 518bfe1bd..2bf3ce8a8 100644 --- a/pkg/sentry/kernel/task_usermem.go +++ b/pkg/sentry/kernel/task_usermem.go @@ -18,8 +18,8 @@ import ( "math" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // MAX_RW_COUNT is the maximum size in bytes of a single read or write. diff --git a/pkg/sentry/kernel/time/BUILD b/pkg/sentry/kernel/time/BUILD index d49594d9f..7ba7dc50c 100644 --- a/pkg/sentry/kernel/time/BUILD +++ b/pkg/sentry/kernel/time/BUILD @@ -11,7 +11,7 @@ go_library( visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", - "//pkg/sentry/context", + "//pkg/context", "//pkg/sync", "//pkg/syserror", "//pkg/waiter", diff --git a/pkg/sentry/kernel/time/context.go b/pkg/sentry/kernel/time/context.go index 8ef483dd3..00b729d88 100644 --- a/pkg/sentry/kernel/time/context.go +++ b/pkg/sentry/kernel/time/context.go @@ -15,7 +15,7 @@ package time import ( - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" ) // contextID is the time package's type for context.Context.Value keys. diff --git a/pkg/sentry/kernel/timekeeper_test.go b/pkg/sentry/kernel/timekeeper_test.go index 849c5b646..cf2f7ca72 100644 --- a/pkg/sentry/kernel/timekeeper_test.go +++ b/pkg/sentry/kernel/timekeeper_test.go @@ -17,12 +17,12 @@ package kernel import ( "testing" - "gvisor.dev/gvisor/pkg/sentry/context/contexttest" + "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/sentry/pgalloc" sentrytime "gvisor.dev/gvisor/pkg/sentry/time" "gvisor.dev/gvisor/pkg/sentry/usage" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // mockClocks is a sentrytime.Clocks that simply returns the times in the diff --git a/pkg/sentry/kernel/vdso.go b/pkg/sentry/kernel/vdso.go index fdd10c56c..f1b3c212c 100644 --- a/pkg/sentry/kernel/vdso.go +++ b/pkg/sentry/kernel/vdso.go @@ -18,10 +18,10 @@ import ( "fmt" "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/platform" - "gvisor.dev/gvisor/pkg/sentry/safemem" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) // vdsoParams are the parameters exposed to the VDSO. diff --git a/pkg/sentry/limits/BUILD b/pkg/sentry/limits/BUILD index 67869757f..cf591c4c1 100644 --- a/pkg/sentry/limits/BUILD +++ b/pkg/sentry/limits/BUILD @@ -12,7 +12,7 @@ go_library( visibility = ["//:sandbox"], deps = [ "//pkg/abi/linux", - "//pkg/sentry/context", + "//pkg/context", "//pkg/sync", ], ) diff --git a/pkg/sentry/limits/context.go b/pkg/sentry/limits/context.go index 6972749ed..77e1fe217 100644 --- a/pkg/sentry/limits/context.go +++ b/pkg/sentry/limits/context.go @@ -15,7 +15,7 @@ package limits import ( - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" ) // contextID is the limit package's type for context.Context.Value keys. diff --git a/pkg/sentry/loader/BUILD b/pkg/sentry/loader/BUILD index d4ad2bd6c..23790378a 100644 --- a/pkg/sentry/loader/BUILD +++ b/pkg/sentry/loader/BUILD @@ -24,11 +24,12 @@ go_library( "//pkg/abi", "//pkg/abi/linux", "//pkg/binary", + "//pkg/context", "//pkg/cpuid", "//pkg/log", "//pkg/rand", + "//pkg/safemem", "//pkg/sentry/arch", - "//pkg/sentry/context", "//pkg/sentry/fs", "//pkg/sentry/fs/anon", "//pkg/sentry/fs/fsutil", @@ -37,12 +38,11 @@ go_library( "//pkg/sentry/memmap", "//pkg/sentry/mm", "//pkg/sentry/pgalloc", - "//pkg/sentry/safemem", "//pkg/sentry/uniqueid", "//pkg/sentry/usage", - "//pkg/sentry/usermem", "//pkg/syserr", "//pkg/syserror", + "//pkg/usermem", "//pkg/waiter", ], ) diff --git a/pkg/sentry/loader/elf.go b/pkg/sentry/loader/elf.go index 6299a3e2f..122ed05c2 100644 --- a/pkg/sentry/loader/elf.go +++ b/pkg/sentry/loader/elf.go @@ -23,16 +23,16 @@ import ( "gvisor.dev/gvisor/pkg/abi" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/cpuid" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/mm" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) const ( diff --git a/pkg/sentry/loader/interpreter.go b/pkg/sentry/loader/interpreter.go index ccf909cac..098a45d36 100644 --- a/pkg/sentry/loader/interpreter.go +++ b/pkg/sentry/loader/interpreter.go @@ -18,10 +18,10 @@ import ( "bytes" "io" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) const ( diff --git a/pkg/sentry/loader/loader.go b/pkg/sentry/loader/loader.go index b03eeb005..9a613d6b7 100644 --- a/pkg/sentry/loader/loader.go +++ b/pkg/sentry/loader/loader.go @@ -24,16 +24,16 @@ import ( "gvisor.dev/gvisor/pkg/abi" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/cpuid" "gvisor.dev/gvisor/pkg/rand" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/mm" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // LoadArgs holds specifications for an executable file to be loaded. diff --git a/pkg/sentry/loader/vdso.go b/pkg/sentry/loader/vdso.go index df8a81907..52f446ed7 100644 --- a/pkg/sentry/loader/vdso.go +++ b/pkg/sentry/loader/vdso.go @@ -20,20 +20,20 @@ import ( "io" "gvisor.dev/gvisor/pkg/abi" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/anon" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/mm" "gvisor.dev/gvisor/pkg/sentry/pgalloc" - "gvisor.dev/gvisor/pkg/sentry/safemem" "gvisor.dev/gvisor/pkg/sentry/uniqueid" "gvisor.dev/gvisor/pkg/sentry/usage" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/memmap/BUILD b/pkg/sentry/memmap/BUILD index f9a65f086..a98b66de1 100644 --- a/pkg/sentry/memmap/BUILD +++ b/pkg/sentry/memmap/BUILD @@ -38,11 +38,11 @@ go_library( ], visibility = ["//pkg/sentry:internal"], deps = [ + "//pkg/context", "//pkg/log", - "//pkg/sentry/context", "//pkg/sentry/platform", - "//pkg/sentry/usermem", "//pkg/syserror", + "//pkg/usermem", ], ) @@ -51,5 +51,5 @@ go_test( size = "small", srcs = ["mapping_set_test.go"], library = ":memmap", - deps = ["//pkg/sentry/usermem"], + deps = ["//pkg/usermem"], ) diff --git a/pkg/sentry/memmap/mapping_set.go b/pkg/sentry/memmap/mapping_set.go index 0a5b7ce45..d609c1ae0 100644 --- a/pkg/sentry/memmap/mapping_set.go +++ b/pkg/sentry/memmap/mapping_set.go @@ -18,7 +18,7 @@ import ( "fmt" "math" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) // MappingSet maps offsets into a Mappable to mappings of those offsets. It is diff --git a/pkg/sentry/memmap/mapping_set_test.go b/pkg/sentry/memmap/mapping_set_test.go index f9b11a59c..d39efe38f 100644 --- a/pkg/sentry/memmap/mapping_set_test.go +++ b/pkg/sentry/memmap/mapping_set_test.go @@ -18,7 +18,7 @@ import ( "reflect" "testing" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) type testMappingSpace struct { diff --git a/pkg/sentry/memmap/memmap.go b/pkg/sentry/memmap/memmap.go index 16a722a13..c6db9fc8f 100644 --- a/pkg/sentry/memmap/memmap.go +++ b/pkg/sentry/memmap/memmap.go @@ -18,9 +18,9 @@ package memmap import ( "fmt" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/platform" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) // Mappable represents a memory-mappable object, a mutable mapping from uint64 diff --git a/pkg/sentry/mm/BUILD b/pkg/sentry/mm/BUILD index bd6399fa2..e5729ced5 100644 --- a/pkg/sentry/mm/BUILD +++ b/pkg/sentry/mm/BUILD @@ -27,7 +27,7 @@ go_template_instance( "minDegree": "8", }, imports = { - "usermem": "gvisor.dev/gvisor/pkg/sentry/usermem", + "usermem": "gvisor.dev/gvisor/pkg/usermem", }, package = "mm", prefix = "vma", @@ -47,7 +47,7 @@ go_template_instance( "minDegree": "8", }, imports = { - "usermem": "gvisor.dev/gvisor/pkg/sentry/usermem", + "usermem": "gvisor.dev/gvisor/pkg/usermem", }, package = "mm", prefix = "pma", @@ -99,10 +99,12 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/atomicbitops", + "//pkg/context", "//pkg/log", "//pkg/refs", + "//pkg/safecopy", + "//pkg/safemem", "//pkg/sentry/arch", - "//pkg/sentry/context", "//pkg/sentry/fs", "//pkg/sentry/fs/proc/seqfile", "//pkg/sentry/kernel/auth", @@ -112,13 +114,11 @@ go_library( "//pkg/sentry/memmap", "//pkg/sentry/pgalloc", "//pkg/sentry/platform", - "//pkg/sentry/platform/safecopy", - "//pkg/sentry/safemem", "//pkg/sentry/usage", - "//pkg/sentry/usermem", "//pkg/sync", "//pkg/syserror", "//pkg/tcpip/buffer", + "//pkg/usermem", ], ) @@ -128,14 +128,14 @@ go_test( srcs = ["mm_test.go"], library = ":mm", deps = [ + "//pkg/context", "//pkg/sentry/arch", - "//pkg/sentry/context", - "//pkg/sentry/context/contexttest", + "//pkg/sentry/contexttest", "//pkg/sentry/limits", "//pkg/sentry/memmap", "//pkg/sentry/pgalloc", "//pkg/sentry/platform", - "//pkg/sentry/usermem", "//pkg/syserror", + "//pkg/usermem", ], ) diff --git a/pkg/sentry/mm/address_space.go b/pkg/sentry/mm/address_space.go index cfebcfd42..e58a63deb 100644 --- a/pkg/sentry/mm/address_space.go +++ b/pkg/sentry/mm/address_space.go @@ -20,7 +20,7 @@ import ( "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/sentry/platform" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) // AddressSpace returns the platform.AddressSpace bound to mm. diff --git a/pkg/sentry/mm/aio_context.go b/pkg/sentry/mm/aio_context.go index 4b48866ad..cb29d94b0 100644 --- a/pkg/sentry/mm/aio_context.go +++ b/pkg/sentry/mm/aio_context.go @@ -16,15 +16,15 @@ package mm import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/refs" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/usage" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // aioManager creates and manages asynchronous I/O contexts. diff --git a/pkg/sentry/mm/debug.go b/pkg/sentry/mm/debug.go index df9adf708..c273c982e 100644 --- a/pkg/sentry/mm/debug.go +++ b/pkg/sentry/mm/debug.go @@ -18,7 +18,7 @@ import ( "bytes" "fmt" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" ) const ( diff --git a/pkg/sentry/mm/io.go b/pkg/sentry/mm/io.go index b03e7d020..fa776f9c6 100644 --- a/pkg/sentry/mm/io.go +++ b/pkg/sentry/mm/io.go @@ -15,11 +15,11 @@ package mm import ( - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/platform" - "gvisor.dev/gvisor/pkg/sentry/safemem" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // There are two supported ways to copy data to/from application virtual diff --git a/pkg/sentry/mm/lifecycle.go b/pkg/sentry/mm/lifecycle.go index 4e9ca1de6..47b8fbf43 100644 --- a/pkg/sentry/mm/lifecycle.go +++ b/pkg/sentry/mm/lifecycle.go @@ -19,13 +19,13 @@ import ( "sync/atomic" "gvisor.dev/gvisor/pkg/atomicbitops" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/platform" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) // NewMemoryManager returns a new MemoryManager with no mappings and 1 user. diff --git a/pkg/sentry/mm/metadata.go b/pkg/sentry/mm/metadata.go index d2a01d48a..f550acae0 100644 --- a/pkg/sentry/mm/metadata.go +++ b/pkg/sentry/mm/metadata.go @@ -17,7 +17,7 @@ package mm import ( "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) // Dumpability describes if and how core dumps should be created. diff --git a/pkg/sentry/mm/mm.go b/pkg/sentry/mm/mm.go index 78cc9e6e4..09e582dd3 100644 --- a/pkg/sentry/mm/mm.go +++ b/pkg/sentry/mm/mm.go @@ -35,14 +35,14 @@ package mm import ( + "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/platform" - "gvisor.dev/gvisor/pkg/sentry/safemem" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/usermem" ) // MemoryManager implements a virtual address space. diff --git a/pkg/sentry/mm/mm_test.go b/pkg/sentry/mm/mm_test.go index 4d2bfaaed..edacca741 100644 --- a/pkg/sentry/mm/mm_test.go +++ b/pkg/sentry/mm/mm_test.go @@ -17,15 +17,15 @@ package mm import ( "testing" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/context/contexttest" + "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/platform" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) func testMemoryManager(ctx context.Context) *MemoryManager { diff --git a/pkg/sentry/mm/pma.go b/pkg/sentry/mm/pma.go index c976c6f45..62e4c20af 100644 --- a/pkg/sentry/mm/pma.go +++ b/pkg/sentry/mm/pma.go @@ -17,14 +17,14 @@ package mm import ( "fmt" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/safecopy" + "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/platform" - "gvisor.dev/gvisor/pkg/sentry/platform/safecopy" - "gvisor.dev/gvisor/pkg/sentry/safemem" "gvisor.dev/gvisor/pkg/sentry/usage" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // existingPMAsLocked checks that pmas exist for all addresses in ar, and diff --git a/pkg/sentry/mm/procfs.go b/pkg/sentry/mm/procfs.go index 79610acb7..1ab92f046 100644 --- a/pkg/sentry/mm/procfs.go +++ b/pkg/sentry/mm/procfs.go @@ -19,10 +19,10 @@ import ( "fmt" "strings" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs/proc/seqfile" "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) const ( diff --git a/pkg/sentry/mm/save_restore.go b/pkg/sentry/mm/save_restore.go index 93259c5a3..f56215d9a 100644 --- a/pkg/sentry/mm/save_restore.go +++ b/pkg/sentry/mm/save_restore.go @@ -17,7 +17,7 @@ package mm import ( "fmt" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" ) // InvalidateUnsavable invokes memmap.Mappable.InvalidateUnsavable on all diff --git a/pkg/sentry/mm/shm.go b/pkg/sentry/mm/shm.go index b9f2d23e5..6432731d4 100644 --- a/pkg/sentry/mm/shm.go +++ b/pkg/sentry/mm/shm.go @@ -15,10 +15,10 @@ package mm import ( - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/kernel/shm" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // DetachShm unmaps a sysv shared memory segment. diff --git a/pkg/sentry/mm/special_mappable.go b/pkg/sentry/mm/special_mappable.go index ea2d7af74..9ad52082d 100644 --- a/pkg/sentry/mm/special_mappable.go +++ b/pkg/sentry/mm/special_mappable.go @@ -15,14 +15,14 @@ package mm import ( + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/refs" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/usage" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // SpecialMappable implements memmap.MappingIdentity and memmap.Mappable with diff --git a/pkg/sentry/mm/syscalls.go b/pkg/sentry/mm/syscalls.go index c2466c988..c5dfa5972 100644 --- a/pkg/sentry/mm/syscalls.go +++ b/pkg/sentry/mm/syscalls.go @@ -19,14 +19,14 @@ import ( mrand "math/rand" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/futex" "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // HandleUserFault handles an application page fault. sp is the faulting diff --git a/pkg/sentry/mm/vma.go b/pkg/sentry/mm/vma.go index f2fd70799..9a14e69e6 100644 --- a/pkg/sentry/mm/vma.go +++ b/pkg/sentry/mm/vma.go @@ -18,13 +18,13 @@ import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // Preconditions: mm.mappingMu must be locked for writing. opts must be valid diff --git a/pkg/sentry/pgalloc/BUILD b/pkg/sentry/pgalloc/BUILD index 02385a3ce..1eeb9f317 100644 --- a/pkg/sentry/pgalloc/BUILD +++ b/pkg/sentry/pgalloc/BUILD @@ -61,18 +61,18 @@ go_library( ], visibility = ["//pkg/sentry:internal"], deps = [ + "//pkg/context", "//pkg/log", "//pkg/memutil", + "//pkg/safemem", "//pkg/sentry/arch", - "//pkg/sentry/context", "//pkg/sentry/hostmm", "//pkg/sentry/platform", - "//pkg/sentry/safemem", "//pkg/sentry/usage", - "//pkg/sentry/usermem", "//pkg/state", "//pkg/sync", "//pkg/syserror", + "//pkg/usermem", ], ) @@ -81,5 +81,5 @@ go_test( size = "small", srcs = ["pgalloc_test.go"], library = ":pgalloc", - deps = ["//pkg/sentry/usermem"], + deps = ["//pkg/usermem"], ) diff --git a/pkg/sentry/pgalloc/context.go b/pkg/sentry/pgalloc/context.go index 11ccf897b..d25215418 100644 --- a/pkg/sentry/pgalloc/context.go +++ b/pkg/sentry/pgalloc/context.go @@ -15,7 +15,7 @@ package pgalloc import ( - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" ) // contextID is this package's type for context.Context.Value keys. diff --git a/pkg/sentry/pgalloc/pgalloc.go b/pkg/sentry/pgalloc/pgalloc.go index c99e023d9..577e9306a 100644 --- a/pkg/sentry/pgalloc/pgalloc.go +++ b/pkg/sentry/pgalloc/pgalloc.go @@ -29,15 +29,15 @@ import ( "syscall" "time" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/hostmm" "gvisor.dev/gvisor/pkg/sentry/platform" - "gvisor.dev/gvisor/pkg/sentry/safemem" "gvisor.dev/gvisor/pkg/sentry/usage" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // MemoryFile is a platform.File whose pages may be allocated to arbitrary diff --git a/pkg/sentry/pgalloc/pgalloc_test.go b/pkg/sentry/pgalloc/pgalloc_test.go index 428e6a859..293f22c6b 100644 --- a/pkg/sentry/pgalloc/pgalloc_test.go +++ b/pkg/sentry/pgalloc/pgalloc_test.go @@ -17,7 +17,7 @@ package pgalloc import ( "testing" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) const ( diff --git a/pkg/sentry/pgalloc/save_restore.go b/pkg/sentry/pgalloc/save_restore.go index aafce1d00..f8385c146 100644 --- a/pkg/sentry/pgalloc/save_restore.go +++ b/pkg/sentry/pgalloc/save_restore.go @@ -25,8 +25,8 @@ import ( "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/usage" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/state" + "gvisor.dev/gvisor/pkg/usermem" ) // SaveTo writes f's state to the given stream. diff --git a/pkg/sentry/platform/BUILD b/pkg/sentry/platform/BUILD index 006450b2d..453241eca 100644 --- a/pkg/sentry/platform/BUILD +++ b/pkg/sentry/platform/BUILD @@ -26,14 +26,14 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/atomicbitops", + "//pkg/context", "//pkg/log", + "//pkg/safecopy", + "//pkg/safemem", "//pkg/seccomp", "//pkg/sentry/arch", - "//pkg/sentry/context", - "//pkg/sentry/platform/safecopy", - "//pkg/sentry/safemem", "//pkg/sentry/usage", - "//pkg/sentry/usermem", "//pkg/syserror", + "//pkg/usermem", ], ) diff --git a/pkg/sentry/platform/context.go b/pkg/sentry/platform/context.go index e29bc4485..6759cda65 100644 --- a/pkg/sentry/platform/context.go +++ b/pkg/sentry/platform/context.go @@ -15,7 +15,7 @@ package platform import ( - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" ) // contextID is the auth package's type for context.Context.Value keys. diff --git a/pkg/sentry/platform/kvm/BUILD b/pkg/sentry/platform/kvm/BUILD index a4532a766..159f7eafd 100644 --- a/pkg/sentry/platform/kvm/BUILD +++ b/pkg/sentry/platform/kvm/BUILD @@ -44,16 +44,16 @@ go_library( "//pkg/cpuid", "//pkg/log", "//pkg/procid", + "//pkg/safecopy", "//pkg/seccomp", "//pkg/sentry/arch", "//pkg/sentry/platform", "//pkg/sentry/platform/interrupt", "//pkg/sentry/platform/ring0", "//pkg/sentry/platform/ring0/pagetables", - "//pkg/sentry/platform/safecopy", "//pkg/sentry/time", - "//pkg/sentry/usermem", "//pkg/sync", + "//pkg/usermem", ], ) @@ -75,6 +75,6 @@ go_test( "//pkg/sentry/platform/kvm/testutil", "//pkg/sentry/platform/ring0", "//pkg/sentry/platform/ring0/pagetables", - "//pkg/sentry/usermem", + "//pkg/usermem", ], ) diff --git a/pkg/sentry/platform/kvm/address_space.go b/pkg/sentry/platform/kvm/address_space.go index a25f3c449..be213bfe8 100644 --- a/pkg/sentry/platform/kvm/address_space.go +++ b/pkg/sentry/platform/kvm/address_space.go @@ -20,8 +20,8 @@ import ( "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/usermem" ) // dirtySet tracks vCPUs for invalidation. diff --git a/pkg/sentry/platform/kvm/bluepill.go b/pkg/sentry/platform/kvm/bluepill.go index 30dbb74d6..35cd55fef 100644 --- a/pkg/sentry/platform/kvm/bluepill.go +++ b/pkg/sentry/platform/kvm/bluepill.go @@ -19,9 +19,9 @@ import ( "reflect" "syscall" + "gvisor.dev/gvisor/pkg/safecopy" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/platform/ring0" - "gvisor.dev/gvisor/pkg/sentry/platform/safecopy" ) // bluepill enters guest mode. diff --git a/pkg/sentry/platform/kvm/bluepill_fault.go b/pkg/sentry/platform/kvm/bluepill_fault.go index f6459cda9..e34f46aeb 100644 --- a/pkg/sentry/platform/kvm/bluepill_fault.go +++ b/pkg/sentry/platform/kvm/bluepill_fault.go @@ -18,7 +18,7 @@ import ( "sync/atomic" "syscall" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) const ( diff --git a/pkg/sentry/platform/kvm/context.go b/pkg/sentry/platform/kvm/context.go index 99450d22d..c769ac7b4 100644 --- a/pkg/sentry/platform/kvm/context.go +++ b/pkg/sentry/platform/kvm/context.go @@ -19,7 +19,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/platform/interrupt" "gvisor.dev/gvisor/pkg/sentry/platform/ring0" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) // context is an implementation of the platform context. diff --git a/pkg/sentry/platform/kvm/kvm.go b/pkg/sentry/platform/kvm/kvm.go index d337c5c7c..972ba85c3 100644 --- a/pkg/sentry/platform/kvm/kvm.go +++ b/pkg/sentry/platform/kvm/kvm.go @@ -23,8 +23,8 @@ import ( "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/platform/ring0" "gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/usermem" ) // KVM represents a lightweight VM context. diff --git a/pkg/sentry/platform/kvm/kvm_test.go b/pkg/sentry/platform/kvm/kvm_test.go index 30df725d4..c42752d50 100644 --- a/pkg/sentry/platform/kvm/kvm_test.go +++ b/pkg/sentry/platform/kvm/kvm_test.go @@ -27,7 +27,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/platform/kvm/testutil" "gvisor.dev/gvisor/pkg/sentry/platform/ring0" "gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) var dummyFPState = (*byte)(arch.NewFloatingPointData()) diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go index e6d912168..8076c7529 100644 --- a/pkg/sentry/platform/kvm/machine.go +++ b/pkg/sentry/platform/kvm/machine.go @@ -25,8 +25,8 @@ import ( "gvisor.dev/gvisor/pkg/procid" "gvisor.dev/gvisor/pkg/sentry/platform/ring0" "gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/usermem" ) // machine contains state associated with the VM as a whole. diff --git a/pkg/sentry/platform/kvm/machine_amd64.go b/pkg/sentry/platform/kvm/machine_amd64.go index 873e39dc7..923ce3909 100644 --- a/pkg/sentry/platform/kvm/machine_amd64.go +++ b/pkg/sentry/platform/kvm/machine_amd64.go @@ -26,7 +26,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/platform/ring0" "gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) // initArchState initializes architecture-specific state. diff --git a/pkg/sentry/platform/kvm/machine_arm64.go b/pkg/sentry/platform/kvm/machine_arm64.go index 3b1f20219..09552837a 100644 --- a/pkg/sentry/platform/kvm/machine_arm64.go +++ b/pkg/sentry/platform/kvm/machine_arm64.go @@ -20,7 +20,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) type vCPUArchState struct { diff --git a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go index 3f2f97a6b..1c8384e6b 100644 --- a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go +++ b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go @@ -26,7 +26,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/platform/ring0" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) // setMemoryRegion initializes a region. diff --git a/pkg/sentry/platform/kvm/physical_map.go b/pkg/sentry/platform/kvm/physical_map.go index 91de5dab1..f7fa2f98d 100644 --- a/pkg/sentry/platform/kvm/physical_map.go +++ b/pkg/sentry/platform/kvm/physical_map.go @@ -21,7 +21,7 @@ import ( "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/platform/ring0" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) type region struct { diff --git a/pkg/sentry/platform/kvm/virtual_map.go b/pkg/sentry/platform/kvm/virtual_map.go index 2d68855ef..c8897d34f 100644 --- a/pkg/sentry/platform/kvm/virtual_map.go +++ b/pkg/sentry/platform/kvm/virtual_map.go @@ -22,7 +22,7 @@ import ( "regexp" "strconv" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) type virtualRegion struct { diff --git a/pkg/sentry/platform/kvm/virtual_map_test.go b/pkg/sentry/platform/kvm/virtual_map_test.go index 6a2f145be..327e2be4f 100644 --- a/pkg/sentry/platform/kvm/virtual_map_test.go +++ b/pkg/sentry/platform/kvm/virtual_map_test.go @@ -18,7 +18,7 @@ import ( "syscall" "testing" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) type checker struct { diff --git a/pkg/sentry/platform/mmap_min_addr.go b/pkg/sentry/platform/mmap_min_addr.go index 999787462..091c2e365 100644 --- a/pkg/sentry/platform/mmap_min_addr.go +++ b/pkg/sentry/platform/mmap_min_addr.go @@ -20,7 +20,7 @@ import ( "strconv" "strings" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) // systemMMapMinAddrSource is the source file. diff --git a/pkg/sentry/platform/platform.go b/pkg/sentry/platform/platform.go index ec22dbf87..2ca696382 100644 --- a/pkg/sentry/platform/platform.go +++ b/pkg/sentry/platform/platform.go @@ -22,10 +22,10 @@ import ( "os" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/seccomp" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/safemem" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) // Platform provides abstractions for execution contexts (Context, diff --git a/pkg/sentry/platform/ptrace/BUILD b/pkg/sentry/platform/ptrace/BUILD index 3bcc5e040..95abd321e 100644 --- a/pkg/sentry/platform/ptrace/BUILD +++ b/pkg/sentry/platform/ptrace/BUILD @@ -25,14 +25,14 @@ go_library( "//pkg/abi/linux", "//pkg/log", "//pkg/procid", + "//pkg/safecopy", "//pkg/seccomp", "//pkg/sentry/arch", "//pkg/sentry/hostcpu", "//pkg/sentry/platform", "//pkg/sentry/platform/interrupt", - "//pkg/sentry/platform/safecopy", - "//pkg/sentry/usermem", "//pkg/sync", + "//pkg/usermem", "@org_golang_x_sys//unix:go_default_library", ], ) diff --git a/pkg/sentry/platform/ptrace/ptrace.go b/pkg/sentry/platform/ptrace/ptrace.go index bb0e03880..03adb624b 100644 --- a/pkg/sentry/platform/ptrace/ptrace.go +++ b/pkg/sentry/platform/ptrace/ptrace.go @@ -51,8 +51,8 @@ import ( "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/platform/interrupt" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/usermem" ) var ( diff --git a/pkg/sentry/platform/ptrace/ptrace_unsafe.go b/pkg/sentry/platform/ptrace/ptrace_unsafe.go index 72c7ec564..6c0ed7b3e 100644 --- a/pkg/sentry/platform/ptrace/ptrace_unsafe.go +++ b/pkg/sentry/platform/ptrace/ptrace_unsafe.go @@ -20,7 +20,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) // getRegs gets the general purpose register set. diff --git a/pkg/sentry/platform/ptrace/stub_unsafe.go b/pkg/sentry/platform/ptrace/stub_unsafe.go index aa1b87237..341dde143 100644 --- a/pkg/sentry/platform/ptrace/stub_unsafe.go +++ b/pkg/sentry/platform/ptrace/stub_unsafe.go @@ -19,8 +19,8 @@ import ( "syscall" "unsafe" - "gvisor.dev/gvisor/pkg/sentry/platform/safecopy" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/safecopy" + "gvisor.dev/gvisor/pkg/usermem" ) // stub is defined in arch-specific assembly. diff --git a/pkg/sentry/platform/ptrace/subprocess.go b/pkg/sentry/platform/ptrace/subprocess.go index 15dc46a5b..31b7cec53 100644 --- a/pkg/sentry/platform/ptrace/subprocess.go +++ b/pkg/sentry/platform/ptrace/subprocess.go @@ -25,8 +25,8 @@ import ( "gvisor.dev/gvisor/pkg/procid" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/platform" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/usermem" ) // Linux kernel errnos which "should never be seen by user programs", but will diff --git a/pkg/sentry/platform/ring0/BUILD b/pkg/sentry/platform/ring0/BUILD index 6dee8fcc5..934b6fbcd 100644 --- a/pkg/sentry/platform/ring0/BUILD +++ b/pkg/sentry/platform/ring0/BUILD @@ -78,6 +78,6 @@ go_library( deps = [ "//pkg/cpuid", "//pkg/sentry/platform/ring0/pagetables", - "//pkg/sentry/usermem", + "//pkg/usermem", ], ) diff --git a/pkg/sentry/platform/ring0/defs_amd64.go b/pkg/sentry/platform/ring0/defs_amd64.go index 9dae0dccb..9c6c2cf5c 100644 --- a/pkg/sentry/platform/ring0/defs_amd64.go +++ b/pkg/sentry/platform/ring0/defs_amd64.go @@ -18,7 +18,7 @@ package ring0 import ( "gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) var ( diff --git a/pkg/sentry/platform/ring0/defs_arm64.go b/pkg/sentry/platform/ring0/defs_arm64.go index a850ce6cf..1583dda12 100644 --- a/pkg/sentry/platform/ring0/defs_arm64.go +++ b/pkg/sentry/platform/ring0/defs_arm64.go @@ -18,7 +18,7 @@ package ring0 import ( "gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) var ( diff --git a/pkg/sentry/platform/ring0/gen_offsets/BUILD b/pkg/sentry/platform/ring0/gen_offsets/BUILD index 147311ed3..4cae10459 100644 --- a/pkg/sentry/platform/ring0/gen_offsets/BUILD +++ b/pkg/sentry/platform/ring0/gen_offsets/BUILD @@ -28,6 +28,6 @@ go_binary( deps = [ "//pkg/cpuid", "//pkg/sentry/platform/ring0/pagetables", - "//pkg/sentry/usermem", + "//pkg/usermem", ], ) diff --git a/pkg/sentry/platform/ring0/pagetables/BUILD b/pkg/sentry/platform/ring0/pagetables/BUILD index 8b5cdd6c1..971eed7fa 100644 --- a/pkg/sentry/platform/ring0/pagetables/BUILD +++ b/pkg/sentry/platform/ring0/pagetables/BUILD @@ -93,8 +93,8 @@ go_library( "//pkg/sentry/platform/ring0:__subpackages__", ], deps = [ - "//pkg/sentry/usermem", "//pkg/sync", + "//pkg/usermem", ], ) @@ -108,5 +108,5 @@ go_test( "walker_check.go", ], library = ":pagetables", - deps = ["//pkg/sentry/usermem"], + deps = ["//pkg/usermem"], ) diff --git a/pkg/sentry/platform/ring0/pagetables/allocator_unsafe.go b/pkg/sentry/platform/ring0/pagetables/allocator_unsafe.go index a90394a33..d08bfdeb3 100644 --- a/pkg/sentry/platform/ring0/pagetables/allocator_unsafe.go +++ b/pkg/sentry/platform/ring0/pagetables/allocator_unsafe.go @@ -17,7 +17,7 @@ package pagetables import ( "unsafe" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) // newAlignedPTEs returns a set of aligned PTEs. diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables.go b/pkg/sentry/platform/ring0/pagetables/pagetables.go index 30c64a372..87e88e97d 100644 --- a/pkg/sentry/platform/ring0/pagetables/pagetables.go +++ b/pkg/sentry/platform/ring0/pagetables/pagetables.go @@ -21,7 +21,7 @@ package pagetables import ( - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) // PageTables is a set of page tables. diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_aarch64.go b/pkg/sentry/platform/ring0/pagetables/pagetables_aarch64.go index e78424766..78510ebed 100644 --- a/pkg/sentry/platform/ring0/pagetables/pagetables_aarch64.go +++ b/pkg/sentry/platform/ring0/pagetables/pagetables_aarch64.go @@ -19,7 +19,7 @@ package pagetables import ( "sync/atomic" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) // archPageTables is architecture-specific data. diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_amd64_test.go b/pkg/sentry/platform/ring0/pagetables/pagetables_amd64_test.go index 35e917526..54e8e554f 100644 --- a/pkg/sentry/platform/ring0/pagetables/pagetables_amd64_test.go +++ b/pkg/sentry/platform/ring0/pagetables/pagetables_amd64_test.go @@ -19,7 +19,7 @@ package pagetables import ( "testing" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) func Test2MAnd4K(t *testing.T) { diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_arm64_test.go b/pkg/sentry/platform/ring0/pagetables/pagetables_arm64_test.go index 254116233..2f73d424f 100644 --- a/pkg/sentry/platform/ring0/pagetables/pagetables_arm64_test.go +++ b/pkg/sentry/platform/ring0/pagetables/pagetables_arm64_test.go @@ -19,7 +19,7 @@ package pagetables import ( "testing" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) func Test2MAnd4K(t *testing.T) { diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_test.go b/pkg/sentry/platform/ring0/pagetables/pagetables_test.go index 6e95ad2b9..5c88d087d 100644 --- a/pkg/sentry/platform/ring0/pagetables/pagetables_test.go +++ b/pkg/sentry/platform/ring0/pagetables/pagetables_test.go @@ -17,7 +17,7 @@ package pagetables import ( "testing" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) type mapping struct { diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_x86.go b/pkg/sentry/platform/ring0/pagetables/pagetables_x86.go index 3e2383c5e..dcf061df9 100644 --- a/pkg/sentry/platform/ring0/pagetables/pagetables_x86.go +++ b/pkg/sentry/platform/ring0/pagetables/pagetables_x86.go @@ -19,7 +19,7 @@ package pagetables import ( "sync/atomic" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) // archPageTables is architecture-specific data. diff --git a/pkg/sentry/platform/safecopy/BUILD b/pkg/sentry/platform/safecopy/BUILD deleted file mode 100644 index b8747585b..000000000 --- a/pkg/sentry/platform/safecopy/BUILD +++ /dev/null @@ -1,29 +0,0 @@ -load("//tools:defs.bzl", "go_library", "go_test") - -package(licenses = ["notice"]) - -go_library( - name = "safecopy", - srcs = [ - "atomic_amd64.s", - "atomic_arm64.s", - "memclr_amd64.s", - "memclr_arm64.s", - "memcpy_amd64.s", - "memcpy_arm64.s", - "safecopy.go", - "safecopy_unsafe.go", - "sighandler_amd64.s", - "sighandler_arm64.s", - ], - visibility = ["//pkg/sentry:internal"], - deps = ["//pkg/syserror"], -) - -go_test( - name = "safecopy_test", - srcs = [ - "safecopy_test.go", - ], - library = ":safecopy", -) diff --git a/pkg/sentry/platform/safecopy/LICENSE b/pkg/sentry/platform/safecopy/LICENSE deleted file mode 100644 index 6a66aea5e..000000000 --- a/pkg/sentry/platform/safecopy/LICENSE +++ /dev/null @@ -1,27 +0,0 @@ -Copyright (c) 2009 The Go Authors. All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: - - * Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above -copyright notice, this list of conditions and the following disclaimer -in the documentation and/or other materials provided with the -distribution. - * Neither the name of Google Inc. nor the names of its -contributors may be used to endorse or promote products derived from -this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/pkg/sentry/platform/safecopy/atomic_amd64.s b/pkg/sentry/platform/safecopy/atomic_amd64.s deleted file mode 100644 index a0cd78f33..000000000 --- a/pkg/sentry/platform/safecopy/atomic_amd64.s +++ /dev/null @@ -1,136 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "textflag.h" - -// handleSwapUint32Fault returns the value stored in DI. Control is transferred -// to it when swapUint32 below receives SIGSEGV or SIGBUS, with the signal -// number stored in DI. -// -// It must have the same frame configuration as swapUint32 so that it can undo -// any potential call frame set up by the assembler. -TEXT handleSwapUint32Fault(SB), NOSPLIT, $0-24 - MOVL DI, sig+20(FP) - RET - -// swapUint32 atomically stores new into *addr and returns (the previous *addr -// value, 0). If a SIGSEGV or SIGBUS signal is received during the swap, the -// value of old is unspecified, and sig is the number of the signal that was -// received. -// -// Preconditions: addr must be aligned to a 4-byte boundary. -// -//func swapUint32(ptr unsafe.Pointer, new uint32) (old uint32, sig int32) -TEXT ·swapUint32(SB), NOSPLIT, $0-24 - // Store 0 as the returned signal number. If we run to completion, - // this is the value the caller will see; if a signal is received, - // handleSwapUint32Fault will store a different value in this address. - MOVL $0, sig+20(FP) - - MOVQ addr+0(FP), DI - MOVL new+8(FP), AX - XCHGL AX, 0(DI) - MOVL AX, old+16(FP) - RET - -// handleSwapUint64Fault returns the value stored in DI. Control is transferred -// to it when swapUint64 below receives SIGSEGV or SIGBUS, with the signal -// number stored in DI. -// -// It must have the same frame configuration as swapUint64 so that it can undo -// any potential call frame set up by the assembler. -TEXT handleSwapUint64Fault(SB), NOSPLIT, $0-28 - MOVL DI, sig+24(FP) - RET - -// swapUint64 atomically stores new into *addr and returns (the previous *addr -// value, 0). If a SIGSEGV or SIGBUS signal is received during the swap, the -// value of old is unspecified, and sig is the number of the signal that was -// received. -// -// Preconditions: addr must be aligned to a 8-byte boundary. -// -//func swapUint64(ptr unsafe.Pointer, new uint64) (old uint64, sig int32) -TEXT ·swapUint64(SB), NOSPLIT, $0-28 - // Store 0 as the returned signal number. If we run to completion, - // this is the value the caller will see; if a signal is received, - // handleSwapUint64Fault will store a different value in this address. - MOVL $0, sig+24(FP) - - MOVQ addr+0(FP), DI - MOVQ new+8(FP), AX - XCHGQ AX, 0(DI) - MOVQ AX, old+16(FP) - RET - -// handleCompareAndSwapUint32Fault returns the value stored in DI. Control is -// transferred to it when swapUint64 below receives SIGSEGV or SIGBUS, with the -// signal number stored in DI. -// -// It must have the same frame configuration as compareAndSwapUint32 so that it -// can undo any potential call frame set up by the assembler. -TEXT handleCompareAndSwapUint32Fault(SB), NOSPLIT, $0-24 - MOVL DI, sig+20(FP) - RET - -// compareAndSwapUint32 is like sync/atomic.CompareAndSwapUint32, but returns -// (the value previously stored at addr, 0). If a SIGSEGV or SIGBUS signal is -// received during the operation, the value of prev is unspecified, and sig is -// the number of the signal that was received. -// -// Preconditions: addr must be aligned to a 4-byte boundary. -// -//func compareAndSwapUint32(ptr unsafe.Pointer, old, new uint32) (prev uint32, sig int32) -TEXT ·compareAndSwapUint32(SB), NOSPLIT, $0-24 - // Store 0 as the returned signal number. If we run to completion, this is - // the value the caller will see; if a signal is received, - // handleCompareAndSwapUint32Fault will store a different value in this - // address. - MOVL $0, sig+20(FP) - - MOVQ addr+0(FP), DI - MOVL old+8(FP), AX - MOVL new+12(FP), DX - LOCK - CMPXCHGL DX, 0(DI) - MOVL AX, prev+16(FP) - RET - -// handleLoadUint32Fault returns the value stored in DI. Control is transferred -// to it when LoadUint32 below receives SIGSEGV or SIGBUS, with the signal -// number stored in DI. -// -// It must have the same frame configuration as loadUint32 so that it can undo -// any potential call frame set up by the assembler. -TEXT handleLoadUint32Fault(SB), NOSPLIT, $0-16 - MOVL DI, sig+12(FP) - RET - -// loadUint32 atomically loads *addr and returns it. If a SIGSEGV or SIGBUS -// signal is received, the value returned is unspecified, and sig is the number -// of the signal that was received. -// -// Preconditions: addr must be aligned to a 4-byte boundary. -// -//func loadUint32(ptr unsafe.Pointer) (val uint32, sig int32) -TEXT ·loadUint32(SB), NOSPLIT, $0-16 - // Store 0 as the returned signal number. If we run to completion, - // this is the value the caller will see; if a signal is received, - // handleLoadUint32Fault will store a different value in this address. - MOVL $0, sig+12(FP) - - MOVQ addr+0(FP), AX - MOVL (AX), BX - MOVL BX, val+8(FP) - RET diff --git a/pkg/sentry/platform/safecopy/atomic_arm64.s b/pkg/sentry/platform/safecopy/atomic_arm64.s deleted file mode 100644 index d58ed71f7..000000000 --- a/pkg/sentry/platform/safecopy/atomic_arm64.s +++ /dev/null @@ -1,126 +0,0 @@ -// Copyright 2014 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -#include "textflag.h" - -// handleSwapUint32Fault returns the value stored in R1. Control is transferred -// to it when swapUint32 below receives SIGSEGV or SIGBUS, with the signal -// number stored in R1. -// -// It must have the same frame configuration as swapUint32 so that it can undo -// any potential call frame set up by the assembler. -TEXT handleSwapUint32Fault(SB), NOSPLIT, $0-24 - MOVW R1, sig+20(FP) - RET - -// See the corresponding doc in safecopy_unsafe.go -// -// The code is derived from Go source runtime/internal/atomic.Xchg. -// -//func swapUint32(ptr unsafe.Pointer, new uint32) (old uint32, sig int32) -TEXT ·swapUint32(SB), NOSPLIT, $0-24 - // Store 0 as the returned signal number. If we run to completion, - // this is the value the caller will see; if a signal is received, - // handleSwapUint32Fault will store a different value in this address. - MOVW $0, sig+20(FP) -again: - MOVD addr+0(FP), R0 - MOVW new+8(FP), R1 - LDAXRW (R0), R2 - STLXRW R1, (R0), R3 - CBNZ R3, again - MOVW R2, old+16(FP) - RET - -// handleSwapUint64Fault returns the value stored in R1. Control is transferred -// to it when swapUint64 below receives SIGSEGV or SIGBUS, with the signal -// number stored in R1. -// -// It must have the same frame configuration as swapUint64 so that it can undo -// any potential call frame set up by the assembler. -TEXT handleSwapUint64Fault(SB), NOSPLIT, $0-28 - MOVW R1, sig+24(FP) - RET - -// See the corresponding doc in safecopy_unsafe.go -// -// The code is derived from Go source runtime/internal/atomic.Xchg64. -// -//func swapUint64(ptr unsafe.Pointer, new uint64) (old uint64, sig int32) -TEXT ·swapUint64(SB), NOSPLIT, $0-28 - // Store 0 as the returned signal number. If we run to completion, - // this is the value the caller will see; if a signal is received, - // handleSwapUint64Fault will store a different value in this address. - MOVW $0, sig+24(FP) -again: - MOVD addr+0(FP), R0 - MOVD new+8(FP), R1 - LDAXR (R0), R2 - STLXR R1, (R0), R3 - CBNZ R3, again - MOVD R2, old+16(FP) - RET - -// handleCompareAndSwapUint32Fault returns the value stored in R1. Control is -// transferred to it when compareAndSwapUint32 below receives SIGSEGV or SIGBUS, -// with the signal number stored in R1. -// -// It must have the same frame configuration as compareAndSwapUint32 so that it -// can undo any potential call frame set up by the assembler. -TEXT handleCompareAndSwapUint32Fault(SB), NOSPLIT, $0-24 - MOVW R1, sig+20(FP) - RET - -// See the corresponding doc in safecopy_unsafe.go -// -// The code is derived from Go source runtime/internal/atomic.Cas. -// -//func compareAndSwapUint32(ptr unsafe.Pointer, old, new uint32) (prev uint32, sig int32) -TEXT ·compareAndSwapUint32(SB), NOSPLIT, $0-24 - // Store 0 as the returned signal number. If we run to completion, this is - // the value the caller will see; if a signal is received, - // handleCompareAndSwapUint32Fault will store a different value in this - // address. - MOVW $0, sig+20(FP) - - MOVD addr+0(FP), R0 - MOVW old+8(FP), R1 - MOVW new+12(FP), R2 -again: - LDAXRW (R0), R3 - CMPW R1, R3 - BNE done - STLXRW R2, (R0), R4 - CBNZ R4, again -done: - MOVW R3, prev+16(FP) - RET - -// handleLoadUint32Fault returns the value stored in DI. Control is transferred -// to it when LoadUint32 below receives SIGSEGV or SIGBUS, with the signal -// number stored in DI. -// -// It must have the same frame configuration as loadUint32 so that it can undo -// any potential call frame set up by the assembler. -TEXT handleLoadUint32Fault(SB), NOSPLIT, $0-16 - MOVW R1, sig+12(FP) - RET - -// loadUint32 atomically loads *addr and returns it. If a SIGSEGV or SIGBUS -// signal is received, the value returned is unspecified, and sig is the number -// of the signal that was received. -// -// Preconditions: addr must be aligned to a 4-byte boundary. -// -//func loadUint32(ptr unsafe.Pointer) (val uint32, sig int32) -TEXT ·loadUint32(SB), NOSPLIT, $0-16 - // Store 0 as the returned signal number. If we run to completion, - // this is the value the caller will see; if a signal is received, - // handleLoadUint32Fault will store a different value in this address. - MOVW $0, sig+12(FP) - - MOVD addr+0(FP), R0 - LDARW (R0), R1 - MOVW R1, val+8(FP) - RET diff --git a/pkg/sentry/platform/safecopy/memclr_amd64.s b/pkg/sentry/platform/safecopy/memclr_amd64.s deleted file mode 100644 index 64cf32f05..000000000 --- a/pkg/sentry/platform/safecopy/memclr_amd64.s +++ /dev/null @@ -1,147 +0,0 @@ -// Copyright 2014 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -#include "textflag.h" - -// handleMemclrFault returns (the value stored in AX, the value stored in DI). -// Control is transferred to it when memclr below receives SIGSEGV or SIGBUS, -// with the faulting address stored in AX and the signal number stored in DI. -// -// It must have the same frame configuration as memclr so that it can undo any -// potential call frame set up by the assembler. -TEXT handleMemclrFault(SB), NOSPLIT, $0-28 - MOVQ AX, addr+16(FP) - MOVL DI, sig+24(FP) - RET - -// memclr sets the n bytes following ptr to zeroes. If a SIGSEGV or SIGBUS -// signal is received during the write, it returns the address that caused the -// fault and the number of the signal that was received. Otherwise, it returns -// an unspecified address and a signal number of 0. -// -// Data is written in order, such that if a fault happens at address p, it is -// safe to assume that all data before p-maxRegisterSize has already been -// successfully written. -// -// The code is derived from runtime.memclrNoHeapPointers. -// -// func memclr(ptr unsafe.Pointer, n uintptr) (fault unsafe.Pointer, sig int32) -TEXT ·memclr(SB), NOSPLIT, $0-28 - // Store 0 as the returned signal number. If we run to completion, - // this is the value the caller will see; if a signal is received, - // handleMemclrFault will store a different value in this address. - MOVL $0, sig+24(FP) - - MOVQ ptr+0(FP), DI - MOVQ n+8(FP), BX - XORQ AX, AX - - // MOVOU seems always faster than REP STOSQ. -tail: - TESTQ BX, BX - JEQ _0 - CMPQ BX, $2 - JBE _1or2 - CMPQ BX, $4 - JBE _3or4 - CMPQ BX, $8 - JB _5through7 - JE _8 - CMPQ BX, $16 - JBE _9through16 - PXOR X0, X0 - CMPQ BX, $32 - JBE _17through32 - CMPQ BX, $64 - JBE _33through64 - CMPQ BX, $128 - JBE _65through128 - CMPQ BX, $256 - JBE _129through256 - // TODO: use branch table and BSR to make this just a single dispatch - // TODO: for really big clears, use MOVNTDQ, even without AVX2. - -loop: - MOVOU X0, 0(DI) - MOVOU X0, 16(DI) - MOVOU X0, 32(DI) - MOVOU X0, 48(DI) - MOVOU X0, 64(DI) - MOVOU X0, 80(DI) - MOVOU X0, 96(DI) - MOVOU X0, 112(DI) - MOVOU X0, 128(DI) - MOVOU X0, 144(DI) - MOVOU X0, 160(DI) - MOVOU X0, 176(DI) - MOVOU X0, 192(DI) - MOVOU X0, 208(DI) - MOVOU X0, 224(DI) - MOVOU X0, 240(DI) - SUBQ $256, BX - ADDQ $256, DI - CMPQ BX, $256 - JAE loop - JMP tail - -_1or2: - MOVB AX, (DI) - MOVB AX, -1(DI)(BX*1) - RET -_0: - RET -_3or4: - MOVW AX, (DI) - MOVW AX, -2(DI)(BX*1) - RET -_5through7: - MOVL AX, (DI) - MOVL AX, -4(DI)(BX*1) - RET -_8: - // We need a separate case for 8 to make sure we clear pointers atomically. - MOVQ AX, (DI) - RET -_9through16: - MOVQ AX, (DI) - MOVQ AX, -8(DI)(BX*1) - RET -_17through32: - MOVOU X0, (DI) - MOVOU X0, -16(DI)(BX*1) - RET -_33through64: - MOVOU X0, (DI) - MOVOU X0, 16(DI) - MOVOU X0, -32(DI)(BX*1) - MOVOU X0, -16(DI)(BX*1) - RET -_65through128: - MOVOU X0, (DI) - MOVOU X0, 16(DI) - MOVOU X0, 32(DI) - MOVOU X0, 48(DI) - MOVOU X0, -64(DI)(BX*1) - MOVOU X0, -48(DI)(BX*1) - MOVOU X0, -32(DI)(BX*1) - MOVOU X0, -16(DI)(BX*1) - RET -_129through256: - MOVOU X0, (DI) - MOVOU X0, 16(DI) - MOVOU X0, 32(DI) - MOVOU X0, 48(DI) - MOVOU X0, 64(DI) - MOVOU X0, 80(DI) - MOVOU X0, 96(DI) - MOVOU X0, 112(DI) - MOVOU X0, -128(DI)(BX*1) - MOVOU X0, -112(DI)(BX*1) - MOVOU X0, -96(DI)(BX*1) - MOVOU X0, -80(DI)(BX*1) - MOVOU X0, -64(DI)(BX*1) - MOVOU X0, -48(DI)(BX*1) - MOVOU X0, -32(DI)(BX*1) - MOVOU X0, -16(DI)(BX*1) - RET diff --git a/pkg/sentry/platform/safecopy/memclr_arm64.s b/pkg/sentry/platform/safecopy/memclr_arm64.s deleted file mode 100644 index 7361b9067..000000000 --- a/pkg/sentry/platform/safecopy/memclr_arm64.s +++ /dev/null @@ -1,74 +0,0 @@ -// Copyright 2014 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -#include "textflag.h" - -// handleMemclrFault returns (the value stored in R0, the value stored in R1). -// Control is transferred to it when memclr below receives SIGSEGV or SIGBUS, -// with the faulting address stored in R0 and the signal number stored in R1. -// -// It must have the same frame configuration as memclr so that it can undo any -// potential call frame set up by the assembler. -TEXT handleMemclrFault(SB), NOSPLIT, $0-28 - MOVD R0, addr+16(FP) - MOVW R1, sig+24(FP) - RET - -// See the corresponding doc in safecopy_unsafe.go -// -// The code is derived from runtime.memclrNoHeapPointers. -// -// func memclr(ptr unsafe.Pointer, n uintptr) (fault unsafe.Pointer, sig int32) -TEXT ·memclr(SB), NOSPLIT, $0-28 - // Store 0 as the returned signal number. If we run to completion, - // this is the value the caller will see; if a signal is received, - // handleMemclrFault will store a different value in this address. - MOVW $0, sig+24(FP) - MOVD ptr+0(FP), R0 - MOVD n+8(FP), R1 - - // If size is less than 16 bytes, use tail_zero to zero what remains - CMP $16, R1 - BLT tail_zero - // Get buffer offset into 16 byte aligned address for better performance - ANDS $15, R0, ZR - BNE unaligned_to_16 -aligned_to_16: - LSR $4, R1, R2 -zero_by_16: - STP.P (ZR, ZR), 16(R0) // Store pair with post index. - SUBS $1, R2, R2 - BNE zero_by_16 - ANDS $15, R1, R1 - BEQ end - - // Zero buffer with size=R1 < 16 -tail_zero: - TBZ $3, R1, tail_zero_4 - MOVD.P ZR, 8(R0) -tail_zero_4: - TBZ $2, R1, tail_zero_2 - MOVW.P ZR, 4(R0) -tail_zero_2: - TBZ $1, R1, tail_zero_1 - MOVH.P ZR, 2(R0) -tail_zero_1: - TBZ $0, R1, end - MOVB ZR, (R0) -end: - RET - -unaligned_to_16: - MOVD R0, R2 -head_loop: - MOVBU.P ZR, 1(R0) - ANDS $15, R0, ZR - BNE head_loop - // Adjust length for what remains - SUB R2, R0, R3 - SUB R3, R1 - // If size is less than 16 bytes, use tail_zero to zero what remains - CMP $16, R1 - BLT tail_zero - B aligned_to_16 diff --git a/pkg/sentry/platform/safecopy/memcpy_amd64.s b/pkg/sentry/platform/safecopy/memcpy_amd64.s deleted file mode 100644 index 129691d68..000000000 --- a/pkg/sentry/platform/safecopy/memcpy_amd64.s +++ /dev/null @@ -1,250 +0,0 @@ -// Copyright © 1994-1999 Lucent Technologies Inc. All rights reserved. -// Revisions Copyright © 2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com). All rights reserved. -// Portions Copyright 2009 The Go Authors. All rights reserved. -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in -// all copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -// THE SOFTWARE. - -#include "textflag.h" - -// handleMemcpyFault returns (the value stored in AX, the value stored in DI). -// Control is transferred to it when memcpy below receives SIGSEGV or SIGBUS, -// with the faulting address stored in AX and the signal number stored in DI. -// -// It must have the same frame configuration as memcpy so that it can undo any -// potential call frame set up by the assembler. -TEXT handleMemcpyFault(SB), NOSPLIT, $0-36 - MOVQ AX, addr+24(FP) - MOVL DI, sig+32(FP) - RET - -// memcpy copies data from src to dst. If a SIGSEGV or SIGBUS signal is received -// during the copy, it returns the address that caused the fault and the number -// of the signal that was received. Otherwise, it returns an unspecified address -// and a signal number of 0. -// -// Data is copied in order, such that if a fault happens at address p, it is -// safe to assume that all data before p-maxRegisterSize has already been -// successfully copied. -// -// The code is derived from the forward copying part of runtime.memmove. -// -// func memcpy(dst, src unsafe.Pointer, n uintptr) (fault unsafe.Pointer, sig int32) -TEXT ·memcpy(SB), NOSPLIT, $0-36 - // Store 0 as the returned signal number. If we run to completion, - // this is the value the caller will see; if a signal is received, - // handleMemcpyFault will store a different value in this address. - MOVL $0, sig+32(FP) - - MOVQ to+0(FP), DI - MOVQ from+8(FP), SI - MOVQ n+16(FP), BX - - // REP instructions have a high startup cost, so we handle small sizes - // with some straightline code. The REP MOVSQ instruction is really fast - // for large sizes. The cutover is approximately 2K. -tail: - // move_129through256 or smaller work whether or not the source and the - // destination memory regions overlap because they load all data into - // registers before writing it back. move_256through2048 on the other - // hand can be used only when the memory regions don't overlap or the copy - // direction is forward. - TESTQ BX, BX - JEQ move_0 - CMPQ BX, $2 - JBE move_1or2 - CMPQ BX, $4 - JBE move_3or4 - CMPQ BX, $8 - JB move_5through7 - JE move_8 - CMPQ BX, $16 - JBE move_9through16 - CMPQ BX, $32 - JBE move_17through32 - CMPQ BX, $64 - JBE move_33through64 - CMPQ BX, $128 - JBE move_65through128 - CMPQ BX, $256 - JBE move_129through256 - // TODO: use branch table and BSR to make this just a single dispatch - -/* - * forward copy loop - */ - CMPQ BX, $2048 - JLS move_256through2048 - - // Check alignment - MOVL SI, AX - ORL DI, AX - TESTL $7, AX - JEQ fwdBy8 - - // Do 1 byte at a time - MOVQ BX, CX - REP; MOVSB - RET - -fwdBy8: - // Do 8 bytes at a time - MOVQ BX, CX - SHRQ $3, CX - ANDQ $7, BX - REP; MOVSQ - JMP tail - -move_1or2: - MOVB (SI), AX - MOVB AX, (DI) - MOVB -1(SI)(BX*1), CX - MOVB CX, -1(DI)(BX*1) - RET -move_0: - RET -move_3or4: - MOVW (SI), AX - MOVW AX, (DI) - MOVW -2(SI)(BX*1), CX - MOVW CX, -2(DI)(BX*1) - RET -move_5through7: - MOVL (SI), AX - MOVL AX, (DI) - MOVL -4(SI)(BX*1), CX - MOVL CX, -4(DI)(BX*1) - RET -move_8: - // We need a separate case for 8 to make sure we write pointers atomically. - MOVQ (SI), AX - MOVQ AX, (DI) - RET -move_9through16: - MOVQ (SI), AX - MOVQ AX, (DI) - MOVQ -8(SI)(BX*1), CX - MOVQ CX, -8(DI)(BX*1) - RET -move_17through32: - MOVOU (SI), X0 - MOVOU X0, (DI) - MOVOU -16(SI)(BX*1), X1 - MOVOU X1, -16(DI)(BX*1) - RET -move_33through64: - MOVOU (SI), X0 - MOVOU X0, (DI) - MOVOU 16(SI), X1 - MOVOU X1, 16(DI) - MOVOU -32(SI)(BX*1), X2 - MOVOU X2, -32(DI)(BX*1) - MOVOU -16(SI)(BX*1), X3 - MOVOU X3, -16(DI)(BX*1) - RET -move_65through128: - MOVOU (SI), X0 - MOVOU X0, (DI) - MOVOU 16(SI), X1 - MOVOU X1, 16(DI) - MOVOU 32(SI), X2 - MOVOU X2, 32(DI) - MOVOU 48(SI), X3 - MOVOU X3, 48(DI) - MOVOU -64(SI)(BX*1), X4 - MOVOU X4, -64(DI)(BX*1) - MOVOU -48(SI)(BX*1), X5 - MOVOU X5, -48(DI)(BX*1) - MOVOU -32(SI)(BX*1), X6 - MOVOU X6, -32(DI)(BX*1) - MOVOU -16(SI)(BX*1), X7 - MOVOU X7, -16(DI)(BX*1) - RET -move_129through256: - MOVOU (SI), X0 - MOVOU X0, (DI) - MOVOU 16(SI), X1 - MOVOU X1, 16(DI) - MOVOU 32(SI), X2 - MOVOU X2, 32(DI) - MOVOU 48(SI), X3 - MOVOU X3, 48(DI) - MOVOU 64(SI), X4 - MOVOU X4, 64(DI) - MOVOU 80(SI), X5 - MOVOU X5, 80(DI) - MOVOU 96(SI), X6 - MOVOU X6, 96(DI) - MOVOU 112(SI), X7 - MOVOU X7, 112(DI) - MOVOU -128(SI)(BX*1), X8 - MOVOU X8, -128(DI)(BX*1) - MOVOU -112(SI)(BX*1), X9 - MOVOU X9, -112(DI)(BX*1) - MOVOU -96(SI)(BX*1), X10 - MOVOU X10, -96(DI)(BX*1) - MOVOU -80(SI)(BX*1), X11 - MOVOU X11, -80(DI)(BX*1) - MOVOU -64(SI)(BX*1), X12 - MOVOU X12, -64(DI)(BX*1) - MOVOU -48(SI)(BX*1), X13 - MOVOU X13, -48(DI)(BX*1) - MOVOU -32(SI)(BX*1), X14 - MOVOU X14, -32(DI)(BX*1) - MOVOU -16(SI)(BX*1), X15 - MOVOU X15, -16(DI)(BX*1) - RET -move_256through2048: - SUBQ $256, BX - MOVOU (SI), X0 - MOVOU X0, (DI) - MOVOU 16(SI), X1 - MOVOU X1, 16(DI) - MOVOU 32(SI), X2 - MOVOU X2, 32(DI) - MOVOU 48(SI), X3 - MOVOU X3, 48(DI) - MOVOU 64(SI), X4 - MOVOU X4, 64(DI) - MOVOU 80(SI), X5 - MOVOU X5, 80(DI) - MOVOU 96(SI), X6 - MOVOU X6, 96(DI) - MOVOU 112(SI), X7 - MOVOU X7, 112(DI) - MOVOU 128(SI), X8 - MOVOU X8, 128(DI) - MOVOU 144(SI), X9 - MOVOU X9, 144(DI) - MOVOU 160(SI), X10 - MOVOU X10, 160(DI) - MOVOU 176(SI), X11 - MOVOU X11, 176(DI) - MOVOU 192(SI), X12 - MOVOU X12, 192(DI) - MOVOU 208(SI), X13 - MOVOU X13, 208(DI) - MOVOU 224(SI), X14 - MOVOU X14, 224(DI) - MOVOU 240(SI), X15 - MOVOU X15, 240(DI) - CMPQ BX, $256 - LEAQ 256(SI), SI - LEAQ 256(DI), DI - JGE move_256through2048 - JMP tail diff --git a/pkg/sentry/platform/safecopy/memcpy_arm64.s b/pkg/sentry/platform/safecopy/memcpy_arm64.s deleted file mode 100644 index e7e541565..000000000 --- a/pkg/sentry/platform/safecopy/memcpy_arm64.s +++ /dev/null @@ -1,78 +0,0 @@ -// Copyright 2014 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -#include "textflag.h" - -// handleMemcpyFault returns (the value stored in R0, the value stored in R1). -// Control is transferred to it when memcpy below receives SIGSEGV or SIGBUS, -// with the faulting address stored in R0 and the signal number stored in R1. -// -// It must have the same frame configuration as memcpy so that it can undo any -// potential call frame set up by the assembler. -TEXT handleMemcpyFault(SB), NOSPLIT, $0-36 - MOVD R0, addr+24(FP) - MOVW R1, sig+32(FP) - RET - -// memcpy copies data from src to dst. If a SIGSEGV or SIGBUS signal is received -// during the copy, it returns the address that caused the fault and the number -// of the signal that was received. Otherwise, it returns an unspecified address -// and a signal number of 0. -// -// Data is copied in order, such that if a fault happens at address p, it is -// safe to assume that all data before p-maxRegisterSize has already been -// successfully copied. -// -// The code is derived from the Go source runtime.memmove. -// -// func memcpy(dst, src unsafe.Pointer, n uintptr) (fault unsafe.Pointer, sig int32) -TEXT ·memcpy(SB), NOSPLIT, $-8-36 - // Store 0 as the returned signal number. If we run to completion, - // this is the value the caller will see; if a signal is received, - // handleMemcpyFault will store a different value in this address. - MOVW $0, sig+32(FP) - - MOVD to+0(FP), R3 - MOVD from+8(FP), R4 - MOVD n+16(FP), R5 - CMP $0, R5 - BNE check - RET - -check: - AND $~7, R5, R7 // R7 is N&~7. - SUB R7, R5, R6 // R6 is N&7. - - // Copying forward proceeds by copying R7/8 words then copying R6 bytes. - // R3 and R4 are advanced as we copy. - - // (There may be implementations of armv8 where copying by bytes until - // at least one of source or dest is word aligned is a worthwhile - // optimization, but the on the one tested so far (xgene) it did not - // make a significance difference.) - - CMP $0, R7 // Do we need to do any word-by-word copying? - BEQ noforwardlarge - ADD R3, R7, R9 // R9 points just past where we copy by word. - -forwardlargeloop: - MOVD.P 8(R4), R8 // R8 is just a scratch register. - MOVD.P R8, 8(R3) - CMP R3, R9 - BNE forwardlargeloop - -noforwardlarge: - CMP $0, R6 // Do we need to do any byte-by-byte copying? - BNE forwardtail - RET - -forwardtail: - ADD R3, R6, R9 // R9 points just past the destination memory. - -forwardtailloop: - MOVBU.P 1(R4), R8 - MOVBU.P R8, 1(R3) - CMP R3, R9 - BNE forwardtailloop - RET diff --git a/pkg/sentry/platform/safecopy/safecopy.go b/pkg/sentry/platform/safecopy/safecopy.go deleted file mode 100644 index 2fb7e5809..000000000 --- a/pkg/sentry/platform/safecopy/safecopy.go +++ /dev/null @@ -1,144 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Package safecopy provides an efficient implementation of functions to access -// memory that may result in SIGSEGV or SIGBUS being sent to the accessor. -package safecopy - -import ( - "fmt" - "reflect" - "runtime" - "syscall" - - "gvisor.dev/gvisor/pkg/syserror" -) - -// SegvError is returned when a safecopy function receives SIGSEGV. -type SegvError struct { - // Addr is the address at which the SIGSEGV occurred. - Addr uintptr -} - -// Error implements error.Error. -func (e SegvError) Error() string { - return fmt.Sprintf("SIGSEGV at %#x", e.Addr) -} - -// BusError is returned when a safecopy function receives SIGBUS. -type BusError struct { - // Addr is the address at which the SIGBUS occurred. - Addr uintptr -} - -// Error implements error.Error. -func (e BusError) Error() string { - return fmt.Sprintf("SIGBUS at %#x", e.Addr) -} - -// AlignmentError is returned when a safecopy function is passed an address -// that does not meet alignment requirements. -type AlignmentError struct { - // Addr is the invalid address. - Addr uintptr - - // Alignment is the required alignment. - Alignment uintptr -} - -// Error implements error.Error. -func (e AlignmentError) Error() string { - return fmt.Sprintf("address %#x is not aligned to a %d-byte boundary", e.Addr, e.Alignment) -} - -var ( - // The begin and end addresses below are for the functions that are - // checked by the signal handler. - memcpyBegin uintptr - memcpyEnd uintptr - memclrBegin uintptr - memclrEnd uintptr - swapUint32Begin uintptr - swapUint32End uintptr - swapUint64Begin uintptr - swapUint64End uintptr - compareAndSwapUint32Begin uintptr - compareAndSwapUint32End uintptr - loadUint32Begin uintptr - loadUint32End uintptr - - // savedSigSegVHandler is a pointer to the SIGSEGV handler that was - // configured before we replaced it with our own. We still call into it - // when we get a SIGSEGV that is not interesting to us. - savedSigSegVHandler uintptr - - // same a above, but for SIGBUS signals. - savedSigBusHandler uintptr -) - -// signalHandler is our replacement signal handler for SIGSEGV and SIGBUS -// signals. -func signalHandler() - -// FindEndAddress returns the end address (one byte beyond the last) of the -// function that contains the specified address (begin). -func FindEndAddress(begin uintptr) uintptr { - f := runtime.FuncForPC(begin) - if f != nil { - for p := begin; ; p++ { - g := runtime.FuncForPC(p) - if f != g { - return p - } - } - } - return begin -} - -// initializeAddresses initializes the addresses used by the signal handler. -func initializeAddresses() { - // The following functions are written in assembly language, so they won't - // be inlined by the existing compiler/linker. Tests will fail if this - // assumption is violated. - memcpyBegin = reflect.ValueOf(memcpy).Pointer() - memcpyEnd = FindEndAddress(memcpyBegin) - memclrBegin = reflect.ValueOf(memclr).Pointer() - memclrEnd = FindEndAddress(memclrBegin) - swapUint32Begin = reflect.ValueOf(swapUint32).Pointer() - swapUint32End = FindEndAddress(swapUint32Begin) - swapUint64Begin = reflect.ValueOf(swapUint64).Pointer() - swapUint64End = FindEndAddress(swapUint64Begin) - compareAndSwapUint32Begin = reflect.ValueOf(compareAndSwapUint32).Pointer() - compareAndSwapUint32End = FindEndAddress(compareAndSwapUint32Begin) - loadUint32Begin = reflect.ValueOf(loadUint32).Pointer() - loadUint32End = FindEndAddress(loadUint32Begin) -} - -func init() { - initializeAddresses() - if err := ReplaceSignalHandler(syscall.SIGSEGV, reflect.ValueOf(signalHandler).Pointer(), &savedSigSegVHandler); err != nil { - panic(fmt.Sprintf("Unable to set handler for SIGSEGV: %v", err)) - } - if err := ReplaceSignalHandler(syscall.SIGBUS, reflect.ValueOf(signalHandler).Pointer(), &savedSigBusHandler); err != nil { - panic(fmt.Sprintf("Unable to set handler for SIGBUS: %v", err)) - } - syserror.AddErrorUnwrapper(func(e error) (syscall.Errno, bool) { - switch e.(type) { - case SegvError, BusError, AlignmentError: - return syscall.EFAULT, true - default: - return 0, false - } - }) -} diff --git a/pkg/sentry/platform/safecopy/safecopy_test.go b/pkg/sentry/platform/safecopy/safecopy_test.go deleted file mode 100644 index 5818f7f9b..000000000 --- a/pkg/sentry/platform/safecopy/safecopy_test.go +++ /dev/null @@ -1,617 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package safecopy - -import ( - "bytes" - "fmt" - "io/ioutil" - "math/rand" - "os" - "runtime/debug" - "syscall" - "testing" - "unsafe" -) - -// Size of a page in bytes. Cloned from usermem.PageSize to avoid a circular -// dependency. -const pageSize = 4096 - -func initRandom(b []byte) { - for i := range b { - b[i] = byte(rand.Intn(256)) - } -} - -func randBuf(size int) []byte { - b := make([]byte, size) - initRandom(b) - return b -} - -func TestCopyInSuccess(t *testing.T) { - // Test that CopyIn does not return an error when all pages are accessible. - const bufLen = 8192 - a := randBuf(bufLen) - b := make([]byte, bufLen) - - n, err := CopyIn(b, unsafe.Pointer(&a[0])) - if n != bufLen { - t.Errorf("Unexpected copy length, got %v, want %v", n, bufLen) - } - if err != nil { - t.Errorf("Unexpected error: %v", err) - } - if !bytes.Equal(a, b) { - t.Errorf("Buffers are not equal when they should be: %v %v", a, b) - } -} - -func TestCopyOutSuccess(t *testing.T) { - // Test that CopyOut does not return an error when all pages are - // accessible. - const bufLen = 8192 - a := randBuf(bufLen) - b := make([]byte, bufLen) - - n, err := CopyOut(unsafe.Pointer(&b[0]), a) - if n != bufLen { - t.Errorf("Unexpected copy length, got %v, want %v", n, bufLen) - } - if err != nil { - t.Errorf("Unexpected error: %v", err) - } - if !bytes.Equal(a, b) { - t.Errorf("Buffers are not equal when they should be: %v %v", a, b) - } -} - -func TestCopySuccess(t *testing.T) { - // Test that Copy does not return an error when all pages are accessible. - const bufLen = 8192 - a := randBuf(bufLen) - b := make([]byte, bufLen) - - n, err := Copy(unsafe.Pointer(&b[0]), unsafe.Pointer(&a[0]), bufLen) - if n != bufLen { - t.Errorf("Unexpected copy length, got %v, want %v", n, bufLen) - } - if err != nil { - t.Errorf("Unexpected error: %v", err) - } - if !bytes.Equal(a, b) { - t.Errorf("Buffers are not equal when they should be: %v %v", a, b) - } -} - -func TestZeroOutSuccess(t *testing.T) { - // Test that ZeroOut does not return an error when all pages are - // accessible. - const bufLen = 8192 - a := make([]byte, bufLen) - b := randBuf(bufLen) - - n, err := ZeroOut(unsafe.Pointer(&b[0]), bufLen) - if n != bufLen { - t.Errorf("Unexpected copy length, got %v, want %v", n, bufLen) - } - if err != nil { - t.Errorf("Unexpected error: %v", err) - } - if !bytes.Equal(a, b) { - t.Errorf("Buffers are not equal when they should be: %v %v", a, b) - } -} - -func TestSwapUint32Success(t *testing.T) { - // Test that SwapUint32 does not return an error when the page is - // accessible. - before := uint32(rand.Int31()) - after := uint32(rand.Int31()) - val := before - - old, err := SwapUint32(unsafe.Pointer(&val), after) - if err != nil { - t.Errorf("Unexpected error: %v", err) - } - if old != before { - t.Errorf("Unexpected old value: got %v, want %v", old, before) - } - if val != after { - t.Errorf("Unexpected new value: got %v, want %v", val, after) - } -} - -func TestSwapUint32AlignmentError(t *testing.T) { - // Test that SwapUint32 returns an AlignmentError when passed an unaligned - // address. - data := new(struct{ val uint64 }) - addr := uintptr(unsafe.Pointer(&data.val)) + 1 - want := AlignmentError{Addr: addr, Alignment: 4} - if _, err := SwapUint32(unsafe.Pointer(addr), 1); err != want { - t.Errorf("Unexpected error: got %v, want %v", err, want) - } -} - -func TestSwapUint64Success(t *testing.T) { - // Test that SwapUint64 does not return an error when the page is - // accessible. - before := uint64(rand.Int63()) - after := uint64(rand.Int63()) - // "The first word in ... an allocated struct or slice can be relied upon - // to be 64-bit aligned." - sync/atomic docs - data := new(struct{ val uint64 }) - data.val = before - - old, err := SwapUint64(unsafe.Pointer(&data.val), after) - if err != nil { - t.Errorf("Unexpected error: %v", err) - } - if old != before { - t.Errorf("Unexpected old value: got %v, want %v", old, before) - } - if data.val != after { - t.Errorf("Unexpected new value: got %v, want %v", data.val, after) - } -} - -func TestSwapUint64AlignmentError(t *testing.T) { - // Test that SwapUint64 returns an AlignmentError when passed an unaligned - // address. - data := new(struct{ val1, val2 uint64 }) - addr := uintptr(unsafe.Pointer(&data.val1)) + 1 - want := AlignmentError{Addr: addr, Alignment: 8} - if _, err := SwapUint64(unsafe.Pointer(addr), 1); err != want { - t.Errorf("Unexpected error: got %v, want %v", err, want) - } -} - -func TestCompareAndSwapUint32Success(t *testing.T) { - // Test that CompareAndSwapUint32 does not return an error when the page is - // accessible. - before := uint32(rand.Int31()) - after := uint32(rand.Int31()) - val := before - - old, err := CompareAndSwapUint32(unsafe.Pointer(&val), before, after) - if err != nil { - t.Errorf("Unexpected error: %v", err) - } - if old != before { - t.Errorf("Unexpected old value: got %v, want %v", old, before) - } - if val != after { - t.Errorf("Unexpected new value: got %v, want %v", val, after) - } -} - -func TestCompareAndSwapUint32AlignmentError(t *testing.T) { - // Test that CompareAndSwapUint32 returns an AlignmentError when passed an - // unaligned address. - data := new(struct{ val uint64 }) - addr := uintptr(unsafe.Pointer(&data.val)) + 1 - want := AlignmentError{Addr: addr, Alignment: 4} - if _, err := CompareAndSwapUint32(unsafe.Pointer(addr), 0, 1); err != want { - t.Errorf("Unexpected error: got %v, want %v", err, want) - } -} - -// withSegvErrorTestMapping calls fn with a two-page mapping. The first page -// contains random data, and the second page generates SIGSEGV when accessed. -func withSegvErrorTestMapping(t *testing.T, fn func(m []byte)) { - mapping, err := syscall.Mmap(-1, 0, 2*pageSize, syscall.PROT_READ|syscall.PROT_WRITE, syscall.MAP_ANONYMOUS|syscall.MAP_PRIVATE) - if err != nil { - t.Fatalf("Mmap failed: %v", err) - } - defer syscall.Munmap(mapping) - if err := syscall.Mprotect(mapping[pageSize:], syscall.PROT_NONE); err != nil { - t.Fatalf("Mprotect failed: %v", err) - } - initRandom(mapping[:pageSize]) - - fn(mapping) -} - -// withBusErrorTestMapping calls fn with a two-page mapping. The first page -// contains random data, and the second page generates SIGBUS when accessed. -func withBusErrorTestMapping(t *testing.T, fn func(m []byte)) { - f, err := ioutil.TempFile("", "sigbus_test") - if err != nil { - t.Fatalf("TempFile failed: %v", err) - } - defer f.Close() - if err := f.Truncate(pageSize); err != nil { - t.Fatalf("Truncate failed: %v", err) - } - mapping, err := syscall.Mmap(int(f.Fd()), 0, 2*pageSize, syscall.PROT_READ|syscall.PROT_WRITE, syscall.MAP_SHARED) - if err != nil { - t.Fatalf("Mmap failed: %v", err) - } - defer syscall.Munmap(mapping) - initRandom(mapping[:pageSize]) - - fn(mapping) -} - -func TestCopyInSegvError(t *testing.T) { - // Test that CopyIn returns a SegvError when reaching a page that signals - // SIGSEGV. - for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ { - t.Run(fmt.Sprintf("starting copy %d bytes before SIGSEGV", bytesBeforeFault), func(t *testing.T) { - withSegvErrorTestMapping(t, func(mapping []byte) { - secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize - src := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault)) - dst := randBuf(pageSize) - n, err := CopyIn(dst, src) - if n != bytesBeforeFault { - t.Errorf("Unexpected copy length: got %v, want %v", n, bytesBeforeFault) - } - if want := (SegvError{secondPage}); err != want { - t.Errorf("Unexpected error: got %v, want %v", err, want) - } - if got, want := dst[:bytesBeforeFault], mapping[pageSize-bytesBeforeFault:pageSize]; !bytes.Equal(got, want) { - t.Errorf("Buffers are not equal when they should be: %v %v", got, want) - } - }) - }) - } -} - -func TestCopyInBusError(t *testing.T) { - // Test that CopyIn returns a BusError when reaching a page that signals - // SIGBUS. - for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ { - t.Run(fmt.Sprintf("starting copy %d bytes before SIGBUS", bytesBeforeFault), func(t *testing.T) { - withBusErrorTestMapping(t, func(mapping []byte) { - secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize - src := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault)) - dst := randBuf(pageSize) - n, err := CopyIn(dst, src) - if n != bytesBeforeFault { - t.Errorf("Unexpected copy length: got %v, want %v", n, bytesBeforeFault) - } - if want := (BusError{secondPage}); err != want { - t.Errorf("Unexpected error: got %v, want %v", err, want) - } - if got, want := dst[:bytesBeforeFault], mapping[pageSize-bytesBeforeFault:pageSize]; !bytes.Equal(got, want) { - t.Errorf("Buffers are not equal when they should be: %v %v", got, want) - } - }) - }) - } -} - -func TestCopyOutSegvError(t *testing.T) { - // Test that CopyOut returns a SegvError when reaching a page that signals - // SIGSEGV. - for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ { - t.Run(fmt.Sprintf("starting copy %d bytes before SIGSEGV", bytesBeforeFault), func(t *testing.T) { - withSegvErrorTestMapping(t, func(mapping []byte) { - secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize - dst := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault)) - src := randBuf(pageSize) - n, err := CopyOut(dst, src) - if n != bytesBeforeFault { - t.Errorf("Unexpected copy length: got %v, want %v", n, bytesBeforeFault) - } - if want := (SegvError{secondPage}); err != want { - t.Errorf("Unexpected error: got %v, want %v", err, want) - } - if got, want := mapping[pageSize-bytesBeforeFault:pageSize], src[:bytesBeforeFault]; !bytes.Equal(got, want) { - t.Errorf("Buffers are not equal when they should be: %v %v", got, want) - } - }) - }) - } -} - -func TestCopyOutBusError(t *testing.T) { - // Test that CopyOut returns a BusError when reaching a page that signals - // SIGBUS. - for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ { - t.Run(fmt.Sprintf("starting copy %d bytes before SIGSEGV", bytesBeforeFault), func(t *testing.T) { - withBusErrorTestMapping(t, func(mapping []byte) { - secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize - dst := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault)) - src := randBuf(pageSize) - n, err := CopyOut(dst, src) - if n != bytesBeforeFault { - t.Errorf("Unexpected copy length: got %v, want %v", n, bytesBeforeFault) - } - if want := (BusError{secondPage}); err != want { - t.Errorf("Unexpected error: got %v, want %v", err, want) - } - if got, want := mapping[pageSize-bytesBeforeFault:pageSize], src[:bytesBeforeFault]; !bytes.Equal(got, want) { - t.Errorf("Buffers are not equal when they should be: %v %v", got, want) - } - }) - }) - } -} - -func TestCopySourceSegvError(t *testing.T) { - // Test that Copy returns a SegvError when copying from a page that signals - // SIGSEGV. - for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ { - t.Run(fmt.Sprintf("starting copy %d bytes before SIGSEGV", bytesBeforeFault), func(t *testing.T) { - withSegvErrorTestMapping(t, func(mapping []byte) { - secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize - src := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault)) - dst := randBuf(pageSize) - n, err := Copy(unsafe.Pointer(&dst[0]), src, pageSize) - if n != uintptr(bytesBeforeFault) { - t.Errorf("Unexpected copy length: got %v, want %v", n, bytesBeforeFault) - } - if want := (SegvError{secondPage}); err != want { - t.Errorf("Unexpected error: got %v, want %v", err, want) - } - if got, want := dst[:bytesBeforeFault], mapping[pageSize-bytesBeforeFault:pageSize]; !bytes.Equal(got, want) { - t.Errorf("Buffers are not equal when they should be: %v %v", got, want) - } - }) - }) - } -} - -func TestCopySourceBusError(t *testing.T) { - // Test that Copy returns a BusError when copying from a page that signals - // SIGBUS. - for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ { - t.Run(fmt.Sprintf("starting copy %d bytes before SIGBUS", bytesBeforeFault), func(t *testing.T) { - withBusErrorTestMapping(t, func(mapping []byte) { - secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize - src := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault)) - dst := randBuf(pageSize) - n, err := Copy(unsafe.Pointer(&dst[0]), src, pageSize) - if n != uintptr(bytesBeforeFault) { - t.Errorf("Unexpected copy length: got %v, want %v", n, bytesBeforeFault) - } - if want := (BusError{secondPage}); err != want { - t.Errorf("Unexpected error: got %v, want %v", err, want) - } - if got, want := dst[:bytesBeforeFault], mapping[pageSize-bytesBeforeFault:pageSize]; !bytes.Equal(got, want) { - t.Errorf("Buffers are not equal when they should be: %v %v", got, want) - } - }) - }) - } -} - -func TestCopyDestinationSegvError(t *testing.T) { - // Test that Copy returns a SegvError when copying to a page that signals - // SIGSEGV. - for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ { - t.Run(fmt.Sprintf("starting copy %d bytes before SIGSEGV", bytesBeforeFault), func(t *testing.T) { - withSegvErrorTestMapping(t, func(mapping []byte) { - secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize - dst := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault)) - src := randBuf(pageSize) - n, err := Copy(dst, unsafe.Pointer(&src[0]), pageSize) - if n != uintptr(bytesBeforeFault) { - t.Errorf("Unexpected copy length: got %v, want %v", n, bytesBeforeFault) - } - if want := (SegvError{secondPage}); err != want { - t.Errorf("Unexpected error: got %v, want %v", err, want) - } - if got, want := mapping[pageSize-bytesBeforeFault:pageSize], src[:bytesBeforeFault]; !bytes.Equal(got, want) { - t.Errorf("Buffers are not equal when they should be: %v %v", got, want) - } - }) - }) - } -} - -func TestCopyDestinationBusError(t *testing.T) { - // Test that Copy returns a BusError when copying to a page that signals - // SIGBUS. - for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ { - t.Run(fmt.Sprintf("starting copy %d bytes before SIGBUS", bytesBeforeFault), func(t *testing.T) { - withBusErrorTestMapping(t, func(mapping []byte) { - secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize - dst := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault)) - src := randBuf(pageSize) - n, err := Copy(dst, unsafe.Pointer(&src[0]), pageSize) - if n != uintptr(bytesBeforeFault) { - t.Errorf("Unexpected copy length: got %v, want %v", n, bytesBeforeFault) - } - if want := (BusError{secondPage}); err != want { - t.Errorf("Unexpected error: got %v, want %v", err, want) - } - if got, want := mapping[pageSize-bytesBeforeFault:pageSize], src[:bytesBeforeFault]; !bytes.Equal(got, want) { - t.Errorf("Buffers are not equal when they should be: %v %v", got, want) - } - }) - }) - } -} - -func TestZeroOutSegvError(t *testing.T) { - // Test that ZeroOut returns a SegvError when reaching a page that signals - // SIGSEGV. - for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ { - t.Run(fmt.Sprintf("starting write %d bytes before SIGSEGV", bytesBeforeFault), func(t *testing.T) { - withSegvErrorTestMapping(t, func(mapping []byte) { - secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize - dst := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault)) - n, err := ZeroOut(dst, pageSize) - if n != uintptr(bytesBeforeFault) { - t.Errorf("Unexpected write length: got %v, want %v", n, bytesBeforeFault) - } - if want := (SegvError{secondPage}); err != want { - t.Errorf("Unexpected error: got %v, want %v", err, want) - } - if got, want := mapping[pageSize-bytesBeforeFault:pageSize], make([]byte, bytesBeforeFault); !bytes.Equal(got, want) { - t.Errorf("Non-zero bytes in written part of mapping: %v", got) - } - }) - }) - } -} - -func TestZeroOutBusError(t *testing.T) { - // Test that ZeroOut returns a BusError when reaching a page that signals - // SIGBUS. - for bytesBeforeFault := 0; bytesBeforeFault <= 2*maxRegisterSize; bytesBeforeFault++ { - t.Run(fmt.Sprintf("starting write %d bytes before SIGBUS", bytesBeforeFault), func(t *testing.T) { - withBusErrorTestMapping(t, func(mapping []byte) { - secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize - dst := unsafe.Pointer(secondPage - uintptr(bytesBeforeFault)) - n, err := ZeroOut(dst, pageSize) - if n != uintptr(bytesBeforeFault) { - t.Errorf("Unexpected write length: got %v, want %v", n, bytesBeforeFault) - } - if want := (BusError{secondPage}); err != want { - t.Errorf("Unexpected error: got %v, want %v", err, want) - } - if got, want := mapping[pageSize-bytesBeforeFault:pageSize], make([]byte, bytesBeforeFault); !bytes.Equal(got, want) { - t.Errorf("Non-zero bytes in written part of mapping: %v", got) - } - }) - }) - } -} - -func TestSwapUint32SegvError(t *testing.T) { - // Test that SwapUint32 returns a SegvError when reaching a page that - // signals SIGSEGV. - withSegvErrorTestMapping(t, func(mapping []byte) { - secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize - _, err := SwapUint32(unsafe.Pointer(secondPage), 1) - if want := (SegvError{secondPage}); err != want { - t.Errorf("Unexpected error: got %v, want %v", err, want) - } - }) -} - -func TestSwapUint32BusError(t *testing.T) { - // Test that SwapUint32 returns a BusError when reaching a page that - // signals SIGBUS. - withBusErrorTestMapping(t, func(mapping []byte) { - secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize - _, err := SwapUint32(unsafe.Pointer(secondPage), 1) - if want := (BusError{secondPage}); err != want { - t.Errorf("Unexpected error: got %v, want %v", err, want) - } - }) -} - -func TestSwapUint64SegvError(t *testing.T) { - // Test that SwapUint64 returns a SegvError when reaching a page that - // signals SIGSEGV. - withSegvErrorTestMapping(t, func(mapping []byte) { - secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize - _, err := SwapUint64(unsafe.Pointer(secondPage), 1) - if want := (SegvError{secondPage}); err != want { - t.Errorf("Unexpected error: got %v, want %v", err, want) - } - }) -} - -func TestSwapUint64BusError(t *testing.T) { - // Test that SwapUint64 returns a BusError when reaching a page that - // signals SIGBUS. - withBusErrorTestMapping(t, func(mapping []byte) { - secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize - _, err := SwapUint64(unsafe.Pointer(secondPage), 1) - if want := (BusError{secondPage}); err != want { - t.Errorf("Unexpected error: got %v, want %v", err, want) - } - }) -} - -func TestCompareAndSwapUint32SegvError(t *testing.T) { - // Test that CompareAndSwapUint32 returns a SegvError when reaching a page - // that signals SIGSEGV. - withSegvErrorTestMapping(t, func(mapping []byte) { - secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize - _, err := CompareAndSwapUint32(unsafe.Pointer(secondPage), 0, 1) - if want := (SegvError{secondPage}); err != want { - t.Errorf("Unexpected error: got %v, want %v", err, want) - } - }) -} - -func TestCompareAndSwapUint32BusError(t *testing.T) { - // Test that CompareAndSwapUint32 returns a BusError when reaching a page - // that signals SIGBUS. - withBusErrorTestMapping(t, func(mapping []byte) { - secondPage := uintptr(unsafe.Pointer(&mapping[0])) + pageSize - _, err := CompareAndSwapUint32(unsafe.Pointer(secondPage), 0, 1) - if want := (BusError{secondPage}); err != want { - t.Errorf("Unexpected error: got %v, want %v", err, want) - } - }) -} - -func testCopy(dst, src []byte) (panicked bool) { - defer func() { - if r := recover(); r != nil { - panicked = true - } - }() - debug.SetPanicOnFault(true) - copy(dst, src) - return -} - -func TestSegVOnMemmove(t *testing.T) { - // Test that SIGSEGVs received by runtime.memmove when *not* doing - // CopyIn or CopyOut work gets propagated to the runtime. - const bufLen = pageSize - a, err := syscall.Mmap(-1, 0, bufLen, syscall.PROT_NONE, syscall.MAP_ANON|syscall.MAP_PRIVATE) - if err != nil { - t.Fatalf("Mmap failed: %v", err) - - } - defer syscall.Munmap(a) - b := randBuf(bufLen) - - if !testCopy(b, a) { - t.Fatalf("testCopy didn't panic when it should have") - } - - if !testCopy(a, b) { - t.Fatalf("testCopy didn't panic when it should have") - } -} - -func TestSigbusOnMemmove(t *testing.T) { - // Test that SIGBUS received by runtime.memmove when *not* doing - // CopyIn or CopyOut work gets propagated to the runtime. - const bufLen = pageSize - f, err := ioutil.TempFile("", "sigbus_test") - if err != nil { - t.Fatalf("TempFile failed: %v", err) - } - os.Remove(f.Name()) - defer f.Close() - - a, err := syscall.Mmap(int(f.Fd()), 0, bufLen, syscall.PROT_READ|syscall.PROT_WRITE, syscall.MAP_SHARED) - if err != nil { - t.Fatalf("Mmap failed: %v", err) - - } - defer syscall.Munmap(a) - b := randBuf(bufLen) - - if !testCopy(b, a) { - t.Fatalf("testCopy didn't panic when it should have") - } - - if !testCopy(a, b) { - t.Fatalf("testCopy didn't panic when it should have") - } -} diff --git a/pkg/sentry/platform/safecopy/safecopy_unsafe.go b/pkg/sentry/platform/safecopy/safecopy_unsafe.go deleted file mode 100644 index eef028e68..000000000 --- a/pkg/sentry/platform/safecopy/safecopy_unsafe.go +++ /dev/null @@ -1,335 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package safecopy - -import ( - "fmt" - "syscall" - "unsafe" -) - -// maxRegisterSize is the maximum register size used in memcpy and memclr. It -// is used to decide by how much to rewind the copy (for memcpy) or zeroing -// (for memclr) before proceeding. -const maxRegisterSize = 16 - -// memcpy copies data from src to dst. If a SIGSEGV or SIGBUS signal is received -// during the copy, it returns the address that caused the fault and the number -// of the signal that was received. Otherwise, it returns an unspecified address -// and a signal number of 0. -// -// Data is copied in order, such that if a fault happens at address p, it is -// safe to assume that all data before p-maxRegisterSize has already been -// successfully copied. -// -//go:noescape -func memcpy(dst, src unsafe.Pointer, n uintptr) (fault unsafe.Pointer, sig int32) - -// memclr sets the n bytes following ptr to zeroes. If a SIGSEGV or SIGBUS -// signal is received during the write, it returns the address that caused the -// fault and the number of the signal that was received. Otherwise, it returns -// an unspecified address and a signal number of 0. -// -// Data is written in order, such that if a fault happens at address p, it is -// safe to assume that all data before p-maxRegisterSize has already been -// successfully written. -// -//go:noescape -func memclr(ptr unsafe.Pointer, n uintptr) (fault unsafe.Pointer, sig int32) - -// swapUint32 atomically stores new into *ptr and returns (the previous *ptr -// value, 0). If a SIGSEGV or SIGBUS signal is received during the swap, the -// value of old is unspecified, and sig is the number of the signal that was -// received. -// -// Preconditions: ptr must be aligned to a 4-byte boundary. -// -//go:noescape -func swapUint32(ptr unsafe.Pointer, new uint32) (old uint32, sig int32) - -// swapUint64 atomically stores new into *ptr and returns (the previous *ptr -// value, 0). If a SIGSEGV or SIGBUS signal is received during the swap, the -// value of old is unspecified, and sig is the number of the signal that was -// received. -// -// Preconditions: ptr must be aligned to a 8-byte boundary. -// -//go:noescape -func swapUint64(ptr unsafe.Pointer, new uint64) (old uint64, sig int32) - -// compareAndSwapUint32 is like sync/atomic.CompareAndSwapUint32, but returns -// (the value previously stored at ptr, 0). If a SIGSEGV or SIGBUS signal is -// received during the operation, the value of prev is unspecified, and sig is -// the number of the signal that was received. -// -// Preconditions: ptr must be aligned to a 4-byte boundary. -// -//go:noescape -func compareAndSwapUint32(ptr unsafe.Pointer, old, new uint32) (prev uint32, sig int32) - -// LoadUint32 is like sync/atomic.LoadUint32, but operates with user memory. It -// may fail with SIGSEGV or SIGBUS if it is received while reading from ptr. -// -// Preconditions: ptr must be aligned to a 4-byte boundary. -// -//go:noescape -func loadUint32(ptr unsafe.Pointer) (val uint32, sig int32) - -// CopyIn copies len(dst) bytes from src to dst. It returns the number of bytes -// copied and an error if SIGSEGV or SIGBUS is received while reading from src. -func CopyIn(dst []byte, src unsafe.Pointer) (int, error) { - toCopy := uintptr(len(dst)) - if len(dst) == 0 { - return 0, nil - } - - fault, sig := memcpy(unsafe.Pointer(&dst[0]), src, toCopy) - if sig == 0 { - return len(dst), nil - } - - faultN, srcN := uintptr(fault), uintptr(src) - if faultN < srcN || faultN >= srcN+toCopy { - panic(fmt.Sprintf("CopyIn raised signal %d at %#x, which is outside source [%#x, %#x)", sig, faultN, srcN, srcN+toCopy)) - } - - // memcpy might have ended the copy up to maxRegisterSize bytes before - // fault, if an instruction caused a memory access that straddled two - // pages, and the second one faulted. Try to copy up to the fault. - var done int - if faultN-srcN > maxRegisterSize { - done = int(faultN - srcN - maxRegisterSize) - } - n, err := CopyIn(dst[done:int(faultN-srcN)], unsafe.Pointer(srcN+uintptr(done))) - done += n - if err != nil { - return done, err - } - return done, errorFromFaultSignal(fault, sig) -} - -// CopyOut copies len(src) bytes from src to dst. If returns the number of -// bytes done and an error if SIGSEGV or SIGBUS is received while writing to -// dst. -func CopyOut(dst unsafe.Pointer, src []byte) (int, error) { - toCopy := uintptr(len(src)) - if toCopy == 0 { - return 0, nil - } - - fault, sig := memcpy(dst, unsafe.Pointer(&src[0]), toCopy) - if sig == 0 { - return len(src), nil - } - - faultN, dstN := uintptr(fault), uintptr(dst) - if faultN < dstN || faultN >= dstN+toCopy { - panic(fmt.Sprintf("CopyOut raised signal %d at %#x, which is outside destination [%#x, %#x)", sig, faultN, dstN, dstN+toCopy)) - } - - // memcpy might have ended the copy up to maxRegisterSize bytes before - // fault, if an instruction caused a memory access that straddled two - // pages, and the second one faulted. Try to copy up to the fault. - var done int - if faultN-dstN > maxRegisterSize { - done = int(faultN - dstN - maxRegisterSize) - } - n, err := CopyOut(unsafe.Pointer(dstN+uintptr(done)), src[done:int(faultN-dstN)]) - done += n - if err != nil { - return done, err - } - return done, errorFromFaultSignal(fault, sig) -} - -// Copy copies toCopy bytes from src to dst. It returns the number of bytes -// copied and an error if SIGSEGV or SIGBUS is received while reading from src -// or writing to dst. -// -// Data is copied in order; if [src, src+toCopy) and [dst, dst+toCopy) overlap, -// the resulting contents of dst are unspecified. -func Copy(dst, src unsafe.Pointer, toCopy uintptr) (uintptr, error) { - if toCopy == 0 { - return 0, nil - } - - fault, sig := memcpy(dst, src, toCopy) - if sig == 0 { - return toCopy, nil - } - - // Did the fault occur while reading from src or writing to dst? - faultN, srcN, dstN := uintptr(fault), uintptr(src), uintptr(dst) - faultAfterSrc := ^uintptr(0) - if faultN >= srcN { - faultAfterSrc = faultN - srcN - } - faultAfterDst := ^uintptr(0) - if faultN >= dstN { - faultAfterDst = faultN - dstN - } - if faultAfterSrc >= toCopy && faultAfterDst >= toCopy { - panic(fmt.Sprintf("Copy raised signal %d at %#x, which is outside source [%#x, %#x) and destination [%#x, %#x)", sig, faultN, srcN, srcN+toCopy, dstN, dstN+toCopy)) - } - faultedAfter := faultAfterSrc - if faultedAfter > faultAfterDst { - faultedAfter = faultAfterDst - } - - // memcpy might have ended the copy up to maxRegisterSize bytes before - // fault, if an instruction caused a memory access that straddled two - // pages, and the second one faulted. Try to copy up to the fault. - var done uintptr - if faultedAfter > maxRegisterSize { - done = faultedAfter - maxRegisterSize - } - n, err := Copy(unsafe.Pointer(dstN+done), unsafe.Pointer(srcN+done), faultedAfter-done) - done += n - if err != nil { - return done, err - } - return done, errorFromFaultSignal(fault, sig) -} - -// ZeroOut writes toZero zero bytes to dst. It returns the number of bytes -// written and an error if SIGSEGV or SIGBUS is received while writing to dst. -func ZeroOut(dst unsafe.Pointer, toZero uintptr) (uintptr, error) { - if toZero == 0 { - return 0, nil - } - - fault, sig := memclr(dst, toZero) - if sig == 0 { - return toZero, nil - } - - faultN, dstN := uintptr(fault), uintptr(dst) - if faultN < dstN || faultN >= dstN+toZero { - panic(fmt.Sprintf("ZeroOut raised signal %d at %#x, which is outside destination [%#x, %#x)", sig, faultN, dstN, dstN+toZero)) - } - - // memclr might have ended the write up to maxRegisterSize bytes before - // fault, if an instruction caused a memory access that straddled two - // pages, and the second one faulted. Try to write up to the fault. - var done uintptr - if faultN-dstN > maxRegisterSize { - done = faultN - dstN - maxRegisterSize - } - n, err := ZeroOut(unsafe.Pointer(dstN+done), faultN-dstN-done) - done += n - if err != nil { - return done, err - } - return done, errorFromFaultSignal(fault, sig) -} - -// SwapUint32 is equivalent to sync/atomic.SwapUint32, except that it returns -// an error if SIGSEGV or SIGBUS is received while accessing ptr, or if ptr is -// not aligned to a 4-byte boundary. -func SwapUint32(ptr unsafe.Pointer, new uint32) (uint32, error) { - if addr := uintptr(ptr); addr&3 != 0 { - return 0, AlignmentError{addr, 4} - } - old, sig := swapUint32(ptr, new) - return old, errorFromFaultSignal(ptr, sig) -} - -// SwapUint64 is equivalent to sync/atomic.SwapUint64, except that it returns -// an error if SIGSEGV or SIGBUS is received while accessing ptr, or if ptr is -// not aligned to an 8-byte boundary. -func SwapUint64(ptr unsafe.Pointer, new uint64) (uint64, error) { - if addr := uintptr(ptr); addr&7 != 0 { - return 0, AlignmentError{addr, 8} - } - old, sig := swapUint64(ptr, new) - return old, errorFromFaultSignal(ptr, sig) -} - -// CompareAndSwapUint32 is equivalent to atomicbitops.CompareAndSwapUint32, -// except that it returns an error if SIGSEGV or SIGBUS is received while -// accessing ptr, or if ptr is not aligned to a 4-byte boundary. -func CompareAndSwapUint32(ptr unsafe.Pointer, old, new uint32) (uint32, error) { - if addr := uintptr(ptr); addr&3 != 0 { - return 0, AlignmentError{addr, 4} - } - prev, sig := compareAndSwapUint32(ptr, old, new) - return prev, errorFromFaultSignal(ptr, sig) -} - -// LoadUint32 is like sync/atomic.LoadUint32, but operates with user memory. It -// may fail with SIGSEGV or SIGBUS if it is received while reading from ptr. -// -// Preconditions: ptr must be aligned to a 4-byte boundary. -func LoadUint32(ptr unsafe.Pointer) (uint32, error) { - if addr := uintptr(ptr); addr&3 != 0 { - return 0, AlignmentError{addr, 4} - } - val, sig := loadUint32(ptr) - return val, errorFromFaultSignal(ptr, sig) -} - -func errorFromFaultSignal(addr unsafe.Pointer, sig int32) error { - switch sig { - case 0: - return nil - case int32(syscall.SIGSEGV): - return SegvError{uintptr(addr)} - case int32(syscall.SIGBUS): - return BusError{uintptr(addr)} - default: - panic(fmt.Sprintf("safecopy got unexpected signal %d at address %#x", sig, addr)) - } -} - -// ReplaceSignalHandler replaces the existing signal handler for the provided -// signal with the one that handles faults in safecopy-protected functions. -// -// It stores the value of the previously set handler in previous. -// -// This function will be called on initialization in order to install safecopy -// handlers for appropriate signals. These handlers will call the previous -// handler however, and if this is function is being used externally then the -// same courtesy is expected. -func ReplaceSignalHandler(sig syscall.Signal, handler uintptr, previous *uintptr) error { - var sa struct { - handler uintptr - flags uint64 - restorer uintptr - mask uint64 - } - const maskLen = 8 - - // Get the existing signal handler information, and save the current - // handler. Once we replace it, we will use this pointer to fall back to - // it when we receive other signals. - if _, _, e := syscall.RawSyscall6(syscall.SYS_RT_SIGACTION, uintptr(sig), 0, uintptr(unsafe.Pointer(&sa)), maskLen, 0, 0); e != 0 { - return e - } - - // Fail if there isn't a previous handler. - if sa.handler == 0 { - return fmt.Errorf("previous handler for signal %x isn't set", sig) - } - - *previous = sa.handler - - // Install our own handler. - sa.handler = handler - if _, _, e := syscall.RawSyscall6(syscall.SYS_RT_SIGACTION, uintptr(sig), uintptr(unsafe.Pointer(&sa)), 0, maskLen, 0, 0); e != 0 { - return e - } - - return nil -} diff --git a/pkg/sentry/platform/safecopy/sighandler_amd64.s b/pkg/sentry/platform/safecopy/sighandler_amd64.s deleted file mode 100644 index 475ae48e9..000000000 --- a/pkg/sentry/platform/safecopy/sighandler_amd64.s +++ /dev/null @@ -1,133 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "textflag.h" - -// The signals handled by sigHandler. -#define SIGBUS 7 -#define SIGSEGV 11 - -// Offsets to the registers in context->uc_mcontext.gregs[]. -#define REG_RDI 0x68 -#define REG_RAX 0x90 -#define REG_IP 0xa8 - -// Offset to the si_addr field of siginfo. -#define SI_CODE 0x08 -#define SI_ADDR 0x10 - -// signalHandler is the signal handler for SIGSEGV and SIGBUS signals. It must -// not be set up as a handler to any other signals. -// -// If the instruction causing the signal is within a safecopy-protected -// function, the signal is handled such that execution resumes in the -// appropriate fault handling stub with AX containing the faulting address and -// DI containing the signal number. Otherwise control is transferred to the -// previously configured signal handler (savedSigSegvHandler or -// savedSigBusHandler). -// -// This function cannot be written in go because it runs whenever a signal is -// received by the thread (preempting whatever was running), which includes when -// garbage collector has stopped or isn't expecting any interactions (like -// barriers). -// -// The arguments are the following: -// DI - The signal number. -// SI - Pointer to siginfo_t structure. -// DX - Pointer to ucontext structure. -TEXT ·signalHandler(SB),NOSPLIT,$0 - // Check if the signal is from the kernel. - MOVQ $0x0, CX - CMPL CX, SI_CODE(SI) - JGE original_handler - - // Check if RIP is within the area we care about. - MOVQ REG_IP(DX), CX - CMPQ CX, ·memcpyBegin(SB) - JB not_memcpy - CMPQ CX, ·memcpyEnd(SB) - JAE not_memcpy - - // Modify the context such that execution will resume in the fault - // handler. - LEAQ handleMemcpyFault(SB), CX - JMP handle_fault - -not_memcpy: - CMPQ CX, ·memclrBegin(SB) - JB not_memclr - CMPQ CX, ·memclrEnd(SB) - JAE not_memclr - - LEAQ handleMemclrFault(SB), CX - JMP handle_fault - -not_memclr: - CMPQ CX, ·swapUint32Begin(SB) - JB not_swapuint32 - CMPQ CX, ·swapUint32End(SB) - JAE not_swapuint32 - - LEAQ handleSwapUint32Fault(SB), CX - JMP handle_fault - -not_swapuint32: - CMPQ CX, ·swapUint64Begin(SB) - JB not_swapuint64 - CMPQ CX, ·swapUint64End(SB) - JAE not_swapuint64 - - LEAQ handleSwapUint64Fault(SB), CX - JMP handle_fault - -not_swapuint64: - CMPQ CX, ·compareAndSwapUint32Begin(SB) - JB not_casuint32 - CMPQ CX, ·compareAndSwapUint32End(SB) - JAE not_casuint32 - - LEAQ handleCompareAndSwapUint32Fault(SB), CX - JMP handle_fault - -not_casuint32: - CMPQ CX, ·loadUint32Begin(SB) - JB not_loaduint32 - CMPQ CX, ·loadUint32End(SB) - JAE not_loaduint32 - - LEAQ handleLoadUint32Fault(SB), CX - JMP handle_fault - -not_loaduint32: -original_handler: - // Jump to the previous signal handler, which is likely the golang one. - XORQ CX, CX - MOVQ ·savedSigBusHandler(SB), AX - CMPL DI, $SIGSEGV - CMOVQEQ ·savedSigSegVHandler(SB), AX - JMP AX - -handle_fault: - // Entered with the address of the fault handler in RCX; store it in - // RIP. - MOVQ CX, REG_IP(DX) - - // Store the faulting address in RAX. - MOVQ SI_ADDR(SI), CX - MOVQ CX, REG_RAX(DX) - - // Store the signal number in EDI. - MOVL DI, REG_RDI(DX) - - RET diff --git a/pkg/sentry/platform/safecopy/sighandler_arm64.s b/pkg/sentry/platform/safecopy/sighandler_arm64.s deleted file mode 100644 index 53e4ac2c1..000000000 --- a/pkg/sentry/platform/safecopy/sighandler_arm64.s +++ /dev/null @@ -1,143 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "textflag.h" - -// The signals handled by sigHandler. -#define SIGBUS 7 -#define SIGSEGV 11 - -// Offsets to the registers in context->uc_mcontext.gregs[]. -#define REG_R0 0xB8 -#define REG_R1 0xC0 -#define REG_PC 0x1B8 - -// Offset to the si_addr field of siginfo. -#define SI_CODE 0x08 -#define SI_ADDR 0x10 - -// signalHandler is the signal handler for SIGSEGV and SIGBUS signals. It must -// not be set up as a handler to any other signals. -// -// If the instruction causing the signal is within a safecopy-protected -// function, the signal is handled such that execution resumes in the -// appropriate fault handling stub with R0 containing the faulting address and -// R1 containing the signal number. Otherwise control is transferred to the -// previously configured signal handler (savedSigSegvHandler or -// savedSigBusHandler). -// -// This function cannot be written in go because it runs whenever a signal is -// received by the thread (preempting whatever was running), which includes when -// garbage collector has stopped or isn't expecting any interactions (like -// barriers). -// -// The arguments are the following: -// R0 - The signal number. -// R1 - Pointer to siginfo_t structure. -// R2 - Pointer to ucontext structure. -TEXT ·signalHandler(SB),NOSPLIT,$0 - // Check if the signal is from the kernel, si_code > 0 means a kernel signal. - MOVD SI_CODE(R1), R7 - CMPW $0x0, R7 - BLE original_handler - - // Check if PC is within the area we care about. - MOVD REG_PC(R2), R7 - MOVD ·memcpyBegin(SB), R8 - CMP R8, R7 - BLO not_memcpy - MOVD ·memcpyEnd(SB), R8 - CMP R8, R7 - BHS not_memcpy - - // Modify the context such that execution will resume in the fault handler. - MOVD $handleMemcpyFault(SB), R7 - B handle_fault - -not_memcpy: - MOVD ·memclrBegin(SB), R8 - CMP R8, R7 - BLO not_memclr - MOVD ·memclrEnd(SB), R8 - CMP R8, R7 - BHS not_memclr - - MOVD $handleMemclrFault(SB), R7 - B handle_fault - -not_memclr: - MOVD ·swapUint32Begin(SB), R8 - CMP R8, R7 - BLO not_swapuint32 - MOVD ·swapUint32End(SB), R8 - CMP R8, R7 - BHS not_swapuint32 - - MOVD $handleSwapUint32Fault(SB), R7 - B handle_fault - -not_swapuint32: - MOVD ·swapUint64Begin(SB), R8 - CMP R8, R7 - BLO not_swapuint64 - MOVD ·swapUint64End(SB), R8 - CMP R8, R7 - BHS not_swapuint64 - - MOVD $handleSwapUint64Fault(SB), R7 - B handle_fault - -not_swapuint64: - MOVD ·compareAndSwapUint32Begin(SB), R8 - CMP R8, R7 - BLO not_casuint32 - MOVD ·compareAndSwapUint32End(SB), R8 - CMP R8, R7 - BHS not_casuint32 - - MOVD $handleCompareAndSwapUint32Fault(SB), R7 - B handle_fault - -not_casuint32: - MOVD ·loadUint32Begin(SB), R8 - CMP R8, R7 - BLO not_loaduint32 - MOVD ·loadUint32End(SB), R8 - CMP R8, R7 - BHS not_loaduint32 - - MOVD $handleLoadUint32Fault(SB), R7 - B handle_fault - -not_loaduint32: -original_handler: - // Jump to the previous signal handler, which is likely the golang one. - MOVD ·savedSigBusHandler(SB), R7 - MOVD ·savedSigSegVHandler(SB), R8 - CMPW $SIGSEGV, R0 - CSEL EQ, R8, R7, R7 - B (R7) - -handle_fault: - // Entered with the address of the fault handler in R7; store it in PC. - MOVD R7, REG_PC(R2) - - // Store the faulting address in R0. - MOVD SI_ADDR(R1), R7 - MOVD R7, REG_R0(R2) - - // Store the signal number in R1. - MOVW R0, REG_R1(R2) - - RET diff --git a/pkg/sentry/safemem/BUILD b/pkg/sentry/safemem/BUILD deleted file mode 100644 index 3ab76da97..000000000 --- a/pkg/sentry/safemem/BUILD +++ /dev/null @@ -1,27 +0,0 @@ -load("//tools:defs.bzl", "go_library", "go_test") - -package(licenses = ["notice"]) - -go_library( - name = "safemem", - srcs = [ - "block_unsafe.go", - "io.go", - "safemem.go", - "seq_unsafe.go", - ], - visibility = ["//pkg/sentry:internal"], - deps = [ - "//pkg/sentry/platform/safecopy", - ], -) - -go_test( - name = "safemem_test", - size = "small", - srcs = [ - "io_test.go", - "seq_test.go", - ], - library = ":safemem", -) diff --git a/pkg/sentry/safemem/block_unsafe.go b/pkg/sentry/safemem/block_unsafe.go deleted file mode 100644 index 6f03c94bf..000000000 --- a/pkg/sentry/safemem/block_unsafe.go +++ /dev/null @@ -1,279 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package safemem - -import ( - "fmt" - "reflect" - "unsafe" - - "gvisor.dev/gvisor/pkg/sentry/platform/safecopy" -) - -// A Block is a range of contiguous bytes, similar to []byte but with the -// following differences: -// -// - The memory represented by a Block may require the use of safecopy to -// access. -// -// - Block does not carry a capacity and cannot be expanded. -// -// Blocks are immutable and may be copied by value. The zero value of Block -// represents an empty range, analogous to a nil []byte. -type Block struct { - // [start, start+length) is the represented memory. - // - // start is an unsafe.Pointer to ensure that Block prevents the represented - // memory from being garbage-collected. - start unsafe.Pointer - length int - - // needSafecopy is true if accessing the represented memory requires the - // use of safecopy. - needSafecopy bool -} - -// BlockFromSafeSlice returns a Block equivalent to slice, which is safe to -// access without safecopy. -func BlockFromSafeSlice(slice []byte) Block { - return blockFromSlice(slice, false) -} - -// BlockFromUnsafeSlice returns a Block equivalent to bs, which is not safe to -// access without safecopy. -func BlockFromUnsafeSlice(slice []byte) Block { - return blockFromSlice(slice, true) -} - -func blockFromSlice(slice []byte, needSafecopy bool) Block { - if len(slice) == 0 { - return Block{} - } - return Block{ - start: unsafe.Pointer(&slice[0]), - length: len(slice), - needSafecopy: needSafecopy, - } -} - -// BlockFromSafePointer returns a Block equivalent to [ptr, ptr+len), which is -// safe to access without safecopy. -// -// Preconditions: ptr+len does not overflow. -func BlockFromSafePointer(ptr unsafe.Pointer, len int) Block { - return blockFromPointer(ptr, len, false) -} - -// BlockFromUnsafePointer returns a Block equivalent to [ptr, ptr+len), which -// is not safe to access without safecopy. -// -// Preconditions: ptr+len does not overflow. -func BlockFromUnsafePointer(ptr unsafe.Pointer, len int) Block { - return blockFromPointer(ptr, len, true) -} - -func blockFromPointer(ptr unsafe.Pointer, len int, needSafecopy bool) Block { - if uptr := uintptr(ptr); uptr+uintptr(len) < uptr { - panic(fmt.Sprintf("ptr %#x + len %#x overflows", ptr, len)) - } - return Block{ - start: ptr, - length: len, - needSafecopy: needSafecopy, - } -} - -// DropFirst returns a Block equivalent to b, but with the first n bytes -// omitted. It is analogous to the [n:] operation on a slice, except that if n -// > b.Len(), DropFirst returns an empty Block instead of panicking. -// -// Preconditions: n >= 0. -func (b Block) DropFirst(n int) Block { - if n < 0 { - panic(fmt.Sprintf("invalid n: %d", n)) - } - return b.DropFirst64(uint64(n)) -} - -// DropFirst64 is equivalent to DropFirst but takes a uint64. -func (b Block) DropFirst64(n uint64) Block { - if n >= uint64(b.length) { - return Block{} - } - return Block{ - start: unsafe.Pointer(uintptr(b.start) + uintptr(n)), - length: b.length - int(n), - needSafecopy: b.needSafecopy, - } -} - -// TakeFirst returns a Block equivalent to the first n bytes of b. It is -// analogous to the [:n] operation on a slice, except that if n > b.Len(), -// TakeFirst returns a copy of b instead of panicking. -// -// Preconditions: n >= 0. -func (b Block) TakeFirst(n int) Block { - if n < 0 { - panic(fmt.Sprintf("invalid n: %d", n)) - } - return b.TakeFirst64(uint64(n)) -} - -// TakeFirst64 is equivalent to TakeFirst but takes a uint64. -func (b Block) TakeFirst64(n uint64) Block { - if n == 0 { - return Block{} - } - if n >= uint64(b.length) { - return b - } - return Block{ - start: b.start, - length: int(n), - needSafecopy: b.needSafecopy, - } -} - -// ToSlice returns a []byte equivalent to b. -func (b Block) ToSlice() []byte { - var bs []byte - hdr := (*reflect.SliceHeader)(unsafe.Pointer(&bs)) - hdr.Data = uintptr(b.start) - hdr.Len = b.length - hdr.Cap = b.length - return bs -} - -// Addr returns b's start address as a uintptr. It returns uintptr instead of -// unsafe.Pointer so that code using safemem cannot obtain unsafe.Pointers -// without importing the unsafe package explicitly. -// -// Note that a uintptr is not recognized as a pointer by the garbage collector, -// such that if there are no uses of b after a call to b.Addr() and the address -// is to Go-managed memory, the returned uintptr does not prevent garbage -// collection of the pointee. -func (b Block) Addr() uintptr { - return uintptr(b.start) -} - -// Len returns b's length in bytes. -func (b Block) Len() int { - return b.length -} - -// NeedSafecopy returns true if accessing b.ToSlice() requires the use of safecopy. -func (b Block) NeedSafecopy() bool { - return b.needSafecopy -} - -// String implements fmt.Stringer.String. -func (b Block) String() string { - if uintptr(b.start) == 0 && b.length == 0 { - return "" - } - var suffix string - if b.needSafecopy { - suffix = "*" - } - return fmt.Sprintf("[%#x-%#x)%s", uintptr(b.start), uintptr(b.start)+uintptr(b.length), suffix) -} - -// Copy copies src.Len() or dst.Len() bytes, whichever is less, from src -// to dst and returns the number of bytes copied. -// -// If src and dst overlap, the data stored in dst is unspecified. -func Copy(dst, src Block) (int, error) { - if !dst.needSafecopy && !src.needSafecopy { - return copy(dst.ToSlice(), src.ToSlice()), nil - } - - n := dst.length - if n > src.length { - n = src.length - } - if n == 0 { - return 0, nil - } - - switch { - case dst.needSafecopy && !src.needSafecopy: - return safecopy.CopyOut(dst.start, src.TakeFirst(n).ToSlice()) - case !dst.needSafecopy && src.needSafecopy: - return safecopy.CopyIn(dst.TakeFirst(n).ToSlice(), src.start) - case dst.needSafecopy && src.needSafecopy: - n64, err := safecopy.Copy(dst.start, src.start, uintptr(n)) - return int(n64), err - default: - panic("unreachable") - } -} - -// Zero sets all bytes in dst to 0 and returns the number of bytes zeroed. -func Zero(dst Block) (int, error) { - if !dst.needSafecopy { - bs := dst.ToSlice() - for i := range bs { - bs[i] = 0 - } - return len(bs), nil - } - - n64, err := safecopy.ZeroOut(dst.start, uintptr(dst.length)) - return int(n64), err -} - -// Safecopy atomics are no slower than non-safecopy atomics, so use the former -// even when !b.needSafecopy to get consistent alignment checking. - -// SwapUint32 invokes safecopy.SwapUint32 on the first 4 bytes of b. -// -// Preconditions: b.Len() >= 4. -func SwapUint32(b Block, new uint32) (uint32, error) { - if b.length < 4 { - panic(fmt.Sprintf("insufficient length: %d", b.length)) - } - return safecopy.SwapUint32(b.start, new) -} - -// SwapUint64 invokes safecopy.SwapUint64 on the first 8 bytes of b. -// -// Preconditions: b.Len() >= 8. -func SwapUint64(b Block, new uint64) (uint64, error) { - if b.length < 8 { - panic(fmt.Sprintf("insufficient length: %d", b.length)) - } - return safecopy.SwapUint64(b.start, new) -} - -// CompareAndSwapUint32 invokes safecopy.CompareAndSwapUint32 on the first 4 -// bytes of b. -// -// Preconditions: b.Len() >= 4. -func CompareAndSwapUint32(b Block, old, new uint32) (uint32, error) { - if b.length < 4 { - panic(fmt.Sprintf("insufficient length: %d", b.length)) - } - return safecopy.CompareAndSwapUint32(b.start, old, new) -} - -// LoadUint32 invokes safecopy.LoadUint32 on the first 4 bytes of b. -// -// Preconditions: b.Len() >= 4. -func LoadUint32(b Block) (uint32, error) { - if b.length < 4 { - panic(fmt.Sprintf("insufficient length: %d", b.length)) - } - return safecopy.LoadUint32(b.start) -} diff --git a/pkg/sentry/safemem/io.go b/pkg/sentry/safemem/io.go deleted file mode 100644 index f039a5c34..000000000 --- a/pkg/sentry/safemem/io.go +++ /dev/null @@ -1,392 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package safemem - -import ( - "errors" - "io" - "math" -) - -// ErrEndOfBlockSeq is returned by BlockSeqWriter when attempting to write -// beyond the end of the BlockSeq. -var ErrEndOfBlockSeq = errors.New("write beyond end of BlockSeq") - -// Reader represents a streaming byte source like io.Reader. -type Reader interface { - // ReadToBlocks reads up to dsts.NumBytes() bytes into dsts and returns the - // number of bytes read. It may return a partial read without an error - // (i.e. (n, nil) where 0 < n < dsts.NumBytes()). It should not return a - // full read with an error (i.e. (dsts.NumBytes(), err) where err != nil); - // note that this differs from io.Reader.Read (in particular, io.EOF should - // not be returned if ReadToBlocks successfully reads dsts.NumBytes() - // bytes.) - ReadToBlocks(dsts BlockSeq) (uint64, error) -} - -// Writer represents a streaming byte sink like io.Writer. -type Writer interface { - // WriteFromBlocks writes up to srcs.NumBytes() bytes from srcs and returns - // the number of bytes written. It may return a partial write without an - // error (i.e. (n, nil) where 0 < n < srcs.NumBytes()). It should not - // return a full write with an error (i.e. srcs.NumBytes(), err) where err - // != nil). - WriteFromBlocks(srcs BlockSeq) (uint64, error) -} - -// ReadFullToBlocks repeatedly invokes r.ReadToBlocks until dsts.NumBytes() -// bytes have been read or ReadToBlocks returns an error. -func ReadFullToBlocks(r Reader, dsts BlockSeq) (uint64, error) { - var done uint64 - for !dsts.IsEmpty() { - n, err := r.ReadToBlocks(dsts) - done += n - if err != nil { - return done, err - } - dsts = dsts.DropFirst64(n) - } - return done, nil -} - -// WriteFullFromBlocks repeatedly invokes w.WriteFromBlocks until -// srcs.NumBytes() bytes have been written or WriteFromBlocks returns an error. -func WriteFullFromBlocks(w Writer, srcs BlockSeq) (uint64, error) { - var done uint64 - for !srcs.IsEmpty() { - n, err := w.WriteFromBlocks(srcs) - done += n - if err != nil { - return done, err - } - srcs = srcs.DropFirst64(n) - } - return done, nil -} - -// BlockSeqReader implements Reader by reading from a BlockSeq. -type BlockSeqReader struct { - Blocks BlockSeq -} - -// ReadToBlocks implements Reader.ReadToBlocks. -func (r *BlockSeqReader) ReadToBlocks(dsts BlockSeq) (uint64, error) { - n, err := CopySeq(dsts, r.Blocks) - r.Blocks = r.Blocks.DropFirst64(n) - if err != nil { - return n, err - } - if n < dsts.NumBytes() { - return n, io.EOF - } - return n, nil -} - -// BlockSeqWriter implements Writer by writing to a BlockSeq. -type BlockSeqWriter struct { - Blocks BlockSeq -} - -// WriteFromBlocks implements Writer.WriteFromBlocks. -func (w *BlockSeqWriter) WriteFromBlocks(srcs BlockSeq) (uint64, error) { - n, err := CopySeq(w.Blocks, srcs) - w.Blocks = w.Blocks.DropFirst64(n) - if err != nil { - return n, err - } - if n < srcs.NumBytes() { - return n, ErrEndOfBlockSeq - } - return n, nil -} - -// ReaderFunc implements Reader for a function with the semantics of -// Reader.ReadToBlocks. -type ReaderFunc func(dsts BlockSeq) (uint64, error) - -// ReadToBlocks implements Reader.ReadToBlocks. -func (f ReaderFunc) ReadToBlocks(dsts BlockSeq) (uint64, error) { - return f(dsts) -} - -// WriterFunc implements Writer for a function with the semantics of -// Writer.WriteFromBlocks. -type WriterFunc func(srcs BlockSeq) (uint64, error) - -// WriteFromBlocks implements Writer.WriteFromBlocks. -func (f WriterFunc) WriteFromBlocks(srcs BlockSeq) (uint64, error) { - return f(srcs) -} - -// ToIOReader implements io.Reader for a (safemem.)Reader. -// -// ToIOReader will return a successful partial read iff Reader.ReadToBlocks does -// so. -type ToIOReader struct { - Reader Reader -} - -// Read implements io.Reader.Read. -func (r ToIOReader) Read(dst []byte) (int, error) { - n, err := r.Reader.ReadToBlocks(BlockSeqOf(BlockFromSafeSlice(dst))) - return int(n), err -} - -// ToIOWriter implements io.Writer for a (safemem.)Writer. -type ToIOWriter struct { - Writer Writer -} - -// Write implements io.Writer.Write. -func (w ToIOWriter) Write(src []byte) (int, error) { - // io.Writer does not permit partial writes. - n, err := WriteFullFromBlocks(w.Writer, BlockSeqOf(BlockFromSafeSlice(src))) - return int(n), err -} - -// FromIOReader implements Reader for an io.Reader by repeatedly invoking -// io.Reader.Read until it returns an error or partial read. This is not -// thread-safe. -// -// FromIOReader will return a successful partial read iff Reader.Read does so. -type FromIOReader struct { - Reader io.Reader -} - -// ReadToBlocks implements Reader.ReadToBlocks. -func (r FromIOReader) ReadToBlocks(dsts BlockSeq) (uint64, error) { - var buf []byte - var done uint64 - for !dsts.IsEmpty() { - dst := dsts.Head() - var n int - var err error - n, buf, err = r.readToBlock(dst, buf) - done += uint64(n) - if n != dst.Len() { - return done, err - } - dsts = dsts.Tail() - if err != nil { - if dsts.IsEmpty() && err == io.EOF { - return done, nil - } - return done, err - } - } - return done, nil -} - -func (r FromIOReader) readToBlock(dst Block, buf []byte) (int, []byte, error) { - // io.Reader isn't safecopy-aware, so we have to buffer Blocks that require - // safecopy. - if !dst.NeedSafecopy() { - n, err := r.Reader.Read(dst.ToSlice()) - return n, buf, err - } - if len(buf) < dst.Len() { - buf = make([]byte, dst.Len()) - } - rn, rerr := r.Reader.Read(buf[:dst.Len()]) - wbn, wberr := Copy(dst, BlockFromSafeSlice(buf[:rn])) - if wberr != nil { - return wbn, buf, wberr - } - return wbn, buf, rerr -} - -// FromIOReaderAt implements Reader for an io.ReaderAt. Does not repeatedly -// invoke io.ReaderAt.ReadAt because ReadAt is more strict than Read. A partial -// read indicates an error. This is not thread-safe. -type FromIOReaderAt struct { - ReaderAt io.ReaderAt - Offset int64 -} - -// ReadToBlocks implements Reader.ReadToBlocks. -func (r FromIOReaderAt) ReadToBlocks(dsts BlockSeq) (uint64, error) { - var buf []byte - var done uint64 - for !dsts.IsEmpty() { - dst := dsts.Head() - var n int - var err error - n, buf, err = r.readToBlock(dst, buf) - done += uint64(n) - if n != dst.Len() { - return done, err - } - dsts = dsts.Tail() - if err != nil { - if dsts.IsEmpty() && err == io.EOF { - return done, nil - } - return done, err - } - } - return done, nil -} - -func (r FromIOReaderAt) readToBlock(dst Block, buf []byte) (int, []byte, error) { - // io.Reader isn't safecopy-aware, so we have to buffer Blocks that require - // safecopy. - if !dst.NeedSafecopy() { - n, err := r.ReaderAt.ReadAt(dst.ToSlice(), r.Offset) - r.Offset += int64(n) - return n, buf, err - } - if len(buf) < dst.Len() { - buf = make([]byte, dst.Len()) - } - rn, rerr := r.ReaderAt.ReadAt(buf[:dst.Len()], r.Offset) - r.Offset += int64(rn) - wbn, wberr := Copy(dst, BlockFromSafeSlice(buf[:rn])) - if wberr != nil { - return wbn, buf, wberr - } - return wbn, buf, rerr -} - -// FromIOWriter implements Writer for an io.Writer by repeatedly invoking -// io.Writer.Write until it returns an error or partial write. -// -// FromIOWriter will tolerate implementations of io.Writer.Write that return -// partial writes with a nil error in contravention of io.Writer's -// requirements, since Writer is permitted to do so. FromIOWriter will return a -// successful partial write iff Writer.Write does so. -type FromIOWriter struct { - Writer io.Writer -} - -// WriteFromBlocks implements Writer.WriteFromBlocks. -func (w FromIOWriter) WriteFromBlocks(srcs BlockSeq) (uint64, error) { - var buf []byte - var done uint64 - for !srcs.IsEmpty() { - src := srcs.Head() - var n int - var err error - n, buf, err = w.writeFromBlock(src, buf) - done += uint64(n) - if n != src.Len() || err != nil { - return done, err - } - srcs = srcs.Tail() - } - return done, nil -} - -func (w FromIOWriter) writeFromBlock(src Block, buf []byte) (int, []byte, error) { - // io.Writer isn't safecopy-aware, so we have to buffer Blocks that require - // safecopy. - if !src.NeedSafecopy() { - n, err := w.Writer.Write(src.ToSlice()) - return n, buf, err - } - if len(buf) < src.Len() { - buf = make([]byte, src.Len()) - } - bufn, buferr := Copy(BlockFromSafeSlice(buf[:src.Len()]), src) - wn, werr := w.Writer.Write(buf[:bufn]) - if werr != nil { - return wn, buf, werr - } - return wn, buf, buferr -} - -// FromVecReaderFunc implements Reader for a function that reads data into a -// [][]byte and returns the number of bytes read as an int64. -type FromVecReaderFunc struct { - ReadVec func(dsts [][]byte) (int64, error) -} - -// ReadToBlocks implements Reader.ReadToBlocks. -// -// ReadToBlocks calls r.ReadVec at most once. -func (r FromVecReaderFunc) ReadToBlocks(dsts BlockSeq) (uint64, error) { - if dsts.IsEmpty() { - return 0, nil - } - // Ensure that we don't pass a [][]byte with a total length > MaxInt64. - dsts = dsts.TakeFirst64(uint64(math.MaxInt64)) - dstSlices := make([][]byte, 0, dsts.NumBlocks()) - // Buffer Blocks that require safecopy. - for tmp := dsts; !tmp.IsEmpty(); tmp = tmp.Tail() { - dst := tmp.Head() - if dst.NeedSafecopy() { - dstSlices = append(dstSlices, make([]byte, dst.Len())) - } else { - dstSlices = append(dstSlices, dst.ToSlice()) - } - } - rn, rerr := r.ReadVec(dstSlices) - dsts = dsts.TakeFirst64(uint64(rn)) - var done uint64 - var i int - for !dsts.IsEmpty() { - dst := dsts.Head() - if dst.NeedSafecopy() { - n, err := Copy(dst, BlockFromSafeSlice(dstSlices[i])) - done += uint64(n) - if err != nil { - return done, err - } - } else { - done += uint64(dst.Len()) - } - dsts = dsts.Tail() - i++ - } - return done, rerr -} - -// FromVecWriterFunc implements Writer for a function that writes data from a -// [][]byte and returns the number of bytes written. -type FromVecWriterFunc struct { - WriteVec func(srcs [][]byte) (int64, error) -} - -// WriteFromBlocks implements Writer.WriteFromBlocks. -// -// WriteFromBlocks calls w.WriteVec at most once. -func (w FromVecWriterFunc) WriteFromBlocks(srcs BlockSeq) (uint64, error) { - if srcs.IsEmpty() { - return 0, nil - } - // Ensure that we don't pass a [][]byte with a total length > MaxInt64. - srcs = srcs.TakeFirst64(uint64(math.MaxInt64)) - srcSlices := make([][]byte, 0, srcs.NumBlocks()) - // Buffer Blocks that require safecopy. - var buferr error - for tmp := srcs; !tmp.IsEmpty(); tmp = tmp.Tail() { - src := tmp.Head() - if src.NeedSafecopy() { - slice := make([]byte, src.Len()) - n, err := Copy(BlockFromSafeSlice(slice), src) - srcSlices = append(srcSlices, slice[:n]) - if err != nil { - buferr = err - break - } - } else { - srcSlices = append(srcSlices, src.ToSlice()) - } - } - n, err := w.WriteVec(srcSlices) - if err != nil { - return uint64(n), err - } - return uint64(n), buferr -} diff --git a/pkg/sentry/safemem/io_test.go b/pkg/sentry/safemem/io_test.go deleted file mode 100644 index 629741bee..000000000 --- a/pkg/sentry/safemem/io_test.go +++ /dev/null @@ -1,199 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package safemem - -import ( - "bytes" - "io" - "testing" -) - -func makeBlocks(slices ...[]byte) []Block { - blocks := make([]Block, 0, len(slices)) - for _, s := range slices { - blocks = append(blocks, BlockFromSafeSlice(s)) - } - return blocks -} - -func TestFromIOReaderFullRead(t *testing.T) { - r := FromIOReader{bytes.NewBufferString("foobar")} - dsts := makeBlocks(make([]byte, 3), make([]byte, 3)) - n, err := r.ReadToBlocks(BlockSeqFromSlice(dsts)) - if wantN := uint64(6); n != wantN || err != nil { - t.Errorf("ReadToBlocks: got (%v, %v), wanted (%v, nil)", n, err, wantN) - } - for i, want := range [][]byte{[]byte("foo"), []byte("bar")} { - if got := dsts[i].ToSlice(); !bytes.Equal(got, want) { - t.Errorf("dsts[%d]: got %q, wanted %q", i, got, want) - } - } -} - -type eofHidingReader struct { - Reader io.Reader -} - -func (r eofHidingReader) Read(dst []byte) (int, error) { - n, err := r.Reader.Read(dst) - if err == io.EOF { - return n, nil - } - return n, err -} - -func TestFromIOReaderPartialRead(t *testing.T) { - r := FromIOReader{eofHidingReader{bytes.NewBufferString("foob")}} - dsts := makeBlocks(make([]byte, 3), make([]byte, 3)) - n, err := r.ReadToBlocks(BlockSeqFromSlice(dsts)) - // FromIOReader should stop after the eofHidingReader returns (1, nil) - // for a 3-byte read. - if wantN := uint64(4); n != wantN || err != nil { - t.Errorf("ReadToBlocks: got (%v, %v), wanted (%v, nil)", n, err, wantN) - } - for i, want := range [][]byte{[]byte("foo"), []byte("b\x00\x00")} { - if got := dsts[i].ToSlice(); !bytes.Equal(got, want) { - t.Errorf("dsts[%d]: got %q, wanted %q", i, got, want) - } - } -} - -type singleByteReader struct { - Reader io.Reader -} - -func (r singleByteReader) Read(dst []byte) (int, error) { - if len(dst) == 0 { - return r.Reader.Read(dst) - } - return r.Reader.Read(dst[:1]) -} - -func TestSingleByteReader(t *testing.T) { - r := FromIOReader{singleByteReader{bytes.NewBufferString("foobar")}} - dsts := makeBlocks(make([]byte, 3), make([]byte, 3)) - n, err := r.ReadToBlocks(BlockSeqFromSlice(dsts)) - // FromIOReader should stop after the singleByteReader returns (1, nil) - // for a 3-byte read. - if wantN := uint64(1); n != wantN || err != nil { - t.Errorf("ReadToBlocks: got (%v, %v), wanted (%v, nil)", n, err, wantN) - } - for i, want := range [][]byte{[]byte("f\x00\x00"), []byte("\x00\x00\x00")} { - if got := dsts[i].ToSlice(); !bytes.Equal(got, want) { - t.Errorf("dsts[%d]: got %q, wanted %q", i, got, want) - } - } -} - -func TestReadFullToBlocks(t *testing.T) { - r := FromIOReader{singleByteReader{bytes.NewBufferString("foobar")}} - dsts := makeBlocks(make([]byte, 3), make([]byte, 3)) - n, err := ReadFullToBlocks(r, BlockSeqFromSlice(dsts)) - // ReadFullToBlocks should call into FromIOReader => singleByteReader - // repeatedly until dsts is exhausted. - if wantN := uint64(6); n != wantN || err != nil { - t.Errorf("ReadFullToBlocks: got (%v, %v), wanted (%v, nil)", n, err, wantN) - } - for i, want := range [][]byte{[]byte("foo"), []byte("bar")} { - if got := dsts[i].ToSlice(); !bytes.Equal(got, want) { - t.Errorf("dsts[%d]: got %q, wanted %q", i, got, want) - } - } -} - -func TestFromIOWriterFullWrite(t *testing.T) { - srcs := makeBlocks([]byte("foo"), []byte("bar")) - var dst bytes.Buffer - w := FromIOWriter{&dst} - n, err := w.WriteFromBlocks(BlockSeqFromSlice(srcs)) - if wantN := uint64(6); n != wantN || err != nil { - t.Errorf("WriteFromBlocks: got (%v, %v), wanted (%v, nil)", n, err, wantN) - } - if got, want := dst.Bytes(), []byte("foobar"); !bytes.Equal(got, want) { - t.Errorf("dst: got %q, wanted %q", got, want) - } -} - -type limitedWriter struct { - Writer io.Writer - Done int - Limit int -} - -func (w *limitedWriter) Write(src []byte) (int, error) { - count := len(src) - if count > (w.Limit - w.Done) { - count = w.Limit - w.Done - } - n, err := w.Writer.Write(src[:count]) - w.Done += n - return n, err -} - -func TestFromIOWriterPartialWrite(t *testing.T) { - srcs := makeBlocks([]byte("foo"), []byte("bar")) - var dst bytes.Buffer - w := FromIOWriter{&limitedWriter{&dst, 0, 4}} - n, err := w.WriteFromBlocks(BlockSeqFromSlice(srcs)) - // FromIOWriter should stop after the limitedWriter returns (1, nil) for a - // 3-byte write. - if wantN := uint64(4); n != wantN || err != nil { - t.Errorf("WriteFromBlocks: got (%v, %v), wanted (%v, nil)", n, err, wantN) - } - if got, want := dst.Bytes(), []byte("foob"); !bytes.Equal(got, want) { - t.Errorf("dst: got %q, wanted %q", got, want) - } -} - -type singleByteWriter struct { - Writer io.Writer -} - -func (w singleByteWriter) Write(src []byte) (int, error) { - if len(src) == 0 { - return w.Writer.Write(src) - } - return w.Writer.Write(src[:1]) -} - -func TestSingleByteWriter(t *testing.T) { - srcs := makeBlocks([]byte("foo"), []byte("bar")) - var dst bytes.Buffer - w := FromIOWriter{singleByteWriter{&dst}} - n, err := w.WriteFromBlocks(BlockSeqFromSlice(srcs)) - // FromIOWriter should stop after the singleByteWriter returns (1, nil) - // for a 3-byte write. - if wantN := uint64(1); n != wantN || err != nil { - t.Errorf("WriteFromBlocks: got (%v, %v), wanted (%v, nil)", n, err, wantN) - } - if got, want := dst.Bytes(), []byte("f"); !bytes.Equal(got, want) { - t.Errorf("dst: got %q, wanted %q", got, want) - } -} - -func TestWriteFullToBlocks(t *testing.T) { - srcs := makeBlocks([]byte("foo"), []byte("bar")) - var dst bytes.Buffer - w := FromIOWriter{singleByteWriter{&dst}} - n, err := WriteFullFromBlocks(w, BlockSeqFromSlice(srcs)) - // WriteFullToBlocks should call into FromIOWriter => singleByteWriter - // repeatedly until srcs is exhausted. - if wantN := uint64(6); n != wantN || err != nil { - t.Errorf("WriteFullFromBlocks: got (%v, %v), wanted (%v, nil)", n, err, wantN) - } - if got, want := dst.Bytes(), []byte("foobar"); !bytes.Equal(got, want) { - t.Errorf("dst: got %q, wanted %q", got, want) - } -} diff --git a/pkg/sentry/safemem/safemem.go b/pkg/sentry/safemem/safemem.go deleted file mode 100644 index 3e70d33a2..000000000 --- a/pkg/sentry/safemem/safemem.go +++ /dev/null @@ -1,16 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Package safemem provides the Block and BlockSeq types. -package safemem diff --git a/pkg/sentry/safemem/seq_test.go b/pkg/sentry/safemem/seq_test.go deleted file mode 100644 index eba4bb535..000000000 --- a/pkg/sentry/safemem/seq_test.go +++ /dev/null @@ -1,196 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package safemem - -import ( - "bytes" - "reflect" - "testing" -) - -type blockSeqTest struct { - desc string - - pieces []string - haveOffset bool - offset uint64 - haveLimit bool - limit uint64 - - want string -} - -func (t blockSeqTest) NonEmptyByteSlices() [][]byte { - // t is a value, so we can mutate it freely. - slices := make([][]byte, 0, len(t.pieces)) - for _, str := range t.pieces { - if t.haveOffset { - strOff := t.offset - if strOff > uint64(len(str)) { - strOff = uint64(len(str)) - } - str = str[strOff:] - t.offset -= strOff - } - if t.haveLimit { - strLim := t.limit - if strLim > uint64(len(str)) { - strLim = uint64(len(str)) - } - str = str[:strLim] - t.limit -= strLim - } - if len(str) != 0 { - slices = append(slices, []byte(str)) - } - } - return slices -} - -func (t blockSeqTest) BlockSeq() BlockSeq { - blocks := make([]Block, 0, len(t.pieces)) - for _, str := range t.pieces { - blocks = append(blocks, BlockFromSafeSlice([]byte(str))) - } - bs := BlockSeqFromSlice(blocks) - if t.haveOffset { - bs = bs.DropFirst64(t.offset) - } - if t.haveLimit { - bs = bs.TakeFirst64(t.limit) - } - return bs -} - -var blockSeqTests = []blockSeqTest{ - { - desc: "Empty sequence", - }, - { - desc: "Sequence of length 1", - pieces: []string{"foobar"}, - want: "foobar", - }, - { - desc: "Sequence of length 2", - pieces: []string{"foo", "bar"}, - want: "foobar", - }, - { - desc: "Empty Blocks", - pieces: []string{"", "foo", "", "", "bar", ""}, - want: "foobar", - }, - { - desc: "Sequence with non-zero offset", - pieces: []string{"foo", "bar"}, - haveOffset: true, - offset: 2, - want: "obar", - }, - { - desc: "Sequence with non-maximal limit", - pieces: []string{"foo", "bar"}, - haveLimit: true, - limit: 5, - want: "fooba", - }, - { - desc: "Sequence with offset and limit", - pieces: []string{"foo", "bar"}, - haveOffset: true, - offset: 2, - haveLimit: true, - limit: 3, - want: "oba", - }, -} - -func TestBlockSeqNumBytes(t *testing.T) { - for _, test := range blockSeqTests { - t.Run(test.desc, func(t *testing.T) { - if got, want := test.BlockSeq().NumBytes(), uint64(len(test.want)); got != want { - t.Errorf("NumBytes: got %d, wanted %d", got, want) - } - }) - } -} - -func TestBlockSeqIterBlocks(t *testing.T) { - // Tests BlockSeq iteration using Head/Tail. - for _, test := range blockSeqTests { - t.Run(test.desc, func(t *testing.T) { - srcs := test.BlockSeq() - // "Note that a non-nil empty slice and a nil slice ... are not - // deeply equal." - reflect - slices := make([][]byte, 0, 0) - for !srcs.IsEmpty() { - src := srcs.Head() - slices = append(slices, src.ToSlice()) - nextSrcs := srcs.Tail() - if got, want := nextSrcs.NumBytes(), srcs.NumBytes()-uint64(src.Len()); got != want { - t.Fatalf("%v.Tail(): got %v (%d bytes), wanted %d bytes", srcs, nextSrcs, got, want) - } - srcs = nextSrcs - } - if wantSlices := test.NonEmptyByteSlices(); !reflect.DeepEqual(slices, wantSlices) { - t.Errorf("Accumulated slices: got %v, wanted %v", slices, wantSlices) - } - }) - } -} - -func TestBlockSeqIterBytes(t *testing.T) { - // Tests BlockSeq iteration using Head/DropFirst. - for _, test := range blockSeqTests { - t.Run(test.desc, func(t *testing.T) { - srcs := test.BlockSeq() - var dst bytes.Buffer - for !srcs.IsEmpty() { - src := srcs.Head() - var b [1]byte - n, err := Copy(BlockFromSafeSlice(b[:]), src) - if n != 1 || err != nil { - t.Fatalf("Copy: got (%v, %v), wanted (1, nil)", n, err) - } - dst.WriteByte(b[0]) - nextSrcs := srcs.DropFirst(1) - if got, want := nextSrcs.NumBytes(), srcs.NumBytes()-1; got != want { - t.Fatalf("%v.DropFirst(1): got %v (%d bytes), wanted %d bytes", srcs, nextSrcs, got, want) - } - srcs = nextSrcs - } - if got := string(dst.Bytes()); got != test.want { - t.Errorf("Copied string: got %q, wanted %q", got, test.want) - } - }) - } -} - -func TestBlockSeqDropBeyondLimit(t *testing.T) { - blocks := []Block{BlockFromSafeSlice([]byte("123")), BlockFromSafeSlice([]byte("4"))} - bs := BlockSeqFromSlice(blocks) - if got, want := bs.NumBytes(), uint64(4); got != want { - t.Errorf("%v.NumBytes(): got %d, wanted %d", bs, got, want) - } - bs = bs.TakeFirst(1) - if got, want := bs.NumBytes(), uint64(1); got != want { - t.Errorf("%v.NumBytes(): got %d, wanted %d", bs, got, want) - } - bs = bs.DropFirst(2) - if got, want := bs.NumBytes(), uint64(0); got != want { - t.Errorf("%v.NumBytes(): got %d, wanted %d", bs, got, want) - } -} diff --git a/pkg/sentry/safemem/seq_unsafe.go b/pkg/sentry/safemem/seq_unsafe.go deleted file mode 100644 index 354a95dde..000000000 --- a/pkg/sentry/safemem/seq_unsafe.go +++ /dev/null @@ -1,299 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package safemem - -import ( - "bytes" - "fmt" - "reflect" - "unsafe" -) - -// A BlockSeq represents a sequence of Blocks, each of which has non-zero -// length. -// -// BlockSeqs are immutable and may be copied by value. The zero value of -// BlockSeq represents an empty sequence. -type BlockSeq struct { - // If length is 0, then the BlockSeq is empty. Invariants: data == 0; - // offset == 0; limit == 0. - // - // If length is -1, then the BlockSeq represents the single Block{data, - // limit, false}. Invariants: offset == 0; limit > 0; limit does not - // overflow the range of an int. - // - // If length is -2, then the BlockSeq represents the single Block{data, - // limit, true}. Invariants: offset == 0; limit > 0; limit does not - // overflow the range of an int. - // - // Otherwise, length >= 2, and the BlockSeq represents the `length` Blocks - // in the array of Blocks starting at address `data`, starting at `offset` - // bytes into the first Block and limited to the following `limit` bytes. - // Invariants: data != 0; offset < len(data[0]); limit > 0; offset+limit <= - // the combined length of all Blocks in the array; the first Block in the - // array has non-zero length. - // - // length is never 1; sequences consisting of a single Block are always - // stored inline (with length < 0). - data unsafe.Pointer - length int - offset int - limit uint64 -} - -// BlockSeqOf returns a BlockSeq representing the single Block b. -func BlockSeqOf(b Block) BlockSeq { - bs := BlockSeq{ - data: b.start, - length: -1, - limit: uint64(b.length), - } - if b.needSafecopy { - bs.length = -2 - } - return bs -} - -// BlockSeqFromSlice returns a BlockSeq representing all Blocks in slice. -// If slice contains Blocks with zero length, BlockSeq will skip them during -// iteration. -// -// Whether the returned BlockSeq shares memory with slice is unspecified; -// clients should avoid mutating slices passed to BlockSeqFromSlice. -// -// Preconditions: The combined length of all Blocks in slice <= math.MaxUint64. -func BlockSeqFromSlice(slice []Block) BlockSeq { - slice = skipEmpty(slice) - var limit uint64 - for _, b := range slice { - sum := limit + uint64(b.Len()) - if sum < limit { - panic("BlockSeq length overflows uint64") - } - limit = sum - } - return blockSeqFromSliceLimited(slice, limit) -} - -// Preconditions: The combined length of all Blocks in slice <= limit. If -// len(slice) != 0, the first Block in slice has non-zero length, and limit > -// 0. -func blockSeqFromSliceLimited(slice []Block, limit uint64) BlockSeq { - switch len(slice) { - case 0: - return BlockSeq{} - case 1: - return BlockSeqOf(slice[0].TakeFirst64(limit)) - default: - return BlockSeq{ - data: unsafe.Pointer(&slice[0]), - length: len(slice), - limit: limit, - } - } -} - -func skipEmpty(slice []Block) []Block { - for i, b := range slice { - if b.Len() != 0 { - return slice[i:] - } - } - return nil -} - -// IsEmpty returns true if bs contains no Blocks. -// -// Invariants: bs.IsEmpty() == (bs.NumBlocks() == 0) == (bs.NumBytes() == 0). -// (Of these, prefer to use bs.IsEmpty().) -func (bs BlockSeq) IsEmpty() bool { - return bs.length == 0 -} - -// NumBlocks returns the number of Blocks in bs. -func (bs BlockSeq) NumBlocks() int { - // In general, we have to count: if bs represents a windowed slice then the - // slice may contain Blocks with zero length, and bs.length may be larger - // than the actual number of Blocks due to bs.limit. - var n int - for !bs.IsEmpty() { - n++ - bs = bs.Tail() - } - return n -} - -// NumBytes returns the sum of Block.Len() for all Blocks in bs. -func (bs BlockSeq) NumBytes() uint64 { - return bs.limit -} - -// Head returns the first Block in bs. -// -// Preconditions: !bs.IsEmpty(). -func (bs BlockSeq) Head() Block { - if bs.length == 0 { - panic("empty BlockSeq") - } - if bs.length < 0 { - return bs.internalBlock() - } - return (*Block)(bs.data).DropFirst(bs.offset).TakeFirst64(bs.limit) -} - -// Preconditions: bs.length < 0. -func (bs BlockSeq) internalBlock() Block { - return Block{ - start: bs.data, - length: int(bs.limit), - needSafecopy: bs.length == -2, - } -} - -// Tail returns a BlockSeq consisting of all Blocks in bs after the first. -// -// Preconditions: !bs.IsEmpty(). -func (bs BlockSeq) Tail() BlockSeq { - if bs.length == 0 { - panic("empty BlockSeq") - } - if bs.length < 0 { - return BlockSeq{} - } - head := (*Block)(bs.data).DropFirst(bs.offset) - headLen := uint64(head.Len()) - if headLen >= bs.limit { - // The head Block exhausts the limit, so the tail is empty. - return BlockSeq{} - } - var extSlice []Block - extSliceHdr := (*reflect.SliceHeader)(unsafe.Pointer(&extSlice)) - extSliceHdr.Data = uintptr(bs.data) - extSliceHdr.Len = bs.length - extSliceHdr.Cap = bs.length - tailSlice := skipEmpty(extSlice[1:]) - tailLimit := bs.limit - headLen - return blockSeqFromSliceLimited(tailSlice, tailLimit) -} - -// DropFirst returns a BlockSeq equivalent to bs, but with the first n bytes -// omitted. If n > bs.NumBytes(), DropFirst returns an empty BlockSeq. -// -// Preconditions: n >= 0. -func (bs BlockSeq) DropFirst(n int) BlockSeq { - if n < 0 { - panic(fmt.Sprintf("invalid n: %d", n)) - } - return bs.DropFirst64(uint64(n)) -} - -// DropFirst64 is equivalent to DropFirst but takes an uint64. -func (bs BlockSeq) DropFirst64(n uint64) BlockSeq { - if n >= bs.limit { - return BlockSeq{} - } - for { - // Calling bs.Head() here is surprisingly expensive, so inline getting - // the head's length. - var headLen uint64 - if bs.length < 0 { - headLen = bs.limit - } else { - headLen = uint64((*Block)(bs.data).Len() - bs.offset) - } - if n < headLen { - // Dropping ends partway through the head Block. - if bs.length < 0 { - return BlockSeqOf(bs.internalBlock().DropFirst64(n)) - } - bs.offset += int(n) - bs.limit -= n - return bs - } - n -= headLen - bs = bs.Tail() - } -} - -// TakeFirst returns a BlockSeq equivalent to the first n bytes of bs. If n > -// bs.NumBytes(), TakeFirst returns a BlockSeq equivalent to bs. -// -// Preconditions: n >= 0. -func (bs BlockSeq) TakeFirst(n int) BlockSeq { - if n < 0 { - panic(fmt.Sprintf("invalid n: %d", n)) - } - return bs.TakeFirst64(uint64(n)) -} - -// TakeFirst64 is equivalent to TakeFirst but takes a uint64. -func (bs BlockSeq) TakeFirst64(n uint64) BlockSeq { - if n == 0 { - return BlockSeq{} - } - if bs.limit > n { - bs.limit = n - } - return bs -} - -// String implements fmt.Stringer.String. -func (bs BlockSeq) String() string { - var buf bytes.Buffer - buf.WriteByte('[') - var sep string - for !bs.IsEmpty() { - buf.WriteString(sep) - sep = " " - buf.WriteString(bs.Head().String()) - bs = bs.Tail() - } - buf.WriteByte(']') - return buf.String() -} - -// CopySeq copies srcs.NumBytes() or dsts.NumBytes() bytes, whichever is less, -// from srcs to dsts and returns the number of bytes copied. -// -// If srcs and dsts overlap, the data stored in dsts is unspecified. -func CopySeq(dsts, srcs BlockSeq) (uint64, error) { - var done uint64 - for !dsts.IsEmpty() && !srcs.IsEmpty() { - dst := dsts.Head() - src := srcs.Head() - n, err := Copy(dst, src) - done += uint64(n) - if err != nil { - return done, err - } - dsts = dsts.DropFirst(n) - srcs = srcs.DropFirst(n) - } - return done, nil -} - -// ZeroSeq sets all bytes in dsts to 0 and returns the number of bytes zeroed. -func ZeroSeq(dsts BlockSeq) (uint64, error) { - var done uint64 - for !dsts.IsEmpty() { - n, err := Zero(dsts.Head()) - done += uint64(n) - if err != nil { - return done, err - } - dsts = dsts.DropFirst(n) - } - return done, nil -} diff --git a/pkg/sentry/socket/BUILD b/pkg/sentry/socket/BUILD index 8e2b97afb..611fa22c3 100644 --- a/pkg/sentry/socket/BUILD +++ b/pkg/sentry/socket/BUILD @@ -9,15 +9,15 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/binary", - "//pkg/sentry/context", + "//pkg/context", "//pkg/sentry/device", "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", "//pkg/sentry/kernel", "//pkg/sentry/kernel/time", "//pkg/sentry/socket/unix/transport", - "//pkg/sentry/usermem", "//pkg/syserr", "//pkg/tcpip", + "//pkg/usermem", ], ) diff --git a/pkg/sentry/socket/control/BUILD b/pkg/sentry/socket/control/BUILD index 3850f6345..79e16d6e8 100644 --- a/pkg/sentry/socket/control/BUILD +++ b/pkg/sentry/socket/control/BUILD @@ -12,13 +12,13 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/binary", - "//pkg/sentry/context", + "//pkg/context", "//pkg/sentry/fs", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", "//pkg/sentry/socket", "//pkg/sentry/socket/unix/transport", - "//pkg/sentry/usermem", "//pkg/syserror", + "//pkg/usermem", ], ) diff --git a/pkg/sentry/socket/control/control.go b/pkg/sentry/socket/control/control.go index 1684dfc24..00265f15b 100644 --- a/pkg/sentry/socket/control/control.go +++ b/pkg/sentry/socket/control/control.go @@ -19,14 +19,14 @@ package control import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/binary" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/socket" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) const maxInt = int(^uint(0) >> 1) diff --git a/pkg/sentry/socket/hostinet/BUILD b/pkg/sentry/socket/hostinet/BUILD index 42bf7be6a..5a07d5d0e 100644 --- a/pkg/sentry/socket/hostinet/BUILD +++ b/pkg/sentry/socket/hostinet/BUILD @@ -16,23 +16,23 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/binary", + "//pkg/context", "//pkg/fdnotifier", "//pkg/log", + "//pkg/safemem", "//pkg/sentry/arch", - "//pkg/sentry/context", "//pkg/sentry/device", "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", "//pkg/sentry/inet", "//pkg/sentry/kernel", "//pkg/sentry/kernel/time", - "//pkg/sentry/safemem", "//pkg/sentry/socket", "//pkg/sentry/socket/control", - "//pkg/sentry/usermem", "//pkg/syserr", "//pkg/syserror", "//pkg/tcpip/stack", + "//pkg/usermem", "//pkg/waiter", "@org_golang_x_sys//unix:go_default_library", ], diff --git a/pkg/sentry/socket/hostinet/socket.go b/pkg/sentry/socket/hostinet/socket.go index c957b0f1d..bde4c7a1e 100644 --- a/pkg/sentry/socket/hostinet/socket.go +++ b/pkg/sentry/socket/hostinet/socket.go @@ -21,19 +21,19 @@ import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fdnotifier" "gvisor.dev/gvisor/pkg/log" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/kernel" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" - "gvisor.dev/gvisor/pkg/sentry/safemem" "gvisor.dev/gvisor/pkg/sentry/socket" "gvisor.dev/gvisor/pkg/sentry/socket/control" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/socket/hostinet/socket_unsafe.go b/pkg/sentry/socket/hostinet/socket_unsafe.go index e69ec38c2..cd67234d2 100644 --- a/pkg/sentry/socket/hostinet/socket_unsafe.go +++ b/pkg/sentry/socket/hostinet/socket_unsafe.go @@ -19,14 +19,14 @@ import ( "unsafe" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/socket" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) func firstBytePtr(bs []byte) unsafe.Pointer { diff --git a/pkg/sentry/socket/hostinet/stack.go b/pkg/sentry/socket/hostinet/stack.go index e67b46c9e..034eca676 100644 --- a/pkg/sentry/socket/hostinet/stack.go +++ b/pkg/sentry/socket/hostinet/stack.go @@ -25,13 +25,13 @@ import ( "syscall" "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/inet" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/tcpip/stack" + "gvisor.dev/gvisor/pkg/usermem" ) var defaultRecvBufSize = inet.TCPBufferSize{ diff --git a/pkg/sentry/socket/netfilter/BUILD b/pkg/sentry/socket/netfilter/BUILD index ed34a8308..fa2a2cb66 100644 --- a/pkg/sentry/socket/netfilter/BUILD +++ b/pkg/sentry/socket/netfilter/BUILD @@ -15,10 +15,10 @@ go_library( "//pkg/binary", "//pkg/log", "//pkg/sentry/kernel", - "//pkg/sentry/usermem", "//pkg/syserr", "//pkg/tcpip", "//pkg/tcpip/iptables", "//pkg/tcpip/stack", + "//pkg/usermem", ], ) diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go index c65c36081..6ef740463 100644 --- a/pkg/sentry/socket/netfilter/netfilter.go +++ b/pkg/sentry/socket/netfilter/netfilter.go @@ -23,11 +23,11 @@ import ( "gvisor.dev/gvisor/pkg/binary" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/iptables" "gvisor.dev/gvisor/pkg/tcpip/stack" + "gvisor.dev/gvisor/pkg/usermem" ) // errorTargetName is used to mark targets as error targets. Error targets diff --git a/pkg/sentry/socket/netlink/BUILD b/pkg/sentry/socket/netlink/BUILD index baaac13c6..f8b8e467d 100644 --- a/pkg/sentry/socket/netlink/BUILD +++ b/pkg/sentry/socket/netlink/BUILD @@ -13,8 +13,8 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/binary", + "//pkg/context", "//pkg/sentry/arch", - "//pkg/sentry/context", "//pkg/sentry/device", "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", @@ -25,11 +25,11 @@ go_library( "//pkg/sentry/socket/netlink/port", "//pkg/sentry/socket/unix", "//pkg/sentry/socket/unix/transport", - "//pkg/sentry/usermem", "//pkg/sync", "//pkg/syserr", "//pkg/syserror", "//pkg/tcpip", + "//pkg/usermem", "//pkg/waiter", ], ) diff --git a/pkg/sentry/socket/netlink/message.go b/pkg/sentry/socket/netlink/message.go index ce0a1afd0..b21e0ca4b 100644 --- a/pkg/sentry/socket/netlink/message.go +++ b/pkg/sentry/socket/netlink/message.go @@ -20,7 +20,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/binary" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) // alignUp rounds a length up to an alignment. diff --git a/pkg/sentry/socket/netlink/provider.go b/pkg/sentry/socket/netlink/provider.go index be005df24..07f860a49 100644 --- a/pkg/sentry/socket/netlink/provider.go +++ b/pkg/sentry/socket/netlink/provider.go @@ -18,7 +18,7 @@ import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/socket" diff --git a/pkg/sentry/socket/netlink/route/BUILD b/pkg/sentry/socket/netlink/route/BUILD index 2137c7aeb..0234aadde 100644 --- a/pkg/sentry/socket/netlink/route/BUILD +++ b/pkg/sentry/socket/netlink/route/BUILD @@ -8,7 +8,7 @@ go_library( visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", - "//pkg/sentry/context", + "//pkg/context", "//pkg/sentry/inet", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", diff --git a/pkg/sentry/socket/netlink/route/protocol.go b/pkg/sentry/socket/netlink/route/protocol.go index 6b4a0ecf4..80a15d6cb 100644 --- a/pkg/sentry/socket/netlink/route/protocol.go +++ b/pkg/sentry/socket/netlink/route/protocol.go @@ -19,7 +19,7 @@ import ( "bytes" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" diff --git a/pkg/sentry/socket/netlink/socket.go b/pkg/sentry/socket/netlink/socket.go index cea56f4ed..c4b95debb 100644 --- a/pkg/sentry/socket/netlink/socket.go +++ b/pkg/sentry/socket/netlink/socket.go @@ -20,8 +20,8 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/device" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" @@ -32,11 +32,11 @@ import ( "gvisor.dev/gvisor/pkg/sentry/socket/netlink/port" "gvisor.dev/gvisor/pkg/sentry/socket/unix" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/socket/netlink/uevent/BUILD b/pkg/sentry/socket/netlink/uevent/BUILD index 73fbdf1eb..b6434923c 100644 --- a/pkg/sentry/socket/netlink/uevent/BUILD +++ b/pkg/sentry/socket/netlink/uevent/BUILD @@ -8,7 +8,7 @@ go_library( visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", - "//pkg/sentry/context", + "//pkg/context", "//pkg/sentry/kernel", "//pkg/sentry/socket/netlink", "//pkg/syserr", diff --git a/pkg/sentry/socket/netlink/uevent/protocol.go b/pkg/sentry/socket/netlink/uevent/protocol.go index b5d7808d7..1ee4296bc 100644 --- a/pkg/sentry/socket/netlink/uevent/protocol.go +++ b/pkg/sentry/socket/netlink/uevent/protocol.go @@ -20,7 +20,7 @@ package uevent import ( "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/socket/netlink" "gvisor.dev/gvisor/pkg/syserr" diff --git a/pkg/sentry/socket/netstack/BUILD b/pkg/sentry/socket/netstack/BUILD index e3d1f90cb..ab01cb4fa 100644 --- a/pkg/sentry/socket/netstack/BUILD +++ b/pkg/sentry/socket/netstack/BUILD @@ -17,10 +17,11 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/binary", + "//pkg/context", "//pkg/log", "//pkg/metric", + "//pkg/safemem", "//pkg/sentry/arch", - "//pkg/sentry/context", "//pkg/sentry/device", "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", @@ -28,11 +29,9 @@ go_library( "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", "//pkg/sentry/kernel/time", - "//pkg/sentry/safemem", "//pkg/sentry/socket", "//pkg/sentry/socket/netfilter", "//pkg/sentry/unimpl", - "//pkg/sentry/usermem", "//pkg/sync", "//pkg/syserr", "//pkg/syserror", @@ -45,6 +44,7 @@ go_library( "//pkg/tcpip/stack", "//pkg/tcpip/transport/tcp", "//pkg/tcpip/transport/udp", + "//pkg/usermem", "//pkg/waiter", ], ) diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go index 318acbeff..8619cc506 100644 --- a/pkg/sentry/socket/netstack/netstack.go +++ b/pkg/sentry/socket/netstack/netstack.go @@ -34,20 +34,19 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/metric" + "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/kernel" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" - "gvisor.dev/gvisor/pkg/sentry/safemem" "gvisor.dev/gvisor/pkg/sentry/socket" "gvisor.dev/gvisor/pkg/sentry/socket/netfilter" "gvisor.dev/gvisor/pkg/sentry/unimpl" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/syserror" @@ -57,6 +56,7 @@ import ( "gvisor.dev/gvisor/pkg/tcpip/stack" "gvisor.dev/gvisor/pkg/tcpip/transport/tcp" "gvisor.dev/gvisor/pkg/tcpip/transport/udp" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/socket/netstack/provider.go b/pkg/sentry/socket/netstack/provider.go index 2d2c1ba2a..5afff2564 100644 --- a/pkg/sentry/socket/netstack/provider.go +++ b/pkg/sentry/socket/netstack/provider.go @@ -18,7 +18,7 @@ import ( "syscall" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" diff --git a/pkg/sentry/socket/socket.go b/pkg/sentry/socket/socket.go index 2389a9cdb..50d9744e6 100644 --- a/pkg/sentry/socket/socket.go +++ b/pkg/sentry/socket/socket.go @@ -24,16 +24,16 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/binary" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/device" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/kernel" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/usermem" ) // ControlMessages represents the union of unix control messages and tcpip diff --git a/pkg/sentry/socket/unix/BUILD b/pkg/sentry/socket/unix/BUILD index bade18686..08743deba 100644 --- a/pkg/sentry/socket/unix/BUILD +++ b/pkg/sentry/socket/unix/BUILD @@ -12,23 +12,23 @@ go_library( visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", + "//pkg/context", "//pkg/refs", + "//pkg/safemem", "//pkg/sentry/arch", - "//pkg/sentry/context", "//pkg/sentry/device", "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", "//pkg/sentry/kernel", "//pkg/sentry/kernel/time", - "//pkg/sentry/safemem", "//pkg/sentry/socket", "//pkg/sentry/socket/control", "//pkg/sentry/socket/netstack", "//pkg/sentry/socket/unix/transport", - "//pkg/sentry/usermem", "//pkg/syserr", "//pkg/syserror", "//pkg/tcpip", + "//pkg/usermem", "//pkg/waiter", ], ) diff --git a/pkg/sentry/socket/unix/io.go b/pkg/sentry/socket/unix/io.go index 2447f24ef..129949990 100644 --- a/pkg/sentry/socket/unix/io.go +++ b/pkg/sentry/socket/unix/io.go @@ -15,8 +15,8 @@ package unix import ( - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/safemem" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/tcpip" ) diff --git a/pkg/sentry/socket/unix/transport/BUILD b/pkg/sentry/socket/unix/transport/BUILD index 4bdfc9208..74bcd6300 100644 --- a/pkg/sentry/socket/unix/transport/BUILD +++ b/pkg/sentry/socket/unix/transport/BUILD @@ -28,9 +28,9 @@ go_library( visibility = ["//:sandbox"], deps = [ "//pkg/abi/linux", + "//pkg/context", "//pkg/ilist", "//pkg/refs", - "//pkg/sentry/context", "//pkg/sync", "//pkg/syserr", "//pkg/tcpip", diff --git a/pkg/sentry/socket/unix/transport/connectioned.go b/pkg/sentry/socket/unix/transport/connectioned.go index 9e6fbc111..ce5b94ee7 100644 --- a/pkg/sentry/socket/unix/transport/connectioned.go +++ b/pkg/sentry/socket/unix/transport/connectioned.go @@ -16,7 +16,7 @@ package transport import ( "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/tcpip" diff --git a/pkg/sentry/socket/unix/transport/connectionless.go b/pkg/sentry/socket/unix/transport/connectionless.go index 0322dec0b..4b06d63ac 100644 --- a/pkg/sentry/socket/unix/transport/connectionless.go +++ b/pkg/sentry/socket/unix/transport/connectionless.go @@ -16,7 +16,7 @@ package transport import ( "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/waiter" diff --git a/pkg/sentry/socket/unix/transport/unix.go b/pkg/sentry/socket/unix/transport/unix.go index fcc0da332..dcbafe0e5 100644 --- a/pkg/sentry/socket/unix/transport/unix.go +++ b/pkg/sentry/socket/unix/transport/unix.go @@ -19,7 +19,7 @@ import ( "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/tcpip" diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go index 7f49ba864..4d30aa714 100644 --- a/pkg/sentry/socket/unix/unix.go +++ b/pkg/sentry/socket/unix/unix.go @@ -22,9 +22,9 @@ import ( "syscall" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/kernel" @@ -33,10 +33,10 @@ import ( "gvisor.dev/gvisor/pkg/sentry/socket/control" "gvisor.dev/gvisor/pkg/sentry/socket/netstack" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/strace/BUILD b/pkg/sentry/strace/BUILD index ff6fafa63..762a946fe 100644 --- a/pkg/sentry/strace/BUILD +++ b/pkg/sentry/strace/BUILD @@ -34,7 +34,7 @@ go_library( "//pkg/sentry/socket/netlink", "//pkg/sentry/socket/netstack", "//pkg/sentry/syscalls/linux", - "//pkg/sentry/usermem", + "//pkg/usermem", ], ) diff --git a/pkg/sentry/strace/poll.go b/pkg/sentry/strace/poll.go index 5187594a7..074e80f9b 100644 --- a/pkg/sentry/strace/poll.go +++ b/pkg/sentry/strace/poll.go @@ -22,7 +22,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/kernel" slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) // PollEventSet is the set of poll(2) event flags. diff --git a/pkg/sentry/strace/select.go b/pkg/sentry/strace/select.go index c77d418e6..3a4c32aa0 100644 --- a/pkg/sentry/strace/select.go +++ b/pkg/sentry/strace/select.go @@ -19,7 +19,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) func fdsFromSet(t *kernel.Task, set []byte) []int { diff --git a/pkg/sentry/strace/signal.go b/pkg/sentry/strace/signal.go index 5656d53eb..c41f36e3f 100644 --- a/pkg/sentry/strace/signal.go +++ b/pkg/sentry/strace/signal.go @@ -21,7 +21,7 @@ import ( "gvisor.dev/gvisor/pkg/abi" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) // signalNames contains the names of all named signals. diff --git a/pkg/sentry/strace/socket.go b/pkg/sentry/strace/socket.go index b6d7177f4..d2079c85f 100644 --- a/pkg/sentry/strace/socket.go +++ b/pkg/sentry/strace/socket.go @@ -26,7 +26,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/socket/netlink" "gvisor.dev/gvisor/pkg/sentry/socket/netstack" slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) // SocketFamily are the possible socket(2) families. diff --git a/pkg/sentry/strace/strace.go b/pkg/sentry/strace/strace.go index 629c1f308..3fc4a47fc 100644 --- a/pkg/sentry/strace/strace.go +++ b/pkg/sentry/strace/strace.go @@ -33,7 +33,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel" pb "gvisor.dev/gvisor/pkg/sentry/strace/strace_go_proto" slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) // DefaultLogMaximumSize is the default LogMaximumSize. diff --git a/pkg/sentry/syscalls/linux/BUILD b/pkg/sentry/syscalls/linux/BUILD index 7d74e0f70..8d6c52850 100644 --- a/pkg/sentry/syscalls/linux/BUILD +++ b/pkg/sentry/syscalls/linux/BUILD @@ -63,11 +63,12 @@ go_library( "//pkg/abi/linux", "//pkg/binary", "//pkg/bpf", + "//pkg/context", "//pkg/log", "//pkg/metric", "//pkg/rand", + "//pkg/safemem", "//pkg/sentry/arch", - "//pkg/sentry/context", "//pkg/sentry/fs", "//pkg/sentry/fs/anon", "//pkg/sentry/fs/lock", @@ -87,16 +88,15 @@ go_library( "//pkg/sentry/loader", "//pkg/sentry/memmap", "//pkg/sentry/mm", - "//pkg/sentry/safemem", "//pkg/sentry/socket", "//pkg/sentry/socket/control", "//pkg/sentry/socket/unix/transport", "//pkg/sentry/syscalls", "//pkg/sentry/usage", - "//pkg/sentry/usermem", "//pkg/sync", "//pkg/syserr", "//pkg/syserror", + "//pkg/usermem", "//pkg/waiter", ], ) diff --git a/pkg/sentry/syscalls/linux/linux64_amd64.go b/pkg/sentry/syscalls/linux/linux64_amd64.go index c76771a54..7435b50bf 100644 --- a/pkg/sentry/syscalls/linux/linux64_amd64.go +++ b/pkg/sentry/syscalls/linux/linux64_amd64.go @@ -20,8 +20,8 @@ import ( "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/syscalls" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // AMD64 is a table of Linux amd64 syscall API with the corresponding syscall diff --git a/pkg/sentry/syscalls/linux/linux64_arm64.go b/pkg/sentry/syscalls/linux/linux64_arm64.go index d3587fda6..03a39fe65 100644 --- a/pkg/sentry/syscalls/linux/linux64_arm64.go +++ b/pkg/sentry/syscalls/linux/linux64_arm64.go @@ -20,8 +20,8 @@ import ( "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/syscalls" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // ARM64 is a table of Linux arm64 syscall API with the corresponding syscall diff --git a/pkg/sentry/syscalls/linux/sigset.go b/pkg/sentry/syscalls/linux/sigset.go index 333013d8c..2ddb2b146 100644 --- a/pkg/sentry/syscalls/linux/sigset.go +++ b/pkg/sentry/syscalls/linux/sigset.go @@ -17,8 +17,8 @@ package linux import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // copyInSigSet copies in a sigset_t, checks its size, and ensures that KILL and diff --git a/pkg/sentry/syscalls/linux/sys_aio.go b/pkg/sentry/syscalls/linux/sys_aio.go index f56411bfe..b401978db 100644 --- a/pkg/sentry/syscalls/linux/sys_aio.go +++ b/pkg/sentry/syscalls/linux/sys_aio.go @@ -23,8 +23,8 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/eventfd" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/mm" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // I/O commands. diff --git a/pkg/sentry/syscalls/linux/sys_epoll.go b/pkg/sentry/syscalls/linux/sys_epoll.go index 65b4a227b..5f11b496c 100644 --- a/pkg/sentry/syscalls/linux/sys_epoll.go +++ b/pkg/sentry/syscalls/linux/sys_epoll.go @@ -20,8 +20,8 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/epoll" "gvisor.dev/gvisor/pkg/sentry/syscalls" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go index 9bc2445a5..c54735148 100644 --- a/pkg/sentry/syscalls/linux/sys_file.go +++ b/pkg/sentry/syscalls/linux/sys_file.go @@ -18,8 +18,8 @@ import ( "syscall" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/fs/tmpfs" @@ -28,8 +28,8 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/fasync" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/limits" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // fileOpAt performs an operation on the second last component in the path. diff --git a/pkg/sentry/syscalls/linux/sys_futex.go b/pkg/sentry/syscalls/linux/sys_futex.go index bde17a767..b68261f72 100644 --- a/pkg/sentry/syscalls/linux/sys_futex.go +++ b/pkg/sentry/syscalls/linux/sys_futex.go @@ -21,8 +21,8 @@ import ( "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // futexWaitRestartBlock encapsulates the state required to restart futex(2) diff --git a/pkg/sentry/syscalls/linux/sys_getdents.go b/pkg/sentry/syscalls/linux/sys_getdents.go index 912cbe4ff..f66f4ffde 100644 --- a/pkg/sentry/syscalls/linux/sys_getdents.go +++ b/pkg/sentry/syscalls/linux/sys_getdents.go @@ -23,8 +23,8 @@ import ( "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // Getdents implements linux syscall getdents(2) for 64bit systems. diff --git a/pkg/sentry/syscalls/linux/sys_mempolicy.go b/pkg/sentry/syscalls/linux/sys_mempolicy.go index f5a519d8a..ac934dc6f 100644 --- a/pkg/sentry/syscalls/linux/sys_mempolicy.go +++ b/pkg/sentry/syscalls/linux/sys_mempolicy.go @@ -20,8 +20,8 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // We unconditionally report a single NUMA node. This also means that our diff --git a/pkg/sentry/syscalls/linux/sys_mmap.go b/pkg/sentry/syscalls/linux/sys_mmap.go index 58a05b5bb..9959f6e61 100644 --- a/pkg/sentry/syscalls/linux/sys_mmap.go +++ b/pkg/sentry/syscalls/linux/sys_mmap.go @@ -22,8 +22,8 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/mm" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // Brk implements linux syscall brk(2). diff --git a/pkg/sentry/syscalls/linux/sys_mount.go b/pkg/sentry/syscalls/linux/sys_mount.go index 8c13e2d82..eb5ff48f5 100644 --- a/pkg/sentry/syscalls/linux/sys_mount.go +++ b/pkg/sentry/syscalls/linux/sys_mount.go @@ -19,8 +19,8 @@ import ( "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // Mount implements Linux syscall mount(2). diff --git a/pkg/sentry/syscalls/linux/sys_pipe.go b/pkg/sentry/syscalls/linux/sys_pipe.go index 418d7fa5f..798344042 100644 --- a/pkg/sentry/syscalls/linux/sys_pipe.go +++ b/pkg/sentry/syscalls/linux/sys_pipe.go @@ -20,8 +20,8 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // pipe2 implements the actual system call with flags. diff --git a/pkg/sentry/syscalls/linux/sys_poll.go b/pkg/sentry/syscalls/linux/sys_poll.go index 2b2df989a..4f8762d7d 100644 --- a/pkg/sentry/syscalls/linux/sys_poll.go +++ b/pkg/sentry/syscalls/linux/sys_poll.go @@ -23,8 +23,8 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/limits" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/syscalls/linux/sys_random.go b/pkg/sentry/syscalls/linux/sys_random.go index bc4c588bf..c0aa0fd60 100644 --- a/pkg/sentry/syscalls/linux/sys_random.go +++ b/pkg/sentry/syscalls/linux/sys_random.go @@ -19,11 +19,11 @@ import ( "math" "gvisor.dev/gvisor/pkg/rand" + "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/safemem" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) const ( diff --git a/pkg/sentry/syscalls/linux/sys_read.go b/pkg/sentry/syscalls/linux/sys_read.go index cd31e0649..f9f594190 100644 --- a/pkg/sentry/syscalls/linux/sys_read.go +++ b/pkg/sentry/syscalls/linux/sys_read.go @@ -23,8 +23,8 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/socket" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/syscalls/linux/sys_rlimit.go b/pkg/sentry/syscalls/linux/sys_rlimit.go index 51e3f836b..e08c333d6 100644 --- a/pkg/sentry/syscalls/linux/sys_rlimit.go +++ b/pkg/sentry/syscalls/linux/sys_rlimit.go @@ -19,8 +19,8 @@ import ( "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/limits" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // rlimit describes an implementation of 'struct rlimit', which may vary from diff --git a/pkg/sentry/syscalls/linux/sys_seccomp.go b/pkg/sentry/syscalls/linux/sys_seccomp.go index 18510ead8..5b7a66f4d 100644 --- a/pkg/sentry/syscalls/linux/sys_seccomp.go +++ b/pkg/sentry/syscalls/linux/sys_seccomp.go @@ -19,8 +19,8 @@ import ( "gvisor.dev/gvisor/pkg/bpf" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // userSockFprog is equivalent to Linux's struct sock_fprog on amd64. diff --git a/pkg/sentry/syscalls/linux/sys_sem.go b/pkg/sentry/syscalls/linux/sys_sem.go index cde3b54e7..5f54f2456 100644 --- a/pkg/sentry/syscalls/linux/sys_sem.go +++ b/pkg/sentry/syscalls/linux/sys_sem.go @@ -22,8 +22,8 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) const opsMax = 500 // SEMOPM diff --git a/pkg/sentry/syscalls/linux/sys_signal.go b/pkg/sentry/syscalls/linux/sys_signal.go index fb6efd5d8..209be2990 100644 --- a/pkg/sentry/syscalls/linux/sys_signal.go +++ b/pkg/sentry/syscalls/linux/sys_signal.go @@ -23,8 +23,8 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/signalfd" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // "For a process to have permission to send a signal it must diff --git a/pkg/sentry/syscalls/linux/sys_socket.go b/pkg/sentry/syscalls/linux/sys_socket.go index cda517a81..2919228d0 100644 --- a/pkg/sentry/syscalls/linux/sys_socket.go +++ b/pkg/sentry/syscalls/linux/sys_socket.go @@ -26,9 +26,9 @@ import ( "gvisor.dev/gvisor/pkg/sentry/socket" "gvisor.dev/gvisor/pkg/sentry/socket/control" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // minListenBacklog is the minimum reasonable backlog for listening sockets. diff --git a/pkg/sentry/syscalls/linux/sys_stat.go b/pkg/sentry/syscalls/linux/sys_stat.go index 69b17b799..c841abccb 100644 --- a/pkg/sentry/syscalls/linux/sys_stat.go +++ b/pkg/sentry/syscalls/linux/sys_stat.go @@ -19,8 +19,8 @@ import ( "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // Stat implements linux syscall stat(2). diff --git a/pkg/sentry/syscalls/linux/sys_stat_amd64.go b/pkg/sentry/syscalls/linux/sys_stat_amd64.go index 58afb4a9a..75a567bd4 100644 --- a/pkg/sentry/syscalls/linux/sys_stat_amd64.go +++ b/pkg/sentry/syscalls/linux/sys_stat_amd64.go @@ -21,7 +21,7 @@ import ( "gvisor.dev/gvisor/pkg/binary" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) // copyOutStat copies the attributes (sattr, uattr) to the struct stat at diff --git a/pkg/sentry/syscalls/linux/sys_stat_arm64.go b/pkg/sentry/syscalls/linux/sys_stat_arm64.go index 3e1251e0b..80c98d05c 100644 --- a/pkg/sentry/syscalls/linux/sys_stat_arm64.go +++ b/pkg/sentry/syscalls/linux/sys_stat_arm64.go @@ -21,7 +21,7 @@ import ( "gvisor.dev/gvisor/pkg/binary" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) // copyOutStat copies the attributes (sattr, uattr) to the struct stat at diff --git a/pkg/sentry/syscalls/linux/sys_thread.go b/pkg/sentry/syscalls/linux/sys_thread.go index b47c3b5c4..0c9e2255d 100644 --- a/pkg/sentry/syscalls/linux/sys_thread.go +++ b/pkg/sentry/syscalls/linux/sys_thread.go @@ -24,8 +24,8 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/sched" "gvisor.dev/gvisor/pkg/sentry/loader" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) const ( diff --git a/pkg/sentry/syscalls/linux/sys_time.go b/pkg/sentry/syscalls/linux/sys_time.go index b887fa9d7..2d2aa0819 100644 --- a/pkg/sentry/syscalls/linux/sys_time.go +++ b/pkg/sentry/syscalls/linux/sys_time.go @@ -22,8 +22,8 @@ import ( "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // The most significant 29 bits hold either a pid or a file descriptor. diff --git a/pkg/sentry/syscalls/linux/sys_timer.go b/pkg/sentry/syscalls/linux/sys_timer.go index d4134207b..432351917 100644 --- a/pkg/sentry/syscalls/linux/sys_timer.go +++ b/pkg/sentry/syscalls/linux/sys_timer.go @@ -20,8 +20,8 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) const nsecPerSec = int64(time.Second) diff --git a/pkg/sentry/syscalls/linux/sys_write.go b/pkg/sentry/syscalls/linux/sys_write.go index ad4b67806..aba892939 100644 --- a/pkg/sentry/syscalls/linux/sys_write.go +++ b/pkg/sentry/syscalls/linux/sys_write.go @@ -23,8 +23,8 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/socket" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/syscalls/linux/sys_xattr.go b/pkg/sentry/syscalls/linux/sys_xattr.go index 77deb8980..efb95555c 100644 --- a/pkg/sentry/syscalls/linux/sys_xattr.go +++ b/pkg/sentry/syscalls/linux/sys_xattr.go @@ -21,8 +21,8 @@ import ( "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // GetXattr implements linux syscall getxattr(2). diff --git a/pkg/sentry/syscalls/linux/timespec.go b/pkg/sentry/syscalls/linux/timespec.go index 4ff8f9234..ddc3ee26e 100644 --- a/pkg/sentry/syscalls/linux/timespec.go +++ b/pkg/sentry/syscalls/linux/timespec.go @@ -19,8 +19,8 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // copyTimespecIn copies a Timespec from the untrusted app range to the kernel. diff --git a/pkg/sentry/unimpl/BUILD b/pkg/sentry/unimpl/BUILD index 370fa6ec5..5d4aa3a63 100644 --- a/pkg/sentry/unimpl/BUILD +++ b/pkg/sentry/unimpl/BUILD @@ -14,7 +14,7 @@ go_library( srcs = ["events.go"], visibility = ["//:sandbox"], deps = [ + "//pkg/context", "//pkg/log", - "//pkg/sentry/context", ], ) diff --git a/pkg/sentry/unimpl/events.go b/pkg/sentry/unimpl/events.go index 79b5de9e4..73ed9372f 100644 --- a/pkg/sentry/unimpl/events.go +++ b/pkg/sentry/unimpl/events.go @@ -17,8 +17,8 @@ package unimpl import ( + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" - "gvisor.dev/gvisor/pkg/sentry/context" ) // contextID is the events package's type for context.Context.Value keys. diff --git a/pkg/sentry/uniqueid/BUILD b/pkg/sentry/uniqueid/BUILD index e9c18f170..7467e6398 100644 --- a/pkg/sentry/uniqueid/BUILD +++ b/pkg/sentry/uniqueid/BUILD @@ -7,7 +7,7 @@ go_library( srcs = ["context.go"], visibility = ["//pkg/sentry:internal"], deps = [ - "//pkg/sentry/context", + "//pkg/context", "//pkg/sentry/socket/unix/transport", ], ) diff --git a/pkg/sentry/uniqueid/context.go b/pkg/sentry/uniqueid/context.go index 4e466d66d..1fb884a90 100644 --- a/pkg/sentry/uniqueid/context.go +++ b/pkg/sentry/uniqueid/context.go @@ -17,7 +17,7 @@ package uniqueid import ( - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" ) diff --git a/pkg/sentry/usermem/BUILD b/pkg/sentry/usermem/BUILD deleted file mode 100644 index c8322e29e..000000000 --- a/pkg/sentry/usermem/BUILD +++ /dev/null @@ -1,55 +0,0 @@ -load("//tools:defs.bzl", "go_library", "go_test") -load("//tools/go_generics:defs.bzl", "go_template_instance") - -package(licenses = ["notice"]) - -go_template_instance( - name = "addr_range", - out = "addr_range.go", - package = "usermem", - prefix = "Addr", - template = "//pkg/segment:generic_range", - types = { - "T": "Addr", - }, -) - -go_library( - name = "usermem", - srcs = [ - "access_type.go", - "addr.go", - "addr_range.go", - "addr_range_seq_unsafe.go", - "bytes_io.go", - "bytes_io_unsafe.go", - "usermem.go", - "usermem_arm64.go", - "usermem_unsafe.go", - "usermem_x86.go", - ], - visibility = ["//pkg/sentry:internal"], - deps = [ - "//pkg/atomicbitops", - "//pkg/binary", - "//pkg/log", - "//pkg/sentry/context", - "//pkg/sentry/safemem", - "//pkg/syserror", - ], -) - -go_test( - name = "usermem_test", - size = "small", - srcs = [ - "addr_range_seq_test.go", - "usermem_test.go", - ], - library = ":usermem", - deps = [ - "//pkg/sentry/context", - "//pkg/sentry/safemem", - "//pkg/syserror", - ], -) diff --git a/pkg/sentry/usermem/README.md b/pkg/sentry/usermem/README.md deleted file mode 100644 index f6d2137eb..000000000 --- a/pkg/sentry/usermem/README.md +++ /dev/null @@ -1,31 +0,0 @@ -This package defines primitives for sentry access to application memory. - -Major types: - -- The `IO` interface represents a virtual address space and provides I/O - methods on that address space. `IO` is the lowest-level primitive. The - primary implementation of the `IO` interface is `mm.MemoryManager`. - -- `IOSequence` represents a collection of individually-contiguous address - ranges in a `IO` that is operated on sequentially, analogous to Linux's - `struct iov_iter`. - -Major usage patterns: - -- Access to a task's virtual memory, subject to the application's memory - protections and while running on that task's goroutine, from a context that - is at or above the level of the `kernel` package (e.g. most syscall - implementations in `syscalls/linux`); use the `kernel.Task.Copy*` wrappers - defined in `kernel/task_usermem.go`. - -- Access to a task's virtual memory, from a context that is at or above the - level of the `kernel` package, but where any of the above constraints does - not hold (e.g. `PTRACE_POKEDATA`, which ignores application memory - protections); obtain the task's `mm.MemoryManager` by calling - `kernel.Task.MemoryManager`, and call its `IO` methods directly. - -- Access to a task's virtual memory, from a context that is below the level of - the `kernel` package (e.g. filesystem I/O); clients must pass I/O arguments - from higher layers, usually in the form of an `IOSequence`. The - `kernel.Task.SingleIOSequence` and `kernel.Task.IovecsIOSequence` functions - in `kernel/task_usermem.go` are convenience functions for doing so. diff --git a/pkg/sentry/usermem/access_type.go b/pkg/sentry/usermem/access_type.go deleted file mode 100644 index 9c1742a59..000000000 --- a/pkg/sentry/usermem/access_type.go +++ /dev/null @@ -1,128 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package usermem - -import ( - "syscall" -) - -// AccessType specifies memory access types. This is used for -// setting mapping permissions, as well as communicating faults. -// -// +stateify savable -type AccessType struct { - // Read is read access. - Read bool - - // Write is write access. - Write bool - - // Execute is executable access. - Execute bool -} - -// String returns a pretty representation of access. This looks like the -// familiar r-x, rw-, etc. and can be relied on as such. -func (a AccessType) String() string { - bits := [3]byte{'-', '-', '-'} - if a.Read { - bits[0] = 'r' - } - if a.Write { - bits[1] = 'w' - } - if a.Execute { - bits[2] = 'x' - } - return string(bits[:]) -} - -// Any returns true iff at least one of Read, Write or Execute is true. -func (a AccessType) Any() bool { - return a.Read || a.Write || a.Execute -} - -// Prot returns the system prot (syscall.PROT_READ, etc.) for this access. -func (a AccessType) Prot() int { - var prot int - if a.Read { - prot |= syscall.PROT_READ - } - if a.Write { - prot |= syscall.PROT_WRITE - } - if a.Execute { - prot |= syscall.PROT_EXEC - } - return prot -} - -// SupersetOf returns true iff the access types in a are a superset of the -// access types in other. -func (a AccessType) SupersetOf(other AccessType) bool { - if !a.Read && other.Read { - return false - } - if !a.Write && other.Write { - return false - } - if !a.Execute && other.Execute { - return false - } - return true -} - -// Intersect returns the access types set in both a and other. -func (a AccessType) Intersect(other AccessType) AccessType { - return AccessType{ - Read: a.Read && other.Read, - Write: a.Write && other.Write, - Execute: a.Execute && other.Execute, - } -} - -// Union returns the access types set in either a or other. -func (a AccessType) Union(other AccessType) AccessType { - return AccessType{ - Read: a.Read || other.Read, - Write: a.Write || other.Write, - Execute: a.Execute || other.Execute, - } -} - -// Effective returns the set of effective access types allowed by a, even if -// some types are not explicitly allowed. -func (a AccessType) Effective() AccessType { - // In Linux, Write and Execute access generally imply Read access. See - // mm/mmap.c:protection_map. - // - // The notable exception is get_user_pages, which only checks against - // the original vma flags. That said, most user memory accesses do not - // use GUP. - if a.Write || a.Execute { - a.Read = true - } - return a -} - -// Convenient access types. -var ( - NoAccess = AccessType{} - Read = AccessType{Read: true} - Write = AccessType{Write: true} - Execute = AccessType{Execute: true} - ReadWrite = AccessType{Read: true, Write: true} - AnyAccess = AccessType{Read: true, Write: true, Execute: true} -) diff --git a/pkg/sentry/usermem/addr.go b/pkg/sentry/usermem/addr.go deleted file mode 100644 index e79210804..000000000 --- a/pkg/sentry/usermem/addr.go +++ /dev/null @@ -1,108 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package usermem - -import ( - "fmt" -) - -// Addr represents a generic virtual address. -// -// +stateify savable -type Addr uintptr - -// AddLength adds the given length to start and returns the result. ok is true -// iff adding the length did not overflow the range of Addr. -// -// Note: This function is usually used to get the end of an address range -// defined by its start address and length. Since the resulting end is -// exclusive, end == 0 is technically valid, and corresponds to a range that -// extends to the end of the address space, but ok will be false. This isn't -// expected to ever come up in practice. -func (v Addr) AddLength(length uint64) (end Addr, ok bool) { - end = v + Addr(length) - // The second half of the following check is needed in case uintptr is - // smaller than 64 bits. - ok = end >= v && length <= uint64(^Addr(0)) - return -} - -// RoundDown returns the address rounded down to the nearest page boundary. -func (v Addr) RoundDown() Addr { - return v & ^Addr(PageSize-1) -} - -// RoundUp returns the address rounded up to the nearest page boundary. ok is -// true iff rounding up did not wrap around. -func (v Addr) RoundUp() (addr Addr, ok bool) { - addr = Addr(v + PageSize - 1).RoundDown() - ok = addr >= v - return -} - -// MustRoundUp is equivalent to RoundUp, but panics if rounding up wraps -// around. -func (v Addr) MustRoundUp() Addr { - addr, ok := v.RoundUp() - if !ok { - panic(fmt.Sprintf("usermem.Addr(%d).RoundUp() wraps", v)) - } - return addr -} - -// HugeRoundDown returns the address rounded down to the nearest huge page -// boundary. -func (v Addr) HugeRoundDown() Addr { - return v & ^Addr(HugePageSize-1) -} - -// HugeRoundUp returns the address rounded up to the nearest huge page boundary. -// ok is true iff rounding up did not wrap around. -func (v Addr) HugeRoundUp() (addr Addr, ok bool) { - addr = Addr(v + HugePageSize - 1).HugeRoundDown() - ok = addr >= v - return -} - -// PageOffset returns the offset of v into the current page. -func (v Addr) PageOffset() uint64 { - return uint64(v & Addr(PageSize-1)) -} - -// IsPageAligned returns true if v.PageOffset() == 0. -func (v Addr) IsPageAligned() bool { - return v.PageOffset() == 0 -} - -// AddrRange is a range of Addrs. -// -// type AddrRange - -// ToRange returns [v, v+length). -func (v Addr) ToRange(length uint64) (AddrRange, bool) { - end, ok := v.AddLength(length) - return AddrRange{v, end}, ok -} - -// IsPageAligned returns true if ar.Start.IsPageAligned() and -// ar.End.IsPageAligned(). -func (ar AddrRange) IsPageAligned() bool { - return ar.Start.IsPageAligned() && ar.End.IsPageAligned() -} - -// String implements fmt.Stringer.String. -func (ar AddrRange) String() string { - return fmt.Sprintf("[%#x, %#x)", ar.Start, ar.End) -} diff --git a/pkg/sentry/usermem/addr_range_seq_test.go b/pkg/sentry/usermem/addr_range_seq_test.go deleted file mode 100644 index 82f735026..000000000 --- a/pkg/sentry/usermem/addr_range_seq_test.go +++ /dev/null @@ -1,197 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package usermem - -import ( - "testing" -) - -var addrRangeSeqTests = []struct { - desc string - ranges []AddrRange -}{ - { - desc: "Empty sequence", - }, - { - desc: "Single empty AddrRange", - ranges: []AddrRange{ - {0x10, 0x10}, - }, - }, - { - desc: "Single non-empty AddrRange of length 1", - ranges: []AddrRange{ - {0x10, 0x11}, - }, - }, - { - desc: "Single non-empty AddrRange of length 2", - ranges: []AddrRange{ - {0x10, 0x12}, - }, - }, - { - desc: "Multiple non-empty AddrRanges", - ranges: []AddrRange{ - {0x10, 0x11}, - {0x20, 0x22}, - }, - }, - { - desc: "Multiple AddrRanges including empty AddrRanges", - ranges: []AddrRange{ - {0x10, 0x10}, - {0x20, 0x20}, - {0x30, 0x33}, - {0x40, 0x44}, - {0x50, 0x50}, - {0x60, 0x60}, - {0x70, 0x77}, - {0x80, 0x88}, - {0x90, 0x90}, - {0xa0, 0xa0}, - }, - }, -} - -func testAddrRangeSeqEqualityWithTailIteration(t *testing.T, ars AddrRangeSeq, wantRanges []AddrRange) { - var wantLen int64 - for _, ar := range wantRanges { - wantLen += int64(ar.Length()) - } - - var i int - for !ars.IsEmpty() { - if gotLen := ars.NumBytes(); gotLen != wantLen { - t.Errorf("Iteration %d: %v.NumBytes(): got %d, wanted %d", i, ars, gotLen, wantLen) - } - if gotN, wantN := ars.NumRanges(), len(wantRanges)-i; gotN != wantN { - t.Errorf("Iteration %d: %v.NumRanges(): got %d, wanted %d", i, ars, gotN, wantN) - } - got := ars.Head() - if i >= len(wantRanges) { - t.Errorf("Iteration %d: %v.Head(): got %s, wanted ", i, ars, got) - } else if want := wantRanges[i]; got != want { - t.Errorf("Iteration %d: %v.Head(): got %s, wanted %s", i, ars, got, want) - } - ars = ars.Tail() - wantLen -= int64(got.Length()) - i++ - } - if gotLen := ars.NumBytes(); gotLen != 0 || wantLen != 0 { - t.Errorf("Iteration %d: %v.NumBytes(): got %d, wanted %d (which should be 0)", i, ars, gotLen, wantLen) - } - if gotN := ars.NumRanges(); gotN != 0 { - t.Errorf("Iteration %d: %v.NumRanges(): got %d, wanted 0", i, ars, gotN) - } -} - -func TestAddrRangeSeqTailIteration(t *testing.T) { - for _, test := range addrRangeSeqTests { - t.Run(test.desc, func(t *testing.T) { - testAddrRangeSeqEqualityWithTailIteration(t, AddrRangeSeqFromSlice(test.ranges), test.ranges) - }) - } -} - -func TestAddrRangeSeqDropFirstEmpty(t *testing.T) { - var ars AddrRangeSeq - if got, want := ars.DropFirst(1), ars; got != want { - t.Errorf("%v.DropFirst(1): got %v, wanted %v", ars, got, want) - } -} - -func TestAddrRangeSeqDropSingleByteIteration(t *testing.T) { - // Tests AddrRangeSeq iteration using Head/DropFirst, simulating - // I/O-per-AddrRange. - for _, test := range addrRangeSeqTests { - t.Run(test.desc, func(t *testing.T) { - // Figure out what AddrRanges we expect to see. - var wantLen int64 - var wantRanges []AddrRange - for _, ar := range test.ranges { - wantLen += int64(ar.Length()) - wantRanges = append(wantRanges, ar) - if ar.Length() == 0 { - // We "do" 0 bytes of I/O and then call DropFirst(0), - // advancing to the next AddrRange. - continue - } - // Otherwise we "do" 1 byte of I/O and then call DropFirst(1), - // advancing the AddrRange by 1 byte, or to the next AddrRange - // if this one is exhausted. - for ar.Start++; ar.Length() != 0; ar.Start++ { - wantRanges = append(wantRanges, ar) - } - } - t.Logf("Expected AddrRanges: %s (%d bytes)", wantRanges, wantLen) - - ars := AddrRangeSeqFromSlice(test.ranges) - var i int - for !ars.IsEmpty() { - if gotLen := ars.NumBytes(); gotLen != wantLen { - t.Errorf("Iteration %d: %v.NumBytes(): got %d, wanted %d", i, ars, gotLen, wantLen) - } - got := ars.Head() - if i >= len(wantRanges) { - t.Errorf("Iteration %d: %v.Head(): got %s, wanted ", i, ars, got) - } else if want := wantRanges[i]; got != want { - t.Errorf("Iteration %d: %v.Head(): got %s, wanted %s", i, ars, got, want) - } - if got.Length() == 0 { - ars = ars.DropFirst(0) - } else { - ars = ars.DropFirst(1) - wantLen-- - } - i++ - } - if gotLen := ars.NumBytes(); gotLen != 0 || wantLen != 0 { - t.Errorf("Iteration %d: %v.NumBytes(): got %d, wanted %d (which should be 0)", i, ars, gotLen, wantLen) - } - }) - } -} - -func TestAddrRangeSeqTakeFirstEmpty(t *testing.T) { - var ars AddrRangeSeq - if got, want := ars.TakeFirst(1), ars; got != want { - t.Errorf("%v.TakeFirst(1): got %v, wanted %v", ars, got, want) - } -} - -func TestAddrRangeSeqTakeFirst(t *testing.T) { - ranges := []AddrRange{ - {0x10, 0x11}, - {0x20, 0x22}, - {0x30, 0x30}, - {0x40, 0x44}, - {0x50, 0x55}, - {0x60, 0x60}, - {0x70, 0x77}, - } - ars := AddrRangeSeqFromSlice(ranges).TakeFirst(5) - want := []AddrRange{ - {0x10, 0x11}, // +1 byte (total 1 byte), not truncated - {0x20, 0x22}, // +2 bytes (total 3 bytes), not truncated - {0x30, 0x30}, // +0 bytes (total 3 bytes), no change - {0x40, 0x42}, // +2 bytes (total 5 bytes), partially truncated - {0x50, 0x50}, // +0 bytes (total 5 bytes), fully truncated - {0x60, 0x60}, // +0 bytes (total 5 bytes), "fully truncated" (no change) - {0x70, 0x70}, // +0 bytes (total 5 bytes), fully truncated - } - testAddrRangeSeqEqualityWithTailIteration(t, ars, want) -} diff --git a/pkg/sentry/usermem/addr_range_seq_unsafe.go b/pkg/sentry/usermem/addr_range_seq_unsafe.go deleted file mode 100644 index c09337c15..000000000 --- a/pkg/sentry/usermem/addr_range_seq_unsafe.go +++ /dev/null @@ -1,277 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package usermem - -import ( - "bytes" - "fmt" - "reflect" - "unsafe" -) - -// An AddrRangeSeq represents a sequence of AddrRanges. -// -// AddrRangeSeqs are immutable and may be copied by value. The zero value of -// AddrRangeSeq represents an empty sequence. -// -// An AddrRangeSeq may contain AddrRanges with a length of 0. This is necessary -// since zero-length AddrRanges are significant to MM bounds checks. -type AddrRangeSeq struct { - // If length is 0, then the AddrRangeSeq represents no AddrRanges. - // Invariants: data == 0; offset == 0; limit == 0. - // - // If length is 1, then the AddrRangeSeq represents the single - // AddrRange{offset, offset+limit}. Invariants: data == 0. - // - // Otherwise, length >= 2, and the AddrRangeSeq represents the `length` - // AddrRanges in the array of AddrRanges starting at address `data`, - // starting at `offset` bytes into the first AddrRange and limited to the - // following `limit` bytes. (AddrRanges after `limit` are still iterated, - // but are truncated to a length of 0.) Invariants: data != 0; offset <= - // data[0].Length(); limit > 0; offset+limit <= the combined length of all - // AddrRanges in the array. - data unsafe.Pointer - length int - offset Addr - limit Addr -} - -// AddrRangeSeqOf returns an AddrRangeSeq representing the single AddrRange ar. -func AddrRangeSeqOf(ar AddrRange) AddrRangeSeq { - return AddrRangeSeq{ - length: 1, - offset: ar.Start, - limit: ar.Length(), - } -} - -// AddrRangeSeqFromSlice returns an AddrRangeSeq representing all AddrRanges in -// slice. -// -// Whether the returned AddrRangeSeq shares memory with slice is unspecified; -// clients should avoid mutating slices passed to AddrRangeSeqFromSlice. -// -// Preconditions: The combined length of all AddrRanges in slice <= -// math.MaxInt64. -func AddrRangeSeqFromSlice(slice []AddrRange) AddrRangeSeq { - var limit int64 - for _, ar := range slice { - len64 := int64(ar.Length()) - if len64 < 0 { - panic(fmt.Sprintf("Length of AddrRange %v overflows int64", ar)) - } - sum := limit + len64 - if sum < limit { - panic(fmt.Sprintf("Total length of AddrRanges %v overflows int64", slice)) - } - limit = sum - } - return addrRangeSeqFromSliceLimited(slice, limit) -} - -// Preconditions: The combined length of all AddrRanges in slice <= limit. -// limit >= 0. If len(slice) != 0, then limit > 0. -func addrRangeSeqFromSliceLimited(slice []AddrRange, limit int64) AddrRangeSeq { - switch len(slice) { - case 0: - return AddrRangeSeq{} - case 1: - return AddrRangeSeq{ - length: 1, - offset: slice[0].Start, - limit: Addr(limit), - } - default: - return AddrRangeSeq{ - data: unsafe.Pointer(&slice[0]), - length: len(slice), - limit: Addr(limit), - } - } -} - -// IsEmpty returns true if ars.NumRanges() == 0. -// -// Note that since AddrRangeSeq may contain AddrRanges with a length of zero, -// an AddrRange representing 0 bytes (AddrRangeSeq.NumBytes() == 0) is not -// necessarily empty. -func (ars AddrRangeSeq) IsEmpty() bool { - return ars.length == 0 -} - -// NumRanges returns the number of AddrRanges in ars. -func (ars AddrRangeSeq) NumRanges() int { - return ars.length -} - -// NumBytes returns the number of bytes represented by ars. -func (ars AddrRangeSeq) NumBytes() int64 { - return int64(ars.limit) -} - -// Head returns the first AddrRange in ars. -// -// Preconditions: !ars.IsEmpty(). -func (ars AddrRangeSeq) Head() AddrRange { - if ars.length == 0 { - panic("empty AddrRangeSeq") - } - if ars.length == 1 { - return AddrRange{ars.offset, ars.offset + ars.limit} - } - ar := *(*AddrRange)(ars.data) - ar.Start += ars.offset - if ar.Length() > ars.limit { - ar.End = ar.Start + ars.limit - } - return ar -} - -// Tail returns an AddrRangeSeq consisting of all AddrRanges in ars after the -// first. -// -// Preconditions: !ars.IsEmpty(). -func (ars AddrRangeSeq) Tail() AddrRangeSeq { - if ars.length == 0 { - panic("empty AddrRangeSeq") - } - if ars.length == 1 { - return AddrRangeSeq{} - } - return ars.externalTail() -} - -// Preconditions: ars.length >= 2. -func (ars AddrRangeSeq) externalTail() AddrRangeSeq { - headLen := (*AddrRange)(ars.data).Length() - ars.offset - var tailLimit int64 - if ars.limit > headLen { - tailLimit = int64(ars.limit - headLen) - } - var extSlice []AddrRange - extSliceHdr := (*reflect.SliceHeader)(unsafe.Pointer(&extSlice)) - extSliceHdr.Data = uintptr(ars.data) - extSliceHdr.Len = ars.length - extSliceHdr.Cap = ars.length - return addrRangeSeqFromSliceLimited(extSlice[1:], tailLimit) -} - -// DropFirst returns an AddrRangeSeq equivalent to ars, but with the first n -// bytes omitted. If n > ars.NumBytes(), DropFirst returns an empty -// AddrRangeSeq. -// -// If !ars.IsEmpty() and ars.Head().Length() == 0, DropFirst will always omit -// at least ars.Head(), even if n == 0. This guarantees that the basic pattern -// of: -// -// for !ars.IsEmpty() { -// n, err = doIOWith(ars.Head()) -// if err != nil { -// return err -// } -// ars = ars.DropFirst(n) -// } -// -// works even in the presence of zero-length AddrRanges. -// -// Preconditions: n >= 0. -func (ars AddrRangeSeq) DropFirst(n int) AddrRangeSeq { - if n < 0 { - panic(fmt.Sprintf("invalid n: %d", n)) - } - return ars.DropFirst64(int64(n)) -} - -// DropFirst64 is equivalent to DropFirst but takes an int64. -func (ars AddrRangeSeq) DropFirst64(n int64) AddrRangeSeq { - if n < 0 { - panic(fmt.Sprintf("invalid n: %d", n)) - } - if Addr(n) > ars.limit { - return AddrRangeSeq{} - } - // Handle initial empty AddrRange. - switch ars.length { - case 0: - return AddrRangeSeq{} - case 1: - if ars.limit == 0 { - return AddrRangeSeq{} - } - default: - if rawHeadLen := (*AddrRange)(ars.data).Length(); ars.offset == rawHeadLen { - ars = ars.externalTail() - } - } - for n != 0 { - // Calling ars.Head() here is surprisingly expensive, so inline getting - // the head's length. - var headLen Addr - if ars.length == 1 { - headLen = ars.limit - } else { - headLen = (*AddrRange)(ars.data).Length() - ars.offset - } - if Addr(n) < headLen { - // Dropping ends partway through the head AddrRange. - ars.offset += Addr(n) - ars.limit -= Addr(n) - return ars - } - n -= int64(headLen) - ars = ars.Tail() - } - return ars -} - -// TakeFirst returns an AddrRangeSeq equivalent to ars, but iterating at most n -// bytes. TakeFirst never removes AddrRanges from ars; AddrRanges beyond the -// first n bytes are reduced to a length of zero, but will still be iterated. -// -// Preconditions: n >= 0. -func (ars AddrRangeSeq) TakeFirst(n int) AddrRangeSeq { - if n < 0 { - panic(fmt.Sprintf("invalid n: %d", n)) - } - return ars.TakeFirst64(int64(n)) -} - -// TakeFirst64 is equivalent to TakeFirst but takes an int64. -func (ars AddrRangeSeq) TakeFirst64(n int64) AddrRangeSeq { - if n < 0 { - panic(fmt.Sprintf("invalid n: %d", n)) - } - if ars.limit > Addr(n) { - ars.limit = Addr(n) - } - return ars -} - -// String implements fmt.Stringer.String. -func (ars AddrRangeSeq) String() string { - // This is deliberately chosen to be the same as fmt's automatic stringer - // for []AddrRange. - var buf bytes.Buffer - buf.WriteByte('[') - var sep string - for !ars.IsEmpty() { - buf.WriteString(sep) - sep = " " - buf.WriteString(ars.Head().String()) - ars = ars.Tail() - } - buf.WriteByte(']') - return buf.String() -} diff --git a/pkg/sentry/usermem/bytes_io.go b/pkg/sentry/usermem/bytes_io.go deleted file mode 100644 index 7898851b3..000000000 --- a/pkg/sentry/usermem/bytes_io.go +++ /dev/null @@ -1,141 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package usermem - -import ( - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/safemem" - "gvisor.dev/gvisor/pkg/syserror" -) - -const maxInt = int(^uint(0) >> 1) - -// BytesIO implements IO using a byte slice. Addresses are interpreted as -// offsets into the slice. Reads and writes beyond the end of the slice return -// EFAULT. -type BytesIO struct { - Bytes []byte -} - -// CopyOut implements IO.CopyOut. -func (b *BytesIO) CopyOut(ctx context.Context, addr Addr, src []byte, opts IOOpts) (int, error) { - rngN, rngErr := b.rangeCheck(addr, len(src)) - if rngN == 0 { - return 0, rngErr - } - return copy(b.Bytes[int(addr):], src[:rngN]), rngErr -} - -// CopyIn implements IO.CopyIn. -func (b *BytesIO) CopyIn(ctx context.Context, addr Addr, dst []byte, opts IOOpts) (int, error) { - rngN, rngErr := b.rangeCheck(addr, len(dst)) - if rngN == 0 { - return 0, rngErr - } - return copy(dst[:rngN], b.Bytes[int(addr):]), rngErr -} - -// ZeroOut implements IO.ZeroOut. -func (b *BytesIO) ZeroOut(ctx context.Context, addr Addr, toZero int64, opts IOOpts) (int64, error) { - if toZero > int64(maxInt) { - return 0, syserror.EINVAL - } - rngN, rngErr := b.rangeCheck(addr, int(toZero)) - if rngN == 0 { - return 0, rngErr - } - zeroSlice := b.Bytes[int(addr) : int(addr)+rngN] - for i := range zeroSlice { - zeroSlice[i] = 0 - } - return int64(rngN), rngErr -} - -// CopyOutFrom implements IO.CopyOutFrom. -func (b *BytesIO) CopyOutFrom(ctx context.Context, ars AddrRangeSeq, src safemem.Reader, opts IOOpts) (int64, error) { - dsts, rngErr := b.blocksFromAddrRanges(ars) - n, err := src.ReadToBlocks(dsts) - if err != nil { - return int64(n), err - } - return int64(n), rngErr -} - -// CopyInTo implements IO.CopyInTo. -func (b *BytesIO) CopyInTo(ctx context.Context, ars AddrRangeSeq, dst safemem.Writer, opts IOOpts) (int64, error) { - srcs, rngErr := b.blocksFromAddrRanges(ars) - n, err := dst.WriteFromBlocks(srcs) - if err != nil { - return int64(n), err - } - return int64(n), rngErr -} - -func (b *BytesIO) rangeCheck(addr Addr, length int) (int, error) { - if length == 0 { - return 0, nil - } - if length < 0 { - return 0, syserror.EINVAL - } - max := Addr(len(b.Bytes)) - if addr >= max { - return 0, syserror.EFAULT - } - end, ok := addr.AddLength(uint64(length)) - if !ok || end > max { - return int(max - addr), syserror.EFAULT - } - return length, nil -} - -func (b *BytesIO) blocksFromAddrRanges(ars AddrRangeSeq) (safemem.BlockSeq, error) { - switch ars.NumRanges() { - case 0: - return safemem.BlockSeq{}, nil - case 1: - block, err := b.blockFromAddrRange(ars.Head()) - return safemem.BlockSeqOf(block), err - default: - blocks := make([]safemem.Block, 0, ars.NumRanges()) - for !ars.IsEmpty() { - block, err := b.blockFromAddrRange(ars.Head()) - if block.Len() != 0 { - blocks = append(blocks, block) - } - if err != nil { - return safemem.BlockSeqFromSlice(blocks), err - } - ars = ars.Tail() - } - return safemem.BlockSeqFromSlice(blocks), nil - } -} - -func (b *BytesIO) blockFromAddrRange(ar AddrRange) (safemem.Block, error) { - n, err := b.rangeCheck(ar.Start, int(ar.Length())) - if n == 0 { - return safemem.Block{}, err - } - return safemem.BlockFromSafeSlice(b.Bytes[int(ar.Start) : int(ar.Start)+n]), err -} - -// BytesIOSequence returns an IOSequence representing the given byte slice. -func BytesIOSequence(buf []byte) IOSequence { - return IOSequence{ - IO: &BytesIO{buf}, - Addrs: AddrRangeSeqOf(AddrRange{0, Addr(len(buf))}), - } -} diff --git a/pkg/sentry/usermem/bytes_io_unsafe.go b/pkg/sentry/usermem/bytes_io_unsafe.go deleted file mode 100644 index fca5952f4..000000000 --- a/pkg/sentry/usermem/bytes_io_unsafe.go +++ /dev/null @@ -1,47 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package usermem - -import ( - "sync/atomic" - "unsafe" - - "gvisor.dev/gvisor/pkg/atomicbitops" - "gvisor.dev/gvisor/pkg/sentry/context" -) - -// SwapUint32 implements IO.SwapUint32. -func (b *BytesIO) SwapUint32(ctx context.Context, addr Addr, new uint32, opts IOOpts) (uint32, error) { - if _, rngErr := b.rangeCheck(addr, 4); rngErr != nil { - return 0, rngErr - } - return atomic.SwapUint32((*uint32)(unsafe.Pointer(&b.Bytes[int(addr)])), new), nil -} - -// CompareAndSwapUint32 implements IO.CompareAndSwapUint32. -func (b *BytesIO) CompareAndSwapUint32(ctx context.Context, addr Addr, old, new uint32, opts IOOpts) (uint32, error) { - if _, rngErr := b.rangeCheck(addr, 4); rngErr != nil { - return 0, rngErr - } - return atomicbitops.CompareAndSwapUint32((*uint32)(unsafe.Pointer(&b.Bytes[int(addr)])), old, new), nil -} - -// LoadUint32 implements IO.LoadUint32. -func (b *BytesIO) LoadUint32(ctx context.Context, addr Addr, opts IOOpts) (uint32, error) { - if _, err := b.rangeCheck(addr, 4); err != nil { - return 0, err - } - return atomic.LoadUint32((*uint32)(unsafe.Pointer(&b.Bytes[int(addr)]))), nil -} diff --git a/pkg/sentry/usermem/usermem.go b/pkg/sentry/usermem/usermem.go deleted file mode 100644 index 7b1f312b1..000000000 --- a/pkg/sentry/usermem/usermem.go +++ /dev/null @@ -1,597 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Package usermem governs access to user memory. -package usermem - -import ( - "bytes" - "errors" - "io" - "strconv" - - "gvisor.dev/gvisor/pkg/binary" - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/safemem" - "gvisor.dev/gvisor/pkg/syserror" -) - -// IO provides access to the contents of a virtual memory space. -// -// FIXME(b/38173783): Implementations of IO cannot expect ctx to contain any -// meaningful data. -type IO interface { - // CopyOut copies len(src) bytes from src to the memory mapped at addr. It - // returns the number of bytes copied. If the number of bytes copied is < - // len(src), it returns a non-nil error explaining why. - // - // Preconditions: The caller must not hold mm.MemoryManager.mappingMu or - // any following locks in the lock order. - // - // Postconditions: CopyOut does not retain src. - CopyOut(ctx context.Context, addr Addr, src []byte, opts IOOpts) (int, error) - - // CopyIn copies len(dst) bytes from the memory mapped at addr to dst. - // It returns the number of bytes copied. If the number of bytes copied is - // < len(dst), it returns a non-nil error explaining why. - // - // Preconditions: The caller must not hold mm.MemoryManager.mappingMu or - // any following locks in the lock order. - // - // Postconditions: CopyIn does not retain dst. - CopyIn(ctx context.Context, addr Addr, dst []byte, opts IOOpts) (int, error) - - // ZeroOut sets toZero bytes to 0, starting at addr. It returns the number - // of bytes zeroed. If the number of bytes zeroed is < toZero, it returns a - // non-nil error explaining why. - // - // Preconditions: The caller must not hold mm.MemoryManager.mappingMu or - // any following locks in the lock order. toZero >= 0. - ZeroOut(ctx context.Context, addr Addr, toZero int64, opts IOOpts) (int64, error) - - // CopyOutFrom copies ars.NumBytes() bytes from src to the memory mapped at - // ars. It returns the number of bytes copied, which may be less than the - // number of bytes read from src if copying fails. CopyOutFrom may return a - // partial copy without an error iff src.ReadToBlocks returns a partial - // read without an error. - // - // CopyOutFrom calls src.ReadToBlocks at most once. - // - // Preconditions: The caller must not hold mm.MemoryManager.mappingMu or - // any following locks in the lock order. src.ReadToBlocks must not block - // on mm.MemoryManager.activeMu or any preceding locks in the lock order. - CopyOutFrom(ctx context.Context, ars AddrRangeSeq, src safemem.Reader, opts IOOpts) (int64, error) - - // CopyInTo copies ars.NumBytes() bytes from the memory mapped at ars to - // dst. It returns the number of bytes copied. CopyInTo may return a - // partial copy without an error iff dst.WriteFromBlocks returns a partial - // write without an error. - // - // CopyInTo calls dst.WriteFromBlocks at most once. - // - // Preconditions: The caller must not hold mm.MemoryManager.mappingMu or - // any following locks in the lock order. dst.WriteFromBlocks must not - // block on mm.MemoryManager.activeMu or any preceding locks in the lock - // order. - CopyInTo(ctx context.Context, ars AddrRangeSeq, dst safemem.Writer, opts IOOpts) (int64, error) - - // TODO(jamieliu): The requirement that CopyOutFrom/CopyInTo call src/dst - // at most once, which is unnecessary in most cases, forces implementations - // to gather safemem.Blocks into a single slice to pass to src/dst. Add - // CopyOutFromIter/CopyInToIter, which relaxes this restriction, to avoid - // this allocation. - - // SwapUint32 atomically sets the uint32 value at addr to new and - // returns the previous value. - // - // Preconditions: The caller must not hold mm.MemoryManager.mappingMu or - // any following locks in the lock order. addr must be aligned to a 4-byte - // boundary. - SwapUint32(ctx context.Context, addr Addr, new uint32, opts IOOpts) (uint32, error) - - // CompareAndSwapUint32 atomically compares the uint32 value at addr to - // old; if they are equal, the value in memory is replaced by new. In - // either case, the previous value stored in memory is returned. - // - // Preconditions: The caller must not hold mm.MemoryManager.mappingMu or - // any following locks in the lock order. addr must be aligned to a 4-byte - // boundary. - CompareAndSwapUint32(ctx context.Context, addr Addr, old, new uint32, opts IOOpts) (uint32, error) - - // LoadUint32 atomically loads the uint32 value at addr and returns it. - // - // Preconditions: The caller must not hold mm.MemoryManager.mappingMu or - // any following locks in the lock order. addr must be aligned to a 4-byte - // boundary. - LoadUint32(ctx context.Context, addr Addr, opts IOOpts) (uint32, error) -} - -// IOOpts contains options applicable to all IO methods. -type IOOpts struct { - // If IgnorePermissions is true, application-defined memory protections set - // by mmap(2) or mprotect(2) will be ignored. (Memory protections required - // by the target of the mapping are never ignored.) - IgnorePermissions bool - - // If AddressSpaceActive is true, the IO implementation may assume that it - // has an active AddressSpace and can therefore use AddressSpace copying - // without performing activation. See mm/io.go for details. - AddressSpaceActive bool -} - -// IOReadWriter is an io.ReadWriter that reads from / writes to addresses -// starting at addr in IO. The preconditions that apply to IO.CopyIn and -// IO.CopyOut also apply to IOReadWriter.Read and IOReadWriter.Write -// respectively. -type IOReadWriter struct { - Ctx context.Context - IO IO - Addr Addr - Opts IOOpts -} - -// Read implements io.Reader.Read. -// -// Note that an address space does not have an "end of file", so Read can only -// return io.EOF if IO.CopyIn returns io.EOF. Attempts to read unmapped or -// unreadable memory, or beyond the end of the address space, should return -// EFAULT. -func (rw *IOReadWriter) Read(dst []byte) (int, error) { - n, err := rw.IO.CopyIn(rw.Ctx, rw.Addr, dst, rw.Opts) - end, ok := rw.Addr.AddLength(uint64(n)) - if ok { - rw.Addr = end - } else { - // Disallow wraparound. - rw.Addr = ^Addr(0) - if err != nil { - err = syserror.EFAULT - } - } - return n, err -} - -// Writer implements io.Writer.Write. -func (rw *IOReadWriter) Write(src []byte) (int, error) { - n, err := rw.IO.CopyOut(rw.Ctx, rw.Addr, src, rw.Opts) - end, ok := rw.Addr.AddLength(uint64(n)) - if ok { - rw.Addr = end - } else { - // Disallow wraparound. - rw.Addr = ^Addr(0) - if err != nil { - err = syserror.EFAULT - } - } - return n, err -} - -// CopyObjectOut copies a fixed-size value or slice of fixed-size values from -// src to the memory mapped at addr in uio. It returns the number of bytes -// copied. -// -// CopyObjectOut must use reflection to encode src; performance-sensitive -// clients should do encoding manually and use uio.CopyOut directly. -// -// Preconditions: As for IO.CopyOut. -func CopyObjectOut(ctx context.Context, uio IO, addr Addr, src interface{}, opts IOOpts) (int, error) { - w := &IOReadWriter{ - Ctx: ctx, - IO: uio, - Addr: addr, - Opts: opts, - } - // Allocate a byte slice the size of the object being marshaled. This - // adds an extra reflection call, but avoids needing to grow the slice - // during encoding, which can result in many heap-allocated slices. - b := make([]byte, 0, binary.Size(src)) - return w.Write(binary.Marshal(b, ByteOrder, src)) -} - -// CopyObjectIn copies a fixed-size value or slice of fixed-size values from -// the memory mapped at addr in uio to dst. It returns the number of bytes -// copied. -// -// CopyObjectIn must use reflection to decode dst; performance-sensitive -// clients should use uio.CopyIn directly and do decoding manually. -// -// Preconditions: As for IO.CopyIn. -func CopyObjectIn(ctx context.Context, uio IO, addr Addr, dst interface{}, opts IOOpts) (int, error) { - r := &IOReadWriter{ - Ctx: ctx, - IO: uio, - Addr: addr, - Opts: opts, - } - buf := make([]byte, binary.Size(dst)) - if _, err := io.ReadFull(r, buf); err != nil { - return 0, err - } - binary.Unmarshal(buf, ByteOrder, dst) - return int(r.Addr - addr), nil -} - -// CopyStringIn tuning parameters, defined outside that function for tests. -const ( - copyStringIncrement = 64 - copyStringMaxInitBufLen = 256 -) - -// CopyStringIn copies a NUL-terminated string of unknown length from the -// memory mapped at addr in uio and returns it as a string (not including the -// trailing NUL). If the length of the string, including the terminating NUL, -// would exceed maxlen, CopyStringIn returns the string truncated to maxlen and -// ENAMETOOLONG. -// -// Preconditions: As for IO.CopyFromUser. maxlen >= 0. -func CopyStringIn(ctx context.Context, uio IO, addr Addr, maxlen int, opts IOOpts) (string, error) { - initLen := maxlen - if initLen > copyStringMaxInitBufLen { - initLen = copyStringMaxInitBufLen - } - buf := make([]byte, initLen) - var done int - for done < maxlen { - // Read up to copyStringIncrement bytes at a time. - readlen := copyStringIncrement - if readlen > maxlen-done { - readlen = maxlen - done - } - end, ok := addr.AddLength(uint64(readlen)) - if !ok { - return stringFromImmutableBytes(buf[:done]), syserror.EFAULT - } - // Shorten the read to avoid crossing page boundaries, since faulting - // in a page unnecessarily is expensive. This also ensures that partial - // copies up to the end of application-mappable memory succeed. - if addr.RoundDown() != end.RoundDown() { - end = end.RoundDown() - readlen = int(end - addr) - } - // Ensure that our buffer is large enough to accommodate the read. - if done+readlen > len(buf) { - newBufLen := len(buf) * 2 - if newBufLen > maxlen { - newBufLen = maxlen - } - buf = append(buf, make([]byte, newBufLen-len(buf))...) - } - n, err := uio.CopyIn(ctx, addr, buf[done:done+readlen], opts) - // Look for the terminating zero byte, which may have occurred before - // hitting err. - if i := bytes.IndexByte(buf[done:done+n], byte(0)); i >= 0 { - return stringFromImmutableBytes(buf[:done+i]), nil - } - - done += n - if err != nil { - return stringFromImmutableBytes(buf[:done]), err - } - addr = end - } - return stringFromImmutableBytes(buf), syserror.ENAMETOOLONG -} - -// CopyOutVec copies bytes from src to the memory mapped at ars in uio. The -// maximum number of bytes copied is ars.NumBytes() or len(src), whichever is -// less. CopyOutVec returns the number of bytes copied; if this is less than -// the maximum, it returns a non-nil error explaining why. -// -// Preconditions: As for IO.CopyOut. -func CopyOutVec(ctx context.Context, uio IO, ars AddrRangeSeq, src []byte, opts IOOpts) (int, error) { - var done int - for !ars.IsEmpty() && done < len(src) { - ar := ars.Head() - cplen := len(src) - done - if Addr(cplen) >= ar.Length() { - cplen = int(ar.Length()) - } - n, err := uio.CopyOut(ctx, ar.Start, src[done:done+cplen], opts) - done += n - if err != nil { - return done, err - } - ars = ars.DropFirst(n) - } - return done, nil -} - -// CopyInVec copies bytes from the memory mapped at ars in uio to dst. The -// maximum number of bytes copied is ars.NumBytes() or len(dst), whichever is -// less. CopyInVec returns the number of bytes copied; if this is less than the -// maximum, it returns a non-nil error explaining why. -// -// Preconditions: As for IO.CopyIn. -func CopyInVec(ctx context.Context, uio IO, ars AddrRangeSeq, dst []byte, opts IOOpts) (int, error) { - var done int - for !ars.IsEmpty() && done < len(dst) { - ar := ars.Head() - cplen := len(dst) - done - if Addr(cplen) >= ar.Length() { - cplen = int(ar.Length()) - } - n, err := uio.CopyIn(ctx, ar.Start, dst[done:done+cplen], opts) - done += n - if err != nil { - return done, err - } - ars = ars.DropFirst(n) - } - return done, nil -} - -// ZeroOutVec writes zeroes to the memory mapped at ars in uio. The maximum -// number of bytes written is ars.NumBytes() or toZero, whichever is less. -// ZeroOutVec returns the number of bytes written; if this is less than the -// maximum, it returns a non-nil error explaining why. -// -// Preconditions: As for IO.ZeroOut. -func ZeroOutVec(ctx context.Context, uio IO, ars AddrRangeSeq, toZero int64, opts IOOpts) (int64, error) { - var done int64 - for !ars.IsEmpty() && done < toZero { - ar := ars.Head() - cplen := toZero - done - if Addr(cplen) >= ar.Length() { - cplen = int64(ar.Length()) - } - n, err := uio.ZeroOut(ctx, ar.Start, cplen, opts) - done += n - if err != nil { - return done, err - } - ars = ars.DropFirst64(n) - } - return done, nil -} - -func isASCIIWhitespace(b byte) bool { - // Compare Linux include/linux/ctype.h, lib/ctype.c. - // 9 => horizontal tab '\t' - // 10 => line feed '\n' - // 11 => vertical tab '\v' - // 12 => form feed '\c' - // 13 => carriage return '\r' - return b == ' ' || (b >= 9 && b <= 13) -} - -// CopyInt32StringsInVec copies up to len(dsts) whitespace-separated decimal -// strings from the memory mapped at ars in uio and converts them to int32 -// values in dsts. It returns the number of bytes read. -// -// CopyInt32StringsInVec shares the following properties with Linux's -// kernel/sysctl.c:proc_dointvec(write=1): -// -// - If any read value overflows the range of int32, or any invalid characters -// are encountered during the read, CopyInt32StringsInVec returns EINVAL. -// -// - If, upon reaching the end of ars, fewer than len(dsts) values have been -// read, CopyInt32StringsInVec returns no error if at least 1 value was read -// and EINVAL otherwise. -// -// - Trailing whitespace after the last successfully read value is counted in -// the number of bytes read. -// -// Unlike proc_dointvec(): -// -// - CopyInt32StringsInVec does not implicitly limit ars.NumBytes() to -// PageSize-1; callers that require this must do so explicitly. -// -// - CopyInt32StringsInVec returns EINVAL if ars.NumBytes() == 0. -// -// Preconditions: As for CopyInVec. -func CopyInt32StringsInVec(ctx context.Context, uio IO, ars AddrRangeSeq, dsts []int32, opts IOOpts) (int64, error) { - if len(dsts) == 0 { - return 0, nil - } - - buf := make([]byte, ars.NumBytes()) - n, cperr := CopyInVec(ctx, uio, ars, buf, opts) - buf = buf[:n] - - var i, j int - for ; j < len(dsts); j++ { - // Skip leading whitespace. - for i < len(buf) && isASCIIWhitespace(buf[i]) { - i++ - } - if i == len(buf) { - break - } - - // Find the end of the value to be parsed (next whitespace or end of string). - nextI := i + 1 - for nextI < len(buf) && !isASCIIWhitespace(buf[nextI]) { - nextI++ - } - - // Parse a single value. - val, err := strconv.ParseInt(string(buf[i:nextI]), 10, 32) - if err != nil { - return int64(i), syserror.EINVAL - } - dsts[j] = int32(val) - - i = nextI - } - - // Skip trailing whitespace. - for i < len(buf) && isASCIIWhitespace(buf[i]) { - i++ - } - - if cperr != nil { - return int64(i), cperr - } - if j == 0 { - return int64(i), syserror.EINVAL - } - return int64(i), nil -} - -// CopyInt32StringInVec is equivalent to CopyInt32StringsInVec, but copies at -// most one int32. -func CopyInt32StringInVec(ctx context.Context, uio IO, ars AddrRangeSeq, dst *int32, opts IOOpts) (int64, error) { - dsts := [1]int32{*dst} - n, err := CopyInt32StringsInVec(ctx, uio, ars, dsts[:], opts) - *dst = dsts[0] - return n, err -} - -// IOSequence holds arguments to IO methods. -type IOSequence struct { - IO IO - Addrs AddrRangeSeq - Opts IOOpts -} - -// NumBytes returns s.Addrs.NumBytes(). -// -// Note that NumBytes() may return 0 even if !s.Addrs.IsEmpty(), since -// s.Addrs may contain a non-zero number of zero-length AddrRanges. -// Many clients of -// IOSequence currently do something like: -// -// if ioseq.NumBytes() == 0 { -// return 0, nil -// } -// if f.availableBytes == 0 { -// return 0, syserror.ErrWouldBlock -// } -// return ioseq.CopyOutFrom(..., reader) -// -// In such cases, using s.Addrs.IsEmpty() will cause them to have the wrong -// behavior for zero-length I/O. However, using s.NumBytes() == 0 instead means -// that we will return success for zero-length I/O in cases where Linux would -// return EFAULT due to a failed access_ok() check, so in the long term we -// should move checks for ErrWouldBlock etc. into the body of -// reader.ReadToBlocks and use s.Addrs.IsEmpty() instead. -func (s IOSequence) NumBytes() int64 { - return s.Addrs.NumBytes() -} - -// DropFirst returns a copy of s with s.Addrs.DropFirst(n). -// -// Preconditions: As for AddrRangeSeq.DropFirst. -func (s IOSequence) DropFirst(n int) IOSequence { - return IOSequence{s.IO, s.Addrs.DropFirst(n), s.Opts} -} - -// DropFirst64 returns a copy of s with s.Addrs.DropFirst64(n). -// -// Preconditions: As for AddrRangeSeq.DropFirst64. -func (s IOSequence) DropFirst64(n int64) IOSequence { - return IOSequence{s.IO, s.Addrs.DropFirst64(n), s.Opts} -} - -// TakeFirst returns a copy of s with s.Addrs.TakeFirst(n). -// -// Preconditions: As for AddrRangeSeq.TakeFirst. -func (s IOSequence) TakeFirst(n int) IOSequence { - return IOSequence{s.IO, s.Addrs.TakeFirst(n), s.Opts} -} - -// TakeFirst64 returns a copy of s with s.Addrs.TakeFirst64(n). -// -// Preconditions: As for AddrRangeSeq.TakeFirst64. -func (s IOSequence) TakeFirst64(n int64) IOSequence { - return IOSequence{s.IO, s.Addrs.TakeFirst64(n), s.Opts} -} - -// CopyOut invokes CopyOutVec over s.Addrs. -// -// As with CopyOutVec, if s.NumBytes() < len(src), the copy will be truncated -// to s.NumBytes(), and a nil error will be returned. -// -// Preconditions: As for CopyOutVec. -func (s IOSequence) CopyOut(ctx context.Context, src []byte) (int, error) { - return CopyOutVec(ctx, s.IO, s.Addrs, src, s.Opts) -} - -// CopyIn invokes CopyInVec over s.Addrs. -// -// As with CopyInVec, if s.NumBytes() < len(dst), the copy will be truncated to -// s.NumBytes(), and a nil error will be returned. -// -// Preconditions: As for CopyInVec. -func (s IOSequence) CopyIn(ctx context.Context, dst []byte) (int, error) { - return CopyInVec(ctx, s.IO, s.Addrs, dst, s.Opts) -} - -// ZeroOut invokes ZeroOutVec over s.Addrs. -// -// As with ZeroOutVec, if s.NumBytes() < toZero, the write will be truncated -// to s.NumBytes(), and a nil error will be returned. -// -// Preconditions: As for ZeroOutVec. -func (s IOSequence) ZeroOut(ctx context.Context, toZero int64) (int64, error) { - return ZeroOutVec(ctx, s.IO, s.Addrs, toZero, s.Opts) -} - -// CopyOutFrom invokes s.CopyOutFrom over s.Addrs. -// -// Preconditions: As for IO.CopyOutFrom. -func (s IOSequence) CopyOutFrom(ctx context.Context, src safemem.Reader) (int64, error) { - return s.IO.CopyOutFrom(ctx, s.Addrs, src, s.Opts) -} - -// CopyInTo invokes s.CopyInTo over s.Addrs. -// -// Preconditions: As for IO.CopyInTo. -func (s IOSequence) CopyInTo(ctx context.Context, dst safemem.Writer) (int64, error) { - return s.IO.CopyInTo(ctx, s.Addrs, dst, s.Opts) -} - -// Reader returns an io.Reader that reads from s. Reads beyond the end of s -// return io.EOF. The preconditions that apply to s.CopyIn also apply to the -// returned io.Reader.Read. -func (s IOSequence) Reader(ctx context.Context) io.Reader { - return &ioSequenceReadWriter{ctx, s} -} - -// Writer returns an io.Writer that writes to s. Writes beyond the end of s -// return ErrEndOfIOSequence. The preconditions that apply to s.CopyOut also -// apply to the returned io.Writer.Write. -func (s IOSequence) Writer(ctx context.Context) io.Writer { - return &ioSequenceReadWriter{ctx, s} -} - -// ErrEndOfIOSequence is returned by IOSequence.Writer().Write() when -// attempting to write beyond the end of the IOSequence. -var ErrEndOfIOSequence = errors.New("write beyond end of IOSequence") - -type ioSequenceReadWriter struct { - ctx context.Context - s IOSequence -} - -// Read implements io.Reader.Read. -func (rw *ioSequenceReadWriter) Read(dst []byte) (int, error) { - n, err := rw.s.CopyIn(rw.ctx, dst) - rw.s = rw.s.DropFirst(n) - if err == nil && rw.s.NumBytes() == 0 { - err = io.EOF - } - return n, err -} - -// Write implements io.Writer.Write. -func (rw *ioSequenceReadWriter) Write(src []byte) (int, error) { - n, err := rw.s.CopyOut(rw.ctx, src) - rw.s = rw.s.DropFirst(n) - if err == nil && n < len(src) { - err = ErrEndOfIOSequence - } - return n, err -} diff --git a/pkg/sentry/usermem/usermem_arm64.go b/pkg/sentry/usermem/usermem_arm64.go deleted file mode 100644 index fdfc30a66..000000000 --- a/pkg/sentry/usermem/usermem_arm64.go +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// +build arm64 - -package usermem - -import ( - "encoding/binary" - "syscall" -) - -const ( - // PageSize is the system page size. - // arm64 support 4K/16K/64K page size, - // which can be get by syscall.Getpagesize(). - // Currently, only 4K page size is supported. - PageSize = 1 << PageShift - - // HugePageSize is the system huge page size. - HugePageSize = 1 << HugePageShift - - // PageShift is the binary log of the system page size. - PageShift = 12 - - // HugePageShift is the binary log of the system huge page size. - // Should be calculated by "PageShift + (PageShift - 3)" - // when multiple page size support is ready. - HugePageShift = 21 -) - -var ( - // ByteOrder is the native byte order (little endian). - ByteOrder = binary.LittleEndian -) - -func init() { - // Make sure the page size is 4K on arm64 platform. - if size := syscall.Getpagesize(); size != PageSize { - panic("Only 4K page size is supported on arm64!") - } -} diff --git a/pkg/sentry/usermem/usermem_test.go b/pkg/sentry/usermem/usermem_test.go deleted file mode 100644 index 299f64754..000000000 --- a/pkg/sentry/usermem/usermem_test.go +++ /dev/null @@ -1,424 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package usermem - -import ( - "bytes" - "encoding/binary" - "fmt" - "reflect" - "strings" - "testing" - - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/safemem" - "gvisor.dev/gvisor/pkg/syserror" -) - -// newContext returns a context.Context that we can use in these tests (we -// can't use contexttest because it depends on usermem). -func newContext() context.Context { - return context.Background() -} - -func newBytesIOString(s string) *BytesIO { - return &BytesIO{[]byte(s)} -} - -func TestBytesIOCopyOutSuccess(t *testing.T) { - b := newBytesIOString("ABCDE") - n, err := b.CopyOut(newContext(), 1, []byte("foo"), IOOpts{}) - if wantN := 3; n != wantN || err != nil { - t.Errorf("CopyOut: got (%v, %v), wanted (%v, nil)", n, err, wantN) - } - if got, want := b.Bytes, []byte("AfooE"); !bytes.Equal(got, want) { - t.Errorf("Bytes: got %q, wanted %q", got, want) - } -} - -func TestBytesIOCopyOutFailure(t *testing.T) { - b := newBytesIOString("ABC") - n, err := b.CopyOut(newContext(), 1, []byte("foo"), IOOpts{}) - if wantN, wantErr := 2, syserror.EFAULT; n != wantN || err != wantErr { - t.Errorf("CopyOut: got (%v, %v), wanted (%v, %v)", n, err, wantN, wantErr) - } - if got, want := b.Bytes, []byte("Afo"); !bytes.Equal(got, want) { - t.Errorf("Bytes: got %q, wanted %q", got, want) - } -} - -func TestBytesIOCopyInSuccess(t *testing.T) { - b := newBytesIOString("AfooE") - var dst [3]byte - n, err := b.CopyIn(newContext(), 1, dst[:], IOOpts{}) - if wantN := 3; n != wantN || err != nil { - t.Errorf("CopyIn: got (%v, %v), wanted (%v, nil)", n, err, wantN) - } - if got, want := dst[:], []byte("foo"); !bytes.Equal(got, want) { - t.Errorf("dst: got %q, wanted %q", got, want) - } -} - -func TestBytesIOCopyInFailure(t *testing.T) { - b := newBytesIOString("Afo") - var dst [3]byte - n, err := b.CopyIn(newContext(), 1, dst[:], IOOpts{}) - if wantN, wantErr := 2, syserror.EFAULT; n != wantN || err != wantErr { - t.Errorf("CopyIn: got (%v, %v), wanted (%v, %v)", n, err, wantN, wantErr) - } - if got, want := dst[:], []byte("fo\x00"); !bytes.Equal(got, want) { - t.Errorf("dst: got %q, wanted %q", got, want) - } -} - -func TestBytesIOZeroOutSuccess(t *testing.T) { - b := newBytesIOString("ABCD") - n, err := b.ZeroOut(newContext(), 1, 2, IOOpts{}) - if wantN := int64(2); n != wantN || err != nil { - t.Errorf("ZeroOut: got (%v, %v), wanted (%v, nil)", n, err, wantN) - } - if got, want := b.Bytes, []byte("A\x00\x00D"); !bytes.Equal(got, want) { - t.Errorf("Bytes: got %q, wanted %q", got, want) - } -} - -func TestBytesIOZeroOutFailure(t *testing.T) { - b := newBytesIOString("ABC") - n, err := b.ZeroOut(newContext(), 1, 3, IOOpts{}) - if wantN, wantErr := int64(2), syserror.EFAULT; n != wantN || err != wantErr { - t.Errorf("ZeroOut: got (%v, %v), wanted (%v, %v)", n, err, wantN, wantErr) - } - if got, want := b.Bytes, []byte("A\x00\x00"); !bytes.Equal(got, want) { - t.Errorf("Bytes: got %q, wanted %q", got, want) - } -} - -func TestBytesIOCopyOutFromSuccess(t *testing.T) { - b := newBytesIOString("ABCDEFGH") - n, err := b.CopyOutFrom(newContext(), AddrRangeSeqFromSlice([]AddrRange{ - {Start: 4, End: 7}, - {Start: 1, End: 4}, - }), safemem.FromIOReader{bytes.NewBufferString("barfoo")}, IOOpts{}) - if wantN := int64(6); n != wantN || err != nil { - t.Errorf("CopyOutFrom: got (%v, %v), wanted (%v, nil)", n, err, wantN) - } - if got, want := b.Bytes, []byte("AfoobarH"); !bytes.Equal(got, want) { - t.Errorf("Bytes: got %q, wanted %q", got, want) - } -} - -func TestBytesIOCopyOutFromFailure(t *testing.T) { - b := newBytesIOString("ABCDE") - n, err := b.CopyOutFrom(newContext(), AddrRangeSeqFromSlice([]AddrRange{ - {Start: 1, End: 4}, - {Start: 4, End: 7}, - }), safemem.FromIOReader{bytes.NewBufferString("foobar")}, IOOpts{}) - if wantN, wantErr := int64(4), syserror.EFAULT; n != wantN || err != wantErr { - t.Errorf("CopyOutFrom: got (%v, %v), wanted (%v, %v)", n, err, wantN, wantErr) - } - if got, want := b.Bytes, []byte("Afoob"); !bytes.Equal(got, want) { - t.Errorf("Bytes: got %q, wanted %q", got, want) - } -} - -func TestBytesIOCopyInToSuccess(t *testing.T) { - b := newBytesIOString("AfoobarH") - var dst bytes.Buffer - n, err := b.CopyInTo(newContext(), AddrRangeSeqFromSlice([]AddrRange{ - {Start: 4, End: 7}, - {Start: 1, End: 4}, - }), safemem.FromIOWriter{&dst}, IOOpts{}) - if wantN := int64(6); n != wantN || err != nil { - t.Errorf("CopyInTo: got (%v, %v), wanted (%v, nil)", n, err, wantN) - } - if got, want := dst.Bytes(), []byte("barfoo"); !bytes.Equal(got, want) { - t.Errorf("dst.Bytes(): got %q, wanted %q", got, want) - } -} - -func TestBytesIOCopyInToFailure(t *testing.T) { - b := newBytesIOString("Afoob") - var dst bytes.Buffer - n, err := b.CopyInTo(newContext(), AddrRangeSeqFromSlice([]AddrRange{ - {Start: 1, End: 4}, - {Start: 4, End: 7}, - }), safemem.FromIOWriter{&dst}, IOOpts{}) - if wantN, wantErr := int64(4), syserror.EFAULT; n != wantN || err != wantErr { - t.Errorf("CopyOutFrom: got (%v, %v), wanted (%v, %v)", n, err, wantN, wantErr) - } - if got, want := dst.Bytes(), []byte("foob"); !bytes.Equal(got, want) { - t.Errorf("dst.Bytes(): got %q, wanted %q", got, want) - } -} - -type testStruct struct { - Int8 int8 - Uint8 uint8 - Int16 int16 - Uint16 uint16 - Int32 int32 - Uint32 uint32 - Int64 int64 - Uint64 uint64 -} - -func TestCopyObject(t *testing.T) { - wantObj := testStruct{1, 2, 3, 4, 5, 6, 7, 8} - wantN := binary.Size(wantObj) - b := &BytesIO{make([]byte, wantN)} - ctx := newContext() - if n, err := CopyObjectOut(ctx, b, 0, &wantObj, IOOpts{}); n != wantN || err != nil { - t.Fatalf("CopyObjectOut: got (%v, %v), wanted (%v, nil)", n, err, wantN) - } - var gotObj testStruct - if n, err := CopyObjectIn(ctx, b, 0, &gotObj, IOOpts{}); n != wantN || err != nil { - t.Errorf("CopyObjectIn: got (%v, %v), wanted (%v, nil)", n, err, wantN) - } - if gotObj != wantObj { - t.Errorf("CopyObject round trip: got %+v, wanted %+v", gotObj, wantObj) - } -} - -func TestCopyStringInShort(t *testing.T) { - // Tests for string length <= copyStringIncrement. - want := strings.Repeat("A", copyStringIncrement-2) - mem := want + "\x00" - if got, err := CopyStringIn(newContext(), newBytesIOString(mem), 0, 2*copyStringIncrement, IOOpts{}); got != want || err != nil { - t.Errorf("CopyStringIn: got (%q, %v), wanted (%q, nil)", got, err, want) - } -} - -func TestCopyStringInLong(t *testing.T) { - // Tests for copyStringIncrement < string length <= copyStringMaxInitBufLen - // (requiring multiple calls to IO.CopyIn()). - want := strings.Repeat("A", copyStringIncrement*3/4) + strings.Repeat("B", copyStringIncrement*3/4) - mem := want + "\x00" - if got, err := CopyStringIn(newContext(), newBytesIOString(mem), 0, 2*copyStringIncrement, IOOpts{}); got != want || err != nil { - t.Errorf("CopyStringIn: got (%q, %v), wanted (%q, nil)", got, err, want) - } -} - -func TestCopyStringInVeryLong(t *testing.T) { - // Tests for string length > copyStringMaxInitBufLen (requiring buffer - // reallocation). - want := strings.Repeat("A", copyStringMaxInitBufLen*3/4) + strings.Repeat("B", copyStringMaxInitBufLen*3/4) - mem := want + "\x00" - if got, err := CopyStringIn(newContext(), newBytesIOString(mem), 0, 2*copyStringMaxInitBufLen, IOOpts{}); got != want || err != nil { - t.Errorf("CopyStringIn: got (%q, %v), wanted (%q, nil)", got, err, want) - } -} - -func TestCopyStringInNoTerminatingZeroByte(t *testing.T) { - want := strings.Repeat("A", copyStringIncrement-1) - got, err := CopyStringIn(newContext(), newBytesIOString(want), 0, 2*copyStringIncrement, IOOpts{}) - if wantErr := syserror.EFAULT; got != want || err != wantErr { - t.Errorf("CopyStringIn: got (%q, %v), wanted (%q, %v)", got, err, want, wantErr) - } -} - -func TestCopyStringInTruncatedByMaxlen(t *testing.T) { - got, err := CopyStringIn(newContext(), newBytesIOString(strings.Repeat("A", 10)), 0, 5, IOOpts{}) - if want, wantErr := strings.Repeat("A", 5), syserror.ENAMETOOLONG; got != want || err != wantErr { - t.Errorf("CopyStringIn: got (%q, %v), wanted (%q, %v)", got, err, want, wantErr) - } -} - -func TestCopyInt32StringsInVec(t *testing.T) { - for _, test := range []struct { - str string - n int - initial []int32 - final []int32 - }{ - { - str: "100 200", - n: len("100 200"), - initial: []int32{1, 2}, - final: []int32{100, 200}, - }, - { - // Fewer values ok - str: "100", - n: len("100"), - initial: []int32{1, 2}, - final: []int32{100, 2}, - }, - { - // Extra values ok - str: "100 200 300", - n: len("100 200 "), - initial: []int32{1, 2}, - final: []int32{100, 200}, - }, - { - // Leading and trailing whitespace ok - str: " 100\t200\n", - n: len(" 100\t200\n"), - initial: []int32{1, 2}, - final: []int32{100, 200}, - }, - } { - t.Run(fmt.Sprintf("%q", test.str), func(t *testing.T) { - src := BytesIOSequence([]byte(test.str)) - dsts := append([]int32(nil), test.initial...) - if n, err := CopyInt32StringsInVec(newContext(), src.IO, src.Addrs, dsts, src.Opts); n != int64(test.n) || err != nil { - t.Errorf("CopyInt32StringsInVec: got (%d, %v), wanted (%d, nil)", n, err, test.n) - } - if !reflect.DeepEqual(dsts, test.final) { - t.Errorf("dsts: got %v, wanted %v", dsts, test.final) - } - }) - } -} - -func TestCopyInt32StringsInVecRequiresOneValidValue(t *testing.T) { - for _, s := range []string{"", "\n", "a123"} { - t.Run(fmt.Sprintf("%q", s), func(t *testing.T) { - src := BytesIOSequence([]byte(s)) - initial := []int32{1, 2} - dsts := append([]int32(nil), initial...) - if n, err := CopyInt32StringsInVec(newContext(), src.IO, src.Addrs, dsts, src.Opts); err != syserror.EINVAL { - t.Errorf("CopyInt32StringsInVec: got (%d, %v), wanted (_, %v)", n, err, syserror.EINVAL) - } - if !reflect.DeepEqual(dsts, initial) { - t.Errorf("dsts: got %v, wanted %v", dsts, initial) - } - }) - } -} - -func TestIOSequenceCopyOut(t *testing.T) { - buf := []byte("ABCD") - s := BytesIOSequence(buf) - - // CopyOut limited by len(src). - n, err := s.CopyOut(newContext(), []byte("fo")) - if wantN := 2; n != wantN || err != nil { - t.Errorf("CopyOut: got (%v, %v), wanted (%v, nil)", n, err, wantN) - } - if want := []byte("foCD"); !bytes.Equal(buf, want) { - t.Errorf("buf: got %q, wanted %q", buf, want) - } - s = s.DropFirst(2) - if got, want := s.NumBytes(), int64(2); got != want { - t.Errorf("NumBytes: got %v, wanted %v", got, want) - } - - // CopyOut limited by s.NumBytes(). - n, err = s.CopyOut(newContext(), []byte("obar")) - if wantN := 2; n != wantN || err != nil { - t.Errorf("CopyOut: got (%v, %v), wanted (%v, nil)", n, err, wantN) - } - if want := []byte("foob"); !bytes.Equal(buf, want) { - t.Errorf("buf: got %q, wanted %q", buf, want) - } - s = s.DropFirst(2) - if got, want := s.NumBytes(), int64(0); got != want { - t.Errorf("NumBytes: got %v, wanted %v", got, want) - } -} - -func TestIOSequenceCopyIn(t *testing.T) { - s := BytesIOSequence([]byte("foob")) - dst := []byte("ABCDEF") - - // CopyIn limited by len(dst). - n, err := s.CopyIn(newContext(), dst[:2]) - if wantN := 2; n != wantN || err != nil { - t.Errorf("CopyIn: got (%v, %v), wanted (%v, nil)", n, err, wantN) - } - if want := []byte("foCDEF"); !bytes.Equal(dst, want) { - t.Errorf("dst: got %q, wanted %q", dst, want) - } - s = s.DropFirst(2) - if got, want := s.NumBytes(), int64(2); got != want { - t.Errorf("NumBytes: got %v, wanted %v", got, want) - } - - // CopyIn limited by s.Remaining(). - n, err = s.CopyIn(newContext(), dst[2:]) - if wantN := 2; n != wantN || err != nil { - t.Errorf("CopyIn: got (%v, %v), wanted (%v, nil)", n, err, wantN) - } - if want := []byte("foobEF"); !bytes.Equal(dst, want) { - t.Errorf("dst: got %q, wanted %q", dst, want) - } - s = s.DropFirst(2) - if got, want := s.NumBytes(), int64(0); got != want { - t.Errorf("NumBytes: got %v, wanted %v", got, want) - } -} - -func TestIOSequenceZeroOut(t *testing.T) { - buf := []byte("ABCD") - s := BytesIOSequence(buf) - - // ZeroOut limited by toZero. - n, err := s.ZeroOut(newContext(), 2) - if wantN := int64(2); n != wantN || err != nil { - t.Errorf("ZeroOut: got (%v, %v), wanted (%v, nil)", n, err, wantN) - } - if want := []byte("\x00\x00CD"); !bytes.Equal(buf, want) { - t.Errorf("buf: got %q, wanted %q", buf, want) - } - s = s.DropFirst(2) - if got, want := s.NumBytes(), int64(2); got != want { - t.Errorf("NumBytes: got %v, wanted %v", got, want) - } - - // ZeroOut limited by s.NumBytes(). - n, err = s.ZeroOut(newContext(), 4) - if wantN := int64(2); n != wantN || err != nil { - t.Errorf("CopyOut: got (%v, %v), wanted (%v, nil)", n, err, wantN) - } - if want := []byte("\x00\x00\x00\x00"); !bytes.Equal(buf, want) { - t.Errorf("buf: got %q, wanted %q", buf, want) - } - s = s.DropFirst(2) - if got, want := s.NumBytes(), int64(0); got != want { - t.Errorf("NumBytes: got %v, wanted %v", got, want) - } -} - -func TestIOSequenceTakeFirst(t *testing.T) { - s := BytesIOSequence([]byte("foobar")) - if got, want := s.NumBytes(), int64(6); got != want { - t.Errorf("NumBytes: got %v, wanted %v", got, want) - } - - s = s.TakeFirst(3) - if got, want := s.NumBytes(), int64(3); got != want { - t.Errorf("NumBytes: got %v, wanted %v", got, want) - } - - // TakeFirst(n) where n > s.NumBytes() is a no-op. - s = s.TakeFirst(9) - if got, want := s.NumBytes(), int64(3); got != want { - t.Errorf("NumBytes: got %v, wanted %v", got, want) - } - - var dst [3]byte - n, err := s.CopyIn(newContext(), dst[:]) - if wantN := 3; n != wantN || err != nil { - t.Errorf("CopyIn: got (%v, %v), wanted (%v, nil)", n, err, wantN) - } - if got, want := dst[:], []byte("foo"); !bytes.Equal(got, want) { - t.Errorf("dst: got %q, wanted %q", got, want) - } - s = s.DropFirst(3) - if got, want := s.NumBytes(), int64(0); got != want { - t.Errorf("NumBytes: got %v, wanted %v", got, want) - } -} diff --git a/pkg/sentry/usermem/usermem_unsafe.go b/pkg/sentry/usermem/usermem_unsafe.go deleted file mode 100644 index 876783e78..000000000 --- a/pkg/sentry/usermem/usermem_unsafe.go +++ /dev/null @@ -1,27 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package usermem - -import ( - "unsafe" -) - -// stringFromImmutableBytes is equivalent to string(bs), except that it never -// copies even if escape analysis can't prove that bs does not escape. This is -// only valid if bs is never mutated after stringFromImmutableBytes returns. -func stringFromImmutableBytes(bs []byte) string { - // Compare strings.Builder.String(). - return *(*string)(unsafe.Pointer(&bs)) -} diff --git a/pkg/sentry/usermem/usermem_x86.go b/pkg/sentry/usermem/usermem_x86.go deleted file mode 100644 index 8059b72d2..000000000 --- a/pkg/sentry/usermem/usermem_x86.go +++ /dev/null @@ -1,38 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// +build amd64 i386 - -package usermem - -import "encoding/binary" - -const ( - // PageSize is the system page size. - PageSize = 1 << PageShift - - // HugePageSize is the system huge page size. - HugePageSize = 1 << HugePageShift - - // PageShift is the binary log of the system page size. - PageShift = 12 - - // HugePageShift is the binary log of the system huge page size. - HugePageShift = 21 -) - -var ( - // ByteOrder is the native byte order (little endian). - ByteOrder = binary.LittleEndian -) diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD index 51acdc4e9..6b1009328 100644 --- a/pkg/sentry/vfs/BUILD +++ b/pkg/sentry/vfs/BUILD @@ -26,14 +26,14 @@ go_library( visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", + "//pkg/context", "//pkg/fspath", "//pkg/sentry/arch", - "//pkg/sentry/context", "//pkg/sentry/kernel/auth", "//pkg/sentry/memmap", - "//pkg/sentry/usermem", "//pkg/sync", "//pkg/syserror", + "//pkg/usermem", "//pkg/waiter", ], ) @@ -48,11 +48,11 @@ go_test( library = ":vfs", deps = [ "//pkg/abi/linux", - "//pkg/sentry/context", - "//pkg/sentry/context/contexttest", + "//pkg/context", + "//pkg/sentry/contexttest", "//pkg/sentry/kernel/auth", - "//pkg/sentry/usermem", "//pkg/sync", "//pkg/syserror", + "//pkg/usermem", ], ) diff --git a/pkg/sentry/vfs/context.go b/pkg/sentry/vfs/context.go index 705194ebc..d97362b9a 100644 --- a/pkg/sentry/vfs/context.go +++ b/pkg/sentry/vfs/context.go @@ -15,7 +15,7 @@ package vfs import ( - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" ) // contextID is this package's type for context.Context.Value keys. diff --git a/pkg/sentry/vfs/device.go b/pkg/sentry/vfs/device.go index 9f9d6e783..3af2aa58d 100644 --- a/pkg/sentry/vfs/device.go +++ b/pkg/sentry/vfs/device.go @@ -17,7 +17,7 @@ package vfs import ( "fmt" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/syserror" ) diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go index 51c95c2d9..225024463 100644 --- a/pkg/sentry/vfs/file_description.go +++ b/pkg/sentry/vfs/file_description.go @@ -18,12 +18,12 @@ import ( "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/vfs/file_description_impl_util.go b/pkg/sentry/vfs/file_description_impl_util.go index c00b3c84b..fb9b87fdc 100644 --- a/pkg/sentry/vfs/file_description_impl_util.go +++ b/pkg/sentry/vfs/file_description_impl_util.go @@ -19,12 +19,12 @@ import ( "io" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) diff --git a/pkg/sentry/vfs/file_description_impl_util_test.go b/pkg/sentry/vfs/file_description_impl_util_test.go index 9ed58512f..1720d325d 100644 --- a/pkg/sentry/vfs/file_description_impl_util_test.go +++ b/pkg/sentry/vfs/file_description_impl_util_test.go @@ -22,11 +22,11 @@ import ( "testing" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" - "gvisor.dev/gvisor/pkg/sentry/context/contexttest" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/sentry/usermem" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // fileDescription is the common fd struct which a filesystem implementation diff --git a/pkg/sentry/vfs/filesystem.go b/pkg/sentry/vfs/filesystem.go index ea78f555b..a06a6caf3 100644 --- a/pkg/sentry/vfs/filesystem.go +++ b/pkg/sentry/vfs/filesystem.go @@ -18,8 +18,8 @@ import ( "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fspath" - "gvisor.dev/gvisor/pkg/sentry/context" ) // A Filesystem is a tree of nodes represented by Dentries, which forms part of diff --git a/pkg/sentry/vfs/filesystem_type.go b/pkg/sentry/vfs/filesystem_type.go index 023301780..c58b70728 100644 --- a/pkg/sentry/vfs/filesystem_type.go +++ b/pkg/sentry/vfs/filesystem_type.go @@ -18,7 +18,7 @@ import ( "bytes" "fmt" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" ) diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go index 00177b371..d39528051 100644 --- a/pkg/sentry/vfs/mount.go +++ b/pkg/sentry/vfs/mount.go @@ -19,7 +19,7 @@ import ( "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/syserror" ) diff --git a/pkg/sentry/vfs/pathname.go b/pkg/sentry/vfs/pathname.go index cf80df90e..b318c681a 100644 --- a/pkg/sentry/vfs/pathname.go +++ b/pkg/sentry/vfs/pathname.go @@ -15,8 +15,8 @@ package vfs import ( + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fspath" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) diff --git a/pkg/sentry/vfs/testutil.go b/pkg/sentry/vfs/testutil.go index ee5c8b9e2..392c7611e 100644 --- a/pkg/sentry/vfs/testutil.go +++ b/pkg/sentry/vfs/testutil.go @@ -18,8 +18,8 @@ import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fspath" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/syserror" ) diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go index 1f6f56293..b2bf48853 100644 --- a/pkg/sentry/vfs/vfs.go +++ b/pkg/sentry/vfs/vfs.go @@ -31,8 +31,8 @@ import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fspath" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" diff --git a/pkg/usermem/BUILD b/pkg/usermem/BUILD new file mode 100644 index 000000000..ff8b9e91a --- /dev/null +++ b/pkg/usermem/BUILD @@ -0,0 +1,55 @@ +load("//tools:defs.bzl", "go_library", "go_test") +load("//tools/go_generics:defs.bzl", "go_template_instance") + +package(licenses = ["notice"]) + +go_template_instance( + name = "addr_range", + out = "addr_range.go", + package = "usermem", + prefix = "Addr", + template = "//pkg/segment:generic_range", + types = { + "T": "Addr", + }, +) + +go_library( + name = "usermem", + srcs = [ + "access_type.go", + "addr.go", + "addr_range.go", + "addr_range_seq_unsafe.go", + "bytes_io.go", + "bytes_io_unsafe.go", + "usermem.go", + "usermem_arm64.go", + "usermem_unsafe.go", + "usermem_x86.go", + ], + visibility = ["//:sandbox"], + deps = [ + "//pkg/atomicbitops", + "//pkg/binary", + "//pkg/context", + "//pkg/log", + "//pkg/safemem", + "//pkg/syserror", + ], +) + +go_test( + name = "usermem_test", + size = "small", + srcs = [ + "addr_range_seq_test.go", + "usermem_test.go", + ], + library = ":usermem", + deps = [ + "//pkg/context", + "//pkg/safemem", + "//pkg/syserror", + ], +) diff --git a/pkg/usermem/README.md b/pkg/usermem/README.md new file mode 100644 index 000000000..f6d2137eb --- /dev/null +++ b/pkg/usermem/README.md @@ -0,0 +1,31 @@ +This package defines primitives for sentry access to application memory. + +Major types: + +- The `IO` interface represents a virtual address space and provides I/O + methods on that address space. `IO` is the lowest-level primitive. The + primary implementation of the `IO` interface is `mm.MemoryManager`. + +- `IOSequence` represents a collection of individually-contiguous address + ranges in a `IO` that is operated on sequentially, analogous to Linux's + `struct iov_iter`. + +Major usage patterns: + +- Access to a task's virtual memory, subject to the application's memory + protections and while running on that task's goroutine, from a context that + is at or above the level of the `kernel` package (e.g. most syscall + implementations in `syscalls/linux`); use the `kernel.Task.Copy*` wrappers + defined in `kernel/task_usermem.go`. + +- Access to a task's virtual memory, from a context that is at or above the + level of the `kernel` package, but where any of the above constraints does + not hold (e.g. `PTRACE_POKEDATA`, which ignores application memory + protections); obtain the task's `mm.MemoryManager` by calling + `kernel.Task.MemoryManager`, and call its `IO` methods directly. + +- Access to a task's virtual memory, from a context that is below the level of + the `kernel` package (e.g. filesystem I/O); clients must pass I/O arguments + from higher layers, usually in the form of an `IOSequence`. The + `kernel.Task.SingleIOSequence` and `kernel.Task.IovecsIOSequence` functions + in `kernel/task_usermem.go` are convenience functions for doing so. diff --git a/pkg/usermem/access_type.go b/pkg/usermem/access_type.go new file mode 100644 index 000000000..9c1742a59 --- /dev/null +++ b/pkg/usermem/access_type.go @@ -0,0 +1,128 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package usermem + +import ( + "syscall" +) + +// AccessType specifies memory access types. This is used for +// setting mapping permissions, as well as communicating faults. +// +// +stateify savable +type AccessType struct { + // Read is read access. + Read bool + + // Write is write access. + Write bool + + // Execute is executable access. + Execute bool +} + +// String returns a pretty representation of access. This looks like the +// familiar r-x, rw-, etc. and can be relied on as such. +func (a AccessType) String() string { + bits := [3]byte{'-', '-', '-'} + if a.Read { + bits[0] = 'r' + } + if a.Write { + bits[1] = 'w' + } + if a.Execute { + bits[2] = 'x' + } + return string(bits[:]) +} + +// Any returns true iff at least one of Read, Write or Execute is true. +func (a AccessType) Any() bool { + return a.Read || a.Write || a.Execute +} + +// Prot returns the system prot (syscall.PROT_READ, etc.) for this access. +func (a AccessType) Prot() int { + var prot int + if a.Read { + prot |= syscall.PROT_READ + } + if a.Write { + prot |= syscall.PROT_WRITE + } + if a.Execute { + prot |= syscall.PROT_EXEC + } + return prot +} + +// SupersetOf returns true iff the access types in a are a superset of the +// access types in other. +func (a AccessType) SupersetOf(other AccessType) bool { + if !a.Read && other.Read { + return false + } + if !a.Write && other.Write { + return false + } + if !a.Execute && other.Execute { + return false + } + return true +} + +// Intersect returns the access types set in both a and other. +func (a AccessType) Intersect(other AccessType) AccessType { + return AccessType{ + Read: a.Read && other.Read, + Write: a.Write && other.Write, + Execute: a.Execute && other.Execute, + } +} + +// Union returns the access types set in either a or other. +func (a AccessType) Union(other AccessType) AccessType { + return AccessType{ + Read: a.Read || other.Read, + Write: a.Write || other.Write, + Execute: a.Execute || other.Execute, + } +} + +// Effective returns the set of effective access types allowed by a, even if +// some types are not explicitly allowed. +func (a AccessType) Effective() AccessType { + // In Linux, Write and Execute access generally imply Read access. See + // mm/mmap.c:protection_map. + // + // The notable exception is get_user_pages, which only checks against + // the original vma flags. That said, most user memory accesses do not + // use GUP. + if a.Write || a.Execute { + a.Read = true + } + return a +} + +// Convenient access types. +var ( + NoAccess = AccessType{} + Read = AccessType{Read: true} + Write = AccessType{Write: true} + Execute = AccessType{Execute: true} + ReadWrite = AccessType{Read: true, Write: true} + AnyAccess = AccessType{Read: true, Write: true, Execute: true} +) diff --git a/pkg/usermem/addr.go b/pkg/usermem/addr.go new file mode 100644 index 000000000..e79210804 --- /dev/null +++ b/pkg/usermem/addr.go @@ -0,0 +1,108 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package usermem + +import ( + "fmt" +) + +// Addr represents a generic virtual address. +// +// +stateify savable +type Addr uintptr + +// AddLength adds the given length to start and returns the result. ok is true +// iff adding the length did not overflow the range of Addr. +// +// Note: This function is usually used to get the end of an address range +// defined by its start address and length. Since the resulting end is +// exclusive, end == 0 is technically valid, and corresponds to a range that +// extends to the end of the address space, but ok will be false. This isn't +// expected to ever come up in practice. +func (v Addr) AddLength(length uint64) (end Addr, ok bool) { + end = v + Addr(length) + // The second half of the following check is needed in case uintptr is + // smaller than 64 bits. + ok = end >= v && length <= uint64(^Addr(0)) + return +} + +// RoundDown returns the address rounded down to the nearest page boundary. +func (v Addr) RoundDown() Addr { + return v & ^Addr(PageSize-1) +} + +// RoundUp returns the address rounded up to the nearest page boundary. ok is +// true iff rounding up did not wrap around. +func (v Addr) RoundUp() (addr Addr, ok bool) { + addr = Addr(v + PageSize - 1).RoundDown() + ok = addr >= v + return +} + +// MustRoundUp is equivalent to RoundUp, but panics if rounding up wraps +// around. +func (v Addr) MustRoundUp() Addr { + addr, ok := v.RoundUp() + if !ok { + panic(fmt.Sprintf("usermem.Addr(%d).RoundUp() wraps", v)) + } + return addr +} + +// HugeRoundDown returns the address rounded down to the nearest huge page +// boundary. +func (v Addr) HugeRoundDown() Addr { + return v & ^Addr(HugePageSize-1) +} + +// HugeRoundUp returns the address rounded up to the nearest huge page boundary. +// ok is true iff rounding up did not wrap around. +func (v Addr) HugeRoundUp() (addr Addr, ok bool) { + addr = Addr(v + HugePageSize - 1).HugeRoundDown() + ok = addr >= v + return +} + +// PageOffset returns the offset of v into the current page. +func (v Addr) PageOffset() uint64 { + return uint64(v & Addr(PageSize-1)) +} + +// IsPageAligned returns true if v.PageOffset() == 0. +func (v Addr) IsPageAligned() bool { + return v.PageOffset() == 0 +} + +// AddrRange is a range of Addrs. +// +// type AddrRange + +// ToRange returns [v, v+length). +func (v Addr) ToRange(length uint64) (AddrRange, bool) { + end, ok := v.AddLength(length) + return AddrRange{v, end}, ok +} + +// IsPageAligned returns true if ar.Start.IsPageAligned() and +// ar.End.IsPageAligned(). +func (ar AddrRange) IsPageAligned() bool { + return ar.Start.IsPageAligned() && ar.End.IsPageAligned() +} + +// String implements fmt.Stringer.String. +func (ar AddrRange) String() string { + return fmt.Sprintf("[%#x, %#x)", ar.Start, ar.End) +} diff --git a/pkg/usermem/addr_range_seq_test.go b/pkg/usermem/addr_range_seq_test.go new file mode 100644 index 000000000..82f735026 --- /dev/null +++ b/pkg/usermem/addr_range_seq_test.go @@ -0,0 +1,197 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package usermem + +import ( + "testing" +) + +var addrRangeSeqTests = []struct { + desc string + ranges []AddrRange +}{ + { + desc: "Empty sequence", + }, + { + desc: "Single empty AddrRange", + ranges: []AddrRange{ + {0x10, 0x10}, + }, + }, + { + desc: "Single non-empty AddrRange of length 1", + ranges: []AddrRange{ + {0x10, 0x11}, + }, + }, + { + desc: "Single non-empty AddrRange of length 2", + ranges: []AddrRange{ + {0x10, 0x12}, + }, + }, + { + desc: "Multiple non-empty AddrRanges", + ranges: []AddrRange{ + {0x10, 0x11}, + {0x20, 0x22}, + }, + }, + { + desc: "Multiple AddrRanges including empty AddrRanges", + ranges: []AddrRange{ + {0x10, 0x10}, + {0x20, 0x20}, + {0x30, 0x33}, + {0x40, 0x44}, + {0x50, 0x50}, + {0x60, 0x60}, + {0x70, 0x77}, + {0x80, 0x88}, + {0x90, 0x90}, + {0xa0, 0xa0}, + }, + }, +} + +func testAddrRangeSeqEqualityWithTailIteration(t *testing.T, ars AddrRangeSeq, wantRanges []AddrRange) { + var wantLen int64 + for _, ar := range wantRanges { + wantLen += int64(ar.Length()) + } + + var i int + for !ars.IsEmpty() { + if gotLen := ars.NumBytes(); gotLen != wantLen { + t.Errorf("Iteration %d: %v.NumBytes(): got %d, wanted %d", i, ars, gotLen, wantLen) + } + if gotN, wantN := ars.NumRanges(), len(wantRanges)-i; gotN != wantN { + t.Errorf("Iteration %d: %v.NumRanges(): got %d, wanted %d", i, ars, gotN, wantN) + } + got := ars.Head() + if i >= len(wantRanges) { + t.Errorf("Iteration %d: %v.Head(): got %s, wanted ", i, ars, got) + } else if want := wantRanges[i]; got != want { + t.Errorf("Iteration %d: %v.Head(): got %s, wanted %s", i, ars, got, want) + } + ars = ars.Tail() + wantLen -= int64(got.Length()) + i++ + } + if gotLen := ars.NumBytes(); gotLen != 0 || wantLen != 0 { + t.Errorf("Iteration %d: %v.NumBytes(): got %d, wanted %d (which should be 0)", i, ars, gotLen, wantLen) + } + if gotN := ars.NumRanges(); gotN != 0 { + t.Errorf("Iteration %d: %v.NumRanges(): got %d, wanted 0", i, ars, gotN) + } +} + +func TestAddrRangeSeqTailIteration(t *testing.T) { + for _, test := range addrRangeSeqTests { + t.Run(test.desc, func(t *testing.T) { + testAddrRangeSeqEqualityWithTailIteration(t, AddrRangeSeqFromSlice(test.ranges), test.ranges) + }) + } +} + +func TestAddrRangeSeqDropFirstEmpty(t *testing.T) { + var ars AddrRangeSeq + if got, want := ars.DropFirst(1), ars; got != want { + t.Errorf("%v.DropFirst(1): got %v, wanted %v", ars, got, want) + } +} + +func TestAddrRangeSeqDropSingleByteIteration(t *testing.T) { + // Tests AddrRangeSeq iteration using Head/DropFirst, simulating + // I/O-per-AddrRange. + for _, test := range addrRangeSeqTests { + t.Run(test.desc, func(t *testing.T) { + // Figure out what AddrRanges we expect to see. + var wantLen int64 + var wantRanges []AddrRange + for _, ar := range test.ranges { + wantLen += int64(ar.Length()) + wantRanges = append(wantRanges, ar) + if ar.Length() == 0 { + // We "do" 0 bytes of I/O and then call DropFirst(0), + // advancing to the next AddrRange. + continue + } + // Otherwise we "do" 1 byte of I/O and then call DropFirst(1), + // advancing the AddrRange by 1 byte, or to the next AddrRange + // if this one is exhausted. + for ar.Start++; ar.Length() != 0; ar.Start++ { + wantRanges = append(wantRanges, ar) + } + } + t.Logf("Expected AddrRanges: %s (%d bytes)", wantRanges, wantLen) + + ars := AddrRangeSeqFromSlice(test.ranges) + var i int + for !ars.IsEmpty() { + if gotLen := ars.NumBytes(); gotLen != wantLen { + t.Errorf("Iteration %d: %v.NumBytes(): got %d, wanted %d", i, ars, gotLen, wantLen) + } + got := ars.Head() + if i >= len(wantRanges) { + t.Errorf("Iteration %d: %v.Head(): got %s, wanted ", i, ars, got) + } else if want := wantRanges[i]; got != want { + t.Errorf("Iteration %d: %v.Head(): got %s, wanted %s", i, ars, got, want) + } + if got.Length() == 0 { + ars = ars.DropFirst(0) + } else { + ars = ars.DropFirst(1) + wantLen-- + } + i++ + } + if gotLen := ars.NumBytes(); gotLen != 0 || wantLen != 0 { + t.Errorf("Iteration %d: %v.NumBytes(): got %d, wanted %d (which should be 0)", i, ars, gotLen, wantLen) + } + }) + } +} + +func TestAddrRangeSeqTakeFirstEmpty(t *testing.T) { + var ars AddrRangeSeq + if got, want := ars.TakeFirst(1), ars; got != want { + t.Errorf("%v.TakeFirst(1): got %v, wanted %v", ars, got, want) + } +} + +func TestAddrRangeSeqTakeFirst(t *testing.T) { + ranges := []AddrRange{ + {0x10, 0x11}, + {0x20, 0x22}, + {0x30, 0x30}, + {0x40, 0x44}, + {0x50, 0x55}, + {0x60, 0x60}, + {0x70, 0x77}, + } + ars := AddrRangeSeqFromSlice(ranges).TakeFirst(5) + want := []AddrRange{ + {0x10, 0x11}, // +1 byte (total 1 byte), not truncated + {0x20, 0x22}, // +2 bytes (total 3 bytes), not truncated + {0x30, 0x30}, // +0 bytes (total 3 bytes), no change + {0x40, 0x42}, // +2 bytes (total 5 bytes), partially truncated + {0x50, 0x50}, // +0 bytes (total 5 bytes), fully truncated + {0x60, 0x60}, // +0 bytes (total 5 bytes), "fully truncated" (no change) + {0x70, 0x70}, // +0 bytes (total 5 bytes), fully truncated + } + testAddrRangeSeqEqualityWithTailIteration(t, ars, want) +} diff --git a/pkg/usermem/addr_range_seq_unsafe.go b/pkg/usermem/addr_range_seq_unsafe.go new file mode 100644 index 000000000..c09337c15 --- /dev/null +++ b/pkg/usermem/addr_range_seq_unsafe.go @@ -0,0 +1,277 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package usermem + +import ( + "bytes" + "fmt" + "reflect" + "unsafe" +) + +// An AddrRangeSeq represents a sequence of AddrRanges. +// +// AddrRangeSeqs are immutable and may be copied by value. The zero value of +// AddrRangeSeq represents an empty sequence. +// +// An AddrRangeSeq may contain AddrRanges with a length of 0. This is necessary +// since zero-length AddrRanges are significant to MM bounds checks. +type AddrRangeSeq struct { + // If length is 0, then the AddrRangeSeq represents no AddrRanges. + // Invariants: data == 0; offset == 0; limit == 0. + // + // If length is 1, then the AddrRangeSeq represents the single + // AddrRange{offset, offset+limit}. Invariants: data == 0. + // + // Otherwise, length >= 2, and the AddrRangeSeq represents the `length` + // AddrRanges in the array of AddrRanges starting at address `data`, + // starting at `offset` bytes into the first AddrRange and limited to the + // following `limit` bytes. (AddrRanges after `limit` are still iterated, + // but are truncated to a length of 0.) Invariants: data != 0; offset <= + // data[0].Length(); limit > 0; offset+limit <= the combined length of all + // AddrRanges in the array. + data unsafe.Pointer + length int + offset Addr + limit Addr +} + +// AddrRangeSeqOf returns an AddrRangeSeq representing the single AddrRange ar. +func AddrRangeSeqOf(ar AddrRange) AddrRangeSeq { + return AddrRangeSeq{ + length: 1, + offset: ar.Start, + limit: ar.Length(), + } +} + +// AddrRangeSeqFromSlice returns an AddrRangeSeq representing all AddrRanges in +// slice. +// +// Whether the returned AddrRangeSeq shares memory with slice is unspecified; +// clients should avoid mutating slices passed to AddrRangeSeqFromSlice. +// +// Preconditions: The combined length of all AddrRanges in slice <= +// math.MaxInt64. +func AddrRangeSeqFromSlice(slice []AddrRange) AddrRangeSeq { + var limit int64 + for _, ar := range slice { + len64 := int64(ar.Length()) + if len64 < 0 { + panic(fmt.Sprintf("Length of AddrRange %v overflows int64", ar)) + } + sum := limit + len64 + if sum < limit { + panic(fmt.Sprintf("Total length of AddrRanges %v overflows int64", slice)) + } + limit = sum + } + return addrRangeSeqFromSliceLimited(slice, limit) +} + +// Preconditions: The combined length of all AddrRanges in slice <= limit. +// limit >= 0. If len(slice) != 0, then limit > 0. +func addrRangeSeqFromSliceLimited(slice []AddrRange, limit int64) AddrRangeSeq { + switch len(slice) { + case 0: + return AddrRangeSeq{} + case 1: + return AddrRangeSeq{ + length: 1, + offset: slice[0].Start, + limit: Addr(limit), + } + default: + return AddrRangeSeq{ + data: unsafe.Pointer(&slice[0]), + length: len(slice), + limit: Addr(limit), + } + } +} + +// IsEmpty returns true if ars.NumRanges() == 0. +// +// Note that since AddrRangeSeq may contain AddrRanges with a length of zero, +// an AddrRange representing 0 bytes (AddrRangeSeq.NumBytes() == 0) is not +// necessarily empty. +func (ars AddrRangeSeq) IsEmpty() bool { + return ars.length == 0 +} + +// NumRanges returns the number of AddrRanges in ars. +func (ars AddrRangeSeq) NumRanges() int { + return ars.length +} + +// NumBytes returns the number of bytes represented by ars. +func (ars AddrRangeSeq) NumBytes() int64 { + return int64(ars.limit) +} + +// Head returns the first AddrRange in ars. +// +// Preconditions: !ars.IsEmpty(). +func (ars AddrRangeSeq) Head() AddrRange { + if ars.length == 0 { + panic("empty AddrRangeSeq") + } + if ars.length == 1 { + return AddrRange{ars.offset, ars.offset + ars.limit} + } + ar := *(*AddrRange)(ars.data) + ar.Start += ars.offset + if ar.Length() > ars.limit { + ar.End = ar.Start + ars.limit + } + return ar +} + +// Tail returns an AddrRangeSeq consisting of all AddrRanges in ars after the +// first. +// +// Preconditions: !ars.IsEmpty(). +func (ars AddrRangeSeq) Tail() AddrRangeSeq { + if ars.length == 0 { + panic("empty AddrRangeSeq") + } + if ars.length == 1 { + return AddrRangeSeq{} + } + return ars.externalTail() +} + +// Preconditions: ars.length >= 2. +func (ars AddrRangeSeq) externalTail() AddrRangeSeq { + headLen := (*AddrRange)(ars.data).Length() - ars.offset + var tailLimit int64 + if ars.limit > headLen { + tailLimit = int64(ars.limit - headLen) + } + var extSlice []AddrRange + extSliceHdr := (*reflect.SliceHeader)(unsafe.Pointer(&extSlice)) + extSliceHdr.Data = uintptr(ars.data) + extSliceHdr.Len = ars.length + extSliceHdr.Cap = ars.length + return addrRangeSeqFromSliceLimited(extSlice[1:], tailLimit) +} + +// DropFirst returns an AddrRangeSeq equivalent to ars, but with the first n +// bytes omitted. If n > ars.NumBytes(), DropFirst returns an empty +// AddrRangeSeq. +// +// If !ars.IsEmpty() and ars.Head().Length() == 0, DropFirst will always omit +// at least ars.Head(), even if n == 0. This guarantees that the basic pattern +// of: +// +// for !ars.IsEmpty() { +// n, err = doIOWith(ars.Head()) +// if err != nil { +// return err +// } +// ars = ars.DropFirst(n) +// } +// +// works even in the presence of zero-length AddrRanges. +// +// Preconditions: n >= 0. +func (ars AddrRangeSeq) DropFirst(n int) AddrRangeSeq { + if n < 0 { + panic(fmt.Sprintf("invalid n: %d", n)) + } + return ars.DropFirst64(int64(n)) +} + +// DropFirst64 is equivalent to DropFirst but takes an int64. +func (ars AddrRangeSeq) DropFirst64(n int64) AddrRangeSeq { + if n < 0 { + panic(fmt.Sprintf("invalid n: %d", n)) + } + if Addr(n) > ars.limit { + return AddrRangeSeq{} + } + // Handle initial empty AddrRange. + switch ars.length { + case 0: + return AddrRangeSeq{} + case 1: + if ars.limit == 0 { + return AddrRangeSeq{} + } + default: + if rawHeadLen := (*AddrRange)(ars.data).Length(); ars.offset == rawHeadLen { + ars = ars.externalTail() + } + } + for n != 0 { + // Calling ars.Head() here is surprisingly expensive, so inline getting + // the head's length. + var headLen Addr + if ars.length == 1 { + headLen = ars.limit + } else { + headLen = (*AddrRange)(ars.data).Length() - ars.offset + } + if Addr(n) < headLen { + // Dropping ends partway through the head AddrRange. + ars.offset += Addr(n) + ars.limit -= Addr(n) + return ars + } + n -= int64(headLen) + ars = ars.Tail() + } + return ars +} + +// TakeFirst returns an AddrRangeSeq equivalent to ars, but iterating at most n +// bytes. TakeFirst never removes AddrRanges from ars; AddrRanges beyond the +// first n bytes are reduced to a length of zero, but will still be iterated. +// +// Preconditions: n >= 0. +func (ars AddrRangeSeq) TakeFirst(n int) AddrRangeSeq { + if n < 0 { + panic(fmt.Sprintf("invalid n: %d", n)) + } + return ars.TakeFirst64(int64(n)) +} + +// TakeFirst64 is equivalent to TakeFirst but takes an int64. +func (ars AddrRangeSeq) TakeFirst64(n int64) AddrRangeSeq { + if n < 0 { + panic(fmt.Sprintf("invalid n: %d", n)) + } + if ars.limit > Addr(n) { + ars.limit = Addr(n) + } + return ars +} + +// String implements fmt.Stringer.String. +func (ars AddrRangeSeq) String() string { + // This is deliberately chosen to be the same as fmt's automatic stringer + // for []AddrRange. + var buf bytes.Buffer + buf.WriteByte('[') + var sep string + for !ars.IsEmpty() { + buf.WriteString(sep) + sep = " " + buf.WriteString(ars.Head().String()) + ars = ars.Tail() + } + buf.WriteByte(']') + return buf.String() +} diff --git a/pkg/usermem/bytes_io.go b/pkg/usermem/bytes_io.go new file mode 100644 index 000000000..e177d30eb --- /dev/null +++ b/pkg/usermem/bytes_io.go @@ -0,0 +1,141 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package usermem + +import ( + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/safemem" + "gvisor.dev/gvisor/pkg/syserror" +) + +const maxInt = int(^uint(0) >> 1) + +// BytesIO implements IO using a byte slice. Addresses are interpreted as +// offsets into the slice. Reads and writes beyond the end of the slice return +// EFAULT. +type BytesIO struct { + Bytes []byte +} + +// CopyOut implements IO.CopyOut. +func (b *BytesIO) CopyOut(ctx context.Context, addr Addr, src []byte, opts IOOpts) (int, error) { + rngN, rngErr := b.rangeCheck(addr, len(src)) + if rngN == 0 { + return 0, rngErr + } + return copy(b.Bytes[int(addr):], src[:rngN]), rngErr +} + +// CopyIn implements IO.CopyIn. +func (b *BytesIO) CopyIn(ctx context.Context, addr Addr, dst []byte, opts IOOpts) (int, error) { + rngN, rngErr := b.rangeCheck(addr, len(dst)) + if rngN == 0 { + return 0, rngErr + } + return copy(dst[:rngN], b.Bytes[int(addr):]), rngErr +} + +// ZeroOut implements IO.ZeroOut. +func (b *BytesIO) ZeroOut(ctx context.Context, addr Addr, toZero int64, opts IOOpts) (int64, error) { + if toZero > int64(maxInt) { + return 0, syserror.EINVAL + } + rngN, rngErr := b.rangeCheck(addr, int(toZero)) + if rngN == 0 { + return 0, rngErr + } + zeroSlice := b.Bytes[int(addr) : int(addr)+rngN] + for i := range zeroSlice { + zeroSlice[i] = 0 + } + return int64(rngN), rngErr +} + +// CopyOutFrom implements IO.CopyOutFrom. +func (b *BytesIO) CopyOutFrom(ctx context.Context, ars AddrRangeSeq, src safemem.Reader, opts IOOpts) (int64, error) { + dsts, rngErr := b.blocksFromAddrRanges(ars) + n, err := src.ReadToBlocks(dsts) + if err != nil { + return int64(n), err + } + return int64(n), rngErr +} + +// CopyInTo implements IO.CopyInTo. +func (b *BytesIO) CopyInTo(ctx context.Context, ars AddrRangeSeq, dst safemem.Writer, opts IOOpts) (int64, error) { + srcs, rngErr := b.blocksFromAddrRanges(ars) + n, err := dst.WriteFromBlocks(srcs) + if err != nil { + return int64(n), err + } + return int64(n), rngErr +} + +func (b *BytesIO) rangeCheck(addr Addr, length int) (int, error) { + if length == 0 { + return 0, nil + } + if length < 0 { + return 0, syserror.EINVAL + } + max := Addr(len(b.Bytes)) + if addr >= max { + return 0, syserror.EFAULT + } + end, ok := addr.AddLength(uint64(length)) + if !ok || end > max { + return int(max - addr), syserror.EFAULT + } + return length, nil +} + +func (b *BytesIO) blocksFromAddrRanges(ars AddrRangeSeq) (safemem.BlockSeq, error) { + switch ars.NumRanges() { + case 0: + return safemem.BlockSeq{}, nil + case 1: + block, err := b.blockFromAddrRange(ars.Head()) + return safemem.BlockSeqOf(block), err + default: + blocks := make([]safemem.Block, 0, ars.NumRanges()) + for !ars.IsEmpty() { + block, err := b.blockFromAddrRange(ars.Head()) + if block.Len() != 0 { + blocks = append(blocks, block) + } + if err != nil { + return safemem.BlockSeqFromSlice(blocks), err + } + ars = ars.Tail() + } + return safemem.BlockSeqFromSlice(blocks), nil + } +} + +func (b *BytesIO) blockFromAddrRange(ar AddrRange) (safemem.Block, error) { + n, err := b.rangeCheck(ar.Start, int(ar.Length())) + if n == 0 { + return safemem.Block{}, err + } + return safemem.BlockFromSafeSlice(b.Bytes[int(ar.Start) : int(ar.Start)+n]), err +} + +// BytesIOSequence returns an IOSequence representing the given byte slice. +func BytesIOSequence(buf []byte) IOSequence { + return IOSequence{ + IO: &BytesIO{buf}, + Addrs: AddrRangeSeqOf(AddrRange{0, Addr(len(buf))}), + } +} diff --git a/pkg/usermem/bytes_io_unsafe.go b/pkg/usermem/bytes_io_unsafe.go new file mode 100644 index 000000000..20de5037d --- /dev/null +++ b/pkg/usermem/bytes_io_unsafe.go @@ -0,0 +1,47 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package usermem + +import ( + "sync/atomic" + "unsafe" + + "gvisor.dev/gvisor/pkg/atomicbitops" + "gvisor.dev/gvisor/pkg/context" +) + +// SwapUint32 implements IO.SwapUint32. +func (b *BytesIO) SwapUint32(ctx context.Context, addr Addr, new uint32, opts IOOpts) (uint32, error) { + if _, rngErr := b.rangeCheck(addr, 4); rngErr != nil { + return 0, rngErr + } + return atomic.SwapUint32((*uint32)(unsafe.Pointer(&b.Bytes[int(addr)])), new), nil +} + +// CompareAndSwapUint32 implements IO.CompareAndSwapUint32. +func (b *BytesIO) CompareAndSwapUint32(ctx context.Context, addr Addr, old, new uint32, opts IOOpts) (uint32, error) { + if _, rngErr := b.rangeCheck(addr, 4); rngErr != nil { + return 0, rngErr + } + return atomicbitops.CompareAndSwapUint32((*uint32)(unsafe.Pointer(&b.Bytes[int(addr)])), old, new), nil +} + +// LoadUint32 implements IO.LoadUint32. +func (b *BytesIO) LoadUint32(ctx context.Context, addr Addr, opts IOOpts) (uint32, error) { + if _, err := b.rangeCheck(addr, 4); err != nil { + return 0, err + } + return atomic.LoadUint32((*uint32)(unsafe.Pointer(&b.Bytes[int(addr)]))), nil +} diff --git a/pkg/usermem/usermem.go b/pkg/usermem/usermem.go new file mode 100644 index 000000000..71fd4e155 --- /dev/null +++ b/pkg/usermem/usermem.go @@ -0,0 +1,597 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package usermem governs access to user memory. +package usermem + +import ( + "bytes" + "errors" + "io" + "strconv" + + "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/safemem" + "gvisor.dev/gvisor/pkg/syserror" +) + +// IO provides access to the contents of a virtual memory space. +// +// FIXME(b/38173783): Implementations of IO cannot expect ctx to contain any +// meaningful data. +type IO interface { + // CopyOut copies len(src) bytes from src to the memory mapped at addr. It + // returns the number of bytes copied. If the number of bytes copied is < + // len(src), it returns a non-nil error explaining why. + // + // Preconditions: The caller must not hold mm.MemoryManager.mappingMu or + // any following locks in the lock order. + // + // Postconditions: CopyOut does not retain src. + CopyOut(ctx context.Context, addr Addr, src []byte, opts IOOpts) (int, error) + + // CopyIn copies len(dst) bytes from the memory mapped at addr to dst. + // It returns the number of bytes copied. If the number of bytes copied is + // < len(dst), it returns a non-nil error explaining why. + // + // Preconditions: The caller must not hold mm.MemoryManager.mappingMu or + // any following locks in the lock order. + // + // Postconditions: CopyIn does not retain dst. + CopyIn(ctx context.Context, addr Addr, dst []byte, opts IOOpts) (int, error) + + // ZeroOut sets toZero bytes to 0, starting at addr. It returns the number + // of bytes zeroed. If the number of bytes zeroed is < toZero, it returns a + // non-nil error explaining why. + // + // Preconditions: The caller must not hold mm.MemoryManager.mappingMu or + // any following locks in the lock order. toZero >= 0. + ZeroOut(ctx context.Context, addr Addr, toZero int64, opts IOOpts) (int64, error) + + // CopyOutFrom copies ars.NumBytes() bytes from src to the memory mapped at + // ars. It returns the number of bytes copied, which may be less than the + // number of bytes read from src if copying fails. CopyOutFrom may return a + // partial copy without an error iff src.ReadToBlocks returns a partial + // read without an error. + // + // CopyOutFrom calls src.ReadToBlocks at most once. + // + // Preconditions: The caller must not hold mm.MemoryManager.mappingMu or + // any following locks in the lock order. src.ReadToBlocks must not block + // on mm.MemoryManager.activeMu or any preceding locks in the lock order. + CopyOutFrom(ctx context.Context, ars AddrRangeSeq, src safemem.Reader, opts IOOpts) (int64, error) + + // CopyInTo copies ars.NumBytes() bytes from the memory mapped at ars to + // dst. It returns the number of bytes copied. CopyInTo may return a + // partial copy without an error iff dst.WriteFromBlocks returns a partial + // write without an error. + // + // CopyInTo calls dst.WriteFromBlocks at most once. + // + // Preconditions: The caller must not hold mm.MemoryManager.mappingMu or + // any following locks in the lock order. dst.WriteFromBlocks must not + // block on mm.MemoryManager.activeMu or any preceding locks in the lock + // order. + CopyInTo(ctx context.Context, ars AddrRangeSeq, dst safemem.Writer, opts IOOpts) (int64, error) + + // TODO(jamieliu): The requirement that CopyOutFrom/CopyInTo call src/dst + // at most once, which is unnecessary in most cases, forces implementations + // to gather safemem.Blocks into a single slice to pass to src/dst. Add + // CopyOutFromIter/CopyInToIter, which relaxes this restriction, to avoid + // this allocation. + + // SwapUint32 atomically sets the uint32 value at addr to new and + // returns the previous value. + // + // Preconditions: The caller must not hold mm.MemoryManager.mappingMu or + // any following locks in the lock order. addr must be aligned to a 4-byte + // boundary. + SwapUint32(ctx context.Context, addr Addr, new uint32, opts IOOpts) (uint32, error) + + // CompareAndSwapUint32 atomically compares the uint32 value at addr to + // old; if they are equal, the value in memory is replaced by new. In + // either case, the previous value stored in memory is returned. + // + // Preconditions: The caller must not hold mm.MemoryManager.mappingMu or + // any following locks in the lock order. addr must be aligned to a 4-byte + // boundary. + CompareAndSwapUint32(ctx context.Context, addr Addr, old, new uint32, opts IOOpts) (uint32, error) + + // LoadUint32 atomically loads the uint32 value at addr and returns it. + // + // Preconditions: The caller must not hold mm.MemoryManager.mappingMu or + // any following locks in the lock order. addr must be aligned to a 4-byte + // boundary. + LoadUint32(ctx context.Context, addr Addr, opts IOOpts) (uint32, error) +} + +// IOOpts contains options applicable to all IO methods. +type IOOpts struct { + // If IgnorePermissions is true, application-defined memory protections set + // by mmap(2) or mprotect(2) will be ignored. (Memory protections required + // by the target of the mapping are never ignored.) + IgnorePermissions bool + + // If AddressSpaceActive is true, the IO implementation may assume that it + // has an active AddressSpace and can therefore use AddressSpace copying + // without performing activation. See mm/io.go for details. + AddressSpaceActive bool +} + +// IOReadWriter is an io.ReadWriter that reads from / writes to addresses +// starting at addr in IO. The preconditions that apply to IO.CopyIn and +// IO.CopyOut also apply to IOReadWriter.Read and IOReadWriter.Write +// respectively. +type IOReadWriter struct { + Ctx context.Context + IO IO + Addr Addr + Opts IOOpts +} + +// Read implements io.Reader.Read. +// +// Note that an address space does not have an "end of file", so Read can only +// return io.EOF if IO.CopyIn returns io.EOF. Attempts to read unmapped or +// unreadable memory, or beyond the end of the address space, should return +// EFAULT. +func (rw *IOReadWriter) Read(dst []byte) (int, error) { + n, err := rw.IO.CopyIn(rw.Ctx, rw.Addr, dst, rw.Opts) + end, ok := rw.Addr.AddLength(uint64(n)) + if ok { + rw.Addr = end + } else { + // Disallow wraparound. + rw.Addr = ^Addr(0) + if err != nil { + err = syserror.EFAULT + } + } + return n, err +} + +// Writer implements io.Writer.Write. +func (rw *IOReadWriter) Write(src []byte) (int, error) { + n, err := rw.IO.CopyOut(rw.Ctx, rw.Addr, src, rw.Opts) + end, ok := rw.Addr.AddLength(uint64(n)) + if ok { + rw.Addr = end + } else { + // Disallow wraparound. + rw.Addr = ^Addr(0) + if err != nil { + err = syserror.EFAULT + } + } + return n, err +} + +// CopyObjectOut copies a fixed-size value or slice of fixed-size values from +// src to the memory mapped at addr in uio. It returns the number of bytes +// copied. +// +// CopyObjectOut must use reflection to encode src; performance-sensitive +// clients should do encoding manually and use uio.CopyOut directly. +// +// Preconditions: As for IO.CopyOut. +func CopyObjectOut(ctx context.Context, uio IO, addr Addr, src interface{}, opts IOOpts) (int, error) { + w := &IOReadWriter{ + Ctx: ctx, + IO: uio, + Addr: addr, + Opts: opts, + } + // Allocate a byte slice the size of the object being marshaled. This + // adds an extra reflection call, but avoids needing to grow the slice + // during encoding, which can result in many heap-allocated slices. + b := make([]byte, 0, binary.Size(src)) + return w.Write(binary.Marshal(b, ByteOrder, src)) +} + +// CopyObjectIn copies a fixed-size value or slice of fixed-size values from +// the memory mapped at addr in uio to dst. It returns the number of bytes +// copied. +// +// CopyObjectIn must use reflection to decode dst; performance-sensitive +// clients should use uio.CopyIn directly and do decoding manually. +// +// Preconditions: As for IO.CopyIn. +func CopyObjectIn(ctx context.Context, uio IO, addr Addr, dst interface{}, opts IOOpts) (int, error) { + r := &IOReadWriter{ + Ctx: ctx, + IO: uio, + Addr: addr, + Opts: opts, + } + buf := make([]byte, binary.Size(dst)) + if _, err := io.ReadFull(r, buf); err != nil { + return 0, err + } + binary.Unmarshal(buf, ByteOrder, dst) + return int(r.Addr - addr), nil +} + +// CopyStringIn tuning parameters, defined outside that function for tests. +const ( + copyStringIncrement = 64 + copyStringMaxInitBufLen = 256 +) + +// CopyStringIn copies a NUL-terminated string of unknown length from the +// memory mapped at addr in uio and returns it as a string (not including the +// trailing NUL). If the length of the string, including the terminating NUL, +// would exceed maxlen, CopyStringIn returns the string truncated to maxlen and +// ENAMETOOLONG. +// +// Preconditions: As for IO.CopyFromUser. maxlen >= 0. +func CopyStringIn(ctx context.Context, uio IO, addr Addr, maxlen int, opts IOOpts) (string, error) { + initLen := maxlen + if initLen > copyStringMaxInitBufLen { + initLen = copyStringMaxInitBufLen + } + buf := make([]byte, initLen) + var done int + for done < maxlen { + // Read up to copyStringIncrement bytes at a time. + readlen := copyStringIncrement + if readlen > maxlen-done { + readlen = maxlen - done + } + end, ok := addr.AddLength(uint64(readlen)) + if !ok { + return stringFromImmutableBytes(buf[:done]), syserror.EFAULT + } + // Shorten the read to avoid crossing page boundaries, since faulting + // in a page unnecessarily is expensive. This also ensures that partial + // copies up to the end of application-mappable memory succeed. + if addr.RoundDown() != end.RoundDown() { + end = end.RoundDown() + readlen = int(end - addr) + } + // Ensure that our buffer is large enough to accommodate the read. + if done+readlen > len(buf) { + newBufLen := len(buf) * 2 + if newBufLen > maxlen { + newBufLen = maxlen + } + buf = append(buf, make([]byte, newBufLen-len(buf))...) + } + n, err := uio.CopyIn(ctx, addr, buf[done:done+readlen], opts) + // Look for the terminating zero byte, which may have occurred before + // hitting err. + if i := bytes.IndexByte(buf[done:done+n], byte(0)); i >= 0 { + return stringFromImmutableBytes(buf[:done+i]), nil + } + + done += n + if err != nil { + return stringFromImmutableBytes(buf[:done]), err + } + addr = end + } + return stringFromImmutableBytes(buf), syserror.ENAMETOOLONG +} + +// CopyOutVec copies bytes from src to the memory mapped at ars in uio. The +// maximum number of bytes copied is ars.NumBytes() or len(src), whichever is +// less. CopyOutVec returns the number of bytes copied; if this is less than +// the maximum, it returns a non-nil error explaining why. +// +// Preconditions: As for IO.CopyOut. +func CopyOutVec(ctx context.Context, uio IO, ars AddrRangeSeq, src []byte, opts IOOpts) (int, error) { + var done int + for !ars.IsEmpty() && done < len(src) { + ar := ars.Head() + cplen := len(src) - done + if Addr(cplen) >= ar.Length() { + cplen = int(ar.Length()) + } + n, err := uio.CopyOut(ctx, ar.Start, src[done:done+cplen], opts) + done += n + if err != nil { + return done, err + } + ars = ars.DropFirst(n) + } + return done, nil +} + +// CopyInVec copies bytes from the memory mapped at ars in uio to dst. The +// maximum number of bytes copied is ars.NumBytes() or len(dst), whichever is +// less. CopyInVec returns the number of bytes copied; if this is less than the +// maximum, it returns a non-nil error explaining why. +// +// Preconditions: As for IO.CopyIn. +func CopyInVec(ctx context.Context, uio IO, ars AddrRangeSeq, dst []byte, opts IOOpts) (int, error) { + var done int + for !ars.IsEmpty() && done < len(dst) { + ar := ars.Head() + cplen := len(dst) - done + if Addr(cplen) >= ar.Length() { + cplen = int(ar.Length()) + } + n, err := uio.CopyIn(ctx, ar.Start, dst[done:done+cplen], opts) + done += n + if err != nil { + return done, err + } + ars = ars.DropFirst(n) + } + return done, nil +} + +// ZeroOutVec writes zeroes to the memory mapped at ars in uio. The maximum +// number of bytes written is ars.NumBytes() or toZero, whichever is less. +// ZeroOutVec returns the number of bytes written; if this is less than the +// maximum, it returns a non-nil error explaining why. +// +// Preconditions: As for IO.ZeroOut. +func ZeroOutVec(ctx context.Context, uio IO, ars AddrRangeSeq, toZero int64, opts IOOpts) (int64, error) { + var done int64 + for !ars.IsEmpty() && done < toZero { + ar := ars.Head() + cplen := toZero - done + if Addr(cplen) >= ar.Length() { + cplen = int64(ar.Length()) + } + n, err := uio.ZeroOut(ctx, ar.Start, cplen, opts) + done += n + if err != nil { + return done, err + } + ars = ars.DropFirst64(n) + } + return done, nil +} + +func isASCIIWhitespace(b byte) bool { + // Compare Linux include/linux/ctype.h, lib/ctype.c. + // 9 => horizontal tab '\t' + // 10 => line feed '\n' + // 11 => vertical tab '\v' + // 12 => form feed '\c' + // 13 => carriage return '\r' + return b == ' ' || (b >= 9 && b <= 13) +} + +// CopyInt32StringsInVec copies up to len(dsts) whitespace-separated decimal +// strings from the memory mapped at ars in uio and converts them to int32 +// values in dsts. It returns the number of bytes read. +// +// CopyInt32StringsInVec shares the following properties with Linux's +// kernel/sysctl.c:proc_dointvec(write=1): +// +// - If any read value overflows the range of int32, or any invalid characters +// are encountered during the read, CopyInt32StringsInVec returns EINVAL. +// +// - If, upon reaching the end of ars, fewer than len(dsts) values have been +// read, CopyInt32StringsInVec returns no error if at least 1 value was read +// and EINVAL otherwise. +// +// - Trailing whitespace after the last successfully read value is counted in +// the number of bytes read. +// +// Unlike proc_dointvec(): +// +// - CopyInt32StringsInVec does not implicitly limit ars.NumBytes() to +// PageSize-1; callers that require this must do so explicitly. +// +// - CopyInt32StringsInVec returns EINVAL if ars.NumBytes() == 0. +// +// Preconditions: As for CopyInVec. +func CopyInt32StringsInVec(ctx context.Context, uio IO, ars AddrRangeSeq, dsts []int32, opts IOOpts) (int64, error) { + if len(dsts) == 0 { + return 0, nil + } + + buf := make([]byte, ars.NumBytes()) + n, cperr := CopyInVec(ctx, uio, ars, buf, opts) + buf = buf[:n] + + var i, j int + for ; j < len(dsts); j++ { + // Skip leading whitespace. + for i < len(buf) && isASCIIWhitespace(buf[i]) { + i++ + } + if i == len(buf) { + break + } + + // Find the end of the value to be parsed (next whitespace or end of string). + nextI := i + 1 + for nextI < len(buf) && !isASCIIWhitespace(buf[nextI]) { + nextI++ + } + + // Parse a single value. + val, err := strconv.ParseInt(string(buf[i:nextI]), 10, 32) + if err != nil { + return int64(i), syserror.EINVAL + } + dsts[j] = int32(val) + + i = nextI + } + + // Skip trailing whitespace. + for i < len(buf) && isASCIIWhitespace(buf[i]) { + i++ + } + + if cperr != nil { + return int64(i), cperr + } + if j == 0 { + return int64(i), syserror.EINVAL + } + return int64(i), nil +} + +// CopyInt32StringInVec is equivalent to CopyInt32StringsInVec, but copies at +// most one int32. +func CopyInt32StringInVec(ctx context.Context, uio IO, ars AddrRangeSeq, dst *int32, opts IOOpts) (int64, error) { + dsts := [1]int32{*dst} + n, err := CopyInt32StringsInVec(ctx, uio, ars, dsts[:], opts) + *dst = dsts[0] + return n, err +} + +// IOSequence holds arguments to IO methods. +type IOSequence struct { + IO IO + Addrs AddrRangeSeq + Opts IOOpts +} + +// NumBytes returns s.Addrs.NumBytes(). +// +// Note that NumBytes() may return 0 even if !s.Addrs.IsEmpty(), since +// s.Addrs may contain a non-zero number of zero-length AddrRanges. +// Many clients of +// IOSequence currently do something like: +// +// if ioseq.NumBytes() == 0 { +// return 0, nil +// } +// if f.availableBytes == 0 { +// return 0, syserror.ErrWouldBlock +// } +// return ioseq.CopyOutFrom(..., reader) +// +// In such cases, using s.Addrs.IsEmpty() will cause them to have the wrong +// behavior for zero-length I/O. However, using s.NumBytes() == 0 instead means +// that we will return success for zero-length I/O in cases where Linux would +// return EFAULT due to a failed access_ok() check, so in the long term we +// should move checks for ErrWouldBlock etc. into the body of +// reader.ReadToBlocks and use s.Addrs.IsEmpty() instead. +func (s IOSequence) NumBytes() int64 { + return s.Addrs.NumBytes() +} + +// DropFirst returns a copy of s with s.Addrs.DropFirst(n). +// +// Preconditions: As for AddrRangeSeq.DropFirst. +func (s IOSequence) DropFirst(n int) IOSequence { + return IOSequence{s.IO, s.Addrs.DropFirst(n), s.Opts} +} + +// DropFirst64 returns a copy of s with s.Addrs.DropFirst64(n). +// +// Preconditions: As for AddrRangeSeq.DropFirst64. +func (s IOSequence) DropFirst64(n int64) IOSequence { + return IOSequence{s.IO, s.Addrs.DropFirst64(n), s.Opts} +} + +// TakeFirst returns a copy of s with s.Addrs.TakeFirst(n). +// +// Preconditions: As for AddrRangeSeq.TakeFirst. +func (s IOSequence) TakeFirst(n int) IOSequence { + return IOSequence{s.IO, s.Addrs.TakeFirst(n), s.Opts} +} + +// TakeFirst64 returns a copy of s with s.Addrs.TakeFirst64(n). +// +// Preconditions: As for AddrRangeSeq.TakeFirst64. +func (s IOSequence) TakeFirst64(n int64) IOSequence { + return IOSequence{s.IO, s.Addrs.TakeFirst64(n), s.Opts} +} + +// CopyOut invokes CopyOutVec over s.Addrs. +// +// As with CopyOutVec, if s.NumBytes() < len(src), the copy will be truncated +// to s.NumBytes(), and a nil error will be returned. +// +// Preconditions: As for CopyOutVec. +func (s IOSequence) CopyOut(ctx context.Context, src []byte) (int, error) { + return CopyOutVec(ctx, s.IO, s.Addrs, src, s.Opts) +} + +// CopyIn invokes CopyInVec over s.Addrs. +// +// As with CopyInVec, if s.NumBytes() < len(dst), the copy will be truncated to +// s.NumBytes(), and a nil error will be returned. +// +// Preconditions: As for CopyInVec. +func (s IOSequence) CopyIn(ctx context.Context, dst []byte) (int, error) { + return CopyInVec(ctx, s.IO, s.Addrs, dst, s.Opts) +} + +// ZeroOut invokes ZeroOutVec over s.Addrs. +// +// As with ZeroOutVec, if s.NumBytes() < toZero, the write will be truncated +// to s.NumBytes(), and a nil error will be returned. +// +// Preconditions: As for ZeroOutVec. +func (s IOSequence) ZeroOut(ctx context.Context, toZero int64) (int64, error) { + return ZeroOutVec(ctx, s.IO, s.Addrs, toZero, s.Opts) +} + +// CopyOutFrom invokes s.CopyOutFrom over s.Addrs. +// +// Preconditions: As for IO.CopyOutFrom. +func (s IOSequence) CopyOutFrom(ctx context.Context, src safemem.Reader) (int64, error) { + return s.IO.CopyOutFrom(ctx, s.Addrs, src, s.Opts) +} + +// CopyInTo invokes s.CopyInTo over s.Addrs. +// +// Preconditions: As for IO.CopyInTo. +func (s IOSequence) CopyInTo(ctx context.Context, dst safemem.Writer) (int64, error) { + return s.IO.CopyInTo(ctx, s.Addrs, dst, s.Opts) +} + +// Reader returns an io.Reader that reads from s. Reads beyond the end of s +// return io.EOF. The preconditions that apply to s.CopyIn also apply to the +// returned io.Reader.Read. +func (s IOSequence) Reader(ctx context.Context) io.Reader { + return &ioSequenceReadWriter{ctx, s} +} + +// Writer returns an io.Writer that writes to s. Writes beyond the end of s +// return ErrEndOfIOSequence. The preconditions that apply to s.CopyOut also +// apply to the returned io.Writer.Write. +func (s IOSequence) Writer(ctx context.Context) io.Writer { + return &ioSequenceReadWriter{ctx, s} +} + +// ErrEndOfIOSequence is returned by IOSequence.Writer().Write() when +// attempting to write beyond the end of the IOSequence. +var ErrEndOfIOSequence = errors.New("write beyond end of IOSequence") + +type ioSequenceReadWriter struct { + ctx context.Context + s IOSequence +} + +// Read implements io.Reader.Read. +func (rw *ioSequenceReadWriter) Read(dst []byte) (int, error) { + n, err := rw.s.CopyIn(rw.ctx, dst) + rw.s = rw.s.DropFirst(n) + if err == nil && rw.s.NumBytes() == 0 { + err = io.EOF + } + return n, err +} + +// Write implements io.Writer.Write. +func (rw *ioSequenceReadWriter) Write(src []byte) (int, error) { + n, err := rw.s.CopyOut(rw.ctx, src) + rw.s = rw.s.DropFirst(n) + if err == nil && n < len(src) { + err = ErrEndOfIOSequence + } + return n, err +} diff --git a/pkg/usermem/usermem_arm64.go b/pkg/usermem/usermem_arm64.go new file mode 100644 index 000000000..fdfc30a66 --- /dev/null +++ b/pkg/usermem/usermem_arm64.go @@ -0,0 +1,53 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build arm64 + +package usermem + +import ( + "encoding/binary" + "syscall" +) + +const ( + // PageSize is the system page size. + // arm64 support 4K/16K/64K page size, + // which can be get by syscall.Getpagesize(). + // Currently, only 4K page size is supported. + PageSize = 1 << PageShift + + // HugePageSize is the system huge page size. + HugePageSize = 1 << HugePageShift + + // PageShift is the binary log of the system page size. + PageShift = 12 + + // HugePageShift is the binary log of the system huge page size. + // Should be calculated by "PageShift + (PageShift - 3)" + // when multiple page size support is ready. + HugePageShift = 21 +) + +var ( + // ByteOrder is the native byte order (little endian). + ByteOrder = binary.LittleEndian +) + +func init() { + // Make sure the page size is 4K on arm64 platform. + if size := syscall.Getpagesize(); size != PageSize { + panic("Only 4K page size is supported on arm64!") + } +} diff --git a/pkg/usermem/usermem_test.go b/pkg/usermem/usermem_test.go new file mode 100644 index 000000000..bf3c5df2b --- /dev/null +++ b/pkg/usermem/usermem_test.go @@ -0,0 +1,424 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package usermem + +import ( + "bytes" + "encoding/binary" + "fmt" + "reflect" + "strings" + "testing" + + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/safemem" + "gvisor.dev/gvisor/pkg/syserror" +) + +// newContext returns a context.Context that we can use in these tests (we +// can't use contexttest because it depends on usermem). +func newContext() context.Context { + return context.Background() +} + +func newBytesIOString(s string) *BytesIO { + return &BytesIO{[]byte(s)} +} + +func TestBytesIOCopyOutSuccess(t *testing.T) { + b := newBytesIOString("ABCDE") + n, err := b.CopyOut(newContext(), 1, []byte("foo"), IOOpts{}) + if wantN := 3; n != wantN || err != nil { + t.Errorf("CopyOut: got (%v, %v), wanted (%v, nil)", n, err, wantN) + } + if got, want := b.Bytes, []byte("AfooE"); !bytes.Equal(got, want) { + t.Errorf("Bytes: got %q, wanted %q", got, want) + } +} + +func TestBytesIOCopyOutFailure(t *testing.T) { + b := newBytesIOString("ABC") + n, err := b.CopyOut(newContext(), 1, []byte("foo"), IOOpts{}) + if wantN, wantErr := 2, syserror.EFAULT; n != wantN || err != wantErr { + t.Errorf("CopyOut: got (%v, %v), wanted (%v, %v)", n, err, wantN, wantErr) + } + if got, want := b.Bytes, []byte("Afo"); !bytes.Equal(got, want) { + t.Errorf("Bytes: got %q, wanted %q", got, want) + } +} + +func TestBytesIOCopyInSuccess(t *testing.T) { + b := newBytesIOString("AfooE") + var dst [3]byte + n, err := b.CopyIn(newContext(), 1, dst[:], IOOpts{}) + if wantN := 3; n != wantN || err != nil { + t.Errorf("CopyIn: got (%v, %v), wanted (%v, nil)", n, err, wantN) + } + if got, want := dst[:], []byte("foo"); !bytes.Equal(got, want) { + t.Errorf("dst: got %q, wanted %q", got, want) + } +} + +func TestBytesIOCopyInFailure(t *testing.T) { + b := newBytesIOString("Afo") + var dst [3]byte + n, err := b.CopyIn(newContext(), 1, dst[:], IOOpts{}) + if wantN, wantErr := 2, syserror.EFAULT; n != wantN || err != wantErr { + t.Errorf("CopyIn: got (%v, %v), wanted (%v, %v)", n, err, wantN, wantErr) + } + if got, want := dst[:], []byte("fo\x00"); !bytes.Equal(got, want) { + t.Errorf("dst: got %q, wanted %q", got, want) + } +} + +func TestBytesIOZeroOutSuccess(t *testing.T) { + b := newBytesIOString("ABCD") + n, err := b.ZeroOut(newContext(), 1, 2, IOOpts{}) + if wantN := int64(2); n != wantN || err != nil { + t.Errorf("ZeroOut: got (%v, %v), wanted (%v, nil)", n, err, wantN) + } + if got, want := b.Bytes, []byte("A\x00\x00D"); !bytes.Equal(got, want) { + t.Errorf("Bytes: got %q, wanted %q", got, want) + } +} + +func TestBytesIOZeroOutFailure(t *testing.T) { + b := newBytesIOString("ABC") + n, err := b.ZeroOut(newContext(), 1, 3, IOOpts{}) + if wantN, wantErr := int64(2), syserror.EFAULT; n != wantN || err != wantErr { + t.Errorf("ZeroOut: got (%v, %v), wanted (%v, %v)", n, err, wantN, wantErr) + } + if got, want := b.Bytes, []byte("A\x00\x00"); !bytes.Equal(got, want) { + t.Errorf("Bytes: got %q, wanted %q", got, want) + } +} + +func TestBytesIOCopyOutFromSuccess(t *testing.T) { + b := newBytesIOString("ABCDEFGH") + n, err := b.CopyOutFrom(newContext(), AddrRangeSeqFromSlice([]AddrRange{ + {Start: 4, End: 7}, + {Start: 1, End: 4}, + }), safemem.FromIOReader{bytes.NewBufferString("barfoo")}, IOOpts{}) + if wantN := int64(6); n != wantN || err != nil { + t.Errorf("CopyOutFrom: got (%v, %v), wanted (%v, nil)", n, err, wantN) + } + if got, want := b.Bytes, []byte("AfoobarH"); !bytes.Equal(got, want) { + t.Errorf("Bytes: got %q, wanted %q", got, want) + } +} + +func TestBytesIOCopyOutFromFailure(t *testing.T) { + b := newBytesIOString("ABCDE") + n, err := b.CopyOutFrom(newContext(), AddrRangeSeqFromSlice([]AddrRange{ + {Start: 1, End: 4}, + {Start: 4, End: 7}, + }), safemem.FromIOReader{bytes.NewBufferString("foobar")}, IOOpts{}) + if wantN, wantErr := int64(4), syserror.EFAULT; n != wantN || err != wantErr { + t.Errorf("CopyOutFrom: got (%v, %v), wanted (%v, %v)", n, err, wantN, wantErr) + } + if got, want := b.Bytes, []byte("Afoob"); !bytes.Equal(got, want) { + t.Errorf("Bytes: got %q, wanted %q", got, want) + } +} + +func TestBytesIOCopyInToSuccess(t *testing.T) { + b := newBytesIOString("AfoobarH") + var dst bytes.Buffer + n, err := b.CopyInTo(newContext(), AddrRangeSeqFromSlice([]AddrRange{ + {Start: 4, End: 7}, + {Start: 1, End: 4}, + }), safemem.FromIOWriter{&dst}, IOOpts{}) + if wantN := int64(6); n != wantN || err != nil { + t.Errorf("CopyInTo: got (%v, %v), wanted (%v, nil)", n, err, wantN) + } + if got, want := dst.Bytes(), []byte("barfoo"); !bytes.Equal(got, want) { + t.Errorf("dst.Bytes(): got %q, wanted %q", got, want) + } +} + +func TestBytesIOCopyInToFailure(t *testing.T) { + b := newBytesIOString("Afoob") + var dst bytes.Buffer + n, err := b.CopyInTo(newContext(), AddrRangeSeqFromSlice([]AddrRange{ + {Start: 1, End: 4}, + {Start: 4, End: 7}, + }), safemem.FromIOWriter{&dst}, IOOpts{}) + if wantN, wantErr := int64(4), syserror.EFAULT; n != wantN || err != wantErr { + t.Errorf("CopyOutFrom: got (%v, %v), wanted (%v, %v)", n, err, wantN, wantErr) + } + if got, want := dst.Bytes(), []byte("foob"); !bytes.Equal(got, want) { + t.Errorf("dst.Bytes(): got %q, wanted %q", got, want) + } +} + +type testStruct struct { + Int8 int8 + Uint8 uint8 + Int16 int16 + Uint16 uint16 + Int32 int32 + Uint32 uint32 + Int64 int64 + Uint64 uint64 +} + +func TestCopyObject(t *testing.T) { + wantObj := testStruct{1, 2, 3, 4, 5, 6, 7, 8} + wantN := binary.Size(wantObj) + b := &BytesIO{make([]byte, wantN)} + ctx := newContext() + if n, err := CopyObjectOut(ctx, b, 0, &wantObj, IOOpts{}); n != wantN || err != nil { + t.Fatalf("CopyObjectOut: got (%v, %v), wanted (%v, nil)", n, err, wantN) + } + var gotObj testStruct + if n, err := CopyObjectIn(ctx, b, 0, &gotObj, IOOpts{}); n != wantN || err != nil { + t.Errorf("CopyObjectIn: got (%v, %v), wanted (%v, nil)", n, err, wantN) + } + if gotObj != wantObj { + t.Errorf("CopyObject round trip: got %+v, wanted %+v", gotObj, wantObj) + } +} + +func TestCopyStringInShort(t *testing.T) { + // Tests for string length <= copyStringIncrement. + want := strings.Repeat("A", copyStringIncrement-2) + mem := want + "\x00" + if got, err := CopyStringIn(newContext(), newBytesIOString(mem), 0, 2*copyStringIncrement, IOOpts{}); got != want || err != nil { + t.Errorf("CopyStringIn: got (%q, %v), wanted (%q, nil)", got, err, want) + } +} + +func TestCopyStringInLong(t *testing.T) { + // Tests for copyStringIncrement < string length <= copyStringMaxInitBufLen + // (requiring multiple calls to IO.CopyIn()). + want := strings.Repeat("A", copyStringIncrement*3/4) + strings.Repeat("B", copyStringIncrement*3/4) + mem := want + "\x00" + if got, err := CopyStringIn(newContext(), newBytesIOString(mem), 0, 2*copyStringIncrement, IOOpts{}); got != want || err != nil { + t.Errorf("CopyStringIn: got (%q, %v), wanted (%q, nil)", got, err, want) + } +} + +func TestCopyStringInVeryLong(t *testing.T) { + // Tests for string length > copyStringMaxInitBufLen (requiring buffer + // reallocation). + want := strings.Repeat("A", copyStringMaxInitBufLen*3/4) + strings.Repeat("B", copyStringMaxInitBufLen*3/4) + mem := want + "\x00" + if got, err := CopyStringIn(newContext(), newBytesIOString(mem), 0, 2*copyStringMaxInitBufLen, IOOpts{}); got != want || err != nil { + t.Errorf("CopyStringIn: got (%q, %v), wanted (%q, nil)", got, err, want) + } +} + +func TestCopyStringInNoTerminatingZeroByte(t *testing.T) { + want := strings.Repeat("A", copyStringIncrement-1) + got, err := CopyStringIn(newContext(), newBytesIOString(want), 0, 2*copyStringIncrement, IOOpts{}) + if wantErr := syserror.EFAULT; got != want || err != wantErr { + t.Errorf("CopyStringIn: got (%q, %v), wanted (%q, %v)", got, err, want, wantErr) + } +} + +func TestCopyStringInTruncatedByMaxlen(t *testing.T) { + got, err := CopyStringIn(newContext(), newBytesIOString(strings.Repeat("A", 10)), 0, 5, IOOpts{}) + if want, wantErr := strings.Repeat("A", 5), syserror.ENAMETOOLONG; got != want || err != wantErr { + t.Errorf("CopyStringIn: got (%q, %v), wanted (%q, %v)", got, err, want, wantErr) + } +} + +func TestCopyInt32StringsInVec(t *testing.T) { + for _, test := range []struct { + str string + n int + initial []int32 + final []int32 + }{ + { + str: "100 200", + n: len("100 200"), + initial: []int32{1, 2}, + final: []int32{100, 200}, + }, + { + // Fewer values ok + str: "100", + n: len("100"), + initial: []int32{1, 2}, + final: []int32{100, 2}, + }, + { + // Extra values ok + str: "100 200 300", + n: len("100 200 "), + initial: []int32{1, 2}, + final: []int32{100, 200}, + }, + { + // Leading and trailing whitespace ok + str: " 100\t200\n", + n: len(" 100\t200\n"), + initial: []int32{1, 2}, + final: []int32{100, 200}, + }, + } { + t.Run(fmt.Sprintf("%q", test.str), func(t *testing.T) { + src := BytesIOSequence([]byte(test.str)) + dsts := append([]int32(nil), test.initial...) + if n, err := CopyInt32StringsInVec(newContext(), src.IO, src.Addrs, dsts, src.Opts); n != int64(test.n) || err != nil { + t.Errorf("CopyInt32StringsInVec: got (%d, %v), wanted (%d, nil)", n, err, test.n) + } + if !reflect.DeepEqual(dsts, test.final) { + t.Errorf("dsts: got %v, wanted %v", dsts, test.final) + } + }) + } +} + +func TestCopyInt32StringsInVecRequiresOneValidValue(t *testing.T) { + for _, s := range []string{"", "\n", "a123"} { + t.Run(fmt.Sprintf("%q", s), func(t *testing.T) { + src := BytesIOSequence([]byte(s)) + initial := []int32{1, 2} + dsts := append([]int32(nil), initial...) + if n, err := CopyInt32StringsInVec(newContext(), src.IO, src.Addrs, dsts, src.Opts); err != syserror.EINVAL { + t.Errorf("CopyInt32StringsInVec: got (%d, %v), wanted (_, %v)", n, err, syserror.EINVAL) + } + if !reflect.DeepEqual(dsts, initial) { + t.Errorf("dsts: got %v, wanted %v", dsts, initial) + } + }) + } +} + +func TestIOSequenceCopyOut(t *testing.T) { + buf := []byte("ABCD") + s := BytesIOSequence(buf) + + // CopyOut limited by len(src). + n, err := s.CopyOut(newContext(), []byte("fo")) + if wantN := 2; n != wantN || err != nil { + t.Errorf("CopyOut: got (%v, %v), wanted (%v, nil)", n, err, wantN) + } + if want := []byte("foCD"); !bytes.Equal(buf, want) { + t.Errorf("buf: got %q, wanted %q", buf, want) + } + s = s.DropFirst(2) + if got, want := s.NumBytes(), int64(2); got != want { + t.Errorf("NumBytes: got %v, wanted %v", got, want) + } + + // CopyOut limited by s.NumBytes(). + n, err = s.CopyOut(newContext(), []byte("obar")) + if wantN := 2; n != wantN || err != nil { + t.Errorf("CopyOut: got (%v, %v), wanted (%v, nil)", n, err, wantN) + } + if want := []byte("foob"); !bytes.Equal(buf, want) { + t.Errorf("buf: got %q, wanted %q", buf, want) + } + s = s.DropFirst(2) + if got, want := s.NumBytes(), int64(0); got != want { + t.Errorf("NumBytes: got %v, wanted %v", got, want) + } +} + +func TestIOSequenceCopyIn(t *testing.T) { + s := BytesIOSequence([]byte("foob")) + dst := []byte("ABCDEF") + + // CopyIn limited by len(dst). + n, err := s.CopyIn(newContext(), dst[:2]) + if wantN := 2; n != wantN || err != nil { + t.Errorf("CopyIn: got (%v, %v), wanted (%v, nil)", n, err, wantN) + } + if want := []byte("foCDEF"); !bytes.Equal(dst, want) { + t.Errorf("dst: got %q, wanted %q", dst, want) + } + s = s.DropFirst(2) + if got, want := s.NumBytes(), int64(2); got != want { + t.Errorf("NumBytes: got %v, wanted %v", got, want) + } + + // CopyIn limited by s.Remaining(). + n, err = s.CopyIn(newContext(), dst[2:]) + if wantN := 2; n != wantN || err != nil { + t.Errorf("CopyIn: got (%v, %v), wanted (%v, nil)", n, err, wantN) + } + if want := []byte("foobEF"); !bytes.Equal(dst, want) { + t.Errorf("dst: got %q, wanted %q", dst, want) + } + s = s.DropFirst(2) + if got, want := s.NumBytes(), int64(0); got != want { + t.Errorf("NumBytes: got %v, wanted %v", got, want) + } +} + +func TestIOSequenceZeroOut(t *testing.T) { + buf := []byte("ABCD") + s := BytesIOSequence(buf) + + // ZeroOut limited by toZero. + n, err := s.ZeroOut(newContext(), 2) + if wantN := int64(2); n != wantN || err != nil { + t.Errorf("ZeroOut: got (%v, %v), wanted (%v, nil)", n, err, wantN) + } + if want := []byte("\x00\x00CD"); !bytes.Equal(buf, want) { + t.Errorf("buf: got %q, wanted %q", buf, want) + } + s = s.DropFirst(2) + if got, want := s.NumBytes(), int64(2); got != want { + t.Errorf("NumBytes: got %v, wanted %v", got, want) + } + + // ZeroOut limited by s.NumBytes(). + n, err = s.ZeroOut(newContext(), 4) + if wantN := int64(2); n != wantN || err != nil { + t.Errorf("CopyOut: got (%v, %v), wanted (%v, nil)", n, err, wantN) + } + if want := []byte("\x00\x00\x00\x00"); !bytes.Equal(buf, want) { + t.Errorf("buf: got %q, wanted %q", buf, want) + } + s = s.DropFirst(2) + if got, want := s.NumBytes(), int64(0); got != want { + t.Errorf("NumBytes: got %v, wanted %v", got, want) + } +} + +func TestIOSequenceTakeFirst(t *testing.T) { + s := BytesIOSequence([]byte("foobar")) + if got, want := s.NumBytes(), int64(6); got != want { + t.Errorf("NumBytes: got %v, wanted %v", got, want) + } + + s = s.TakeFirst(3) + if got, want := s.NumBytes(), int64(3); got != want { + t.Errorf("NumBytes: got %v, wanted %v", got, want) + } + + // TakeFirst(n) where n > s.NumBytes() is a no-op. + s = s.TakeFirst(9) + if got, want := s.NumBytes(), int64(3); got != want { + t.Errorf("NumBytes: got %v, wanted %v", got, want) + } + + var dst [3]byte + n, err := s.CopyIn(newContext(), dst[:]) + if wantN := 3; n != wantN || err != nil { + t.Errorf("CopyIn: got (%v, %v), wanted (%v, nil)", n, err, wantN) + } + if got, want := dst[:], []byte("foo"); !bytes.Equal(got, want) { + t.Errorf("dst: got %q, wanted %q", got, want) + } + s = s.DropFirst(3) + if got, want := s.NumBytes(), int64(0); got != want { + t.Errorf("NumBytes: got %v, wanted %v", got, want) + } +} diff --git a/pkg/usermem/usermem_unsafe.go b/pkg/usermem/usermem_unsafe.go new file mode 100644 index 000000000..876783e78 --- /dev/null +++ b/pkg/usermem/usermem_unsafe.go @@ -0,0 +1,27 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package usermem + +import ( + "unsafe" +) + +// stringFromImmutableBytes is equivalent to string(bs), except that it never +// copies even if escape analysis can't prove that bs does not escape. This is +// only valid if bs is never mutated after stringFromImmutableBytes returns. +func stringFromImmutableBytes(bs []byte) string { + // Compare strings.Builder.String(). + return *(*string)(unsafe.Pointer(&bs)) +} diff --git a/pkg/usermem/usermem_x86.go b/pkg/usermem/usermem_x86.go new file mode 100644 index 000000000..8059b72d2 --- /dev/null +++ b/pkg/usermem/usermem_x86.go @@ -0,0 +1,38 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build amd64 i386 + +package usermem + +import "encoding/binary" + +const ( + // PageSize is the system page size. + PageSize = 1 << PageShift + + // HugePageSize is the system huge page size. + HugePageSize = 1 << HugePageShift + + // PageShift is the binary log of the system page size. + PageShift = 12 + + // HugePageShift is the binary log of the system huge page size. + HugePageShift = 21 +) + +var ( + // ByteOrder is the native byte order (little endian). + ByteOrder = binary.LittleEndian +) diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD index f3ebc0231..a96c80261 100644 --- a/runsc/boot/BUILD +++ b/runsc/boot/BUILD @@ -30,6 +30,7 @@ go_library( deps = [ "//pkg/abi", "//pkg/abi/linux", + "//pkg/context", "//pkg/control/server", "//pkg/cpuid", "//pkg/eventchannel", @@ -39,7 +40,6 @@ go_library( "//pkg/refs", "//pkg/sentry/arch", "//pkg/sentry/arch:registers_go_proto", - "//pkg/sentry/context", "//pkg/sentry/control", "//pkg/sentry/fs", "//pkg/sentry/fs/dev", @@ -71,7 +71,6 @@ go_library( "//pkg/sentry/time", "//pkg/sentry/unimpl:unimplemented_syscall_go_proto", "//pkg/sentry/usage", - "//pkg/sentry/usermem", "//pkg/sentry/watchdog", "//pkg/sync", "//pkg/syserror", @@ -88,6 +87,7 @@ go_library( "//pkg/tcpip/transport/tcp", "//pkg/tcpip/transport/udp", "//pkg/urpc", + "//pkg/usermem", "//runsc/boot/filter", "//runsc/boot/platforms", "//runsc/specutils", @@ -111,7 +111,7 @@ go_test( "//pkg/control/server", "//pkg/log", "//pkg/p9", - "//pkg/sentry/context/contexttest", + "//pkg/sentry/contexttest", "//pkg/sentry/fs", "//pkg/sentry/kernel/auth", "//pkg/sync", diff --git a/runsc/boot/fds.go b/runsc/boot/fds.go index e5de1f3d7..417d2d5fb 100644 --- a/runsc/boot/fds.go +++ b/runsc/boot/fds.go @@ -17,7 +17,7 @@ package boot import ( "fmt" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/host" "gvisor.dev/gvisor/pkg/sentry/kernel" diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go index 421ccd255..0f62842ea 100644 --- a/runsc/boot/fs.go +++ b/runsc/boot/fs.go @@ -32,8 +32,8 @@ import ( specs "github.com/opencontainers/runtime-spec/specs-go" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" - "gvisor.dev/gvisor/pkg/sentry/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/gofer" "gvisor.dev/gvisor/pkg/sentry/fs/ramfs" diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go index bec0dc292..44aa63196 100644 --- a/runsc/boot/loader_test.go +++ b/runsc/boot/loader_test.go @@ -27,7 +27,7 @@ import ( "gvisor.dev/gvisor/pkg/control/server" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/p9" - "gvisor.dev/gvisor/pkg/sentry/context/contexttest" + "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/unet" diff --git a/runsc/boot/user.go b/runsc/boot/user.go index 56cc12ee0..f0aa52135 100644 --- a/runsc/boot/user.go +++ b/runsc/boot/user.go @@ -22,10 +22,10 @@ import ( "strings" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" ) type fileReader struct { diff --git a/runsc/boot/user_test.go b/runsc/boot/user_test.go index 9aee2ad07..fb4e13dfb 100644 --- a/runsc/boot/user_test.go +++ b/runsc/boot/user_test.go @@ -23,7 +23,7 @@ import ( "testing" specs "github.com/opencontainers/runtime-spec/specs-go" - "gvisor.dev/gvisor/pkg/sentry/context/contexttest" + "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" ) diff --git a/tools/go_marshal/defs.bzl b/tools/go_marshal/defs.bzl index 2918ceffe..d79786a68 100644 --- a/tools/go_marshal/defs.bzl +++ b/tools/go_marshal/defs.bzl @@ -54,8 +54,8 @@ go_marshal = rule( # marshal_deps are the dependencies requied by generated code. marshal_deps = [ "//tools/go_marshal/marshal", - "//pkg/sentry/platform/safecopy", - "//pkg/sentry/usermem", + "//pkg/safecopy", + "//pkg/usermem", ] # marshal_test_deps are required by test targets. diff --git a/tools/go_marshal/gomarshal/generator.go b/tools/go_marshal/gomarshal/generator.go index 8392f3f6d..af90bdecb 100644 --- a/tools/go_marshal/gomarshal/generator.go +++ b/tools/go_marshal/gomarshal/generator.go @@ -27,8 +27,8 @@ import ( const ( marshalImport = "gvisor.dev/gvisor/tools/go_marshal/marshal" - usermemImport = "gvisor.dev/gvisor/pkg/sentry/usermem" - safecopyImport = "gvisor.dev/gvisor/pkg/sentry/platform/safecopy" + safecopyImport = "gvisor.dev/gvisor/pkg/safecopy" + usermemImport = "gvisor.dev/gvisor/pkg/usermem" ) // List of identifiers we use in generated code, that may conflict a diff --git a/tools/go_marshal/test/BUILD b/tools/go_marshal/test/BUILD index 38ba49fed..e345e3a8e 100644 --- a/tools/go_marshal/test/BUILD +++ b/tools/go_marshal/test/BUILD @@ -15,7 +15,7 @@ go_test( deps = [ ":test", "//pkg/binary", - "//pkg/sentry/usermem", + "//pkg/usermem", "//tools/go_marshal/analysis", ], ) diff --git a/tools/go_marshal/test/benchmark_test.go b/tools/go_marshal/test/benchmark_test.go index e70db06d8..e12403741 100644 --- a/tools/go_marshal/test/benchmark_test.go +++ b/tools/go_marshal/test/benchmark_test.go @@ -22,7 +22,7 @@ import ( "testing" "gvisor.dev/gvisor/pkg/binary" - "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/tools/go_marshal/analysis" test "gvisor.dev/gvisor/tools/go_marshal/test" ) -- cgit v1.2.3 From 2862b0b1be9ce821e86877802b9608aad3102916 Mon Sep 17 00:00:00 2001 From: Jamie Liu Date: Tue, 28 Jan 2020 15:04:34 -0800 Subject: Add //pkg/sentry/fsimpl/devtmpfs. PiperOrigin-RevId: 292021389 --- pkg/sentry/fsimpl/devtmpfs/BUILD | 33 +++++ pkg/sentry/fsimpl/devtmpfs/devtmpfs.go | 187 ++++++++++++++++++++++++++++ pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go | 119 ++++++++++++++++++ pkg/sentry/fsimpl/tmpfs/BUILD | 1 + 4 files changed, 340 insertions(+) create mode 100644 pkg/sentry/fsimpl/devtmpfs/BUILD create mode 100644 pkg/sentry/fsimpl/devtmpfs/devtmpfs.go create mode 100644 pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/devtmpfs/BUILD b/pkg/sentry/fsimpl/devtmpfs/BUILD new file mode 100644 index 000000000..aa0c2ad8c --- /dev/null +++ b/pkg/sentry/fsimpl/devtmpfs/BUILD @@ -0,0 +1,33 @@ +load("//tools:defs.bzl", "go_library", "go_test") + +licenses(["notice"]) + +go_library( + name = "devtmpfs", + srcs = ["devtmpfs.go"], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/context", + "//pkg/fspath", + "//pkg/sentry/fsimpl/tmpfs", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/vfs", + "//pkg/sync", + ], +) + +go_test( + name = "devtmpfs_test", + size = "small", + srcs = ["devtmpfs_test.go"], + library = ":devtmpfs", + deps = [ + "//pkg/abi/linux", + "//pkg/fspath", + "//pkg/sentry/contexttest", + "//pkg/sentry/fsimpl/tmpfs", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/vfs", + ], +) diff --git a/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go b/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go new file mode 100644 index 000000000..d36fa74fb --- /dev/null +++ b/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go @@ -0,0 +1,187 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package devtmpfs provides an implementation of /dev based on tmpfs, +// analogous to Linux's devtmpfs. +package devtmpfs + +import ( + "fmt" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" +) + +// FilesystemType implements vfs.FilesystemType. +type FilesystemType struct { + initOnce sync.Once + initErr error + + // fs is the tmpfs filesystem that backs all mounts of this FilesystemType. + // root is fs' root. fs and root are immutable. + fs *vfs.Filesystem + root *vfs.Dentry +} + +// GetFilesystem implements vfs.FilesystemType.GetFilesystem. +func (fst *FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { + fst.initOnce.Do(func() { + fs, root, err := tmpfs.FilesystemType{}.GetFilesystem(ctx, vfsObj, creds, "" /* source */, vfs.GetFilesystemOptions{ + Data: "mode=0755", // opts from drivers/base/devtmpfs.c:devtmpfs_init() + }) + if err != nil { + fst.initErr = err + return + } + fst.fs = fs + fst.root = root + }) + if fst.initErr != nil { + return nil, nil, fst.initErr + } + fst.fs.IncRef() + fst.root.IncRef() + return fst.fs, fst.root, nil +} + +// Accessor allows devices to create device special files in devtmpfs. +type Accessor struct { + vfsObj *vfs.VirtualFilesystem + mntns *vfs.MountNamespace + root vfs.VirtualDentry + creds *auth.Credentials +} + +// NewAccessor returns an Accessor that supports creation of device special +// files in the devtmpfs instance registered with name fsTypeName in vfsObj. +func NewAccessor(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, fsTypeName string) (*Accessor, error) { + mntns, err := vfsObj.NewMountNamespace(ctx, creds, "devtmpfs" /* source */, fsTypeName, &vfs.GetFilesystemOptions{}) + if err != nil { + return nil, err + } + return &Accessor{ + vfsObj: vfsObj, + mntns: mntns, + root: mntns.Root(), + creds: creds, + }, nil +} + +// Release must be called when a is no longer in use. +func (a *Accessor) Release() { + a.root.DecRef() + a.mntns.DecRef(a.vfsObj) +} + +// accessorContext implements context.Context by extending an existing +// context.Context with an Accessor's values for VFS-relevant state. +type accessorContext struct { + context.Context + a *Accessor +} + +func (a *Accessor) wrapContext(ctx context.Context) *accessorContext { + return &accessorContext{ + Context: ctx, + a: a, + } +} + +// Value implements context.Context.Value. +func (ac *accessorContext) Value(key interface{}) interface{} { + switch key { + case vfs.CtxMountNamespace: + return ac.a.mntns + case vfs.CtxRoot: + ac.a.root.IncRef() + return ac.a.root + default: + return ac.Context.Value(key) + } +} + +func (a *Accessor) pathOperationAt(pathname string) *vfs.PathOperation { + return &vfs.PathOperation{ + Root: a.root, + Start: a.root, + Path: fspath.Parse(pathname), + } +} + +// CreateDeviceFile creates a device special file at the given pathname in the +// devtmpfs instance accessed by the Accessor. +func (a *Accessor) CreateDeviceFile(ctx context.Context, pathname string, kind vfs.DeviceKind, major, minor uint32, perms uint16) error { + mode := (linux.FileMode)(perms) + switch kind { + case vfs.BlockDevice: + mode |= linux.S_IFBLK + case vfs.CharDevice: + mode |= linux.S_IFCHR + default: + panic(fmt.Sprintf("invalid vfs.DeviceKind: %v", kind)) + } + // NOTE: Linux's devtmpfs refuses to automatically delete files it didn't + // create, which it recognizes by storing a pointer to the kdevtmpfs struct + // thread in struct inode::i_private. Accessor doesn't yet support deletion + // of files at all, and probably won't as long as we don't need to support + // kernel modules, so this is moot for now. + return a.vfsObj.MknodAt(a.wrapContext(ctx), a.creds, a.pathOperationAt(pathname), &vfs.MknodOptions{ + Mode: mode, + DevMajor: major, + DevMinor: minor, + }) +} + +// UserspaceInit creates symbolic links and mount points in the devtmpfs +// instance accessed by the Accessor that are created by userspace in Linux. It +// does not create mounts. +func (a *Accessor) UserspaceInit(ctx context.Context) error { + actx := a.wrapContext(ctx) + + // systemd: src/shared/dev-setup.c:dev_setup() + for _, symlink := range []struct { + source string + target string + }{ + // /proc/kcore is not implemented. + {source: "fd", target: "/proc/self/fd"}, + {source: "stdin", target: "/proc/self/fd/0"}, + {source: "stdout", target: "/proc/self/fd/1"}, + {source: "stderr", target: "/proc/self/fd/2"}, + } { + if err := a.vfsObj.SymlinkAt(actx, a.creds, a.pathOperationAt(symlink.source), symlink.target); err != nil { + return fmt.Errorf("failed to create symlink %q => %q: %v", symlink.source, symlink.target, err) + } + } + + // systemd: src/core/mount-setup.c:mount_table + for _, dir := range []string{ + "shm", + "pts", + } { + if err := a.vfsObj.MkdirAt(actx, a.creds, a.pathOperationAt(dir), &vfs.MkdirOptions{ + // systemd: src/core/mount-setup.c:mount_one() + Mode: 0755, + }); err != nil { + return fmt.Errorf("failed to create directory %q: %v", dir, err) + } + } + + return nil +} diff --git a/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go b/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go new file mode 100644 index 000000000..82c58c900 --- /dev/null +++ b/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go @@ -0,0 +1,119 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package devtmpfs + +import ( + "testing" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/contexttest" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" +) + +func TestDevtmpfs(t *testing.T) { + ctx := contexttest.Context(t) + creds := auth.CredentialsFromContext(ctx) + + vfsObj := vfs.New() + // Register tmpfs just so that we can have a root filesystem that isn't + // devtmpfs. + vfsObj.MustRegisterFilesystemType("tmpfs", tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserMount: true, + }) + vfsObj.MustRegisterFilesystemType("devtmpfs", &FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserMount: true, + }) + + // Create a test mount namespace with devtmpfs mounted at "/dev". + const devPath = "/dev" + mntns, err := vfsObj.NewMountNamespace(ctx, creds, "tmpfs" /* source */, "tmpfs" /* fsTypeName */, &vfs.GetFilesystemOptions{}) + if err != nil { + t.Fatalf("failed to create tmpfs root mount: %v", err) + } + defer mntns.DecRef(vfsObj) + root := mntns.Root() + defer root.DecRef() + devpop := vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(devPath), + } + if err := vfsObj.MkdirAt(ctx, creds, &devpop, &vfs.MkdirOptions{ + Mode: 0755, + }); err != nil { + t.Fatalf("failed to create mount point: %v", err) + } + if err := vfsObj.MountAt(ctx, creds, "devtmpfs" /* source */, &devpop, "devtmpfs" /* fsTypeName */, &vfs.MountOptions{}); err != nil { + t.Fatalf("failed to mount devtmpfs: %v", err) + } + + a, err := NewAccessor(ctx, vfsObj, creds, "devtmpfs") + if err != nil { + t.Fatalf("failed to create devtmpfs.Accessor: %v", err) + } + defer a.Release() + + // Create "userspace-initialized" files using a devtmpfs.Accessor. + if err := a.UserspaceInit(ctx); err != nil { + t.Fatalf("failed to userspace-initialize devtmpfs: %v", err) + } + // Created files should be visible in the test mount namespace. + abspath := devPath + "/fd" + target, err := vfsObj.ReadlinkAt(ctx, creds, &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(abspath), + }) + if want := "/proc/self/fd"; err != nil || target != want { + t.Fatalf("readlink(%q): got (%q, %v), wanted (%q, nil)", abspath, target, err, want) + } + + // Create a dummy device special file using a devtmpfs.Accessor. + const ( + pathInDev = "dummy" + kind = vfs.CharDevice + major = 12 + minor = 34 + perms = 0600 + wantMode = linux.S_IFCHR | perms + ) + if err := a.CreateDeviceFile(ctx, pathInDev, kind, major, minor, perms); err != nil { + t.Fatalf("failed to create device file: %v", err) + } + // The device special file should be visible in the test mount namespace. + abspath = devPath + "/" + pathInDev + stat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(abspath), + }, &vfs.StatOptions{ + Mask: linux.STATX_TYPE | linux.STATX_MODE, + }) + if err != nil { + t.Fatalf("failed to stat device file at %q: %v", abspath, err) + } + if stat.Mode != wantMode { + t.Errorf("device file mode: got %v, wanted %v", stat.Mode, wantMode) + } + if stat.RdevMajor != major { + t.Errorf("major device number: got %v, wanted %v", stat.RdevMajor, major) + } + if stat.RdevMinor != minor { + t.Errorf("minor device number: got %v, wanted %v", stat.RdevMinor, minor) + } +} diff --git a/pkg/sentry/fsimpl/tmpfs/BUILD b/pkg/sentry/fsimpl/tmpfs/BUILD index fb436860c..c61366224 100644 --- a/pkg/sentry/fsimpl/tmpfs/BUILD +++ b/pkg/sentry/fsimpl/tmpfs/BUILD @@ -27,6 +27,7 @@ go_library( "symlink.go", "tmpfs.go", ], + visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", "//pkg/amutex", -- cgit v1.2.3 From 396c574db276ae1424af7098b5cd917e2bed9921 Mon Sep 17 00:00:00 2001 From: Fabricio Voznika Date: Tue, 28 Jan 2020 18:30:36 -0800 Subject: Add support for WritableSource in DynamicBytesFileDescriptionImpl WritableSource is a convenience interface used for files that can be written to, e.g. /proc/net/ipv4/tpc_sack. It reads max of 4KB and only from offset 0 which should cover most cases. It can be extended as neeed. Updates #1195 PiperOrigin-RevId: 292056924 --- pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go | 4 +- pkg/sentry/fsimpl/proc/tasks.go | 2 +- pkg/sentry/fsimpl/proc/tasks_sys.go | 84 +++++++++++-- pkg/sentry/vfs/file_description_impl_util.go | 76 +++++++++--- pkg/sentry/vfs/file_description_impl_util_test.go | 138 +++++++++++++++++----- 5 files changed, 250 insertions(+), 54 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go index 373f801ff..733792c78 100644 --- a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go +++ b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go @@ -108,12 +108,12 @@ func (fd *DynamicBytesFD) PRead(ctx context.Context, dst usermem.IOSequence, off // Write implements vfs.FileDescriptionImpl.Write. func (fd *DynamicBytesFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { - return fd.FileDescriptionDefaultImpl.Write(ctx, src, opts) + return fd.DynamicBytesFileDescriptionImpl.Write(ctx, src, opts) } // PWrite implements vfs.FileDescriptionImpl.PWrite. func (fd *DynamicBytesFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { - return fd.FileDescriptionDefaultImpl.PWrite(ctx, src, offset, opts) + return fd.DynamicBytesFileDescriptionImpl.PWrite(ctx, src, offset, opts) } // Release implements vfs.FileDescriptionImpl.Release. diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go index e0cb9c47b..14bd334e8 100644 --- a/pkg/sentry/fsimpl/proc/tasks.go +++ b/pkg/sentry/fsimpl/proc/tasks.go @@ -69,7 +69,7 @@ func newTasksInode(inoGen InoGenerator, k *kernel.Kernel, pidns *kernel.PIDNames "cpuinfo": newDentry(root, inoGen.NextIno(), 0444, newStaticFile(cpuInfoData(k))), //"filesystems": newDentry(root, inoGen.NextIno(), 0444, &filesystemsData{}), "loadavg": newDentry(root, inoGen.NextIno(), 0444, &loadavgData{}), - "sys": newSysDir(root, inoGen), + "sys": newSysDir(root, inoGen, k), "meminfo": newDentry(root, inoGen.NextIno(), 0444, &meminfoData{}), "mounts": kernfs.NewStaticSymlink(root, inoGen.NextIno(), "self/mounts"), "net": newNetDir(root, inoGen, k), diff --git a/pkg/sentry/fsimpl/proc/tasks_sys.go b/pkg/sentry/fsimpl/proc/tasks_sys.go index ad963870b..c7ce74883 100644 --- a/pkg/sentry/fsimpl/proc/tasks_sys.go +++ b/pkg/sentry/fsimpl/proc/tasks_sys.go @@ -21,12 +21,16 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" + "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // newSysDir returns the dentry corresponding to /proc/sys directory. -func newSysDir(root *auth.Credentials, inoGen InoGenerator) *kernfs.Dentry { +func newSysDir(root *auth.Credentials, inoGen InoGenerator, k *kernel.Kernel) *kernfs.Dentry { return kernfs.NewStaticDir(root, inoGen.NextIno(), 0555, map[string]*kernfs.Dentry{ "kernel": kernfs.NewStaticDir(root, inoGen.NextIno(), 0555, map[string]*kernfs.Dentry{ "hostname": newDentry(root, inoGen.NextIno(), 0444, &hostnameData{}), @@ -38,18 +42,18 @@ func newSysDir(root *auth.Credentials, inoGen InoGenerator) *kernfs.Dentry { "mmap_min_addr": newDentry(root, inoGen.NextIno(), 0444, &mmapMinAddrData{}), "overcommit_memory": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("0\n")), }), - "net": newSysNetDir(root, inoGen), + "net": newSysNetDir(root, inoGen, k), }) } // newSysNetDir returns the dentry corresponding to /proc/sys/net directory. -func newSysNetDir(root *auth.Credentials, inoGen InoGenerator) *kernfs.Dentry { - return kernfs.NewStaticDir(root, inoGen.NextIno(), 0555, map[string]*kernfs.Dentry{ - "net": kernfs.NewStaticDir(root, inoGen.NextIno(), 0555, map[string]*kernfs.Dentry{ +func newSysNetDir(root *auth.Credentials, inoGen InoGenerator, k *kernel.Kernel) *kernfs.Dentry { + var contents map[string]*kernfs.Dentry + + if stack := k.NetworkStack(); stack != nil { + contents = map[string]*kernfs.Dentry{ "ipv4": kernfs.NewStaticDir(root, inoGen.NextIno(), 0555, map[string]*kernfs.Dentry{ - // Add tcp_sack. - // TODO(gvisor.dev/issue/1195): tcp_sack allows write(2) - // "tcp_sack": newTCPSackInode(ctx, msrc, s), + "tcp_sack": newDentry(root, inoGen.NextIno(), 0644, &tcpSackData{stack: stack}), // The following files are simple stubs until they are implemented in // netstack, most of these files are configuration related. We use the @@ -103,7 +107,11 @@ func newSysNetDir(root *auth.Credentials, inoGen InoGenerator) *kernfs.Dentry { "wmem_default": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("212992")), "wmem_max": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("212992")), }), - }), + } + } + + return kernfs.NewStaticDir(root, inoGen.NextIno(), 0555, map[string]*kernfs.Dentry{ + "net": kernfs.NewStaticDir(root, inoGen.NextIno(), 0555, contents), }) } @@ -141,3 +149,61 @@ func (*hostnameData) Generate(ctx context.Context, buf *bytes.Buffer) error { buf.WriteString("\n") return nil } + +// tcpSackData implements vfs.WritableDynamicBytesSource for +// /proc/sys/net/tcp_sack. +// +// +stateify savable +type tcpSackData struct { + kernfs.DynamicBytesFile + + stack inet.Stack `state:"wait"` + enabled *bool +} + +var _ vfs.WritableDynamicBytesSource = (*tcpSackData)(nil) + +// Generate implements vfs.DynamicBytesSource. +func (d *tcpSackData) Generate(ctx context.Context, buf *bytes.Buffer) error { + if d.enabled == nil { + sack, err := d.stack.TCPSACKEnabled() + if err != nil { + return err + } + d.enabled = &sack + } + + val := "0\n" + if *d.enabled { + // Technically, this is not quite compatible with Linux. Linux stores these + // as an integer, so if you write "2" into tcp_sack, you should get 2 back. + // Tough luck. + val = "1\n" + } + buf.WriteString(val) + return nil +} + +func (d *tcpSackData) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) { + if offset != 0 { + // No need to handle partial writes thus far. + return 0, syserror.EINVAL + } + if src.NumBytes() == 0 { + return 0, nil + } + + // Limit the amount of memory allocated. + src = src.TakeFirst(usermem.PageSize - 1) + + var v int32 + n, err := usermem.CopyInt32StringInVec(ctx, src.IO, src.Addrs, &v, src.Opts) + if err != nil { + return n, err + } + if d.enabled == nil { + d.enabled = new(bool) + } + *d.enabled = v != 0 + return n, d.stack.SetTCPSACKEnabled(*d.enabled) +} diff --git a/pkg/sentry/vfs/file_description_impl_util.go b/pkg/sentry/vfs/file_description_impl_util.go index fb9b87fdc..a4900c170 100644 --- a/pkg/sentry/vfs/file_description_impl_util.go +++ b/pkg/sentry/vfs/file_description_impl_util.go @@ -192,21 +192,6 @@ func (DentryMetadataFileDescriptionImpl) SetStat(ctx context.Context, opts SetSt panic("illegal call to DentryMetadataFileDescriptionImpl.SetStat") } -// DynamicBytesFileDescriptionImpl may be embedded by implementations of -// FileDescriptionImpl that represent read-only regular files whose contents -// are backed by a bytes.Buffer that is regenerated when necessary, consistent -// with Linux's fs/seq_file.c:single_open(). -// -// DynamicBytesFileDescriptionImpl.SetDataSource() must be called before first -// use. -type DynamicBytesFileDescriptionImpl struct { - data DynamicBytesSource // immutable - mu sync.Mutex // protects the following fields - buf bytes.Buffer - off int64 - lastRead int64 // offset at which the last Read, PRead, or Seek ended -} - // DynamicBytesSource represents a data source for a // DynamicBytesFileDescriptionImpl. type DynamicBytesSource interface { @@ -225,6 +210,30 @@ func (s *StaticData) Generate(ctx context.Context, buf *bytes.Buffer) error { return nil } +// WritableDynamicBytesSource extends DynamicBytesSource to allow writes to the +// underlying source. +type WritableDynamicBytesSource interface { + DynamicBytesSource + + // Write sends writes to the source. + Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) +} + +// DynamicBytesFileDescriptionImpl may be embedded by implementations of +// FileDescriptionImpl that represent read-only regular files whose contents +// are backed by a bytes.Buffer that is regenerated when necessary, consistent +// with Linux's fs/seq_file.c:single_open(). +// +// DynamicBytesFileDescriptionImpl.SetDataSource() must be called before first +// use. +type DynamicBytesFileDescriptionImpl struct { + data DynamicBytesSource // immutable + mu sync.Mutex // protects the following fields + buf bytes.Buffer + off int64 + lastRead int64 // offset at which the last Read, PRead, or Seek ended +} + // SetDataSource must be called exactly once on fd before first use. func (fd *DynamicBytesFileDescriptionImpl) SetDataSource(data DynamicBytesSource) { fd.data = data @@ -304,6 +313,43 @@ func (fd *DynamicBytesFileDescriptionImpl) Seek(ctx context.Context, offset int6 return offset, nil } +// Preconditions: fd.mu must be locked. +func (fd *DynamicBytesFileDescriptionImpl) pwriteLocked(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) { + if opts.Flags&^(linux.RWF_HIPRI|linux.RWF_DSYNC|linux.RWF_SYNC) != 0 { + return 0, syserror.EOPNOTSUPP + } + + writable, ok := fd.data.(WritableDynamicBytesSource) + if !ok { + return 0, syserror.EINVAL + } + n, err := writable.Write(ctx, src, offset) + if err != nil { + return 0, err + } + + // Invalidate cached data that might exist prior to this call. + fd.buf.Reset() + return n, nil +} + +// PWrite implements FileDescriptionImpl.PWrite. +func (fd *DynamicBytesFileDescriptionImpl) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) { + fd.mu.Lock() + n, err := fd.pwriteLocked(ctx, src, offset, opts) + fd.mu.Unlock() + return n, err +} + +// Write implements FileDescriptionImpl.Write. +func (fd *DynamicBytesFileDescriptionImpl) Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) { + fd.mu.Lock() + n, err := fd.pwriteLocked(ctx, src, fd.off, opts) + fd.off += n + fd.mu.Unlock() + return n, err +} + // GenericConfigureMMap may be used by most implementations of // FileDescriptionImpl.ConfigureMMap. func GenericConfigureMMap(fd *FileDescription, m memmap.Mappable, opts *memmap.MMapOpts) error { diff --git a/pkg/sentry/vfs/file_description_impl_util_test.go b/pkg/sentry/vfs/file_description_impl_util_test.go index 0f44e7c8c..8fa26418e 100644 --- a/pkg/sentry/vfs/file_description_impl_util_test.go +++ b/pkg/sentry/vfs/file_description_impl_util_test.go @@ -35,61 +35,80 @@ type fileDescription struct { FileDescriptionDefaultImpl } -// genCountFD is a read-only FileDescriptionImpl representing a regular file -// that contains the number of times its DynamicBytesSource.Generate() +// genCount contains the number of times its DynamicBytesSource.Generate() // implementation has been called. -type genCountFD struct { +type genCount struct { + count uint64 // accessed using atomic memory ops +} + +// Generate implements DynamicBytesSource.Generate. +func (g *genCount) Generate(ctx context.Context, buf *bytes.Buffer) error { + fmt.Fprintf(buf, "%d", atomic.AddUint64(&g.count, 1)) + return nil +} + +type storeData struct { + data string +} + +var _ WritableDynamicBytesSource = (*storeData)(nil) + +// Generate implements DynamicBytesSource. +func (d *storeData) Generate(ctx context.Context, buf *bytes.Buffer) error { + buf.WriteString(d.data) + return nil +} + +// Generate implements WritableDynamicBytesSource. +func (d *storeData) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) { + buf := make([]byte, src.NumBytes()) + n, err := src.CopyIn(ctx, buf) + if err != nil { + return 0, err + } + + d.data = string(buf[:n]) + return 0, nil +} + +// testFD is a read-only FileDescriptionImpl representing a regular file. +type testFD struct { fileDescription DynamicBytesFileDescriptionImpl - count uint64 // accessed using atomic memory ops + data DynamicBytesSource } -func newGenCountFD(vfsObj *VirtualFilesystem) *FileDescription { +func newTestFD(vfsObj *VirtualFilesystem, statusFlags uint32, data DynamicBytesSource) *FileDescription { vd := vfsObj.NewAnonVirtualDentry("genCountFD") defer vd.DecRef() - var fd genCountFD - fd.vfsfd.Init(&fd, 0 /* statusFlags */, vd.Mount(), vd.Dentry(), &FileDescriptionOptions{}) - fd.DynamicBytesFileDescriptionImpl.SetDataSource(&fd) + var fd testFD + fd.vfsfd.Init(&fd, statusFlags, vd.Mount(), vd.Dentry(), &FileDescriptionOptions{}) + fd.DynamicBytesFileDescriptionImpl.SetDataSource(data) return &fd.vfsfd } // Release implements FileDescriptionImpl.Release. -func (fd *genCountFD) Release() { -} - -// StatusFlags implements FileDescriptionImpl.StatusFlags. -func (fd *genCountFD) StatusFlags(ctx context.Context) (uint32, error) { - return 0, nil +func (fd *testFD) Release() { } // SetStatusFlags implements FileDescriptionImpl.SetStatusFlags. -func (fd *genCountFD) SetStatusFlags(ctx context.Context, flags uint32) error { - return syserror.EPERM -} - // Stat implements FileDescriptionImpl.Stat. -func (fd *genCountFD) Stat(ctx context.Context, opts StatOptions) (linux.Statx, error) { +func (fd *testFD) Stat(ctx context.Context, opts StatOptions) (linux.Statx, error) { // Note that Statx.Mask == 0 in the return value. return linux.Statx{}, nil } // SetStat implements FileDescriptionImpl.SetStat. -func (fd *genCountFD) SetStat(ctx context.Context, opts SetStatOptions) error { +func (fd *testFD) SetStat(ctx context.Context, opts SetStatOptions) error { return syserror.EPERM } -// Generate implements DynamicBytesSource.Generate. -func (fd *genCountFD) Generate(ctx context.Context, buf *bytes.Buffer) error { - fmt.Fprintf(buf, "%d", atomic.AddUint64(&fd.count, 1)) - return nil -} - func TestGenCountFD(t *testing.T) { ctx := contexttest.Context(t) vfsObj := New() // vfs.New() - fd := newGenCountFD(vfsObj) + fd := newTestFD(vfsObj, linux.O_RDWR, &genCount{}) defer fd.DecRef() // The first read causes Generate to be called to fill the FD's buffer. @@ -130,4 +149,69 @@ func TestGenCountFD(t *testing.T) { if want := byte('3'); buf[0] != want { t.Errorf("PRead: got byte %c, wanted %c", buf[0], want) } + + // Write and PWrite fails. + if _, err := fd.Write(ctx, ioseq, WriteOptions{}); err != syserror.EINVAL { + t.Errorf("Write: got err %v, wanted %v", err, syserror.EINVAL) + } + if _, err := fd.PWrite(ctx, ioseq, 0, WriteOptions{}); err != syserror.EINVAL { + t.Errorf("Write: got err %v, wanted %v", err, syserror.EINVAL) + } +} + +func TestWritable(t *testing.T) { + ctx := contexttest.Context(t) + + vfsObj := New() // vfs.New() + fd := newTestFD(vfsObj, linux.O_RDWR, &storeData{data: "init"}) + defer fd.DecRef() + + buf := make([]byte, 10) + ioseq := usermem.BytesIOSequence(buf) + if n, err := fd.Read(ctx, ioseq, ReadOptions{}); n != 4 && err != io.EOF { + t.Fatalf("Read: got (%v, %v), wanted (4, EOF)", n, err) + } + if want := "init"; want == string(buf) { + t.Fatalf("Read: got %v, wanted %v", string(buf), want) + } + + // Test PWrite. + want := "write" + writeIOSeq := usermem.BytesIOSequence([]byte(want)) + if n, err := fd.PWrite(ctx, writeIOSeq, 0, WriteOptions{}); int(n) != len(want) && err != nil { + t.Errorf("PWrite: got err (%v, %v), wanted (%v, nil)", n, err, len(want)) + } + if n, err := fd.PRead(ctx, ioseq, 0, ReadOptions{}); int(n) != len(want) && err != io.EOF { + t.Fatalf("PRead: got (%v, %v), wanted (%v, EOF)", n, err, len(want)) + } + if want == string(buf) { + t.Fatalf("PRead: got %v, wanted %v", string(buf), want) + } + + // Test Seek to 0 followed by Write. + want = "write2" + writeIOSeq = usermem.BytesIOSequence([]byte(want)) + if n, err := fd.Seek(ctx, 0, linux.SEEK_SET); n != 0 && err != nil { + t.Errorf("Seek: got err (%v, %v), wanted (0, nil)", n, err) + } + if n, err := fd.Write(ctx, writeIOSeq, WriteOptions{}); int(n) != len(want) && err != nil { + t.Errorf("Write: got err (%v, %v), wanted (%v, nil)", n, err, len(want)) + } + if n, err := fd.PRead(ctx, ioseq, 0, ReadOptions{}); int(n) != len(want) && err != io.EOF { + t.Fatalf("PRead: got (%v, %v), wanted (%v, EOF)", n, err, len(want)) + } + if want == string(buf) { + t.Fatalf("PRead: got %v, wanted %v", string(buf), want) + } + + // Test failure if offset != 0. + if n, err := fd.Seek(ctx, 1, linux.SEEK_SET); n != 0 && err != nil { + t.Errorf("Seek: got err (%v, %v), wanted (0, nil)", n, err) + } + if n, err := fd.Write(ctx, writeIOSeq, WriteOptions{}); n != 0 && err != syserror.EINVAL { + t.Errorf("Write: got err (%v, %v), wanted (0, EINVAL)", n, err) + } + if n, err := fd.PWrite(ctx, writeIOSeq, 2, WriteOptions{}); n != 0 && err != syserror.EINVAL { + t.Errorf("PWrite: got err (%v, %v), wanted (0, EINVAL)", n, err) + } } -- cgit v1.2.3 From 6c3072243dfbf70062de5f610e14fd6ed2ce5f32 Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Fri, 31 Jan 2020 14:14:52 -0800 Subject: Implement file locks for regular tmpfs files in VFSv2. Add a file lock implementation that can be embedded into various filesystem implementations. Updates #1480 PiperOrigin-RevId: 292614758 --- pkg/sentry/fsimpl/tmpfs/BUILD | 3 ++ pkg/sentry/fsimpl/tmpfs/regular_file.go | 23 +++++++++ pkg/sentry/fsimpl/tmpfs/regular_file_test.go | 56 ++++++++++++++++++++++ pkg/sentry/fsimpl/tmpfs/tmpfs.go | 43 +++++++++++++++++ pkg/sentry/vfs/lock/BUILD | 13 +++++ pkg/sentry/vfs/lock/lock.go | 72 ++++++++++++++++++++++++++++ 6 files changed, 210 insertions(+) create mode 100644 pkg/sentry/vfs/lock/BUILD create mode 100644 pkg/sentry/vfs/lock/lock.go (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/tmpfs/BUILD b/pkg/sentry/fsimpl/tmpfs/BUILD index c61366224..57abd5583 100644 --- a/pkg/sentry/fsimpl/tmpfs/BUILD +++ b/pkg/sentry/fsimpl/tmpfs/BUILD @@ -38,6 +38,7 @@ go_library( "//pkg/sentry/arch", "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", + "//pkg/sentry/fs/lock", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", "//pkg/sentry/kernel/pipe", @@ -47,6 +48,7 @@ go_library( "//pkg/sentry/platform", "//pkg/sentry/usage", "//pkg/sentry/vfs", + "//pkg/sentry/vfs/lock", "//pkg/sync", "//pkg/syserror", "//pkg/usermem", @@ -86,6 +88,7 @@ go_test( "//pkg/context", "//pkg/fspath", "//pkg/sentry/contexttest", + "//pkg/sentry/fs/lock", "//pkg/sentry/kernel/auth", "//pkg/sentry/kernel/contexttest", "//pkg/sentry/vfs", diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file.go b/pkg/sentry/fsimpl/tmpfs/regular_file.go index e9e6faf67..dab346a41 100644 --- a/pkg/sentry/fsimpl/tmpfs/regular_file.go +++ b/pkg/sentry/fsimpl/tmpfs/regular_file.go @@ -23,6 +23,7 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" + "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" @@ -192,6 +193,28 @@ func (fd *regularFileFD) Sync(ctx context.Context) error { return nil } +// LockBSD implements vfs.FileDescriptionImpl.LockBSD. +func (fd *regularFileFD) LockBSD(ctx context.Context, uid lock.UniqueID, t lock.LockType, block lock.Blocker) error { + return fd.inode().lockBSD(uid, t, block) +} + +// UnlockBSD implements vfs.FileDescriptionImpl.UnlockBSD. +func (fd *regularFileFD) UnlockBSD(ctx context.Context, uid lock.UniqueID) error { + fd.inode().unlockBSD(uid) + return nil +} + +// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. +func (fd *regularFileFD) LockPOSIX(ctx context.Context, uid lock.UniqueID, t lock.LockType, rng lock.LockRange, block lock.Blocker) error { + return fd.inode().lockPOSIX(uid, t, rng, block) +} + +// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. +func (fd *regularFileFD) UnlockPOSIX(ctx context.Context, uid lock.UniqueID, rng lock.LockRange) error { + fd.inode().unlockPOSIX(uid, rng) + return nil +} + // regularFileReadWriter implements safemem.Reader and Safemem.Writer. type regularFileReadWriter struct { file *regularFile diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file_test.go b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go index 32552e261..2b52992ea 100644 --- a/pkg/sentry/fsimpl/tmpfs/regular_file_test.go +++ b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go @@ -24,9 +24,11 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/contexttest" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -260,6 +262,60 @@ func TestPWrite(t *testing.T) { } } +func TestLocks(t *testing.T) { + ctx := contexttest.Context(t) + fd, cleanup, err := newFileFD(ctx, 0644) + if err != nil { + t.Fatal(err) + } + defer cleanup() + + var ( + uid1 lock.UniqueID + uid2 lock.UniqueID + // Non-blocking. + block lock.Blocker + ) + + uid1 = 123 + uid2 = 456 + + if err := fd.Impl().LockBSD(ctx, uid1, lock.ReadLock, block); err != nil { + t.Fatalf("fd.Impl().LockBSD failed: err = %v", err) + } + if err := fd.Impl().LockBSD(ctx, uid2, lock.ReadLock, block); err != nil { + t.Fatalf("fd.Impl().LockBSD failed: err = %v", err) + } + if got, want := fd.Impl().LockBSD(ctx, uid2, lock.WriteLock, block), syserror.ErrWouldBlock; got != want { + t.Fatalf("fd.Impl().LockBSD failed: got = %v, want = %v", got, want) + } + if err := fd.Impl().UnlockBSD(ctx, uid1); err != nil { + t.Fatalf("fd.Impl().UnlockBSD failed: err = %v", err) + } + if err := fd.Impl().LockBSD(ctx, uid2, lock.WriteLock, block); err != nil { + t.Fatalf("fd.Impl().LockBSD failed: err = %v", err) + } + + rng1 := lock.LockRange{0, 1} + rng2 := lock.LockRange{1, 2} + + if err := fd.Impl().LockPOSIX(ctx, uid1, lock.ReadLock, rng1, block); err != nil { + t.Fatalf("fd.Impl().LockPOSIX failed: err = %v", err) + } + if err := fd.Impl().LockPOSIX(ctx, uid2, lock.ReadLock, rng2, block); err != nil { + t.Fatalf("fd.Impl().LockPOSIX failed: err = %v", err) + } + if err := fd.Impl().LockPOSIX(ctx, uid1, lock.WriteLock, rng1, block); err != nil { + t.Fatalf("fd.Impl().LockPOSIX failed: err = %v", err) + } + if got, want := fd.Impl().LockPOSIX(ctx, uid2, lock.ReadLock, rng1, block), syserror.ErrWouldBlock; got != want { + t.Fatalf("fd.Impl().LockPOSIX failed: got = %v, want = %v", got, want) + } + if err := fd.Impl().UnlockPOSIX(ctx, uid1, rng1); err != nil { + t.Fatalf("fd.Impl().UnlockPOSIX failed: err = %v", err) + } +} + func TestPRead(t *testing.T) { ctx := contexttest.Context(t) fd, cleanup, err := newFileFD(ctx, 0644) diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go index 88dbd6e35..2108d0f4d 100644 --- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go +++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go @@ -30,10 +30,12 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) @@ -153,6 +155,9 @@ type inode struct { rdevMajor uint32 rdevMinor uint32 + // Advisory file locks, which lock at the inode level. + locks lock.FileLocks + impl interface{} // immutable } @@ -352,6 +357,44 @@ func (i *inode) setStat(stat linux.Statx) error { return nil } +// TODO(gvisor.dev/issue/1480): support file locking for file types other than regular. +func (i *inode) lockBSD(uid fslock.UniqueID, t fslock.LockType, block fslock.Blocker) error { + switch i.impl.(type) { + case *regularFile: + return i.locks.LockBSD(uid, t, block) + } + return syserror.EBADF +} + +// TODO(gvisor.dev/issue/1480): support file locking for file types other than regular. +func (i *inode) unlockBSD(uid fslock.UniqueID) error { + switch i.impl.(type) { + case *regularFile: + i.locks.UnlockBSD(uid) + return nil + } + return syserror.EBADF +} + +// TODO(gvisor.dev/issue/1480): support file locking for file types other than regular. +func (i *inode) lockPOSIX(uid fslock.UniqueID, t fslock.LockType, rng fslock.LockRange, block fslock.Blocker) error { + switch i.impl.(type) { + case *regularFile: + return i.locks.LockPOSIX(uid, t, rng, block) + } + return syserror.EBADF +} + +// TODO(gvisor.dev/issue/1480): support file locking for file types other than regular. +func (i *inode) unlockPOSIX(uid fslock.UniqueID, rng fslock.LockRange) error { + switch i.impl.(type) { + case *regularFile: + i.locks.UnlockPOSIX(uid, rng) + return nil + } + return syserror.EBADF +} + // allocatedBlocksForSize returns the number of 512B blocks needed to // accommodate the given size in bytes, as appropriate for struct // stat::st_blocks and struct statx::stx_blocks. (Note that this 512B block diff --git a/pkg/sentry/vfs/lock/BUILD b/pkg/sentry/vfs/lock/BUILD new file mode 100644 index 000000000..d9ab063b7 --- /dev/null +++ b/pkg/sentry/vfs/lock/BUILD @@ -0,0 +1,13 @@ +load("//tools:defs.bzl", "go_library") + +package(licenses = ["notice"]) + +go_library( + name = "lock", + srcs = ["lock.go"], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/sentry/fs/lock", + "//pkg/syserror", + ], +) diff --git a/pkg/sentry/vfs/lock/lock.go b/pkg/sentry/vfs/lock/lock.go new file mode 100644 index 000000000..724dfe743 --- /dev/null +++ b/pkg/sentry/vfs/lock/lock.go @@ -0,0 +1,72 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package lock provides POSIX and BSD style file locking for VFS2 file +// implementations. +// +// The actual implementations can be found in the lock package under +// sentry/fs/lock. +package lock + +import ( + fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" + "gvisor.dev/gvisor/pkg/syserror" +) + +// FileLocks supports POSIX and BSD style locks, which correspond to fcntl(2) +// and flock(2) respectively in Linux. It can be embedded into various file +// implementations for VFS2 that support locking. +// +// Note that in Linux these two types of locks are _not_ cooperative, because +// race and deadlock conditions make merging them prohibitive. We do the same +// and keep them oblivious to each other. +type FileLocks struct { + // bsd is a set of BSD-style advisory file wide locks, see flock(2). + bsd fslock.Locks + + // posix is a set of POSIX-style regional advisory locks, see fcntl(2). + posix fslock.Locks +} + +// LockBSD tries to acquire a BSD-style lock on the entire file. +func (fl *FileLocks) LockBSD(uid fslock.UniqueID, t fslock.LockType, block fslock.Blocker) error { + if fl.bsd.LockRegion(uid, t, fslock.LockRange{0, fslock.LockEOF}, block) { + return nil + } + return syserror.ErrWouldBlock +} + +// UnlockBSD releases a BSD-style lock on the entire file. +// +// This operation is always successful, even if there did not exist a lock on +// the requested region held by uid in the first place. +func (fl *FileLocks) UnlockBSD(uid fslock.UniqueID) { + fl.bsd.UnlockRegion(uid, fslock.LockRange{0, fslock.LockEOF}) +} + +// LockPOSIX tries to acquire a POSIX-style lock on a file region. +func (fl *FileLocks) LockPOSIX(uid fslock.UniqueID, t fslock.LockType, rng fslock.LockRange, block fslock.Blocker) error { + if fl.posix.LockRegion(uid, t, rng, block) { + return nil + } + return syserror.ErrWouldBlock +} + +// UnlockPOSIX releases a POSIX-style lock on a file region. +// +// This operation is always successful, even if there did not exist a lock on +// the requested region held by uid in the first place. +func (fl *FileLocks) UnlockPOSIX(uid fslock.UniqueID, rng fslock.LockRange) { + fl.posix.UnlockRegion(uid, rng) +} -- cgit v1.2.3 From 492229d0176c1af2ab4ea4cf91bf211e940b5b12 Mon Sep 17 00:00:00 2001 From: Jamie Liu Date: Tue, 4 Feb 2020 11:28:36 -0800 Subject: VFS2 gofer client Updates #1198 Opening host pipes (by spinning in fdpipe) and host sockets is not yet complete, and will be done in a future CL. Major differences from VFS1 gofer client (sentry/fs/gofer), with varying levels of backportability: - "Cache policies" are replaced by InteropMode, which control the behavior of timestamps in addition to caching. Under InteropModeExclusive (analogous to cacheAll) and InteropModeWritethrough (analogous to cacheAllWritethrough), client timestamps are *not* written back to the server (it is not possible in 9P or Linux for clients to set ctime, so writing back client-authoritative timestamps results in incoherence between atime/mtime and ctime). Under InteropModeShared (analogous to cacheRemoteRevalidating), client timestamps are not used at all (remote filesystem clocks are authoritative). cacheNone is translated to InteropModeShared + new option filesystemOptions.specialRegularFiles. - Under InteropModeShared, "unstable attribute" reloading for permission checks, lookup, and revalidation are fused, which is feasible in VFS2 since gofer.filesystem controls path resolution. This results in a ~33% reduction in RPCs for filesystem operations compared to cacheRemoteRevalidating. For example, consider stat("/foo/bar/baz") where "/foo/bar/baz" fails revalidation, resulting in the instantiation of a new dentry: VFS1 RPCs: getattr("/") // fs.MountNamespace.FindLink() => fs.Inode.CheckPermission() => gofer.inodeOperations.check() => gofer.inodeOperations.UnstableAttr() walkgetattr("/", "foo") = fid1 // fs.Dirent.walk() => gofer.session.Revalidate() => gofer.cachePolicy.Revalidate() clunk(fid1) getattr("/foo") // CheckPermission walkgetattr("/foo", "bar") = fid2 // Revalidate clunk(fid2) getattr("/foo/bar") // CheckPermission walkgetattr("/foo/bar", "baz") = fid3 // Revalidate clunk(fid3) walkgetattr("/foo/bar", "baz") = fid4 // fs.Dirent.walk() => gofer.inodeOperations.Lookup getattr("/foo/bar/baz") // linux.stat() => gofer.inodeOperations.UnstableAttr() VFS2 RPCs: getattr("/") // gofer.filesystem.walkExistingLocked() walkgetattr("/", "foo") = fid1 // gofer.filesystem.stepExistingLocked() clunk(fid1) // No getattr: walkgetattr already updated metadata for permission check walkgetattr("/foo", "bar") = fid2 clunk(fid2) walkgetattr("/foo/bar", "baz") = fid3 // No clunk: fid3 used for new gofer.dentry // No getattr: walkgetattr already updated metadata for stat() - gofer.filesystem.unlinkAt() does not require instantiation of a dentry that represents the file to be deleted. Updates #898. - gofer.regularFileFD.OnClose() skips Tflushf for regular files under InteropModeExclusive, as it's nonsensical to request a remote file flush without flushing locally-buffered writes to that remote file first. - Symlink targets are cached when InteropModeShared is not in effect. - p9.QID.Path (which is already required to be unique for each file within a server, and is accordingly already synthesized from device/inode numbers in all known gofers) is used as-is for inode numbers, rather than being mapped along with attr.RDev in the client to yet another synthetic inode number. - Relevant parts of fsutil.CachingInodeOperations are inlined directly into gofer package code. This avoids having to duplicate part of its functionality in fsutil.HostMappable. PiperOrigin-RevId: 293190213 --- pkg/safemem/seq_unsafe.go | 17 + pkg/sentry/fs/fsutil/BUILD | 4 +- pkg/sentry/fs/fsutil/frame_ref_set.go | 13 +- pkg/sentry/fs/fsutil/inode_cached.go | 2 +- pkg/sentry/fsimpl/gofer/BUILD | 55 ++ pkg/sentry/fsimpl/gofer/directory.go | 190 +++++ pkg/sentry/fsimpl/gofer/filesystem.go | 1087 ++++++++++++++++++++++++++++ pkg/sentry/fsimpl/gofer/gofer.go | 1147 ++++++++++++++++++++++++++++++ pkg/sentry/fsimpl/gofer/handle.go | 135 ++++ pkg/sentry/fsimpl/gofer/handle_unsafe.go | 66 ++ pkg/sentry/fsimpl/gofer/p9file.go | 219 ++++++ pkg/sentry/fsimpl/gofer/pagemath.go | 31 + pkg/sentry/fsimpl/gofer/regular_file.go | 860 ++++++++++++++++++++++ pkg/sentry/fsimpl/gofer/special_file.go | 159 +++++ pkg/sentry/fsimpl/gofer/symlink.go | 47 ++ pkg/sentry/fsimpl/gofer/time.go | 75 ++ pkg/sentry/fsimpl/tmpfs/filesystem.go | 2 +- pkg/sentry/socket/hostinet/socket.go | 23 +- 18 files changed, 4103 insertions(+), 29 deletions(-) create mode 100644 pkg/sentry/fsimpl/gofer/BUILD create mode 100644 pkg/sentry/fsimpl/gofer/directory.go create mode 100644 pkg/sentry/fsimpl/gofer/filesystem.go create mode 100644 pkg/sentry/fsimpl/gofer/gofer.go create mode 100644 pkg/sentry/fsimpl/gofer/handle.go create mode 100644 pkg/sentry/fsimpl/gofer/handle_unsafe.go create mode 100644 pkg/sentry/fsimpl/gofer/p9file.go create mode 100644 pkg/sentry/fsimpl/gofer/pagemath.go create mode 100644 pkg/sentry/fsimpl/gofer/regular_file.go create mode 100644 pkg/sentry/fsimpl/gofer/special_file.go create mode 100644 pkg/sentry/fsimpl/gofer/symlink.go create mode 100644 pkg/sentry/fsimpl/gofer/time.go (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/safemem/seq_unsafe.go b/pkg/safemem/seq_unsafe.go index 354a95dde..dcdfc9600 100644 --- a/pkg/safemem/seq_unsafe.go +++ b/pkg/safemem/seq_unsafe.go @@ -18,6 +18,7 @@ import ( "bytes" "fmt" "reflect" + "syscall" "unsafe" ) @@ -297,3 +298,19 @@ func ZeroSeq(dsts BlockSeq) (uint64, error) { } return done, nil } + +// IovecsFromBlockSeq returns a []syscall.Iovec representing seq. +func IovecsFromBlockSeq(bs BlockSeq) []syscall.Iovec { + iovs := make([]syscall.Iovec, 0, bs.NumBlocks()) + for ; !bs.IsEmpty(); bs = bs.Tail() { + b := bs.Head() + iovs = append(iovs, syscall.Iovec{ + Base: &b.ToSlice()[0], + Len: uint64(b.Len()), + }) + // We don't need to care about b.NeedSafecopy(), because the host + // kernel will handle such address ranges just fine (by returning + // EFAULT). + } + return iovs +} diff --git a/pkg/sentry/fs/fsutil/BUILD b/pkg/sentry/fs/fsutil/BUILD index 4ab2a384f..789369220 100644 --- a/pkg/sentry/fs/fsutil/BUILD +++ b/pkg/sentry/fs/fsutil/BUILD @@ -28,13 +28,13 @@ go_template_instance( "platform": "gvisor.dev/gvisor/pkg/sentry/platform", }, package = "fsutil", - prefix = "frameRef", + prefix = "FrameRef", template = "//pkg/segment:generic_set", types = { "Key": "uint64", "Range": "platform.FileRange", "Value": "uint64", - "Functions": "frameRefSetFunctions", + "Functions": "FrameRefSetFunctions", }, ) diff --git a/pkg/sentry/fs/fsutil/frame_ref_set.go b/pkg/sentry/fs/fsutil/frame_ref_set.go index dd63db32b..6564fd0c6 100644 --- a/pkg/sentry/fs/fsutil/frame_ref_set.go +++ b/pkg/sentry/fs/fsutil/frame_ref_set.go @@ -20,24 +20,25 @@ import ( "gvisor.dev/gvisor/pkg/sentry/platform" ) -type frameRefSetFunctions struct{} +// FrameRefSetFunctions implements segment.Functions for FrameRefSet. +type FrameRefSetFunctions struct{} // MinKey implements segment.Functions.MinKey. -func (frameRefSetFunctions) MinKey() uint64 { +func (FrameRefSetFunctions) MinKey() uint64 { return 0 } // MaxKey implements segment.Functions.MaxKey. -func (frameRefSetFunctions) MaxKey() uint64 { +func (FrameRefSetFunctions) MaxKey() uint64 { return math.MaxUint64 } // ClearValue implements segment.Functions.ClearValue. -func (frameRefSetFunctions) ClearValue(val *uint64) { +func (FrameRefSetFunctions) ClearValue(val *uint64) { } // Merge implements segment.Functions.Merge. -func (frameRefSetFunctions) Merge(_ platform.FileRange, val1 uint64, _ platform.FileRange, val2 uint64) (uint64, bool) { +func (FrameRefSetFunctions) Merge(_ platform.FileRange, val1 uint64, _ platform.FileRange, val2 uint64) (uint64, bool) { if val1 != val2 { return 0, false } @@ -45,6 +46,6 @@ func (frameRefSetFunctions) Merge(_ platform.FileRange, val1 uint64, _ platform. } // Split implements segment.Functions.Split. -func (frameRefSetFunctions) Split(_ platform.FileRange, val uint64, _ uint64) (uint64, uint64) { +func (FrameRefSetFunctions) Split(_ platform.FileRange, val uint64, _ uint64) (uint64, uint64) { return val, val } diff --git a/pkg/sentry/fs/fsutil/inode_cached.go b/pkg/sentry/fs/fsutil/inode_cached.go index 573b8586e..800c8b4e1 100644 --- a/pkg/sentry/fs/fsutil/inode_cached.go +++ b/pkg/sentry/fs/fsutil/inode_cached.go @@ -111,7 +111,7 @@ type CachingInodeOperations struct { // refs tracks active references to data in the cache. // // refs is protected by dataMu. - refs frameRefSet + refs FrameRefSet } // CachingInodeOperationsOptions configures a CachingInodeOperations. diff --git a/pkg/sentry/fsimpl/gofer/BUILD b/pkg/sentry/fsimpl/gofer/BUILD new file mode 100644 index 000000000..4ba76a1e8 --- /dev/null +++ b/pkg/sentry/fsimpl/gofer/BUILD @@ -0,0 +1,55 @@ +load("//tools:defs.bzl", "go_library") +load("//tools/go_generics:defs.bzl", "go_template_instance") + +licenses(["notice"]) + +go_template_instance( + name = "dentry_list", + out = "dentry_list.go", + package = "gofer", + prefix = "dentry", + template = "//pkg/ilist:generic_list", + types = { + "Element": "*dentry", + "Linker": "*dentry", + }, +) + +go_library( + name = "gofer", + srcs = [ + "dentry_list.go", + "directory.go", + "filesystem.go", + "gofer.go", + "handle.go", + "handle_unsafe.go", + "p9file.go", + "pagemath.go", + "regular_file.go", + "special_file.go", + "symlink.go", + "time.go", + ], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/context", + "//pkg/fd", + "//pkg/fspath", + "//pkg/log", + "//pkg/p9", + "//pkg/safemem", + "//pkg/sentry/fs/fsutil", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/kernel/time", + "//pkg/sentry/memmap", + "//pkg/sentry/pgalloc", + "//pkg/sentry/platform", + "//pkg/sentry/usage", + "//pkg/sentry/vfs", + "//pkg/syserror", + "//pkg/unet", + "//pkg/usermem", + ], +) diff --git a/pkg/sentry/fsimpl/gofer/directory.go b/pkg/sentry/fsimpl/gofer/directory.go new file mode 100644 index 000000000..baa2cdd8e --- /dev/null +++ b/pkg/sentry/fsimpl/gofer/directory.go @@ -0,0 +1,190 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package gofer + +import ( + "sync" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/p9" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +func (d *dentry) isDir() bool { + return d.fileType() == linux.S_IFDIR +} + +// Preconditions: d.dirMu must be locked. d.isDir(). fs.opts.interop != +// InteropModeShared. +func (d *dentry) cacheNegativeChildLocked(name string) { + if d.negativeChildren == nil { + d.negativeChildren = make(map[string]struct{}) + } + d.negativeChildren[name] = struct{}{} +} + +type directoryFD struct { + fileDescription + vfs.DirectoryFileDescriptionDefaultImpl + + mu sync.Mutex + off int64 + dirents []vfs.Dirent +} + +// Release implements vfs.FileDescriptionImpl.Release. +func (fd *directoryFD) Release() { +} + +// IterDirents implements vfs.FileDescriptionImpl.IterDirents. +func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error { + fd.mu.Lock() + defer fd.mu.Unlock() + + if fd.dirents == nil { + ds, err := fd.dentry().getDirents(ctx) + if err != nil { + return err + } + fd.dirents = ds + } + + for fd.off < int64(len(fd.dirents)) { + if !cb.Handle(fd.dirents[fd.off]) { + return nil + } + fd.off++ + } + return nil +} + +// Preconditions: d.isDir(). There exists at least one directoryFD representing d. +func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) { + // 9P2000.L's readdir does not specify behavior in the presence of + // concurrent mutation of an iterated directory, so implementations may + // duplicate or omit entries in this case, which violates POSIX semantics. + // Thus we read all directory entries while holding d.dirMu to exclude + // directory mutations. (Note that it is impossible for the client to + // exclude concurrent mutation from other remote filesystem users. Since + // there is no way to detect if the server has incorrectly omitted + // directory entries, we simply assume that the server is well-behaved + // under InteropModeShared.) This is inconsistent with Linux (which appears + // to assume that directory fids have the correct semantics, and translates + // struct file_operations::readdir calls directly to readdir RPCs), but is + // consistent with VFS1. + + d.fs.renameMu.RLock() + defer d.fs.renameMu.RUnlock() + d.dirMu.Lock() + defer d.dirMu.Unlock() + if d.dirents != nil { + return d.dirents, nil + } + + // It's not clear if 9P2000.L's readdir is expected to return "." and "..", + // so we generate them here. + parent := d.vfsd.ParentOrSelf().Impl().(*dentry) + dirents := []vfs.Dirent{ + { + Name: ".", + Type: linux.DT_DIR, + Ino: d.ino, + NextOff: 1, + }, + { + Name: "..", + Type: uint8(atomic.LoadUint32(&parent.mode) >> 12), + Ino: parent.ino, + NextOff: 2, + }, + } + off := uint64(0) + const count = 64 * 1024 // for consistency with the vfs1 client + d.handleMu.RLock() + defer d.handleMu.RUnlock() + if !d.handleReadable { + // This should not be possible because a readable handle should have + // been opened when the calling directoryFD was opened. + panic("gofer.dentry.getDirents called without a readable handle") + } + for { + p9ds, err := d.handle.file.readdir(ctx, off, count) + if err != nil { + return nil, err + } + if len(p9ds) == 0 { + // Cache dirents for future directoryFDs if permitted. + if d.fs.opts.interop != InteropModeShared { + d.dirents = dirents + } + return dirents, nil + } + for _, p9d := range p9ds { + if p9d.Name == "." || p9d.Name == ".." { + continue + } + dirent := vfs.Dirent{ + Name: p9d.Name, + Ino: p9d.QID.Path, + NextOff: int64(len(dirents) + 1), + } + // p9 does not expose 9P2000.U's DMDEVICE, DMNAMEDPIPE, or + // DMSOCKET. + switch p9d.Type { + case p9.TypeSymlink: + dirent.Type = linux.DT_LNK + case p9.TypeDir: + dirent.Type = linux.DT_DIR + default: + dirent.Type = linux.DT_REG + } + dirents = append(dirents, dirent) + } + off = p9ds[len(p9ds)-1].Offset + } +} + +// Seek implements vfs.FileDescriptionImpl.Seek. +func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { + fd.mu.Lock() + defer fd.mu.Unlock() + + switch whence { + case linux.SEEK_SET: + if offset < 0 { + return 0, syserror.EINVAL + } + if offset == 0 { + // Ensure that the next call to fd.IterDirents() calls + // fd.dentry().getDirents(). + fd.dirents = nil + } + fd.off = offset + return fd.off, nil + case linux.SEEK_CUR: + offset += fd.off + if offset < 0 { + return 0, syserror.EINVAL + } + // Don't clear fd.dirents in this case, even if offset == 0. + fd.off = offset + return fd.off, nil + default: + return 0, syserror.EINVAL + } +} diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go new file mode 100644 index 000000000..8eb61debf --- /dev/null +++ b/pkg/sentry/fsimpl/gofer/filesystem.go @@ -0,0 +1,1087 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package gofer + +import ( + "sync" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/p9" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +// Sync implements vfs.FilesystemImpl.Sync. +func (fs *filesystem) Sync(ctx context.Context) error { + // Snapshot current dentries and special files. + fs.syncMu.Lock() + ds := make([]*dentry, 0, len(fs.dentries)) + for d := range fs.dentries { + ds = append(ds, d) + } + sffds := make([]*specialFileFD, 0, len(fs.specialFileFDs)) + for sffd := range fs.specialFileFDs { + sffds = append(sffds, sffd) + } + fs.syncMu.Unlock() + + // Return the first error we encounter, but sync everything we can + // regardless. + var retErr error + + // Sync regular files. + for _, d := range ds { + if !d.TryIncRef() { + continue + } + err := d.syncSharedHandle(ctx) + d.DecRef() + if err != nil && retErr == nil { + retErr = err + } + } + + // Sync special files, which may be writable but do not use dentry shared + // handles (so they won't be synced by the above). + for _, sffd := range sffds { + if !sffd.vfsfd.TryIncRef() { + continue + } + err := sffd.Sync(ctx) + sffd.vfsfd.DecRef() + if err != nil && retErr == nil { + retErr = err + } + } + + return retErr +} + +// maxFilenameLen is the maximum length of a filename. This is dictated by 9P's +// encoding of strings, which uses 2 bytes for the length prefix. +const maxFilenameLen = (1 << 16) - 1 + +// dentrySlicePool is a pool of *[]*dentry used to store dentries for which +// dentry.checkCachingLocked() must be called. The pool holds pointers to +// slices because Go lacks generics, so sync.Pool operates on interface{}, so +// every call to (what should be) sync.Pool<[]*dentry>.Put() allocates a copy +// of the slice header on the heap. +var dentrySlicePool = sync.Pool{ + New: func() interface{} { + ds := make([]*dentry, 0, 4) // arbitrary non-zero initial capacity + return &ds + }, +} + +func appendDentry(ds *[]*dentry, d *dentry) *[]*dentry { + if ds == nil { + ds = dentrySlicePool.Get().(*[]*dentry) + } + *ds = append(*ds, d) + return ds +} + +// Preconditions: ds != nil. +func putDentrySlice(ds *[]*dentry) { + // Allow dentries to be GC'd. + for i := range *ds { + (*ds)[i] = nil + } + *ds = (*ds)[:0] + dentrySlicePool.Put(ds) +} + +// stepLocked resolves rp.Component() to an existing file, starting from the +// given directory. +// +// Dentries which may become cached as a result of the traversal are appended +// to *ds. +// +// Preconditions: fs.renameMu must be locked. d.dirMu must be locked. +// !rp.Done(). If fs.opts.interop == InteropModeShared, then d's cached +// metadata must be up to date. +func (fs *filesystem) stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, ds **[]*dentry) (*dentry, error) { + if !d.isDir() { + return nil, syserror.ENOTDIR + } + if err := d.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil { + return nil, err + } +afterSymlink: + name := rp.Component() + if name == "." { + rp.Advance() + return d, nil + } + if name == ".." { + parentVFSD, err := rp.ResolveParent(&d.vfsd) + if err != nil { + return nil, err + } + parent := parentVFSD.Impl().(*dentry) + if fs.opts.interop == InteropModeShared { + // We must assume that parentVFSD is correct, because if d has been + // moved elsewhere in the remote filesystem so that its parent has + // changed, we have no way of determining its new parent's location + // in the filesystem. Get updated metadata for parentVFSD. + _, attrMask, attr, err := parent.file.getAttr(ctx, dentryAttrMask()) + if err != nil { + return nil, err + } + parent.updateFromP9Attrs(attrMask, &attr) + } + rp.Advance() + return parent, nil + } + childVFSD, err := rp.ResolveChild(&d.vfsd, name) + if err != nil { + return nil, err + } + // FIXME(jamieliu): Linux performs revalidation before mount lookup + // (fs/namei.c:lookup_fast() => __d_lookup_rcu(), d_revalidate(), + // __follow_mount_rcu()). + child, err := fs.revalidateChildLocked(ctx, rp.VirtualFilesystem(), d, name, childVFSD, ds) + if err != nil { + return nil, err + } + if child == nil { + return nil, syserror.ENOENT + } + if child.isSymlink() && rp.ShouldFollowSymlink() { + target, err := child.readlink(ctx, rp.Mount()) + if err != nil { + return nil, err + } + if err := rp.HandleSymlink(target); err != nil { + return nil, err + } + goto afterSymlink // don't check the current directory again + } + rp.Advance() + return child, nil +} + +// revalidateChildLocked must be called after a call to parent.vfsd.Child(name) +// or vfs.ResolvingPath.ResolveChild(name) returns childVFSD (which may be +// nil) to verify that the returned child (or lack thereof) is correct. If no file +// exists at name, revalidateChildLocked returns (nil, nil). +// +// Preconditions: fs.renameMu must be locked. parent.dirMu must be locked. +// parent.isDir(). name is not "." or "..". +// +// Postconditions: If revalidateChildLocked returns a non-nil dentry, its +// cached metadata is up to date. +func (fs *filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.VirtualFilesystem, parent *dentry, name string, childVFSD *vfs.Dentry, ds **[]*dentry) (*dentry, error) { + if childVFSD != nil && fs.opts.interop != InteropModeShared { + // We have a cached dentry that is assumed to be correct. + return childVFSD.Impl().(*dentry), nil + } + // We either don't have a cached dentry or need to verify that it's still + // correct, either of which requires a remote lookup. Check if this name is + // valid before performing the lookup. + if len(name) > maxFilenameLen { + return nil, syserror.ENAMETOOLONG + } + // Check if we've already cached this lookup with a negative result. + if _, ok := parent.negativeChildren[name]; ok { + return nil, nil + } + // Perform the remote lookup. + qid, file, attrMask, attr, err := parent.file.walkGetAttrOne(ctx, name) + if err != nil && err != syserror.ENOENT { + return nil, err + } + if childVFSD != nil { + child := childVFSD.Impl().(*dentry) + if !file.isNil() && qid.Path == child.ino { + // The file at this path hasn't changed. Just update cached + // metadata. + file.close(ctx) + child.updateFromP9Attrs(attrMask, &attr) + return child, nil + } + // The file at this path has changed or no longer exists. Remove + // the stale dentry from the tree, and re-evaluate its caching + // status (i.e. if it has 0 references, drop it). + vfsObj.ForceDeleteDentry(childVFSD) + *ds = appendDentry(*ds, child) + childVFSD = nil + } + if file.isNil() { + // No file exists at this path now. Cache the negative lookup if + // allowed. + if fs.opts.interop != InteropModeShared { + parent.cacheNegativeChildLocked(name) + } + return nil, nil + } + // Create a new dentry representing the file. + child, err := fs.newDentry(ctx, file, qid, attrMask, &attr) + if err != nil { + file.close(ctx) + return nil, err + } + parent.IncRef() // reference held by child on its parent + parent.vfsd.InsertChild(&child.vfsd, name) + // For now, child has 0 references, so our caller should call + // child.checkCachingLocked(). + *ds = appendDentry(*ds, child) + return child, nil +} + +// walkParentDirLocked resolves all but the last path component of rp to an +// existing directory, starting from the given directory (which is usually +// rp.Start().Impl().(*dentry)). It does not check that the returned directory +// is searchable by the provider of rp. +// +// Preconditions: fs.renameMu must be locked. !rp.Done(). If fs.opts.interop == +// InteropModeShared, then d's cached metadata must be up to date. +func (fs *filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, ds **[]*dentry) (*dentry, error) { + for !rp.Final() { + d.dirMu.Lock() + next, err := fs.stepLocked(ctx, rp, d, ds) + d.dirMu.Unlock() + if err != nil { + return nil, err + } + d = next + } + if !d.isDir() { + return nil, syserror.ENOTDIR + } + return d, nil +} + +// resolveLocked resolves rp to an existing file. +// +// Preconditions: fs.renameMu must be locked. +func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath, ds **[]*dentry) (*dentry, error) { + d := rp.Start().Impl().(*dentry) + if fs.opts.interop == InteropModeShared { + // Get updated metadata for rp.Start() as required by fs.stepLocked(). + if err := d.updateFromGetattr(ctx); err != nil { + return nil, err + } + } + for !rp.Done() { + d.dirMu.Lock() + next, err := fs.stepLocked(ctx, rp, d, ds) + d.dirMu.Unlock() + if err != nil { + return nil, err + } + d = next + } + if rp.MustBeDir() && !d.isDir() { + return nil, syserror.ENOTDIR + } + return d, nil +} + +// doCreateAt checks that creating a file at rp is permitted, then invokes +// create to do so. +// +// Preconditions: !rp.Done(). For the final path component in rp, +// !rp.ShouldFollowSymlink(). +func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool, create func(parent *dentry, name string) error) error { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckCaching(&ds) + start := rp.Start().Impl().(*dentry) + if fs.opts.interop == InteropModeShared { + // Get updated metadata for start as required by + // fs.walkParentDirLocked(). + if err := start.updateFromGetattr(ctx); err != nil { + return err + } + } + parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds) + if err != nil { + return err + } + if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true); err != nil { + return err + } + if parent.isDeleted() { + return syserror.ENOENT + } + name := rp.Component() + if name == "." || name == ".." { + return syserror.EEXIST + } + if len(name) > maxFilenameLen { + return syserror.ENAMETOOLONG + } + if !dir && rp.MustBeDir() { + return syserror.ENOENT + } + mnt := rp.Mount() + if err := mnt.CheckBeginWrite(); err != nil { + return err + } + defer mnt.EndWrite() + parent.dirMu.Lock() + defer parent.dirMu.Unlock() + if fs.opts.interop == InteropModeShared { + // The existence of a dentry at name would be inconclusive because the + // file it represents may have been deleted from the remote filesystem, + // so we would need to make an RPC to revalidate the dentry. Just + // attempt the file creation RPC instead. If a file does exist, the RPC + // will fail with EEXIST like we would have. If the RPC succeeds, and a + // stale dentry exists, the dentry will fail revalidation next time + // it's used. + return create(parent, name) + } + if parent.vfsd.Child(name) != nil { + return syserror.EEXIST + } + // No cached dentry exists; however, there might still be an existing file + // at name. As above, we attempt the file creation RPC anyway. + if err := create(parent, name); err != nil { + return err + } + parent.touchCMtime(ctx) + delete(parent.negativeChildren, name) + parent.dirents = nil + return nil +} + +// Preconditions: !rp.Done(). +func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool) error { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckCaching(&ds) + start := rp.Start().Impl().(*dentry) + if fs.opts.interop == InteropModeShared { + // Get updated metadata for start as required by + // fs.walkParentDirLocked(). + if err := start.updateFromGetattr(ctx); err != nil { + return err + } + } + parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds) + if err != nil { + return err + } + if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true); err != nil { + return err + } + if err := rp.Mount().CheckBeginWrite(); err != nil { + return err + } + defer rp.Mount().EndWrite() + + name := rp.Component() + if dir { + if name == "." { + return syserror.EINVAL + } + if name == ".." { + return syserror.ENOTEMPTY + } + } else { + if name == "." || name == ".." { + return syserror.EISDIR + } + } + vfsObj := rp.VirtualFilesystem() + mntns := vfs.MountNamespaceFromContext(ctx) + parent.dirMu.Lock() + defer parent.dirMu.Unlock() + childVFSD := parent.vfsd.Child(name) + var child *dentry + // We only need a dentry representing the file at name if it can be a mount + // point. If childVFSD is nil, then it can't be a mount point. If childVFSD + // is non-nil but stale, the actual file can't be a mount point either; we + // detect this case by just speculatively calling PrepareDeleteDentry and + // only revalidating the dentry if that fails (indicating that the existing + // dentry is a mount point). + if childVFSD != nil { + child = childVFSD.Impl().(*dentry) + if err := vfsObj.PrepareDeleteDentry(mntns, childVFSD); err != nil { + child, err = fs.revalidateChildLocked(ctx, vfsObj, parent, name, childVFSD, &ds) + if err != nil { + return err + } + if child != nil { + childVFSD = &child.vfsd + if err := vfsObj.PrepareDeleteDentry(mntns, childVFSD); err != nil { + return err + } + } else { + childVFSD = nil + } + } + } else if _, ok := parent.negativeChildren[name]; ok { + return syserror.ENOENT + } + flags := uint32(0) + if dir { + if child != nil && !child.isDir() { + return syserror.ENOTDIR + } + flags = linux.AT_REMOVEDIR + } else { + if child != nil && child.isDir() { + return syserror.EISDIR + } + if rp.MustBeDir() { + return syserror.ENOTDIR + } + } + err = parent.file.unlinkAt(ctx, name, flags) + if err != nil { + if childVFSD != nil { + vfsObj.AbortDeleteDentry(childVFSD) + } + return err + } + if fs.opts.interop != InteropModeShared { + parent.touchCMtime(ctx) + parent.cacheNegativeChildLocked(name) + parent.dirents = nil + } + if child != nil { + child.setDeleted() + vfsObj.CommitDeleteDentry(childVFSD) + ds = appendDentry(ds, child) + } + return nil +} + +// renameMuRUnlockAndCheckCaching calls fs.renameMu.RUnlock(), then calls +// dentry.checkCachingLocked on all dentries in *ds with fs.renameMu locked for +// writing. +// +// ds is a pointer-to-pointer since defer evaluates its arguments immediately, +// but dentry slices are allocated lazily, and it's much easier to say "defer +// fs.renameMuRUnlockAndCheckCaching(&ds)" than "defer func() { +// fs.renameMuRUnlockAndCheckCaching(ds) }()" to work around this. +func (fs *filesystem) renameMuRUnlockAndCheckCaching(ds **[]*dentry) { + fs.renameMu.RUnlock() + if *ds == nil { + return + } + if len(**ds) != 0 { + fs.renameMu.Lock() + for _, d := range **ds { + d.checkCachingLocked() + } + fs.renameMu.Unlock() + } + putDentrySlice(*ds) +} + +func (fs *filesystem) renameMuUnlockAndCheckCaching(ds **[]*dentry) { + if *ds == nil { + fs.renameMu.Unlock() + return + } + for _, d := range **ds { + d.checkCachingLocked() + } + fs.renameMu.Unlock() + putDentrySlice(*ds) +} + +// GetDentryAt implements vfs.FilesystemImpl.GetDentryAt. +func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckCaching(&ds) + d, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + return nil, err + } + if opts.CheckSearchable { + if !d.isDir() { + return nil, syserror.ENOTDIR + } + if err := d.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil { + return nil, err + } + } + d.IncRef() + return &d.vfsd, nil +} + +// GetParentDentryAt implements vfs.FilesystemImpl.GetParentDentryAt. +func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckCaching(&ds) + start := rp.Start().Impl().(*dentry) + if fs.opts.interop == InteropModeShared { + // Get updated metadata for start as required by + // fs.walkParentDirLocked(). + if err := start.updateFromGetattr(ctx); err != nil { + return nil, err + } + } + d, err := fs.walkParentDirLocked(ctx, rp, start, &ds) + if err != nil { + return nil, err + } + d.IncRef() + return &d.vfsd, nil +} + +// LinkAt implements vfs.FilesystemImpl.LinkAt. +func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error { + return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, childName string) error { + if rp.Mount() != vd.Mount() { + return syserror.EXDEV + } + // 9P2000.L supports hard links, but we don't. + return syserror.EPERM + }) +} + +// MkdirAt implements vfs.FilesystemImpl.MkdirAt. +func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error { + return fs.doCreateAt(ctx, rp, true /* dir */, func(parent *dentry, name string) error { + creds := rp.Credentials() + _, err := parent.file.mkdir(ctx, name, (p9.FileMode)(opts.Mode), (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID)) + return err + }) +} + +// MknodAt implements vfs.FilesystemImpl.MknodAt. +func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error { + return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, name string) error { + creds := rp.Credentials() + _, err := parent.file.mknod(ctx, name, (p9.FileMode)(opts.Mode), opts.DevMajor, opts.DevMinor, (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID)) + return err + }) +} + +// OpenAt implements vfs.FilesystemImpl.OpenAt. +func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + // Reject O_TMPFILE, which is not supported; supporting it correctly in the + // presence of other remote filesystem users requires remote filesystem + // support, and it isn't clear that there's any way to implement this in + // 9P. + if opts.Flags&linux.O_TMPFILE != 0 { + return nil, syserror.EOPNOTSUPP + } + mayCreate := opts.Flags&linux.O_CREAT != 0 + mustCreate := opts.Flags&(linux.O_CREAT|linux.O_EXCL) == (linux.O_CREAT | linux.O_EXCL) + + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckCaching(&ds) + + start := rp.Start().Impl().(*dentry) + if fs.opts.interop == InteropModeShared { + // Get updated metadata for start as required by fs.stepLocked(). + if err := start.updateFromGetattr(ctx); err != nil { + return nil, err + } + } + if rp.Done() { + return start.openLocked(ctx, rp, opts.Flags) + } + +afterTrailingSymlink: + parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds) + if err != nil { + return nil, err + } + // Check for search permission in the parent directory. + if err := parent.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil { + return nil, err + } + // Determine whether or not we need to create a file. + parent.dirMu.Lock() + child, err := fs.stepLocked(ctx, rp, parent, &ds) + if err == syserror.ENOENT && mayCreate { + fd, err := parent.createAndOpenChildLocked(ctx, rp, &opts) + parent.dirMu.Unlock() + return fd, err + } + if err != nil { + parent.dirMu.Unlock() + return nil, err + } + // Open existing child or follow symlink. + parent.dirMu.Unlock() + if mustCreate { + return nil, syserror.EEXIST + } + if child.isSymlink() && rp.ShouldFollowSymlink() { + target, err := child.readlink(ctx, rp.Mount()) + if err != nil { + return nil, err + } + if err := rp.HandleSymlink(target); err != nil { + return nil, err + } + start = parent + goto afterTrailingSymlink + } + return child.openLocked(ctx, rp, opts.Flags) +} + +// Preconditions: fs.renameMu must be locked. +func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, flags uint32) (*vfs.FileDescription, error) { + ats := vfs.AccessTypesForOpenFlags(flags) + if err := d.checkPermissions(rp.Credentials(), ats, d.isDir()); err != nil { + return nil, err + } + mnt := rp.Mount() + filetype := d.fileType() + switch { + case filetype == linux.S_IFREG && !d.fs.opts.regularFilesUseSpecialFileFD: + if err := d.ensureSharedHandle(ctx, ats&vfs.MayRead != 0, ats&vfs.MayWrite != 0, flags&linux.O_TRUNC != 0); err != nil { + return nil, err + } + fd := ®ularFileFD{} + if err := fd.vfsfd.Init(fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{ + AllowDirectIO: true, + }); err != nil { + return nil, err + } + return &fd.vfsfd, nil + case filetype == linux.S_IFDIR: + // Can't open directories with O_CREAT. + if flags&linux.O_CREAT != 0 { + return nil, syserror.EISDIR + } + // Can't open directories writably. + if ats&vfs.MayWrite != 0 { + return nil, syserror.EISDIR + } + if flags&linux.O_DIRECT != 0 { + return nil, syserror.EINVAL + } + if err := d.ensureSharedHandle(ctx, ats&vfs.MayRead != 0, false /* write */, false /* trunc */); err != nil { + return nil, err + } + fd := &directoryFD{} + if err := fd.vfsfd.Init(fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil { + return nil, err + } + return &fd.vfsfd, nil + case filetype == linux.S_IFLNK: + // Can't open symlinks without O_PATH (which is unimplemented). + return nil, syserror.ELOOP + default: + if flags&linux.O_DIRECT != 0 { + return nil, syserror.EINVAL + } + h, err := openHandle(ctx, d.file, ats&vfs.MayRead != 0, ats&vfs.MayWrite != 0, flags&linux.O_TRUNC != 0) + if err != nil { + return nil, err + } + fd := &specialFileFD{ + handle: h, + } + if err := fd.vfsfd.Init(fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil { + h.close(ctx) + return nil, err + } + return &fd.vfsfd, nil + } +} + +// Preconditions: d.fs.renameMu must be locked. d.dirMu must be locked. +func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions) (*vfs.FileDescription, error) { + if err := d.checkPermissions(rp.Credentials(), vfs.MayWrite, true); err != nil { + return nil, err + } + if d.isDeleted() { + return nil, syserror.ENOENT + } + mnt := rp.Mount() + if err := mnt.CheckBeginWrite(); err != nil { + return nil, err + } + defer mnt.EndWrite() + + // 9P2000.L's lcreate takes a fid representing the parent directory, and + // converts it into an open fid representing the created file, so we need + // to duplicate the directory fid first. + _, dirfile, err := d.file.walk(ctx, nil) + if err != nil { + return nil, err + } + creds := rp.Credentials() + name := rp.Component() + fdobj, openFile, createQID, _, err := dirfile.create(ctx, name, (p9.OpenFlags)(opts.Flags), (p9.FileMode)(opts.Mode), (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID)) + if err != nil { + dirfile.close(ctx) + return nil, err + } + // Then we need to walk to the file we just created to get a non-open fid + // representing it, and to get its metadata. This must use d.file since, as + // explained above, dirfile was invalidated by dirfile.Create(). + walkQID, nonOpenFile, attrMask, attr, err := d.file.walkGetAttrOne(ctx, name) + if err != nil { + openFile.close(ctx) + if fdobj != nil { + fdobj.Close() + } + return nil, err + } + // Sanity-check that we walked to the file we created. + if createQID.Path != walkQID.Path { + // Probably due to concurrent remote filesystem mutation? + ctx.Warningf("gofer.dentry.createAndOpenChildLocked: created file has QID %v before walk, QID %v after (interop=%v)", createQID, walkQID, d.fs.opts.interop) + nonOpenFile.close(ctx) + openFile.close(ctx) + if fdobj != nil { + fdobj.Close() + } + return nil, syserror.EAGAIN + } + + // Construct the new dentry. + child, err := d.fs.newDentry(ctx, nonOpenFile, createQID, attrMask, &attr) + if err != nil { + nonOpenFile.close(ctx) + openFile.close(ctx) + if fdobj != nil { + fdobj.Close() + } + return nil, err + } + // Incorporate the fid that was opened by lcreate. + useRegularFileFD := child.fileType() == linux.S_IFREG && !d.fs.opts.regularFilesUseSpecialFileFD + if useRegularFileFD { + child.handleMu.Lock() + child.handle.file = openFile + if fdobj != nil { + child.handle.fd = int32(fdobj.Release()) + } + child.handleReadable = vfs.MayReadFileWithOpenFlags(opts.Flags) + child.handleWritable = vfs.MayWriteFileWithOpenFlags(opts.Flags) + child.handleMu.Unlock() + } + // Take a reference on the new dentry to be held by the new file + // description. (This reference also means that the new dentry is not + // eligible for caching yet, so we don't need to append to a dentry slice.) + child.refs = 1 + // Insert the dentry into the tree. + d.IncRef() // reference held by child on its parent d + d.vfsd.InsertChild(&child.vfsd, name) + if d.fs.opts.interop != InteropModeShared { + d.touchCMtime(ctx) + delete(d.negativeChildren, name) + d.dirents = nil + } + + // Finally, construct a file description representing the created file. + var childVFSFD *vfs.FileDescription + mnt.IncRef() + if useRegularFileFD { + fd := ®ularFileFD{} + if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &child.vfsd, &vfs.FileDescriptionOptions{ + AllowDirectIO: true, + }); err != nil { + return nil, err + } + childVFSFD = &fd.vfsfd + } else { + fd := &specialFileFD{ + handle: handle{ + file: openFile, + fd: -1, + }, + } + if fdobj != nil { + fd.handle.fd = int32(fdobj.Release()) + } + if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &child.vfsd, &vfs.FileDescriptionOptions{}); err != nil { + fd.handle.close(ctx) + return nil, err + } + childVFSFD = &fd.vfsfd + } + return childVFSFD, nil +} + +// ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt. +func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckCaching(&ds) + d, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + return "", err + } + if !d.isSymlink() { + return "", syserror.EINVAL + } + return d.readlink(ctx, rp.Mount()) +} + +// RenameAt implements vfs.FilesystemImpl.RenameAt. +func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldParentVD vfs.VirtualDentry, oldName string, opts vfs.RenameOptions) error { + if opts.Flags != 0 { + // Requires 9P support. + return syserror.EINVAL + } + + var ds *[]*dentry + fs.renameMu.Lock() + defer fs.renameMuUnlockAndCheckCaching(&ds) + newParent, err := fs.walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry), &ds) + if err != nil { + return err + } + newName := rp.Component() + if newName == "." || newName == ".." { + return syserror.EBUSY + } + mnt := rp.Mount() + if mnt != oldParentVD.Mount() { + return syserror.EXDEV + } + if err := mnt.CheckBeginWrite(); err != nil { + return err + } + defer mnt.EndWrite() + + oldParent := oldParentVD.Dentry().Impl().(*dentry) + if fs.opts.interop == InteropModeShared { + if err := oldParent.updateFromGetattr(ctx); err != nil { + return err + } + } + if err := oldParent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true); err != nil { + return err + } + vfsObj := rp.VirtualFilesystem() + // We need a dentry representing the renamed file since, if it's a + // directory, we need to check for write permission on it. + oldParent.dirMu.Lock() + defer oldParent.dirMu.Unlock() + renamed, err := fs.revalidateChildLocked(ctx, vfsObj, oldParent, oldName, oldParent.vfsd.Child(oldName), &ds) + if err != nil { + return err + } + if renamed == nil { + return syserror.ENOENT + } + if renamed.isDir() { + if renamed == newParent || renamed.vfsd.IsAncestorOf(&newParent.vfsd) { + return syserror.EINVAL + } + if oldParent != newParent { + if err := renamed.checkPermissions(rp.Credentials(), vfs.MayWrite, true); err != nil { + return err + } + } + } else { + if opts.MustBeDir || rp.MustBeDir() { + return syserror.ENOTDIR + } + } + + if oldParent != newParent { + if err := newParent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true); err != nil { + return err + } + newParent.dirMu.Lock() + defer newParent.dirMu.Unlock() + } + if newParent.isDeleted() { + return syserror.ENOENT + } + replacedVFSD := newParent.vfsd.Child(newName) + var replaced *dentry + // This is similar to unlinkAt, except: + // + // - We revalidate the replaced dentry unconditionally for simplicity. + // + // - If rp.MustBeDir(), then we need a dentry representing the replaced + // file regardless to confirm that it's a directory. + if replacedVFSD != nil || rp.MustBeDir() { + replaced, err = fs.revalidateChildLocked(ctx, vfsObj, newParent, newName, replacedVFSD, &ds) + if err != nil { + return err + } + if replaced != nil { + if replaced.isDir() { + if !renamed.isDir() { + return syserror.EISDIR + } + } else { + if rp.MustBeDir() || renamed.isDir() { + return syserror.ENOTDIR + } + } + replacedVFSD = &replaced.vfsd + } else { + replacedVFSD = nil + } + } + + if oldParent == newParent && oldName == newName { + return nil + } + if err := vfsObj.PrepareRenameDentry(vfs.MountNamespaceFromContext(ctx), &renamed.vfsd, replacedVFSD); err != nil { + return err + } + if err := renamed.file.rename(ctx, newParent.file, newName); err != nil { + vfsObj.AbortRenameDentry(&renamed.vfsd, replacedVFSD) + return err + } + if fs.opts.interop != InteropModeShared { + oldParent.cacheNegativeChildLocked(oldName) + oldParent.dirents = nil + delete(newParent.negativeChildren, newName) + newParent.dirents = nil + } + vfsObj.CommitRenameReplaceDentry(&renamed.vfsd, &newParent.vfsd, newName, replacedVFSD) + return nil +} + +// RmdirAt implements vfs.FilesystemImpl.RmdirAt. +func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error { + return fs.unlinkAt(ctx, rp, true /* dir */) +} + +// SetStatAt implements vfs.FilesystemImpl.SetStatAt. +func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckCaching(&ds) + d, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + return err + } + return d.setStat(ctx, rp.Credentials(), &opts.Stat, rp.Mount()) +} + +// StatAt implements vfs.FilesystemImpl.StatAt. +func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckCaching(&ds) + d, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + return linux.Statx{}, err + } + // Since walking updates metadata for all traversed dentries under + // InteropModeShared, including the returned one, we can return cached + // metadata here regardless of fs.opts.interop. + var stat linux.Statx + d.statTo(&stat) + return stat, nil +} + +// StatFSAt implements vfs.FilesystemImpl.StatFSAt. +func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckCaching(&ds) + d, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + return linux.Statfs{}, err + } + fsstat, err := d.file.statFS(ctx) + if err != nil { + return linux.Statfs{}, err + } + nameLen := uint64(fsstat.NameLength) + if nameLen > maxFilenameLen { + nameLen = maxFilenameLen + } + return linux.Statfs{ + // This is primarily for distinguishing a gofer file system in + // tests. Testing is important, so instead of defining + // something completely random, use a standard value. + Type: linux.V9FS_MAGIC, + BlockSize: int64(fsstat.BlockSize), + Blocks: fsstat.Blocks, + BlocksFree: fsstat.BlocksFree, + BlocksAvailable: fsstat.BlocksAvailable, + Files: fsstat.Files, + FilesFree: fsstat.FilesFree, + NameLength: nameLen, + }, nil +} + +// SymlinkAt implements vfs.FilesystemImpl.SymlinkAt. +func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error { + return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, name string) error { + creds := rp.Credentials() + _, err := parent.file.symlink(ctx, target, name, (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID)) + return err + }) +} + +// UnlinkAt implements vfs.FilesystemImpl.UnlinkAt. +func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error { + return fs.unlinkAt(ctx, rp, false /* dir */) +} + +// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt. +func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([]string, error) { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckCaching(&ds) + d, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + return nil, err + } + return d.listxattr(ctx) +} + +// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt. +func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) (string, error) { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckCaching(&ds) + d, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + return "", err + } + return d.getxattr(ctx, name) +} + +// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt. +func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckCaching(&ds) + d, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + return err + } + return d.setxattr(ctx, &opts) +} + +// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt. +func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckCaching(&ds) + d, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + return err + } + return d.removexattr(ctx, name) +} + +// PrependPath implements vfs.FilesystemImpl.PrependPath. +func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error { + fs.renameMu.RLock() + defer fs.renameMu.RUnlock() + return vfs.GenericPrependPath(vfsroot, vd, b) +} diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go new file mode 100644 index 000000000..d0552bd99 --- /dev/null +++ b/pkg/sentry/fsimpl/gofer/gofer.go @@ -0,0 +1,1147 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package gofer provides a filesystem implementation that is backed by a 9p +// server, interchangably referred to as "gofers" throughout this package. +// +// Lock order: +// regularFileFD/directoryFD.mu +// filesystem.renameMu +// dentry.dirMu +// filesystem.syncMu +// dentry.metadataMu +// *** "memmap.Mappable locks" below this point +// dentry.mapsMu +// *** "memmap.Mappable locks taken by Translate" below this point +// dentry.handleMu +// dentry.dataMu +// +// Locking dentry.dirMu in multiple dentries requires holding +// filesystem.renameMu for writing. +package gofer + +import ( + "fmt" + "strconv" + "sync" + "sync/atomic" + "syscall" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/p9" + "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/memmap" + "gvisor.dev/gvisor/pkg/sentry/pgalloc" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/unet" + "gvisor.dev/gvisor/pkg/usermem" +) + +// FilesystemType implements vfs.FilesystemType. +type FilesystemType struct{} + +// filesystem implements vfs.FilesystemImpl. +type filesystem struct { + vfsfs vfs.Filesystem + + // mfp is used to allocate memory that caches regular file contents. mfp is + // immutable. + mfp pgalloc.MemoryFileProvider + + // Immutable options. + opts filesystemOptions + + // client is the client used by this filesystem. client is immutable. + client *p9.Client + + // uid and gid are the effective KUID and KGID of the filesystem's creator, + // and are used as the owner and group for files that don't specify one. + // uid and gid are immutable. + uid auth.KUID + gid auth.KGID + + // renameMu serves two purposes: + // + // - It synchronizes path resolution with renaming initiated by this + // client. + // + // - It is held by path resolution to ensure that reachable dentries remain + // valid. A dentry is reachable by path resolution if it has a non-zero + // reference count (such that it is usable as vfs.ResolvingPath.Start() or + // is reachable from its children), or if it is a child dentry (such that + // it is reachable from its parent). + renameMu sync.RWMutex + + // cachedDentries contains all dentries with 0 references. (Due to race + // conditions, it may also contain dentries with non-zero references.) + // cachedDentriesLen is the number of dentries in cachedDentries. These + // fields are protected by renameMu. + cachedDentries dentryList + cachedDentriesLen uint64 + + // dentries contains all dentries in this filesystem. specialFileFDs + // contains all open specialFileFDs. These fields are protected by syncMu. + syncMu sync.Mutex + dentries map[*dentry]struct{} + specialFileFDs map[*specialFileFD]struct{} +} + +type filesystemOptions struct { + // "Standard" 9P options. + fd int + aname string + interop InteropMode // derived from the "cache" mount option + msize uint32 + version string + + // maxCachedDentries is the maximum number of dentries with 0 references + // retained by the client. + maxCachedDentries uint64 + + // If forcePageCache is true, host FDs may not be used for application + // memory mappings even if available; instead, the client must perform its + // own caching of regular file pages. This is primarily useful for testing. + forcePageCache bool + + // If limitHostFDTranslation is true, apply maxFillRange() constraints to + // host FD mappings returned by dentry.(memmap.Mappable).Translate(). This + // makes memory accounting behavior more consistent between cases where + // host FDs are / are not available, but may increase the frequency of + // sentry-handled page faults on files for which a host FD is available. + limitHostFDTranslation bool + + // If overlayfsStaleRead is true, O_RDONLY host FDs provided by the remote + // filesystem may not be coherent with writable host FDs opened later, so + // mappings of the former must be replaced by mappings of the latter. This + // is usually only the case when the remote filesystem is an overlayfs + // mount on Linux < 4.19. + overlayfsStaleRead bool + + // If regularFilesUseSpecialFileFD is true, application FDs representing + // regular files will use distinct file handles for each FD, in the same + // way that application FDs representing "special files" such as sockets + // do. Note that this disables client caching and mmap for regular files. + regularFilesUseSpecialFileFD bool +} + +// InteropMode controls the client's interaction with other remote filesystem +// users. +type InteropMode uint32 + +const ( + // InteropModeExclusive is appropriate when the filesystem client is the + // only user of the remote filesystem. + // + // - The client may cache arbitrary filesystem state (file data, metadata, + // filesystem structure, etc.). + // + // - Client changes to filesystem state may be sent to the remote + // filesystem asynchronously, except when server permission checks are + // necessary. + // + // - File timestamps are based on client clocks. This ensures that users of + // the client observe timestamps that are coherent with their own clocks + // and consistent with Linux's semantics. However, since it is not always + // possible for clients to set arbitrary atimes and mtimes, and never + // possible for clients to set arbitrary ctimes, file timestamp changes are + // stored in the client only and never sent to the remote filesystem. + InteropModeExclusive InteropMode = iota + + // InteropModeWritethrough is appropriate when there are read-only users of + // the remote filesystem that expect to observe changes made by the + // filesystem client. + // + // - The client may cache arbitrary filesystem state. + // + // - Client changes to filesystem state must be sent to the remote + // filesystem synchronously. + // + // - File timestamps are based on client clocks. As a corollary, access + // timestamp changes from other remote filesystem users will not be visible + // to the client. + InteropModeWritethrough + + // InteropModeShared is appropriate when there are users of the remote + // filesystem that may mutate its state other than the client. + // + // - The client must verify cached filesystem state before using it. + // + // - Client changes to filesystem state must be sent to the remote + // filesystem synchronously. + // + // - File timestamps are based on server clocks. This is necessary to + // ensure that timestamp changes are synchronized between remote filesystem + // users. + // + // Note that the correctness of InteropModeShared depends on the server + // correctly implementing 9P fids (i.e. each fid immutably represents a + // single filesystem object), even in the presence of remote filesystem + // mutations from other users. If this is violated, the behavior of the + // client is undefined. + InteropModeShared +) + +// GetFilesystem implements vfs.FilesystemType.GetFilesystem. +func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { + mfp := pgalloc.MemoryFileProviderFromContext(ctx) + if mfp == nil { + ctx.Warningf("gofer.FilesystemType.GetFilesystem: context does not provide a pgalloc.MemoryFileProvider") + return nil, nil, syserror.EINVAL + } + + mopts := vfs.GenericParseMountOptions(opts.Data) + var fsopts filesystemOptions + + // Check that the transport is "fd". + trans, ok := mopts["trans"] + if !ok { + ctx.Warningf("gofer.FilesystemType.GetFilesystem: transport must be specified as 'trans=fd'") + return nil, nil, syserror.EINVAL + } + delete(mopts, "trans") + if trans != "fd" { + ctx.Warningf("gofer.FilesystemType.GetFilesystem: unsupported transport: trans=%s", trans) + return nil, nil, syserror.EINVAL + } + + // Check that read and write FDs are provided and identical. + rfdstr, ok := mopts["rfdno"] + if !ok { + ctx.Warningf("gofer.FilesystemType.GetFilesystem: read FD must be specified as 'rfdno=") + return nil, nil, syserror.EINVAL + } + delete(mopts, "rfdno") + rfd, err := strconv.Atoi(rfdstr) + if err != nil { + ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid read FD: rfdno=%s", rfdstr) + return nil, nil, syserror.EINVAL + } + wfdstr, ok := mopts["wfdno"] + if !ok { + ctx.Warningf("gofer.FilesystemType.GetFilesystem: write FD must be specified as 'wfdno=") + return nil, nil, syserror.EINVAL + } + delete(mopts, "wfdno") + wfd, err := strconv.Atoi(wfdstr) + if err != nil { + ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid write FD: wfdno=%s", wfdstr) + return nil, nil, syserror.EINVAL + } + if rfd != wfd { + ctx.Warningf("gofer.FilesystemType.GetFilesystem: read FD (%d) and write FD (%d) must be equal", rfd, wfd) + return nil, nil, syserror.EINVAL + } + fsopts.fd = rfd + + // Get the attach name. + fsopts.aname = "/" + if aname, ok := mopts["aname"]; ok { + delete(mopts, "aname") + fsopts.aname = aname + } + + // Parse the cache policy. For historical reasons, this defaults to the + // least generally-applicable option, InteropModeExclusive. + fsopts.interop = InteropModeExclusive + if cache, ok := mopts["cache"]; ok { + delete(mopts, "cache") + switch cache { + case "fscache": + fsopts.interop = InteropModeExclusive + case "fscache_writethrough": + fsopts.interop = InteropModeWritethrough + case "none": + fsopts.regularFilesUseSpecialFileFD = true + fallthrough + case "remote_revalidating": + fsopts.interop = InteropModeShared + default: + ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid cache policy: cache=%s", cache) + return nil, nil, syserror.EINVAL + } + } + + // Parse the 9P message size. + fsopts.msize = 1024 * 1024 // 1M, tested to give good enough performance up to 64M + if msizestr, ok := mopts["msize"]; ok { + delete(mopts, "msize") + msize, err := strconv.ParseUint(msizestr, 10, 32) + if err != nil { + ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid message size: msize=%s", msizestr) + return nil, nil, syserror.EINVAL + } + fsopts.msize = uint32(msize) + } + + // Parse the 9P protocol version. + fsopts.version = p9.HighestVersionString() + if version, ok := mopts["version"]; ok { + delete(mopts, "version") + fsopts.version = version + } + + // Parse the dentry cache limit. + fsopts.maxCachedDentries = 1000 + if str, ok := mopts["dentry_cache_limit"]; ok { + delete(mopts, "dentry_cache_limit") + maxCachedDentries, err := strconv.ParseUint(str, 10, 64) + if err != nil { + ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid dentry cache limit: dentry_cache_limit=%s", str) + return nil, nil, syserror.EINVAL + } + fsopts.maxCachedDentries = maxCachedDentries + } + + // Handle simple flags. + if _, ok := mopts["force_page_cache"]; ok { + delete(mopts, "force_page_cache") + fsopts.forcePageCache = true + } + if _, ok := mopts["limit_host_fd_translation"]; ok { + delete(mopts, "limit_host_fd_translation") + fsopts.limitHostFDTranslation = true + } + if _, ok := mopts["overlayfs_stale_read"]; ok { + delete(mopts, "overlayfs_stale_read") + fsopts.overlayfsStaleRead = true + } + // fsopts.regularFilesUseSpecialFileFD can only be enabled by specifying + // "cache=none". + + // Check for unparsed options. + if len(mopts) != 0 { + ctx.Warningf("gofer.FilesystemType.GetFilesystem: unknown options: %v", mopts) + return nil, nil, syserror.EINVAL + } + + // Establish a connection with the server. + conn, err := unet.NewSocket(fsopts.fd) + if err != nil { + return nil, nil, err + } + + // Perform version negotiation with the server. + ctx.UninterruptibleSleepStart(false) + client, err := p9.NewClient(conn, fsopts.msize, fsopts.version) + ctx.UninterruptibleSleepFinish(false) + if err != nil { + conn.Close() + return nil, nil, err + } + // Ownership of conn has been transferred to client. + + // Perform attach to obtain the filesystem root. + ctx.UninterruptibleSleepStart(false) + attached, err := client.Attach(fsopts.aname) + ctx.UninterruptibleSleepFinish(false) + if err != nil { + client.Close() + return nil, nil, err + } + attachFile := p9file{attached} + qid, attrMask, attr, err := attachFile.getAttr(ctx, dentryAttrMask()) + if err != nil { + attachFile.close(ctx) + client.Close() + return nil, nil, err + } + + // Construct the filesystem object. + fs := &filesystem{ + mfp: mfp, + opts: fsopts, + uid: creds.EffectiveKUID, + gid: creds.EffectiveKGID, + client: client, + dentries: make(map[*dentry]struct{}), + specialFileFDs: make(map[*specialFileFD]struct{}), + } + fs.vfsfs.Init(vfsObj, fs) + + // Construct the root dentry. + root, err := fs.newDentry(ctx, attachFile, qid, attrMask, &attr) + if err != nil { + attachFile.close(ctx) + fs.vfsfs.DecRef() + return nil, nil, err + } + // Set the root's reference count to 2. One reference is returned to the + // caller, and the other is deliberately leaked to prevent the root from + // being "cached" and subsequently evicted. Its resources will still be + // cleaned up by fs.Release(). + root.refs = 2 + + return &fs.vfsfs, &root.vfsd, nil +} + +// Release implements vfs.FilesystemImpl.Release. +func (fs *filesystem) Release() { + ctx := context.Background() + mf := fs.mfp.MemoryFile() + + fs.syncMu.Lock() + for d := range fs.dentries { + d.handleMu.Lock() + d.dataMu.Lock() + if d.handleWritable { + // Write dirty cached data to the remote file. + if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, fs.mfp.MemoryFile(), d.handle.writeFromBlocksAt); err != nil { + log.Warningf("gofer.filesystem.Release: failed to flush dentry: %v", err) + } + // TODO(jamieliu): Do we need to flushf/fsync d? + } + // Discard cached pages. + d.cache.DropAll(mf) + d.dirty.RemoveAll() + d.dataMu.Unlock() + // Close the host fd if one exists. + if d.handle.fd >= 0 { + syscall.Close(int(d.handle.fd)) + d.handle.fd = -1 + } + d.handleMu.Unlock() + } + // There can't be any specialFileFDs still using fs, since each such + // FileDescription would hold a reference on a Mount holding a reference on + // fs. + fs.syncMu.Unlock() + + // Close the connection to the server. This implicitly clunks all fids. + fs.client.Close() +} + +// dentry implements vfs.DentryImpl. +type dentry struct { + vfsd vfs.Dentry + + // refs is the reference count. Each dentry holds a reference on its + // parent, even if disowned. refs is accessed using atomic memory + // operations. + refs int64 + + // fs is the owning filesystem. fs is immutable. + fs *filesystem + + // We don't support hard links, so each dentry maps 1:1 to an inode. + + // file is the unopened p9.File that backs this dentry. file is immutable. + file p9file + + // If deleted is non-zero, the file represented by this dentry has been + // deleted. deleted is accessed using atomic memory operations. + deleted uint32 + + // If cached is true, dentryEntry links dentry into + // filesystem.cachedDentries. cached and dentryEntry are protected by + // filesystem.renameMu. + cached bool + dentryEntry + + dirMu sync.Mutex + + // If this dentry represents a directory, and InteropModeShared is not in + // effect, negativeChildren is a set of child names in this directory that + // are known not to exist. negativeChildren is protected by dirMu. + negativeChildren map[string]struct{} + + // If this dentry represents a directory, InteropModeShared is not in + // effect, and dirents is not nil, it is a cache of all entries in the + // directory, in the order they were returned by the server. dirents is + // protected by dirMu. + dirents []vfs.Dirent + + // Cached metadata; protected by metadataMu and accessed using atomic + // memory operations unless otherwise specified. + metadataMu sync.Mutex + ino uint64 // immutable + mode uint32 // type is immutable, perms are mutable + uid uint32 // auth.KUID, but stored as raw uint32 for sync/atomic + gid uint32 // auth.KGID, but ... + blockSize uint32 // 0 if unknown + // Timestamps, all nsecs from the Unix epoch. + atime int64 + mtime int64 + ctime int64 + btime int64 + // File size, protected by both metadataMu and dataMu (i.e. both must be + // locked to mutate it). + size uint64 + + mapsMu sync.Mutex + + // If this dentry represents a regular file, mappings tracks mappings of + // the file into memmap.MappingSpaces. mappings is protected by mapsMu. + mappings memmap.MappingSet + + // If this dentry represents a regular file or directory: + // + // - handle is the I/O handle used by all regularFileFDs/directoryFDs + // representing this dentry. + // + // - handleReadable is true if handle is readable. + // + // - handleWritable is true if handle is writable. + // + // Invariants: + // + // - If handleReadable == handleWritable == false, then handle.file == nil + // (i.e. there is no open handle). Conversely, if handleReadable || + // handleWritable == true, then handle.file != nil (i.e. there is an open + // handle). + // + // - handleReadable and handleWritable cannot transition from true to false + // (i.e. handles may not be downgraded). + // + // These fields are protected by handleMu. + handleMu sync.RWMutex + handle handle + handleReadable bool + handleWritable bool + + dataMu sync.RWMutex + + // If this dentry represents a regular file that is client-cached, cache + // maps offsets into the cached file to offsets into + // filesystem.mfp.MemoryFile() that store the file's data. cache is + // protected by dataMu. + cache fsutil.FileRangeSet + + // If this dentry represents a regular file that is client-cached, dirty + // tracks dirty segments in cache. dirty is protected by dataMu. + dirty fsutil.DirtySet + + // pf implements platform.File for mappings of handle.fd. + pf dentryPlatformFile + + // If this dentry represents a symbolic link, InteropModeShared is not in + // effect, and haveTarget is true, target is the symlink target. haveTarget + // and target are protected by dataMu. + haveTarget bool + target string +} + +// dentryAttrMask returns a p9.AttrMask enabling all attributes used by the +// gofer client. +func dentryAttrMask() p9.AttrMask { + return p9.AttrMask{ + Mode: true, + UID: true, + GID: true, + ATime: true, + MTime: true, + CTime: true, + Size: true, + BTime: true, + } +} + +// newDentry creates a new dentry representing the given file. The dentry +// initially has no references, but is not cached; it is the caller's +// responsibility to set the dentry's reference count and/or call +// dentry.checkCachingLocked() as appropriate. +func (fs *filesystem) newDentry(ctx context.Context, file p9file, qid p9.QID, mask p9.AttrMask, attr *p9.Attr) (*dentry, error) { + if !mask.Mode { + ctx.Warningf("can't create gofer.dentry without file type") + return nil, syserror.EIO + } + if attr.Mode.FileType() == p9.ModeRegular && !mask.Size { + ctx.Warningf("can't create regular file gofer.dentry without file size") + return nil, syserror.EIO + } + + d := &dentry{ + fs: fs, + file: file, + ino: qid.Path, + mode: uint32(attr.Mode), + uid: uint32(fs.uid), + gid: uint32(fs.gid), + blockSize: usermem.PageSize, + handle: handle{ + fd: -1, + }, + } + d.pf.dentry = d + if mask.UID { + d.uid = uint32(attr.UID) + } + if mask.GID { + d.gid = uint32(attr.GID) + } + if mask.Size { + d.size = attr.Size + } + if attr.BlockSize != 0 { + d.blockSize = uint32(attr.BlockSize) + } + if mask.ATime { + d.atime = dentryTimestampFromP9(attr.ATimeSeconds, attr.ATimeNanoSeconds) + } + if mask.MTime { + d.mtime = dentryTimestampFromP9(attr.MTimeSeconds, attr.MTimeNanoSeconds) + } + if mask.CTime { + d.ctime = dentryTimestampFromP9(attr.CTimeSeconds, attr.CTimeNanoSeconds) + } + if mask.BTime { + d.btime = dentryTimestampFromP9(attr.BTimeSeconds, attr.BTimeNanoSeconds) + } + d.vfsd.Init(d) + + fs.syncMu.Lock() + fs.dentries[d] = struct{}{} + fs.syncMu.Unlock() + return d, nil +} + +// updateFromP9Attrs is called to update d's metadata after an update from the +// remote filesystem. +func (d *dentry) updateFromP9Attrs(mask p9.AttrMask, attr *p9.Attr) { + d.metadataMu.Lock() + if mask.Mode { + if got, want := uint32(attr.Mode.FileType()), d.fileType(); got != want { + d.metadataMu.Unlock() + panic(fmt.Sprintf("gofer.dentry file type changed from %#o to %#o", want, got)) + } + atomic.StoreUint32(&d.mode, uint32(attr.Mode)) + } + if mask.UID { + atomic.StoreUint32(&d.uid, uint32(attr.UID)) + } + if mask.GID { + atomic.StoreUint32(&d.gid, uint32(attr.GID)) + } + // There is no P9_GETATTR_* bit for I/O block size. + if attr.BlockSize != 0 { + atomic.StoreUint32(&d.blockSize, uint32(attr.BlockSize)) + } + if mask.ATime { + atomic.StoreInt64(&d.atime, dentryTimestampFromP9(attr.ATimeSeconds, attr.ATimeNanoSeconds)) + } + if mask.MTime { + atomic.StoreInt64(&d.mtime, dentryTimestampFromP9(attr.MTimeSeconds, attr.MTimeNanoSeconds)) + } + if mask.CTime { + atomic.StoreInt64(&d.ctime, dentryTimestampFromP9(attr.CTimeSeconds, attr.CTimeNanoSeconds)) + } + if mask.BTime { + atomic.StoreInt64(&d.btime, dentryTimestampFromP9(attr.BTimeSeconds, attr.BTimeNanoSeconds)) + } + if mask.Size { + d.dataMu.Lock() + atomic.StoreUint64(&d.size, attr.Size) + d.dataMu.Unlock() + } + d.metadataMu.Unlock() +} + +func (d *dentry) updateFromGetattr(ctx context.Context) error { + // Use d.handle.file, which represents a 9P fid that has been opened, in + // preference to d.file, which represents a 9P fid that has not. This may + // be significantly more efficient in some implementations. + var ( + file p9file + handleMuRLocked bool + ) + d.handleMu.RLock() + if !d.handle.file.isNil() { + file = d.handle.file + handleMuRLocked = true + } else { + file = d.file + d.handleMu.RUnlock() + } + _, attrMask, attr, err := file.getAttr(ctx, dentryAttrMask()) + if handleMuRLocked { + d.handleMu.RUnlock() + } + if err != nil { + return err + } + d.updateFromP9Attrs(attrMask, &attr) + return nil +} + +func (d *dentry) fileType() uint32 { + return atomic.LoadUint32(&d.mode) & linux.S_IFMT +} + +func (d *dentry) statTo(stat *linux.Statx) { + stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME | linux.STATX_INO | linux.STATX_SIZE | linux.STATX_BLOCKS | linux.STATX_BTIME + stat.Blksize = atomic.LoadUint32(&d.blockSize) + stat.Nlink = 1 + if d.isDir() { + stat.Nlink = 2 + } + stat.UID = atomic.LoadUint32(&d.uid) + stat.GID = atomic.LoadUint32(&d.gid) + stat.Mode = uint16(atomic.LoadUint32(&d.mode)) + stat.Ino = d.ino + stat.Size = atomic.LoadUint64(&d.size) + // This is consistent with regularFileFD.Seek(), which treats regular files + // as having no holes. + stat.Blocks = (stat.Size + 511) / 512 + stat.Atime = statxTimestampFromDentry(atomic.LoadInt64(&d.atime)) + stat.Btime = statxTimestampFromDentry(atomic.LoadInt64(&d.btime)) + stat.Ctime = statxTimestampFromDentry(atomic.LoadInt64(&d.ctime)) + stat.Mtime = statxTimestampFromDentry(atomic.LoadInt64(&d.mtime)) + // TODO(jamieliu): device number +} + +func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *linux.Statx, mnt *vfs.Mount) error { + if stat.Mask == 0 { + return nil + } + if stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID|linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_SIZE) != 0 { + return syserror.EPERM + } + if err := vfs.CheckSetStat(creds, stat, uint16(atomic.LoadUint32(&d.mode))&^linux.S_IFMT, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))); err != nil { + return err + } + if err := mnt.CheckBeginWrite(); err != nil { + return err + } + defer mnt.EndWrite() + setLocalAtime := false + setLocalMtime := false + if d.fs.opts.interop != InteropModeShared { + // Timestamp updates will be handled locally. + setLocalAtime = stat.Mask&linux.STATX_ATIME != 0 + setLocalMtime = stat.Mask&linux.STATX_MTIME != 0 + stat.Mask &^= linux.STATX_ATIME | linux.STATX_MTIME + if !setLocalMtime && (stat.Mask&linux.STATX_SIZE != 0) { + // Truncate updates mtime. + setLocalMtime = true + stat.Mtime.Nsec = linux.UTIME_NOW + } + } + d.metadataMu.Lock() + defer d.metadataMu.Unlock() + if stat.Mask != 0 { + if err := d.file.setAttr(ctx, p9.SetAttrMask{ + Permissions: stat.Mask&linux.STATX_MODE != 0, + UID: stat.Mask&linux.STATX_UID != 0, + GID: stat.Mask&linux.STATX_GID != 0, + Size: stat.Mask&linux.STATX_SIZE != 0, + ATime: stat.Mask&linux.STATX_ATIME != 0, + MTime: stat.Mask&linux.STATX_MTIME != 0, + ATimeNotSystemTime: stat.Atime.Nsec != linux.UTIME_NOW, + MTimeNotSystemTime: stat.Mtime.Nsec != linux.UTIME_NOW, + }, p9.SetAttr{ + Permissions: p9.FileMode(stat.Mode), + UID: p9.UID(stat.UID), + GID: p9.GID(stat.GID), + Size: stat.Size, + ATimeSeconds: uint64(stat.Atime.Sec), + ATimeNanoSeconds: uint64(stat.Atime.Nsec), + MTimeSeconds: uint64(stat.Mtime.Sec), + MTimeNanoSeconds: uint64(stat.Mtime.Nsec), + }); err != nil { + return err + } + } + if d.fs.opts.interop == InteropModeShared { + // There's no point to updating d's metadata in this case since it'll + // be overwritten by revalidation before the next time it's used + // anyway. (InteropModeShared inhibits client caching of regular file + // data, so there's no cache to truncate either.) + return nil + } + now, haveNow := nowFromContext(ctx) + if !haveNow { + ctx.Warningf("gofer.dentry.setStat: current time not available") + } + if stat.Mask&linux.STATX_MODE != 0 { + atomic.StoreUint32(&d.mode, d.fileType()|uint32(stat.Mode)) + } + if stat.Mask&linux.STATX_UID != 0 { + atomic.StoreUint32(&d.uid, stat.UID) + } + if stat.Mask&linux.STATX_GID != 0 { + atomic.StoreUint32(&d.gid, stat.GID) + } + if setLocalAtime { + if stat.Atime.Nsec == linux.UTIME_NOW { + if haveNow { + atomic.StoreInt64(&d.atime, now) + } + } else { + atomic.StoreInt64(&d.atime, dentryTimestampFromStatx(stat.Atime)) + } + } + if setLocalMtime { + if stat.Mtime.Nsec == linux.UTIME_NOW { + if haveNow { + atomic.StoreInt64(&d.mtime, now) + } + } else { + atomic.StoreInt64(&d.mtime, dentryTimestampFromStatx(stat.Mtime)) + } + } + if haveNow { + atomic.StoreInt64(&d.ctime, now) + } + if stat.Mask&linux.STATX_SIZE != 0 { + d.dataMu.Lock() + oldSize := d.size + d.size = stat.Size + // d.dataMu must be unlocked to lock d.mapsMu and invalidate mappings + // below. This allows concurrent calls to Read/Translate/etc. These + // functions synchronize with truncation by refusing to use cache + // contents beyond the new d.size. (We are still holding d.metadataMu, + // so we can't race with Write or another truncate.) + d.dataMu.Unlock() + if d.size < oldSize { + oldpgend := pageRoundUp(oldSize) + newpgend := pageRoundUp(d.size) + if oldpgend != newpgend { + d.mapsMu.Lock() + d.mappings.Invalidate(memmap.MappableRange{newpgend, oldpgend}, memmap.InvalidateOpts{ + // Compare Linux's mm/truncate.c:truncate_setsize() => + // truncate_pagecache() => + // mm/memory.c:unmap_mapping_range(evencows=1). + InvalidatePrivate: true, + }) + d.mapsMu.Unlock() + } + // We are now guaranteed that there are no translations of + // truncated pages, and can remove them from the cache. Since + // truncated pages have been removed from the remote file, they + // should be dropped without being written back. + d.dataMu.Lock() + d.cache.Truncate(d.size, d.fs.mfp.MemoryFile()) + d.dirty.KeepClean(memmap.MappableRange{d.size, oldpgend}) + d.dataMu.Unlock() + } + } + return nil +} + +func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes, isDir bool) error { + return vfs.GenericCheckPermissions(creds, ats, isDir, uint16(atomic.LoadUint32(&d.mode))&0777, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))) +} + +// IncRef implements vfs.DentryImpl.IncRef. +func (d *dentry) IncRef() { + // d.refs may be 0 if d.fs.renameMu is locked, which serializes against + // d.checkCachingLocked(). + atomic.AddInt64(&d.refs, 1) +} + +// TryIncRef implements vfs.DentryImpl.TryIncRef. +func (d *dentry) TryIncRef() bool { + for { + refs := atomic.LoadInt64(&d.refs) + if refs == 0 { + return false + } + if atomic.CompareAndSwapInt64(&d.refs, refs, refs+1) { + return true + } + } +} + +// DecRef implements vfs.DentryImpl.DecRef. +func (d *dentry) DecRef() { + if refs := atomic.AddInt64(&d.refs, -1); refs == 0 { + d.fs.renameMu.Lock() + d.checkCachingLocked() + d.fs.renameMu.Unlock() + } else if refs < 0 { + panic("gofer.dentry.DecRef() called without holding a reference") + } +} + +// checkCachingLocked should be called after d's reference count becomes 0 or it +// becomes disowned. +// +// Preconditions: d.fs.renameMu must be locked for writing. +func (d *dentry) checkCachingLocked() { + // Dentries with a non-zero reference count must be retained. (The only way + // to obtain a reference on a dentry with zero references is via path + // resolution, which requires renameMu, so if d.refs is zero then it will + // remain zero while we hold renameMu for writing.) + if atomic.LoadInt64(&d.refs) != 0 { + if d.cached { + d.fs.cachedDentries.Remove(d) + d.fs.cachedDentriesLen-- + d.cached = false + } + return + } + // Non-child dentries with zero references are no longer reachable by path + // resolution and should be dropped immediately. + if d.vfsd.Parent() == nil || d.vfsd.IsDisowned() { + if d.cached { + d.fs.cachedDentries.Remove(d) + d.fs.cachedDentriesLen-- + d.cached = false + } + d.destroyLocked() + return + } + // If d is already cached, just move it to the front of the LRU. + if d.cached { + d.fs.cachedDentries.Remove(d) + d.fs.cachedDentries.PushFront(d) + return + } + // Cache the dentry, then evict the least recently used cached dentry if + // the cache becomes over-full. + d.fs.cachedDentries.PushFront(d) + d.fs.cachedDentriesLen++ + d.cached = true + if d.fs.cachedDentriesLen > d.fs.opts.maxCachedDentries { + victim := d.fs.cachedDentries.Back() + d.fs.cachedDentries.Remove(victim) + d.fs.cachedDentriesLen-- + victim.cached = false + // victim.refs may have become non-zero from an earlier path + // resolution since it was inserted into fs.cachedDentries; see + // dentry.incRefLocked(). Either way, we brought + // fs.cachedDentriesLen back down to fs.opts.maxCachedDentries, so + // we don't loop. + if atomic.LoadInt64(&victim.refs) == 0 { + if victimParentVFSD := victim.vfsd.Parent(); victimParentVFSD != nil { + victimParent := victimParentVFSD.Impl().(*dentry) + victimParent.dirMu.Lock() + if !victim.vfsd.IsDisowned() { + // victim can't be a mount point (in any mount + // namespace), since VFS holds references on mount + // points. + d.fs.vfsfs.VirtualFilesystem().ForceDeleteDentry(&victim.vfsd) + // We're only deleting the dentry, not the file it + // represents, so we don't need to update + // victimParent.dirents etc. + } + victimParent.dirMu.Unlock() + } + victim.destroyLocked() + } + } +} + +// Preconditions: d.fs.renameMu must be locked for writing. d.refs == 0. d is +// not a child dentry. +func (d *dentry) destroyLocked() { + ctx := context.Background() + d.handleMu.Lock() + if !d.handle.file.isNil() { + mf := d.fs.mfp.MemoryFile() + d.dataMu.Lock() + // Write dirty pages back to the remote filesystem. + if d.handleWritable { + if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, mf, d.handle.writeFromBlocksAt); err != nil { + log.Warningf("gofer.dentry.DecRef: failed to write dirty data back: %v", err) + } + } + // Discard cached data. + d.cache.DropAll(mf) + d.dirty.RemoveAll() + d.dataMu.Unlock() + // Clunk open fids and close open host FDs. + d.handle.close(ctx) + } + d.handleMu.Unlock() + d.file.close(ctx) + // Remove d from the set of all dentries. + d.fs.syncMu.Lock() + delete(d.fs.dentries, d) + d.fs.syncMu.Unlock() + // Drop the reference held by d on its parent. + if parentVFSD := d.vfsd.Parent(); parentVFSD != nil { + parent := parentVFSD.Impl().(*dentry) + // This is parent.DecRef() without recursive locking of d.fs.renameMu. + if refs := atomic.AddInt64(&parent.refs, -1); refs == 0 { + parent.checkCachingLocked() + } else if refs < 0 { + panic("gofer.dentry.DecRef() called without holding a reference") + } + } +} + +func (d *dentry) isDeleted() bool { + return atomic.LoadUint32(&d.deleted) != 0 +} + +func (d *dentry) setDeleted() { + atomic.StoreUint32(&d.deleted, 1) +} + +func (d *dentry) listxattr(ctx context.Context) ([]string, error) { + return nil, syserror.ENOTSUP +} + +func (d *dentry) getxattr(ctx context.Context, name string) (string, error) { + // TODO(jamieliu): add vfs.GetxattrOptions.Size + return d.file.getXattr(ctx, name, linux.XATTR_SIZE_MAX) +} + +func (d *dentry) setxattr(ctx context.Context, opts *vfs.SetxattrOptions) error { + return d.file.setXattr(ctx, opts.Name, opts.Value, opts.Flags) +} + +func (d *dentry) removexattr(ctx context.Context, name string) error { + return syserror.ENOTSUP +} + +// Preconditions: d.isRegularFile() || d.isDirectory(). +func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool) error { + // O_TRUNC unconditionally requires us to obtain a new handle (opened with + // O_TRUNC). + if !trunc { + d.handleMu.RLock() + if (!read || d.handleReadable) && (!write || d.handleWritable) { + // The current handle is sufficient. + d.handleMu.RUnlock() + return nil + } + d.handleMu.RUnlock() + } + + haveOldFD := false + d.handleMu.Lock() + if (read && !d.handleReadable) || (write && !d.handleWritable) || trunc { + // Get a new handle. + wantReadable := d.handleReadable || read + wantWritable := d.handleWritable || write + h, err := openHandle(ctx, d.file, wantReadable, wantWritable, trunc) + if err != nil { + d.handleMu.Unlock() + return err + } + if !d.handle.file.isNil() { + // Check that old and new handles are compatible: If the old handle + // includes a host file descriptor but the new one does not, or + // vice versa, old and new memory mappings may be incoherent. + haveOldFD = d.handle.fd >= 0 + haveNewFD := h.fd >= 0 + if haveOldFD != haveNewFD { + d.handleMu.Unlock() + ctx.Warningf("gofer.dentry.ensureSharedHandle: can't change host FD availability from %v to %v across dentry handle upgrade", haveOldFD, haveNewFD) + h.close(ctx) + return syserror.EIO + } + if haveOldFD { + // We may have raced with callers of d.pf.FD() that are now + // using the old file descriptor, preventing us from safely + // closing it. We could handle this by invalidating existing + // memmap.Translations, but this is expensive. Instead, use + // dup2() to make the old file descriptor refer to the new file + // description, then close the new file descriptor (which is no + // longer needed). Racing callers may use the old or new file + // description, but this doesn't matter since they refer to the + // same file (unless d.fs.opts.overlayfsStaleRead is true, + // which we handle separately). + if err := syscall.Dup2(int(h.fd), int(d.handle.fd)); err != nil { + d.handleMu.Unlock() + ctx.Warningf("gofer.dentry.ensureSharedHandle: failed to dup fd %d to fd %d: %v", h.fd, d.handle.fd, err) + h.close(ctx) + return err + } + syscall.Close(int(h.fd)) + h.fd = d.handle.fd + if d.fs.opts.overlayfsStaleRead { + // Replace sentry mappings of the old FD with mappings of + // the new FD, since the two are not necessarily coherent. + if err := d.pf.hostFileMapper.RegenerateMappings(int(h.fd)); err != nil { + d.handleMu.Unlock() + ctx.Warningf("gofer.dentry.ensureSharedHandle: failed to replace sentry mappings of old FD with mappings of new FD: %v", err) + h.close(ctx) + return err + } + } + // Clunk the old fid before making the new handle visible (by + // unlocking d.handleMu). + d.handle.file.close(ctx) + } + } + // Switch to the new handle. + d.handle = h + d.handleReadable = wantReadable + d.handleWritable = wantWritable + } + d.handleMu.Unlock() + + if d.fs.opts.overlayfsStaleRead && haveOldFD { + // Invalidate application mappings that may be using the old FD; they + // will be replaced with mappings using the new FD after future calls + // to d.Translate(). This requires holding d.mapsMu, which precedes + // d.handleMu in the lock order. + d.mapsMu.Lock() + d.mappings.InvalidateAll(memmap.InvalidateOpts{}) + d.mapsMu.Unlock() + } + + return nil +} + +// fileDescription is embedded by gofer implementations of +// vfs.FileDescriptionImpl. +type fileDescription struct { + vfsfd vfs.FileDescription + vfs.FileDescriptionDefaultImpl +} + +func (fd *fileDescription) filesystem() *filesystem { + return fd.vfsfd.Mount().Filesystem().Impl().(*filesystem) +} + +func (fd *fileDescription) dentry() *dentry { + return fd.vfsfd.Dentry().Impl().(*dentry) +} + +// Stat implements vfs.FileDescriptionImpl.Stat. +func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { + d := fd.dentry() + if d.fs.opts.interop == InteropModeShared && opts.Mask&(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID|linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_CTIME|linux.STATX_SIZE|linux.STATX_BLOCKS|linux.STATX_BTIME) != 0 && opts.Sync != linux.AT_STATX_DONT_SYNC { + // TODO(jamieliu): Use specialFileFD.handle.file for the getattr if + // available? + if err := d.updateFromGetattr(ctx); err != nil { + return linux.Statx{}, err + } + } + var stat linux.Statx + d.statTo(&stat) + return stat, nil +} + +// SetStat implements vfs.FileDescriptionImpl.SetStat. +func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { + return fd.dentry().setStat(ctx, auth.CredentialsFromContext(ctx), &opts.Stat, fd.vfsfd.Mount()) +} + +// Listxattr implements vfs.FileDescriptionImpl.Listxattr. +func (fd *fileDescription) Listxattr(ctx context.Context) ([]string, error) { + return fd.dentry().listxattr(ctx) +} + +// Getxattr implements vfs.FileDescriptionImpl.Getxattr. +func (fd *fileDescription) Getxattr(ctx context.Context, name string) (string, error) { + return fd.dentry().getxattr(ctx, name) +} + +// Setxattr implements vfs.FileDescriptionImpl.Setxattr. +func (fd *fileDescription) Setxattr(ctx context.Context, opts vfs.SetxattrOptions) error { + return fd.dentry().setxattr(ctx, &opts) +} + +// Removexattr implements vfs.FileDescriptionImpl.Removexattr. +func (fd *fileDescription) Removexattr(ctx context.Context, name string) error { + return fd.dentry().removexattr(ctx, name) +} diff --git a/pkg/sentry/fsimpl/gofer/handle.go b/pkg/sentry/fsimpl/gofer/handle.go new file mode 100644 index 000000000..cfe66f797 --- /dev/null +++ b/pkg/sentry/fsimpl/gofer/handle.go @@ -0,0 +1,135 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package gofer + +import ( + "syscall" + + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/p9" + "gvisor.dev/gvisor/pkg/safemem" +) + +// handle represents a remote "open file descriptor", consisting of an opened +// fid (p9.File) and optionally a host file descriptor. +type handle struct { + file p9file + fd int32 // -1 if unavailable +} + +// Preconditions: read || write. +func openHandle(ctx context.Context, file p9file, read, write, trunc bool) (handle, error) { + _, newfile, err := file.walk(ctx, nil) + if err != nil { + return handle{fd: -1}, err + } + var flags p9.OpenFlags + switch { + case read && !write: + flags = p9.ReadOnly + case !read && write: + flags = p9.WriteOnly + case read && write: + flags = p9.ReadWrite + } + if trunc { + flags |= p9.OpenTruncate + } + fdobj, _, _, err := newfile.open(ctx, flags) + if err != nil { + newfile.close(ctx) + return handle{fd: -1}, err + } + fd := int32(-1) + if fdobj != nil { + fd = int32(fdobj.Release()) + } + return handle{ + file: newfile, + fd: fd, + }, nil +} + +func (h *handle) close(ctx context.Context) { + h.file.close(ctx) + h.file = p9file{} + if h.fd >= 0 { + syscall.Close(int(h.fd)) + h.fd = -1 + } +} + +func (h *handle) readToBlocksAt(ctx context.Context, dsts safemem.BlockSeq, offset uint64) (uint64, error) { + if dsts.IsEmpty() { + return 0, nil + } + if h.fd >= 0 { + ctx.UninterruptibleSleepStart(false) + n, err := hostPreadv(h.fd, dsts, int64(offset)) + ctx.UninterruptibleSleepFinish(false) + return n, err + } + if dsts.NumBlocks() == 1 && !dsts.Head().NeedSafecopy() { + n, err := h.file.readAt(ctx, dsts.Head().ToSlice(), offset) + return uint64(n), err + } + // Buffer the read since p9.File.ReadAt() takes []byte. + buf := make([]byte, dsts.NumBytes()) + n, err := h.file.readAt(ctx, buf, offset) + if n == 0 { + return 0, err + } + if cp, cperr := safemem.CopySeq(dsts, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf[:n]))); cperr != nil { + return cp, cperr + } + return uint64(n), err +} + +func (h *handle) writeFromBlocksAt(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error) { + if srcs.IsEmpty() { + return 0, nil + } + if h.fd >= 0 { + ctx.UninterruptibleSleepStart(false) + n, err := hostPwritev(h.fd, srcs, int64(offset)) + ctx.UninterruptibleSleepFinish(false) + return n, err + } + if srcs.NumBlocks() == 1 && !srcs.Head().NeedSafecopy() { + n, err := h.file.writeAt(ctx, srcs.Head().ToSlice(), offset) + return uint64(n), err + } + // Buffer the write since p9.File.WriteAt() takes []byte. + buf := make([]byte, srcs.NumBytes()) + cp, cperr := safemem.CopySeq(safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf)), srcs) + if cp == 0 { + return 0, cperr + } + n, err := h.file.writeAt(ctx, buf[:cp], offset) + if err != nil { + return uint64(n), err + } + return cp, cperr +} + +func (h *handle) sync(ctx context.Context) error { + if h.fd >= 0 { + ctx.UninterruptibleSleepStart(false) + err := syscall.Fsync(int(h.fd)) + ctx.UninterruptibleSleepFinish(false) + return err + } + return h.file.fsync(ctx) +} diff --git a/pkg/sentry/fsimpl/gofer/handle_unsafe.go b/pkg/sentry/fsimpl/gofer/handle_unsafe.go new file mode 100644 index 000000000..19560ab26 --- /dev/null +++ b/pkg/sentry/fsimpl/gofer/handle_unsafe.go @@ -0,0 +1,66 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package gofer + +import ( + "syscall" + "unsafe" + + "gvisor.dev/gvisor/pkg/safemem" +) + +// Preconditions: !dsts.IsEmpty(). +func hostPreadv(fd int32, dsts safemem.BlockSeq, off int64) (uint64, error) { + // No buffering is necessary regardless of safecopy; host syscalls will + // return EFAULT if appropriate, instead of raising SIGBUS. + if dsts.NumBlocks() == 1 { + // Use pread() instead of preadv() to avoid iovec allocation and + // copying. + dst := dsts.Head() + n, _, e := syscall.Syscall6(syscall.SYS_PREAD64, uintptr(fd), dst.Addr(), uintptr(dst.Len()), uintptr(off), 0, 0) + if e != 0 { + return 0, e + } + return uint64(n), nil + } + iovs := safemem.IovecsFromBlockSeq(dsts) + n, _, e := syscall.Syscall6(syscall.SYS_PREADV, uintptr(fd), uintptr((unsafe.Pointer)(&iovs[0])), uintptr(len(iovs)), uintptr(off), 0, 0) + if e != 0 { + return 0, e + } + return uint64(n), nil +} + +// Preconditions: !srcs.IsEmpty(). +func hostPwritev(fd int32, srcs safemem.BlockSeq, off int64) (uint64, error) { + // No buffering is necessary regardless of safecopy; host syscalls will + // return EFAULT if appropriate, instead of raising SIGBUS. + if srcs.NumBlocks() == 1 { + // Use pwrite() instead of pwritev() to avoid iovec allocation and + // copying. + src := srcs.Head() + n, _, e := syscall.Syscall6(syscall.SYS_PWRITE64, uintptr(fd), src.Addr(), uintptr(src.Len()), uintptr(off), 0, 0) + if e != 0 { + return 0, e + } + return uint64(n), nil + } + iovs := safemem.IovecsFromBlockSeq(srcs) + n, _, e := syscall.Syscall6(syscall.SYS_PWRITEV, uintptr(fd), uintptr((unsafe.Pointer)(&iovs[0])), uintptr(len(iovs)), uintptr(off), 0, 0) + if e != 0 { + return 0, e + } + return uint64(n), nil +} diff --git a/pkg/sentry/fsimpl/gofer/p9file.go b/pkg/sentry/fsimpl/gofer/p9file.go new file mode 100644 index 000000000..755ac2985 --- /dev/null +++ b/pkg/sentry/fsimpl/gofer/p9file.go @@ -0,0 +1,219 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package gofer + +import ( + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fd" + "gvisor.dev/gvisor/pkg/p9" + "gvisor.dev/gvisor/pkg/syserror" +) + +// p9file is a wrapper around p9.File that provides methods that are +// Context-aware. +type p9file struct { + file p9.File +} + +func (f p9file) isNil() bool { + return f.file == nil +} + +func (f p9file) walk(ctx context.Context, names []string) ([]p9.QID, p9file, error) { + ctx.UninterruptibleSleepStart(false) + qids, newfile, err := f.file.Walk(names) + ctx.UninterruptibleSleepFinish(false) + return qids, p9file{newfile}, err +} + +func (f p9file) walkGetAttr(ctx context.Context, names []string) ([]p9.QID, p9file, p9.AttrMask, p9.Attr, error) { + ctx.UninterruptibleSleepStart(false) + qids, newfile, attrMask, attr, err := f.file.WalkGetAttr(names) + ctx.UninterruptibleSleepFinish(false) + return qids, p9file{newfile}, attrMask, attr, err +} + +// walkGetAttrOne is a wrapper around p9.File.WalkGetAttr that takes a single +// path component and returns a single qid. +func (f p9file) walkGetAttrOne(ctx context.Context, name string) (p9.QID, p9file, p9.AttrMask, p9.Attr, error) { + ctx.UninterruptibleSleepStart(false) + qids, newfile, attrMask, attr, err := f.file.WalkGetAttr([]string{name}) + ctx.UninterruptibleSleepFinish(false) + if err != nil { + return p9.QID{}, p9file{}, p9.AttrMask{}, p9.Attr{}, err + } + if len(qids) != 1 { + ctx.Warningf("p9.File.WalkGetAttr returned %d qids (%v), wanted 1", len(qids), qids) + if newfile != nil { + p9file{newfile}.close(ctx) + } + return p9.QID{}, p9file{}, p9.AttrMask{}, p9.Attr{}, syserror.EIO + } + return qids[0], p9file{newfile}, attrMask, attr, nil +} + +func (f p9file) statFS(ctx context.Context) (p9.FSStat, error) { + ctx.UninterruptibleSleepStart(false) + fsstat, err := f.file.StatFS() + ctx.UninterruptibleSleepFinish(false) + return fsstat, err +} + +func (f p9file) getAttr(ctx context.Context, req p9.AttrMask) (p9.QID, p9.AttrMask, p9.Attr, error) { + ctx.UninterruptibleSleepStart(false) + qid, attrMask, attr, err := f.file.GetAttr(req) + ctx.UninterruptibleSleepFinish(false) + return qid, attrMask, attr, err +} + +func (f p9file) setAttr(ctx context.Context, valid p9.SetAttrMask, attr p9.SetAttr) error { + ctx.UninterruptibleSleepStart(false) + err := f.file.SetAttr(valid, attr) + ctx.UninterruptibleSleepFinish(false) + return err +} + +func (f p9file) getXattr(ctx context.Context, name string, size uint64) (string, error) { + ctx.UninterruptibleSleepStart(false) + val, err := f.file.GetXattr(name, size) + ctx.UninterruptibleSleepFinish(false) + return val, err +} + +func (f p9file) setXattr(ctx context.Context, name, value string, flags uint32) error { + ctx.UninterruptibleSleepStart(false) + err := f.file.SetXattr(name, value, flags) + ctx.UninterruptibleSleepFinish(false) + return err +} + +func (f p9file) allocate(ctx context.Context, mode p9.AllocateMode, offset, length uint64) error { + ctx.UninterruptibleSleepStart(false) + err := f.file.Allocate(mode, offset, length) + ctx.UninterruptibleSleepFinish(false) + return err +} + +func (f p9file) close(ctx context.Context) error { + ctx.UninterruptibleSleepStart(false) + err := f.file.Close() + ctx.UninterruptibleSleepFinish(false) + return err +} + +func (f p9file) open(ctx context.Context, flags p9.OpenFlags) (*fd.FD, p9.QID, uint32, error) { + ctx.UninterruptibleSleepStart(false) + fdobj, qid, iounit, err := f.file.Open(flags) + ctx.UninterruptibleSleepFinish(false) + return fdobj, qid, iounit, err +} + +func (f p9file) readAt(ctx context.Context, p []byte, offset uint64) (int, error) { + ctx.UninterruptibleSleepStart(false) + n, err := f.file.ReadAt(p, offset) + ctx.UninterruptibleSleepFinish(false) + return n, err +} + +func (f p9file) writeAt(ctx context.Context, p []byte, offset uint64) (int, error) { + ctx.UninterruptibleSleepStart(false) + n, err := f.file.WriteAt(p, offset) + ctx.UninterruptibleSleepFinish(false) + return n, err +} + +func (f p9file) fsync(ctx context.Context) error { + ctx.UninterruptibleSleepStart(false) + err := f.file.FSync() + ctx.UninterruptibleSleepFinish(false) + return err +} + +func (f p9file) create(ctx context.Context, name string, flags p9.OpenFlags, permissions p9.FileMode, uid p9.UID, gid p9.GID) (*fd.FD, p9file, p9.QID, uint32, error) { + ctx.UninterruptibleSleepStart(false) + fdobj, newfile, qid, iounit, err := f.file.Create(name, flags, permissions, uid, gid) + ctx.UninterruptibleSleepFinish(false) + return fdobj, p9file{newfile}, qid, iounit, err +} + +func (f p9file) mkdir(ctx context.Context, name string, permissions p9.FileMode, uid p9.UID, gid p9.GID) (p9.QID, error) { + ctx.UninterruptibleSleepStart(false) + qid, err := f.file.Mkdir(name, permissions, uid, gid) + ctx.UninterruptibleSleepFinish(false) + return qid, err +} + +func (f p9file) symlink(ctx context.Context, oldName string, newName string, uid p9.UID, gid p9.GID) (p9.QID, error) { + ctx.UninterruptibleSleepStart(false) + qid, err := f.file.Symlink(oldName, newName, uid, gid) + ctx.UninterruptibleSleepFinish(false) + return qid, err +} + +func (f p9file) link(ctx context.Context, target p9file, newName string) error { + ctx.UninterruptibleSleepStart(false) + err := f.file.Link(target.file, newName) + ctx.UninterruptibleSleepFinish(false) + return err +} + +func (f p9file) mknod(ctx context.Context, name string, mode p9.FileMode, major uint32, minor uint32, uid p9.UID, gid p9.GID) (p9.QID, error) { + ctx.UninterruptibleSleepStart(false) + qid, err := f.file.Mknod(name, mode, major, minor, uid, gid) + ctx.UninterruptibleSleepFinish(false) + return qid, err +} + +func (f p9file) rename(ctx context.Context, newDir p9file, newName string) error { + ctx.UninterruptibleSleepStart(false) + err := f.file.Rename(newDir.file, newName) + ctx.UninterruptibleSleepFinish(false) + return err +} + +func (f p9file) unlinkAt(ctx context.Context, name string, flags uint32) error { + ctx.UninterruptibleSleepStart(false) + err := f.file.UnlinkAt(name, flags) + ctx.UninterruptibleSleepFinish(false) + return err +} + +func (f p9file) readdir(ctx context.Context, offset uint64, count uint32) ([]p9.Dirent, error) { + ctx.UninterruptibleSleepStart(false) + dirents, err := f.file.Readdir(offset, count) + ctx.UninterruptibleSleepFinish(false) + return dirents, err +} + +func (f p9file) readlink(ctx context.Context) (string, error) { + ctx.UninterruptibleSleepStart(false) + target, err := f.file.Readlink() + ctx.UninterruptibleSleepFinish(false) + return target, err +} + +func (f p9file) flush(ctx context.Context) error { + ctx.UninterruptibleSleepStart(false) + err := f.file.Flush() + ctx.UninterruptibleSleepFinish(false) + return err +} + +func (f p9file) connect(ctx context.Context, flags p9.ConnectFlags) (*fd.FD, error) { + ctx.UninterruptibleSleepStart(false) + fdobj, err := f.file.Connect(flags) + ctx.UninterruptibleSleepFinish(false) + return fdobj, err +} diff --git a/pkg/sentry/fsimpl/gofer/pagemath.go b/pkg/sentry/fsimpl/gofer/pagemath.go new file mode 100644 index 000000000..847cb0784 --- /dev/null +++ b/pkg/sentry/fsimpl/gofer/pagemath.go @@ -0,0 +1,31 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package gofer + +import ( + "gvisor.dev/gvisor/pkg/usermem" +) + +// This are equivalent to usermem.Addr.RoundDown/Up, but without the +// potentially truncating conversion to usermem.Addr. This is necessary because +// there is no way to define generic "PageRoundDown/Up" functions in Go. + +func pageRoundDown(x uint64) uint64 { + return x &^ (usermem.PageSize - 1) +} + +func pageRoundUp(x uint64) uint64 { + return pageRoundDown(x + usermem.PageSize - 1) +} diff --git a/pkg/sentry/fsimpl/gofer/regular_file.go b/pkg/sentry/fsimpl/gofer/regular_file.go new file mode 100644 index 000000000..8e11e06b3 --- /dev/null +++ b/pkg/sentry/fsimpl/gofer/regular_file.go @@ -0,0 +1,860 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package gofer + +import ( + "fmt" + "io" + "math" + "sync" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/safemem" + "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" + "gvisor.dev/gvisor/pkg/sentry/memmap" + "gvisor.dev/gvisor/pkg/sentry/pgalloc" + "gvisor.dev/gvisor/pkg/sentry/platform" + "gvisor.dev/gvisor/pkg/sentry/usage" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +func (d *dentry) isRegularFile() bool { + return d.fileType() == linux.S_IFREG +} + +type regularFileFD struct { + fileDescription + + // off is the file offset. off is protected by mu. + mu sync.Mutex + off int64 +} + +// Release implements vfs.FileDescriptionImpl.Release. +func (fd *regularFileFD) Release() { +} + +// OnClose implements vfs.FileDescriptionImpl.OnClose. +func (fd *regularFileFD) OnClose(ctx context.Context) error { + if !fd.vfsfd.IsWritable() { + return nil + } + // Skip flushing if writes may be buffered by the client, since (as with + // the VFS1 client) we don't flush buffered writes on close anyway. + d := fd.dentry() + if d.fs.opts.interop == InteropModeExclusive { + return nil + } + d.handleMu.RLock() + defer d.handleMu.RUnlock() + return d.handle.file.flush(ctx) +} + +// PRead implements vfs.FileDescriptionImpl.PRead. +func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { + if offset < 0 { + return 0, syserror.EINVAL + } + if opts.Flags != 0 { + return 0, syserror.EOPNOTSUPP + } + + // Check for reading at EOF before calling into MM (but not under + // InteropModeShared, which makes d.size unreliable). + d := fd.dentry() + if d.fs.opts.interop != InteropModeShared && uint64(offset) >= atomic.LoadUint64(&d.size) { + return 0, io.EOF + } + + if fd.vfsfd.StatusFlags()&linux.O_DIRECT != 0 { + // Lock d.metadataMu for the rest of the read to prevent d.size from + // changing. + d.metadataMu.Lock() + defer d.metadataMu.Unlock() + // Write dirty cached pages that will be touched by the read back to + // the remote file. + if err := d.writeback(ctx, offset, dst.NumBytes()); err != nil { + return 0, err + } + } + + rw := getDentryReadWriter(ctx, d, offset) + if fd.vfsfd.StatusFlags()&linux.O_DIRECT != 0 { + // Require the read to go to the remote file. + rw.direct = true + } + n, err := dst.CopyOutFrom(ctx, rw) + putDentryReadWriter(rw) + if d.fs.opts.interop != InteropModeShared { + // Compare Linux's mm/filemap.c:do_generic_file_read() => file_accessed(). + d.touchAtime(ctx, fd.vfsfd.Mount()) + } + return n, err +} + +// Read implements vfs.FileDescriptionImpl.Read. +func (fd *regularFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + fd.mu.Lock() + n, err := fd.PRead(ctx, dst, fd.off, opts) + fd.off += n + fd.mu.Unlock() + return n, err +} + +// PWrite implements vfs.FileDescriptionImpl.PWrite. +func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { + if offset < 0 { + return 0, syserror.EINVAL + } + if opts.Flags != 0 { + return 0, syserror.EOPNOTSUPP + } + + d := fd.dentry() + d.metadataMu.Lock() + defer d.metadataMu.Unlock() + if d.fs.opts.interop != InteropModeShared { + // Compare Linux's mm/filemap.c:__generic_file_write_iter() => + // file_update_time(). This is d.touchCMtime(), but without locking + // d.metadataMu (recursively). + if now, ok := nowFromContext(ctx); ok { + atomic.StoreInt64(&d.mtime, now) + atomic.StoreInt64(&d.ctime, now) + } + } + if fd.vfsfd.StatusFlags()&linux.O_DIRECT != 0 { + // Write dirty cached pages that will be touched by the write back to + // the remote file. + if err := d.writeback(ctx, offset, src.NumBytes()); err != nil { + return 0, err + } + // Remove touched pages from the cache. + pgstart := pageRoundDown(uint64(offset)) + pgend := pageRoundUp(uint64(offset + src.NumBytes())) + if pgend < pgstart { + return 0, syserror.EINVAL + } + mr := memmap.MappableRange{pgstart, pgend} + var freed []platform.FileRange + d.dataMu.Lock() + cseg := d.cache.LowerBoundSegment(mr.Start) + for cseg.Ok() && cseg.Start() < mr.End { + cseg = d.cache.Isolate(cseg, mr) + freed = append(freed, platform.FileRange{cseg.Value(), cseg.Value() + cseg.Range().Length()}) + cseg = d.cache.Remove(cseg).NextSegment() + } + d.dataMu.Unlock() + // Invalidate mappings of removed pages. + d.mapsMu.Lock() + d.mappings.Invalidate(mr, memmap.InvalidateOpts{}) + d.mapsMu.Unlock() + // Finally free pages removed from the cache. + mf := d.fs.mfp.MemoryFile() + for _, freedFR := range freed { + mf.DecRef(freedFR) + } + } + rw := getDentryReadWriter(ctx, d, offset) + if fd.vfsfd.StatusFlags()&linux.O_DIRECT != 0 { + // Require the write to go to the remote file. + rw.direct = true + } + n, err := src.CopyInTo(ctx, rw) + putDentryReadWriter(rw) + if n != 0 && fd.vfsfd.StatusFlags()&(linux.O_DSYNC|linux.O_SYNC) != 0 { + // Write dirty cached pages touched by the write back to the remote + // file. + if err := d.writeback(ctx, offset, src.NumBytes()); err != nil { + return 0, err + } + // Request the remote filesystem to sync the remote file. + if err := d.handle.file.fsync(ctx); err != nil { + return 0, err + } + } + return n, err +} + +// Write implements vfs.FileDescriptionImpl.Write. +func (fd *regularFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { + fd.mu.Lock() + n, err := fd.PWrite(ctx, src, fd.off, opts) + fd.off += n + fd.mu.Unlock() + return n, err +} + +type dentryReadWriter struct { + ctx context.Context + d *dentry + off uint64 + direct bool +} + +var dentryReadWriterPool = sync.Pool{ + New: func() interface{} { + return &dentryReadWriter{} + }, +} + +func getDentryReadWriter(ctx context.Context, d *dentry, offset int64) *dentryReadWriter { + rw := dentryReadWriterPool.Get().(*dentryReadWriter) + rw.ctx = ctx + rw.d = d + rw.off = uint64(offset) + rw.direct = false + return rw +} + +func putDentryReadWriter(rw *dentryReadWriter) { + rw.ctx = nil + rw.d = nil + dentryReadWriterPool.Put(rw) +} + +// ReadToBlocks implements safemem.Reader.ReadToBlocks. +func (rw *dentryReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) { + if dsts.IsEmpty() { + return 0, nil + } + + // If we have a mmappable host FD (which must be used here to ensure + // coherence with memory-mapped I/O), or if InteropModeShared is in effect + // (which prevents us from caching file contents and makes dentry.size + // unreliable), or if the file was opened O_DIRECT, read directly from + // dentry.handle without locking dentry.dataMu. + rw.d.handleMu.RLock() + if (rw.d.handle.fd >= 0 && !rw.d.fs.opts.forcePageCache) || rw.d.fs.opts.interop == InteropModeShared || rw.direct { + n, err := rw.d.handle.readToBlocksAt(rw.ctx, dsts, rw.off) + rw.d.handleMu.RUnlock() + rw.off += n + return n, err + } + + // Otherwise read from/through the cache. + mf := rw.d.fs.mfp.MemoryFile() + fillCache := mf.ShouldCacheEvictable() + var dataMuUnlock func() + if fillCache { + rw.d.dataMu.Lock() + dataMuUnlock = rw.d.dataMu.Unlock + } else { + rw.d.dataMu.RLock() + dataMuUnlock = rw.d.dataMu.RUnlock + } + + // Compute the range to read (limited by file size and overflow-checked). + if rw.off >= rw.d.size { + dataMuUnlock() + rw.d.handleMu.RUnlock() + return 0, io.EOF + } + end := rw.d.size + if rend := rw.off + dsts.NumBytes(); rend > rw.off && rend < end { + end = rend + } + + var done uint64 + seg, gap := rw.d.cache.Find(rw.off) + for rw.off < end { + mr := memmap.MappableRange{rw.off, end} + switch { + case seg.Ok(): + // Get internal mappings from the cache. + ims, err := mf.MapInternal(seg.FileRangeOf(seg.Range().Intersect(mr)), usermem.Read) + if err != nil { + dataMuUnlock() + rw.d.handleMu.RUnlock() + return done, err + } + + // Copy from internal mappings. + n, err := safemem.CopySeq(dsts, ims) + done += n + rw.off += n + dsts = dsts.DropFirst64(n) + if err != nil { + dataMuUnlock() + rw.d.handleMu.RUnlock() + return done, err + } + + // Continue. + seg, gap = seg.NextNonEmpty() + + case gap.Ok(): + gapMR := gap.Range().Intersect(mr) + if fillCache { + // Read into the cache, then re-enter the loop to read from the + // cache. + reqMR := memmap.MappableRange{ + Start: pageRoundDown(gapMR.Start), + End: pageRoundUp(gapMR.End), + } + optMR := gap.Range() + err := rw.d.cache.Fill(rw.ctx, reqMR, maxFillRange(reqMR, optMR), mf, usage.PageCache, rw.d.handle.readToBlocksAt) + mf.MarkEvictable(rw.d, pgalloc.EvictableRange{optMR.Start, optMR.End}) + seg, gap = rw.d.cache.Find(rw.off) + if !seg.Ok() { + dataMuUnlock() + rw.d.handleMu.RUnlock() + return done, err + } + // err might have occurred in part of gap.Range() outside + // gapMR. Forget about it for now; if the error matters and + // persists, we'll run into it again in a later iteration of + // this loop. + } else { + // Read directly from the file. + gapDsts := dsts.TakeFirst64(gapMR.Length()) + n, err := rw.d.handle.readToBlocksAt(rw.ctx, gapDsts, gapMR.Start) + done += n + rw.off += n + dsts = dsts.DropFirst64(n) + // Partial reads are fine. But we must stop reading. + if n != gapDsts.NumBytes() || err != nil { + dataMuUnlock() + rw.d.handleMu.RUnlock() + return done, err + } + + // Continue. + seg, gap = gap.NextSegment(), fsutil.FileRangeGapIterator{} + } + } + } + dataMuUnlock() + rw.d.handleMu.RUnlock() + return done, nil +} + +// WriteFromBlocks implements safemem.Writer.WriteFromBlocks. +// +// Preconditions: rw.d.metadataMu must be locked. +func (rw *dentryReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) { + if srcs.IsEmpty() { + return 0, nil + } + + // If we have a mmappable host FD (which must be used here to ensure + // coherence with memory-mapped I/O), or if InteropModeShared is in effect + // (which prevents us from caching file contents), or if the file was + // opened with O_DIRECT, write directly to dentry.handle without locking + // dentry.dataMu. + rw.d.handleMu.RLock() + if (rw.d.handle.fd >= 0 && !rw.d.fs.opts.forcePageCache) || rw.d.fs.opts.interop == InteropModeShared || rw.direct { + n, err := rw.d.handle.writeFromBlocksAt(rw.ctx, srcs, rw.off) + rw.d.handleMu.RUnlock() + rw.off += n + return n, err + } + + // Otherwise write to/through the cache. + mf := rw.d.fs.mfp.MemoryFile() + rw.d.dataMu.Lock() + + // Compute the range to write (overflow-checked). + start := rw.off + end := rw.off + srcs.NumBytes() + if end <= rw.off { + end = math.MaxInt64 + } + + var ( + done uint64 + retErr error + ) + seg, gap := rw.d.cache.Find(rw.off) + for rw.off < end { + mr := memmap.MappableRange{rw.off, end} + switch { + case seg.Ok(): + // Get internal mappings from the cache. + segMR := seg.Range().Intersect(mr) + ims, err := mf.MapInternal(seg.FileRangeOf(segMR), usermem.Write) + if err != nil { + retErr = err + goto exitLoop + } + + // Copy to internal mappings. + n, err := safemem.CopySeq(ims, srcs) + done += n + rw.off += n + srcs = srcs.DropFirst64(n) + rw.d.dirty.MarkDirty(segMR) + if err != nil { + retErr = err + goto exitLoop + } + + // Continue. + seg, gap = seg.NextNonEmpty() + + case gap.Ok(): + // Write directly to the file. At present, we never fill the cache + // when writing, since doing so can convert small writes into + // inefficient read-modify-write cycles, and we have no mechanism + // for detecting or avoiding this. + gapMR := gap.Range().Intersect(mr) + gapSrcs := srcs.TakeFirst64(gapMR.Length()) + n, err := rw.d.handle.writeFromBlocksAt(rw.ctx, gapSrcs, gapMR.Start) + done += n + rw.off += n + srcs = srcs.DropFirst64(n) + // Partial writes are fine. But we must stop writing. + if n != gapSrcs.NumBytes() || err != nil { + retErr = err + goto exitLoop + } + + // Continue. + seg, gap = gap.NextSegment(), fsutil.FileRangeGapIterator{} + } + } +exitLoop: + if rw.off > rw.d.size { + atomic.StoreUint64(&rw.d.size, rw.off) + // The remote file's size will implicitly be extended to the correct + // value when we write back to it. + } + // If InteropModeWritethrough is in effect, flush written data back to the + // remote filesystem. + if rw.d.fs.opts.interop == InteropModeWritethrough && done != 0 { + if err := fsutil.SyncDirty(rw.ctx, memmap.MappableRange{ + Start: start, + End: rw.off, + }, &rw.d.cache, &rw.d.dirty, rw.d.size, mf, rw.d.handle.writeFromBlocksAt); err != nil { + // We have no idea how many bytes were actually flushed. + rw.off = start + done = 0 + retErr = err + } + } + rw.d.dataMu.Unlock() + rw.d.handleMu.RUnlock() + return done, retErr +} + +func (d *dentry) writeback(ctx context.Context, offset, size int64) error { + if size == 0 { + return nil + } + d.handleMu.RLock() + defer d.handleMu.RUnlock() + d.dataMu.Lock() + defer d.dataMu.Unlock() + // Compute the range of valid bytes (overflow-checked). + if uint64(offset) >= d.size { + return nil + } + end := int64(d.size) + if rend := offset + size; rend > offset && rend < end { + end = rend + } + return fsutil.SyncDirty(ctx, memmap.MappableRange{ + Start: uint64(offset), + End: uint64(end), + }, &d.cache, &d.dirty, d.size, d.fs.mfp.MemoryFile(), d.handle.writeFromBlocksAt) +} + +// Seek implements vfs.FileDescriptionImpl.Seek. +func (fd *regularFileFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { + fd.mu.Lock() + defer fd.mu.Unlock() + switch whence { + case linux.SEEK_SET: + // Use offset as specified. + case linux.SEEK_CUR: + offset += fd.off + case linux.SEEK_END, linux.SEEK_DATA, linux.SEEK_HOLE: + // Ensure file size is up to date. + d := fd.dentry() + if fd.filesystem().opts.interop == InteropModeShared { + if err := d.updateFromGetattr(ctx); err != nil { + return 0, err + } + } + size := int64(atomic.LoadUint64(&d.size)) + // For SEEK_DATA and SEEK_HOLE, treat the file as a single contiguous + // block of data. + switch whence { + case linux.SEEK_END: + offset += size + case linux.SEEK_DATA: + if offset > size { + return 0, syserror.ENXIO + } + // Use offset as specified. + case linux.SEEK_HOLE: + if offset > size { + return 0, syserror.ENXIO + } + offset = size + } + default: + return 0, syserror.EINVAL + } + if offset < 0 { + return 0, syserror.EINVAL + } + fd.off = offset + return offset, nil +} + +// Sync implements vfs.FileDescriptionImpl.Sync. +func (fd *regularFileFD) Sync(ctx context.Context) error { + return fd.dentry().syncSharedHandle(ctx) +} + +func (d *dentry) syncSharedHandle(ctx context.Context) error { + d.handleMu.RLock() + if !d.handleWritable { + d.handleMu.RUnlock() + return nil + } + d.dataMu.Lock() + // Write dirty cached data to the remote file. + err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, d.fs.mfp.MemoryFile(), d.handle.writeFromBlocksAt) + d.dataMu.Unlock() + if err == nil { + // Sync the remote file. + err = d.handle.sync(ctx) + } + d.handleMu.RUnlock() + return err +} + +// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap. +func (fd *regularFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { + d := fd.dentry() + switch d.fs.opts.interop { + case InteropModeExclusive: + // Any mapping is fine. + case InteropModeWritethrough: + // Shared writable mappings require a host FD, since otherwise we can't + // synchronously flush memory-mapped writes to the remote file. + if opts.Private || !opts.MaxPerms.Write { + break + } + fallthrough + case InteropModeShared: + // All mappings require a host FD to be coherent with other filesystem + // users. + if d.fs.opts.forcePageCache { + // Whether or not we have a host FD, we're not allowed to use it. + return syserror.ENODEV + } + d.handleMu.RLock() + haveFD := d.handle.fd >= 0 + d.handleMu.RUnlock() + if !haveFD { + return syserror.ENODEV + } + default: + panic(fmt.Sprintf("unknown InteropMode %v", d.fs.opts.interop)) + } + return vfs.GenericConfigureMMap(&fd.vfsfd, d, opts) +} + +func (d *dentry) mayCachePages() bool { + if d.fs.opts.interop == InteropModeShared { + return false + } + if d.fs.opts.forcePageCache { + return true + } + d.handleMu.RLock() + haveFD := d.handle.fd >= 0 + d.handleMu.RUnlock() + return haveFD +} + +// AddMapping implements memmap.Mappable.AddMapping. +func (d *dentry) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) error { + d.mapsMu.Lock() + mapped := d.mappings.AddMapping(ms, ar, offset, writable) + // Do this unconditionally since whether we have a host FD can change + // across save/restore. + for _, r := range mapped { + d.pf.hostFileMapper.IncRefOn(r) + } + if d.mayCachePages() { + // d.Evict() will refuse to evict memory-mapped pages, so tell the + // MemoryFile to not bother trying. + mf := d.fs.mfp.MemoryFile() + for _, r := range mapped { + mf.MarkUnevictable(d, pgalloc.EvictableRange{r.Start, r.End}) + } + } + d.mapsMu.Unlock() + return nil +} + +// RemoveMapping implements memmap.Mappable.RemoveMapping. +func (d *dentry) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) { + d.mapsMu.Lock() + unmapped := d.mappings.RemoveMapping(ms, ar, offset, writable) + for _, r := range unmapped { + d.pf.hostFileMapper.DecRefOn(r) + } + if d.mayCachePages() { + // Pages that are no longer referenced by any application memory + // mappings are now considered unused; allow MemoryFile to evict them + // when necessary. + mf := d.fs.mfp.MemoryFile() + d.dataMu.Lock() + for _, r := range unmapped { + // Since these pages are no longer mapped, they are no longer + // concurrently dirtyable by a writable memory mapping. + d.dirty.AllowClean(r) + mf.MarkEvictable(d, pgalloc.EvictableRange{r.Start, r.End}) + } + d.dataMu.Unlock() + } + d.mapsMu.Unlock() +} + +// CopyMapping implements memmap.Mappable.CopyMapping. +func (d *dentry) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR usermem.AddrRange, offset uint64, writable bool) error { + return d.AddMapping(ctx, ms, dstAR, offset, writable) +} + +// Translate implements memmap.Mappable.Translate. +func (d *dentry) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) { + d.handleMu.RLock() + if d.handle.fd >= 0 && !d.fs.opts.forcePageCache { + d.handleMu.RUnlock() + mr := optional + if d.fs.opts.limitHostFDTranslation { + mr = maxFillRange(required, optional) + } + return []memmap.Translation{ + { + Source: mr, + File: &d.pf, + Offset: mr.Start, + Perms: usermem.AnyAccess, + }, + }, nil + } + + d.dataMu.Lock() + + // Constrain translations to d.size (rounded up) to prevent translation to + // pages that may be concurrently truncated. + pgend := pageRoundUp(d.size) + var beyondEOF bool + if required.End > pgend { + if required.Start >= pgend { + d.dataMu.Unlock() + d.handleMu.RUnlock() + return nil, &memmap.BusError{io.EOF} + } + beyondEOF = true + required.End = pgend + } + if optional.End > pgend { + optional.End = pgend + } + + mf := d.fs.mfp.MemoryFile() + cerr := d.cache.Fill(ctx, required, maxFillRange(required, optional), mf, usage.PageCache, d.handle.readToBlocksAt) + + var ts []memmap.Translation + var translatedEnd uint64 + for seg := d.cache.FindSegment(required.Start); seg.Ok() && seg.Start() < required.End; seg, _ = seg.NextNonEmpty() { + segMR := seg.Range().Intersect(optional) + // TODO(jamieliu): Make Translations writable even if writability is + // not required if already kept-dirty by another writable translation. + perms := usermem.AccessType{ + Read: true, + Execute: true, + } + if at.Write { + // From this point forward, this memory can be dirtied through the + // mapping at any time. + d.dirty.KeepDirty(segMR) + perms.Write = true + } + ts = append(ts, memmap.Translation{ + Source: segMR, + File: mf, + Offset: seg.FileRangeOf(segMR).Start, + Perms: perms, + }) + translatedEnd = segMR.End + } + + d.dataMu.Unlock() + d.handleMu.RUnlock() + + // Don't return the error returned by c.cache.Fill if it occurred outside + // of required. + if translatedEnd < required.End && cerr != nil { + return ts, &memmap.BusError{cerr} + } + if beyondEOF { + return ts, &memmap.BusError{io.EOF} + } + return ts, nil +} + +func maxFillRange(required, optional memmap.MappableRange) memmap.MappableRange { + const maxReadahead = 64 << 10 // 64 KB, chosen arbitrarily + if required.Length() >= maxReadahead { + return required + } + if optional.Length() <= maxReadahead { + return optional + } + optional.Start = required.Start + if optional.Length() <= maxReadahead { + return optional + } + optional.End = optional.Start + maxReadahead + return optional +} + +// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable. +func (d *dentry) InvalidateUnsavable(ctx context.Context) error { + // Whether we have a host fd (and consequently what platform.File is + // mapped) can change across save/restore, so invalidate all translations + // unconditionally. + d.mapsMu.Lock() + defer d.mapsMu.Unlock() + d.mappings.InvalidateAll(memmap.InvalidateOpts{}) + + // Write the cache's contents back to the remote file so that if we have a + // host fd after restore, the remote file's contents are coherent. + mf := d.fs.mfp.MemoryFile() + d.dataMu.Lock() + defer d.dataMu.Unlock() + if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, mf, d.handle.writeFromBlocksAt); err != nil { + return err + } + + // Discard the cache so that it's not stored in saved state. This is safe + // because per InvalidateUnsavable invariants, no new translations can have + // been returned after we invalidated all existing translations above. + d.cache.DropAll(mf) + d.dirty.RemoveAll() + + return nil +} + +// Evict implements pgalloc.EvictableMemoryUser.Evict. +func (d *dentry) Evict(ctx context.Context, er pgalloc.EvictableRange) { + d.mapsMu.Lock() + defer d.mapsMu.Unlock() + d.dataMu.Lock() + defer d.dataMu.Unlock() + + mr := memmap.MappableRange{er.Start, er.End} + mf := d.fs.mfp.MemoryFile() + // Only allow pages that are no longer memory-mapped to be evicted. + for mgap := d.mappings.LowerBoundGap(mr.Start); mgap.Ok() && mgap.Start() < mr.End; mgap = mgap.NextGap() { + mgapMR := mgap.Range().Intersect(mr) + if mgapMR.Length() == 0 { + continue + } + if err := fsutil.SyncDirty(ctx, mgapMR, &d.cache, &d.dirty, d.size, mf, d.handle.writeFromBlocksAt); err != nil { + log.Warningf("Failed to writeback cached data %v: %v", mgapMR, err) + } + d.cache.Drop(mgapMR, mf) + d.dirty.KeepClean(mgapMR) + } +} + +// dentryPlatformFile implements platform.File. It exists solely because dentry +// cannot implement both vfs.DentryImpl.IncRef and platform.File.IncRef. +// +// dentryPlatformFile is only used when a host FD representing the remote file +// is available (i.e. dentry.handle.fd >= 0), and that FD is used for +// application memory mappings (i.e. !filesystem.opts.forcePageCache). +type dentryPlatformFile struct { + *dentry + + // fdRefs counts references on platform.File offsets. fdRefs is protected + // by dentry.dataMu. + fdRefs fsutil.FrameRefSet + + // If this dentry represents a regular file, and handle.fd >= 0, + // hostFileMapper caches mappings of handle.fd. + hostFileMapper fsutil.HostFileMapper +} + +// IncRef implements platform.File.IncRef. +func (d *dentryPlatformFile) IncRef(fr platform.FileRange) { + d.dataMu.Lock() + seg, gap := d.fdRefs.Find(fr.Start) + for { + switch { + case seg.Ok() && seg.Start() < fr.End: + seg = d.fdRefs.Isolate(seg, fr) + seg.SetValue(seg.Value() + 1) + seg, gap = seg.NextNonEmpty() + case gap.Ok() && gap.Start() < fr.End: + newRange := gap.Range().Intersect(fr) + usage.MemoryAccounting.Inc(newRange.Length(), usage.Mapped) + seg, gap = d.fdRefs.InsertWithoutMerging(gap, newRange, 1).NextNonEmpty() + default: + d.fdRefs.MergeAdjacent(fr) + d.dataMu.Unlock() + return + } + } +} + +// DecRef implements platform.File.DecRef. +func (d *dentryPlatformFile) DecRef(fr platform.FileRange) { + d.dataMu.Lock() + seg := d.fdRefs.FindSegment(fr.Start) + + for seg.Ok() && seg.Start() < fr.End { + seg = d.fdRefs.Isolate(seg, fr) + if old := seg.Value(); old == 1 { + usage.MemoryAccounting.Dec(seg.Range().Length(), usage.Mapped) + seg = d.fdRefs.Remove(seg).NextSegment() + } else { + seg.SetValue(old - 1) + seg = seg.NextSegment() + } + } + d.fdRefs.MergeAdjacent(fr) + d.dataMu.Unlock() + +} + +// MapInternal implements platform.File.MapInternal. +func (d *dentryPlatformFile) MapInternal(fr platform.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) { + d.handleMu.RLock() + bs, err := d.hostFileMapper.MapInternal(fr, int(d.handle.fd), at.Write) + d.handleMu.RUnlock() + return bs, err +} + +// FD implements platform.File.FD. +func (d *dentryPlatformFile) FD() int { + d.handleMu.RLock() + fd := d.handle.fd + d.handleMu.RUnlock() + return int(fd) +} diff --git a/pkg/sentry/fsimpl/gofer/special_file.go b/pkg/sentry/fsimpl/gofer/special_file.go new file mode 100644 index 000000000..08c691c47 --- /dev/null +++ b/pkg/sentry/fsimpl/gofer/special_file.go @@ -0,0 +1,159 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package gofer + +import ( + "sync" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/safemem" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// specialFileFD implements vfs.FileDescriptionImpl for files other than +// regular files, directories, and symlinks: pipes, sockets, etc. It is also +// used for regular files when filesystemOptions.specialRegularFiles is in +// effect. specialFileFD differs from regularFileFD by using per-FD handles +// instead of shared per-dentry handles, and never buffering I/O. +type specialFileFD struct { + fileDescription + + // handle is immutable. + handle handle + + // off is the file offset. off is protected by mu. (POSIX 2.9.7 only + // requires operations using the file offset to be atomic for regular files + // and symlinks; however, since specialFileFD may be used for regular + // files, we apply this atomicity unconditionally.) + mu sync.Mutex + off int64 +} + +// Release implements vfs.FileDescriptionImpl.Release. +func (fd *specialFileFD) Release() { + fd.handle.close(context.Background()) + fs := fd.vfsfd.Mount().Filesystem().Impl().(*filesystem) + fs.syncMu.Lock() + delete(fs.specialFileFDs, fd) + fs.syncMu.Unlock() +} + +// OnClose implements vfs.FileDescriptionImpl.OnClose. +func (fd *specialFileFD) OnClose(ctx context.Context) error { + if !fd.vfsfd.IsWritable() { + return nil + } + return fd.handle.file.flush(ctx) +} + +// PRead implements vfs.FileDescriptionImpl.PRead. +func (fd *specialFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { + if offset < 0 { + return 0, syserror.EINVAL + } + if opts.Flags != 0 { + return 0, syserror.EOPNOTSUPP + } + + // Going through dst.CopyOutFrom() holds MM locks around file operations of + // unknown duration. For regularFileFD, doing so is necessary to support + // mmap due to lock ordering; MM locks precede dentry.dataMu. That doesn't + // hold here since specialFileFD doesn't client-cache data. Just buffer the + // read instead. + if d := fd.dentry(); d.fs.opts.interop != InteropModeShared { + d.touchAtime(ctx, fd.vfsfd.Mount()) + } + buf := make([]byte, dst.NumBytes()) + n, err := fd.handle.readToBlocksAt(ctx, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf)), uint64(offset)) + if n == 0 { + return 0, err + } + if cp, cperr := dst.CopyOut(ctx, buf[:n]); cperr != nil { + return int64(cp), cperr + } + return int64(n), err +} + +// Read implements vfs.FileDescriptionImpl.Read. +func (fd *specialFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + fd.mu.Lock() + n, err := fd.PRead(ctx, dst, fd.off, opts) + fd.off += n + fd.mu.Unlock() + return n, err +} + +// PWrite implements vfs.FileDescriptionImpl.PWrite. +func (fd *specialFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { + if offset < 0 { + return 0, syserror.EINVAL + } + if opts.Flags != 0 { + return 0, syserror.EOPNOTSUPP + } + + // Do a buffered write. See rationale in PRead. + if d := fd.dentry(); d.fs.opts.interop != InteropModeShared { + d.touchCMtime(ctx) + } + buf := make([]byte, src.NumBytes()) + // Don't do partial writes if we get a partial read from src. + if _, err := src.CopyIn(ctx, buf); err != nil { + return 0, err + } + n, err := fd.handle.writeFromBlocksAt(ctx, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf)), uint64(offset)) + return int64(n), err +} + +// Write implements vfs.FileDescriptionImpl.Write. +func (fd *specialFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { + fd.mu.Lock() + n, err := fd.PWrite(ctx, src, fd.off, opts) + fd.off += n + fd.mu.Unlock() + return n, err +} + +// Seek implements vfs.FileDescriptionImpl.Seek. +func (fd *specialFileFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { + fd.mu.Lock() + defer fd.mu.Unlock() + switch whence { + case linux.SEEK_SET: + // Use offset as given. + case linux.SEEK_CUR: + offset += fd.off + default: + // SEEK_END, SEEK_DATA, and SEEK_HOLE aren't supported since it's not + // clear that file size is even meaningful for these files. + return 0, syserror.EINVAL + } + if offset < 0 { + return 0, syserror.EINVAL + } + fd.off = offset + return offset, nil +} + +// Sync implements vfs.FileDescriptionImpl.Sync. +func (fd *specialFileFD) Sync(ctx context.Context) error { + if !fd.vfsfd.IsWritable() { + return nil + } + return fd.handle.sync(ctx) +} diff --git a/pkg/sentry/fsimpl/gofer/symlink.go b/pkg/sentry/fsimpl/gofer/symlink.go new file mode 100644 index 000000000..adf43be60 --- /dev/null +++ b/pkg/sentry/fsimpl/gofer/symlink.go @@ -0,0 +1,47 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package gofer + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/vfs" +) + +func (d *dentry) isSymlink() bool { + return d.fileType() == linux.S_IFLNK +} + +// Precondition: d.isSymlink(). +func (d *dentry) readlink(ctx context.Context, mnt *vfs.Mount) (string, error) { + if d.fs.opts.interop != InteropModeShared { + d.touchAtime(ctx, mnt) + d.dataMu.Lock() + if d.haveTarget { + target := d.target + d.dataMu.Unlock() + return target, nil + } + } + target, err := d.file.readlink(ctx) + if d.fs.opts.interop != InteropModeShared { + if err == nil { + d.haveTarget = true + d.target = target + } + d.dataMu.Unlock() + } + return target, err +} diff --git a/pkg/sentry/fsimpl/gofer/time.go b/pkg/sentry/fsimpl/gofer/time.go new file mode 100644 index 000000000..7598ec6a8 --- /dev/null +++ b/pkg/sentry/fsimpl/gofer/time.go @@ -0,0 +1,75 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package gofer + +import ( + "sync/atomic" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" + "gvisor.dev/gvisor/pkg/sentry/vfs" +) + +func dentryTimestampFromP9(s, ns uint64) int64 { + return int64(s*1e9 + ns) +} + +func dentryTimestampFromStatx(ts linux.StatxTimestamp) int64 { + return ts.Sec*1e9 + int64(ts.Nsec) +} + +func statxTimestampFromDentry(ns int64) linux.StatxTimestamp { + return linux.StatxTimestamp{ + Sec: ns / 1e9, + Nsec: uint32(ns % 1e9), + } +} + +func nowFromContext(ctx context.Context) (int64, bool) { + if clock := ktime.RealtimeClockFromContext(ctx); clock != nil { + return clock.Now().Nanoseconds(), true + } + return 0, false +} + +// Preconditions: fs.interop != InteropModeShared. +func (d *dentry) touchAtime(ctx context.Context, mnt *vfs.Mount) { + if err := mnt.CheckBeginWrite(); err != nil { + return + } + now, ok := nowFromContext(ctx) + if !ok { + mnt.EndWrite() + return + } + d.metadataMu.Lock() + atomic.StoreInt64(&d.atime, now) + d.metadataMu.Unlock() + mnt.EndWrite() +} + +// Preconditions: fs.interop != InteropModeShared. The caller has successfully +// called vfs.Mount.CheckBeginWrite(). +func (d *dentry) touchCMtime(ctx context.Context) { + now, ok := nowFromContext(ctx) + if !ok { + return + } + d.metadataMu.Lock() + atomic.StoreInt64(&d.mtime, now) + atomic.StoreInt64(&d.ctime, now) + d.metadataMu.Unlock() +} diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go index 5ee9cf1e9..72bc15264 100644 --- a/pkg/sentry/fsimpl/tmpfs/filesystem.go +++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go @@ -622,7 +622,7 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error if child.inode.isDir() { return syserror.EISDIR } - if !rp.MustBeDir() { + if rp.MustBeDir() { return syserror.ENOTDIR } mnt := rp.Mount() diff --git a/pkg/sentry/socket/hostinet/socket.go b/pkg/sentry/socket/hostinet/socket.go index bde4c7a1e..34f63986f 100644 --- a/pkg/sentry/socket/hostinet/socket.go +++ b/pkg/sentry/socket/hostinet/socket.go @@ -126,7 +126,7 @@ func (s *socketOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOS } return uint64(n), nil } - return readv(s.fd, iovecsFromBlockSeq(dsts)) + return readv(s.fd, safemem.IovecsFromBlockSeq(dsts)) })) return int64(n), err } @@ -149,7 +149,7 @@ func (s *socketOperations) Write(ctx context.Context, _ *fs.File, src usermem.IO } return uint64(n), nil } - return writev(s.fd, iovecsFromBlockSeq(srcs)) + return writev(s.fd, safemem.IovecsFromBlockSeq(srcs)) })) return int64(n), err } @@ -402,7 +402,7 @@ func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags // We always do a non-blocking recv*(). sysflags := flags | syscall.MSG_DONTWAIT - iovs := iovecsFromBlockSeq(dsts) + iovs := safemem.IovecsFromBlockSeq(dsts) msg := syscall.Msghdr{ Iov: &iovs[0], Iovlen: uint64(len(iovs)), @@ -522,7 +522,7 @@ func (s *socketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to [] return uint64(n), nil } - iovs := iovecsFromBlockSeq(srcs) + iovs := safemem.IovecsFromBlockSeq(srcs) msg := syscall.Msghdr{ Iov: &iovs[0], Iovlen: uint64(len(iovs)), @@ -567,21 +567,6 @@ func (s *socketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to [] return int(n), syserr.FromError(err) } -func iovecsFromBlockSeq(bs safemem.BlockSeq) []syscall.Iovec { - iovs := make([]syscall.Iovec, 0, bs.NumBlocks()) - for ; !bs.IsEmpty(); bs = bs.Tail() { - b := bs.Head() - iovs = append(iovs, syscall.Iovec{ - Base: &b.ToSlice()[0], - Len: uint64(b.Len()), - }) - // We don't need to care about b.NeedSafecopy(), because the host - // kernel will handle such address ranges just fine (by returning - // EFAULT). - } - return iovs -} - func translateIOSyscallError(err error) error { if err == syscall.EAGAIN || err == syscall.EWOULDBLOCK { return syserror.ErrWouldBlock -- cgit v1.2.3 From dcffddf0cae026411e7e678744a1e39dc2b513cf Mon Sep 17 00:00:00 2001 From: Fabricio Voznika Date: Tue, 4 Feb 2020 11:47:41 -0800 Subject: Remove argument from vfs.MountNamespace.DecRef() Updates #1035 PiperOrigin-RevId: 293194631 --- pkg/sentry/fsimpl/devtmpfs/devtmpfs.go | 2 +- pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go | 2 +- pkg/sentry/fsimpl/testutil/testutil.go | 2 +- pkg/sentry/fsimpl/tmpfs/benchmark_test.go | 4 ++-- pkg/sentry/fsimpl/tmpfs/regular_file_test.go | 2 +- pkg/sentry/vfs/mount.go | 3 ++- 6 files changed, 8 insertions(+), 7 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go b/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go index d36fa74fb..e03a0c665 100644 --- a/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go +++ b/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go @@ -86,7 +86,7 @@ func NewAccessor(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth // Release must be called when a is no longer in use. func (a *Accessor) Release() { a.root.DecRef() - a.mntns.DecRef(a.vfsObj) + a.mntns.DecRef() } // accessorContext implements context.Context by extending an existing diff --git a/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go b/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go index 82c58c900..73308a2b5 100644 --- a/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go +++ b/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go @@ -45,7 +45,7 @@ func TestDevtmpfs(t *testing.T) { if err != nil { t.Fatalf("failed to create tmpfs root mount: %v", err) } - defer mntns.DecRef(vfsObj) + defer mntns.DecRef() root := mntns.Root() defer root.DecRef() devpop := vfs.PathOperation{ diff --git a/pkg/sentry/fsimpl/testutil/testutil.go b/pkg/sentry/fsimpl/testutil/testutil.go index 1c98335c1..69fd84ddd 100644 --- a/pkg/sentry/fsimpl/testutil/testutil.go +++ b/pkg/sentry/fsimpl/testutil/testutil.go @@ -98,7 +98,7 @@ func (s *System) WithTemporaryContext(ctx context.Context) *System { // Destroy release resources associated with a test system. func (s *System) Destroy() { s.Root.DecRef() - s.mns.DecRef(s.VFS) // Reference on mns passed to NewSystem. + s.mns.DecRef() // Reference on mns passed to NewSystem. } // ReadToEnd reads the contents of fd until EOF to a string. diff --git a/pkg/sentry/fsimpl/tmpfs/benchmark_test.go b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go index 54241c8e8..9fce5e4b4 100644 --- a/pkg/sentry/fsimpl/tmpfs/benchmark_test.go +++ b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go @@ -183,7 +183,7 @@ func BenchmarkVFS2MemfsStat(b *testing.B) { if err != nil { b.Fatalf("failed to create tmpfs root mount: %v", err) } - defer mntns.DecRef(vfsObj) + defer mntns.DecRef() var filePathBuilder strings.Builder filePathBuilder.WriteByte('/') @@ -374,7 +374,7 @@ func BenchmarkVFS2MemfsMountStat(b *testing.B) { if err != nil { b.Fatalf("failed to create tmpfs root mount: %v", err) } - defer mntns.DecRef(vfsObj) + defer mntns.DecRef() var filePathBuilder strings.Builder filePathBuilder.WriteByte('/') diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file_test.go b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go index 2b52992ea..e9f71e334 100644 --- a/pkg/sentry/fsimpl/tmpfs/regular_file_test.go +++ b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go @@ -51,7 +51,7 @@ func newTmpfsRoot(ctx context.Context) (*vfs.VirtualFilesystem, vfs.VirtualDentr root := mntns.Root() return vfsObj, root, func() { root.DecRef() - mntns.DecRef(vfsObj) + mntns.DecRef() }, nil } diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go index d39528051..1fbb420f9 100644 --- a/pkg/sentry/vfs/mount.go +++ b/pkg/sentry/vfs/mount.go @@ -423,7 +423,8 @@ func (mntns *MountNamespace) IncRef() { } // DecRef decrements mntns' reference count. -func (mntns *MountNamespace) DecRef(vfs *VirtualFilesystem) { +func (mntns *MountNamespace) DecRef() { + vfs := mntns.root.fs.VirtualFilesystem() if refs := atomic.AddInt64(&mntns.refs, -1); refs == 0 { vfs.mountMu.Lock() vfs.mounts.seq.BeginWrite() -- cgit v1.2.3 From bfa0bba72abb69cbc7f4da27d3b4b116c3784495 Mon Sep 17 00:00:00 2001 From: Fabricio Voznika Date: Mon, 10 Feb 2020 12:02:47 -0800 Subject: Redirect FIXME to gvisor.dev PiperOrigin-RevId: 294272755 --- pkg/sentry/fsimpl/kernfs/filesystem.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go index 9d65d0179..e49303c26 100644 --- a/pkg/sentry/fsimpl/kernfs/filesystem.go +++ b/pkg/sentry/fsimpl/kernfs/filesystem.go @@ -111,10 +111,10 @@ func (fs *Filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.Vir // Dentry isn't cached; it either doesn't exist or failed // revalidation. Attempt to resolve it via Lookup. // - // FIXME(b/144498111): Inode.Lookup() should return *(kernfs.)Dentry, - // not *vfs.Dentry, since (kernfs.)Filesystem assumes that all dentries - // in the filesystem are (kernfs.)Dentry and performs vfs.DentryImpl - // casts accordingly. + // FIXME(gvisor.dev/issue/1193): Inode.Lookup() should return + // *(kernfs.)Dentry, not *vfs.Dentry, since (kernfs.)Filesystem assumes + // that all dentries in the filesystem are (kernfs.)Dentry and performs + // vfs.DentryImpl casts accordingly. var err error childVFSD, err = parent.inode.Lookup(ctx, name) if err != nil { -- cgit v1.2.3 From 4d4d47f0c0a21d3404d2edae527b187d62daa3c8 Mon Sep 17 00:00:00 2001 From: Adin Scannell Date: Mon, 10 Feb 2020 13:04:29 -0800 Subject: Add contextual note. PiperOrigin-RevId: 294285723 --- pkg/sentry/fsimpl/gofer/directory.go | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/gofer/directory.go b/pkg/sentry/fsimpl/gofer/directory.go index baa2cdd8e..6d4ebc2bf 100644 --- a/pkg/sentry/fsimpl/gofer/directory.go +++ b/pkg/sentry/fsimpl/gofer/directory.go @@ -87,6 +87,10 @@ func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) { // to assume that directory fids have the correct semantics, and translates // struct file_operations::readdir calls directly to readdir RPCs), but is // consistent with VFS1. + // + // NOTE(b/135560623): In particular, some gofer implementations may not + // retain state between calls to Readdir, so may not provide a coherent + // directory stream across in the presence of mutation. d.fs.renameMu.RLock() defer d.fs.renameMu.RUnlock() -- cgit v1.2.3 From bb22ebd7fbfc66556b38df669be5c6372daba018 Mon Sep 17 00:00:00 2001 From: Adin Scannell Date: Mon, 10 Feb 2020 13:20:44 -0800 Subject: Add contextual comment. PiperOrigin-RevId: 294289066 --- pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go | 3 +++ 1 file changed, 3 insertions(+) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go index d1436b943..2015a8871 100644 --- a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go +++ b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go @@ -15,6 +15,9 @@ // These benchmarks emulate memfs benchmarks. Ext4 images must be created // before this benchmark is run using the `make_deep_ext4.sh` script at // /tmp/image-{depth}.ext4 for all the depths tested below. +// +// The benchmark itself cannot run the script because the script requires +// sudo privileges to create the file system images. package benchmark_test import ( -- cgit v1.2.3 From 6dced977eab69401a114257e386addb9cb03a39d Mon Sep 17 00:00:00 2001 From: gVisor bot Date: Tue, 11 Feb 2020 17:38:04 -0800 Subject: Ensure fsimpl/gofer.dentryPlatformFile.hostFileMapper is initialized. Fixes #1812. (The more direct cause of the deadlock is panic unsafety because the historically high cost of defer means that we avoid it in hot paths, including much of MM; defer is much cheaper as of Go 1.14, but still a measurable overhead.) PiperOrigin-RevId: 294560316 --- pkg/sentry/fs/fsutil/host_file_mapper.go | 17 +++++++++++------ pkg/sentry/fsimpl/gofer/regular_file.go | 5 +++++ 2 files changed, 16 insertions(+), 6 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fs/fsutil/host_file_mapper.go b/pkg/sentry/fs/fsutil/host_file_mapper.go index 67278aa86..e82afd112 100644 --- a/pkg/sentry/fs/fsutil/host_file_mapper.go +++ b/pkg/sentry/fs/fsutil/host_file_mapper.go @@ -65,13 +65,18 @@ type mapping struct { writable bool } -// NewHostFileMapper returns a HostFileMapper with no references or cached -// mappings. +// Init must be called on zero-value HostFileMappers before first use. +func (f *HostFileMapper) Init() { + f.refs = make(map[uint64]int32) + f.mappings = make(map[uint64]mapping) +} + +// NewHostFileMapper returns an initialized HostFileMapper allocated on the +// heap with no references or cached mappings. func NewHostFileMapper() *HostFileMapper { - return &HostFileMapper{ - refs: make(map[uint64]int32), - mappings: make(map[uint64]mapping), - } + f := &HostFileMapper{} + f.Init() + return f } // IncRefOn increments the reference count on all offsets in mr. diff --git a/pkg/sentry/fsimpl/gofer/regular_file.go b/pkg/sentry/fsimpl/gofer/regular_file.go index 8e11e06b3..54c1031a7 100644 --- a/pkg/sentry/fsimpl/gofer/regular_file.go +++ b/pkg/sentry/fsimpl/gofer/regular_file.go @@ -571,6 +571,8 @@ func (fd *regularFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpt default: panic(fmt.Sprintf("unknown InteropMode %v", d.fs.opts.interop)) } + // After this point, d may be used as a memmap.Mappable. + d.pf.hostFileMapperInitOnce.Do(d.pf.hostFileMapper.Init) return vfs.GenericConfigureMMap(&fd.vfsfd, d, opts) } @@ -799,6 +801,9 @@ type dentryPlatformFile struct { // If this dentry represents a regular file, and handle.fd >= 0, // hostFileMapper caches mappings of handle.fd. hostFileMapper fsutil.HostFileMapper + + // hostFileMapperInitOnce is used to lazily initialize hostFileMapper. + hostFileMapperInitOnce sync.Once } // IncRef implements platform.File.IncRef. -- cgit v1.2.3 From a6024f7f5f6f438c11e30be0f93657b1956fd5ba Mon Sep 17 00:00:00 2001 From: gVisor bot Date: Thu, 13 Feb 2020 17:56:34 -0800 Subject: Add FileExec flag to OpenOptions This allow callers to say whether the file is being opened to be executed, so that the proper checks can be done from FilesystemImpl.OpenAt() Updates #1623 PiperOrigin-RevId: 295042595 --- pkg/sentry/fsimpl/ext/filesystem.go | 2 +- pkg/sentry/fsimpl/ext/inode.go | 12 ++++++------ pkg/sentry/fsimpl/gofer/filesystem.go | 24 ++++++++++++------------ pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go | 4 ++-- pkg/sentry/fsimpl/kernfs/fd_impl_util.go | 6 +++--- pkg/sentry/fsimpl/kernfs/filesystem.go | 10 +++++----- pkg/sentry/fsimpl/kernfs/inode_impl_util.go | 6 +++--- pkg/sentry/fsimpl/kernfs/kernfs.go | 2 +- pkg/sentry/fsimpl/kernfs/kernfs_test.go | 8 ++++---- pkg/sentry/fsimpl/proc/subtasks.go | 4 ++-- pkg/sentry/fsimpl/proc/task.go | 4 ++-- pkg/sentry/fsimpl/proc/tasks.go | 4 ++-- pkg/sentry/fsimpl/sys/sys.go | 4 ++-- pkg/sentry/fsimpl/tmpfs/filesystem.go | 2 +- pkg/sentry/vfs/options.go | 5 +++++ pkg/sentry/vfs/permissions.go | 19 ++++++++++++------- pkg/sentry/vfs/vfs.go | 19 +++++++++++++++++++ 17 files changed, 82 insertions(+), 53 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/ext/filesystem.go b/pkg/sentry/fsimpl/ext/filesystem.go index 07bf58953..e05429d41 100644 --- a/pkg/sentry/fsimpl/ext/filesystem.go +++ b/pkg/sentry/fsimpl/ext/filesystem.go @@ -296,7 +296,7 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf if vfs.MayWriteFileWithOpenFlags(opts.Flags) || opts.Flags&(linux.O_CREAT|linux.O_EXCL|linux.O_TMPFILE) != 0 { return nil, syserror.EROFS } - return inode.open(rp, vfsd, opts.Flags) + return inode.open(rp, vfsd, &opts) } // ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt. diff --git a/pkg/sentry/fsimpl/ext/inode.go b/pkg/sentry/fsimpl/ext/inode.go index 191b39970..6962083f5 100644 --- a/pkg/sentry/fsimpl/ext/inode.go +++ b/pkg/sentry/fsimpl/ext/inode.go @@ -148,8 +148,8 @@ func newInode(fs *filesystem, inodeNum uint32) (*inode, error) { } // open creates and returns a file description for the dentry passed in. -func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) { - ats := vfs.AccessTypesForOpenFlags(flags) +func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts *vfs.OpenOptions) (*vfs.FileDescription, error) { + ats := vfs.AccessTypesForOpenFlags(opts) if err := in.checkPermissions(rp.Credentials(), ats); err != nil { return nil, err } @@ -157,7 +157,7 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*v switch in.impl.(type) { case *regularFile: var fd regularFileFD - if err := fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{}); err != nil { + if err := fd.vfsfd.Init(&fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{}); err != nil { return nil, err } return &fd.vfsfd, nil @@ -168,17 +168,17 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*v return nil, syserror.EISDIR } var fd directoryFD - if err := fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{}); err != nil { + if err := fd.vfsfd.Init(&fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{}); err != nil { return nil, err } return &fd.vfsfd, nil case *symlink: - if flags&linux.O_PATH == 0 { + if opts.Flags&linux.O_PATH == 0 { // Can't open symlinks without O_PATH. return nil, syserror.ELOOP } var fd symlinkFD - fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{}) + fd.vfsfd.Init(&fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{}) return &fd.vfsfd, nil default: panic(fmt.Sprintf("unknown inode type: %T", in.impl)) diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go index 8eb61debf..138adb9f7 100644 --- a/pkg/sentry/fsimpl/gofer/filesystem.go +++ b/pkg/sentry/fsimpl/gofer/filesystem.go @@ -593,7 +593,7 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf } } if rp.Done() { - return start.openLocked(ctx, rp, opts.Flags) + return start.openLocked(ctx, rp, &opts) } afterTrailingSymlink: @@ -633,12 +633,12 @@ afterTrailingSymlink: start = parent goto afterTrailingSymlink } - return child.openLocked(ctx, rp, opts.Flags) + return child.openLocked(ctx, rp, &opts) } // Preconditions: fs.renameMu must be locked. -func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, flags uint32) (*vfs.FileDescription, error) { - ats := vfs.AccessTypesForOpenFlags(flags) +func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions) (*vfs.FileDescription, error) { + ats := vfs.AccessTypesForOpenFlags(opts) if err := d.checkPermissions(rp.Credentials(), ats, d.isDir()); err != nil { return nil, err } @@ -646,11 +646,11 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, flags ui filetype := d.fileType() switch { case filetype == linux.S_IFREG && !d.fs.opts.regularFilesUseSpecialFileFD: - if err := d.ensureSharedHandle(ctx, ats&vfs.MayRead != 0, ats&vfs.MayWrite != 0, flags&linux.O_TRUNC != 0); err != nil { + if err := d.ensureSharedHandle(ctx, ats&vfs.MayRead != 0, ats&vfs.MayWrite != 0, opts.Flags&linux.O_TRUNC != 0); err != nil { return nil, err } fd := ®ularFileFD{} - if err := fd.vfsfd.Init(fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{ + if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{ AllowDirectIO: true, }); err != nil { return nil, err @@ -658,21 +658,21 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, flags ui return &fd.vfsfd, nil case filetype == linux.S_IFDIR: // Can't open directories with O_CREAT. - if flags&linux.O_CREAT != 0 { + if opts.Flags&linux.O_CREAT != 0 { return nil, syserror.EISDIR } // Can't open directories writably. if ats&vfs.MayWrite != 0 { return nil, syserror.EISDIR } - if flags&linux.O_DIRECT != 0 { + if opts.Flags&linux.O_DIRECT != 0 { return nil, syserror.EINVAL } if err := d.ensureSharedHandle(ctx, ats&vfs.MayRead != 0, false /* write */, false /* trunc */); err != nil { return nil, err } fd := &directoryFD{} - if err := fd.vfsfd.Init(fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil { + if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil { return nil, err } return &fd.vfsfd, nil @@ -680,17 +680,17 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, flags ui // Can't open symlinks without O_PATH (which is unimplemented). return nil, syserror.ELOOP default: - if flags&linux.O_DIRECT != 0 { + if opts.Flags&linux.O_DIRECT != 0 { return nil, syserror.EINVAL } - h, err := openHandle(ctx, d.file, ats&vfs.MayRead != 0, ats&vfs.MayWrite != 0, flags&linux.O_TRUNC != 0) + h, err := openHandle(ctx, d.file, ats&vfs.MayRead != 0, ats&vfs.MayWrite != 0, opts.Flags&linux.O_TRUNC != 0) if err != nil { return nil, err } fd := &specialFileFD{ handle: h, } - if err := fd.vfsfd.Init(fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil { + if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil { h.close(ctx) return nil, err } diff --git a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go index 733792c78..d092ccb2a 100644 --- a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go +++ b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go @@ -53,9 +53,9 @@ func (f *DynamicBytesFile) Init(creds *auth.Credentials, ino uint64, data vfs.Dy } // Open implements Inode.Open. -func (f *DynamicBytesFile) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) { +func (f *DynamicBytesFile) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { fd := &DynamicBytesFD{} - if err := fd.Init(rp.Mount(), vfsd, f.data, flags); err != nil { + if err := fd.Init(rp.Mount(), vfsd, f.data, opts.Flags); err != nil { return nil, err } return &fd.vfsfd, nil diff --git a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go index 6104751c8..eda781155 100644 --- a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go @@ -43,12 +43,12 @@ type GenericDirectoryFD struct { } // Init initializes a GenericDirectoryFD. -func (fd *GenericDirectoryFD) Init(m *vfs.Mount, d *vfs.Dentry, children *OrderedChildren, flags uint32) error { - if vfs.AccessTypesForOpenFlags(flags)&vfs.MayWrite != 0 { +func (fd *GenericDirectoryFD) Init(m *vfs.Mount, d *vfs.Dentry, children *OrderedChildren, opts *vfs.OpenOptions) error { + if vfs.AccessTypesForOpenFlags(opts)&vfs.MayWrite != 0 { // Can't open directories for writing. return syserror.EISDIR } - if err := fd.vfsfd.Init(fd, flags, m, d, &vfs.FileDescriptionOptions{}); err != nil { + if err := fd.vfsfd.Init(fd, opts.Flags, m, d, &vfs.FileDescriptionOptions{}); err != nil { return err } fd.children = children diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go index e49303c26..ee98eb66a 100644 --- a/pkg/sentry/fsimpl/kernfs/filesystem.go +++ b/pkg/sentry/fsimpl/kernfs/filesystem.go @@ -365,7 +365,7 @@ func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf // appropriate bits in rp), but are returned by // FileDescriptionImpl.StatusFlags(). opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_TRUNC | linux.O_DIRECTORY | linux.O_NOFOLLOW - ats := vfs.AccessTypesForOpenFlags(opts.Flags) + ats := vfs.AccessTypesForOpenFlags(&opts) // Do not create new file. if opts.Flags&linux.O_CREAT == 0 { @@ -379,7 +379,7 @@ func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf if err := inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil { return nil, err } - return inode.Open(rp, vfsd, opts.Flags) + return inode.Open(rp, vfsd, opts) } // May create new file. @@ -398,7 +398,7 @@ func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf if err := inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil { return nil, err } - return inode.Open(rp, vfsd, opts.Flags) + return inode.Open(rp, vfsd, opts) } afterTrailingSymlink: parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp) @@ -438,7 +438,7 @@ afterTrailingSymlink: return nil, err } parentVFSD.Impl().(*Dentry).InsertChild(pc, child) - return child.Impl().(*Dentry).inode.Open(rp, child, opts.Flags) + return child.Impl().(*Dentry).inode.Open(rp, child, opts) } // Open existing file or follow symlink. if mustCreate { @@ -463,7 +463,7 @@ afterTrailingSymlink: if err := childInode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil { return nil, err } - return childInode.Open(rp, childVFSD, opts.Flags) + return childInode.Open(rp, childVFSD, opts) } // ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt. diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go index adca2313f..099d70a16 100644 --- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go @@ -507,7 +507,7 @@ type InodeSymlink struct { } // Open implements Inode.Open. -func (InodeSymlink) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) { +func (InodeSymlink) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { return nil, syserror.ELOOP } @@ -549,8 +549,8 @@ func (s *StaticDirectory) Init(creds *auth.Credentials, ino uint64, perm linux.F } // Open implements kernfs.Inode. -func (s *StaticDirectory) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) { +func (s *StaticDirectory) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { fd := &GenericDirectoryFD{} - fd.Init(rp.Mount(), vfsd, &s.OrderedChildren, flags) + fd.Init(rp.Mount(), vfsd, &s.OrderedChildren, &opts) return fd.VFSFileDescription(), nil } diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go index 79ebea8a5..c74fa999b 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs.go @@ -303,7 +303,7 @@ type Inode interface { // inode for its lifetime. // // Precondition: !rp.Done(). vfsd.Impl() must be a kernfs Dentry. - Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) + Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) } type inodeRefs interface { diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go index ee65cf491..96a16e654 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs_test.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go @@ -113,9 +113,9 @@ func (fs *filesystem) newReadonlyDir(creds *auth.Credentials, mode linux.FileMod return &dir.dentry } -func (d *readonlyDir) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) { +func (d *readonlyDir) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { fd := &kernfs.GenericDirectoryFD{} - if err := fd.Init(rp.Mount(), vfsd, &d.OrderedChildren, flags); err != nil { + if err := fd.Init(rp.Mount(), vfsd, &d.OrderedChildren, &opts); err != nil { return nil, err } return fd.VFSFileDescription(), nil @@ -143,9 +143,9 @@ func (fs *filesystem) newDir(creds *auth.Credentials, mode linux.FileMode, conte return &dir.dentry } -func (d *dir) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) { +func (d *dir) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { fd := &kernfs.GenericDirectoryFD{} - fd.Init(rp.Mount(), vfsd, &d.OrderedChildren, flags) + fd.Init(rp.Mount(), vfsd, &d.OrderedChildren, &opts) return fd.VFSFileDescription(), nil } diff --git a/pkg/sentry/fsimpl/proc/subtasks.go b/pkg/sentry/fsimpl/proc/subtasks.go index 353e37195..102af0e93 100644 --- a/pkg/sentry/fsimpl/proc/subtasks.go +++ b/pkg/sentry/fsimpl/proc/subtasks.go @@ -114,9 +114,9 @@ func (i *subtasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallb } // Open implements kernfs.Inode. -func (i *subtasksInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) { +func (i *subtasksInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { fd := &kernfs.GenericDirectoryFD{} - fd.Init(rp.Mount(), vfsd, &i.OrderedChildren, flags) + fd.Init(rp.Mount(), vfsd, &i.OrderedChildren, &opts) return fd.VFSFileDescription(), nil } diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go index eb5bc62c0..2d814668a 100644 --- a/pkg/sentry/fsimpl/proc/task.go +++ b/pkg/sentry/fsimpl/proc/task.go @@ -98,9 +98,9 @@ func (i *taskInode) Valid(ctx context.Context) bool { } // Open implements kernfs.Inode. -func (i *taskInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) { +func (i *taskInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { fd := &kernfs.GenericDirectoryFD{} - fd.Init(rp.Mount(), vfsd, &i.OrderedChildren, flags) + fd.Init(rp.Mount(), vfsd, &i.OrderedChildren, &opts) return fd.VFSFileDescription(), nil } diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go index 14bd334e8..ebe21630c 100644 --- a/pkg/sentry/fsimpl/proc/tasks.go +++ b/pkg/sentry/fsimpl/proc/tasks.go @@ -205,9 +205,9 @@ func (i *tasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback } // Open implements kernfs.Inode. -func (i *tasksInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) { +func (i *tasksInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { fd := &kernfs.GenericDirectoryFD{} - fd.Init(rp.Mount(), vfsd, &i.OrderedChildren, flags) + fd.Init(rp.Mount(), vfsd, &i.OrderedChildren, &opts) return fd.VFSFileDescription(), nil } diff --git a/pkg/sentry/fsimpl/sys/sys.go b/pkg/sentry/fsimpl/sys/sys.go index e35d52d17..d693fceae 100644 --- a/pkg/sentry/fsimpl/sys/sys.go +++ b/pkg/sentry/fsimpl/sys/sys.go @@ -97,9 +97,9 @@ func (d *dir) SetStat(fs *vfs.Filesystem, opts vfs.SetStatOptions) error { } // Open implements kernfs.Inode.Open. -func (d *dir) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) { +func (d *dir) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { fd := &kernfs.GenericDirectoryFD{} - fd.Init(rp.Mount(), vfsd, &d.OrderedChildren, flags) + fd.Init(rp.Mount(), vfsd, &d.OrderedChildren, &opts) return fd.VFSFileDescription(), nil } diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go index 72bc15264..8785452b6 100644 --- a/pkg/sentry/fsimpl/tmpfs/filesystem.go +++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go @@ -334,7 +334,7 @@ afterTrailingSymlink: } func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions, afterCreate bool) (*vfs.FileDescription, error) { - ats := vfs.AccessTypesForOpenFlags(opts.Flags) + ats := vfs.AccessTypesForOpenFlags(opts) if !afterCreate { if err := d.inode.checkPermissions(rp.Credentials(), ats, d.inode.isDir()); err != nil { return nil, err diff --git a/pkg/sentry/vfs/options.go b/pkg/sentry/vfs/options.go index b7774bf28..fdf8be157 100644 --- a/pkg/sentry/vfs/options.go +++ b/pkg/sentry/vfs/options.go @@ -72,6 +72,11 @@ type OpenOptions struct { // If FilesystemImpl.OpenAt() creates a file, Mode is the file mode for the // created file. Mode linux.FileMode + + // FileExec is set when the file is being opened to be executed. + // VirtualFilesystem.OpenAt() checks that the caller has execute permissions + // on the file, and that the file is a regular file. + FileExec bool } // ReadOptions contains options to FileDescription.PRead(), diff --git a/pkg/sentry/vfs/permissions.go b/pkg/sentry/vfs/permissions.go index f664581f4..8e250998a 100644 --- a/pkg/sentry/vfs/permissions.go +++ b/pkg/sentry/vfs/permissions.go @@ -103,17 +103,22 @@ func GenericCheckPermissions(creds *auth.Credentials, ats AccessTypes, isDir boo // AccessTypesForOpenFlags returns MayRead|MayWrite in this case. // // Use May{Read,Write}FileWithOpenFlags() for these checks instead. -func AccessTypesForOpenFlags(flags uint32) AccessTypes { - switch flags & linux.O_ACCMODE { +func AccessTypesForOpenFlags(opts *OpenOptions) AccessTypes { + ats := AccessTypes(0) + if opts.FileExec { + ats |= MayExec + } + + switch opts.Flags & linux.O_ACCMODE { case linux.O_RDONLY: - if flags&linux.O_TRUNC != 0 { - return MayRead | MayWrite + if opts.Flags&linux.O_TRUNC != 0 { + return ats | MayRead | MayWrite } - return MayRead + return ats | MayRead case linux.O_WRONLY: - return MayWrite + return ats | MayWrite default: - return MayRead | MayWrite + return ats | MayRead | MayWrite } } diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go index 908c69f91..9629afee9 100644 --- a/pkg/sentry/vfs/vfs.go +++ b/pkg/sentry/vfs/vfs.go @@ -379,6 +379,25 @@ func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credential fd, err := rp.mount.fs.impl.OpenAt(ctx, rp, *opts) if err == nil { vfs.putResolvingPath(rp) + + // TODO(gvisor.dev/issue/1193): Move inside fsimpl to avoid another call + // to FileDescription.Stat(). + if opts.FileExec { + // Only a regular file can be executed. + stat, err := fd.Stat(ctx, StatOptions{Mask: linux.STATX_TYPE}) + if err != nil { + return nil, err + } + if stat.Mask&linux.STATX_TYPE != 0 { + // This shouldn't happen, but if type can't be retrieved, file can't + // be executed. + return nil, syserror.EACCES + } + if linux.FileMode(stat.Mode).FileType() != linux.ModeRegular { + return nil, syserror.EACCES + } + } + return fd, nil } if !rp.handleError(err) { -- cgit v1.2.3 From 4075de11be44372c454aae7f9650cdc814c52229 Mon Sep 17 00:00:00 2001 From: gVisor bot Date: Fri, 14 Feb 2020 11:11:55 -0800 Subject: Plumb VFS2 inside the Sentry - Added fsbridge package with interface that can be used to open and read from VFS1 and VFS2 files. - Converted ELF loader to use fsbridge - Added VFS2 types to FSContext - Added vfs.MountNamespace to ThreadGroup Updates #1623 PiperOrigin-RevId: 295183950 --- pkg/sentry/control/BUILD | 5 + pkg/sentry/control/proc.go | 127 +++++++++++++-- pkg/sentry/fs/proc/BUILD | 1 + pkg/sentry/fs/proc/task.go | 17 +- pkg/sentry/fsbridge/BUILD | 24 +++ pkg/sentry/fsbridge/bridge.go | 54 ++++++ pkg/sentry/fsbridge/fs.go | 181 +++++++++++++++++++++ pkg/sentry/fsbridge/vfs.go | 134 +++++++++++++++ pkg/sentry/fsimpl/devtmpfs/devtmpfs.go | 4 + pkg/sentry/fsimpl/gofer/filesystem.go | 5 +- pkg/sentry/fsimpl/gofer/gofer.go | 3 + pkg/sentry/fsimpl/kernfs/filesystem.go | 10 +- pkg/sentry/fsimpl/proc/BUILD | 1 + pkg/sentry/fsimpl/proc/filesystem.go | 18 +- pkg/sentry/fsimpl/proc/tasks_test.go | 17 +- pkg/sentry/fsimpl/sys/BUILD | 1 + pkg/sentry/fsimpl/sys/sys.go | 3 + pkg/sentry/fsimpl/sys/sys_test.go | 7 +- pkg/sentry/fsimpl/testutil/BUILD | 2 +- pkg/sentry/fsimpl/testutil/kernel.go | 24 +-- pkg/sentry/fsimpl/testutil/testutil.go | 12 +- pkg/sentry/fsimpl/tmpfs/filesystem.go | 12 +- pkg/sentry/fsimpl/tmpfs/tmpfs.go | 3 + pkg/sentry/kernel/BUILD | 2 + pkg/sentry/kernel/fs_context.go | 98 +++++++++-- pkg/sentry/kernel/kernel.go | 145 +++++++++++++---- pkg/sentry/kernel/task.go | 27 +++ pkg/sentry/kernel/task_clone.go | 11 +- pkg/sentry/kernel/task_context.go | 2 +- pkg/sentry/kernel/task_exit.go | 7 + pkg/sentry/kernel/task_log.go | 15 +- pkg/sentry/kernel/task_start.go | 49 +++--- pkg/sentry/kernel/thread_group.go | 6 +- pkg/sentry/loader/BUILD | 2 + pkg/sentry/loader/elf.go | 28 ++-- pkg/sentry/loader/interpreter.go | 6 +- pkg/sentry/loader/loader.go | 179 ++++++-------------- pkg/sentry/loader/vdso.go | 7 +- pkg/sentry/mm/BUILD | 2 +- pkg/sentry/mm/metadata.go | 10 +- pkg/sentry/mm/mm.go | 4 +- pkg/sentry/strace/strace.go | 28 ++++ pkg/sentry/syscalls/linux/BUILD | 1 + pkg/sentry/syscalls/linux/sys_prctl.go | 3 +- pkg/sentry/syscalls/linux/sys_thread.go | 17 +- .../syscalls/linux/vfs2/linux64_override_amd64.go | 106 ++++++++++++ pkg/sentry/vfs/BUILD | 1 + pkg/sentry/vfs/context.go | 7 +- pkg/sentry/vfs/mount.go | 10 +- pkg/sentry/vfs/options.go | 2 +- pkg/sentry/vfs/vfs.go | 5 +- runsc/boot/loader.go | 11 +- 52 files changed, 1134 insertions(+), 322 deletions(-) create mode 100644 pkg/sentry/fsbridge/BUILD create mode 100644 pkg/sentry/fsbridge/bridge.go create mode 100644 pkg/sentry/fsbridge/fs.go create mode 100644 pkg/sentry/fsbridge/vfs.go (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/control/BUILD b/pkg/sentry/control/BUILD index e69496477..d16d78aa5 100644 --- a/pkg/sentry/control/BUILD +++ b/pkg/sentry/control/BUILD @@ -16,10 +16,13 @@ go_library( ], deps = [ "//pkg/abi/linux", + "//pkg/context", "//pkg/fd", + "//pkg/fspath", "//pkg/log", "//pkg/sentry/fs", "//pkg/sentry/fs/host", + "//pkg/sentry/fsbridge", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", "//pkg/sentry/kernel/time", @@ -27,8 +30,10 @@ go_library( "//pkg/sentry/state", "//pkg/sentry/strace", "//pkg/sentry/usage", + "//pkg/sentry/vfs", "//pkg/sentry/watchdog", "//pkg/sync", + "//pkg/syserror", "//pkg/tcpip/link/sniffer", "//pkg/urpc", ], diff --git a/pkg/sentry/control/proc.go b/pkg/sentry/control/proc.go index ced51c66c..8973754c8 100644 --- a/pkg/sentry/control/proc.go +++ b/pkg/sentry/control/proc.go @@ -18,19 +18,26 @@ import ( "bytes" "encoding/json" "fmt" + "path" "sort" "strings" "text/tabwriter" "time" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/host" + "gvisor.dev/gvisor/pkg/sentry/fsbridge" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/usage" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/urpc" ) @@ -60,6 +67,12 @@ type ExecArgs struct { // process's MountNamespace. MountNamespace *fs.MountNamespace + // MountNamespaceVFS2 is the mount namespace to execute the new process in. + // A reference on MountNamespace must be held for the lifetime of the + // ExecArgs. If MountNamespace is nil, it will default to the init + // process's MountNamespace. + MountNamespaceVFS2 *vfs.MountNamespace + // WorkingDirectory defines the working directory for the new process. WorkingDirectory string `json:"wd"` @@ -150,6 +163,7 @@ func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadI Envv: args.Envv, WorkingDirectory: args.WorkingDirectory, MountNamespace: args.MountNamespace, + MountNamespaceVFS2: args.MountNamespaceVFS2, Credentials: creds, FDTable: fdTable, Umask: 0022, @@ -166,24 +180,53 @@ func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadI // be donated to the new process in CreateProcess. initArgs.MountNamespace.IncRef() } + if initArgs.MountNamespaceVFS2 != nil { + // initArgs must hold a reference on MountNamespaceVFS2, which will + // be donated to the new process in CreateProcess. + initArgs.MountNamespaceVFS2.IncRef() + } ctx := initArgs.NewContext(proc.Kernel) if initArgs.Filename == "" { - // Get the full path to the filename from the PATH env variable. - paths := fs.GetPath(initArgs.Envv) - mns := initArgs.MountNamespace - if mns == nil { - mns = proc.Kernel.GlobalInit().Leader().MountNamespace() - } - f, err := mns.ResolveExecutablePath(ctx, initArgs.WorkingDirectory, initArgs.Argv[0], paths) - if err != nil { - return nil, 0, nil, fmt.Errorf("error finding executable %q in PATH %v: %v", initArgs.Argv[0], paths, err) + if kernel.VFS2Enabled { + // Get the full path to the filename from the PATH env variable. + if initArgs.MountNamespaceVFS2 == nil { + // Set initArgs so that 'ctx' returns the namespace. + // + // MountNamespaceVFS2 adds a reference to the namespace, which is + // transferred to the new process. + initArgs.MountNamespaceVFS2 = proc.Kernel.GlobalInit().Leader().MountNamespaceVFS2() + } + + paths := fs.GetPath(initArgs.Envv) + vfsObj := proc.Kernel.VFS + file, err := ResolveExecutablePath(ctx, vfsObj, initArgs.WorkingDirectory, initArgs.Argv[0], paths) + if err != nil { + return nil, 0, nil, fmt.Errorf("error finding executable %q in PATH %v: %v", initArgs.Argv[0], paths, err) + } + initArgs.File = fsbridge.NewVFSFile(file) + } else { + // Get the full path to the filename from the PATH env variable. + paths := fs.GetPath(initArgs.Envv) + if initArgs.MountNamespace == nil { + // Set initArgs so that 'ctx' returns the namespace. + initArgs.MountNamespace = proc.Kernel.GlobalInit().Leader().MountNamespace() + + // initArgs must hold a reference on MountNamespace, which will + // be donated to the new process in CreateProcess. + initArgs.MountNamespaceVFS2.IncRef() + } + f, err := initArgs.MountNamespace.ResolveExecutablePath(ctx, initArgs.WorkingDirectory, initArgs.Argv[0], paths) + if err != nil { + return nil, 0, nil, fmt.Errorf("error finding executable %q in PATH %v: %v", initArgs.Argv[0], paths, err) + } + initArgs.Filename = f } - initArgs.Filename = f } mounter := fs.FileOwnerFromContext(ctx) + // TODO(gvisor.dev/issue/1623): Use host FD when supported in VFS2. var ttyFile *fs.File for appFD, hostFile := range args.FilePayload.Files { var appFile *fs.File @@ -411,3 +454,67 @@ func ttyName(tty *kernel.TTY) string { } return fmt.Sprintf("pts/%d", tty.Index) } + +// ResolveExecutablePath resolves the given executable name given a set of +// paths that might contain it. +func ResolveExecutablePath(ctx context.Context, vfsObj *vfs.VirtualFilesystem, wd, name string, paths []string) (*vfs.FileDescription, error) { + root := vfs.RootFromContext(ctx) + defer root.DecRef() + creds := auth.CredentialsFromContext(ctx) + + // Absolute paths can be used directly. + if path.IsAbs(name) { + return openExecutable(ctx, vfsObj, creds, root, name) + } + + // Paths with '/' in them should be joined to the working directory, or + // to the root if working directory is not set. + if strings.IndexByte(name, '/') > 0 { + if len(wd) == 0 { + wd = "/" + } + if !path.IsAbs(wd) { + return nil, fmt.Errorf("working directory %q must be absolute", wd) + } + return openExecutable(ctx, vfsObj, creds, root, path.Join(wd, name)) + } + + // Otherwise, we must lookup the name in the paths, starting from the + // calling context's root directory. + for _, p := range paths { + if !path.IsAbs(p) { + // Relative paths aren't safe, no one should be using them. + log.Warningf("Skipping relative path %q in $PATH", p) + continue + } + + binPath := path.Join(p, name) + f, err := openExecutable(ctx, vfsObj, creds, root, binPath) + if err != nil { + return nil, err + } + if f == nil { + continue // Not found/no access. + } + return f, nil + } + return nil, syserror.ENOENT +} + +func openExecutable(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, root vfs.VirtualDentry, path string) (*vfs.FileDescription, error) { + pop := vfs.PathOperation{ + Root: root, + Start: root, // binPath is absolute, Start can be anything. + Path: fspath.Parse(path), + FollowFinalSymlink: true, + } + opts := &vfs.OpenOptions{ + Flags: linux.O_RDONLY, + FileExec: true, + } + f, err := vfsObj.OpenAt(ctx, creds, &pop, opts) + if err == syserror.ENOENT || err == syserror.EACCES { + return nil, nil + } + return f, err +} diff --git a/pkg/sentry/fs/proc/BUILD b/pkg/sentry/fs/proc/BUILD index 280093c5e..77c2c5c0e 100644 --- a/pkg/sentry/fs/proc/BUILD +++ b/pkg/sentry/fs/proc/BUILD @@ -36,6 +36,7 @@ go_library( "//pkg/sentry/fs/proc/device", "//pkg/sentry/fs/proc/seqfile", "//pkg/sentry/fs/ramfs", + "//pkg/sentry/fsbridge", "//pkg/sentry/inet", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", diff --git a/pkg/sentry/fs/proc/task.go b/pkg/sentry/fs/proc/task.go index ca020e11e..8ab8d8a02 100644 --- a/pkg/sentry/fs/proc/task.go +++ b/pkg/sentry/fs/proc/task.go @@ -28,6 +28,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs/proc/device" "gvisor.dev/gvisor/pkg/sentry/fs/proc/seqfile" "gvisor.dev/gvisor/pkg/sentry/fs/ramfs" + "gvisor.dev/gvisor/pkg/sentry/fsbridge" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/mm" @@ -249,7 +250,7 @@ func newExe(t *kernel.Task, msrc *fs.MountSource) *fs.Inode { return newProcInode(t, exeSymlink, msrc, fs.Symlink, t) } -func (e *exe) executable() (d *fs.Dirent, err error) { +func (e *exe) executable() (file fsbridge.File, err error) { e.t.WithMuLocked(func(t *kernel.Task) { mm := t.MemoryManager() if mm == nil { @@ -262,8 +263,8 @@ func (e *exe) executable() (d *fs.Dirent, err error) { // The MemoryManager may be destroyed, in which case // MemoryManager.destroy will simply set the executable to nil // (with locks held). - d = mm.Executable() - if d == nil { + file = mm.Executable() + if file == nil { err = syserror.ENOENT } }) @@ -283,15 +284,7 @@ func (e *exe) Readlink(ctx context.Context, inode *fs.Inode) (string, error) { } defer exec.DecRef() - root := fs.RootFromContext(ctx) - if root == nil { - // This doesn't correspond to anything in Linux because the vfs is - // global there. - return "", syserror.EINVAL - } - defer root.DecRef() - n, _ := exec.FullName(root) - return n, nil + return exec.PathnameWithDeleted(ctx), nil } // namespaceSymlink represents a symlink in the namespacefs, such as the files diff --git a/pkg/sentry/fsbridge/BUILD b/pkg/sentry/fsbridge/BUILD new file mode 100644 index 000000000..6c798f0bd --- /dev/null +++ b/pkg/sentry/fsbridge/BUILD @@ -0,0 +1,24 @@ +load("//tools:defs.bzl", "go_library") + +licenses(["notice"]) + +go_library( + name = "fsbridge", + srcs = [ + "bridge.go", + "fs.go", + "vfs.go", + ], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/context", + "//pkg/fspath", + "//pkg/sentry/fs", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/memmap", + "//pkg/sentry/vfs", + "//pkg/syserror", + "//pkg/usermem", + ], +) diff --git a/pkg/sentry/fsbridge/bridge.go b/pkg/sentry/fsbridge/bridge.go new file mode 100644 index 000000000..8e7590721 --- /dev/null +++ b/pkg/sentry/fsbridge/bridge.go @@ -0,0 +1,54 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package fsbridge provides common interfaces to bridge between VFS1 and VFS2 +// files. +package fsbridge + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/memmap" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/usermem" +) + +// File provides a common interface to bridge between VFS1 and VFS2 files. +type File interface { + // PathnameWithDeleted returns an absolute pathname to vd, consistent with + // Linux's d_path(). In particular, if vd.Dentry() has been disowned, + // PathnameWithDeleted appends " (deleted)" to the returned pathname. + PathnameWithDeleted(ctx context.Context) string + + // ReadFull read all contents from the file. + ReadFull(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) + + // ConfigureMMap mutates opts to implement mmap(2) for the file. + ConfigureMMap(context.Context, *memmap.MMapOpts) error + + // Type returns the file type, e.g. linux.S_IFREG. + Type(context.Context) (linux.FileMode, error) + + // IncRef increments reference. + IncRef() + + // DecRef decrements reference. + DecRef() +} + +// Lookup provides a common interface to open files. +type Lookup interface { + // OpenPath opens a file. + OpenPath(ctx context.Context, path string, opts vfs.OpenOptions, remainingTraversals *uint, resolveFinal bool) (File, error) +} diff --git a/pkg/sentry/fsbridge/fs.go b/pkg/sentry/fsbridge/fs.go new file mode 100644 index 000000000..093ce1fb3 --- /dev/null +++ b/pkg/sentry/fsbridge/fs.go @@ -0,0 +1,181 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fsbridge + +import ( + "io" + "strings" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/memmap" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// fsFile implements File interface over fs.File. +// +// +stateify savable +type fsFile struct { + file *fs.File +} + +var _ File = (*fsFile)(nil) + +// NewFSFile creates a new File over fs.File. +func NewFSFile(file *fs.File) File { + return &fsFile{file: file} +} + +// PathnameWithDeleted implements File. +func (f *fsFile) PathnameWithDeleted(ctx context.Context) string { + root := fs.RootFromContext(ctx) + if root == nil { + // This doesn't correspond to anything in Linux because the vfs is + // global there. + return "" + } + defer root.DecRef() + + name, _ := f.file.Dirent.FullName(root) + return name +} + +// ReadFull implements File. +func (f *fsFile) ReadFull(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) { + var total int64 + for dst.NumBytes() > 0 { + n, err := f.file.Preadv(ctx, dst, offset+total) + total += n + if err == io.EOF && total != 0 { + return total, io.ErrUnexpectedEOF + } else if err != nil { + return total, err + } + dst = dst.DropFirst64(n) + } + return total, nil +} + +// ConfigureMMap implements File. +func (f *fsFile) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { + return f.file.ConfigureMMap(ctx, opts) +} + +// Type implements File. +func (f *fsFile) Type(context.Context) (linux.FileMode, error) { + return linux.FileMode(f.file.Dirent.Inode.StableAttr.Type.LinuxType()), nil +} + +// IncRef implements File. +func (f *fsFile) IncRef() { + f.file.IncRef() +} + +// DecRef implements File. +func (f *fsFile) DecRef() { + f.file.DecRef() +} + +// fsLookup implements Lookup interface using fs.File. +// +// +stateify savable +type fsLookup struct { + mntns *fs.MountNamespace + + root *fs.Dirent + workingDir *fs.Dirent +} + +var _ Lookup = (*fsLookup)(nil) + +// NewFSLookup creates a new Lookup using VFS1. +func NewFSLookup(mntns *fs.MountNamespace, root, workingDir *fs.Dirent) Lookup { + return &fsLookup{ + mntns: mntns, + root: root, + workingDir: workingDir, + } +} + +// OpenPath implements Lookup. +func (l *fsLookup) OpenPath(ctx context.Context, path string, opts vfs.OpenOptions, remainingTraversals *uint, resolveFinal bool) (File, error) { + var d *fs.Dirent + var err error + if resolveFinal { + d, err = l.mntns.FindInode(ctx, l.root, l.workingDir, path, remainingTraversals) + } else { + d, err = l.mntns.FindLink(ctx, l.root, l.workingDir, path, remainingTraversals) + } + if err != nil { + return nil, err + } + defer d.DecRef() + + if !resolveFinal && fs.IsSymlink(d.Inode.StableAttr) { + return nil, syserror.ELOOP + } + + fsPerm := openOptionsToPermMask(&opts) + if err := d.Inode.CheckPermission(ctx, fsPerm); err != nil { + return nil, err + } + + // If they claim it's a directory, then make sure. + if strings.HasSuffix(path, "/") { + if d.Inode.StableAttr.Type != fs.Directory { + return nil, syserror.ENOTDIR + } + } + + if opts.FileExec && d.Inode.StableAttr.Type != fs.RegularFile { + ctx.Infof("%q is not a regular file: %v", path, d.Inode.StableAttr.Type) + return nil, syserror.EACCES + } + + f, err := d.Inode.GetFile(ctx, d, flagsToFileFlags(opts.Flags)) + if err != nil { + return nil, err + } + + return &fsFile{file: f}, nil +} + +func openOptionsToPermMask(opts *vfs.OpenOptions) fs.PermMask { + mode := opts.Flags & linux.O_ACCMODE + return fs.PermMask{ + Read: mode == linux.O_RDONLY || mode == linux.O_RDWR, + Write: mode == linux.O_WRONLY || mode == linux.O_RDWR, + Execute: opts.FileExec, + } +} + +func flagsToFileFlags(flags uint32) fs.FileFlags { + return fs.FileFlags{ + Direct: flags&linux.O_DIRECT != 0, + DSync: flags&(linux.O_DSYNC|linux.O_SYNC) != 0, + Sync: flags&linux.O_SYNC != 0, + NonBlocking: flags&linux.O_NONBLOCK != 0, + Read: (flags & linux.O_ACCMODE) != linux.O_WRONLY, + Write: (flags & linux.O_ACCMODE) != linux.O_RDONLY, + Append: flags&linux.O_APPEND != 0, + Directory: flags&linux.O_DIRECTORY != 0, + Async: flags&linux.O_ASYNC != 0, + LargeFile: flags&linux.O_LARGEFILE != 0, + Truncate: flags&linux.O_TRUNC != 0, + } +} diff --git a/pkg/sentry/fsbridge/vfs.go b/pkg/sentry/fsbridge/vfs.go new file mode 100644 index 000000000..e657c39bc --- /dev/null +++ b/pkg/sentry/fsbridge/vfs.go @@ -0,0 +1,134 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fsbridge + +import ( + "io" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/memmap" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/usermem" +) + +// fsFile implements File interface over vfs.FileDescription. +// +// +stateify savable +type vfsFile struct { + file *vfs.FileDescription +} + +var _ File = (*vfsFile)(nil) + +// NewVFSFile creates a new File over fs.File. +func NewVFSFile(file *vfs.FileDescription) File { + return &vfsFile{file: file} +} + +// PathnameWithDeleted implements File. +func (f *vfsFile) PathnameWithDeleted(ctx context.Context) string { + root := vfs.RootFromContext(ctx) + defer root.DecRef() + + vfsObj := f.file.VirtualDentry().Mount().Filesystem().VirtualFilesystem() + name, _ := vfsObj.PathnameWithDeleted(ctx, root, f.file.VirtualDentry()) + return name +} + +// ReadFull implements File. +func (f *vfsFile) ReadFull(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) { + var total int64 + for dst.NumBytes() > 0 { + n, err := f.file.PRead(ctx, dst, offset+total, vfs.ReadOptions{}) + total += n + if err == io.EOF && total != 0 { + return total, io.ErrUnexpectedEOF + } else if err != nil { + return total, err + } + dst = dst.DropFirst64(n) + } + return total, nil +} + +// ConfigureMMap implements File. +func (f *vfsFile) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { + return f.file.ConfigureMMap(ctx, opts) +} + +// Type implements File. +func (f *vfsFile) Type(ctx context.Context) (linux.FileMode, error) { + stat, err := f.file.Stat(ctx, vfs.StatOptions{}) + if err != nil { + return 0, err + } + return linux.FileMode(stat.Mode).FileType(), nil +} + +// IncRef implements File. +func (f *vfsFile) IncRef() { + f.file.IncRef() +} + +// DecRef implements File. +func (f *vfsFile) DecRef() { + f.file.DecRef() +} + +// fsLookup implements Lookup interface using fs.File. +// +// +stateify savable +type vfsLookup struct { + mntns *vfs.MountNamespace + + root vfs.VirtualDentry + workingDir vfs.VirtualDentry +} + +var _ Lookup = (*vfsLookup)(nil) + +// NewVFSLookup creates a new Lookup using VFS2. +func NewVFSLookup(mntns *vfs.MountNamespace, root, workingDir vfs.VirtualDentry) Lookup { + return &vfsLookup{ + mntns: mntns, + root: root, + workingDir: workingDir, + } +} + +// OpenPath implements Lookup. +// +// remainingTraversals is not configurable in VFS2, all callers are using the +// default anyways. +// +// TODO(gvisor.dev/issue/1623): Check mount has read and exec permission. +func (l *vfsLookup) OpenPath(ctx context.Context, path string, opts vfs.OpenOptions, _ *uint, resolveFinal bool) (File, error) { + vfsObj := l.mntns.Root().Mount().Filesystem().VirtualFilesystem() + creds := auth.CredentialsFromContext(ctx) + pop := &vfs.PathOperation{ + Root: l.root, + Start: l.root, + Path: fspath.Parse(path), + FollowFinalSymlink: resolveFinal, + } + fd, err := vfsObj.OpenAt(ctx, creds, pop, &opts) + if err != nil { + return nil, err + } + return &vfsFile{file: fd}, nil +} diff --git a/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go b/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go index e03a0c665..abd4f24e7 100644 --- a/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go +++ b/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go @@ -28,6 +28,9 @@ import ( "gvisor.dev/gvisor/pkg/sync" ) +// Name is the default filesystem name. +const Name = "devtmpfs" + // FilesystemType implements vfs.FilesystemType. type FilesystemType struct { initOnce sync.Once @@ -107,6 +110,7 @@ func (a *Accessor) wrapContext(ctx context.Context) *accessorContext { func (ac *accessorContext) Value(key interface{}) interface{} { switch key { case vfs.CtxMountNamespace: + ac.a.mntns.IncRef() return ac.a.mntns case vfs.CtxRoot: ac.a.root.IncRef() diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go index 138adb9f7..5cfb0dc4c 100644 --- a/pkg/sentry/fsimpl/gofer/filesystem.go +++ b/pkg/sentry/fsimpl/gofer/filesystem.go @@ -400,6 +400,7 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b } vfsObj := rp.VirtualFilesystem() mntns := vfs.MountNamespaceFromContext(ctx) + defer mntns.DecRef() parent.dirMu.Lock() defer parent.dirMu.Unlock() childVFSD := parent.vfsd.Child(name) @@ -934,7 +935,9 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa if oldParent == newParent && oldName == newName { return nil } - if err := vfsObj.PrepareRenameDentry(vfs.MountNamespaceFromContext(ctx), &renamed.vfsd, replacedVFSD); err != nil { + mntns := vfs.MountNamespaceFromContext(ctx) + defer mntns.DecRef() + if err := vfsObj.PrepareRenameDentry(mntns, &renamed.vfsd, replacedVFSD); err != nil { return err } if err := renamed.file.rename(ctx, newParent.file, newName); err != nil { diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go index d0552bd99..d00850e25 100644 --- a/pkg/sentry/fsimpl/gofer/gofer.go +++ b/pkg/sentry/fsimpl/gofer/gofer.go @@ -52,6 +52,9 @@ import ( "gvisor.dev/gvisor/pkg/usermem" ) +// Name is the default filesystem name. +const Name = "9p" + // FilesystemType implements vfs.FilesystemType. type FilesystemType struct{} diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go index ee98eb66a..292f58afd 100644 --- a/pkg/sentry/fsimpl/kernfs/filesystem.go +++ b/pkg/sentry/fsimpl/kernfs/filesystem.go @@ -544,6 +544,7 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa } mntns := vfs.MountNamespaceFromContext(ctx) + defer mntns.DecRef() virtfs := rp.VirtualFilesystem() srcDirDentry := srcDirVFSD.Impl().(*Dentry) @@ -595,7 +596,10 @@ func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error parentDentry := vfsd.Parent().Impl().(*Dentry) parentDentry.dirMu.Lock() defer parentDentry.dirMu.Unlock() - if err := virtfs.PrepareDeleteDentry(vfs.MountNamespaceFromContext(ctx), vfsd); err != nil { + + mntns := vfs.MountNamespaceFromContext(ctx) + defer mntns.DecRef() + if err := virtfs.PrepareDeleteDentry(mntns, vfsd); err != nil { return err } if err := parentDentry.inode.RmDir(ctx, rp.Component(), vfsd); err != nil { @@ -697,7 +701,9 @@ func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error parentDentry := vfsd.Parent().Impl().(*Dentry) parentDentry.dirMu.Lock() defer parentDentry.dirMu.Unlock() - if err := virtfs.PrepareDeleteDentry(vfs.MountNamespaceFromContext(ctx), vfsd); err != nil { + mntns := vfs.MountNamespaceFromContext(ctx) + defer mntns.DecRef() + if err := virtfs.PrepareDeleteDentry(mntns, vfsd); err != nil { return err } if err := parentDentry.inode.Unlink(ctx, rp.Component(), vfsd); err != nil { diff --git a/pkg/sentry/fsimpl/proc/BUILD b/pkg/sentry/fsimpl/proc/BUILD index 12aac2e6a..a83245866 100644 --- a/pkg/sentry/fsimpl/proc/BUILD +++ b/pkg/sentry/fsimpl/proc/BUILD @@ -14,6 +14,7 @@ go_library( "tasks_net.go", "tasks_sys.go", ], + visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", "//pkg/context", diff --git a/pkg/sentry/fsimpl/proc/filesystem.go b/pkg/sentry/fsimpl/proc/filesystem.go index 11477b6a9..5c19d5522 100644 --- a/pkg/sentry/fsimpl/proc/filesystem.go +++ b/pkg/sentry/fsimpl/proc/filesystem.go @@ -26,15 +26,18 @@ import ( "gvisor.dev/gvisor/pkg/sentry/vfs" ) -// procFSType is the factory class for procfs. +// Name is the default filesystem name. +const Name = "proc" + +// FilesystemType is the factory class for procfs. // // +stateify savable -type procFSType struct{} +type FilesystemType struct{} -var _ vfs.FilesystemType = (*procFSType)(nil) +var _ vfs.FilesystemType = (*FilesystemType)(nil) // GetFilesystem implements vfs.FilesystemType. -func (ft *procFSType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { +func (ft *FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { k := kernel.KernelFromContext(ctx) if k == nil { return nil, nil, fmt.Errorf("procfs requires a kernel") @@ -47,12 +50,13 @@ func (ft *procFSType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFile procfs := &kernfs.Filesystem{} procfs.VFSFilesystem().Init(vfsObj, procfs) - var data *InternalData + var cgroups map[string]string if opts.InternalData != nil { - data = opts.InternalData.(*InternalData) + data := opts.InternalData.(*InternalData) + cgroups = data.Cgroups } - _, dentry := newTasksInode(procfs, k, pidns, data.Cgroups) + _, dentry := newTasksInode(procfs, k, pidns, cgroups) return procfs.VFSFilesystem(), dentry.VFSDentry(), nil } diff --git a/pkg/sentry/fsimpl/proc/tasks_test.go b/pkg/sentry/fsimpl/proc/tasks_test.go index 6fc3524db..96c72cbc9 100644 --- a/pkg/sentry/fsimpl/proc/tasks_test.go +++ b/pkg/sentry/fsimpl/proc/tasks_test.go @@ -90,8 +90,7 @@ func setup(t *testing.T) *testutil.System { ctx := k.SupervisorContext() creds := auth.CredentialsFromContext(ctx) - vfsObj := vfs.New() - vfsObj.MustRegisterFilesystemType("procfs", &procFSType{}, &vfs.RegisterFilesystemTypeOptions{ + k.VFS.MustRegisterFilesystemType(Name, &FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ AllowUserMount: true, }) fsOpts := vfs.GetFilesystemOptions{ @@ -102,11 +101,11 @@ func setup(t *testing.T) *testutil.System { }, }, } - mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "procfs", &fsOpts) + mntns, err := k.VFS.NewMountNamespace(ctx, creds, "", Name, &fsOpts) if err != nil { t.Fatalf("NewMountNamespace(): %v", err) } - return testutil.NewSystem(ctx, t, vfsObj, mntns) + return testutil.NewSystem(ctx, t, k.VFS, mntns) } func TestTasksEmpty(t *testing.T) { @@ -131,7 +130,7 @@ func TestTasks(t *testing.T) { var tasks []*kernel.Task for i := 0; i < 5; i++ { tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits()) - task, err := testutil.CreateTask(s.Ctx, fmt.Sprintf("name-%d", i), tc) + task, err := testutil.CreateTask(s.Ctx, fmt.Sprintf("name-%d", i), tc, s.MntNs, s.Root, s.Root) if err != nil { t.Fatalf("CreateTask(): %v", err) } @@ -213,7 +212,7 @@ func TestTasksOffset(t *testing.T) { k := kernel.KernelFromContext(s.Ctx) for i := 0; i < 3; i++ { tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits()) - if _, err := testutil.CreateTask(s.Ctx, fmt.Sprintf("name-%d", i), tc); err != nil { + if _, err := testutil.CreateTask(s.Ctx, fmt.Sprintf("name-%d", i), tc, s.MntNs, s.Root, s.Root); err != nil { t.Fatalf("CreateTask(): %v", err) } } @@ -337,7 +336,7 @@ func TestTask(t *testing.T) { k := kernel.KernelFromContext(s.Ctx) tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits()) - _, err := testutil.CreateTask(s.Ctx, "name", tc) + _, err := testutil.CreateTask(s.Ctx, "name", tc, s.MntNs, s.Root, s.Root) if err != nil { t.Fatalf("CreateTask(): %v", err) } @@ -352,7 +351,7 @@ func TestProcSelf(t *testing.T) { k := kernel.KernelFromContext(s.Ctx) tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits()) - task, err := testutil.CreateTask(s.Ctx, "name", tc) + task, err := testutil.CreateTask(s.Ctx, "name", tc, s.MntNs, s.Root, s.Root) if err != nil { t.Fatalf("CreateTask(): %v", err) } @@ -433,7 +432,7 @@ func TestTree(t *testing.T) { var tasks []*kernel.Task for i := 0; i < 5; i++ { tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits()) - task, err := testutil.CreateTask(s.Ctx, fmt.Sprintf("name-%d", i), tc) + task, err := testutil.CreateTask(s.Ctx, fmt.Sprintf("name-%d", i), tc, s.MntNs, s.Root, s.Root) if err != nil { t.Fatalf("CreateTask(): %v", err) } diff --git a/pkg/sentry/fsimpl/sys/BUILD b/pkg/sentry/fsimpl/sys/BUILD index 66c0d8bc8..a741e2bb6 100644 --- a/pkg/sentry/fsimpl/sys/BUILD +++ b/pkg/sentry/fsimpl/sys/BUILD @@ -7,6 +7,7 @@ go_library( srcs = [ "sys.go", ], + visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", "//pkg/context", diff --git a/pkg/sentry/fsimpl/sys/sys.go b/pkg/sentry/fsimpl/sys/sys.go index d693fceae..c36c4fa11 100644 --- a/pkg/sentry/fsimpl/sys/sys.go +++ b/pkg/sentry/fsimpl/sys/sys.go @@ -28,6 +28,9 @@ import ( "gvisor.dev/gvisor/pkg/syserror" ) +// Name is the default filesystem name. +const Name = "sysfs" + // FilesystemType implements vfs.FilesystemType. type FilesystemType struct{} diff --git a/pkg/sentry/fsimpl/sys/sys_test.go b/pkg/sentry/fsimpl/sys/sys_test.go index 8b1cf0bd0..5d1ba5867 100644 --- a/pkg/sentry/fsimpl/sys/sys_test.go +++ b/pkg/sentry/fsimpl/sys/sys_test.go @@ -34,16 +34,15 @@ func newTestSystem(t *testing.T) *testutil.System { } ctx := k.SupervisorContext() creds := auth.CredentialsFromContext(ctx) - v := vfs.New() - v.MustRegisterFilesystemType("sysfs", sys.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + k.VFS.MustRegisterFilesystemType(sys.Name, sys.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ AllowUserMount: true, }) - mns, err := v.NewMountNamespace(ctx, creds, "", "sysfs", &vfs.GetFilesystemOptions{}) + mns, err := k.VFS.NewMountNamespace(ctx, creds, "", sys.Name, &vfs.GetFilesystemOptions{}) if err != nil { t.Fatalf("Failed to create new mount namespace: %v", err) } - return testutil.NewSystem(ctx, t, v, mns) + return testutil.NewSystem(ctx, t, k.VFS, mns) } func TestReadCPUFile(t *testing.T) { diff --git a/pkg/sentry/fsimpl/testutil/BUILD b/pkg/sentry/fsimpl/testutil/BUILD index efd5974c4..e4f36f4ae 100644 --- a/pkg/sentry/fsimpl/testutil/BUILD +++ b/pkg/sentry/fsimpl/testutil/BUILD @@ -16,7 +16,7 @@ go_library( "//pkg/cpuid", "//pkg/fspath", "//pkg/memutil", - "//pkg/sentry/fs", + "//pkg/sentry/fsimpl/tmpfs", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", "//pkg/sentry/kernel/sched", diff --git a/pkg/sentry/fsimpl/testutil/kernel.go b/pkg/sentry/fsimpl/testutil/kernel.go index 89f8c4915..a91b3ec4d 100644 --- a/pkg/sentry/fsimpl/testutil/kernel.go +++ b/pkg/sentry/fsimpl/testutil/kernel.go @@ -24,7 +24,7 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/cpuid" "gvisor.dev/gvisor/pkg/memutil" - "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/sched" @@ -33,6 +33,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/time" + "gvisor.dev/gvisor/pkg/sentry/vfs" // Platforms are plugable. _ "gvisor.dev/gvisor/pkg/sentry/platform/kvm" @@ -99,26 +100,27 @@ func Boot() (*kernel.Kernel, error) { return nil, fmt.Errorf("initializing kernel: %v", err) } - ctx := k.SupervisorContext() + kernel.VFS2Enabled = true + + vfsObj := vfs.New() + vfsObj.MustRegisterFilesystemType(tmpfs.Name, &tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserMount: true, + AllowUserList: true, + }) + k.VFS = vfsObj - // Create mount namespace without root as it's the minimum required to create - // the global thread group. - mntns, err := fs.NewMountNamespace(ctx, nil) - if err != nil { - return nil, err - } ls, err := limits.NewLinuxLimitSet() if err != nil { return nil, err } - tg := k.NewThreadGroup(mntns, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, ls) + tg := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, ls) k.TestOnly_SetGlobalInit(tg) return k, nil } // CreateTask creates a new bare bones task for tests. -func CreateTask(ctx context.Context, name string, tc *kernel.ThreadGroup) (*kernel.Task, error) { +func CreateTask(ctx context.Context, name string, tc *kernel.ThreadGroup, mntns *vfs.MountNamespace, root, cwd vfs.VirtualDentry) (*kernel.Task, error) { k := kernel.KernelFromContext(ctx) config := &kernel.TaskConfig{ Kernel: k, @@ -129,6 +131,8 @@ func CreateTask(ctx context.Context, name string, tc *kernel.ThreadGroup) (*kern UTSNamespace: kernel.UTSNamespaceFromContext(ctx), IPCNamespace: kernel.IPCNamespaceFromContext(ctx), AbstractSocketNamespace: kernel.NewAbstractSocketNamespace(), + MountNamespaceVFS2: mntns, + FSContext: kernel.NewFSContextVFS2(root, cwd, 0022), } return k.TaskSet().NewTask(config) } diff --git a/pkg/sentry/fsimpl/testutil/testutil.go b/pkg/sentry/fsimpl/testutil/testutil.go index 69fd84ddd..b97e3534a 100644 --- a/pkg/sentry/fsimpl/testutil/testutil.go +++ b/pkg/sentry/fsimpl/testutil/testutil.go @@ -41,12 +41,12 @@ type System struct { Creds *auth.Credentials VFS *vfs.VirtualFilesystem Root vfs.VirtualDentry - mns *vfs.MountNamespace + MntNs *vfs.MountNamespace } // NewSystem constructs a System. // -// Precondition: Caller must hold a reference on mns, whose ownership +// Precondition: Caller must hold a reference on MntNs, whose ownership // is transferred to the new System. func NewSystem(ctx context.Context, t *testing.T, v *vfs.VirtualFilesystem, mns *vfs.MountNamespace) *System { s := &System{ @@ -54,7 +54,7 @@ func NewSystem(ctx context.Context, t *testing.T, v *vfs.VirtualFilesystem, mns Ctx: ctx, Creds: auth.CredentialsFromContext(ctx), VFS: v, - mns: mns, + MntNs: mns, Root: mns.Root(), } return s @@ -75,7 +75,7 @@ func (s *System) WithSubtest(t *testing.T) *System { Ctx: s.Ctx, Creds: s.Creds, VFS: s.VFS, - mns: s.mns, + MntNs: s.MntNs, Root: s.Root, } } @@ -90,7 +90,7 @@ func (s *System) WithTemporaryContext(ctx context.Context) *System { Ctx: ctx, Creds: s.Creds, VFS: s.VFS, - mns: s.mns, + MntNs: s.MntNs, Root: s.Root, } } @@ -98,7 +98,7 @@ func (s *System) WithTemporaryContext(ctx context.Context) *System { // Destroy release resources associated with a test system. func (s *System) Destroy() { s.Root.DecRef() - s.mns.DecRef() // Reference on mns passed to NewSystem. + s.MntNs.DecRef() // Reference on MntNs passed to NewSystem. } // ReadToEnd reads the contents of fd until EOF to a string. diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go index 8785452b6..7f7b791c4 100644 --- a/pkg/sentry/fsimpl/tmpfs/filesystem.go +++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go @@ -486,7 +486,9 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa vfsObj := rp.VirtualFilesystem() oldParentDir := oldParent.inode.impl.(*directory) newParentDir := newParent.inode.impl.(*directory) - if err := vfsObj.PrepareRenameDentry(vfs.MountNamespaceFromContext(ctx), renamedVFSD, replacedVFSD); err != nil { + mntns := vfs.MountNamespaceFromContext(ctx) + defer mntns.DecRef() + if err := vfsObj.PrepareRenameDentry(mntns, renamedVFSD, replacedVFSD); err != nil { return err } if replaced != nil { @@ -543,7 +545,9 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error } defer mnt.EndWrite() vfsObj := rp.VirtualFilesystem() - if err := vfsObj.PrepareDeleteDentry(vfs.MountNamespaceFromContext(ctx), childVFSD); err != nil { + mntns := vfs.MountNamespaceFromContext(ctx) + defer mntns.DecRef() + if err := vfsObj.PrepareDeleteDentry(mntns, childVFSD); err != nil { return err } parent.inode.impl.(*directory).childList.Remove(child) @@ -631,7 +635,9 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error } defer mnt.EndWrite() vfsObj := rp.VirtualFilesystem() - if err := vfsObj.PrepareDeleteDentry(vfs.MountNamespaceFromContext(ctx), childVFSD); err != nil { + mntns := vfs.MountNamespaceFromContext(ctx) + defer mntns.DecRef() + if err := vfsObj.PrepareDeleteDentry(mntns, childVFSD); err != nil { return err } parent.inode.impl.(*directory).childList.Remove(child) diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go index 2108d0f4d..c5bb17562 100644 --- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go +++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go @@ -40,6 +40,9 @@ import ( "gvisor.dev/gvisor/pkg/syserror" ) +// Name is the default filesystem name. +const Name = "tmpfs" + // FilesystemType implements vfs.FilesystemType. type FilesystemType struct{} diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD index 2231d6973..46306945f 100644 --- a/pkg/sentry/kernel/BUILD +++ b/pkg/sentry/kernel/BUILD @@ -157,6 +157,7 @@ go_library( "//pkg/context", "//pkg/cpuid", "//pkg/eventchannel", + "//pkg/fspath", "//pkg/log", "//pkg/metric", "//pkg/refs", @@ -167,6 +168,7 @@ go_library( "//pkg/sentry/fs", "//pkg/sentry/fs/lock", "//pkg/sentry/fs/timerfd", + "//pkg/sentry/fsbridge", "//pkg/sentry/hostcpu", "//pkg/sentry/inet", "//pkg/sentry/kernel/auth", diff --git a/pkg/sentry/kernel/fs_context.go b/pkg/sentry/kernel/fs_context.go index 2448c1d99..7218aa24e 100644 --- a/pkg/sentry/kernel/fs_context.go +++ b/pkg/sentry/kernel/fs_context.go @@ -19,6 +19,7 @@ import ( "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" ) @@ -37,10 +38,16 @@ type FSContext struct { // destroyed. root *fs.Dirent + // rootVFS2 is the filesystem root. + rootVFS2 vfs.VirtualDentry + // cwd is the current working directory. Will be nil iff the FSContext // has been destroyed. cwd *fs.Dirent + // cwdVFS2 is the current working directory. + cwdVFS2 vfs.VirtualDentry + // umask is the current file mode creation mask. When a thread using this // context invokes a syscall that creates a file, bits set in umask are // removed from the permissions that the file is created with. @@ -60,6 +67,19 @@ func newFSContext(root, cwd *fs.Dirent, umask uint) *FSContext { return &f } +// NewFSContextVFS2 returns a new filesystem context. +func NewFSContextVFS2(root, cwd vfs.VirtualDentry, umask uint) *FSContext { + root.IncRef() + cwd.IncRef() + f := FSContext{ + rootVFS2: root, + cwdVFS2: cwd, + umask: umask, + } + f.EnableLeakCheck("kernel.FSContext") + return &f +} + // destroy is the destructor for an FSContext. // // This will call DecRef on both root and cwd Dirents. If either call to @@ -75,11 +95,17 @@ func (f *FSContext) destroy() { f.mu.Lock() defer f.mu.Unlock() - f.root.DecRef() - f.root = nil - - f.cwd.DecRef() - f.cwd = nil + if VFS2Enabled { + f.rootVFS2.DecRef() + f.rootVFS2 = vfs.VirtualDentry{} + f.cwdVFS2.DecRef() + f.cwdVFS2 = vfs.VirtualDentry{} + } else { + f.root.DecRef() + f.root = nil + f.cwd.DecRef() + f.cwd = nil + } } // DecRef implements RefCounter.DecRef with destructor f.destroy. @@ -93,12 +119,21 @@ func (f *FSContext) DecRef() { func (f *FSContext) Fork() *FSContext { f.mu.Lock() defer f.mu.Unlock() - f.cwd.IncRef() - f.root.IncRef() + + if VFS2Enabled { + f.cwdVFS2.IncRef() + f.rootVFS2.IncRef() + } else { + f.cwd.IncRef() + f.root.IncRef() + } + return &FSContext{ - cwd: f.cwd, - root: f.root, - umask: f.umask, + cwd: f.cwd, + root: f.root, + cwdVFS2: f.cwdVFS2, + rootVFS2: f.rootVFS2, + umask: f.umask, } } @@ -109,12 +144,23 @@ func (f *FSContext) Fork() *FSContext { func (f *FSContext) WorkingDirectory() *fs.Dirent { f.mu.Lock() defer f.mu.Unlock() - if f.cwd != nil { - f.cwd.IncRef() - } + + f.cwd.IncRef() return f.cwd } +// WorkingDirectoryVFS2 returns the current working directory. +// +// This will return nil if called after destroy(), otherwise it will return a +// Dirent with a reference taken. +func (f *FSContext) WorkingDirectoryVFS2() vfs.VirtualDentry { + f.mu.Lock() + defer f.mu.Unlock() + + f.cwdVFS2.IncRef() + return f.cwdVFS2 +} + // SetWorkingDirectory sets the current working directory. // This will take an extra reference on the Dirent. // @@ -137,6 +183,20 @@ func (f *FSContext) SetWorkingDirectory(d *fs.Dirent) { old.DecRef() } +// SetWorkingDirectoryVFS2 sets the current working directory. +// This will take an extra reference on the VirtualDentry. +// +// This is not a valid call after destroy. +func (f *FSContext) SetWorkingDirectoryVFS2(d vfs.VirtualDentry) { + f.mu.Lock() + defer f.mu.Unlock() + + old := f.cwdVFS2 + f.cwdVFS2 = d + d.IncRef() + old.DecRef() +} + // RootDirectory returns the current filesystem root. // // This will return nil if called after destroy(), otherwise it will return a @@ -150,6 +210,18 @@ func (f *FSContext) RootDirectory() *fs.Dirent { return f.root } +// RootDirectoryVFS2 returns the current filesystem root. +// +// This will return nil if called after destroy(), otherwise it will return a +// Dirent with a reference taken. +func (f *FSContext) RootDirectoryVFS2() vfs.VirtualDentry { + f.mu.Lock() + defer f.mu.Unlock() + + f.rootVFS2.IncRef() + return f.rootVFS2 +} + // SetRootDirectory sets the root directory. // This will take an extra reference on the Dirent. // diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index 3ee760ba2..2665f057c 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -43,11 +43,13 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/cpuid" "gvisor.dev/gvisor/pkg/eventchannel" + "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/timerfd" + "gvisor.dev/gvisor/pkg/sentry/fsbridge" "gvisor.dev/gvisor/pkg/sentry/hostcpu" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" @@ -71,6 +73,10 @@ import ( "gvisor.dev/gvisor/pkg/tcpip" ) +// VFS2Enabled is set to true when VFS2 is enabled. Added as a global for allow +// easy access everywhere. To be removed once VFS2 becomes the default. +var VFS2Enabled = false + // Kernel represents an emulated Linux kernel. It must be initialized by calling // Init() or LoadFrom(). // @@ -238,6 +244,9 @@ type Kernel struct { // SpecialOpts contains special kernel options. SpecialOpts + + // VFS keeps the filesystem state used across the kernel. + VFS *vfs.VirtualFilesystem } // InitKernelArgs holds arguments to Init. @@ -624,7 +633,7 @@ type CreateProcessArgs struct { // File is a passed host FD pointing to a file to load as the init binary. // // This is checked if and only if Filename is "". - File *fs.File + File fsbridge.File // Argvv is a list of arguments. Argv []string @@ -673,6 +682,13 @@ type CreateProcessArgs struct { // increment it). MountNamespace *fs.MountNamespace + // MountNamespaceVFS2 optionally contains the mount namespace for this + // process. If nil, the init process's mount namespace is used. + // + // Anyone setting MountNamespaceVFS2 must donate a reference (i.e. + // increment it). + MountNamespaceVFS2 *vfs.MountNamespace + // ContainerID is the container that the process belongs to. ContainerID string } @@ -711,11 +727,22 @@ func (ctx *createProcessContext) Value(key interface{}) interface{} { return ctx.args.Credentials case fs.CtxRoot: if ctx.args.MountNamespace != nil { - // MountNamespace.Root() will take a reference on the root - // dirent for us. + // MountNamespace.Root() will take a reference on the root dirent for us. return ctx.args.MountNamespace.Root() } return nil + case vfs.CtxRoot: + if ctx.args.MountNamespaceVFS2 == nil { + return nil + } + // MountNamespaceVFS2.Root() takes a reference on the root dirent for us. + return ctx.args.MountNamespaceVFS2.Root() + case vfs.CtxMountNamespace: + if ctx.k.globalInit == nil { + return nil + } + // MountNamespaceVFS2 takes a reference for us. + return ctx.k.GlobalInit().Leader().MountNamespaceVFS2() case fs.CtxDirentCacheLimiter: return ctx.k.DirentCacheLimiter case ktime.CtxRealtimeClock: @@ -757,34 +784,77 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, defer k.extMu.Unlock() log.Infof("EXEC: %v", args.Argv) - // Grab the mount namespace. - mounts := args.MountNamespace - if mounts == nil { - mounts = k.GlobalInit().Leader().MountNamespace() - mounts.IncRef() - } - - tg := k.NewThreadGroup(mounts, args.PIDNamespace, NewSignalHandlers(), linux.SIGCHLD, args.Limits) ctx := args.NewContext(k) - // Get the root directory from the MountNamespace. - root := mounts.Root() - // The call to newFSContext below will take a reference on root, so we - // don't need to hold this one. - defer root.DecRef() - - // Grab the working directory. - remainingTraversals := uint(args.MaxSymlinkTraversals) - wd := root // Default. - if args.WorkingDirectory != "" { - var err error - wd, err = mounts.FindInode(ctx, root, nil, args.WorkingDirectory, &remainingTraversals) - if err != nil { - return nil, 0, fmt.Errorf("failed to find initial working directory %q: %v", args.WorkingDirectory, err) + var ( + opener fsbridge.Lookup + fsContext *FSContext + mntns *fs.MountNamespace + ) + + if VFS2Enabled { + mntnsVFS2 := args.MountNamespaceVFS2 + if mntnsVFS2 == nil { + // MountNamespaceVFS2 adds a reference to the namespace, which is + // transferred to the new process. + mntnsVFS2 = k.GlobalInit().Leader().MountNamespaceVFS2() + } + // Get the root directory from the MountNamespace. + root := args.MountNamespaceVFS2.Root() + // The call to newFSContext below will take a reference on root, so we + // don't need to hold this one. + defer root.DecRef() + + // Grab the working directory. + wd := root // Default. + if args.WorkingDirectory != "" { + pop := vfs.PathOperation{ + Root: root, + Start: wd, + Path: fspath.Parse(args.WorkingDirectory), + FollowFinalSymlink: true, + } + var err error + wd, err = k.VFS.GetDentryAt(ctx, args.Credentials, &pop, &vfs.GetDentryOptions{ + CheckSearchable: true, + }) + if err != nil { + return nil, 0, fmt.Errorf("failed to find initial working directory %q: %v", args.WorkingDirectory, err) + } + defer wd.DecRef() + } + opener = fsbridge.NewVFSLookup(mntnsVFS2, root, wd) + fsContext = NewFSContextVFS2(root, wd, args.Umask) + + } else { + mntns = args.MountNamespace + if mntns == nil { + mntns = k.GlobalInit().Leader().MountNamespace() + mntns.IncRef() } - defer wd.DecRef() + // Get the root directory from the MountNamespace. + root := mntns.Root() + // The call to newFSContext below will take a reference on root, so we + // don't need to hold this one. + defer root.DecRef() + + // Grab the working directory. + remainingTraversals := args.MaxSymlinkTraversals + wd := root // Default. + if args.WorkingDirectory != "" { + var err error + wd, err = mntns.FindInode(ctx, root, nil, args.WorkingDirectory, &remainingTraversals) + if err != nil { + return nil, 0, fmt.Errorf("failed to find initial working directory %q: %v", args.WorkingDirectory, err) + } + defer wd.DecRef() + } + opener = fsbridge.NewFSLookup(mntns, root, wd) + fsContext = newFSContext(root, wd, args.Umask) } + tg := k.NewThreadGroup(mntns, args.PIDNamespace, NewSignalHandlers(), linux.SIGCHLD, args.Limits) + // Check which file to start from. switch { case args.Filename != "": @@ -805,11 +875,9 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, } // Create a fresh task context. - remainingTraversals = uint(args.MaxSymlinkTraversals) + remainingTraversals := args.MaxSymlinkTraversals loadArgs := loader.LoadArgs{ - Mounts: mounts, - Root: root, - WorkingDirectory: wd, + Opener: opener, RemainingTraversals: &remainingTraversals, ResolveFinal: true, Filename: args.Filename, @@ -834,13 +902,14 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, Kernel: k, ThreadGroup: tg, TaskContext: tc, - FSContext: newFSContext(root, wd, args.Umask), + FSContext: fsContext, FDTable: args.FDTable, Credentials: args.Credentials, AllowedCPUMask: sched.NewFullCPUSet(k.applicationCores), UTSNamespace: args.UTSNamespace, IPCNamespace: args.IPCNamespace, AbstractSocketNamespace: args.AbstractSocketNamespace, + MountNamespaceVFS2: args.MountNamespaceVFS2, ContainerID: args.ContainerID, } t, err := k.tasks.NewTask(config) @@ -1378,6 +1447,20 @@ func (ctx supervisorContext) Value(key interface{}) interface{} { return ctx.k.globalInit.mounts.Root() } return nil + case vfs.CtxRoot: + if ctx.k.globalInit == nil { + return vfs.VirtualDentry{} + } + mntns := ctx.k.GlobalInit().Leader().MountNamespaceVFS2() + defer mntns.DecRef() + // Root() takes a reference on the root dirent for us. + return mntns.Root() + case vfs.CtxMountNamespace: + if ctx.k.globalInit == nil { + return nil + } + // MountNamespaceVFS2() takes a reference for us. + return ctx.k.GlobalInit().Leader().MountNamespaceVFS2() case fs.CtxDirentCacheLimiter: return ctx.k.DirentCacheLimiter case ktime.CtxRealtimeClock: diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go index 981e8c7fe..a3443ff21 100644 --- a/pkg/sentry/kernel/task.go +++ b/pkg/sentry/kernel/task.go @@ -424,6 +424,11 @@ type Task struct { // abstractSockets is protected by mu. abstractSockets *AbstractSocketNamespace + // mountNamespaceVFS2 is the task's mount namespace. + // + // It is protected by mu. It is owned by the task goroutine. + mountNamespaceVFS2 *vfs.MountNamespace + // parentDeathSignal is sent to this task's thread group when its parent exits. // // parentDeathSignal is protected by mu. @@ -638,6 +643,11 @@ func (t *Task) Value(key interface{}) interface{} { return int32(t.ThreadGroup().ID()) case fs.CtxRoot: return t.fsContext.RootDirectory() + case vfs.CtxRoot: + return t.fsContext.RootDirectoryVFS2() + case vfs.CtxMountNamespace: + t.mountNamespaceVFS2.IncRef() + return t.mountNamespaceVFS2 case fs.CtxDirentCacheLimiter: return t.k.DirentCacheLimiter case inet.CtxStack: @@ -701,6 +711,14 @@ func (t *Task) SyscallRestartBlock() SyscallRestartBlock { // Preconditions: The caller must be running on the task goroutine, or t.mu // must be locked. func (t *Task) IsChrooted() bool { + if VFS2Enabled { + realRoot := t.mountNamespaceVFS2.Root() + defer realRoot.DecRef() + root := t.fsContext.RootDirectoryVFS2() + defer root.DecRef() + return root != realRoot + } + realRoot := t.tg.mounts.Root() defer realRoot.DecRef() root := t.fsContext.RootDirectory() @@ -796,6 +814,15 @@ func (t *Task) MountNamespace() *fs.MountNamespace { return t.tg.mounts } +// MountNamespaceVFS2 returns t's MountNamespace. A reference is taken on the +// returned mount namespace. +func (t *Task) MountNamespaceVFS2() *vfs.MountNamespace { + t.mu.Lock() + defer t.mu.Unlock() + t.mountNamespaceVFS2.IncRef() + return t.mountNamespaceVFS2 +} + // AbstractSockets returns t's AbstractSocketNamespace. func (t *Task) AbstractSockets() *AbstractSocketNamespace { return t.abstractSockets diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go index 53d4d211b..ba74b4c1c 100644 --- a/pkg/sentry/kernel/task_clone.go +++ b/pkg/sentry/kernel/task_clone.go @@ -199,6 +199,12 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { ipcns = NewIPCNamespace(userns) } + // TODO(b/63601033): Implement CLONE_NEWNS. + mntnsVFS2 := t.mountNamespaceVFS2 + if mntnsVFS2 != nil { + mntnsVFS2.IncRef() + } + tc, err := t.tc.Fork(t, t.k, !opts.NewAddressSpace) if err != nil { return 0, nil, err @@ -241,7 +247,9 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { rseqAddr := usermem.Addr(0) rseqSignature := uint32(0) if opts.NewThreadGroup { - tg.mounts.IncRef() + if tg.mounts != nil { + tg.mounts.IncRef() + } sh := t.tg.signalHandlers if opts.NewSignalHandlers { sh = sh.Fork() @@ -265,6 +273,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { UTSNamespace: utsns, IPCNamespace: ipcns, AbstractSocketNamespace: t.abstractSockets, + MountNamespaceVFS2: mntnsVFS2, RSeqAddr: rseqAddr, RSeqSignature: rseqSignature, ContainerID: t.ContainerID(), diff --git a/pkg/sentry/kernel/task_context.go b/pkg/sentry/kernel/task_context.go index 2d6e7733c..2be982684 100644 --- a/pkg/sentry/kernel/task_context.go +++ b/pkg/sentry/kernel/task_context.go @@ -136,7 +136,7 @@ func (t *Task) Stack() *arch.Stack { func (k *Kernel) LoadTaskImage(ctx context.Context, args loader.LoadArgs) (*TaskContext, *syserr.Error) { // If File is not nil, we should load that instead of resolving Filename. if args.File != nil { - args.Filename = args.File.MappedName(ctx) + args.Filename = args.File.PathnameWithDeleted(ctx) } // Prepare a new user address space to load into. diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go index 435761e5a..c4ade6e8e 100644 --- a/pkg/sentry/kernel/task_exit.go +++ b/pkg/sentry/kernel/task_exit.go @@ -269,6 +269,13 @@ func (*runExitMain) execute(t *Task) taskRunState { t.fsContext.DecRef() t.fdTable.DecRef() + t.mu.Lock() + if t.mountNamespaceVFS2 != nil { + t.mountNamespaceVFS2.DecRef() + t.mountNamespaceVFS2 = nil + } + t.mu.Unlock() + // If this is the last task to exit from the thread group, release the // thread group's resources. if lastExiter { diff --git a/pkg/sentry/kernel/task_log.go b/pkg/sentry/kernel/task_log.go index 41259210c..6d737d3e5 100644 --- a/pkg/sentry/kernel/task_log.go +++ b/pkg/sentry/kernel/task_log.go @@ -198,18 +198,11 @@ func (t *Task) traceExecEvent(tc *TaskContext) { if !trace.IsEnabled() { return } - d := tc.MemoryManager.Executable() - if d == nil { + file := tc.MemoryManager.Executable() + if file == nil { trace.Logf(t.traceContext, traceCategory, "exec: << unknown >>") return } - defer d.DecRef() - root := t.fsContext.RootDirectory() - if root == nil { - trace.Logf(t.traceContext, traceCategory, "exec: << no root directory >>") - return - } - defer root.DecRef() - n, _ := d.FullName(root) - trace.Logf(t.traceContext, traceCategory, "exec: %s", n) + defer file.DecRef() + trace.Logf(t.traceContext, traceCategory, "exec: %s", file.PathnameWithDeleted(t)) } diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go index de838beef..f9236a842 100644 --- a/pkg/sentry/kernel/task_start.go +++ b/pkg/sentry/kernel/task_start.go @@ -21,6 +21,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/futex" "gvisor.dev/gvisor/pkg/sentry/kernel/sched" "gvisor.dev/gvisor/pkg/sentry/usage" + "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -80,6 +81,9 @@ type TaskConfig struct { // AbstractSocketNamespace is the AbstractSocketNamespace of the new task. AbstractSocketNamespace *AbstractSocketNamespace + // MountNamespaceVFS2 is the MountNamespace of the new task. + MountNamespaceVFS2 *vfs.MountNamespace + // RSeqAddr is a pointer to the the userspace linux.RSeq structure. RSeqAddr usermem.Addr @@ -116,28 +120,29 @@ func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) { parent: cfg.Parent, children: make(map[*Task]struct{}), }, - runState: (*runApp)(nil), - interruptChan: make(chan struct{}, 1), - signalMask: cfg.SignalMask, - signalStack: arch.SignalStack{Flags: arch.SignalStackFlagDisable}, - tc: *tc, - fsContext: cfg.FSContext, - fdTable: cfg.FDTable, - p: cfg.Kernel.Platform.NewContext(), - k: cfg.Kernel, - ptraceTracees: make(map[*Task]struct{}), - allowedCPUMask: cfg.AllowedCPUMask.Copy(), - ioUsage: &usage.IO{}, - niceness: cfg.Niceness, - netns: cfg.NetworkNamespaced, - utsns: cfg.UTSNamespace, - ipcns: cfg.IPCNamespace, - abstractSockets: cfg.AbstractSocketNamespace, - rseqCPU: -1, - rseqAddr: cfg.RSeqAddr, - rseqSignature: cfg.RSeqSignature, - futexWaiter: futex.NewWaiter(), - containerID: cfg.ContainerID, + runState: (*runApp)(nil), + interruptChan: make(chan struct{}, 1), + signalMask: cfg.SignalMask, + signalStack: arch.SignalStack{Flags: arch.SignalStackFlagDisable}, + tc: *tc, + fsContext: cfg.FSContext, + fdTable: cfg.FDTable, + p: cfg.Kernel.Platform.NewContext(), + k: cfg.Kernel, + ptraceTracees: make(map[*Task]struct{}), + allowedCPUMask: cfg.AllowedCPUMask.Copy(), + ioUsage: &usage.IO{}, + niceness: cfg.Niceness, + netns: cfg.NetworkNamespaced, + utsns: cfg.UTSNamespace, + ipcns: cfg.IPCNamespace, + abstractSockets: cfg.AbstractSocketNamespace, + mountNamespaceVFS2: cfg.MountNamespaceVFS2, + rseqCPU: -1, + rseqAddr: cfg.RSeqAddr, + rseqSignature: cfg.RSeqSignature, + futexWaiter: futex.NewWaiter(), + containerID: cfg.ContainerID, } t.creds.Store(cfg.Credentials) t.endStopCond.L = &t.tg.signalHandlers.mu diff --git a/pkg/sentry/kernel/thread_group.go b/pkg/sentry/kernel/thread_group.go index 768e958d2..268f62e9d 100644 --- a/pkg/sentry/kernel/thread_group.go +++ b/pkg/sentry/kernel/thread_group.go @@ -256,7 +256,7 @@ type ThreadGroup struct { tty *TTY } -// NewThreadGroup returns a new, empty thread group in PID namespace ns. The +// NewThreadGroup returns a new, empty thread group in PID namespace pidns. The // thread group leader will send its parent terminationSignal when it exits. // The new thread group isn't visible to the system until a task has been // created inside of it by a successful call to TaskSet.NewTask. @@ -317,7 +317,9 @@ func (tg *ThreadGroup) release() { for _, it := range its { it.DestroyTimer() } - tg.mounts.DecRef() + if tg.mounts != nil { + tg.mounts.DecRef() + } } // forEachChildThreadGroupLocked indicates over all child ThreadGroups. diff --git a/pkg/sentry/loader/BUILD b/pkg/sentry/loader/BUILD index 23790378a..c6aa65f28 100644 --- a/pkg/sentry/loader/BUILD +++ b/pkg/sentry/loader/BUILD @@ -33,6 +33,7 @@ go_library( "//pkg/sentry/fs", "//pkg/sentry/fs/anon", "//pkg/sentry/fs/fsutil", + "//pkg/sentry/fsbridge", "//pkg/sentry/kernel/auth", "//pkg/sentry/limits", "//pkg/sentry/memmap", @@ -40,6 +41,7 @@ go_library( "//pkg/sentry/pgalloc", "//pkg/sentry/uniqueid", "//pkg/sentry/usage", + "//pkg/sentry/vfs", "//pkg/syserr", "//pkg/syserror", "//pkg/usermem", diff --git a/pkg/sentry/loader/elf.go b/pkg/sentry/loader/elf.go index 122ed05c2..616fafa2c 100644 --- a/pkg/sentry/loader/elf.go +++ b/pkg/sentry/loader/elf.go @@ -27,7 +27,7 @@ import ( "gvisor.dev/gvisor/pkg/cpuid" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/fsbridge" "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/mm" @@ -97,11 +97,11 @@ type elfInfo struct { // accepts from the ELF, and it doesn't parse unnecessary parts of the file. // // ctx may be nil if f does not need it. -func parseHeader(ctx context.Context, f *fs.File) (elfInfo, error) { +func parseHeader(ctx context.Context, f fsbridge.File) (elfInfo, error) { // Check ident first; it will tell us the endianness of the rest of the // structs. var ident [elf.EI_NIDENT]byte - _, err := readFull(ctx, f, usermem.BytesIOSequence(ident[:]), 0) + _, err := f.ReadFull(ctx, usermem.BytesIOSequence(ident[:]), 0) if err != nil { log.Infof("Error reading ELF ident: %v", err) // The entire ident array always exists. @@ -137,7 +137,7 @@ func parseHeader(ctx context.Context, f *fs.File) (elfInfo, error) { var hdr elf.Header64 hdrBuf := make([]byte, header64Size) - _, err = readFull(ctx, f, usermem.BytesIOSequence(hdrBuf), 0) + _, err = f.ReadFull(ctx, usermem.BytesIOSequence(hdrBuf), 0) if err != nil { log.Infof("Error reading ELF header: %v", err) // The entire header always exists. @@ -187,7 +187,7 @@ func parseHeader(ctx context.Context, f *fs.File) (elfInfo, error) { } phdrBuf := make([]byte, totalPhdrSize) - _, err = readFull(ctx, f, usermem.BytesIOSequence(phdrBuf), int64(hdr.Phoff)) + _, err = f.ReadFull(ctx, usermem.BytesIOSequence(phdrBuf), int64(hdr.Phoff)) if err != nil { log.Infof("Error reading ELF phdrs: %v", err) // If phdrs were specified, they should all exist. @@ -227,7 +227,7 @@ func parseHeader(ctx context.Context, f *fs.File) (elfInfo, error) { // mapSegment maps a phdr into the Task. offset is the offset to apply to // phdr.Vaddr. -func mapSegment(ctx context.Context, m *mm.MemoryManager, f *fs.File, phdr *elf.ProgHeader, offset usermem.Addr) error { +func mapSegment(ctx context.Context, m *mm.MemoryManager, f fsbridge.File, phdr *elf.ProgHeader, offset usermem.Addr) error { // We must make a page-aligned mapping. adjust := usermem.Addr(phdr.Vaddr).PageOffset() @@ -395,7 +395,7 @@ type loadedELF struct { // // Preconditions: // * f is an ELF file -func loadParsedELF(ctx context.Context, m *mm.MemoryManager, f *fs.File, info elfInfo, sharedLoadOffset usermem.Addr) (loadedELF, error) { +func loadParsedELF(ctx context.Context, m *mm.MemoryManager, f fsbridge.File, info elfInfo, sharedLoadOffset usermem.Addr) (loadedELF, error) { first := true var start, end usermem.Addr var interpreter string @@ -431,7 +431,7 @@ func loadParsedELF(ctx context.Context, m *mm.MemoryManager, f *fs.File, info el } path := make([]byte, phdr.Filesz) - _, err := readFull(ctx, f, usermem.BytesIOSequence(path), int64(phdr.Off)) + _, err := f.ReadFull(ctx, usermem.BytesIOSequence(path), int64(phdr.Off)) if err != nil { // If an interpreter was specified, it should exist. ctx.Infof("Error reading PT_INTERP path: %v", err) @@ -564,7 +564,7 @@ func loadParsedELF(ctx context.Context, m *mm.MemoryManager, f *fs.File, info el // Preconditions: // * f is an ELF file // * f is the first ELF loaded into m -func loadInitialELF(ctx context.Context, m *mm.MemoryManager, fs *cpuid.FeatureSet, f *fs.File) (loadedELF, arch.Context, error) { +func loadInitialELF(ctx context.Context, m *mm.MemoryManager, fs *cpuid.FeatureSet, f fsbridge.File) (loadedELF, arch.Context, error) { info, err := parseHeader(ctx, f) if err != nil { ctx.Infof("Failed to parse initial ELF: %v", err) @@ -602,7 +602,7 @@ func loadInitialELF(ctx context.Context, m *mm.MemoryManager, fs *cpuid.FeatureS // // Preconditions: // * f is an ELF file -func loadInterpreterELF(ctx context.Context, m *mm.MemoryManager, f *fs.File, initial loadedELF) (loadedELF, error) { +func loadInterpreterELF(ctx context.Context, m *mm.MemoryManager, f fsbridge.File, initial loadedELF) (loadedELF, error) { info, err := parseHeader(ctx, f) if err != nil { if err == syserror.ENOEXEC { @@ -649,16 +649,14 @@ func loadELF(ctx context.Context, args LoadArgs) (loadedELF, arch.Context, error // Refresh the traversal limit. *args.RemainingTraversals = linux.MaxSymlinkTraversals args.Filename = bin.interpreter - d, i, err := openPath(ctx, args) + intFile, err := openPath(ctx, args) if err != nil { ctx.Infof("Error opening interpreter %s: %v", bin.interpreter, err) return loadedELF{}, nil, err } - defer i.DecRef() - // We don't need the Dirent. - d.DecRef() + defer intFile.DecRef() - interp, err = loadInterpreterELF(ctx, args.MemoryManager, i, bin) + interp, err = loadInterpreterELF(ctx, args.MemoryManager, intFile, bin) if err != nil { ctx.Infof("Error loading interpreter: %v", err) return loadedELF{}, nil, err diff --git a/pkg/sentry/loader/interpreter.go b/pkg/sentry/loader/interpreter.go index 098a45d36..3886b4d33 100644 --- a/pkg/sentry/loader/interpreter.go +++ b/pkg/sentry/loader/interpreter.go @@ -19,7 +19,7 @@ import ( "io" "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/fsbridge" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -37,9 +37,9 @@ const ( ) // parseInterpreterScript returns the interpreter path and argv. -func parseInterpreterScript(ctx context.Context, filename string, f *fs.File, argv []string) (newpath string, newargv []string, err error) { +func parseInterpreterScript(ctx context.Context, filename string, f fsbridge.File, argv []string) (newpath string, newargv []string, err error) { line := make([]byte, interpMaxLineLength) - n, err := readFull(ctx, f, usermem.BytesIOSequence(line), 0) + n, err := f.ReadFull(ctx, usermem.BytesIOSequence(line), 0) // Short read is OK. if err != nil && err != io.ErrUnexpectedEOF { if err == io.EOF { diff --git a/pkg/sentry/loader/loader.go b/pkg/sentry/loader/loader.go index 9a613d6b7..d6675b8f0 100644 --- a/pkg/sentry/loader/loader.go +++ b/pkg/sentry/loader/loader.go @@ -20,7 +20,6 @@ import ( "fmt" "io" "path" - "strings" "gvisor.dev/gvisor/pkg/abi" "gvisor.dev/gvisor/pkg/abi/linux" @@ -29,8 +28,10 @@ import ( "gvisor.dev/gvisor/pkg/rand" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/fsbridge" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/mm" + "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" @@ -41,16 +42,6 @@ type LoadArgs struct { // MemoryManager is the memory manager to load the executable into. MemoryManager *mm.MemoryManager - // Mounts is the mount namespace in which to look up Filename. - Mounts *fs.MountNamespace - - // Root is the root directory under which to look up Filename. - Root *fs.Dirent - - // WorkingDirectory is the working directory under which to look up - // Filename. - WorkingDirectory *fs.Dirent - // RemainingTraversals is the maximum number of symlinks to follow to // resolve Filename. This counter is passed by reference to keep it // updated throughout the call stack. @@ -65,7 +56,12 @@ type LoadArgs struct { // File is an open fs.File object of the executable. If File is not // nil, then File will be loaded and Filename will be ignored. - File *fs.File + // + // The caller is responsible for checking that the user can execute this file. + File fsbridge.File + + // Opener is used to open the executable file when 'File' is nil. + Opener fsbridge.Lookup // CloseOnExec indicates that the executable (or one of its parent // directories) was opened with O_CLOEXEC. If the executable is an @@ -106,103 +102,32 @@ func readFull(ctx context.Context, f *fs.File, dst usermem.IOSequence, offset in // installed in the Task FDTable. The caller takes ownership of both. // // args.Filename must be a readable, executable, regular file. -func openPath(ctx context.Context, args LoadArgs) (*fs.Dirent, *fs.File, error) { +func openPath(ctx context.Context, args LoadArgs) (fsbridge.File, error) { if args.Filename == "" { ctx.Infof("cannot open empty name") - return nil, nil, syserror.ENOENT - } - - var d *fs.Dirent - var err error - if args.ResolveFinal { - d, err = args.Mounts.FindInode(ctx, args.Root, args.WorkingDirectory, args.Filename, args.RemainingTraversals) - } else { - d, err = args.Mounts.FindLink(ctx, args.Root, args.WorkingDirectory, args.Filename, args.RemainingTraversals) - } - if err != nil { - return nil, nil, err - } - // Defer a DecRef for the sake of failure cases. - defer d.DecRef() - - if !args.ResolveFinal && fs.IsSymlink(d.Inode.StableAttr) { - return nil, nil, syserror.ELOOP - } - - if err := checkPermission(ctx, d); err != nil { - return nil, nil, err - } - - // If they claim it's a directory, then make sure. - // - // N.B. we reject directories below, but we must first reject - // non-directories passed as directories. - if strings.HasSuffix(args.Filename, "/") && !fs.IsDir(d.Inode.StableAttr) { - return nil, nil, syserror.ENOTDIR - } - - if err := checkIsRegularFile(ctx, d, args.Filename); err != nil { - return nil, nil, err - } - - f, err := d.Inode.GetFile(ctx, d, fs.FileFlags{Read: true}) - if err != nil { - return nil, nil, err - } - // Defer a DecRef for the sake of failure cases. - defer f.DecRef() - - if err := checkPread(ctx, f, args.Filename); err != nil { - return nil, nil, err - } - - d.IncRef() - f.IncRef() - return d, f, err -} - -// checkFile performs checks on a file to be executed. -func checkFile(ctx context.Context, f *fs.File, filename string) error { - if err := checkPermission(ctx, f.Dirent); err != nil { - return err - } - - if err := checkIsRegularFile(ctx, f.Dirent, filename); err != nil { - return err + return nil, syserror.ENOENT } - return checkPread(ctx, f, filename) -} - -// checkPermission checks whether the file is readable and executable. -func checkPermission(ctx context.Context, d *fs.Dirent) error { - perms := fs.PermMask{ - // TODO(gvisor.dev/issue/160): Linux requires only execute - // permission, not read. However, our backing filesystems may - // prevent us from reading the file without read permission. - // - // Additionally, a task with a non-readable executable has - // additional constraints on access via ptrace and procfs. - Read: true, - Execute: true, + // TODO(gvisor.dev/issue/160): Linux requires only execute permission, + // not read. However, our backing filesystems may prevent us from reading + // the file without read permission. Additionally, a task with a + // non-readable executable has additional constraints on access via + // ptrace and procfs. + opts := vfs.OpenOptions{ + Flags: linux.O_RDONLY, + FileExec: true, } - return d.Inode.CheckPermission(ctx, perms) + return args.Opener.OpenPath(ctx, args.Filename, opts, args.RemainingTraversals, args.ResolveFinal) } // checkIsRegularFile prevents us from trying to execute a directory, pipe, etc. -func checkIsRegularFile(ctx context.Context, d *fs.Dirent, filename string) error { - attr := d.Inode.StableAttr - if !fs.IsRegular(attr) { - ctx.Infof("%s is not regular: %v", filename, attr) - return syserror.EACCES +func checkIsRegularFile(ctx context.Context, file fsbridge.File, filename string) error { + t, err := file.Type(ctx) + if err != nil { + return err } - return nil -} - -// checkPread checks whether we can read the file at arbitrary offsets. -func checkPread(ctx context.Context, f *fs.File, filename string) error { - if !f.Flags().Pread { - ctx.Infof("%s cannot be read at an offset: %+v", filename, f.Flags()) + if t != linux.ModeRegular { + ctx.Infof("%q is not a regular file: %v", filename, t) return syserror.EACCES } return nil @@ -224,8 +149,10 @@ const ( maxLoaderAttempts = 6 ) -// loadExecutable loads an executable that is pointed to by args.File. If nil, -// the path args.Filename is resolved and loaded. If the executable is an +// loadExecutable loads an executable that is pointed to by args.File. The +// caller is responsible for checking that the user can execute this file. +// If nil, the path args.Filename is resolved and loaded (check that the user +// can execute this file is done here in this case). If the executable is an // interpreter script rather than an ELF, the binary of the corresponding // interpreter will be loaded. // @@ -234,37 +161,27 @@ const ( // * arch.Context matching the binary arch // * fs.Dirent of the binary file // * Possibly updated args.Argv -func loadExecutable(ctx context.Context, args LoadArgs) (loadedELF, arch.Context, *fs.Dirent, []string, error) { +func loadExecutable(ctx context.Context, args LoadArgs) (loadedELF, arch.Context, fsbridge.File, []string, error) { for i := 0; i < maxLoaderAttempts; i++ { - var ( - d *fs.Dirent - err error - ) if args.File == nil { - d, args.File, err = openPath(ctx, args) - // We will return d in the successful case, but defer a DecRef for the - // sake of intermediate loops and failure cases. - if d != nil { - defer d.DecRef() - } - if args.File != nil { - defer args.File.DecRef() + var err error + args.File, err = openPath(ctx, args) + if err != nil { + ctx.Infof("Error opening %s: %v", args.Filename, err) + return loadedELF{}, nil, nil, nil, err } + // Ensure file is release in case the code loops or errors out. + defer args.File.DecRef() } else { - d = args.File.Dirent - d.IncRef() - defer d.DecRef() - err = checkFile(ctx, args.File, args.Filename) - } - if err != nil { - ctx.Infof("Error opening %s: %v", args.Filename, err) - return loadedELF{}, nil, nil, nil, err + if err := checkIsRegularFile(ctx, args.File, args.Filename); err != nil { + return loadedELF{}, nil, nil, nil, err + } } // Check the header. Is this an ELF or interpreter script? var hdr [4]uint8 // N.B. We assume that reading from a regular file cannot block. - _, err = readFull(ctx, args.File, usermem.BytesIOSequence(hdr[:]), 0) + _, err := args.File.ReadFull(ctx, usermem.BytesIOSequence(hdr[:]), 0) // Allow unexpected EOF, as a valid executable could be only three bytes // (e.g., #!a). if err != nil && err != io.ErrUnexpectedEOF { @@ -281,9 +198,10 @@ func loadExecutable(ctx context.Context, args LoadArgs) (loadedELF, arch.Context ctx.Infof("Error loading ELF: %v", err) return loadedELF{}, nil, nil, nil, err } - // An ELF is always terminal. Hold on to d. - d.IncRef() - return loaded, ac, d, args.Argv, err + // An ELF is always terminal. Hold on to file. + args.File.IncRef() + return loaded, ac, args.File, args.Argv, err + case bytes.Equal(hdr[:2], []byte(interpreterScriptMagic)): if args.CloseOnExec { return loadedELF{}, nil, nil, nil, syserror.ENOENT @@ -295,6 +213,7 @@ func loadExecutable(ctx context.Context, args LoadArgs) (loadedELF, arch.Context } // Refresh the traversal limit for the interpreter. *args.RemainingTraversals = linux.MaxSymlinkTraversals + default: ctx.Infof("Unknown magic: %v", hdr) return loadedELF{}, nil, nil, nil, syserror.ENOEXEC @@ -317,11 +236,11 @@ func loadExecutable(ctx context.Context, args LoadArgs) (loadedELF, arch.Context // * Load is called on the Task goroutine. func Load(ctx context.Context, args LoadArgs, extraAuxv []arch.AuxEntry, vdso *VDSO) (abi.OS, arch.Context, string, *syserr.Error) { // Load the executable itself. - loaded, ac, d, newArgv, err := loadExecutable(ctx, args) + loaded, ac, file, newArgv, err := loadExecutable(ctx, args) if err != nil { return 0, nil, "", syserr.NewDynamic(fmt.Sprintf("Failed to load %s: %v", args.Filename, err), syserr.FromError(err).ToLinux()) } - defer d.DecRef() + defer file.DecRef() // Load the VDSO. vdsoAddr, err := loadVDSO(ctx, args.MemoryManager, vdso, loaded) @@ -390,7 +309,7 @@ func Load(ctx context.Context, args LoadArgs, extraAuxv []arch.AuxEntry, vdso *V m.SetEnvvStart(sl.EnvvStart) m.SetEnvvEnd(sl.EnvvEnd) m.SetAuxv(auxv) - m.SetExecutable(d) + m.SetExecutable(file) ac.SetIP(uintptr(loaded.entry)) ac.SetStack(uintptr(stack.Bottom)) diff --git a/pkg/sentry/loader/vdso.go b/pkg/sentry/loader/vdso.go index 52f446ed7..161b28c2c 100644 --- a/pkg/sentry/loader/vdso.go +++ b/pkg/sentry/loader/vdso.go @@ -27,6 +27,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/anon" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" + "gvisor.dev/gvisor/pkg/sentry/fsbridge" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/mm" "gvisor.dev/gvisor/pkg/sentry/pgalloc" @@ -69,6 +70,8 @@ type byteReader struct { var _ fs.FileOperations = (*byteReader)(nil) // newByteReaderFile creates a fake file to read data from. +// +// TODO(gvisor.dev/issue/1623): Convert to VFS2. func newByteReaderFile(ctx context.Context, data []byte) *fs.File { // Create a fake inode. inode := fs.NewInode( @@ -123,7 +126,7 @@ func (b *byteReader) Write(ctx context.Context, file *fs.File, src usermem.IOSeq // * PT_LOAD segments don't extend beyond the end of the file. // // ctx may be nil if f does not need it. -func validateVDSO(ctx context.Context, f *fs.File, size uint64) (elfInfo, error) { +func validateVDSO(ctx context.Context, f fsbridge.File, size uint64) (elfInfo, error) { info, err := parseHeader(ctx, f) if err != nil { log.Infof("Unable to parse VDSO header: %v", err) @@ -221,7 +224,7 @@ type VDSO struct { // PrepareVDSO validates the system VDSO and returns a VDSO, containing the // param page for updating by the kernel. func PrepareVDSO(ctx context.Context, mfp pgalloc.MemoryFileProvider) (*VDSO, error) { - vdsoFile := newByteReaderFile(ctx, vdsoBin) + vdsoFile := fsbridge.NewFSFile(newByteReaderFile(ctx, vdsoBin)) // First make sure the VDSO is valid. vdsoFile does not use ctx, so a // nil context can be passed. diff --git a/pkg/sentry/mm/BUILD b/pkg/sentry/mm/BUILD index e5729ced5..73591dab7 100644 --- a/pkg/sentry/mm/BUILD +++ b/pkg/sentry/mm/BUILD @@ -105,8 +105,8 @@ go_library( "//pkg/safecopy", "//pkg/safemem", "//pkg/sentry/arch", - "//pkg/sentry/fs", "//pkg/sentry/fs/proc/seqfile", + "//pkg/sentry/fsbridge", "//pkg/sentry/kernel/auth", "//pkg/sentry/kernel/futex", "//pkg/sentry/kernel/shm", diff --git a/pkg/sentry/mm/metadata.go b/pkg/sentry/mm/metadata.go index f550acae0..6a49334f4 100644 --- a/pkg/sentry/mm/metadata.go +++ b/pkg/sentry/mm/metadata.go @@ -16,7 +16,7 @@ package mm import ( "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/fsbridge" "gvisor.dev/gvisor/pkg/usermem" ) @@ -132,7 +132,7 @@ func (mm *MemoryManager) SetAuxv(auxv arch.Auxv) { // // An additional reference will be taken in the case of a non-nil executable, // which must be released by the caller. -func (mm *MemoryManager) Executable() *fs.Dirent { +func (mm *MemoryManager) Executable() fsbridge.File { mm.metadataMu.Lock() defer mm.metadataMu.Unlock() @@ -147,15 +147,15 @@ func (mm *MemoryManager) Executable() *fs.Dirent { // SetExecutable sets the executable. // // This takes a reference on d. -func (mm *MemoryManager) SetExecutable(d *fs.Dirent) { +func (mm *MemoryManager) SetExecutable(file fsbridge.File) { mm.metadataMu.Lock() // Grab a new reference. - d.IncRef() + file.IncRef() // Set the executable. orig := mm.executable - mm.executable = d + mm.executable = file mm.metadataMu.Unlock() diff --git a/pkg/sentry/mm/mm.go b/pkg/sentry/mm/mm.go index 09e582dd3..637383c7a 100644 --- a/pkg/sentry/mm/mm.go +++ b/pkg/sentry/mm/mm.go @@ -37,7 +37,7 @@ package mm import ( "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/fsbridge" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/platform" @@ -215,7 +215,7 @@ type MemoryManager struct { // is not nil, it holds a reference on the Dirent. // // executable is protected by metadataMu. - executable *fs.Dirent + executable fsbridge.File // dumpability describes if and how this MemoryManager may be dumped to // userspace. diff --git a/pkg/sentry/strace/strace.go b/pkg/sentry/strace/strace.go index a796b2396..46cb2a1cc 100644 --- a/pkg/sentry/strace/strace.go +++ b/pkg/sentry/strace/strace.go @@ -141,6 +141,10 @@ func path(t *kernel.Task, addr usermem.Addr) string { } func fd(t *kernel.Task, fd int32) string { + if kernel.VFS2Enabled { + return fdVFS2(t, fd) + } + root := t.FSContext().RootDirectory() if root != nil { defer root.DecRef() @@ -169,6 +173,30 @@ func fd(t *kernel.Task, fd int32) string { return fmt.Sprintf("%#x %s", fd, name) } +func fdVFS2(t *kernel.Task, fd int32) string { + root := t.FSContext().RootDirectoryVFS2() + defer root.DecRef() + + vfsObj := root.Mount().Filesystem().VirtualFilesystem() + if fd == linux.AT_FDCWD { + wd := t.FSContext().WorkingDirectoryVFS2() + defer wd.DecRef() + + name, _ := vfsObj.PathnameWithDeleted(t, root, wd) + return fmt.Sprintf("AT_FDCWD %s", name) + } + + file := t.GetFileVFS2(fd) + if file == nil { + // Cast FD to uint64 to avoid printing negative hex. + return fmt.Sprintf("%#x (bad FD)", uint64(fd)) + } + defer file.DecRef() + + name, _ := vfsObj.PathnameWithDeleted(t, root, file.VirtualDentry()) + return fmt.Sprintf("%#x %s", fd, name) +} + func fdpair(t *kernel.Task, addr usermem.Addr) string { var fds [2]int32 _, err := t.CopyIn(addr, &fds) diff --git a/pkg/sentry/syscalls/linux/BUILD b/pkg/sentry/syscalls/linux/BUILD index be16ee686..0d24fd3c4 100644 --- a/pkg/sentry/syscalls/linux/BUILD +++ b/pkg/sentry/syscalls/linux/BUILD @@ -74,6 +74,7 @@ go_library( "//pkg/sentry/fs/lock", "//pkg/sentry/fs/timerfd", "//pkg/sentry/fs/tmpfs", + "//pkg/sentry/fsbridge", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", "//pkg/sentry/kernel/epoll", diff --git a/pkg/sentry/syscalls/linux/sys_prctl.go b/pkg/sentry/syscalls/linux/sys_prctl.go index 98db32d77..9c6728530 100644 --- a/pkg/sentry/syscalls/linux/sys_prctl.go +++ b/pkg/sentry/syscalls/linux/sys_prctl.go @@ -20,6 +20,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/fsbridge" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/mm" @@ -135,7 +136,7 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall } // Set the underlying executable. - t.MemoryManager().SetExecutable(file.Dirent) + t.MemoryManager().SetExecutable(fsbridge.NewFSFile(file)) case linux.PR_SET_MM_AUXV, linux.PR_SET_MM_START_CODE, diff --git a/pkg/sentry/syscalls/linux/sys_thread.go b/pkg/sentry/syscalls/linux/sys_thread.go index 0c9e2255d..00915fdde 100644 --- a/pkg/sentry/syscalls/linux/sys_thread.go +++ b/pkg/sentry/syscalls/linux/sys_thread.go @@ -21,6 +21,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/fsbridge" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/sched" "gvisor.dev/gvisor/pkg/sentry/loader" @@ -119,7 +120,7 @@ func execveat(t *kernel.Task, dirFD int32, pathnameAddr, argvAddr, envvAddr user defer root.DecRef() var wd *fs.Dirent - var executable *fs.File + var executable fsbridge.File var closeOnExec bool if dirFD == linux.AT_FDCWD || path.IsAbs(pathname) { // Even if the pathname is absolute, we may still need the wd @@ -136,7 +137,15 @@ func execveat(t *kernel.Task, dirFD int32, pathnameAddr, argvAddr, envvAddr user closeOnExec = fdFlags.CloseOnExec if atEmptyPath && len(pathname) == 0 { - executable = f + // TODO(gvisor.dev/issue/160): Linux requires only execute permission, + // not read. However, our backing filesystems may prevent us from reading + // the file without read permission. Additionally, a task with a + // non-readable executable has additional constraints on access via + // ptrace and procfs. + if err := f.Dirent.Inode.CheckPermission(t, fs.PermMask{Read: true, Execute: true}); err != nil { + return 0, nil, err + } + executable = fsbridge.NewFSFile(f) } else { wd = f.Dirent wd.IncRef() @@ -152,9 +161,7 @@ func execveat(t *kernel.Task, dirFD int32, pathnameAddr, argvAddr, envvAddr user // Load the new TaskContext. remainingTraversals := uint(linux.MaxSymlinkTraversals) loadArgs := loader.LoadArgs{ - Mounts: t.MountNamespace(), - Root: root, - WorkingDirectory: wd, + Opener: fsbridge.NewFSLookup(t.MountNamespace(), root, wd), RemainingTraversals: &remainingTraversals, ResolveFinal: resolveFinal, Filename: pathname, diff --git a/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go b/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go index c134714ee..e0ac32b33 100644 --- a/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go +++ b/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go @@ -22,4 +22,110 @@ import ( // Override syscall table to add syscalls implementations from this package. func Override(table map[uintptr]kernel.Syscall) { table[0] = syscalls.Supported("read", Read) + + // Remove syscalls that haven't been converted yet. It's better to get ENOSYS + // rather than a SIGSEGV deep in the stack. + delete(table, 1) // write + delete(table, 2) // open + delete(table, 3) // close + delete(table, 4) // stat + delete(table, 5) // fstat + delete(table, 6) // lstat + delete(table, 7) // poll + delete(table, 8) // lseek + delete(table, 9) // mmap + delete(table, 16) // ioctl + delete(table, 17) // pread64 + delete(table, 18) // pwrite64 + delete(table, 19) // readv + delete(table, 20) // writev + delete(table, 21) // access + delete(table, 22) // pipe + delete(table, 32) // dup + delete(table, 33) // dup2 + delete(table, 40) // sendfile + delete(table, 59) // execve + delete(table, 72) // fcntl + delete(table, 73) // flock + delete(table, 74) // fsync + delete(table, 75) // fdatasync + delete(table, 76) // truncate + delete(table, 77) // ftruncate + delete(table, 78) // getdents + delete(table, 79) // getcwd + delete(table, 80) // chdir + delete(table, 81) // fchdir + delete(table, 82) // rename + delete(table, 83) // mkdir + delete(table, 84) // rmdir + delete(table, 85) // creat + delete(table, 86) // link + delete(table, 87) // unlink + delete(table, 88) // symlink + delete(table, 89) // readlink + delete(table, 90) // chmod + delete(table, 91) // fchmod + delete(table, 92) // chown + delete(table, 93) // fchown + delete(table, 94) // lchown + delete(table, 133) // mknod + delete(table, 137) // statfs + delete(table, 138) // fstatfs + delete(table, 161) // chroot + delete(table, 162) // sync + delete(table, 165) // mount + delete(table, 166) // umount2 + delete(table, 172) // iopl + delete(table, 173) // ioperm + delete(table, 187) // readahead + delete(table, 188) // setxattr + delete(table, 189) // lsetxattr + delete(table, 190) // fsetxattr + delete(table, 191) // getxattr + delete(table, 192) // lgetxattr + delete(table, 193) // fgetxattr + delete(table, 206) // io_setup + delete(table, 207) // io_destroy + delete(table, 208) // io_getevents + delete(table, 209) // io_submit + delete(table, 210) // io_cancel + delete(table, 213) // epoll_create + delete(table, 214) // epoll_ctl_old + delete(table, 215) // epoll_wait_old + delete(table, 216) // remap_file_pages + delete(table, 217) // getdents64 + delete(table, 232) // epoll_wait + delete(table, 233) // epoll_ctl + delete(table, 253) // inotify_init + delete(table, 254) // inotify_add_watch + delete(table, 255) // inotify_rm_watch + delete(table, 257) // openat + delete(table, 258) // mkdirat + delete(table, 259) // mknodat + delete(table, 260) // fchownat + delete(table, 261) // futimesat + delete(table, 262) // fstatat + delete(table, 263) // unlinkat + delete(table, 264) // renameat + delete(table, 265) // linkat + delete(table, 266) // symlinkat + delete(table, 267) // readlinkat + delete(table, 268) // fchmodat + delete(table, 269) // faccessat + delete(table, 270) // pselect + delete(table, 271) // ppoll + delete(table, 285) // fallocate + delete(table, 291) // epoll_create1 + delete(table, 292) // dup3 + delete(table, 293) // pipe2 + delete(table, 294) // inotify_init1 + delete(table, 295) // preadv + delete(table, 296) // pwritev + delete(table, 306) // syncfs + delete(table, 316) // renameat2 + delete(table, 319) // memfd_create + delete(table, 322) // execveat + delete(table, 327) // preadv2 + delete(table, 328) // pwritev2 + delete(table, 332) // statx } diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD index 14b39eb9d..0b4f18ab5 100644 --- a/pkg/sentry/vfs/BUILD +++ b/pkg/sentry/vfs/BUILD @@ -43,6 +43,7 @@ go_library( "//pkg/abi/linux", "//pkg/context", "//pkg/fspath", + "//pkg/log", "//pkg/sentry/arch", "//pkg/sentry/fs/lock", "//pkg/sentry/kernel/auth", diff --git a/pkg/sentry/vfs/context.go b/pkg/sentry/vfs/context.go index d97362b9a..82781e6d3 100644 --- a/pkg/sentry/vfs/context.go +++ b/pkg/sentry/vfs/context.go @@ -29,9 +29,10 @@ const ( CtxRoot ) -// MountNamespaceFromContext returns the MountNamespace used by ctx. It does -// not take a reference on the returned MountNamespace. If ctx is not -// associated with a MountNamespace, MountNamespaceFromContext returns nil. +// MountNamespaceFromContext returns the MountNamespace used by ctx. If ctx is +// not associated with a MountNamespace, MountNamespaceFromContext returns nil. +// +// A reference is taken on the returned MountNamespace. func MountNamespaceFromContext(ctx context.Context) *MountNamespace { if v := ctx.Value(CtxMountNamespace); v != nil { return v.(*MountNamespace) diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go index 1fbb420f9..ad2c9fcf4 100644 --- a/pkg/sentry/vfs/mount.go +++ b/pkg/sentry/vfs/mount.go @@ -114,6 +114,7 @@ type MountNamespace struct { func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth.Credentials, source, fsTypeName string, opts *GetFilesystemOptions) (*MountNamespace, error) { rft := vfs.getFilesystemType(fsTypeName) if rft == nil { + ctx.Warningf("Unknown filesystem: %s", fsTypeName) return nil, syserror.ENODEV } fs, root, err := rft.fsType.GetFilesystem(ctx, vfs, creds, source, *opts) @@ -231,9 +232,12 @@ func (vfs *VirtualFilesystem) UmountAt(ctx context.Context, creds *auth.Credenti return syserror.EINVAL } vfs.mountMu.Lock() - if mntns := MountNamespaceFromContext(ctx); mntns != nil && mntns != vd.mount.ns { - vfs.mountMu.Unlock() - return syserror.EINVAL + if mntns := MountNamespaceFromContext(ctx); mntns != nil { + defer mntns.DecRef() + if mntns != vd.mount.ns { + vfs.mountMu.Unlock() + return syserror.EINVAL + } } // TODO(jamieliu): Linux special-cases umount of the caller's root, which diff --git a/pkg/sentry/vfs/options.go b/pkg/sentry/vfs/options.go index fdf8be157..6af7fdac1 100644 --- a/pkg/sentry/vfs/options.go +++ b/pkg/sentry/vfs/options.go @@ -61,7 +61,7 @@ type MountOptions struct { type OpenOptions struct { // Flags contains access mode and flags as specified for open(2). // - // FilesystemImpls is reponsible for implementing the following flags: + // FilesystemImpls are responsible for implementing the following flags: // O_RDONLY, O_WRONLY, O_RDWR, O_APPEND, O_CREAT, O_DIRECT, O_DSYNC, // O_EXCL, O_NOATIME, O_NOCTTY, O_NONBLOCK, O_PATH, O_SYNC, O_TMPFILE, and // O_TRUNC. VFS is responsible for handling O_DIRECTORY, O_LARGEFILE, and diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go index 9629afee9..51deae313 100644 --- a/pkg/sentry/vfs/vfs.go +++ b/pkg/sentry/vfs/vfs.go @@ -393,7 +393,8 @@ func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credential // be executed. return nil, syserror.EACCES } - if linux.FileMode(stat.Mode).FileType() != linux.ModeRegular { + if t := linux.FileMode(stat.Mode).FileType(); t != linux.ModeRegular { + ctx.Infof("%q is not a regular file: %v", pop.Path, t) return nil, syserror.EACCES } } @@ -743,6 +744,8 @@ func (vfs *VirtualFilesystem) SyncAllFilesystems(ctx context.Context) error { // VirtualDentry methods require that a reference is held on the VirtualDentry. // // VirtualDentry is analogous to Linux's struct path. +// +// +stateify savable type VirtualDentry struct { mount *Mount dentry *Dentry diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index 9f0d5d7af..239ca5302 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -795,16 +795,19 @@ func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) { return 0, fmt.Errorf("container %q not started", args.ContainerID) } + // TODO(gvisor.dev/issue/1623): Add VFS2 support + // Get the container MountNamespace from the Task. tg.Leader().WithMuLocked(func(t *kernel.Task) { - // task.MountNamespace() does not take a ref, so we must do so - // ourselves. + // task.MountNamespace() does not take a ref, so we must do so ourselves. args.MountNamespace = t.MountNamespace() args.MountNamespace.IncRef() }) - defer args.MountNamespace.DecRef() + if args.MountNamespace != nil { + defer args.MountNamespace.DecRef() + } - // Add the HOME enviroment varible if it is not already set. + // Add the HOME environment variable if it is not already set. root := args.MountNamespace.Root() defer root.DecRef() ctx := fs.WithRoot(l.k.SupervisorContext(), root) -- cgit v1.2.3 From e4c7f3e6f6c19f3259820a4c41b69e85c0454379 Mon Sep 17 00:00:00 2001 From: gVisor bot Date: Fri, 14 Feb 2020 13:39:51 -0800 Subject: Inline vfs.VirtualFilesystem in Kernel struct This saves one pointer dereference per VFS access. Updates #1623 PiperOrigin-RevId: 295216176 --- pkg/sentry/control/proc.go | 2 +- pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go | 5 +++- pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go | 5 +++- pkg/sentry/fsimpl/ext/ext_test.go | 5 +++- pkg/sentry/fsimpl/kernfs/kernfs_test.go | 5 +++- pkg/sentry/fsimpl/proc/tasks_test.go | 6 ++-- pkg/sentry/fsimpl/sys/sys_test.go | 6 ++-- pkg/sentry/fsimpl/testutil/kernel.go | 7 +++-- pkg/sentry/fsimpl/tmpfs/benchmark_test.go | 10 +++++-- pkg/sentry/fsimpl/tmpfs/pipe_test.go | 5 +++- pkg/sentry/fsimpl/tmpfs/regular_file_test.go | 6 +++- pkg/sentry/kernel/kernel.go | 9 ++++-- pkg/sentry/vfs/dentry.go | 4 ++- pkg/sentry/vfs/device.go | 3 ++ pkg/sentry/vfs/file_description_impl_util_test.go | 10 +++++-- pkg/sentry/vfs/filesystem.go | 2 ++ pkg/sentry/vfs/filesystem_type.go | 1 + pkg/sentry/vfs/mount.go | 4 +++ pkg/sentry/vfs/mount_unsafe.go | 8 ++++-- pkg/sentry/vfs/vfs.go | 35 +++++++++++------------ 20 files changed, 94 insertions(+), 44 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/control/proc.go b/pkg/sentry/control/proc.go index 8973754c8..5457ba5e7 100644 --- a/pkg/sentry/control/proc.go +++ b/pkg/sentry/control/proc.go @@ -199,7 +199,7 @@ func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadI } paths := fs.GetPath(initArgs.Envv) - vfsObj := proc.Kernel.VFS + vfsObj := proc.Kernel.VFS() file, err := ResolveExecutablePath(ctx, vfsObj, initArgs.WorkingDirectory, initArgs.Argv[0], paths) if err != nil { return nil, 0, nil, fmt.Errorf("error finding executable %q in PATH %v: %v", initArgs.Argv[0], paths, err) diff --git a/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go b/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go index 73308a2b5..b6d52c015 100644 --- a/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go +++ b/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go @@ -29,7 +29,10 @@ func TestDevtmpfs(t *testing.T) { ctx := contexttest.Context(t) creds := auth.CredentialsFromContext(ctx) - vfsObj := vfs.New() + vfsObj := &vfs.VirtualFilesystem{} + if err := vfsObj.Init(); err != nil { + t.Fatalf("VFS init: %v", err) + } // Register tmpfs just so that we can have a root filesystem that isn't // devtmpfs. vfsObj.MustRegisterFilesystemType("tmpfs", tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ diff --git a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go index 2015a8871..89caee3df 100644 --- a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go +++ b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go @@ -52,7 +52,10 @@ func setUp(b *testing.B, imagePath string) (context.Context, *vfs.VirtualFilesys creds := auth.CredentialsFromContext(ctx) // Create VFS. - vfsObj := vfs.New() + vfsObj := &vfs.VirtualFilesystem{} + if err := vfsObj.Init(); err != nil { + return nil, nil, nil, nil, err + } vfsObj.MustRegisterFilesystemType("extfs", ext.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ AllowUserMount: true, }) diff --git a/pkg/sentry/fsimpl/ext/ext_test.go b/pkg/sentry/fsimpl/ext/ext_test.go index 05f992826..ef6127f3c 100644 --- a/pkg/sentry/fsimpl/ext/ext_test.go +++ b/pkg/sentry/fsimpl/ext/ext_test.go @@ -65,7 +65,10 @@ func setUp(t *testing.T, imagePath string) (context.Context, *vfs.VirtualFilesys creds := auth.CredentialsFromContext(ctx) // Create VFS. - vfsObj := vfs.New() + vfsObj := &vfs.VirtualFilesystem{} + if err := vfsObj.Init(); err != nil { + t.Fatalf("VFS init: %v", err) + } vfsObj.MustRegisterFilesystemType("extfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ AllowUserMount: true, }) diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go index 96a16e654..0459fb305 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs_test.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go @@ -45,7 +45,10 @@ type RootDentryFn func(*auth.Credentials, *filesystem) *kernfs.Dentry func newTestSystem(t *testing.T, rootFn RootDentryFn) *testutil.System { ctx := contexttest.Context(t) creds := auth.CredentialsFromContext(ctx) - v := vfs.New() + v := &vfs.VirtualFilesystem{} + if err := v.Init(); err != nil { + t.Fatalf("VFS init: %v", err) + } v.MustRegisterFilesystemType("testfs", &fsType{rootFn: rootFn}, &vfs.RegisterFilesystemTypeOptions{ AllowUserMount: true, }) diff --git a/pkg/sentry/fsimpl/proc/tasks_test.go b/pkg/sentry/fsimpl/proc/tasks_test.go index 96c72cbc9..c5d531fe0 100644 --- a/pkg/sentry/fsimpl/proc/tasks_test.go +++ b/pkg/sentry/fsimpl/proc/tasks_test.go @@ -90,7 +90,7 @@ func setup(t *testing.T) *testutil.System { ctx := k.SupervisorContext() creds := auth.CredentialsFromContext(ctx) - k.VFS.MustRegisterFilesystemType(Name, &FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + k.VFS().MustRegisterFilesystemType(Name, &FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ AllowUserMount: true, }) fsOpts := vfs.GetFilesystemOptions{ @@ -101,11 +101,11 @@ func setup(t *testing.T) *testutil.System { }, }, } - mntns, err := k.VFS.NewMountNamespace(ctx, creds, "", Name, &fsOpts) + mntns, err := k.VFS().NewMountNamespace(ctx, creds, "", Name, &fsOpts) if err != nil { t.Fatalf("NewMountNamespace(): %v", err) } - return testutil.NewSystem(ctx, t, k.VFS, mntns) + return testutil.NewSystem(ctx, t, k.VFS(), mntns) } func TestTasksEmpty(t *testing.T) { diff --git a/pkg/sentry/fsimpl/sys/sys_test.go b/pkg/sentry/fsimpl/sys/sys_test.go index 5d1ba5867..4b3602d47 100644 --- a/pkg/sentry/fsimpl/sys/sys_test.go +++ b/pkg/sentry/fsimpl/sys/sys_test.go @@ -34,15 +34,15 @@ func newTestSystem(t *testing.T) *testutil.System { } ctx := k.SupervisorContext() creds := auth.CredentialsFromContext(ctx) - k.VFS.MustRegisterFilesystemType(sys.Name, sys.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + k.VFS().MustRegisterFilesystemType(sys.Name, sys.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ AllowUserMount: true, }) - mns, err := k.VFS.NewMountNamespace(ctx, creds, "", sys.Name, &vfs.GetFilesystemOptions{}) + mns, err := k.VFS().NewMountNamespace(ctx, creds, "", sys.Name, &vfs.GetFilesystemOptions{}) if err != nil { t.Fatalf("Failed to create new mount namespace: %v", err) } - return testutil.NewSystem(ctx, t, k.VFS, mns) + return testutil.NewSystem(ctx, t, k.VFS(), mns) } func TestReadCPUFile(t *testing.T) { diff --git a/pkg/sentry/fsimpl/testutil/kernel.go b/pkg/sentry/fsimpl/testutil/kernel.go index a91b3ec4d..d0be32e72 100644 --- a/pkg/sentry/fsimpl/testutil/kernel.go +++ b/pkg/sentry/fsimpl/testutil/kernel.go @@ -102,12 +102,13 @@ func Boot() (*kernel.Kernel, error) { kernel.VFS2Enabled = true - vfsObj := vfs.New() - vfsObj.MustRegisterFilesystemType(tmpfs.Name, &tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + if err := k.VFS().Init(); err != nil { + return nil, fmt.Errorf("VFS init: %v", err) + } + k.VFS().MustRegisterFilesystemType(tmpfs.Name, &tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ AllowUserMount: true, AllowUserList: true, }) - k.VFS = vfsObj ls, err := limits.NewLinuxLimitSet() if err != nil { diff --git a/pkg/sentry/fsimpl/tmpfs/benchmark_test.go b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go index 9fce5e4b4..383133e44 100644 --- a/pkg/sentry/fsimpl/tmpfs/benchmark_test.go +++ b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go @@ -175,7 +175,10 @@ func BenchmarkVFS2MemfsStat(b *testing.B) { creds := auth.CredentialsFromContext(ctx) // Create VFS. - vfsObj := vfs.New() + vfsObj := vfs.VirtualFilesystem{} + if err := vfsObj.Init(); err != nil { + b.Fatalf("VFS init: %v", err) + } vfsObj.MustRegisterFilesystemType("tmpfs", tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ AllowUserMount: true, }) @@ -366,7 +369,10 @@ func BenchmarkVFS2MemfsMountStat(b *testing.B) { creds := auth.CredentialsFromContext(ctx) // Create VFS. - vfsObj := vfs.New() + vfsObj := vfs.VirtualFilesystem{} + if err := vfsObj.Init(); err != nil { + b.Fatalf("VFS init: %v", err) + } vfsObj.MustRegisterFilesystemType("tmpfs", tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ AllowUserMount: true, }) diff --git a/pkg/sentry/fsimpl/tmpfs/pipe_test.go b/pkg/sentry/fsimpl/tmpfs/pipe_test.go index 5ee7f2a72..1614f2c39 100644 --- a/pkg/sentry/fsimpl/tmpfs/pipe_test.go +++ b/pkg/sentry/fsimpl/tmpfs/pipe_test.go @@ -151,7 +151,10 @@ func setup(t *testing.T) (context.Context, *auth.Credentials, *vfs.VirtualFilesy creds := auth.CredentialsFromContext(ctx) // Create VFS. - vfsObj := vfs.New() + vfsObj := &vfs.VirtualFilesystem{} + if err := vfsObj.Init(); err != nil { + t.Fatalf("VFS init: %v", err) + } vfsObj.MustRegisterFilesystemType("tmpfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ AllowUserMount: true, }) diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file_test.go b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go index e9f71e334..0399725cf 100644 --- a/pkg/sentry/fsimpl/tmpfs/regular_file_test.go +++ b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go @@ -40,7 +40,11 @@ var nextFileID int64 func newTmpfsRoot(ctx context.Context) (*vfs.VirtualFilesystem, vfs.VirtualDentry, func(), error) { creds := auth.CredentialsFromContext(ctx) - vfsObj := vfs.New() + vfsObj := &vfs.VirtualFilesystem{} + if err := vfsObj.Init(); err != nil { + return nil, vfs.VirtualDentry{}, nil, fmt.Errorf("VFS init: %v", err) + } + vfsObj.MustRegisterFilesystemType("tmpfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ AllowUserMount: true, }) diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index 2665f057c..ea21af33f 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -246,7 +246,7 @@ type Kernel struct { SpecialOpts // VFS keeps the filesystem state used across the kernel. - VFS *vfs.VirtualFilesystem + vfs vfs.VirtualFilesystem } // InitKernelArgs holds arguments to Init. @@ -815,7 +815,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, FollowFinalSymlink: true, } var err error - wd, err = k.VFS.GetDentryAt(ctx, args.Credentials, &pop, &vfs.GetDentryOptions{ + wd, err = k.VFS().GetDentryAt(ctx, args.Credentials, &pop, &vfs.GetDentryOptions{ CheckSearchable: true, }) if err != nil { @@ -1506,3 +1506,8 @@ func (k *Kernel) EmitUnimplementedEvent(ctx context.Context) { Registers: t.Arch().StateData().Proto(), }) } + +// VFS returns the virtual filesystem for the kernel. +func (k *Kernel) VFS() *vfs.VirtualFilesystem { + return &k.vfs +} diff --git a/pkg/sentry/vfs/dentry.go b/pkg/sentry/vfs/dentry.go index 486a76475..35b208721 100644 --- a/pkg/sentry/vfs/dentry.go +++ b/pkg/sentry/vfs/dentry.go @@ -71,6 +71,8 @@ import ( // lifetime. Dentry reference counts only indicate the extent to which VFS // requires Dentries to exist; Filesystems may elect to cache or discard // Dentries with zero references. +// +// +stateify savable type Dentry struct { // parent is this Dentry's parent in this Filesystem. If this Dentry is // independent, parent is nil. @@ -89,7 +91,7 @@ type Dentry struct { children map[string]*Dentry // mu synchronizes disowning and mounting over this Dentry. - mu sync.Mutex + mu sync.Mutex `state:"nosave"` // impl is the DentryImpl associated with this Dentry. impl is immutable. // This should be the last field in Dentry. diff --git a/pkg/sentry/vfs/device.go b/pkg/sentry/vfs/device.go index 3af2aa58d..bda5576fa 100644 --- a/pkg/sentry/vfs/device.go +++ b/pkg/sentry/vfs/device.go @@ -56,6 +56,7 @@ type Device interface { Open(ctx context.Context, mnt *Mount, d *Dentry, opts OpenOptions) (*FileDescription, error) } +// +stateify savable type registeredDevice struct { dev Device opts RegisterDeviceOptions @@ -63,6 +64,8 @@ type registeredDevice struct { // RegisterDeviceOptions contains options to // VirtualFilesystem.RegisterDevice(). +// +// +stateify savable type RegisterDeviceOptions struct { // GroupName is the name shown for this device registration in // /proc/devices. If GroupName is empty, this registration will not be diff --git a/pkg/sentry/vfs/file_description_impl_util_test.go b/pkg/sentry/vfs/file_description_impl_util_test.go index 8fa26418e..3a75d4d62 100644 --- a/pkg/sentry/vfs/file_description_impl_util_test.go +++ b/pkg/sentry/vfs/file_description_impl_util_test.go @@ -107,7 +107,10 @@ func (fd *testFD) SetStat(ctx context.Context, opts SetStatOptions) error { func TestGenCountFD(t *testing.T) { ctx := contexttest.Context(t) - vfsObj := New() // vfs.New() + vfsObj := &VirtualFilesystem{} + if err := vfsObj.Init(); err != nil { + t.Fatalf("VFS init: %v", err) + } fd := newTestFD(vfsObj, linux.O_RDWR, &genCount{}) defer fd.DecRef() @@ -162,7 +165,10 @@ func TestGenCountFD(t *testing.T) { func TestWritable(t *testing.T) { ctx := contexttest.Context(t) - vfsObj := New() // vfs.New() + vfsObj := &VirtualFilesystem{} + if err := vfsObj.Init(); err != nil { + t.Fatalf("VFS init: %v", err) + } fd := newTestFD(vfsObj, linux.O_RDWR, &storeData{data: "init"}) defer fd.DecRef() diff --git a/pkg/sentry/vfs/filesystem.go b/pkg/sentry/vfs/filesystem.go index a06a6caf3..556976d0b 100644 --- a/pkg/sentry/vfs/filesystem.go +++ b/pkg/sentry/vfs/filesystem.go @@ -29,6 +29,8 @@ import ( // Filesystem methods require that a reference is held. // // Filesystem is analogous to Linux's struct super_block. +// +// +stateify savable type Filesystem struct { // refs is the reference count. refs is accessed using atomic memory // operations. diff --git a/pkg/sentry/vfs/filesystem_type.go b/pkg/sentry/vfs/filesystem_type.go index c58b70728..bb9cada81 100644 --- a/pkg/sentry/vfs/filesystem_type.go +++ b/pkg/sentry/vfs/filesystem_type.go @@ -44,6 +44,7 @@ type GetFilesystemOptions struct { InternalData interface{} } +// +stateify savable type registeredFilesystemType struct { fsType FilesystemType opts RegisterFilesystemTypeOptions diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go index ad2c9fcf4..9912df799 100644 --- a/pkg/sentry/vfs/mount.go +++ b/pkg/sentry/vfs/mount.go @@ -38,6 +38,8 @@ import ( // // Mount is analogous to Linux's struct mount. (gVisor does not distinguish // between struct mount and struct vfsmount.) +// +// +stateify savable type Mount struct { // vfs, fs, and root are immutable. References are held on fs and root. // @@ -85,6 +87,8 @@ type Mount struct { // MountNamespace methods require that a reference is held. // // MountNamespace is analogous to Linux's struct mnt_namespace. +// +// +stateify savable type MountNamespace struct { // root is the MountNamespace's root mount. root is immutable. root *Mount diff --git a/pkg/sentry/vfs/mount_unsafe.go b/pkg/sentry/vfs/mount_unsafe.go index bd90d36c4..1fe766a44 100644 --- a/pkg/sentry/vfs/mount_unsafe.go +++ b/pkg/sentry/vfs/mount_unsafe.go @@ -64,6 +64,8 @@ func (mnt *Mount) storeKey(vd VirtualDentry) { // (provided mutation is sufficiently uncommon). // // mountTable.Init() must be called on new mountTables before use. +// +// +stateify savable type mountTable struct { // mountTable is implemented as a seqcount-protected hash table that // resolves collisions with linear probing, featuring Robin Hood insertion @@ -75,8 +77,8 @@ type mountTable struct { // intrinsics and inline assembly, limiting the performance of this // approach.) - seq sync.SeqCount - seed uint32 // for hashing keys + seq sync.SeqCount `state:"nosave"` + seed uint32 // for hashing keys // size holds both length (number of elements) and capacity (number of // slots): capacity is stored as its base-2 log (referred to as order) in @@ -89,7 +91,7 @@ type mountTable struct { // length and cap in separate uint32s) for ~free. size uint64 - slots unsafe.Pointer // []mountSlot; never nil after Init + slots unsafe.Pointer `state:"nosave"` // []mountSlot; never nil after Init } type mountSlot struct { diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go index 51deae313..8f29031b2 100644 --- a/pkg/sentry/vfs/vfs.go +++ b/pkg/sentry/vfs/vfs.go @@ -46,11 +46,13 @@ import ( // // There is no analogue to the VirtualFilesystem type in Linux, as the // equivalent state in Linux is global. +// +// +stateify savable type VirtualFilesystem struct { // mountMu serializes mount mutations. // // mountMu is analogous to Linux's namespace_sem. - mountMu sync.Mutex + mountMu sync.Mutex `state:"nosave"` // mounts maps (mount parent, mount point) pairs to mounts. (Since mounts // are uniquely namespaced, including mount parent in the key correctly @@ -89,44 +91,42 @@ type VirtualFilesystem struct { // devices contains all registered Devices. devices is protected by // devicesMu. - devicesMu sync.RWMutex + devicesMu sync.RWMutex `state:"nosave"` devices map[devTuple]*registeredDevice // anonBlockDevMinor contains all allocated anonymous block device minor // numbers. anonBlockDevMinorNext is a lower bound for the smallest // unallocated anonymous block device number. anonBlockDevMinorNext and // anonBlockDevMinor are protected by anonBlockDevMinorMu. - anonBlockDevMinorMu sync.Mutex + anonBlockDevMinorMu sync.Mutex `state:"nosave"` anonBlockDevMinorNext uint32 anonBlockDevMinor map[uint32]struct{} // fsTypes contains all registered FilesystemTypes. fsTypes is protected by // fsTypesMu. - fsTypesMu sync.RWMutex + fsTypesMu sync.RWMutex `state:"nosave"` fsTypes map[string]*registeredFilesystemType // filesystems contains all Filesystems. filesystems is protected by // filesystemsMu. - filesystemsMu sync.Mutex + filesystemsMu sync.Mutex `state:"nosave"` filesystems map[*Filesystem]struct{} } -// New returns a new VirtualFilesystem with no mounts or FilesystemTypes. -func New() *VirtualFilesystem { - vfs := &VirtualFilesystem{ - mountpoints: make(map[*Dentry]map[*Mount]struct{}), - devices: make(map[devTuple]*registeredDevice), - anonBlockDevMinorNext: 1, - anonBlockDevMinor: make(map[uint32]struct{}), - fsTypes: make(map[string]*registeredFilesystemType), - filesystems: make(map[*Filesystem]struct{}), - } +// Init initializes a new VirtualFilesystem with no mounts or FilesystemTypes. +func (vfs *VirtualFilesystem) Init() error { + vfs.mountpoints = make(map[*Dentry]map[*Mount]struct{}) + vfs.devices = make(map[devTuple]*registeredDevice) + vfs.anonBlockDevMinorNext = 1 + vfs.anonBlockDevMinor = make(map[uint32]struct{}) + vfs.fsTypes = make(map[string]*registeredFilesystemType) + vfs.filesystems = make(map[*Filesystem]struct{}) vfs.mounts.Init() // Construct vfs.anonMount. anonfsDevMinor, err := vfs.GetAnonBlockDevMinor() if err != nil { - panic(fmt.Sprintf("VirtualFilesystem.GetAnonBlockDevMinor() failed during VirtualFilesystem construction: %v", err)) + return err } anonfs := anonFilesystem{ devMinor: anonfsDevMinor, @@ -137,8 +137,7 @@ func New() *VirtualFilesystem { fs: &anonfs.vfsfs, refs: 1, } - - return vfs + return nil } // PathOperation specifies the path operated on by a VFS method. -- cgit v1.2.3 From 3557b2665198b57c04924ad4be8dbf9e42cedf71 Mon Sep 17 00:00:00 2001 From: gVisor bot Date: Fri, 14 Feb 2020 14:39:40 -0800 Subject: Allow vfs.IterDirentsCallback.Handle() to return an error. This is easier than storing errors from e.g. CopyOut in the callback. PiperOrigin-RevId: 295230021 --- pkg/sentry/fsimpl/ext/directory.go | 6 +++--- pkg/sentry/fsimpl/ext/ext_test.go | 4 ++-- pkg/sentry/fsimpl/gofer/directory.go | 4 ++-- pkg/sentry/fsimpl/kernfs/fd_impl_util.go | 12 ++++++------ pkg/sentry/fsimpl/proc/subtasks.go | 4 ++-- pkg/sentry/fsimpl/proc/tasks.go | 12 ++++++------ pkg/sentry/fsimpl/testutil/testutil.go | 4 ++-- pkg/sentry/fsimpl/tmpfs/directory.go | 18 +++++++++--------- pkg/sentry/vfs/file_description.go | 10 +++++----- 9 files changed, 37 insertions(+), 37 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/ext/directory.go b/pkg/sentry/fsimpl/ext/directory.go index ebb72b75e..bd6ede995 100644 --- a/pkg/sentry/fsimpl/ext/directory.go +++ b/pkg/sentry/fsimpl/ext/directory.go @@ -188,14 +188,14 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba childType = fs.ToInodeType(childInode.diskInode.Mode().FileType()) } - if !cb.Handle(vfs.Dirent{ + if err := cb.Handle(vfs.Dirent{ Name: child.diskDirent.FileName(), Type: fs.ToDirentType(childType), Ino: uint64(child.diskDirent.Inode()), NextOff: fd.off + 1, - }) { + }); err != nil { dir.childList.InsertBefore(child, fd.iter) - return nil + return err } fd.off++ } diff --git a/pkg/sentry/fsimpl/ext/ext_test.go b/pkg/sentry/fsimpl/ext/ext_test.go index ef6127f3c..29bb73765 100644 --- a/pkg/sentry/fsimpl/ext/ext_test.go +++ b/pkg/sentry/fsimpl/ext/ext_test.go @@ -499,9 +499,9 @@ func newIterDirentCb() *iterDirentsCb { } // Handle implements vfs.IterDirentsCallback.Handle. -func (cb *iterDirentsCb) Handle(dirent vfs.Dirent) bool { +func (cb *iterDirentsCb) Handle(dirent vfs.Dirent) error { cb.dirents = append(cb.dirents, dirent) - return true + return nil } // TestIterDirents tests the FileDescriptionImpl.IterDirents functionality. diff --git a/pkg/sentry/fsimpl/gofer/directory.go b/pkg/sentry/fsimpl/gofer/directory.go index 6d4ebc2bf..5dbfc6250 100644 --- a/pkg/sentry/fsimpl/gofer/directory.go +++ b/pkg/sentry/fsimpl/gofer/directory.go @@ -65,8 +65,8 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba } for fd.off < int64(len(fd.dirents)) { - if !cb.Handle(fd.dirents[fd.off]) { - return nil + if err := cb.Handle(fd.dirents[fd.off]); err != nil { + return err } fd.off++ } diff --git a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go index eda781155..5650512e0 100644 --- a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go @@ -116,8 +116,8 @@ func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirent Ino: stat.Ino, NextOff: 1, } - if !cb.Handle(dirent) { - return nil + if err := cb.Handle(dirent); err != nil { + return err } fd.off++ } @@ -132,8 +132,8 @@ func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirent Ino: stat.Ino, NextOff: 2, } - if !cb.Handle(dirent) { - return nil + if err := cb.Handle(dirent); err != nil { + return err } fd.off++ } @@ -153,8 +153,8 @@ func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirent Ino: stat.Ino, NextOff: fd.off + 1, } - if !cb.Handle(dirent) { - return nil + if err := cb.Handle(dirent); err != nil { + return err } fd.off++ } diff --git a/pkg/sentry/fsimpl/proc/subtasks.go b/pkg/sentry/fsimpl/proc/subtasks.go index 102af0e93..f3f4e49b4 100644 --- a/pkg/sentry/fsimpl/proc/subtasks.go +++ b/pkg/sentry/fsimpl/proc/subtasks.go @@ -105,8 +105,8 @@ func (i *subtasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallb Ino: i.inoGen.NextIno(), NextOff: offset + 1, } - if !cb.Handle(dirent) { - return offset, nil + if err := cb.Handle(dirent); err != nil { + return offset, err } offset++ } diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go index ebe21630c..ce08a7d53 100644 --- a/pkg/sentry/fsimpl/proc/tasks.go +++ b/pkg/sentry/fsimpl/proc/tasks.go @@ -151,8 +151,8 @@ func (i *tasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback Ino: i.inoGen.NextIno(), NextOff: offset + 1, } - if !cb.Handle(dirent) { - return offset, nil + if err := cb.Handle(dirent); err != nil { + return offset, err } offset++ } @@ -163,8 +163,8 @@ func (i *tasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback Ino: i.inoGen.NextIno(), NextOff: offset + 1, } - if !cb.Handle(dirent) { - return offset, nil + if err := cb.Handle(dirent); err != nil { + return offset, err } offset++ } @@ -196,8 +196,8 @@ func (i *tasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback Ino: i.inoGen.NextIno(), NextOff: FIRST_PROCESS_ENTRY + 2 + int64(tid) + 1, } - if !cb.Handle(dirent) { - return offset, nil + if err := cb.Handle(dirent); err != nil { + return offset, err } offset++ } diff --git a/pkg/sentry/fsimpl/testutil/testutil.go b/pkg/sentry/fsimpl/testutil/testutil.go index b97e3534a..e16808c63 100644 --- a/pkg/sentry/fsimpl/testutil/testutil.go +++ b/pkg/sentry/fsimpl/testutil/testutil.go @@ -226,7 +226,7 @@ func (d *DirentCollector) SkipDotsChecks(value bool) { } // Handle implements vfs.IterDirentsCallback.Handle. -func (d *DirentCollector) Handle(dirent vfs.Dirent) bool { +func (d *DirentCollector) Handle(dirent vfs.Dirent) error { d.mu.Lock() if d.dirents == nil { d.dirents = make(map[string]*vfs.Dirent) @@ -234,7 +234,7 @@ func (d *DirentCollector) Handle(dirent vfs.Dirent) bool { d.order = append(d.order, &dirent) d.dirents[dirent.Name] = &dirent d.mu.Unlock() - return true + return nil } // Count returns the number of dirents currently in the collector. diff --git a/pkg/sentry/fsimpl/tmpfs/directory.go b/pkg/sentry/fsimpl/tmpfs/directory.go index dc0d27cf9..b4380af38 100644 --- a/pkg/sentry/fsimpl/tmpfs/directory.go +++ b/pkg/sentry/fsimpl/tmpfs/directory.go @@ -74,25 +74,25 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba defer fs.mu.Unlock() if fd.off == 0 { - if !cb.Handle(vfs.Dirent{ + if err := cb.Handle(vfs.Dirent{ Name: ".", Type: linux.DT_DIR, Ino: vfsd.Impl().(*dentry).inode.ino, NextOff: 1, - }) { - return nil + }); err != nil { + return err } fd.off++ } if fd.off == 1 { parentInode := vfsd.ParentOrSelf().Impl().(*dentry).inode - if !cb.Handle(vfs.Dirent{ + if err := cb.Handle(vfs.Dirent{ Name: "..", Type: parentInode.direntType(), Ino: parentInode.ino, NextOff: 2, - }) { - return nil + }); err != nil { + return err } fd.off++ } @@ -111,14 +111,14 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba for child != nil { // Skip other directoryFD iterators. if child.inode != nil { - if !cb.Handle(vfs.Dirent{ + if err := cb.Handle(vfs.Dirent{ Name: child.vfsd.Name(), Type: child.inode.direntType(), Ino: child.inode.ino, NextOff: fd.off + 1, - }) { + }); err != nil { dir.childList.InsertBefore(child, fd.iter) - return nil + return err } fd.off++ } diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go index 5bac660c7..9a1ad630c 100644 --- a/pkg/sentry/vfs/file_description.go +++ b/pkg/sentry/vfs/file_description.go @@ -435,11 +435,11 @@ type Dirent struct { // IterDirentsCallback receives Dirents from FileDescriptionImpl.IterDirents. type IterDirentsCallback interface { - // Handle handles the given iterated Dirent. It returns true if iteration - // should continue, and false if FileDescriptionImpl.IterDirents should - // terminate now and restart with the same Dirent the next time it is - // called. - Handle(dirent Dirent) bool + // Handle handles the given iterated Dirent. If Handle returns a non-nil + // error, FileDescriptionImpl.IterDirents must stop iteration and return + // the error; the next call to FileDescriptionImpl.IterDirents should + // restart with the same Dirent. + Handle(dirent Dirent) error } // OnClose is called when a file descriptor representing the FileDescription is -- cgit v1.2.3 From 10ed60e4778e01a2813eea6e3c201826f75982a5 Mon Sep 17 00:00:00 2001 From: gVisor bot Date: Thu, 20 Feb 2020 09:57:06 -0800 Subject: VFS2: Support memory mapping in tmpfs. tmpfs.fileDescription now implements ConfigureMMap. And tmpfs.regularFile implement memmap.Mappable. The methods are mostly unchanged from VFS1 tmpfs. PiperOrigin-RevId: 296234557 --- pkg/sentry/fsimpl/tmpfs/filesystem.go | 8 +- pkg/sentry/fsimpl/tmpfs/regular_file.go | 233 +++++++++++++++++++++++++++----- pkg/sentry/fsimpl/tmpfs/tmpfs.go | 16 ++- 3 files changed, 213 insertions(+), 44 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go index 7f7b791c4..e1b551422 100644 --- a/pkg/sentry/fsimpl/tmpfs/filesystem.go +++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go @@ -16,7 +16,6 @@ package tmpfs import ( "fmt" - "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" @@ -347,10 +346,9 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.Open return nil, err } if opts.Flags&linux.O_TRUNC != 0 { - impl.mu.Lock() - impl.data.Truncate(0, impl.memFile) - atomic.StoreUint64(&impl.size, 0) - impl.mu.Unlock() + if _, err := impl.truncate(0); err != nil { + return nil, err + } } return &fd.vfsfd, nil case *directory: diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file.go b/pkg/sentry/fsimpl/tmpfs/regular_file.go index dab346a41..711442424 100644 --- a/pkg/sentry/fsimpl/tmpfs/regular_file.go +++ b/pkg/sentry/fsimpl/tmpfs/regular_file.go @@ -15,6 +15,7 @@ package tmpfs import ( + "fmt" "io" "math" "sync/atomic" @@ -22,6 +23,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/safemem" + "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" @@ -34,25 +36,53 @@ import ( "gvisor.dev/gvisor/pkg/usermem" ) +// regularFile is a regular (=S_IFREG) tmpfs file. type regularFile struct { inode inode // memFile is a platform.File used to allocate pages to this regularFile. memFile *pgalloc.MemoryFile - // mu protects the fields below. - mu sync.RWMutex + // mapsMu protects mappings. + mapsMu sync.Mutex `state:"nosave"` + + // mappings tracks mappings of the file into memmap.MappingSpaces. + // + // Protected by mapsMu. + mappings memmap.MappingSet + + // writableMappingPages tracks how many pages of virtual memory are mapped + // as potentially writable from this file. If a page has multiple mappings, + // each mapping is counted separately. + // + // This counter is susceptible to overflow as we can potentially count + // mappings from many VMAs. We count pages rather than bytes to slightly + // mitigate this. + // + // Protected by mapsMu. + writableMappingPages uint64 + + // dataMu protects the fields below. + dataMu sync.RWMutex // data maps offsets into the file to offsets into memFile that store // the file's data. + // + // Protected by dataMu. data fsutil.FileRangeSet - // size is the size of data, but accessed using atomic memory - // operations to avoid locking in inode.stat(). - size uint64 - // seals represents file seals on this inode. + // + // Protected by dataMu. seals uint32 + + // size is the size of data. + // + // Protected by both dataMu and inode.mu; reading it requires holding + // either mutex, while writing requires holding both AND using atomics. + // Readers that do not require consistency (like Stat) may read the + // value atomically without holding either lock. + size uint64 } func (fs *filesystem) newRegularFile(creds *auth.Credentials, mode linux.FileMode) *inode { @@ -66,39 +96,170 @@ func (fs *filesystem) newRegularFile(creds *auth.Credentials, mode linux.FileMod // truncate grows or shrinks the file to the given size. It returns true if the // file size was updated. -func (rf *regularFile) truncate(size uint64) (bool, error) { - rf.mu.Lock() - defer rf.mu.Unlock() +func (rf *regularFile) truncate(newSize uint64) (bool, error) { + rf.inode.mu.Lock() + defer rf.inode.mu.Unlock() + return rf.truncateLocked(newSize) +} - if size == rf.size { +// Preconditions: rf.inode.mu must be held. +func (rf *regularFile) truncateLocked(newSize uint64) (bool, error) { + oldSize := rf.size + if newSize == oldSize { // Nothing to do. return false, nil } - if size > rf.size { - // Growing the file. + // Need to hold inode.mu and dataMu while modifying size. + rf.dataMu.Lock() + if newSize > oldSize { + // Can we grow the file? if rf.seals&linux.F_SEAL_GROW != 0 { - // Seal does not allow growth. + rf.dataMu.Unlock() return false, syserror.EPERM } - rf.size = size + // We only need to update the file size. + atomic.StoreUint64(&rf.size, newSize) + rf.dataMu.Unlock() return true, nil } - // Shrinking the file + // We are shrinking the file. First check if this is allowed. if rf.seals&linux.F_SEAL_SHRINK != 0 { - // Seal does not allow shrink. + rf.dataMu.Unlock() return false, syserror.EPERM } - // TODO(gvisor.dev/issues/1197): Invalidate mappings once we have - // mappings. + // Update the file size. + atomic.StoreUint64(&rf.size, newSize) + rf.dataMu.Unlock() + + // Invalidate past translations of truncated pages. + oldpgend := fs.OffsetPageEnd(int64(oldSize)) + newpgend := fs.OffsetPageEnd(int64(newSize)) + if newpgend < oldpgend { + rf.mapsMu.Lock() + rf.mappings.Invalidate(memmap.MappableRange{newpgend, oldpgend}, memmap.InvalidateOpts{ + // Compare Linux's mm/shmem.c:shmem_setattr() => + // mm/memory.c:unmap_mapping_range(evencows=1). + InvalidatePrivate: true, + }) + rf.mapsMu.Unlock() + } - rf.data.Truncate(size, rf.memFile) - rf.size = size + // We are now guaranteed that there are no translations of truncated pages, + // and can remove them. + rf.dataMu.Lock() + rf.data.Truncate(newSize, rf.memFile) + rf.dataMu.Unlock() return true, nil } +// AddMapping implements memmap.Mappable.AddMapping. +func (rf *regularFile) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) error { + rf.mapsMu.Lock() + defer rf.mapsMu.Unlock() + rf.dataMu.RLock() + defer rf.dataMu.RUnlock() + + // Reject writable mapping if F_SEAL_WRITE is set. + if rf.seals&linux.F_SEAL_WRITE != 0 && writable { + return syserror.EPERM + } + + rf.mappings.AddMapping(ms, ar, offset, writable) + if writable { + pagesBefore := rf.writableMappingPages + + // ar is guaranteed to be page aligned per memmap.Mappable. + rf.writableMappingPages += uint64(ar.Length() / usermem.PageSize) + + if rf.writableMappingPages < pagesBefore { + panic(fmt.Sprintf("Overflow while mapping potentially writable pages pointing to a tmpfs file. Before %v, after %v", pagesBefore, rf.writableMappingPages)) + } + } + + return nil +} + +// RemoveMapping implements memmap.Mappable.RemoveMapping. +func (rf *regularFile) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) { + rf.mapsMu.Lock() + defer rf.mapsMu.Unlock() + + rf.mappings.RemoveMapping(ms, ar, offset, writable) + + if writable { + pagesBefore := rf.writableMappingPages + + // ar is guaranteed to be page aligned per memmap.Mappable. + rf.writableMappingPages -= uint64(ar.Length() / usermem.PageSize) + + if rf.writableMappingPages > pagesBefore { + panic(fmt.Sprintf("Underflow while unmapping potentially writable pages pointing to a tmpfs file. Before %v, after %v", pagesBefore, rf.writableMappingPages)) + } + } +} + +// CopyMapping implements memmap.Mappable.CopyMapping. +func (rf *regularFile) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR usermem.AddrRange, offset uint64, writable bool) error { + return rf.AddMapping(ctx, ms, dstAR, offset, writable) +} + +// Translate implements memmap.Mappable.Translate. +func (rf *regularFile) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) { + rf.dataMu.Lock() + defer rf.dataMu.Unlock() + + // Constrain translations to f.attr.Size (rounded up) to prevent + // translation to pages that may be concurrently truncated. + pgend := fs.OffsetPageEnd(int64(rf.size)) + var beyondEOF bool + if required.End > pgend { + if required.Start >= pgend { + return nil, &memmap.BusError{io.EOF} + } + beyondEOF = true + required.End = pgend + } + if optional.End > pgend { + optional.End = pgend + } + + cerr := rf.data.Fill(ctx, required, optional, rf.memFile, usage.Tmpfs, func(_ context.Context, dsts safemem.BlockSeq, _ uint64) (uint64, error) { + // Newly-allocated pages are zeroed, so we don't need to do anything. + return dsts.NumBytes(), nil + }) + + var ts []memmap.Translation + var translatedEnd uint64 + for seg := rf.data.FindSegment(required.Start); seg.Ok() && seg.Start() < required.End; seg, _ = seg.NextNonEmpty() { + segMR := seg.Range().Intersect(optional) + ts = append(ts, memmap.Translation{ + Source: segMR, + File: rf.memFile, + Offset: seg.FileRangeOf(segMR).Start, + Perms: usermem.AnyAccess, + }) + translatedEnd = segMR.End + } + + // Don't return the error returned by f.data.Fill if it occurred outside of + // required. + if translatedEnd < required.End && cerr != nil { + return ts, &memmap.BusError{cerr} + } + if beyondEOF { + return ts, &memmap.BusError{io.EOF} + } + return ts, nil +} + +// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable. +func (*regularFile) InvalidateUnsavable(context.Context) error { + return nil +} + type regularFileFD struct { fileDescription @@ -152,8 +313,10 @@ func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, off // Overflow. return 0, syserror.EFBIG } + f.inode.mu.Lock() rw := getRegularFileReadWriter(f, offset) n, err := src.CopyInTo(ctx, rw) + f.inode.mu.Unlock() putRegularFileReadWriter(rw) return n, err } @@ -215,6 +378,12 @@ func (fd *regularFileFD) UnlockPOSIX(ctx context.Context, uid lock.UniqueID, rng return nil } +// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap. +func (fd *regularFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { + file := fd.inode().impl.(*regularFile) + return vfs.GenericConfigureMMap(&fd.vfsfd, file, opts) +} + // regularFileReadWriter implements safemem.Reader and Safemem.Writer. type regularFileReadWriter struct { file *regularFile @@ -244,14 +413,15 @@ func putRegularFileReadWriter(rw *regularFileReadWriter) { // ReadToBlocks implements safemem.Reader.ReadToBlocks. func (rw *regularFileReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) { - rw.file.mu.RLock() + rw.file.dataMu.RLock() + defer rw.file.dataMu.RUnlock() + size := rw.file.size // Compute the range to read (limited by file size and overflow-checked). - if rw.off >= rw.file.size { - rw.file.mu.RUnlock() + if rw.off >= size { return 0, io.EOF } - end := rw.file.size + end := size if rend := rw.off + dsts.NumBytes(); rend > rw.off && rend < end { end = rend } @@ -265,7 +435,6 @@ func (rw *regularFileReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, er // Get internal mappings. ims, err := rw.file.memFile.MapInternal(seg.FileRangeOf(seg.Range().Intersect(mr)), usermem.Read) if err != nil { - rw.file.mu.RUnlock() return done, err } @@ -275,7 +444,6 @@ func (rw *regularFileReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, er rw.off += uint64(n) dsts = dsts.DropFirst64(n) if err != nil { - rw.file.mu.RUnlock() return done, err } @@ -291,7 +459,6 @@ func (rw *regularFileReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, er rw.off += uint64(n) dsts = dsts.DropFirst64(n) if err != nil { - rw.file.mu.RUnlock() return done, err } @@ -299,13 +466,16 @@ func (rw *regularFileReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, er seg, gap = gap.NextSegment(), fsutil.FileRangeGapIterator{} } } - rw.file.mu.RUnlock() return done, nil } // WriteFromBlocks implements safemem.Writer.WriteFromBlocks. +// +// Preconditions: inode.mu must be held. func (rw *regularFileReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) { - rw.file.mu.Lock() + // Hold dataMu so we can modify size. + rw.file.dataMu.Lock() + defer rw.file.dataMu.Unlock() // Compute the range to write (overflow-checked). end := rw.off + srcs.NumBytes() @@ -316,7 +486,6 @@ func (rw *regularFileReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, // Check if seals prevent either file growth or all writes. switch { case rw.file.seals&linux.F_SEAL_WRITE != 0: // Write sealed - rw.file.mu.Unlock() return 0, syserror.EPERM case end > rw.file.size && rw.file.seals&linux.F_SEAL_GROW != 0: // Grow sealed // When growth is sealed, Linux effectively allows writes which would @@ -338,7 +507,6 @@ func (rw *regularFileReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, } if end <= rw.off { // Truncation would result in no data being written. - rw.file.mu.Unlock() return 0, syserror.EPERM } } @@ -395,9 +563,8 @@ exitLoop: // If the write ends beyond the file's previous size, it causes the // file to grow. if rw.off > rw.file.size { - atomic.StoreUint64(&rw.file.size, rw.off) + rw.file.size = rw.off } - rw.file.mu.Unlock() return done, retErr } diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go index c5bb17562..521206305 100644 --- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go +++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go @@ -18,9 +18,10 @@ // Lock order: // // filesystem.mu -// regularFileFD.offMu -// regularFile.mu // inode.mu +// regularFileFD.offMu +// regularFile.mapsMu +// regularFile.dataMu package tmpfs import ( @@ -226,12 +227,15 @@ func (i *inode) tryIncRef() bool { func (i *inode) decRef() { if refs := atomic.AddInt64(&i.refs, -1); refs == 0 { - // This is unnecessary; it's mostly to simulate what tmpfs would do. if regFile, ok := i.impl.(*regularFile); ok { - regFile.mu.Lock() + // Hold inode.mu and regFile.dataMu while mutating + // size. + i.mu.Lock() + regFile.dataMu.Lock() regFile.data.DropAll(regFile.memFile) atomic.StoreUint64(®File.size, 0) - regFile.mu.Unlock() + regFile.dataMu.Unlock() + i.mu.Unlock() } } else if refs < 0 { panic("tmpfs.inode.decRef() called without holding a reference") @@ -320,7 +324,7 @@ func (i *inode) setStat(stat linux.Statx) error { if mask&linux.STATX_SIZE != 0 { switch impl := i.impl.(type) { case *regularFile: - updated, err := impl.truncate(stat.Size) + updated, err := impl.truncateLocked(stat.Size) if err != nil { return err } -- cgit v1.2.3 From 4a73bae269ae9f52a962ae3b08a17ccaacf7ba80 Mon Sep 17 00:00:00 2001 From: gVisor bot Date: Thu, 20 Feb 2020 15:19:40 -0800 Subject: Initial network namespace support. TCP/IP will work with netstack networking. hostinet doesn't work, and sockets will have the same behavior as it is now. Before the userspace is able to create device, the default loopback device can be used to test. /proc/net and /sys/net will still be connected to the root network stack; this is the same behavior now. Issue #1833 PiperOrigin-RevId: 296309389 --- pkg/sentry/fs/proc/net.go | 5 +- pkg/sentry/fs/proc/sys_net.go | 4 +- pkg/sentry/fsimpl/proc/tasks_net.go | 5 +- pkg/sentry/fsimpl/proc/tasks_sys.go | 4 +- pkg/sentry/fsimpl/testutil/kernel.go | 1 + pkg/sentry/inet/BUILD | 1 + pkg/sentry/inet/namespace.go | 99 +++++++++++++++++++++++++ pkg/sentry/kernel/kernel.go | 26 ++++--- pkg/sentry/kernel/task.go | 9 +-- pkg/sentry/kernel/task_clone.go | 16 ++-- pkg/sentry/kernel/task_net.go | 19 +++-- pkg/sentry/kernel/task_start.go | 8 +- pkg/tcpip/time_unsafe.go | 2 + runsc/boot/BUILD | 2 +- runsc/boot/controller.go | 11 +-- runsc/boot/loader.go | 121 +++++++++++++++++++++---------- runsc/boot/network.go | 27 +++++++ runsc/boot/pprof.go | 18 ----- runsc/boot/pprof/BUILD | 11 +++ runsc/boot/pprof/pprof.go | 20 +++++ runsc/sandbox/network.go | 25 +------ test/syscalls/BUILD | 2 + test/syscalls/linux/BUILD | 17 +++++ test/syscalls/linux/network_namespace.cc | 121 +++++++++++++++++++++++++++++++ 24 files changed, 451 insertions(+), 123 deletions(-) create mode 100644 pkg/sentry/inet/namespace.go delete mode 100644 runsc/boot/pprof.go create mode 100644 runsc/boot/pprof/BUILD create mode 100644 runsc/boot/pprof/pprof.go create mode 100644 test/syscalls/linux/network_namespace.cc (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fs/proc/net.go b/pkg/sentry/fs/proc/net.go index 6f2775344..95d5817ff 100644 --- a/pkg/sentry/fs/proc/net.go +++ b/pkg/sentry/fs/proc/net.go @@ -43,7 +43,10 @@ import ( // newNet creates a new proc net entry. func (p *proc) newNetDir(ctx context.Context, k *kernel.Kernel, msrc *fs.MountSource) *fs.Inode { var contents map[string]*fs.Inode - if s := p.k.NetworkStack(); s != nil { + // TODO(gvisor.dev/issue/1833): Support for using the network stack in the + // network namespace of the calling process. We should make this per-process, + // a.k.a. /proc/PID/net, and make /proc/net a symlink to /proc/self/net. + if s := p.k.RootNetworkNamespace().Stack(); s != nil { contents = map[string]*fs.Inode{ "dev": seqfile.NewSeqFileInode(ctx, &netDev{s: s}, msrc), "snmp": seqfile.NewSeqFileInode(ctx, &netSnmp{s: s}, msrc), diff --git a/pkg/sentry/fs/proc/sys_net.go b/pkg/sentry/fs/proc/sys_net.go index 0772d4ae4..d4c4b533d 100644 --- a/pkg/sentry/fs/proc/sys_net.go +++ b/pkg/sentry/fs/proc/sys_net.go @@ -357,7 +357,9 @@ func (p *proc) newSysNetIPv4Dir(ctx context.Context, msrc *fs.MountSource, s ine func (p *proc) newSysNetDir(ctx context.Context, msrc *fs.MountSource) *fs.Inode { var contents map[string]*fs.Inode - if s := p.k.NetworkStack(); s != nil { + // TODO(gvisor.dev/issue/1833): Support for using the network stack in the + // network namespace of the calling process. + if s := p.k.RootNetworkNamespace().Stack(); s != nil { contents = map[string]*fs.Inode{ "ipv4": p.newSysNetIPv4Dir(ctx, msrc, s), "core": p.newSysNetCore(ctx, msrc, s), diff --git a/pkg/sentry/fsimpl/proc/tasks_net.go b/pkg/sentry/fsimpl/proc/tasks_net.go index 608fec017..d4e1812d8 100644 --- a/pkg/sentry/fsimpl/proc/tasks_net.go +++ b/pkg/sentry/fsimpl/proc/tasks_net.go @@ -39,7 +39,10 @@ import ( func newNetDir(root *auth.Credentials, inoGen InoGenerator, k *kernel.Kernel) *kernfs.Dentry { var contents map[string]*kernfs.Dentry - if stack := k.NetworkStack(); stack != nil { + // TODO(gvisor.dev/issue/1833): Support for using the network stack in the + // network namespace of the calling process. We should make this per-process, + // a.k.a. /proc/PID/net, and make /proc/net a symlink to /proc/self/net. + if stack := k.RootNetworkNamespace().Stack(); stack != nil { const ( arp = "IP address HW type Flags HW address Mask Device\n" netlink = "sk Eth Pid Groups Rmem Wmem Dump Locks Drops Inode\n" diff --git a/pkg/sentry/fsimpl/proc/tasks_sys.go b/pkg/sentry/fsimpl/proc/tasks_sys.go index c7ce74883..3d5dc463c 100644 --- a/pkg/sentry/fsimpl/proc/tasks_sys.go +++ b/pkg/sentry/fsimpl/proc/tasks_sys.go @@ -50,7 +50,9 @@ func newSysDir(root *auth.Credentials, inoGen InoGenerator, k *kernel.Kernel) *k func newSysNetDir(root *auth.Credentials, inoGen InoGenerator, k *kernel.Kernel) *kernfs.Dentry { var contents map[string]*kernfs.Dentry - if stack := k.NetworkStack(); stack != nil { + // TODO(gvisor.dev/issue/1833): Support for using the network stack in the + // network namespace of the calling process. + if stack := k.RootNetworkNamespace().Stack(); stack != nil { contents = map[string]*kernfs.Dentry{ "ipv4": kernfs.NewStaticDir(root, inoGen.NextIno(), 0555, map[string]*kernfs.Dentry{ "tcp_sack": newDentry(root, inoGen.NextIno(), 0644, &tcpSackData{stack: stack}), diff --git a/pkg/sentry/fsimpl/testutil/kernel.go b/pkg/sentry/fsimpl/testutil/kernel.go index d0be32e72..488478e29 100644 --- a/pkg/sentry/fsimpl/testutil/kernel.go +++ b/pkg/sentry/fsimpl/testutil/kernel.go @@ -128,6 +128,7 @@ func CreateTask(ctx context.Context, name string, tc *kernel.ThreadGroup, mntns ThreadGroup: tc, TaskContext: &kernel.TaskContext{Name: name}, Credentials: auth.CredentialsFromContext(ctx), + NetworkNamespace: k.RootNetworkNamespace(), AllowedCPUMask: sched.NewFullCPUSet(k.ApplicationCores()), UTSNamespace: kernel.UTSNamespaceFromContext(ctx), IPCNamespace: kernel.IPCNamespaceFromContext(ctx), diff --git a/pkg/sentry/inet/BUILD b/pkg/sentry/inet/BUILD index 334432abf..07bf39fed 100644 --- a/pkg/sentry/inet/BUILD +++ b/pkg/sentry/inet/BUILD @@ -10,6 +10,7 @@ go_library( srcs = [ "context.go", "inet.go", + "namespace.go", "test_stack.go", ], deps = [ diff --git a/pkg/sentry/inet/namespace.go b/pkg/sentry/inet/namespace.go new file mode 100644 index 000000000..c16667e7f --- /dev/null +++ b/pkg/sentry/inet/namespace.go @@ -0,0 +1,99 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package inet + +// Namespace represents a network namespace. See network_namespaces(7). +// +// +stateify savable +type Namespace struct { + // stack is the network stack implementation of this network namespace. + stack Stack `state:"nosave"` + + // creator allows kernel to create new network stack for network namespaces. + // If nil, no networking will function if network is namespaced. + creator NetworkStackCreator + + // isRoot indicates whether this is the root network namespace. + isRoot bool +} + +// NewRootNamespace creates the root network namespace, with creator +// allowing new network namespaces to be created. If creator is nil, no +// networking will function if the network is namespaced. +func NewRootNamespace(stack Stack, creator NetworkStackCreator) *Namespace { + return &Namespace{ + stack: stack, + creator: creator, + isRoot: true, + } +} + +// NewNamespace creates a new network namespace from the root. +func NewNamespace(root *Namespace) *Namespace { + n := &Namespace{ + creator: root.creator, + } + n.init() + return n +} + +// Stack returns the network stack of n. Stack may return nil if no network +// stack is configured. +func (n *Namespace) Stack() Stack { + return n.stack +} + +// IsRoot returns whether n is the root network namespace. +func (n *Namespace) IsRoot() bool { + return n.isRoot +} + +// RestoreRootStack restores the root network namespace with stack. This should +// only be called when restoring kernel. +func (n *Namespace) RestoreRootStack(stack Stack) { + if !n.isRoot { + panic("RestoreRootStack can only be called on root network namespace") + } + if n.stack != nil { + panic("RestoreRootStack called after a stack has already been set") + } + n.stack = stack +} + +func (n *Namespace) init() { + // Root network namespace will have stack assigned later. + if n.isRoot { + return + } + if n.creator != nil { + var err error + n.stack, err = n.creator.CreateStack() + if err != nil { + panic(err) + } + } +} + +// afterLoad is invoked by stateify. +func (n *Namespace) afterLoad() { + n.init() +} + +// NetworkStackCreator allows new instances of a network stack to be created. It +// is used by the kernel to create new network namespaces when requested. +type NetworkStackCreator interface { + // CreateStack creates a new network stack for a network namespace. + CreateStack() (Stack, error) +} diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index 7da0368f1..c62fd6eb1 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -111,7 +111,7 @@ type Kernel struct { timekeeper *Timekeeper tasks *TaskSet rootUserNamespace *auth.UserNamespace - networkStack inet.Stack `state:"nosave"` + rootNetworkNamespace *inet.Namespace applicationCores uint useHostCores bool extraAuxv []arch.AuxEntry @@ -260,8 +260,9 @@ type InitKernelArgs struct { // RootUserNamespace is the root user namespace. RootUserNamespace *auth.UserNamespace - // NetworkStack is the TCP/IP network stack. NetworkStack may be nil. - NetworkStack inet.Stack + // RootNetworkNamespace is the root network namespace. If nil, no networking + // will be available. + RootNetworkNamespace *inet.Namespace // ApplicationCores is the number of logical CPUs visible to sandboxed // applications. The set of logical CPU IDs is [0, ApplicationCores); thus @@ -320,7 +321,10 @@ func (k *Kernel) Init(args InitKernelArgs) error { k.rootUTSNamespace = args.RootUTSNamespace k.rootIPCNamespace = args.RootIPCNamespace k.rootAbstractSocketNamespace = args.RootAbstractSocketNamespace - k.networkStack = args.NetworkStack + k.rootNetworkNamespace = args.RootNetworkNamespace + if k.rootNetworkNamespace == nil { + k.rootNetworkNamespace = inet.NewRootNamespace(nil, nil) + } k.applicationCores = args.ApplicationCores if args.UseHostCores { k.useHostCores = true @@ -543,8 +547,6 @@ func (ts *TaskSet) unregisterEpollWaiters() { func (k *Kernel) LoadFrom(r io.Reader, net inet.Stack, clocks sentrytime.Clocks) error { loadStart := time.Now() - k.networkStack = net - initAppCores := k.applicationCores // Load the pre-saved CPUID FeatureSet. @@ -575,6 +577,10 @@ func (k *Kernel) LoadFrom(r io.Reader, net inet.Stack, clocks sentrytime.Clocks) log.Infof("Kernel load stats: %s", &stats) log.Infof("Kernel load took [%s].", time.Since(kernelStart)) + // rootNetworkNamespace should be populated after loading the state file. + // Restore the root network stack. + k.rootNetworkNamespace.RestoreRootStack(net) + // Load the memory file's state. memoryStart := time.Now() if err := k.mf.LoadFrom(k.SupervisorContext(), r); err != nil { @@ -905,6 +911,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, FSContext: fsContext, FDTable: args.FDTable, Credentials: args.Credentials, + NetworkNamespace: k.RootNetworkNamespace(), AllowedCPUMask: sched.NewFullCPUSet(k.applicationCores), UTSNamespace: args.UTSNamespace, IPCNamespace: args.IPCNamespace, @@ -1255,10 +1262,9 @@ func (k *Kernel) RootAbstractSocketNamespace() *AbstractSocketNamespace { return k.rootAbstractSocketNamespace } -// NetworkStack returns the network stack. NetworkStack may return nil if no -// network stack is available. -func (k *Kernel) NetworkStack() inet.Stack { - return k.networkStack +// RootNetworkNamespace returns the root network namespace, always non-nil. +func (k *Kernel) RootNetworkNamespace() *inet.Namespace { + return k.rootNetworkNamespace } // GlobalInit returns the thread group with ID 1 in the root PID namespace, or diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go index a3443ff21..e37e23231 100644 --- a/pkg/sentry/kernel/task.go +++ b/pkg/sentry/kernel/task.go @@ -486,13 +486,10 @@ type Task struct { numaPolicy int32 numaNodeMask uint64 - // If netns is true, the task is in a non-root network namespace. Network - // namespaces aren't currently implemented in full; being in a network - // namespace simply prevents the task from observing any network devices - // (including loopback) or using abstract socket addresses (see unix(7)). + // netns is the task's network namespace. netns is never nil. // - // netns is protected by mu. netns is owned by the task goroutine. - netns bool + // netns is protected by mu. + netns *inet.Namespace // If rseqPreempted is true, before the next call to p.Switch(), // interrupt rseq critical regions as defined by rseqAddr and diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go index ba74b4c1c..78866f280 100644 --- a/pkg/sentry/kernel/task_clone.go +++ b/pkg/sentry/kernel/task_clone.go @@ -17,6 +17,7 @@ package kernel import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/bpf" + "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -54,8 +55,7 @@ type SharingOptions struct { NewUserNamespace bool // If NewNetworkNamespace is true, the task should have an independent - // network namespace. (Note that network namespaces are not really - // implemented; see comment on Task.netns for details.) + // network namespace. NewNetworkNamespace bool // If NewFiles is true, the task should use an independent file descriptor @@ -199,6 +199,11 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { ipcns = NewIPCNamespace(userns) } + netns := t.NetworkNamespace() + if opts.NewNetworkNamespace { + netns = inet.NewNamespace(netns) + } + // TODO(b/63601033): Implement CLONE_NEWNS. mntnsVFS2 := t.mountNamespaceVFS2 if mntnsVFS2 != nil { @@ -268,7 +273,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { FDTable: fdTable, Credentials: creds, Niceness: t.Niceness(), - NetworkNamespaced: t.netns, + NetworkNamespace: netns, AllowedCPUMask: t.CPUMask(), UTSNamespace: utsns, IPCNamespace: ipcns, @@ -283,9 +288,6 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { } else { cfg.InheritParent = t } - if opts.NewNetworkNamespace { - cfg.NetworkNamespaced = true - } nt, err := t.tg.pidns.owner.NewTask(cfg) if err != nil { if opts.NewThreadGroup { @@ -482,7 +484,7 @@ func (t *Task) Unshare(opts *SharingOptions) error { t.mu.Unlock() return syserror.EPERM } - t.netns = true + t.netns = inet.NewNamespace(t.netns) } if opts.NewUTSNamespace { if !haveCapSysAdmin { diff --git a/pkg/sentry/kernel/task_net.go b/pkg/sentry/kernel/task_net.go index 172a31e1d..f7711232c 100644 --- a/pkg/sentry/kernel/task_net.go +++ b/pkg/sentry/kernel/task_net.go @@ -22,14 +22,23 @@ import ( func (t *Task) IsNetworkNamespaced() bool { t.mu.Lock() defer t.mu.Unlock() - return t.netns + return !t.netns.IsRoot() } // NetworkContext returns the network stack used by the task. NetworkContext // may return nil if no network stack is available. +// +// TODO(gvisor.dev/issue/1833): Migrate callers of this method to +// NetworkNamespace(). func (t *Task) NetworkContext() inet.Stack { - if t.IsNetworkNamespaced() { - return nil - } - return t.k.networkStack + t.mu.Lock() + defer t.mu.Unlock() + return t.netns.Stack() +} + +// NetworkNamespace returns the network namespace observed by the task. +func (t *Task) NetworkNamespace() *inet.Namespace { + t.mu.Lock() + defer t.mu.Unlock() + return t.netns } diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go index f9236a842..a5035bb7f 100644 --- a/pkg/sentry/kernel/task_start.go +++ b/pkg/sentry/kernel/task_start.go @@ -17,6 +17,7 @@ package kernel import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/futex" "gvisor.dev/gvisor/pkg/sentry/kernel/sched" @@ -65,9 +66,8 @@ type TaskConfig struct { // Niceness is the niceness of the new task. Niceness int - // If NetworkNamespaced is true, the new task should observe a non-root - // network namespace. - NetworkNamespaced bool + // NetworkNamespace is the network namespace to be used for the new task. + NetworkNamespace *inet.Namespace // AllowedCPUMask contains the cpus that this task can run on. AllowedCPUMask sched.CPUSet @@ -133,7 +133,7 @@ func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) { allowedCPUMask: cfg.AllowedCPUMask.Copy(), ioUsage: &usage.IO{}, niceness: cfg.Niceness, - netns: cfg.NetworkNamespaced, + netns: cfg.NetworkNamespace, utsns: cfg.UTSNamespace, ipcns: cfg.IPCNamespace, abstractSockets: cfg.AbstractSocketNamespace, diff --git a/pkg/tcpip/time_unsafe.go b/pkg/tcpip/time_unsafe.go index 48764b978..2f98a996f 100644 --- a/pkg/tcpip/time_unsafe.go +++ b/pkg/tcpip/time_unsafe.go @@ -25,6 +25,8 @@ import ( ) // StdClock implements Clock with the time package. +// +// +stateify savable type StdClock struct{} var _ Clock = (*StdClock)(nil) diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD index ae4dd102a..26f68fe3d 100644 --- a/runsc/boot/BUILD +++ b/runsc/boot/BUILD @@ -19,7 +19,6 @@ go_library( "loader_amd64.go", "loader_arm64.go", "network.go", - "pprof.go", "strace.go", "user.go", ], @@ -91,6 +90,7 @@ go_library( "//pkg/usermem", "//runsc/boot/filter", "//runsc/boot/platforms", + "//runsc/boot/pprof", "//runsc/specutils", "@com_github_golang_protobuf//proto:go_default_library", "@com_github_opencontainers_runtime-spec//specs-go:go_default_library", diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go index 9c9e94864..17e774e0c 100644 --- a/runsc/boot/controller.go +++ b/runsc/boot/controller.go @@ -32,6 +32,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/watchdog" "gvisor.dev/gvisor/pkg/tcpip/stack" "gvisor.dev/gvisor/pkg/urpc" + "gvisor.dev/gvisor/runsc/boot/pprof" "gvisor.dev/gvisor/runsc/specutils" ) @@ -142,7 +143,7 @@ func newController(fd int, l *Loader) (*controller, error) { } srv.Register(manager) - if eps, ok := l.k.NetworkStack().(*netstack.Stack); ok { + if eps, ok := l.k.RootNetworkNamespace().Stack().(*netstack.Stack); ok { net := &Network{ Stack: eps.Stack, } @@ -341,7 +342,7 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error { return fmt.Errorf("creating memory file: %v", err) } k.SetMemoryFile(mf) - networkStack := cm.l.k.NetworkStack() + networkStack := cm.l.k.RootNetworkNamespace().Stack() cm.l.k = k // Set up the restore environment. @@ -365,9 +366,9 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error { } if cm.l.conf.ProfileEnable { - // initializePProf opens /proc/self/maps, so has to be - // called before installing seccomp filters. - initializePProf() + // pprof.Initialize opens /proc/self/maps, so has to be called before + // installing seccomp filters. + pprof.Initialize() } // Seccomp filters have to be applied before parsing the state file. diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index eef43b9df..e7ca98134 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -49,6 +49,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/watchdog" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/tcpip/link/loopback" "gvisor.dev/gvisor/pkg/tcpip/link/sniffer" "gvisor.dev/gvisor/pkg/tcpip/network/arp" "gvisor.dev/gvisor/pkg/tcpip/network/ipv4" @@ -60,6 +61,7 @@ import ( "gvisor.dev/gvisor/pkg/tcpip/transport/udp" "gvisor.dev/gvisor/runsc/boot/filter" _ "gvisor.dev/gvisor/runsc/boot/platforms" // register all platforms. + "gvisor.dev/gvisor/runsc/boot/pprof" "gvisor.dev/gvisor/runsc/specutils" // Include supported socket providers. @@ -230,11 +232,8 @@ func New(args Args) (*Loader, error) { return nil, fmt.Errorf("enabling strace: %v", err) } - // Create an empty network stack because the network namespace may be empty at - // this point. Netns is configured before Run() is called. Netstack is - // configured using a control uRPC message. Host network is configured inside - // Run(). - networkStack, err := newEmptyNetworkStack(args.Conf, k, k) + // Create root network namespace/stack. + netns, err := newRootNetworkNamespace(args.Conf, k, k) if err != nil { return nil, fmt.Errorf("creating network: %v", err) } @@ -277,7 +276,7 @@ func New(args Args) (*Loader, error) { FeatureSet: cpuid.HostFeatureSet(), Timekeeper: tk, RootUserNamespace: creds.UserNamespace, - NetworkStack: networkStack, + RootNetworkNamespace: netns, ApplicationCores: uint(args.NumCPU), Vdso: vdso, RootUTSNamespace: kernel.NewUTSNamespace(args.Spec.Hostname, args.Spec.Hostname, creds.UserNamespace), @@ -466,7 +465,7 @@ func (l *Loader) run() error { // Delay host network configuration to this point because network namespace // is configured after the loader is created and before Run() is called. log.Debugf("Configuring host network") - stack := l.k.NetworkStack().(*hostinet.Stack) + stack := l.k.RootNetworkNamespace().Stack().(*hostinet.Stack) if err := stack.Configure(); err != nil { return err } @@ -485,7 +484,7 @@ func (l *Loader) run() error { // l.restore is set by the container manager when a restore call is made. if !l.restore { if l.conf.ProfileEnable { - initializePProf() + pprof.Initialize() } // Finally done with all configuration. Setup filters before user code @@ -908,48 +907,92 @@ func (l *Loader) WaitExit() kernel.ExitStatus { return l.k.GlobalInit().ExitStatus() } -func newEmptyNetworkStack(conf *Config, clock tcpip.Clock, uniqueID stack.UniqueID) (inet.Stack, error) { +func newRootNetworkNamespace(conf *Config, clock tcpip.Clock, uniqueID stack.UniqueID) (*inet.Namespace, error) { + // Create an empty network stack because the network namespace may be empty at + // this point. Netns is configured before Run() is called. Netstack is + // configured using a control uRPC message. Host network is configured inside + // Run(). switch conf.Network { case NetworkHost: - return hostinet.NewStack(), nil + // No network namespacing support for hostinet yet, hence creator is nil. + return inet.NewRootNamespace(hostinet.NewStack(), nil), nil case NetworkNone, NetworkSandbox: - // NetworkNone sets up loopback using netstack. - netProtos := []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol(), arp.NewProtocol()} - transProtos := []stack.TransportProtocol{tcp.NewProtocol(), udp.NewProtocol(), icmp.NewProtocol4()} - s := netstack.Stack{stack.New(stack.Options{ - NetworkProtocols: netProtos, - TransportProtocols: transProtos, - Clock: clock, - Stats: netstack.Metrics, - HandleLocal: true, - // Enable raw sockets for users with sufficient - // privileges. - RawFactory: raw.EndpointFactory{}, - UniqueID: uniqueID, - })} - - // Enable SACK Recovery. - if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SACKEnabled(true)); err != nil { - return nil, fmt.Errorf("failed to enable SACK: %v", err) + s, err := newEmptySandboxNetworkStack(clock, uniqueID) + if err != nil { + return nil, err } + creator := &sandboxNetstackCreator{ + clock: clock, + uniqueID: uniqueID, + } + return inet.NewRootNamespace(s, creator), nil - // Set default TTLs as required by socket/netstack. - s.Stack.SetNetworkProtocolOption(ipv4.ProtocolNumber, tcpip.DefaultTTLOption(netstack.DefaultTTL)) - s.Stack.SetNetworkProtocolOption(ipv6.ProtocolNumber, tcpip.DefaultTTLOption(netstack.DefaultTTL)) + default: + panic(fmt.Sprintf("invalid network configuration: %v", conf.Network)) + } - // Enable Receive Buffer Auto-Tuning. - if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.ModerateReceiveBufferOption(true)); err != nil { - return nil, fmt.Errorf("SetTransportProtocolOption failed: %v", err) - } +} - s.FillDefaultIPTables() +func newEmptySandboxNetworkStack(clock tcpip.Clock, uniqueID stack.UniqueID) (inet.Stack, error) { + netProtos := []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol(), arp.NewProtocol()} + transProtos := []stack.TransportProtocol{tcp.NewProtocol(), udp.NewProtocol(), icmp.NewProtocol4()} + s := netstack.Stack{stack.New(stack.Options{ + NetworkProtocols: netProtos, + TransportProtocols: transProtos, + Clock: clock, + Stats: netstack.Metrics, + HandleLocal: true, + // Enable raw sockets for users with sufficient + // privileges. + RawFactory: raw.EndpointFactory{}, + UniqueID: uniqueID, + })} - return &s, nil + // Enable SACK Recovery. + if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SACKEnabled(true)); err != nil { + return nil, fmt.Errorf("failed to enable SACK: %v", err) + } - default: - panic(fmt.Sprintf("invalid network configuration: %v", conf.Network)) + // Set default TTLs as required by socket/netstack. + s.Stack.SetNetworkProtocolOption(ipv4.ProtocolNumber, tcpip.DefaultTTLOption(netstack.DefaultTTL)) + s.Stack.SetNetworkProtocolOption(ipv6.ProtocolNumber, tcpip.DefaultTTLOption(netstack.DefaultTTL)) + + // Enable Receive Buffer Auto-Tuning. + if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.ModerateReceiveBufferOption(true)); err != nil { + return nil, fmt.Errorf("SetTransportProtocolOption failed: %v", err) + } + + s.FillDefaultIPTables() + + return &s, nil +} + +// sandboxNetstackCreator implements kernel.NetworkStackCreator. +// +// +stateify savable +type sandboxNetstackCreator struct { + clock tcpip.Clock + uniqueID stack.UniqueID +} + +// CreateStack implements kernel.NetworkStackCreator.CreateStack. +func (f *sandboxNetstackCreator) CreateStack() (inet.Stack, error) { + s, err := newEmptySandboxNetworkStack(f.clock, f.uniqueID) + if err != nil { + return nil, err } + + // Setup loopback. + n := &Network{Stack: s.(*netstack.Stack).Stack} + nicID := tcpip.NICID(f.uniqueID.UniqueID()) + link := DefaultLoopbackLink + linkEP := loopback.New() + if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses); err != nil { + return nil, err + } + + return s, nil } // signal sends a signal to one or more processes in a container. If PID is 0, diff --git a/runsc/boot/network.go b/runsc/boot/network.go index 6a8765ec8..bee6ee336 100644 --- a/runsc/boot/network.go +++ b/runsc/boot/network.go @@ -17,6 +17,7 @@ package boot import ( "fmt" "net" + "strings" "syscall" "gvisor.dev/gvisor/pkg/log" @@ -31,6 +32,32 @@ import ( "gvisor.dev/gvisor/pkg/urpc" ) +var ( + // DefaultLoopbackLink contains IP addresses and routes of "127.0.0.1/8" and + // "::1/8" on "lo" interface. + DefaultLoopbackLink = LoopbackLink{ + Name: "lo", + Addresses: []net.IP{ + net.IP("\x7f\x00\x00\x01"), + net.IPv6loopback, + }, + Routes: []Route{ + { + Destination: net.IPNet{ + IP: net.IPv4(0x7f, 0, 0, 0), + Mask: net.IPv4Mask(0xff, 0, 0, 0), + }, + }, + { + Destination: net.IPNet{ + IP: net.IPv6loopback, + Mask: net.IPMask(strings.Repeat("\xff", net.IPv6len)), + }, + }, + }, + } +) + // Network exposes methods that can be used to configure a network stack. type Network struct { Stack *stack.Stack diff --git a/runsc/boot/pprof.go b/runsc/boot/pprof.go deleted file mode 100644 index 463362f02..000000000 --- a/runsc/boot/pprof.go +++ /dev/null @@ -1,18 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package boot - -func initializePProf() { -} diff --git a/runsc/boot/pprof/BUILD b/runsc/boot/pprof/BUILD new file mode 100644 index 000000000..29cb42b2f --- /dev/null +++ b/runsc/boot/pprof/BUILD @@ -0,0 +1,11 @@ +load("//tools:defs.bzl", "go_library") + +package(licenses = ["notice"]) + +go_library( + name = "pprof", + srcs = ["pprof.go"], + visibility = [ + "//runsc:__subpackages__", + ], +) diff --git a/runsc/boot/pprof/pprof.go b/runsc/boot/pprof/pprof.go new file mode 100644 index 000000000..1ded20dee --- /dev/null +++ b/runsc/boot/pprof/pprof.go @@ -0,0 +1,20 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package pprof provides a stub to initialize custom profilers. +package pprof + +// Initialize will be called at boot for initializing custom profilers. +func Initialize() { +} diff --git a/runsc/sandbox/network.go b/runsc/sandbox/network.go index 99e143696..bc093fba5 100644 --- a/runsc/sandbox/network.go +++ b/runsc/sandbox/network.go @@ -21,7 +21,6 @@ import ( "path/filepath" "runtime" "strconv" - "strings" "syscall" specs "github.com/opencontainers/runtime-spec/specs-go" @@ -75,30 +74,8 @@ func setupNetwork(conn *urpc.Client, pid int, spec *specs.Spec, conf *boot.Confi } func createDefaultLoopbackInterface(conn *urpc.Client) error { - link := boot.LoopbackLink{ - Name: "lo", - Addresses: []net.IP{ - net.IP("\x7f\x00\x00\x01"), - net.IPv6loopback, - }, - Routes: []boot.Route{ - { - Destination: net.IPNet{ - - IP: net.IPv4(0x7f, 0, 0, 0), - Mask: net.IPv4Mask(0xff, 0, 0, 0), - }, - }, - { - Destination: net.IPNet{ - IP: net.IPv6loopback, - Mask: net.IPMask(strings.Repeat("\xff", net.IPv6len)), - }, - }, - }, - } if err := conn.Call(boot.NetworkCreateLinksAndRoutes, &boot.CreateLinksAndRoutesArgs{ - LoopbackLinks: []boot.LoopbackLink{link}, + LoopbackLinks: []boot.LoopbackLink{boot.DefaultLoopbackLink}, }, nil); err != nil { return fmt.Errorf("creating loopback link and routes: %v", err) } diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD index d69ac8356..d1977d4de 100644 --- a/test/syscalls/BUILD +++ b/test/syscalls/BUILD @@ -258,6 +258,8 @@ syscall_test( syscall_test(test = "//test/syscalls/linux:munmap_test") +syscall_test(test = "//test/syscalls/linux:network_namespace_test") + syscall_test( add_overlay = True, test = "//test/syscalls/linux:open_create_test", diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD index 05a818795..aa303af84 100644 --- a/test/syscalls/linux/BUILD +++ b/test/syscalls/linux/BUILD @@ -3639,6 +3639,23 @@ cc_binary( ], ) +cc_binary( + name = "network_namespace_test", + testonly = 1, + srcs = ["network_namespace.cc"], + linkstatic = 1, + deps = [ + ":socket_test_util", + gtest, + "//test/util:capability_util", + "//test/util:memory_util", + "//test/util:test_main", + "//test/util:test_util", + "//test/util:thread_util", + "@com_google_absl//absl/synchronization", + ], +) + cc_binary( name = "semaphore_test", testonly = 1, diff --git a/test/syscalls/linux/network_namespace.cc b/test/syscalls/linux/network_namespace.cc new file mode 100644 index 000000000..6ea48c263 --- /dev/null +++ b/test/syscalls/linux/network_namespace.cc @@ -0,0 +1,121 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include + +#include "gmock/gmock.h" +#include "gtest/gtest.h" +#include "absl/synchronization/notification.h" +#include "test/syscalls/linux/socket_test_util.h" +#include "test/util/capability_util.h" +#include "test/util/memory_util.h" +#include "test/util/test_util.h" +#include "test/util/thread_util.h" + +namespace gvisor { +namespace testing { + +namespace { + +using TestFunc = std::function; +using RunFunc = std::function; + +struct NamespaceStrategy { + RunFunc run; + + static NamespaceStrategy Of(RunFunc run) { + NamespaceStrategy s; + s.run = run; + return s; + } +}; + +PosixError RunWithUnshare(TestFunc fn) { + PosixError err = PosixError(-1, "function did not return a value"); + ScopedThread t([&] { + if (unshare(CLONE_NEWNET) != 0) { + err = PosixError(errno); + return; + } + err = fn(); + }); + t.Join(); + return err; +} + +PosixError RunWithClone(TestFunc fn) { + struct Args { + absl::Notification n; + TestFunc fn; + PosixError err; + }; + Args args; + args.fn = fn; + args.err = PosixError(-1, "function did not return a value"); + + ASSIGN_OR_RETURN_ERRNO( + Mapping child_stack, + MmapAnon(kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE)); + pid_t child = clone( + +[](void *arg) { + Args *args = reinterpret_cast(arg); + args->err = args->fn(); + args->n.Notify(); + syscall(SYS_exit, 0); // Exit manually. No return address on stack. + return 0; + }, + reinterpret_cast(child_stack.addr() + kPageSize), + CLONE_NEWNET | CLONE_THREAD | CLONE_SIGHAND | CLONE_VM, &args); + if (child < 0) { + return PosixError(errno, "clone() failed"); + } + args.n.WaitForNotification(); + return args.err; +} + +class NetworkNamespaceTest + : public ::testing::TestWithParam {}; + +TEST_P(NetworkNamespaceTest, LoopbackExists) { + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN))); + + EXPECT_NO_ERRNO(GetParam().run([]() { + // TODO(gvisor.dev/issue/1833): Update this to test that only "lo" exists. + // Check loopback device exists. + int sock = socket(AF_INET, SOCK_DGRAM, 0); + if (sock < 0) { + return PosixError(errno, "socket() failed"); + } + struct ifreq ifr; + snprintf(ifr.ifr_name, IFNAMSIZ, "lo"); + if (ioctl(sock, SIOCGIFINDEX, &ifr) < 0) { + return PosixError(errno, "ioctl() failed, lo cannot be found"); + } + return NoError(); + })); +} + +INSTANTIATE_TEST_SUITE_P( + AllNetworkNamespaceTest, NetworkNamespaceTest, + ::testing::Values(NamespaceStrategy::Of(RunWithUnshare), + NamespaceStrategy::Of(RunWithClone))); + +} // namespace + +} // namespace testing +} // namespace gvisor -- cgit v1.2.3 From 471b15b212831af31c2fe36cd42cea7ec7b7785b Mon Sep 17 00:00:00 2001 From: Jamie Liu Date: Tue, 25 Feb 2020 13:25:36 -0800 Subject: Port most syscalls to VFS2. pipe and pipe2 aren't ported, pending a slight rework of pipe FDs for VFS2. mount and umount2 aren't ported out of temporary laziness. access and faccessat need additional FSImpl methods to implement properly, but are stubbed to prevent googletest from CHECK-failing. Other syscalls require additional plumbing. Updates #1623 PiperOrigin-RevId: 297188448 --- pkg/abi/linux/epoll_amd64.go | 2 + pkg/abi/linux/epoll_arm64.go | 2 + pkg/abi/linux/file.go | 2 + pkg/abi/linux/fs.go | 2 + pkg/abi/linux/signal.go | 2 + pkg/abi/linux/time.go | 6 + pkg/abi/linux/xattr.go | 1 + pkg/fspath/BUILD | 4 +- pkg/fspath/builder.go | 8 + pkg/fspath/builder_unsafe.go | 27 - pkg/fspath/fspath.go | 3 +- pkg/gohacks/BUILD | 11 + pkg/gohacks/gohacks_unsafe.go | 57 ++ pkg/sentry/fsbridge/vfs.go | 10 +- pkg/sentry/fsimpl/proc/tasks.go | 4 +- pkg/sentry/kernel/fd_table.go | 49 +- pkg/sentry/kernel/fs_context.go | 22 + pkg/sentry/kernel/task.go | 18 + pkg/sentry/syscalls/linux/sys_epoll.go | 4 + pkg/sentry/syscalls/linux/sys_file.go | 40 ++ pkg/sentry/syscalls/linux/sys_getdents.go | 4 + pkg/sentry/syscalls/linux/sys_lseek.go | 4 + pkg/sentry/syscalls/linux/sys_mmap.go | 4 + pkg/sentry/syscalls/linux/sys_read.go | 4 + pkg/sentry/syscalls/linux/sys_stat.go | 4 + pkg/sentry/syscalls/linux/sys_sync.go | 4 + pkg/sentry/syscalls/linux/sys_write.go | 4 + pkg/sentry/syscalls/linux/sys_xattr.go | 4 + pkg/sentry/syscalls/linux/vfs2/BUILD | 28 +- pkg/sentry/syscalls/linux/vfs2/epoll.go | 225 ++++++++ pkg/sentry/syscalls/linux/vfs2/epoll_unsafe.go | 44 ++ pkg/sentry/syscalls/linux/vfs2/execve.go | 137 +++++ pkg/sentry/syscalls/linux/vfs2/fd.go | 147 ++++++ pkg/sentry/syscalls/linux/vfs2/filesystem.go | 326 ++++++++++++ pkg/sentry/syscalls/linux/vfs2/fscontext.go | 131 +++++ pkg/sentry/syscalls/linux/vfs2/getdents.go | 149 ++++++ pkg/sentry/syscalls/linux/vfs2/ioctl.go | 35 ++ .../syscalls/linux/vfs2/linux64_override_amd64.go | 216 ++++---- .../syscalls/linux/vfs2/linux64_override_arm64.go | 2 + pkg/sentry/syscalls/linux/vfs2/mmap.go | 92 ++++ pkg/sentry/syscalls/linux/vfs2/path.go | 94 ++++ pkg/sentry/syscalls/linux/vfs2/poll.go | 584 +++++++++++++++++++++ pkg/sentry/syscalls/linux/vfs2/read_write.go | 511 ++++++++++++++++++ pkg/sentry/syscalls/linux/vfs2/setstat.go | 380 ++++++++++++++ pkg/sentry/syscalls/linux/vfs2/stat.go | 346 ++++++++++++ pkg/sentry/syscalls/linux/vfs2/sync.go | 87 +++ pkg/sentry/syscalls/linux/vfs2/sys_read.go | 95 ---- pkg/sentry/syscalls/linux/vfs2/xattr.go | 353 +++++++++++++ pkg/sentry/vfs/BUILD | 1 + pkg/sentry/vfs/epoll.go | 3 + pkg/sentry/vfs/mount_unsafe.go | 12 +- pkg/sentry/vfs/resolving_path.go | 2 +- pkg/sentry/vfs/vfs.go | 10 +- pkg/usermem/BUILD | 2 +- pkg/usermem/usermem.go | 9 +- pkg/usermem/usermem_unsafe.go | 27 - runsc/boot/filter/config.go | 2 + 57 files changed, 4082 insertions(+), 274 deletions(-) delete mode 100644 pkg/fspath/builder_unsafe.go create mode 100644 pkg/gohacks/BUILD create mode 100644 pkg/gohacks/gohacks_unsafe.go create mode 100644 pkg/sentry/syscalls/linux/vfs2/epoll.go create mode 100644 pkg/sentry/syscalls/linux/vfs2/epoll_unsafe.go create mode 100644 pkg/sentry/syscalls/linux/vfs2/execve.go create mode 100644 pkg/sentry/syscalls/linux/vfs2/fd.go create mode 100644 pkg/sentry/syscalls/linux/vfs2/filesystem.go create mode 100644 pkg/sentry/syscalls/linux/vfs2/fscontext.go create mode 100644 pkg/sentry/syscalls/linux/vfs2/getdents.go create mode 100644 pkg/sentry/syscalls/linux/vfs2/ioctl.go create mode 100644 pkg/sentry/syscalls/linux/vfs2/mmap.go create mode 100644 pkg/sentry/syscalls/linux/vfs2/path.go create mode 100644 pkg/sentry/syscalls/linux/vfs2/poll.go create mode 100644 pkg/sentry/syscalls/linux/vfs2/read_write.go create mode 100644 pkg/sentry/syscalls/linux/vfs2/setstat.go create mode 100644 pkg/sentry/syscalls/linux/vfs2/stat.go create mode 100644 pkg/sentry/syscalls/linux/vfs2/sync.go delete mode 100644 pkg/sentry/syscalls/linux/vfs2/sys_read.go create mode 100644 pkg/sentry/syscalls/linux/vfs2/xattr.go delete mode 100644 pkg/usermem/usermem_unsafe.go (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/abi/linux/epoll_amd64.go b/pkg/abi/linux/epoll_amd64.go index 57041491c..34ff18009 100644 --- a/pkg/abi/linux/epoll_amd64.go +++ b/pkg/abi/linux/epoll_amd64.go @@ -15,6 +15,8 @@ package linux // EpollEvent is equivalent to struct epoll_event from epoll(2). +// +// +marshal type EpollEvent struct { Events uint32 // Linux makes struct epoll_event::data a __u64. We represent it as diff --git a/pkg/abi/linux/epoll_arm64.go b/pkg/abi/linux/epoll_arm64.go index 62ef5821e..f86c35329 100644 --- a/pkg/abi/linux/epoll_arm64.go +++ b/pkg/abi/linux/epoll_arm64.go @@ -15,6 +15,8 @@ package linux // EpollEvent is equivalent to struct epoll_event from epoll(2). +// +// +marshal type EpollEvent struct { Events uint32 // Linux makes struct epoll_event a __u64, necessitating 4 bytes of padding diff --git a/pkg/abi/linux/file.go b/pkg/abi/linux/file.go index c3ab15a4f..e229ac21c 100644 --- a/pkg/abi/linux/file.go +++ b/pkg/abi/linux/file.go @@ -241,6 +241,8 @@ const ( ) // Statx represents struct statx. +// +// +marshal type Statx struct { Mask uint32 Blksize uint32 diff --git a/pkg/abi/linux/fs.go b/pkg/abi/linux/fs.go index 2c652baa2..158d2db5b 100644 --- a/pkg/abi/linux/fs.go +++ b/pkg/abi/linux/fs.go @@ -38,6 +38,8 @@ const ( ) // Statfs is struct statfs, from uapi/asm-generic/statfs.h. +// +// +marshal type Statfs struct { // Type is one of the filesystem magic values, defined above. Type uint64 diff --git a/pkg/abi/linux/signal.go b/pkg/abi/linux/signal.go index c69b04ea9..1c330e763 100644 --- a/pkg/abi/linux/signal.go +++ b/pkg/abi/linux/signal.go @@ -115,6 +115,8 @@ const ( ) // SignalSet is a signal mask with a bit corresponding to each signal. +// +// +marshal type SignalSet uint64 // SignalSetSize is the size in bytes of a SignalSet. diff --git a/pkg/abi/linux/time.go b/pkg/abi/linux/time.go index e562b46d9..e6860ed49 100644 --- a/pkg/abi/linux/time.go +++ b/pkg/abi/linux/time.go @@ -157,6 +157,8 @@ func DurationToTimespec(dur time.Duration) Timespec { const SizeOfTimeval = 16 // Timeval represents struct timeval in . +// +// +marshal type Timeval struct { Sec int64 Usec int64 @@ -230,6 +232,8 @@ type Tms struct { type TimerID int32 // StatxTimestamp represents struct statx_timestamp. +// +// +marshal type StatxTimestamp struct { Sec int64 Nsec uint32 @@ -258,6 +262,8 @@ func NsecToStatxTimestamp(nsec int64) (ts StatxTimestamp) { } // Utime represents struct utimbuf used by utimes(2). +// +// +marshal type Utime struct { Actime int64 Modtime int64 diff --git a/pkg/abi/linux/xattr.go b/pkg/abi/linux/xattr.go index a3b6406fa..99180b208 100644 --- a/pkg/abi/linux/xattr.go +++ b/pkg/abi/linux/xattr.go @@ -18,6 +18,7 @@ package linux const ( XATTR_NAME_MAX = 255 XATTR_SIZE_MAX = 65536 + XATTR_LIST_MAX = 65536 XATTR_CREATE = 1 XATTR_REPLACE = 2 diff --git a/pkg/fspath/BUILD b/pkg/fspath/BUILD index ee84471b2..67dd1e225 100644 --- a/pkg/fspath/BUILD +++ b/pkg/fspath/BUILD @@ -8,9 +8,11 @@ go_library( name = "fspath", srcs = [ "builder.go", - "builder_unsafe.go", "fspath.go", ], + deps = [ + "//pkg/gohacks", + ], ) go_test( diff --git a/pkg/fspath/builder.go b/pkg/fspath/builder.go index 7ddb36826..6318d3874 100644 --- a/pkg/fspath/builder.go +++ b/pkg/fspath/builder.go @@ -16,6 +16,8 @@ package fspath import ( "fmt" + + "gvisor.dev/gvisor/pkg/gohacks" ) // Builder is similar to strings.Builder, but is used to produce pathnames @@ -102,3 +104,9 @@ func (b *Builder) AppendString(str string) { copy(b.buf[b.start:], b.buf[oldStart:]) copy(b.buf[len(b.buf)-len(str):], str) } + +// String returns the accumulated string. No other methods should be called +// after String. +func (b *Builder) String() string { + return gohacks.StringFromImmutableBytes(b.buf[b.start:]) +} diff --git a/pkg/fspath/builder_unsafe.go b/pkg/fspath/builder_unsafe.go deleted file mode 100644 index 75606808d..000000000 --- a/pkg/fspath/builder_unsafe.go +++ /dev/null @@ -1,27 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package fspath - -import ( - "unsafe" -) - -// String returns the accumulated string. No other methods should be called -// after String. -func (b *Builder) String() string { - bs := b.buf[b.start:] - // Compare strings.Builder.String(). - return *(*string)(unsafe.Pointer(&bs)) -} diff --git a/pkg/fspath/fspath.go b/pkg/fspath/fspath.go index 9fb3fee24..4c983d5fd 100644 --- a/pkg/fspath/fspath.go +++ b/pkg/fspath/fspath.go @@ -67,7 +67,8 @@ func Parse(pathname string) Path { // Path contains the information contained in a pathname string. // -// Path is copyable by value. +// Path is copyable by value. The zero value for Path is equivalent to +// fspath.Parse(""), i.e. the empty path. type Path struct { // Begin is an iterator to the first path component in the relative part of // the path. diff --git a/pkg/gohacks/BUILD b/pkg/gohacks/BUILD new file mode 100644 index 000000000..798a65eca --- /dev/null +++ b/pkg/gohacks/BUILD @@ -0,0 +1,11 @@ +load("//tools:defs.bzl", "go_library") + +package(licenses = ["notice"]) + +go_library( + name = "gohacks", + srcs = [ + "gohacks_unsafe.go", + ], + visibility = ["//:sandbox"], +) diff --git a/pkg/gohacks/gohacks_unsafe.go b/pkg/gohacks/gohacks_unsafe.go new file mode 100644 index 000000000..aad675172 --- /dev/null +++ b/pkg/gohacks/gohacks_unsafe.go @@ -0,0 +1,57 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package gohacks contains utilities for subverting the Go compiler. +package gohacks + +import ( + "reflect" + "unsafe" +) + +// Noescape hides a pointer from escape analysis. Noescape is the identity +// function but escape analysis doesn't think the output depends on the input. +// Noescape is inlined and currently compiles down to zero instructions. +// USE CAREFULLY! +// +// (Noescape is copy/pasted from Go's runtime/stubs.go:noescape().) +// +//go:nosplit +func Noescape(p unsafe.Pointer) unsafe.Pointer { + x := uintptr(p) + return unsafe.Pointer(x ^ 0) +} + +// ImmutableBytesFromString is equivalent to []byte(s), except that it uses the +// same memory backing s instead of making a heap-allocated copy. This is only +// valid if the returned slice is never mutated. +func ImmutableBytesFromString(s string) []byte { + shdr := (*reflect.StringHeader)(unsafe.Pointer(&s)) + var bs []byte + bshdr := (*reflect.SliceHeader)(unsafe.Pointer(&bs)) + bshdr.Data = shdr.Data + bshdr.Len = shdr.Len + bshdr.Cap = shdr.Len + return bs +} + +// StringFromImmutableBytes is equivalent to string(bs), except that it uses +// the same memory backing bs instead of making a heap-allocated copy. This is +// only valid if bs is never mutated after StringFromImmutableBytes returns. +func StringFromImmutableBytes(bs []byte) string { + // This is cheaper than messing with reflect.StringHeader and + // reflect.SliceHeader, which as of this writing produces many dead stores + // of zeroes. Compare strings.Builder.String(). + return *(*string)(unsafe.Pointer(&bs)) +} diff --git a/pkg/sentry/fsbridge/vfs.go b/pkg/sentry/fsbridge/vfs.go index e657c39bc..6aa17bfc1 100644 --- a/pkg/sentry/fsbridge/vfs.go +++ b/pkg/sentry/fsbridge/vfs.go @@ -117,15 +117,19 @@ func NewVFSLookup(mntns *vfs.MountNamespace, root, workingDir vfs.VirtualDentry) // default anyways. // // TODO(gvisor.dev/issue/1623): Check mount has read and exec permission. -func (l *vfsLookup) OpenPath(ctx context.Context, path string, opts vfs.OpenOptions, _ *uint, resolveFinal bool) (File, error) { +func (l *vfsLookup) OpenPath(ctx context.Context, pathname string, opts vfs.OpenOptions, _ *uint, resolveFinal bool) (File, error) { vfsObj := l.mntns.Root().Mount().Filesystem().VirtualFilesystem() creds := auth.CredentialsFromContext(ctx) + path := fspath.Parse(pathname) pop := &vfs.PathOperation{ Root: l.root, - Start: l.root, - Path: fspath.Parse(path), + Start: l.workingDir, + Path: path, FollowFinalSymlink: resolveFinal, } + if path.Absolute { + pop.Start = l.root + } fd, err := vfsObj.OpenAt(ctx, creds, pop, &opts) if err != nil { return nil, err diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go index ce08a7d53..10c08fa90 100644 --- a/pkg/sentry/fsimpl/proc/tasks.go +++ b/pkg/sentry/fsimpl/proc/tasks.go @@ -73,9 +73,9 @@ func newTasksInode(inoGen InoGenerator, k *kernel.Kernel, pidns *kernel.PIDNames "meminfo": newDentry(root, inoGen.NextIno(), 0444, &meminfoData{}), "mounts": kernfs.NewStaticSymlink(root, inoGen.NextIno(), "self/mounts"), "net": newNetDir(root, inoGen, k), - "stat": newDentry(root, inoGen.NextIno(), 0444, &statData{}), + "stat": newDentry(root, inoGen.NextIno(), 0444, &statData{k: k}), "uptime": newDentry(root, inoGen.NextIno(), 0444, &uptimeData{}), - "version": newDentry(root, inoGen.NextIno(), 0444, &versionData{}), + "version": newDentry(root, inoGen.NextIno(), 0444, &versionData{k: k}), } inode := &tasksInode{ diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go index 23b88f7a6..58001d56c 100644 --- a/pkg/sentry/kernel/fd_table.go +++ b/pkg/sentry/kernel/fd_table.go @@ -296,6 +296,50 @@ func (f *FDTable) NewFDs(ctx context.Context, fd int32, files []*fs.File, flags return fds, nil } +// NewFDVFS2 allocates a file descriptor greater than or equal to minfd for +// the given file description. If it succeeds, it takes a reference on file. +func (f *FDTable) NewFDVFS2(ctx context.Context, minfd int32, file *vfs.FileDescription, flags FDFlags) (int32, error) { + if minfd < 0 { + // Don't accept negative FDs. + return -1, syscall.EINVAL + } + + // Default limit. + end := int32(math.MaxInt32) + + // Ensure we don't get past the provided limit. + if limitSet := limits.FromContext(ctx); limitSet != nil { + lim := limitSet.Get(limits.NumberOfFiles) + if lim.Cur != limits.Infinity { + end = int32(lim.Cur) + } + if minfd >= end { + return -1, syscall.EMFILE + } + } + + f.mu.Lock() + defer f.mu.Unlock() + + // From f.next to find available fd. + fd := minfd + if fd < f.next { + fd = f.next + } + for fd < end { + if d, _, _ := f.get(fd); d == nil { + f.setVFS2(fd, file, flags) + if fd == f.next { + // Update next search start position. + f.next = fd + 1 + } + return fd, nil + } + fd++ + } + return -1, syscall.EMFILE +} + // NewFDAt sets the file reference for the given FD. If there is an active // reference for that FD, the ref count for that existing reference is // decremented. @@ -316,9 +360,6 @@ func (f *FDTable) newFDAt(ctx context.Context, fd int32, file *fs.File, fileVFS2 return syscall.EBADF } - f.mu.Lock() - defer f.mu.Unlock() - // Check the limit for the provided file. if limitSet := limits.FromContext(ctx); limitSet != nil { if lim := limitSet.Get(limits.NumberOfFiles); lim.Cur != limits.Infinity && uint64(fd) >= lim.Cur { @@ -327,6 +368,8 @@ func (f *FDTable) newFDAt(ctx context.Context, fd int32, file *fs.File, fileVFS2 } // Install the entry. + f.mu.Lock() + defer f.mu.Unlock() f.setAll(fd, file, fileVFS2, flags) return nil } diff --git a/pkg/sentry/kernel/fs_context.go b/pkg/sentry/kernel/fs_context.go index 7218aa24e..47f78df9a 100644 --- a/pkg/sentry/kernel/fs_context.go +++ b/pkg/sentry/kernel/fs_context.go @@ -244,6 +244,28 @@ func (f *FSContext) SetRootDirectory(d *fs.Dirent) { old.DecRef() } +// SetRootDirectoryVFS2 sets the root directory. It takes a reference on vd. +// +// This is not a valid call after free. +func (f *FSContext) SetRootDirectoryVFS2(vd vfs.VirtualDentry) { + if !vd.Ok() { + panic("FSContext.SetRootDirectoryVFS2 called with zero-value VirtualDentry") + } + + f.mu.Lock() + + if !f.rootVFS2.Ok() { + f.mu.Unlock() + panic(fmt.Sprintf("FSContext.SetRootDirectoryVFS2(%v)) called after destroy", vd)) + } + + old := f.rootVFS2 + vd.IncRef() + f.rootVFS2 = vd + f.mu.Unlock() + old.DecRef() +} + // Umask returns the current umask. func (f *FSContext) Umask() uint { f.mu.Lock() diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go index e37e23231..2cee2e6ed 100644 --- a/pkg/sentry/kernel/task.go +++ b/pkg/sentry/kernel/task.go @@ -789,6 +789,15 @@ func (t *Task) NewFDFrom(fd int32, file *fs.File, flags FDFlags) (int32, error) return fds[0], nil } +// NewFDFromVFS2 is a convenience wrapper for t.FDTable().NewFDVFS2. +// +// This automatically passes the task as the context. +// +// Precondition: same as FDTable.Get. +func (t *Task) NewFDFromVFS2(fd int32, file *vfs.FileDescription, flags FDFlags) (int32, error) { + return t.fdTable.NewFDVFS2(t, fd, file, flags) +} + // NewFDAt is a convenience wrapper for t.FDTable().NewFDAt. // // This automatically passes the task as the context. @@ -798,6 +807,15 @@ func (t *Task) NewFDAt(fd int32, file *fs.File, flags FDFlags) error { return t.fdTable.NewFDAt(t, fd, file, flags) } +// NewFDAtVFS2 is a convenience wrapper for t.FDTable().NewFDAtVFS2. +// +// This automatically passes the task as the context. +// +// Precondition: same as FDTable. +func (t *Task) NewFDAtVFS2(fd int32, file *vfs.FileDescription, flags FDFlags) error { + return t.fdTable.NewFDAtVFS2(t, fd, file, flags) +} + // WithMuLocked executes f with t.mu locked. func (t *Task) WithMuLocked(f func(*Task)) { t.mu.Lock() diff --git a/pkg/sentry/syscalls/linux/sys_epoll.go b/pkg/sentry/syscalls/linux/sys_epoll.go index fbef5b376..3ab93fbde 100644 --- a/pkg/sentry/syscalls/linux/sys_epoll.go +++ b/pkg/sentry/syscalls/linux/sys_epoll.go @@ -25,6 +25,8 @@ import ( "gvisor.dev/gvisor/pkg/waiter" ) +// LINT.IfChange + // EpollCreate1 implements the epoll_create1(2) linux syscall. func EpollCreate1(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { flags := args[0].Int() @@ -164,3 +166,5 @@ func EpollPwait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy return EpollWait(t, args) } + +// LINT.ThenChange(vfs2/epoll.go) diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go index 421845ebb..c21f14dc0 100644 --- a/pkg/sentry/syscalls/linux/sys_file.go +++ b/pkg/sentry/syscalls/linux/sys_file.go @@ -130,6 +130,8 @@ func copyInPath(t *kernel.Task, addr usermem.Addr, allowEmpty bool) (path string return path, dirPath, nil } +// LINT.IfChange + func openAt(t *kernel.Task, dirFD int32, addr usermem.Addr, flags uint) (fd uintptr, err error) { path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */) if err != nil { @@ -575,6 +577,10 @@ func Faccessat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys return 0, nil, accessAt(t, dirFD, addr, flags&linux.AT_SYMLINK_NOFOLLOW == 0, mode) } +// LINT.ThenChange(vfs2/filesystem.go) + +// LINT.IfChange + // Ioctl implements linux syscall ioctl(2). func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { fd := args[0].Int() @@ -650,6 +656,10 @@ func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall } } +// LINT.ThenChange(vfs2/ioctl.go) + +// LINT.IfChange + // Getcwd implements the linux syscall getcwd(2). func Getcwd(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { addr := args[0].Pointer() @@ -760,6 +770,10 @@ func Fchdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal return 0, nil, nil } +// LINT.ThenChange(vfs2/fscontext.go) + +// LINT.IfChange + // Close implements linux syscall close(2). func Close(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { fd := args[0].Int() @@ -1094,6 +1108,8 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall } } +// LINT.ThenChange(vfs2/fd.go) + const ( _FADV_NORMAL = 0 _FADV_RANDOM = 1 @@ -1141,6 +1157,8 @@ func Fadvise64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys return 0, nil, nil } +// LINT.IfChange + func mkdirAt(t *kernel.Task, dirFD int32, addr usermem.Addr, mode linux.FileMode) error { path, _, err := copyInPath(t, addr, false /* allowEmpty */) if err != nil { @@ -1421,6 +1439,10 @@ func Linkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal return 0, nil, linkAt(t, oldDirFD, oldAddr, newDirFD, newAddr, resolve, allowEmpty) } +// LINT.ThenChange(vfs2/filesystem.go) + +// LINT.IfChange + func readlinkAt(t *kernel.Task, dirFD int32, addr usermem.Addr, bufAddr usermem.Addr, size uint) (copied uintptr, err error) { path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */) if err != nil { @@ -1480,6 +1502,10 @@ func Readlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy return n, nil, err } +// LINT.ThenChange(vfs2/stat.go) + +// LINT.IfChange + func unlinkAt(t *kernel.Task, dirFD int32, addr usermem.Addr) error { path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */) if err != nil { @@ -1516,6 +1542,10 @@ func Unlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc return 0, nil, unlinkAt(t, dirFD, addr) } +// LINT.ThenChange(vfs2/filesystem.go) + +// LINT.IfChange + // Truncate implements linux syscall truncate(2). func Truncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { addr := args[0].Pointer() @@ -1614,6 +1644,8 @@ func Ftruncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys return 0, nil, nil } +// LINT.ThenChange(vfs2/setstat.go) + // Umask implements linux syscall umask(2). func Umask(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { mask := args[0].ModeT() @@ -1621,6 +1653,8 @@ func Umask(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall return uintptr(mask), nil, nil } +// LINT.IfChange + // Change ownership of a file. // // uid and gid may be -1, in which case they will not be changed. @@ -1987,6 +2021,10 @@ func Futimesat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys return 0, nil, utimes(t, dirFD, pathnameAddr, ts, true) } +// LINT.ThenChange(vfs2/setstat.go) + +// LINT.IfChange + func renameAt(t *kernel.Task, oldDirFD int32, oldAddr usermem.Addr, newDirFD int32, newAddr usermem.Addr) error { newPath, _, err := copyInPath(t, newAddr, false /* allowEmpty */) if err != nil { @@ -2042,6 +2080,8 @@ func Renameat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc return 0, nil, renameAt(t, oldDirFD, oldPathAddr, newDirFD, newPathAddr) } +// LINT.ThenChange(vfs2/filesystem.go) + // Fallocate implements linux system call fallocate(2). func Fallocate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { fd := args[0].Int() diff --git a/pkg/sentry/syscalls/linux/sys_getdents.go b/pkg/sentry/syscalls/linux/sys_getdents.go index f66f4ffde..b126fecc0 100644 --- a/pkg/sentry/syscalls/linux/sys_getdents.go +++ b/pkg/sentry/syscalls/linux/sys_getdents.go @@ -27,6 +27,8 @@ import ( "gvisor.dev/gvisor/pkg/usermem" ) +// LINT.IfChange + // Getdents implements linux syscall getdents(2) for 64bit systems. func Getdents(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { fd := args[0].Int() @@ -244,3 +246,5 @@ func (ds *direntSerializer) CopyOut(name string, attr fs.DentAttr) error { func (ds *direntSerializer) Written() int { return ds.written } + +// LINT.ThenChange(vfs2/getdents.go) diff --git a/pkg/sentry/syscalls/linux/sys_lseek.go b/pkg/sentry/syscalls/linux/sys_lseek.go index 297e920c4..3f7691eae 100644 --- a/pkg/sentry/syscalls/linux/sys_lseek.go +++ b/pkg/sentry/syscalls/linux/sys_lseek.go @@ -21,6 +21,8 @@ import ( "gvisor.dev/gvisor/pkg/syserror" ) +// LINT.IfChange + // Lseek implements linux syscall lseek(2). func Lseek(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { fd := args[0].Int() @@ -52,3 +54,5 @@ func Lseek(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall } return uintptr(offset), nil, err } + +// LINT.ThenChange(vfs2/read_write.go) diff --git a/pkg/sentry/syscalls/linux/sys_mmap.go b/pkg/sentry/syscalls/linux/sys_mmap.go index 9959f6e61..91694d374 100644 --- a/pkg/sentry/syscalls/linux/sys_mmap.go +++ b/pkg/sentry/syscalls/linux/sys_mmap.go @@ -35,6 +35,8 @@ func Brk(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallCo return uintptr(addr), nil, nil } +// LINT.IfChange + // Mmap implements linux syscall mmap(2). func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { prot := args[2].Int() @@ -104,6 +106,8 @@ func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC return uintptr(rv), nil, err } +// LINT.ThenChange(vfs2/mmap.go) + // Munmap implements linux syscall munmap(2). func Munmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { return 0, nil, t.MemoryManager().MUnmap(t, args[0].Pointer(), args[1].Uint64()) diff --git a/pkg/sentry/syscalls/linux/sys_read.go b/pkg/sentry/syscalls/linux/sys_read.go index 227692f06..78a2cb750 100644 --- a/pkg/sentry/syscalls/linux/sys_read.go +++ b/pkg/sentry/syscalls/linux/sys_read.go @@ -28,6 +28,8 @@ import ( "gvisor.dev/gvisor/pkg/waiter" ) +// LINT.IfChange + const ( // EventMaskRead contains events that can be triggered on reads. EventMaskRead = waiter.EventIn | waiter.EventHUp | waiter.EventErr @@ -388,3 +390,5 @@ func preadv(t *kernel.Task, f *fs.File, dst usermem.IOSequence, offset int64) (i return total, err } + +// LINT.ThenChange(vfs2/read_write.go) diff --git a/pkg/sentry/syscalls/linux/sys_stat.go b/pkg/sentry/syscalls/linux/sys_stat.go index 11f25e00d..701b27b4a 100644 --- a/pkg/sentry/syscalls/linux/sys_stat.go +++ b/pkg/sentry/syscalls/linux/sys_stat.go @@ -23,6 +23,8 @@ import ( "gvisor.dev/gvisor/pkg/usermem" ) +// LINT.IfChange + func statFromAttrs(t *kernel.Task, sattr fs.StableAttr, uattr fs.UnstableAttr) linux.Stat { return linux.Stat{ Dev: sattr.DeviceID, @@ -297,3 +299,5 @@ func statfsImpl(t *kernel.Task, d *fs.Dirent, addr usermem.Addr) error { _, err = t.CopyOut(addr, &statfs) return err } + +// LINT.ThenChange(vfs2/stat.go) diff --git a/pkg/sentry/syscalls/linux/sys_sync.go b/pkg/sentry/syscalls/linux/sys_sync.go index 3e55235bd..5ad465ae3 100644 --- a/pkg/sentry/syscalls/linux/sys_sync.go +++ b/pkg/sentry/syscalls/linux/sys_sync.go @@ -22,6 +22,8 @@ import ( "gvisor.dev/gvisor/pkg/syserror" ) +// LINT.IfChange + // Sync implements linux system call sync(2). func Sync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { t.MountNamespace().SyncAll(t) @@ -135,3 +137,5 @@ func SyncFileRange(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel return 0, nil, syserror.ConvertIntr(err, kernel.ERESTARTSYS) } + +// LINT.ThenChange(vfs2/sync.go) diff --git a/pkg/sentry/syscalls/linux/sys_write.go b/pkg/sentry/syscalls/linux/sys_write.go index aba892939..506ee54ce 100644 --- a/pkg/sentry/syscalls/linux/sys_write.go +++ b/pkg/sentry/syscalls/linux/sys_write.go @@ -28,6 +28,8 @@ import ( "gvisor.dev/gvisor/pkg/waiter" ) +// LINT.IfChange + const ( // EventMaskWrite contains events that can be triggered on writes. // @@ -358,3 +360,5 @@ func pwritev(t *kernel.Task, f *fs.File, src usermem.IOSequence, offset int64) ( return total, err } + +// LINT.ThenChange(vfs2/read_write.go) diff --git a/pkg/sentry/syscalls/linux/sys_xattr.go b/pkg/sentry/syscalls/linux/sys_xattr.go index 9d8140b8a..2de5e3422 100644 --- a/pkg/sentry/syscalls/linux/sys_xattr.go +++ b/pkg/sentry/syscalls/linux/sys_xattr.go @@ -25,6 +25,8 @@ import ( "gvisor.dev/gvisor/pkg/usermem" ) +// LINT.IfChange + // GetXattr implements linux syscall getxattr(2). func GetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { return getXattrFromPath(t, args, true) @@ -418,3 +420,5 @@ func removeXattr(t *kernel.Task, d *fs.Dirent, nameAddr usermem.Addr) error { return d.Inode.RemoveXattr(t, d, name) } + +// LINT.ThenChange(vfs2/xattr.go) diff --git a/pkg/sentry/syscalls/linux/vfs2/BUILD b/pkg/sentry/syscalls/linux/vfs2/BUILD index 6b8a00b6e..f51761e81 100644 --- a/pkg/sentry/syscalls/linux/vfs2/BUILD +++ b/pkg/sentry/syscalls/linux/vfs2/BUILD @@ -5,18 +5,44 @@ package(licenses = ["notice"]) go_library( name = "vfs2", srcs = [ + "epoll.go", + "epoll_unsafe.go", + "execve.go", + "fd.go", + "filesystem.go", + "fscontext.go", + "getdents.go", + "ioctl.go", "linux64.go", "linux64_override_amd64.go", "linux64_override_arm64.go", - "sys_read.go", + "mmap.go", + "path.go", + "poll.go", + "read_write.go", + "setstat.go", + "stat.go", + "sync.go", + "xattr.go", ], + marshal = True, visibility = ["//:sandbox"], deps = [ + "//pkg/abi/linux", + "//pkg/fspath", + "//pkg/gohacks", "//pkg/sentry/arch", + "//pkg/sentry/fsbridge", "//pkg/sentry/kernel", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/kernel/time", + "//pkg/sentry/limits", + "//pkg/sentry/loader", + "//pkg/sentry/memmap", "//pkg/sentry/syscalls", "//pkg/sentry/syscalls/linux", "//pkg/sentry/vfs", + "//pkg/sync", "//pkg/syserror", "//pkg/usermem", "//pkg/waiter", diff --git a/pkg/sentry/syscalls/linux/vfs2/epoll.go b/pkg/sentry/syscalls/linux/vfs2/epoll.go new file mode 100644 index 000000000..d6cb0e79a --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/epoll.go @@ -0,0 +1,225 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "math" + "time" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +// EpollCreate1 implements Linux syscall epoll_create1(2). +func EpollCreate1(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + flags := args[0].Int() + if flags&^linux.EPOLL_CLOEXEC != 0 { + return 0, nil, syserror.EINVAL + } + + file, err := t.Kernel().VFS().NewEpollInstanceFD() + if err != nil { + return 0, nil, err + } + defer file.DecRef() + + fd, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{ + CloseOnExec: flags&linux.EPOLL_CLOEXEC != 0, + }) + if err != nil { + return 0, nil, err + } + return uintptr(fd), nil, nil +} + +// EpollCreate implements Linux syscall epoll_create(2). +func EpollCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + size := args[0].Int() + + // "Since Linux 2.6.8, the size argument is ignored, but must be greater + // than zero" - epoll_create(2) + if size <= 0 { + return 0, nil, syserror.EINVAL + } + + file, err := t.Kernel().VFS().NewEpollInstanceFD() + if err != nil { + return 0, nil, err + } + defer file.DecRef() + + fd, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{}) + if err != nil { + return 0, nil, err + } + return uintptr(fd), nil, nil +} + +// EpollCtl implements Linux syscall epoll_ctl(2). +func EpollCtl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + epfd := args[0].Int() + op := args[1].Int() + fd := args[2].Int() + eventAddr := args[3].Pointer() + + epfile := t.GetFileVFS2(epfd) + if epfile == nil { + return 0, nil, syserror.EBADF + } + defer epfile.DecRef() + ep, ok := epfile.Impl().(*vfs.EpollInstance) + if !ok { + return 0, nil, syserror.EINVAL + } + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + if epfile == file { + return 0, nil, syserror.EINVAL + } + + var event linux.EpollEvent + switch op { + case linux.EPOLL_CTL_ADD: + if err := event.CopyIn(t, eventAddr); err != nil { + return 0, nil, err + } + return 0, nil, ep.AddInterest(file, fd, event) + case linux.EPOLL_CTL_DEL: + return 0, nil, ep.DeleteInterest(file, fd) + case linux.EPOLL_CTL_MOD: + if err := event.CopyIn(t, eventAddr); err != nil { + return 0, nil, err + } + return 0, nil, ep.ModifyInterest(file, fd, event) + default: + return 0, nil, syserror.EINVAL + } +} + +// EpollWait implements Linux syscall epoll_wait(2). +func EpollWait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + epfd := args[0].Int() + eventsAddr := args[1].Pointer() + maxEvents := int(args[2].Int()) + timeout := int(args[3].Int()) + + const _EP_MAX_EVENTS = math.MaxInt32 / sizeofEpollEvent // Linux: fs/eventpoll.c:EP_MAX_EVENTS + if maxEvents <= 0 || maxEvents > _EP_MAX_EVENTS { + return 0, nil, syserror.EINVAL + } + + epfile := t.GetFileVFS2(epfd) + if epfile == nil { + return 0, nil, syserror.EBADF + } + defer epfile.DecRef() + ep, ok := epfile.Impl().(*vfs.EpollInstance) + if !ok { + return 0, nil, syserror.EINVAL + } + + // Use a fixed-size buffer in a loop, instead of make([]linux.EpollEvent, + // maxEvents), so that the buffer can be allocated on the stack. + var ( + events [16]linux.EpollEvent + total int + ch chan struct{} + haveDeadline bool + deadline ktime.Time + ) + for { + batchEvents := len(events) + if batchEvents > maxEvents { + batchEvents = maxEvents + } + n := ep.ReadEvents(events[:batchEvents]) + maxEvents -= n + if n != 0 { + // Copy what we read out. + copiedEvents, err := copyOutEvents(t, eventsAddr, events[:n]) + eventsAddr += usermem.Addr(copiedEvents * sizeofEpollEvent) + total += copiedEvents + if err != nil { + if total != 0 { + return uintptr(total), nil, nil + } + return 0, nil, err + } + // If we've filled the application's event buffer, we're done. + if maxEvents == 0 { + return uintptr(total), nil, nil + } + // Loop if we read a full batch, under the expectation that there + // may be more events to read. + if n == batchEvents { + continue + } + } + // We get here if n != batchEvents. If we read any number of events + // (just now, or in a previous iteration of this loop), or if timeout + // is 0 (such that epoll_wait should be non-blocking), return the + // events we've read so far to the application. + if total != 0 || timeout == 0 { + return uintptr(total), nil, nil + } + // In the first iteration of this loop, register with the epoll + // instance for readability events, but then immediately continue the + // loop since we need to retry ReadEvents() before blocking. In all + // subsequent iterations, block until events are available, the timeout + // expires, or an interrupt arrives. + if ch == nil { + var w waiter.Entry + w, ch = waiter.NewChannelEntry(nil) + epfile.EventRegister(&w, waiter.EventIn) + defer epfile.EventUnregister(&w) + } else { + // Set up the timer if a timeout was specified. + if timeout > 0 && !haveDeadline { + timeoutDur := time.Duration(timeout) * time.Millisecond + deadline = t.Kernel().MonotonicClock().Now().Add(timeoutDur) + haveDeadline = true + } + if err := t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil { + if err == syserror.ETIMEDOUT { + err = nil + } + // total must be 0 since otherwise we would have returned + // above. + return 0, nil, err + } + } + } +} + +// EpollPwait implements Linux syscall epoll_pwait(2). +func EpollPwait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + maskAddr := args[4].Pointer() + maskSize := uint(args[5].Uint()) + + if err := setTempSignalSet(t, maskAddr, maskSize); err != nil { + return 0, nil, err + } + + return EpollWait(t, args) +} diff --git a/pkg/sentry/syscalls/linux/vfs2/epoll_unsafe.go b/pkg/sentry/syscalls/linux/vfs2/epoll_unsafe.go new file mode 100644 index 000000000..825f325bf --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/epoll_unsafe.go @@ -0,0 +1,44 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "reflect" + "runtime" + "unsafe" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/gohacks" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/usermem" +) + +const sizeofEpollEvent = int(unsafe.Sizeof(linux.EpollEvent{})) + +func copyOutEvents(t *kernel.Task, addr usermem.Addr, events []linux.EpollEvent) (int, error) { + if len(events) == 0 { + return 0, nil + } + // Cast events to a byte slice for copying. + var eventBytes []byte + eventBytesHdr := (*reflect.SliceHeader)(unsafe.Pointer(&eventBytes)) + eventBytesHdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(&events[0]))) + eventBytesHdr.Len = len(events) * sizeofEpollEvent + eventBytesHdr.Cap = len(events) * sizeofEpollEvent + copiedBytes, err := t.CopyOutBytes(addr, eventBytes) + runtime.KeepAlive(events) + copiedEvents := copiedBytes / sizeofEpollEvent // rounded down + return copiedEvents, err +} diff --git a/pkg/sentry/syscalls/linux/vfs2/execve.go b/pkg/sentry/syscalls/linux/vfs2/execve.go new file mode 100644 index 000000000..aef0078a8 --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/execve.go @@ -0,0 +1,137 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fsbridge" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/loader" + slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// Execve implements linux syscall execve(2). +func Execve(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + pathnameAddr := args[0].Pointer() + argvAddr := args[1].Pointer() + envvAddr := args[2].Pointer() + return execveat(t, linux.AT_FDCWD, pathnameAddr, argvAddr, envvAddr, 0 /* flags */) +} + +// Execveat implements linux syscall execveat(2). +func Execveat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + dirfd := args[0].Int() + pathnameAddr := args[1].Pointer() + argvAddr := args[2].Pointer() + envvAddr := args[3].Pointer() + flags := args[4].Int() + return execveat(t, dirfd, pathnameAddr, argvAddr, envvAddr, flags) +} + +func execveat(t *kernel.Task, dirfd int32, pathnameAddr, argvAddr, envvAddr usermem.Addr, flags int32) (uintptr, *kernel.SyscallControl, error) { + if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW) != 0 { + return 0, nil, syserror.EINVAL + } + + pathname, err := t.CopyInString(pathnameAddr, linux.PATH_MAX) + if err != nil { + return 0, nil, err + } + var argv, envv []string + if argvAddr != 0 { + var err error + argv, err = t.CopyInVector(argvAddr, slinux.ExecMaxElemSize, slinux.ExecMaxTotalSize) + if err != nil { + return 0, nil, err + } + } + if envvAddr != 0 { + var err error + envv, err = t.CopyInVector(envvAddr, slinux.ExecMaxElemSize, slinux.ExecMaxTotalSize) + if err != nil { + return 0, nil, err + } + } + + root := t.FSContext().RootDirectoryVFS2() + defer root.DecRef() + var executable fsbridge.File + closeOnExec := false + if path := fspath.Parse(pathname); dirfd != linux.AT_FDCWD && !path.Absolute { + // We must open the executable ourselves since dirfd is used as the + // starting point while resolving path, but the task working directory + // is used as the starting point while resolving interpreters (Linux: + // fs/binfmt_script.c:load_script() => fs/exec.c:open_exec() => + // do_open_execat(fd=AT_FDCWD)), and the loader package is currently + // incapable of handling this correctly. + if !path.HasComponents() && flags&linux.AT_EMPTY_PATH == 0 { + return 0, nil, syserror.ENOENT + } + dirfile, dirfileFlags := t.FDTable().GetVFS2(dirfd) + if dirfile == nil { + return 0, nil, syserror.EBADF + } + start := dirfile.VirtualDentry() + start.IncRef() + dirfile.DecRef() + closeOnExec = dirfileFlags.CloseOnExec + file, err := t.Kernel().VFS().OpenAt(t, t.Credentials(), &vfs.PathOperation{ + Root: root, + Start: start, + Path: path, + FollowFinalSymlink: flags&linux.AT_SYMLINK_NOFOLLOW == 0, + }, &vfs.OpenOptions{ + Flags: linux.O_RDONLY, + FileExec: true, + }) + start.DecRef() + if err != nil { + return 0, nil, err + } + defer file.DecRef() + executable = fsbridge.NewVFSFile(file) + } + + // Load the new TaskContext. + mntns := t.MountNamespaceVFS2() // FIXME(jamieliu): useless refcount change + defer mntns.DecRef() + wd := t.FSContext().WorkingDirectoryVFS2() + defer wd.DecRef() + remainingTraversals := uint(linux.MaxSymlinkTraversals) + loadArgs := loader.LoadArgs{ + Opener: fsbridge.NewVFSLookup(mntns, root, wd), + RemainingTraversals: &remainingTraversals, + ResolveFinal: flags&linux.AT_SYMLINK_NOFOLLOW == 0, + Filename: pathname, + File: executable, + CloseOnExec: closeOnExec, + Argv: argv, + Envv: envv, + Features: t.Arch().FeatureSet(), + } + + tc, se := t.Kernel().LoadTaskImage(t, loadArgs) + if se != nil { + return 0, nil, se.ToError() + } + + ctrl, err := t.Execve(tc) + return 0, ctrl, err +} diff --git a/pkg/sentry/syscalls/linux/vfs2/fd.go b/pkg/sentry/syscalls/linux/vfs2/fd.go new file mode 100644 index 000000000..3afcea665 --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/fd.go @@ -0,0 +1,147 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" + "gvisor.dev/gvisor/pkg/syserror" +) + +// Close implements Linux syscall close(2). +func Close(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + + // Note that Remove provides a reference on the file that we may use to + // flush. It is still active until we drop the final reference below + // (and other reference-holding operations complete). + _, file := t.FDTable().Remove(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + err := file.OnClose(t) + return 0, nil, slinux.HandleIOErrorVFS2(t, false /* partial */, err, syserror.EINTR, "close", file) +} + +// Dup implements Linux syscall dup(2). +func Dup(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + newFD, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{}) + if err != nil { + return 0, nil, syserror.EMFILE + } + return uintptr(newFD), nil, nil +} + +// Dup2 implements Linux syscall dup2(2). +func Dup2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + oldfd := args[0].Int() + newfd := args[1].Int() + + if oldfd == newfd { + // As long as oldfd is valid, dup2() does nothing and returns newfd. + file := t.GetFileVFS2(oldfd) + if file == nil { + return 0, nil, syserror.EBADF + } + file.DecRef() + return uintptr(newfd), nil, nil + } + + return dup3(t, oldfd, newfd, 0) +} + +// Dup3 implements Linux syscall dup3(2). +func Dup3(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + oldfd := args[0].Int() + newfd := args[1].Int() + flags := args[2].Uint() + + if oldfd == newfd { + return 0, nil, syserror.EINVAL + } + + return dup3(t, oldfd, newfd, flags) +} + +func dup3(t *kernel.Task, oldfd, newfd int32, flags uint32) (uintptr, *kernel.SyscallControl, error) { + if flags&^linux.O_CLOEXEC != 0 { + return 0, nil, syserror.EINVAL + } + + file := t.GetFileVFS2(oldfd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + err := t.NewFDAtVFS2(newfd, file, kernel.FDFlags{ + CloseOnExec: flags&linux.O_CLOEXEC != 0, + }) + if err != nil { + return 0, nil, err + } + return uintptr(newfd), nil, nil +} + +// Fcntl implements linux syscall fcntl(2). +func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + cmd := args[1].Int() + + file, flags := t.FDTable().GetVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + switch cmd { + case linux.F_DUPFD, linux.F_DUPFD_CLOEXEC: + minfd := args[2].Int() + fd, err := t.NewFDFromVFS2(minfd, file, kernel.FDFlags{ + CloseOnExec: cmd == linux.F_DUPFD_CLOEXEC, + }) + if err != nil { + return 0, nil, err + } + return uintptr(fd), nil, nil + case linux.F_GETFD: + return uintptr(flags.ToLinuxFDFlags()), nil, nil + case linux.F_SETFD: + flags := args[2].Uint() + t.FDTable().SetFlags(fd, kernel.FDFlags{ + CloseOnExec: flags&linux.FD_CLOEXEC != 0, + }) + return 0, nil, nil + case linux.F_GETFL: + return uintptr(file.StatusFlags()), nil, nil + case linux.F_SETFL: + return 0, nil, file.SetStatusFlags(t, t.Credentials(), args[2].Uint()) + default: + // TODO(gvisor.dev/issue/1623): Everything else is not yet supported. + return 0, nil, syserror.EINVAL + } +} diff --git a/pkg/sentry/syscalls/linux/vfs2/filesystem.go b/pkg/sentry/syscalls/linux/vfs2/filesystem.go new file mode 100644 index 000000000..fc5ceea4c --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/filesystem.go @@ -0,0 +1,326 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// Link implements Linux syscall link(2). +func Link(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + oldpathAddr := args[0].Pointer() + newpathAddr := args[1].Pointer() + return 0, nil, linkat(t, linux.AT_FDCWD, oldpathAddr, linux.AT_FDCWD, newpathAddr, 0 /* flags */) +} + +// Linkat implements Linux syscall linkat(2). +func Linkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + olddirfd := args[0].Int() + oldpathAddr := args[1].Pointer() + newdirfd := args[2].Int() + newpathAddr := args[3].Pointer() + flags := args[4].Int() + return 0, nil, linkat(t, olddirfd, oldpathAddr, newdirfd, newpathAddr, flags) +} + +func linkat(t *kernel.Task, olddirfd int32, oldpathAddr usermem.Addr, newdirfd int32, newpathAddr usermem.Addr, flags int32) error { + if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_FOLLOW) != 0 { + return syserror.EINVAL + } + if flags&linux.AT_EMPTY_PATH != 0 && !t.HasCapability(linux.CAP_DAC_READ_SEARCH) { + return syserror.ENOENT + } + + oldpath, err := copyInPath(t, oldpathAddr) + if err != nil { + return err + } + oldtpop, err := getTaskPathOperation(t, olddirfd, oldpath, shouldAllowEmptyPath(flags&linux.AT_EMPTY_PATH != 0), shouldFollowFinalSymlink(flags&linux.AT_SYMLINK_FOLLOW != 0)) + if err != nil { + return err + } + defer oldtpop.Release() + + newpath, err := copyInPath(t, newpathAddr) + if err != nil { + return err + } + newtpop, err := getTaskPathOperation(t, newdirfd, newpath, disallowEmptyPath, nofollowFinalSymlink) + if err != nil { + return err + } + defer newtpop.Release() + + return t.Kernel().VFS().LinkAt(t, t.Credentials(), &oldtpop.pop, &newtpop.pop) +} + +// Mkdir implements Linux syscall mkdir(2). +func Mkdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + mode := args[1].ModeT() + return 0, nil, mkdirat(t, linux.AT_FDCWD, addr, mode) +} + +// Mkdirat implements Linux syscall mkdirat(2). +func Mkdirat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + dirfd := args[0].Int() + addr := args[1].Pointer() + mode := args[2].ModeT() + return 0, nil, mkdirat(t, dirfd, addr, mode) +} + +func mkdirat(t *kernel.Task, dirfd int32, addr usermem.Addr, mode uint) error { + path, err := copyInPath(t, addr) + if err != nil { + return err + } + tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, nofollowFinalSymlink) + if err != nil { + return err + } + defer tpop.Release() + return t.Kernel().VFS().MkdirAt(t, t.Credentials(), &tpop.pop, &vfs.MkdirOptions{ + Mode: linux.FileMode(mode & (0777 | linux.S_ISVTX) &^ t.FSContext().Umask()), + }) +} + +// Mknod implements Linux syscall mknod(2). +func Mknod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + mode := args[1].ModeT() + dev := args[2].Uint() + return 0, nil, mknodat(t, linux.AT_FDCWD, addr, mode, dev) +} + +// Mknodat implements Linux syscall mknodat(2). +func Mknodat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + dirfd := args[0].Int() + addr := args[1].Pointer() + mode := args[2].ModeT() + dev := args[3].Uint() + return 0, nil, mknodat(t, dirfd, addr, mode, dev) +} + +func mknodat(t *kernel.Task, dirfd int32, addr usermem.Addr, mode uint, dev uint32) error { + path, err := copyInPath(t, addr) + if err != nil { + return err + } + tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, nofollowFinalSymlink) + if err != nil { + return err + } + defer tpop.Release() + major, minor := linux.DecodeDeviceID(dev) + return t.Kernel().VFS().MknodAt(t, t.Credentials(), &tpop.pop, &vfs.MknodOptions{ + Mode: linux.FileMode(mode &^ t.FSContext().Umask()), + DevMajor: uint32(major), + DevMinor: minor, + }) +} + +// Open implements Linux syscall open(2). +func Open(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + flags := args[1].Uint() + mode := args[2].ModeT() + return openat(t, linux.AT_FDCWD, addr, flags, mode) +} + +// Openat implements Linux syscall openat(2). +func Openat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + dirfd := args[0].Int() + addr := args[1].Pointer() + flags := args[2].Uint() + mode := args[3].ModeT() + return openat(t, dirfd, addr, flags, mode) +} + +// Creat implements Linux syscall creat(2). +func Creat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + mode := args[1].ModeT() + return openat(t, linux.AT_FDCWD, addr, linux.O_WRONLY|linux.O_CREAT|linux.O_TRUNC, mode) +} + +func openat(t *kernel.Task, dirfd int32, pathAddr usermem.Addr, flags uint32, mode uint) (uintptr, *kernel.SyscallControl, error) { + path, err := copyInPath(t, pathAddr) + if err != nil { + return 0, nil, err + } + tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, shouldFollowFinalSymlink(flags&linux.O_NOFOLLOW == 0)) + if err != nil { + return 0, nil, err + } + defer tpop.Release() + + file, err := t.Kernel().VFS().OpenAt(t, t.Credentials(), &tpop.pop, &vfs.OpenOptions{ + Flags: flags, + Mode: linux.FileMode(mode & (0777 | linux.S_ISUID | linux.S_ISGID | linux.S_ISVTX) &^ t.FSContext().Umask()), + }) + if err != nil { + return 0, nil, err + } + defer file.DecRef() + + fd, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{ + CloseOnExec: flags&linux.O_CLOEXEC != 0, + }) + return uintptr(fd), nil, err +} + +// Rename implements Linux syscall rename(2). +func Rename(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + oldpathAddr := args[0].Pointer() + newpathAddr := args[1].Pointer() + return 0, nil, renameat(t, linux.AT_FDCWD, oldpathAddr, linux.AT_FDCWD, newpathAddr, 0 /* flags */) +} + +// Renameat implements Linux syscall renameat(2). +func Renameat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + olddirfd := args[0].Int() + oldpathAddr := args[1].Pointer() + newdirfd := args[2].Int() + newpathAddr := args[3].Pointer() + return 0, nil, renameat(t, olddirfd, oldpathAddr, newdirfd, newpathAddr, 0 /* flags */) +} + +// Renameat2 implements Linux syscall renameat2(2). +func Renameat2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + olddirfd := args[0].Int() + oldpathAddr := args[1].Pointer() + newdirfd := args[2].Int() + newpathAddr := args[3].Pointer() + flags := args[4].Uint() + return 0, nil, renameat(t, olddirfd, oldpathAddr, newdirfd, newpathAddr, flags) +} + +func renameat(t *kernel.Task, olddirfd int32, oldpathAddr usermem.Addr, newdirfd int32, newpathAddr usermem.Addr, flags uint32) error { + oldpath, err := copyInPath(t, oldpathAddr) + if err != nil { + return err + } + // "If oldpath refers to a symbolic link, the link is renamed" - rename(2) + oldtpop, err := getTaskPathOperation(t, olddirfd, oldpath, disallowEmptyPath, nofollowFinalSymlink) + if err != nil { + return err + } + defer oldtpop.Release() + + newpath, err := copyInPath(t, newpathAddr) + if err != nil { + return err + } + newtpop, err := getTaskPathOperation(t, newdirfd, newpath, disallowEmptyPath, nofollowFinalSymlink) + if err != nil { + return err + } + defer newtpop.Release() + + return t.Kernel().VFS().RenameAt(t, t.Credentials(), &oldtpop.pop, &newtpop.pop, &vfs.RenameOptions{ + Flags: flags, + }) +} + +// Rmdir implements Linux syscall rmdir(2). +func Rmdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + pathAddr := args[0].Pointer() + return 0, nil, rmdirat(t, linux.AT_FDCWD, pathAddr) +} + +func rmdirat(t *kernel.Task, dirfd int32, pathAddr usermem.Addr) error { + path, err := copyInPath(t, pathAddr) + if err != nil { + return err + } + tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, followFinalSymlink) + if err != nil { + return err + } + defer tpop.Release() + return t.Kernel().VFS().RmdirAt(t, t.Credentials(), &tpop.pop) +} + +// Unlink implements Linux syscall unlink(2). +func Unlink(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + pathAddr := args[0].Pointer() + return 0, nil, unlinkat(t, linux.AT_FDCWD, pathAddr) +} + +func unlinkat(t *kernel.Task, dirfd int32, pathAddr usermem.Addr) error { + path, err := copyInPath(t, pathAddr) + if err != nil { + return err + } + tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, nofollowFinalSymlink) + if err != nil { + return err + } + defer tpop.Release() + return t.Kernel().VFS().UnlinkAt(t, t.Credentials(), &tpop.pop) +} + +// Unlinkat implements Linux syscall unlinkat(2). +func Unlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + dirfd := args[0].Int() + pathAddr := args[1].Pointer() + flags := args[2].Int() + + if flags&^linux.AT_REMOVEDIR != 0 { + return 0, nil, syserror.EINVAL + } + + if flags&linux.AT_REMOVEDIR != 0 { + return 0, nil, rmdirat(t, dirfd, pathAddr) + } + return 0, nil, unlinkat(t, dirfd, pathAddr) +} + +// Symlink implements Linux syscall symlink(2). +func Symlink(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + targetAddr := args[0].Pointer() + linkpathAddr := args[1].Pointer() + return 0, nil, symlinkat(t, targetAddr, linux.AT_FDCWD, linkpathAddr) +} + +// Symlinkat implements Linux syscall symlinkat(2). +func Symlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + targetAddr := args[0].Pointer() + newdirfd := args[1].Int() + linkpathAddr := args[2].Pointer() + return 0, nil, symlinkat(t, targetAddr, newdirfd, linkpathAddr) +} + +func symlinkat(t *kernel.Task, targetAddr usermem.Addr, newdirfd int32, linkpathAddr usermem.Addr) error { + target, err := t.CopyInString(targetAddr, linux.PATH_MAX) + if err != nil { + return err + } + linkpath, err := copyInPath(t, linkpathAddr) + if err != nil { + return err + } + tpop, err := getTaskPathOperation(t, newdirfd, linkpath, disallowEmptyPath, nofollowFinalSymlink) + if err != nil { + return err + } + defer tpop.Release() + return t.Kernel().VFS().SymlinkAt(t, t.Credentials(), &tpop.pop, target) +} diff --git a/pkg/sentry/syscalls/linux/vfs2/fscontext.go b/pkg/sentry/syscalls/linux/vfs2/fscontext.go new file mode 100644 index 000000000..317409a18 --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/fscontext.go @@ -0,0 +1,131 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +// Getcwd implements Linux syscall getcwd(2). +func Getcwd(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + size := args[1].SizeT() + + root := t.FSContext().RootDirectoryVFS2() + wd := t.FSContext().WorkingDirectoryVFS2() + s, err := t.Kernel().VFS().PathnameForGetcwd(t, root, wd) + root.DecRef() + wd.DecRef() + if err != nil { + return 0, nil, err + } + + // Note this is >= because we need a terminator. + if uint(len(s)) >= size { + return 0, nil, syserror.ERANGE + } + + // Construct a byte slice containing a NUL terminator. + buf := t.CopyScratchBuffer(len(s) + 1) + copy(buf, s) + buf[len(buf)-1] = 0 + + // Write the pathname slice. + n, err := t.CopyOutBytes(addr, buf) + if err != nil { + return 0, nil, err + } + return uintptr(n), nil, nil +} + +// Chdir implements Linux syscall chdir(2). +func Chdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + + path, err := copyInPath(t, addr) + if err != nil { + return 0, nil, err + } + tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink) + if err != nil { + return 0, nil, err + } + defer tpop.Release() + + vd, err := t.Kernel().VFS().GetDentryAt(t, t.Credentials(), &tpop.pop, &vfs.GetDentryOptions{ + CheckSearchable: true, + }) + if err != nil { + return 0, nil, err + } + t.FSContext().SetWorkingDirectoryVFS2(vd) + vd.DecRef() + return 0, nil, nil +} + +// Fchdir implements Linux syscall fchdir(2). +func Fchdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + + tpop, err := getTaskPathOperation(t, fd, fspath.Path{}, allowEmptyPath, nofollowFinalSymlink) + if err != nil { + return 0, nil, err + } + defer tpop.Release() + + vd, err := t.Kernel().VFS().GetDentryAt(t, t.Credentials(), &tpop.pop, &vfs.GetDentryOptions{ + CheckSearchable: true, + }) + if err != nil { + return 0, nil, err + } + t.FSContext().SetWorkingDirectoryVFS2(vd) + vd.DecRef() + return 0, nil, nil +} + +// Chroot implements Linux syscall chroot(2). +func Chroot(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + + if !t.HasCapability(linux.CAP_SYS_CHROOT) { + return 0, nil, syserror.EPERM + } + + path, err := copyInPath(t, addr) + if err != nil { + return 0, nil, err + } + tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink) + if err != nil { + return 0, nil, err + } + defer tpop.Release() + + vd, err := t.Kernel().VFS().GetDentryAt(t, t.Credentials(), &tpop.pop, &vfs.GetDentryOptions{ + CheckSearchable: true, + }) + if err != nil { + return 0, nil, err + } + t.FSContext().SetRootDirectoryVFS2(vd) + vd.DecRef() + return 0, nil, nil +} diff --git a/pkg/sentry/syscalls/linux/vfs2/getdents.go b/pkg/sentry/syscalls/linux/vfs2/getdents.go new file mode 100644 index 000000000..ddc140b65 --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/getdents.go @@ -0,0 +1,149 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "fmt" + + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// Getdents implements Linux syscall getdents(2). +func Getdents(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return getdents(t, args, false /* isGetdents64 */) +} + +// Getdents64 implements Linux syscall getdents64(2). +func Getdents64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return getdents(t, args, true /* isGetdents64 */) +} + +func getdents(t *kernel.Task, args arch.SyscallArguments, isGetdents64 bool) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + addr := args[1].Pointer() + size := int(args[2].Uint()) + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + cb := getGetdentsCallback(t, addr, size, isGetdents64) + err := file.IterDirents(t, cb) + n := size - cb.remaining + putGetdentsCallback(cb) + if n == 0 { + return 0, nil, err + } + return uintptr(n), nil, nil +} + +type getdentsCallback struct { + t *kernel.Task + addr usermem.Addr + remaining int + isGetdents64 bool +} + +var getdentsCallbackPool = sync.Pool{ + New: func() interface{} { + return &getdentsCallback{} + }, +} + +func getGetdentsCallback(t *kernel.Task, addr usermem.Addr, size int, isGetdents64 bool) *getdentsCallback { + cb := getdentsCallbackPool.Get().(*getdentsCallback) + *cb = getdentsCallback{ + t: t, + addr: addr, + remaining: size, + isGetdents64: isGetdents64, + } + return cb +} + +func putGetdentsCallback(cb *getdentsCallback) { + cb.t = nil + getdentsCallbackPool.Put(cb) +} + +// Handle implements vfs.IterDirentsCallback.Handle. +func (cb *getdentsCallback) Handle(dirent vfs.Dirent) error { + var buf []byte + if cb.isGetdents64 { + // struct linux_dirent64 { + // ino64_t d_ino; /* 64-bit inode number */ + // off64_t d_off; /* 64-bit offset to next structure */ + // unsigned short d_reclen; /* Size of this dirent */ + // unsigned char d_type; /* File type */ + // char d_name[]; /* Filename (null-terminated) */ + // }; + size := 8 + 8 + 2 + 1 + 1 + len(dirent.Name) + if size < cb.remaining { + return syserror.EINVAL + } + buf = cb.t.CopyScratchBuffer(size) + usermem.ByteOrder.PutUint64(buf[0:8], dirent.Ino) + usermem.ByteOrder.PutUint64(buf[8:16], uint64(dirent.NextOff)) + usermem.ByteOrder.PutUint16(buf[16:18], uint16(size)) + buf[18] = dirent.Type + copy(buf[19:], dirent.Name) + buf[size-1] = 0 // NUL terminator + } else { + // struct linux_dirent { + // unsigned long d_ino; /* Inode number */ + // unsigned long d_off; /* Offset to next linux_dirent */ + // unsigned short d_reclen; /* Length of this linux_dirent */ + // char d_name[]; /* Filename (null-terminated) */ + // /* length is actually (d_reclen - 2 - + // offsetof(struct linux_dirent, d_name)) */ + // /* + // char pad; // Zero padding byte + // char d_type; // File type (only since Linux + // // 2.6.4); offset is (d_reclen - 1) + // */ + // }; + if cb.t.Arch().Width() != 8 { + panic(fmt.Sprintf("unsupported sizeof(unsigned long): %d", cb.t.Arch().Width())) + } + size := 8 + 8 + 2 + 1 + 1 + 1 + len(dirent.Name) + if size < cb.remaining { + return syserror.EINVAL + } + buf = cb.t.CopyScratchBuffer(size) + usermem.ByteOrder.PutUint64(buf[0:8], dirent.Ino) + usermem.ByteOrder.PutUint64(buf[8:16], uint64(dirent.NextOff)) + usermem.ByteOrder.PutUint16(buf[16:18], uint16(size)) + copy(buf[18:], dirent.Name) + buf[size-3] = 0 // NUL terminator + buf[size-2] = 0 // zero padding byte + buf[size-1] = dirent.Type + } + n, err := cb.t.CopyOutBytes(cb.addr, buf) + if err != nil { + // Don't report partially-written dirents by advancing cb.addr or + // cb.remaining. + return err + } + cb.addr += usermem.Addr(n) + cb.remaining -= n + return nil +} diff --git a/pkg/sentry/syscalls/linux/vfs2/ioctl.go b/pkg/sentry/syscalls/linux/vfs2/ioctl.go new file mode 100644 index 000000000..5a2418da9 --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/ioctl.go @@ -0,0 +1,35 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/syserror" +) + +// Ioctl implements Linux syscall ioctl(2). +func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + ret, err := file.Ioctl(t, t.MemoryManager(), args) + return ret, nil, err +} diff --git a/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go b/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go index e0ac32b33..7d220bc20 100644 --- a/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go +++ b/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go @@ -12,6 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +// +build amd64 + package vfs2 import ( @@ -22,110 +24,142 @@ import ( // Override syscall table to add syscalls implementations from this package. func Override(table map[uintptr]kernel.Syscall) { table[0] = syscalls.Supported("read", Read) - - // Remove syscalls that haven't been converted yet. It's better to get ENOSYS - // rather than a SIGSEGV deep in the stack. - delete(table, 1) // write - delete(table, 2) // open - delete(table, 3) // close - delete(table, 4) // stat - delete(table, 5) // fstat - delete(table, 6) // lstat - delete(table, 7) // poll - delete(table, 8) // lseek - delete(table, 9) // mmap - delete(table, 16) // ioctl - delete(table, 17) // pread64 - delete(table, 18) // pwrite64 - delete(table, 19) // readv - delete(table, 20) // writev - delete(table, 21) // access - delete(table, 22) // pipe - delete(table, 32) // dup - delete(table, 33) // dup2 - delete(table, 40) // sendfile - delete(table, 59) // execve - delete(table, 72) // fcntl - delete(table, 73) // flock - delete(table, 74) // fsync - delete(table, 75) // fdatasync - delete(table, 76) // truncate - delete(table, 77) // ftruncate - delete(table, 78) // getdents - delete(table, 79) // getcwd - delete(table, 80) // chdir - delete(table, 81) // fchdir - delete(table, 82) // rename - delete(table, 83) // mkdir - delete(table, 84) // rmdir - delete(table, 85) // creat - delete(table, 86) // link - delete(table, 87) // unlink - delete(table, 88) // symlink - delete(table, 89) // readlink - delete(table, 90) // chmod - delete(table, 91) // fchmod - delete(table, 92) // chown - delete(table, 93) // fchown - delete(table, 94) // lchown - delete(table, 133) // mknod - delete(table, 137) // statfs - delete(table, 138) // fstatfs - delete(table, 161) // chroot - delete(table, 162) // sync + table[1] = syscalls.Supported("write", Write) + table[2] = syscalls.Supported("open", Open) + table[3] = syscalls.Supported("close", Close) + table[4] = syscalls.Supported("stat", Stat) + table[5] = syscalls.Supported("fstat", Fstat) + table[6] = syscalls.Supported("lstat", Lstat) + table[7] = syscalls.Supported("poll", Poll) + table[8] = syscalls.Supported("lseek", Lseek) + table[9] = syscalls.Supported("mmap", Mmap) + table[16] = syscalls.Supported("ioctl", Ioctl) + table[17] = syscalls.Supported("pread64", Pread64) + table[18] = syscalls.Supported("pwrite64", Pwrite64) + table[19] = syscalls.Supported("readv", Readv) + table[20] = syscalls.Supported("writev", Writev) + table[21] = syscalls.Supported("access", Access) + delete(table, 22) // pipe + table[23] = syscalls.Supported("select", Select) + table[32] = syscalls.Supported("dup", Dup) + table[33] = syscalls.Supported("dup2", Dup2) + delete(table, 40) // sendfile + delete(table, 41) // socket + delete(table, 42) // connect + delete(table, 43) // accept + delete(table, 44) // sendto + delete(table, 45) // recvfrom + delete(table, 46) // sendmsg + delete(table, 47) // recvmsg + delete(table, 48) // shutdown + delete(table, 49) // bind + delete(table, 50) // listen + delete(table, 51) // getsockname + delete(table, 52) // getpeername + delete(table, 53) // socketpair + delete(table, 54) // setsockopt + delete(table, 55) // getsockopt + table[59] = syscalls.Supported("execve", Execve) + table[72] = syscalls.Supported("fcntl", Fcntl) + delete(table, 73) // flock + table[74] = syscalls.Supported("fsync", Fsync) + table[75] = syscalls.Supported("fdatasync", Fdatasync) + table[76] = syscalls.Supported("truncate", Truncate) + table[77] = syscalls.Supported("ftruncate", Ftruncate) + table[78] = syscalls.Supported("getdents", Getdents) + table[79] = syscalls.Supported("getcwd", Getcwd) + table[80] = syscalls.Supported("chdir", Chdir) + table[81] = syscalls.Supported("fchdir", Fchdir) + table[82] = syscalls.Supported("rename", Rename) + table[83] = syscalls.Supported("mkdir", Mkdir) + table[84] = syscalls.Supported("rmdir", Rmdir) + table[85] = syscalls.Supported("creat", Creat) + table[86] = syscalls.Supported("link", Link) + table[87] = syscalls.Supported("unlink", Unlink) + table[88] = syscalls.Supported("symlink", Symlink) + table[89] = syscalls.Supported("readlink", Readlink) + table[90] = syscalls.Supported("chmod", Chmod) + table[91] = syscalls.Supported("fchmod", Fchmod) + table[92] = syscalls.Supported("chown", Chown) + table[93] = syscalls.Supported("fchown", Fchown) + table[94] = syscalls.Supported("lchown", Lchown) + table[132] = syscalls.Supported("utime", Utime) + table[133] = syscalls.Supported("mknod", Mknod) + table[137] = syscalls.Supported("statfs", Statfs) + table[138] = syscalls.Supported("fstatfs", Fstatfs) + table[161] = syscalls.Supported("chroot", Chroot) + table[162] = syscalls.Supported("sync", Sync) delete(table, 165) // mount delete(table, 166) // umount2 - delete(table, 172) // iopl - delete(table, 173) // ioperm delete(table, 187) // readahead - delete(table, 188) // setxattr - delete(table, 189) // lsetxattr - delete(table, 190) // fsetxattr - delete(table, 191) // getxattr - delete(table, 192) // lgetxattr - delete(table, 193) // fgetxattr + table[188] = syscalls.Supported("setxattr", Setxattr) + table[189] = syscalls.Supported("lsetxattr", Lsetxattr) + table[190] = syscalls.Supported("fsetxattr", Fsetxattr) + table[191] = syscalls.Supported("getxattr", Getxattr) + table[192] = syscalls.Supported("lgetxattr", Lgetxattr) + table[193] = syscalls.Supported("fgetxattr", Fgetxattr) + table[194] = syscalls.Supported("listxattr", Listxattr) + table[195] = syscalls.Supported("llistxattr", Llistxattr) + table[196] = syscalls.Supported("flistxattr", Flistxattr) + table[197] = syscalls.Supported("removexattr", Removexattr) + table[198] = syscalls.Supported("lremovexattr", Lremovexattr) + table[199] = syscalls.Supported("fremovexattr", Fremovexattr) delete(table, 206) // io_setup delete(table, 207) // io_destroy delete(table, 208) // io_getevents delete(table, 209) // io_submit delete(table, 210) // io_cancel - delete(table, 213) // epoll_create - delete(table, 214) // epoll_ctl_old - delete(table, 215) // epoll_wait_old - delete(table, 216) // remap_file_pages - delete(table, 217) // getdents64 - delete(table, 232) // epoll_wait - delete(table, 233) // epoll_ctl + table[213] = syscalls.Supported("epoll_create", EpollCreate) + table[217] = syscalls.Supported("getdents64", Getdents64) + delete(table, 221) // fdavise64 + table[232] = syscalls.Supported("epoll_wait", EpollWait) + table[233] = syscalls.Supported("epoll_ctl", EpollCtl) + table[235] = syscalls.Supported("utimes", Utimes) delete(table, 253) // inotify_init delete(table, 254) // inotify_add_watch delete(table, 255) // inotify_rm_watch - delete(table, 257) // openat - delete(table, 258) // mkdirat - delete(table, 259) // mknodat - delete(table, 260) // fchownat - delete(table, 261) // futimesat - delete(table, 262) // fstatat - delete(table, 263) // unlinkat - delete(table, 264) // renameat - delete(table, 265) // linkat - delete(table, 266) // symlinkat - delete(table, 267) // readlinkat - delete(table, 268) // fchmodat - delete(table, 269) // faccessat - delete(table, 270) // pselect - delete(table, 271) // ppoll + table[257] = syscalls.Supported("openat", Openat) + table[258] = syscalls.Supported("mkdirat", Mkdirat) + table[259] = syscalls.Supported("mknodat", Mknodat) + table[260] = syscalls.Supported("fchownat", Fchownat) + table[261] = syscalls.Supported("futimens", Futimens) + table[262] = syscalls.Supported("newfstatat", Newfstatat) + table[263] = syscalls.Supported("unlinkat", Unlinkat) + table[264] = syscalls.Supported("renameat", Renameat) + table[265] = syscalls.Supported("linkat", Linkat) + table[266] = syscalls.Supported("symlinkat", Symlinkat) + table[267] = syscalls.Supported("readlinkat", Readlinkat) + table[268] = syscalls.Supported("fchmodat", Fchmodat) + table[269] = syscalls.Supported("faccessat", Faccessat) + table[270] = syscalls.Supported("pselect", Pselect) + table[271] = syscalls.Supported("ppoll", Ppoll) + delete(table, 275) // splice + delete(table, 276) // tee + table[277] = syscalls.Supported("sync_file_range", SyncFileRange) + table[280] = syscalls.Supported("utimensat", Utimensat) + table[281] = syscalls.Supported("epoll_pwait", EpollPwait) + delete(table, 282) // signalfd + delete(table, 283) // timerfd_create + delete(table, 284) // eventfd delete(table, 285) // fallocate - delete(table, 291) // epoll_create1 - delete(table, 292) // dup3 + delete(table, 286) // timerfd_settime + delete(table, 287) // timerfd_gettime + delete(table, 288) // accept4 + delete(table, 289) // signalfd4 + delete(table, 290) // eventfd2 + table[291] = syscalls.Supported("epoll_create1", EpollCreate1) + table[292] = syscalls.Supported("dup3", Dup3) delete(table, 293) // pipe2 delete(table, 294) // inotify_init1 - delete(table, 295) // preadv - delete(table, 296) // pwritev - delete(table, 306) // syncfs - delete(table, 316) // renameat2 + table[295] = syscalls.Supported("preadv", Preadv) + table[296] = syscalls.Supported("pwritev", Pwritev) + delete(table, 299) // recvmmsg + table[306] = syscalls.Supported("syncfs", Syncfs) + delete(table, 307) // sendmmsg + table[316] = syscalls.Supported("renameat2", Renameat2) delete(table, 319) // memfd_create - delete(table, 322) // execveat - delete(table, 327) // preadv2 - delete(table, 328) // pwritev2 - delete(table, 332) // statx + table[322] = syscalls.Supported("execveat", Execveat) + table[327] = syscalls.Supported("preadv2", Preadv2) + table[328] = syscalls.Supported("pwritev2", Pwritev2) + table[332] = syscalls.Supported("statx", Statx) } diff --git a/pkg/sentry/syscalls/linux/vfs2/linux64_override_arm64.go b/pkg/sentry/syscalls/linux/vfs2/linux64_override_arm64.go index 6af5c400f..a6b367468 100644 --- a/pkg/sentry/syscalls/linux/vfs2/linux64_override_arm64.go +++ b/pkg/sentry/syscalls/linux/vfs2/linux64_override_arm64.go @@ -12,6 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +// +build arm64 + package vfs2 import ( diff --git a/pkg/sentry/syscalls/linux/vfs2/mmap.go b/pkg/sentry/syscalls/linux/vfs2/mmap.go new file mode 100644 index 000000000..60a43f0a0 --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/mmap.go @@ -0,0 +1,92 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/memmap" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// Mmap implements Linux syscall mmap(2). +func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + prot := args[2].Int() + flags := args[3].Int() + fd := args[4].Int() + fixed := flags&linux.MAP_FIXED != 0 + private := flags&linux.MAP_PRIVATE != 0 + shared := flags&linux.MAP_SHARED != 0 + anon := flags&linux.MAP_ANONYMOUS != 0 + map32bit := flags&linux.MAP_32BIT != 0 + + // Require exactly one of MAP_PRIVATE and MAP_SHARED. + if private == shared { + return 0, nil, syserror.EINVAL + } + + opts := memmap.MMapOpts{ + Length: args[1].Uint64(), + Offset: args[5].Uint64(), + Addr: args[0].Pointer(), + Fixed: fixed, + Unmap: fixed, + Map32Bit: map32bit, + Private: private, + Perms: usermem.AccessType{ + Read: linux.PROT_READ&prot != 0, + Write: linux.PROT_WRITE&prot != 0, + Execute: linux.PROT_EXEC&prot != 0, + }, + MaxPerms: usermem.AnyAccess, + GrowsDown: linux.MAP_GROWSDOWN&flags != 0, + Precommit: linux.MAP_POPULATE&flags != 0, + } + if linux.MAP_LOCKED&flags != 0 { + opts.MLockMode = memmap.MLockEager + } + defer func() { + if opts.MappingIdentity != nil { + opts.MappingIdentity.DecRef() + } + }() + + if !anon { + // Convert the passed FD to a file reference. + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // mmap unconditionally requires that the FD is readable. + if !file.IsReadable() { + return 0, nil, syserror.EACCES + } + // MAP_SHARED requires that the FD be writable for PROT_WRITE. + if shared && !file.IsWritable() { + opts.MaxPerms.Write = false + } + + if err := file.ConfigureMMap(t, &opts); err != nil { + return 0, nil, err + } + } + + rv, err := t.MemoryManager().MMap(t, opts) + return uintptr(rv), nil, err +} diff --git a/pkg/sentry/syscalls/linux/vfs2/path.go b/pkg/sentry/syscalls/linux/vfs2/path.go new file mode 100644 index 000000000..97da6c647 --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/path.go @@ -0,0 +1,94 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +func copyInPath(t *kernel.Task, addr usermem.Addr) (fspath.Path, error) { + pathname, err := t.CopyInString(addr, linux.PATH_MAX) + if err != nil { + return fspath.Path{}, err + } + return fspath.Parse(pathname), nil +} + +type taskPathOperation struct { + pop vfs.PathOperation + haveStartRef bool +} + +func getTaskPathOperation(t *kernel.Task, dirfd int32, path fspath.Path, shouldAllowEmptyPath shouldAllowEmptyPath, shouldFollowFinalSymlink shouldFollowFinalSymlink) (taskPathOperation, error) { + root := t.FSContext().RootDirectoryVFS2() + start := root + haveStartRef := false + if !path.Absolute { + if !path.HasComponents() && !bool(shouldAllowEmptyPath) { + root.DecRef() + return taskPathOperation{}, syserror.ENOENT + } + if dirfd == linux.AT_FDCWD { + start = t.FSContext().WorkingDirectoryVFS2() + haveStartRef = true + } else { + dirfile := t.GetFileVFS2(dirfd) + if dirfile == nil { + root.DecRef() + return taskPathOperation{}, syserror.EBADF + } + start = dirfile.VirtualDentry() + start.IncRef() + haveStartRef = true + dirfile.DecRef() + } + } + return taskPathOperation{ + pop: vfs.PathOperation{ + Root: root, + Start: start, + Path: path, + FollowFinalSymlink: bool(shouldFollowFinalSymlink), + }, + haveStartRef: haveStartRef, + }, nil +} + +func (tpop *taskPathOperation) Release() { + tpop.pop.Root.DecRef() + if tpop.haveStartRef { + tpop.pop.Start.DecRef() + tpop.haveStartRef = false + } +} + +type shouldAllowEmptyPath bool + +const ( + disallowEmptyPath shouldAllowEmptyPath = false + allowEmptyPath shouldAllowEmptyPath = true +) + +type shouldFollowFinalSymlink bool + +const ( + nofollowFinalSymlink shouldFollowFinalSymlink = false + followFinalSymlink shouldFollowFinalSymlink = true +) diff --git a/pkg/sentry/syscalls/linux/vfs2/poll.go b/pkg/sentry/syscalls/linux/vfs2/poll.go new file mode 100644 index 000000000..dbf4882da --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/poll.go @@ -0,0 +1,584 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "fmt" + "time" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" + "gvisor.dev/gvisor/pkg/sentry/limits" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +// fileCap is the maximum allowable files for poll & select. This has no +// equivalent in Linux; it exists in gVisor since allocation failure in Go is +// unrecoverable. +const fileCap = 1024 * 1024 + +// Masks for "readable", "writable", and "exceptional" events as defined by +// select(2). +const ( + // selectReadEvents is analogous to the Linux kernel's + // fs/select.c:POLLIN_SET. + selectReadEvents = linux.POLLIN | linux.POLLHUP | linux.POLLERR + + // selectWriteEvents is analogous to the Linux kernel's + // fs/select.c:POLLOUT_SET. + selectWriteEvents = linux.POLLOUT | linux.POLLERR + + // selectExceptEvents is analogous to the Linux kernel's + // fs/select.c:POLLEX_SET. + selectExceptEvents = linux.POLLPRI +) + +// pollState tracks the associated file description and waiter of a PollFD. +type pollState struct { + file *vfs.FileDescription + waiter waiter.Entry +} + +// initReadiness gets the current ready mask for the file represented by the FD +// stored in pfd.FD. If a channel is passed in, the waiter entry in "state" is +// used to register with the file for event notifications, and a reference to +// the file is stored in "state". +func initReadiness(t *kernel.Task, pfd *linux.PollFD, state *pollState, ch chan struct{}) { + if pfd.FD < 0 { + pfd.REvents = 0 + return + } + + file := t.GetFileVFS2(pfd.FD) + if file == nil { + pfd.REvents = linux.POLLNVAL + return + } + + if ch == nil { + defer file.DecRef() + } else { + state.file = file + state.waiter, _ = waiter.NewChannelEntry(ch) + file.EventRegister(&state.waiter, waiter.EventMaskFromLinux(uint32(pfd.Events))) + } + + r := file.Readiness(waiter.EventMaskFromLinux(uint32(pfd.Events))) + pfd.REvents = int16(r.ToLinux()) & pfd.Events +} + +// releaseState releases all the pollState in "state". +func releaseState(state []pollState) { + for i := range state { + if state[i].file != nil { + state[i].file.EventUnregister(&state[i].waiter) + state[i].file.DecRef() + } + } +} + +// pollBlock polls the PollFDs in "pfd" with a bounded time specified in "timeout" +// when "timeout" is greater than zero. +// +// pollBlock returns the remaining timeout, which is always 0 on a timeout; and 0 or +// positive if interrupted by a signal. +func pollBlock(t *kernel.Task, pfd []linux.PollFD, timeout time.Duration) (time.Duration, uintptr, error) { + var ch chan struct{} + if timeout != 0 { + ch = make(chan struct{}, 1) + } + + // Register for event notification in the files involved if we may + // block (timeout not zero). Once we find a file that has a non-zero + // result, we stop registering for events but still go through all files + // to get their ready masks. + state := make([]pollState, len(pfd)) + defer releaseState(state) + n := uintptr(0) + for i := range pfd { + initReadiness(t, &pfd[i], &state[i], ch) + if pfd[i].REvents != 0 { + n++ + ch = nil + } + } + + if timeout == 0 { + return timeout, n, nil + } + + haveTimeout := timeout >= 0 + + for n == 0 { + var err error + // Wait for a notification. + timeout, err = t.BlockWithTimeout(ch, haveTimeout, timeout) + if err != nil { + if err == syserror.ETIMEDOUT { + err = nil + } + return timeout, 0, err + } + + // We got notified, count how many files are ready. If none, + // then this was a spurious notification, and we just go back + // to sleep with the remaining timeout. + for i := range state { + if state[i].file == nil { + continue + } + + r := state[i].file.Readiness(waiter.EventMaskFromLinux(uint32(pfd[i].Events))) + rl := int16(r.ToLinux()) & pfd[i].Events + if rl != 0 { + pfd[i].REvents = rl + n++ + } + } + } + + return timeout, n, nil +} + +// copyInPollFDs copies an array of struct pollfd unless nfds exceeds the max. +func copyInPollFDs(t *kernel.Task, addr usermem.Addr, nfds uint) ([]linux.PollFD, error) { + if uint64(nfds) > t.ThreadGroup().Limits().GetCapped(limits.NumberOfFiles, fileCap) { + return nil, syserror.EINVAL + } + + pfd := make([]linux.PollFD, nfds) + if nfds > 0 { + if _, err := t.CopyIn(addr, &pfd); err != nil { + return nil, err + } + } + + return pfd, nil +} + +func doPoll(t *kernel.Task, addr usermem.Addr, nfds uint, timeout time.Duration) (time.Duration, uintptr, error) { + pfd, err := copyInPollFDs(t, addr, nfds) + if err != nil { + return timeout, 0, err + } + + // Compatibility warning: Linux adds POLLHUP and POLLERR just before + // polling, in fs/select.c:do_pollfd(). Since pfd is copied out after + // polling, changing event masks here is an application-visible difference. + // (Linux also doesn't copy out event masks at all, only revents.) + for i := range pfd { + pfd[i].Events |= linux.POLLHUP | linux.POLLERR + } + remainingTimeout, n, err := pollBlock(t, pfd, timeout) + err = syserror.ConvertIntr(err, syserror.EINTR) + + // The poll entries are copied out regardless of whether + // any are set or not. This aligns with the Linux behavior. + if nfds > 0 && err == nil { + if _, err := t.CopyOut(addr, pfd); err != nil { + return remainingTimeout, 0, err + } + } + + return remainingTimeout, n, err +} + +// CopyInFDSet copies an fd set from select(2)/pselect(2). +func CopyInFDSet(t *kernel.Task, addr usermem.Addr, nBytes, nBitsInLastPartialByte int) ([]byte, error) { + set := make([]byte, nBytes) + + if addr != 0 { + if _, err := t.CopyIn(addr, &set); err != nil { + return nil, err + } + // If we only use part of the last byte, mask out the extraneous bits. + // + // N.B. This only works on little-endian architectures. + if nBitsInLastPartialByte != 0 { + set[nBytes-1] &^= byte(0xff) << nBitsInLastPartialByte + } + } + return set, nil +} + +func doSelect(t *kernel.Task, nfds int, readFDs, writeFDs, exceptFDs usermem.Addr, timeout time.Duration) (uintptr, error) { + if nfds < 0 || nfds > fileCap { + return 0, syserror.EINVAL + } + + // Calculate the size of the fd sets (one bit per fd). + nBytes := (nfds + 7) / 8 + nBitsInLastPartialByte := nfds % 8 + + // Capture all the provided input vectors. + r, err := CopyInFDSet(t, readFDs, nBytes, nBitsInLastPartialByte) + if err != nil { + return 0, err + } + w, err := CopyInFDSet(t, writeFDs, nBytes, nBitsInLastPartialByte) + if err != nil { + return 0, err + } + e, err := CopyInFDSet(t, exceptFDs, nBytes, nBitsInLastPartialByte) + if err != nil { + return 0, err + } + + // Count how many FDs are actually being requested so that we can build + // a PollFD array. + fdCount := 0 + for i := 0; i < nBytes; i++ { + v := r[i] | w[i] | e[i] + for v != 0 { + v &= (v - 1) + fdCount++ + } + } + + // Build the PollFD array. + pfd := make([]linux.PollFD, 0, fdCount) + var fd int32 + for i := 0; i < nBytes; i++ { + rV, wV, eV := r[i], w[i], e[i] + v := rV | wV | eV + m := byte(1) + for j := 0; j < 8; j++ { + if (v & m) != 0 { + // Make sure the fd is valid and decrement the reference + // immediately to ensure we don't leak. Note, another thread + // might be about to close fd. This is racy, but that's + // OK. Linux is racy in the same way. + file := t.GetFileVFS2(fd) + if file == nil { + return 0, syserror.EBADF + } + file.DecRef() + + var mask int16 + if (rV & m) != 0 { + mask |= selectReadEvents + } + + if (wV & m) != 0 { + mask |= selectWriteEvents + } + + if (eV & m) != 0 { + mask |= selectExceptEvents + } + + pfd = append(pfd, linux.PollFD{ + FD: fd, + Events: mask, + }) + } + + fd++ + m <<= 1 + } + } + + // Do the syscall, then count the number of bits set. + if _, _, err = pollBlock(t, pfd, timeout); err != nil { + return 0, syserror.ConvertIntr(err, syserror.EINTR) + } + + // r, w, and e are currently event mask bitsets; unset bits corresponding + // to events that *didn't* occur. + bitSetCount := uintptr(0) + for idx := range pfd { + events := pfd[idx].REvents + i, j := pfd[idx].FD/8, uint(pfd[idx].FD%8) + m := byte(1) << j + if r[i]&m != 0 { + if (events & selectReadEvents) != 0 { + bitSetCount++ + } else { + r[i] &^= m + } + } + if w[i]&m != 0 { + if (events & selectWriteEvents) != 0 { + bitSetCount++ + } else { + w[i] &^= m + } + } + if e[i]&m != 0 { + if (events & selectExceptEvents) != 0 { + bitSetCount++ + } else { + e[i] &^= m + } + } + } + + // Copy updated vectors back. + if readFDs != 0 { + if _, err := t.CopyOut(readFDs, r); err != nil { + return 0, err + } + } + + if writeFDs != 0 { + if _, err := t.CopyOut(writeFDs, w); err != nil { + return 0, err + } + } + + if exceptFDs != 0 { + if _, err := t.CopyOut(exceptFDs, e); err != nil { + return 0, err + } + } + + return bitSetCount, nil +} + +// timeoutRemaining returns the amount of time remaining for the specified +// timeout or 0 if it has elapsed. +// +// startNs must be from CLOCK_MONOTONIC. +func timeoutRemaining(t *kernel.Task, startNs ktime.Time, timeout time.Duration) time.Duration { + now := t.Kernel().MonotonicClock().Now() + remaining := timeout - now.Sub(startNs) + if remaining < 0 { + remaining = 0 + } + return remaining +} + +// copyOutTimespecRemaining copies the time remaining in timeout to timespecAddr. +// +// startNs must be from CLOCK_MONOTONIC. +func copyOutTimespecRemaining(t *kernel.Task, startNs ktime.Time, timeout time.Duration, timespecAddr usermem.Addr) error { + if timeout <= 0 { + return nil + } + remaining := timeoutRemaining(t, startNs, timeout) + tsRemaining := linux.NsecToTimespec(remaining.Nanoseconds()) + return tsRemaining.CopyOut(t, timespecAddr) +} + +// copyOutTimevalRemaining copies the time remaining in timeout to timevalAddr. +// +// startNs must be from CLOCK_MONOTONIC. +func copyOutTimevalRemaining(t *kernel.Task, startNs ktime.Time, timeout time.Duration, timevalAddr usermem.Addr) error { + if timeout <= 0 { + return nil + } + remaining := timeoutRemaining(t, startNs, timeout) + tvRemaining := linux.NsecToTimeval(remaining.Nanoseconds()) + return tvRemaining.CopyOut(t, timevalAddr) +} + +// pollRestartBlock encapsulates the state required to restart poll(2) via +// restart_syscall(2). +// +// +stateify savable +type pollRestartBlock struct { + pfdAddr usermem.Addr + nfds uint + timeout time.Duration +} + +// Restart implements kernel.SyscallRestartBlock.Restart. +func (p *pollRestartBlock) Restart(t *kernel.Task) (uintptr, error) { + return poll(t, p.pfdAddr, p.nfds, p.timeout) +} + +func poll(t *kernel.Task, pfdAddr usermem.Addr, nfds uint, timeout time.Duration) (uintptr, error) { + remainingTimeout, n, err := doPoll(t, pfdAddr, nfds, timeout) + // On an interrupt poll(2) is restarted with the remaining timeout. + if err == syserror.EINTR { + t.SetSyscallRestartBlock(&pollRestartBlock{ + pfdAddr: pfdAddr, + nfds: nfds, + timeout: remainingTimeout, + }) + return 0, kernel.ERESTART_RESTARTBLOCK + } + return n, err +} + +// Poll implements linux syscall poll(2). +func Poll(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + pfdAddr := args[0].Pointer() + nfds := uint(args[1].Uint()) // poll(2) uses unsigned long. + timeout := time.Duration(args[2].Int()) * time.Millisecond + n, err := poll(t, pfdAddr, nfds, timeout) + return n, nil, err +} + +// Ppoll implements linux syscall ppoll(2). +func Ppoll(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + pfdAddr := args[0].Pointer() + nfds := uint(args[1].Uint()) // poll(2) uses unsigned long. + timespecAddr := args[2].Pointer() + maskAddr := args[3].Pointer() + maskSize := uint(args[4].Uint()) + + timeout, err := copyTimespecInToDuration(t, timespecAddr) + if err != nil { + return 0, nil, err + } + + var startNs ktime.Time + if timeout > 0 { + startNs = t.Kernel().MonotonicClock().Now() + } + + if err := setTempSignalSet(t, maskAddr, maskSize); err != nil { + return 0, nil, err + } + + _, n, err := doPoll(t, pfdAddr, nfds, timeout) + copyErr := copyOutTimespecRemaining(t, startNs, timeout, timespecAddr) + // doPoll returns EINTR if interrupted, but ppoll is normally restartable + // if interrupted by something other than a signal handled by the + // application (i.e. returns ERESTARTNOHAND). However, if + // copyOutTimespecRemaining failed, then the restarted ppoll would use the + // wrong timeout, so the error should be left as EINTR. + // + // Note that this means that if err is nil but copyErr is not, copyErr is + // ignored. This is consistent with Linux. + if err == syserror.EINTR && copyErr == nil { + err = kernel.ERESTARTNOHAND + } + return n, nil, err +} + +// Select implements linux syscall select(2). +func Select(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + nfds := int(args[0].Int()) // select(2) uses an int. + readFDs := args[1].Pointer() + writeFDs := args[2].Pointer() + exceptFDs := args[3].Pointer() + timevalAddr := args[4].Pointer() + + // Use a negative Duration to indicate "no timeout". + timeout := time.Duration(-1) + if timevalAddr != 0 { + var timeval linux.Timeval + if err := timeval.CopyIn(t, timevalAddr); err != nil { + return 0, nil, err + } + if timeval.Sec < 0 || timeval.Usec < 0 { + return 0, nil, syserror.EINVAL + } + timeout = time.Duration(timeval.ToNsecCapped()) + } + startNs := t.Kernel().MonotonicClock().Now() + n, err := doSelect(t, nfds, readFDs, writeFDs, exceptFDs, timeout) + copyErr := copyOutTimevalRemaining(t, startNs, timeout, timevalAddr) + // See comment in Ppoll. + if err == syserror.EINTR && copyErr == nil { + err = kernel.ERESTARTNOHAND + } + return n, nil, err +} + +// Pselect implements linux syscall pselect(2). +func Pselect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + nfds := int(args[0].Int()) // select(2) uses an int. + readFDs := args[1].Pointer() + writeFDs := args[2].Pointer() + exceptFDs := args[3].Pointer() + timespecAddr := args[4].Pointer() + maskWithSizeAddr := args[5].Pointer() + + timeout, err := copyTimespecInToDuration(t, timespecAddr) + if err != nil { + return 0, nil, err + } + + var startNs ktime.Time + if timeout > 0 { + startNs = t.Kernel().MonotonicClock().Now() + } + + if maskWithSizeAddr != 0 { + if t.Arch().Width() != 8 { + panic(fmt.Sprintf("unsupported sizeof(void*): %d", t.Arch().Width())) + } + var maskStruct sigSetWithSize + if err := maskStruct.CopyIn(t, maskWithSizeAddr); err != nil { + return 0, nil, err + } + if err := setTempSignalSet(t, usermem.Addr(maskStruct.sigsetAddr), uint(maskStruct.sizeofSigset)); err != nil { + return 0, nil, err + } + } + + n, err := doSelect(t, nfds, readFDs, writeFDs, exceptFDs, timeout) + copyErr := copyOutTimespecRemaining(t, startNs, timeout, timespecAddr) + // See comment in Ppoll. + if err == syserror.EINTR && copyErr == nil { + err = kernel.ERESTARTNOHAND + } + return n, nil, err +} + +// +marshal +type sigSetWithSize struct { + sigsetAddr uint64 + sizeofSigset uint64 +} + +// copyTimespecInToDuration copies a Timespec from the untrusted app range, +// validates it and converts it to a Duration. +// +// If the Timespec is larger than what can be represented in a Duration, the +// returned value is the maximum that Duration will allow. +// +// If timespecAddr is NULL, the returned value is negative. +func copyTimespecInToDuration(t *kernel.Task, timespecAddr usermem.Addr) (time.Duration, error) { + // Use a negative Duration to indicate "no timeout". + timeout := time.Duration(-1) + if timespecAddr != 0 { + var timespec linux.Timespec + if err := timespec.CopyIn(t, timespecAddr); err != nil { + return 0, err + } + if !timespec.Valid() { + return 0, syserror.EINVAL + } + timeout = time.Duration(timespec.ToNsecCapped()) + } + return timeout, nil +} + +func setTempSignalSet(t *kernel.Task, maskAddr usermem.Addr, maskSize uint) error { + if maskAddr == 0 { + return nil + } + if maskSize != linux.SignalSetSize { + return syserror.EINVAL + } + var mask linux.SignalSet + if err := mask.CopyIn(t, maskAddr); err != nil { + return err + } + mask &^= kernel.UnblockableSignals + oldmask := t.SignalMask() + t.SetSignalMask(mask) + t.SetSavedSignalMask(oldmask) + return nil +} diff --git a/pkg/sentry/syscalls/linux/vfs2/read_write.go b/pkg/sentry/syscalls/linux/vfs2/read_write.go new file mode 100644 index 000000000..35f6308d6 --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/read_write.go @@ -0,0 +1,511 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +const ( + eventMaskRead = waiter.EventIn | waiter.EventHUp | waiter.EventErr + eventMaskWrite = waiter.EventOut | waiter.EventHUp | waiter.EventErr +) + +// Read implements Linux syscall read(2). +func Read(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + addr := args[1].Pointer() + size := args[2].SizeT() + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Check that the size is legitimate. + si := int(size) + if si < 0 { + return 0, nil, syserror.EINVAL + } + + // Get the destination of the read. + dst, err := t.SingleIOSequence(addr, si, usermem.IOOpts{ + AddressSpaceActive: true, + }) + if err != nil { + return 0, nil, err + } + + n, err := read(t, file, dst, vfs.ReadOptions{}) + t.IOUsage().AccountReadSyscall(n) + return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "read", file) +} + +// Readv implements Linux syscall readv(2). +func Readv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + addr := args[1].Pointer() + iovcnt := int(args[2].Int()) + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Get the destination of the read. + dst, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{ + AddressSpaceActive: true, + }) + if err != nil { + return 0, nil, err + } + + n, err := read(t, file, dst, vfs.ReadOptions{}) + t.IOUsage().AccountReadSyscall(n) + return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "readv", file) +} + +func read(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + n, err := file.Read(t, dst, opts) + if err != syserror.ErrWouldBlock || file.StatusFlags()&linux.O_NONBLOCK != 0 { + return n, err + } + + // Register for notifications. + w, ch := waiter.NewChannelEntry(nil) + file.EventRegister(&w, eventMaskRead) + + total := n + for { + // Shorten dst to reflect bytes previously read. + dst = dst.DropFirst(int(n)) + + // Issue the request and break out if it completes with anything other than + // "would block". + n, err := file.Read(t, dst, opts) + total += n + if err != syserror.ErrWouldBlock { + break + } + if err := t.Block(ch); err != nil { + break + } + } + file.EventUnregister(&w) + + return total, err +} + +// Pread64 implements Linux syscall pread64(2). +func Pread64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + addr := args[1].Pointer() + size := args[2].SizeT() + offset := args[3].Int64() + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Check that the offset is legitimate. + if offset < 0 { + return 0, nil, syserror.EINVAL + } + + // Check that the size is legitimate. + si := int(size) + if si < 0 { + return 0, nil, syserror.EINVAL + } + + // Get the destination of the read. + dst, err := t.SingleIOSequence(addr, si, usermem.IOOpts{ + AddressSpaceActive: true, + }) + if err != nil { + return 0, nil, err + } + + n, err := pread(t, file, dst, offset, vfs.ReadOptions{}) + t.IOUsage().AccountReadSyscall(n) + return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "pread64", file) +} + +// Preadv implements Linux syscall preadv(2). +func Preadv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + addr := args[1].Pointer() + iovcnt := int(args[2].Int()) + offset := args[3].Int64() + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Check that the offset is legitimate. + if offset < 0 { + return 0, nil, syserror.EINVAL + } + + // Get the destination of the read. + dst, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{ + AddressSpaceActive: true, + }) + if err != nil { + return 0, nil, err + } + + n, err := pread(t, file, dst, offset, vfs.ReadOptions{}) + t.IOUsage().AccountReadSyscall(n) + return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "preadv", file) +} + +// Preadv2 implements Linux syscall preadv2(2). +func Preadv2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + // While the glibc signature is + // preadv2(int fd, struct iovec* iov, int iov_cnt, off_t offset, int flags) + // the actual syscall + // (https://elixir.bootlin.com/linux/v5.5/source/fs/read_write.c#L1142) + // splits the offset argument into a high/low value for compatibility with + // 32-bit architectures. The flags argument is the 6th argument (index 5). + fd := args[0].Int() + addr := args[1].Pointer() + iovcnt := int(args[2].Int()) + offset := args[3].Int64() + flags := args[5].Int() + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Check that the offset is legitimate. + if offset < -1 { + return 0, nil, syserror.EINVAL + } + + // Get the destination of the read. + dst, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{ + AddressSpaceActive: true, + }) + if err != nil { + return 0, nil, err + } + + opts := vfs.ReadOptions{ + Flags: uint32(flags), + } + var n int64 + if offset == -1 { + n, err = read(t, file, dst, opts) + } else { + n, err = pread(t, file, dst, offset, opts) + } + t.IOUsage().AccountReadSyscall(n) + return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "preadv2", file) +} + +func pread(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { + n, err := file.PRead(t, dst, offset, opts) + if err != syserror.ErrWouldBlock || file.StatusFlags()&linux.O_NONBLOCK != 0 { + return n, err + } + + // Register for notifications. + w, ch := waiter.NewChannelEntry(nil) + file.EventRegister(&w, eventMaskRead) + + total := n + for { + // Shorten dst to reflect bytes previously read. + dst = dst.DropFirst(int(n)) + + // Issue the request and break out if it completes with anything other than + // "would block". + n, err := file.PRead(t, dst, offset+total, opts) + total += n + if err != syserror.ErrWouldBlock { + break + } + if err := t.Block(ch); err != nil { + break + } + } + file.EventUnregister(&w) + + return total, err +} + +// Write implements Linux syscall write(2). +func Write(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + addr := args[1].Pointer() + size := args[2].SizeT() + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Check that the size is legitimate. + si := int(size) + if si < 0 { + return 0, nil, syserror.EINVAL + } + + // Get the source of the write. + src, err := t.SingleIOSequence(addr, si, usermem.IOOpts{ + AddressSpaceActive: true, + }) + if err != nil { + return 0, nil, err + } + + n, err := write(t, file, src, vfs.WriteOptions{}) + t.IOUsage().AccountWriteSyscall(n) + return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "write", file) +} + +// Writev implements Linux syscall writev(2). +func Writev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + addr := args[1].Pointer() + iovcnt := int(args[2].Int()) + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Get the source of the write. + src, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{ + AddressSpaceActive: true, + }) + if err != nil { + return 0, nil, err + } + + n, err := write(t, file, src, vfs.WriteOptions{}) + t.IOUsage().AccountWriteSyscall(n) + return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "writev", file) +} + +func write(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { + n, err := file.Write(t, src, opts) + if err != syserror.ErrWouldBlock || file.StatusFlags()&linux.O_NONBLOCK != 0 { + return n, err + } + + // Register for notifications. + w, ch := waiter.NewChannelEntry(nil) + file.EventRegister(&w, eventMaskWrite) + + total := n + for { + // Shorten src to reflect bytes previously written. + src = src.DropFirst(int(n)) + + // Issue the request and break out if it completes with anything other than + // "would block". + n, err := file.Write(t, src, opts) + total += n + if err != syserror.ErrWouldBlock { + break + } + if err := t.Block(ch); err != nil { + break + } + } + file.EventUnregister(&w) + + return total, err +} + +// Pwrite64 implements Linux syscall pwrite64(2). +func Pwrite64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + addr := args[1].Pointer() + size := args[2].SizeT() + offset := args[3].Int64() + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Check that the offset is legitimate. + if offset < 0 { + return 0, nil, syserror.EINVAL + } + + // Check that the size is legitimate. + si := int(size) + if si < 0 { + return 0, nil, syserror.EINVAL + } + + // Get the source of the write. + src, err := t.SingleIOSequence(addr, si, usermem.IOOpts{ + AddressSpaceActive: true, + }) + if err != nil { + return 0, nil, err + } + + n, err := pwrite(t, file, src, offset, vfs.WriteOptions{}) + t.IOUsage().AccountWriteSyscall(n) + return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "pwrite64", file) +} + +// Pwritev implements Linux syscall pwritev(2). +func Pwritev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + addr := args[1].Pointer() + iovcnt := int(args[2].Int()) + offset := args[3].Int64() + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Check that the offset is legitimate. + if offset < 0 { + return 0, nil, syserror.EINVAL + } + + // Get the source of the write. + src, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{ + AddressSpaceActive: true, + }) + if err != nil { + return 0, nil, err + } + + n, err := pwrite(t, file, src, offset, vfs.WriteOptions{}) + t.IOUsage().AccountReadSyscall(n) + return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "pwritev", file) +} + +// Pwritev2 implements Linux syscall pwritev2(2). +func Pwritev2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + // While the glibc signature is + // pwritev2(int fd, struct iovec* iov, int iov_cnt, off_t offset, int flags) + // the actual syscall + // (https://elixir.bootlin.com/linux/v5.5/source/fs/read_write.c#L1162) + // splits the offset argument into a high/low value for compatibility with + // 32-bit architectures. The flags argument is the 6th argument (index 5). + fd := args[0].Int() + addr := args[1].Pointer() + iovcnt := int(args[2].Int()) + offset := args[3].Int64() + flags := args[5].Int() + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Check that the offset is legitimate. + if offset < -1 { + return 0, nil, syserror.EINVAL + } + + // Get the source of the write. + src, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{ + AddressSpaceActive: true, + }) + if err != nil { + return 0, nil, err + } + + opts := vfs.WriteOptions{ + Flags: uint32(flags), + } + var n int64 + if offset == -1 { + n, err = write(t, file, src, opts) + } else { + n, err = pwrite(t, file, src, offset, opts) + } + t.IOUsage().AccountWriteSyscall(n) + return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "pwritev2", file) +} + +func pwrite(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { + n, err := file.PWrite(t, src, offset, opts) + if err != syserror.ErrWouldBlock || file.StatusFlags()&linux.O_NONBLOCK != 0 { + return n, err + } + + // Register for notifications. + w, ch := waiter.NewChannelEntry(nil) + file.EventRegister(&w, eventMaskWrite) + + total := n + for { + // Shorten src to reflect bytes previously written. + src = src.DropFirst(int(n)) + + // Issue the request and break out if it completes with anything other than + // "would block". + n, err := file.PWrite(t, src, offset+total, opts) + total += n + if err != syserror.ErrWouldBlock { + break + } + if err := t.Block(ch); err != nil { + break + } + } + file.EventUnregister(&w) + + return total, err +} + +// Lseek implements Linux syscall lseek(2). +func Lseek(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + offset := args[1].Int64() + whence := args[2].Int() + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + newoff, err := file.Seek(t, offset, whence) + return uintptr(newoff), nil, err +} diff --git a/pkg/sentry/syscalls/linux/vfs2/setstat.go b/pkg/sentry/syscalls/linux/vfs2/setstat.go new file mode 100644 index 000000000..9250659ff --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/setstat.go @@ -0,0 +1,380 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +const chmodMask = 0777 | linux.S_ISUID | linux.S_ISGID | linux.S_ISVTX + +// Chmod implements Linux syscall chmod(2). +func Chmod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + pathAddr := args[0].Pointer() + mode := args[1].ModeT() + return 0, nil, fchmodat(t, linux.AT_FDCWD, pathAddr, mode) +} + +// Fchmodat implements Linux syscall fchmodat(2). +func Fchmodat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + dirfd := args[0].Int() + pathAddr := args[1].Pointer() + mode := args[2].ModeT() + return 0, nil, fchmodat(t, dirfd, pathAddr, mode) +} + +func fchmodat(t *kernel.Task, dirfd int32, pathAddr usermem.Addr, mode uint) error { + path, err := copyInPath(t, pathAddr) + if err != nil { + return err + } + + return setstatat(t, dirfd, path, disallowEmptyPath, followFinalSymlink, &vfs.SetStatOptions{ + Stat: linux.Statx{ + Mask: linux.STATX_MODE, + Mode: uint16(mode & chmodMask), + }, + }) +} + +// Fchmod implements Linux syscall fchmod(2). +func Fchmod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + mode := args[1].ModeT() + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + return 0, nil, file.SetStat(t, vfs.SetStatOptions{ + Stat: linux.Statx{ + Mask: linux.STATX_MODE, + Mode: uint16(mode & chmodMask), + }, + }) +} + +// Chown implements Linux syscall chown(2). +func Chown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + pathAddr := args[0].Pointer() + owner := args[1].Int() + group := args[2].Int() + return 0, nil, fchownat(t, linux.AT_FDCWD, pathAddr, owner, group, 0 /* flags */) +} + +// Lchown implements Linux syscall lchown(2). +func Lchown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + pathAddr := args[0].Pointer() + owner := args[1].Int() + group := args[2].Int() + return 0, nil, fchownat(t, linux.AT_FDCWD, pathAddr, owner, group, linux.AT_SYMLINK_NOFOLLOW) +} + +// Fchownat implements Linux syscall fchownat(2). +func Fchownat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + dirfd := args[0].Int() + pathAddr := args[1].Pointer() + owner := args[2].Int() + group := args[3].Int() + flags := args[4].Int() + return 0, nil, fchownat(t, dirfd, pathAddr, owner, group, flags) +} + +func fchownat(t *kernel.Task, dirfd int32, pathAddr usermem.Addr, owner, group, flags int32) error { + if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW) != 0 { + return syserror.EINVAL + } + + path, err := copyInPath(t, pathAddr) + if err != nil { + return err + } + + var opts vfs.SetStatOptions + if err := populateSetStatOptionsForChown(t, owner, group, &opts); err != nil { + return err + } + + return setstatat(t, dirfd, path, shouldAllowEmptyPath(flags&linux.AT_EMPTY_PATH != 0), shouldFollowFinalSymlink(flags&linux.AT_SYMLINK_NOFOLLOW == 0), &opts) +} + +func populateSetStatOptionsForChown(t *kernel.Task, owner, group int32, opts *vfs.SetStatOptions) error { + userns := t.UserNamespace() + if owner != -1 { + kuid := userns.MapToKUID(auth.UID(owner)) + if !kuid.Ok() { + return syserror.EINVAL + } + opts.Stat.Mask |= linux.STATX_UID + opts.Stat.UID = uint32(kuid) + } + if group != -1 { + kgid := userns.MapToKGID(auth.GID(group)) + if !kgid.Ok() { + return syserror.EINVAL + } + opts.Stat.Mask |= linux.STATX_GID + opts.Stat.GID = uint32(kgid) + } + return nil +} + +// Fchown implements Linux syscall fchown(2). +func Fchown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + owner := args[1].Int() + group := args[2].Int() + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + var opts vfs.SetStatOptions + if err := populateSetStatOptionsForChown(t, owner, group, &opts); err != nil { + return 0, nil, err + } + return 0, nil, file.SetStat(t, opts) +} + +// Truncate implements Linux syscall truncate(2). +func Truncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + length := args[1].Int64() + + if length < 0 { + return 0, nil, syserror.EINVAL + } + + path, err := copyInPath(t, addr) + if err != nil { + return 0, nil, err + } + + return 0, nil, setstatat(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink, &vfs.SetStatOptions{ + Stat: linux.Statx{ + Mask: linux.STATX_SIZE, + Size: uint64(length), + }, + }) +} + +// Ftruncate implements Linux syscall ftruncate(2). +func Ftruncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + length := args[1].Int64() + + if length < 0 { + return 0, nil, syserror.EINVAL + } + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + return 0, nil, file.SetStat(t, vfs.SetStatOptions{ + Stat: linux.Statx{ + Mask: linux.STATX_SIZE, + Size: uint64(length), + }, + }) +} + +// Utime implements Linux syscall utime(2). +func Utime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + pathAddr := args[0].Pointer() + timesAddr := args[1].Pointer() + + path, err := copyInPath(t, pathAddr) + if err != nil { + return 0, nil, err + } + + opts := vfs.SetStatOptions{ + Stat: linux.Statx{ + Mask: linux.STATX_ATIME | linux.STATX_MTIME, + }, + } + if timesAddr == 0 { + opts.Stat.Atime.Nsec = linux.UTIME_NOW + opts.Stat.Mtime.Nsec = linux.UTIME_NOW + } else { + var times linux.Utime + if err := times.CopyIn(t, timesAddr); err != nil { + return 0, nil, err + } + opts.Stat.Atime.Sec = times.Actime + opts.Stat.Mtime.Sec = times.Modtime + } + + return 0, nil, setstatat(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink, &opts) +} + +// Utimes implements Linux syscall utimes(2). +func Utimes(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + pathAddr := args[0].Pointer() + timesAddr := args[1].Pointer() + + path, err := copyInPath(t, pathAddr) + if err != nil { + return 0, nil, err + } + + opts := vfs.SetStatOptions{ + Stat: linux.Statx{ + Mask: linux.STATX_ATIME | linux.STATX_MTIME, + }, + } + if timesAddr == 0 { + opts.Stat.Atime.Nsec = linux.UTIME_NOW + opts.Stat.Mtime.Nsec = linux.UTIME_NOW + } else { + var times [2]linux.Timeval + if _, err := t.CopyIn(timesAddr, ×); err != nil { + return 0, nil, err + } + opts.Stat.Atime = linux.StatxTimestamp{ + Sec: times[0].Sec, + Nsec: uint32(times[0].Usec * 1000), + } + opts.Stat.Mtime = linux.StatxTimestamp{ + Sec: times[1].Sec, + Nsec: uint32(times[1].Usec * 1000), + } + } + + return 0, nil, setstatat(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink, &opts) +} + +// Utimensat implements Linux syscall utimensat(2). +func Utimensat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + dirfd := args[0].Int() + pathAddr := args[1].Pointer() + timesAddr := args[2].Pointer() + flags := args[3].Int() + + if flags&^linux.AT_SYMLINK_NOFOLLOW != 0 { + return 0, nil, syserror.EINVAL + } + + path, err := copyInPath(t, pathAddr) + if err != nil { + return 0, nil, err + } + + var opts vfs.SetStatOptions + if err := populateSetStatOptionsForUtimens(t, timesAddr, &opts); err != nil { + return 0, nil, err + } + + return 0, nil, setstatat(t, dirfd, path, disallowEmptyPath, followFinalSymlink, &opts) +} + +// Futimens implements Linux syscall futimens(2). +func Futimens(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + timesAddr := args[1].Pointer() + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + var opts vfs.SetStatOptions + if err := populateSetStatOptionsForUtimens(t, timesAddr, &opts); err != nil { + return 0, nil, err + } + + return 0, nil, file.SetStat(t, opts) +} + +func populateSetStatOptionsForUtimens(t *kernel.Task, timesAddr usermem.Addr, opts *vfs.SetStatOptions) error { + if timesAddr == 0 { + opts.Stat.Mask = linux.STATX_ATIME | linux.STATX_MTIME + opts.Stat.Atime.Nsec = linux.UTIME_NOW + opts.Stat.Mtime.Nsec = linux.UTIME_NOW + return nil + } + var times [2]linux.Timespec + if _, err := t.CopyIn(timesAddr, ×); err != nil { + return err + } + if times[0].Nsec != linux.UTIME_OMIT { + opts.Stat.Mask |= linux.STATX_ATIME + opts.Stat.Atime = linux.StatxTimestamp{ + Sec: times[0].Sec, + Nsec: uint32(times[0].Nsec), + } + } + if times[1].Nsec != linux.UTIME_OMIT { + opts.Stat.Mask |= linux.STATX_MTIME + opts.Stat.Mtime = linux.StatxTimestamp{ + Sec: times[1].Sec, + Nsec: uint32(times[1].Nsec), + } + } + return nil +} + +func setstatat(t *kernel.Task, dirfd int32, path fspath.Path, shouldAllowEmptyPath shouldAllowEmptyPath, shouldFollowFinalSymlink shouldFollowFinalSymlink, opts *vfs.SetStatOptions) error { + root := t.FSContext().RootDirectoryVFS2() + defer root.DecRef() + start := root + if !path.Absolute { + if !path.HasComponents() && !bool(shouldAllowEmptyPath) { + return syserror.ENOENT + } + if dirfd == linux.AT_FDCWD { + start = t.FSContext().WorkingDirectoryVFS2() + defer start.DecRef() + } else { + dirfile := t.GetFileVFS2(dirfd) + if dirfile == nil { + return syserror.EBADF + } + if !path.HasComponents() { + // Use FileDescription.SetStat() instead of + // VirtualFilesystem.SetStatAt(), since the former may be able + // to use opened file state to expedite the SetStat. + err := dirfile.SetStat(t, *opts) + dirfile.DecRef() + return err + } + start = dirfile.VirtualDentry() + start.IncRef() + defer start.DecRef() + dirfile.DecRef() + } + } + return t.Kernel().VFS().SetStatAt(t, t.Credentials(), &vfs.PathOperation{ + Root: root, + Start: start, + Path: path, + FollowFinalSymlink: bool(shouldFollowFinalSymlink), + }, opts) +} diff --git a/pkg/sentry/syscalls/linux/vfs2/stat.go b/pkg/sentry/syscalls/linux/vfs2/stat.go new file mode 100644 index 000000000..dca8d7011 --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/stat.go @@ -0,0 +1,346 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/gohacks" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// Stat implements Linux syscall stat(2). +func Stat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + pathAddr := args[0].Pointer() + statAddr := args[1].Pointer() + return 0, nil, fstatat(t, linux.AT_FDCWD, pathAddr, statAddr, 0 /* flags */) +} + +// Lstat implements Linux syscall lstat(2). +func Lstat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + pathAddr := args[0].Pointer() + statAddr := args[1].Pointer() + return 0, nil, fstatat(t, linux.AT_FDCWD, pathAddr, statAddr, linux.AT_SYMLINK_NOFOLLOW) +} + +// Newfstatat implements Linux syscall newfstatat, which backs fstatat(2). +func Newfstatat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + dirfd := args[0].Int() + pathAddr := args[1].Pointer() + statAddr := args[2].Pointer() + flags := args[3].Int() + return 0, nil, fstatat(t, dirfd, pathAddr, statAddr, flags) +} + +func fstatat(t *kernel.Task, dirfd int32, pathAddr, statAddr usermem.Addr, flags int32) error { + if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW) != 0 { + return syserror.EINVAL + } + + opts := vfs.StatOptions{ + Mask: linux.STATX_BASIC_STATS, + } + + path, err := copyInPath(t, pathAddr) + if err != nil { + return err + } + + root := t.FSContext().RootDirectoryVFS2() + defer root.DecRef() + start := root + if !path.Absolute { + if !path.HasComponents() && flags&linux.AT_EMPTY_PATH == 0 { + return syserror.ENOENT + } + if dirfd == linux.AT_FDCWD { + start = t.FSContext().WorkingDirectoryVFS2() + defer start.DecRef() + } else { + dirfile := t.GetFileVFS2(dirfd) + if dirfile == nil { + return syserror.EBADF + } + if !path.HasComponents() { + // Use FileDescription.Stat() instead of + // VirtualFilesystem.StatAt() for fstatat(fd, ""), since the + // former may be able to use opened file state to expedite the + // Stat. + statx, err := dirfile.Stat(t, opts) + dirfile.DecRef() + if err != nil { + return err + } + var stat linux.Stat + convertStatxToUserStat(t, &statx, &stat) + return stat.CopyOut(t, statAddr) + } + start = dirfile.VirtualDentry() + start.IncRef() + defer start.DecRef() + dirfile.DecRef() + } + } + + statx, err := t.Kernel().VFS().StatAt(t, t.Credentials(), &vfs.PathOperation{ + Root: root, + Start: start, + Path: path, + FollowFinalSymlink: flags&linux.AT_SYMLINK_NOFOLLOW == 0, + }, &opts) + if err != nil { + return err + } + var stat linux.Stat + convertStatxToUserStat(t, &statx, &stat) + return stat.CopyOut(t, statAddr) +} + +// This takes both input and output as pointer arguments to avoid copying large +// structs. +func convertStatxToUserStat(t *kernel.Task, statx *linux.Statx, stat *linux.Stat) { + // Linux just copies fields from struct kstat without regard to struct + // kstat::result_mask (fs/stat.c:cp_new_stat()), so we do too. + userns := t.UserNamespace() + *stat = linux.Stat{ + Dev: uint64(linux.MakeDeviceID(uint16(statx.DevMajor), statx.DevMinor)), + Ino: statx.Ino, + Nlink: uint64(statx.Nlink), + Mode: uint32(statx.Mode), + UID: uint32(auth.KUID(statx.UID).In(userns).OrOverflow()), + GID: uint32(auth.KGID(statx.GID).In(userns).OrOverflow()), + Rdev: uint64(linux.MakeDeviceID(uint16(statx.RdevMajor), statx.RdevMinor)), + Size: int64(statx.Size), + Blksize: int64(statx.Blksize), + Blocks: int64(statx.Blocks), + ATime: timespecFromStatxTimestamp(statx.Atime), + MTime: timespecFromStatxTimestamp(statx.Mtime), + CTime: timespecFromStatxTimestamp(statx.Ctime), + } +} + +func timespecFromStatxTimestamp(sxts linux.StatxTimestamp) linux.Timespec { + return linux.Timespec{ + Sec: sxts.Sec, + Nsec: int64(sxts.Nsec), + } +} + +// Fstat implements Linux syscall fstat(2). +func Fstat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + statAddr := args[1].Pointer() + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + statx, err := file.Stat(t, vfs.StatOptions{ + Mask: linux.STATX_BASIC_STATS, + }) + if err != nil { + return 0, nil, err + } + var stat linux.Stat + convertStatxToUserStat(t, &statx, &stat) + return 0, nil, stat.CopyOut(t, statAddr) +} + +// Statx implements Linux syscall statx(2). +func Statx(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + dirfd := args[0].Int() + pathAddr := args[1].Pointer() + flags := args[2].Int() + mask := args[3].Uint() + statxAddr := args[4].Pointer() + + if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW) != 0 { + return 0, nil, syserror.EINVAL + } + + opts := vfs.StatOptions{ + Mask: mask, + Sync: uint32(flags & linux.AT_STATX_SYNC_TYPE), + } + + path, err := copyInPath(t, pathAddr) + if err != nil { + return 0, nil, err + } + + root := t.FSContext().RootDirectoryVFS2() + defer root.DecRef() + start := root + if !path.Absolute { + if !path.HasComponents() && flags&linux.AT_EMPTY_PATH == 0 { + return 0, nil, syserror.ENOENT + } + if dirfd == linux.AT_FDCWD { + start = t.FSContext().WorkingDirectoryVFS2() + defer start.DecRef() + } else { + dirfile := t.GetFileVFS2(dirfd) + if dirfile == nil { + return 0, nil, syserror.EBADF + } + if !path.HasComponents() { + // Use FileDescription.Stat() instead of + // VirtualFilesystem.StatAt() for statx(fd, ""), since the + // former may be able to use opened file state to expedite the + // Stat. + statx, err := dirfile.Stat(t, opts) + dirfile.DecRef() + if err != nil { + return 0, nil, err + } + userifyStatx(t, &statx) + return 0, nil, statx.CopyOut(t, statxAddr) + } + start = dirfile.VirtualDentry() + start.IncRef() + defer start.DecRef() + dirfile.DecRef() + } + } + + statx, err := t.Kernel().VFS().StatAt(t, t.Credentials(), &vfs.PathOperation{ + Root: root, + Start: start, + Path: path, + FollowFinalSymlink: flags&linux.AT_SYMLINK_NOFOLLOW == 0, + }, &opts) + if err != nil { + return 0, nil, err + } + userifyStatx(t, &statx) + return 0, nil, statx.CopyOut(t, statxAddr) +} + +func userifyStatx(t *kernel.Task, statx *linux.Statx) { + userns := t.UserNamespace() + statx.UID = uint32(auth.KUID(statx.UID).In(userns).OrOverflow()) + statx.GID = uint32(auth.KGID(statx.GID).In(userns).OrOverflow()) +} + +// Readlink implements Linux syscall readlink(2). +func Readlink(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + pathAddr := args[0].Pointer() + bufAddr := args[1].Pointer() + size := args[2].SizeT() + return readlinkat(t, linux.AT_FDCWD, pathAddr, bufAddr, size) +} + +// Access implements Linux syscall access(2). +func Access(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + // FIXME(jamieliu): actually implement + return 0, nil, nil +} + +// Faccessat implements Linux syscall access(2). +func Faccessat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + // FIXME(jamieliu): actually implement + return 0, nil, nil +} + +// Readlinkat implements Linux syscall mknodat(2). +func Readlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + dirfd := args[0].Int() + pathAddr := args[1].Pointer() + bufAddr := args[2].Pointer() + size := args[3].SizeT() + return readlinkat(t, dirfd, pathAddr, bufAddr, size) +} + +func readlinkat(t *kernel.Task, dirfd int32, pathAddr, bufAddr usermem.Addr, size uint) (uintptr, *kernel.SyscallControl, error) { + if int(size) <= 0 { + return 0, nil, syserror.EINVAL + } + + path, err := copyInPath(t, pathAddr) + if err != nil { + return 0, nil, err + } + // "Since Linux 2.6.39, pathname can be an empty string, in which case the + // call operates on the symbolic link referred to by dirfd ..." - + // readlinkat(2) + tpop, err := getTaskPathOperation(t, dirfd, path, allowEmptyPath, nofollowFinalSymlink) + if err != nil { + return 0, nil, err + } + defer tpop.Release() + + target, err := t.Kernel().VFS().ReadlinkAt(t, t.Credentials(), &tpop.pop) + if err != nil { + return 0, nil, err + } + + if len(target) > int(size) { + target = target[:size] + } + n, err := t.CopyOutBytes(bufAddr, gohacks.ImmutableBytesFromString(target)) + if n == 0 { + return 0, nil, err + } + return uintptr(n), nil, nil +} + +// Statfs implements Linux syscall statfs(2). +func Statfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + pathAddr := args[0].Pointer() + bufAddr := args[1].Pointer() + + path, err := copyInPath(t, pathAddr) + if err != nil { + return 0, nil, err + } + tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink) + if err != nil { + return 0, nil, err + } + defer tpop.Release() + + statfs, err := t.Kernel().VFS().StatFSAt(t, t.Credentials(), &tpop.pop) + if err != nil { + return 0, nil, err + } + + return 0, nil, statfs.CopyOut(t, bufAddr) +} + +// Fstatfs implements Linux syscall fstatfs(2). +func Fstatfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + bufAddr := args[1].Pointer() + + tpop, err := getTaskPathOperation(t, fd, fspath.Path{}, allowEmptyPath, nofollowFinalSymlink) + if err != nil { + return 0, nil, err + } + defer tpop.Release() + + statfs, err := t.Kernel().VFS().StatFSAt(t, t.Credentials(), &tpop.pop) + if err != nil { + return 0, nil, err + } + + return 0, nil, statfs.CopyOut(t, bufAddr) +} diff --git a/pkg/sentry/syscalls/linux/vfs2/sync.go b/pkg/sentry/syscalls/linux/vfs2/sync.go new file mode 100644 index 000000000..365250b0b --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/sync.go @@ -0,0 +1,87 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/syserror" +) + +// Sync implements Linux syscall sync(2). +func Sync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return 0, nil, t.Kernel().VFS().SyncAllFilesystems(t) +} + +// Syncfs implements Linux syscall syncfs(2). +func Syncfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + return 0, nil, file.SyncFS(t) +} + +// Fsync implements Linux syscall fsync(2). +func Fsync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + return 0, nil, file.Sync(t) +} + +// Fdatasync implements Linux syscall fdatasync(2). +func Fdatasync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + // TODO(gvisor.dev/issue/1897): Avoid writeback of unnecessary metadata. + return Fsync(t, args) +} + +// SyncFileRange implements Linux syscall sync_file_range(2). +func SyncFileRange(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + offset := args[1].Int64() + nbytes := args[2].Int64() + flags := args[3].Uint() + + if offset < 0 { + return 0, nil, syserror.EINVAL + } + if nbytes < 0 { + return 0, nil, syserror.EINVAL + } + if flags&^(linux.SYNC_FILE_RANGE_WAIT_BEFORE|linux.SYNC_FILE_RANGE_WRITE|linux.SYNC_FILE_RANGE_WAIT_AFTER) != 0 { + return 0, nil, syserror.EINVAL + } + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // TODO(gvisor.dev/issue/1897): Avoid writeback of data ranges outside of + // [offset, offset+nbytes). + return 0, nil, file.Sync(t) +} diff --git a/pkg/sentry/syscalls/linux/vfs2/sys_read.go b/pkg/sentry/syscalls/linux/vfs2/sys_read.go deleted file mode 100644 index 7667524c7..000000000 --- a/pkg/sentry/syscalls/linux/vfs2/sys_read.go +++ /dev/null @@ -1,95 +0,0 @@ -// Copyright 2020 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package vfs2 - -import ( - "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/kernel" - "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" - "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" - "gvisor.dev/gvisor/pkg/usermem" - "gvisor.dev/gvisor/pkg/waiter" -) - -const ( - // EventMaskRead contains events that can be triggered on reads. - EventMaskRead = waiter.EventIn | waiter.EventHUp | waiter.EventErr -) - -// Read implements linux syscall read(2). Note that we try to get a buffer that -// is exactly the size requested because some applications like qemu expect -// they can do large reads all at once. Bug for bug. Same for other read -// calls below. -func Read(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := args[0].Int() - addr := args[1].Pointer() - size := args[2].SizeT() - - file := t.GetFileVFS2(fd) - if file == nil { - return 0, nil, syserror.EBADF - } - defer file.DecRef() - - // Check that the size is legitimate. - si := int(size) - if si < 0 { - return 0, nil, syserror.EINVAL - } - - // Get the destination of the read. - dst, err := t.SingleIOSequence(addr, si, usermem.IOOpts{ - AddressSpaceActive: true, - }) - if err != nil { - return 0, nil, err - } - - n, err := read(t, file, dst, vfs.ReadOptions{}) - t.IOUsage().AccountReadSyscall(n) - return uintptr(n), nil, linux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "read", file) -} - -func read(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { - n, err := file.Read(t, dst, opts) - if err != syserror.ErrWouldBlock { - return n, err - } - - // Register for notifications. - w, ch := waiter.NewChannelEntry(nil) - file.EventRegister(&w, EventMaskRead) - - total := n - for { - // Shorten dst to reflect bytes previously read. - dst = dst.DropFirst(int(n)) - - // Issue the request and break out if it completes with anything other than - // "would block". - n, err := file.Read(t, dst, opts) - total += n - if err != syserror.ErrWouldBlock { - break - } - if err := t.Block(ch); err != nil { - break - } - } - file.EventUnregister(&w) - - return total, err -} diff --git a/pkg/sentry/syscalls/linux/vfs2/xattr.go b/pkg/sentry/syscalls/linux/vfs2/xattr.go new file mode 100644 index 000000000..89e9ff4d7 --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/xattr.go @@ -0,0 +1,353 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "bytes" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/gohacks" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// Listxattr implements Linux syscall listxattr(2). +func Listxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return listxattr(t, args, followFinalSymlink) +} + +// Llistxattr implements Linux syscall llistxattr(2). +func Llistxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return listxattr(t, args, nofollowFinalSymlink) +} + +func listxattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSymlink shouldFollowFinalSymlink) (uintptr, *kernel.SyscallControl, error) { + pathAddr := args[0].Pointer() + listAddr := args[1].Pointer() + size := args[2].SizeT() + + path, err := copyInPath(t, pathAddr) + if err != nil { + return 0, nil, err + } + tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, shouldFollowFinalSymlink) + if err != nil { + return 0, nil, err + } + defer tpop.Release() + + names, err := t.Kernel().VFS().ListxattrAt(t, t.Credentials(), &tpop.pop) + if err != nil { + return 0, nil, err + } + n, err := copyOutXattrNameList(t, listAddr, size, names) + if err != nil { + return 0, nil, err + } + return uintptr(n), nil, nil +} + +// Flistxattr implements Linux syscall flistxattr(2). +func Flistxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + listAddr := args[1].Pointer() + size := args[2].SizeT() + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + names, err := file.Listxattr(t) + if err != nil { + return 0, nil, err + } + n, err := copyOutXattrNameList(t, listAddr, size, names) + if err != nil { + return 0, nil, err + } + return uintptr(n), nil, nil +} + +// Getxattr implements Linux syscall getxattr(2). +func Getxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return getxattr(t, args, followFinalSymlink) +} + +// Lgetxattr implements Linux syscall lgetxattr(2). +func Lgetxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return getxattr(t, args, nofollowFinalSymlink) +} + +func getxattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSymlink shouldFollowFinalSymlink) (uintptr, *kernel.SyscallControl, error) { + pathAddr := args[0].Pointer() + nameAddr := args[1].Pointer() + valueAddr := args[2].Pointer() + size := args[3].SizeT() + + path, err := copyInPath(t, pathAddr) + if err != nil { + return 0, nil, err + } + tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, shouldFollowFinalSymlink) + if err != nil { + return 0, nil, err + } + defer tpop.Release() + + name, err := copyInXattrName(t, nameAddr) + if err != nil { + return 0, nil, err + } + + value, err := t.Kernel().VFS().GetxattrAt(t, t.Credentials(), &tpop.pop, name) + if err != nil { + return 0, nil, err + } + n, err := copyOutXattrValue(t, valueAddr, size, value) + if err != nil { + return 0, nil, err + } + return uintptr(n), nil, nil +} + +// Fgetxattr implements Linux syscall fgetxattr(2). +func Fgetxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + nameAddr := args[1].Pointer() + valueAddr := args[2].Pointer() + size := args[3].SizeT() + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + name, err := copyInXattrName(t, nameAddr) + if err != nil { + return 0, nil, err + } + + value, err := file.Getxattr(t, name) + if err != nil { + return 0, nil, err + } + n, err := copyOutXattrValue(t, valueAddr, size, value) + if err != nil { + return 0, nil, err + } + return uintptr(n), nil, nil +} + +// Setxattr implements Linux syscall setxattr(2). +func Setxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return 0, nil, setxattr(t, args, followFinalSymlink) +} + +// Lsetxattr implements Linux syscall lsetxattr(2). +func Lsetxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return 0, nil, setxattr(t, args, nofollowFinalSymlink) +} + +func setxattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSymlink shouldFollowFinalSymlink) error { + pathAddr := args[0].Pointer() + nameAddr := args[1].Pointer() + valueAddr := args[2].Pointer() + size := args[3].SizeT() + flags := args[4].Int() + + if flags&^(linux.XATTR_CREATE|linux.XATTR_REPLACE) != 0 { + return syserror.EINVAL + } + + path, err := copyInPath(t, pathAddr) + if err != nil { + return err + } + tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, shouldFollowFinalSymlink) + if err != nil { + return err + } + defer tpop.Release() + + name, err := copyInXattrName(t, nameAddr) + if err != nil { + return err + } + value, err := copyInXattrValue(t, valueAddr, size) + if err != nil { + return err + } + + return t.Kernel().VFS().SetxattrAt(t, t.Credentials(), &tpop.pop, &vfs.SetxattrOptions{ + Name: name, + Value: value, + Flags: uint32(flags), + }) +} + +// Fsetxattr implements Linux syscall fsetxattr(2). +func Fsetxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + nameAddr := args[1].Pointer() + valueAddr := args[2].Pointer() + size := args[3].SizeT() + flags := args[4].Int() + + if flags&^(linux.XATTR_CREATE|linux.XATTR_REPLACE) != 0 { + return 0, nil, syserror.EINVAL + } + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + name, err := copyInXattrName(t, nameAddr) + if err != nil { + return 0, nil, err + } + value, err := copyInXattrValue(t, valueAddr, size) + if err != nil { + return 0, nil, err + } + + return 0, nil, file.Setxattr(t, vfs.SetxattrOptions{ + Name: name, + Value: value, + Flags: uint32(flags), + }) +} + +// Removexattr implements Linux syscall removexattr(2). +func Removexattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return 0, nil, removexattr(t, args, followFinalSymlink) +} + +// Lremovexattr implements Linux syscall lremovexattr(2). +func Lremovexattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + return 0, nil, removexattr(t, args, nofollowFinalSymlink) +} + +func removexattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSymlink shouldFollowFinalSymlink) error { + pathAddr := args[0].Pointer() + nameAddr := args[1].Pointer() + + path, err := copyInPath(t, pathAddr) + if err != nil { + return err + } + tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, shouldFollowFinalSymlink) + if err != nil { + return err + } + defer tpop.Release() + + name, err := copyInXattrName(t, nameAddr) + if err != nil { + return err + } + + return t.Kernel().VFS().RemovexattrAt(t, t.Credentials(), &tpop.pop, name) +} + +// Fremovexattr implements Linux syscall fremovexattr(2). +func Fremovexattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + nameAddr := args[1].Pointer() + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + name, err := copyInXattrName(t, nameAddr) + if err != nil { + return 0, nil, err + } + + return 0, nil, file.Removexattr(t, name) +} + +func copyInXattrName(t *kernel.Task, nameAddr usermem.Addr) (string, error) { + name, err := t.CopyInString(nameAddr, linux.XATTR_NAME_MAX+1) + if err != nil { + if err == syserror.ENAMETOOLONG { + return "", syserror.ERANGE + } + return "", err + } + if len(name) == 0 { + return "", syserror.ERANGE + } + return name, nil +} + +func copyOutXattrNameList(t *kernel.Task, listAddr usermem.Addr, size uint, names []string) (int, error) { + if size > linux.XATTR_LIST_MAX { + size = linux.XATTR_LIST_MAX + } + var buf bytes.Buffer + for _, name := range names { + buf.WriteString(name) + buf.WriteByte(0) + } + if size == 0 { + // Return the size that would be required to accomodate the list. + return buf.Len(), nil + } + if buf.Len() > int(size) { + if size >= linux.XATTR_LIST_MAX { + return 0, syserror.E2BIG + } + return 0, syserror.ERANGE + } + return t.CopyOutBytes(listAddr, buf.Bytes()) +} + +func copyInXattrValue(t *kernel.Task, valueAddr usermem.Addr, size uint) (string, error) { + if size > linux.XATTR_SIZE_MAX { + return "", syserror.E2BIG + } + buf := make([]byte, size) + if _, err := t.CopyInBytes(valueAddr, buf); err != nil { + return "", err + } + return gohacks.StringFromImmutableBytes(buf), nil +} + +func copyOutXattrValue(t *kernel.Task, valueAddr usermem.Addr, size uint, value string) (int, error) { + if size > linux.XATTR_SIZE_MAX { + size = linux.XATTR_SIZE_MAX + } + if size == 0 { + // Return the size that would be required to accomodate the value. + return len(value), nil + } + if len(value) > int(size) { + if size >= linux.XATTR_SIZE_MAX { + return 0, syserror.E2BIG + } + return 0, syserror.ERANGE + } + return t.CopyOutBytes(valueAddr, gohacks.ImmutableBytesFromString(value)) +} diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD index 0b4f18ab5..07c8383e6 100644 --- a/pkg/sentry/vfs/BUILD +++ b/pkg/sentry/vfs/BUILD @@ -43,6 +43,7 @@ go_library( "//pkg/abi/linux", "//pkg/context", "//pkg/fspath", + "//pkg/gohacks", "//pkg/log", "//pkg/sentry/arch", "//pkg/sentry/fs/lock", diff --git a/pkg/sentry/vfs/epoll.go b/pkg/sentry/vfs/epoll.go index eed41139b..3da45d744 100644 --- a/pkg/sentry/vfs/epoll.go +++ b/pkg/sentry/vfs/epoll.go @@ -202,6 +202,9 @@ func (ep *EpollInstance) AddInterest(file *FileDescription, num int32, event lin // Add epi to file.epolls so that it is removed when the last // FileDescription reference is dropped. file.epollMu.Lock() + if file.epolls == nil { + file.epolls = make(map[*epollInterest]struct{}) + } file.epolls[epi] = struct{}{} file.epollMu.Unlock() diff --git a/pkg/sentry/vfs/mount_unsafe.go b/pkg/sentry/vfs/mount_unsafe.go index 1fe766a44..bc7581698 100644 --- a/pkg/sentry/vfs/mount_unsafe.go +++ b/pkg/sentry/vfs/mount_unsafe.go @@ -26,6 +26,7 @@ import ( "sync/atomic" "unsafe" + "gvisor.dev/gvisor/pkg/gohacks" "gvisor.dev/gvisor/pkg/sync" ) @@ -160,7 +161,7 @@ func newMountTableSlots(cap uintptr) unsafe.Pointer { // Lookup may be called even if there are concurrent mutators of mt. func (mt *mountTable) Lookup(parent *Mount, point *Dentry) *Mount { key := mountKey{parent: unsafe.Pointer(parent), point: unsafe.Pointer(point)} - hash := memhash(noescape(unsafe.Pointer(&key)), uintptr(mt.seed), mountKeyBytes) + hash := memhash(gohacks.Noescape(unsafe.Pointer(&key)), uintptr(mt.seed), mountKeyBytes) loop: for { @@ -361,12 +362,3 @@ func memhash(p unsafe.Pointer, seed, s uintptr) uintptr //go:linkname rand32 runtime.fastrand func rand32() uint32 - -// This is copy/pasted from runtime.noescape(), and is needed because arguments -// apparently escape from all functions defined by linkname. -// -//go:nosplit -func noescape(p unsafe.Pointer) unsafe.Pointer { - x := uintptr(p) - return unsafe.Pointer(x ^ 0) -} diff --git a/pkg/sentry/vfs/resolving_path.go b/pkg/sentry/vfs/resolving_path.go index 8a0b382f6..eb4ebb511 100644 --- a/pkg/sentry/vfs/resolving_path.go +++ b/pkg/sentry/vfs/resolving_path.go @@ -228,7 +228,7 @@ func (rp *ResolvingPath) Advance() { rp.pit = next } else { // at end of path segment, continue with next one rp.curPart-- - rp.pit = rp.parts[rp.curPart-1] + rp.pit = rp.parts[rp.curPart] } } diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go index 8f29031b2..73f8043be 100644 --- a/pkg/sentry/vfs/vfs.go +++ b/pkg/sentry/vfs/vfs.go @@ -385,15 +385,11 @@ func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credential // Only a regular file can be executed. stat, err := fd.Stat(ctx, StatOptions{Mask: linux.STATX_TYPE}) if err != nil { + fd.DecRef() return nil, err } - if stat.Mask&linux.STATX_TYPE != 0 { - // This shouldn't happen, but if type can't be retrieved, file can't - // be executed. - return nil, syserror.EACCES - } - if t := linux.FileMode(stat.Mode).FileType(); t != linux.ModeRegular { - ctx.Infof("%q is not a regular file: %v", pop.Path, t) + if stat.Mask&linux.STATX_TYPE == 0 || stat.Mode&linux.S_IFMT != linux.S_IFREG { + fd.DecRef() return nil, syserror.EACCES } } diff --git a/pkg/usermem/BUILD b/pkg/usermem/BUILD index ff8b9e91a..6c9ada9c7 100644 --- a/pkg/usermem/BUILD +++ b/pkg/usermem/BUILD @@ -25,7 +25,6 @@ go_library( "bytes_io_unsafe.go", "usermem.go", "usermem_arm64.go", - "usermem_unsafe.go", "usermem_x86.go", ], visibility = ["//:sandbox"], @@ -33,6 +32,7 @@ go_library( "//pkg/atomicbitops", "//pkg/binary", "//pkg/context", + "//pkg/gohacks", "//pkg/log", "//pkg/safemem", "//pkg/syserror", diff --git a/pkg/usermem/usermem.go b/pkg/usermem/usermem.go index 71fd4e155..d2f4403b0 100644 --- a/pkg/usermem/usermem.go +++ b/pkg/usermem/usermem.go @@ -23,6 +23,7 @@ import ( "gvisor.dev/gvisor/pkg/binary" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/gohacks" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/syserror" ) @@ -251,7 +252,7 @@ func CopyStringIn(ctx context.Context, uio IO, addr Addr, maxlen int, opts IOOpt } end, ok := addr.AddLength(uint64(readlen)) if !ok { - return stringFromImmutableBytes(buf[:done]), syserror.EFAULT + return gohacks.StringFromImmutableBytes(buf[:done]), syserror.EFAULT } // Shorten the read to avoid crossing page boundaries, since faulting // in a page unnecessarily is expensive. This also ensures that partial @@ -272,16 +273,16 @@ func CopyStringIn(ctx context.Context, uio IO, addr Addr, maxlen int, opts IOOpt // Look for the terminating zero byte, which may have occurred before // hitting err. if i := bytes.IndexByte(buf[done:done+n], byte(0)); i >= 0 { - return stringFromImmutableBytes(buf[:done+i]), nil + return gohacks.StringFromImmutableBytes(buf[:done+i]), nil } done += n if err != nil { - return stringFromImmutableBytes(buf[:done]), err + return gohacks.StringFromImmutableBytes(buf[:done]), err } addr = end } - return stringFromImmutableBytes(buf), syserror.ENAMETOOLONG + return gohacks.StringFromImmutableBytes(buf), syserror.ENAMETOOLONG } // CopyOutVec copies bytes from src to the memory mapped at ars in uio. The diff --git a/pkg/usermem/usermem_unsafe.go b/pkg/usermem/usermem_unsafe.go deleted file mode 100644 index 876783e78..000000000 --- a/pkg/usermem/usermem_unsafe.go +++ /dev/null @@ -1,27 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package usermem - -import ( - "unsafe" -) - -// stringFromImmutableBytes is equivalent to string(bs), except that it never -// copies even if escape analysis can't prove that bs does not escape. This is -// only valid if bs is never mutated after stringFromImmutableBytes returns. -func stringFromImmutableBytes(bs []byte) string { - // Compare strings.Builder.String(). - return *(*string)(unsafe.Pointer(&bs)) -} diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go index c69f4c602..a4627905e 100644 --- a/runsc/boot/filter/config.go +++ b/runsc/boot/filter/config.go @@ -229,7 +229,9 @@ var allowedSyscalls = seccomp.SyscallRules{ syscall.SYS_NANOSLEEP: {}, syscall.SYS_PPOLL: {}, syscall.SYS_PREAD64: {}, + syscall.SYS_PREADV: {}, syscall.SYS_PWRITE64: {}, + syscall.SYS_PWRITEV: {}, syscall.SYS_READ: {}, syscall.SYS_RECVMSG: []seccomp.Rule{ { -- cgit v1.2.3 From 0f8a9e362337ee684042331c5bf24a3cb43d6fc4 Mon Sep 17 00:00:00 2001 From: Fabricio Voznika Date: Fri, 28 Feb 2020 10:13:59 -0800 Subject: Change dup2 call to dup3 We changed syscalls to allow dup3 for ARM64. Updates #1198 PiperOrigin-RevId: 297870816 --- pkg/sentry/fsimpl/gofer/gofer.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go index d00850e25..c4a8f0b38 100644 --- a/pkg/sentry/fsimpl/gofer/gofer.go +++ b/pkg/sentry/fsimpl/gofer/gofer.go @@ -1045,13 +1045,13 @@ func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool // using the old file descriptor, preventing us from safely // closing it. We could handle this by invalidating existing // memmap.Translations, but this is expensive. Instead, use - // dup2() to make the old file descriptor refer to the new file + // dup3 to make the old file descriptor refer to the new file // description, then close the new file descriptor (which is no // longer needed). Racing callers may use the old or new file // description, but this doesn't matter since they refer to the // same file (unless d.fs.opts.overlayfsStaleRead is true, // which we handle separately). - if err := syscall.Dup2(int(h.fd), int(d.handle.fd)); err != nil { + if err := syscall.Dup3(int(h.fd), int(d.handle.fd), 0); err != nil { d.handleMu.Unlock() ctx.Warningf("gofer.dentry.ensureSharedHandle: failed to dup fd %d to fd %d: %v", h.fd, d.handle.fd, err) h.close(ctx) -- cgit v1.2.3 From 122d47aed17abf4301596e19fc8ac9cdad8118d9 Mon Sep 17 00:00:00 2001 From: Fabricio Voznika Date: Tue, 3 Mar 2020 15:27:23 -0800 Subject: Update cached file size when cache is skipped gofer.dentryReadWriter.WriteFromBlocks was not updating gofer.dentry.size after a write operation that skips the cache. Updates #1198 PiperOrigin-RevId: 298708646 --- pkg/sentry/fsimpl/gofer/regular_file.go | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/gofer/regular_file.go b/pkg/sentry/fsimpl/gofer/regular_file.go index 54c1031a7..e95209661 100644 --- a/pkg/sentry/fsimpl/gofer/regular_file.go +++ b/pkg/sentry/fsimpl/gofer/regular_file.go @@ -361,8 +361,15 @@ func (rw *dentryReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, erro rw.d.handleMu.RLock() if (rw.d.handle.fd >= 0 && !rw.d.fs.opts.forcePageCache) || rw.d.fs.opts.interop == InteropModeShared || rw.direct { n, err := rw.d.handle.writeFromBlocksAt(rw.ctx, srcs, rw.off) - rw.d.handleMu.RUnlock() rw.off += n + rw.d.dataMu.Lock() + if rw.off > rw.d.size { + atomic.StoreUint64(&rw.d.size, rw.off) + // The remote file's size will implicitly be extended to the correct + // value when we write back to it. + } + rw.d.dataMu.Unlock() + rw.d.handleMu.RUnlock() return n, err } -- cgit v1.2.3 From da48fc6cca23a38faef51c5b5f8ae609940773a0 Mon Sep 17 00:00:00 2001 From: Ian Lewis Date: Thu, 5 Mar 2020 18:21:39 -0800 Subject: Stub oom_score_adj and oom_score. Adds an oom_score_adj and oom_score proc file stub. oom_score_adj accepts writes of values -1000 to 1000 and persists the value with the task. New tasks inherit the parent's oom_score_adj. oom_score is a read-only stub that always returns the value '0'. Issue #202 PiperOrigin-RevId: 299245355 --- pkg/sentry/fs/proc/task.go | 126 ++++++++++++++++++++++++++----- pkg/sentry/fsimpl/proc/task.go | 12 +-- pkg/sentry/fsimpl/proc/task_files.go | 43 +++++++++++ pkg/sentry/fsimpl/proc/tasks_test.go | 32 ++++---- pkg/sentry/kernel/task.go | 33 ++++++++ pkg/sentry/kernel/task_clone.go | 6 ++ pkg/sentry/kernel/task_start.go | 4 + test/syscalls/BUILD | 8 +- test/syscalls/linux/BUILD | 13 ++++ test/syscalls/linux/proc.cc | 21 ++++++ test/syscalls/linux/proc_pid_oomscore.cc | 72 ++++++++++++++++++ 11 files changed, 330 insertions(+), 40 deletions(-) create mode 100644 test/syscalls/linux/proc_pid_oomscore.cc (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fs/proc/task.go b/pkg/sentry/fs/proc/task.go index 8ab8d8a02..4e9b0fc00 100644 --- a/pkg/sentry/fs/proc/task.go +++ b/pkg/sentry/fs/proc/task.go @@ -72,24 +72,26 @@ var _ fs.InodeOperations = (*taskDir)(nil) // newTaskDir creates a new proc task entry. func (p *proc) newTaskDir(t *kernel.Task, msrc *fs.MountSource, isThreadGroup bool) *fs.Inode { contents := map[string]*fs.Inode{ - "auxv": newAuxvec(t, msrc), - "cmdline": newExecArgInode(t, msrc, cmdlineExecArg), - "comm": newComm(t, msrc), - "environ": newExecArgInode(t, msrc, environExecArg), - "exe": newExe(t, msrc), - "fd": newFdDir(t, msrc), - "fdinfo": newFdInfoDir(t, msrc), - "gid_map": newGIDMap(t, msrc), - "io": newIO(t, msrc, isThreadGroup), - "maps": newMaps(t, msrc), - "mountinfo": seqfile.NewSeqFileInode(t, &mountInfoFile{t: t}, msrc), - "mounts": seqfile.NewSeqFileInode(t, &mountsFile{t: t}, msrc), - "ns": newNamespaceDir(t, msrc), - "smaps": newSmaps(t, msrc), - "stat": newTaskStat(t, msrc, isThreadGroup, p.pidns), - "statm": newStatm(t, msrc), - "status": newStatus(t, msrc, p.pidns), - "uid_map": newUIDMap(t, msrc), + "auxv": newAuxvec(t, msrc), + "cmdline": newExecArgInode(t, msrc, cmdlineExecArg), + "comm": newComm(t, msrc), + "environ": newExecArgInode(t, msrc, environExecArg), + "exe": newExe(t, msrc), + "fd": newFdDir(t, msrc), + "fdinfo": newFdInfoDir(t, msrc), + "gid_map": newGIDMap(t, msrc), + "io": newIO(t, msrc, isThreadGroup), + "maps": newMaps(t, msrc), + "mountinfo": seqfile.NewSeqFileInode(t, &mountInfoFile{t: t}, msrc), + "mounts": seqfile.NewSeqFileInode(t, &mountsFile{t: t}, msrc), + "ns": newNamespaceDir(t, msrc), + "oom_score": newOOMScore(t, msrc), + "oom_score_adj": newOOMScoreAdj(t, msrc), + "smaps": newSmaps(t, msrc), + "stat": newTaskStat(t, msrc, isThreadGroup, p.pidns), + "statm": newStatm(t, msrc), + "status": newStatus(t, msrc, p.pidns), + "uid_map": newUIDMap(t, msrc), } if isThreadGroup { contents["task"] = p.newSubtasks(t, msrc) @@ -796,4 +798,92 @@ func (f *auxvecFile) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequenc return int64(n), err } +// newOOMScore returns a oom_score file. It is a stub that always returns 0. +// TODO(gvisor.dev/issue/1967) +func newOOMScore(t *kernel.Task, msrc *fs.MountSource) *fs.Inode { + return newStaticProcInode(t, msrc, []byte("0\n")) +} + +// oomScoreAdj is a file containing the oom_score adjustment for a task. +// +// +stateify savable +type oomScoreAdj struct { + fsutil.SimpleFileInode + + t *kernel.Task +} + +// +stateify savable +type oomScoreAdjFile struct { + fsutil.FileGenericSeek `state:"nosave"` + fsutil.FileNoIoctl `state:"nosave"` + fsutil.FileNoMMap `state:"nosave"` + fsutil.FileNoSplice `state:"nosave"` + fsutil.FileNoopFlush `state:"nosave"` + fsutil.FileNoopFsync `state:"nosave"` + fsutil.FileNoopRelease `state:"nosave"` + fsutil.FileNotDirReaddir `state:"nosave"` + fsutil.FileUseInodeUnstableAttr `state:"nosave"` + waiter.AlwaysReady `state:"nosave"` + + t *kernel.Task +} + +// newOOMScoreAdj returns a oom_score_adj file. +func newOOMScoreAdj(t *kernel.Task, msrc *fs.MountSource) *fs.Inode { + i := &oomScoreAdj{ + SimpleFileInode: *fsutil.NewSimpleFileInode(t, fs.RootOwner, fs.FilePermsFromMode(0644), linux.PROC_SUPER_MAGIC), + t: t, + } + return newProcInode(t, i, msrc, fs.SpecialFile, t) +} + +// Truncate implements fs.InodeOperations.Truncate. Truncate is called when +// O_TRUNC is specified for any kind of existing Dirent but is not called via +// (f)truncate for proc files. +func (*oomScoreAdj) Truncate(context.Context, *fs.Inode, int64) error { + return nil +} + +// GetFile implements fs.InodeOperations.GetFile. +func (o *oomScoreAdj) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) { + return fs.NewFile(ctx, dirent, flags, &oomScoreAdjFile{t: o.t}), nil +} + +// Read implements fs.FileOperations.Read. +func (f *oomScoreAdjFile) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, offset int64) (int64, error) { + if offset != 0 { + return 0, io.EOF + } + adj, err := f.t.OOMScoreAdj() + if err != nil { + return 0, err + } + adjBytes := []byte(strconv.FormatInt(int64(adj), 10) + "\n") + n, err := dst.CopyOut(ctx, adjBytes) + return int64(n), err +} + +// Write implements fs.FileOperations.Write. +func (f *oomScoreAdjFile) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, offset int64) (int64, error) { + if src.NumBytes() == 0 { + return 0, nil + } + + // Limit input size so as not to impact performance if input size is large. + src = src.TakeFirst(usermem.PageSize - 1) + + var v int32 + n, err := usermem.CopyInt32StringInVec(ctx, src.IO, src.Addrs, &v, src.Opts) + if err != nil { + return 0, err + } + + if err := f.t.SetOOMScoreAdj(v); err != nil { + return 0, err + } + + return n, nil +} + // LINT.ThenChange(../../fsimpl/proc/task.go|../../fsimpl/proc/task_files.go) diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go index 2d814668a..18e5cd6f6 100644 --- a/pkg/sentry/fsimpl/proc/task.go +++ b/pkg/sentry/fsimpl/proc/task.go @@ -62,11 +62,13 @@ func newTaskInode(inoGen InoGenerator, task *kernel.Task, pidns *kernel.PIDNames "pid": newNamespaceSymlink(task, inoGen.NextIno(), "pid"), "user": newNamespaceSymlink(task, inoGen.NextIno(), "user"), }), - "smaps": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &smapsData{task: task}), - "stat": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &taskStatData{task: task, pidns: pidns, tgstats: isThreadGroup}), - "statm": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &statmData{task: task}), - "status": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &statusData{task: task, pidns: pidns}), - "uid_map": newTaskOwnedFile(task, inoGen.NextIno(), 0644, &idMapData{task: task, gids: false}), + "oom_score": newTaskOwnedFile(task, inoGen.NextIno(), 0444, newStaticFile("0\n")), + "oom_score_adj": newTaskOwnedFile(task, inoGen.NextIno(), 0644, &oomScoreAdj{task: task}), + "smaps": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &smapsData{task: task}), + "stat": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &taskStatData{task: task, pidns: pidns, tgstats: isThreadGroup}), + "statm": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &statmData{task: task}), + "status": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &statusData{task: task, pidns: pidns}), + "uid_map": newTaskOwnedFile(task, inoGen.NextIno(), 0644, &idMapData{task: task, gids: false}), } if isThreadGroup { contents["task"] = newSubtasks(task, pidns, inoGen, cgroupControllers) diff --git a/pkg/sentry/fsimpl/proc/task_files.go b/pkg/sentry/fsimpl/proc/task_files.go index efd3b3453..5a231ac86 100644 --- a/pkg/sentry/fsimpl/proc/task_files.go +++ b/pkg/sentry/fsimpl/proc/task_files.go @@ -525,3 +525,46 @@ func (i *ioData) Generate(ctx context.Context, buf *bytes.Buffer) error { fmt.Fprintf(buf, "cancelled_write_bytes: %d\n", io.BytesWriteCancelled) return nil } + +// oomScoreAdj is a stub of the /proc//oom_score_adj file. +// +// +stateify savable +type oomScoreAdj struct { + kernfs.DynamicBytesFile + + task *kernel.Task +} + +var _ vfs.WritableDynamicBytesSource = (*oomScoreAdj)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (o *oomScoreAdj) Generate(ctx context.Context, buf *bytes.Buffer) error { + adj, err := o.task.OOMScoreAdj() + if err != nil { + return err + } + fmt.Fprintf(buf, "%d\n", adj) + return nil +} + +// Write implements vfs.WritableDynamicBytesSource.Write. +func (o *oomScoreAdj) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) { + if src.NumBytes() == 0 { + return 0, nil + } + + // Limit input size so as not to impact performance if input size is large. + src = src.TakeFirst(usermem.PageSize - 1) + + var v int32 + n, err := usermem.CopyInt32StringInVec(ctx, src.IO, src.Addrs, &v, src.Opts) + if err != nil { + return 0, err + } + + if err := o.task.SetOOMScoreAdj(v); err != nil { + return 0, err + } + + return n, nil +} diff --git a/pkg/sentry/fsimpl/proc/tasks_test.go b/pkg/sentry/fsimpl/proc/tasks_test.go index c5d531fe0..0eb401619 100644 --- a/pkg/sentry/fsimpl/proc/tasks_test.go +++ b/pkg/sentry/fsimpl/proc/tasks_test.go @@ -63,21 +63,23 @@ var ( "thread-self": threadSelfLink.NextOff, } taskStaticFiles = map[string]testutil.DirentType{ - "auxv": linux.DT_REG, - "cgroup": linux.DT_REG, - "cmdline": linux.DT_REG, - "comm": linux.DT_REG, - "environ": linux.DT_REG, - "gid_map": linux.DT_REG, - "io": linux.DT_REG, - "maps": linux.DT_REG, - "ns": linux.DT_DIR, - "smaps": linux.DT_REG, - "stat": linux.DT_REG, - "statm": linux.DT_REG, - "status": linux.DT_REG, - "task": linux.DT_DIR, - "uid_map": linux.DT_REG, + "auxv": linux.DT_REG, + "cgroup": linux.DT_REG, + "cmdline": linux.DT_REG, + "comm": linux.DT_REG, + "environ": linux.DT_REG, + "gid_map": linux.DT_REG, + "io": linux.DT_REG, + "maps": linux.DT_REG, + "ns": linux.DT_DIR, + "oom_score": linux.DT_REG, + "oom_score_adj": linux.DT_REG, + "smaps": linux.DT_REG, + "stat": linux.DT_REG, + "statm": linux.DT_REG, + "status": linux.DT_REG, + "task": linux.DT_DIR, + "uid_map": linux.DT_REG, } ) diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go index 2cee2e6ed..c0dbbe890 100644 --- a/pkg/sentry/kernel/task.go +++ b/pkg/sentry/kernel/task.go @@ -37,6 +37,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" ) @@ -554,6 +555,13 @@ type Task struct { // // startTime is protected by mu. startTime ktime.Time + + // oomScoreAdj is the task's OOM score adjustment. This is currently not + // used but is maintained for consistency. + // TODO(gvisor.dev/issue/1967) + // + // oomScoreAdj is protected by mu, and is owned by the task goroutine. + oomScoreAdj int32 } func (t *Task) savePtraceTracer() *Task { @@ -847,3 +855,28 @@ func (t *Task) AbstractSockets() *AbstractSocketNamespace { func (t *Task) ContainerID() string { return t.containerID } + +// OOMScoreAdj gets the task's OOM score adjustment. +func (t *Task) OOMScoreAdj() (int32, error) { + t.mu.Lock() + defer t.mu.Unlock() + if t.ExitState() == TaskExitDead { + return 0, syserror.ESRCH + } + return t.oomScoreAdj, nil +} + +// SetOOMScoreAdj sets the task's OOM score adjustment. The value should be +// between -1000 and 1000 inclusive. +func (t *Task) SetOOMScoreAdj(adj int32) error { + t.mu.Lock() + defer t.mu.Unlock() + if t.ExitState() == TaskExitDead { + return syserror.ESRCH + } + if adj > 1000 || adj < -1000 { + return syserror.EINVAL + } + t.oomScoreAdj = adj + return nil +} diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go index 78866f280..dda502bb8 100644 --- a/pkg/sentry/kernel/task_clone.go +++ b/pkg/sentry/kernel/task_clone.go @@ -264,6 +264,11 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { rseqSignature = t.rseqSignature } + adj, err := t.OOMScoreAdj() + if err != nil { + return 0, nil, err + } + cfg := &TaskConfig{ Kernel: t.k, ThreadGroup: tg, @@ -282,6 +287,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { RSeqAddr: rseqAddr, RSeqSignature: rseqSignature, ContainerID: t.ContainerID(), + OOMScoreAdj: adj, } if opts.NewThreadGroup { cfg.Parent = t diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go index a5035bb7f..2bbf48bb8 100644 --- a/pkg/sentry/kernel/task_start.go +++ b/pkg/sentry/kernel/task_start.go @@ -93,6 +93,9 @@ type TaskConfig struct { // ContainerID is the container the new task belongs to. ContainerID string + + // oomScoreAdj is the task's OOM score adjustment. + OOMScoreAdj int32 } // NewTask creates a new task defined by cfg. @@ -143,6 +146,7 @@ func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) { rseqSignature: cfg.RSeqSignature, futexWaiter: futex.NewWaiter(), containerID: cfg.ContainerID, + oomScoreAdj: cfg.OOMScoreAdj, } t.creds.Store(cfg.Credentials) t.endStopCond.L = &t.tg.signalHandlers.mu diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD index a69b0ce13..9800a0cdf 100644 --- a/test/syscalls/BUILD +++ b/test/syscalls/BUILD @@ -318,10 +318,14 @@ syscall_test( test = "//test/syscalls/linux:proc_test", ) -syscall_test(test = "//test/syscalls/linux:proc_pid_uid_gid_map_test") - syscall_test(test = "//test/syscalls/linux:proc_net_test") +syscall_test(test = "//test/syscalls/linux:proc_pid_oomscore_test") + +syscall_test(test = "//test/syscalls/linux:proc_pid_smaps_test") + +syscall_test(test = "//test/syscalls/linux:proc_pid_uid_gid_map_test") + syscall_test( size = "medium", test = "//test/syscalls/linux:pselect_test", diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD index 0fbd556de..43455f1a3 100644 --- a/test/syscalls/linux/BUILD +++ b/test/syscalls/linux/BUILD @@ -1631,6 +1631,19 @@ cc_binary( ], ) +cc_binary( + name = "proc_pid_oomscore_test", + testonly = 1, + srcs = ["proc_pid_oomscore.cc"], + linkstatic = 1, + deps = [ + "//test/util:fs_util", + "//test/util:test_main", + "//test/util:test_util", + "@com_google_absl//absl/strings", + ], +) + cc_binary( name = "proc_pid_smaps_test", testonly = 1, diff --git a/test/syscalls/linux/proc.cc b/test/syscalls/linux/proc.cc index f91187e75..5a70f6c3b 100644 --- a/test/syscalls/linux/proc.cc +++ b/test/syscalls/linux/proc.cc @@ -1431,6 +1431,12 @@ TEST(ProcPidFile, SubprocessRunning) { EXPECT_THAT(ReadWhileRunning("uid_map", buf, sizeof(buf)), SyscallSucceedsWithValue(sizeof(buf))); + + EXPECT_THAT(ReadWhileRunning("oom_score", buf, sizeof(buf)), + SyscallSucceedsWithValue(sizeof(buf))); + + EXPECT_THAT(ReadWhileRunning("oom_score_adj", buf, sizeof(buf)), + SyscallSucceedsWithValue(sizeof(buf))); } // Test whether /proc/PID/ files can be read for a zombie process. @@ -1466,6 +1472,12 @@ TEST(ProcPidFile, SubprocessZombie) { EXPECT_THAT(ReadWhileZombied("uid_map", buf, sizeof(buf)), SyscallSucceedsWithValue(sizeof(buf))); + EXPECT_THAT(ReadWhileZombied("oom_score", buf, sizeof(buf)), + SyscallSucceedsWithValue(sizeof(buf))); + + EXPECT_THAT(ReadWhileZombied("oom_score_adj", buf, sizeof(buf)), + SyscallSucceedsWithValue(sizeof(buf))); + // FIXME(gvisor.dev/issue/164): Inconsistent behavior between gVisor and linux // on proc files. // @@ -1527,6 +1539,15 @@ TEST(ProcPidFile, SubprocessExited) { EXPECT_THAT(ReadWhileExited("uid_map", buf, sizeof(buf)), SyscallSucceedsWithValue(sizeof(buf))); + + if (!IsRunningOnGvisor()) { + // FIXME(gvisor.dev/issue/164): Succeeds on gVisor. + EXPECT_THAT(ReadWhileExited("oom_score", buf, sizeof(buf)), + SyscallFailsWithErrno(ESRCH)); + } + + EXPECT_THAT(ReadWhileExited("oom_score_adj", buf, sizeof(buf)), + SyscallFailsWithErrno(ESRCH)); } PosixError DirContainsImpl(absl::string_view path, diff --git a/test/syscalls/linux/proc_pid_oomscore.cc b/test/syscalls/linux/proc_pid_oomscore.cc new file mode 100644 index 000000000..707821a3f --- /dev/null +++ b/test/syscalls/linux/proc_pid_oomscore.cc @@ -0,0 +1,72 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include +#include + +#include "test/util/fs_util.h" +#include "test/util/test_util.h" + +namespace gvisor { +namespace testing { + +namespace { + +PosixErrorOr ReadProcNumber(std::string path) { + ASSIGN_OR_RETURN_ERRNO(std::string contents, GetContents(path)); + EXPECT_EQ(contents[contents.length() - 1], '\n'); + + int num; + if (!absl::SimpleAtoi(contents, &num)) { + return PosixError(EINVAL, "invalid value: " + contents); + } + + return num; +} + +TEST(ProcPidOomscoreTest, BasicRead) { + auto const oom_score = + ASSERT_NO_ERRNO_AND_VALUE(ReadProcNumber("/proc/self/oom_score")); + EXPECT_LE(oom_score, 1000); + EXPECT_GE(oom_score, -1000); +} + +TEST(ProcPidOomscoreAdjTest, BasicRead) { + auto const oom_score = + ASSERT_NO_ERRNO_AND_VALUE(ReadProcNumber("/proc/self/oom_score_adj")); + + // oom_score_adj defaults to 0. + EXPECT_EQ(oom_score, 0); +} + +TEST(ProcPidOomscoreAdjTest, BasicWrite) { + constexpr int test_value = 7; + FileDescriptor fd = + ASSERT_NO_ERRNO_AND_VALUE(Open("/proc/self/oom_score_adj", O_WRONLY)); + ASSERT_THAT( + RetryEINTR(write)(fd.get(), std::to_string(test_value).c_str(), 1), + SyscallSucceeds()); + + auto const oom_score = + ASSERT_NO_ERRNO_AND_VALUE(ReadProcNumber("/proc/self/oom_score_adj")); + EXPECT_EQ(oom_score, test_value); +} + +} // namespace + +} // namespace testing +} // namespace gvisor -- cgit v1.2.3 From 960f6a975b7e44c0efe8fd38c66b02017c4fe137 Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Fri, 6 Mar 2020 12:58:45 -0800 Subject: Add plumbing for importing fds in VFS2, along with non-socket, non-TTY impl. In VFS2, imported file descriptors are stored in a kernfs-based filesystem. Upon calling ImportFD, the host fd can be accessed in two ways: 1. a FileDescription that can be added to the FDTable, and 2. a Dentry in the host.filesystem mount, which we will want to access through magic symlinks in /proc/[pid]/fd/. An implementation of the kernfs.Inode interface stores a unique host fd. This inode can be inserted into file descriptions as well as dentries. This change also plumbs in three FileDescriptionImpls corresponding to fds for sockets, TTYs, and other files (only the latter is implemented here). These implementations will mostly make corresponding syscalls to the host. Where possible, the logic is ported over from pkg/sentry/fs/host. Updates #1672 PiperOrigin-RevId: 299417263 --- pkg/sentry/fs/host/util.go | 12 +- pkg/sentry/fsimpl/host/BUILD | 27 +++ pkg/sentry/fsimpl/host/default_file.go | 233 ++++++++++++++++++++ pkg/sentry/fsimpl/host/host.go | 286 +++++++++++++++++++++++++ pkg/sentry/fsimpl/host/util.go | 86 ++++++++ pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go | 2 +- pkg/sentry/fsimpl/kernfs/fd_impl_util.go | 18 +- pkg/sentry/fsimpl/kernfs/filesystem.go | 6 +- pkg/sentry/fsimpl/kernfs/inode_impl_util.go | 54 ++--- pkg/sentry/fsimpl/kernfs/kernfs.go | 7 +- pkg/sentry/fsimpl/proc/subtasks.go | 13 +- pkg/sentry/fsimpl/proc/task.go | 23 +- pkg/sentry/fsimpl/proc/tasks.go | 19 +- pkg/sentry/vfs/BUILD | 4 + pkg/sentry/vfs/file_description_impl_util.go | 4 +- 15 files changed, 731 insertions(+), 63 deletions(-) create mode 100644 pkg/sentry/fsimpl/host/BUILD create mode 100644 pkg/sentry/fsimpl/host/default_file.go create mode 100644 pkg/sentry/fsimpl/host/host.go create mode 100644 pkg/sentry/fsimpl/host/util.go (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fs/host/util.go b/pkg/sentry/fs/host/util.go index e37e687c6..7c60dc1db 100644 --- a/pkg/sentry/fs/host/util.go +++ b/pkg/sentry/fs/host/util.go @@ -24,7 +24,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/device" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" + "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/syserror" ) @@ -152,9 +152,9 @@ func unstableAttr(mo *superOperations, s *syscall.Stat_t) fs.UnstableAttr { Usage: s.Blocks * 512, Perms: fs.FilePermsFromMode(linux.FileMode(s.Mode)), Owner: owner(mo, s), - AccessTime: ktime.FromUnix(s.Atim.Sec, s.Atim.Nsec), - ModificationTime: ktime.FromUnix(s.Mtim.Sec, s.Mtim.Nsec), - StatusChangeTime: ktime.FromUnix(s.Ctim.Sec, s.Ctim.Nsec), + AccessTime: time.FromUnix(s.Atim.Sec, s.Atim.Nsec), + ModificationTime: time.FromUnix(s.Mtim.Sec, s.Mtim.Nsec), + StatusChangeTime: time.FromUnix(s.Ctim.Sec, s.Ctim.Nsec), Links: uint64(s.Nlink), } } @@ -165,6 +165,8 @@ type dirInfo struct { bufp int // location of next record in buf. } +// LINT.IfChange + // isBlockError unwraps os errors and checks if they are caused by EAGAIN or // EWOULDBLOCK. This is so they can be transformed into syserror.ErrWouldBlock. func isBlockError(err error) bool { @@ -177,6 +179,8 @@ func isBlockError(err error) bool { return false } +// LINT.ThenChange(../../fsimpl/host/util.go) + func hostEffectiveKIDs() (uint32, []uint32, error) { gids, err := os.Getgroups() if err != nil { diff --git a/pkg/sentry/fsimpl/host/BUILD b/pkg/sentry/fsimpl/host/BUILD new file mode 100644 index 000000000..731f192b3 --- /dev/null +++ b/pkg/sentry/fsimpl/host/BUILD @@ -0,0 +1,27 @@ +load("//tools:defs.bzl", "go_library") + +licenses(["notice"]) + +go_library( + name = "host", + srcs = [ + "default_file.go", + "host.go", + "util.go", + ], + deps = [ + "//pkg/abi/linux", + "//pkg/context", + "//pkg/log", + "//pkg/refs", + "//pkg/safemem", + "//pkg/sentry/fsimpl/kernfs", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/memmap", + "//pkg/sentry/vfs", + "//pkg/sync", + "//pkg/syserror", + "//pkg/usermem", + "@org_golang_x_sys//unix:go_default_library", + ], +) diff --git a/pkg/sentry/fsimpl/host/default_file.go b/pkg/sentry/fsimpl/host/default_file.go new file mode 100644 index 000000000..172cdb161 --- /dev/null +++ b/pkg/sentry/fsimpl/host/default_file.go @@ -0,0 +1,233 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package host + +import ( + "math" + "syscall" + + "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/safemem" + "gvisor.dev/gvisor/pkg/sentry/memmap" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// defaultFileFD implements FileDescriptionImpl for non-socket, non-TTY files. +type defaultFileFD struct { + fileDescription + + // canMap specifies whether we allow the file to be memory mapped. + canMap bool + + // mu protects the fields below. + mu sync.Mutex + + // offset specifies the current file offset. + offset int64 +} + +// TODO(gvisor.dev/issue/1672): Implement Waitable interface. + +// PRead implements FileDescriptionImpl. +func (f *defaultFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { + // TODO(b/34716638): Some char devices do support offsets, e.g. /dev/null. + if f.inode.isStream { + return 0, syserror.ESPIPE + } + + return readFromHostFD(ctx, f.inode.hostFD, dst, offset, int(opts.Flags)) +} + +// Read implements FileDescriptionImpl. +func (f *defaultFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + // TODO(b/34716638): Some char devices do support offsets, e.g. /dev/null. + if f.inode.isStream { + // These files can't be memory mapped, assert this. + if f.canMap { + panic("files that can return EWOULDBLOCK (sockets, pipes, etc.) cannot be memory mapped") + } + + f.mu.Lock() + n, err := readFromHostFD(ctx, f.inode.hostFD, dst, -1, int(opts.Flags)) + f.mu.Unlock() + if isBlockError(err) { + // If we got any data at all, return it as a "completed" partial read + // rather than retrying until complete. + if n != 0 { + err = nil + } else { + err = syserror.ErrWouldBlock + } + } + return n, err + } + // TODO(gvisor.dev/issue/1672): Cache pages, when forced to do so. + f.mu.Lock() + n, err := readFromHostFD(ctx, f.inode.hostFD, dst, f.offset, int(opts.Flags)) + f.offset += n + f.mu.Unlock() + return n, err +} + +func readFromHostFD(ctx context.Context, fd int, dst usermem.IOSequence, offset int64, flags int) (int64, error) { + if flags&^(linux.RWF_VALID) != 0 { + return 0, syserror.EOPNOTSUPP + } + + reader := safemem.FromVecReaderFunc{ + func(srcs [][]byte) (int64, error) { + n, err := unix.Preadv2(fd, srcs, offset, flags) + return int64(n), err + }, + } + n, err := dst.CopyOutFrom(ctx, reader) + return int64(n), err +} + +// PWrite implements FileDescriptionImpl. +func (f *defaultFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { + // TODO(b/34716638): Some char devices do support offsets, e.g. /dev/null. + if f.inode.isStream { + return 0, syserror.ESPIPE + } + + return writeToHostFD(ctx, f.inode.hostFD, src, offset, int(opts.Flags)) +} + +// Write implements FileDescriptionImpl. +func (f *defaultFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { + // TODO(b/34716638): Some char devices do support offsets, e.g. /dev/null. + if f.inode.isStream { + // These files can't be memory mapped, assert this. + if f.canMap { + panic("files that can return EWOULDBLOCK (sockets, pipes, etc.) cannot be memory mapped") + } + + f.mu.Lock() + n, err := writeToHostFD(ctx, f.inode.hostFD, src, -1, int(opts.Flags)) + f.mu.Unlock() + if isBlockError(err) { + err = syserror.ErrWouldBlock + } + return n, err + } + // TODO(gvisor.dev/issue/1672): Cache pages, when forced to do so. + // TODO(gvisor.dev/issue/1672): Write to end of file and update offset if O_APPEND is set on this file. + f.mu.Lock() + n, err := writeToHostFD(ctx, f.inode.hostFD, src, f.offset, int(opts.Flags)) + f.offset += n + f.mu.Unlock() + return n, err +} + +func writeToHostFD(ctx context.Context, fd int, src usermem.IOSequence, offset int64, flags int) (int64, error) { + if flags&^(linux.RWF_VALID) != 0 { + return 0, syserror.EOPNOTSUPP + } + + writer := safemem.FromVecWriterFunc{ + func(srcs [][]byte) (int64, error) { + n, err := unix.Pwritev2(fd, srcs, offset, flags) + return int64(n), err + }, + } + n, err := src.CopyInTo(ctx, writer) + return int64(n), err +} + +// Seek implements FileDescriptionImpl. +// +// Note that we do not support seeking on directories, since we do not even +// allow directory fds to be imported at all. +func (f *defaultFileFD) Seek(_ context.Context, offset int64, whence int32) (int64, error) { + // TODO(b/34716638): Some char devices do support seeking, e.g. /dev/null. + if f.inode.isStream { + return 0, syserror.ESPIPE + } + + f.mu.Lock() + defer f.mu.Unlock() + + switch whence { + case linux.SEEK_SET: + if offset < 0 { + return f.offset, syserror.EINVAL + } + f.offset = offset + + case linux.SEEK_CUR: + // Check for overflow. Note that underflow cannot occur, since f.offset >= 0. + if offset > math.MaxInt64-f.offset { + return f.offset, syserror.EOVERFLOW + } + if f.offset+offset < 0 { + return f.offset, syserror.EINVAL + } + f.offset += offset + + case linux.SEEK_END: + var s syscall.Stat_t + if err := syscall.Fstat(f.inode.hostFD, &s); err != nil { + return f.offset, err + } + size := s.Size + + // Check for overflow. Note that underflow cannot occur, since size >= 0. + if offset > math.MaxInt64-size { + return f.offset, syserror.EOVERFLOW + } + if size+offset < 0 { + return f.offset, syserror.EINVAL + } + f.offset = size + offset + + case linux.SEEK_DATA, linux.SEEK_HOLE: + // Modifying the offset in the host file table should not matter, since + // this is the only place where we use it. + // + // For reading and writing, we always rely on our internal offset. + n, err := unix.Seek(f.inode.hostFD, offset, int(whence)) + if err != nil { + return f.offset, err + } + f.offset = n + + default: + // Invalid whence. + return f.offset, syserror.EINVAL + } + + return f.offset, nil +} + +// Sync implements FileDescriptionImpl. +func (f *defaultFileFD) Sync(context.Context) error { + // TODO(gvisor.dev/issue/1672): Currently we do not support the SyncData optimization, so we always sync everything. + return unix.Fsync(f.inode.hostFD) +} + +// ConfigureMMap implements FileDescriptionImpl. +func (f *defaultFileFD) ConfigureMMap(_ context.Context, opts *memmap.MMapOpts) error { + if !f.canMap { + return syserror.ENODEV + } + // TODO(gvisor.dev/issue/1672): Implement ConfigureMMap and Mappable interface. + return syserror.ENODEV +} diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go new file mode 100644 index 000000000..c205e6a0b --- /dev/null +++ b/pkg/sentry/fsimpl/host/host.go @@ -0,0 +1,286 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package host provides a filesystem implementation for host files imported as +// file descriptors. +package host + +import ( + "errors" + "fmt" + "syscall" + + "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/refs" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" +) + +// filesystem implements vfs.FilesystemImpl. +type filesystem struct { + kernfs.Filesystem +} + +// ImportFD sets up and returns a vfs.FileDescription from a donated fd. +func ImportFD(mnt *vfs.Mount, hostFD int, ownerUID auth.KUID, ownerGID auth.KGID, isTTY bool) (*vfs.FileDescription, error) { + // Must be importing to a mount of host.filesystem. + fs, ok := mnt.Filesystem().Impl().(*filesystem) + if !ok { + return nil, fmt.Errorf("can't import host FDs into filesystems of type %T", mnt.Filesystem().Impl()) + } + + // Retrieve metadata. + var s syscall.Stat_t + if err := syscall.Fstat(hostFD, &s); err != nil { + return nil, err + } + + fileMode := linux.FileMode(s.Mode) + fileType := fileMode.FileType() + // Pipes, character devices, and sockets can return EWOULDBLOCK for + // operations that would block. + isStream := fileType == syscall.S_IFIFO || fileType == syscall.S_IFCHR || fileType == syscall.S_IFSOCK + + i := &inode{ + hostFD: hostFD, + isStream: isStream, + isTTY: isTTY, + ino: fs.NextIno(), + mode: fileMode, + uid: ownerUID, + gid: ownerGID, + } + + d := &kernfs.Dentry{} + d.Init(i) + // i.open will take a reference on d. + defer d.DecRef() + + return i.open(d.VFSDentry(), mnt) +} + +// inode implements kernfs.Inode. +type inode struct { + kernfs.InodeNotDirectory + kernfs.InodeNotSymlink + + // When the reference count reaches zero, the host fd is closed. + refs.AtomicRefCount + + // hostFD contains the host fd that this file was originally created from, + // which must be available at time of restore. + // + // This field is initialized at creation time and is immutable. + hostFD int + + // isStream is true if the host fd points to a file representing a stream, + // e.g. a socket or a pipe. Such files are not seekable and can return + // EWOULDBLOCK for I/O operations. + // + // This field is initialized at creation time and is immutable. + isStream bool + + // isTTY is true if this file represents a TTY. + // + // This field is initialized at creation time and is immutable. + isTTY bool + + // ino is an inode number unique within this filesystem. + ino uint64 + + // mu protects the inode metadata below. + mu sync.Mutex + + // mode is the file mode of this inode. Note that this value may become out + // of date if the mode is changed on the host, e.g. with chmod. + mode linux.FileMode + + // uid and gid of the file owner. Note that these refer to the owner of the + // file created on import, not the fd on the host. + uid auth.KUID + gid auth.KGID +} + +// Note that these flags may become out of date, since they can be modified +// on the host, e.g. with fcntl. +func fileFlagsFromHostFD(fd int) (int, error) { + flags, err := unix.FcntlInt(uintptr(fd), syscall.F_GETFL, 0) + if err != nil { + log.Warningf("Failed to get file flags for donated FD %d: %v", fd, err) + return 0, err + } + // TODO(gvisor.dev/issue/1672): implement behavior corresponding to these allowed flags. + flags &= syscall.O_ACCMODE | syscall.O_DIRECT | syscall.O_NONBLOCK | syscall.O_DSYNC | syscall.O_SYNC | syscall.O_APPEND + return flags, nil +} + +// CheckPermissions implements kernfs.Inode. +func (i *inode) CheckPermissions(ctx context.Context, creds *auth.Credentials, atx vfs.AccessTypes) error { + return vfs.GenericCheckPermissions(creds, atx, false /* isDir */, uint16(i.mode), i.uid, i.gid) +} + +// Mode implements kernfs.Inode. +func (i *inode) Mode() linux.FileMode { + return i.mode +} + +// Stat implements kernfs.Inode. +func (i *inode) Stat(_ *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) { + var s unix.Statx_t + if err := unix.Statx(i.hostFD, "", int(unix.AT_EMPTY_PATH|opts.Sync), int(opts.Mask), &s); err != nil { + return linux.Statx{}, err + } + ls := unixToLinuxStatx(s) + + // Use our own internal inode number and file owner. + // + // TODO(gvisor.dev/issue/1672): Use a kernfs-specific device number as well. + // If we use the device number from the host, it may collide with another + // sentry-internal device number. We handle device/inode numbers without + // relying on the host to prevent collisions. + ls.Ino = i.ino + ls.UID = uint32(i.uid) + ls.GID = uint32(i.gid) + + // Update file mode from the host. + i.mode = linux.FileMode(ls.Mode) + + return ls, nil +} + +// SetStat implements kernfs.Inode. +func (i *inode) SetStat(_ *vfs.Filesystem, opts vfs.SetStatOptions) error { + s := opts.Stat + + m := s.Mask + if m == 0 { + return nil + } + if m&(linux.STATX_UID|linux.STATX_GID) != 0 { + return syserror.EPERM + } + if m&linux.STATX_MODE != 0 { + if err := syscall.Fchmod(i.hostFD, uint32(s.Mode)); err != nil { + return err + } + i.mode = linux.FileMode(s.Mode) + } + if m&linux.STATX_SIZE != 0 { + if err := syscall.Ftruncate(i.hostFD, int64(s.Size)); err != nil { + return err + } + } + if m&(linux.STATX_ATIME|linux.STATX_MTIME) != 0 { + timestamps := []unix.Timespec{ + toTimespec(s.Atime, m&linux.STATX_ATIME == 0), + toTimespec(s.Mtime, m&linux.STATX_MTIME == 0), + } + if err := unix.UtimesNanoAt(i.hostFD, "", timestamps, unix.AT_EMPTY_PATH); err != nil { + return err + } + } + return nil +} + +// DecRef implements kernfs.Inode. +func (i *inode) DecRef() { + i.AtomicRefCount.DecRefWithDestructor(i.Destroy) +} + +// Destroy implements kernfs.Inode. +func (i *inode) Destroy() { + if err := unix.Close(i.hostFD); err != nil { + log.Warningf("failed to close host fd %d: %v", i.hostFD, err) + } +} + +// Open implements kernfs.Inode. +func (i *inode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + return i.open(vfsd, rp.Mount()) +} + +func (i *inode) open(d *vfs.Dentry, mnt *vfs.Mount) (*vfs.FileDescription, error) { + + fileType := i.mode.FileType() + if fileType == syscall.S_IFSOCK { + if i.isTTY { + return nil, errors.New("cannot use host socket as TTY") + } + // TODO(gvisor.dev/issue/1672): support importing sockets. + return nil, errors.New("importing host sockets not supported") + } + + if i.isTTY { + // TODO(gvisor.dev/issue/1672): support importing host fd as TTY. + return nil, errors.New("importing host fd as TTY not supported") + } + + // For simplicity, set offset to 0. Technically, we should + // only set to 0 on files that are not seekable (sockets, pipes, etc.), + // and use the offset from the host fd otherwise. + fd := &defaultFileFD{ + fileDescription: fileDescription{ + inode: i, + }, + canMap: canMap(uint32(fileType)), + mu: sync.Mutex{}, + offset: 0, + } + + vfsfd := &fd.vfsfd + flags, err := fileFlagsFromHostFD(i.hostFD) + if err != nil { + return nil, err + } + + if err := vfsfd.Init(fd, uint32(flags), mnt, d, &vfs.FileDescriptionOptions{}); err != nil { + return nil, err + } + return vfsfd, nil +} + +// fileDescription is embedded by host fd implementations of FileDescriptionImpl. +type fileDescription struct { + vfsfd vfs.FileDescription + vfs.FileDescriptionDefaultImpl + + // inode is vfsfd.Dentry().Impl().(*kernfs.Dentry).Inode().(*inode), but + // cached to reduce indirections and casting. fileDescription does not hold + // a reference on the inode through the inode field (since one is already + // held via the Dentry). + // + // inode is immutable after fileDescription creation. + inode *inode +} + +// SetStat implements vfs.FileDescriptionImpl. +func (f *fileDescription) SetStat(_ context.Context, opts vfs.SetStatOptions) error { + return f.inode.SetStat(nil, opts) +} + +// Stat implements vfs.FileDescriptionImpl. +func (f *fileDescription) Stat(_ context.Context, opts vfs.StatOptions) (linux.Statx, error) { + return f.inode.Stat(nil, opts) +} + +// Release implements vfs.FileDescriptionImpl. +func (f *fileDescription) Release() { + // noop +} diff --git a/pkg/sentry/fsimpl/host/util.go b/pkg/sentry/fsimpl/host/util.go new file mode 100644 index 000000000..e1ccacb4d --- /dev/null +++ b/pkg/sentry/fsimpl/host/util.go @@ -0,0 +1,86 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package host + +import ( + "syscall" + + "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/syserror" +) + +func toTimespec(ts linux.StatxTimestamp, omit bool) unix.Timespec { + if omit { + return unix.Timespec{ + Sec: 0, + Nsec: unix.UTIME_OMIT, + } + } + return unix.Timespec{ + Sec: int64(ts.Sec), + Nsec: int64(ts.Nsec), + } +} + +func unixToLinuxStatx(s unix.Statx_t) linux.Statx { + return linux.Statx{ + Mask: s.Mask, + Blksize: s.Blksize, + Attributes: s.Attributes, + Nlink: s.Nlink, + UID: s.Uid, + GID: s.Gid, + Mode: s.Mode, + Ino: s.Ino, + Size: s.Size, + Blocks: s.Blocks, + AttributesMask: s.Attributes_mask, + Atime: unixToLinuxStatxTimestamp(s.Atime), + Btime: unixToLinuxStatxTimestamp(s.Btime), + Ctime: unixToLinuxStatxTimestamp(s.Ctime), + Mtime: unixToLinuxStatxTimestamp(s.Mtime), + RdevMajor: s.Rdev_major, + RdevMinor: s.Rdev_minor, + DevMajor: s.Dev_major, + DevMinor: s.Dev_minor, + } +} + +func unixToLinuxStatxTimestamp(ts unix.StatxTimestamp) linux.StatxTimestamp { + return linux.StatxTimestamp{Sec: ts.Sec, Nsec: ts.Nsec} +} + +// wouldBlock returns true for file types that can return EWOULDBLOCK +// for blocking operations, e.g. pipes, character devices, and sockets. +func wouldBlock(fileType uint32) bool { + return fileType == syscall.S_IFIFO || fileType == syscall.S_IFCHR || fileType == syscall.S_IFSOCK +} + +// canMap returns true if a file with fileType is allowed to be memory mapped. +// This is ported over from VFS1, but it's probably not the best way for us +// to check if a file can be memory mapped. +func canMap(fileType uint32) bool { + // TODO(gvisor.dev/issue/1672): Also allow "special files" to be mapped (see fs/host:canMap()). + // + // TODO(b/38213152): Some obscure character devices can be mapped. + return fileType == syscall.S_IFREG +} + +// isBlockError checks if an error is EAGAIN or EWOULDBLOCK. +// If so, they can be transformed into syserror.ErrWouldBlock. +func isBlockError(err error) bool { + return err == syserror.EAGAIN || err == syserror.EWOULDBLOCK +} diff --git a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go index d092ccb2a..1c026f4d8 100644 --- a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go +++ b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go @@ -122,7 +122,7 @@ func (fd *DynamicBytesFD) Release() {} // Stat implements vfs.FileDescriptionImpl.Stat. func (fd *DynamicBytesFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { fs := fd.vfsfd.VirtualDentry().Mount().Filesystem() - return fd.inode.Stat(fs), nil + return fd.inode.Stat(fs, opts) } // SetStat implements vfs.FileDescriptionImpl.SetStat. diff --git a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go index 5650512e0..da821d524 100644 --- a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go @@ -107,9 +107,13 @@ func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirent fs.mu.Lock() defer fs.mu.Unlock() + opts := vfs.StatOptions{Mask: linux.STATX_INO} // Handle ".". if fd.off == 0 { - stat := fd.inode().Stat(vfsFS) + stat, err := fd.inode().Stat(vfsFS, opts) + if err != nil { + return err + } dirent := vfs.Dirent{ Name: ".", Type: linux.DT_DIR, @@ -125,7 +129,10 @@ func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirent // Handle "..". if fd.off == 1 { parentInode := vfsd.ParentOrSelf().Impl().(*Dentry).inode - stat := parentInode.Stat(vfsFS) + stat, err := parentInode.Stat(vfsFS, opts) + if err != nil { + return err + } dirent := vfs.Dirent{ Name: "..", Type: linux.FileMode(stat.Mode).DirentType(), @@ -146,7 +153,10 @@ func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirent childIdx := fd.off - 2 for it := fd.children.nthLocked(childIdx); it != nil; it = it.Next() { inode := it.Dentry.Impl().(*Dentry).inode - stat := inode.Stat(vfsFS) + stat, err := inode.Stat(vfsFS, opts) + if err != nil { + return err + } dirent := vfs.Dirent{ Name: it.Name, Type: linux.FileMode(stat.Mode).DirentType(), @@ -190,7 +200,7 @@ func (fd *GenericDirectoryFD) Seek(ctx context.Context, offset int64, whence int func (fd *GenericDirectoryFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { fs := fd.filesystem() inode := fd.inode() - return inode.Stat(fs), nil + return inode.Stat(fs, opts) } // SetStat implements vfs.FileDescriptionImpl.SetStat. diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go index 292f58afd..1d7e04ad4 100644 --- a/pkg/sentry/fsimpl/kernfs/filesystem.go +++ b/pkg/sentry/fsimpl/kernfs/filesystem.go @@ -12,10 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -// This file implements vfs.FilesystemImpl for kernfs. - package kernfs +// This file implements vfs.FilesystemImpl for kernfs. + import ( "fmt" @@ -634,7 +634,7 @@ func (fs *Filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf if err != nil { return linux.Statx{}, err } - return inode.Stat(fs.VFSFilesystem()), nil + return inode.Stat(fs.VFSFilesystem(), opts) } // StatFSAt implements vfs.FilesystemImpl.StatFSAt. diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go index 099d70a16..d50018b18 100644 --- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go @@ -36,20 +36,20 @@ type InodeNoopRefCount struct { } // IncRef implements Inode.IncRef. -func (n *InodeNoopRefCount) IncRef() { +func (InodeNoopRefCount) IncRef() { } // DecRef implements Inode.DecRef. -func (n *InodeNoopRefCount) DecRef() { +func (InodeNoopRefCount) DecRef() { } // TryIncRef implements Inode.TryIncRef. -func (n *InodeNoopRefCount) TryIncRef() bool { +func (InodeNoopRefCount) TryIncRef() bool { return true } // Destroy implements Inode.Destroy. -func (n *InodeNoopRefCount) Destroy() { +func (InodeNoopRefCount) Destroy() { } // InodeDirectoryNoNewChildren partially implements the Inode interface. @@ -58,27 +58,27 @@ func (n *InodeNoopRefCount) Destroy() { type InodeDirectoryNoNewChildren struct{} // NewFile implements Inode.NewFile. -func (*InodeDirectoryNoNewChildren) NewFile(context.Context, string, vfs.OpenOptions) (*vfs.Dentry, error) { +func (InodeDirectoryNoNewChildren) NewFile(context.Context, string, vfs.OpenOptions) (*vfs.Dentry, error) { return nil, syserror.EPERM } // NewDir implements Inode.NewDir. -func (*InodeDirectoryNoNewChildren) NewDir(context.Context, string, vfs.MkdirOptions) (*vfs.Dentry, error) { +func (InodeDirectoryNoNewChildren) NewDir(context.Context, string, vfs.MkdirOptions) (*vfs.Dentry, error) { return nil, syserror.EPERM } // NewLink implements Inode.NewLink. -func (*InodeDirectoryNoNewChildren) NewLink(context.Context, string, Inode) (*vfs.Dentry, error) { +func (InodeDirectoryNoNewChildren) NewLink(context.Context, string, Inode) (*vfs.Dentry, error) { return nil, syserror.EPERM } // NewSymlink implements Inode.NewSymlink. -func (*InodeDirectoryNoNewChildren) NewSymlink(context.Context, string, string) (*vfs.Dentry, error) { +func (InodeDirectoryNoNewChildren) NewSymlink(context.Context, string, string) (*vfs.Dentry, error) { return nil, syserror.EPERM } // NewNode implements Inode.NewNode. -func (*InodeDirectoryNoNewChildren) NewNode(context.Context, string, vfs.MknodOptions) (*vfs.Dentry, error) { +func (InodeDirectoryNoNewChildren) NewNode(context.Context, string, vfs.MknodOptions) (*vfs.Dentry, error) { return nil, syserror.EPERM } @@ -90,62 +90,62 @@ type InodeNotDirectory struct { } // HasChildren implements Inode.HasChildren. -func (*InodeNotDirectory) HasChildren() bool { +func (InodeNotDirectory) HasChildren() bool { return false } // NewFile implements Inode.NewFile. -func (*InodeNotDirectory) NewFile(context.Context, string, vfs.OpenOptions) (*vfs.Dentry, error) { +func (InodeNotDirectory) NewFile(context.Context, string, vfs.OpenOptions) (*vfs.Dentry, error) { panic("NewFile called on non-directory inode") } // NewDir implements Inode.NewDir. -func (*InodeNotDirectory) NewDir(context.Context, string, vfs.MkdirOptions) (*vfs.Dentry, error) { +func (InodeNotDirectory) NewDir(context.Context, string, vfs.MkdirOptions) (*vfs.Dentry, error) { panic("NewDir called on non-directory inode") } // NewLink implements Inode.NewLinkink. -func (*InodeNotDirectory) NewLink(context.Context, string, Inode) (*vfs.Dentry, error) { +func (InodeNotDirectory) NewLink(context.Context, string, Inode) (*vfs.Dentry, error) { panic("NewLink called on non-directory inode") } // NewSymlink implements Inode.NewSymlink. -func (*InodeNotDirectory) NewSymlink(context.Context, string, string) (*vfs.Dentry, error) { +func (InodeNotDirectory) NewSymlink(context.Context, string, string) (*vfs.Dentry, error) { panic("NewSymlink called on non-directory inode") } // NewNode implements Inode.NewNode. -func (*InodeNotDirectory) NewNode(context.Context, string, vfs.MknodOptions) (*vfs.Dentry, error) { +func (InodeNotDirectory) NewNode(context.Context, string, vfs.MknodOptions) (*vfs.Dentry, error) { panic("NewNode called on non-directory inode") } // Unlink implements Inode.Unlink. -func (*InodeNotDirectory) Unlink(context.Context, string, *vfs.Dentry) error { +func (InodeNotDirectory) Unlink(context.Context, string, *vfs.Dentry) error { panic("Unlink called on non-directory inode") } // RmDir implements Inode.RmDir. -func (*InodeNotDirectory) RmDir(context.Context, string, *vfs.Dentry) error { +func (InodeNotDirectory) RmDir(context.Context, string, *vfs.Dentry) error { panic("RmDir called on non-directory inode") } // Rename implements Inode.Rename. -func (*InodeNotDirectory) Rename(context.Context, string, string, *vfs.Dentry, *vfs.Dentry) (*vfs.Dentry, error) { +func (InodeNotDirectory) Rename(context.Context, string, string, *vfs.Dentry, *vfs.Dentry) (*vfs.Dentry, error) { panic("Rename called on non-directory inode") } // Lookup implements Inode.Lookup. -func (*InodeNotDirectory) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) { +func (InodeNotDirectory) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) { panic("Lookup called on non-directory inode") } // IterDirents implements Inode.IterDirents. -func (*InodeNotDirectory) IterDirents(ctx context.Context, callback vfs.IterDirentsCallback, offset, relOffset int64) (newOffset int64, err error) { +func (InodeNotDirectory) IterDirents(ctx context.Context, callback vfs.IterDirentsCallback, offset, relOffset int64) (newOffset int64, err error) { panic("IterDirents called on non-directory inode") } // Valid implements Inode.Valid. -func (*InodeNotDirectory) Valid(context.Context) bool { +func (InodeNotDirectory) Valid(context.Context) bool { return true } @@ -157,17 +157,17 @@ func (*InodeNotDirectory) Valid(context.Context) bool { type InodeNoDynamicLookup struct{} // Lookup implements Inode.Lookup. -func (*InodeNoDynamicLookup) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) { +func (InodeNoDynamicLookup) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) { return nil, syserror.ENOENT } // IterDirents implements Inode.IterDirents. -func (*InodeNoDynamicLookup) IterDirents(ctx context.Context, callback vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) { +func (InodeNoDynamicLookup) IterDirents(ctx context.Context, callback vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) { return offset, nil } // Valid implements Inode.Valid. -func (*InodeNoDynamicLookup) Valid(ctx context.Context) bool { +func (InodeNoDynamicLookup) Valid(ctx context.Context) bool { return true } @@ -177,7 +177,7 @@ func (*InodeNoDynamicLookup) Valid(ctx context.Context) bool { type InodeNotSymlink struct{} // Readlink implements Inode.Readlink. -func (*InodeNotSymlink) Readlink(context.Context) (string, error) { +func (InodeNotSymlink) Readlink(context.Context) (string, error) { return "", syserror.EINVAL } @@ -219,7 +219,7 @@ func (a *InodeAttrs) Mode() linux.FileMode { // Stat partially implements Inode.Stat. Note that this function doesn't provide // all the stat fields, and the embedder should consider extending the result // with filesystem-specific fields. -func (a *InodeAttrs) Stat(*vfs.Filesystem) linux.Statx { +func (a *InodeAttrs) Stat(*vfs.Filesystem, vfs.StatOptions) (linux.Statx, error) { var stat linux.Statx stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO | linux.STATX_NLINK stat.Ino = atomic.LoadUint64(&a.ino) @@ -230,7 +230,7 @@ func (a *InodeAttrs) Stat(*vfs.Filesystem) linux.Statx { // TODO: Implement other stat fields like timestamps. - return stat + return stat, nil } // SetStat implements Inode.SetStat. diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go index c74fa999b..a8ab2a2ba 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs.go @@ -176,8 +176,6 @@ type Dentry struct { vfsd vfs.Dentry inode Inode - refs uint64 - // flags caches useful information about the dentry from the inode. See the // dflags* consts above. Must be accessed by atomic ops. flags uint32 @@ -302,7 +300,8 @@ type Inode interface { // this inode. The returned file description should hold a reference on the // inode for its lifetime. // - // Precondition: !rp.Done(). vfsd.Impl() must be a kernfs Dentry. + // Precondition: rp.Done(). vfsd.Impl() must be the kernfs Dentry containing + // the inode on which Open() is being called. Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) } @@ -328,7 +327,7 @@ type inodeMetadata interface { // Stat returns the metadata for this inode. This corresponds to // vfs.FilesystemImpl.StatAt. - Stat(fs *vfs.Filesystem) linux.Statx + Stat(fs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) // SetStat updates the metadata for this inode. This corresponds to // vfs.FilesystemImpl.SetStatAt. diff --git a/pkg/sentry/fsimpl/proc/subtasks.go b/pkg/sentry/fsimpl/proc/subtasks.go index f3f4e49b4..611645f3f 100644 --- a/pkg/sentry/fsimpl/proc/subtasks.go +++ b/pkg/sentry/fsimpl/proc/subtasks.go @@ -121,8 +121,13 @@ func (i *subtasksInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.O } // Stat implements kernfs.Inode. -func (i *subtasksInode) Stat(vsfs *vfs.Filesystem) linux.Statx { - stat := i.InodeAttrs.Stat(vsfs) - stat.Nlink += uint32(i.task.ThreadGroup().Count()) - return stat +func (i *subtasksInode) Stat(vsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) { + stat, err := i.InodeAttrs.Stat(vsfs, opts) + if err != nil { + return linux.Statx{}, err + } + if opts.Mask&linux.STATX_NLINK != 0 { + stat.Nlink += uint32(i.task.ThreadGroup().Count()) + } + return stat, nil } diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go index 18e5cd6f6..c0d643f51 100644 --- a/pkg/sentry/fsimpl/proc/task.go +++ b/pkg/sentry/fsimpl/proc/task.go @@ -154,12 +154,21 @@ func newTaskOwnedDir(task *kernel.Task, ino uint64, perm linux.FileMode, childre } // Stat implements kernfs.Inode. -func (i *taskOwnedInode) Stat(fs *vfs.Filesystem) linux.Statx { - stat := i.Inode.Stat(fs) - uid, gid := i.getOwner(linux.FileMode(stat.Mode)) - stat.UID = uint32(uid) - stat.GID = uint32(gid) - return stat +func (i *taskOwnedInode) Stat(fs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) { + stat, err := i.Inode.Stat(fs, opts) + if err != nil { + return linux.Statx{}, err + } + if opts.Mask&(linux.STATX_UID|linux.STATX_GID) != 0 { + uid, gid := i.getOwner(linux.FileMode(stat.Mode)) + if opts.Mask&linux.STATX_UID != 0 { + stat.UID = uint32(uid) + } + if opts.Mask&linux.STATX_GID != 0 { + stat.GID = uint32(gid) + } + } + return stat, nil } // CheckPermissions implements kernfs.Inode. @@ -236,7 +245,7 @@ func newNamespaceSymlink(task *kernel.Task, ino uint64, ns string) *kernfs.Dentr // member, there is one entry containing three colon-separated fields: // hierarchy-ID:controller-list:cgroup-path" func newCgroupData(controllers map[string]string) dynamicInode { - buf := bytes.Buffer{} + var buf bytes.Buffer // The hierarchy ids must be positive integers (for cgroup v1), but the // exact number does not matter, so long as they are unique. We can diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go index 10c08fa90..b1e39c82f 100644 --- a/pkg/sentry/fsimpl/proc/tasks.go +++ b/pkg/sentry/fsimpl/proc/tasks.go @@ -211,17 +211,22 @@ func (i *tasksInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.Open return fd.VFSFileDescription(), nil } -func (i *tasksInode) Stat(vsfs *vfs.Filesystem) linux.Statx { - stat := i.InodeAttrs.Stat(vsfs) +func (i *tasksInode) Stat(vsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) { + stat, err := i.InodeAttrs.Stat(vsfs, opts) + if err != nil { + return linux.Statx{}, err + } - // Add dynamic children to link count. - for _, tg := range i.pidns.ThreadGroups() { - if leader := tg.Leader(); leader != nil { - stat.Nlink++ + if opts.Mask&linux.STATX_NLINK != 0 { + // Add dynamic children to link count. + for _, tg := range i.pidns.ThreadGroups() { + if leader := tg.Leader(); leader != nil { + stat.Nlink++ + } } } - return stat + return stat, nil } func cpuInfoData(k *kernel.Kernel) string { diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD index 07c8383e6..cb4deb068 100644 --- a/pkg/sentry/vfs/BUILD +++ b/pkg/sentry/vfs/BUILD @@ -42,10 +42,13 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/context", + "//pkg/fd", "//pkg/fspath", "//pkg/gohacks", "//pkg/log", + "//pkg/safemem", "//pkg/sentry/arch", + "//pkg/sentry/fs", "//pkg/sentry/fs/lock", "//pkg/sentry/kernel/auth", "//pkg/sentry/memmap", @@ -53,6 +56,7 @@ go_library( "//pkg/syserror", "//pkg/usermem", "//pkg/waiter", + "@org_golang_x_sys//unix:go_default_library", ], ) diff --git a/pkg/sentry/vfs/file_description_impl_util.go b/pkg/sentry/vfs/file_description_impl_util.go index c2a52ec1b..45191d1c3 100644 --- a/pkg/sentry/vfs/file_description_impl_util.go +++ b/pkg/sentry/vfs/file_description_impl_util.go @@ -33,8 +33,8 @@ import ( // implementations to adapt: // - Have a local fileDescription struct (containing FileDescription) which // embeds FileDescriptionDefaultImpl and overrides the default methods -// which are common to all fd implementations for that for that filesystem -// like StatusFlags, SetStatusFlags, Stat, SetStat, StatFS, etc. +// which are common to all fd implementations for that filesystem like +// StatusFlags, SetStatusFlags, Stat, SetStat, StatFS, etc. // - This should be embedded in all file description implementations as the // first field by value. // - Directory FDs would also embed DirectoryFileDescriptionDefaultImpl. -- cgit v1.2.3 From b36de6e7be0542b410901d3cbcd1b3c0fc493cf5 Mon Sep 17 00:00:00 2001 From: Ting-Yu Wang Date: Mon, 9 Mar 2020 19:57:35 -0700 Subject: Move /proc/net to /proc/PID/net, and make /proc/net -> /proc/self/net. Issue #1833 PiperOrigin-RevId: 299998105 --- pkg/sentry/fs/proc/net.go | 53 +-- pkg/sentry/fs/proc/proc.go | 2 +- pkg/sentry/fs/proc/task.go | 1 + pkg/sentry/fsimpl/proc/BUILD | 2 +- pkg/sentry/fsimpl/proc/task.go | 1 + pkg/sentry/fsimpl/proc/task_net.go | 790 +++++++++++++++++++++++++++++++++++ pkg/sentry/fsimpl/proc/tasks.go | 2 +- pkg/sentry/fsimpl/proc/tasks_net.go | 787 ---------------------------------- pkg/sentry/fsimpl/proc/tasks_test.go | 3 +- test/syscalls/linux/proc_net.cc | 25 ++ 10 files changed, 849 insertions(+), 817 deletions(-) create mode 100644 pkg/sentry/fsimpl/proc/task_net.go delete mode 100644 pkg/sentry/fsimpl/proc/tasks_net.go (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fs/proc/net.go b/pkg/sentry/fs/proc/net.go index 95d5817ff..bd18177d4 100644 --- a/pkg/sentry/fs/proc/net.go +++ b/pkg/sentry/fs/proc/net.go @@ -40,47 +40,48 @@ import ( // LINT.IfChange -// newNet creates a new proc net entry. -func (p *proc) newNetDir(ctx context.Context, k *kernel.Kernel, msrc *fs.MountSource) *fs.Inode { +// newNetDir creates a new proc net entry. +func newNetDir(t *kernel.Task, msrc *fs.MountSource) *fs.Inode { + k := t.Kernel() + var contents map[string]*fs.Inode - // TODO(gvisor.dev/issue/1833): Support for using the network stack in the - // network namespace of the calling process. We should make this per-process, - // a.k.a. /proc/PID/net, and make /proc/net a symlink to /proc/self/net. - if s := p.k.RootNetworkNamespace().Stack(); s != nil { + if s := t.NetworkNamespace().Stack(); s != nil { + // TODO(gvisor.dev/issue/1833): Make sure file contents reflect the task + // network namespace. contents = map[string]*fs.Inode{ - "dev": seqfile.NewSeqFileInode(ctx, &netDev{s: s}, msrc), - "snmp": seqfile.NewSeqFileInode(ctx, &netSnmp{s: s}, msrc), + "dev": seqfile.NewSeqFileInode(t, &netDev{s: s}, msrc), + "snmp": seqfile.NewSeqFileInode(t, &netSnmp{s: s}, msrc), // The following files are simple stubs until they are // implemented in netstack, if the file contains a // header the stub is just the header otherwise it is // an empty file. - "arp": newStaticProcInode(ctx, msrc, []byte("IP address HW type Flags HW address Mask Device\n")), + "arp": newStaticProcInode(t, msrc, []byte("IP address HW type Flags HW address Mask Device\n")), - "netlink": newStaticProcInode(ctx, msrc, []byte("sk Eth Pid Groups Rmem Wmem Dump Locks Drops Inode\n")), - "netstat": newStaticProcInode(ctx, msrc, []byte("TcpExt: SyncookiesSent SyncookiesRecv SyncookiesFailed EmbryonicRsts PruneCalled RcvPruned OfoPruned OutOfWindowIcmps LockDroppedIcmps ArpFilter TW TWRecycled TWKilled PAWSPassive PAWSActive PAWSEstab DelayedACKs DelayedACKLocked DelayedACKLost ListenOverflows ListenDrops TCPPrequeued TCPDirectCopyFromBacklog TCPDirectCopyFromPrequeue TCPPrequeueDropped TCPHPHits TCPHPHitsToUser TCPPureAcks TCPHPAcks TCPRenoRecovery TCPSackRecovery TCPSACKReneging TCPFACKReorder TCPSACKReorder TCPRenoReorder TCPTSReorder TCPFullUndo TCPPartialUndo TCPDSACKUndo TCPLossUndo TCPLostRetransmit TCPRenoFailures TCPSackFailures TCPLossFailures TCPFastRetrans TCPForwardRetrans TCPSlowStartRetrans TCPTimeouts TCPLossProbes TCPLossProbeRecovery TCPRenoRecoveryFail TCPSackRecoveryFail TCPSchedulerFailed TCPRcvCollapsed TCPDSACKOldSent TCPDSACKOfoSent TCPDSACKRecv TCPDSACKOfoRecv TCPAbortOnData TCPAbortOnClose TCPAbortOnMemory TCPAbortOnTimeout TCPAbortOnLinger TCPAbortFailed TCPMemoryPressures TCPSACKDiscard TCPDSACKIgnoredOld TCPDSACKIgnoredNoUndo TCPSpuriousRTOs TCPMD5NotFound TCPMD5Unexpected TCPMD5Failure TCPSackShifted TCPSackMerged TCPSackShiftFallback TCPBacklogDrop TCPMinTTLDrop TCPDeferAcceptDrop IPReversePathFilter TCPTimeWaitOverflow TCPReqQFullDoCookies TCPReqQFullDrop TCPRetransFail TCPRcvCoalesce TCPOFOQueue TCPOFODrop TCPOFOMerge TCPChallengeACK TCPSYNChallenge TCPFastOpenActive TCPFastOpenActiveFail TCPFastOpenPassive TCPFastOpenPassiveFail TCPFastOpenListenOverflow TCPFastOpenCookieReqd TCPSpuriousRtxHostQueues BusyPollRxPackets TCPAutoCorking TCPFromZeroWindowAdv TCPToZeroWindowAdv TCPWantZeroWindowAdv TCPSynRetrans TCPOrigDataSent TCPHystartTrainDetect TCPHystartTrainCwnd TCPHystartDelayDetect TCPHystartDelayCwnd TCPACKSkippedSynRecv TCPACKSkippedPAWS TCPACKSkippedSeq TCPACKSkippedFinWait2 TCPACKSkippedTimeWait TCPACKSkippedChallenge TCPWinProbe TCPKeepAlive TCPMTUPFail TCPMTUPSuccess\n")), - "packet": newStaticProcInode(ctx, msrc, []byte("sk RefCnt Type Proto Iface R Rmem User Inode\n")), - "protocols": newStaticProcInode(ctx, msrc, []byte("protocol size sockets memory press maxhdr slab module cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n")), + "netlink": newStaticProcInode(t, msrc, []byte("sk Eth Pid Groups Rmem Wmem Dump Locks Drops Inode\n")), + "netstat": newStaticProcInode(t, msrc, []byte("TcpExt: SyncookiesSent SyncookiesRecv SyncookiesFailed EmbryonicRsts PruneCalled RcvPruned OfoPruned OutOfWindowIcmps LockDroppedIcmps ArpFilter TW TWRecycled TWKilled PAWSPassive PAWSActive PAWSEstab DelayedACKs DelayedACKLocked DelayedACKLost ListenOverflows ListenDrops TCPPrequeued TCPDirectCopyFromBacklog TCPDirectCopyFromPrequeue TCPPrequeueDropped TCPHPHits TCPHPHitsToUser TCPPureAcks TCPHPAcks TCPRenoRecovery TCPSackRecovery TCPSACKReneging TCPFACKReorder TCPSACKReorder TCPRenoReorder TCPTSReorder TCPFullUndo TCPPartialUndo TCPDSACKUndo TCPLossUndo TCPLostRetransmit TCPRenoFailures TCPSackFailures TCPLossFailures TCPFastRetrans TCPForwardRetrans TCPSlowStartRetrans TCPTimeouts TCPLossProbes TCPLossProbeRecovery TCPRenoRecoveryFail TCPSackRecoveryFail TCPSchedulerFailed TCPRcvCollapsed TCPDSACKOldSent TCPDSACKOfoSent TCPDSACKRecv TCPDSACKOfoRecv TCPAbortOnData TCPAbortOnClose TCPAbortOnMemory TCPAbortOnTimeout TCPAbortOnLinger TCPAbortFailed TCPMemoryPressures TCPSACKDiscard TCPDSACKIgnoredOld TCPDSACKIgnoredNoUndo TCPSpuriousRTOs TCPMD5NotFound TCPMD5Unexpected TCPMD5Failure TCPSackShifted TCPSackMerged TCPSackShiftFallback TCPBacklogDrop TCPMinTTLDrop TCPDeferAcceptDrop IPReversePathFilter TCPTimeWaitOverflow TCPReqQFullDoCookies TCPReqQFullDrop TCPRetransFail TCPRcvCoalesce TCPOFOQueue TCPOFODrop TCPOFOMerge TCPChallengeACK TCPSYNChallenge TCPFastOpenActive TCPFastOpenActiveFail TCPFastOpenPassive TCPFastOpenPassiveFail TCPFastOpenListenOverflow TCPFastOpenCookieReqd TCPSpuriousRtxHostQueues BusyPollRxPackets TCPAutoCorking TCPFromZeroWindowAdv TCPToZeroWindowAdv TCPWantZeroWindowAdv TCPSynRetrans TCPOrigDataSent TCPHystartTrainDetect TCPHystartTrainCwnd TCPHystartDelayDetect TCPHystartDelayCwnd TCPACKSkippedSynRecv TCPACKSkippedPAWS TCPACKSkippedSeq TCPACKSkippedFinWait2 TCPACKSkippedTimeWait TCPACKSkippedChallenge TCPWinProbe TCPKeepAlive TCPMTUPFail TCPMTUPSuccess\n")), + "packet": newStaticProcInode(t, msrc, []byte("sk RefCnt Type Proto Iface R Rmem User Inode\n")), + "protocols": newStaticProcInode(t, msrc, []byte("protocol size sockets memory press maxhdr slab module cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n")), // Linux sets psched values to: nsec per usec, psched // tick in ns, 1000000, high res timer ticks per sec // (ClockGetres returns 1ns resolution). - "psched": newStaticProcInode(ctx, msrc, []byte(fmt.Sprintf("%08x %08x %08x %08x\n", uint64(time.Microsecond/time.Nanosecond), 64, 1000000, uint64(time.Second/time.Nanosecond)))), - "ptype": newStaticProcInode(ctx, msrc, []byte("Type Device Function\n")), - "route": seqfile.NewSeqFileInode(ctx, &netRoute{s: s}, msrc), - "tcp": seqfile.NewSeqFileInode(ctx, &netTCP{k: k}, msrc), - "udp": seqfile.NewSeqFileInode(ctx, &netUDP{k: k}, msrc), - "unix": seqfile.NewSeqFileInode(ctx, &netUnix{k: k}, msrc), + "psched": newStaticProcInode(t, msrc, []byte(fmt.Sprintf("%08x %08x %08x %08x\n", uint64(time.Microsecond/time.Nanosecond), 64, 1000000, uint64(time.Second/time.Nanosecond)))), + "ptype": newStaticProcInode(t, msrc, []byte("Type Device Function\n")), + "route": seqfile.NewSeqFileInode(t, &netRoute{s: s}, msrc), + "tcp": seqfile.NewSeqFileInode(t, &netTCP{k: k}, msrc), + "udp": seqfile.NewSeqFileInode(t, &netUDP{k: k}, msrc), + "unix": seqfile.NewSeqFileInode(t, &netUnix{k: k}, msrc), } if s.SupportsIPv6() { - contents["if_inet6"] = seqfile.NewSeqFileInode(ctx, &ifinet6{s: s}, msrc) - contents["ipv6_route"] = newStaticProcInode(ctx, msrc, []byte("")) - contents["tcp6"] = seqfile.NewSeqFileInode(ctx, &netTCP6{k: k}, msrc) - contents["udp6"] = newStaticProcInode(ctx, msrc, []byte(" sl local_address remote_address st tx_queue rx_queue tr tm->when retrnsmt uid timeout inode\n")) + contents["if_inet6"] = seqfile.NewSeqFileInode(t, &ifinet6{s: s}, msrc) + contents["ipv6_route"] = newStaticProcInode(t, msrc, []byte("")) + contents["tcp6"] = seqfile.NewSeqFileInode(t, &netTCP6{k: k}, msrc) + contents["udp6"] = newStaticProcInode(t, msrc, []byte(" sl local_address remote_address st tx_queue rx_queue tr tm->when retrnsmt uid timeout inode\n")) } } - d := ramfs.NewDir(ctx, contents, fs.RootOwner, fs.FilePermsFromMode(0555)) - return newProcInode(ctx, d, msrc, fs.SpecialDirectory, nil) + d := ramfs.NewDir(t, contents, fs.RootOwner, fs.FilePermsFromMode(0555)) + return newProcInode(t, d, msrc, fs.SpecialDirectory, t) } // ifinet6 implements seqfile.SeqSource for /proc/net/if_inet6. @@ -837,4 +838,4 @@ func (n *netUDP) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]se return data, 0 } -// LINT.ThenChange(../../fsimpl/proc/tasks_net.go) +// LINT.ThenChange(../../fsimpl/proc/task_net.go) diff --git a/pkg/sentry/fs/proc/proc.go b/pkg/sentry/fs/proc/proc.go index c8abb5052..c659224a7 100644 --- a/pkg/sentry/fs/proc/proc.go +++ b/pkg/sentry/fs/proc/proc.go @@ -70,6 +70,7 @@ func New(ctx context.Context, msrc *fs.MountSource, cgroupControllers map[string "loadavg": seqfile.NewSeqFileInode(ctx, &loadavgData{}, msrc), "meminfo": seqfile.NewSeqFileInode(ctx, &meminfoData{k}, msrc), "mounts": newProcInode(ctx, ramfs.NewSymlink(ctx, fs.RootOwner, "self/mounts"), msrc, fs.Symlink, nil), + "net": newProcInode(ctx, ramfs.NewSymlink(ctx, fs.RootOwner, "self/net"), msrc, fs.Symlink, nil), "self": newSelf(ctx, pidns, msrc), "stat": seqfile.NewSeqFileInode(ctx, &statData{k}, msrc), "thread-self": newThreadSelf(ctx, pidns, msrc), @@ -86,7 +87,6 @@ func New(ctx context.Context, msrc *fs.MountSource, cgroupControllers map[string } // Add more contents that need proc to be initialized. - p.AddChild(ctx, "net", p.newNetDir(ctx, k, msrc)) p.AddChild(ctx, "sys", p.newSysDir(ctx, msrc)) return newProcInode(ctx, p, msrc, fs.SpecialDirectory, nil), nil diff --git a/pkg/sentry/fs/proc/task.go b/pkg/sentry/fs/proc/task.go index 4e9b0fc00..03cc788c8 100644 --- a/pkg/sentry/fs/proc/task.go +++ b/pkg/sentry/fs/proc/task.go @@ -84,6 +84,7 @@ func (p *proc) newTaskDir(t *kernel.Task, msrc *fs.MountSource, isThreadGroup bo "maps": newMaps(t, msrc), "mountinfo": seqfile.NewSeqFileInode(t, &mountInfoFile{t: t}, msrc), "mounts": seqfile.NewSeqFileInode(t, &mountsFile{t: t}, msrc), + "net": newNetDir(t, msrc), "ns": newNamespaceDir(t, msrc), "oom_score": newOOMScore(t, msrc), "oom_score_adj": newOOMScoreAdj(t, msrc), diff --git a/pkg/sentry/fsimpl/proc/BUILD b/pkg/sentry/fsimpl/proc/BUILD index a83245866..bb609a305 100644 --- a/pkg/sentry/fsimpl/proc/BUILD +++ b/pkg/sentry/fsimpl/proc/BUILD @@ -9,9 +9,9 @@ go_library( "subtasks.go", "task.go", "task_files.go", + "task_net.go", "tasks.go", "tasks_files.go", - "tasks_net.go", "tasks_sys.go", ], visibility = ["//pkg/sentry:internal"], diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go index c0d643f51..493acbd1b 100644 --- a/pkg/sentry/fsimpl/proc/task.go +++ b/pkg/sentry/fsimpl/proc/task.go @@ -57,6 +57,7 @@ func newTaskInode(inoGen InoGenerator, task *kernel.Task, pidns *kernel.PIDNames "maps": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &mapsData{task: task}), //"mountinfo": seqfile.NewSeqFileInode(t, &mountInfoFile{t: t}, msrc), //"mounts": seqfile.NewSeqFileInode(t, &mountsFile{t: t}, msrc), + "net": newTaskNetDir(task, inoGen), "ns": newTaskOwnedDir(task, inoGen.NextIno(), 0511, map[string]*kernfs.Dentry{ "net": newNamespaceSymlink(task, inoGen.NextIno(), "net"), "pid": newNamespaceSymlink(task, inoGen.NextIno(), "pid"), diff --git a/pkg/sentry/fsimpl/proc/task_net.go b/pkg/sentry/fsimpl/proc/task_net.go new file mode 100644 index 000000000..373a7b17d --- /dev/null +++ b/pkg/sentry/fsimpl/proc/task_net.go @@ -0,0 +1,790 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package proc + +import ( + "bytes" + "fmt" + "io" + "reflect" + "time" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" + "gvisor.dev/gvisor/pkg/sentry/inet" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/socket" + "gvisor.dev/gvisor/pkg/sentry/socket/unix" + "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/tcpip/header" + "gvisor.dev/gvisor/pkg/usermem" +) + +func newTaskNetDir(task *kernel.Task, inoGen InoGenerator) *kernfs.Dentry { + k := task.Kernel() + pidns := task.PIDNamespace() + root := auth.NewRootCredentials(pidns.UserNamespace()) + + var contents map[string]*kernfs.Dentry + if stack := task.NetworkNamespace().Stack(); stack != nil { + const ( + arp = "IP address HW type Flags HW address Mask Device\n" + netlink = "sk Eth Pid Groups Rmem Wmem Dump Locks Drops Inode\n" + packet = "sk RefCnt Type Proto Iface R Rmem User Inode\n" + protocols = "protocol size sockets memory press maxhdr slab module cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n" + ptype = "Type Device Function\n" + upd6 = " sl local_address remote_address st tx_queue rx_queue tr tm->when retrnsmt uid timeout inode\n" + ) + psched := fmt.Sprintf("%08x %08x %08x %08x\n", uint64(time.Microsecond/time.Nanosecond), 64, 1000000, uint64(time.Second/time.Nanosecond)) + + // TODO(gvisor.dev/issue/1833): Make sure file contents reflect the task + // network namespace. + contents = map[string]*kernfs.Dentry{ + "dev": newDentry(root, inoGen.NextIno(), 0444, &netDevData{stack: stack}), + "snmp": newDentry(root, inoGen.NextIno(), 0444, &netSnmpData{stack: stack}), + + // The following files are simple stubs until they are implemented in + // netstack, if the file contains a header the stub is just the header + // otherwise it is an empty file. + "arp": newDentry(root, inoGen.NextIno(), 0444, newStaticFile(arp)), + "netlink": newDentry(root, inoGen.NextIno(), 0444, newStaticFile(netlink)), + "netstat": newDentry(root, inoGen.NextIno(), 0444, &netStatData{}), + "packet": newDentry(root, inoGen.NextIno(), 0444, newStaticFile(packet)), + "protocols": newDentry(root, inoGen.NextIno(), 0444, newStaticFile(protocols)), + + // Linux sets psched values to: nsec per usec, psched tick in ns, 1000000, + // high res timer ticks per sec (ClockGetres returns 1ns resolution). + "psched": newDentry(root, inoGen.NextIno(), 0444, newStaticFile(psched)), + "ptype": newDentry(root, inoGen.NextIno(), 0444, newStaticFile(ptype)), + "route": newDentry(root, inoGen.NextIno(), 0444, &netRouteData{stack: stack}), + "tcp": newDentry(root, inoGen.NextIno(), 0444, &netTCPData{kernel: k}), + "udp": newDentry(root, inoGen.NextIno(), 0444, &netUDPData{kernel: k}), + "unix": newDentry(root, inoGen.NextIno(), 0444, &netUnixData{kernel: k}), + } + + if stack.SupportsIPv6() { + contents["if_inet6"] = newDentry(root, inoGen.NextIno(), 0444, &ifinet6{stack: stack}) + contents["ipv6_route"] = newDentry(root, inoGen.NextIno(), 0444, newStaticFile("")) + contents["tcp6"] = newDentry(root, inoGen.NextIno(), 0444, &netTCP6Data{kernel: k}) + contents["udp6"] = newDentry(root, inoGen.NextIno(), 0444, newStaticFile(upd6)) + } + } + + return newTaskOwnedDir(task, inoGen.NextIno(), 0555, contents) +} + +// ifinet6 implements vfs.DynamicBytesSource for /proc/net/if_inet6. +// +// +stateify savable +type ifinet6 struct { + kernfs.DynamicBytesFile + + stack inet.Stack +} + +var _ dynamicInode = (*ifinet6)(nil) + +func (n *ifinet6) contents() []string { + var lines []string + nics := n.stack.Interfaces() + for id, naddrs := range n.stack.InterfaceAddrs() { + nic, ok := nics[id] + if !ok { + // NIC was added after NICNames was called. We'll just ignore it. + continue + } + + for _, a := range naddrs { + // IPv6 only. + if a.Family != linux.AF_INET6 { + continue + } + + // Fields: + // IPv6 address displayed in 32 hexadecimal chars without colons + // Netlink device number (interface index) in hexadecimal (use nic id) + // Prefix length in hexadecimal + // Scope value (use 0) + // Interface flags + // Device name + lines = append(lines, fmt.Sprintf("%032x %02x %02x %02x %02x %8s\n", a.Addr, id, a.PrefixLen, 0, a.Flags, nic.Name)) + } + } + return lines +} + +// Generate implements vfs.DynamicBytesSource.Generate. +func (n *ifinet6) Generate(ctx context.Context, buf *bytes.Buffer) error { + for _, l := range n.contents() { + buf.WriteString(l) + } + return nil +} + +// netDevData implements vfs.DynamicBytesSource for /proc/net/dev. +// +// +stateify savable +type netDevData struct { + kernfs.DynamicBytesFile + + stack inet.Stack +} + +var _ dynamicInode = (*netDevData)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (n *netDevData) Generate(ctx context.Context, buf *bytes.Buffer) error { + interfaces := n.stack.Interfaces() + buf.WriteString("Inter-| Receive | Transmit\n") + buf.WriteString(" face |bytes packets errs drop fifo frame compressed multicast|bytes packets errs drop fifo colls carrier compressed\n") + + for _, i := range interfaces { + // Implements the same format as + // net/core/net-procfs.c:dev_seq_printf_stats. + var stats inet.StatDev + if err := n.stack.Statistics(&stats, i.Name); err != nil { + log.Warningf("Failed to retrieve interface statistics for %v: %v", i.Name, err) + continue + } + fmt.Fprintf( + buf, + "%6s: %7d %7d %4d %4d %4d %5d %10d %9d %8d %7d %4d %4d %4d %5d %7d %10d\n", + i.Name, + // Received + stats[0], // bytes + stats[1], // packets + stats[2], // errors + stats[3], // dropped + stats[4], // fifo + stats[5], // frame + stats[6], // compressed + stats[7], // multicast + // Transmitted + stats[8], // bytes + stats[9], // packets + stats[10], // errors + stats[11], // dropped + stats[12], // fifo + stats[13], // frame + stats[14], // compressed + stats[15], // multicast + ) + } + + return nil +} + +// netUnixData implements vfs.DynamicBytesSource for /proc/net/unix. +// +// +stateify savable +type netUnixData struct { + kernfs.DynamicBytesFile + + kernel *kernel.Kernel +} + +var _ dynamicInode = (*netUnixData)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (n *netUnixData) Generate(ctx context.Context, buf *bytes.Buffer) error { + buf.WriteString("Num RefCount Protocol Flags Type St Inode Path\n") + for _, se := range n.kernel.ListSockets() { + s := se.Sock.Get() + if s == nil { + log.Debugf("Couldn't resolve weakref %v in socket table, racing with destruction?", se.Sock) + continue + } + sfile := s.(*fs.File) + if family, _, _ := sfile.FileOperations.(socket.Socket).Type(); family != linux.AF_UNIX { + s.DecRef() + // Not a unix socket. + continue + } + sops := sfile.FileOperations.(*unix.SocketOperations) + + addr, err := sops.Endpoint().GetLocalAddress() + if err != nil { + log.Warningf("Failed to retrieve socket name from %+v: %v", sfile, err) + addr.Addr = "" + } + + sockFlags := 0 + if ce, ok := sops.Endpoint().(transport.ConnectingEndpoint); ok { + if ce.Listening() { + // For unix domain sockets, linux reports a single flag + // value if the socket is listening, of __SO_ACCEPTCON. + sockFlags = linux.SO_ACCEPTCON + } + } + + // In the socket entry below, the value for the 'Num' field requires + // some consideration. Linux prints the address to the struct + // unix_sock representing a socket in the kernel, but may redact the + // value for unprivileged users depending on the kptr_restrict + // sysctl. + // + // One use for this field is to allow a privileged user to + // introspect into the kernel memory to determine information about + // a socket not available through procfs, such as the socket's peer. + // + // In gvisor, returning a pointer to our internal structures would + // be pointless, as it wouldn't match the memory layout for struct + // unix_sock, making introspection difficult. We could populate a + // struct unix_sock with the appropriate data, but even that + // requires consideration for which kernel version to emulate, as + // the definition of this struct changes over time. + // + // For now, we always redact this pointer. + fmt.Fprintf(buf, "%#016p: %08X %08X %08X %04X %02X %5d", + (*unix.SocketOperations)(nil), // Num, pointer to kernel socket struct. + sfile.ReadRefs()-1, // RefCount, don't count our own ref. + 0, // Protocol, always 0 for UDS. + sockFlags, // Flags. + sops.Endpoint().Type(), // Type. + sops.State(), // State. + sfile.InodeID(), // Inode. + ) + + // Path + if len(addr.Addr) != 0 { + if addr.Addr[0] == 0 { + // Abstract path. + fmt.Fprintf(buf, " @%s", string(addr.Addr[1:])) + } else { + fmt.Fprintf(buf, " %s", string(addr.Addr)) + } + } + fmt.Fprintf(buf, "\n") + + s.DecRef() + } + return nil +} + +func networkToHost16(n uint16) uint16 { + // n is in network byte order, so is big-endian. The most-significant byte + // should be stored in the lower address. + // + // We manually inline binary.BigEndian.Uint16() because Go does not support + // non-primitive consts, so binary.BigEndian is a (mutable) var, so calls to + // binary.BigEndian.Uint16() require a read of binary.BigEndian and an + // interface method call, defeating inlining. + buf := [2]byte{byte(n >> 8 & 0xff), byte(n & 0xff)} + return usermem.ByteOrder.Uint16(buf[:]) +} + +func writeInetAddr(w io.Writer, family int, i linux.SockAddr) { + switch family { + case linux.AF_INET: + var a linux.SockAddrInet + if i != nil { + a = *i.(*linux.SockAddrInet) + } + + // linux.SockAddrInet.Port is stored in the network byte order and is + // printed like a number in host byte order. Note that all numbers in host + // byte order are printed with the most-significant byte first when + // formatted with %X. See get_tcp4_sock() and udp4_format_sock() in Linux. + port := networkToHost16(a.Port) + + // linux.SockAddrInet.Addr is stored as a byte slice in big-endian order + // (i.e. most-significant byte in index 0). Linux represents this as a + // __be32 which is a typedef for an unsigned int, and is printed with + // %X. This means that for a little-endian machine, Linux prints the + // least-significant byte of the address first. To emulate this, we first + // invert the byte order for the address using usermem.ByteOrder.Uint32, + // which makes it have the equivalent encoding to a __be32 on a little + // endian machine. Note that this operation is a no-op on a big endian + // machine. Then similar to Linux, we format it with %X, which will print + // the most-significant byte of the __be32 address first, which is now + // actually the least-significant byte of the original address in + // linux.SockAddrInet.Addr on little endian machines, due to the conversion. + addr := usermem.ByteOrder.Uint32(a.Addr[:]) + + fmt.Fprintf(w, "%08X:%04X ", addr, port) + case linux.AF_INET6: + var a linux.SockAddrInet6 + if i != nil { + a = *i.(*linux.SockAddrInet6) + } + + port := networkToHost16(a.Port) + addr0 := usermem.ByteOrder.Uint32(a.Addr[0:4]) + addr1 := usermem.ByteOrder.Uint32(a.Addr[4:8]) + addr2 := usermem.ByteOrder.Uint32(a.Addr[8:12]) + addr3 := usermem.ByteOrder.Uint32(a.Addr[12:16]) + fmt.Fprintf(w, "%08X%08X%08X%08X:%04X ", addr0, addr1, addr2, addr3, port) + } +} + +func commonGenerateTCP(ctx context.Context, buf *bytes.Buffer, k *kernel.Kernel, family int) error { + // t may be nil here if our caller is not part of a task goroutine. This can + // happen for example if we're here for "sentryctl cat". When t is nil, + // degrade gracefully and retrieve what we can. + t := kernel.TaskFromContext(ctx) + + for _, se := range k.ListSockets() { + s := se.Sock.Get() + if s == nil { + log.Debugf("Couldn't resolve weakref with ID %v in socket table, racing with destruction?", se.ID) + continue + } + sfile := s.(*fs.File) + sops, ok := sfile.FileOperations.(socket.Socket) + if !ok { + panic(fmt.Sprintf("Found non-socket file in socket table: %+v", sfile)) + } + if fa, stype, _ := sops.Type(); !(family == fa && stype == linux.SOCK_STREAM) { + s.DecRef() + // Not tcp4 sockets. + continue + } + + // Linux's documentation for the fields below can be found at + // https://www.kernel.org/doc/Documentation/networking/proc_net_tcp.txt. + // For Linux's implementation, see net/ipv4/tcp_ipv4.c:get_tcp4_sock(). + // Note that the header doesn't contain labels for all the fields. + + // Field: sl; entry number. + fmt.Fprintf(buf, "%4d: ", se.ID) + + // Field: local_adddress. + var localAddr linux.SockAddr + if t != nil { + if local, _, err := sops.GetSockName(t); err == nil { + localAddr = local + } + } + writeInetAddr(buf, family, localAddr) + + // Field: rem_address. + var remoteAddr linux.SockAddr + if t != nil { + if remote, _, err := sops.GetPeerName(t); err == nil { + remoteAddr = remote + } + } + writeInetAddr(buf, family, remoteAddr) + + // Field: state; socket state. + fmt.Fprintf(buf, "%02X ", sops.State()) + + // Field: tx_queue, rx_queue; number of packets in the transmit and + // receive queue. Unimplemented. + fmt.Fprintf(buf, "%08X:%08X ", 0, 0) + + // Field: tr, tm->when; timer active state and number of jiffies + // until timer expires. Unimplemented. + fmt.Fprintf(buf, "%02X:%08X ", 0, 0) + + // Field: retrnsmt; number of unrecovered RTO timeouts. + // Unimplemented. + fmt.Fprintf(buf, "%08X ", 0) + + // Field: uid. + uattr, err := sfile.Dirent.Inode.UnstableAttr(ctx) + if err != nil { + log.Warningf("Failed to retrieve unstable attr for socket file: %v", err) + fmt.Fprintf(buf, "%5d ", 0) + } else { + creds := auth.CredentialsFromContext(ctx) + fmt.Fprintf(buf, "%5d ", uint32(uattr.Owner.UID.In(creds.UserNamespace).OrOverflow())) + } + + // Field: timeout; number of unanswered 0-window probes. + // Unimplemented. + fmt.Fprintf(buf, "%8d ", 0) + + // Field: inode. + fmt.Fprintf(buf, "%8d ", sfile.InodeID()) + + // Field: refcount. Don't count the ref we obtain while deferencing + // the weakref to this socket. + fmt.Fprintf(buf, "%d ", sfile.ReadRefs()-1) + + // Field: Socket struct address. Redacted due to the same reason as + // the 'Num' field in /proc/net/unix, see netUnix.ReadSeqFileData. + fmt.Fprintf(buf, "%#016p ", (*socket.Socket)(nil)) + + // Field: retransmit timeout. Unimplemented. + fmt.Fprintf(buf, "%d ", 0) + + // Field: predicted tick of soft clock (delayed ACK control data). + // Unimplemented. + fmt.Fprintf(buf, "%d ", 0) + + // Field: (ack.quick<<1)|ack.pingpong, Unimplemented. + fmt.Fprintf(buf, "%d ", 0) + + // Field: sending congestion window, Unimplemented. + fmt.Fprintf(buf, "%d ", 0) + + // Field: Slow start size threshold, -1 if threshold >= 0xFFFF. + // Unimplemented, report as large threshold. + fmt.Fprintf(buf, "%d", -1) + + fmt.Fprintf(buf, "\n") + + s.DecRef() + } + + return nil +} + +// netTCPData implements vfs.DynamicBytesSource for /proc/net/tcp. +// +// +stateify savable +type netTCPData struct { + kernfs.DynamicBytesFile + + kernel *kernel.Kernel +} + +var _ dynamicInode = (*netTCPData)(nil) + +func (d *netTCPData) Generate(ctx context.Context, buf *bytes.Buffer) error { + buf.WriteString(" sl local_address rem_address st tx_queue rx_queue tr tm->when retrnsmt uid timeout inode \n") + return commonGenerateTCP(ctx, buf, d.kernel, linux.AF_INET) +} + +// netTCP6Data implements vfs.DynamicBytesSource for /proc/net/tcp6. +// +// +stateify savable +type netTCP6Data struct { + kernfs.DynamicBytesFile + + kernel *kernel.Kernel +} + +var _ dynamicInode = (*netTCP6Data)(nil) + +func (d *netTCP6Data) Generate(ctx context.Context, buf *bytes.Buffer) error { + buf.WriteString(" sl local_address remote_address st tx_queue rx_queue tr tm->when retrnsmt uid timeout inode\n") + return commonGenerateTCP(ctx, buf, d.kernel, linux.AF_INET6) +} + +// netUDPData implements vfs.DynamicBytesSource for /proc/net/udp. +// +// +stateify savable +type netUDPData struct { + kernfs.DynamicBytesFile + + kernel *kernel.Kernel +} + +var _ dynamicInode = (*netUDPData)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (d *netUDPData) Generate(ctx context.Context, buf *bytes.Buffer) error { + // t may be nil here if our caller is not part of a task goroutine. This can + // happen for example if we're here for "sentryctl cat". When t is nil, + // degrade gracefully and retrieve what we can. + t := kernel.TaskFromContext(ctx) + + for _, se := range d.kernel.ListSockets() { + s := se.Sock.Get() + if s == nil { + log.Debugf("Couldn't resolve weakref with ID %v in socket table, racing with destruction?", se.ID) + continue + } + sfile := s.(*fs.File) + sops, ok := sfile.FileOperations.(socket.Socket) + if !ok { + panic(fmt.Sprintf("Found non-socket file in socket table: %+v", sfile)) + } + if family, stype, _ := sops.Type(); family != linux.AF_INET || stype != linux.SOCK_DGRAM { + s.DecRef() + // Not udp4 socket. + continue + } + + // For Linux's implementation, see net/ipv4/udp.c:udp4_format_sock(). + + // Field: sl; entry number. + fmt.Fprintf(buf, "%5d: ", se.ID) + + // Field: local_adddress. + var localAddr linux.SockAddrInet + if t != nil { + if local, _, err := sops.GetSockName(t); err == nil { + localAddr = *local.(*linux.SockAddrInet) + } + } + writeInetAddr(buf, linux.AF_INET, &localAddr) + + // Field: rem_address. + var remoteAddr linux.SockAddrInet + if t != nil { + if remote, _, err := sops.GetPeerName(t); err == nil { + remoteAddr = *remote.(*linux.SockAddrInet) + } + } + writeInetAddr(buf, linux.AF_INET, &remoteAddr) + + // Field: state; socket state. + fmt.Fprintf(buf, "%02X ", sops.State()) + + // Field: tx_queue, rx_queue; number of packets in the transmit and + // receive queue. Unimplemented. + fmt.Fprintf(buf, "%08X:%08X ", 0, 0) + + // Field: tr, tm->when. Always 0 for UDP. + fmt.Fprintf(buf, "%02X:%08X ", 0, 0) + + // Field: retrnsmt. Always 0 for UDP. + fmt.Fprintf(buf, "%08X ", 0) + + // Field: uid. + uattr, err := sfile.Dirent.Inode.UnstableAttr(ctx) + if err != nil { + log.Warningf("Failed to retrieve unstable attr for socket file: %v", err) + fmt.Fprintf(buf, "%5d ", 0) + } else { + creds := auth.CredentialsFromContext(ctx) + fmt.Fprintf(buf, "%5d ", uint32(uattr.Owner.UID.In(creds.UserNamespace).OrOverflow())) + } + + // Field: timeout. Always 0 for UDP. + fmt.Fprintf(buf, "%8d ", 0) + + // Field: inode. + fmt.Fprintf(buf, "%8d ", sfile.InodeID()) + + // Field: ref; reference count on the socket inode. Don't count the ref + // we obtain while deferencing the weakref to this socket. + fmt.Fprintf(buf, "%d ", sfile.ReadRefs()-1) + + // Field: Socket struct address. Redacted due to the same reason as + // the 'Num' field in /proc/net/unix, see netUnix.ReadSeqFileData. + fmt.Fprintf(buf, "%#016p ", (*socket.Socket)(nil)) + + // Field: drops; number of dropped packets. Unimplemented. + fmt.Fprintf(buf, "%d", 0) + + fmt.Fprintf(buf, "\n") + + s.DecRef() + } + return nil +} + +// netSnmpData implements vfs.DynamicBytesSource for /proc/net/snmp. +// +// +stateify savable +type netSnmpData struct { + kernfs.DynamicBytesFile + + stack inet.Stack +} + +var _ dynamicInode = (*netSnmpData)(nil) + +type snmpLine struct { + prefix string + header string +} + +var snmp = []snmpLine{ + { + prefix: "Ip", + header: "Forwarding DefaultTTL InReceives InHdrErrors InAddrErrors ForwDatagrams InUnknownProtos InDiscards InDelivers OutRequests OutDiscards OutNoRoutes ReasmTimeout ReasmReqds ReasmOKs ReasmFails FragOKs FragFails FragCreates", + }, + { + prefix: "Icmp", + header: "InMsgs InErrors InCsumErrors InDestUnreachs InTimeExcds InParmProbs InSrcQuenchs InRedirects InEchos InEchoReps InTimestamps InTimestampReps InAddrMasks InAddrMaskReps OutMsgs OutErrors OutDestUnreachs OutTimeExcds OutParmProbs OutSrcQuenchs OutRedirects OutEchos OutEchoReps OutTimestamps OutTimestampReps OutAddrMasks OutAddrMaskReps", + }, + { + prefix: "IcmpMsg", + }, + { + prefix: "Tcp", + header: "RtoAlgorithm RtoMin RtoMax MaxConn ActiveOpens PassiveOpens AttemptFails EstabResets CurrEstab InSegs OutSegs RetransSegs InErrs OutRsts InCsumErrors", + }, + { + prefix: "Udp", + header: "InDatagrams NoPorts InErrors OutDatagrams RcvbufErrors SndbufErrors InCsumErrors IgnoredMulti", + }, + { + prefix: "UdpLite", + header: "InDatagrams NoPorts InErrors OutDatagrams RcvbufErrors SndbufErrors InCsumErrors IgnoredMulti", + }, +} + +func toSlice(a interface{}) []uint64 { + v := reflect.Indirect(reflect.ValueOf(a)) + return v.Slice(0, v.Len()).Interface().([]uint64) +} + +func sprintSlice(s []uint64) string { + if len(s) == 0 { + return "" + } + r := fmt.Sprint(s) + return r[1 : len(r)-1] // Remove "[]" introduced by fmt of slice. +} + +// Generate implements vfs.DynamicBytesSource. +func (d *netSnmpData) Generate(ctx context.Context, buf *bytes.Buffer) error { + types := []interface{}{ + &inet.StatSNMPIP{}, + &inet.StatSNMPICMP{}, + nil, // TODO(gvisor.dev/issue/628): Support IcmpMsg stats. + &inet.StatSNMPTCP{}, + &inet.StatSNMPUDP{}, + &inet.StatSNMPUDPLite{}, + } + for i, stat := range types { + line := snmp[i] + if stat == nil { + fmt.Fprintf(buf, "%s:\n", line.prefix) + fmt.Fprintf(buf, "%s:\n", line.prefix) + continue + } + if err := d.stack.Statistics(stat, line.prefix); err != nil { + if err == syserror.EOPNOTSUPP { + log.Infof("Failed to retrieve %s of /proc/net/snmp: %v", line.prefix, err) + } else { + log.Warningf("Failed to retrieve %s of /proc/net/snmp: %v", line.prefix, err) + } + } + + fmt.Fprintf(buf, "%s: %s\n", line.prefix, line.header) + + if line.prefix == "Tcp" { + tcp := stat.(*inet.StatSNMPTCP) + // "Tcp" needs special processing because MaxConn is signed. RFC 2012. + fmt.Sprintf("%s: %s %d %s\n", line.prefix, sprintSlice(tcp[:3]), int64(tcp[3]), sprintSlice(tcp[4:])) + } else { + fmt.Sprintf("%s: %s\n", line.prefix, sprintSlice(toSlice(stat))) + } + } + return nil +} + +// netRouteData implements vfs.DynamicBytesSource for /proc/net/route. +// +// +stateify savable +type netRouteData struct { + kernfs.DynamicBytesFile + + stack inet.Stack +} + +var _ dynamicInode = (*netRouteData)(nil) + +// Generate implements vfs.DynamicBytesSource. +// See Linux's net/ipv4/fib_trie.c:fib_route_seq_show. +func (d *netRouteData) Generate(ctx context.Context, buf *bytes.Buffer) error { + fmt.Fprintf(buf, "%-127s\n", "Iface\tDestination\tGateway\tFlags\tRefCnt\tUse\tMetric\tMask\tMTU\tWindow\tIRTT") + + interfaces := d.stack.Interfaces() + for _, rt := range d.stack.RouteTable() { + // /proc/net/route only includes ipv4 routes. + if rt.Family != linux.AF_INET { + continue + } + + // /proc/net/route does not include broadcast or multicast routes. + if rt.Type == linux.RTN_BROADCAST || rt.Type == linux.RTN_MULTICAST { + continue + } + + iface, ok := interfaces[rt.OutputInterface] + if !ok || iface.Name == "lo" { + continue + } + + var ( + gw uint32 + prefix uint32 + flags = linux.RTF_UP + ) + if len(rt.GatewayAddr) == header.IPv4AddressSize { + flags |= linux.RTF_GATEWAY + gw = usermem.ByteOrder.Uint32(rt.GatewayAddr) + } + if len(rt.DstAddr) == header.IPv4AddressSize { + prefix = usermem.ByteOrder.Uint32(rt.DstAddr) + } + l := fmt.Sprintf( + "%s\t%08X\t%08X\t%04X\t%d\t%d\t%d\t%08X\t%d\t%d\t%d", + iface.Name, + prefix, + gw, + flags, + 0, // RefCnt. + 0, // Use. + 0, // Metric. + (uint32(1)<> 8 & 0xff), byte(n & 0xff)} - return usermem.ByteOrder.Uint16(buf[:]) -} - -func writeInetAddr(w io.Writer, family int, i linux.SockAddr) { - switch family { - case linux.AF_INET: - var a linux.SockAddrInet - if i != nil { - a = *i.(*linux.SockAddrInet) - } - - // linux.SockAddrInet.Port is stored in the network byte order and is - // printed like a number in host byte order. Note that all numbers in host - // byte order are printed with the most-significant byte first when - // formatted with %X. See get_tcp4_sock() and udp4_format_sock() in Linux. - port := networkToHost16(a.Port) - - // linux.SockAddrInet.Addr is stored as a byte slice in big-endian order - // (i.e. most-significant byte in index 0). Linux represents this as a - // __be32 which is a typedef for an unsigned int, and is printed with - // %X. This means that for a little-endian machine, Linux prints the - // least-significant byte of the address first. To emulate this, we first - // invert the byte order for the address using usermem.ByteOrder.Uint32, - // which makes it have the equivalent encoding to a __be32 on a little - // endian machine. Note that this operation is a no-op on a big endian - // machine. Then similar to Linux, we format it with %X, which will print - // the most-significant byte of the __be32 address first, which is now - // actually the least-significant byte of the original address in - // linux.SockAddrInet.Addr on little endian machines, due to the conversion. - addr := usermem.ByteOrder.Uint32(a.Addr[:]) - - fmt.Fprintf(w, "%08X:%04X ", addr, port) - case linux.AF_INET6: - var a linux.SockAddrInet6 - if i != nil { - a = *i.(*linux.SockAddrInet6) - } - - port := networkToHost16(a.Port) - addr0 := usermem.ByteOrder.Uint32(a.Addr[0:4]) - addr1 := usermem.ByteOrder.Uint32(a.Addr[4:8]) - addr2 := usermem.ByteOrder.Uint32(a.Addr[8:12]) - addr3 := usermem.ByteOrder.Uint32(a.Addr[12:16]) - fmt.Fprintf(w, "%08X%08X%08X%08X:%04X ", addr0, addr1, addr2, addr3, port) - } -} - -func commonGenerateTCP(ctx context.Context, buf *bytes.Buffer, k *kernel.Kernel, family int) error { - // t may be nil here if our caller is not part of a task goroutine. This can - // happen for example if we're here for "sentryctl cat". When t is nil, - // degrade gracefully and retrieve what we can. - t := kernel.TaskFromContext(ctx) - - for _, se := range k.ListSockets() { - s := se.Sock.Get() - if s == nil { - log.Debugf("Couldn't resolve weakref with ID %v in socket table, racing with destruction?", se.ID) - continue - } - sfile := s.(*fs.File) - sops, ok := sfile.FileOperations.(socket.Socket) - if !ok { - panic(fmt.Sprintf("Found non-socket file in socket table: %+v", sfile)) - } - if fa, stype, _ := sops.Type(); !(family == fa && stype == linux.SOCK_STREAM) { - s.DecRef() - // Not tcp4 sockets. - continue - } - - // Linux's documentation for the fields below can be found at - // https://www.kernel.org/doc/Documentation/networking/proc_net_tcp.txt. - // For Linux's implementation, see net/ipv4/tcp_ipv4.c:get_tcp4_sock(). - // Note that the header doesn't contain labels for all the fields. - - // Field: sl; entry number. - fmt.Fprintf(buf, "%4d: ", se.ID) - - // Field: local_adddress. - var localAddr linux.SockAddr - if t != nil { - if local, _, err := sops.GetSockName(t); err == nil { - localAddr = local - } - } - writeInetAddr(buf, family, localAddr) - - // Field: rem_address. - var remoteAddr linux.SockAddr - if t != nil { - if remote, _, err := sops.GetPeerName(t); err == nil { - remoteAddr = remote - } - } - writeInetAddr(buf, family, remoteAddr) - - // Field: state; socket state. - fmt.Fprintf(buf, "%02X ", sops.State()) - - // Field: tx_queue, rx_queue; number of packets in the transmit and - // receive queue. Unimplemented. - fmt.Fprintf(buf, "%08X:%08X ", 0, 0) - - // Field: tr, tm->when; timer active state and number of jiffies - // until timer expires. Unimplemented. - fmt.Fprintf(buf, "%02X:%08X ", 0, 0) - - // Field: retrnsmt; number of unrecovered RTO timeouts. - // Unimplemented. - fmt.Fprintf(buf, "%08X ", 0) - - // Field: uid. - uattr, err := sfile.Dirent.Inode.UnstableAttr(ctx) - if err != nil { - log.Warningf("Failed to retrieve unstable attr for socket file: %v", err) - fmt.Fprintf(buf, "%5d ", 0) - } else { - creds := auth.CredentialsFromContext(ctx) - fmt.Fprintf(buf, "%5d ", uint32(uattr.Owner.UID.In(creds.UserNamespace).OrOverflow())) - } - - // Field: timeout; number of unanswered 0-window probes. - // Unimplemented. - fmt.Fprintf(buf, "%8d ", 0) - - // Field: inode. - fmt.Fprintf(buf, "%8d ", sfile.InodeID()) - - // Field: refcount. Don't count the ref we obtain while deferencing - // the weakref to this socket. - fmt.Fprintf(buf, "%d ", sfile.ReadRefs()-1) - - // Field: Socket struct address. Redacted due to the same reason as - // the 'Num' field in /proc/net/unix, see netUnix.ReadSeqFileData. - fmt.Fprintf(buf, "%#016p ", (*socket.Socket)(nil)) - - // Field: retransmit timeout. Unimplemented. - fmt.Fprintf(buf, "%d ", 0) - - // Field: predicted tick of soft clock (delayed ACK control data). - // Unimplemented. - fmt.Fprintf(buf, "%d ", 0) - - // Field: (ack.quick<<1)|ack.pingpong, Unimplemented. - fmt.Fprintf(buf, "%d ", 0) - - // Field: sending congestion window, Unimplemented. - fmt.Fprintf(buf, "%d ", 0) - - // Field: Slow start size threshold, -1 if threshold >= 0xFFFF. - // Unimplemented, report as large threshold. - fmt.Fprintf(buf, "%d", -1) - - fmt.Fprintf(buf, "\n") - - s.DecRef() - } - - return nil -} - -// netTCPData implements vfs.DynamicBytesSource for /proc/net/tcp. -// -// +stateify savable -type netTCPData struct { - kernfs.DynamicBytesFile - - kernel *kernel.Kernel -} - -var _ dynamicInode = (*netTCPData)(nil) - -func (d *netTCPData) Generate(ctx context.Context, buf *bytes.Buffer) error { - buf.WriteString(" sl local_address rem_address st tx_queue rx_queue tr tm->when retrnsmt uid timeout inode \n") - return commonGenerateTCP(ctx, buf, d.kernel, linux.AF_INET) -} - -// netTCP6Data implements vfs.DynamicBytesSource for /proc/net/tcp6. -// -// +stateify savable -type netTCP6Data struct { - kernfs.DynamicBytesFile - - kernel *kernel.Kernel -} - -var _ dynamicInode = (*netTCP6Data)(nil) - -func (d *netTCP6Data) Generate(ctx context.Context, buf *bytes.Buffer) error { - buf.WriteString(" sl local_address remote_address st tx_queue rx_queue tr tm->when retrnsmt uid timeout inode\n") - return commonGenerateTCP(ctx, buf, d.kernel, linux.AF_INET6) -} - -// netUDPData implements vfs.DynamicBytesSource for /proc/net/udp. -// -// +stateify savable -type netUDPData struct { - kernfs.DynamicBytesFile - - kernel *kernel.Kernel -} - -var _ dynamicInode = (*netUDPData)(nil) - -// Generate implements vfs.DynamicBytesSource.Generate. -func (d *netUDPData) Generate(ctx context.Context, buf *bytes.Buffer) error { - // t may be nil here if our caller is not part of a task goroutine. This can - // happen for example if we're here for "sentryctl cat". When t is nil, - // degrade gracefully and retrieve what we can. - t := kernel.TaskFromContext(ctx) - - for _, se := range d.kernel.ListSockets() { - s := se.Sock.Get() - if s == nil { - log.Debugf("Couldn't resolve weakref with ID %v in socket table, racing with destruction?", se.ID) - continue - } - sfile := s.(*fs.File) - sops, ok := sfile.FileOperations.(socket.Socket) - if !ok { - panic(fmt.Sprintf("Found non-socket file in socket table: %+v", sfile)) - } - if family, stype, _ := sops.Type(); family != linux.AF_INET || stype != linux.SOCK_DGRAM { - s.DecRef() - // Not udp4 socket. - continue - } - - // For Linux's implementation, see net/ipv4/udp.c:udp4_format_sock(). - - // Field: sl; entry number. - fmt.Fprintf(buf, "%5d: ", se.ID) - - // Field: local_adddress. - var localAddr linux.SockAddrInet - if t != nil { - if local, _, err := sops.GetSockName(t); err == nil { - localAddr = *local.(*linux.SockAddrInet) - } - } - writeInetAddr(buf, linux.AF_INET, &localAddr) - - // Field: rem_address. - var remoteAddr linux.SockAddrInet - if t != nil { - if remote, _, err := sops.GetPeerName(t); err == nil { - remoteAddr = *remote.(*linux.SockAddrInet) - } - } - writeInetAddr(buf, linux.AF_INET, &remoteAddr) - - // Field: state; socket state. - fmt.Fprintf(buf, "%02X ", sops.State()) - - // Field: tx_queue, rx_queue; number of packets in the transmit and - // receive queue. Unimplemented. - fmt.Fprintf(buf, "%08X:%08X ", 0, 0) - - // Field: tr, tm->when. Always 0 for UDP. - fmt.Fprintf(buf, "%02X:%08X ", 0, 0) - - // Field: retrnsmt. Always 0 for UDP. - fmt.Fprintf(buf, "%08X ", 0) - - // Field: uid. - uattr, err := sfile.Dirent.Inode.UnstableAttr(ctx) - if err != nil { - log.Warningf("Failed to retrieve unstable attr for socket file: %v", err) - fmt.Fprintf(buf, "%5d ", 0) - } else { - creds := auth.CredentialsFromContext(ctx) - fmt.Fprintf(buf, "%5d ", uint32(uattr.Owner.UID.In(creds.UserNamespace).OrOverflow())) - } - - // Field: timeout. Always 0 for UDP. - fmt.Fprintf(buf, "%8d ", 0) - - // Field: inode. - fmt.Fprintf(buf, "%8d ", sfile.InodeID()) - - // Field: ref; reference count on the socket inode. Don't count the ref - // we obtain while deferencing the weakref to this socket. - fmt.Fprintf(buf, "%d ", sfile.ReadRefs()-1) - - // Field: Socket struct address. Redacted due to the same reason as - // the 'Num' field in /proc/net/unix, see netUnix.ReadSeqFileData. - fmt.Fprintf(buf, "%#016p ", (*socket.Socket)(nil)) - - // Field: drops; number of dropped packets. Unimplemented. - fmt.Fprintf(buf, "%d", 0) - - fmt.Fprintf(buf, "\n") - - s.DecRef() - } - return nil -} - -// netSnmpData implements vfs.DynamicBytesSource for /proc/net/snmp. -// -// +stateify savable -type netSnmpData struct { - kernfs.DynamicBytesFile - - stack inet.Stack -} - -var _ dynamicInode = (*netSnmpData)(nil) - -type snmpLine struct { - prefix string - header string -} - -var snmp = []snmpLine{ - { - prefix: "Ip", - header: "Forwarding DefaultTTL InReceives InHdrErrors InAddrErrors ForwDatagrams InUnknownProtos InDiscards InDelivers OutRequests OutDiscards OutNoRoutes ReasmTimeout ReasmReqds ReasmOKs ReasmFails FragOKs FragFails FragCreates", - }, - { - prefix: "Icmp", - header: "InMsgs InErrors InCsumErrors InDestUnreachs InTimeExcds InParmProbs InSrcQuenchs InRedirects InEchos InEchoReps InTimestamps InTimestampReps InAddrMasks InAddrMaskReps OutMsgs OutErrors OutDestUnreachs OutTimeExcds OutParmProbs OutSrcQuenchs OutRedirects OutEchos OutEchoReps OutTimestamps OutTimestampReps OutAddrMasks OutAddrMaskReps", - }, - { - prefix: "IcmpMsg", - }, - { - prefix: "Tcp", - header: "RtoAlgorithm RtoMin RtoMax MaxConn ActiveOpens PassiveOpens AttemptFails EstabResets CurrEstab InSegs OutSegs RetransSegs InErrs OutRsts InCsumErrors", - }, - { - prefix: "Udp", - header: "InDatagrams NoPorts InErrors OutDatagrams RcvbufErrors SndbufErrors InCsumErrors IgnoredMulti", - }, - { - prefix: "UdpLite", - header: "InDatagrams NoPorts InErrors OutDatagrams RcvbufErrors SndbufErrors InCsumErrors IgnoredMulti", - }, -} - -func toSlice(a interface{}) []uint64 { - v := reflect.Indirect(reflect.ValueOf(a)) - return v.Slice(0, v.Len()).Interface().([]uint64) -} - -func sprintSlice(s []uint64) string { - if len(s) == 0 { - return "" - } - r := fmt.Sprint(s) - return r[1 : len(r)-1] // Remove "[]" introduced by fmt of slice. -} - -// Generate implements vfs.DynamicBytesSource. -func (d *netSnmpData) Generate(ctx context.Context, buf *bytes.Buffer) error { - types := []interface{}{ - &inet.StatSNMPIP{}, - &inet.StatSNMPICMP{}, - nil, // TODO(gvisor.dev/issue/628): Support IcmpMsg stats. - &inet.StatSNMPTCP{}, - &inet.StatSNMPUDP{}, - &inet.StatSNMPUDPLite{}, - } - for i, stat := range types { - line := snmp[i] - if stat == nil { - fmt.Fprintf(buf, "%s:\n", line.prefix) - fmt.Fprintf(buf, "%s:\n", line.prefix) - continue - } - if err := d.stack.Statistics(stat, line.prefix); err != nil { - if err == syserror.EOPNOTSUPP { - log.Infof("Failed to retrieve %s of /proc/net/snmp: %v", line.prefix, err) - } else { - log.Warningf("Failed to retrieve %s of /proc/net/snmp: %v", line.prefix, err) - } - } - - fmt.Fprintf(buf, "%s: %s\n", line.prefix, line.header) - - if line.prefix == "Tcp" { - tcp := stat.(*inet.StatSNMPTCP) - // "Tcp" needs special processing because MaxConn is signed. RFC 2012. - fmt.Sprintf("%s: %s %d %s\n", line.prefix, sprintSlice(tcp[:3]), int64(tcp[3]), sprintSlice(tcp[4:])) - } else { - fmt.Sprintf("%s: %s\n", line.prefix, sprintSlice(toSlice(stat))) - } - } - return nil -} - -// netRouteData implements vfs.DynamicBytesSource for /proc/net/route. -// -// +stateify savable -type netRouteData struct { - kernfs.DynamicBytesFile - - stack inet.Stack -} - -var _ dynamicInode = (*netRouteData)(nil) - -// Generate implements vfs.DynamicBytesSource. -// See Linux's net/ipv4/fib_trie.c:fib_route_seq_show. -func (d *netRouteData) Generate(ctx context.Context, buf *bytes.Buffer) error { - fmt.Fprintf(buf, "%-127s\n", "Iface\tDestination\tGateway\tFlags\tRefCnt\tUse\tMetric\tMask\tMTU\tWindow\tIRTT") - - interfaces := d.stack.Interfaces() - for _, rt := range d.stack.RouteTable() { - // /proc/net/route only includes ipv4 routes. - if rt.Family != linux.AF_INET { - continue - } - - // /proc/net/route does not include broadcast or multicast routes. - if rt.Type == linux.RTN_BROADCAST || rt.Type == linux.RTN_MULTICAST { - continue - } - - iface, ok := interfaces[rt.OutputInterface] - if !ok || iface.Name == "lo" { - continue - } - - var ( - gw uint32 - prefix uint32 - flags = linux.RTF_UP - ) - if len(rt.GatewayAddr) == header.IPv4AddressSize { - flags |= linux.RTF_GATEWAY - gw = usermem.ByteOrder.Uint32(rt.GatewayAddr) - } - if len(rt.DstAddr) == header.IPv4AddressSize { - prefix = usermem.ByteOrder.Uint32(rt.DstAddr) - } - l := fmt.Sprintf( - "%s\t%08X\t%08X\t%04X\t%d\t%d\t%d\t%08X\t%d\t%d\t%d", - iface.Name, - prefix, - gw, - flags, - 0, // RefCnt. - 0, // Use. - 0, // Metric. - (uint32(1)< Date: Fri, 13 Mar 2020 11:40:13 -0700 Subject: Implement access/faccessat for VFS2. Note that the raw faccessat system call does not actually take a flags argument; according to faccessat(2), the glibc wrapper implements the flags by using fstatat(2). Remove the flag argument that we try to extract from vfs1, which would just be a garbage value. Updates #1965 Fixes #2101 PiperOrigin-RevId: 300796067 --- pkg/sentry/fsimpl/ext/filesystem.go | 10 ++++++ pkg/sentry/fsimpl/gofer/filesystem.go | 13 ++++++++ pkg/sentry/fsimpl/kernfs/filesystem.go | 14 ++++++++ pkg/sentry/fsimpl/tmpfs/filesystem.go | 12 +++++++ pkg/sentry/syscalls/linux/sys_file.go | 16 ++++++--- pkg/sentry/syscalls/linux/vfs2/stat.go | 60 +++++++++++++++++++++++++++++++--- pkg/sentry/vfs/anonfs.go | 25 +++++++++++--- pkg/sentry/vfs/filesystem.go | 4 +++ pkg/sentry/vfs/vfs.go | 17 ++++++++++ 9 files changed, 157 insertions(+), 14 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/ext/filesystem.go b/pkg/sentry/fsimpl/ext/filesystem.go index e05429d41..8497be615 100644 --- a/pkg/sentry/fsimpl/ext/filesystem.go +++ b/pkg/sentry/fsimpl/ext/filesystem.go @@ -22,6 +22,7 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" @@ -255,6 +256,15 @@ func (fs *filesystem) statTo(stat *linux.Statfs) { // TODO(b/134676337): Set Statfs.Flags and Statfs.FSID. } +// AccessAt implements vfs.Filesystem.Impl.AccessAt. +func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error { + _, inode, err := fs.walk(rp, false) + if err != nil { + return err + } + return inode.checkPermissions(rp.Credentials(), ats) +} + // GetDentryAt implements vfs.FilesystemImpl.GetDentryAt. func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) { vfsd, inode, err := fs.walk(rp, false) diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go index 5cfb0dc4c..38e4cdbc5 100644 --- a/pkg/sentry/fsimpl/gofer/filesystem.go +++ b/pkg/sentry/fsimpl/gofer/filesystem.go @@ -21,6 +21,7 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/p9" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" ) @@ -499,6 +500,18 @@ func (fs *filesystem) renameMuUnlockAndCheckCaching(ds **[]*dentry) { putDentrySlice(*ds) } +// AccessAt implements vfs.Filesystem.Impl.AccessAt. +func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckCaching(&ds) + d, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + return err + } + return d.checkPermissions(creds, ats, d.isDir()) +} + // GetDentryAt implements vfs.FilesystemImpl.GetDentryAt. func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) { var ds *[]*dentry diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go index 1d7e04ad4..3288de290 100644 --- a/pkg/sentry/fsimpl/kernfs/filesystem.go +++ b/pkg/sentry/fsimpl/kernfs/filesystem.go @@ -22,6 +22,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" ) @@ -229,6 +230,19 @@ func (fs *Filesystem) Sync(ctx context.Context) error { return nil } +// AccessAt implements vfs.Filesystem.Impl.AccessAt. +func (fs *Filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error { + fs.mu.RLock() + defer fs.mu.RUnlock() + defer fs.processDeferredDecRefs() + + _, inode, err := fs.walkExistingLocked(ctx, rp) + if err != nil { + return err + } + return inode.CheckPermissions(ctx, creds, ats) +} + // GetDentryAt implements vfs.FilesystemImpl.GetDentryAt. func (fs *Filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) { fs.mu.RLock() diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go index e1b551422..02637fca6 100644 --- a/pkg/sentry/fsimpl/tmpfs/filesystem.go +++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go @@ -20,6 +20,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" ) @@ -154,6 +155,17 @@ func (fs *filesystem) doCreateAt(rp *vfs.ResolvingPath, dir bool, create func(pa return create(parent, name) } +// AccessAt implements vfs.Filesystem.Impl.AccessAt. +func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error { + fs.mu.RLock() + defer fs.mu.RUnlock() + d, err := resolveLocked(rp) + if err != nil { + return err + } + return d.inode.checkPermissions(creds, ats, d.inode.isDir()) +} + // GetDentryAt implements vfs.FilesystemImpl.GetDentryAt. func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) { fs.mu.RLock() diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go index d10a9bed8..35a98212a 100644 --- a/pkg/sentry/syscalls/linux/sys_file.go +++ b/pkg/sentry/syscalls/linux/sys_file.go @@ -514,7 +514,7 @@ func (ac accessContext) Value(key interface{}) interface{} { } } -func accessAt(t *kernel.Task, dirFD int32, addr usermem.Addr, resolve bool, mode uint) error { +func accessAt(t *kernel.Task, dirFD int32, addr usermem.Addr, mode uint) error { const rOK = 4 const wOK = 2 const xOK = 1 @@ -529,7 +529,7 @@ func accessAt(t *kernel.Task, dirFD int32, addr usermem.Addr, resolve bool, mode return syserror.EINVAL } - return fileOpOn(t, dirFD, path, resolve, func(root *fs.Dirent, d *fs.Dirent, _ uint) error { + return fileOpOn(t, dirFD, path, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent, _ uint) error { // access(2) and faccessat(2) check permissions using real // UID/GID, not effective UID/GID. // @@ -564,17 +564,23 @@ func Access(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal addr := args[0].Pointer() mode := args[1].ModeT() - return 0, nil, accessAt(t, linux.AT_FDCWD, addr, true, mode) + return 0, nil, accessAt(t, linux.AT_FDCWD, addr, mode) } // Faccessat implements linux syscall faccessat(2). +// +// Note that the faccessat() system call does not take a flags argument: +// "The raw faccessat() system call takes only the first three arguments. The +// AT_EACCESS and AT_SYMLINK_NOFOLLOW flags are actually implemented within +// the glibc wrapper function for faccessat(). If either of these flags is +// specified, then the wrapper function employs fstatat(2) to determine access +// permissions." - faccessat(2) func Faccessat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { dirFD := args[0].Int() addr := args[1].Pointer() mode := args[2].ModeT() - flags := args[3].Int() - return 0, nil, accessAt(t, dirFD, addr, flags&linux.AT_SYMLINK_NOFOLLOW == 0, mode) + return 0, nil, accessAt(t, dirFD, addr, mode) } // LINT.ThenChange(vfs2/filesystem.go) diff --git a/pkg/sentry/syscalls/linux/vfs2/stat.go b/pkg/sentry/syscalls/linux/vfs2/stat.go index 12c532310..a74ea6fd5 100644 --- a/pkg/sentry/syscalls/linux/vfs2/stat.go +++ b/pkg/sentry/syscalls/linux/vfs2/stat.go @@ -228,14 +228,64 @@ func Readlink(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc // Access implements Linux syscall access(2). func Access(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - // FIXME(jamieliu): actually implement - return 0, nil, nil + addr := args[0].Pointer() + mode := args[1].ModeT() + + return 0, nil, accessAt(t, linux.AT_FDCWD, addr, mode) } -// Faccessat implements Linux syscall access(2). +// Faccessat implements Linux syscall faccessat(2). +// +// Note that the faccessat() system call does not take a flags argument: +// "The raw faccessat() system call takes only the first three arguments. The +// AT_EACCESS and AT_SYMLINK_NOFOLLOW flags are actually implemented within +// the glibc wrapper function for faccessat(). If either of these flags is +// specified, then the wrapper function employs fstatat(2) to determine access +// permissions." - faccessat(2) func Faccessat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - // FIXME(jamieliu): actually implement - return 0, nil, nil + dirfd := args[0].Int() + addr := args[1].Pointer() + mode := args[2].ModeT() + + return 0, nil, accessAt(t, dirfd, addr, mode) +} + +func accessAt(t *kernel.Task, dirfd int32, pathAddr usermem.Addr, mode uint) error { + const rOK = 4 + const wOK = 2 + const xOK = 1 + + // Sanity check the mode. + if mode&^(rOK|wOK|xOK) != 0 { + return syserror.EINVAL + } + + path, err := copyInPath(t, pathAddr) + if err != nil { + return err + } + tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, followFinalSymlink) + if err != nil { + return err + } + + // access(2) and faccessat(2) check permissions using real + // UID/GID, not effective UID/GID. + // + // "access() needs to use the real uid/gid, not the effective + // uid/gid. We do this by temporarily clearing all FS-related + // capabilities and switching the fsuid/fsgid around to the + // real ones." -fs/open.c:faccessat + creds := t.Credentials().Fork() + creds.EffectiveKUID = creds.RealKUID + creds.EffectiveKGID = creds.RealKGID + if creds.EffectiveKUID.In(creds.UserNamespace) == auth.RootUID { + creds.EffectiveCaps = creds.PermittedCaps + } else { + creds.EffectiveCaps = 0 + } + + return t.Kernel().VFS().AccessAt(t, creds, vfs.AccessTypes(mode), &tpop.pop) } // Readlinkat implements Linux syscall mknodat(2). diff --git a/pkg/sentry/vfs/anonfs.go b/pkg/sentry/vfs/anonfs.go index 2db25be49..925996517 100644 --- a/pkg/sentry/vfs/anonfs.go +++ b/pkg/sentry/vfs/anonfs.go @@ -41,7 +41,14 @@ func (vfs *VirtualFilesystem) NewAnonVirtualDentry(name string) VirtualDentry { } } -const anonfsBlockSize = usermem.PageSize // via fs/libfs.c:pseudo_fs_fill_super() +const ( + anonfsBlockSize = usermem.PageSize // via fs/libfs.c:pseudo_fs_fill_super() + + // Mode, UID, and GID for a generic anonfs file. + anonFileMode = 0600 // no type is correct + anonFileUID = auth.RootKUID + anonFileGID = auth.RootKGID +) // anonFilesystem is the implementation of FilesystemImpl that backs // VirtualDentries returned by VirtualFilesystem.NewAnonVirtualDentry(). @@ -69,6 +76,16 @@ func (fs *anonFilesystem) Sync(ctx context.Context) error { return nil } +// AccessAt implements vfs.Filesystem.Impl.AccessAt. +// +// TODO(gvisor.dev/issue/1965): Implement access permissions. +func (fs *anonFilesystem) AccessAt(ctx context.Context, rp *ResolvingPath, creds *auth.Credentials, ats AccessTypes) error { + if !rp.Done() { + return syserror.ENOTDIR + } + return GenericCheckPermissions(creds, ats, false /* isDir */, anonFileMode, anonFileUID, anonFileGID) +} + // GetDentryAt implements FilesystemImpl.GetDentryAt. func (fs *anonFilesystem) GetDentryAt(ctx context.Context, rp *ResolvingPath, opts GetDentryOptions) (*Dentry, error) { if !rp.Done() { @@ -167,9 +184,9 @@ func (fs *anonFilesystem) StatAt(ctx context.Context, rp *ResolvingPath, opts St Mask: linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO | linux.STATX_SIZE | linux.STATX_BLOCKS, Blksize: anonfsBlockSize, Nlink: 1, - UID: uint32(auth.RootKUID), - GID: uint32(auth.RootKGID), - Mode: 0600, // no type is correct + UID: uint32(anonFileUID), + GID: uint32(anonFileGID), + Mode: anonFileMode, Ino: 1, Size: 0, Blocks: 0, diff --git a/pkg/sentry/vfs/filesystem.go b/pkg/sentry/vfs/filesystem.go index 556976d0b..c43dcff3d 100644 --- a/pkg/sentry/vfs/filesystem.go +++ b/pkg/sentry/vfs/filesystem.go @@ -20,6 +20,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" ) // A Filesystem is a tree of nodes represented by Dentries, which forms part of @@ -144,6 +145,9 @@ type FilesystemImpl interface { // file data to be written to the underlying [filesystem]", as by syncfs(2). Sync(ctx context.Context) error + // AccessAt checks whether a user with creds can access the file at rp. + AccessAt(ctx context.Context, rp *ResolvingPath, creds *auth.Credentials, ats AccessTypes) error + // GetDentryAt returns a Dentry representing the file at rp. A reference is // taken on the returned Dentry. // diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go index 365e8b30d..2e2880171 100644 --- a/pkg/sentry/vfs/vfs.go +++ b/pkg/sentry/vfs/vfs.go @@ -174,6 +174,23 @@ type PathOperation struct { FollowFinalSymlink bool } +// AccessAt checks whether a user with creds has access to the file at +// the given path. +func (vfs *VirtualFilesystem) AccessAt(ctx context.Context, creds *auth.Credentials, ats AccessTypes, pop *PathOperation) error { + rp := vfs.getResolvingPath(creds, pop) + for { + err := rp.mount.fs.impl.AccessAt(ctx, rp, creds, ats) + if err == nil { + vfs.putResolvingPath(rp) + return nil + } + if !rp.handleError(err) { + vfs.putResolvingPath(rp) + return err + } + } +} + // GetDentryAt returns a VirtualDentry representing the given path, at which a // file must exist. A reference is taken on the returned VirtualDentry. func (vfs *VirtualFilesystem) GetDentryAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *GetDentryOptions) (VirtualDentry, error) { -- cgit v1.2.3 From 1c0535297067179a822ba2dd9a6fe13a8be5a666 Mon Sep 17 00:00:00 2001 From: Jamie Liu Date: Fri, 13 Mar 2020 13:17:59 -0700 Subject: Fix oom_score_adj. - Make oomScoreAdj a ThreadGroup field (Linux: signal_struct::oom_score_adj). - Avoid deadlock caused by Task.OOMScoreAdj()/SetOOMScoreAdj() locking Task.mu and TaskSet.mu in the wrong order (via Task.ExitState()). PiperOrigin-RevId: 300814698 --- pkg/sentry/fs/proc/task.go | 17 ++++++++++------- pkg/sentry/fsimpl/proc/task_files.go | 10 ++++++---- pkg/sentry/kernel/task.go | 29 ++++++----------------------- pkg/sentry/kernel/task_clone.go | 9 +++------ pkg/sentry/kernel/task_start.go | 4 ---- pkg/sentry/kernel/thread_group.go | 7 +++++++ 6 files changed, 32 insertions(+), 44 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fs/proc/task.go b/pkg/sentry/fs/proc/task.go index 03cc788c8..d6c5dd2c1 100644 --- a/pkg/sentry/fs/proc/task.go +++ b/pkg/sentry/fs/proc/task.go @@ -853,15 +853,15 @@ func (o *oomScoreAdj) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.F // Read implements fs.FileOperations.Read. func (f *oomScoreAdjFile) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, offset int64) (int64, error) { - if offset != 0 { - return 0, io.EOF + if f.t.ExitState() == kernel.TaskExitDead { + return 0, syserror.ESRCH } - adj, err := f.t.OOMScoreAdj() - if err != nil { - return 0, err + var buf bytes.Buffer + fmt.Fprintf(&buf, "%d\n", f.t.OOMScoreAdj()) + if offset >= int64(buf.Len()) { + return 0, io.EOF } - adjBytes := []byte(strconv.FormatInt(int64(adj), 10) + "\n") - n, err := dst.CopyOut(ctx, adjBytes) + n, err := dst.CopyOut(ctx, buf.Bytes()[offset:]) return int64(n), err } @@ -880,6 +880,9 @@ func (f *oomScoreAdjFile) Write(ctx context.Context, _ *fs.File, src usermem.IOS return 0, err } + if f.t.ExitState() == kernel.TaskExitDead { + return 0, syserror.ESRCH + } if err := f.t.SetOOMScoreAdj(v); err != nil { return 0, err } diff --git a/pkg/sentry/fsimpl/proc/task_files.go b/pkg/sentry/fsimpl/proc/task_files.go index 5a231ac86..4d3332771 100644 --- a/pkg/sentry/fsimpl/proc/task_files.go +++ b/pkg/sentry/fsimpl/proc/task_files.go @@ -539,11 +539,10 @@ var _ vfs.WritableDynamicBytesSource = (*oomScoreAdj)(nil) // Generate implements vfs.DynamicBytesSource.Generate. func (o *oomScoreAdj) Generate(ctx context.Context, buf *bytes.Buffer) error { - adj, err := o.task.OOMScoreAdj() - if err != nil { - return err + if o.task.ExitState() == kernel.TaskExitDead { + return syserror.ESRCH } - fmt.Fprintf(buf, "%d\n", adj) + fmt.Fprintf(buf, "%d\n", o.task.OOMScoreAdj()) return nil } @@ -562,6 +561,9 @@ func (o *oomScoreAdj) Write(ctx context.Context, src usermem.IOSequence, offset return 0, err } + if o.task.ExitState() == kernel.TaskExitDead { + return 0, syserror.ESRCH + } if err := o.task.SetOOMScoreAdj(v); err != nil { return 0, err } diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go index c0dbbe890..8452ddf5b 100644 --- a/pkg/sentry/kernel/task.go +++ b/pkg/sentry/kernel/task.go @@ -555,13 +555,6 @@ type Task struct { // // startTime is protected by mu. startTime ktime.Time - - // oomScoreAdj is the task's OOM score adjustment. This is currently not - // used but is maintained for consistency. - // TODO(gvisor.dev/issue/1967) - // - // oomScoreAdj is protected by mu, and is owned by the task goroutine. - oomScoreAdj int32 } func (t *Task) savePtraceTracer() *Task { @@ -856,27 +849,17 @@ func (t *Task) ContainerID() string { return t.containerID } -// OOMScoreAdj gets the task's OOM score adjustment. -func (t *Task) OOMScoreAdj() (int32, error) { - t.mu.Lock() - defer t.mu.Unlock() - if t.ExitState() == TaskExitDead { - return 0, syserror.ESRCH - } - return t.oomScoreAdj, nil +// OOMScoreAdj gets the task's thread group's OOM score adjustment. +func (t *Task) OOMScoreAdj() int32 { + return atomic.LoadInt32(&t.tg.oomScoreAdj) } -// SetOOMScoreAdj sets the task's OOM score adjustment. The value should be -// between -1000 and 1000 inclusive. +// SetOOMScoreAdj sets the task's thread group's OOM score adjustment. The +// value should be between -1000 and 1000 inclusive. func (t *Task) SetOOMScoreAdj(adj int32) error { - t.mu.Lock() - defer t.mu.Unlock() - if t.ExitState() == TaskExitDead { - return syserror.ESRCH - } if adj > 1000 || adj < -1000 { return syserror.EINVAL } - t.oomScoreAdj = adj + atomic.StoreInt32(&t.tg.oomScoreAdj, adj) return nil } diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go index dda502bb8..e1ecca99e 100644 --- a/pkg/sentry/kernel/task_clone.go +++ b/pkg/sentry/kernel/task_clone.go @@ -15,6 +15,8 @@ package kernel import ( + "sync/atomic" + "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/bpf" "gvisor.dev/gvisor/pkg/sentry/inet" @@ -260,15 +262,11 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { sh = sh.Fork() } tg = t.k.NewThreadGroup(tg.mounts, pidns, sh, opts.TerminationSignal, tg.limits.GetCopy()) + tg.oomScoreAdj = atomic.LoadInt32(&t.tg.oomScoreAdj) rseqAddr = t.rseqAddr rseqSignature = t.rseqSignature } - adj, err := t.OOMScoreAdj() - if err != nil { - return 0, nil, err - } - cfg := &TaskConfig{ Kernel: t.k, ThreadGroup: tg, @@ -287,7 +285,6 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { RSeqAddr: rseqAddr, RSeqSignature: rseqSignature, ContainerID: t.ContainerID(), - OOMScoreAdj: adj, } if opts.NewThreadGroup { cfg.Parent = t diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go index 2bbf48bb8..a5035bb7f 100644 --- a/pkg/sentry/kernel/task_start.go +++ b/pkg/sentry/kernel/task_start.go @@ -93,9 +93,6 @@ type TaskConfig struct { // ContainerID is the container the new task belongs to. ContainerID string - - // oomScoreAdj is the task's OOM score adjustment. - OOMScoreAdj int32 } // NewTask creates a new task defined by cfg. @@ -146,7 +143,6 @@ func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) { rseqSignature: cfg.RSeqSignature, futexWaiter: futex.NewWaiter(), containerID: cfg.ContainerID, - oomScoreAdj: cfg.OOMScoreAdj, } t.creds.Store(cfg.Credentials) t.endStopCond.L = &t.tg.signalHandlers.mu diff --git a/pkg/sentry/kernel/thread_group.go b/pkg/sentry/kernel/thread_group.go index 268f62e9d..52849f5b3 100644 --- a/pkg/sentry/kernel/thread_group.go +++ b/pkg/sentry/kernel/thread_group.go @@ -254,6 +254,13 @@ type ThreadGroup struct { // // tty is protected by the signal mutex. tty *TTY + + // oomScoreAdj is the thread group's OOM score adjustment. This is + // currently not used but is maintained for consistency. + // TODO(gvisor.dev/issue/1967) + // + // oomScoreAdj is accessed using atomic memory operations. + oomScoreAdj int32 } // NewThreadGroup returns a new, empty thread group in PID namespace pidns. The -- cgit v1.2.3 From 45a8ae240dd180f1b8b4c56e77ac67e4cd3af96f Mon Sep 17 00:00:00 2001 From: Fabricio Voznika Date: Fri, 13 Mar 2020 18:56:05 -0700 Subject: Add remaining procfs files Closes #1195 PiperOrigin-RevId: 300867055 --- pkg/sentry/fsimpl/kernfs/inode_impl_util.go | 8 + pkg/sentry/fsimpl/kernfs/kernfs.go | 2 +- pkg/sentry/fsimpl/proc/BUILD | 4 + pkg/sentry/fsimpl/proc/subtasks.go | 6 +- pkg/sentry/fsimpl/proc/task.go | 26 +-- pkg/sentry/fsimpl/proc/task_fds.go | 287 ++++++++++++++++++++++++++++ pkg/sentry/fsimpl/proc/task_files.go | 251 +++++++++++++++++++++++- pkg/sentry/fsimpl/proc/tasks.go | 30 ++- pkg/sentry/fsimpl/proc/tasks_files.go | 52 +++-- pkg/sentry/fsimpl/proc/tasks_test.go | 87 +++++++-- pkg/sentry/fsimpl/testutil/BUILD | 2 + pkg/sentry/fsimpl/testutil/kernel.go | 27 ++- pkg/sentry/kernel/fd_table.go | 7 +- 13 files changed, 709 insertions(+), 80 deletions(-) create mode 100644 pkg/sentry/fsimpl/proc/task_fds.go (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go index d50018b18..94ca3dbdd 100644 --- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go @@ -554,3 +554,11 @@ func (s *StaticDirectory) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs fd.Init(rp.Mount(), vfsd, &s.OrderedChildren, &opts) return fd.VFSFileDescription(), nil } + +// AlwaysValid partially implements kernfs.inodeDynamicLookup. +type AlwaysValid struct{} + +// Valid implements kernfs.inodeDynamicLookup. +func (*AlwaysValid) Valid(context.Context) bool { + return true +} diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go index a8ab2a2ba..18a34a590 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs.go @@ -319,7 +319,7 @@ type inodeMetadata interface { // CheckPermissions checks that creds may access this inode for the // requested access type, per the the rules of // fs/namei.c:generic_permission(). - CheckPermissions(ctx context.Context, creds *auth.Credentials, atx vfs.AccessTypes) error + CheckPermissions(ctx context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error // Mode returns the (struct stat)::st_mode value for this inode. This is // separated from Stat for performance. diff --git a/pkg/sentry/fsimpl/proc/BUILD b/pkg/sentry/fsimpl/proc/BUILD index bb609a305..8156984eb 100644 --- a/pkg/sentry/fsimpl/proc/BUILD +++ b/pkg/sentry/fsimpl/proc/BUILD @@ -8,6 +8,7 @@ go_library( "filesystem.go", "subtasks.go", "task.go", + "task_fds.go", "task_files.go", "task_net.go", "tasks.go", @@ -19,8 +20,10 @@ go_library( "//pkg/abi/linux", "//pkg/context", "//pkg/log", + "//pkg/refs", "//pkg/safemem", "//pkg/sentry/fs", + "//pkg/sentry/fsbridge", "//pkg/sentry/fsimpl/kernfs", "//pkg/sentry/inet", "//pkg/sentry/kernel", @@ -53,6 +56,7 @@ go_test( "//pkg/fspath", "//pkg/sentry/contexttest", "//pkg/sentry/fsimpl/testutil", + "//pkg/sentry/fsimpl/tmpfs", "//pkg/sentry/inet", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", diff --git a/pkg/sentry/fsimpl/proc/subtasks.go b/pkg/sentry/fsimpl/proc/subtasks.go index 611645f3f..a3a7c16a5 100644 --- a/pkg/sentry/fsimpl/proc/subtasks.go +++ b/pkg/sentry/fsimpl/proc/subtasks.go @@ -34,6 +34,7 @@ type subtasksInode struct { kernfs.InodeDirectoryNoNewChildren kernfs.InodeAttrs kernfs.OrderedChildren + kernfs.AlwaysValid task *kernel.Task pidns *kernel.PIDNamespace @@ -61,11 +62,6 @@ func newSubtasks(task *kernel.Task, pidns *kernel.PIDNamespace, inoGen InoGenera return dentry } -// Valid implements kernfs.inodeDynamicLookup. -func (i *subtasksInode) Valid(ctx context.Context) bool { - return true -} - // Lookup implements kernfs.inodeDynamicLookup. func (i *subtasksInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) { tid, err := strconv.ParseUint(name, 10, 32) diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go index 493acbd1b..4891caab6 100644 --- a/pkg/sentry/fsimpl/proc/task.go +++ b/pkg/sentry/fsimpl/proc/task.go @@ -45,19 +45,19 @@ var _ kernfs.Inode = (*taskInode)(nil) func newTaskInode(inoGen InoGenerator, task *kernel.Task, pidns *kernel.PIDNamespace, isThreadGroup bool, cgroupControllers map[string]string) *kernfs.Dentry { contents := map[string]*kernfs.Dentry{ - "auxv": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &auxvData{task: task}), - "cmdline": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &cmdlineData{task: task, arg: cmdlineDataArg}), - "comm": newComm(task, inoGen.NextIno(), 0444), - "environ": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &cmdlineData{task: task, arg: environDataArg}), - //"exe": newExe(t, msrc), - //"fd": newFdDir(t, msrc), - //"fdinfo": newFdInfoDir(t, msrc), - "gid_map": newTaskOwnedFile(task, inoGen.NextIno(), 0644, &idMapData{task: task, gids: true}), - "io": newTaskOwnedFile(task, inoGen.NextIno(), 0400, newIO(task, isThreadGroup)), - "maps": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &mapsData{task: task}), - //"mountinfo": seqfile.NewSeqFileInode(t, &mountInfoFile{t: t}, msrc), - //"mounts": seqfile.NewSeqFileInode(t, &mountsFile{t: t}, msrc), - "net": newTaskNetDir(task, inoGen), + "auxv": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &auxvData{task: task}), + "cmdline": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &cmdlineData{task: task, arg: cmdlineDataArg}), + "comm": newComm(task, inoGen.NextIno(), 0444), + "environ": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &cmdlineData{task: task, arg: environDataArg}), + "exe": newExeSymlink(task, inoGen.NextIno()), + "fd": newFDDirInode(task, inoGen), + "fdinfo": newFDInfoDirInode(task, inoGen), + "gid_map": newTaskOwnedFile(task, inoGen.NextIno(), 0644, &idMapData{task: task, gids: true}), + "io": newTaskOwnedFile(task, inoGen.NextIno(), 0400, newIO(task, isThreadGroup)), + "maps": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &mapsData{task: task}), + "mountinfo": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &mountInfoData{task: task}), + "mounts": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &mountsData{task: task}), + "net": newTaskNetDir(task, inoGen), "ns": newTaskOwnedDir(task, inoGen.NextIno(), 0511, map[string]*kernfs.Dentry{ "net": newNamespaceSymlink(task, inoGen.NextIno(), "net"), "pid": newNamespaceSymlink(task, inoGen.NextIno(), "pid"), diff --git a/pkg/sentry/fsimpl/proc/task_fds.go b/pkg/sentry/fsimpl/proc/task_fds.go new file mode 100644 index 000000000..76bfc5307 --- /dev/null +++ b/pkg/sentry/fsimpl/proc/task_fds.go @@ -0,0 +1,287 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package proc + +import ( + "bytes" + "fmt" + "sort" + "strconv" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/refs" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +type fdDir struct { + inoGen InoGenerator + task *kernel.Task + + // When produceSymlinks is set, dirents produces for the FDs are reported + // as symlink. Otherwise, they are reported as regular files. + produceSymlink bool +} + +func (i *fdDir) lookup(name string) (*vfs.FileDescription, kernel.FDFlags, error) { + fd, err := strconv.ParseUint(name, 10, 64) + if err != nil { + return nil, kernel.FDFlags{}, syserror.ENOENT + } + + var ( + file *vfs.FileDescription + flags kernel.FDFlags + ) + i.task.WithMuLocked(func(t *kernel.Task) { + if fdTable := t.FDTable(); fdTable != nil { + file, flags = fdTable.GetVFS2(int32(fd)) + } + }) + if file == nil { + return nil, kernel.FDFlags{}, syserror.ENOENT + } + return file, flags, nil +} + +// IterDirents implements kernfs.inodeDynamicLookup. +func (i *fdDir) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, absOffset, relOffset int64) (int64, error) { + var fds []int32 + i.task.WithMuLocked(func(t *kernel.Task) { + if fdTable := t.FDTable(); fdTable != nil { + fds = fdTable.GetFDs() + } + }) + + offset := absOffset + relOffset + typ := uint8(linux.DT_REG) + if i.produceSymlink { + typ = linux.DT_LNK + } + + // Find the appropriate starting point. + idx := sort.Search(len(fds), func(i int) bool { return fds[i] >= int32(relOffset) }) + if idx >= len(fds) { + return offset, nil + } + for _, fd := range fds[idx:] { + dirent := vfs.Dirent{ + Name: strconv.FormatUint(uint64(fd), 10), + Type: typ, + Ino: i.inoGen.NextIno(), + NextOff: offset + 1, + } + if err := cb.Handle(dirent); err != nil { + return offset, err + } + offset++ + } + return offset, nil +} + +// fdDirInode represents the inode for /proc/[pid]/fd directory. +// +// +stateify savable +type fdDirInode struct { + kernfs.InodeNotSymlink + kernfs.InodeDirectoryNoNewChildren + kernfs.InodeAttrs + kernfs.OrderedChildren + kernfs.AlwaysValid + fdDir +} + +var _ kernfs.Inode = (*fdDirInode)(nil) + +func newFDDirInode(task *kernel.Task, inoGen InoGenerator) *kernfs.Dentry { + inode := &fdDirInode{ + fdDir: fdDir{ + inoGen: inoGen, + task: task, + produceSymlink: true, + }, + } + inode.InodeAttrs.Init(task.Credentials(), inoGen.NextIno(), linux.ModeDirectory|0555) + + dentry := &kernfs.Dentry{} + dentry.Init(inode) + inode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) + + return dentry +} + +// Lookup implements kernfs.inodeDynamicLookup. +func (i *fdDirInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) { + file, _, err := i.lookup(name) + if err != nil { + return nil, err + } + taskDentry := newFDSymlink(i.task.Credentials(), file, i.inoGen.NextIno()) + return taskDentry.VFSDentry(), nil +} + +// Open implements kernfs.Inode. +func (i *fdDirInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + fd := &kernfs.GenericDirectoryFD{} + fd.Init(rp.Mount(), vfsd, &i.OrderedChildren, &opts) + return fd.VFSFileDescription(), nil +} + +// CheckPermissions implements kernfs.Inode. +// +// This is to match Linux, which uses a special permission handler to guarantee +// that a process can still access /proc/self/fd after it has executed +// setuid. See fs/proc/fd.c:proc_fd_permission. +func (i *fdDirInode) CheckPermissions(ctx context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error { + err := i.InodeAttrs.CheckPermissions(ctx, creds, ats) + if err == nil { + // Access granted, no extra check needed. + return nil + } + if t := kernel.TaskFromContext(ctx); t != nil { + // Allow access if the task trying to access it is in the thread group + // corresponding to this directory. + if i.task.ThreadGroup() == t.ThreadGroup() { + // Access granted (overridden). + return nil + } + } + return err +} + +// fdSymlink is an symlink for the /proc/[pid]/fd/[fd] file. +// +// +stateify savable +type fdSymlink struct { + refs.AtomicRefCount + kernfs.InodeAttrs + kernfs.InodeSymlink + + file *vfs.FileDescription +} + +var _ kernfs.Inode = (*fdSymlink)(nil) + +func newFDSymlink(creds *auth.Credentials, file *vfs.FileDescription, ino uint64) *kernfs.Dentry { + file.IncRef() + inode := &fdSymlink{file: file} + inode.Init(creds, ino, linux.ModeSymlink|0777) + + d := &kernfs.Dentry{} + d.Init(inode) + return d +} + +func (s *fdSymlink) Readlink(ctx context.Context) (string, error) { + root := vfs.RootFromContext(ctx) + defer root.DecRef() + + vfsObj := s.file.VirtualDentry().Mount().Filesystem().VirtualFilesystem() + return vfsObj.PathnameWithDeleted(ctx, root, s.file.VirtualDentry()) +} + +func (s *fdSymlink) DecRef() { + s.AtomicRefCount.DecRefWithDestructor(func() { + s.Destroy() + }) +} + +func (s *fdSymlink) Destroy() { + s.file.DecRef() +} + +// fdInfoDirInode represents the inode for /proc/[pid]/fdinfo directory. +// +// +stateify savable +type fdInfoDirInode struct { + kernfs.InodeNotSymlink + kernfs.InodeDirectoryNoNewChildren + kernfs.InodeAttrs + kernfs.OrderedChildren + kernfs.AlwaysValid + fdDir +} + +var _ kernfs.Inode = (*fdInfoDirInode)(nil) + +func newFDInfoDirInode(task *kernel.Task, inoGen InoGenerator) *kernfs.Dentry { + inode := &fdInfoDirInode{ + fdDir: fdDir{ + inoGen: inoGen, + task: task, + }, + } + inode.InodeAttrs.Init(task.Credentials(), inoGen.NextIno(), linux.ModeDirectory|0555) + + dentry := &kernfs.Dentry{} + dentry.Init(inode) + inode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) + + return dentry +} + +// Lookup implements kernfs.inodeDynamicLookup. +func (i *fdInfoDirInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) { + file, flags, err := i.lookup(name) + if err != nil { + return nil, err + } + + data := &fdInfoData{file: file, flags: flags} + dentry := newTaskOwnedFile(i.task, i.inoGen.NextIno(), 0444, data) + return dentry.VFSDentry(), nil +} + +// Open implements kernfs.Inode. +func (i *fdInfoDirInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + fd := &kernfs.GenericDirectoryFD{} + fd.Init(rp.Mount(), vfsd, &i.OrderedChildren, &opts) + return fd.VFSFileDescription(), nil +} + +// fdInfoData implements vfs.DynamicBytesSource for /proc/[pid]/fdinfo/[fd]. +// +// +stateify savable +type fdInfoData struct { + kernfs.DynamicBytesFile + refs.AtomicRefCount + + file *vfs.FileDescription + flags kernel.FDFlags +} + +var _ dynamicInode = (*fdInfoData)(nil) + +func (d *fdInfoData) DecRef() { + d.AtomicRefCount.DecRefWithDestructor(d.destroy) +} + +func (d *fdInfoData) destroy() { + d.file.DecRef() +} + +// Generate implements vfs.DynamicBytesSource.Generate. +func (d *fdInfoData) Generate(ctx context.Context, buf *bytes.Buffer) error { + // TODO(b/121266871): Include pos, locks, and other data. For now we only + // have flags. + // See https://www.kernel.org/doc/Documentation/filesystems/proc.txt + flags := uint(d.file.StatusFlags()) | d.flags.ToLinuxFileFlags() + fmt.Fprintf(buf, "flags:\t0%o\n", flags) + return nil +} diff --git a/pkg/sentry/fsimpl/proc/task_files.go b/pkg/sentry/fsimpl/proc/task_files.go index 4d3332771..8c743df8d 100644 --- a/pkg/sentry/fsimpl/proc/task_files.go +++ b/pkg/sentry/fsimpl/proc/task_files.go @@ -18,10 +18,14 @@ import ( "bytes" "fmt" "io" + "sort" + "strings" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/safemem" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/fsbridge" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" @@ -496,7 +500,7 @@ func (s *statusData) Generate(ctx context.Context, buf *bytes.Buffer) error { return nil } -// ioUsage is the /proc//io and /proc//task//io data provider. +// ioUsage is the /proc/[pid]/io and /proc/[pid]/task/[tid]/io data provider. type ioUsage interface { // IOUsage returns the io usage data. IOUsage() *usage.IO @@ -570,3 +574,248 @@ func (o *oomScoreAdj) Write(ctx context.Context, src usermem.IOSequence, offset return n, nil } + +// exeSymlink is an symlink for the /proc/[pid]/exe file. +// +// +stateify savable +type exeSymlink struct { + kernfs.InodeAttrs + kernfs.InodeNoopRefCount + kernfs.InodeSymlink + + task *kernel.Task +} + +var _ kernfs.Inode = (*exeSymlink)(nil) + +func newExeSymlink(task *kernel.Task, ino uint64) *kernfs.Dentry { + inode := &exeSymlink{task: task} + inode.Init(task.Credentials(), ino, linux.ModeSymlink|0777) + + d := &kernfs.Dentry{} + d.Init(inode) + return d +} + +// Readlink implements kernfs.Inode. +func (s *exeSymlink) Readlink(ctx context.Context) (string, error) { + if !kernel.ContextCanTrace(ctx, s.task, false) { + return "", syserror.EACCES + } + + // Pull out the executable for /proc/[pid]/exe. + exec, err := s.executable() + if err != nil { + return "", err + } + defer exec.DecRef() + + return exec.PathnameWithDeleted(ctx), nil +} + +func (s *exeSymlink) executable() (file fsbridge.File, err error) { + s.task.WithMuLocked(func(t *kernel.Task) { + mm := t.MemoryManager() + if mm == nil { + // TODO(b/34851096): Check shouldn't allow Readlink once the + // Task is zombied. + err = syserror.EACCES + return + } + + // The MemoryManager may be destroyed, in which case + // MemoryManager.destroy will simply set the executable to nil + // (with locks held). + file = mm.Executable() + if file == nil { + err = syserror.ENOENT + } + }) + return +} + +// forEachMountSource runs f for the process root mount and each mount that is +// a descendant of the root. +func forEachMount(t *kernel.Task, fn func(string, *fs.Mount)) { + var fsctx *kernel.FSContext + t.WithMuLocked(func(t *kernel.Task) { + fsctx = t.FSContext() + }) + if fsctx == nil { + // The task has been destroyed. Nothing to show here. + return + } + + // All mount points must be relative to the rootDir, and mounts outside + // will be excluded. + rootDir := fsctx.RootDirectory() + if rootDir == nil { + // The task has been destroyed. Nothing to show here. + return + } + defer rootDir.DecRef() + + mnt := t.MountNamespace().FindMount(rootDir) + if mnt == nil { + // Has it just been unmounted? + return + } + ms := t.MountNamespace().AllMountsUnder(mnt) + sort.Slice(ms, func(i, j int) bool { + return ms[i].ID < ms[j].ID + }) + for _, m := range ms { + mroot := m.Root() + if mroot == nil { + continue // No longer valid. + } + mountPath, desc := mroot.FullName(rootDir) + mroot.DecRef() + if !desc { + // MountSources that are not descendants of the chroot jail are ignored. + continue + } + fn(mountPath, m) + } +} + +// mountInfoData is used to implement /proc/[pid]/mountinfo. +// +// +stateify savable +type mountInfoData struct { + kernfs.DynamicBytesFile + + task *kernel.Task +} + +var _ dynamicInode = (*mountInfoData)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (i *mountInfoData) Generate(ctx context.Context, buf *bytes.Buffer) error { + forEachMount(i.task, func(mountPath string, m *fs.Mount) { + mroot := m.Root() + if mroot == nil { + return // No longer valid. + } + defer mroot.DecRef() + + // Format: + // 36 35 98:0 /mnt1 /mnt2 rw,noatime master:1 - ext3 /dev/root rw,errors=continue + // (1)(2)(3) (4) (5) (6) (7) (8) (9) (10) (11) + + // (1) MountSource ID. + fmt.Fprintf(buf, "%d ", m.ID) + + // (2) Parent ID (or this ID if there is no parent). + pID := m.ID + if !m.IsRoot() && !m.IsUndo() { + pID = m.ParentID + } + fmt.Fprintf(buf, "%d ", pID) + + // (3) Major:Minor device ID. We don't have a superblock, so we + // just use the root inode device number. + sa := mroot.Inode.StableAttr + fmt.Fprintf(buf, "%d:%d ", sa.DeviceFileMajor, sa.DeviceFileMinor) + + // (4) Root: the pathname of the directory in the filesystem + // which forms the root of this mount. + // + // NOTE(b/78135857): This will always be "/" until we implement + // bind mounts. + fmt.Fprintf(buf, "/ ") + + // (5) Mount point (relative to process root). + fmt.Fprintf(buf, "%s ", mountPath) + + // (6) Mount options. + flags := mroot.Inode.MountSource.Flags + opts := "rw" + if flags.ReadOnly { + opts = "ro" + } + if flags.NoAtime { + opts += ",noatime" + } + if flags.NoExec { + opts += ",noexec" + } + fmt.Fprintf(buf, "%s ", opts) + + // (7) Optional fields: zero or more fields of the form "tag[:value]". + // (8) Separator: the end of the optional fields is marked by a single hyphen. + fmt.Fprintf(buf, "- ") + + // (9) Filesystem type. + fmt.Fprintf(buf, "%s ", mroot.Inode.MountSource.FilesystemType) + + // (10) Mount source: filesystem-specific information or "none". + fmt.Fprintf(buf, "none ") + + // (11) Superblock options, and final newline. + fmt.Fprintf(buf, "%s\n", superBlockOpts(mountPath, mroot.Inode.MountSource)) + }) + return nil +} + +func superBlockOpts(mountPath string, msrc *fs.MountSource) string { + // gVisor doesn't (yet) have a concept of super block options, so we + // use the ro/rw bit from the mount flag. + opts := "rw" + if msrc.Flags.ReadOnly { + opts = "ro" + } + + // NOTE(b/147673608): If the mount is a cgroup, we also need to include + // the cgroup name in the options. For now we just read that from the + // path. + // TODO(gvisor.dev/issues/190): Once gVisor has full cgroup support, we + // should get this value from the cgroup itself, and not rely on the + // path. + if msrc.FilesystemType == "cgroup" { + splitPath := strings.Split(mountPath, "/") + cgroupType := splitPath[len(splitPath)-1] + opts += "," + cgroupType + } + return opts +} + +// mountsData is used to implement /proc/[pid]/mounts. +// +// +stateify savable +type mountsData struct { + kernfs.DynamicBytesFile + + task *kernel.Task +} + +var _ dynamicInode = (*mountInfoData)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (i *mountsData) Generate(ctx context.Context, buf *bytes.Buffer) error { + forEachMount(i.task, func(mountPath string, m *fs.Mount) { + // Format: + // + // + // We use the filesystem name as the first field, since there + // is no real block device we can point to, and we also should + // not expose anything about the remote filesystem. + // + // Only ro/rw option is supported for now. + // + // The "needs dump"and fsck flags are always 0, which is allowed. + root := m.Root() + if root == nil { + return // No longer valid. + } + defer root.DecRef() + + flags := root.Inode.MountSource.Flags + opts := "rw" + if flags.ReadOnly { + opts = "ro" + } + fmt.Fprintf(buf, "%s %s %s %s %d %d\n", "none", mountPath, root.Inode.MountSource.FilesystemType, opts, 0, 0) + }) + return nil +} diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go index d203cebd4..07115664c 100644 --- a/pkg/sentry/fsimpl/proc/tasks.go +++ b/pkg/sentry/fsimpl/proc/tasks.go @@ -46,6 +46,7 @@ type tasksInode struct { kernfs.InodeDirectoryNoNewChildren kernfs.InodeAttrs kernfs.OrderedChildren + kernfs.AlwaysValid inoGen InoGenerator pidns *kernel.PIDNamespace @@ -66,23 +67,23 @@ var _ kernfs.Inode = (*tasksInode)(nil) func newTasksInode(inoGen InoGenerator, k *kernel.Kernel, pidns *kernel.PIDNamespace, cgroupControllers map[string]string) (*tasksInode, *kernfs.Dentry) { root := auth.NewRootCredentials(pidns.UserNamespace()) contents := map[string]*kernfs.Dentry{ - "cpuinfo": newDentry(root, inoGen.NextIno(), 0444, newStaticFile(cpuInfoData(k))), - //"filesystems": newDentry(root, inoGen.NextIno(), 0444, &filesystemsData{}), - "loadavg": newDentry(root, inoGen.NextIno(), 0444, &loadavgData{}), - "sys": newSysDir(root, inoGen, k), - "meminfo": newDentry(root, inoGen.NextIno(), 0444, &meminfoData{}), - "mounts": kernfs.NewStaticSymlink(root, inoGen.NextIno(), "self/mounts"), - "net": kernfs.NewStaticSymlink(root, inoGen.NextIno(), "self/net"), - "stat": newDentry(root, inoGen.NextIno(), 0444, &statData{k: k}), - "uptime": newDentry(root, inoGen.NextIno(), 0444, &uptimeData{}), - "version": newDentry(root, inoGen.NextIno(), 0444, &versionData{k: k}), + "cpuinfo": newDentry(root, inoGen.NextIno(), 0444, newStaticFile(cpuInfoData(k))), + "filesystems": newDentry(root, inoGen.NextIno(), 0444, &filesystemsData{}), + "loadavg": newDentry(root, inoGen.NextIno(), 0444, &loadavgData{}), + "sys": newSysDir(root, inoGen, k), + "meminfo": newDentry(root, inoGen.NextIno(), 0444, &meminfoData{}), + "mounts": kernfs.NewStaticSymlink(root, inoGen.NextIno(), "self/mounts"), + "net": kernfs.NewStaticSymlink(root, inoGen.NextIno(), "self/net"), + "stat": newDentry(root, inoGen.NextIno(), 0444, &statData{}), + "uptime": newDentry(root, inoGen.NextIno(), 0444, &uptimeData{}), + "version": newDentry(root, inoGen.NextIno(), 0444, &versionData{}), } inode := &tasksInode{ pidns: pidns, inoGen: inoGen, - selfSymlink: newSelfSymlink(root, inoGen.NextIno(), 0444, pidns).VFSDentry(), - threadSelfSymlink: newThreadSelfSymlink(root, inoGen.NextIno(), 0444, pidns).VFSDentry(), + selfSymlink: newSelfSymlink(root, inoGen.NextIno(), pidns).VFSDentry(), + threadSelfSymlink: newThreadSelfSymlink(root, inoGen.NextIno(), pidns).VFSDentry(), cgroupControllers: cgroupControllers, } inode.InodeAttrs.Init(root, inoGen.NextIno(), linux.ModeDirectory|0555) @@ -121,11 +122,6 @@ func (i *tasksInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, erro return taskDentry.VFSDentry(), nil } -// Valid implements kernfs.inodeDynamicLookup. -func (i *tasksInode) Valid(ctx context.Context) bool { - return true -} - // IterDirents implements kernfs.inodeDynamicLookup. func (i *tasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, offset, _ int64) (int64, error) { // fs/proc/internal.h: #define FIRST_PROCESS_ENTRY 256 diff --git a/pkg/sentry/fsimpl/proc/tasks_files.go b/pkg/sentry/fsimpl/proc/tasks_files.go index 434998910..b99badba8 100644 --- a/pkg/sentry/fsimpl/proc/tasks_files.go +++ b/pkg/sentry/fsimpl/proc/tasks_files.go @@ -40,9 +40,9 @@ type selfSymlink struct { var _ kernfs.Inode = (*selfSymlink)(nil) -func newSelfSymlink(creds *auth.Credentials, ino uint64, perm linux.FileMode, pidns *kernel.PIDNamespace) *kernfs.Dentry { +func newSelfSymlink(creds *auth.Credentials, ino uint64, pidns *kernel.PIDNamespace) *kernfs.Dentry { inode := &selfSymlink{pidns: pidns} - inode.Init(creds, ino, linux.ModeSymlink|perm) + inode.Init(creds, ino, linux.ModeSymlink|0777) d := &kernfs.Dentry{} d.Init(inode) @@ -72,9 +72,9 @@ type threadSelfSymlink struct { var _ kernfs.Inode = (*threadSelfSymlink)(nil) -func newThreadSelfSymlink(creds *auth.Credentials, ino uint64, perm linux.FileMode, pidns *kernel.PIDNamespace) *kernfs.Dentry { +func newThreadSelfSymlink(creds *auth.Credentials, ino uint64, pidns *kernel.PIDNamespace) *kernfs.Dentry { inode := &threadSelfSymlink{pidns: pidns} - inode.Init(creds, ino, linux.ModeSymlink|perm) + inode.Init(creds, ino, linux.ModeSymlink|0777) d := &kernfs.Dentry{} d.Init(inode) @@ -138,21 +138,19 @@ func (c cpuStats) String() string { // +stateify savable type statData struct { kernfs.DynamicBytesFile - - // k is the owning Kernel. - k *kernel.Kernel } var _ dynamicInode = (*statData)(nil) // Generate implements vfs.DynamicBytesSource.Generate. -func (s *statData) Generate(ctx context.Context, buf *bytes.Buffer) error { +func (*statData) Generate(ctx context.Context, buf *bytes.Buffer) error { // TODO(b/37226836): We currently export only zero CPU stats. We could // at least provide some aggregate stats. var cpu cpuStats fmt.Fprintf(buf, "cpu %s\n", cpu) - for c, max := uint(0), s.k.ApplicationCores(); c < max; c++ { + k := kernel.KernelFromContext(ctx) + for c, max := uint(0), k.ApplicationCores(); c < max; c++ { fmt.Fprintf(buf, "cpu%d %s\n", c, cpu) } @@ -176,7 +174,7 @@ func (s *statData) Generate(ctx context.Context, buf *bytes.Buffer) error { fmt.Fprintf(buf, "ctxt 0\n") // CLOCK_REALTIME timestamp from boot, in seconds. - fmt.Fprintf(buf, "btime %d\n", s.k.Timekeeper().BootTime().Seconds()) + fmt.Fprintf(buf, "btime %d\n", k.Timekeeper().BootTime().Seconds()) // Total number of clones. // TODO(b/37226836): Count this. @@ -209,7 +207,7 @@ type loadavgData struct { var _ dynamicInode = (*loadavgData)(nil) // Generate implements vfs.DynamicBytesSource.Generate. -func (d *loadavgData) Generate(ctx context.Context, buf *bytes.Buffer) error { +func (*loadavgData) Generate(ctx context.Context, buf *bytes.Buffer) error { // TODO(b/62345059): Include real data in fields. // Column 1-3: CPU and IO utilization of the last 1, 5, and 10 minute periods. // Column 4-5: currently running processes and the total number of processes. @@ -223,16 +221,14 @@ func (d *loadavgData) Generate(ctx context.Context, buf *bytes.Buffer) error { // +stateify savable type meminfoData struct { kernfs.DynamicBytesFile - - // k is the owning Kernel. - k *kernel.Kernel } var _ dynamicInode = (*meminfoData)(nil) // Generate implements vfs.DynamicBytesSource.Generate. -func (d *meminfoData) Generate(ctx context.Context, buf *bytes.Buffer) error { - mf := d.k.MemoryFile() +func (*meminfoData) Generate(ctx context.Context, buf *bytes.Buffer) error { + k := kernel.KernelFromContext(ctx) + mf := k.MemoryFile() mf.UpdateUsage() snapshot, totalUsage := usage.MemoryAccounting.Copy() totalSize := usage.TotalMemory(mf.TotalSize(), totalUsage) @@ -295,16 +291,14 @@ func (*uptimeData) Generate(ctx context.Context, buf *bytes.Buffer) error { // +stateify savable type versionData struct { kernfs.DynamicBytesFile - - // k is the owning Kernel. - k *kernel.Kernel } var _ dynamicInode = (*versionData)(nil) // Generate implements vfs.DynamicBytesSource.Generate. -func (v *versionData) Generate(ctx context.Context, buf *bytes.Buffer) error { - init := v.k.GlobalInit() +func (*versionData) Generate(ctx context.Context, buf *bytes.Buffer) error { + k := kernel.KernelFromContext(ctx) + init := k.GlobalInit() if init == nil { // Attempted to read before the init Task is created. This can // only occur during startup, which should never need to read @@ -335,3 +329,19 @@ func (v *versionData) Generate(ctx context.Context, buf *bytes.Buffer) error { fmt.Fprintf(buf, "%s version %s %s\n", ver.Sysname, ver.Release, ver.Version) return nil } + +// filesystemsData backs /proc/filesystems. +// +// +stateify savable +type filesystemsData struct { + kernfs.DynamicBytesFile +} + +var _ dynamicInode = (*filesystemsData)(nil) + +// Generate implements vfs.DynamicBytesSource.Generate. +func (d *filesystemsData) Generate(ctx context.Context, buf *bytes.Buffer) error { + k := kernel.KernelFromContext(ctx) + k.VFS().GenerateProcFilesystems(buf) + return nil +} diff --git a/pkg/sentry/fsimpl/proc/tasks_test.go b/pkg/sentry/fsimpl/proc/tasks_test.go index 1bb9430c0..d0f97c137 100644 --- a/pkg/sentry/fsimpl/proc/tasks_test.go +++ b/pkg/sentry/fsimpl/proc/tasks_test.go @@ -25,6 +25,7 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/fsimpl/testutil" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" @@ -47,6 +48,7 @@ var ( var ( tasksStaticFiles = map[string]testutil.DirentType{ "cpuinfo": linux.DT_REG, + "filesystems": linux.DT_REG, "loadavg": linux.DT_REG, "meminfo": linux.DT_REG, "mounts": linux.DT_LNK, @@ -68,9 +70,14 @@ var ( "cmdline": linux.DT_REG, "comm": linux.DT_REG, "environ": linux.DT_REG, + "exe": linux.DT_LNK, + "fd": linux.DT_DIR, + "fdinfo": linux.DT_DIR, "gid_map": linux.DT_REG, "io": linux.DT_REG, "maps": linux.DT_REG, + "mountinfo": linux.DT_REG, + "mounts": linux.DT_REG, "net": linux.DT_DIR, "ns": linux.DT_DIR, "oom_score": linux.DT_REG, @@ -96,17 +103,37 @@ func setup(t *testing.T) *testutil.System { k.VFS().MustRegisterFilesystemType(Name, &FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ AllowUserMount: true, }) - fsOpts := vfs.GetFilesystemOptions{ - InternalData: &InternalData{ - Cgroups: map[string]string{ - "cpuset": "/foo/cpuset", - "memory": "/foo/memory", + + mntns, err := k.VFS().NewMountNamespace(ctx, creds, "", tmpfs.Name, &vfs.GetFilesystemOptions{}) + if err != nil { + t.Fatalf("NewMountNamespace(): %v", err) + } + pop := &vfs.PathOperation{ + Root: mntns.Root(), + Start: mntns.Root(), + Path: fspath.Parse("/proc"), + } + if err := k.VFS().MkdirAt(ctx, creds, pop, &vfs.MkdirOptions{Mode: 0777}); err != nil { + t.Fatalf("MkDir(/proc): %v", err) + } + + pop = &vfs.PathOperation{ + Root: mntns.Root(), + Start: mntns.Root(), + Path: fspath.Parse("/proc"), + } + mntOpts := &vfs.MountOptions{ + GetFilesystemOptions: vfs.GetFilesystemOptions{ + InternalData: &InternalData{ + Cgroups: map[string]string{ + "cpuset": "/foo/cpuset", + "memory": "/foo/memory", + }, }, }, } - mntns, err := k.VFS().NewMountNamespace(ctx, creds, "", Name, &fsOpts) - if err != nil { - t.Fatalf("NewMountNamespace(): %v", err) + if err := k.VFS().MountAt(ctx, creds, "", pop, Name, mntOpts); err != nil { + t.Fatalf("MountAt(/proc): %v", err) } return testutil.NewSystem(ctx, t, k.VFS(), mntns) } @@ -115,7 +142,7 @@ func TestTasksEmpty(t *testing.T) { s := setup(t) defer s.Destroy() - collector := s.ListDirents(s.PathOpAtRoot("/")) + collector := s.ListDirents(s.PathOpAtRoot("/proc")) s.AssertAllDirentTypes(collector, tasksStaticFiles) s.AssertDirentOffsets(collector, tasksStaticFilesNextOffs) } @@ -141,7 +168,7 @@ func TestTasks(t *testing.T) { expectedDirents[fmt.Sprintf("%d", i+1)] = linux.DT_DIR } - collector := s.ListDirents(s.PathOpAtRoot("/")) + collector := s.ListDirents(s.PathOpAtRoot("/proc")) s.AssertAllDirentTypes(collector, expectedDirents) s.AssertDirentOffsets(collector, tasksStaticFilesNextOffs) @@ -181,7 +208,7 @@ func TestTasks(t *testing.T) { } // Test lookup. - for _, path := range []string{"/1", "/2"} { + for _, path := range []string{"/proc/1", "/proc/2"} { fd, err := s.VFS.OpenAt( s.Ctx, s.Creds, @@ -191,6 +218,7 @@ func TestTasks(t *testing.T) { if err != nil { t.Fatalf("vfsfs.OpenAt(%q) failed: %v", path, err) } + defer fd.DecRef() buf := make([]byte, 1) bufIOSeq := usermem.BytesIOSequence(buf) if _, err := fd.Read(s.Ctx, bufIOSeq, vfs.ReadOptions{}); err != syserror.EISDIR { @@ -201,10 +229,10 @@ func TestTasks(t *testing.T) { if _, err := s.VFS.OpenAt( s.Ctx, s.Creds, - s.PathOpAtRoot("/9999"), + s.PathOpAtRoot("/proc/9999"), &vfs.OpenOptions{}, ); err != syserror.ENOENT { - t.Fatalf("wrong error from vfsfs.OpenAt(/9999): %v", err) + t.Fatalf("wrong error from vfsfs.OpenAt(/proc/9999): %v", err) } } @@ -302,12 +330,13 @@ func TestTasksOffset(t *testing.T) { fd, err := s.VFS.OpenAt( s.Ctx, s.Creds, - s.PathOpAtRoot("/"), + s.PathOpAtRoot("/proc"), &vfs.OpenOptions{}, ) if err != nil { t.Fatalf("vfsfs.OpenAt(/) failed: %v", err) } + defer fd.DecRef() if _, err := fd.Seek(s.Ctx, tc.offset, linux.SEEK_SET); err != nil { t.Fatalf("Seek(%d, SEEK_SET): %v", tc.offset, err) } @@ -344,7 +373,7 @@ func TestTask(t *testing.T) { t.Fatalf("CreateTask(): %v", err) } - collector := s.ListDirents(s.PathOpAtRoot("/1")) + collector := s.ListDirents(s.PathOpAtRoot("/proc/1")) s.AssertAllDirentTypes(collector, taskStaticFiles) } @@ -362,14 +391,14 @@ func TestProcSelf(t *testing.T) { collector := s.WithTemporaryContext(task).ListDirents(&vfs.PathOperation{ Root: s.Root, Start: s.Root, - Path: fspath.Parse("/self/"), + Path: fspath.Parse("/proc/self/"), FollowFinalSymlink: true, }) s.AssertAllDirentTypes(collector, taskStaticFiles) } func iterateDir(ctx context.Context, t *testing.T, s *testutil.System, fd *vfs.FileDescription) { - t.Logf("Iterating: /proc%s", fd.MappedName(ctx)) + t.Logf("Iterating: %s", fd.MappedName(ctx)) var collector testutil.DirentCollector if err := fd.IterDirents(ctx, &collector); err != nil { @@ -412,6 +441,7 @@ func iterateDir(ctx context.Context, t *testing.T, s *testutil.System, fd *vfs.F t.Errorf("vfsfs.OpenAt(%v) failed: %v", childPath, err) continue } + defer child.DecRef() stat, err := child.Stat(ctx, vfs.StatOptions{}) if err != nil { t.Errorf("Stat(%v) failed: %v", childPath, err) @@ -432,6 +462,22 @@ func TestTree(t *testing.T) { defer s.Destroy() k := kernel.KernelFromContext(s.Ctx) + + pop := &vfs.PathOperation{ + Root: s.Root, + Start: s.Root, + Path: fspath.Parse("test-file"), + } + opts := &vfs.OpenOptions{ + Flags: linux.O_RDONLY | linux.O_CREAT, + Mode: 0777, + } + file, err := s.VFS.OpenAt(s.Ctx, s.Creds, pop, opts) + if err != nil { + t.Fatalf("failed to create test file: %v", err) + } + defer file.DecRef() + var tasks []*kernel.Task for i := 0; i < 5; i++ { tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits()) @@ -439,6 +485,8 @@ func TestTree(t *testing.T) { if err != nil { t.Fatalf("CreateTask(): %v", err) } + // Add file to populate /proc/[pid]/fd and fdinfo directories. + task.FDTable().NewFDVFS2(task, 0, file, kernel.FDFlags{}) tasks = append(tasks, task) } @@ -446,11 +494,12 @@ func TestTree(t *testing.T) { fd, err := s.VFS.OpenAt( ctx, auth.CredentialsFromContext(s.Ctx), - &vfs.PathOperation{Root: s.Root, Start: s.Root, Path: fspath.Parse("/")}, + &vfs.PathOperation{Root: s.Root, Start: s.Root, Path: fspath.Parse("/proc")}, &vfs.OpenOptions{}, ) if err != nil { - t.Fatalf("vfsfs.OpenAt(/) failed: %v", err) + t.Fatalf("vfsfs.OpenAt(/proc) failed: %v", err) } iterateDir(ctx, t, s, fd) + fd.DecRef() } diff --git a/pkg/sentry/fsimpl/testutil/BUILD b/pkg/sentry/fsimpl/testutil/BUILD index e4f36f4ae..0e4053a46 100644 --- a/pkg/sentry/fsimpl/testutil/BUILD +++ b/pkg/sentry/fsimpl/testutil/BUILD @@ -16,12 +16,14 @@ go_library( "//pkg/cpuid", "//pkg/fspath", "//pkg/memutil", + "//pkg/sentry/fsbridge", "//pkg/sentry/fsimpl/tmpfs", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", "//pkg/sentry/kernel/sched", "//pkg/sentry/limits", "//pkg/sentry/loader", + "//pkg/sentry/mm", "//pkg/sentry/pgalloc", "//pkg/sentry/platform", "//pkg/sentry/platform/kvm", diff --git a/pkg/sentry/fsimpl/testutil/kernel.go b/pkg/sentry/fsimpl/testutil/kernel.go index 488478e29..c16a36cdb 100644 --- a/pkg/sentry/fsimpl/testutil/kernel.go +++ b/pkg/sentry/fsimpl/testutil/kernel.go @@ -23,13 +23,16 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/cpuid" + "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/memutil" + "gvisor.dev/gvisor/pkg/sentry/fsbridge" "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/sched" "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/loader" + "gvisor.dev/gvisor/pkg/sentry/mm" "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/time" @@ -123,10 +126,17 @@ func Boot() (*kernel.Kernel, error) { // CreateTask creates a new bare bones task for tests. func CreateTask(ctx context.Context, name string, tc *kernel.ThreadGroup, mntns *vfs.MountNamespace, root, cwd vfs.VirtualDentry) (*kernel.Task, error) { k := kernel.KernelFromContext(ctx) + exe, err := newFakeExecutable(ctx, k.VFS(), auth.CredentialsFromContext(ctx), root) + if err != nil { + return nil, err + } + m := mm.NewMemoryManager(k, k, k.SleepForAddressSpaceActivation) + m.SetExecutable(fsbridge.NewVFSFile(exe)) + config := &kernel.TaskConfig{ Kernel: k, ThreadGroup: tc, - TaskContext: &kernel.TaskContext{Name: name}, + TaskContext: &kernel.TaskContext{Name: name, MemoryManager: m}, Credentials: auth.CredentialsFromContext(ctx), NetworkNamespace: k.RootNetworkNamespace(), AllowedCPUMask: sched.NewFullCPUSet(k.ApplicationCores()), @@ -135,10 +145,25 @@ func CreateTask(ctx context.Context, name string, tc *kernel.ThreadGroup, mntns AbstractSocketNamespace: kernel.NewAbstractSocketNamespace(), MountNamespaceVFS2: mntns, FSContext: kernel.NewFSContextVFS2(root, cwd, 0022), + FDTable: k.NewFDTable(), } return k.TaskSet().NewTask(config) } +func newFakeExecutable(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, root vfs.VirtualDentry) (*vfs.FileDescription, error) { + const name = "executable" + pop := &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(name), + } + opts := &vfs.OpenOptions{ + Flags: linux.O_RDONLY | linux.O_CREAT, + Mode: 0777, + } + return vfsObj.OpenAt(ctx, creds, pop, opts) +} + func createMemoryFile() (*pgalloc.MemoryFile, error) { const memfileName = "test-memory" memfd, err := memutil.CreateMemFD(memfileName, 0) diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go index 00f914564..7de2e509e 100644 --- a/pkg/sentry/kernel/fd_table.go +++ b/pkg/sentry/kernel/fd_table.go @@ -191,7 +191,7 @@ func (f *FDTable) Size() int { return int(size) } -// forEach iterates over all non-nil files. +// forEach iterates over all non-nil files in sorted order. // // It is the caller's responsibility to acquire an appropriate lock. func (f *FDTable) forEach(fn func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags)) { @@ -458,7 +458,10 @@ func (f *FDTable) GetVFS2(fd int32) (*vfs.FileDescription, FDFlags) { } } -// GetFDs returns a list of valid fds. +// GetFDs returns a sorted list of valid fds. +// +// Precondition: The caller must be running on the task goroutine, or Task.mu +// must be locked. func (f *FDTable) GetFDs() []int32 { fds := make([]int32, 0, int(atomic.LoadInt32(&f.used))) f.forEach(func(fd int32, _ *fs.File, _ *vfs.FileDescription, _ FDFlags) { -- cgit v1.2.3 From 5e413cad10d2358a21dd08216953faee70e62a0b Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Sat, 14 Mar 2020 07:13:15 -0700 Subject: Plumb VFS2 imported fds into virtual filesystem. - When setting up the virtual filesystem, mount a host.filesystem to contain all files that need to be imported. - Make read/preadv syscalls to the host in cases where preadv2 may not be supported yet (likewise for writing). - Make save/restore functions in kernel/kernel.go return early if vfs2 is enabled. PiperOrigin-RevId: 300922353 --- pkg/abi/linux/file.go | 3 + pkg/sentry/fs/host/control.go | 2 + pkg/sentry/fsimpl/host/BUILD | 2 + pkg/sentry/fsimpl/host/default_file.go | 45 +++++++----- pkg/sentry/fsimpl/host/host.go | 124 ++++++++++++++++++++++++++++++--- pkg/sentry/fsimpl/host/util.go | 28 ++------ pkg/sentry/kernel/kernel.go | 40 +++++++---- pkg/sentry/syscalls/linux/sys_stat.go | 5 +- pkg/sentry/syscalls/linux/vfs2/stat.go | 6 +- runsc/boot/filter/config.go | 1 + test/syscalls/linux/stat.cc | 60 ++++++++++++++-- 11 files changed, 246 insertions(+), 70 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/abi/linux/file.go b/pkg/abi/linux/file.go index e229ac21c..dbe58acbe 100644 --- a/pkg/abi/linux/file.go +++ b/pkg/abi/linux/file.go @@ -266,6 +266,9 @@ type Statx struct { DevMinor uint32 } +// SizeOfStatx is the size of a Statx struct. +var SizeOfStatx = binary.Size(Statx{}) + // FileMode represents a mode_t. type FileMode uint16 diff --git a/pkg/sentry/fs/host/control.go b/pkg/sentry/fs/host/control.go index 1658979fc..cd84e1337 100644 --- a/pkg/sentry/fs/host/control.go +++ b/pkg/sentry/fs/host/control.go @@ -32,6 +32,8 @@ func newSCMRights(fds []int) control.SCMRights { } // Files implements control.SCMRights.Files. +// +// TODO(gvisor.dev/issue/2017): Port to VFS2. func (c *scmRights) Files(ctx context.Context, max int) (control.RightsFiles, bool) { n := max var trunc bool diff --git a/pkg/sentry/fsimpl/host/BUILD b/pkg/sentry/fsimpl/host/BUILD index 731f192b3..5d67f88e3 100644 --- a/pkg/sentry/fsimpl/host/BUILD +++ b/pkg/sentry/fsimpl/host/BUILD @@ -9,9 +9,11 @@ go_library( "host.go", "util.go", ], + visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", "//pkg/context", + "//pkg/fd", "//pkg/log", "//pkg/refs", "//pkg/safemem", diff --git a/pkg/sentry/fsimpl/host/default_file.go b/pkg/sentry/fsimpl/host/default_file.go index 172cdb161..98682ba5e 100644 --- a/pkg/sentry/fsimpl/host/default_file.go +++ b/pkg/sentry/fsimpl/host/default_file.go @@ -21,6 +21,7 @@ import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/vfs" @@ -64,9 +65,7 @@ func (f *defaultFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts v panic("files that can return EWOULDBLOCK (sockets, pipes, etc.) cannot be memory mapped") } - f.mu.Lock() n, err := readFromHostFD(ctx, f.inode.hostFD, dst, -1, int(opts.Flags)) - f.mu.Unlock() if isBlockError(err) { // If we got any data at all, return it as a "completed" partial read // rather than retrying until complete. @@ -86,16 +85,22 @@ func (f *defaultFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts v return n, err } -func readFromHostFD(ctx context.Context, fd int, dst usermem.IOSequence, offset int64, flags int) (int64, error) { - if flags&^(linux.RWF_VALID) != 0 { +func readFromHostFD(ctx context.Context, hostFD int, dst usermem.IOSequence, offset int64, flags int) (int64, error) { + // TODO(gvisor.dev/issue/1672): Support select preadv2 flags. + if flags != 0 { return 0, syserror.EOPNOTSUPP } - reader := safemem.FromVecReaderFunc{ - func(srcs [][]byte) (int64, error) { - n, err := unix.Preadv2(fd, srcs, offset, flags) - return int64(n), err - }, + var reader safemem.Reader + if offset == -1 { + reader = safemem.FromIOReader{fd.NewReadWriter(hostFD)} + } else { + reader = safemem.FromVecReaderFunc{ + func(srcs [][]byte) (int64, error) { + n, err := unix.Preadv(hostFD, srcs, offset) + return int64(n), err + }, + } } n, err := dst.CopyOutFrom(ctx, reader) return int64(n), err @@ -120,9 +125,7 @@ func (f *defaultFileFD) Write(ctx context.Context, src usermem.IOSequence, opts panic("files that can return EWOULDBLOCK (sockets, pipes, etc.) cannot be memory mapped") } - f.mu.Lock() n, err := writeToHostFD(ctx, f.inode.hostFD, src, -1, int(opts.Flags)) - f.mu.Unlock() if isBlockError(err) { err = syserror.ErrWouldBlock } @@ -137,16 +140,22 @@ func (f *defaultFileFD) Write(ctx context.Context, src usermem.IOSequence, opts return n, err } -func writeToHostFD(ctx context.Context, fd int, src usermem.IOSequence, offset int64, flags int) (int64, error) { - if flags&^(linux.RWF_VALID) != 0 { +func writeToHostFD(ctx context.Context, hostFD int, src usermem.IOSequence, offset int64, flags int) (int64, error) { + // TODO(gvisor.dev/issue/1672): Support select pwritev2 flags. + if flags != 0 { return 0, syserror.EOPNOTSUPP } - writer := safemem.FromVecWriterFunc{ - func(srcs [][]byte) (int64, error) { - n, err := unix.Pwritev2(fd, srcs, offset, flags) - return int64(n), err - }, + var writer safemem.Writer + if offset == -1 { + writer = safemem.FromIOWriter{fd.NewReadWriter(hostFD)} + } else { + writer = safemem.FromVecWriterFunc{ + func(srcs [][]byte) (int64, error) { + n, err := unix.Pwritev(hostFD, srcs, offset) + return int64(n), err + }, + } } n, err := src.CopyInTo(ctx, writer) return int64(n), err diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go index c205e6a0b..0be812d13 100644 --- a/pkg/sentry/fsimpl/host/host.go +++ b/pkg/sentry/fsimpl/host/host.go @@ -38,10 +38,19 @@ type filesystem struct { kernfs.Filesystem } +// NewMount returns a new disconnected mount in vfsObj that may be passed to ImportFD. +func NewMount(vfsObj *vfs.VirtualFilesystem) (*vfs.Mount, error) { + fs := &filesystem{} + fs.Init(vfsObj) + vfsfs := fs.VFSFilesystem() + // NewDisconnectedMount will take an additional reference on vfsfs. + defer vfsfs.DecRef() + return vfsObj.NewDisconnectedMount(vfsfs, nil, &vfs.MountOptions{}) +} + // ImportFD sets up and returns a vfs.FileDescription from a donated fd. func ImportFD(mnt *vfs.Mount, hostFD int, ownerUID auth.KUID, ownerGID auth.KGID, isTTY bool) (*vfs.FileDescription, error) { - // Must be importing to a mount of host.filesystem. - fs, ok := mnt.Filesystem().Impl().(*filesystem) + fs, ok := mnt.Filesystem().Impl().(*kernfs.Filesystem) if !ok { return nil, fmt.Errorf("can't import host FDs into filesystems of type %T", mnt.Filesystem().Impl()) } @@ -54,8 +63,7 @@ func ImportFD(mnt *vfs.Mount, hostFD int, ownerUID auth.KUID, ownerGID auth.KGID fileMode := linux.FileMode(s.Mode) fileType := fileMode.FileType() - // Pipes, character devices, and sockets can return EWOULDBLOCK for - // operations that would block. + // Pipes, character devices, and sockets. isStream := fileType == syscall.S_IFIFO || fileType == syscall.S_IFCHR || fileType == syscall.S_IFSOCK i := &inode{ @@ -143,11 +151,109 @@ func (i *inode) Mode() linux.FileMode { // Stat implements kernfs.Inode. func (i *inode) Stat(_ *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) { + if opts.Mask&linux.STATX__RESERVED != 0 { + return linux.Statx{}, syserror.EINVAL + } + if opts.Sync&linux.AT_STATX_SYNC_TYPE == linux.AT_STATX_SYNC_TYPE { + return linux.Statx{}, syserror.EINVAL + } + + // Limit our host call only to known flags. + mask := opts.Mask & linux.STATX_ALL var s unix.Statx_t - if err := unix.Statx(i.hostFD, "", int(unix.AT_EMPTY_PATH|opts.Sync), int(opts.Mask), &s); err != nil { + err := unix.Statx(i.hostFD, "", int(unix.AT_EMPTY_PATH|opts.Sync), int(mask), &s) + // Fallback to fstat(2), if statx(2) is not supported on the host. + // + // TODO(b/151263641): Remove fallback. + if err == syserror.ENOSYS { + return i.fstat(opts) + } else if err != nil { + return linux.Statx{}, err + } + + ls := linux.Statx{Mask: mask} + // Unconditionally fill blksize, attributes, and device numbers, as indicated + // by /include/uapi/linux/stat.h. + // + // RdevMajor/RdevMinor are left as zero, so as not to expose host device + // numbers. + // + // TODO(gvisor.dev/issue/1672): Use kernfs-specific, internally defined + // device numbers. If we use the device number from the host, it may collide + // with another sentry-internal device number. We handle device/inode + // numbers without relying on the host to prevent collisions. + ls.Blksize = s.Blksize + ls.Attributes = s.Attributes + ls.AttributesMask = s.Attributes_mask + + if mask|linux.STATX_TYPE != 0 { + ls.Mode |= s.Mode & linux.S_IFMT + } + if mask|linux.STATX_MODE != 0 { + ls.Mode |= s.Mode &^ linux.S_IFMT + } + if mask|linux.STATX_NLINK != 0 { + ls.Nlink = s.Nlink + } + if mask|linux.STATX_ATIME != 0 { + ls.Atime = unixToLinuxStatxTimestamp(s.Atime) + } + if mask|linux.STATX_BTIME != 0 { + ls.Btime = unixToLinuxStatxTimestamp(s.Btime) + } + if mask|linux.STATX_CTIME != 0 { + ls.Ctime = unixToLinuxStatxTimestamp(s.Ctime) + } + if mask|linux.STATX_MTIME != 0 { + ls.Mtime = unixToLinuxStatxTimestamp(s.Mtime) + } + if mask|linux.STATX_SIZE != 0 { + ls.Size = s.Size + } + if mask|linux.STATX_BLOCKS != 0 { + ls.Blocks = s.Blocks + } + + // Use our own internal inode number and file owner. + if mask|linux.STATX_INO != 0 { + ls.Ino = i.ino + } + if mask|linux.STATX_UID != 0 { + ls.UID = uint32(i.uid) + } + if mask|linux.STATX_GID != 0 { + ls.GID = uint32(i.gid) + } + + return ls, nil +} + +// fstat is a best-effort fallback for inode.Stat() if the host does not +// support statx(2). +// +// We ignore the mask and sync flags in opts and simply supply +// STATX_BASIC_STATS, as fstat(2) itself does not allow the specification +// of a mask or sync flags. fstat(2) does not provide any metadata +// equivalent to Statx.Attributes, Statx.AttributesMask, or Statx.Btime, so +// those fields remain empty. +func (i *inode) fstat(opts vfs.StatOptions) (linux.Statx, error) { + var s unix.Stat_t + if err := unix.Fstat(i.hostFD, &s); err != nil { return linux.Statx{}, err } - ls := unixToLinuxStatx(s) + + // Note that rdev numbers are left as 0; do not expose host device numbers. + ls := linux.Statx{ + Mask: linux.STATX_BASIC_STATS, + Blksize: uint32(s.Blksize), + Nlink: uint32(s.Nlink), + Mode: uint16(s.Mode), + Size: uint64(s.Size), + Blocks: uint64(s.Blocks), + Atime: timespecToStatxTimestamp(s.Atim), + Ctime: timespecToStatxTimestamp(s.Ctim), + Mtime: timespecToStatxTimestamp(s.Mtim), + } // Use our own internal inode number and file owner. // @@ -159,9 +265,6 @@ func (i *inode) Stat(_ *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, erro ls.UID = uint32(i.uid) ls.GID = uint32(i.gid) - // Update file mode from the host. - i.mode = linux.FileMode(ls.Mode) - return ls, nil } @@ -217,7 +320,6 @@ func (i *inode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptio } func (i *inode) open(d *vfs.Dentry, mnt *vfs.Mount) (*vfs.FileDescription, error) { - fileType := i.mode.FileType() if fileType == syscall.S_IFSOCK { if i.isTTY { @@ -227,6 +329,8 @@ func (i *inode) open(d *vfs.Dentry, mnt *vfs.Mount) (*vfs.FileDescription, error return nil, errors.New("importing host sockets not supported") } + // TODO(gvisor.dev/issue/1672): Whitelist specific file types here, so that + // we don't allow importing arbitrary file types without proper support. if i.isTTY { // TODO(gvisor.dev/issue/1672): support importing host fd as TTY. return nil, errors.New("importing host fd as TTY not supported") diff --git a/pkg/sentry/fsimpl/host/util.go b/pkg/sentry/fsimpl/host/util.go index e1ccacb4d..d519feef5 100644 --- a/pkg/sentry/fsimpl/host/util.go +++ b/pkg/sentry/fsimpl/host/util.go @@ -35,34 +35,14 @@ func toTimespec(ts linux.StatxTimestamp, omit bool) unix.Timespec { } } -func unixToLinuxStatx(s unix.Statx_t) linux.Statx { - return linux.Statx{ - Mask: s.Mask, - Blksize: s.Blksize, - Attributes: s.Attributes, - Nlink: s.Nlink, - UID: s.Uid, - GID: s.Gid, - Mode: s.Mode, - Ino: s.Ino, - Size: s.Size, - Blocks: s.Blocks, - AttributesMask: s.Attributes_mask, - Atime: unixToLinuxStatxTimestamp(s.Atime), - Btime: unixToLinuxStatxTimestamp(s.Btime), - Ctime: unixToLinuxStatxTimestamp(s.Ctime), - Mtime: unixToLinuxStatxTimestamp(s.Mtime), - RdevMajor: s.Rdev_major, - RdevMinor: s.Rdev_minor, - DevMajor: s.Dev_major, - DevMinor: s.Dev_minor, - } -} - func unixToLinuxStatxTimestamp(ts unix.StatxTimestamp) linux.StatxTimestamp { return linux.StatxTimestamp{Sec: ts.Sec, Nsec: ts.Nsec} } +func timespecToStatxTimestamp(ts unix.Timespec) linux.StatxTimestamp { + return linux.StatxTimestamp{Sec: int64(ts.Sec), Nsec: uint32(ts.Nsec)} +} + // wouldBlock returns true for file types that can return EWOULDBLOCK // for blocking operations, e.g. pipes, character devices, and sockets. func wouldBlock(fileType uint32) bool { diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index 1d627564f..6feda8fa1 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -467,6 +467,11 @@ func (k *Kernel) flushMountSourceRefs() error { // // Precondition: Must be called with the kernel paused. func (ts *TaskSet) forEachFDPaused(f func(*fs.File, *vfs.FileDescription) error) (err error) { + // TODO(gvisor.dev/issue/1663): Add save support for VFS2. + if VFS2Enabled { + return nil + } + ts.mu.RLock() defer ts.mu.RUnlock() for t := range ts.Root.tids { @@ -484,7 +489,7 @@ func (ts *TaskSet) forEachFDPaused(f func(*fs.File, *vfs.FileDescription) error) } func (ts *TaskSet) flushWritesToFiles(ctx context.Context) error { - // TODO(gvisor.dev/issues/1663): Add save support for VFS2. + // TODO(gvisor.dev/issue/1663): Add save support for VFS2. return ts.forEachFDPaused(func(file *fs.File, _ *vfs.FileDescription) error { if flags := file.Flags(); !flags.Write { return nil @@ -533,6 +538,11 @@ func (k *Kernel) invalidateUnsavableMappings(ctx context.Context) error { } func (ts *TaskSet) unregisterEpollWaiters() { + // TODO(gvisor.dev/issue/1663): Add save support for VFS2. + if VFS2Enabled { + return + } + ts.mu.RLock() defer ts.mu.RUnlock() for t := range ts.Root.tids { @@ -1005,11 +1015,14 @@ func (k *Kernel) pauseTimeLocked() { // This means we'll iterate FDTables shared by multiple tasks repeatedly, // but ktime.Timer.Pause is idempotent so this is harmless. if t.fdTable != nil { - t.fdTable.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) { - if tfd, ok := file.FileOperations.(*timerfd.TimerOperations); ok { - tfd.PauseTimer() - } - }) + // TODO(gvisor.dev/issue/1663): Add save support for VFS2. + if !VFS2Enabled { + t.fdTable.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) { + if tfd, ok := file.FileOperations.(*timerfd.TimerOperations); ok { + tfd.PauseTimer() + } + }) + } } } k.timekeeper.PauseUpdates() @@ -1034,12 +1047,15 @@ func (k *Kernel) resumeTimeLocked() { it.ResumeTimer() } } - if t.fdTable != nil { - t.fdTable.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) { - if tfd, ok := file.FileOperations.(*timerfd.TimerOperations); ok { - tfd.ResumeTimer() - } - }) + // TODO(gvisor.dev/issue/1663): Add save support for VFS2. + if !VFS2Enabled { + if t.fdTable != nil { + t.fdTable.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) { + if tfd, ok := file.FileOperations.(*timerfd.TimerOperations); ok { + tfd.ResumeTimer() + } + }) + } } } } diff --git a/pkg/sentry/syscalls/linux/sys_stat.go b/pkg/sentry/syscalls/linux/sys_stat.go index 9bd2df104..a11a87cd1 100644 --- a/pkg/sentry/syscalls/linux/sys_stat.go +++ b/pkg/sentry/syscalls/linux/sys_stat.go @@ -136,7 +136,10 @@ func Statx(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall mask := args[3].Uint() statxAddr := args[4].Pointer() - if mask&linux.STATX__RESERVED > 0 { + if mask&linux.STATX__RESERVED != 0 { + return 0, nil, syserror.EINVAL + } + if flags&^(linux.AT_SYMLINK_NOFOLLOW|linux.AT_EMPTY_PATH|linux.AT_STATX_SYNC_TYPE) != 0 { return 0, nil, syserror.EINVAL } if flags&linux.AT_STATX_SYNC_TYPE == linux.AT_STATX_SYNC_TYPE { diff --git a/pkg/sentry/syscalls/linux/vfs2/stat.go b/pkg/sentry/syscalls/linux/vfs2/stat.go index a74ea6fd5..97eaedd66 100644 --- a/pkg/sentry/syscalls/linux/vfs2/stat.go +++ b/pkg/sentry/syscalls/linux/vfs2/stat.go @@ -150,7 +150,11 @@ func Statx(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall mask := args[3].Uint() statxAddr := args[4].Pointer() - if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW) != 0 { + if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW|linux.AT_STATX_SYNC_TYPE) != 0 { + return 0, nil, syserror.EINVAL + } + + if mask&linux.STATX__RESERVED != 0 { return 0, nil, syserror.EINVAL } diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go index a4627905e..f459d1973 100644 --- a/runsc/boot/filter/config.go +++ b/runsc/boot/filter/config.go @@ -284,6 +284,7 @@ var allowedSyscalls = seccomp.SyscallRules{ {seccomp.AllowAny{}, seccomp.AllowValue(syscall.SHUT_RDWR)}, }, syscall.SYS_SIGALTSTACK: {}, + unix.SYS_STATX: {}, syscall.SYS_SYNC_FILE_RANGE: {}, syscall.SYS_TGKILL: []seccomp.Rule{ { diff --git a/test/syscalls/linux/stat.cc b/test/syscalls/linux/stat.cc index c951ac3b3..513b9cd1c 100644 --- a/test/syscalls/linux/stat.cc +++ b/test/syscalls/linux/stat.cc @@ -607,7 +607,7 @@ int statx(int dirfd, const char* pathname, int flags, unsigned int mask, } TEST_F(StatTest, StatxAbsPath) { - SKIP_IF(!IsRunningOnGvisor() && statx(-1, nullptr, 0, 0, 0) < 0 && + SKIP_IF(!IsRunningOnGvisor() && statx(-1, nullptr, 0, 0, nullptr) < 0 && errno == ENOSYS); struct kernel_statx stx; @@ -617,7 +617,7 @@ TEST_F(StatTest, StatxAbsPath) { } TEST_F(StatTest, StatxRelPathDirFD) { - SKIP_IF(!IsRunningOnGvisor() && statx(-1, nullptr, 0, 0, 0) < 0 && + SKIP_IF(!IsRunningOnGvisor() && statx(-1, nullptr, 0, 0, nullptr) < 0 && errno == ENOSYS); struct kernel_statx stx; @@ -631,7 +631,7 @@ TEST_F(StatTest, StatxRelPathDirFD) { } TEST_F(StatTest, StatxRelPathCwd) { - SKIP_IF(!IsRunningOnGvisor() && statx(-1, nullptr, 0, 0, 0) < 0 && + SKIP_IF(!IsRunningOnGvisor() && statx(-1, nullptr, 0, 0, nullptr) < 0 && errno == ENOSYS); ASSERT_THAT(chdir(GetAbsoluteTestTmpdir().c_str()), SyscallSucceeds()); @@ -643,7 +643,7 @@ TEST_F(StatTest, StatxRelPathCwd) { } TEST_F(StatTest, StatxEmptyPath) { - SKIP_IF(!IsRunningOnGvisor() && statx(-1, nullptr, 0, 0, 0) < 0 && + SKIP_IF(!IsRunningOnGvisor() && statx(-1, nullptr, 0, 0, nullptr) < 0 && errno == ENOSYS); const auto fd = ASSERT_NO_ERRNO_AND_VALUE(Open(test_file_name_, O_RDONLY)); @@ -653,6 +653,58 @@ TEST_F(StatTest, StatxEmptyPath) { EXPECT_TRUE(S_ISREG(stx.stx_mode)); } +TEST_F(StatTest, StatxDoesNotRejectExtraneousMaskBits) { + SKIP_IF(!IsRunningOnGvisor() && statx(-1, nullptr, 0, 0, nullptr) < 0 && + errno == ENOSYS); + + struct kernel_statx stx; + // Set all mask bits except for STATX__RESERVED. + uint mask = 0xffffffff & ~0x80000000; + EXPECT_THAT(statx(-1, test_file_name_.c_str(), 0, mask, &stx), + SyscallSucceeds()); + EXPECT_TRUE(S_ISREG(stx.stx_mode)); +} + +TEST_F(StatTest, StatxRejectsReservedMaskBit) { + SKIP_IF(!IsRunningOnGvisor() && statx(-1, nullptr, 0, 0, nullptr) < 0 && + errno == ENOSYS); + + struct kernel_statx stx; + // Set STATX__RESERVED in the mask. + EXPECT_THAT(statx(-1, test_file_name_.c_str(), 0, 0x80000000, &stx), + SyscallFailsWithErrno(EINVAL)); +} + +TEST_F(StatTest, StatxSymlink) { + SKIP_IF(!IsRunningOnGvisor() && statx(-1, nullptr, 0, 0, nullptr) < 0 && + errno == ENOSYS); + + std::string parent_dir = "/tmp"; + TempPath link = ASSERT_NO_ERRNO_AND_VALUE( + TempPath::CreateSymlinkTo(parent_dir, test_file_name_)); + std::string p = link.path(); + + struct kernel_statx stx; + EXPECT_THAT(statx(AT_FDCWD, p.c_str(), AT_SYMLINK_NOFOLLOW, STATX_ALL, &stx), + SyscallSucceeds()); + EXPECT_TRUE(S_ISLNK(stx.stx_mode)); + EXPECT_THAT(statx(AT_FDCWD, p.c_str(), 0, STATX_ALL, &stx), + SyscallSucceeds()); + EXPECT_TRUE(S_ISREG(stx.stx_mode)); +} + +TEST_F(StatTest, StatxInvalidFlags) { + SKIP_IF(!IsRunningOnGvisor() && statx(-1, nullptr, 0, 0, nullptr) < 0 && + errno == ENOSYS); + + struct kernel_statx stx; + EXPECT_THAT(statx(AT_FDCWD, test_file_name_.c_str(), 12345, 0, &stx), + SyscallFailsWithErrno(EINVAL)); + EXPECT_THAT(statx(AT_FDCWD, test_file_name_.c_str(), + 0x6000 /* AT_STATX_SYNC_TYPE */, 0, &stx), + SyscallFailsWithErrno(EINVAL)); +} + } // namespace } // namespace testing -- cgit v1.2.3 From 97127750289b49dd5e29f8ddb4209137e47fe52d Mon Sep 17 00:00:00 2001 From: Fabricio Voznika Date: Sat, 14 Mar 2020 13:46:55 -0700 Subject: Disallow kernfs.Inode.SetStat for readonly inodes Updates #1195, #1193 PiperOrigin-RevId: 300950993 --- pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go | 7 +++--- pkg/sentry/fsimpl/kernfs/inode_impl_util.go | 9 +++++-- pkg/sentry/fsimpl/kernfs/symlink.go | 7 ++++++ pkg/sentry/fsimpl/proc/subtasks.go | 5 ++++ pkg/sentry/fsimpl/proc/task.go | 10 +++----- pkg/sentry/fsimpl/proc/tasks.go | 16 ++++++++++++- pkg/sentry/fsimpl/proc/tasks_files.go | 33 ++++++++++++++++++++++---- pkg/sentry/fsimpl/sys/sys.go | 8 ++++--- 8 files changed, 74 insertions(+), 21 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go index 1c026f4d8..0d27a8867 100644 --- a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go +++ b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go @@ -61,9 +61,10 @@ func (f *DynamicBytesFile) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vf return &fd.vfsfd, nil } -// SetStat implements Inode.SetStat. -func (f *DynamicBytesFile) SetStat(*vfs.Filesystem, vfs.SetStatOptions) error { - // DynamicBytesFiles are immutable. +// SetStat implements Inode.SetStat. By default DynamicBytesFile doesn't allow +// inode attributes to be changed. Override SetStat() making it call +// f.InodeAttrs to allow it. +func (*DynamicBytesFile) SetStat(*vfs.Filesystem, vfs.SetStatOptions) error { return syserror.EPERM } diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go index 94ca3dbdd..4ed41326d 100644 --- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go @@ -228,7 +228,7 @@ func (a *InodeAttrs) Stat(*vfs.Filesystem, vfs.StatOptions) (linux.Statx, error) stat.GID = atomic.LoadUint32(&a.gid) stat.Nlink = atomic.LoadUint32(&a.nlink) - // TODO: Implement other stat fields like timestamps. + // TODO(gvisor.dev/issue/1193): Implement other stat fields like timestamps. return stat, nil } @@ -256,7 +256,7 @@ func (a *InodeAttrs) SetStat(_ *vfs.Filesystem, opts vfs.SetStatOptions) error { // Note that not all fields are modifiable. For example, the file type and // inode numbers are immutable after node creation. - // TODO: Implement other stat fields like timestamps. + // TODO(gvisor.dev/issue/1193): Implement other stat fields like timestamps. return nil } @@ -555,6 +555,11 @@ func (s *StaticDirectory) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs return fd.VFSFileDescription(), nil } +// SetStat implements Inode.SetStat not allowing inode attributes to be changed. +func (*StaticDirectory) SetStat(*vfs.Filesystem, vfs.SetStatOptions) error { + return syserror.EPERM +} + // AlwaysValid partially implements kernfs.inodeDynamicLookup. type AlwaysValid struct{} diff --git a/pkg/sentry/fsimpl/kernfs/symlink.go b/pkg/sentry/fsimpl/kernfs/symlink.go index 0ee7eb9b7..41c5a3099 100644 --- a/pkg/sentry/fsimpl/kernfs/symlink.go +++ b/pkg/sentry/fsimpl/kernfs/symlink.go @@ -18,6 +18,8 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" ) // StaticSymlink provides an Inode implementation for symlinks that point to @@ -52,3 +54,8 @@ func (s *StaticSymlink) Init(creds *auth.Credentials, ino uint64, target string) func (s *StaticSymlink) Readlink(_ context.Context) (string, error) { return s.target, nil } + +// SetStat implements Inode.SetStat not allowing inode attributes to be changed. +func (*StaticSymlink) SetStat(*vfs.Filesystem, vfs.SetStatOptions) error { + return syserror.EPERM +} diff --git a/pkg/sentry/fsimpl/proc/subtasks.go b/pkg/sentry/fsimpl/proc/subtasks.go index a3a7c16a5..ea6d60f6e 100644 --- a/pkg/sentry/fsimpl/proc/subtasks.go +++ b/pkg/sentry/fsimpl/proc/subtasks.go @@ -127,3 +127,8 @@ func (i *subtasksInode) Stat(vsfs *vfs.Filesystem, opts vfs.StatOptions) (linux. } return stat, nil } + +// SetStat implements Inode.SetStat not allowing inode attributes to be changed. +func (*subtasksInode) SetStat(*vfs.Filesystem, vfs.SetStatOptions) error { + return syserror.EPERM +} diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go index 4891caab6..fae3fc5aa 100644 --- a/pkg/sentry/fsimpl/proc/task.go +++ b/pkg/sentry/fsimpl/proc/task.go @@ -107,13 +107,9 @@ func (i *taskInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenO return fd.VFSFileDescription(), nil } -// SetStat implements kernfs.Inode. -func (i *taskInode) SetStat(_ *vfs.Filesystem, opts vfs.SetStatOptions) error { - stat := opts.Stat - if stat.Mask&linux.STATX_MODE != 0 { - return syserror.EPERM - } - return nil +// SetStat implements Inode.SetStat not allowing inode attributes to be changed. +func (*taskInode) SetStat(*vfs.Filesystem, vfs.SetStatOptions) error { + return syserror.EPERM } // taskOwnedInode implements kernfs.Inode and overrides inode owner with task diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go index 07115664c..9f2ef8200 100644 --- a/pkg/sentry/fsimpl/proc/tasks.go +++ b/pkg/sentry/fsimpl/proc/tasks.go @@ -67,7 +67,7 @@ var _ kernfs.Inode = (*tasksInode)(nil) func newTasksInode(inoGen InoGenerator, k *kernel.Kernel, pidns *kernel.PIDNamespace, cgroupControllers map[string]string) (*tasksInode, *kernfs.Dentry) { root := auth.NewRootCredentials(pidns.UserNamespace()) contents := map[string]*kernfs.Dentry{ - "cpuinfo": newDentry(root, inoGen.NextIno(), 0444, newStaticFile(cpuInfoData(k))), + "cpuinfo": newDentry(root, inoGen.NextIno(), 0444, newStaticFileSetStat(cpuInfoData(k))), "filesystems": newDentry(root, inoGen.NextIno(), 0444, &filesystemsData{}), "loadavg": newDentry(root, inoGen.NextIno(), 0444, &loadavgData{}), "sys": newSysDir(root, inoGen, k), @@ -225,6 +225,20 @@ func (i *tasksInode) Stat(vsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Sta return stat, nil } +// staticFileSetStat implements a special static file that allows inode +// attributes to be set. This is to support /proc files that are readonly, but +// allow attributes to be set. +type staticFileSetStat struct { + dynamicBytesFileSetAttr + vfs.StaticData +} + +var _ dynamicInode = (*staticFileSetStat)(nil) + +func newStaticFileSetStat(data string) *staticFileSetStat { + return &staticFileSetStat{StaticData: vfs.StaticData{Data: data}} +} + func cpuInfoData(k *kernel.Kernel) string { features := k.FeatureSet() if features == nil { diff --git a/pkg/sentry/fsimpl/proc/tasks_files.go b/pkg/sentry/fsimpl/proc/tasks_files.go index b99badba8..20085bb39 100644 --- a/pkg/sentry/fsimpl/proc/tasks_files.go +++ b/pkg/sentry/fsimpl/proc/tasks_files.go @@ -26,6 +26,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/usage" + "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -62,6 +63,11 @@ func (s *selfSymlink) Readlink(ctx context.Context) (string, error) { return strconv.FormatUint(uint64(tgid), 10), nil } +// SetStat implements Inode.SetStat not allowing inode attributes to be changed. +func (*selfSymlink) SetStat(*vfs.Filesystem, vfs.SetStatOptions) error { + return syserror.EPERM +} + type threadSelfSymlink struct { kernfs.InodeAttrs kernfs.InodeNoopRefCount @@ -95,6 +101,23 @@ func (s *threadSelfSymlink) Readlink(ctx context.Context) (string, error) { return fmt.Sprintf("%d/task/%d", tgid, tid), nil } +// SetStat implements Inode.SetStat not allowing inode attributes to be changed. +func (*threadSelfSymlink) SetStat(*vfs.Filesystem, vfs.SetStatOptions) error { + return syserror.EPERM +} + +// dynamicBytesFileSetAttr implements a special file that allows inode +// attributes to be set. This is to support /proc files that are readonly, but +// allow attributes to be set. +type dynamicBytesFileSetAttr struct { + kernfs.DynamicBytesFile +} + +// SetStat implements Inode.SetStat. +func (d *dynamicBytesFileSetAttr) SetStat(fs *vfs.Filesystem, opts vfs.SetStatOptions) error { + return d.DynamicBytesFile.InodeAttrs.SetStat(fs, opts) +} + // cpuStats contains the breakdown of CPU time for /proc/stat. type cpuStats struct { // user is time spent in userspace tasks with non-positive niceness. @@ -137,7 +160,7 @@ func (c cpuStats) String() string { // // +stateify savable type statData struct { - kernfs.DynamicBytesFile + dynamicBytesFileSetAttr } var _ dynamicInode = (*statData)(nil) @@ -201,7 +224,7 @@ func (*statData) Generate(ctx context.Context, buf *bytes.Buffer) error { // // +stateify savable type loadavgData struct { - kernfs.DynamicBytesFile + dynamicBytesFileSetAttr } var _ dynamicInode = (*loadavgData)(nil) @@ -220,7 +243,7 @@ func (*loadavgData) Generate(ctx context.Context, buf *bytes.Buffer) error { // // +stateify savable type meminfoData struct { - kernfs.DynamicBytesFile + dynamicBytesFileSetAttr } var _ dynamicInode = (*meminfoData)(nil) @@ -271,7 +294,7 @@ func (*meminfoData) Generate(ctx context.Context, buf *bytes.Buffer) error { // // +stateify savable type uptimeData struct { - kernfs.DynamicBytesFile + dynamicBytesFileSetAttr } var _ dynamicInode = (*uptimeData)(nil) @@ -290,7 +313,7 @@ func (*uptimeData) Generate(ctx context.Context, buf *bytes.Buffer) error { // // +stateify savable type versionData struct { - kernfs.DynamicBytesFile + dynamicBytesFileSetAttr } var _ dynamicInode = (*versionData)(nil) diff --git a/pkg/sentry/fsimpl/sys/sys.go b/pkg/sentry/fsimpl/sys/sys.go index c36c4fa11..3928ff2c8 100644 --- a/pkg/sentry/fsimpl/sys/sys.go +++ b/pkg/sentry/fsimpl/sys/sys.go @@ -94,15 +94,17 @@ func (fs *filesystem) newDir(creds *auth.Credentials, mode linux.FileMode, conte return &d.dentry } -// SetStat implements kernfs.Inode.SetStat. -func (d *dir) SetStat(fs *vfs.Filesystem, opts vfs.SetStatOptions) error { +// SetStat implements Inode.SetStat not allowing inode attributes to be changed. +func (*dir) SetStat(*vfs.Filesystem, vfs.SetStatOptions) error { return syserror.EPERM } // Open implements kernfs.Inode.Open. func (d *dir) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { fd := &kernfs.GenericDirectoryFD{} - fd.Init(rp.Mount(), vfsd, &d.OrderedChildren, &opts) + if err := fd.Init(rp.Mount(), vfsd, &d.OrderedChildren, &opts); err != nil { + return nil, err + } return fd.VFSFileDescription(), nil } -- cgit v1.2.3 From 0f60799a4f8c3db567973574147370fc900df55f Mon Sep 17 00:00:00 2001 From: Fabricio Voznika Date: Mon, 16 Mar 2020 13:28:00 -0700 Subject: Add calls to vfs.CheckSetStat to fsimpls Only gofer filesystem was calling vfs.CheckSetStat for vfs.FilesystemImpl.SetStatAt and vfs.FileDescriptionImpl.SetStat. Updates #1193, #1672, #1197 PiperOrigin-RevId: 301226522 --- pkg/sentry/fsimpl/host/host.go | 16 +++++++++++----- pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go | 2 +- pkg/sentry/fsimpl/kernfs/fd_impl_util.go | 4 +++- pkg/sentry/fsimpl/kernfs/filesystem.go | 2 +- pkg/sentry/fsimpl/kernfs/inode_impl_util.go | 14 ++++++++++++-- pkg/sentry/fsimpl/kernfs/kernfs.go | 6 ++++-- pkg/sentry/fsimpl/kernfs/kernfs_test.go | 2 +- pkg/sentry/fsimpl/kernfs/symlink.go | 2 +- pkg/sentry/fsimpl/proc/subtasks.go | 3 ++- pkg/sentry/fsimpl/proc/task.go | 2 +- pkg/sentry/fsimpl/proc/tasks_files.go | 8 ++++---- pkg/sentry/fsimpl/sys/sys.go | 2 +- pkg/sentry/fsimpl/tmpfs/filesystem.go | 2 +- pkg/sentry/fsimpl/tmpfs/tmpfs.go | 11 +++++++++-- pkg/sentry/vfs/file_description.go | 3 ++- pkg/sentry/vfs/filesystem.go | 4 +++- 16 files changed, 57 insertions(+), 26 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go index 0be812d13..67c050c30 100644 --- a/pkg/sentry/fsimpl/host/host.go +++ b/pkg/sentry/fsimpl/host/host.go @@ -114,7 +114,8 @@ type inode struct { ino uint64 // mu protects the inode metadata below. - mu sync.Mutex + // TODO(gvisor.dev/issue/1672): actually protect fields below. + //mu sync.Mutex // mode is the file mode of this inode. Note that this value may become out // of date if the mode is changed on the host, e.g. with chmod. @@ -269,16 +270,20 @@ func (i *inode) fstat(opts vfs.StatOptions) (linux.Statx, error) { } // SetStat implements kernfs.Inode. -func (i *inode) SetStat(_ *vfs.Filesystem, opts vfs.SetStatOptions) error { +func (i *inode) SetStat(fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error { s := opts.Stat m := s.Mask if m == 0 { return nil } - if m&(linux.STATX_UID|linux.STATX_GID) != 0 { + if m&^(linux.STATX_MODE|linux.STATX_SIZE|linux.STATX_ATIME|linux.STATX_MTIME) != 0 { return syserror.EPERM } + if err := vfs.CheckSetStat(creds, &s, uint16(i.Mode().Permissions()), i.uid, i.gid); err != nil { + return err + } + if m&linux.STATX_MODE != 0 { if err := syscall.Fchmod(i.hostFD, uint32(s.Mode)); err != nil { return err @@ -375,8 +380,9 @@ type fileDescription struct { } // SetStat implements vfs.FileDescriptionImpl. -func (f *fileDescription) SetStat(_ context.Context, opts vfs.SetStatOptions) error { - return f.inode.SetStat(nil, opts) +func (f *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { + creds := auth.CredentialsFromContext(ctx) + return f.inode.SetStat(nil, creds, opts) } // Stat implements vfs.FileDescriptionImpl. diff --git a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go index 0d27a8867..c788d1d62 100644 --- a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go +++ b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go @@ -64,7 +64,7 @@ func (f *DynamicBytesFile) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vf // SetStat implements Inode.SetStat. By default DynamicBytesFile doesn't allow // inode attributes to be changed. Override SetStat() making it call // f.InodeAttrs to allow it. -func (*DynamicBytesFile) SetStat(*vfs.Filesystem, vfs.SetStatOptions) error { +func (*DynamicBytesFile) SetStat(*vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { return syserror.EPERM } diff --git a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go index da821d524..331c82011 100644 --- a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go @@ -17,6 +17,7 @@ package kernfs import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" @@ -206,6 +207,7 @@ func (fd *GenericDirectoryFD) Stat(ctx context.Context, opts vfs.StatOptions) (l // SetStat implements vfs.FileDescriptionImpl.SetStat. func (fd *GenericDirectoryFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { fs := fd.filesystem() + creds := auth.CredentialsFromContext(ctx) inode := fd.vfsfd.VirtualDentry().Dentry().Impl().(*Dentry).inode - return inode.SetStat(fs, opts) + return inode.SetStat(fs, creds, opts) } diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go index 3288de290..37fbe2eea 100644 --- a/pkg/sentry/fsimpl/kernfs/filesystem.go +++ b/pkg/sentry/fsimpl/kernfs/filesystem.go @@ -636,7 +636,7 @@ func (fs *Filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts if opts.Stat.Mask == 0 { return nil } - return inode.SetStat(fs.VFSFilesystem(), opts) + return inode.SetStat(fs.VFSFilesystem(), rp.Credentials(), opts) } // StatAt implements vfs.FilesystemImpl.StatAt. diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go index 4ed41326d..851c61b49 100644 --- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go @@ -234,7 +234,17 @@ func (a *InodeAttrs) Stat(*vfs.Filesystem, vfs.StatOptions) (linux.Statx, error) } // SetStat implements Inode.SetStat. -func (a *InodeAttrs) SetStat(_ *vfs.Filesystem, opts vfs.SetStatOptions) error { +func (a *InodeAttrs) SetStat(fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error { + if opts.Stat.Mask == 0 { + return nil + } + if opts.Stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID) != 0 { + return syserror.EPERM + } + if err := vfs.CheckSetStat(creds, &opts.Stat, uint16(a.Mode().Permissions()), auth.KUID(atomic.LoadUint32(&a.uid)), auth.KGID(atomic.LoadUint32(&a.gid))); err != nil { + return err + } + stat := opts.Stat if stat.Mask&linux.STATX_MODE != 0 { for { @@ -556,7 +566,7 @@ func (s *StaticDirectory) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs } // SetStat implements Inode.SetStat not allowing inode attributes to be changed. -func (*StaticDirectory) SetStat(*vfs.Filesystem, vfs.SetStatOptions) error { +func (*StaticDirectory) SetStat(*vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { return syserror.EPERM } diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go index 18a34a590..b12b216d2 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs.go @@ -330,8 +330,10 @@ type inodeMetadata interface { Stat(fs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) // SetStat updates the metadata for this inode. This corresponds to - // vfs.FilesystemImpl.SetStatAt. - SetStat(fs *vfs.Filesystem, opts vfs.SetStatOptions) error + // vfs.FilesystemImpl.SetStatAt. Implementations are responsible for checking + // if the operation can be performed (see vfs.CheckSetStat() for common + // checks). + SetStat(fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error } // Precondition: All methods in this interface may only be called on directory diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go index 0459fb305..2875e6ffa 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs_test.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go @@ -91,7 +91,7 @@ type attrs struct { kernfs.InodeAttrs } -func (a *attrs) SetStat(fs *vfs.Filesystem, opt vfs.SetStatOptions) error { +func (*attrs) SetStat(*vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { return syserror.EPERM } diff --git a/pkg/sentry/fsimpl/kernfs/symlink.go b/pkg/sentry/fsimpl/kernfs/symlink.go index 41c5a3099..92f709d29 100644 --- a/pkg/sentry/fsimpl/kernfs/symlink.go +++ b/pkg/sentry/fsimpl/kernfs/symlink.go @@ -56,6 +56,6 @@ func (s *StaticSymlink) Readlink(_ context.Context) (string, error) { } // SetStat implements Inode.SetStat not allowing inode attributes to be changed. -func (*StaticSymlink) SetStat(*vfs.Filesystem, vfs.SetStatOptions) error { +func (*StaticSymlink) SetStat(*vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { return syserror.EPERM } diff --git a/pkg/sentry/fsimpl/proc/subtasks.go b/pkg/sentry/fsimpl/proc/subtasks.go index ea6d60f6e..eb191aba4 100644 --- a/pkg/sentry/fsimpl/proc/subtasks.go +++ b/pkg/sentry/fsimpl/proc/subtasks.go @@ -22,6 +22,7 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" ) @@ -129,6 +130,6 @@ func (i *subtasksInode) Stat(vsfs *vfs.Filesystem, opts vfs.StatOptions) (linux. } // SetStat implements Inode.SetStat not allowing inode attributes to be changed. -func (*subtasksInode) SetStat(*vfs.Filesystem, vfs.SetStatOptions) error { +func (*subtasksInode) SetStat(*vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { return syserror.EPERM } diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go index fae3fc5aa..ceb427ffb 100644 --- a/pkg/sentry/fsimpl/proc/task.go +++ b/pkg/sentry/fsimpl/proc/task.go @@ -108,7 +108,7 @@ func (i *taskInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenO } // SetStat implements Inode.SetStat not allowing inode attributes to be changed. -func (*taskInode) SetStat(*vfs.Filesystem, vfs.SetStatOptions) error { +func (*taskInode) SetStat(*vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { return syserror.EPERM } diff --git a/pkg/sentry/fsimpl/proc/tasks_files.go b/pkg/sentry/fsimpl/proc/tasks_files.go index 20085bb39..d3d99393f 100644 --- a/pkg/sentry/fsimpl/proc/tasks_files.go +++ b/pkg/sentry/fsimpl/proc/tasks_files.go @@ -64,7 +64,7 @@ func (s *selfSymlink) Readlink(ctx context.Context) (string, error) { } // SetStat implements Inode.SetStat not allowing inode attributes to be changed. -func (*selfSymlink) SetStat(*vfs.Filesystem, vfs.SetStatOptions) error { +func (*selfSymlink) SetStat(*vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { return syserror.EPERM } @@ -102,7 +102,7 @@ func (s *threadSelfSymlink) Readlink(ctx context.Context) (string, error) { } // SetStat implements Inode.SetStat not allowing inode attributes to be changed. -func (*threadSelfSymlink) SetStat(*vfs.Filesystem, vfs.SetStatOptions) error { +func (*threadSelfSymlink) SetStat(*vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { return syserror.EPERM } @@ -114,8 +114,8 @@ type dynamicBytesFileSetAttr struct { } // SetStat implements Inode.SetStat. -func (d *dynamicBytesFileSetAttr) SetStat(fs *vfs.Filesystem, opts vfs.SetStatOptions) error { - return d.DynamicBytesFile.InodeAttrs.SetStat(fs, opts) +func (d *dynamicBytesFileSetAttr) SetStat(fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error { + return d.DynamicBytesFile.InodeAttrs.SetStat(fs, creds, opts) } // cpuStats contains the breakdown of CPU time for /proc/stat. diff --git a/pkg/sentry/fsimpl/sys/sys.go b/pkg/sentry/fsimpl/sys/sys.go index 3928ff2c8..9c8e63783 100644 --- a/pkg/sentry/fsimpl/sys/sys.go +++ b/pkg/sentry/fsimpl/sys/sys.go @@ -95,7 +95,7 @@ func (fs *filesystem) newDir(creds *auth.Credentials, mode linux.FileMode, conte } // SetStat implements Inode.SetStat not allowing inode attributes to be changed. -func (*dir) SetStat(*vfs.Filesystem, vfs.SetStatOptions) error { +func (*dir) SetStat(*vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { return syserror.EPERM } diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go index 02637fca6..6e8b4cae7 100644 --- a/pkg/sentry/fsimpl/tmpfs/filesystem.go +++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go @@ -575,7 +575,7 @@ func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts if err != nil { return err } - return d.inode.setStat(opts.Stat) + return d.inode.setStat(rp.Credentials(), &opts.Stat) } // StatAt implements vfs.FilesystemImpl.StatAt. diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go index 521206305..c18f1e46e 100644 --- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go +++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go @@ -299,10 +299,16 @@ func (i *inode) statTo(stat *linux.Statx) { } } -func (i *inode) setStat(stat linux.Statx) error { +func (i *inode) setStat(creds *auth.Credentials, stat *linux.Statx) error { if stat.Mask == 0 { return nil } + if stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID|linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_CTIME|linux.STATX_SIZE) != 0 { + return syserror.EPERM + } + if err := vfs.CheckSetStat(creds, stat, uint16(atomic.LoadUint32(&i.mode))&^linux.S_IFMT, auth.KUID(atomic.LoadUint32(&i.uid)), auth.KGID(atomic.LoadUint32(&i.gid))); err != nil { + return err + } i.mu.Lock() var ( needsMtimeBump bool @@ -457,5 +463,6 @@ func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linu // SetStat implements vfs.FileDescriptionImpl.SetStat. func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { - return fd.inode().setStat(opts.Stat) + creds := auth.CredentialsFromContext(ctx) + return fd.inode().setStat(creds, &opts.Stat) } diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go index 9a1ad630c..8ee549dc2 100644 --- a/pkg/sentry/vfs/file_description.go +++ b/pkg/sentry/vfs/file_description.go @@ -286,7 +286,8 @@ type FileDescriptionImpl interface { Stat(ctx context.Context, opts StatOptions) (linux.Statx, error) // SetStat updates metadata for the file represented by the - // FileDescription. + // FileDescription. Implementations are responsible for checking if the + // operation can be performed (see vfs.CheckSetStat() for common checks). SetStat(ctx context.Context, opts SetStatOptions) error // StatFS returns metadata for the filesystem containing the file diff --git a/pkg/sentry/vfs/filesystem.go b/pkg/sentry/vfs/filesystem.go index c43dcff3d..332decce6 100644 --- a/pkg/sentry/vfs/filesystem.go +++ b/pkg/sentry/vfs/filesystem.go @@ -366,7 +366,9 @@ type FilesystemImpl interface { // ResolvingPath.Resolve*(), then !rp.Done(). RmdirAt(ctx context.Context, rp *ResolvingPath) error - // SetStatAt updates metadata for the file at the given path. + // SetStatAt updates metadata for the file at the given path. Implementations + // are responsible for checking if the operation can be performed + // (see vfs.CheckSetStat() for common checks). // // Errors: // -- cgit v1.2.3 From 2a6c4369be8d0522a1f439aa02bce0eb21d42ea2 Mon Sep 17 00:00:00 2001 From: Fabricio Voznika Date: Mon, 16 Mar 2020 15:59:29 -0700 Subject: Enforce file size rlimits in VFS2 Updates #1035 PiperOrigin-RevId: 301255357 --- pkg/sentry/fsimpl/gofer/gofer.go | 2 +- pkg/sentry/fsimpl/gofer/regular_file.go | 5 ++++ pkg/sentry/fsimpl/gofer/special_file.go | 8 +++++++ pkg/sentry/fsimpl/host/default_file.go | 7 +++++- pkg/sentry/fsimpl/host/host.go | 6 ++--- pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go | 2 +- pkg/sentry/fsimpl/kernfs/fd_impl_util.go | 2 +- pkg/sentry/fsimpl/kernfs/filesystem.go | 2 +- pkg/sentry/fsimpl/kernfs/inode_impl_util.go | 6 ++--- pkg/sentry/fsimpl/kernfs/kernfs.go | 2 +- pkg/sentry/fsimpl/kernfs/kernfs_test.go | 2 +- pkg/sentry/fsimpl/kernfs/symlink.go | 2 +- pkg/sentry/fsimpl/proc/subtasks.go | 2 +- pkg/sentry/fsimpl/proc/task.go | 2 +- pkg/sentry/fsimpl/proc/tasks_files.go | 8 +++---- pkg/sentry/fsimpl/sys/sys.go | 2 +- pkg/sentry/fsimpl/tmpfs/filesystem.go | 2 +- pkg/sentry/fsimpl/tmpfs/regular_file.go | 11 +++++++-- pkg/sentry/fsimpl/tmpfs/tmpfs.go | 6 ++--- pkg/sentry/syscalls/linux/vfs2/setstat.go | 15 ++++++++++-- pkg/sentry/vfs/BUILD | 1 + pkg/sentry/vfs/file_description_impl_util.go | 5 ++++ pkg/sentry/vfs/permissions.go | 33 +++++++++++++++++++++++++- 23 files changed, 103 insertions(+), 30 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go index c4a8f0b38..999485492 100644 --- a/pkg/sentry/fsimpl/gofer/gofer.go +++ b/pkg/sentry/fsimpl/gofer/gofer.go @@ -713,7 +713,7 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *lin if stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID|linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_SIZE) != 0 { return syserror.EPERM } - if err := vfs.CheckSetStat(creds, stat, uint16(atomic.LoadUint32(&d.mode))&^linux.S_IFMT, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))); err != nil { + if err := vfs.CheckSetStat(ctx, creds, stat, uint16(atomic.LoadUint32(&d.mode))&^linux.S_IFMT, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))); err != nil { return err } if err := mnt.CheckBeginWrite(); err != nil { diff --git a/pkg/sentry/fsimpl/gofer/regular_file.go b/pkg/sentry/fsimpl/gofer/regular_file.go index e95209661..3593eb1d5 100644 --- a/pkg/sentry/fsimpl/gofer/regular_file.go +++ b/pkg/sentry/fsimpl/gofer/regular_file.go @@ -126,6 +126,11 @@ func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, off if opts.Flags != 0 { return 0, syserror.EOPNOTSUPP } + limit, err := vfs.CheckLimit(ctx, offset, src.NumBytes()) + if err != nil { + return 0, err + } + src = src.TakeFirst64(limit) d := fd.dentry() d.metadataMu.Lock() diff --git a/pkg/sentry/fsimpl/gofer/special_file.go b/pkg/sentry/fsimpl/gofer/special_file.go index 08c691c47..274f7346f 100644 --- a/pkg/sentry/fsimpl/gofer/special_file.go +++ b/pkg/sentry/fsimpl/gofer/special_file.go @@ -107,6 +107,14 @@ func (fd *specialFileFD) PWrite(ctx context.Context, src usermem.IOSequence, off return 0, syserror.EOPNOTSUPP } + if fd.dentry().fileType() == linux.S_IFREG { + limit, err := vfs.CheckLimit(ctx, offset, src.NumBytes()) + if err != nil { + return 0, err + } + src = src.TakeFirst64(limit) + } + // Do a buffered write. See rationale in PRead. if d := fd.dentry(); d.fs.opts.interop != InteropModeShared { d.touchCMtime(ctx) diff --git a/pkg/sentry/fsimpl/host/default_file.go b/pkg/sentry/fsimpl/host/default_file.go index 98682ba5e..459238603 100644 --- a/pkg/sentry/fsimpl/host/default_file.go +++ b/pkg/sentry/fsimpl/host/default_file.go @@ -112,7 +112,6 @@ func (f *defaultFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offs if f.inode.isStream { return 0, syserror.ESPIPE } - return writeToHostFD(ctx, f.inode.hostFD, src, offset, int(opts.Flags)) } @@ -146,6 +145,12 @@ func writeToHostFD(ctx context.Context, hostFD int, src usermem.IOSequence, offs return 0, syserror.EOPNOTSUPP } + limit, err := vfs.CheckLimit(ctx, offset, src.NumBytes()) + if err != nil { + return 0, err + } + src = src.TakeFirst64(limit) + var writer safemem.Writer if offset == -1 { writer = safemem.FromIOWriter{fd.NewReadWriter(hostFD)} diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go index 67c050c30..2eebcd60c 100644 --- a/pkg/sentry/fsimpl/host/host.go +++ b/pkg/sentry/fsimpl/host/host.go @@ -270,7 +270,7 @@ func (i *inode) fstat(opts vfs.StatOptions) (linux.Statx, error) { } // SetStat implements kernfs.Inode. -func (i *inode) SetStat(fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error { +func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error { s := opts.Stat m := s.Mask @@ -280,7 +280,7 @@ func (i *inode) SetStat(fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.Se if m&^(linux.STATX_MODE|linux.STATX_SIZE|linux.STATX_ATIME|linux.STATX_MTIME) != 0 { return syserror.EPERM } - if err := vfs.CheckSetStat(creds, &s, uint16(i.Mode().Permissions()), i.uid, i.gid); err != nil { + if err := vfs.CheckSetStat(ctx, creds, &s, uint16(i.Mode().Permissions()), i.uid, i.gid); err != nil { return err } @@ -382,7 +382,7 @@ type fileDescription struct { // SetStat implements vfs.FileDescriptionImpl. func (f *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { creds := auth.CredentialsFromContext(ctx) - return f.inode.SetStat(nil, creds, opts) + return f.inode.SetStat(ctx, nil, creds, opts) } // Stat implements vfs.FileDescriptionImpl. diff --git a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go index c788d1d62..d8bddbafa 100644 --- a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go +++ b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go @@ -64,7 +64,7 @@ func (f *DynamicBytesFile) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vf // SetStat implements Inode.SetStat. By default DynamicBytesFile doesn't allow // inode attributes to be changed. Override SetStat() making it call // f.InodeAttrs to allow it. -func (*DynamicBytesFile) SetStat(*vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { +func (*DynamicBytesFile) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { return syserror.EPERM } diff --git a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go index 331c82011..75c4bab1a 100644 --- a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go @@ -209,5 +209,5 @@ func (fd *GenericDirectoryFD) SetStat(ctx context.Context, opts vfs.SetStatOptio fs := fd.filesystem() creds := auth.CredentialsFromContext(ctx) inode := fd.vfsfd.VirtualDentry().Dentry().Impl().(*Dentry).inode - return inode.SetStat(fs, creds, opts) + return inode.SetStat(ctx, fs, creds, opts) } diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go index 37fbe2eea..31da8b511 100644 --- a/pkg/sentry/fsimpl/kernfs/filesystem.go +++ b/pkg/sentry/fsimpl/kernfs/filesystem.go @@ -636,7 +636,7 @@ func (fs *Filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts if opts.Stat.Mask == 0 { return nil } - return inode.SetStat(fs.VFSFilesystem(), rp.Credentials(), opts) + return inode.SetStat(ctx, fs.VFSFilesystem(), rp.Credentials(), opts) } // StatAt implements vfs.FilesystemImpl.StatAt. diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go index 851c61b49..c612dcf07 100644 --- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go @@ -234,14 +234,14 @@ func (a *InodeAttrs) Stat(*vfs.Filesystem, vfs.StatOptions) (linux.Statx, error) } // SetStat implements Inode.SetStat. -func (a *InodeAttrs) SetStat(fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error { +func (a *InodeAttrs) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error { if opts.Stat.Mask == 0 { return nil } if opts.Stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID) != 0 { return syserror.EPERM } - if err := vfs.CheckSetStat(creds, &opts.Stat, uint16(a.Mode().Permissions()), auth.KUID(atomic.LoadUint32(&a.uid)), auth.KGID(atomic.LoadUint32(&a.gid))); err != nil { + if err := vfs.CheckSetStat(ctx, creds, &opts.Stat, uint16(a.Mode().Permissions()), auth.KUID(atomic.LoadUint32(&a.uid)), auth.KGID(atomic.LoadUint32(&a.gid))); err != nil { return err } @@ -566,7 +566,7 @@ func (s *StaticDirectory) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs } // SetStat implements Inode.SetStat not allowing inode attributes to be changed. -func (*StaticDirectory) SetStat(*vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { +func (*StaticDirectory) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { return syserror.EPERM } diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go index b12b216d2..794e38908 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs.go @@ -333,7 +333,7 @@ type inodeMetadata interface { // vfs.FilesystemImpl.SetStatAt. Implementations are responsible for checking // if the operation can be performed (see vfs.CheckSetStat() for common // checks). - SetStat(fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error + SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error } // Precondition: All methods in this interface may only be called on directory diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go index 2875e6ffa..fb0d25ad7 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs_test.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go @@ -91,7 +91,7 @@ type attrs struct { kernfs.InodeAttrs } -func (*attrs) SetStat(*vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { +func (*attrs) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { return syserror.EPERM } diff --git a/pkg/sentry/fsimpl/kernfs/symlink.go b/pkg/sentry/fsimpl/kernfs/symlink.go index 92f709d29..5918d3309 100644 --- a/pkg/sentry/fsimpl/kernfs/symlink.go +++ b/pkg/sentry/fsimpl/kernfs/symlink.go @@ -56,6 +56,6 @@ func (s *StaticSymlink) Readlink(_ context.Context) (string, error) { } // SetStat implements Inode.SetStat not allowing inode attributes to be changed. -func (*StaticSymlink) SetStat(*vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { +func (*StaticSymlink) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { return syserror.EPERM } diff --git a/pkg/sentry/fsimpl/proc/subtasks.go b/pkg/sentry/fsimpl/proc/subtasks.go index eb191aba4..a21313666 100644 --- a/pkg/sentry/fsimpl/proc/subtasks.go +++ b/pkg/sentry/fsimpl/proc/subtasks.go @@ -130,6 +130,6 @@ func (i *subtasksInode) Stat(vsfs *vfs.Filesystem, opts vfs.StatOptions) (linux. } // SetStat implements Inode.SetStat not allowing inode attributes to be changed. -func (*subtasksInode) SetStat(*vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { +func (*subtasksInode) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { return syserror.EPERM } diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go index ceb427ffb..49d6efb0e 100644 --- a/pkg/sentry/fsimpl/proc/task.go +++ b/pkg/sentry/fsimpl/proc/task.go @@ -108,7 +108,7 @@ func (i *taskInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenO } // SetStat implements Inode.SetStat not allowing inode attributes to be changed. -func (*taskInode) SetStat(*vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { +func (*taskInode) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { return syserror.EPERM } diff --git a/pkg/sentry/fsimpl/proc/tasks_files.go b/pkg/sentry/fsimpl/proc/tasks_files.go index d3d99393f..882c1981e 100644 --- a/pkg/sentry/fsimpl/proc/tasks_files.go +++ b/pkg/sentry/fsimpl/proc/tasks_files.go @@ -64,7 +64,7 @@ func (s *selfSymlink) Readlink(ctx context.Context) (string, error) { } // SetStat implements Inode.SetStat not allowing inode attributes to be changed. -func (*selfSymlink) SetStat(*vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { +func (*selfSymlink) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { return syserror.EPERM } @@ -102,7 +102,7 @@ func (s *threadSelfSymlink) Readlink(ctx context.Context) (string, error) { } // SetStat implements Inode.SetStat not allowing inode attributes to be changed. -func (*threadSelfSymlink) SetStat(*vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { +func (*threadSelfSymlink) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { return syserror.EPERM } @@ -114,8 +114,8 @@ type dynamicBytesFileSetAttr struct { } // SetStat implements Inode.SetStat. -func (d *dynamicBytesFileSetAttr) SetStat(fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error { - return d.DynamicBytesFile.InodeAttrs.SetStat(fs, creds, opts) +func (d *dynamicBytesFileSetAttr) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error { + return d.DynamicBytesFile.InodeAttrs.SetStat(ctx, fs, creds, opts) } // cpuStats contains the breakdown of CPU time for /proc/stat. diff --git a/pkg/sentry/fsimpl/sys/sys.go b/pkg/sentry/fsimpl/sys/sys.go index 9c8e63783..7abfd62f2 100644 --- a/pkg/sentry/fsimpl/sys/sys.go +++ b/pkg/sentry/fsimpl/sys/sys.go @@ -95,7 +95,7 @@ func (fs *filesystem) newDir(creds *auth.Credentials, mode linux.FileMode, conte } // SetStat implements Inode.SetStat not allowing inode attributes to be changed. -func (*dir) SetStat(*vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { +func (*dir) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { return syserror.EPERM } diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go index 6e8b4cae7..75d01b853 100644 --- a/pkg/sentry/fsimpl/tmpfs/filesystem.go +++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go @@ -575,7 +575,7 @@ func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts if err != nil { return err } - return d.inode.setStat(rp.Credentials(), &opts.Stat) + return d.inode.setStat(ctx, rp.Credentials(), &opts.Stat) } // StatAt implements vfs.FilesystemImpl.StatAt. diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file.go b/pkg/sentry/fsimpl/tmpfs/regular_file.go index 711442424..5a2896bf6 100644 --- a/pkg/sentry/fsimpl/tmpfs/regular_file.go +++ b/pkg/sentry/fsimpl/tmpfs/regular_file.go @@ -308,11 +308,18 @@ func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, off return 0, nil } f := fd.inode().impl.(*regularFile) - end := offset + srclen - if end < offset { + if end := offset + srclen; end < offset { // Overflow. return 0, syserror.EFBIG } + + var err error + srclen, err = vfs.CheckLimit(ctx, offset, srclen) + if err != nil { + return 0, err + } + src = src.TakeFirst64(srclen) + f.inode.mu.Lock() rw := getRegularFileReadWriter(f, offset) n, err := src.CopyInTo(ctx, rw) diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go index c18f1e46e..ff69372b3 100644 --- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go +++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go @@ -299,14 +299,14 @@ func (i *inode) statTo(stat *linux.Statx) { } } -func (i *inode) setStat(creds *auth.Credentials, stat *linux.Statx) error { +func (i *inode) setStat(ctx context.Context, creds *auth.Credentials, stat *linux.Statx) error { if stat.Mask == 0 { return nil } if stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID|linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_CTIME|linux.STATX_SIZE) != 0 { return syserror.EPERM } - if err := vfs.CheckSetStat(creds, stat, uint16(atomic.LoadUint32(&i.mode))&^linux.S_IFMT, auth.KUID(atomic.LoadUint32(&i.uid)), auth.KGID(atomic.LoadUint32(&i.gid))); err != nil { + if err := vfs.CheckSetStat(ctx, creds, stat, uint16(atomic.LoadUint32(&i.mode))&^linux.S_IFMT, auth.KUID(atomic.LoadUint32(&i.uid)), auth.KGID(atomic.LoadUint32(&i.gid))); err != nil { return err } i.mu.Lock() @@ -464,5 +464,5 @@ func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linu // SetStat implements vfs.FileDescriptionImpl.SetStat. func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { creds := auth.CredentialsFromContext(ctx) - return fd.inode().setStat(creds, &opts.Stat) + return fd.inode().setStat(ctx, creds, &opts.Stat) } diff --git a/pkg/sentry/syscalls/linux/vfs2/setstat.go b/pkg/sentry/syscalls/linux/vfs2/setstat.go index 9250659ff..136453ccc 100644 --- a/pkg/sentry/syscalls/linux/vfs2/setstat.go +++ b/pkg/sentry/syscalls/linux/vfs2/setstat.go @@ -173,12 +173,13 @@ func Truncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc return 0, nil, err } - return 0, nil, setstatat(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink, &vfs.SetStatOptions{ + err = setstatat(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink, &vfs.SetStatOptions{ Stat: linux.Statx{ Mask: linux.STATX_SIZE, Size: uint64(length), }, }) + return 0, nil, handleSetSizeError(t, err) } // Ftruncate implements Linux syscall ftruncate(2). @@ -196,12 +197,13 @@ func Ftruncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys } defer file.DecRef() - return 0, nil, file.SetStat(t, vfs.SetStatOptions{ + err := file.SetStat(t, vfs.SetStatOptions{ Stat: linux.Statx{ Mask: linux.STATX_SIZE, Size: uint64(length), }, }) + return 0, nil, handleSetSizeError(t, err) } // Utime implements Linux syscall utime(2). @@ -378,3 +380,12 @@ func setstatat(t *kernel.Task, dirfd int32, path fspath.Path, shouldAllowEmptyPa FollowFinalSymlink: bool(shouldFollowFinalSymlink), }, opts) } + +func handleSetSizeError(t *kernel.Task, err error) error { + if err == syserror.ErrExceedsFileSizeLimit { + // Convert error to EFBIG and send a SIGXFSZ per setrlimit(2). + t.SendSignal(kernel.SignalInfoNoInfo(linux.SIGXFSZ, t, t)) + return syserror.EFBIG + } + return err +} diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD index cb4deb068..a2a06fc8f 100644 --- a/pkg/sentry/vfs/BUILD +++ b/pkg/sentry/vfs/BUILD @@ -51,6 +51,7 @@ go_library( "//pkg/sentry/fs", "//pkg/sentry/fs/lock", "//pkg/sentry/kernel/auth", + "//pkg/sentry/limits", "//pkg/sentry/memmap", "//pkg/sync", "//pkg/syserror", diff --git a/pkg/sentry/vfs/file_description_impl_util.go b/pkg/sentry/vfs/file_description_impl_util.go index 45191d1c3..d45e602ce 100644 --- a/pkg/sentry/vfs/file_description_impl_util.go +++ b/pkg/sentry/vfs/file_description_impl_util.go @@ -339,6 +339,11 @@ func (fd *DynamicBytesFileDescriptionImpl) pwriteLocked(ctx context.Context, src if opts.Flags&^(linux.RWF_HIPRI|linux.RWF_DSYNC|linux.RWF_SYNC) != 0 { return 0, syserror.EOPNOTSUPP } + limit, err := CheckLimit(ctx, offset, src.NumBytes()) + if err != nil { + return 0, err + } + src = src.TakeFirst64(limit) writable, ok := fd.data.(WritableDynamicBytesSource) if !ok { diff --git a/pkg/sentry/vfs/permissions.go b/pkg/sentry/vfs/permissions.go index 8e250998a..2c8f23f55 100644 --- a/pkg/sentry/vfs/permissions.go +++ b/pkg/sentry/vfs/permissions.go @@ -15,8 +15,12 @@ package vfs import ( + "math" + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/syserror" ) @@ -147,7 +151,16 @@ func MayWriteFileWithOpenFlags(flags uint32) bool { // CheckSetStat checks that creds has permission to change the metadata of a // file with the given permissions, UID, and GID as specified by stat, subject // to the rules of Linux's fs/attr.c:setattr_prepare(). -func CheckSetStat(creds *auth.Credentials, stat *linux.Statx, mode uint16, kuid auth.KUID, kgid auth.KGID) error { +func CheckSetStat(ctx context.Context, creds *auth.Credentials, stat *linux.Statx, mode uint16, kuid auth.KUID, kgid auth.KGID) error { + if stat.Mask&linux.STATX_SIZE != 0 { + limit, err := CheckLimit(ctx, 0, int64(stat.Size)) + if err != nil { + return err + } + if limit < int64(stat.Size) { + return syserror.ErrExceedsFileSizeLimit + } + } if stat.Mask&linux.STATX_MODE != 0 { if !CanActAsOwner(creds, kuid) { return syserror.EPERM @@ -205,3 +218,21 @@ func CanActAsOwner(creds *auth.Credentials, kuid auth.KUID) bool { func HasCapabilityOnFile(creds *auth.Credentials, cp linux.Capability, kuid auth.KUID, kgid auth.KGID) bool { return creds.HasCapability(cp) && creds.UserNamespace.MapFromKUID(kuid).Ok() && creds.UserNamespace.MapFromKGID(kgid).Ok() } + +// CheckLimit enforces file size rlimits. It returns error if the write +// operation must not proceed. Otherwise it returns the max length allowed to +// without violating the limit. +func CheckLimit(ctx context.Context, offset, size int64) (int64, error) { + fileSizeLimit := limits.FromContext(ctx).Get(limits.FileSize).Cur + if fileSizeLimit > math.MaxInt64 { + return size, nil + } + if offset >= int64(fileSizeLimit) { + return 0, syserror.ErrExceedsFileSizeLimit + } + remaining := int64(fileSizeLimit) - offset + if remaining < size { + return remaining, nil + } + return size, nil +} -- cgit v1.2.3 From 3a42638a0b32ceede66d8d593609b424bbdba47e Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Wed, 18 Mar 2020 19:08:46 -0700 Subject: Port imported TTY fds to vfs2. Refactor fs/host.TTYFileOperations so that the relevant functionality can be shared with VFS2 (fsimpl/host.ttyFD). Incorporate host.defaultFileFD into the default host.fileDescription. This way, there is no need for a separate default_file.go. As in vfs1, the TTY file implementation can be built on top of this default and override operations as necessary (PRead/Read/PWrite/Write, Release, Ioctl). Note that these changes still need to be plumbed into runsc, which refers to imported TTYs in control/proc.go:ExecAsync. Updates #1672. PiperOrigin-RevId: 301718157 --- pkg/sentry/fs/host/ioctl_unsafe.go | 4 + pkg/sentry/fs/host/tty.go | 5 + pkg/sentry/fs/host/util.go | 8 +- pkg/sentry/fsimpl/host/BUILD | 6 +- pkg/sentry/fsimpl/host/default_file.go | 247 --------------------- pkg/sentry/fsimpl/host/host.go | 258 ++++++++++++++++++++-- pkg/sentry/fsimpl/host/ioctl_unsafe.go | 56 +++++ pkg/sentry/fsimpl/host/tty.go | 379 +++++++++++++++++++++++++++++++++ 8 files changed, 692 insertions(+), 271 deletions(-) delete mode 100644 pkg/sentry/fsimpl/host/default_file.go create mode 100644 pkg/sentry/fsimpl/host/ioctl_unsafe.go create mode 100644 pkg/sentry/fsimpl/host/tty.go (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fs/host/ioctl_unsafe.go b/pkg/sentry/fs/host/ioctl_unsafe.go index 271582e54..150ac8e19 100644 --- a/pkg/sentry/fs/host/ioctl_unsafe.go +++ b/pkg/sentry/fs/host/ioctl_unsafe.go @@ -21,6 +21,8 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" ) +// LINT.IfChange + func ioctlGetTermios(fd int) (*linux.Termios, error) { var t linux.Termios _, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), linux.TCGETS, uintptr(unsafe.Pointer(&t))) @@ -54,3 +56,5 @@ func ioctlSetWinsize(fd int, w *linux.Winsize) error { } return nil } + +// LINT.ThenChange(../../fsimpl/host/ioctl_unsafe.go) diff --git a/pkg/sentry/fs/host/tty.go b/pkg/sentry/fs/host/tty.go index 3f218b4a7..cb91355ab 100644 --- a/pkg/sentry/fs/host/tty.go +++ b/pkg/sentry/fs/host/tty.go @@ -26,6 +26,8 @@ import ( "gvisor.dev/gvisor/pkg/usermem" ) +// LINT.IfChange + // TTYFileOperations implements fs.FileOperations for a host file descriptor // that wraps a TTY FD. // @@ -43,6 +45,7 @@ type TTYFileOperations struct { // connected to this TTY. fgProcessGroup *kernel.ProcessGroup + // termios contains the terminal attributes for this TTY. termios linux.KernelTermios } @@ -357,3 +360,5 @@ func (t *TTYFileOperations) checkChange(ctx context.Context, sig linux.Signal) e _ = pg.SendSignal(kernel.SignalInfoPriv(sig)) return kernel.ERESTARTSYS } + +// LINT.ThenChange(../../fsimpl/host/tty.go) diff --git a/pkg/sentry/fs/host/util.go b/pkg/sentry/fs/host/util.go index 388108fdf..1b0356930 100644 --- a/pkg/sentry/fs/host/util.go +++ b/pkg/sentry/fs/host/util.go @@ -23,7 +23,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/device" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/sentry/kernel/time" + ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/syserror" ) @@ -80,9 +80,9 @@ func unstableAttr(s *syscall.Stat_t) fs.UnstableAttr { Usage: s.Blocks * 512, Perms: fs.FilePermsFromMode(linux.FileMode(s.Mode)), Owner: owner(s), - AccessTime: time.FromUnix(s.Atim.Sec, s.Atim.Nsec), - ModificationTime: time.FromUnix(s.Mtim.Sec, s.Mtim.Nsec), - StatusChangeTime: time.FromUnix(s.Ctim.Sec, s.Ctim.Nsec), + AccessTime: ktime.FromUnix(s.Atim.Sec, s.Atim.Nsec), + ModificationTime: ktime.FromUnix(s.Mtim.Sec, s.Mtim.Nsec), + StatusChangeTime: ktime.FromUnix(s.Ctim.Sec, s.Ctim.Nsec), Links: uint64(s.Nlink), } } diff --git a/pkg/sentry/fsimpl/host/BUILD b/pkg/sentry/fsimpl/host/BUILD index 5d67f88e3..0bb4a5c3e 100644 --- a/pkg/sentry/fsimpl/host/BUILD +++ b/pkg/sentry/fsimpl/host/BUILD @@ -5,8 +5,9 @@ licenses(["notice"]) go_library( name = "host", srcs = [ - "default_file.go", "host.go", + "ioctl_unsafe.go", + "tty.go", "util.go", ], visibility = ["//pkg/sentry:internal"], @@ -17,9 +18,12 @@ go_library( "//pkg/log", "//pkg/refs", "//pkg/safemem", + "//pkg/sentry/arch", "//pkg/sentry/fsimpl/kernfs", + "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", "//pkg/sentry/memmap", + "//pkg/sentry/unimpl", "//pkg/sentry/vfs", "//pkg/sync", "//pkg/syserror", diff --git a/pkg/sentry/fsimpl/host/default_file.go b/pkg/sentry/fsimpl/host/default_file.go deleted file mode 100644 index 459238603..000000000 --- a/pkg/sentry/fsimpl/host/default_file.go +++ /dev/null @@ -1,247 +0,0 @@ -// Copyright 2020 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package host - -import ( - "math" - "syscall" - - "golang.org/x/sys/unix" - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/fd" - "gvisor.dev/gvisor/pkg/safemem" - "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/pkg/syserror" - "gvisor.dev/gvisor/pkg/usermem" -) - -// defaultFileFD implements FileDescriptionImpl for non-socket, non-TTY files. -type defaultFileFD struct { - fileDescription - - // canMap specifies whether we allow the file to be memory mapped. - canMap bool - - // mu protects the fields below. - mu sync.Mutex - - // offset specifies the current file offset. - offset int64 -} - -// TODO(gvisor.dev/issue/1672): Implement Waitable interface. - -// PRead implements FileDescriptionImpl. -func (f *defaultFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { - // TODO(b/34716638): Some char devices do support offsets, e.g. /dev/null. - if f.inode.isStream { - return 0, syserror.ESPIPE - } - - return readFromHostFD(ctx, f.inode.hostFD, dst, offset, int(opts.Flags)) -} - -// Read implements FileDescriptionImpl. -func (f *defaultFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { - // TODO(b/34716638): Some char devices do support offsets, e.g. /dev/null. - if f.inode.isStream { - // These files can't be memory mapped, assert this. - if f.canMap { - panic("files that can return EWOULDBLOCK (sockets, pipes, etc.) cannot be memory mapped") - } - - n, err := readFromHostFD(ctx, f.inode.hostFD, dst, -1, int(opts.Flags)) - if isBlockError(err) { - // If we got any data at all, return it as a "completed" partial read - // rather than retrying until complete. - if n != 0 { - err = nil - } else { - err = syserror.ErrWouldBlock - } - } - return n, err - } - // TODO(gvisor.dev/issue/1672): Cache pages, when forced to do so. - f.mu.Lock() - n, err := readFromHostFD(ctx, f.inode.hostFD, dst, f.offset, int(opts.Flags)) - f.offset += n - f.mu.Unlock() - return n, err -} - -func readFromHostFD(ctx context.Context, hostFD int, dst usermem.IOSequence, offset int64, flags int) (int64, error) { - // TODO(gvisor.dev/issue/1672): Support select preadv2 flags. - if flags != 0 { - return 0, syserror.EOPNOTSUPP - } - - var reader safemem.Reader - if offset == -1 { - reader = safemem.FromIOReader{fd.NewReadWriter(hostFD)} - } else { - reader = safemem.FromVecReaderFunc{ - func(srcs [][]byte) (int64, error) { - n, err := unix.Preadv(hostFD, srcs, offset) - return int64(n), err - }, - } - } - n, err := dst.CopyOutFrom(ctx, reader) - return int64(n), err -} - -// PWrite implements FileDescriptionImpl. -func (f *defaultFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { - // TODO(b/34716638): Some char devices do support offsets, e.g. /dev/null. - if f.inode.isStream { - return 0, syserror.ESPIPE - } - return writeToHostFD(ctx, f.inode.hostFD, src, offset, int(opts.Flags)) -} - -// Write implements FileDescriptionImpl. -func (f *defaultFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { - // TODO(b/34716638): Some char devices do support offsets, e.g. /dev/null. - if f.inode.isStream { - // These files can't be memory mapped, assert this. - if f.canMap { - panic("files that can return EWOULDBLOCK (sockets, pipes, etc.) cannot be memory mapped") - } - - n, err := writeToHostFD(ctx, f.inode.hostFD, src, -1, int(opts.Flags)) - if isBlockError(err) { - err = syserror.ErrWouldBlock - } - return n, err - } - // TODO(gvisor.dev/issue/1672): Cache pages, when forced to do so. - // TODO(gvisor.dev/issue/1672): Write to end of file and update offset if O_APPEND is set on this file. - f.mu.Lock() - n, err := writeToHostFD(ctx, f.inode.hostFD, src, f.offset, int(opts.Flags)) - f.offset += n - f.mu.Unlock() - return n, err -} - -func writeToHostFD(ctx context.Context, hostFD int, src usermem.IOSequence, offset int64, flags int) (int64, error) { - // TODO(gvisor.dev/issue/1672): Support select pwritev2 flags. - if flags != 0 { - return 0, syserror.EOPNOTSUPP - } - - limit, err := vfs.CheckLimit(ctx, offset, src.NumBytes()) - if err != nil { - return 0, err - } - src = src.TakeFirst64(limit) - - var writer safemem.Writer - if offset == -1 { - writer = safemem.FromIOWriter{fd.NewReadWriter(hostFD)} - } else { - writer = safemem.FromVecWriterFunc{ - func(srcs [][]byte) (int64, error) { - n, err := unix.Pwritev(hostFD, srcs, offset) - return int64(n), err - }, - } - } - n, err := src.CopyInTo(ctx, writer) - return int64(n), err -} - -// Seek implements FileDescriptionImpl. -// -// Note that we do not support seeking on directories, since we do not even -// allow directory fds to be imported at all. -func (f *defaultFileFD) Seek(_ context.Context, offset int64, whence int32) (int64, error) { - // TODO(b/34716638): Some char devices do support seeking, e.g. /dev/null. - if f.inode.isStream { - return 0, syserror.ESPIPE - } - - f.mu.Lock() - defer f.mu.Unlock() - - switch whence { - case linux.SEEK_SET: - if offset < 0 { - return f.offset, syserror.EINVAL - } - f.offset = offset - - case linux.SEEK_CUR: - // Check for overflow. Note that underflow cannot occur, since f.offset >= 0. - if offset > math.MaxInt64-f.offset { - return f.offset, syserror.EOVERFLOW - } - if f.offset+offset < 0 { - return f.offset, syserror.EINVAL - } - f.offset += offset - - case linux.SEEK_END: - var s syscall.Stat_t - if err := syscall.Fstat(f.inode.hostFD, &s); err != nil { - return f.offset, err - } - size := s.Size - - // Check for overflow. Note that underflow cannot occur, since size >= 0. - if offset > math.MaxInt64-size { - return f.offset, syserror.EOVERFLOW - } - if size+offset < 0 { - return f.offset, syserror.EINVAL - } - f.offset = size + offset - - case linux.SEEK_DATA, linux.SEEK_HOLE: - // Modifying the offset in the host file table should not matter, since - // this is the only place where we use it. - // - // For reading and writing, we always rely on our internal offset. - n, err := unix.Seek(f.inode.hostFD, offset, int(whence)) - if err != nil { - return f.offset, err - } - f.offset = n - - default: - // Invalid whence. - return f.offset, syserror.EINVAL - } - - return f.offset, nil -} - -// Sync implements FileDescriptionImpl. -func (f *defaultFileFD) Sync(context.Context) error { - // TODO(gvisor.dev/issue/1672): Currently we do not support the SyncData optimization, so we always sync everything. - return unix.Fsync(f.inode.hostFD) -} - -// ConfigureMMap implements FileDescriptionImpl. -func (f *defaultFileFD) ConfigureMMap(_ context.Context, opts *memmap.MMapOpts) error { - if !f.canMap { - return syserror.ENODEV - } - // TODO(gvisor.dev/issue/1672): Implement ConfigureMMap and Mappable interface. - return syserror.ENODEV -} diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go index 2eebcd60c..3afb41395 100644 --- a/pkg/sentry/fsimpl/host/host.go +++ b/pkg/sentry/fsimpl/host/host.go @@ -19,18 +19,23 @@ package host import ( "errors" "fmt" + "math" "syscall" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/refs" + "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // filesystem implements vfs.FilesystemImpl. @@ -70,10 +75,20 @@ func ImportFD(mnt *vfs.Mount, hostFD int, ownerUID auth.KUID, ownerGID auth.KGID hostFD: hostFD, isStream: isStream, isTTY: isTTY, + canMap: canMap(uint32(fileType)), ino: fs.NextIno(), mode: fileMode, uid: ownerUID, gid: ownerGID, + // For simplicity, set offset to 0. Technically, we should + // only set to 0 on files that are not seekable (sockets, pipes, etc.), + // and use the offset from the host fd otherwise. + offset: 0, + } + + // These files can't be memory mapped, assert this. + if i.isStream && i.canMap { + panic("files that can return EWOULDBLOCK (sockets, pipes, etc.) cannot be memory mapped") } d := &kernfs.Dentry{} @@ -110,12 +125,17 @@ type inode struct { // This field is initialized at creation time and is immutable. isTTY bool + // canMap specifies whether we allow the file to be memory mapped. + // + // This field is initialized at creation time and is immutable. + canMap bool + // ino is an inode number unique within this filesystem. + // + // This field is initialized at creation time and is immutable. ino uint64 - // mu protects the inode metadata below. - // TODO(gvisor.dev/issue/1672): actually protect fields below. - //mu sync.Mutex + // TODO(gvisor.dev/issue/1672): protect mode, uid, and gid with mutex. // mode is the file mode of this inode. Note that this value may become out // of date if the mode is changed on the host, e.g. with chmod. @@ -125,6 +145,12 @@ type inode struct { // file created on import, not the fd on the host. uid auth.KUID gid auth.KGID + + // offsetMu protects offset. + offsetMu sync.Mutex + + // offset specifies the current file offset. + offset int64 } // Note that these flags may become out of date, since they can be modified @@ -336,36 +362,40 @@ func (i *inode) open(d *vfs.Dentry, mnt *vfs.Mount) (*vfs.FileDescription, error // TODO(gvisor.dev/issue/1672): Whitelist specific file types here, so that // we don't allow importing arbitrary file types without proper support. + var ( + vfsfd *vfs.FileDescription + fdImpl vfs.FileDescriptionImpl + ) if i.isTTY { - // TODO(gvisor.dev/issue/1672): support importing host fd as TTY. - return nil, errors.New("importing host fd as TTY not supported") - } - - // For simplicity, set offset to 0. Technically, we should - // only set to 0 on files that are not seekable (sockets, pipes, etc.), - // and use the offset from the host fd otherwise. - fd := &defaultFileFD{ - fileDescription: fileDescription{ - inode: i, - }, - canMap: canMap(uint32(fileType)), - mu: sync.Mutex{}, - offset: 0, + fd := &ttyFD{ + fileDescription: fileDescription{inode: i}, + termios: linux.DefaultSlaveTermios, + } + vfsfd = &fd.vfsfd + fdImpl = fd + } else { + // For simplicity, set offset to 0. Technically, we should + // only set to 0 on files that are not seekable (sockets, pipes, etc.), + // and use the offset from the host fd otherwise. + fd := &fileDescription{inode: i} + vfsfd = &fd.vfsfd + fdImpl = fd } - vfsfd := &fd.vfsfd flags, err := fileFlagsFromHostFD(i.hostFD) if err != nil { return nil, err } - if err := vfsfd.Init(fd, uint32(flags), mnt, d, &vfs.FileDescriptionOptions{}); err != nil { + if err := vfsfd.Init(fdImpl, uint32(flags), mnt, d, &vfs.FileDescriptionOptions{}); err != nil { return nil, err } return vfsfd, nil } // fileDescription is embedded by host fd implementations of FileDescriptionImpl. +// +// TODO(gvisor.dev/issue/1672): Implement Waitable interface. type fileDescription struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl @@ -394,3 +424,193 @@ func (f *fileDescription) Stat(_ context.Context, opts vfs.StatOptions) (linux.S func (f *fileDescription) Release() { // noop } + +// PRead implements FileDescriptionImpl. +func (f *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { + i := f.inode + // TODO(b/34716638): Some char devices do support offsets, e.g. /dev/null. + if i.isStream { + return 0, syserror.ESPIPE + } + + return readFromHostFD(ctx, i.hostFD, dst, offset, opts.Flags) +} + +// Read implements FileDescriptionImpl. +func (f *fileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + i := f.inode + // TODO(b/34716638): Some char devices do support offsets, e.g. /dev/null. + if i.isStream { + n, err := readFromHostFD(ctx, i.hostFD, dst, -1, opts.Flags) + if isBlockError(err) { + // If we got any data at all, return it as a "completed" partial read + // rather than retrying until complete. + if n != 0 { + err = nil + } else { + err = syserror.ErrWouldBlock + } + } + return n, err + } + // TODO(gvisor.dev/issue/1672): Cache pages, when forced to do so. + i.offsetMu.Lock() + n, err := readFromHostFD(ctx, i.hostFD, dst, i.offset, opts.Flags) + i.offset += n + i.offsetMu.Unlock() + return n, err +} + +func readFromHostFD(ctx context.Context, hostFD int, dst usermem.IOSequence, offset int64, flags uint32) (int64, error) { + // TODO(gvisor.dev/issue/1672): Support select preadv2 flags. + if flags != 0 { + return 0, syserror.EOPNOTSUPP + } + + var reader safemem.Reader + if offset == -1 { + reader = safemem.FromIOReader{fd.NewReadWriter(hostFD)} + } else { + reader = safemem.FromVecReaderFunc{ + func(srcs [][]byte) (int64, error) { + n, err := unix.Preadv(hostFD, srcs, offset) + return int64(n), err + }, + } + } + n, err := dst.CopyOutFrom(ctx, reader) + return int64(n), err +} + +// PWrite implements FileDescriptionImpl. +func (f *fileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { + i := f.inode + // TODO(b/34716638): Some char devices do support offsets, e.g. /dev/null. + if i.isStream { + return 0, syserror.ESPIPE + } + + return writeToHostFD(ctx, i.hostFD, src, offset, opts.Flags) +} + +// Write implements FileDescriptionImpl. +func (f *fileDescription) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { + i := f.inode + // TODO(b/34716638): Some char devices do support offsets, e.g. /dev/null. + if i.isStream { + n, err := writeToHostFD(ctx, i.hostFD, src, -1, opts.Flags) + if isBlockError(err) { + err = syserror.ErrWouldBlock + } + return n, err + } + // TODO(gvisor.dev/issue/1672): Cache pages, when forced to do so. + // TODO(gvisor.dev/issue/1672): Write to end of file and update offset if O_APPEND is set on this file. + i.offsetMu.Lock() + n, err := writeToHostFD(ctx, i.hostFD, src, i.offset, opts.Flags) + i.offset += n + i.offsetMu.Unlock() + return n, err +} + +func writeToHostFD(ctx context.Context, hostFD int, src usermem.IOSequence, offset int64, flags uint32) (int64, error) { + // TODO(gvisor.dev/issue/1672): Support select pwritev2 flags. + if flags != 0 { + return 0, syserror.EOPNOTSUPP + } + + var writer safemem.Writer + if offset == -1 { + writer = safemem.FromIOWriter{fd.NewReadWriter(hostFD)} + } else { + writer = safemem.FromVecWriterFunc{ + func(srcs [][]byte) (int64, error) { + n, err := unix.Pwritev(hostFD, srcs, offset) + return int64(n), err + }, + } + } + n, err := src.CopyInTo(ctx, writer) + return int64(n), err +} + +// Seek implements FileDescriptionImpl. +// +// Note that we do not support seeking on directories, since we do not even +// allow directory fds to be imported at all. +func (f *fileDescription) Seek(_ context.Context, offset int64, whence int32) (int64, error) { + i := f.inode + // TODO(b/34716638): Some char devices do support seeking, e.g. /dev/null. + if i.isStream { + return 0, syserror.ESPIPE + } + + i.offsetMu.Lock() + defer i.offsetMu.Unlock() + + switch whence { + case linux.SEEK_SET: + if offset < 0 { + return i.offset, syserror.EINVAL + } + i.offset = offset + + case linux.SEEK_CUR: + // Check for overflow. Note that underflow cannot occur, since i.offset >= 0. + if offset > math.MaxInt64-i.offset { + return i.offset, syserror.EOVERFLOW + } + if i.offset+offset < 0 { + return i.offset, syserror.EINVAL + } + i.offset += offset + + case linux.SEEK_END: + var s syscall.Stat_t + if err := syscall.Fstat(i.hostFD, &s); err != nil { + return i.offset, err + } + size := s.Size + + // Check for overflow. Note that underflow cannot occur, since size >= 0. + if offset > math.MaxInt64-size { + return i.offset, syserror.EOVERFLOW + } + if size+offset < 0 { + return i.offset, syserror.EINVAL + } + i.offset = size + offset + + case linux.SEEK_DATA, linux.SEEK_HOLE: + // Modifying the offset in the host file table should not matter, since + // this is the only place where we use it. + // + // For reading and writing, we always rely on our internal offset. + n, err := unix.Seek(i.hostFD, offset, int(whence)) + if err != nil { + return i.offset, err + } + i.offset = n + + default: + // Invalid whence. + return i.offset, syserror.EINVAL + } + + return i.offset, nil +} + +// Sync implements FileDescriptionImpl. +func (f *fileDescription) Sync(context.Context) error { + // TODO(gvisor.dev/issue/1672): Currently we do not support the SyncData optimization, so we always sync everything. + return unix.Fsync(f.inode.hostFD) +} + +// ConfigureMMap implements FileDescriptionImpl. +func (f *fileDescription) ConfigureMMap(_ context.Context, opts *memmap.MMapOpts) error { + if !f.inode.canMap { + return syserror.ENODEV + } + // TODO(gvisor.dev/issue/1672): Implement ConfigureMMap and Mappable interface. + return syserror.ENODEV +} diff --git a/pkg/sentry/fsimpl/host/ioctl_unsafe.go b/pkg/sentry/fsimpl/host/ioctl_unsafe.go new file mode 100644 index 000000000..0983bf7d8 --- /dev/null +++ b/pkg/sentry/fsimpl/host/ioctl_unsafe.go @@ -0,0 +1,56 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package host + +import ( + "syscall" + "unsafe" + + "gvisor.dev/gvisor/pkg/abi/linux" +) + +func ioctlGetTermios(fd int) (*linux.Termios, error) { + var t linux.Termios + _, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), linux.TCGETS, uintptr(unsafe.Pointer(&t))) + if errno != 0 { + return nil, errno + } + return &t, nil +} + +func ioctlSetTermios(fd int, req uint64, t *linux.Termios) error { + _, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), uintptr(req), uintptr(unsafe.Pointer(t))) + if errno != 0 { + return errno + } + return nil +} + +func ioctlGetWinsize(fd int) (*linux.Winsize, error) { + var w linux.Winsize + _, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), linux.TIOCGWINSZ, uintptr(unsafe.Pointer(&w))) + if errno != 0 { + return nil, errno + } + return &w, nil +} + +func ioctlSetWinsize(fd int, w *linux.Winsize) error { + _, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), linux.TIOCSWINSZ, uintptr(unsafe.Pointer(w))) + if errno != 0 { + return errno + } + return nil +} diff --git a/pkg/sentry/fsimpl/host/tty.go b/pkg/sentry/fsimpl/host/tty.go new file mode 100644 index 000000000..8936afb06 --- /dev/null +++ b/pkg/sentry/fsimpl/host/tty.go @@ -0,0 +1,379 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package host + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/unimpl" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// ttyFD implements vfs.FileDescriptionImpl for a host file descriptor +// that wraps a TTY FD. +type ttyFD struct { + fileDescription + + // mu protects the fields below. + mu sync.Mutex `state:"nosave"` + + // session is the session attached to this ttyFD. + session *kernel.Session + + // fgProcessGroup is the foreground process group that is currently + // connected to this TTY. + fgProcessGroup *kernel.ProcessGroup + + // termios contains the terminal attributes for this TTY. + termios linux.KernelTermios +} + +// InitForegroundProcessGroup sets the foreground process group and session for +// the TTY. This should only be called once, after the foreground process group +// has been created, but before it has started running. +func (t *ttyFD) InitForegroundProcessGroup(pg *kernel.ProcessGroup) { + t.mu.Lock() + defer t.mu.Unlock() + if t.fgProcessGroup != nil { + panic("foreground process group is already set") + } + t.fgProcessGroup = pg + t.session = pg.Session() +} + +// ForegroundProcessGroup returns the foreground process for the TTY. +func (t *ttyFD) ForegroundProcessGroup() *kernel.ProcessGroup { + t.mu.Lock() + defer t.mu.Unlock() + return t.fgProcessGroup +} + +// Release implements fs.FileOperations.Release. +func (t *ttyFD) Release() { + t.mu.Lock() + t.fgProcessGroup = nil + t.mu.Unlock() + + t.fileDescription.Release() +} + +// PRead implements vfs.FileDescriptionImpl. +// +// Reading from a TTY is only allowed for foreground process groups. Background +// process groups will either get EIO or a SIGTTIN. +func (t *ttyFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { + t.mu.Lock() + defer t.mu.Unlock() + + // Are we allowed to do the read? + // drivers/tty/n_tty.c:n_tty_read()=>job_control()=>tty_check_change(). + if err := t.checkChange(ctx, linux.SIGTTIN); err != nil { + return 0, err + } + + // Do the read. + return t.fileDescription.PRead(ctx, dst, offset, opts) +} + +// Read implements vfs.FileDescriptionImpl. +// +// Reading from a TTY is only allowed for foreground process groups. Background +// process groups will either get EIO or a SIGTTIN. +func (t *ttyFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + t.mu.Lock() + defer t.mu.Unlock() + + // Are we allowed to do the read? + // drivers/tty/n_tty.c:n_tty_read()=>job_control()=>tty_check_change(). + if err := t.checkChange(ctx, linux.SIGTTIN); err != nil { + return 0, err + } + + // Do the read. + return t.fileDescription.Read(ctx, dst, opts) +} + +// PWrite implements vfs.FileDescriptionImpl. +func (t *ttyFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { + t.mu.Lock() + defer t.mu.Unlock() + + // Check whether TOSTOP is enabled. This corresponds to the check in + // drivers/tty/n_tty.c:n_tty_write(). + if t.termios.LEnabled(linux.TOSTOP) { + if err := t.checkChange(ctx, linux.SIGTTOU); err != nil { + return 0, err + } + } + return t.fileDescription.PWrite(ctx, src, offset, opts) +} + +// Write implements vfs.FileDescriptionImpl. +func (t *ttyFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { + t.mu.Lock() + defer t.mu.Unlock() + + // Check whether TOSTOP is enabled. This corresponds to the check in + // drivers/tty/n_tty.c:n_tty_write(). + if t.termios.LEnabled(linux.TOSTOP) { + if err := t.checkChange(ctx, linux.SIGTTOU); err != nil { + return 0, err + } + } + return t.fileDescription.Write(ctx, src, opts) +} + +// Ioctl implements vfs.FileDescriptionImpl. +func (t *ttyFD) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { + // Ignore arg[0]. This is the real FD: + fd := t.inode.hostFD + ioctl := args[1].Uint64() + switch ioctl { + case linux.TCGETS: + termios, err := ioctlGetTermios(fd) + if err != nil { + return 0, err + } + _, err = usermem.CopyObjectOut(ctx, io, args[2].Pointer(), termios, usermem.IOOpts{ + AddressSpaceActive: true, + }) + return 0, err + + case linux.TCSETS, linux.TCSETSW, linux.TCSETSF: + t.mu.Lock() + defer t.mu.Unlock() + + if err := t.checkChange(ctx, linux.SIGTTOU); err != nil { + return 0, err + } + + var termios linux.Termios + if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &termios, usermem.IOOpts{ + AddressSpaceActive: true, + }); err != nil { + return 0, err + } + err := ioctlSetTermios(fd, ioctl, &termios) + if err == nil { + t.termios.FromTermios(termios) + } + return 0, err + + case linux.TIOCGPGRP: + // Args: pid_t *argp + // When successful, equivalent to *argp = tcgetpgrp(fd). + // Get the process group ID of the foreground process group on this + // terminal. + + pidns := kernel.PIDNamespaceFromContext(ctx) + if pidns == nil { + return 0, syserror.ENOTTY + } + + t.mu.Lock() + defer t.mu.Unlock() + + // Map the ProcessGroup into a ProcessGroupID in the task's PID namespace. + pgID := pidns.IDOfProcessGroup(t.fgProcessGroup) + _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), &pgID, usermem.IOOpts{ + AddressSpaceActive: true, + }) + return 0, err + + case linux.TIOCSPGRP: + // Args: const pid_t *argp + // Equivalent to tcsetpgrp(fd, *argp). + // Set the foreground process group ID of this terminal. + + task := kernel.TaskFromContext(ctx) + if task == nil { + return 0, syserror.ENOTTY + } + + t.mu.Lock() + defer t.mu.Unlock() + + // Check that we are allowed to set the process group. + if err := t.checkChange(ctx, linux.SIGTTOU); err != nil { + // drivers/tty/tty_io.c:tiocspgrp() converts -EIO from tty_check_change() + // to -ENOTTY. + if err == syserror.EIO { + return 0, syserror.ENOTTY + } + return 0, err + } + + // Check that calling task's process group is in the TTY session. + if task.ThreadGroup().Session() != t.session { + return 0, syserror.ENOTTY + } + + var pgID kernel.ProcessGroupID + if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &pgID, usermem.IOOpts{ + AddressSpaceActive: true, + }); err != nil { + return 0, err + } + + // pgID must be non-negative. + if pgID < 0 { + return 0, syserror.EINVAL + } + + // Process group with pgID must exist in this PID namespace. + pidns := task.PIDNamespace() + pg := pidns.ProcessGroupWithID(pgID) + if pg == nil { + return 0, syserror.ESRCH + } + + // Check that new process group is in the TTY session. + if pg.Session() != t.session { + return 0, syserror.EPERM + } + + t.fgProcessGroup = pg + return 0, nil + + case linux.TIOCGWINSZ: + // Args: struct winsize *argp + // Get window size. + winsize, err := ioctlGetWinsize(fd) + if err != nil { + return 0, err + } + _, err = usermem.CopyObjectOut(ctx, io, args[2].Pointer(), winsize, usermem.IOOpts{ + AddressSpaceActive: true, + }) + return 0, err + + case linux.TIOCSWINSZ: + // Args: const struct winsize *argp + // Set window size. + + // Unlike setting the termios, any process group (even background ones) can + // set the winsize. + + var winsize linux.Winsize + if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &winsize, usermem.IOOpts{ + AddressSpaceActive: true, + }); err != nil { + return 0, err + } + err := ioctlSetWinsize(fd, &winsize) + return 0, err + + // Unimplemented commands. + case linux.TIOCSETD, + linux.TIOCSBRK, + linux.TIOCCBRK, + linux.TCSBRK, + linux.TCSBRKP, + linux.TIOCSTI, + linux.TIOCCONS, + linux.FIONBIO, + linux.TIOCEXCL, + linux.TIOCNXCL, + linux.TIOCGEXCL, + linux.TIOCNOTTY, + linux.TIOCSCTTY, + linux.TIOCGSID, + linux.TIOCGETD, + linux.TIOCVHANGUP, + linux.TIOCGDEV, + linux.TIOCMGET, + linux.TIOCMSET, + linux.TIOCMBIC, + linux.TIOCMBIS, + linux.TIOCGICOUNT, + linux.TCFLSH, + linux.TIOCSSERIAL, + linux.TIOCGPTPEER: + + unimpl.EmitUnimplementedEvent(ctx) + fallthrough + default: + return 0, syserror.ENOTTY + } +} + +// checkChange checks that the process group is allowed to read, write, or +// change the state of the TTY. +// +// This corresponds to Linux drivers/tty/tty_io.c:tty_check_change(). The logic +// is a bit convoluted, but documented inline. +// +// Preconditions: t.mu must be held. +func (t *ttyFD) checkChange(ctx context.Context, sig linux.Signal) error { + task := kernel.TaskFromContext(ctx) + if task == nil { + // No task? Linux does not have an analog for this case, but + // tty_check_change is more of a blacklist of cases than a + // whitelist, and is surprisingly permissive. Allowing the + // change seems most appropriate. + return nil + } + + tg := task.ThreadGroup() + pg := tg.ProcessGroup() + + // If the session for the task is different than the session for the + // controlling TTY, then the change is allowed. Seems like a bad idea, + // but that's exactly what linux does. + if tg.Session() != t.fgProcessGroup.Session() { + return nil + } + + // If we are the foreground process group, then the change is allowed. + if pg == t.fgProcessGroup { + return nil + } + + // We are not the foreground process group. + + // Is the provided signal blocked or ignored? + if (task.SignalMask()&linux.SignalSetOf(sig) != 0) || tg.SignalHandlers().IsIgnored(sig) { + // If the signal is SIGTTIN, then we are attempting to read + // from the TTY. Don't send the signal and return EIO. + if sig == linux.SIGTTIN { + return syserror.EIO + } + + // Otherwise, we are writing or changing terminal state. This is allowed. + return nil + } + + // If the process group is an orphan, return EIO. + if pg.IsOrphan() { + return syserror.EIO + } + + // Otherwise, send the signal to the process group and return ERESTARTSYS. + // + // Note that Linux also unconditionally sets TIF_SIGPENDING on current, + // but this isn't necessary in gVisor because the rationale given in + // 040b6362d58f "tty: fix leakage of -ERESTARTSYS to userland" doesn't + // apply: the sentry will handle -ERESTARTSYS in + // kernel.runApp.execute() even if the kernel.Task isn't interrupted. + // + // Linux ignores the result of kill_pgrp(). + _ = pg.SendSignal(kernel.SignalInfoPriv(sig)) + return kernel.ERESTARTSYS +} -- cgit v1.2.3 From 248e46f320525704da917e148a8f69d9b74671a0 Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Thu, 19 Mar 2020 23:29:15 -0700 Subject: Whitelist utimensat(2). utimensat is used by hostfs for setting timestamps on imported fds. Previously, this would crash the sandbox since utimensat was not allowed. Correct the VFS2 version of hostfs to match the call in VFS1. PiperOrigin-RevId: 301970121 --- pkg/sentry/fsimpl/host/BUILD | 1 + pkg/sentry/fsimpl/host/host.go | 4 ++-- pkg/sentry/fsimpl/host/util.go | 8 ++++---- pkg/sentry/fsimpl/host/util_unsafe.go | 34 ++++++++++++++++++++++++++++++++++ runsc/boot/filter/config.go | 8 ++++++++ 5 files changed, 49 insertions(+), 6 deletions(-) create mode 100644 pkg/sentry/fsimpl/host/util_unsafe.go (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/host/BUILD b/pkg/sentry/fsimpl/host/BUILD index 0bb4a5c3e..82e1fb74b 100644 --- a/pkg/sentry/fsimpl/host/BUILD +++ b/pkg/sentry/fsimpl/host/BUILD @@ -9,6 +9,7 @@ go_library( "ioctl_unsafe.go", "tty.go", "util.go", + "util_unsafe.go", ], visibility = ["//pkg/sentry:internal"], deps = [ diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go index 3afb41395..1f735628f 100644 --- a/pkg/sentry/fsimpl/host/host.go +++ b/pkg/sentry/fsimpl/host/host.go @@ -322,11 +322,11 @@ func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Cre } } if m&(linux.STATX_ATIME|linux.STATX_MTIME) != 0 { - timestamps := []unix.Timespec{ + ts := [2]syscall.Timespec{ toTimespec(s.Atime, m&linux.STATX_ATIME == 0), toTimespec(s.Mtime, m&linux.STATX_MTIME == 0), } - if err := unix.UtimesNanoAt(i.hostFD, "", timestamps, unix.AT_EMPTY_PATH); err != nil { + if err := setTimestamps(i.hostFD, &ts); err != nil { return err } } diff --git a/pkg/sentry/fsimpl/host/util.go b/pkg/sentry/fsimpl/host/util.go index d519feef5..2bc757b1a 100644 --- a/pkg/sentry/fsimpl/host/util.go +++ b/pkg/sentry/fsimpl/host/util.go @@ -22,15 +22,15 @@ import ( "gvisor.dev/gvisor/pkg/syserror" ) -func toTimespec(ts linux.StatxTimestamp, omit bool) unix.Timespec { +func toTimespec(ts linux.StatxTimestamp, omit bool) syscall.Timespec { if omit { - return unix.Timespec{ + return syscall.Timespec{ Sec: 0, Nsec: unix.UTIME_OMIT, } } - return unix.Timespec{ - Sec: int64(ts.Sec), + return syscall.Timespec{ + Sec: ts.Sec, Nsec: int64(ts.Nsec), } } diff --git a/pkg/sentry/fsimpl/host/util_unsafe.go b/pkg/sentry/fsimpl/host/util_unsafe.go new file mode 100644 index 000000000..5136ac844 --- /dev/null +++ b/pkg/sentry/fsimpl/host/util_unsafe.go @@ -0,0 +1,34 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package host + +import ( + "syscall" + "unsafe" +) + +func setTimestamps(fd int, ts *[2]syscall.Timespec) error { + _, _, errno := syscall.Syscall6( + syscall.SYS_UTIMENSAT, + uintptr(fd), + 0, /* path */ + uintptr(unsafe.Pointer(ts)), + 0, /* flags */ + 0, 0) + if errno != 0 { + return errno + } + return nil +} diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go index f459d1973..06b9f888a 100644 --- a/runsc/boot/filter/config.go +++ b/runsc/boot/filter/config.go @@ -291,6 +291,14 @@ var allowedSyscalls = seccomp.SyscallRules{ seccomp.AllowValue(uint64(os.Getpid())), }, }, + syscall.SYS_UTIMENSAT: []seccomp.Rule{ + { + seccomp.AllowAny{}, + seccomp.AllowValue(0), /* null pathname */ + seccomp.AllowAny{}, + seccomp.AllowValue(0), /* flags */ + }, + }, syscall.SYS_WRITE: {}, // The only user in rawfile.NonBlockingWrite3 always passes iovcnt with // values 2 or 3. Three iovec-s are passed, when the PACKET_VNET_HDR -- cgit v1.2.3 From c7f5673529af758c9f7c95523535174c7929dab5 Mon Sep 17 00:00:00 2001 From: Fabricio Voznika Date: Wed, 25 Mar 2020 14:44:18 -0700 Subject: Set file mode and type to attribute Makes less error prone to find file type. Updates #1197 PiperOrigin-RevId: 302974244 --- pkg/sentry/fsimpl/tmpfs/device_file.go | 10 ++++++++++ pkg/sentry/fsimpl/tmpfs/directory.go | 7 +------ pkg/sentry/fsimpl/tmpfs/named_pipe.go | 2 +- pkg/sentry/fsimpl/tmpfs/regular_file.go | 2 +- pkg/sentry/fsimpl/tmpfs/symlink.go | 3 ++- pkg/sentry/fsimpl/tmpfs/tmpfs.go | 26 ++++++++++++-------------- 6 files changed, 27 insertions(+), 23 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/tmpfs/device_file.go b/pkg/sentry/fsimpl/tmpfs/device_file.go index 84b181b90..83bf885ee 100644 --- a/pkg/sentry/fsimpl/tmpfs/device_file.go +++ b/pkg/sentry/fsimpl/tmpfs/device_file.go @@ -15,6 +15,8 @@ package tmpfs import ( + "fmt" + "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" @@ -33,6 +35,14 @@ func (fs *filesystem) newDeviceFile(creds *auth.Credentials, mode linux.FileMode major: major, minor: minor, } + switch kind { + case vfs.BlockDevice: + mode |= linux.S_IFBLK + case vfs.CharDevice: + mode |= linux.S_IFCHR + default: + panic(fmt.Sprintf("invalid DeviceKind: %v", kind)) + } file.inode.init(file, fs, creds, mode) file.inode.nlink = 1 // from parent directory return &file.inode diff --git a/pkg/sentry/fsimpl/tmpfs/directory.go b/pkg/sentry/fsimpl/tmpfs/directory.go index b4380af38..37c75ab64 100644 --- a/pkg/sentry/fsimpl/tmpfs/directory.go +++ b/pkg/sentry/fsimpl/tmpfs/directory.go @@ -34,16 +34,11 @@ type directory struct { func (fs *filesystem) newDirectory(creds *auth.Credentials, mode linux.FileMode) *inode { dir := &directory{} - dir.inode.init(dir, fs, creds, mode) + dir.inode.init(dir, fs, creds, linux.S_IFDIR|mode) dir.inode.nlink = 2 // from "." and parent directory or ".." for root return &dir.inode } -func (i *inode) isDir() bool { - _, ok := i.impl.(*directory) - return ok -} - type directoryFD struct { fileDescription vfs.DirectoryFileDescriptionDefaultImpl diff --git a/pkg/sentry/fsimpl/tmpfs/named_pipe.go b/pkg/sentry/fsimpl/tmpfs/named_pipe.go index 0c57fdca3..2c5c739df 100644 --- a/pkg/sentry/fsimpl/tmpfs/named_pipe.go +++ b/pkg/sentry/fsimpl/tmpfs/named_pipe.go @@ -34,7 +34,7 @@ type namedPipe struct { // * rp.Mount().CheckBeginWrite() has been called successfully. func (fs *filesystem) newNamedPipe(creds *auth.Credentials, mode linux.FileMode) *inode { file := &namedPipe{pipe: pipe.NewVFSPipe(pipe.DefaultPipeSize, usermem.PageSize)} - file.inode.init(file, fs, creds, mode) + file.inode.init(file, fs, creds, linux.S_IFIFO|mode) file.inode.nlink = 1 // Only the parent has a link. return &file.inode } diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file.go b/pkg/sentry/fsimpl/tmpfs/regular_file.go index 5a2896bf6..26cd65605 100644 --- a/pkg/sentry/fsimpl/tmpfs/regular_file.go +++ b/pkg/sentry/fsimpl/tmpfs/regular_file.go @@ -89,7 +89,7 @@ func (fs *filesystem) newRegularFile(creds *auth.Credentials, mode linux.FileMod file := ®ularFile{ memFile: fs.memFile, } - file.inode.init(file, fs, creds, mode) + file.inode.init(file, fs, creds, linux.S_IFREG|mode) file.inode.nlink = 1 // from parent directory return &file.inode } diff --git a/pkg/sentry/fsimpl/tmpfs/symlink.go b/pkg/sentry/fsimpl/tmpfs/symlink.go index 5246aca84..47e075ed4 100644 --- a/pkg/sentry/fsimpl/tmpfs/symlink.go +++ b/pkg/sentry/fsimpl/tmpfs/symlink.go @@ -15,6 +15,7 @@ package tmpfs import ( + "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" ) @@ -27,7 +28,7 @@ func (fs *filesystem) newSymlink(creds *auth.Credentials, target string) *inode link := &symlink{ target: target, } - link.inode.init(link, fs, creds, 0777) + link.inode.init(link, fs, creds, linux.S_IFLNK|0777) link.inode.nlink = 1 // from parent directory return &link.inode } diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go index ff69372b3..2d5070a46 100644 --- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go +++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go @@ -144,7 +144,7 @@ type inode struct { // Inode metadata. Writing multiple fields atomically requires holding // mu, othewise atomic operations can be used. mu sync.Mutex - mode uint32 // excluding file type bits, which are based on impl + mode uint32 // file type and mode nlink uint32 // protected by filesystem.mu instead of inode.mu uid uint32 // auth.KUID, but stored as raw uint32 for sync/atomic gid uint32 // auth.KGID, but ... @@ -168,6 +168,9 @@ type inode struct { const maxLinks = math.MaxUint32 func (i *inode) init(impl interface{}, fs *filesystem, creds *auth.Credentials, mode linux.FileMode) { + if mode.FileType() == 0 { + panic("file type is required in FileMode") + } i.clock = fs.clock i.refs = 1 i.mode = uint32(mode) @@ -269,31 +272,21 @@ func (i *inode) statTo(stat *linux.Statx) { // TODO(gvisor.dev/issues/1197): Device number. switch impl := i.impl.(type) { case *regularFile: - stat.Mode |= linux.S_IFREG stat.Mask |= linux.STATX_SIZE | linux.STATX_BLOCKS stat.Size = uint64(atomic.LoadUint64(&impl.size)) // In tmpfs, this will be FileRangeSet.Span() / 512 (but also cached in // a uint64 accessed using atomic memory operations to avoid taking // locks). stat.Blocks = allocatedBlocksForSize(stat.Size) - case *directory: - stat.Mode |= linux.S_IFDIR case *symlink: - stat.Mode |= linux.S_IFLNK stat.Mask |= linux.STATX_SIZE | linux.STATX_BLOCKS stat.Size = uint64(len(impl.target)) stat.Blocks = allocatedBlocksForSize(stat.Size) - case *namedPipe: - stat.Mode |= linux.S_IFIFO case *deviceFile: - switch impl.kind { - case vfs.BlockDevice: - stat.Mode |= linux.S_IFBLK - case vfs.CharDevice: - stat.Mode |= linux.S_IFCHR - } stat.RdevMajor = impl.major stat.RdevMinor = impl.minor + case *directory, *namedPipe: + // Nothing to do. default: panic(fmt.Sprintf("unknown inode type: %T", i.impl)) } @@ -316,7 +309,8 @@ func (i *inode) setStat(ctx context.Context, creds *auth.Credentials, stat *linu ) mask := stat.Mask if mask&linux.STATX_MODE != 0 { - atomic.StoreUint32(&i.mode, uint32(stat.Mode)) + ft := atomic.LoadUint32(&i.mode) & linux.S_IFMT + atomic.StoreUint32(&i.mode, ft|uint32(stat.Mode&^linux.S_IFMT)) needsCtimeBump = true } if mask&linux.STATX_UID != 0 { @@ -439,6 +433,10 @@ func (i *inode) direntType() uint8 { } } +func (i *inode) isDir() bool { + return linux.FileMode(i.mode).FileType() == linux.S_IFDIR +} + // fileDescription is embedded by tmpfs implementations of // vfs.FileDescriptionImpl. type fileDescription struct { -- cgit v1.2.3 From e541ebec2fdb5b29209cb3fc8235b77edcaebb6a Mon Sep 17 00:00:00 2001 From: Fabricio Voznika Date: Wed, 25 Mar 2020 14:54:10 -0700 Subject: Misc fixes to make stat_test pass (almost) The only test failing now requires socket which is not available in VFS2 yet. Updates #1198 PiperOrigin-RevId: 302976572 --- pkg/bits/bits_template.go | 8 ++++++ pkg/bits/uint64_test.go | 18 ++++++++++++ pkg/sentry/fsimpl/gofer/filesystem.go | 16 +++++++++-- pkg/sentry/fsimpl/gofer/gofer.go | 41 ++++++++++++++++++++++++---- pkg/sentry/syscalls/linux/vfs2/BUILD | 1 + pkg/sentry/syscalls/linux/vfs2/filesystem.go | 2 +- pkg/sentry/syscalls/linux/vfs2/getdents.go | 4 +-- pkg/sentry/syscalls/linux/vfs2/stat.go | 7 ++++- pkg/sentry/vfs/resolving_path.go | 16 +++++++++-- test/syscalls/linux/stat.cc | 11 +++++++- 10 files changed, 109 insertions(+), 15 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/bits/bits_template.go b/pkg/bits/bits_template.go index 93a435b80..998645388 100644 --- a/pkg/bits/bits_template.go +++ b/pkg/bits/bits_template.go @@ -42,3 +42,11 @@ func Mask(is ...int) T { func MaskOf(i int) T { return T(1) << T(i) } + +// IsPowerOfTwo returns true if v is power of 2. +func IsPowerOfTwo(v T) bool { + if v == 0 { + return false + } + return v&(v-1) == 0 +} diff --git a/pkg/bits/uint64_test.go b/pkg/bits/uint64_test.go index 1b018d808..193d1ebcd 100644 --- a/pkg/bits/uint64_test.go +++ b/pkg/bits/uint64_test.go @@ -114,3 +114,21 @@ func TestIsOn(t *testing.T) { } } } + +func TestIsPowerOfTwo(t *testing.T) { + for _, tc := range []struct { + v uint64 + want bool + }{ + {v: 0, want: false}, + {v: 1, want: true}, + {v: 2, want: true}, + {v: 3, want: false}, + {v: 4, want: true}, + {v: 5, want: false}, + } { + if got := IsPowerOfTwo64(tc.v); got != tc.want { + t.Errorf("IsPowerOfTwo(%d) = %t, want: %t", tc.v, got, tc.want) + } + } +} diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go index 38e4cdbc5..26b492185 100644 --- a/pkg/sentry/fsimpl/gofer/filesystem.go +++ b/pkg/sentry/fsimpl/gofer/filesystem.go @@ -454,6 +454,9 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b } if fs.opts.interop != InteropModeShared { parent.touchCMtime(ctx) + if dir { + parent.decLinks() + } parent.cacheNegativeChildLocked(name) parent.dirents = nil } @@ -569,8 +572,13 @@ func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs. func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error { return fs.doCreateAt(ctx, rp, true /* dir */, func(parent *dentry, name string) error { creds := rp.Credentials() - _, err := parent.file.mkdir(ctx, name, (p9.FileMode)(opts.Mode), (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID)) - return err + if _, err := parent.file.mkdir(ctx, name, (p9.FileMode)(opts.Mode), (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID)); err != nil { + return err + } + if fs.opts.interop != InteropModeShared { + parent.incLinks() + } + return nil }) } @@ -962,6 +970,10 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa oldParent.dirents = nil delete(newParent.negativeChildren, newName) newParent.dirents = nil + if renamed.isDir() { + oldParent.decLinks() + newParent.incLinks() + } } vfsObj.CommitRenameReplaceDentry(&renamed.vfsd, &newParent.vfsd, newName, replacedVFSD) return nil diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go index 999485492..13928ce36 100644 --- a/pkg/sentry/fsimpl/gofer/gofer.go +++ b/pkg/sentry/fsimpl/gofer/gofer.go @@ -485,6 +485,11 @@ type dentry struct { // locked to mutate it). size uint64 + // nlink counts the number of hard links to this dentry. It's updated and + // accessed using atomic operations. It's not protected by metadataMu like the + // other metadata fields. + nlink uint32 + mapsMu sync.Mutex // If this dentry represents a regular file, mappings tracks mappings of @@ -604,6 +609,9 @@ func (fs *filesystem) newDentry(ctx context.Context, file p9file, qid p9.QID, ma if mask.BTime { d.btime = dentryTimestampFromP9(attr.BTimeSeconds, attr.BTimeNanoSeconds) } + if mask.NLink { + d.nlink = uint32(attr.NLink) + } d.vfsd.Init(d) fs.syncMu.Lock() @@ -645,6 +653,9 @@ func (d *dentry) updateFromP9Attrs(mask p9.AttrMask, attr *p9.Attr) { if mask.BTime { atomic.StoreInt64(&d.btime, dentryTimestampFromP9(attr.BTimeSeconds, attr.BTimeNanoSeconds)) } + if mask.NLink { + atomic.StoreUint32(&d.nlink, uint32(attr.NLink)) + } if mask.Size { d.dataMu.Lock() atomic.StoreUint64(&d.size, attr.Size) @@ -687,10 +698,7 @@ func (d *dentry) fileType() uint32 { func (d *dentry) statTo(stat *linux.Statx) { stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME | linux.STATX_INO | linux.STATX_SIZE | linux.STATX_BLOCKS | linux.STATX_BTIME stat.Blksize = atomic.LoadUint32(&d.blockSize) - stat.Nlink = 1 - if d.isDir() { - stat.Nlink = 2 - } + stat.Nlink = atomic.LoadUint32(&d.nlink) stat.UID = atomic.LoadUint32(&d.uid) stat.GID = atomic.LoadUint32(&d.gid) stat.Mode = uint16(atomic.LoadUint32(&d.mode)) @@ -703,7 +711,7 @@ func (d *dentry) statTo(stat *linux.Statx) { stat.Btime = statxTimestampFromDentry(atomic.LoadInt64(&d.btime)) stat.Ctime = statxTimestampFromDentry(atomic.LoadInt64(&d.ctime)) stat.Mtime = statxTimestampFromDentry(atomic.LoadInt64(&d.mtime)) - // TODO(jamieliu): device number + // TODO(gvisor.dev/issue/1198): device number } func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *linux.Statx, mnt *vfs.Mount) error { @@ -1094,6 +1102,26 @@ func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool return nil } +// incLinks increments link count. +// +// Preconditions: d.nlink != 0 && d.nlink < math.MaxUint32. +func (d *dentry) incLinks() { + v := atomic.AddUint32(&d.nlink, 1) + if v < 2 { + panic(fmt.Sprintf("dentry.nlink is invalid (was 0 or overflowed): %d", v)) + } +} + +// decLinks decrements link count. +// +// Preconditions: d.nlink > 1. +func (d *dentry) decLinks() { + v := atomic.AddUint32(&d.nlink, ^uint32(0)) + if v == 0 { + panic(fmt.Sprintf("dentry.nlink must be greater than 0: %d", v)) + } +} + // fileDescription is embedded by gofer implementations of // vfs.FileDescriptionImpl. type fileDescription struct { @@ -1112,7 +1140,8 @@ func (fd *fileDescription) dentry() *dentry { // Stat implements vfs.FileDescriptionImpl.Stat. func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { d := fd.dentry() - if d.fs.opts.interop == InteropModeShared && opts.Mask&(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID|linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_CTIME|linux.STATX_SIZE|linux.STATX_BLOCKS|linux.STATX_BTIME) != 0 && opts.Sync != linux.AT_STATX_DONT_SYNC { + const validMask = uint32(linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME | linux.STATX_SIZE | linux.STATX_BLOCKS | linux.STATX_BTIME) + if d.fs.opts.interop == InteropModeShared && opts.Mask&(validMask) != 0 && opts.Sync != linux.AT_STATX_DONT_SYNC { // TODO(jamieliu): Use specialFileFD.handle.file for the getattr if // available? if err := d.updateFromGetattr(ctx); err != nil { diff --git a/pkg/sentry/syscalls/linux/vfs2/BUILD b/pkg/sentry/syscalls/linux/vfs2/BUILD index e7695e995..2eb210014 100644 --- a/pkg/sentry/syscalls/linux/vfs2/BUILD +++ b/pkg/sentry/syscalls/linux/vfs2/BUILD @@ -31,6 +31,7 @@ go_library( visibility = ["//:sandbox"], deps = [ "//pkg/abi/linux", + "//pkg/bits", "//pkg/fspath", "//pkg/gohacks", "//pkg/sentry/arch", diff --git a/pkg/sentry/syscalls/linux/vfs2/filesystem.go b/pkg/sentry/syscalls/linux/vfs2/filesystem.go index fc5ceea4c..a859095e2 100644 --- a/pkg/sentry/syscalls/linux/vfs2/filesystem.go +++ b/pkg/sentry/syscalls/linux/vfs2/filesystem.go @@ -250,7 +250,7 @@ func rmdirat(t *kernel.Task, dirfd int32, pathAddr usermem.Addr) error { if err != nil { return err } - tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, followFinalSymlink) + tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, nofollowFinalSymlink) if err != nil { return err } diff --git a/pkg/sentry/syscalls/linux/vfs2/getdents.go b/pkg/sentry/syscalls/linux/vfs2/getdents.go index ddc140b65..a61cc5059 100644 --- a/pkg/sentry/syscalls/linux/vfs2/getdents.go +++ b/pkg/sentry/syscalls/linux/vfs2/getdents.go @@ -97,7 +97,7 @@ func (cb *getdentsCallback) Handle(dirent vfs.Dirent) error { // char d_name[]; /* Filename (null-terminated) */ // }; size := 8 + 8 + 2 + 1 + 1 + len(dirent.Name) - if size < cb.remaining { + if size > cb.remaining { return syserror.EINVAL } buf = cb.t.CopyScratchBuffer(size) @@ -125,7 +125,7 @@ func (cb *getdentsCallback) Handle(dirent vfs.Dirent) error { panic(fmt.Sprintf("unsupported sizeof(unsigned long): %d", cb.t.Arch().Width())) } size := 8 + 8 + 2 + 1 + 1 + 1 + len(dirent.Name) - if size < cb.remaining { + if size > cb.remaining { return syserror.EINVAL } buf = cb.t.CopyScratchBuffer(size) diff --git a/pkg/sentry/syscalls/linux/vfs2/stat.go b/pkg/sentry/syscalls/linux/vfs2/stat.go index 068243132..fdfe49243 100644 --- a/pkg/sentry/syscalls/linux/vfs2/stat.go +++ b/pkg/sentry/syscalls/linux/vfs2/stat.go @@ -16,6 +16,7 @@ package vfs2 import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/bits" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/gohacks" "gvisor.dev/gvisor/pkg/sentry/arch" @@ -153,7 +154,11 @@ func Statx(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW|linux.AT_STATX_SYNC_TYPE) != 0 { return 0, nil, syserror.EINVAL } - + // Make sure that only one sync type option is set. + syncType := uint32(flags & linux.AT_STATX_SYNC_TYPE) + if syncType != 0 && !bits.IsPowerOfTwo32(syncType) { + return 0, nil, syserror.EINVAL + } if mask&linux.STATX__RESERVED != 0 { return 0, nil, syserror.EINVAL } diff --git a/pkg/sentry/vfs/resolving_path.go b/pkg/sentry/vfs/resolving_path.go index eb4ebb511..8f31495da 100644 --- a/pkg/sentry/vfs/resolving_path.go +++ b/pkg/sentry/vfs/resolving_path.go @@ -329,10 +329,22 @@ func (rp *ResolvingPath) ResolveComponent(d *Dentry) (*Dentry, error) { // component in pcs represents a symbolic link, the symbolic link should be // followed. // +// If path is terminated with '/', the '/' is considered the last element and +// any symlink before that is followed: +// - For most non-creating walks, the last path component is handled by +// fs/namei.c:lookup_last(), which sets LOOKUP_FOLLOW if the first byte +// after the path component is non-NULL (which is only possible if it's '/') +// and the path component is of type LAST_NORM. +// +// - For open/openat/openat2 without O_CREAT, the last path component is +// handled by fs/namei.c:do_last(), which does the same, though without the +// LAST_NORM check. +// // Preconditions: !rp.Done(). func (rp *ResolvingPath) ShouldFollowSymlink() bool { - // Non-final symlinks are always followed. - return rp.flags&rpflagsFollowFinalSymlink != 0 || !rp.Final() + // Non-final symlinks are always followed. Paths terminated with '/' are also + // always followed. + return rp.flags&rpflagsFollowFinalSymlink != 0 || !rp.Final() || rp.MustBeDir() } // HandleSymlink is called when the current path component is a symbolic link diff --git a/test/syscalls/linux/stat.cc b/test/syscalls/linux/stat.cc index 513b9cd1c..2503960f3 100644 --- a/test/syscalls/linux/stat.cc +++ b/test/syscalls/linux/stat.cc @@ -34,6 +34,13 @@ #include "test/util/temp_path.h" #include "test/util/test_util.h" +#ifndef AT_STATX_FORCE_SYNC +#define AT_STATX_FORCE_SYNC 0x2000 +#endif +#ifndef AT_STATX_DONT_SYNC +#define AT_STATX_DONT_SYNC 0x4000 +#endif + namespace gvisor { namespace testing { @@ -700,8 +707,10 @@ TEST_F(StatTest, StatxInvalidFlags) { struct kernel_statx stx; EXPECT_THAT(statx(AT_FDCWD, test_file_name_.c_str(), 12345, 0, &stx), SyscallFailsWithErrno(EINVAL)); + + // Sync flags are mutually exclusive. EXPECT_THAT(statx(AT_FDCWD, test_file_name_.c_str(), - 0x6000 /* AT_STATX_SYNC_TYPE */, 0, &stx), + AT_STATX_FORCE_SYNC | AT_STATX_DONT_SYNC, 0, &stx), SyscallFailsWithErrno(EINVAL)); } -- cgit v1.2.3 From de694e5484502d53166d70b36141e62fcdf07803 Mon Sep 17 00:00:00 2001 From: Fabricio Voznika Date: Wed, 25 Mar 2020 19:12:25 -0700 Subject: Combine file mode and isDir arguments Updates #1035 PiperOrigin-RevId: 303021328 --- pkg/abi/linux/file.go | 5 +++++ pkg/sentry/fsimpl/ext/inode.go | 2 +- pkg/sentry/fsimpl/gofer/filesystem.go | 22 +++++++++++----------- pkg/sentry/fsimpl/gofer/gofer.go | 7 ++++--- pkg/sentry/fsimpl/host/host.go | 6 +++--- pkg/sentry/fsimpl/kernfs/fd_impl_util.go | 3 +-- pkg/sentry/fsimpl/kernfs/inode_impl_util.go | 6 ++---- pkg/sentry/fsimpl/proc/task.go | 9 +-------- pkg/sentry/fsimpl/tmpfs/filesystem.go | 24 ++++++++++++------------ pkg/sentry/fsimpl/tmpfs/tmpfs.go | 8 +++++--- pkg/sentry/vfs/anonfs.go | 2 +- pkg/sentry/vfs/permissions.go | 23 ++++++++++------------- 12 files changed, 56 insertions(+), 61 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/abi/linux/file.go b/pkg/abi/linux/file.go index dbe58acbe..055ac1d7c 100644 --- a/pkg/abi/linux/file.go +++ b/pkg/abi/linux/file.go @@ -287,6 +287,11 @@ func (m FileMode) ExtraBits() FileMode { return m &^ (PermissionsMask | FileTypeMask) } +// IsDir returns true if file type represents a directory. +func (m FileMode) IsDir() bool { + return m.FileType() == S_IFDIR +} + // String returns a string representation of m. func (m FileMode) String() string { var s []string diff --git a/pkg/sentry/fsimpl/ext/inode.go b/pkg/sentry/fsimpl/ext/inode.go index 6962083f5..a39a37318 100644 --- a/pkg/sentry/fsimpl/ext/inode.go +++ b/pkg/sentry/fsimpl/ext/inode.go @@ -186,7 +186,7 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts *vfs.OpenOpt } func (in *inode) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error { - return vfs.GenericCheckPermissions(creds, ats, in.isDir(), uint16(in.diskInode.Mode()), in.diskInode.UID(), in.diskInode.GID()) + return vfs.GenericCheckPermissions(creds, ats, in.diskInode.Mode(), in.diskInode.UID(), in.diskInode.GID()) } // statTo writes the statx fields to the output parameter. diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go index 26b492185..1e43df9ec 100644 --- a/pkg/sentry/fsimpl/gofer/filesystem.go +++ b/pkg/sentry/fsimpl/gofer/filesystem.go @@ -119,7 +119,7 @@ func (fs *filesystem) stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d * if !d.isDir() { return nil, syserror.ENOTDIR } - if err := d.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil { + if err := d.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { return nil, err } afterSymlink: @@ -314,7 +314,7 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir if err != nil { return err } - if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true); err != nil { + if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { return err } if parent.isDeleted() { @@ -378,7 +378,7 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b if err != nil { return err } - if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true); err != nil { + if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { return err } if err := rp.Mount().CheckBeginWrite(); err != nil { @@ -512,7 +512,7 @@ func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds if err != nil { return err } - return d.checkPermissions(creds, ats, d.isDir()) + return d.checkPermissions(creds, ats) } // GetDentryAt implements vfs.FilesystemImpl.GetDentryAt. @@ -528,7 +528,7 @@ func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, op if !d.isDir() { return nil, syserror.ENOTDIR } - if err := d.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil { + if err := d.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { return nil, err } } @@ -624,7 +624,7 @@ afterTrailingSymlink: return nil, err } // Check for search permission in the parent directory. - if err := parent.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil { + if err := parent.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { return nil, err } // Determine whether or not we need to create a file. @@ -661,7 +661,7 @@ afterTrailingSymlink: // Preconditions: fs.renameMu must be locked. func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions) (*vfs.FileDescription, error) { ats := vfs.AccessTypesForOpenFlags(opts) - if err := d.checkPermissions(rp.Credentials(), ats, d.isDir()); err != nil { + if err := d.checkPermissions(rp.Credentials(), ats); err != nil { return nil, err } mnt := rp.Mount() @@ -722,7 +722,7 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vf // Preconditions: d.fs.renameMu must be locked. d.dirMu must be locked. func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions) (*vfs.FileDescription, error) { - if err := d.checkPermissions(rp.Credentials(), vfs.MayWrite, true); err != nil { + if err := d.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil { return nil, err } if d.isDeleted() { @@ -884,7 +884,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa return err } } - if err := oldParent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true); err != nil { + if err := oldParent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { return err } vfsObj := rp.VirtualFilesystem() @@ -904,7 +904,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa return syserror.EINVAL } if oldParent != newParent { - if err := renamed.checkPermissions(rp.Credentials(), vfs.MayWrite, true); err != nil { + if err := renamed.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil { return err } } @@ -915,7 +915,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa } if oldParent != newParent { - if err := newParent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true); err != nil { + if err := newParent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { return err } newParent.dirMu.Lock() diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go index 13928ce36..cf276a417 100644 --- a/pkg/sentry/fsimpl/gofer/gofer.go +++ b/pkg/sentry/fsimpl/gofer/gofer.go @@ -721,7 +721,8 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *lin if stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID|linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_SIZE) != 0 { return syserror.EPERM } - if err := vfs.CheckSetStat(ctx, creds, stat, uint16(atomic.LoadUint32(&d.mode))&^linux.S_IFMT, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))); err != nil { + mode := linux.FileMode(atomic.LoadUint32(&d.mode)) + if err := vfs.CheckSetStat(ctx, creds, stat, mode, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))); err != nil { return err } if err := mnt.CheckBeginWrite(); err != nil { @@ -843,8 +844,8 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *lin return nil } -func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes, isDir bool) error { - return vfs.GenericCheckPermissions(creds, ats, isDir, uint16(atomic.LoadUint32(&d.mode))&0777, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))) +func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error { + return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(atomic.LoadUint32(&d.mode)), auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))) } // IncRef implements vfs.DentryImpl.IncRef. diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go index 1f735628f..a54985ef5 100644 --- a/pkg/sentry/fsimpl/host/host.go +++ b/pkg/sentry/fsimpl/host/host.go @@ -167,8 +167,8 @@ func fileFlagsFromHostFD(fd int) (int, error) { } // CheckPermissions implements kernfs.Inode. -func (i *inode) CheckPermissions(ctx context.Context, creds *auth.Credentials, atx vfs.AccessTypes) error { - return vfs.GenericCheckPermissions(creds, atx, false /* isDir */, uint16(i.mode), i.uid, i.gid) +func (i *inode) CheckPermissions(ctx context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error { + return vfs.GenericCheckPermissions(creds, ats, i.mode, i.uid, i.gid) } // Mode implements kernfs.Inode. @@ -306,7 +306,7 @@ func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Cre if m&^(linux.STATX_MODE|linux.STATX_SIZE|linux.STATX_ATIME|linux.STATX_MTIME) != 0 { return syserror.EPERM } - if err := vfs.CheckSetStat(ctx, creds, &s, uint16(i.Mode().Permissions()), i.uid, i.gid); err != nil { + if err := vfs.CheckSetStat(ctx, creds, &s, i.Mode(), i.uid, i.gid); err != nil { return err } diff --git a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go index 75c4bab1a..bfa786c88 100644 --- a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go @@ -206,8 +206,7 @@ func (fd *GenericDirectoryFD) Stat(ctx context.Context, opts vfs.StatOptions) (l // SetStat implements vfs.FileDescriptionImpl.SetStat. func (fd *GenericDirectoryFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { - fs := fd.filesystem() creds := auth.CredentialsFromContext(ctx) inode := fd.vfsfd.VirtualDentry().Dentry().Impl().(*Dentry).inode - return inode.SetStat(ctx, fs, creds, opts) + return inode.SetStat(ctx, fd.filesystem(), creds, opts) } diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go index c612dcf07..5c84b10c9 100644 --- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go @@ -241,7 +241,7 @@ func (a *InodeAttrs) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *aut if opts.Stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID) != 0 { return syserror.EPERM } - if err := vfs.CheckSetStat(ctx, creds, &opts.Stat, uint16(a.Mode().Permissions()), auth.KUID(atomic.LoadUint32(&a.uid)), auth.KGID(atomic.LoadUint32(&a.gid))); err != nil { + if err := vfs.CheckSetStat(ctx, creds, &opts.Stat, a.Mode(), auth.KUID(atomic.LoadUint32(&a.uid)), auth.KGID(atomic.LoadUint32(&a.gid))); err != nil { return err } @@ -273,12 +273,10 @@ func (a *InodeAttrs) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *aut // CheckPermissions implements Inode.CheckPermissions. func (a *InodeAttrs) CheckPermissions(_ context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error { - mode := a.Mode() return vfs.GenericCheckPermissions( creds, ats, - mode.FileType() == linux.ModeDirectory, - uint16(mode), + a.Mode(), auth.KUID(atomic.LoadUint32(&a.uid)), auth.KGID(atomic.LoadUint32(&a.gid)), ) diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go index 49d6efb0e..aee2a4392 100644 --- a/pkg/sentry/fsimpl/proc/task.go +++ b/pkg/sentry/fsimpl/proc/task.go @@ -172,14 +172,7 @@ func (i *taskOwnedInode) Stat(fs *vfs.Filesystem, opts vfs.StatOptions) (linux.S func (i *taskOwnedInode) CheckPermissions(_ context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error { mode := i.Mode() uid, gid := i.getOwner(mode) - return vfs.GenericCheckPermissions( - creds, - ats, - mode.FileType() == linux.ModeDirectory, - uint16(mode), - uid, - gid, - ) + return vfs.GenericCheckPermissions(creds, ats, mode, uid, gid) } func (i *taskOwnedInode) getOwner(mode linux.FileMode) (auth.KUID, auth.KGID) { diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go index 75d01b853..12cc64385 100644 --- a/pkg/sentry/fsimpl/tmpfs/filesystem.go +++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go @@ -41,7 +41,7 @@ func stepLocked(rp *vfs.ResolvingPath, d *dentry) (*dentry, error) { if !d.inode.isDir() { return nil, syserror.ENOTDIR } - if err := d.inode.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil { + if err := d.inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { return nil, err } afterSymlink: @@ -125,7 +125,7 @@ func (fs *filesystem) doCreateAt(rp *vfs.ResolvingPath, dir bool, create func(pa if err != nil { return err } - if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true /* isDir */); err != nil { + if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { return err } name := rp.Component() @@ -163,7 +163,7 @@ func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds if err != nil { return err } - return d.inode.checkPermissions(creds, ats, d.inode.isDir()) + return d.inode.checkPermissions(creds, ats) } // GetDentryAt implements vfs.FilesystemImpl.GetDentryAt. @@ -178,7 +178,7 @@ func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, op if !d.inode.isDir() { return nil, syserror.ENOTDIR } - if err := d.inode.checkPermissions(rp.Credentials(), vfs.MayExec, true /* isDir */); err != nil { + if err := d.inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { return nil, err } } @@ -301,7 +301,7 @@ afterTrailingSymlink: return nil, err } // Check for search permission in the parent directory. - if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil { + if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { return nil, err } // Reject attempts to open directories with O_CREAT. @@ -316,7 +316,7 @@ afterTrailingSymlink: child, err := stepLocked(rp, parent) if err == syserror.ENOENT { // Already checked for searchability above; now check for writability. - if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite, true); err != nil { + if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil { return nil, err } if err := rp.Mount().CheckBeginWrite(); err != nil { @@ -347,7 +347,7 @@ afterTrailingSymlink: func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions, afterCreate bool) (*vfs.FileDescription, error) { ats := vfs.AccessTypesForOpenFlags(opts) if !afterCreate { - if err := d.inode.checkPermissions(rp.Credentials(), ats, d.inode.isDir()); err != nil { + if err := d.inode.checkPermissions(rp.Credentials(), ats); err != nil { return nil, err } } @@ -428,7 +428,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa defer mnt.EndWrite() oldParent := oldParentVD.Dentry().Impl().(*dentry) - if err := oldParent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true /* isDir */); err != nil { + if err := oldParent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { return err } // Call vfs.Dentry.Child() instead of stepLocked() or rp.ResolveChild(), @@ -445,7 +445,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa } if oldParent != newParent { // Writability is needed to change renamed's "..". - if err := renamed.inode.checkPermissions(rp.Credentials(), vfs.MayWrite, true /* isDir */); err != nil { + if err := renamed.inode.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil { return err } } @@ -455,7 +455,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa } } - if err := newParent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true /* isDir */); err != nil { + if err := newParent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { return err } replacedVFSD := newParent.vfsd.Child(newName) @@ -528,7 +528,7 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error if err != nil { return err } - if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true /* isDir */); err != nil { + if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { return err } name := rp.Component() @@ -621,7 +621,7 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error if err != nil { return err } - if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true /* isDir */); err != nil { + if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { return err } name := rp.Component() diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go index 2d5070a46..2f9e6c876 100644 --- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go +++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go @@ -245,8 +245,9 @@ func (i *inode) decRef() { } } -func (i *inode) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes, isDir bool) error { - return vfs.GenericCheckPermissions(creds, ats, isDir, uint16(atomic.LoadUint32(&i.mode)), auth.KUID(atomic.LoadUint32(&i.uid)), auth.KGID(atomic.LoadUint32(&i.gid))) +func (i *inode) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error { + mode := linux.FileMode(atomic.LoadUint32(&i.mode)) + return vfs.GenericCheckPermissions(creds, ats, mode, auth.KUID(atomic.LoadUint32(&i.uid)), auth.KGID(atomic.LoadUint32(&i.gid))) } // Go won't inline this function, and returning linux.Statx (which is quite @@ -299,7 +300,8 @@ func (i *inode) setStat(ctx context.Context, creds *auth.Credentials, stat *linu if stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID|linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_CTIME|linux.STATX_SIZE) != 0 { return syserror.EPERM } - if err := vfs.CheckSetStat(ctx, creds, stat, uint16(atomic.LoadUint32(&i.mode))&^linux.S_IFMT, auth.KUID(atomic.LoadUint32(&i.uid)), auth.KGID(atomic.LoadUint32(&i.gid))); err != nil { + mode := linux.FileMode(atomic.LoadUint32(&i.mode)) + if err := vfs.CheckSetStat(ctx, creds, stat, mode, auth.KUID(atomic.LoadUint32(&i.uid)), auth.KGID(atomic.LoadUint32(&i.gid))); err != nil { return err } i.mu.Lock() diff --git a/pkg/sentry/vfs/anonfs.go b/pkg/sentry/vfs/anonfs.go index 925996517..a62e43589 100644 --- a/pkg/sentry/vfs/anonfs.go +++ b/pkg/sentry/vfs/anonfs.go @@ -83,7 +83,7 @@ func (fs *anonFilesystem) AccessAt(ctx context.Context, rp *ResolvingPath, creds if !rp.Done() { return syserror.ENOTDIR } - return GenericCheckPermissions(creds, ats, false /* isDir */, anonFileMode, anonFileUID, anonFileGID) + return GenericCheckPermissions(creds, ats, anonFileMode, anonFileUID, anonFileGID) } // GetDentryAt implements FilesystemImpl.GetDentryAt. diff --git a/pkg/sentry/vfs/permissions.go b/pkg/sentry/vfs/permissions.go index 2c8f23f55..f9647f90e 100644 --- a/pkg/sentry/vfs/permissions.go +++ b/pkg/sentry/vfs/permissions.go @@ -29,9 +29,9 @@ type AccessTypes uint16 // Bits in AccessTypes. const ( + MayExec AccessTypes = 1 + MayWrite AccessTypes = 2 MayRead AccessTypes = 4 - MayWrite = 2 - MayExec = 1 ) // OnlyRead returns true if access _only_ allows read. @@ -56,16 +56,17 @@ func (a AccessTypes) MayExec() bool { // GenericCheckPermissions checks that creds has the given access rights on a // file with the given permissions, UID, and GID, subject to the rules of -// fs/namei.c:generic_permission(). isDir is true if the file is a directory. -func GenericCheckPermissions(creds *auth.Credentials, ats AccessTypes, isDir bool, mode uint16, kuid auth.KUID, kgid auth.KGID) error { +// fs/namei.c:generic_permission(). +func GenericCheckPermissions(creds *auth.Credentials, ats AccessTypes, mode linux.FileMode, kuid auth.KUID, kgid auth.KGID) error { // Check permission bits. - perms := mode + perms := uint16(mode.Permissions()) if creds.EffectiveKUID == kuid { perms >>= 6 } else if creds.InGroup(kgid) { perms >>= 3 } if uint16(ats)&perms == uint16(ats) { + // All permission bits match, access granted. return nil } @@ -77,7 +78,7 @@ func GenericCheckPermissions(creds *auth.Credentials, ats AccessTypes, isDir boo } // CAP_DAC_READ_SEARCH allows the caller to read and search arbitrary // directories, and read arbitrary non-directory files. - if (isDir && !ats.MayWrite()) || ats.OnlyRead() { + if (mode.IsDir() && !ats.MayWrite()) || ats.OnlyRead() { if creds.HasCapability(linux.CAP_DAC_READ_SEARCH) { return nil } @@ -85,7 +86,7 @@ func GenericCheckPermissions(creds *auth.Credentials, ats AccessTypes, isDir boo // CAP_DAC_OVERRIDE allows arbitrary access to directories, read/write // access to non-directory files, and execute access to non-directory files // for which at least one execute bit is set. - if isDir || !ats.MayExec() || (mode&0111 != 0) { + if mode.IsDir() || !ats.MayExec() || (mode.Permissions()&0111 != 0) { if creds.HasCapability(linux.CAP_DAC_OVERRIDE) { return nil } @@ -151,7 +152,7 @@ func MayWriteFileWithOpenFlags(flags uint32) bool { // CheckSetStat checks that creds has permission to change the metadata of a // file with the given permissions, UID, and GID as specified by stat, subject // to the rules of Linux's fs/attr.c:setattr_prepare(). -func CheckSetStat(ctx context.Context, creds *auth.Credentials, stat *linux.Statx, mode uint16, kuid auth.KUID, kgid auth.KGID) error { +func CheckSetStat(ctx context.Context, creds *auth.Credentials, stat *linux.Statx, mode linux.FileMode, kuid auth.KUID, kgid auth.KGID) error { if stat.Mask&linux.STATX_SIZE != 0 { limit, err := CheckLimit(ctx, 0, int64(stat.Size)) if err != nil { @@ -190,11 +191,7 @@ func CheckSetStat(ctx context.Context, creds *auth.Credentials, stat *linux.Stat (stat.Mask&linux.STATX_CTIME != 0 && stat.Ctime.Nsec != linux.UTIME_NOW) { return syserror.EPERM } - // isDir is irrelevant in the following call to - // GenericCheckPermissions since ats == MayWrite means that - // CAP_DAC_READ_SEARCH does not apply, and CAP_DAC_OVERRIDE - // applies, regardless of isDir. - if err := GenericCheckPermissions(creds, MayWrite, false /* isDir */, mode, kuid, kgid); err != nil { + if err := GenericCheckPermissions(creds, MayWrite, mode, kuid, kgid); err != nil { return err } } -- cgit v1.2.3 From 137f3614009b0ef931c1d00a083b4ae8e6a39bc9 Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Thu, 26 Mar 2020 16:46:15 -0700 Subject: Use host-defined file owner and mode, when possible, for imported fds. Using the host-defined file owner matches VFS1. It is more correct to use the host-defined mode, since the cached value may become out of date. However, kernfs.Inode.Mode() does not return an error--other filesystems on kernfs are in-memory so retrieving mode should not fail. Therefore, if the host syscall fails, we rely on a cached value instead. Updates #1672. PiperOrigin-RevId: 303220864 --- pkg/sentry/control/proc.go | 6 +-- pkg/sentry/fs/host/BUILD | 1 - pkg/sentry/fs/host/control.go | 2 +- pkg/sentry/fs/host/file.go | 10 ++-- pkg/sentry/fs/host/inode_test.go | 3 +- pkg/sentry/fs/host/wait_test.go | 3 +- pkg/sentry/fsimpl/host/host.go | 110 ++++++++++++++++++++++++++------------- runsc/boot/fds.go | 5 +- 8 files changed, 87 insertions(+), 53 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/control/proc.go b/pkg/sentry/control/proc.go index 5457ba5e7..b51fb3959 100644 --- a/pkg/sentry/control/proc.go +++ b/pkg/sentry/control/proc.go @@ -224,8 +224,6 @@ func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadI } } - mounter := fs.FileOwnerFromContext(ctx) - // TODO(gvisor.dev/issue/1623): Use host FD when supported in VFS2. var ttyFile *fs.File for appFD, hostFile := range args.FilePayload.Files { @@ -235,7 +233,7 @@ func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadI // Import the file as a host TTY file. if ttyFile == nil { var err error - appFile, err = host.ImportFile(ctx, int(hostFile.Fd()), mounter, true /* isTTY */) + appFile, err = host.ImportFile(ctx, int(hostFile.Fd()), true /* isTTY */) if err != nil { return nil, 0, nil, err } @@ -254,7 +252,7 @@ func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadI } else { // Import the file as a regular host file. var err error - appFile, err = host.ImportFile(ctx, int(hostFile.Fd()), mounter, false /* isTTY */) + appFile, err = host.ImportFile(ctx, int(hostFile.Fd()), false /* isTTY */) if err != nil { return nil, 0, nil, err } diff --git a/pkg/sentry/fs/host/BUILD b/pkg/sentry/fs/host/BUILD index 011625c80..aabce6cc9 100644 --- a/pkg/sentry/fs/host/BUILD +++ b/pkg/sentry/fs/host/BUILD @@ -71,7 +71,6 @@ go_test( "//pkg/fd", "//pkg/fdnotifier", "//pkg/sentry/contexttest", - "//pkg/sentry/fs", "//pkg/sentry/kernel/time", "//pkg/sentry/socket", "//pkg/sentry/socket/unix/transport", diff --git a/pkg/sentry/fs/host/control.go b/pkg/sentry/fs/host/control.go index cd84e1337..52c0504b6 100644 --- a/pkg/sentry/fs/host/control.go +++ b/pkg/sentry/fs/host/control.go @@ -78,7 +78,7 @@ func fdsToFiles(ctx context.Context, fds []int) []*fs.File { } // Create the file backed by hostFD. - file, err := NewFile(ctx, fd, fs.FileOwnerFromContext(ctx)) + file, err := NewFile(ctx, fd) if err != nil { ctx.Warningf("Error creating file from host FD: %v", err) break diff --git a/pkg/sentry/fs/host/file.go b/pkg/sentry/fs/host/file.go index 034862694..3e48b8b2c 100644 --- a/pkg/sentry/fs/host/file.go +++ b/pkg/sentry/fs/host/file.go @@ -60,8 +60,8 @@ var _ fs.FileOperations = (*fileOperations)(nil) // The returned File cannot be saved, since there is no guarantee that the same // FD will exist or represent the same file at time of restore. If such a // guarantee does exist, use ImportFile instead. -func NewFile(ctx context.Context, fd int, mounter fs.FileOwner) (*fs.File, error) { - return newFileFromDonatedFD(ctx, fd, mounter, false, false) +func NewFile(ctx context.Context, fd int) (*fs.File, error) { + return newFileFromDonatedFD(ctx, fd, false, false) } // ImportFile creates a new File backed by the provided host file descriptor. @@ -71,13 +71,13 @@ func NewFile(ctx context.Context, fd int, mounter fs.FileOwner) (*fs.File, error // If the returned file is saved, it will be restored by re-importing the FD // originally passed to ImportFile. It is the restorer's responsibility to // ensure that the FD represents the same file. -func ImportFile(ctx context.Context, fd int, mounter fs.FileOwner, isTTY bool) (*fs.File, error) { - return newFileFromDonatedFD(ctx, fd, mounter, true, isTTY) +func ImportFile(ctx context.Context, fd int, isTTY bool) (*fs.File, error) { + return newFileFromDonatedFD(ctx, fd, true, isTTY) } // newFileFromDonatedFD returns an fs.File from a donated FD. If the FD is // saveable, then saveable is true. -func newFileFromDonatedFD(ctx context.Context, donated int, mounter fs.FileOwner, saveable, isTTY bool) (*fs.File, error) { +func newFileFromDonatedFD(ctx context.Context, donated int, saveable, isTTY bool) (*fs.File, error) { var s syscall.Stat_t if err := syscall.Fstat(donated, &s); err != nil { return nil, err diff --git a/pkg/sentry/fs/host/inode_test.go b/pkg/sentry/fs/host/inode_test.go index 4c374681c..c507f57eb 100644 --- a/pkg/sentry/fs/host/inode_test.go +++ b/pkg/sentry/fs/host/inode_test.go @@ -19,7 +19,6 @@ import ( "testing" "gvisor.dev/gvisor/pkg/sentry/contexttest" - "gvisor.dev/gvisor/pkg/sentry/fs" ) // TestCloseFD verifies fds will be closed. @@ -33,7 +32,7 @@ func TestCloseFD(t *testing.T) { // Use the write-end because we will detect if it's closed on the read end. ctx := contexttest.Context(t) - file, err := NewFile(ctx, p[1], fs.RootOwner) + file, err := NewFile(ctx, p[1]) if err != nil { t.Fatalf("Failed to create File: %v", err) } diff --git a/pkg/sentry/fs/host/wait_test.go b/pkg/sentry/fs/host/wait_test.go index d49c3a635..ce397a5e3 100644 --- a/pkg/sentry/fs/host/wait_test.go +++ b/pkg/sentry/fs/host/wait_test.go @@ -20,7 +20,6 @@ import ( "time" "gvisor.dev/gvisor/pkg/sentry/contexttest" - "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/waiter" ) @@ -34,7 +33,7 @@ func TestWait(t *testing.T) { defer syscall.Close(fds[1]) ctx := contexttest.Context(t) - file, err := NewFile(ctx, fds[0], fs.RootOwner) + file, err := NewFile(ctx, fds[0]) if err != nil { syscall.Close(fds[0]) t.Fatalf("NewFile failed: %v", err) diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go index a54985ef5..17e3d6e9d 100644 --- a/pkg/sentry/fsimpl/host/host.go +++ b/pkg/sentry/fsimpl/host/host.go @@ -54,7 +54,7 @@ func NewMount(vfsObj *vfs.VirtualFilesystem) (*vfs.Mount, error) { } // ImportFD sets up and returns a vfs.FileDescription from a donated fd. -func ImportFD(mnt *vfs.Mount, hostFD int, ownerUID auth.KUID, ownerGID auth.KGID, isTTY bool) (*vfs.FileDescription, error) { +func ImportFD(mnt *vfs.Mount, hostFD int, isTTY bool) (*vfs.FileDescription, error) { fs, ok := mnt.Filesystem().Impl().(*kernfs.Filesystem) if !ok { return nil, fmt.Errorf("can't import host FDs into filesystems of type %T", mnt.Filesystem().Impl()) @@ -78,8 +78,6 @@ func ImportFD(mnt *vfs.Mount, hostFD int, ownerUID auth.KUID, ownerGID auth.KGID canMap: canMap(uint32(fileType)), ino: fs.NextIno(), mode: fileMode, - uid: ownerUID, - gid: ownerGID, // For simplicity, set offset to 0. Technically, we should // only set to 0 on files that are not seekable (sockets, pipes, etc.), // and use the offset from the host fd otherwise. @@ -135,17 +133,20 @@ type inode struct { // This field is initialized at creation time and is immutable. ino uint64 - // TODO(gvisor.dev/issue/1672): protect mode, uid, and gid with mutex. + // modeMu protects mode. + modeMu sync.Mutex - // mode is the file mode of this inode. Note that this value may become out - // of date if the mode is changed on the host, e.g. with chmod. + // mode is a cached version of the file mode on the host. Note that it may + // become out of date if the mode is changed on the host, e.g. with chmod. + // + // Generally, it is better to retrieve the mode from the host through an + // fstat syscall. We only use this value in inode.Mode(), which cannot + // return an error, if the syscall to host fails. + // + // FIXME(b/152294168): Plumb error into Inode.Mode() return value so we + // can get rid of this. mode linux.FileMode - // uid and gid of the file owner. Note that these refer to the owner of the - // file created on import, not the fd on the host. - uid auth.KUID - gid auth.KGID - // offsetMu protects offset. offsetMu sync.Mutex @@ -168,12 +169,35 @@ func fileFlagsFromHostFD(fd int) (int, error) { // CheckPermissions implements kernfs.Inode. func (i *inode) CheckPermissions(ctx context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error { - return vfs.GenericCheckPermissions(creds, ats, i.mode, i.uid, i.gid) + mode, uid, gid, err := i.getPermissions() + if err != nil { + return err + } + return vfs.GenericCheckPermissions(creds, ats, mode, uid, gid) } // Mode implements kernfs.Inode. func (i *inode) Mode() linux.FileMode { - return i.mode + mode, _, _, err := i.getPermissions() + if err != nil { + return i.mode + } + + return linux.FileMode(mode) +} + +func (i *inode) getPermissions() (linux.FileMode, auth.KUID, auth.KGID, error) { + // Retrieve metadata. + var s syscall.Stat_t + if err := syscall.Fstat(i.hostFD, &s); err != nil { + return 0, 0, 0, err + } + + // Update cached mode. + i.modeMu.Lock() + i.mode = linux.FileMode(s.Mode) + i.modeMu.Unlock() + return linux.FileMode(s.Mode), auth.KUID(s.Uid), auth.KGID(s.Gid), nil } // Stat implements kernfs.Inode. @@ -213,45 +237,51 @@ func (i *inode) Stat(_ *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, erro ls.Attributes = s.Attributes ls.AttributesMask = s.Attributes_mask - if mask|linux.STATX_TYPE != 0 { + if mask&linux.STATX_TYPE != 0 { ls.Mode |= s.Mode & linux.S_IFMT } - if mask|linux.STATX_MODE != 0 { + if mask&linux.STATX_MODE != 0 { ls.Mode |= s.Mode &^ linux.S_IFMT } - if mask|linux.STATX_NLINK != 0 { + if mask&linux.STATX_NLINK != 0 { ls.Nlink = s.Nlink } - if mask|linux.STATX_ATIME != 0 { + if mask&linux.STATX_UID != 0 { + ls.UID = s.Uid + } + if mask&linux.STATX_GID != 0 { + ls.GID = s.Gid + } + if mask&linux.STATX_ATIME != 0 { ls.Atime = unixToLinuxStatxTimestamp(s.Atime) } - if mask|linux.STATX_BTIME != 0 { + if mask&linux.STATX_BTIME != 0 { ls.Btime = unixToLinuxStatxTimestamp(s.Btime) } - if mask|linux.STATX_CTIME != 0 { + if mask&linux.STATX_CTIME != 0 { ls.Ctime = unixToLinuxStatxTimestamp(s.Ctime) } - if mask|linux.STATX_MTIME != 0 { + if mask&linux.STATX_MTIME != 0 { ls.Mtime = unixToLinuxStatxTimestamp(s.Mtime) } - if mask|linux.STATX_SIZE != 0 { + if mask&linux.STATX_SIZE != 0 { ls.Size = s.Size } - if mask|linux.STATX_BLOCKS != 0 { + if mask&linux.STATX_BLOCKS != 0 { ls.Blocks = s.Blocks } - // Use our own internal inode number and file owner. - if mask|linux.STATX_INO != 0 { + // Use our own internal inode number. + if mask&linux.STATX_INO != 0 { ls.Ino = i.ino } - if mask|linux.STATX_UID != 0 { - ls.UID = uint32(i.uid) - } - if mask|linux.STATX_GID != 0 { - ls.GID = uint32(i.gid) - } + // Update cached mode. + if (mask&linux.STATX_TYPE != 0) && (mask&linux.STATX_MODE != 0) { + i.modeMu.Lock() + i.mode = linux.FileMode(s.Mode) + i.modeMu.Unlock() + } return ls, nil } @@ -274,6 +304,8 @@ func (i *inode) fstat(opts vfs.StatOptions) (linux.Statx, error) { Mask: linux.STATX_BASIC_STATS, Blksize: uint32(s.Blksize), Nlink: uint32(s.Nlink), + UID: s.Uid, + GID: s.Gid, Mode: uint16(s.Mode), Size: uint64(s.Size), Blocks: uint64(s.Blocks), @@ -282,15 +314,13 @@ func (i *inode) fstat(opts vfs.StatOptions) (linux.Statx, error) { Mtime: timespecToStatxTimestamp(s.Mtim), } - // Use our own internal inode number and file owner. + // Use our own internal inode number. // // TODO(gvisor.dev/issue/1672): Use a kernfs-specific device number as well. // If we use the device number from the host, it may collide with another // sentry-internal device number. We handle device/inode numbers without // relying on the host to prevent collisions. ls.Ino = i.ino - ls.UID = uint32(i.uid) - ls.GID = uint32(i.gid) return ls, nil } @@ -306,7 +336,11 @@ func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Cre if m&^(linux.STATX_MODE|linux.STATX_SIZE|linux.STATX_ATIME|linux.STATX_MTIME) != 0 { return syserror.EPERM } - if err := vfs.CheckSetStat(ctx, creds, &s, i.Mode(), i.uid, i.gid); err != nil { + mode, uid, gid, err := i.getPermissions() + if err != nil { + return err + } + if err := vfs.CheckSetStat(ctx, creds, &s, mode.Permissions(), uid, gid); err != nil { return err } @@ -314,7 +348,9 @@ func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Cre if err := syscall.Fchmod(i.hostFD, uint32(s.Mode)); err != nil { return err } + i.modeMu.Lock() i.mode = linux.FileMode(s.Mode) + i.modeMu.Unlock() } if m&linux.STATX_SIZE != 0 { if err := syscall.Ftruncate(i.hostFD, int64(s.Size)); err != nil { @@ -351,7 +387,11 @@ func (i *inode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptio } func (i *inode) open(d *vfs.Dentry, mnt *vfs.Mount) (*vfs.FileDescription, error) { - fileType := i.mode.FileType() + mode, _, _, err := i.getPermissions() + if err != nil { + return nil, err + } + fileType := mode.FileType() if fileType == syscall.S_IFSOCK { if i.isTTY { return nil, errors.New("cannot use host socket as TTY") diff --git a/runsc/boot/fds.go b/runsc/boot/fds.go index 417d2d5fb..5314b0f2a 100644 --- a/runsc/boot/fds.go +++ b/runsc/boot/fds.go @@ -34,7 +34,6 @@ func createFDTable(ctx context.Context, console bool, stdioFDs []int) (*kernel.F k := kernel.KernelFromContext(ctx) fdTable := k.NewFDTable() defer fdTable.DecRef() - mounter := fs.FileOwnerFromContext(ctx) var ttyFile *fs.File for appFD, hostFD := range stdioFDs { @@ -44,7 +43,7 @@ func createFDTable(ctx context.Context, console bool, stdioFDs []int) (*kernel.F // Import the file as a host TTY file. if ttyFile == nil { var err error - appFile, err = host.ImportFile(ctx, hostFD, mounter, true /* isTTY */) + appFile, err = host.ImportFile(ctx, hostFD, true /* isTTY */) if err != nil { return nil, err } @@ -63,7 +62,7 @@ func createFDTable(ctx context.Context, console bool, stdioFDs []int) (*kernel.F } else { // Import the file as a regular host file. var err error - appFile, err = host.ImportFile(ctx, hostFD, mounter, false /* isTTY */) + appFile, err = host.ImportFile(ctx, hostFD, false /* isTTY */) if err != nil { return nil, err } -- cgit v1.2.3 From 76a7ace751bfd4b16411edbc0a2b06d0308b8832 Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Thu, 26 Mar 2020 21:47:46 -0700 Subject: Add BoundEndpointAt filesystem operation. BoundEndpointAt() is needed to support Unix sockets bound at a file path, corresponding to BoundEndpoint() in VFS1. Updates #1476. PiperOrigin-RevId: 303258251 --- pkg/sentry/fsimpl/ext/BUILD | 1 + pkg/sentry/fsimpl/ext/filesystem.go | 12 ++++++++++ pkg/sentry/fsimpl/gofer/BUILD | 1 + pkg/sentry/fsimpl/gofer/filesystem.go | 8 +++++++ pkg/sentry/fsimpl/kernfs/BUILD | 1 + pkg/sentry/fsimpl/kernfs/filesystem.go | 13 ++++++++++ pkg/sentry/fsimpl/tmpfs/BUILD | 1 + pkg/sentry/fsimpl/tmpfs/filesystem.go | 8 +++++++ pkg/sentry/vfs/BUILD | 1 + pkg/sentry/vfs/anonfs.go | 9 +++++++ pkg/sentry/vfs/filesystem.go | 8 ++++++- pkg/sentry/vfs/vfs.go | 44 +++++++++++++++++++++++++++------- pkg/syserror/syserror.go | 1 + 13 files changed, 99 insertions(+), 9 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/ext/BUILD b/pkg/sentry/fsimpl/ext/BUILD index 6f78f478f..d83d75b3d 100644 --- a/pkg/sentry/fsimpl/ext/BUILD +++ b/pkg/sentry/fsimpl/ext/BUILD @@ -45,6 +45,7 @@ go_library( "//pkg/sentry/fsimpl/ext/disklayout", "//pkg/sentry/kernel/auth", "//pkg/sentry/memmap", + "//pkg/sentry/socket/unix/transport", "//pkg/sentry/syscalls/linux", "//pkg/sentry/vfs", "//pkg/sync", diff --git a/pkg/sentry/fsimpl/ext/filesystem.go b/pkg/sentry/fsimpl/ext/filesystem.go index 8497be615..48eaccdbc 100644 --- a/pkg/sentry/fsimpl/ext/filesystem.go +++ b/pkg/sentry/fsimpl/ext/filesystem.go @@ -23,6 +23,7 @@ import ( "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" @@ -463,6 +464,17 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error return syserror.EROFS } +// BoundEndpointAt implements FilesystemImpl.BoundEndpointAt. +func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath) (transport.BoundEndpoint, error) { + _, _, err := fs.walk(rp, false) + if err != nil { + return nil, err + } + + // TODO(b/134676337): Support sockets. + return nil, syserror.ECONNREFUSED +} + // ListxattrAt implements vfs.FilesystemImpl.ListxattrAt. func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([]string, error) { _, _, err := fs.walk(rp, false) diff --git a/pkg/sentry/fsimpl/gofer/BUILD b/pkg/sentry/fsimpl/gofer/BUILD index 4ba76a1e8..d15a36709 100644 --- a/pkg/sentry/fsimpl/gofer/BUILD +++ b/pkg/sentry/fsimpl/gofer/BUILD @@ -46,6 +46,7 @@ go_library( "//pkg/sentry/memmap", "//pkg/sentry/pgalloc", "//pkg/sentry/platform", + "//pkg/sentry/socket/unix/transport", "//pkg/sentry/usage", "//pkg/sentry/vfs", "//pkg/syserror", diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go index 1e43df9ec..269624362 100644 --- a/pkg/sentry/fsimpl/gofer/filesystem.go +++ b/pkg/sentry/fsimpl/gofer/filesystem.go @@ -22,6 +22,7 @@ import ( "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/p9" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" ) @@ -1059,6 +1060,13 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error return fs.unlinkAt(ctx, rp, false /* dir */) } +// BoundEndpointAt implements FilesystemImpl.BoundEndpointAt. +// +// TODO(gvisor.dev/issue/1476): Implement BoundEndpointAt. +func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath) (transport.BoundEndpoint, error) { + return nil, syserror.ECONNREFUSED +} + // ListxattrAt implements vfs.FilesystemImpl.ListxattrAt. func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([]string, error) { var ds *[]*dentry diff --git a/pkg/sentry/fsimpl/kernfs/BUILD b/pkg/sentry/fsimpl/kernfs/BUILD index e73f1f857..b3d6299d0 100644 --- a/pkg/sentry/fsimpl/kernfs/BUILD +++ b/pkg/sentry/fsimpl/kernfs/BUILD @@ -35,6 +35,7 @@ go_library( "//pkg/refs", "//pkg/sentry/kernel/auth", "//pkg/sentry/memmap", + "//pkg/sentry/socket/unix/transport", "//pkg/sentry/vfs", "//pkg/sync", "//pkg/syserror", diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go index 31da8b511..a429fa23d 100644 --- a/pkg/sentry/fsimpl/kernfs/filesystem.go +++ b/pkg/sentry/fsimpl/kernfs/filesystem.go @@ -23,6 +23,7 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" ) @@ -728,6 +729,18 @@ func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error return nil } +// BoundEndpointAt implements FilesystemImpl.BoundEndpointAt. +func (fs *Filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath) (transport.BoundEndpoint, error) { + fs.mu.RLock() + _, _, err := fs.walkExistingLocked(ctx, rp) + fs.mu.RUnlock() + fs.processDeferredDecRefs() + if err != nil { + return nil, err + } + return nil, syserror.ECONNREFUSED +} + // ListxattrAt implements vfs.FilesystemImpl.ListxattrAt. func (fs *Filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([]string, error) { fs.mu.RLock() diff --git a/pkg/sentry/fsimpl/tmpfs/BUILD b/pkg/sentry/fsimpl/tmpfs/BUILD index 57abd5583..6ea35affb 100644 --- a/pkg/sentry/fsimpl/tmpfs/BUILD +++ b/pkg/sentry/fsimpl/tmpfs/BUILD @@ -46,6 +46,7 @@ go_library( "//pkg/sentry/memmap", "//pkg/sentry/pgalloc", "//pkg/sentry/platform", + "//pkg/sentry/socket/unix/transport", "//pkg/sentry/usage", "//pkg/sentry/vfs", "//pkg/sentry/vfs/lock", diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go index 12cc64385..e678ecc37 100644 --- a/pkg/sentry/fsimpl/tmpfs/filesystem.go +++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go @@ -21,6 +21,7 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" ) @@ -656,6 +657,13 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error return nil } +// BoundEndpointAt implements FilesystemImpl.BoundEndpointAt. +// +// TODO(gvisor.dev/issue/1476): Implement BoundEndpointAt. +func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath) (transport.BoundEndpoint, error) { + return nil, syserror.ECONNREFUSED +} + // ListxattrAt implements vfs.FilesystemImpl.ListxattrAt. func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([]string, error) { fs.mu.RLock() diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD index a2a06fc8f..bf4d27c7d 100644 --- a/pkg/sentry/vfs/BUILD +++ b/pkg/sentry/vfs/BUILD @@ -53,6 +53,7 @@ go_library( "//pkg/sentry/kernel/auth", "//pkg/sentry/limits", "//pkg/sentry/memmap", + "//pkg/sentry/socket/unix/transport", "//pkg/sync", "//pkg/syserror", "//pkg/usermem", diff --git a/pkg/sentry/vfs/anonfs.go b/pkg/sentry/vfs/anonfs.go index a62e43589..f58867066 100644 --- a/pkg/sentry/vfs/anonfs.go +++ b/pkg/sentry/vfs/anonfs.go @@ -21,6 +21,7 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -222,6 +223,14 @@ func (fs *anonFilesystem) UnlinkAt(ctx context.Context, rp *ResolvingPath) error return syserror.EPERM } +// BoundEndpointAt implements FilesystemImpl.BoundEndpointAt. +func (fs *anonFilesystem) BoundEndpointAt(ctx context.Context, rp *ResolvingPath) (transport.BoundEndpoint, error) { + if !rp.Final() { + return nil, syserror.ENOTDIR + } + return nil, syserror.ECONNREFUSED +} + // ListxattrAt implements FilesystemImpl.ListxattrAt. func (fs *anonFilesystem) ListxattrAt(ctx context.Context, rp *ResolvingPath) ([]string, error) { if !rp.Done() { diff --git a/pkg/sentry/vfs/filesystem.go b/pkg/sentry/vfs/filesystem.go index 332decce6..7b7d233f9 100644 --- a/pkg/sentry/vfs/filesystem.go +++ b/pkg/sentry/vfs/filesystem.go @@ -21,6 +21,7 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" ) // A Filesystem is a tree of nodes represented by Dentries, which forms part of @@ -460,6 +461,11 @@ type FilesystemImpl interface { // RemovexattrAt returns ENOTSUP. RemovexattrAt(ctx context.Context, rp *ResolvingPath, name string) error + // BoundEndpointAt returns the Unix socket endpoint bound at the path rp. + // + // - If a non-socket file exists at rp, then BoundEndpointAt returns ECONNREFUSED. + BoundEndpointAt(ctx context.Context, rp *ResolvingPath) (transport.BoundEndpoint, error) + // PrependPath prepends a path from vd to vd.Mount().Root() to b. // // If vfsroot.Ok(), it is the contextual VFS root; if it is encountered @@ -482,7 +488,7 @@ type FilesystemImpl interface { // Preconditions: vd.Mount().Filesystem().Impl() == this FilesystemImpl. PrependPath(ctx context.Context, vfsroot, vd VirtualDentry, b *fspath.Builder) error - // TODO: inotify_add_watch(); bind() + // TODO: inotify_add_watch() } // PrependPathAtVFSRootError is returned by implementations of diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go index 03d1fb943..1708c1a53 100644 --- a/pkg/sentry/vfs/vfs.go +++ b/pkg/sentry/vfs/vfs.go @@ -38,6 +38,7 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) @@ -230,7 +231,7 @@ func (vfs *VirtualFilesystem) getParentDirAndName(ctx context.Context, creds *au } if checkInvariants { if rp.canHandleError(err) && rp.Done() { - panic(fmt.Sprintf("%T.GetParentDentryAt() consumed all path components and returned %T", rp.mount.fs.impl, err)) + panic(fmt.Sprintf("%T.GetParentDentryAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) } } if !rp.handleError(err) { @@ -271,7 +272,7 @@ func (vfs *VirtualFilesystem) LinkAt(ctx context.Context, creds *auth.Credential } if checkInvariants { if rp.canHandleError(err) && rp.Done() { - panic(fmt.Sprintf("%T.LinkAt() consumed all path components and returned %T", rp.mount.fs.impl, err)) + panic(fmt.Sprintf("%T.LinkAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) } } if !rp.handleError(err) { @@ -307,7 +308,7 @@ func (vfs *VirtualFilesystem) MkdirAt(ctx context.Context, creds *auth.Credentia } if checkInvariants { if rp.canHandleError(err) && rp.Done() { - panic(fmt.Sprintf("%T.MkdirAt() consumed all path components and returned %T", rp.mount.fs.impl, err)) + panic(fmt.Sprintf("%T.MkdirAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) } } if !rp.handleError(err) { @@ -340,7 +341,7 @@ func (vfs *VirtualFilesystem) MknodAt(ctx context.Context, creds *auth.Credentia } if checkInvariants { if rp.canHandleError(err) && rp.Done() { - panic(fmt.Sprintf("%T.MknodAt() consumed all path components and returned %T", rp.mount.fs.impl, err)) + panic(fmt.Sprintf("%T.MknodAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) } } if !rp.handleError(err) { @@ -350,6 +351,33 @@ func (vfs *VirtualFilesystem) MknodAt(ctx context.Context, creds *auth.Credentia } } +// BoundEndpointAt gets the bound endpoint at the given path, if one exists. +func (vfs *VirtualFilesystem) BoundEndpointAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) (transport.BoundEndpoint, error) { + if !pop.Path.Begin.Ok() { + if pop.Path.Absolute { + return nil, syserror.ECONNREFUSED + } + return nil, syserror.ENOENT + } + rp := vfs.getResolvingPath(creds, pop) + for { + bep, err := rp.mount.fs.impl.BoundEndpointAt(ctx, rp) + if err == nil { + vfs.putResolvingPath(rp) + return bep, nil + } + if checkInvariants { + if rp.canHandleError(err) && rp.Done() { + panic(fmt.Sprintf("%T.BoundEndpointAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) + } + } + if !rp.handleError(err) { + vfs.putResolvingPath(rp) + return nil, err + } + } +} + // OpenAt returns a FileDescription providing access to the file at the given // path. A reference is taken on the returned FileDescription. func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *OpenOptions) (*FileDescription, error) { @@ -494,7 +522,7 @@ func (vfs *VirtualFilesystem) RenameAt(ctx context.Context, creds *auth.Credenti } if checkInvariants { if rp.canHandleError(err) && rp.Done() { - panic(fmt.Sprintf("%T.RenameAt() consumed all path components and returned %T", rp.mount.fs.impl, err)) + panic(fmt.Sprintf("%T.RenameAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) } } if !rp.handleError(err) { @@ -527,7 +555,7 @@ func (vfs *VirtualFilesystem) RmdirAt(ctx context.Context, creds *auth.Credentia } if checkInvariants { if rp.canHandleError(err) && rp.Done() { - panic(fmt.Sprintf("%T.RmdirAt() consumed all path components and returned %T", rp.mount.fs.impl, err)) + panic(fmt.Sprintf("%T.RmdirAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) } } if !rp.handleError(err) { @@ -608,7 +636,7 @@ func (vfs *VirtualFilesystem) SymlinkAt(ctx context.Context, creds *auth.Credent } if checkInvariants { if rp.canHandleError(err) && rp.Done() { - panic(fmt.Sprintf("%T.SymlinkAt() consumed all path components and returned %T", rp.mount.fs.impl, err)) + panic(fmt.Sprintf("%T.SymlinkAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) } } if !rp.handleError(err) { @@ -640,7 +668,7 @@ func (vfs *VirtualFilesystem) UnlinkAt(ctx context.Context, creds *auth.Credenti } if checkInvariants { if rp.canHandleError(err) && rp.Done() { - panic(fmt.Sprintf("%T.UnlinkAt() consumed all path components and returned %T", rp.mount.fs.impl, err)) + panic(fmt.Sprintf("%T.UnlinkAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) } } if !rp.handleError(err) { diff --git a/pkg/syserror/syserror.go b/pkg/syserror/syserror.go index 4b5a0fca6..f86db0999 100644 --- a/pkg/syserror/syserror.go +++ b/pkg/syserror/syserror.go @@ -27,6 +27,7 @@ import ( var ( E2BIG = error(syscall.E2BIG) EACCES = error(syscall.EACCES) + EADDRINUSE = error(syscall.EADDRINUSE) EAGAIN = error(syscall.EAGAIN) EBADF = error(syscall.EBADF) EBADFD = error(syscall.EBADFD) -- cgit v1.2.3 From 10f2c8db915df14102e3f4d9efcfce372c90707a Mon Sep 17 00:00:00 2001 From: Nicolas Lacasse Date: Fri, 27 Mar 2020 16:53:28 -0700 Subject: Add FilesystemType.Name method, and FilesystemType field to Filesystem struct. Both have analogues in Linux: * struct file_system_type has a char *name field. * struct super_block keeps a pointer to the file_system_type. These fields are necessary to support the `filesystem type` field in /proc/[pid]/mountinfo. PiperOrigin-RevId: 303434063 --- pkg/sentry/fsimpl/devtmpfs/devtmpfs.go | 5 +++++ pkg/sentry/fsimpl/ext/ext.go | 12 ++++++++++-- pkg/sentry/fsimpl/gofer/gofer.go | 7 ++++++- pkg/sentry/fsimpl/host/host.go | 15 ++++++++++++++- pkg/sentry/fsimpl/kernfs/kernfs.go | 7 ++----- pkg/sentry/fsimpl/kernfs/kernfs_test.go | 8 ++++++-- pkg/sentry/fsimpl/proc/filesystem.go | 11 ++++++++--- pkg/sentry/fsimpl/sys/sys.go | 9 +++++++-- pkg/sentry/fsimpl/tmpfs/tmpfs.go | 7 ++++++- pkg/sentry/vfs/anonfs.go | 13 +++++++++++++ pkg/sentry/vfs/filesystem.go | 11 ++++++++++- pkg/sentry/vfs/filesystem_type.go | 3 +++ pkg/sentry/vfs/vfs.go | 2 +- 13 files changed, 91 insertions(+), 19 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go b/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go index abd4f24e7..64f1b142c 100644 --- a/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go +++ b/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go @@ -42,6 +42,11 @@ type FilesystemType struct { root *vfs.Dentry } +// Name implements vfs.FilesystemType.Name. +func (*FilesystemType) Name() string { + return Name +} + // GetFilesystem implements vfs.FilesystemType.GetFilesystem. func (fst *FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { fst.initOnce.Do(func() { diff --git a/pkg/sentry/fsimpl/ext/ext.go b/pkg/sentry/fsimpl/ext/ext.go index 373d23b74..7176af6d1 100644 --- a/pkg/sentry/fsimpl/ext/ext.go +++ b/pkg/sentry/fsimpl/ext/ext.go @@ -30,6 +30,9 @@ import ( "gvisor.dev/gvisor/pkg/syserror" ) +// Name is the name of this filesystem. +const Name = "ext" + // FilesystemType implements vfs.FilesystemType. type FilesystemType struct{} @@ -91,8 +94,13 @@ func isCompatible(sb disklayout.SuperBlock) bool { return true } +// Name implements vfs.FilesystemType.Name. +func (FilesystemType) Name() string { + return Name +} + // GetFilesystem implements vfs.FilesystemType.GetFilesystem. -func (FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { +func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { // TODO(b/134676337): Ensure that the user is mounting readonly. If not, // EACCESS should be returned according to mount(2). Filesystem independent // flags (like readonly) are currently not available in pkg/sentry/vfs. @@ -103,7 +111,7 @@ func (FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFile } fs := filesystem{dev: dev, inodeCache: make(map[uint32]*inode)} - fs.vfsfs.Init(vfsObj, &fs) + fs.vfsfs.Init(vfsObj, &fsType, &fs) fs.sb, err = readSuperBlock(dev) if err != nil { return nil, nil, err diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go index cf276a417..8e41b6b1c 100644 --- a/pkg/sentry/fsimpl/gofer/gofer.go +++ b/pkg/sentry/fsimpl/gofer/gofer.go @@ -199,6 +199,11 @@ const ( InteropModeShared ) +// Name implements vfs.FilesystemType.Name. +func (FilesystemType) Name() string { + return Name +} + // GetFilesystem implements vfs.FilesystemType.GetFilesystem. func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { mfp := pgalloc.MemoryFileProviderFromContext(ctx) @@ -374,7 +379,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt dentries: make(map[*dentry]struct{}), specialFileFDs: make(map[*specialFileFD]struct{}), } - fs.vfsfs.Init(vfsObj, fs) + fs.vfsfs.Init(vfsObj, &fstype, fs) // Construct the root dentry. root, err := fs.newDentry(ctx, attachFile, qid, attrMask, &attr) diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go index 17e3d6e9d..7d9dcd4c9 100644 --- a/pkg/sentry/fsimpl/host/host.go +++ b/pkg/sentry/fsimpl/host/host.go @@ -38,6 +38,19 @@ import ( "gvisor.dev/gvisor/pkg/usermem" ) +// filesystemType implements vfs.FilesystemType. +type filesystemType struct{} + +// GetFilesystem implements FilesystemType.GetFilesystem. +func (filesystemType) GetFilesystem(context.Context, *vfs.VirtualFilesystem, *auth.Credentials, string, vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { + panic("cannot instaniate a host filesystem") +} + +// Name implements FilesystemType.Name. +func (filesystemType) Name() string { + return "none" +} + // filesystem implements vfs.FilesystemImpl. type filesystem struct { kernfs.Filesystem @@ -46,7 +59,7 @@ type filesystem struct { // NewMount returns a new disconnected mount in vfsObj that may be passed to ImportFD. func NewMount(vfsObj *vfs.VirtualFilesystem) (*vfs.Mount, error) { fs := &filesystem{} - fs.Init(vfsObj) + fs.Init(vfsObj, &filesystemType{}) vfsfs := fs.VFSFilesystem() // NewDisconnectedMount will take an additional reference on vfsfs. defer vfsfs.DecRef() diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go index 794e38908..2cefef020 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs.go @@ -63,9 +63,6 @@ import ( "gvisor.dev/gvisor/pkg/sync" ) -// FilesystemType implements vfs.FilesystemType. -type FilesystemType struct{} - // Filesystem mostly implements vfs.FilesystemImpl for a generic in-memory // filesystem. Concrete implementations are expected to embed this in their own // Filesystem type. @@ -138,8 +135,8 @@ func (fs *Filesystem) processDeferredDecRefsLocked() { // Init initializes a kernfs filesystem. This should be called from during // vfs.FilesystemType.NewFilesystem for the concrete filesystem embedding // kernfs. -func (fs *Filesystem) Init(vfsObj *vfs.VirtualFilesystem) { - fs.vfsfs.Init(vfsObj, fs) +func (fs *Filesystem) Init(vfsObj *vfs.VirtualFilesystem, fsType vfs.FilesystemType) { + fs.vfsfs.Init(vfsObj, fsType, fs) } // VFSFilesystem returns the generic vfs filesystem object. diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go index fb0d25ad7..465451f35 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs_test.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go @@ -187,9 +187,13 @@ func (*dir) NewNode(context.Context, string, vfs.MknodOptions) (*vfs.Dentry, err return nil, syserror.EPERM } -func (fst *fsType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opt vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { +func (fsType) Name() string { + return "kernfs" +} + +func (fst fsType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opt vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { fs := &filesystem{} - fs.Init(vfsObj) + fs.Init(vfsObj, &fst) root := fst.rootFn(creds, fs) return fs.VFSFilesystem(), root.VFSDentry(), nil } diff --git a/pkg/sentry/fsimpl/proc/filesystem.go b/pkg/sentry/fsimpl/proc/filesystem.go index 5c19d5522..104fc9030 100644 --- a/pkg/sentry/fsimpl/proc/filesystem.go +++ b/pkg/sentry/fsimpl/proc/filesystem.go @@ -36,8 +36,13 @@ type FilesystemType struct{} var _ vfs.FilesystemType = (*FilesystemType)(nil) -// GetFilesystem implements vfs.FilesystemType. -func (ft *FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { +// Name implements vfs.FilesystemType.Name. +func (FilesystemType) Name() string { + return Name +} + +// GetFilesystem implements vfs.FilesystemType.GetFilesystem. +func (ft FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { k := kernel.KernelFromContext(ctx) if k == nil { return nil, nil, fmt.Errorf("procfs requires a kernel") @@ -48,7 +53,7 @@ func (ft *FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virtual } procfs := &kernfs.Filesystem{} - procfs.VFSFilesystem().Init(vfsObj, procfs) + procfs.VFSFilesystem().Init(vfsObj, &ft, procfs) var cgroups map[string]string if opts.InternalData != nil { diff --git a/pkg/sentry/fsimpl/sys/sys.go b/pkg/sentry/fsimpl/sys/sys.go index 7abfd62f2..5c617270e 100644 --- a/pkg/sentry/fsimpl/sys/sys.go +++ b/pkg/sentry/fsimpl/sys/sys.go @@ -39,10 +39,15 @@ type filesystem struct { kernfs.Filesystem } +// Name implements vfs.FilesystemType.Name. +func (FilesystemType) Name() string { + return Name +} + // GetFilesystem implements vfs.FilesystemType.GetFilesystem. -func (FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { +func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { fs := &filesystem{} - fs.Filesystem.Init(vfsObj) + fs.Filesystem.Init(vfsObj, &fsType) k := kernel.KernelFromContext(ctx) maxCPUCores := k.ApplicationCores() defaultSysDirMode := linux.FileMode(0755) diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go index 2f9e6c876..b07b0dbae 100644 --- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go +++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go @@ -63,6 +63,11 @@ type filesystem struct { nextInoMinusOne uint64 // accessed using atomic memory operations } +// Name implements vfs.FilesystemType.Name. +func (FilesystemType) Name() string { + return Name +} + // GetFilesystem implements vfs.FilesystemType.GetFilesystem. func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { memFileProvider := pgalloc.MemoryFileProviderFromContext(ctx) @@ -74,7 +79,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt memFile: memFileProvider.MemoryFile(), clock: clock, } - fs.vfsfs.Init(vfsObj, &fs) + fs.vfsfs.Init(vfsObj, &fstype, &fs) root := fs.newDentry(fs.newDirectory(creds, 01777)) return &fs.vfsfs, &root.vfsd, nil } diff --git a/pkg/sentry/vfs/anonfs.go b/pkg/sentry/vfs/anonfs.go index f58867066..d1f6dfb45 100644 --- a/pkg/sentry/vfs/anonfs.go +++ b/pkg/sentry/vfs/anonfs.go @@ -51,6 +51,19 @@ const ( anonFileGID = auth.RootKGID ) +// anonFilesystemType implements FilesystemType. +type anonFilesystemType struct{} + +// GetFilesystem implements FilesystemType.GetFilesystem. +func (anonFilesystemType) GetFilesystem(context.Context, *VirtualFilesystem, *auth.Credentials, string, GetFilesystemOptions) (*Filesystem, *Dentry, error) { + panic("cannot instaniate an anon filesystem") +} + +// Name implemenents FilesystemType.Name. +func (anonFilesystemType) Name() string { + return "none" +} + // anonFilesystem is the implementation of FilesystemImpl that backs // VirtualDentries returned by VirtualFilesystem.NewAnonVirtualDentry(). // diff --git a/pkg/sentry/vfs/filesystem.go b/pkg/sentry/vfs/filesystem.go index 7b7d233f9..cd34782ff 100644 --- a/pkg/sentry/vfs/filesystem.go +++ b/pkg/sentry/vfs/filesystem.go @@ -42,21 +42,30 @@ type Filesystem struct { // immutable. vfs *VirtualFilesystem + // fsType is the FilesystemType of this Filesystem. + fsType FilesystemType + // impl is the FilesystemImpl associated with this Filesystem. impl is // immutable. This should be the last field in Dentry. impl FilesystemImpl } // Init must be called before first use of fs. -func (fs *Filesystem) Init(vfsObj *VirtualFilesystem, impl FilesystemImpl) { +func (fs *Filesystem) Init(vfsObj *VirtualFilesystem, fsType FilesystemType, impl FilesystemImpl) { fs.refs = 1 fs.vfs = vfsObj + fs.fsType = fsType fs.impl = impl vfsObj.filesystemsMu.Lock() vfsObj.filesystems[fs] = struct{}{} vfsObj.filesystemsMu.Unlock() } +// FilesystemType returns the FilesystemType for this Filesystem. +func (fs *Filesystem) FilesystemType() FilesystemType { + return fs.fsType +} + // VirtualFilesystem returns the containing VirtualFilesystem. func (fs *Filesystem) VirtualFilesystem() *VirtualFilesystem { return fs.vfs diff --git a/pkg/sentry/vfs/filesystem_type.go b/pkg/sentry/vfs/filesystem_type.go index bb9cada81..f2298f7f6 100644 --- a/pkg/sentry/vfs/filesystem_type.go +++ b/pkg/sentry/vfs/filesystem_type.go @@ -30,6 +30,9 @@ type FilesystemType interface { // along with its mount root. A reference is taken on the returned // Filesystem and Dentry. GetFilesystem(ctx context.Context, vfsObj *VirtualFilesystem, creds *auth.Credentials, source string, opts GetFilesystemOptions) (*Filesystem, *Dentry, error) + + // Name returns the name of this FilesystemType. + Name() string } // GetFilesystemOptions contains options to FilesystemType.GetFilesystem. diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go index 1708c1a53..720b90d8f 100644 --- a/pkg/sentry/vfs/vfs.go +++ b/pkg/sentry/vfs/vfs.go @@ -134,7 +134,7 @@ func (vfs *VirtualFilesystem) Init() error { anonfs := anonFilesystem{ devMinor: anonfsDevMinor, } - anonfs.vfsfs.Init(vfs, &anonfs) + anonfs.vfsfs.Init(vfs, &anonFilesystemType{}, &anonfs) defer anonfs.vfsfs.DecRef() anonMount, err := vfs.NewDisconnectedMount(&anonfs.vfsfs, nil, &MountOptions{}) if err != nil { -- cgit v1.2.3 From 9de982ea790ffe56eca07b6535e9420b669b7c0c Mon Sep 17 00:00:00 2001 From: Nicolas Lacasse Date: Tue, 31 Mar 2020 15:00:25 -0700 Subject: Allow passing root file type to tmpfs. PiperOrigin-RevId: 304053357 --- pkg/sentry/fsimpl/testutil/testutil.go | 3 +++ pkg/sentry/fsimpl/tmpfs/tmpfs.go | 32 ++++++++++++++++++++++++++++++-- 2 files changed, 33 insertions(+), 2 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/testutil/testutil.go b/pkg/sentry/fsimpl/testutil/testutil.go index e16808c63..0556af877 100644 --- a/pkg/sentry/fsimpl/testutil/testutil.go +++ b/pkg/sentry/fsimpl/testutil/testutil.go @@ -162,6 +162,9 @@ func (s *System) ListDirents(pop *vfs.PathOperation) *DirentCollector { // exactly the specified set of expected entries. AssertAllDirentTypes respects // collector.skipDots, and implicitly checks for "." and ".." accordingly. func (s *System) AssertAllDirentTypes(collector *DirentCollector, expected map[string]DirentType) { + if expected == nil { + expected = make(map[string]DirentType) + } // Also implicitly check for "." and "..", if enabled. if !collector.skipDots { expected["."] = linux.DT_DIR diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go index b07b0dbae..afd9f8533 100644 --- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go +++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go @@ -68,6 +68,17 @@ func (FilesystemType) Name() string { return Name } +// FilesystemOpts is used to pass configuration data to tmpfs. +type FilesystemOpts struct { + // RootFileType is the FileType of the filesystem root. Valid values + // are: S_IFDIR, S_IFREG, and S_IFLNK. Defaults to S_IFDIR. + RootFileType uint16 + + // RootSymlinkTarget is the target of the root symlink. Only valid if + // RootFileType == S_IFLNK. + RootSymlinkTarget string +} + // GetFilesystem implements vfs.FilesystemType.GetFilesystem. func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { memFileProvider := pgalloc.MemoryFileProviderFromContext(ctx) @@ -79,9 +90,26 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt memFile: memFileProvider.MemoryFile(), clock: clock, } + fs.vfsfs.Init(vfsObj, &fstype, &fs) - root := fs.newDentry(fs.newDirectory(creds, 01777)) - return &fs.vfsfs, &root.vfsd, nil + + typ := uint16(linux.S_IFDIR) + tmpfsOpts, ok := opts.InternalData.(FilesystemOpts) + if ok && tmpfsOpts.RootFileType != 0 { + typ = tmpfsOpts.RootFileType + } + var root *inode + switch typ { + case linux.S_IFREG: + root = fs.newRegularFile(creds, 0777) + case linux.S_IFLNK: + root = fs.newSymlink(creds, tmpfsOpts.RootSymlinkTarget) + case linux.S_IFDIR: + root = fs.newDirectory(creds, 01777) + default: + return nil, nil, fmt.Errorf("invalid tmpfs root file type: %#o", typ) + } + return &fs.vfsfs, &fs.newDentry(root).vfsd, nil } // Release implements vfs.FilesystemImpl.Release. -- cgit v1.2.3 From e1c8eaca8f8413b17dab8f01b2e123e9d4b9ddbc Mon Sep 17 00:00:00 2001 From: Nicolas Lacasse Date: Tue, 31 Mar 2020 15:00:26 -0700 Subject: Fix /proc/self/mounts and /proc/self/mountinfo in VFS2. Some extra fields were added to the Mount type to expose necessary data to the proc filesystem. PiperOrigin-RevId: 304053361 --- pkg/sentry/fsimpl/proc/task_files.go | 183 +++++---------------------------- pkg/sentry/vfs/mount.go | 192 ++++++++++++++++++++++++++++++++++- 2 files changed, 219 insertions(+), 156 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/proc/task_files.go b/pkg/sentry/fsimpl/proc/task_files.go index 8c743df8d..df0d1bcc5 100644 --- a/pkg/sentry/fsimpl/proc/task_files.go +++ b/pkg/sentry/fsimpl/proc/task_files.go @@ -18,13 +18,10 @@ import ( "bytes" "fmt" "io" - "sort" - "strings" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/safemem" - "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fsbridge" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" @@ -634,51 +631,6 @@ func (s *exeSymlink) executable() (file fsbridge.File, err error) { return } -// forEachMountSource runs f for the process root mount and each mount that is -// a descendant of the root. -func forEachMount(t *kernel.Task, fn func(string, *fs.Mount)) { - var fsctx *kernel.FSContext - t.WithMuLocked(func(t *kernel.Task) { - fsctx = t.FSContext() - }) - if fsctx == nil { - // The task has been destroyed. Nothing to show here. - return - } - - // All mount points must be relative to the rootDir, and mounts outside - // will be excluded. - rootDir := fsctx.RootDirectory() - if rootDir == nil { - // The task has been destroyed. Nothing to show here. - return - } - defer rootDir.DecRef() - - mnt := t.MountNamespace().FindMount(rootDir) - if mnt == nil { - // Has it just been unmounted? - return - } - ms := t.MountNamespace().AllMountsUnder(mnt) - sort.Slice(ms, func(i, j int) bool { - return ms[i].ID < ms[j].ID - }) - for _, m := range ms { - mroot := m.Root() - if mroot == nil { - continue // No longer valid. - } - mountPath, desc := mroot.FullName(rootDir) - mroot.DecRef() - if !desc { - // MountSources that are not descendants of the chroot jail are ignored. - continue - } - fn(mountPath, m) - } -} - // mountInfoData is used to implement /proc/[pid]/mountinfo. // // +stateify savable @@ -692,92 +644,22 @@ var _ dynamicInode = (*mountInfoData)(nil) // Generate implements vfs.DynamicBytesSource.Generate. func (i *mountInfoData) Generate(ctx context.Context, buf *bytes.Buffer) error { - forEachMount(i.task, func(mountPath string, m *fs.Mount) { - mroot := m.Root() - if mroot == nil { - return // No longer valid. - } - defer mroot.DecRef() - - // Format: - // 36 35 98:0 /mnt1 /mnt2 rw,noatime master:1 - ext3 /dev/root rw,errors=continue - // (1)(2)(3) (4) (5) (6) (7) (8) (9) (10) (11) - - // (1) MountSource ID. - fmt.Fprintf(buf, "%d ", m.ID) - - // (2) Parent ID (or this ID if there is no parent). - pID := m.ID - if !m.IsRoot() && !m.IsUndo() { - pID = m.ParentID - } - fmt.Fprintf(buf, "%d ", pID) - - // (3) Major:Minor device ID. We don't have a superblock, so we - // just use the root inode device number. - sa := mroot.Inode.StableAttr - fmt.Fprintf(buf, "%d:%d ", sa.DeviceFileMajor, sa.DeviceFileMinor) - - // (4) Root: the pathname of the directory in the filesystem - // which forms the root of this mount. - // - // NOTE(b/78135857): This will always be "/" until we implement - // bind mounts. - fmt.Fprintf(buf, "/ ") - - // (5) Mount point (relative to process root). - fmt.Fprintf(buf, "%s ", mountPath) - - // (6) Mount options. - flags := mroot.Inode.MountSource.Flags - opts := "rw" - if flags.ReadOnly { - opts = "ro" - } - if flags.NoAtime { - opts += ",noatime" - } - if flags.NoExec { - opts += ",noexec" - } - fmt.Fprintf(buf, "%s ", opts) - - // (7) Optional fields: zero or more fields of the form "tag[:value]". - // (8) Separator: the end of the optional fields is marked by a single hyphen. - fmt.Fprintf(buf, "- ") - - // (9) Filesystem type. - fmt.Fprintf(buf, "%s ", mroot.Inode.MountSource.FilesystemType) - - // (10) Mount source: filesystem-specific information or "none". - fmt.Fprintf(buf, "none ") - - // (11) Superblock options, and final newline. - fmt.Fprintf(buf, "%s\n", superBlockOpts(mountPath, mroot.Inode.MountSource)) + var fsctx *kernel.FSContext + i.task.WithMuLocked(func(t *kernel.Task) { + fsctx = t.FSContext() }) - return nil -} - -func superBlockOpts(mountPath string, msrc *fs.MountSource) string { - // gVisor doesn't (yet) have a concept of super block options, so we - // use the ro/rw bit from the mount flag. - opts := "rw" - if msrc.Flags.ReadOnly { - opts = "ro" + if fsctx == nil { + // The task has been destroyed. Nothing to show here. + return nil } - - // NOTE(b/147673608): If the mount is a cgroup, we also need to include - // the cgroup name in the options. For now we just read that from the - // path. - // TODO(gvisor.dev/issues/190): Once gVisor has full cgroup support, we - // should get this value from the cgroup itself, and not rely on the - // path. - if msrc.FilesystemType == "cgroup" { - splitPath := strings.Split(mountPath, "/") - cgroupType := splitPath[len(splitPath)-1] - opts += "," + cgroupType + rootDir := fsctx.RootDirectoryVFS2() + if !rootDir.Ok() { + // Root has been destroyed. Don't try to read mounts. + return nil } - return opts + defer rootDir.DecRef() + i.task.Kernel().VFS().GenerateProcMountInfo(ctx, rootDir, buf) + return nil } // mountsData is used to implement /proc/[pid]/mounts. @@ -789,33 +671,24 @@ type mountsData struct { task *kernel.Task } -var _ dynamicInode = (*mountInfoData)(nil) +var _ dynamicInode = (*mountsData)(nil) // Generate implements vfs.DynamicBytesSource.Generate. func (i *mountsData) Generate(ctx context.Context, buf *bytes.Buffer) error { - forEachMount(i.task, func(mountPath string, m *fs.Mount) { - // Format: - // - // - // We use the filesystem name as the first field, since there - // is no real block device we can point to, and we also should - // not expose anything about the remote filesystem. - // - // Only ro/rw option is supported for now. - // - // The "needs dump"and fsck flags are always 0, which is allowed. - root := m.Root() - if root == nil { - return // No longer valid. - } - defer root.DecRef() - - flags := root.Inode.MountSource.Flags - opts := "rw" - if flags.ReadOnly { - opts = "ro" - } - fmt.Fprintf(buf, "%s %s %s %s %d %d\n", "none", mountPath, root.Inode.MountSource.FilesystemType, opts, 0, 0) + var fsctx *kernel.FSContext + i.task.WithMuLocked(func(t *kernel.Task) { + fsctx = t.FSContext() }) + if fsctx == nil { + // The task has been destroyed. Nothing to show here. + return nil + } + rootDir := fsctx.RootDirectoryVFS2() + if !rootDir.Ok() { + // Root has been destroyed. Don't try to read mounts. + return nil + } + defer rootDir.DecRef() + i.task.Kernel().VFS().GenerateProcMounts(ctx, rootDir, buf) return nil } diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go index 4b68cabda..7792eb1a0 100644 --- a/pkg/sentry/vfs/mount.go +++ b/pkg/sentry/vfs/mount.go @@ -15,7 +15,11 @@ package vfs import ( + "bytes" + "fmt" "math" + "sort" + "strings" "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" @@ -44,7 +48,7 @@ var lastMountID uint64 // // +stateify savable type Mount struct { - // vfs, fs, and root are immutable. References are held on fs and root. + // vfs, fs, root are immutable. References are held on fs and root. // // Invariant: root belongs to fs. vfs *VirtualFilesystem @@ -639,12 +643,28 @@ func (mnt *Mount) setReadOnlyLocked(ro bool) error { return nil } +func (mnt *Mount) readOnly() bool { + return atomic.LoadInt64(&mnt.writers) < 0 +} + // Filesystem returns the mounted Filesystem. It does not take a reference on // the returned Filesystem. func (mnt *Mount) Filesystem() *Filesystem { return mnt.fs } +// submountsLocked returns this Mount and all Mounts that are descendents of +// it. +// +// Precondition: mnt.vfs.mountMu must be held. +func (mnt *Mount) submountsLocked() []*Mount { + mounts := []*Mount{mnt} + for m := range mnt.children { + mounts = append(mounts, m.submountsLocked()...) + } + return mounts +} + // Root returns mntns' root. A reference is taken on the returned // VirtualDentry. func (mntns *MountNamespace) Root() VirtualDentry { @@ -655,3 +675,173 @@ func (mntns *MountNamespace) Root() VirtualDentry { vd.IncRef() return vd } + +// GenerateProcMounts emits the contents of /proc/[pid]/mounts for vfs to buf. +// +// Preconditions: taskRootDir.Ok(). +func (vfs *VirtualFilesystem) GenerateProcMounts(ctx context.Context, taskRootDir VirtualDentry, buf *bytes.Buffer) { + vfs.mountMu.Lock() + defer vfs.mountMu.Unlock() + rootMnt := taskRootDir.mount + mounts := rootMnt.submountsLocked() + sort.Slice(mounts, func(i, j int) bool { return mounts[i].ID < mounts[j].ID }) + for _, mnt := range mounts { + // Get the path to this mount relative to task root. + mntRootVD := VirtualDentry{ + mount: mnt, + dentry: mnt.root, + } + path, err := vfs.PathnameReachable(ctx, taskRootDir, mntRootVD) + if err != nil { + // For some reason we didn't get a path. Log a warning + // and run with empty path. + ctx.Warningf("Error getting pathname for mount root %+v: %v", mnt.root, err) + path = "" + } + if path == "" { + // Either an error occurred, or path is not reachable + // from root. + break + } + + opts := "rw" + if mnt.readOnly() { + opts = "ro" + } + if mnt.flags.NoExec { + opts += ",noexec" + } + + // Format: + // + // + // The "needs dump" and "fsck order" flags are always 0, which + // is allowed. + fmt.Fprintf(buf, "%s %s %s %s %d %d\n", "none", path, mnt.fs.FilesystemType().Name(), opts, 0, 0) + } +} + +// GenerateProcMountInfo emits the contents of /proc/[pid]/mountinfo for vfs to +// buf. +// +// Preconditions: taskRootDir.Ok(). +func (vfs *VirtualFilesystem) GenerateProcMountInfo(ctx context.Context, taskRootDir VirtualDentry, buf *bytes.Buffer) { + vfs.mountMu.Lock() + defer vfs.mountMu.Unlock() + rootMnt := taskRootDir.mount + mounts := rootMnt.submountsLocked() + sort.Slice(mounts, func(i, j int) bool { return mounts[i].ID < mounts[j].ID }) + for _, mnt := range mounts { + // Get the path to this mount relative to task root. + mntRootVD := VirtualDentry{ + mount: mnt, + dentry: mnt.root, + } + path, err := vfs.PathnameReachable(ctx, taskRootDir, mntRootVD) + if err != nil { + // For some reason we didn't get a path. Log a warning + // and run with empty path. + ctx.Warningf("Error getting pathname for mount root %+v: %v", mnt.root, err) + path = "" + } + if path == "" { + // Either an error occurred, or path is not reachable + // from root. + break + } + // Stat the mount root to get the major/minor device numbers. + pop := &PathOperation{ + Root: mntRootVD, + Start: mntRootVD, + } + statx, err := vfs.StatAt(ctx, auth.NewAnonymousCredentials(), pop, &StatOptions{}) + if err != nil { + // Well that's not good. Ignore this mount. + break + } + + // Format: + // 36 35 98:0 /mnt1 /mnt2 rw,noatime master:1 - ext3 /dev/root rw,errors=continue + // (1)(2)(3) (4) (5) (6) (7) (8) (9) (10) (11) + + // (1) Mount ID. + fmt.Fprintf(buf, "%d ", mnt.ID) + + // (2) Parent ID (or this ID if there is no parent). + pID := mnt.ID + if p := mnt.parent(); p != nil { + pID = p.ID + } + fmt.Fprintf(buf, "%d ", pID) + + // (3) Major:Minor device ID. We don't have a superblock, so we + // just use the root inode device number. + fmt.Fprintf(buf, "%d:%d ", statx.DevMajor, statx.DevMinor) + + // (4) Root: the pathname of the directory in the filesystem + // which forms the root of this mount. + // + // NOTE(b/78135857): This will always be "/" until we implement + // bind mounts. + fmt.Fprintf(buf, "/ ") + + // (5) Mount point (relative to process root). + fmt.Fprintf(buf, "%s ", manglePath(path)) + + // (6) Mount options. + opts := "rw" + if mnt.readOnly() { + opts = "ro" + } + if mnt.flags.NoExec { + opts += ",noexec" + } + // TODO(gvisor.dev/issue/1193): Add "noatime" if MS_NOATIME is + // set. + fmt.Fprintf(buf, "%s ", opts) + + // (7) Optional fields: zero or more fields of the form "tag[:value]". + // (8) Separator: the end of the optional fields is marked by a single hyphen. + fmt.Fprintf(buf, "- ") + + // (9) Filesystem type. + fmt.Fprintf(buf, "%s ", mnt.fs.FilesystemType().Name()) + + // (10) Mount source: filesystem-specific information or "none". + fmt.Fprintf(buf, "none ") + + // (11) Superblock options, and final newline. + fmt.Fprintf(buf, "%s\n", superBlockOpts(path, mnt)) + } +} + +// manglePath replaces ' ', '\t', '\n', and '\\' with their octal equivalents. +// See Linux fs/seq_file.c:mangle_path. +func manglePath(p string) string { + r := strings.NewReplacer(" ", "\\040", "\t", "\\011", "\n", "\\012", "\\", "\\134") + return r.Replace(p) +} + +// superBlockOpts returns the super block options string for the the mount at +// the given path. +func superBlockOpts(mountPath string, mnt *Mount) string { + // gVisor doesn't (yet) have a concept of super block options, so we + // use the ro/rw bit from the mount flag. + opts := "rw" + if mnt.readOnly() { + opts = "ro" + } + + // NOTE(b/147673608): If the mount is a cgroup, we also need to include + // the cgroup name in the options. For now we just read that from the + // path. + // TODO(gvisor.dev/issues/190): Once gVisor has full cgroup support, we + // should get this value from the cgroup itself, and not rely on the + // path. + if mnt.fs.FilesystemType().Name() == "cgroup" { + splitPath := strings.Split(mountPath, "/") + cgroupType := splitPath[len(splitPath)-1] + opts += "," + cgroupType + } + return opts +} -- cgit v1.2.3 From 639d94f9f71b43e86320a6e9157c932f5d7936a7 Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Tue, 31 Mar 2020 19:15:55 -0700 Subject: Add socket filesystem and global disconnected socket mount for VFS2. A socket mount where anonymous sockets will reside is added to the VirtualFilesystem. Socketfs is built on top of kernfs. Updates #1476, #1478, #1484, #1485. PiperOrigin-RevId: 304095251 --- pkg/sentry/fsimpl/sockfs/BUILD | 16 +++++++++ pkg/sentry/fsimpl/sockfs/sockfs.go | 73 ++++++++++++++++++++++++++++++++++++++ pkg/sentry/kernel/BUILD | 1 + pkg/sentry/kernel/kernel.go | 24 +++++++++++++ test/syscalls/linux/socket_unix.cc | 2 ++ 5 files changed, 116 insertions(+) create mode 100644 pkg/sentry/fsimpl/sockfs/BUILD create mode 100644 pkg/sentry/fsimpl/sockfs/sockfs.go (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/sockfs/BUILD b/pkg/sentry/fsimpl/sockfs/BUILD new file mode 100644 index 000000000..790d50e65 --- /dev/null +++ b/pkg/sentry/fsimpl/sockfs/BUILD @@ -0,0 +1,16 @@ +load("//tools:defs.bzl", "go_library") + +licenses(["notice"]) + +go_library( + name = "sockfs", + srcs = ["sockfs.go"], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/context", + "//pkg/sentry/fsimpl/kernfs", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/vfs", + "//pkg/syserror", + ], +) diff --git a/pkg/sentry/fsimpl/sockfs/sockfs.go b/pkg/sentry/fsimpl/sockfs/sockfs.go new file mode 100644 index 000000000..c13511de2 --- /dev/null +++ b/pkg/sentry/fsimpl/sockfs/sockfs.go @@ -0,0 +1,73 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package sockfs provides a filesystem implementation for anonymous sockets. +package sockfs + +import ( + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +// NewFilesystem creates a new sockfs filesystem. +// +// Note that there should only ever be one instance of sockfs.Filesystem, +// backing a global socket mount. +func NewFilesystem(vfsObj *vfs.VirtualFilesystem) *vfs.Filesystem { + fs, _, err := filesystemType{}.GetFilesystem(nil, vfsObj, nil, "", vfs.GetFilesystemOptions{}) + if err != nil { + panic("failed to create sockfs filesystem") + } + return fs +} + +// filesystemType implements vfs.FilesystemType. +type filesystemType struct{} + +// GetFilesystem implements FilesystemType.GetFilesystem. +func (fsType filesystemType) GetFilesystem(_ context.Context, vfsObj *vfs.VirtualFilesystem, _ *auth.Credentials, _ string, _ vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { + fs := &filesystem{} + fs.Init(vfsObj, fsType) + return fs.VFSFilesystem(), nil, nil +} + +// Name implements FilesystemType.Name. +// +// Note that registering sockfs is unnecessary, except for the fact that it +// will not show up under /proc/filesystems as a result. This is a very minor +// discrepancy from Linux. +func (filesystemType) Name() string { + return "sockfs" +} + +// filesystem implements vfs.FilesystemImpl. +type filesystem struct { + kernfs.Filesystem +} + +// inode implements kernfs.Inode. +type inode struct { + kernfs.InodeNotDirectory + kernfs.InodeNotSymlink + kernfs.InodeAttrs + kernfs.InodeNoopRefCount +} + +// Open implements kernfs.Inode.Open. +func (i *inode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + return nil, syserror.ENXIO +} diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD index beba29a09..bb7e3cbc3 100644 --- a/pkg/sentry/kernel/BUILD +++ b/pkg/sentry/kernel/BUILD @@ -169,6 +169,7 @@ go_library( "//pkg/sentry/fs/lock", "//pkg/sentry/fs/timerfd", "//pkg/sentry/fsbridge", + "//pkg/sentry/fsimpl/sockfs", "//pkg/sentry/hostcpu", "//pkg/sentry/inet", "//pkg/sentry/kernel/auth", diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index 6feda8fa1..0a448b57c 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -50,6 +50,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/timerfd" "gvisor.dev/gvisor/pkg/sentry/fsbridge" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs" "gvisor.dev/gvisor/pkg/sentry/hostcpu" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" @@ -225,6 +226,11 @@ type Kernel struct { // by extMu. nextSocketEntry uint64 + // socketMount is a disconnected vfs.Mount, not included in k.vfs, + // representing a sockfs.filesystem. socketMount is used to back + // VirtualDentries representing anonymous sockets. + socketMount *vfs.Mount + // deviceRegistry is used to save/restore device.SimpleDevices. deviceRegistry struct{} `state:".(*device.Registry)"` @@ -348,6 +354,19 @@ func (k *Kernel) Init(args InitKernelArgs) error { k.monotonicClock = &timekeeperClock{tk: args.Timekeeper, c: sentrytime.Monotonic} k.futexes = futex.NewManager() k.netlinkPorts = port.New() + if VFS2Enabled { + if err := k.vfs.Init(); err != nil { + return fmt.Errorf("failed to initialize VFS: %v", err) + } + fs := sockfs.NewFilesystem(&k.vfs) + // NewDisconnectedMount will take an additional reference on fs. + defer fs.DecRef() + sm, err := k.vfs.NewDisconnectedMount(fs, nil, &vfs.MountOptions{}) + if err != nil { + return fmt.Errorf("failed to initialize socket mount: %v", err) + } + k.socketMount = sm + } return nil } @@ -1452,6 +1471,11 @@ func (k *Kernel) ListSockets() []*SocketEntry { return socks } +// SocketMount returns the global socket mount. +func (k *Kernel) SocketMount() *vfs.Mount { + return k.socketMount +} + // supervisorContext is a privileged context. type supervisorContext struct { context.NoopSleeper diff --git a/test/syscalls/linux/socket_unix.cc b/test/syscalls/linux/socket_unix.cc index 4cf1f76f1..8bf663e8b 100644 --- a/test/syscalls/linux/socket_unix.cc +++ b/test/syscalls/linux/socket_unix.cc @@ -257,6 +257,8 @@ TEST_P(UnixSocketPairTest, ShutdownWrite) { TEST_P(UnixSocketPairTest, SocketReopenFromProcfs) { // TODO(b/122310852): We should be returning ENXIO and NOT EIO. + // TODO(github.dev/issue/1624): This should be resolved in VFS2. Verify + // that this is the case and delete the SKIP_IF once we delete VFS1. SKIP_IF(IsRunningOnGvisor()); auto sockets = ASSERT_NO_ERRNO_AND_VALUE(NewSocketPair()); -- cgit v1.2.3 From 0d1e299079392043fae24c2be524a0eefe7d8085 Mon Sep 17 00:00:00 2001 From: Nicolas Lacasse Date: Wed, 1 Apr 2020 12:05:17 -0700 Subject: Pass configurable FilesystemType to tmpfs. PiperOrigin-RevId: 304234086 --- pkg/sentry/fsimpl/tmpfs/tmpfs.go | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go index afd9f8533..8bc8818c0 100644 --- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go +++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go @@ -77,6 +77,11 @@ type FilesystemOpts struct { // RootSymlinkTarget is the target of the root symlink. Only valid if // RootFileType == S_IFLNK. RootSymlinkTarget string + + // FilesystemType allows setting a different FilesystemType for this + // tmpfs filesystem. This allows tmpfs to "impersonate" other + // filesystems, like ramdiskfs and cgroupfs. + FilesystemType vfs.FilesystemType } // GetFilesystem implements vfs.FilesystemType.GetFilesystem. @@ -91,15 +96,22 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt clock: clock, } - fs.vfsfs.Init(vfsObj, &fstype, &fs) - - typ := uint16(linux.S_IFDIR) + rootFileType := uint16(linux.S_IFDIR) + newFSType := vfs.FilesystemType(&fstype) tmpfsOpts, ok := opts.InternalData.(FilesystemOpts) - if ok && tmpfsOpts.RootFileType != 0 { - typ = tmpfsOpts.RootFileType + if ok { + if tmpfsOpts.RootFileType != 0 { + rootFileType = tmpfsOpts.RootFileType + } + if tmpfsOpts.FilesystemType != nil { + newFSType = tmpfsOpts.FilesystemType + } } + + fs.vfsfs.Init(vfsObj, newFSType, &fs) + var root *inode - switch typ { + switch rootFileType { case linux.S_IFREG: root = fs.newRegularFile(creds, 0777) case linux.S_IFLNK: @@ -107,7 +119,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt case linux.S_IFDIR: root = fs.newDirectory(creds, 01777) default: - return nil, nil, fmt.Errorf("invalid tmpfs root file type: %#o", typ) + return nil, nil, fmt.Errorf("invalid tmpfs root file type: %#o", rootFileType) } return &fs.vfsfs, &fs.newDentry(root).vfsd, nil } -- cgit v1.2.3 From 5b2396d244ed6283d928a72bdd4cc58d78ef3175 Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Thu, 2 Apr 2020 17:06:19 -0700 Subject: Fix typo in TODO comments. PiperOrigin-RevId: 304508083 --- pkg/sentry/fs/proc/mounts.go | 3 ++- pkg/sentry/fsimpl/tmpfs/filesystem.go | 6 +++--- pkg/sentry/fsimpl/tmpfs/stat_test.go | 4 ++-- pkg/sentry/fsimpl/tmpfs/tmpfs.go | 2 +- pkg/sentry/socket/netstack/netstack.go | 2 +- pkg/sentry/vfs/mount.go | 3 ++- test/syscalls/linux/socket_inet_loopback.cc | 2 +- 7 files changed, 12 insertions(+), 10 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fs/proc/mounts.go b/pkg/sentry/fs/proc/mounts.go index 94deb553b..1fc9c703c 100644 --- a/pkg/sentry/fs/proc/mounts.go +++ b/pkg/sentry/fs/proc/mounts.go @@ -170,7 +170,8 @@ func superBlockOpts(mountPath string, msrc *fs.MountSource) string { // NOTE(b/147673608): If the mount is a cgroup, we also need to include // the cgroup name in the options. For now we just read that from the // path. - // TODO(gvisor.dev/issues/190): Once gVisor has full cgroup support, we + // + // TODO(gvisor.dev/issue/190): Once gVisor has full cgroup support, we // should get this value from the cgroup itself, and not rely on the // path. if msrc.FilesystemType == "cgroup" { diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go index e678ecc37..4cf27bf13 100644 --- a/pkg/sentry/fsimpl/tmpfs/filesystem.go +++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go @@ -57,7 +57,7 @@ afterSymlink: } next := nextVFSD.Impl().(*dentry) if symlink, ok := next.inode.impl.(*symlink); ok && rp.ShouldFollowSymlink() { - // TODO(gvisor.dev/issues/1197): Symlink traversals updates + // TODO(gvisor.dev/issue/1197): Symlink traversals updates // access time. if err := rp.HandleSymlink(symlink.target); err != nil { return nil, err @@ -515,7 +515,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa oldParent.inode.decLinksLocked() newParent.inode.incLinksLocked() } - // TODO(gvisor.dev/issues/1197): Update timestamps and parent directory + // TODO(gvisor.dev/issue/1197): Update timestamps and parent directory // sizes. vfsObj.CommitRenameReplaceDentry(renamedVFSD, &newParent.vfsd, newName, replacedVFSD) return nil @@ -600,7 +600,7 @@ func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linu if err != nil { return linux.Statfs{}, err } - // TODO(gvisor.dev/issues/1197): Actually implement statfs. + // TODO(gvisor.dev/issue/1197): Actually implement statfs. return linux.Statfs{}, syserror.ENOSYS } diff --git a/pkg/sentry/fsimpl/tmpfs/stat_test.go b/pkg/sentry/fsimpl/tmpfs/stat_test.go index ebe035dee..3e02e7190 100644 --- a/pkg/sentry/fsimpl/tmpfs/stat_test.go +++ b/pkg/sentry/fsimpl/tmpfs/stat_test.go @@ -29,7 +29,7 @@ func TestStatAfterCreate(t *testing.T) { mode := linux.FileMode(0644) // Run with different file types. - // TODO(gvisor.dev/issues/1197): Also test symlinks and sockets. + // TODO(gvisor.dev/issue/1197): Also test symlinks and sockets. for _, typ := range []string{"file", "dir", "pipe"} { t.Run(fmt.Sprintf("type=%q", typ), func(t *testing.T) { var ( @@ -169,7 +169,7 @@ func TestSetStat(t *testing.T) { mode := linux.FileMode(0644) // Run with different file types. - // TODO(gvisor.dev/issues/1197): Also test symlinks and sockets. + // TODO(gvisor.dev/issue/1197): Also test symlinks and sockets. for _, typ := range []string{"file", "dir", "pipe"} { t.Run(fmt.Sprintf("type=%q", typ), func(t *testing.T) { var ( diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go index 8bc8818c0..54da15849 100644 --- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go +++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go @@ -315,7 +315,7 @@ func (i *inode) statTo(stat *linux.Statx) { stat.Atime = linux.NsecToStatxTimestamp(i.atime) stat.Ctime = linux.NsecToStatxTimestamp(i.ctime) stat.Mtime = linux.NsecToStatxTimestamp(i.mtime) - // TODO(gvisor.dev/issues/1197): Device number. + // TODO(gvisor.dev/issue/1197): Device number. switch impl := i.impl.(type) { case *regularFile: stat.Mask |= linux.STATX_SIZE | linux.STATX_BLOCKS diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go index f14c336b9..06a5b53bc 100644 --- a/pkg/sentry/socket/netstack/netstack.go +++ b/pkg/sentry/socket/netstack/netstack.go @@ -663,7 +663,7 @@ func (s *SocketOperations) checkFamily(family uint16, exact bool) *syserr.Error // This is a hack to work around the fact that both IPv4 and IPv6 ANY are // represented by the empty string. // -// TODO(gvisor.dev/issues/1556): remove this function. +// TODO(gvisor.dev/issue/1556): remove this function. func (s *SocketOperations) mapFamily(addr tcpip.FullAddress, family uint16) tcpip.FullAddress { if len(addr.Addr) == 0 && s.family == linux.AF_INET6 && family == linux.AF_INET { addr.Addr = "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xff\xff\x00\x00\x00\x00" diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go index 7792eb1a0..1b8ecc415 100644 --- a/pkg/sentry/vfs/mount.go +++ b/pkg/sentry/vfs/mount.go @@ -835,7 +835,8 @@ func superBlockOpts(mountPath string, mnt *Mount) string { // NOTE(b/147673608): If the mount is a cgroup, we also need to include // the cgroup name in the options. For now we just read that from the // path. - // TODO(gvisor.dev/issues/190): Once gVisor has full cgroup support, we + // + // TODO(gvisor.dev/issue/190): Once gVisor has full cgroup support, we // should get this value from the cgroup itself, and not rely on the // path. if mnt.fs.FilesystemType().Name() == "cgroup" { diff --git a/test/syscalls/linux/socket_inet_loopback.cc b/test/syscalls/linux/socket_inet_loopback.cc index 16888de2a..2ffc86382 100644 --- a/test/syscalls/linux/socket_inet_loopback.cc +++ b/test/syscalls/linux/socket_inet_loopback.cc @@ -234,7 +234,7 @@ TEST_P(DualStackSocketTest, AddressOperations) { } } -// TODO(gvisor.dev/issues/1556): uncomment V4MappedAny. +// TODO(gvisor.dev/issue/1556): uncomment V4MappedAny. INSTANTIATE_TEST_SUITE_P( All, DualStackSocketTest, ::testing::Combine( -- cgit v1.2.3 From dd3bc499970c22ebbd270030b4564e6b8e4e929e Mon Sep 17 00:00:00 2001 From: Fabricio Voznika Date: Thu, 2 Apr 2020 19:37:41 -0700 Subject: Add NAME_MAX checks and update file times NAME_MAX should be enforced per filesystem implementation because other file systems may not have the same restriction. Gofer filesystem now keeps a reference to the kernel clock to avoid lookup in the Context on file access to update atime. Update access, modification, and status change times in tmpfs. Updates #1197, #1198. PiperOrigin-RevId: 304527148 --- pkg/sentry/fsimpl/gofer/directory.go | 7 +++- pkg/sentry/fsimpl/gofer/filesystem.go | 13 ++++++-- pkg/sentry/fsimpl/gofer/gofer.go | 22 ++++++------- pkg/sentry/fsimpl/gofer/regular_file.go | 7 ++-- pkg/sentry/fsimpl/gofer/special_file.go | 4 +-- pkg/sentry/fsimpl/gofer/symlink.go | 2 +- pkg/sentry/fsimpl/gofer/time.go | 39 +++++++++++----------- pkg/sentry/fsimpl/kernfs/filesystem.go | 9 ++++++ pkg/sentry/fsimpl/tmpfs/directory.go | 2 ++ pkg/sentry/fsimpl/tmpfs/filesystem.go | 25 +++++++++++++-- pkg/sentry/fsimpl/tmpfs/regular_file.go | 4 ++- pkg/sentry/fsimpl/tmpfs/tmpfs.go | 57 ++++++++++++++++++++++++++++++--- 12 files changed, 139 insertions(+), 52 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/gofer/directory.go b/pkg/sentry/fsimpl/gofer/directory.go index 5dbfc6250..49d9f859b 100644 --- a/pkg/sentry/fsimpl/gofer/directory.go +++ b/pkg/sentry/fsimpl/gofer/directory.go @@ -56,14 +56,19 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba fd.mu.Lock() defer fd.mu.Unlock() + d := fd.dentry() if fd.dirents == nil { - ds, err := fd.dentry().getDirents(ctx) + ds, err := d.getDirents(ctx) if err != nil { return err } fd.dirents = ds } + if d.fs.opts.interop != InteropModeShared { + d.touchAtime(fd.vfsfd.Mount()) + } + for fd.off < int64(len(fd.dirents)) { if err := cb.Handle(fd.dirents[fd.off]); err != nil { return err diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go index 269624362..305228bda 100644 --- a/pkg/sentry/fsimpl/gofer/filesystem.go +++ b/pkg/sentry/fsimpl/gofer/filesystem.go @@ -356,7 +356,9 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir if err := create(parent, name); err != nil { return err } - parent.touchCMtime(ctx) + if fs.opts.interop != InteropModeShared { + parent.touchCMtime() + } delete(parent.negativeChildren, name) parent.dirents = nil return nil @@ -454,7 +456,7 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b return err } if fs.opts.interop != InteropModeShared { - parent.touchCMtime(ctx) + parent.touchCMtime() if dir { parent.decLinks() } @@ -802,7 +804,6 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving d.IncRef() // reference held by child on its parent d d.vfsd.InsertChild(&child.vfsd, name) if d.fs.opts.interop != InteropModeShared { - d.touchCMtime(ctx) delete(d.negativeChildren, name) d.dirents = nil } @@ -834,6 +835,9 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving } childVFSFD = &fd.vfsfd } + if d.fs.opts.interop != InteropModeShared { + d.touchCMtime() + } return childVFSFD, nil } @@ -975,6 +979,9 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa oldParent.decLinks() newParent.incLinks() } + oldParent.touchCMtime() + newParent.touchCMtime() + renamed.touchCtime() } vfsObj.CommitRenameReplaceDentry(&renamed.vfsd, &newParent.vfsd, newName, replacedVFSD) return nil diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go index 8e41b6b1c..adee8bb60 100644 --- a/pkg/sentry/fsimpl/gofer/gofer.go +++ b/pkg/sentry/fsimpl/gofer/gofer.go @@ -44,6 +44,7 @@ import ( "gvisor.dev/gvisor/pkg/p9" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/vfs" @@ -72,6 +73,9 @@ type filesystem struct { // client is the client used by this filesystem. client is immutable. client *p9.Client + // clock is a realtime clock used to set timestamps in file operations. + clock ktime.Clock + // uid and gid are the effective KUID and KGID of the filesystem's creator, // and are used as the owner and group for files that don't specify one. // uid and gid are immutable. @@ -376,6 +380,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt uid: creds.EffectiveKUID, gid: creds.EffectiveKGID, client: client, + clock: ktime.RealtimeClockFromContext(ctx), dentries: make(map[*dentry]struct{}), specialFileFDs: make(map[*specialFileFD]struct{}), } @@ -779,10 +784,7 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *lin // data, so there's no cache to truncate either.) return nil } - now, haveNow := nowFromContext(ctx) - if !haveNow { - ctx.Warningf("gofer.dentry.setStat: current time not available") - } + now := d.fs.clock.Now().Nanoseconds() if stat.Mask&linux.STATX_MODE != 0 { atomic.StoreUint32(&d.mode, d.fileType()|uint32(stat.Mode)) } @@ -794,25 +796,19 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *lin } if setLocalAtime { if stat.Atime.Nsec == linux.UTIME_NOW { - if haveNow { - atomic.StoreInt64(&d.atime, now) - } + atomic.StoreInt64(&d.atime, now) } else { atomic.StoreInt64(&d.atime, dentryTimestampFromStatx(stat.Atime)) } } if setLocalMtime { if stat.Mtime.Nsec == linux.UTIME_NOW { - if haveNow { - atomic.StoreInt64(&d.mtime, now) - } + atomic.StoreInt64(&d.mtime, now) } else { atomic.StoreInt64(&d.mtime, dentryTimestampFromStatx(stat.Mtime)) } } - if haveNow { - atomic.StoreInt64(&d.ctime, now) - } + atomic.StoreInt64(&d.ctime, now) if stat.Mask&linux.STATX_SIZE != 0 { d.dataMu.Lock() oldSize := d.size diff --git a/pkg/sentry/fsimpl/gofer/regular_file.go b/pkg/sentry/fsimpl/gofer/regular_file.go index 3593eb1d5..857f7c74e 100644 --- a/pkg/sentry/fsimpl/gofer/regular_file.go +++ b/pkg/sentry/fsimpl/gofer/regular_file.go @@ -104,7 +104,7 @@ func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offs putDentryReadWriter(rw) if d.fs.opts.interop != InteropModeShared { // Compare Linux's mm/filemap.c:do_generic_file_read() => file_accessed(). - d.touchAtime(ctx, fd.vfsfd.Mount()) + d.touchAtime(fd.vfsfd.Mount()) } return n, err } @@ -139,10 +139,7 @@ func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, off // Compare Linux's mm/filemap.c:__generic_file_write_iter() => // file_update_time(). This is d.touchCMtime(), but without locking // d.metadataMu (recursively). - if now, ok := nowFromContext(ctx); ok { - atomic.StoreInt64(&d.mtime, now) - atomic.StoreInt64(&d.ctime, now) - } + d.touchCMtimeLocked() } if fd.vfsfd.StatusFlags()&linux.O_DIRECT != 0 { // Write dirty cached pages that will be touched by the write back to diff --git a/pkg/sentry/fsimpl/gofer/special_file.go b/pkg/sentry/fsimpl/gofer/special_file.go index 274f7346f..507e0e276 100644 --- a/pkg/sentry/fsimpl/gofer/special_file.go +++ b/pkg/sentry/fsimpl/gofer/special_file.go @@ -76,7 +76,7 @@ func (fd *specialFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offs // hold here since specialFileFD doesn't client-cache data. Just buffer the // read instead. if d := fd.dentry(); d.fs.opts.interop != InteropModeShared { - d.touchAtime(ctx, fd.vfsfd.Mount()) + d.touchAtime(fd.vfsfd.Mount()) } buf := make([]byte, dst.NumBytes()) n, err := fd.handle.readToBlocksAt(ctx, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf)), uint64(offset)) @@ -117,7 +117,7 @@ func (fd *specialFileFD) PWrite(ctx context.Context, src usermem.IOSequence, off // Do a buffered write. See rationale in PRead. if d := fd.dentry(); d.fs.opts.interop != InteropModeShared { - d.touchCMtime(ctx) + d.touchCMtime() } buf := make([]byte, src.NumBytes()) // Don't do partial writes if we get a partial read from src. diff --git a/pkg/sentry/fsimpl/gofer/symlink.go b/pkg/sentry/fsimpl/gofer/symlink.go index adf43be60..2ec819f86 100644 --- a/pkg/sentry/fsimpl/gofer/symlink.go +++ b/pkg/sentry/fsimpl/gofer/symlink.go @@ -27,7 +27,7 @@ func (d *dentry) isSymlink() bool { // Precondition: d.isSymlink(). func (d *dentry) readlink(ctx context.Context, mnt *vfs.Mount) (string, error) { if d.fs.opts.interop != InteropModeShared { - d.touchAtime(ctx, mnt) + d.touchAtime(mnt) d.dataMu.Lock() if d.haveTarget { target := d.target diff --git a/pkg/sentry/fsimpl/gofer/time.go b/pkg/sentry/fsimpl/gofer/time.go index 7598ec6a8..2608e7e1d 100644 --- a/pkg/sentry/fsimpl/gofer/time.go +++ b/pkg/sentry/fsimpl/gofer/time.go @@ -18,8 +18,6 @@ import ( "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/context" - ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/vfs" ) @@ -38,23 +36,12 @@ func statxTimestampFromDentry(ns int64) linux.StatxTimestamp { } } -func nowFromContext(ctx context.Context) (int64, bool) { - if clock := ktime.RealtimeClockFromContext(ctx); clock != nil { - return clock.Now().Nanoseconds(), true - } - return 0, false -} - // Preconditions: fs.interop != InteropModeShared. -func (d *dentry) touchAtime(ctx context.Context, mnt *vfs.Mount) { +func (d *dentry) touchAtime(mnt *vfs.Mount) { if err := mnt.CheckBeginWrite(); err != nil { return } - now, ok := nowFromContext(ctx) - if !ok { - mnt.EndWrite() - return - } + now := d.fs.clock.Now().Nanoseconds() d.metadataMu.Lock() atomic.StoreInt64(&d.atime, now) d.metadataMu.Unlock() @@ -63,13 +50,25 @@ func (d *dentry) touchAtime(ctx context.Context, mnt *vfs.Mount) { // Preconditions: fs.interop != InteropModeShared. The caller has successfully // called vfs.Mount.CheckBeginWrite(). -func (d *dentry) touchCMtime(ctx context.Context) { - now, ok := nowFromContext(ctx) - if !ok { - return - } +func (d *dentry) touchCtime() { + now := d.fs.clock.Now().Nanoseconds() + d.metadataMu.Lock() + atomic.StoreInt64(&d.ctime, now) + d.metadataMu.Unlock() +} + +// Preconditions: fs.interop != InteropModeShared. The caller has successfully +// called vfs.Mount.CheckBeginWrite(). +func (d *dentry) touchCMtime() { + now := d.fs.clock.Now().Nanoseconds() d.metadataMu.Lock() atomic.StoreInt64(&d.mtime, now) atomic.StoreInt64(&d.ctime, now) d.metadataMu.Unlock() } + +func (d *dentry) touchCMtimeLocked() { + now := d.fs.clock.Now().Nanoseconds() + atomic.StoreInt64(&d.mtime, now) + atomic.StoreInt64(&d.ctime, now) +} diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go index a429fa23d..89f5da3d4 100644 --- a/pkg/sentry/fsimpl/kernfs/filesystem.go +++ b/pkg/sentry/fsimpl/kernfs/filesystem.go @@ -63,6 +63,9 @@ afterSymlink: rp.Advance() return nextVFSD, nil } + if len(name) > linux.NAME_MAX { + return nil, syserror.ENAMETOOLONG + } d.dirMu.Lock() nextVFSD, err := rp.ResolveChild(vfsd, name) if err != nil { @@ -191,6 +194,9 @@ func checkCreateLocked(ctx context.Context, rp *vfs.ResolvingPath, parentVFSD *v if pc == "." || pc == ".." { return "", syserror.EEXIST } + if len(pc) > linux.NAME_MAX { + return "", syserror.ENAMETOOLONG + } childVFSD, err := rp.ResolveChild(parentVFSD, pc) if err != nil { return "", err @@ -433,6 +439,9 @@ afterTrailingSymlink: if pc == "." || pc == ".." { return nil, syserror.EISDIR } + if len(pc) > linux.NAME_MAX { + return nil, syserror.ENAMETOOLONG + } // Determine whether or not we need to create a file. childVFSD, err := rp.ResolveChild(parentVFSD, pc) if err != nil { diff --git a/pkg/sentry/fsimpl/tmpfs/directory.go b/pkg/sentry/fsimpl/tmpfs/directory.go index 37c75ab64..45712c9b9 100644 --- a/pkg/sentry/fsimpl/tmpfs/directory.go +++ b/pkg/sentry/fsimpl/tmpfs/directory.go @@ -68,6 +68,8 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba fs.mu.Lock() defer fs.mu.Unlock() + fd.inode().touchAtime(fd.vfsfd.Mount()) + if fd.off == 0 { if err := cb.Handle(vfs.Dirent{ Name: ".", diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go index 4cf27bf13..1978af69c 100644 --- a/pkg/sentry/fsimpl/tmpfs/filesystem.go +++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go @@ -46,6 +46,9 @@ func stepLocked(rp *vfs.ResolvingPath, d *dentry) (*dentry, error) { return nil, err } afterSymlink: + if len(rp.Component()) > linux.NAME_MAX { + return nil, syserror.ENAMETOOLONG + } nextVFSD, err := rp.ResolveComponent(&d.vfsd) if err != nil { return nil, err @@ -133,6 +136,9 @@ func (fs *filesystem) doCreateAt(rp *vfs.ResolvingPath, dir bool, create func(pa if name == "." || name == ".." { return syserror.EEXIST } + if len(name) > linux.NAME_MAX { + return syserror.ENAMETOOLONG + } // Call parent.vfsd.Child() instead of stepLocked() or rp.ResolveChild(), // because if the child exists we want to return EEXIST immediately instead // of attempting symlink/mount traversal. @@ -153,7 +159,11 @@ func (fs *filesystem) doCreateAt(rp *vfs.ResolvingPath, dir bool, create func(pa return err } defer mnt.EndWrite() - return create(parent, name) + if err := create(parent, name); err != nil { + return err + } + parent.inode.touchCMtime() + return nil } // AccessAt implements vfs.Filesystem.Impl.AccessAt. @@ -328,7 +338,12 @@ afterTrailingSymlink: child := fs.newDentry(fs.newRegularFile(rp.Credentials(), opts.Mode)) parent.vfsd.InsertChild(&child.vfsd, name) parent.inode.impl.(*directory).childList.PushBack(child) - return child.open(ctx, rp, &opts, true) + fd, err := child.open(ctx, rp, &opts, true) + if err != nil { + return nil, err + } + parent.inode.touchCMtime() + return fd, nil } if err != nil { return nil, err @@ -398,6 +413,7 @@ func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (st if !ok { return "", syserror.EINVAL } + symlink.inode.touchAtime(rp.Mount()) return symlink.target, nil } @@ -515,6 +531,9 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa oldParent.inode.decLinksLocked() newParent.inode.incLinksLocked() } + oldParent.inode.touchCMtime() + newParent.inode.touchCMtime() + renamed.inode.touchCtime() // TODO(gvisor.dev/issue/1197): Update timestamps and parent directory // sizes. vfsObj.CommitRenameReplaceDentry(renamedVFSD, &newParent.vfsd, newName, replacedVFSD) @@ -565,6 +584,7 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error parent.inode.decLinksLocked() // from child's ".." child.inode.decLinksLocked() vfsObj.CommitDeleteDentry(childVFSD) + parent.inode.touchCMtime() return nil } @@ -654,6 +674,7 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error parent.inode.impl.(*directory).childList.Remove(child) child.inode.decLinksLocked() vfsObj.CommitDeleteDentry(childVFSD) + parent.inode.touchCMtime() return nil } diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file.go b/pkg/sentry/fsimpl/tmpfs/regular_file.go index 26cd65605..57e5e28ec 100644 --- a/pkg/sentry/fsimpl/tmpfs/regular_file.go +++ b/pkg/sentry/fsimpl/tmpfs/regular_file.go @@ -286,7 +286,8 @@ func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offs rw := getRegularFileReadWriter(f, offset) n, err := dst.CopyOutFrom(ctx, rw) putRegularFileReadWriter(rw) - return int64(n), err + fd.inode().touchAtime(fd.vfsfd.Mount()) + return n, err } // Read implements vfs.FileDescriptionImpl.Read. @@ -323,6 +324,7 @@ func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, off f.inode.mu.Lock() rw := getRegularFileReadWriter(f, offset) n, err := src.CopyInTo(ctx, rw) + fd.inode().touchCMtimeLocked() f.inode.mu.Unlock() putRegularFileReadWriter(rw) return n, err diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go index 54da15849..ad47288f8 100644 --- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go +++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go @@ -385,28 +385,41 @@ func (i *inode) setStat(ctx context.Context, creds *auth.Credentials, stat *linu return syserror.EINVAL } } + now := i.clock.Now().Nanoseconds() if mask&linux.STATX_ATIME != 0 { - atomic.StoreInt64(&i.atime, stat.Atime.ToNsecCapped()) + if stat.Atime.Nsec == linux.UTIME_NOW { + atomic.StoreInt64(&i.atime, now) + } else { + atomic.StoreInt64(&i.atime, stat.Atime.ToNsecCapped()) + } needsCtimeBump = true } if mask&linux.STATX_MTIME != 0 { - atomic.StoreInt64(&i.mtime, stat.Mtime.ToNsecCapped()) + if stat.Mtime.Nsec == linux.UTIME_NOW { + atomic.StoreInt64(&i.mtime, now) + } else { + atomic.StoreInt64(&i.mtime, stat.Mtime.ToNsecCapped()) + } needsCtimeBump = true // Ignore the mtime bump, since we just set it ourselves. needsMtimeBump = false } if mask&linux.STATX_CTIME != 0 { - atomic.StoreInt64(&i.ctime, stat.Ctime.ToNsecCapped()) + if stat.Ctime.Nsec == linux.UTIME_NOW { + atomic.StoreInt64(&i.ctime, now) + } else { + atomic.StoreInt64(&i.ctime, stat.Ctime.ToNsecCapped()) + } // Ignore the ctime bump, since we just set it ourselves. needsCtimeBump = false } - now := i.clock.Now().Nanoseconds() if needsMtimeBump { atomic.StoreInt64(&i.mtime, now) } if needsCtimeBump { atomic.StoreInt64(&i.ctime, now) } + i.mu.Unlock() return nil } @@ -484,6 +497,42 @@ func (i *inode) isDir() bool { return linux.FileMode(i.mode).FileType() == linux.S_IFDIR } +func (i *inode) touchAtime(mnt *vfs.Mount) { + if err := mnt.CheckBeginWrite(); err != nil { + return + } + now := i.clock.Now().Nanoseconds() + i.mu.Lock() + atomic.StoreInt64(&i.atime, now) + i.mu.Unlock() + mnt.EndWrite() +} + +// Preconditions: The caller has called vfs.Mount.CheckBeginWrite(). +func (i *inode) touchCtime() { + now := i.clock.Now().Nanoseconds() + i.mu.Lock() + atomic.StoreInt64(&i.ctime, now) + i.mu.Unlock() +} + +// Preconditions: The caller has called vfs.Mount.CheckBeginWrite(). +func (i *inode) touchCMtime() { + now := i.clock.Now().Nanoseconds() + i.mu.Lock() + atomic.StoreInt64(&i.mtime, now) + atomic.StoreInt64(&i.ctime, now) + i.mu.Unlock() +} + +// Preconditions: The caller has called vfs.Mount.CheckBeginWrite() and holds +// inode.mu. +func (i *inode) touchCMtimeLocked() { + now := i.clock.Now().Nanoseconds() + atomic.StoreInt64(&i.mtime, now) + atomic.StoreInt64(&i.ctime, now) +} + // fileDescription is embedded by tmpfs implementations of // vfs.FileDescriptionImpl. type fileDescription struct { -- cgit v1.2.3 From 5818663ebe26857845685702d99db41c7aa2cf3d Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Fri, 3 Apr 2020 14:07:42 -0700 Subject: Add FileDescriptionImpl for Unix sockets. This change involves several steps: - Refactor the VFS1 unix socket implementation to share methods between VFS1 and VFS2 where possible. Re-implement the rest. - Override the default PRead, Read, PWrite, Write, Ioctl, Release methods in FileDescriptionDefaultImpl. - Add functions to create and initialize a new Dentry/Inode and FileDescription for a Unix socket file. Updates #1476 PiperOrigin-RevId: 304689796 --- pkg/sentry/fsimpl/sockfs/BUILD | 1 + pkg/sentry/fsimpl/sockfs/sockfs.go | 29 +++ pkg/sentry/kernel/BUILD | 1 + pkg/sentry/socket/netstack/netstack.go | 8 +- pkg/sentry/socket/unix/BUILD | 4 + pkg/sentry/socket/unix/unix.go | 89 ++++++--- pkg/sentry/socket/unix/unix_vfs2.go | 348 +++++++++++++++++++++++++++++++++ pkg/sentry/vfs/options.go | 5 + 8 files changed, 456 insertions(+), 29 deletions(-) create mode 100644 pkg/sentry/socket/unix/unix_vfs2.go (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/sockfs/BUILD b/pkg/sentry/fsimpl/sockfs/BUILD index 790d50e65..52084ddb5 100644 --- a/pkg/sentry/fsimpl/sockfs/BUILD +++ b/pkg/sentry/fsimpl/sockfs/BUILD @@ -7,6 +7,7 @@ go_library( srcs = ["sockfs.go"], visibility = ["//pkg/sentry:internal"], deps = [ + "//pkg/abi/linux", "//pkg/context", "//pkg/sentry/fsimpl/kernfs", "//pkg/sentry/kernel/auth", diff --git a/pkg/sentry/fsimpl/sockfs/sockfs.go b/pkg/sentry/fsimpl/sockfs/sockfs.go index c13511de2..3f7ad1d65 100644 --- a/pkg/sentry/fsimpl/sockfs/sockfs.go +++ b/pkg/sentry/fsimpl/sockfs/sockfs.go @@ -16,6 +16,7 @@ package sockfs import ( + "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" @@ -60,6 +61,10 @@ type filesystem struct { } // inode implements kernfs.Inode. +// +// TODO(gvisor.dev/issue/1476): Add device numbers to this inode (which are +// not included in InodeAttrs) to store the numbers of the appropriate +// socket device. Override InodeAttrs.Stat() accordingly. type inode struct { kernfs.InodeNotDirectory kernfs.InodeNotSymlink @@ -71,3 +76,27 @@ type inode struct { func (i *inode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { return nil, syserror.ENXIO } + +// InitSocket initializes a socket FileDescription, with a corresponding +// Dentry in mnt. +// +// fd should be the FileDescription associated with socketImpl, i.e. its first +// field. mnt should be the global socket mount, Kernel.socketMount. +func InitSocket(socketImpl vfs.FileDescriptionImpl, fd *vfs.FileDescription, mnt *vfs.Mount, creds *auth.Credentials) error { + fsimpl := mnt.Filesystem().Impl() + fs := fsimpl.(*kernfs.Filesystem) + + // File mode matches net/socket.c:sock_alloc. + filemode := linux.FileMode(linux.S_IFSOCK | 0600) + i := &inode{} + i.Init(creds, fs.NextIno(), filemode) + + d := &kernfs.Dentry{} + d.Init(i) + + opts := &vfs.FileDescriptionOptions{UseDentryMetadata: true} + if err := fd.Init(socketImpl, linux.O_RDWR, mnt, d.VFSDentry(), opts); err != nil { + return err + } + return nil +} diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD index bb7e3cbc3..e0ff58d8c 100644 --- a/pkg/sentry/kernel/BUILD +++ b/pkg/sentry/kernel/BUILD @@ -169,6 +169,7 @@ go_library( "//pkg/sentry/fs/lock", "//pkg/sentry/fs/timerfd", "//pkg/sentry/fsbridge", + "//pkg/sentry/fsimpl/kernfs", "//pkg/sentry/fsimpl/sockfs", "//pkg/sentry/hostcpu", "//pkg/sentry/inet", diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go index 06a5b53bc..5d0085462 100644 --- a/pkg/sentry/socket/netstack/netstack.go +++ b/pkg/sentry/socket/netstack/netstack.go @@ -940,7 +940,7 @@ func (s *SocketOperations) GetSockOpt(t *kernel.Task, level, name int, outPtr us // GetSockOpt can be used to implement the linux syscall getsockopt(2) for // sockets backed by a commonEndpoint. -func GetSockOpt(t *kernel.Task, s socket.Socket, ep commonEndpoint, family int, skType linux.SockType, level, name, outLen int) (interface{}, *syserr.Error) { +func GetSockOpt(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, family int, skType linux.SockType, level, name, outLen int) (interface{}, *syserr.Error) { switch level { case linux.SOL_SOCKET: return getSockOptSocket(t, s, ep, family, skType, name, outLen) @@ -966,7 +966,7 @@ func GetSockOpt(t *kernel.Task, s socket.Socket, ep commonEndpoint, family int, } // getSockOptSocket implements GetSockOpt when level is SOL_SOCKET. -func getSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, family int, skType linux.SockType, name, outLen int) (interface{}, *syserr.Error) { +func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, family int, skType linux.SockType, name, outLen int) (interface{}, *syserr.Error) { // TODO(b/124056281): Stop rejecting short optLen values in getsockopt. switch name { case linux.SO_ERROR: @@ -1541,7 +1541,7 @@ func (s *SocketOperations) SetSockOpt(t *kernel.Task, level int, name int, optVa // SetSockOpt can be used to implement the linux syscall setsockopt(2) for // sockets backed by a commonEndpoint. -func SetSockOpt(t *kernel.Task, s socket.Socket, ep commonEndpoint, level int, name int, optVal []byte) *syserr.Error { +func SetSockOpt(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, level int, name int, optVal []byte) *syserr.Error { switch level { case linux.SOL_SOCKET: return setSockOptSocket(t, s, ep, name, optVal) @@ -1568,7 +1568,7 @@ func SetSockOpt(t *kernel.Task, s socket.Socket, ep commonEndpoint, level int, n } // setSockOptSocket implements SetSockOpt when level is SOL_SOCKET. -func setSockOptSocket(t *kernel.Task, s socket.Socket, ep commonEndpoint, name int, optVal []byte) *syserr.Error { +func setSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name int, optVal []byte) *syserr.Error { switch name { case linux.SO_SNDBUF: if len(optVal) < sizeOfInt32 { diff --git a/pkg/sentry/socket/unix/BUILD b/pkg/sentry/socket/unix/BUILD index 08743deba..de2cc4bdf 100644 --- a/pkg/sentry/socket/unix/BUILD +++ b/pkg/sentry/socket/unix/BUILD @@ -8,23 +8,27 @@ go_library( "device.go", "io.go", "unix.go", + "unix_vfs2.go", ], visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", "//pkg/context", + "//pkg/fspath", "//pkg/refs", "//pkg/safemem", "//pkg/sentry/arch", "//pkg/sentry/device", "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", + "//pkg/sentry/fsimpl/sockfs", "//pkg/sentry/kernel", "//pkg/sentry/kernel/time", "//pkg/sentry/socket", "//pkg/sentry/socket/control", "//pkg/sentry/socket/netstack", "//pkg/sentry/socket/unix/transport", + "//pkg/sentry/vfs", "//pkg/syserr", "//pkg/syserror", "//pkg/tcpip", diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go index 4d30aa714..7c64f30fa 100644 --- a/pkg/sentry/socket/unix/unix.go +++ b/pkg/sentry/socket/unix/unix.go @@ -23,6 +23,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" @@ -33,6 +34,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/socket/control" "gvisor.dev/gvisor/pkg/sentry/socket/netstack" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" + "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/tcpip" @@ -52,11 +54,8 @@ type SocketOperations struct { fsutil.FileNoSplice `state:"nosave"` fsutil.FileNoopFlush `state:"nosave"` fsutil.FileUseInodeUnstableAttr `state:"nosave"` - refs.AtomicRefCount - socket.SendReceiveTimeout - ep transport.Endpoint - stype linux.SockType + socketOpsCommon } // New creates a new unix socket. @@ -75,16 +74,29 @@ func NewWithDirent(ctx context.Context, d *fs.Dirent, ep transport.Endpoint, sty } s := SocketOperations{ - ep: ep, - stype: stype, + socketOpsCommon: socketOpsCommon{ + ep: ep, + stype: stype, + }, } s.EnableLeakCheck("unix.SocketOperations") return fs.NewFile(ctx, d, flags, &s) } +// socketOpsCommon contains the socket operations common to VFS1 and VFS2. +// +// +stateify savable +type socketOpsCommon struct { + refs.AtomicRefCount + socket.SendReceiveTimeout + + ep transport.Endpoint + stype linux.SockType +} + // DecRef implements RefCounter.DecRef. -func (s *SocketOperations) DecRef() { +func (s *socketOpsCommon) DecRef() { s.DecRefWithDestructor(func() { s.ep.Close() }) @@ -97,7 +109,7 @@ func (s *SocketOperations) Release() { s.DecRef() } -func (s *SocketOperations) isPacket() bool { +func (s *socketOpsCommon) isPacket() bool { switch s.stype { case linux.SOCK_DGRAM, linux.SOCK_SEQPACKET: return true @@ -110,7 +122,7 @@ func (s *SocketOperations) isPacket() bool { } // Endpoint extracts the transport.Endpoint. -func (s *SocketOperations) Endpoint() transport.Endpoint { +func (s *socketOpsCommon) Endpoint() transport.Endpoint { return s.ep } @@ -143,7 +155,7 @@ func extractPath(sockaddr []byte) (string, *syserr.Error) { // GetPeerName implements the linux syscall getpeername(2) for sockets backed by // a transport.Endpoint. -func (s *SocketOperations) GetPeerName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) { +func (s *socketOpsCommon) GetPeerName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) { addr, err := s.ep.GetRemoteAddress() if err != nil { return nil, 0, syserr.TranslateNetstackError(err) @@ -155,7 +167,7 @@ func (s *SocketOperations) GetPeerName(t *kernel.Task) (linux.SockAddr, uint32, // GetSockName implements the linux syscall getsockname(2) for sockets backed by // a transport.Endpoint. -func (s *SocketOperations) GetSockName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) { +func (s *socketOpsCommon) GetSockName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) { addr, err := s.ep.GetLocalAddress() if err != nil { return nil, 0, syserr.TranslateNetstackError(err) @@ -178,7 +190,7 @@ func (s *SocketOperations) GetSockOpt(t *kernel.Task, level, name int, outPtr us // Listen implements the linux syscall listen(2) for sockets backed by // a transport.Endpoint. -func (s *SocketOperations) Listen(t *kernel.Task, backlog int) *syserr.Error { +func (s *socketOpsCommon) Listen(t *kernel.Task, backlog int) *syserr.Error { return s.ep.Listen(backlog) } @@ -310,6 +322,8 @@ func (s *SocketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { } // Create the socket. + // + // TODO(gvisor.dev/issue/2324): Correctly set file permissions. childDir, err := d.Bind(t, t.FSContext().RootDirectory(), name, bep, fs.FilePermissions{User: fs.PermMask{Read: true}}) if err != nil { return syserr.ErrPortInUse @@ -345,6 +359,31 @@ func extractEndpoint(t *kernel.Task, sockaddr []byte) (transport.BoundEndpoint, return ep, nil } + if kernel.VFS2Enabled { + p := fspath.Parse(path) + root := t.FSContext().RootDirectoryVFS2() + start := root + relPath := !p.Absolute + if relPath { + start = t.FSContext().WorkingDirectoryVFS2() + } + pop := vfs.PathOperation{ + Root: root, + Start: start, + Path: p, + FollowFinalSymlink: true, + } + ep, e := t.Kernel().VFS().BoundEndpointAt(t, t.Credentials(), &pop) + root.DecRef() + if relPath { + start.DecRef() + } + if e != nil { + return nil, syserr.FromError(e) + } + return ep, nil + } + // Find the node in the filesystem. root := t.FSContext().RootDirectory() cwd := t.FSContext().WorkingDirectory() @@ -363,12 +402,11 @@ func extractEndpoint(t *kernel.Task, sockaddr []byte) (transport.BoundEndpoint, // No socket! return nil, syserr.ErrConnectionRefused } - return ep, nil } // Connect implements the linux syscall connect(2) for unix sockets. -func (s *SocketOperations) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error { +func (s *socketOpsCommon) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error { ep, err := extractEndpoint(t, sockaddr) if err != nil { return err @@ -379,7 +417,7 @@ func (s *SocketOperations) Connect(t *kernel.Task, sockaddr []byte, blocking boo return s.ep.Connect(t, ep) } -// Writev implements fs.FileOperations.Write. +// Write implements fs.FileOperations.Write. func (s *SocketOperations) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, _ int64) (int64, error) { t := kernel.TaskFromContext(ctx) ctrl := control.New(t, s.ep, nil) @@ -399,7 +437,7 @@ func (s *SocketOperations) Write(ctx context.Context, _ *fs.File, src usermem.IO // SendMsg implements the linux syscall sendmsg(2) for unix sockets backed by // a transport.Endpoint. -func (s *SocketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, haveDeadline bool, deadline ktime.Time, controlMessages socket.ControlMessages) (int, *syserr.Error) { +func (s *socketOpsCommon) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, haveDeadline bool, deadline ktime.Time, controlMessages socket.ControlMessages) (int, *syserr.Error) { w := EndpointWriter{ Ctx: t, Endpoint: s.ep, @@ -453,27 +491,27 @@ func (s *SocketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to [] } // Passcred implements transport.Credentialer.Passcred. -func (s *SocketOperations) Passcred() bool { +func (s *socketOpsCommon) Passcred() bool { return s.ep.Passcred() } // ConnectedPasscred implements transport.Credentialer.ConnectedPasscred. -func (s *SocketOperations) ConnectedPasscred() bool { +func (s *socketOpsCommon) ConnectedPasscred() bool { return s.ep.ConnectedPasscred() } // Readiness implements waiter.Waitable.Readiness. -func (s *SocketOperations) Readiness(mask waiter.EventMask) waiter.EventMask { +func (s *socketOpsCommon) Readiness(mask waiter.EventMask) waiter.EventMask { return s.ep.Readiness(mask) } // EventRegister implements waiter.Waitable.EventRegister. -func (s *SocketOperations) EventRegister(e *waiter.Entry, mask waiter.EventMask) { +func (s *socketOpsCommon) EventRegister(e *waiter.Entry, mask waiter.EventMask) { s.ep.EventRegister(e, mask) } // EventUnregister implements waiter.Waitable.EventUnregister. -func (s *SocketOperations) EventUnregister(e *waiter.Entry) { +func (s *socketOpsCommon) EventUnregister(e *waiter.Entry) { s.ep.EventUnregister(e) } @@ -485,7 +523,7 @@ func (s *SocketOperations) SetSockOpt(t *kernel.Task, level int, name int, optVa // Shutdown implements the linux syscall shutdown(2) for sockets backed by // a transport.Endpoint. -func (s *SocketOperations) Shutdown(t *kernel.Task, how int) *syserr.Error { +func (s *socketOpsCommon) Shutdown(t *kernel.Task, how int) *syserr.Error { f, err := netstack.ConvertShutdown(how) if err != nil { return err @@ -511,7 +549,7 @@ func (s *SocketOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOS // RecvMsg implements the linux syscall recvmsg(2) for sockets backed by // a transport.Endpoint. -func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (n int, msgFlags int, senderAddr linux.SockAddr, senderAddrLen uint32, controlMessages socket.ControlMessages, err *syserr.Error) { +func (s *socketOpsCommon) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (n int, msgFlags int, senderAddr linux.SockAddr, senderAddrLen uint32, controlMessages socket.ControlMessages, err *syserr.Error) { trunc := flags&linux.MSG_TRUNC != 0 peek := flags&linux.MSG_PEEK != 0 dontWait := flags&linux.MSG_DONTWAIT != 0 @@ -648,12 +686,12 @@ func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags } // State implements socket.Socket.State. -func (s *SocketOperations) State() uint32 { +func (s *socketOpsCommon) State() uint32 { return s.ep.State() } // Type implements socket.Socket.Type. -func (s *SocketOperations) Type() (family int, skType linux.SockType, protocol int) { +func (s *socketOpsCommon) Type() (family int, skType linux.SockType, protocol int) { // Unix domain sockets always have a protocol of 0. return linux.AF_UNIX, s.stype, 0 } @@ -706,4 +744,5 @@ func (*provider) Pair(t *kernel.Task, stype linux.SockType, protocol int) (*fs.F func init() { socket.RegisterProvider(linux.AF_UNIX, &provider{}) + socket.RegisterProviderVFS2(linux.AF_UNIX, &providerVFS2{}) } diff --git a/pkg/sentry/socket/unix/unix_vfs2.go b/pkg/sentry/socket/unix/unix_vfs2.go new file mode 100644 index 000000000..ca1388e2c --- /dev/null +++ b/pkg/sentry/socket/unix/unix_vfs2.go @@ -0,0 +1,348 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package unix + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/socket/control" + "gvisor.dev/gvisor/pkg/sentry/socket/netstack" + "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserr" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +// SocketVFS2 implements socket.SocketVFS2 (and by extension, +// vfs.FileDescriptionImpl) for Unix sockets. +type SocketVFS2 struct { + vfsfd vfs.FileDescription + vfs.FileDescriptionDefaultImpl + vfs.DentryMetadataFileDescriptionImpl + + socketOpsCommon +} + +// NewVFS2File creates and returns a new vfs.FileDescription for a unix socket. +func NewVFS2File(t *kernel.Task, ep transport.Endpoint, stype linux.SockType) (*vfs.FileDescription, *syserr.Error) { + sock := NewFDImpl(ep, stype) + vfsfd := &sock.vfsfd + if err := sockfs.InitSocket(sock, vfsfd, t.Kernel().SocketMount(), t.Credentials()); err != nil { + return nil, syserr.FromError(err) + } + return vfsfd, nil +} + +// NewFDImpl creates and returns a new SocketVFS2. +func NewFDImpl(ep transport.Endpoint, stype linux.SockType) *SocketVFS2 { + // You can create AF_UNIX, SOCK_RAW sockets. They're the same as + // SOCK_DGRAM and don't require CAP_NET_RAW. + if stype == linux.SOCK_RAW { + stype = linux.SOCK_DGRAM + } + + return &SocketVFS2{ + socketOpsCommon: socketOpsCommon{ + ep: ep, + stype: stype, + }, + } +} + +// GetSockOpt implements the linux syscall getsockopt(2) for sockets backed by +// a transport.Endpoint. +func (s *SocketVFS2) GetSockOpt(t *kernel.Task, level int, name int, outPtr usermem.Addr, outLen int) (interface{}, *syserr.Error) { + return netstack.GetSockOpt(t, s, s.ep, linux.AF_UNIX, s.ep.Type(), level, name, outLen) +} + +// blockingAccept implements a blocking version of accept(2), that is, if no +// connections are ready to be accept, it will block until one becomes ready. +func (s *SocketVFS2) blockingAccept(t *kernel.Task) (transport.Endpoint, *syserr.Error) { + // Register for notifications. + e, ch := waiter.NewChannelEntry(nil) + s.socketOpsCommon.EventRegister(&e, waiter.EventIn) + defer s.socketOpsCommon.EventUnregister(&e) + + // Try to accept the connection; if it fails, then wait until we get a + // notification. + for { + if ep, err := s.ep.Accept(); err != syserr.ErrWouldBlock { + return ep, err + } + + if err := t.Block(ch); err != nil { + return nil, syserr.FromError(err) + } + } +} + +// Accept implements the linux syscall accept(2) for sockets backed by +// a transport.Endpoint. +func (s *SocketVFS2) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error) { + // Issue the accept request to get the new endpoint. + ep, err := s.ep.Accept() + if err != nil { + if err != syserr.ErrWouldBlock || !blocking { + return 0, nil, 0, err + } + + var err *syserr.Error + ep, err = s.blockingAccept(t) + if err != nil { + return 0, nil, 0, err + } + } + + // We expect this to be a FileDescription here. + ns, err := NewVFS2File(t, ep, s.stype) + if err != nil { + return 0, nil, 0, err + } + defer ns.DecRef() + + if flags&linux.SOCK_NONBLOCK != 0 { + ns.SetStatusFlags(t, t.Credentials(), linux.SOCK_NONBLOCK) + } + + var addr linux.SockAddr + var addrLen uint32 + if peerRequested { + // Get address of the peer. + var err *syserr.Error + addr, addrLen, err = ns.Impl().(*SocketVFS2).GetPeerName(t) + if err != nil { + return 0, nil, 0, err + } + } + + fd, e := t.NewFDFromVFS2(0, ns, kernel.FDFlags{ + CloseOnExec: flags&linux.SOCK_CLOEXEC != 0, + }) + if e != nil { + return 0, nil, 0, syserr.FromError(e) + } + + // TODO: add vfs2 sockets to global table. + return fd, addr, addrLen, nil +} + +// Bind implements the linux syscall bind(2) for unix sockets. +func (s *SocketVFS2) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { + p, e := extractPath(sockaddr) + if e != nil { + return e + } + + bep, ok := s.ep.(transport.BoundEndpoint) + if !ok { + // This socket can't be bound. + return syserr.ErrInvalidArgument + } + + return s.ep.Bind(tcpip.FullAddress{Addr: tcpip.Address(p)}, func() *syserr.Error { + // Is it abstract? + if p[0] == 0 { + if t.IsNetworkNamespaced() { + return syserr.ErrInvalidEndpointState + } + if err := t.AbstractSockets().Bind(p[1:], bep, s); err != nil { + // syserr.ErrPortInUse corresponds to EADDRINUSE. + return syserr.ErrPortInUse + } + } else { + path := fspath.Parse(p) + root := t.FSContext().RootDirectoryVFS2() + defer root.DecRef() + start := root + relPath := !path.Absolute + if relPath { + start = t.FSContext().WorkingDirectoryVFS2() + defer start.DecRef() + } + pop := vfs.PathOperation{ + Root: root, + Start: start, + Path: path, + } + err := t.Kernel().VFS().MknodAt(t, t.Credentials(), &pop, &vfs.MknodOptions{ + // TODO(gvisor.dev/issue/2324): The file permissions should be taken + // from s and t.FSContext().Umask() (see net/unix/af_unix.c:unix_bind), + // but VFS1 just always uses 0400. Resolve this inconsistency. + Mode: linux.S_IFSOCK | 0400, + Endpoint: bep, + }) + if err == syserror.EEXIST { + return syserr.ErrAddressInUse + } + return syserr.FromError(err) + } + + return nil + }) +} + +// Ioctl implements vfs.FileDescriptionImpl. +func (s *SocketVFS2) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) { + return netstack.Ioctl(ctx, s.ep, uio, args) +} + +// PRead implements vfs.FileDescriptionImpl. +func (s *SocketVFS2) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { + return 0, syserror.ESPIPE +} + +// Read implements vfs.FileDescriptionImpl. +func (s *SocketVFS2) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + // All flags other than RWF_NOWAIT should be ignored. + // TODO(gvisor.dev/issue/1476): Support RWF_NOWAIT. + if opts.Flags != 0 { + return 0, syserror.EOPNOTSUPP + } + + if dst.NumBytes() == 0 { + return 0, nil + } + return dst.CopyOutFrom(ctx, &EndpointReader{ + Ctx: ctx, + Endpoint: s.ep, + NumRights: 0, + Peek: false, + From: nil, + }) +} + +// PWrite implements vfs.FileDescriptionImpl. +func (s *SocketVFS2) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { + return 0, syserror.ESPIPE +} + +// Write implements vfs.FileDescriptionImpl. +func (s *SocketVFS2) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { + // All flags other than RWF_NOWAIT should be ignored. + // TODO(gvisor.dev/issue/1476): Support RWF_NOWAIT. + if opts.Flags != 0 { + return 0, syserror.EOPNOTSUPP + } + + t := kernel.TaskFromContext(ctx) + ctrl := control.New(t, s.ep, nil) + + if src.NumBytes() == 0 { + nInt, err := s.ep.SendMsg(ctx, [][]byte{}, ctrl, nil) + return int64(nInt), err.ToError() + } + + return src.CopyInTo(ctx, &EndpointWriter{ + Ctx: ctx, + Endpoint: s.ep, + Control: ctrl, + To: nil, + }) +} + +// Release implements vfs.FileDescriptionImpl. +func (s *SocketVFS2) Release() { + // Release only decrements a reference on s because s may be referenced in + // the abstract socket namespace. + s.DecRef() +} + +// Readiness implements waiter.Waitable.Readiness. +func (s *SocketVFS2) Readiness(mask waiter.EventMask) waiter.EventMask { + return s.socketOpsCommon.Readiness(mask) +} + +// EventRegister implements waiter.Waitable.EventRegister. +func (s *SocketVFS2) EventRegister(e *waiter.Entry, mask waiter.EventMask) { + s.socketOpsCommon.EventRegister(e, mask) +} + +// EventUnregister implements waiter.Waitable.EventUnregister. +func (s *SocketVFS2) EventUnregister(e *waiter.Entry) { + s.socketOpsCommon.EventUnregister(e) +} + +// SetSockOpt implements the linux syscall setsockopt(2) for sockets backed by +// a transport.Endpoint. +func (s *SocketVFS2) SetSockOpt(t *kernel.Task, level int, name int, optVal []byte) *syserr.Error { + return netstack.SetSockOpt(t, s, s.ep, level, name, optVal) +} + +// providerVFS2 is a unix domain socket provider for VFS2. +type providerVFS2 struct{} + +func (*providerVFS2) Socket(t *kernel.Task, stype linux.SockType, protocol int) (*vfs.FileDescription, *syserr.Error) { + // Check arguments. + if protocol != 0 && protocol != linux.AF_UNIX /* PF_UNIX */ { + return nil, syserr.ErrProtocolNotSupported + } + + // Create the endpoint and socket. + var ep transport.Endpoint + switch stype { + case linux.SOCK_DGRAM, linux.SOCK_RAW: + ep = transport.NewConnectionless(t) + case linux.SOCK_SEQPACKET, linux.SOCK_STREAM: + ep = transport.NewConnectioned(t, stype, t.Kernel()) + default: + return nil, syserr.ErrInvalidArgument + } + + f, err := NewVFS2File(t, ep, stype) + if err != nil { + ep.Close() + return nil, err + } + return f, nil +} + +// Pair creates a new pair of AF_UNIX connected sockets. +func (*providerVFS2) Pair(t *kernel.Task, stype linux.SockType, protocol int) (*vfs.FileDescription, *vfs.FileDescription, *syserr.Error) { + // Check arguments. + if protocol != 0 && protocol != linux.AF_UNIX /* PF_UNIX */ { + return nil, nil, syserr.ErrProtocolNotSupported + } + + switch stype { + case linux.SOCK_STREAM, linux.SOCK_DGRAM, linux.SOCK_SEQPACKET, linux.SOCK_RAW: + // Ok + default: + return nil, nil, syserr.ErrInvalidArgument + } + + // Create the endpoints and sockets. + ep1, ep2 := transport.NewPair(t, stype, t.Kernel()) + s1, err := NewVFS2File(t, ep1, stype) + if err != nil { + ep1.Close() + ep2.Close() + return nil, nil, err + } + s2, err := NewVFS2File(t, ep2, stype) + if err != nil { + s1.DecRef() + ep2.Close() + return nil, nil, err + } + + return s1, s2, nil +} diff --git a/pkg/sentry/vfs/options.go b/pkg/sentry/vfs/options.go index 3e90dc4ed..2f04bf882 100644 --- a/pkg/sentry/vfs/options.go +++ b/pkg/sentry/vfs/options.go @@ -16,6 +16,7 @@ package vfs import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" ) // GetDentryOptions contains options to VirtualFilesystem.GetDentryAt() and @@ -44,6 +45,10 @@ type MknodOptions struct { // DevMinor are the major and minor device numbers for the created device. DevMajor uint32 DevMinor uint32 + + // Endpoint is the endpoint to bind to the created file, if a socket file is + // being created for bind(2) on a Unix domain socket. + Endpoint transport.BoundEndpoint } // MountFlags contains flags as specified for mount(2), e.g. MS_NOEXEC. -- cgit v1.2.3 From 24bee1c1813a691072cff5bad7a528690a99eb5e Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Sat, 4 Apr 2020 21:01:42 -0700 Subject: Record VFS2 sockets in global socket map. Updates #1476, #1478, #1484, #1485. PiperOrigin-RevId: 304845354 --- pkg/sentry/fsimpl/proc/BUILD | 1 - pkg/sentry/fsimpl/proc/task_net.go | 88 ++++++++++++++++++++++--------------- pkg/sentry/kernel/kernel.go | 30 +++++++++++-- pkg/sentry/socket/socket.go | 6 ++- pkg/sentry/socket/unix/unix_vfs2.go | 2 +- pkg/sentry/vfs/file_description.go | 6 +++ 6 files changed, 91 insertions(+), 42 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/proc/BUILD b/pkg/sentry/fsimpl/proc/BUILD index 8156984eb..17c1342b5 100644 --- a/pkg/sentry/fsimpl/proc/BUILD +++ b/pkg/sentry/fsimpl/proc/BUILD @@ -22,7 +22,6 @@ go_library( "//pkg/log", "//pkg/refs", "//pkg/safemem", - "//pkg/sentry/fs", "//pkg/sentry/fsbridge", "//pkg/sentry/fsimpl/kernfs", "//pkg/sentry/inet", diff --git a/pkg/sentry/fsimpl/proc/task_net.go b/pkg/sentry/fsimpl/proc/task_net.go index 373a7b17d..6b2a77328 100644 --- a/pkg/sentry/fsimpl/proc/task_net.go +++ b/pkg/sentry/fsimpl/proc/task_net.go @@ -24,7 +24,6 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" - "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/kernel" @@ -32,6 +31,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/socket" "gvisor.dev/gvisor/pkg/sentry/socket/unix" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" + "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/usermem" @@ -206,22 +206,21 @@ var _ dynamicInode = (*netUnixData)(nil) func (n *netUnixData) Generate(ctx context.Context, buf *bytes.Buffer) error { buf.WriteString("Num RefCount Protocol Flags Type St Inode Path\n") for _, se := range n.kernel.ListSockets() { - s := se.Sock.Get() - if s == nil { - log.Debugf("Couldn't resolve weakref %v in socket table, racing with destruction?", se.Sock) + s := se.SockVFS2 + if !s.TryIncRef() { + log.Debugf("Couldn't get reference on %v in socket table, racing with destruction?", s) continue } - sfile := s.(*fs.File) - if family, _, _ := sfile.FileOperations.(socket.Socket).Type(); family != linux.AF_UNIX { + if family, _, _ := s.Impl().(socket.SocketVFS2).Type(); family != linux.AF_UNIX { s.DecRef() // Not a unix socket. continue } - sops := sfile.FileOperations.(*unix.SocketOperations) + sops := s.Impl().(*unix.SocketVFS2) addr, err := sops.Endpoint().GetLocalAddress() if err != nil { - log.Warningf("Failed to retrieve socket name from %+v: %v", sfile, err) + log.Warningf("Failed to retrieve socket name from %+v: %v", s, err) addr.Addr = "" } @@ -234,6 +233,15 @@ func (n *netUnixData) Generate(ctx context.Context, buf *bytes.Buffer) error { } } + // Get inode number. + var ino uint64 + stat, statErr := s.Stat(ctx, vfs.StatOptions{Mask: linux.STATX_INO}) + if statErr != nil || stat.Mask&linux.STATX_INO == 0 { + log.Warningf("Failed to retrieve ino for socket file: %v", statErr) + } else { + ino = stat.Ino + } + // In the socket entry below, the value for the 'Num' field requires // some consideration. Linux prints the address to the struct // unix_sock representing a socket in the kernel, but may redact the @@ -252,14 +260,14 @@ func (n *netUnixData) Generate(ctx context.Context, buf *bytes.Buffer) error { // the definition of this struct changes over time. // // For now, we always redact this pointer. - fmt.Fprintf(buf, "%#016p: %08X %08X %08X %04X %02X %5d", + fmt.Fprintf(buf, "%#016p: %08X %08X %08X %04X %02X %8d", (*unix.SocketOperations)(nil), // Num, pointer to kernel socket struct. - sfile.ReadRefs()-1, // RefCount, don't count our own ref. + s.Refs()-1, // RefCount, don't count our own ref. 0, // Protocol, always 0 for UDS. sockFlags, // Flags. sops.Endpoint().Type(), // Type. sops.State(), // State. - sfile.InodeID(), // Inode. + ino, // Inode. ) // Path @@ -341,15 +349,14 @@ func commonGenerateTCP(ctx context.Context, buf *bytes.Buffer, k *kernel.Kernel, t := kernel.TaskFromContext(ctx) for _, se := range k.ListSockets() { - s := se.Sock.Get() - if s == nil { - log.Debugf("Couldn't resolve weakref with ID %v in socket table, racing with destruction?", se.ID) + s := se.SockVFS2 + if !s.TryIncRef() { + log.Debugf("Couldn't get reference on %v in socket table, racing with destruction?", s) continue } - sfile := s.(*fs.File) - sops, ok := sfile.FileOperations.(socket.Socket) + sops, ok := s.Impl().(socket.SocketVFS2) if !ok { - panic(fmt.Sprintf("Found non-socket file in socket table: %+v", sfile)) + panic(fmt.Sprintf("Found non-socket file in socket table: %+v", s)) } if fa, stype, _ := sops.Type(); !(family == fa && stype == linux.SOCK_STREAM) { s.DecRef() @@ -398,14 +405,15 @@ func commonGenerateTCP(ctx context.Context, buf *bytes.Buffer, k *kernel.Kernel, // Unimplemented. fmt.Fprintf(buf, "%08X ", 0) + stat, statErr := s.Stat(ctx, vfs.StatOptions{Mask: linux.STATX_UID | linux.STATX_INO}) + // Field: uid. - uattr, err := sfile.Dirent.Inode.UnstableAttr(ctx) - if err != nil { - log.Warningf("Failed to retrieve unstable attr for socket file: %v", err) + if statErr != nil || stat.Mask&linux.STATX_UID == 0 { + log.Warningf("Failed to retrieve uid for socket file: %v", statErr) fmt.Fprintf(buf, "%5d ", 0) } else { creds := auth.CredentialsFromContext(ctx) - fmt.Fprintf(buf, "%5d ", uint32(uattr.Owner.UID.In(creds.UserNamespace).OrOverflow())) + fmt.Fprintf(buf, "%5d ", uint32(auth.KUID(stat.UID).In(creds.UserNamespace).OrOverflow())) } // Field: timeout; number of unanswered 0-window probes. @@ -413,11 +421,16 @@ func commonGenerateTCP(ctx context.Context, buf *bytes.Buffer, k *kernel.Kernel, fmt.Fprintf(buf, "%8d ", 0) // Field: inode. - fmt.Fprintf(buf, "%8d ", sfile.InodeID()) + if statErr != nil || stat.Mask&linux.STATX_INO == 0 { + log.Warningf("Failed to retrieve inode for socket file: %v", statErr) + fmt.Fprintf(buf, "%8d ", 0) + } else { + fmt.Fprintf(buf, "%8d ", stat.Ino) + } // Field: refcount. Don't count the ref we obtain while deferencing // the weakref to this socket. - fmt.Fprintf(buf, "%d ", sfile.ReadRefs()-1) + fmt.Fprintf(buf, "%d ", s.Refs()-1) // Field: Socket struct address. Redacted due to the same reason as // the 'Num' field in /proc/net/unix, see netUnix.ReadSeqFileData. @@ -499,15 +512,14 @@ func (d *netUDPData) Generate(ctx context.Context, buf *bytes.Buffer) error { t := kernel.TaskFromContext(ctx) for _, se := range d.kernel.ListSockets() { - s := se.Sock.Get() - if s == nil { - log.Debugf("Couldn't resolve weakref with ID %v in socket table, racing with destruction?", se.ID) + s := se.SockVFS2 + if !s.TryIncRef() { + log.Debugf("Couldn't get reference on %v in socket table, racing with destruction?", s) continue } - sfile := s.(*fs.File) - sops, ok := sfile.FileOperations.(socket.Socket) + sops, ok := s.Impl().(socket.SocketVFS2) if !ok { - panic(fmt.Sprintf("Found non-socket file in socket table: %+v", sfile)) + panic(fmt.Sprintf("Found non-socket file in socket table: %+v", s)) } if family, stype, _ := sops.Type(); family != linux.AF_INET || stype != linux.SOCK_DGRAM { s.DecRef() @@ -551,25 +563,31 @@ func (d *netUDPData) Generate(ctx context.Context, buf *bytes.Buffer) error { // Field: retrnsmt. Always 0 for UDP. fmt.Fprintf(buf, "%08X ", 0) + stat, statErr := s.Stat(ctx, vfs.StatOptions{Mask: linux.STATX_UID | linux.STATX_INO}) + // Field: uid. - uattr, err := sfile.Dirent.Inode.UnstableAttr(ctx) - if err != nil { - log.Warningf("Failed to retrieve unstable attr for socket file: %v", err) + if statErr != nil || stat.Mask&linux.STATX_UID == 0 { + log.Warningf("Failed to retrieve uid for socket file: %v", statErr) fmt.Fprintf(buf, "%5d ", 0) } else { creds := auth.CredentialsFromContext(ctx) - fmt.Fprintf(buf, "%5d ", uint32(uattr.Owner.UID.In(creds.UserNamespace).OrOverflow())) + fmt.Fprintf(buf, "%5d ", uint32(auth.KUID(stat.UID).In(creds.UserNamespace).OrOverflow())) } // Field: timeout. Always 0 for UDP. fmt.Fprintf(buf, "%8d ", 0) // Field: inode. - fmt.Fprintf(buf, "%8d ", sfile.InodeID()) + if statErr != nil || stat.Mask&linux.STATX_INO == 0 { + log.Warningf("Failed to retrieve inode for socket file: %v", statErr) + fmt.Fprintf(buf, "%8d ", 0) + } else { + fmt.Fprintf(buf, "%8d ", stat.Ino) + } // Field: ref; reference count on the socket inode. Don't count the ref // we obtain while deferencing the weakref to this socket. - fmt.Fprintf(buf, "%d ", sfile.ReadRefs()-1) + fmt.Fprintf(buf, "%d ", s.Refs()-1) // Field: Socket struct address. Redacted due to the same reason as // the 'Num' field in /proc/net/unix, see netUnix.ReadSeqFileData. diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index 2e6f42b92..ba8935a82 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -1445,9 +1445,10 @@ func (k *Kernel) SupervisorContext() context.Context { // +stateify savable type SocketEntry struct { socketEntry - k *Kernel - Sock *refs.WeakRef - ID uint64 // Socket table entry number. + k *Kernel + Sock *refs.WeakRef + SockVFS2 *vfs.FileDescription + ID uint64 // Socket table entry number. } // WeakRefGone implements refs.WeakRefUser.WeakRefGone. @@ -1470,7 +1471,30 @@ func (k *Kernel) RecordSocket(sock *fs.File) { k.extMu.Unlock() } +// RecordSocketVFS2 adds a VFS2 socket to the system-wide socket table for +// tracking. +// +// Precondition: Caller must hold a reference to sock. +// +// Note that the socket table will not hold a reference on the +// vfs.FileDescription, because we do not support weak refs on VFS2 files. +func (k *Kernel) RecordSocketVFS2(sock *vfs.FileDescription) { + k.extMu.Lock() + id := k.nextSocketEntry + k.nextSocketEntry++ + s := &SocketEntry{ + k: k, + ID: id, + SockVFS2: sock, + } + k.sockets.PushBack(s) + k.extMu.Unlock() +} + // ListSockets returns a snapshot of all sockets. +// +// Callers of ListSockets() in VFS2 should use SocketEntry.SockVFS2.TryIncRef() +// to get a reference on a socket in the table. func (k *Kernel) ListSockets() []*SocketEntry { k.extMu.Lock() var socks []*SocketEntry diff --git a/pkg/sentry/socket/socket.go b/pkg/sentry/socket/socket.go index b5ba4a56b..6580bd6e9 100644 --- a/pkg/sentry/socket/socket.go +++ b/pkg/sentry/socket/socket.go @@ -269,7 +269,7 @@ func NewVFS2(t *kernel.Task, family int, stype linux.SockType, protocol int) (*v return nil, err } if s != nil { - // TODO: Add vfs2 sockets to global socket table. + t.Kernel().RecordSocketVFS2(s) return s, nil } } @@ -291,7 +291,9 @@ func PairVFS2(t *kernel.Task, family int, stype linux.SockType, protocol int) (* return nil, nil, err } if s1 != nil && s2 != nil { - // TODO: Add vfs2 sockets to global socket table. + k := t.Kernel() + k.RecordSocketVFS2(s1) + k.RecordSocketVFS2(s2) return s1, s2, nil } } diff --git a/pkg/sentry/socket/unix/unix_vfs2.go b/pkg/sentry/socket/unix/unix_vfs2.go index ca1388e2c..3e54d49c4 100644 --- a/pkg/sentry/socket/unix/unix_vfs2.go +++ b/pkg/sentry/socket/unix/unix_vfs2.go @@ -141,7 +141,7 @@ func (s *SocketVFS2) Accept(t *kernel.Task, peerRequested bool, flags int, block return 0, nil, 0, syserr.FromError(e) } - // TODO: add vfs2 sockets to global table. + t.Kernel().RecordSocketVFS2(ns) return fd, addr, addrLen, nil } diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go index 5df4bbf45..28e93a441 100644 --- a/pkg/sentry/vfs/file_description.go +++ b/pkg/sentry/vfs/file_description.go @@ -182,6 +182,12 @@ func (fd *FileDescription) DecRef() { } } +// Refs returns the current number of references. The returned count +// is inherently racy and is unsafe to use without external synchronization. +func (fd *FileDescription) Refs() int64 { + return atomic.LoadInt64(&fd.refs) +} + // Mount returns the mount on which fd was opened. It does not take a reference // on the returned Mount. func (fd *FileDescription) Mount() *Mount { -- cgit v1.2.3 From 00d9776a4bb1cc1d7125af7d3e54a939a4f3847a Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Mon, 6 Apr 2020 07:30:20 -0700 Subject: Add socket files to tmpfs VFS2. Updates #1476. PiperOrigin-RevId: 305024274 --- pkg/sentry/fsimpl/tmpfs/BUILD | 1 + pkg/sentry/fsimpl/tmpfs/filesystem.go | 20 +++++++++++++++----- pkg/sentry/fsimpl/tmpfs/socket_file.go | 34 ++++++++++++++++++++++++++++++++++ pkg/sentry/fsimpl/tmpfs/tmpfs.go | 4 +++- 4 files changed, 53 insertions(+), 6 deletions(-) create mode 100644 pkg/sentry/fsimpl/tmpfs/socket_file.go (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/tmpfs/BUILD b/pkg/sentry/fsimpl/tmpfs/BUILD index 6ea35affb..f2ac23c88 100644 --- a/pkg/sentry/fsimpl/tmpfs/BUILD +++ b/pkg/sentry/fsimpl/tmpfs/BUILD @@ -24,6 +24,7 @@ go_library( "filesystem.go", "named_pipe.go", "regular_file.go", + "socket_file.go", "symlink.go", "tmpfs.go", ], diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go index 1978af69c..5339d7072 100644 --- a/pkg/sentry/fsimpl/tmpfs/filesystem.go +++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go @@ -261,8 +261,7 @@ func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v case linux.S_IFCHR: childInode = fs.newDeviceFile(rp.Credentials(), opts.Mode, vfs.CharDevice, opts.DevMajor, opts.DevMinor) case linux.S_IFSOCK: - // Not yet supported. - return syserror.EPERM + childInode = fs.newSocketFile(rp.Credentials(), opts.Mode, opts.Endpoint) default: return syserror.EINVAL } @@ -396,6 +395,8 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.Open return newNamedPipeFD(ctx, impl, rp, &d.vfsd, opts.Flags) case *deviceFile: return rp.VirtualFilesystem().OpenDeviceSpecialFile(ctx, rp.Mount(), &d.vfsd, impl.kind, impl.major, impl.minor, opts) + case *socketFile: + return nil, syserror.ENXIO default: panic(fmt.Sprintf("unknown inode type: %T", d.inode.impl)) } @@ -679,10 +680,19 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error } // BoundEndpointAt implements FilesystemImpl.BoundEndpointAt. -// -// TODO(gvisor.dev/issue/1476): Implement BoundEndpointAt. func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath) (transport.BoundEndpoint, error) { - return nil, syserror.ECONNREFUSED + fs.mu.RLock() + defer fs.mu.RUnlock() + d, err := resolveLocked(rp) + if err != nil { + return nil, err + } + switch impl := d.inode.impl.(type) { + case *socketFile: + return impl.ep, nil + default: + return nil, syserror.ECONNREFUSED + } } // ListxattrAt implements vfs.FilesystemImpl.ListxattrAt. diff --git a/pkg/sentry/fsimpl/tmpfs/socket_file.go b/pkg/sentry/fsimpl/tmpfs/socket_file.go new file mode 100644 index 000000000..25c2321af --- /dev/null +++ b/pkg/sentry/fsimpl/tmpfs/socket_file.go @@ -0,0 +1,34 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tmpfs + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" +) + +// socketFile is a socket (=S_IFSOCK) tmpfs file. +type socketFile struct { + inode inode + ep transport.BoundEndpoint +} + +func (fs *filesystem) newSocketFile(creds *auth.Credentials, mode linux.FileMode, ep transport.BoundEndpoint) *inode { + file := &socketFile{ep: ep} + file.inode.init(file, fs, creds, mode) + file.inode.nlink = 1 // from parent directory + return &file.inode +} diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go index ad47288f8..654e788e3 100644 --- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go +++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go @@ -331,7 +331,7 @@ func (i *inode) statTo(stat *linux.Statx) { case *deviceFile: stat.RdevMajor = impl.major stat.RdevMinor = impl.minor - case *directory, *namedPipe: + case *socketFile, *directory, *namedPipe: // Nothing to do. default: panic(fmt.Sprintf("unknown inode type: %T", i.impl)) @@ -479,6 +479,8 @@ func (i *inode) direntType() uint8 { return linux.DT_DIR case *symlink: return linux.DT_LNK + case *socketFile: + return linux.DT_SOCK case *deviceFile: switch impl.kind { case vfs.BlockDevice: -- cgit v1.2.3 From dd98fdd5beb7f02e7c7b3aeb4f07f5d00ffc41e7 Mon Sep 17 00:00:00 2001 From: Jamie Liu Date: Mon, 6 Apr 2020 16:31:27 -0700 Subject: Correctly implement magic symlinks in VFS2 procfs. Updates #1195 PiperOrigin-RevId: 305143567 --- pkg/sentry/fsbridge/vfs.go | 28 +++++++++++++--------- pkg/sentry/fsimpl/kernfs/filesystem.go | 36 +++++++++++++++++++---------- pkg/sentry/fsimpl/kernfs/inode_impl_util.go | 5 ++++ pkg/sentry/fsimpl/kernfs/kernfs.go | 16 ++++++++++++- pkg/sentry/fsimpl/kernfs/symlink.go | 5 ++++ pkg/sentry/fsimpl/proc/task_fds.go | 6 +++++ pkg/sentry/fsimpl/proc/task_files.go | 17 ++++++++++++++ pkg/sentry/fsimpl/proc/tasks_files.go | 10 ++++++++ 8 files changed, 99 insertions(+), 24 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsbridge/vfs.go b/pkg/sentry/fsbridge/vfs.go index 79b808359..89168220a 100644 --- a/pkg/sentry/fsbridge/vfs.go +++ b/pkg/sentry/fsbridge/vfs.go @@ -26,22 +26,22 @@ import ( "gvisor.dev/gvisor/pkg/usermem" ) -// fsFile implements File interface over vfs.FileDescription. +// VFSFile implements File interface over vfs.FileDescription. // // +stateify savable -type vfsFile struct { +type VFSFile struct { file *vfs.FileDescription } -var _ File = (*vfsFile)(nil) +var _ File = (*VFSFile)(nil) // NewVFSFile creates a new File over fs.File. func NewVFSFile(file *vfs.FileDescription) File { - return &vfsFile{file: file} + return &VFSFile{file: file} } // PathnameWithDeleted implements File. -func (f *vfsFile) PathnameWithDeleted(ctx context.Context) string { +func (f *VFSFile) PathnameWithDeleted(ctx context.Context) string { root := vfs.RootFromContext(ctx) defer root.DecRef() @@ -51,7 +51,7 @@ func (f *vfsFile) PathnameWithDeleted(ctx context.Context) string { } // ReadFull implements File. -func (f *vfsFile) ReadFull(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) { +func (f *VFSFile) ReadFull(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) { var total int64 for dst.NumBytes() > 0 { n, err := f.file.PRead(ctx, dst, offset+total, vfs.ReadOptions{}) @@ -67,12 +67,12 @@ func (f *vfsFile) ReadFull(ctx context.Context, dst usermem.IOSequence, offset i } // ConfigureMMap implements File. -func (f *vfsFile) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { +func (f *VFSFile) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { return f.file.ConfigureMMap(ctx, opts) } // Type implements File. -func (f *vfsFile) Type(ctx context.Context) (linux.FileMode, error) { +func (f *VFSFile) Type(ctx context.Context) (linux.FileMode, error) { stat, err := f.file.Stat(ctx, vfs.StatOptions{}) if err != nil { return 0, err @@ -81,15 +81,21 @@ func (f *vfsFile) Type(ctx context.Context) (linux.FileMode, error) { } // IncRef implements File. -func (f *vfsFile) IncRef() { +func (f *VFSFile) IncRef() { f.file.IncRef() } // DecRef implements File. -func (f *vfsFile) DecRef() { +func (f *VFSFile) DecRef() { f.file.DecRef() } +// FileDescription returns the FileDescription represented by f. It does not +// take an additional reference on the returned FileDescription. +func (f *VFSFile) FileDescription() *vfs.FileDescription { + return f.file +} + // fsLookup implements Lookup interface using fs.File. // // +stateify savable @@ -132,5 +138,5 @@ func (l *vfsLookup) OpenPath(ctx context.Context, pathname string, opts vfs.Open if err != nil { return nil, err } - return &vfsFile{file: fd}, nil + return &VFSFile{file: fd}, nil } diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go index 89f5da3d4..16a3c18ae 100644 --- a/pkg/sentry/fsimpl/kernfs/filesystem.go +++ b/pkg/sentry/fsimpl/kernfs/filesystem.go @@ -79,16 +79,22 @@ afterSymlink: } // Resolve any symlink at current path component. if rp.ShouldFollowSymlink() && next.isSymlink() { - // TODO: VFS2 needs something extra for /proc/[pid]/fd/ "magic symlinks". - target, err := next.inode.Readlink(ctx) + targetVD, targetPathname, err := next.inode.Getlink(ctx) if err != nil { return nil, err } - if err := rp.HandleSymlink(target); err != nil { - return nil, err + if targetVD.Ok() { + err := rp.HandleJump(targetVD) + targetVD.DecRef() + if err != nil { + return nil, err + } + } else { + if err := rp.HandleSymlink(targetPathname); err != nil { + return nil, err + } } goto afterSymlink - } rp.Advance() return &next.vfsd, nil @@ -470,19 +476,25 @@ afterTrailingSymlink: } childDentry := childVFSD.Impl().(*Dentry) childInode := childDentry.inode - if rp.ShouldFollowSymlink() { - if childDentry.isSymlink() { - target, err := childInode.Readlink(ctx) + if rp.ShouldFollowSymlink() && childDentry.isSymlink() { + targetVD, targetPathname, err := childInode.Getlink(ctx) + if err != nil { + return nil, err + } + if targetVD.Ok() { + err := rp.HandleJump(targetVD) + targetVD.DecRef() if err != nil { return nil, err } - if err := rp.HandleSymlink(target); err != nil { + } else { + if err := rp.HandleSymlink(targetPathname); err != nil { return nil, err } - // rp.Final() may no longer be true since we now need to resolve the - // symlink target. - goto afterTrailingSymlink } + // rp.Final() may no longer be true since we now need to resolve the + // symlink target. + goto afterTrailingSymlink } if err := childInode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil { return nil, err diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go index 5c84b10c9..65f09af5d 100644 --- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go @@ -181,6 +181,11 @@ func (InodeNotSymlink) Readlink(context.Context) (string, error) { return "", syserror.EINVAL } +// Getlink implements Inode.Getlink. +func (InodeNotSymlink) Getlink(context.Context) (vfs.VirtualDentry, string, error) { + return vfs.VirtualDentry{}, "", syserror.EINVAL +} + // InodeAttrs partially implements the Inode interface, specifically the // inodeMetadata sub interface. InodeAttrs provides functionality related to // inode attributes. diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go index 2cefef020..ad76b9f64 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs.go @@ -414,7 +414,21 @@ type inodeDynamicLookup interface { } type inodeSymlink interface { - // Readlink resolves the target of a symbolic link. If an inode is not a + // Readlink returns the target of a symbolic link. If an inode is not a // symlink, the implementation should return EINVAL. Readlink(ctx context.Context) (string, error) + + // Getlink returns the target of a symbolic link, as used by path + // resolution: + // + // - If the inode is a "magic link" (a link whose target is most accurately + // represented as a VirtualDentry), Getlink returns (ok VirtualDentry, "", + // nil). A reference is taken on the returned VirtualDentry. + // + // - If the inode is an ordinary symlink, Getlink returns (zero-value + // VirtualDentry, symlink target, nil). + // + // - If the inode is not a symlink, Getlink returns (zero-value + // VirtualDentry, "", EINVAL). + Getlink(ctx context.Context) (vfs.VirtualDentry, string, error) } diff --git a/pkg/sentry/fsimpl/kernfs/symlink.go b/pkg/sentry/fsimpl/kernfs/symlink.go index 5918d3309..018aa503c 100644 --- a/pkg/sentry/fsimpl/kernfs/symlink.go +++ b/pkg/sentry/fsimpl/kernfs/symlink.go @@ -55,6 +55,11 @@ func (s *StaticSymlink) Readlink(_ context.Context) (string, error) { return s.target, nil } +// Getlink implements Inode.Getlink. +func (s *StaticSymlink) Getlink(_ context.Context) (vfs.VirtualDentry, string, error) { + return vfs.VirtualDentry{}, s.target, nil +} + // SetStat implements Inode.SetStat not allowing inode attributes to be changed. func (*StaticSymlink) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { return syserror.EPERM diff --git a/pkg/sentry/fsimpl/proc/task_fds.go b/pkg/sentry/fsimpl/proc/task_fds.go index 76bfc5307..9c8656b28 100644 --- a/pkg/sentry/fsimpl/proc/task_fds.go +++ b/pkg/sentry/fsimpl/proc/task_fds.go @@ -196,6 +196,12 @@ func (s *fdSymlink) Readlink(ctx context.Context) (string, error) { return vfsObj.PathnameWithDeleted(ctx, root, s.file.VirtualDentry()) } +func (s *fdSymlink) Getlink(ctx context.Context) (vfs.VirtualDentry, string, error) { + vd := s.file.VirtualDentry() + vd.IncRef() + return vd, "", nil +} + func (s *fdSymlink) DecRef() { s.AtomicRefCount.DecRefWithDestructor(func() { s.Destroy() diff --git a/pkg/sentry/fsimpl/proc/task_files.go b/pkg/sentry/fsimpl/proc/task_files.go index df0d1bcc5..88ea6a6d8 100644 --- a/pkg/sentry/fsimpl/proc/task_files.go +++ b/pkg/sentry/fsimpl/proc/task_files.go @@ -610,6 +610,23 @@ func (s *exeSymlink) Readlink(ctx context.Context) (string, error) { return exec.PathnameWithDeleted(ctx), nil } +// Getlink implements kernfs.Inode.Getlink. +func (s *exeSymlink) Getlink(ctx context.Context) (vfs.VirtualDentry, string, error) { + if !kernel.ContextCanTrace(ctx, s.task, false) { + return vfs.VirtualDentry{}, "", syserror.EACCES + } + + exec, err := s.executable() + if err != nil { + return vfs.VirtualDentry{}, "", err + } + defer exec.DecRef() + + vd := exec.(*fsbridge.VFSFile).FileDescription().VirtualDentry() + vd.IncRef() + return vd, "", nil +} + func (s *exeSymlink) executable() (file fsbridge.File, err error) { s.task.WithMuLocked(func(t *kernel.Task) { mm := t.MemoryManager() diff --git a/pkg/sentry/fsimpl/proc/tasks_files.go b/pkg/sentry/fsimpl/proc/tasks_files.go index 882c1981e..4621e2de0 100644 --- a/pkg/sentry/fsimpl/proc/tasks_files.go +++ b/pkg/sentry/fsimpl/proc/tasks_files.go @@ -63,6 +63,11 @@ func (s *selfSymlink) Readlink(ctx context.Context) (string, error) { return strconv.FormatUint(uint64(tgid), 10), nil } +func (s *selfSymlink) Getlink(ctx context.Context) (vfs.VirtualDentry, string, error) { + target, err := s.Readlink(ctx) + return vfs.VirtualDentry{}, target, err +} + // SetStat implements Inode.SetStat not allowing inode attributes to be changed. func (*selfSymlink) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { return syserror.EPERM @@ -101,6 +106,11 @@ func (s *threadSelfSymlink) Readlink(ctx context.Context) (string, error) { return fmt.Sprintf("%d/task/%d", tgid, tid), nil } +func (s *threadSelfSymlink) Getlink(ctx context.Context) (vfs.VirtualDentry, string, error) { + target, err := s.Readlink(ctx) + return vfs.VirtualDentry{}, target, err +} + // SetStat implements Inode.SetStat not allowing inode attributes to be changed. func (*threadSelfSymlink) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { return syserror.EPERM -- cgit v1.2.3 From 94319a8241cb299edc812024d6132b7a3819a4dc Mon Sep 17 00:00:00 2001 From: Fabricio Voznika Date: Tue, 7 Apr 2020 09:40:38 -0700 Subject: Make gofer.dentry.destroyLocked idempotent gofer operations accumulate dentries touched in a slice to call checkCachingLocked on them when the operation is over. In case the same dentry is touched multiple times during the operation, checkCachingLocked, and consequently destroyLocked, may be called more than once for the same dentry. Updates #1198 PiperOrigin-RevId: 305276819 --- pkg/sentry/fsimpl/gofer/BUILD | 12 ++++++- pkg/sentry/fsimpl/gofer/gofer.go | 36 +++++++++++++++++--- pkg/sentry/fsimpl/gofer/gofer_test.go | 64 +++++++++++++++++++++++++++++++++++ test/syscalls/linux/open.cc | 22 ++++++++++++ 4 files changed, 129 insertions(+), 5 deletions(-) create mode 100644 pkg/sentry/fsimpl/gofer/gofer_test.go (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/gofer/BUILD b/pkg/sentry/fsimpl/gofer/BUILD index d15a36709..99d1e3f8f 100644 --- a/pkg/sentry/fsimpl/gofer/BUILD +++ b/pkg/sentry/fsimpl/gofer/BUILD @@ -1,4 +1,4 @@ -load("//tools:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library", "go_test") load("//tools/go_generics:defs.bzl", "go_template_instance") licenses(["notice"]) @@ -54,3 +54,13 @@ go_library( "//pkg/usermem", ], ) + +go_test( + name = "gofer_test", + srcs = ["gofer_test.go"], + library = ":gofer", + deps = [ + "//pkg/p9", + "//pkg/sentry/contexttest", + ], +) diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go index adee8bb60..20edaf643 100644 --- a/pkg/sentry/fsimpl/gofer/gofer.go +++ b/pkg/sentry/fsimpl/gofer/gofer.go @@ -444,7 +444,8 @@ type dentry struct { // refs is the reference count. Each dentry holds a reference on its // parent, even if disowned. refs is accessed using atomic memory - // operations. + // operations. When refs reaches 0, the dentry may be added to the cache or + // destroyed. If refs==-1 the dentry has already been destroyed. refs int64 // fs is the owning filesystem. fs is immutable. @@ -860,7 +861,7 @@ func (d *dentry) IncRef() { func (d *dentry) TryIncRef() bool { for { refs := atomic.LoadInt64(&d.refs) - if refs == 0 { + if refs <= 0 { return false } if atomic.CompareAndSwapInt64(&d.refs, refs, refs+1) { @@ -883,13 +884,20 @@ func (d *dentry) DecRef() { // checkCachingLocked should be called after d's reference count becomes 0 or it // becomes disowned. // +// It may be called on a destroyed dentry. For example, +// renameMu[R]UnlockAndCheckCaching may call checkCachingLocked multiple times +// for the same dentry when the dentry is visited more than once in the same +// operation. One of the calls may destroy the dentry, so subsequent calls will +// do nothing. +// // Preconditions: d.fs.renameMu must be locked for writing. func (d *dentry) checkCachingLocked() { // Dentries with a non-zero reference count must be retained. (The only way // to obtain a reference on a dentry with zero references is via path // resolution, which requires renameMu, so if d.refs is zero then it will // remain zero while we hold renameMu for writing.) - if atomic.LoadInt64(&d.refs) != 0 { + refs := atomic.LoadInt64(&d.refs) + if refs > 0 { if d.cached { d.fs.cachedDentries.Remove(d) d.fs.cachedDentriesLen-- @@ -897,6 +905,10 @@ func (d *dentry) checkCachingLocked() { } return } + if refs == -1 { + // Dentry has already been destroyed. + return + } // Non-child dentries with zero references are no longer reachable by path // resolution and should be dropped immediately. if d.vfsd.Parent() == nil || d.vfsd.IsDisowned() { @@ -949,9 +961,22 @@ func (d *dentry) checkCachingLocked() { } } +// destroyLocked destroys the dentry. It may flushes dirty pages from cache, +// close p9 file and remove reference on parent dentry. +// // Preconditions: d.fs.renameMu must be locked for writing. d.refs == 0. d is // not a child dentry. func (d *dentry) destroyLocked() { + switch atomic.LoadInt64(&d.refs) { + case 0: + // Mark the dentry destroyed. + atomic.StoreInt64(&d.refs, -1) + case -1: + panic("dentry.destroyLocked() called on already destroyed dentry") + default: + panic("dentry.destroyLocked() called with references on the dentry") + } + ctx := context.Background() d.handleMu.Lock() if !d.handle.file.isNil() { @@ -971,7 +996,10 @@ func (d *dentry) destroyLocked() { d.handle.close(ctx) } d.handleMu.Unlock() - d.file.close(ctx) + if !d.file.isNil() { + d.file.close(ctx) + d.file = p9file{} + } // Remove d from the set of all dentries. d.fs.syncMu.Lock() delete(d.fs.dentries, d) diff --git a/pkg/sentry/fsimpl/gofer/gofer_test.go b/pkg/sentry/fsimpl/gofer/gofer_test.go new file mode 100644 index 000000000..82bc239db --- /dev/null +++ b/pkg/sentry/fsimpl/gofer/gofer_test.go @@ -0,0 +1,64 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package gofer + +import ( + "sync/atomic" + "testing" + + "gvisor.dev/gvisor/pkg/p9" + "gvisor.dev/gvisor/pkg/sentry/contexttest" +) + +func TestDestroyIdempotent(t *testing.T) { + fs := filesystem{ + dentries: make(map[*dentry]struct{}), + opts: filesystemOptions{ + // Test relies on no dentry being held in the cache. + maxCachedDentries: 0, + }, + } + + ctx := contexttest.Context(t) + attr := &p9.Attr{ + Mode: p9.ModeRegular, + } + mask := p9.AttrMask{ + Mode: true, + Size: true, + } + parent, err := fs.newDentry(ctx, p9file{}, p9.QID{}, mask, attr) + if err != nil { + t.Fatalf("fs.newDentry(): %v", err) + } + + child, err := fs.newDentry(ctx, p9file{}, p9.QID{}, mask, attr) + if err != nil { + t.Fatalf("fs.newDentry(): %v", err) + } + parent.IncRef() // reference held by child on its parent. + parent.vfsd.InsertChild(&child.vfsd, "child") + + child.checkCachingLocked() + if got := atomic.LoadInt64(&child.refs); got != -1 { + t.Fatalf("child.refs=%d, want: -1", got) + } + // Parent will also be destroyed when child reference is removed. + if got := atomic.LoadInt64(&parent.refs); got != -1 { + t.Fatalf("parent.refs=%d, want: -1", got) + } + child.checkCachingLocked() + child.checkCachingLocked() +} diff --git a/test/syscalls/linux/open.cc b/test/syscalls/linux/open.cc index 267ae19f6..640fe6bfc 100644 --- a/test/syscalls/linux/open.cc +++ b/test/syscalls/linux/open.cc @@ -186,6 +186,28 @@ TEST_F(OpenTest, OpenNoFollowStillFollowsLinksInPath) { ASSERT_NO_ERRNO_AND_VALUE(Open(path_via_symlink, O_RDONLY | O_NOFOLLOW)); } +// Test that open(2) can follow symlinks that point back to the same tree. +// Test sets up files as follows: +// root/child/symlink => redirects to ../.. +// root/child/target => regular file +// +// open("root/child/symlink/root/child/file") +TEST_F(OpenTest, SymlinkRecurse) { + auto root = + ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(GetAbsoluteTestTmpdir())); + auto child = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(root.path())); + auto symlink = ASSERT_NO_ERRNO_AND_VALUE( + TempPath::CreateSymlinkTo(child.path(), "../..")); + auto target = ASSERT_NO_ERRNO_AND_VALUE( + TempPath::CreateFileWith(child.path(), "abc", 0644)); + auto path_via_symlink = + JoinPath(symlink.path(), Basename(root.path()), Basename(child.path()), + Basename(target.path())); + const auto contents = + ASSERT_NO_ERRNO_AND_VALUE(GetContents(path_via_symlink)); + ASSERT_EQ(contents, "abc"); +} + TEST_F(OpenTest, Fault) { char* totally_not_null = nullptr; ASSERT_THAT(open(totally_not_null, O_RDONLY), SyscallFailsWithErrno(EFAULT)); -- cgit v1.2.3 From 693b6bdda9206a5910c552a3997b2df5480d6947 Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Tue, 7 Apr 2020 16:16:31 -0700 Subject: Correctly distinguish between seekable and non-seekable host fds. Check whether an fd is seekable by calling the seek syscall and examining the return value, instead of checking the file type, which is inaccurate. PiperOrigin-RevId: 305361593 --- pkg/sentry/fsimpl/host/host.go | 42 ++++++++++++++++++++---------------------- 1 file changed, 20 insertions(+), 22 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go index 7d9dcd4c9..97fa7f7ab 100644 --- a/pkg/sentry/fsimpl/host/host.go +++ b/pkg/sentry/fsimpl/host/host.go @@ -74,31 +74,34 @@ func ImportFD(mnt *vfs.Mount, hostFD int, isTTY bool) (*vfs.FileDescription, err } // Retrieve metadata. - var s syscall.Stat_t - if err := syscall.Fstat(hostFD, &s); err != nil { + var s unix.Stat_t + if err := unix.Fstat(hostFD, &s); err != nil { return nil, err } fileMode := linux.FileMode(s.Mode) fileType := fileMode.FileType() - // Pipes, character devices, and sockets. - isStream := fileType == syscall.S_IFIFO || fileType == syscall.S_IFCHR || fileType == syscall.S_IFSOCK + + // Determine if hostFD is seekable. If not, this syscall will return ESPIPE + // (see fs/read_write.c:llseek), e.g. for pipes, sockets, and some character + // devices. + _, err := unix.Seek(hostFD, 0, linux.SEEK_CUR) + seekable := err != syserror.ESPIPE i := &inode{ hostFD: hostFD, - isStream: isStream, + seekable: seekable, isTTY: isTTY, canMap: canMap(uint32(fileType)), ino: fs.NextIno(), mode: fileMode, - // For simplicity, set offset to 0. Technically, we should - // only set to 0 on files that are not seekable (sockets, pipes, etc.), - // and use the offset from the host fd otherwise. + // For simplicity, set offset to 0. Technically, we should use the existing + // offset on the host if the file is seekable. offset: 0, } - // These files can't be memory mapped, assert this. - if i.isStream && i.canMap { + // Non-seekable files can't be memory mapped, assert this. + if !i.seekable && i.canMap { panic("files that can return EWOULDBLOCK (sockets, pipes, etc.) cannot be memory mapped") } @@ -124,12 +127,12 @@ type inode struct { // This field is initialized at creation time and is immutable. hostFD int - // isStream is true if the host fd points to a file representing a stream, + // seekable is false if the host fd points to a file representing a stream, // e.g. a socket or a pipe. Such files are not seekable and can return // EWOULDBLOCK for I/O operations. // // This field is initialized at creation time and is immutable. - isStream bool + seekable bool // isTTY is true if this file represents a TTY. // @@ -481,8 +484,7 @@ func (f *fileDescription) Release() { // PRead implements FileDescriptionImpl. func (f *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { i := f.inode - // TODO(b/34716638): Some char devices do support offsets, e.g. /dev/null. - if i.isStream { + if !i.seekable { return 0, syserror.ESPIPE } @@ -492,8 +494,7 @@ func (f *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, off // Read implements FileDescriptionImpl. func (f *fileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { i := f.inode - // TODO(b/34716638): Some char devices do support offsets, e.g. /dev/null. - if i.isStream { + if !i.seekable { n, err := readFromHostFD(ctx, i.hostFD, dst, -1, opts.Flags) if isBlockError(err) { // If we got any data at all, return it as a "completed" partial read @@ -538,8 +539,7 @@ func readFromHostFD(ctx context.Context, hostFD int, dst usermem.IOSequence, off // PWrite implements FileDescriptionImpl. func (f *fileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { i := f.inode - // TODO(b/34716638): Some char devices do support offsets, e.g. /dev/null. - if i.isStream { + if !i.seekable { return 0, syserror.ESPIPE } @@ -549,8 +549,7 @@ func (f *fileDescription) PWrite(ctx context.Context, src usermem.IOSequence, of // Write implements FileDescriptionImpl. func (f *fileDescription) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { i := f.inode - // TODO(b/34716638): Some char devices do support offsets, e.g. /dev/null. - if i.isStream { + if !i.seekable { n, err := writeToHostFD(ctx, i.hostFD, src, -1, opts.Flags) if isBlockError(err) { err = syserror.ErrWouldBlock @@ -593,8 +592,7 @@ func writeToHostFD(ctx context.Context, hostFD int, src usermem.IOSequence, offs // allow directory fds to be imported at all. func (f *fileDescription) Seek(_ context.Context, offset int64, whence int32) (int64, error) { i := f.inode - // TODO(b/34716638): Some char devices do support seeking, e.g. /dev/null. - if i.isStream { + if !i.seekable { return 0, syserror.ESPIPE } -- cgit v1.2.3 From 5a1324625f1d0d9ea2d4874f9d6d1008ec12f45e Mon Sep 17 00:00:00 2001 From: Fabricio Voznika Date: Tue, 7 Apr 2020 18:26:34 -0700 Subject: Make unlink tests pass with goferfs Required directory checks were being skipped when there was no child cached. Now the code always loads the child file before unlinking it. Updates #1198 PiperOrigin-RevId: 305382323 --- pkg/sentry/fsimpl/gofer/filesystem.go | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go index 305228bda..137260898 100644 --- a/pkg/sentry/fsimpl/gofer/filesystem.go +++ b/pkg/sentry/fsimpl/gofer/filesystem.go @@ -437,14 +437,19 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b flags := uint32(0) if dir { if child != nil && !child.isDir() { + vfsObj.AbortDeleteDentry(childVFSD) return syserror.ENOTDIR } flags = linux.AT_REMOVEDIR } else { if child != nil && child.isDir() { + vfsObj.AbortDeleteDentry(childVFSD) return syserror.EISDIR } if rp.MustBeDir() { + if childVFSD != nil { + vfsObj.AbortDeleteDentry(childVFSD) + } return syserror.ENOTDIR } } -- cgit v1.2.3 From 928a7c60b8f02811e9c0fcbed0077efd55471cc4 Mon Sep 17 00:00:00 2001 From: Adin Scannell Date: Wed, 25 Mar 2020 17:15:05 -0700 Subject: Fix all printf formatting errors. Updates #2243 --- pkg/eventchannel/event_test.go | 4 ++-- pkg/segment/test/segment_test.go | 2 +- pkg/sentry/fs/fdpipe/pipe_test.go | 4 ++-- pkg/sentry/fsimpl/tmpfs/stat_test.go | 8 ++++---- pkg/tcpip/header/eth_test.go | 2 +- pkg/tcpip/header/ndp_test.go | 2 +- pkg/tcpip/link/fdbased/endpoint.go | 2 +- pkg/tcpip/stack/ndp_test.go | 12 ++++++------ pkg/tcpip/stack/stack_test.go | 32 ++++++++++++++++---------------- pkg/tcpip/tcpip_test.go | 2 +- pkg/tcpip/transport/tcp/tcp_test.go | 4 ++-- pkg/tcpip/transport/udp/udp_test.go | 2 +- runsc/container/test_app/test_app.go | 2 +- test/root/cgroup_test.go | 4 ++-- test/runtimes/blacklist_test.go | 2 +- test/runtimes/runner.go | 2 +- tools/nogo.json | 31 ------------------------------- 17 files changed, 43 insertions(+), 74 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/eventchannel/event_test.go b/pkg/eventchannel/event_test.go index 7f41b4a27..43750360b 100644 --- a/pkg/eventchannel/event_test.go +++ b/pkg/eventchannel/event_test.go @@ -78,7 +78,7 @@ func TestMultiEmitter(t *testing.T) { for _, name := range names { m := testMessage{name: name} if _, err := me.Emit(m); err != nil { - t.Fatal("me.Emit(%v) failed: %v", m, err) + t.Fatalf("me.Emit(%v) failed: %v", m, err) } } @@ -96,7 +96,7 @@ func TestMultiEmitter(t *testing.T) { // Close multiEmitter. if err := me.Close(); err != nil { - t.Fatal("me.Close() failed: %v", err) + t.Fatalf("me.Close() failed: %v", err) } // All testEmitters should be closed. diff --git a/pkg/segment/test/segment_test.go b/pkg/segment/test/segment_test.go index f19a005f3..97b16c158 100644 --- a/pkg/segment/test/segment_test.go +++ b/pkg/segment/test/segment_test.go @@ -63,7 +63,7 @@ func checkSet(s *Set, expectedSegments int) error { return fmt.Errorf("incorrect order: key %d (segment %d) >= key %d (segment %d)", prev, nrSegments-1, next, nrSegments) } if got, want := seg.Value(), seg.Start()+valueOffset; got != want { - return fmt.Errorf("segment %d has key %d, value %d (expected %d)", nrSegments, seg.Start, got, want) + return fmt.Errorf("segment %d has key %d, value %d (expected %d)", nrSegments, seg.Start(), got, want) } prev = next havePrev = true diff --git a/pkg/sentry/fs/fdpipe/pipe_test.go b/pkg/sentry/fs/fdpipe/pipe_test.go index 5aff0cc95..a0082ecca 100644 --- a/pkg/sentry/fs/fdpipe/pipe_test.go +++ b/pkg/sentry/fs/fdpipe/pipe_test.go @@ -119,7 +119,7 @@ func TestNewPipe(t *testing.T) { continue } if flags := p.flags; test.flags != flags { - t.Errorf("%s: got file flags %s, want %s", test.desc, flags, test.flags) + t.Errorf("%s: got file flags %v, want %v", test.desc, flags, test.flags) continue } if len(test.readAheadBuffer) != len(p.readAheadBuffer) { @@ -136,7 +136,7 @@ func TestNewPipe(t *testing.T) { continue } if !fdnotifier.HasFD(int32(f.FD())) { - t.Errorf("%s: pipe fd %d is not registered for events", test.desc, f.FD) + t.Errorf("%s: pipe fd %d is not registered for events", test.desc, f.FD()) } } } diff --git a/pkg/sentry/fsimpl/tmpfs/stat_test.go b/pkg/sentry/fsimpl/tmpfs/stat_test.go index 3e02e7190..d4f59ee5b 100644 --- a/pkg/sentry/fsimpl/tmpfs/stat_test.go +++ b/pkg/sentry/fsimpl/tmpfs/stat_test.go @@ -140,7 +140,7 @@ func TestSetStatAtime(t *testing.T) { Mask: 0, Atime: linux.NsecToStatxTimestamp(100), }}); err != nil { - t.Errorf("SetStat atime without mask failed: %v") + t.Errorf("SetStat atime without mask failed: %v", err) } // Atime should be unchanged. if gotStat, err := fd.Stat(ctx, allStatOptions); err != nil { @@ -155,7 +155,7 @@ func TestSetStatAtime(t *testing.T) { Atime: linux.NsecToStatxTimestamp(100), } if err := fd.SetStat(ctx, vfs.SetStatOptions{Stat: setStat}); err != nil { - t.Errorf("SetStat atime with mask failed: %v") + t.Errorf("SetStat atime with mask failed: %v", err) } if gotStat, err := fd.Stat(ctx, allStatOptions); err != nil { t.Errorf("Stat got error: %v", err) @@ -205,7 +205,7 @@ func TestSetStat(t *testing.T) { Mask: 0, Atime: linux.NsecToStatxTimestamp(100), }}); err != nil { - t.Errorf("SetStat atime without mask failed: %v") + t.Errorf("SetStat atime without mask failed: %v", err) } // Atime should be unchanged. if gotStat, err := fd.Stat(ctx, allStatOptions); err != nil { @@ -220,7 +220,7 @@ func TestSetStat(t *testing.T) { Atime: linux.NsecToStatxTimestamp(100), } if err := fd.SetStat(ctx, vfs.SetStatOptions{Stat: setStat}); err != nil { - t.Errorf("SetStat atime with mask failed: %v") + t.Errorf("SetStat atime with mask failed: %v", err) } if gotStat, err := fd.Stat(ctx, allStatOptions); err != nil { t.Errorf("Stat got error: %v", err) diff --git a/pkg/tcpip/header/eth_test.go b/pkg/tcpip/header/eth_test.go index 7a0014ad9..14413f2ce 100644 --- a/pkg/tcpip/header/eth_test.go +++ b/pkg/tcpip/header/eth_test.go @@ -88,7 +88,7 @@ func TestEthernetAddressFromMulticastIPv4Address(t *testing.T) { for _, test := range tests { t.Run(test.name, func(t *testing.T) { if got := EthernetAddressFromMulticastIPv4Address(test.addr); got != test.expectedLinkAddr { - t.Fatalf("got EthernetAddressFromMulticastIPv4Address(%s) = %s, want = %s", got, test.expectedLinkAddr) + t.Fatalf("got EthernetAddressFromMulticastIPv4Address(%s) = %s, want = %s", test.addr, got, test.expectedLinkAddr) } }) } diff --git a/pkg/tcpip/header/ndp_test.go b/pkg/tcpip/header/ndp_test.go index 1cb9f5dc8..969f8f1e8 100644 --- a/pkg/tcpip/header/ndp_test.go +++ b/pkg/tcpip/header/ndp_test.go @@ -115,7 +115,7 @@ func TestNDPNeighborAdvert(t *testing.T) { // Make sure flags got updated in the backing buffer. if got := b[ndpNAFlagsOffset]; got != 64 { - t.Errorf("got flags byte = %d, want = 64") + t.Errorf("got flags byte = %d, want = 64", got) } } diff --git a/pkg/tcpip/link/fdbased/endpoint.go b/pkg/tcpip/link/fdbased/endpoint.go index 7198742b7..b857ce9d0 100644 --- a/pkg/tcpip/link/fdbased/endpoint.go +++ b/pkg/tcpip/link/fdbased/endpoint.go @@ -91,7 +91,7 @@ func (p PacketDispatchMode) String() string { case PacketMMap: return "PacketMMap" default: - return fmt.Sprintf("unknown packet dispatch mode %v", p) + return fmt.Sprintf("unknown packet dispatch mode '%d'", p) } } diff --git a/pkg/tcpip/stack/ndp_test.go b/pkg/tcpip/stack/ndp_test.go index 27dc8baf9..bc822e3b1 100644 --- a/pkg/tcpip/stack/ndp_test.go +++ b/pkg/tcpip/stack/ndp_test.go @@ -1959,7 +1959,7 @@ func TestAutoGenAddrDeprecateFromPI(t *testing.T) { // addr2 is deprecated but if explicitly requested, it should be used. fullAddr2 := tcpip.FullAddress{Addr: addr2.Address, NIC: nicID} if got := addrForNewConnectionWithAddr(t, s, fullAddr2); got != addr2.Address { - t.Errorf("got addrForNewConnectionWithAddr(_, _, %+v) = %s, want = %s", got, addr2.Address) + t.Errorf("got addrForNewConnectionWithAddr(_, _, %+v) = %s, want = %s", fullAddr2, got, addr2.Address) } // Another PI w/ 0 preferred lifetime should not result in a deprecation @@ -1972,7 +1972,7 @@ func TestAutoGenAddrDeprecateFromPI(t *testing.T) { } expectPrimaryAddr(addr1) if got := addrForNewConnectionWithAddr(t, s, fullAddr2); got != addr2.Address { - t.Errorf("got addrForNewConnectionWithAddr(_, _, %+v) = %s, want = %s", got, addr2.Address) + t.Errorf("got addrForNewConnectionWithAddr(_, _, %+v) = %s, want = %s", fullAddr2, got, addr2.Address) } // Refresh lifetimes of addr generated from prefix2. @@ -2084,7 +2084,7 @@ func TestAutoGenAddrTimerDeprecation(t *testing.T) { // addr1 is deprecated but if explicitly requested, it should be used. fullAddr1 := tcpip.FullAddress{Addr: addr1.Address, NIC: nicID} if got := addrForNewConnectionWithAddr(t, s, fullAddr1); got != addr1.Address { - t.Errorf("got addrForNewConnectionWithAddr(_, _, %+v) = %s, want = %s", got, addr1.Address) + t.Errorf("got addrForNewConnectionWithAddr(_, _, %+v) = %s, want = %s", fullAddr1, got, addr1.Address) } // Refresh valid lifetime for addr of prefix1, w/ 0 preferred lifetime to make @@ -2097,7 +2097,7 @@ func TestAutoGenAddrTimerDeprecation(t *testing.T) { } expectPrimaryAddr(addr2) if got := addrForNewConnectionWithAddr(t, s, fullAddr1); got != addr1.Address { - t.Errorf("got addrForNewConnectionWithAddr(_, _, %+v) = %s, want = %s", got, addr1.Address) + t.Errorf("got addrForNewConnectionWithAddr(_, _, %+v) = %s, want = %s", fullAddr1, got, addr1.Address) } // Refresh lifetimes for addr of prefix1. @@ -2121,7 +2121,7 @@ func TestAutoGenAddrTimerDeprecation(t *testing.T) { // addr2 should be the primary endpoint now since it is not deprecated. expectPrimaryAddr(addr2) if got := addrForNewConnectionWithAddr(t, s, fullAddr1); got != addr1.Address { - t.Errorf("got addrForNewConnectionWithAddr(_, _, %+v) = %s, want = %s", got, addr1.Address) + t.Errorf("got addrForNewConnectionWithAddr(_, _, %+v) = %s, want = %s", fullAddr1, got, addr1.Address) } // Wait for addr of prefix1 to be invalidated. @@ -2564,7 +2564,7 @@ func TestAutoGenAddrAfterRemoval(t *testing.T) { AddressWithPrefix: addr2, } if err := s.AddProtocolAddressWithOptions(nicID, protoAddr2, stack.FirstPrimaryEndpoint); err != nil { - t.Fatalf("AddProtocolAddressWithOptions(%d, %+v, %d, %s) = %s", nicID, protoAddr2, stack.FirstPrimaryEndpoint, err) + t.Fatalf("AddProtocolAddressWithOptions(%d, %+v, %d) = %s", nicID, protoAddr2, stack.FirstPrimaryEndpoint, err) } // addr2 should be more preferred now since it is at the front of the primary // list. diff --git a/pkg/tcpip/stack/stack_test.go b/pkg/tcpip/stack/stack_test.go index 3f8a2a095..c7634ceb1 100644 --- a/pkg/tcpip/stack/stack_test.go +++ b/pkg/tcpip/stack/stack_test.go @@ -1445,19 +1445,19 @@ func TestOutgoingBroadcastWithEmptyRouteTable(t *testing.T) { protoAddr := tcpip.ProtocolAddress{Protocol: fakeNetNumber, AddressWithPrefix: tcpip.AddressWithPrefix{header.IPv4Any, 0}} if err := s.AddProtocolAddress(1, protoAddr); err != nil { - t.Fatalf("AddProtocolAddress(1, %s) failed: %s", protoAddr, err) + t.Fatalf("AddProtocolAddress(1, %v) failed: %v", protoAddr, err) } r, err := s.FindRoute(1, header.IPv4Any, header.IPv4Broadcast, fakeNetNumber, false /* multicastLoop */) if err != nil { - t.Fatalf("FindRoute(1, %s, %s, %d) failed: %s", header.IPv4Any, header.IPv4Broadcast, fakeNetNumber, err) + t.Fatalf("FindRoute(1, %v, %v, %d) failed: %v", header.IPv4Any, header.IPv4Broadcast, fakeNetNumber, err) } if err := verifyRoute(r, stack.Route{LocalAddress: header.IPv4Any, RemoteAddress: header.IPv4Broadcast}); err != nil { - t.Errorf("FindRoute(1, %s, %s, %d) returned unexpected Route: %s)", header.IPv4Any, header.IPv4Broadcast, fakeNetNumber, err) + t.Errorf("FindRoute(1, %v, %v, %d) returned unexpected Route: %v", header.IPv4Any, header.IPv4Broadcast, fakeNetNumber, err) } // If the NIC doesn't exist, it won't work. if _, err := s.FindRoute(2, header.IPv4Any, header.IPv4Broadcast, fakeNetNumber, false /* multicastLoop */); err != tcpip.ErrNetworkUnreachable { - t.Fatalf("got FindRoute(2, %s, %s, %d) = %s want = %s", header.IPv4Any, header.IPv4Broadcast, fakeNetNumber, err, tcpip.ErrNetworkUnreachable) + t.Fatalf("got FindRoute(2, %v, %v, %d) = %v want = %v", header.IPv4Any, header.IPv4Broadcast, fakeNetNumber, err, tcpip.ErrNetworkUnreachable) } } @@ -1483,12 +1483,12 @@ func TestOutgoingBroadcastWithRouteTable(t *testing.T) { } nic1ProtoAddr := tcpip.ProtocolAddress{fakeNetNumber, nic1Addr} if err := s.AddProtocolAddress(1, nic1ProtoAddr); err != nil { - t.Fatalf("AddProtocolAddress(1, %s) failed: %s", nic1ProtoAddr, err) + t.Fatalf("AddProtocolAddress(1, %v) failed: %v", nic1ProtoAddr, err) } nic2ProtoAddr := tcpip.ProtocolAddress{fakeNetNumber, nic2Addr} if err := s.AddProtocolAddress(2, nic2ProtoAddr); err != nil { - t.Fatalf("AddAddress(2, %s) failed: %s", nic2ProtoAddr, err) + t.Fatalf("AddAddress(2, %v) failed: %v", nic2ProtoAddr, err) } // Set the initial route table. @@ -1503,10 +1503,10 @@ func TestOutgoingBroadcastWithRouteTable(t *testing.T) { // When an interface is given, the route for a broadcast goes through it. r, err := s.FindRoute(1, nic1Addr.Address, header.IPv4Broadcast, fakeNetNumber, false /* multicastLoop */) if err != nil { - t.Fatalf("FindRoute(1, %s, %s, %d) failed: %s", nic1Addr.Address, header.IPv4Broadcast, fakeNetNumber, err) + t.Fatalf("FindRoute(1, %v, %v, %d) failed: %v", nic1Addr.Address, header.IPv4Broadcast, fakeNetNumber, err) } if err := verifyRoute(r, stack.Route{LocalAddress: nic1Addr.Address, RemoteAddress: header.IPv4Broadcast}); err != nil { - t.Errorf("FindRoute(1, %s, %s, %d) returned unexpected Route: %s)", nic1Addr.Address, header.IPv4Broadcast, fakeNetNumber, err) + t.Errorf("FindRoute(1, %v, %v, %d) returned unexpected Route: %v", nic1Addr.Address, header.IPv4Broadcast, fakeNetNumber, err) } // When an interface is not given, it consults the route table. @@ -2399,7 +2399,7 @@ func TestNICContextPreservation(t *testing.T) { t.Fatalf("got nicinfos[%d] = _, %t, want _, true; nicinfos = %+v", id, ok, nicinfos) } if got, want := nicinfo.Context == test.want, true; got != want { - t.Fatal("got nicinfo.Context == ctx = %t, want %t; nicinfo.Context = %p, ctx = %p", got, want, nicinfo.Context, test.want) + t.Fatalf("got nicinfo.Context == ctx = %t, want %t; nicinfo.Context = %p, ctx = %p", got, want, nicinfo.Context, test.want) } }) } @@ -2768,7 +2768,7 @@ func TestNewPEBOnPromotionToPermanent(t *testing.T) { { subnet, err := tcpip.NewSubnet("\x00", "\x00") if err != nil { - t.Fatalf("NewSubnet failed:", err) + t.Fatalf("NewSubnet failed: %v", err) } s.SetRouteTable([]tcpip.Route{{Destination: subnet, Gateway: "\x00", NIC: 1}}) } @@ -2782,11 +2782,11 @@ func TestNewPEBOnPromotionToPermanent(t *testing.T) { // permanentExpired kind. r, err := s.FindRoute(1, "\x01", "\x02", fakeNetNumber, false) if err != nil { - t.Fatal("FindRoute failed:", err) + t.Fatalf("FindRoute failed: %v", err) } defer r.Release() if err := s.RemoveAddress(1, "\x01"); err != nil { - t.Fatalf("RemoveAddress failed:", err) + t.Fatalf("RemoveAddress failed: %v", err) } // @@ -2798,7 +2798,7 @@ func TestNewPEBOnPromotionToPermanent(t *testing.T) { // Add some other address with peb set to // FirstPrimaryEndpoint. if err := s.AddAddressWithOptions(1, fakeNetNumber, "\x03", stack.FirstPrimaryEndpoint); err != nil { - t.Fatal("AddAddressWithOptions failed:", err) + t.Fatalf("AddAddressWithOptions failed: %v", err) } @@ -2806,7 +2806,7 @@ func TestNewPEBOnPromotionToPermanent(t *testing.T) { // make sure the new peb was respected. // (The address should just be promoted now). if err := s.AddAddressWithOptions(1, fakeNetNumber, "\x01", ps); err != nil { - t.Fatal("AddAddressWithOptions failed:", err) + t.Fatalf("AddAddressWithOptions failed: %v", err) } var primaryAddrs []tcpip.Address for _, pa := range s.NICInfo()[1].ProtocolAddresses { @@ -2839,11 +2839,11 @@ func TestNewPEBOnPromotionToPermanent(t *testing.T) { // GetMainNICAddress; else, our original address // should be returned. if err := s.RemoveAddress(1, "\x03"); err != nil { - t.Fatalf("RemoveAddress failed:", err) + t.Fatalf("RemoveAddress failed: %v", err) } addr, err = s.GetMainNICAddress(1, fakeNetNumber) if err != nil { - t.Fatal("s.GetMainNICAddress failed:", err) + t.Fatalf("s.GetMainNICAddress failed: %v", err) } if ps == stack.NeverPrimaryEndpoint { if want := (tcpip.AddressWithPrefix{}); addr != want { diff --git a/pkg/tcpip/tcpip_test.go b/pkg/tcpip/tcpip_test.go index 8c0aacffa..1c8e2bc34 100644 --- a/pkg/tcpip/tcpip_test.go +++ b/pkg/tcpip/tcpip_test.go @@ -218,7 +218,7 @@ func TestAddressWithPrefixSubnet(t *testing.T) { gotSubnet := ap.Subnet() wantSubnet, err := NewSubnet(tt.subnetAddr, tt.subnetMask) if err != nil { - t.Error("NewSubnet(%q, %q) failed: %s", tt.subnetAddr, tt.subnetMask, err) + t.Errorf("NewSubnet(%q, %q) failed: %s", tt.subnetAddr, tt.subnetMask, err) continue } if gotSubnet != wantSubnet { diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go index ce3df7478..8a87fa0a8 100644 --- a/pkg/tcpip/transport/tcp/tcp_test.go +++ b/pkg/tcpip/transport/tcp/tcp_test.go @@ -284,7 +284,7 @@ func TestTCPResetSentForACKWhenNotUsingSynCookies(t *testing.T) { // are released instantly on Close. tcpTW := tcpip.TCPTimeWaitTimeoutOption(1 * time.Millisecond) if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpTW); err != nil { - t.Fatalf("e.stack.SetTransportProtocolOption(%d, %s) = %s", tcp.ProtocolNumber, tcpTW, err) + t.Fatalf("e.stack.SetTransportProtocolOption(%d, %v) = %v", tcp.ProtocolNumber, tcpTW, err) } c.EP.Close() @@ -5609,7 +5609,7 @@ func TestReceiveBufferAutoTuningApplicationLimited(t *testing.T) { return } if w := tcp.WindowSize(); w == 0 || w > uint16(wantRcvWnd) { - t.Errorf("expected a non-zero window: got %d, want <= wantRcvWnd", w, wantRcvWnd) + t.Errorf("expected a non-zero window: got %d, want <= wantRcvWnd", w) } }, )) diff --git a/pkg/tcpip/transport/udp/udp_test.go b/pkg/tcpip/transport/udp/udp_test.go index 0905726c1..b3fe557ca 100644 --- a/pkg/tcpip/transport/udp/udp_test.go +++ b/pkg/tcpip/transport/udp/udp_test.go @@ -607,7 +607,7 @@ func testReadInternal(c *testContext, flow testFlow, packetShouldBeDropped, expe // Check the peer address. h := flow.header4Tuple(incoming) if addr.Addr != h.srcAddr.Addr { - c.t.Fatalf("unexpected remote address: got %s, want %s", addr.Addr, h.srcAddr) + c.t.Fatalf("unexpected remote address: got %s, want %v", addr.Addr, h.srcAddr) } // Check the payload. diff --git a/runsc/container/test_app/test_app.go b/runsc/container/test_app/test_app.go index 01c47c79f..5f1c4b7d6 100644 --- a/runsc/container/test_app/test_app.go +++ b/runsc/container/test_app/test_app.go @@ -96,7 +96,7 @@ func (c *uds) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{}) listener, err := net.Listen("unix", c.socketPath) if err != nil { - log.Fatal("error listening on socket %q:", c.socketPath, err) + log.Fatalf("error listening on socket %q: %v", c.socketPath, err) } go server(listener, outputFile) diff --git a/test/root/cgroup_test.go b/test/root/cgroup_test.go index 4038661cb..679342def 100644 --- a/test/root/cgroup_test.go +++ b/test/root/cgroup_test.go @@ -53,7 +53,7 @@ func verifyPid(pid int, path string) error { if scanner.Err() != nil { return scanner.Err() } - return fmt.Errorf("got: %s, want: %d", gots, pid) + return fmt.Errorf("got: %v, want: %d", gots, pid) } // TestCgroup sets cgroup options and checks that cgroup was properly configured. @@ -106,7 +106,7 @@ func TestMemCGroup(t *testing.T) { time.Sleep(100 * time.Millisecond) } - t.Fatalf("%vMB is less than %vMB: %v", memUsage>>20, allocMemSize>>20) + t.Fatalf("%vMB is less than %vMB", memUsage>>20, allocMemSize>>20) } // TestCgroup sets cgroup options and checks that cgroup was properly configured. diff --git a/test/runtimes/blacklist_test.go b/test/runtimes/blacklist_test.go index 52f49b984..0ff69ab18 100644 --- a/test/runtimes/blacklist_test.go +++ b/test/runtimes/blacklist_test.go @@ -32,6 +32,6 @@ func TestBlacklists(t *testing.T) { t.Fatalf("error parsing blacklist: %v", err) } if *blacklistFile != "" && len(bl) == 0 { - t.Errorf("got empty blacklist for file %q", blacklistFile) + t.Errorf("got empty blacklist for file %q", *blacklistFile) } } diff --git a/test/runtimes/runner.go b/test/runtimes/runner.go index ddb890dbc..3c98f4570 100644 --- a/test/runtimes/runner.go +++ b/test/runtimes/runner.go @@ -114,7 +114,7 @@ func getTests(d dockerutil.Docker, blacklist map[string]struct{}) ([]testing.Int F: func(t *testing.T) { // Is the test blacklisted? if _, ok := blacklist[tc]; ok { - t.Skip("SKIP: blacklisted test %q", tc) + t.Skipf("SKIP: blacklisted test %q", tc) } var ( diff --git a/tools/nogo.json b/tools/nogo.json index cc05ba027..f69999e50 100644 --- a/tools/nogo.json +++ b/tools/nogo.json @@ -27,37 +27,6 @@ "/external/io_opencensus_go/tag/map_codec.go": "allowed: false positive" } }, - "printf": { - "exclude_files": { - ".*_abi_autogen_test.go": "fix: Sprintf format has insufficient args", - "/pkg/segment/test/segment_test.go": "fix: Errorf format %d arg seg.Start is a func value, not called", - "/pkg/tcpip/tcpip_test.go": "fix: Error call has possible formatting directive %q", - "/pkg/tcpip/header/eth_test.go": "fix: Fatalf format %s reads arg #3, but call has 2 args", - "/pkg/tcpip/header/ndp_test.go": "fix: Errorf format %d reads arg #1, but call has 0 args", - "/pkg/eventchannel/event_test.go": "fix: Fatal call has possible formatting directive %v", - "/pkg/tcpip/stack/ndp.go": "fix: Fatalf format %s has arg protocolAddr of wrong type gvisor.dev/gvisor/pkg/tcpip.ProtocolAddress", - "/pkg/sentry/fs/fdpipe/pipe_test.go": "fix: Errorf format %s has arg flags of wrong type gvisor.dev/gvisor/pkg/sentry/fs.FileFlags", - "/pkg/sentry/fs/fdpipe/pipe_test.go": "fix: Errorf format %d arg f.FD is a func value, not called", - "/pkg/tcpip/link/fdbased/endpoint.go": "fix: Sprintf format %v with arg p causes recursive String method call", - "/pkg/tcpip/transport/udp/udp_test.go": "fix: Fatalf format %s has arg h.srcAddr of wrong type gvisor.dev/gvisor/pkg/tcpip.FullAddress", - "/pkg/tcpip/transport/tcp/tcp_test.go": "fix: Fatalf format %s has arg tcpTW of wrong type gvisor.dev/gvisor/pkg/tcpip.TCPTimeWaitTimeoutOption", - "/pkg/tcpip/transport/tcp/tcp_test.go": "fix: Errorf call needs 1 arg but has 2 args", - "/pkg/tcpip/stack/ndp_test.go": "fix: Errorf format %s reads arg #3, but call has 2 args", - "/pkg/tcpip/stack/ndp_test.go": "fix: Fatalf format %s reads arg #5, but call has 4 args", - "/pkg/tcpip/stack/stack_test.go": "fix: Fatalf format %s has arg protoAddr of wrong type gvisor.dev/gvisor/pkg/tcpip.ProtocolAddress", - "/pkg/tcpip/stack/stack_test.go": "fix: Fatalf format %s has arg nic1ProtoAddr of wrong type gvisor.dev/gvisor/pkg/tcpip.ProtocolAddress", - "/pkg/tcpip/stack/stack_test.go": "fix: Fatalf format %s has arg nic2ProtoAddr of wrong type gvisor.dev/gvisor/pkg/tcpip.ProtocolAddress", - "/pkg/tcpip/stack/stack_test.go": "fix: Fatal call has possible formatting directive %t", - "/pkg/tcpip/stack/stack_test.go": "fix: Fatalf call has arguments but no formatting directives", - "/pkg/tcpip/link/fdbased/endpoint.go": "fix: Sprintf format %v with arg p causes recursive String method call", - "/pkg/sentry/fsimpl/tmpfs/stat_test.go": "fix: Errorf format %v reads arg #1, but call has 0 args", - "/runsc/container/test_app/test_app.go": "fix: Fatal call has possible formatting directive %q", - "/test/root/cgroup_test.go": "fix: Errorf format %s has arg gots of wrong type []int", - "/test/root/cgroup_test.go": "fix: Fatalf format %v reads arg #3, but call has 2 args", - "/test/runtimes/runner.go": "fix: Skip call has possible formatting directive %q", - "/test/runtimes/blacklist_test.go": "fix: Errorf format %q has arg blacklistFile of wrong type *string" - } - }, "structtag": { "exclude_files": { "/external/": "allowed: may use arbitrary tags" -- cgit v1.2.3 From f888b9ce83c202bb77c1e29c3ee60cc485906536 Mon Sep 17 00:00:00 2001 From: Adin Scannell Date: Wed, 25 Mar 2020 16:57:37 -0700 Subject: Fix unused result errors. This fixes a bug in the proc net directory. Updates #2243 --- pkg/sentry/fsimpl/proc/task_net.go | 4 ++-- tools/nogo.json | 6 ------ 2 files changed, 2 insertions(+), 8 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/proc/task_net.go b/pkg/sentry/fsimpl/proc/task_net.go index 6b2a77328..6595fcee6 100644 --- a/pkg/sentry/fsimpl/proc/task_net.go +++ b/pkg/sentry/fsimpl/proc/task_net.go @@ -688,9 +688,9 @@ func (d *netSnmpData) Generate(ctx context.Context, buf *bytes.Buffer) error { if line.prefix == "Tcp" { tcp := stat.(*inet.StatSNMPTCP) // "Tcp" needs special processing because MaxConn is signed. RFC 2012. - fmt.Sprintf("%s: %s %d %s\n", line.prefix, sprintSlice(tcp[:3]), int64(tcp[3]), sprintSlice(tcp[4:])) + fmt.Fprintf(buf, "%s: %s %d %s\n", line.prefix, sprintSlice(tcp[:3]), int64(tcp[3]), sprintSlice(tcp[4:])) } else { - fmt.Sprintf("%s: %s\n", line.prefix, sprintSlice(toSlice(stat))) + fmt.Fprintf(buf, "%s: %s\n", line.prefix, sprintSlice(toSlice(stat))) } } return nil diff --git a/tools/nogo.json b/tools/nogo.json index f69999e50..09bda9212 100644 --- a/tools/nogo.json +++ b/tools/nogo.json @@ -44,11 +44,5 @@ "/pkg/sentry/platform/safecopy/safecopy_unsafe.go": "allowed: special case", "/pkg/sentry/vfs/mount_unsafe.go": "allowed: special case" } - }, - "unusedresult": { - "exclude_files": { - "/pkg/sentry/fsimpl/proc/task_net.go": "fix: result of fmt.Sprintf call not used", - "/pkg/sentry/fsimpl/proc/tasks_net.go": "fix: result of fmt.Sprintf call not used" - } } } -- cgit v1.2.3 From b30130567d81157e39b692e0116f86015f0bcc71 Mon Sep 17 00:00:00 2001 From: Fabricio Voznika Date: Wed, 8 Apr 2020 13:33:44 -0700 Subject: Enable SubprocessExited and SubprocessZombie for gVisor Updates #164 PiperOrigin-RevId: 305544029 --- pkg/sentry/fs/proc/task.go | 28 ++++++++++++++++-- pkg/sentry/fsimpl/proc/task.go | 16 ----------- pkg/sentry/fsimpl/proc/task_files.go | 56 ++++++++++++++++++++++++++++++++++-- test/syscalls/linux/proc.cc | 31 ++++++++------------ 4 files changed, 90 insertions(+), 41 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fs/proc/task.go b/pkg/sentry/fs/proc/task.go index d6c5dd2c1..4d42eac83 100644 --- a/pkg/sentry/fs/proc/task.go +++ b/pkg/sentry/fs/proc/task.go @@ -57,6 +57,16 @@ func getTaskMM(t *kernel.Task) (*mm.MemoryManager, error) { return m, nil } +func checkTaskState(t *kernel.Task) error { + switch t.ExitState() { + case kernel.TaskExitZombie: + return syserror.EACCES + case kernel.TaskExitDead: + return syserror.ESRCH + } + return nil +} + // taskDir represents a task-level directory. // // +stateify savable @@ -254,11 +264,12 @@ func newExe(t *kernel.Task, msrc *fs.MountSource) *fs.Inode { } func (e *exe) executable() (file fsbridge.File, err error) { + if err := checkTaskState(e.t); err != nil { + return nil, err + } e.t.WithMuLocked(func(t *kernel.Task) { mm := t.MemoryManager() if mm == nil { - // TODO(b/34851096): Check shouldn't allow Readlink once the - // Task is zombied. err = syserror.EACCES return } @@ -268,7 +279,7 @@ func (e *exe) executable() (file fsbridge.File, err error) { // (with locks held). file = mm.Executable() if file == nil { - err = syserror.ENOENT + err = syserror.ESRCH } }) return @@ -313,11 +324,22 @@ func newNamespaceSymlink(t *kernel.Task, msrc *fs.MountSource, name string) *fs. return newProcInode(t, n, msrc, fs.Symlink, t) } +// Readlink reads the symlink value. +func (n *namespaceSymlink) Readlink(ctx context.Context, inode *fs.Inode) (string, error) { + if err := checkTaskState(n.t); err != nil { + return "", err + } + return n.Symlink.Readlink(ctx, inode) +} + // Getlink implements fs.InodeOperations.Getlink. func (n *namespaceSymlink) Getlink(ctx context.Context, inode *fs.Inode) (*fs.Dirent, error) { if !kernel.ContextCanTrace(ctx, n.t, false) { return nil, syserror.EACCES } + if err := checkTaskState(n.t); err != nil { + return nil, err + } // Create a new regular file to fake the namespace file. iops := fsutil.NewNoReadWriteFileInode(ctx, fs.RootOwner, fs.FilePermsFromMode(0777), linux.PROC_SUPER_MAGIC) diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go index aee2a4392..888afc0fd 100644 --- a/pkg/sentry/fsimpl/proc/task.go +++ b/pkg/sentry/fsimpl/proc/task.go @@ -214,22 +214,6 @@ func newIO(t *kernel.Task, isThreadGroup bool) *ioData { return &ioData{ioUsage: t} } -func newNamespaceSymlink(task *kernel.Task, ino uint64, ns string) *kernfs.Dentry { - // Namespace symlinks should contain the namespace name and the inode number - // for the namespace instance, so for example user:[123456]. We currently fake - // the inode number by sticking the symlink inode in its place. - target := fmt.Sprintf("%s:[%d]", ns, ino) - - inode := &kernfs.StaticSymlink{} - // Note: credentials are overridden by taskOwnedInode. - inode.Init(task.Credentials(), ino, target) - - taskInode := &taskOwnedInode{Inode: inode, owner: task} - d := &kernfs.Dentry{} - d.Init(taskInode) - return d -} - // newCgroupData creates inode that shows cgroup information. // From man 7 cgroups: "For each cgroup hierarchy of which the process is a // member, there is one entry containing three colon-separated fields: diff --git a/pkg/sentry/fsimpl/proc/task_files.go b/pkg/sentry/fsimpl/proc/task_files.go index 88ea6a6d8..2c6f8bdfc 100644 --- a/pkg/sentry/fsimpl/proc/task_files.go +++ b/pkg/sentry/fsimpl/proc/task_files.go @@ -64,6 +64,16 @@ func getMMIncRef(task *kernel.Task) (*mm.MemoryManager, error) { return m, nil } +func checkTaskState(t *kernel.Task) error { + switch t.ExitState() { + case kernel.TaskExitZombie: + return syserror.EACCES + case kernel.TaskExitDead: + return syserror.ESRCH + } + return nil +} + type bufferWriter struct { buf *bytes.Buffer } @@ -628,11 +638,13 @@ func (s *exeSymlink) Getlink(ctx context.Context) (vfs.VirtualDentry, string, er } func (s *exeSymlink) executable() (file fsbridge.File, err error) { + if err := checkTaskState(s.task); err != nil { + return nil, err + } + s.task.WithMuLocked(func(t *kernel.Task) { mm := t.MemoryManager() if mm == nil { - // TODO(b/34851096): Check shouldn't allow Readlink once the - // Task is zombied. err = syserror.EACCES return } @@ -642,7 +654,7 @@ func (s *exeSymlink) executable() (file fsbridge.File, err error) { // (with locks held). file = mm.Executable() if file == nil { - err = syserror.ENOENT + err = syserror.ESRCH } }) return @@ -709,3 +721,41 @@ func (i *mountsData) Generate(ctx context.Context, buf *bytes.Buffer) error { i.task.Kernel().VFS().GenerateProcMounts(ctx, rootDir, buf) return nil } + +type namespaceSymlink struct { + kernfs.StaticSymlink + + task *kernel.Task +} + +func newNamespaceSymlink(task *kernel.Task, ino uint64, ns string) *kernfs.Dentry { + // Namespace symlinks should contain the namespace name and the inode number + // for the namespace instance, so for example user:[123456]. We currently fake + // the inode number by sticking the symlink inode in its place. + target := fmt.Sprintf("%s:[%d]", ns, ino) + + inode := &namespaceSymlink{task: task} + // Note: credentials are overridden by taskOwnedInode. + inode.Init(task.Credentials(), ino, target) + + taskInode := &taskOwnedInode{Inode: inode, owner: task} + d := &kernfs.Dentry{} + d.Init(taskInode) + return d +} + +// Readlink implements Inode. +func (s *namespaceSymlink) Readlink(ctx context.Context) (string, error) { + if err := checkTaskState(s.task); err != nil { + return "", err + } + return s.StaticSymlink.Readlink(ctx) +} + +// Getlink implements Inode.Getlink. +func (s *namespaceSymlink) Getlink(ctx context.Context) (vfs.VirtualDentry, string, error) { + if err := checkTaskState(s.task); err != nil { + return vfs.VirtualDentry{}, "", err + } + return s.StaticSymlink.Getlink(ctx) +} diff --git a/test/syscalls/linux/proc.cc b/test/syscalls/linux/proc.cc index 5a70f6c3b..da98e1f66 100644 --- a/test/syscalls/linux/proc.cc +++ b/test/syscalls/linux/proc.cc @@ -1326,8 +1326,6 @@ TEST(ProcPidSymlink, SubprocessRunning) { SyscallSucceedsWithValue(sizeof(buf))); } -// FIXME(gvisor.dev/issue/164): Inconsistent behavior between gVisor and linux -// on proc files. TEST(ProcPidSymlink, SubprocessZombied) { ASSERT_NO_ERRNO(SetCapability(CAP_DAC_OVERRIDE, false)); ASSERT_NO_ERRNO(SetCapability(CAP_DAC_READ_SEARCH, false)); @@ -1337,7 +1335,7 @@ TEST(ProcPidSymlink, SubprocessZombied) { int want = EACCES; if (!IsRunningOnGvisor()) { auto version = ASSERT_NO_ERRNO_AND_VALUE(GetKernelVersion()); - if (version.major == 4 && version.minor > 3) { + if (version.major > 4 || (version.major == 4 && version.minor > 3)) { want = ENOENT; } } @@ -1350,30 +1348,25 @@ TEST(ProcPidSymlink, SubprocessZombied) { SyscallFailsWithErrno(want)); } - // FIXME(gvisor.dev/issue/164): Inconsistent behavior between gVisor and linux - // on proc files. + // FIXME(gvisor.dev/issue/164): Inconsistent behavior between linux on proc + // files. // // ~4.3: Syscall fails with EACCES. - // 4.17 & gVisor: Syscall succeeds and returns 1. + // 4.17: Syscall succeeds and returns 1. // - // EXPECT_THAT(ReadlinkWhileZombied("ns/pid", buf, sizeof(buf)), - // SyscallFailsWithErrno(EACCES)); + if (!IsRunningOnGvisor()) { + return; + } - // FIXME(gvisor.dev/issue/164): Inconsistent behavior between gVisor and linux - // on proc files. - // - // ~4.3: Syscall fails with EACCES. - // 4.17 & gVisor: Syscall succeeds and returns 1. - // - // EXPECT_THAT(ReadlinkWhileZombied("ns/user", buf, sizeof(buf)), - // SyscallFailsWithErrno(EACCES)); + EXPECT_THAT(ReadlinkWhileZombied("ns/pid", buf, sizeof(buf)), + SyscallFailsWithErrno(want)); + + EXPECT_THAT(ReadlinkWhileZombied("ns/user", buf, sizeof(buf)), + SyscallFailsWithErrno(want)); } // Test whether /proc/PID/ symlinks can be read for an exited process. TEST(ProcPidSymlink, SubprocessExited) { - // FIXME(gvisor.dev/issue/164): These all succeed on gVisor. - SKIP_IF(IsRunningOnGvisor()); - char buf[1]; EXPECT_THAT(ReadlinkWhileExited("exe", buf, sizeof(buf)), -- cgit v1.2.3 From d8c0c1d9d5f30f4cfe8e0adedd264b10aff793f7 Mon Sep 17 00:00:00 2001 From: Jamie Liu Date: Wed, 8 Apr 2020 13:39:02 -0700 Subject: Do not hold FileDescription references in VFS2 procfs inodes. FileDescription references are side-effectual; for example, holding a reference on the write end of a pipe prevents reads from the read end from returning EOF. This change is consistent with Linux, but not VFS1; while VFS1 also has this bug, it's less visible there since VFS1 procfs disables caching. Updates #1195 PiperOrigin-RevId: 305545099 --- pkg/sentry/fsimpl/proc/task_fds.go | 125 ++++++++++++++++++++----------------- 1 file changed, 67 insertions(+), 58 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/proc/task_fds.go b/pkg/sentry/fsimpl/proc/task_fds.go index 9c8656b28..046265eca 100644 --- a/pkg/sentry/fsimpl/proc/task_fds.go +++ b/pkg/sentry/fsimpl/proc/task_fds.go @@ -30,34 +30,35 @@ import ( "gvisor.dev/gvisor/pkg/syserror" ) -type fdDir struct { - inoGen InoGenerator - task *kernel.Task - - // When produceSymlinks is set, dirents produces for the FDs are reported - // as symlink. Otherwise, they are reported as regular files. - produceSymlink bool -} - -func (i *fdDir) lookup(name string) (*vfs.FileDescription, kernel.FDFlags, error) { - fd, err := strconv.ParseUint(name, 10, 64) - if err != nil { - return nil, kernel.FDFlags{}, syserror.ENOENT - } - +func getTaskFD(t *kernel.Task, fd int32) (*vfs.FileDescription, kernel.FDFlags) { var ( file *vfs.FileDescription flags kernel.FDFlags ) - i.task.WithMuLocked(func(t *kernel.Task) { - if fdTable := t.FDTable(); fdTable != nil { - file, flags = fdTable.GetVFS2(int32(fd)) + t.WithMuLocked(func(t *kernel.Task) { + if fdt := t.FDTable(); fdt != nil { + file, flags = fdt.GetVFS2(fd) } }) + return file, flags +} + +func taskFDExists(t *kernel.Task, fd int32) bool { + file, _ := getTaskFD(t, fd) if file == nil { - return nil, kernel.FDFlags{}, syserror.ENOENT + return false } - return file, flags, nil + file.DecRef() + return true +} + +type fdDir struct { + inoGen InoGenerator + task *kernel.Task + + // When produceSymlinks is set, dirents produces for the FDs are reported + // as symlink. Otherwise, they are reported as regular files. + produceSymlink bool } // IterDirents implements kernfs.inodeDynamicLookup. @@ -128,11 +129,15 @@ func newFDDirInode(task *kernel.Task, inoGen InoGenerator) *kernfs.Dentry { // Lookup implements kernfs.inodeDynamicLookup. func (i *fdDirInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) { - file, _, err := i.lookup(name) + fdInt, err := strconv.ParseInt(name, 10, 32) if err != nil { - return nil, err + return nil, syserror.ENOENT + } + fd := int32(fdInt) + if !taskFDExists(i.task, fd) { + return nil, syserror.ENOENT } - taskDentry := newFDSymlink(i.task.Credentials(), file, i.inoGen.NextIno()) + taskDentry := newFDSymlink(i.task, fd, i.inoGen.NextIno()) return taskDentry.VFSDentry(), nil } @@ -169,19 +174,22 @@ func (i *fdDirInode) CheckPermissions(ctx context.Context, creds *auth.Credentia // // +stateify savable type fdSymlink struct { - refs.AtomicRefCount kernfs.InodeAttrs + kernfs.InodeNoopRefCount kernfs.InodeSymlink - file *vfs.FileDescription + task *kernel.Task + fd int32 } var _ kernfs.Inode = (*fdSymlink)(nil) -func newFDSymlink(creds *auth.Credentials, file *vfs.FileDescription, ino uint64) *kernfs.Dentry { - file.IncRef() - inode := &fdSymlink{file: file} - inode.Init(creds, ino, linux.ModeSymlink|0777) +func newFDSymlink(task *kernel.Task, fd int32, ino uint64) *kernfs.Dentry { + inode := &fdSymlink{ + task: task, + fd: fd, + } + inode.Init(task.Credentials(), ino, linux.ModeSymlink|0777) d := &kernfs.Dentry{} d.Init(inode) @@ -189,29 +197,27 @@ func newFDSymlink(creds *auth.Credentials, file *vfs.FileDescription, ino uint64 } func (s *fdSymlink) Readlink(ctx context.Context) (string, error) { + file, _ := getTaskFD(s.task, s.fd) + if file == nil { + return "", syserror.ENOENT + } + defer file.DecRef() root := vfs.RootFromContext(ctx) defer root.DecRef() - - vfsObj := s.file.VirtualDentry().Mount().Filesystem().VirtualFilesystem() - return vfsObj.PathnameWithDeleted(ctx, root, s.file.VirtualDentry()) + return s.task.Kernel().VFS().PathnameWithDeleted(ctx, root, file.VirtualDentry()) } func (s *fdSymlink) Getlink(ctx context.Context) (vfs.VirtualDentry, string, error) { - vd := s.file.VirtualDentry() + file, _ := getTaskFD(s.task, s.fd) + if file == nil { + return vfs.VirtualDentry{}, "", syserror.ENOENT + } + defer file.DecRef() + vd := file.VirtualDentry() vd.IncRef() return vd, "", nil } -func (s *fdSymlink) DecRef() { - s.AtomicRefCount.DecRefWithDestructor(func() { - s.Destroy() - }) -} - -func (s *fdSymlink) Destroy() { - s.file.DecRef() -} - // fdInfoDirInode represents the inode for /proc/[pid]/fdinfo directory. // // +stateify savable @@ -244,12 +250,18 @@ func newFDInfoDirInode(task *kernel.Task, inoGen InoGenerator) *kernfs.Dentry { // Lookup implements kernfs.inodeDynamicLookup. func (i *fdInfoDirInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) { - file, flags, err := i.lookup(name) + fdInt, err := strconv.ParseInt(name, 10, 32) if err != nil { - return nil, err + return nil, syserror.ENOENT + } + fd := int32(fdInt) + if !taskFDExists(i.task, fd) { + return nil, syserror.ENOENT + } + data := &fdInfoData{ + task: i.task, + fd: fd, } - - data := &fdInfoData{file: file, flags: flags} dentry := newTaskOwnedFile(i.task, i.inoGen.NextIno(), 0444, data) return dentry.VFSDentry(), nil } @@ -268,26 +280,23 @@ type fdInfoData struct { kernfs.DynamicBytesFile refs.AtomicRefCount - file *vfs.FileDescription - flags kernel.FDFlags + task *kernel.Task + fd int32 } var _ dynamicInode = (*fdInfoData)(nil) -func (d *fdInfoData) DecRef() { - d.AtomicRefCount.DecRefWithDestructor(d.destroy) -} - -func (d *fdInfoData) destroy() { - d.file.DecRef() -} - // Generate implements vfs.DynamicBytesSource.Generate. func (d *fdInfoData) Generate(ctx context.Context, buf *bytes.Buffer) error { + file, descriptorFlags := getTaskFD(d.task, d.fd) + if file == nil { + return syserror.ENOENT + } + defer file.DecRef() // TODO(b/121266871): Include pos, locks, and other data. For now we only // have flags. // See https://www.kernel.org/doc/Documentation/filesystems/proc.txt - flags := uint(d.file.StatusFlags()) | d.flags.ToLinuxFileFlags() + flags := uint(file.StatusFlags()) | descriptorFlags.ToLinuxFileFlags() fmt.Fprintf(buf, "flags:\t0%o\n", flags) return nil } -- cgit v1.2.3 From 6dd5a1f3fe55daa8510b1ee5e3a59219aad92af6 Mon Sep 17 00:00:00 2001 From: Fabricio Voznika Date: Wed, 8 Apr 2020 17:56:55 -0700 Subject: Clean up TODOs PiperOrigin-RevId: 305592245 --- pkg/sentry/fs/tmpfs/fs.go | 3 --- pkg/sentry/fsimpl/kernfs/filesystem.go | 2 +- pkg/sentry/kernel/ptrace.go | 1 - pkg/sentry/vfs/filesystem.go | 2 +- pkg/sentry/vfs/mount.go | 12 ++++++------ pkg/sentry/vfs/mount_test.go | 2 +- runsc/cmd/gofer.go | 5 ++--- test/syscalls/linux/epoll.cc | 4 ---- test/syscalls/linux/file_base.h | 1 + test/syscalls/linux/pwrite64.cc | 9 +-------- test/syscalls/linux/tuntap.cc | 7 ++++--- test/syscalls/linux/write.cc | 10 ++-------- tools/go_generics/defs.bzl | 1 - 13 files changed, 19 insertions(+), 40 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fs/tmpfs/fs.go b/pkg/sentry/fs/tmpfs/fs.go index d5be56c3f..bc117ca6a 100644 --- a/pkg/sentry/fs/tmpfs/fs.go +++ b/pkg/sentry/fs/tmpfs/fs.go @@ -44,9 +44,6 @@ const ( // lookup. cacheRevalidate = "revalidate" - // TODO(edahlgren/mpratt): support a tmpfs size limit. - // size = "size" - // Permissions that exceed modeMask will be rejected. modeMask = 01777 diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go index 16a3c18ae..4433071aa 100644 --- a/pkg/sentry/fsimpl/kernfs/filesystem.go +++ b/pkg/sentry/fsimpl/kernfs/filesystem.go @@ -682,7 +682,7 @@ func (fs *Filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linu if err != nil { return linux.Statfs{}, err } - // TODO: actually implement statfs + // TODO(gvisor.dev/issue/1193): actually implement statfs. return linux.Statfs{}, syserror.ENOSYS } diff --git a/pkg/sentry/kernel/ptrace.go b/pkg/sentry/kernel/ptrace.go index 35ad97d5d..e23e796ef 100644 --- a/pkg/sentry/kernel/ptrace.go +++ b/pkg/sentry/kernel/ptrace.go @@ -184,7 +184,6 @@ func (t *Task) CanTrace(target *Task, attach bool) bool { if targetCreds.PermittedCaps&^callerCreds.PermittedCaps != 0 { return false } - // TODO: Yama LSM return true } diff --git a/pkg/sentry/vfs/filesystem.go b/pkg/sentry/vfs/filesystem.go index cd34782ff..bef1bd312 100644 --- a/pkg/sentry/vfs/filesystem.go +++ b/pkg/sentry/vfs/filesystem.go @@ -497,7 +497,7 @@ type FilesystemImpl interface { // Preconditions: vd.Mount().Filesystem().Impl() == this FilesystemImpl. PrependPath(ctx context.Context, vfsroot, vd VirtualDentry, b *fspath.Builder) error - // TODO: inotify_add_watch() + // TODO(gvisor.dev/issue/1479): inotify_add_watch() } // PrependPathAtVFSRootError is returned by implementations of diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go index 1b8ecc415..f06946103 100644 --- a/pkg/sentry/vfs/mount.go +++ b/pkg/sentry/vfs/mount.go @@ -233,9 +233,9 @@ func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentia } vd.dentry.mu.Lock() } - // TODO: Linux requires that either both the mount point and the mount root - // are directories, or neither are, and returns ENOTDIR if this is not the - // case. + // TODO(gvisor.dev/issue/1035): Linux requires that either both the mount + // point and the mount root are directories, or neither are, and returns + // ENOTDIR if this is not the case. mntns := vd.mount.ns mnt := newMount(vfs, fs, root, mntns, opts) vfs.mounts.seq.BeginWrite() @@ -274,9 +274,9 @@ func (vfs *VirtualFilesystem) UmountAt(ctx context.Context, creds *auth.Credenti } } - // TODO(jamieliu): Linux special-cases umount of the caller's root, which - // we don't implement yet (we'll just fail it since the caller holds a - // reference on it). + // TODO(gvisor.dev/issue/1035): Linux special-cases umount of the caller's + // root, which we don't implement yet (we'll just fail it since the caller + // holds a reference on it). vfs.mounts.seq.BeginWrite() if opts.Flags&linux.MNT_DETACH == 0 { diff --git a/pkg/sentry/vfs/mount_test.go b/pkg/sentry/vfs/mount_test.go index 3b933468d..3335e4057 100644 --- a/pkg/sentry/vfs/mount_test.go +++ b/pkg/sentry/vfs/mount_test.go @@ -55,7 +55,7 @@ func TestMountTableInsertLookup(t *testing.T) { } } -// TODO: concurrent lookup/insertion/removal +// TODO(gvisor.dev/issue/1035): concurrent lookup/insertion/removal. // must be powers of 2 var benchNumMounts = []int{1 << 2, 1 << 5, 1 << 8} diff --git a/runsc/cmd/gofer.go b/runsc/cmd/gofer.go index 02e5af3d3..28f0d54b9 100644 --- a/runsc/cmd/gofer.go +++ b/runsc/cmd/gofer.go @@ -272,9 +272,8 @@ func setupRootFS(spec *specs.Spec, conf *boot.Config) error { root := spec.Root.Path if !conf.TestOnlyAllowRunAsCurrentUserWithoutChroot { - // FIXME: runsc can't be re-executed without - // /proc, so we create a tmpfs mount, mount ./proc and ./root - // there, then move this mount to the root and after + // runsc can't be re-executed without /proc, so we create a tmpfs mount, + // mount ./proc and ./root there, then move this mount to the root and after // setCapsAndCallSelf, runsc will chroot into /root. // // We need a directory to construct a new root and we know that diff --git a/test/syscalls/linux/epoll.cc b/test/syscalls/linux/epoll.cc index a4f8f3cec..f57d38dc7 100644 --- a/test/syscalls/linux/epoll.cc +++ b/test/syscalls/linux/epoll.cc @@ -56,10 +56,6 @@ TEST(EpollTest, AllWritable) { struct epoll_event result[kFDsPerEpoll]; ASSERT_THAT(RetryEINTR(epoll_wait)(epollfd.get(), result, kFDsPerEpoll, -1), SyscallSucceedsWithValue(kFDsPerEpoll)); - // TODO(edahlgren): Why do some tests check epoll_event::data, and others - // don't? Does Linux actually guarantee that, in any of these test cases, - // epoll_wait will necessarily write out the epoll_events in the order that - // they were registered? for (int i = 0; i < kFDsPerEpoll; i++) { ASSERT_EQ(result[i].events, EPOLLOUT); } diff --git a/test/syscalls/linux/file_base.h b/test/syscalls/linux/file_base.h index 25fdd7106..fb418e052 100644 --- a/test/syscalls/linux/file_base.h +++ b/test/syscalls/linux/file_base.h @@ -87,6 +87,7 @@ class FileTest : public ::testing::Test { ClosePipes(); } + protected: std::string test_file_name_; FileDescriptor test_file_fd_; diff --git a/test/syscalls/linux/pwrite64.cc b/test/syscalls/linux/pwrite64.cc index b48fe540d..c2f72e010 100644 --- a/test/syscalls/linux/pwrite64.cc +++ b/test/syscalls/linux/pwrite64.cc @@ -27,14 +27,7 @@ namespace testing { namespace { -// This test is currently very rudimentary. -// -// TODO(edahlgren): -// * bad buffer states (EFAULT). -// * bad fds (wrong permission, wrong type of file, EBADF). -// * check offset is not incremented. -// * check for EOF. -// * writing to pipes, symlinks, special files. +// TODO(gvisor.dev/issue/2370): This test is currently very rudimentary. class Pwrite64 : public ::testing::Test { void SetUp() override { name_ = NewTempAbsPath(); diff --git a/test/syscalls/linux/tuntap.cc b/test/syscalls/linux/tuntap.cc index 53ad2dda3..3a8ba37eb 100644 --- a/test/syscalls/linux/tuntap.cc +++ b/test/syscalls/linux/tuntap.cc @@ -242,7 +242,7 @@ TEST_F(TuntapTest, InvalidReadWrite) { TEST_F(TuntapTest, WriteToDownDevice) { SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_NET_ADMIN))); - // FIXME: gVisor always creates enabled/up'd interfaces. + // FIXME(b/110961832): gVisor always creates enabled/up'd interfaces. SKIP_IF(IsRunningOnGvisor()); FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(kDevNetTun, O_RDWR)); @@ -280,10 +280,11 @@ PosixErrorOr OpenAndAttachTap( &addr, sizeof(addr))); if (!IsRunningOnGvisor()) { - // FIXME: gVisor doesn't support setting MAC address on interfaces yet. + // FIXME(b/110961832): gVisor doesn't support setting MAC address on + // interfaces yet. RETURN_IF_ERRNO(LinkSetMacAddr(link->index, kMacA, sizeof(kMacA))); - // FIXME: gVisor always creates enabled/up'd interfaces. + // FIXME(b/110961832): gVisor always creates enabled/up'd interfaces. RETURN_IF_ERRNO(LinkChangeFlags(link->index, IFF_UP, IFF_UP)); } diff --git a/test/syscalls/linux/write.cc b/test/syscalls/linux/write.cc index 9b219cfd6..39b5b2f56 100644 --- a/test/syscalls/linux/write.cc +++ b/test/syscalls/linux/write.cc @@ -31,14 +31,8 @@ namespace gvisor { namespace testing { namespace { -// This test is currently very rudimentary. -// -// TODO(edahlgren): -// * bad buffer states (EFAULT). -// * bad fds (wrong permission, wrong type of file, EBADF). -// * check offset is incremented. -// * check for EOF. -// * writing to pipes, symlinks, special files. + +// TODO(gvisor.dev/issue/2370): This test is currently very rudimentary. class WriteTest : public ::testing::Test { public: ssize_t WriteBytes(int fd, int bytes) { diff --git a/tools/go_generics/defs.bzl b/tools/go_generics/defs.bzl index c5be52ecd..8c9995fd4 100644 --- a/tools/go_generics/defs.bzl +++ b/tools/go_generics/defs.bzl @@ -105,7 +105,6 @@ def _go_template_instance_impl(ctx): executable = ctx.executable._tool, ) - # TODO: How can we get the dependencies out? return struct( files = depset([output]), ) -- cgit v1.2.3 From 96f914295920404e7c5c97553771e09b31f6900a Mon Sep 17 00:00:00 2001 From: Fabricio Voznika Date: Fri, 10 Apr 2020 15:46:16 -0700 Subject: Use O_CLOEXEC when dup'ing FDs The sentry doesn't allow execve, but it's a good defense in-depth measure. PiperOrigin-RevId: 305958737 --- pkg/sentry/fs/gofer/inode.go | 2 +- pkg/sentry/fsimpl/gofer/gofer.go | 2 +- runsc/boot/filter/config.go | 2 +- runsc/main.go | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fs/gofer/inode.go b/pkg/sentry/fs/gofer/inode.go index 811e8ea30..a016c896e 100644 --- a/pkg/sentry/fs/gofer/inode.go +++ b/pkg/sentry/fs/gofer/inode.go @@ -273,7 +273,7 @@ func (i *inodeFileState) recreateReadHandles(ctx context.Context, writer *handle // operations on the old will see the new data. Then, make the new handle take // ownereship of the old FD and mark the old readHandle to not close the FD // when done. - if err := syscall.Dup3(h.Host.FD(), i.readHandles.Host.FD(), 0); err != nil { + if err := syscall.Dup3(h.Host.FD(), i.readHandles.Host.FD(), syscall.O_CLOEXEC); err != nil { return err } diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go index 20edaf643..bdf11fa65 100644 --- a/pkg/sentry/fsimpl/gofer/gofer.go +++ b/pkg/sentry/fsimpl/gofer/gofer.go @@ -1089,7 +1089,7 @@ func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool // description, but this doesn't matter since they refer to the // same file (unless d.fs.opts.overlayfsStaleRead is true, // which we handle separately). - if err := syscall.Dup3(int(h.fd), int(d.handle.fd), 0); err != nil { + if err := syscall.Dup3(int(h.fd), int(d.handle.fd), syscall.O_CLOEXEC); err != nil { d.handleMu.Unlock() ctx.Warningf("gofer.dentry.ensureSharedHandle: failed to dup fd %d to fd %d: %v", h.fd, d.handle.fd, err) h.close(ctx) diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go index 06b9f888a..1828d116a 100644 --- a/runsc/boot/filter/config.go +++ b/runsc/boot/filter/config.go @@ -44,7 +44,7 @@ var allowedSyscalls = seccomp.SyscallRules{ { seccomp.AllowAny{}, seccomp.AllowAny{}, - seccomp.AllowValue(0), + seccomp.AllowValue(syscall.O_CLOEXEC), }, }, syscall.SYS_EPOLL_CREATE1: {}, diff --git a/runsc/main.go b/runsc/main.go index c1c78529c..59f624842 100644 --- a/runsc/main.go +++ b/runsc/main.go @@ -291,7 +291,7 @@ func main() { // want with them. Since Docker and Containerd both eat boot's stderr, we // dup our stderr to the provided log FD so that panics will appear in the // logs, rather than just disappear. - if err := syscall.Dup3(fd, int(os.Stderr.Fd()), 0); err != nil { + if err := syscall.Dup3(fd, int(os.Stderr.Fd()), syscall.O_CLOEXEC); err != nil { cmd.Fatalf("error dup'ing fd %d to stderr: %v", fd, err) } } -- cgit v1.2.3 From 09ddb5a4262c39744643b612109dd12dcce176a8 Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Fri, 10 Apr 2020 19:01:39 -0700 Subject: Port extended attributes to VFS2. As in VFS1, we only support the user.* namespace. Plumbing is added to tmpfs and goferfs. Note that because of the slightly different order of checks between VFS2 and Linux, one of the xattr tests needs to be relaxed slightly. Fixes #2363. PiperOrigin-RevId: 305985121 --- pkg/sentry/fsimpl/ext/filesystem.go | 4 +- pkg/sentry/fsimpl/gofer/filesystem.go | 12 ++-- pkg/sentry/fsimpl/gofer/gofer.go | 58 +++++++++++---- pkg/sentry/fsimpl/gofer/p9file.go | 14 ++++ pkg/sentry/fsimpl/kernfs/filesystem.go | 4 +- pkg/sentry/fsimpl/tmpfs/BUILD | 1 + pkg/sentry/fsimpl/tmpfs/filesystem.go | 24 +++---- pkg/sentry/fsimpl/tmpfs/tmpfs.go | 77 ++++++++++++++++++++ pkg/sentry/syscalls/linux/vfs2/xattr.go | 13 ++-- pkg/sentry/vfs/anonfs.go | 4 +- pkg/sentry/vfs/file_description.go | 32 ++++++--- pkg/sentry/vfs/file_description_impl_util.go | 4 +- pkg/sentry/vfs/filesystem.go | 24 ++++++- pkg/sentry/vfs/memxattr/BUILD | 15 ++++ pkg/sentry/vfs/memxattr/xattr.go | 102 +++++++++++++++++++++++++++ pkg/sentry/vfs/options.go | 14 ++++ pkg/sentry/vfs/vfs.go | 8 +-- test/syscalls/linux/xattr.cc | 8 +-- 18 files changed, 350 insertions(+), 68 deletions(-) create mode 100644 pkg/sentry/vfs/memxattr/BUILD create mode 100644 pkg/sentry/vfs/memxattr/xattr.go (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/ext/filesystem.go b/pkg/sentry/fsimpl/ext/filesystem.go index 48eaccdbc..afea58f65 100644 --- a/pkg/sentry/fsimpl/ext/filesystem.go +++ b/pkg/sentry/fsimpl/ext/filesystem.go @@ -476,7 +476,7 @@ func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath } // ListxattrAt implements vfs.FilesystemImpl.ListxattrAt. -func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([]string, error) { +func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) { _, _, err := fs.walk(rp, false) if err != nil { return nil, err @@ -485,7 +485,7 @@ func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([ } // GetxattrAt implements vfs.FilesystemImpl.GetxattrAt. -func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) (string, error) { +func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) { _, _, err := fs.walk(rp, false) if err != nil { return "", err diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go index 137260898..cd744bf5e 100644 --- a/pkg/sentry/fsimpl/gofer/filesystem.go +++ b/pkg/sentry/fsimpl/gofer/filesystem.go @@ -1080,7 +1080,7 @@ func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath } // ListxattrAt implements vfs.FilesystemImpl.ListxattrAt. -func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([]string, error) { +func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) { var ds *[]*dentry fs.renameMu.RLock() defer fs.renameMuRUnlockAndCheckCaching(&ds) @@ -1088,11 +1088,11 @@ func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([ if err != nil { return nil, err } - return d.listxattr(ctx) + return d.listxattr(ctx, rp.Credentials(), size) } // GetxattrAt implements vfs.FilesystemImpl.GetxattrAt. -func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) (string, error) { +func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) { var ds *[]*dentry fs.renameMu.RLock() defer fs.renameMuRUnlockAndCheckCaching(&ds) @@ -1100,7 +1100,7 @@ func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, nam if err != nil { return "", err } - return d.getxattr(ctx, name) + return d.getxattr(ctx, rp.Credentials(), &opts) } // SetxattrAt implements vfs.FilesystemImpl.SetxattrAt. @@ -1112,7 +1112,7 @@ func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt if err != nil { return err } - return d.setxattr(ctx, &opts) + return d.setxattr(ctx, rp.Credentials(), &opts) } // RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt. @@ -1124,7 +1124,7 @@ func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, if err != nil { return err } - return d.removexattr(ctx, name) + return d.removexattr(ctx, rp.Credentials(), name) } // PrependPath implements vfs.FilesystemImpl.PrependPath. diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go index bdf11fa65..2485cdb53 100644 --- a/pkg/sentry/fsimpl/gofer/gofer.go +++ b/pkg/sentry/fsimpl/gofer/gofer.go @@ -34,6 +34,7 @@ package gofer import ( "fmt" "strconv" + "strings" "sync" "sync/atomic" "syscall" @@ -1024,21 +1025,50 @@ func (d *dentry) setDeleted() { atomic.StoreUint32(&d.deleted, 1) } -func (d *dentry) listxattr(ctx context.Context) ([]string, error) { - return nil, syserror.ENOTSUP +// We only support xattrs prefixed with "user." (see b/148380782). Currently, +// there is no need to expose any other xattrs through a gofer. +func (d *dentry) listxattr(ctx context.Context, creds *auth.Credentials, size uint64) ([]string, error) { + xattrMap, err := d.file.listXattr(ctx, size) + if err != nil { + return nil, err + } + xattrs := make([]string, 0, len(xattrMap)) + for x := range xattrMap { + if strings.HasPrefix(x, linux.XATTR_USER_PREFIX) { + xattrs = append(xattrs, x) + } + } + return xattrs, nil } -func (d *dentry) getxattr(ctx context.Context, name string) (string, error) { - // TODO(jamieliu): add vfs.GetxattrOptions.Size - return d.file.getXattr(ctx, name, linux.XATTR_SIZE_MAX) +func (d *dentry) getxattr(ctx context.Context, creds *auth.Credentials, opts *vfs.GetxattrOptions) (string, error) { + if err := d.checkPermissions(creds, vfs.MayRead); err != nil { + return "", err + } + if !strings.HasPrefix(opts.Name, linux.XATTR_USER_PREFIX) { + return "", syserror.EOPNOTSUPP + } + return d.file.getXattr(ctx, opts.Name, opts.Size) } -func (d *dentry) setxattr(ctx context.Context, opts *vfs.SetxattrOptions) error { +func (d *dentry) setxattr(ctx context.Context, creds *auth.Credentials, opts *vfs.SetxattrOptions) error { + if err := d.checkPermissions(creds, vfs.MayWrite); err != nil { + return err + } + if !strings.HasPrefix(opts.Name, linux.XATTR_USER_PREFIX) { + return syserror.EOPNOTSUPP + } return d.file.setXattr(ctx, opts.Name, opts.Value, opts.Flags) } -func (d *dentry) removexattr(ctx context.Context, name string) error { - return syserror.ENOTSUP +func (d *dentry) removexattr(ctx context.Context, creds *auth.Credentials, name string) error { + if err := d.checkPermissions(creds, vfs.MayWrite); err != nil { + return err + } + if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) { + return syserror.EOPNOTSUPP + } + return d.file.removeXattr(ctx, name) } // Preconditions: d.isRegularFile() || d.isDirectory(). @@ -1189,21 +1219,21 @@ func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) } // Listxattr implements vfs.FileDescriptionImpl.Listxattr. -func (fd *fileDescription) Listxattr(ctx context.Context) ([]string, error) { - return fd.dentry().listxattr(ctx) +func (fd *fileDescription) Listxattr(ctx context.Context, size uint64) ([]string, error) { + return fd.dentry().listxattr(ctx, auth.CredentialsFromContext(ctx), size) } // Getxattr implements vfs.FileDescriptionImpl.Getxattr. -func (fd *fileDescription) Getxattr(ctx context.Context, name string) (string, error) { - return fd.dentry().getxattr(ctx, name) +func (fd *fileDescription) Getxattr(ctx context.Context, opts vfs.GetxattrOptions) (string, error) { + return fd.dentry().getxattr(ctx, auth.CredentialsFromContext(ctx), &opts) } // Setxattr implements vfs.FileDescriptionImpl.Setxattr. func (fd *fileDescription) Setxattr(ctx context.Context, opts vfs.SetxattrOptions) error { - return fd.dentry().setxattr(ctx, &opts) + return fd.dentry().setxattr(ctx, auth.CredentialsFromContext(ctx), &opts) } // Removexattr implements vfs.FileDescriptionImpl.Removexattr. func (fd *fileDescription) Removexattr(ctx context.Context, name string) error { - return fd.dentry().removexattr(ctx, name) + return fd.dentry().removexattr(ctx, auth.CredentialsFromContext(ctx), name) } diff --git a/pkg/sentry/fsimpl/gofer/p9file.go b/pkg/sentry/fsimpl/gofer/p9file.go index 755ac2985..87f0b877f 100644 --- a/pkg/sentry/fsimpl/gofer/p9file.go +++ b/pkg/sentry/fsimpl/gofer/p9file.go @@ -85,6 +85,13 @@ func (f p9file) setAttr(ctx context.Context, valid p9.SetAttrMask, attr p9.SetAt return err } +func (f p9file) listXattr(ctx context.Context, size uint64) (map[string]struct{}, error) { + ctx.UninterruptibleSleepStart(false) + xattrs, err := f.file.ListXattr(size) + ctx.UninterruptibleSleepFinish(false) + return xattrs, err +} + func (f p9file) getXattr(ctx context.Context, name string, size uint64) (string, error) { ctx.UninterruptibleSleepStart(false) val, err := f.file.GetXattr(name, size) @@ -99,6 +106,13 @@ func (f p9file) setXattr(ctx context.Context, name, value string, flags uint32) return err } +func (f p9file) removeXattr(ctx context.Context, name string) error { + ctx.UninterruptibleSleepStart(false) + err := f.file.RemoveXattr(name) + ctx.UninterruptibleSleepFinish(false) + return err +} + func (f p9file) allocate(ctx context.Context, mode p9.AllocateMode, offset, length uint64) error { ctx.UninterruptibleSleepStart(false) err := f.file.Allocate(mode, offset, length) diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go index 4433071aa..baf81b4db 100644 --- a/pkg/sentry/fsimpl/kernfs/filesystem.go +++ b/pkg/sentry/fsimpl/kernfs/filesystem.go @@ -763,7 +763,7 @@ func (fs *Filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath } // ListxattrAt implements vfs.FilesystemImpl.ListxattrAt. -func (fs *Filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([]string, error) { +func (fs *Filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) { fs.mu.RLock() _, _, err := fs.walkExistingLocked(ctx, rp) fs.mu.RUnlock() @@ -776,7 +776,7 @@ func (fs *Filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([ } // GetxattrAt implements vfs.FilesystemImpl.GetxattrAt. -func (fs *Filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) (string, error) { +func (fs *Filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) { fs.mu.RLock() _, _, err := fs.walkExistingLocked(ctx, rp) fs.mu.RUnlock() diff --git a/pkg/sentry/fsimpl/tmpfs/BUILD b/pkg/sentry/fsimpl/tmpfs/BUILD index f2ac23c88..4e6cd3491 100644 --- a/pkg/sentry/fsimpl/tmpfs/BUILD +++ b/pkg/sentry/fsimpl/tmpfs/BUILD @@ -51,6 +51,7 @@ go_library( "//pkg/sentry/usage", "//pkg/sentry/vfs", "//pkg/sentry/vfs/lock", + "//pkg/sentry/vfs/memxattr", "//pkg/sync", "//pkg/syserror", "//pkg/usermem", diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go index 5339d7072..f4d50d64f 100644 --- a/pkg/sentry/fsimpl/tmpfs/filesystem.go +++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go @@ -696,51 +696,47 @@ func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath } // ListxattrAt implements vfs.FilesystemImpl.ListxattrAt. -func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([]string, error) { +func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) { fs.mu.RLock() defer fs.mu.RUnlock() - _, err := resolveLocked(rp) + d, err := resolveLocked(rp) if err != nil { return nil, err } - // TODO(b/127675828): support extended attributes - return nil, syserror.ENOTSUP + return d.inode.listxattr(size) } // GetxattrAt implements vfs.FilesystemImpl.GetxattrAt. -func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) (string, error) { +func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) { fs.mu.RLock() defer fs.mu.RUnlock() - _, err := resolveLocked(rp) + d, err := resolveLocked(rp) if err != nil { return "", err } - // TODO(b/127675828): support extended attributes - return "", syserror.ENOTSUP + return d.inode.getxattr(rp.Credentials(), &opts) } // SetxattrAt implements vfs.FilesystemImpl.SetxattrAt. func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error { fs.mu.RLock() defer fs.mu.RUnlock() - _, err := resolveLocked(rp) + d, err := resolveLocked(rp) if err != nil { return err } - // TODO(b/127675828): support extended attributes - return syserror.ENOTSUP + return d.inode.setxattr(rp.Credentials(), &opts) } // RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt. func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error { fs.mu.RLock() defer fs.mu.RUnlock() - _, err := resolveLocked(rp) + d, err := resolveLocked(rp) if err != nil { return err } - // TODO(b/127675828): support extended attributes - return syserror.ENOTSUP + return d.inode.removexattr(rp.Credentials(), name) } // PrependPath implements vfs.FilesystemImpl.PrependPath. diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go index 654e788e3..9fa8637d5 100644 --- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go +++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go @@ -27,6 +27,7 @@ package tmpfs import ( "fmt" "math" + "strings" "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" @@ -37,6 +38,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sentry/vfs/lock" + "gvisor.dev/gvisor/pkg/sentry/vfs/memxattr" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) @@ -186,6 +188,11 @@ type inode struct { // filesystem.RmdirAt() drops the reference. refs int64 + // xattrs implements extended attributes. + // + // TODO(b/148380782): Support xattrs other than user.* + xattrs memxattr.SimpleExtendedAttributes + // Inode metadata. Writing multiple fields atomically requires holding // mu, othewise atomic operations can be used. mu sync.Mutex @@ -535,6 +542,56 @@ func (i *inode) touchCMtimeLocked() { atomic.StoreInt64(&i.ctime, now) } +func (i *inode) listxattr(size uint64) ([]string, error) { + return i.xattrs.Listxattr(size) +} + +func (i *inode) getxattr(creds *auth.Credentials, opts *vfs.GetxattrOptions) (string, error) { + if err := i.checkPermissions(creds, vfs.MayRead); err != nil { + return "", err + } + if !strings.HasPrefix(opts.Name, linux.XATTR_USER_PREFIX) { + return "", syserror.EOPNOTSUPP + } + if !i.userXattrSupported() { + return "", syserror.ENODATA + } + return i.xattrs.Getxattr(opts) +} + +func (i *inode) setxattr(creds *auth.Credentials, opts *vfs.SetxattrOptions) error { + if err := i.checkPermissions(creds, vfs.MayWrite); err != nil { + return err + } + if !strings.HasPrefix(opts.Name, linux.XATTR_USER_PREFIX) { + return syserror.EOPNOTSUPP + } + if !i.userXattrSupported() { + return syserror.EPERM + } + return i.xattrs.Setxattr(opts) +} + +func (i *inode) removexattr(creds *auth.Credentials, name string) error { + if err := i.checkPermissions(creds, vfs.MayWrite); err != nil { + return err + } + if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) { + return syserror.EOPNOTSUPP + } + if !i.userXattrSupported() { + return syserror.EPERM + } + return i.xattrs.Removexattr(name) +} + +// Extended attributes in the user.* namespace are only supported for regular +// files and directories. +func (i *inode) userXattrSupported() bool { + filetype := linux.S_IFMT & atomic.LoadUint32(&i.mode) + return filetype == linux.S_IFREG || filetype == linux.S_IFDIR +} + // fileDescription is embedded by tmpfs implementations of // vfs.FileDescriptionImpl. type fileDescription struct { @@ -562,3 +619,23 @@ func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) creds := auth.CredentialsFromContext(ctx) return fd.inode().setStat(ctx, creds, &opts.Stat) } + +// Listxattr implements vfs.FileDescriptionImpl.Listxattr. +func (fd *fileDescription) Listxattr(ctx context.Context, size uint64) ([]string, error) { + return fd.inode().listxattr(size) +} + +// Getxattr implements vfs.FileDescriptionImpl.Getxattr. +func (fd *fileDescription) Getxattr(ctx context.Context, opts vfs.GetxattrOptions) (string, error) { + return fd.inode().getxattr(auth.CredentialsFromContext(ctx), &opts) +} + +// Setxattr implements vfs.FileDescriptionImpl.Setxattr. +func (fd *fileDescription) Setxattr(ctx context.Context, opts vfs.SetxattrOptions) error { + return fd.inode().setxattr(auth.CredentialsFromContext(ctx), &opts) +} + +// Removexattr implements vfs.FileDescriptionImpl.Removexattr. +func (fd *fileDescription) Removexattr(ctx context.Context, name string) error { + return fd.inode().removexattr(auth.CredentialsFromContext(ctx), name) +} diff --git a/pkg/sentry/syscalls/linux/vfs2/xattr.go b/pkg/sentry/syscalls/linux/vfs2/xattr.go index 89e9ff4d7..af455d5c1 100644 --- a/pkg/sentry/syscalls/linux/vfs2/xattr.go +++ b/pkg/sentry/syscalls/linux/vfs2/xattr.go @@ -51,7 +51,7 @@ func listxattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSyml } defer tpop.Release() - names, err := t.Kernel().VFS().ListxattrAt(t, t.Credentials(), &tpop.pop) + names, err := t.Kernel().VFS().ListxattrAt(t, t.Credentials(), &tpop.pop, uint64(size)) if err != nil { return 0, nil, err } @@ -74,7 +74,7 @@ func Flistxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy } defer file.DecRef() - names, err := file.Listxattr(t) + names, err := file.Listxattr(t, uint64(size)) if err != nil { return 0, nil, err } @@ -116,7 +116,10 @@ func getxattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSymli return 0, nil, err } - value, err := t.Kernel().VFS().GetxattrAt(t, t.Credentials(), &tpop.pop, name) + value, err := t.Kernel().VFS().GetxattrAt(t, t.Credentials(), &tpop.pop, &vfs.GetxattrOptions{ + Name: name, + Size: uint64(size), + }) if err != nil { return 0, nil, err } @@ -145,7 +148,7 @@ func Fgetxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys return 0, nil, err } - value, err := file.Getxattr(t, name) + value, err := file.Getxattr(t, &vfs.GetxattrOptions{Name: name, Size: uint64(size)}) if err != nil { return 0, nil, err } @@ -230,7 +233,7 @@ func Fsetxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys return 0, nil, err } - return 0, nil, file.Setxattr(t, vfs.SetxattrOptions{ + return 0, nil, file.Setxattr(t, &vfs.SetxattrOptions{ Name: name, Value: value, Flags: uint32(flags), diff --git a/pkg/sentry/vfs/anonfs.go b/pkg/sentry/vfs/anonfs.go index d1f6dfb45..a64d86122 100644 --- a/pkg/sentry/vfs/anonfs.go +++ b/pkg/sentry/vfs/anonfs.go @@ -245,7 +245,7 @@ func (fs *anonFilesystem) BoundEndpointAt(ctx context.Context, rp *ResolvingPath } // ListxattrAt implements FilesystemImpl.ListxattrAt. -func (fs *anonFilesystem) ListxattrAt(ctx context.Context, rp *ResolvingPath) ([]string, error) { +func (fs *anonFilesystem) ListxattrAt(ctx context.Context, rp *ResolvingPath, size uint64) ([]string, error) { if !rp.Done() { return nil, syserror.ENOTDIR } @@ -253,7 +253,7 @@ func (fs *anonFilesystem) ListxattrAt(ctx context.Context, rp *ResolvingPath) ([ } // GetxattrAt implements FilesystemImpl.GetxattrAt. -func (fs *anonFilesystem) GetxattrAt(ctx context.Context, rp *ResolvingPath, name string) (string, error) { +func (fs *anonFilesystem) GetxattrAt(ctx context.Context, rp *ResolvingPath, opts GetxattrOptions) (string, error) { if !rp.Done() { return "", syserror.ENOTDIR } diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go index 20c545fca..4fb9aea87 100644 --- a/pkg/sentry/vfs/file_description.go +++ b/pkg/sentry/vfs/file_description.go @@ -401,11 +401,11 @@ type FileDescriptionImpl interface { Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) // Listxattr returns all extended attribute names for the file. - Listxattr(ctx context.Context) ([]string, error) + Listxattr(ctx context.Context, size uint64) ([]string, error) // Getxattr returns the value associated with the given extended attribute // for the file. - Getxattr(ctx context.Context, name string) (string, error) + Getxattr(ctx context.Context, opts GetxattrOptions) (string, error) // Setxattr changes the value associated with the given extended attribute // for the file. @@ -605,18 +605,23 @@ func (fd *FileDescription) Ioctl(ctx context.Context, uio usermem.IO, args arch. // Listxattr returns all extended attribute names for the file represented by // fd. -func (fd *FileDescription) Listxattr(ctx context.Context) ([]string, error) { +// +// If the size of the list (including a NUL terminating byte after every entry) +// would exceed size, ERANGE may be returned. Note that implementations +// are free to ignore size entirely and return without error). In all cases, +// if size is 0, the list should be returned without error, regardless of size. +func (fd *FileDescription) Listxattr(ctx context.Context, size uint64) ([]string, error) { if fd.opts.UseDentryMetadata { vfsObj := fd.vd.mount.vfs rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{ Root: fd.vd, Start: fd.vd, }) - names, err := fd.vd.mount.fs.impl.ListxattrAt(ctx, rp) + names, err := fd.vd.mount.fs.impl.ListxattrAt(ctx, rp, size) vfsObj.putResolvingPath(rp) return names, err } - names, err := fd.impl.Listxattr(ctx) + names, err := fd.impl.Listxattr(ctx, size) if err == syserror.ENOTSUP { // Linux doesn't actually return ENOTSUP in this case; instead, // fs/xattr.c:vfs_listxattr() falls back to allowing the security @@ -629,34 +634,39 @@ func (fd *FileDescription) Listxattr(ctx context.Context) ([]string, error) { // Getxattr returns the value associated with the given extended attribute for // the file represented by fd. -func (fd *FileDescription) Getxattr(ctx context.Context, name string) (string, error) { +// +// If the size of the return value exceeds opts.Size, ERANGE may be returned +// (note that implementations are free to ignore opts.Size entirely and return +// without error). In all cases, if opts.Size is 0, the value should be +// returned without error, regardless of size. +func (fd *FileDescription) Getxattr(ctx context.Context, opts *GetxattrOptions) (string, error) { if fd.opts.UseDentryMetadata { vfsObj := fd.vd.mount.vfs rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{ Root: fd.vd, Start: fd.vd, }) - val, err := fd.vd.mount.fs.impl.GetxattrAt(ctx, rp, name) + val, err := fd.vd.mount.fs.impl.GetxattrAt(ctx, rp, *opts) vfsObj.putResolvingPath(rp) return val, err } - return fd.impl.Getxattr(ctx, name) + return fd.impl.Getxattr(ctx, *opts) } // Setxattr changes the value associated with the given extended attribute for // the file represented by fd. -func (fd *FileDescription) Setxattr(ctx context.Context, opts SetxattrOptions) error { +func (fd *FileDescription) Setxattr(ctx context.Context, opts *SetxattrOptions) error { if fd.opts.UseDentryMetadata { vfsObj := fd.vd.mount.vfs rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{ Root: fd.vd, Start: fd.vd, }) - err := fd.vd.mount.fs.impl.SetxattrAt(ctx, rp, opts) + err := fd.vd.mount.fs.impl.SetxattrAt(ctx, rp, *opts) vfsObj.putResolvingPath(rp) return err } - return fd.impl.Setxattr(ctx, opts) + return fd.impl.Setxattr(ctx, *opts) } // Removexattr removes the given extended attribute from the file represented diff --git a/pkg/sentry/vfs/file_description_impl_util.go b/pkg/sentry/vfs/file_description_impl_util.go index d45e602ce..f4c111926 100644 --- a/pkg/sentry/vfs/file_description_impl_util.go +++ b/pkg/sentry/vfs/file_description_impl_util.go @@ -130,14 +130,14 @@ func (FileDescriptionDefaultImpl) Ioctl(ctx context.Context, uio usermem.IO, arg // Listxattr implements FileDescriptionImpl.Listxattr analogously to // inode_operations::listxattr == NULL in Linux. -func (FileDescriptionDefaultImpl) Listxattr(ctx context.Context) ([]string, error) { +func (FileDescriptionDefaultImpl) Listxattr(ctx context.Context, size uint64) ([]string, error) { // This isn't exactly accurate; see FileDescription.Listxattr. return nil, syserror.ENOTSUP } // Getxattr implements FileDescriptionImpl.Getxattr analogously to // inode::i_opflags & IOP_XATTR == 0 in Linux. -func (FileDescriptionDefaultImpl) Getxattr(ctx context.Context, name string) (string, error) { +func (FileDescriptionDefaultImpl) Getxattr(ctx context.Context, opts GetxattrOptions) (string, error) { return "", syserror.ENOTSUP } diff --git a/pkg/sentry/vfs/filesystem.go b/pkg/sentry/vfs/filesystem.go index bef1bd312..a537a29d1 100644 --- a/pkg/sentry/vfs/filesystem.go +++ b/pkg/sentry/vfs/filesystem.go @@ -442,7 +442,13 @@ type FilesystemImpl interface { // - If extended attributes are not supported by the filesystem, // ListxattrAt returns nil. (See FileDescription.Listxattr for an // explanation.) - ListxattrAt(ctx context.Context, rp *ResolvingPath) ([]string, error) + // + // - If the size of the list (including a NUL terminating byte after every + // entry) would exceed size, ERANGE may be returned. Note that + // implementations are free to ignore size entirely and return without + // error). In all cases, if size is 0, the list should be returned without + // error, regardless of size. + ListxattrAt(ctx context.Context, rp *ResolvingPath, size uint64) ([]string, error) // GetxattrAt returns the value associated with the given extended // attribute for the file at rp. @@ -451,7 +457,15 @@ type FilesystemImpl interface { // // - If extended attributes are not supported by the filesystem, GetxattrAt // returns ENOTSUP. - GetxattrAt(ctx context.Context, rp *ResolvingPath, name string) (string, error) + // + // - If an extended attribute named opts.Name does not exist, ENODATA is + // returned. + // + // - If the size of the return value exceeds opts.Size, ERANGE may be + // returned (note that implementations are free to ignore opts.Size entirely + // and return without error). In all cases, if opts.Size is 0, the value + // should be returned without error, regardless of size. + GetxattrAt(ctx context.Context, rp *ResolvingPath, opts GetxattrOptions) (string, error) // SetxattrAt changes the value associated with the given extended // attribute for the file at rp. @@ -460,6 +474,10 @@ type FilesystemImpl interface { // // - If extended attributes are not supported by the filesystem, SetxattrAt // returns ENOTSUP. + // + // - If XATTR_CREATE is set in opts.Flag and opts.Name already exists, + // EEXIST is returned. If XATTR_REPLACE is set and opts.Name does not exist, + // ENODATA is returned. SetxattrAt(ctx context.Context, rp *ResolvingPath, opts SetxattrOptions) error // RemovexattrAt removes the given extended attribute from the file at rp. @@ -468,6 +486,8 @@ type FilesystemImpl interface { // // - If extended attributes are not supported by the filesystem, // RemovexattrAt returns ENOTSUP. + // + // - If name does not exist, ENODATA is returned. RemovexattrAt(ctx context.Context, rp *ResolvingPath, name string) error // BoundEndpointAt returns the Unix socket endpoint bound at the path rp. diff --git a/pkg/sentry/vfs/memxattr/BUILD b/pkg/sentry/vfs/memxattr/BUILD new file mode 100644 index 000000000..d8c4d27b9 --- /dev/null +++ b/pkg/sentry/vfs/memxattr/BUILD @@ -0,0 +1,15 @@ +load("//tools:defs.bzl", "go_library") + +package(licenses = ["notice"]) + +go_library( + name = "memxattr", + srcs = ["xattr.go"], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/sentry/vfs", + "//pkg/sync", + "//pkg/syserror", + ], +) diff --git a/pkg/sentry/vfs/memxattr/xattr.go b/pkg/sentry/vfs/memxattr/xattr.go new file mode 100644 index 000000000..cc1e7d764 --- /dev/null +++ b/pkg/sentry/vfs/memxattr/xattr.go @@ -0,0 +1,102 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package memxattr provides a default, in-memory extended attribute +// implementation. +package memxattr + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" +) + +// SimpleExtendedAttributes implements extended attributes using a map of +// names to values. +// +// +stateify savable +type SimpleExtendedAttributes struct { + // mu protects the below fields. + mu sync.RWMutex `state:"nosave"` + xattrs map[string]string +} + +// Getxattr returns the value at 'name'. +func (x *SimpleExtendedAttributes) Getxattr(opts *vfs.GetxattrOptions) (string, error) { + x.mu.RLock() + value, ok := x.xattrs[opts.Name] + x.mu.RUnlock() + if !ok { + return "", syserror.ENODATA + } + // Check that the size of the buffer provided in getxattr(2) is large enough + // to contain the value. + if opts.Size != 0 && uint64(len(value)) > opts.Size { + return "", syserror.ERANGE + } + return value, nil +} + +// Setxattr sets 'value' at 'name'. +func (x *SimpleExtendedAttributes) Setxattr(opts *vfs.SetxattrOptions) error { + x.mu.Lock() + defer x.mu.Unlock() + if x.xattrs == nil { + if opts.Flags&linux.XATTR_REPLACE != 0 { + return syserror.ENODATA + } + x.xattrs = make(map[string]string) + } + + _, ok := x.xattrs[opts.Name] + if ok && opts.Flags&linux.XATTR_CREATE != 0 { + return syserror.EEXIST + } + if !ok && opts.Flags&linux.XATTR_REPLACE != 0 { + return syserror.ENODATA + } + + x.xattrs[opts.Name] = opts.Value + return nil +} + +// Listxattr returns all names in xattrs. +func (x *SimpleExtendedAttributes) Listxattr(size uint64) ([]string, error) { + // Keep track of the size of the buffer needed in listxattr(2) for the list. + listSize := 0 + x.mu.RLock() + names := make([]string, 0, len(x.xattrs)) + for n := range x.xattrs { + names = append(names, n) + // Add one byte per null terminator. + listSize += len(n) + 1 + } + x.mu.RUnlock() + if size != 0 && uint64(listSize) > size { + return nil, syserror.ERANGE + } + return names, nil +} + +// Removexattr removes the xattr at 'name'. +func (x *SimpleExtendedAttributes) Removexattr(name string) error { + x.mu.Lock() + defer x.mu.Unlock() + if _, ok := x.xattrs[name]; !ok { + return syserror.ENODATA + } + delete(x.xattrs, name) + return nil +} diff --git a/pkg/sentry/vfs/options.go b/pkg/sentry/vfs/options.go index 2f04bf882..534528ce6 100644 --- a/pkg/sentry/vfs/options.go +++ b/pkg/sentry/vfs/options.go @@ -132,6 +132,20 @@ type SetStatOptions struct { Stat linux.Statx } +// GetxattrOptions contains options to VirtualFilesystem.GetxattrAt(), +// FilesystemImpl.GetxattrAt(), FileDescription.Getxattr(), and +// FileDescriptionImpl.Getxattr(). +type GetxattrOptions struct { + // Name is the name of the extended attribute to retrieve. + Name string + + // Size is the maximum value size that the caller will tolerate. If the value + // is larger than size, getxattr methods may return ERANGE, but they are also + // free to ignore the hint entirely (i.e. the value returned may be larger + // than size). All size checking is done independently at the syscall layer. + Size uint64 +} + // SetxattrOptions contains options to VirtualFilesystem.SetxattrAt(), // FilesystemImpl.SetxattrAt(), FileDescription.Setxattr(), and // FileDescriptionImpl.Setxattr(). diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go index 720b90d8f..f592913d5 100644 --- a/pkg/sentry/vfs/vfs.go +++ b/pkg/sentry/vfs/vfs.go @@ -680,10 +680,10 @@ func (vfs *VirtualFilesystem) UnlinkAt(ctx context.Context, creds *auth.Credenti // ListxattrAt returns all extended attribute names for the file at the given // path. -func (vfs *VirtualFilesystem) ListxattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) ([]string, error) { +func (vfs *VirtualFilesystem) ListxattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, size uint64) ([]string, error) { rp := vfs.getResolvingPath(creds, pop) for { - names, err := rp.mount.fs.impl.ListxattrAt(ctx, rp) + names, err := rp.mount.fs.impl.ListxattrAt(ctx, rp, size) if err == nil { vfs.putResolvingPath(rp) return names, nil @@ -705,10 +705,10 @@ func (vfs *VirtualFilesystem) ListxattrAt(ctx context.Context, creds *auth.Crede // GetxattrAt returns the value associated with the given extended attribute // for the file at the given path. -func (vfs *VirtualFilesystem) GetxattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, name string) (string, error) { +func (vfs *VirtualFilesystem) GetxattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *GetxattrOptions) (string, error) { rp := vfs.getResolvingPath(creds, pop) for { - val, err := rp.mount.fs.impl.GetxattrAt(ctx, rp, name) + val, err := rp.mount.fs.impl.GetxattrAt(ctx, rp, *opts) if err == nil { vfs.putResolvingPath(rp) return val, nil diff --git a/test/syscalls/linux/xattr.cc b/test/syscalls/linux/xattr.cc index 8b00ef44c..3231732ec 100644 --- a/test/syscalls/linux/xattr.cc +++ b/test/syscalls/linux/xattr.cc @@ -41,12 +41,12 @@ class XattrTest : public FileTest {}; TEST_F(XattrTest, XattrNonexistentFile) { const char* path = "/does/not/exist"; - EXPECT_THAT(setxattr(path, nullptr, nullptr, 0, /*flags=*/0), - SyscallFailsWithErrno(ENOENT)); - EXPECT_THAT(getxattr(path, nullptr, nullptr, 0), + const char* name = "user.test"; + EXPECT_THAT(setxattr(path, name, nullptr, 0, /*flags=*/0), SyscallFailsWithErrno(ENOENT)); + EXPECT_THAT(getxattr(path, name, nullptr, 0), SyscallFailsWithErrno(ENOENT)); EXPECT_THAT(listxattr(path, nullptr, 0), SyscallFailsWithErrno(ENOENT)); - EXPECT_THAT(removexattr(path, nullptr), SyscallFailsWithErrno(ENOENT)); + EXPECT_THAT(removexattr(path, name), SyscallFailsWithErrno(ENOENT)); } TEST_F(XattrTest, XattrNullName) { -- cgit v1.2.3 From f03996c5e9803934226e4b3a10827501cb936ab9 Mon Sep 17 00:00:00 2001 From: Jamie Liu Date: Thu, 16 Apr 2020 19:26:02 -0700 Subject: Implement pipe(2) and pipe2(2) for VFS2. Updates #1035 PiperOrigin-RevId: 306968644 --- pkg/sentry/fsimpl/pipefs/BUILD | 20 +++ pkg/sentry/fsimpl/pipefs/pipefs.go | 148 +++++++++++++++++++ pkg/sentry/fsimpl/tmpfs/filesystem.go | 2 +- pkg/sentry/fsimpl/tmpfs/named_pipe.go | 23 +-- pkg/sentry/fsimpl/tmpfs/tmpfs.go | 2 +- pkg/sentry/kernel/BUILD | 1 + pkg/sentry/kernel/kernel.go | 30 +++- pkg/sentry/kernel/pipe/vfs.go | 162 ++++++++++++--------- pkg/sentry/syscalls/linux/sys_pipe.go | 14 +- pkg/sentry/syscalls/linux/vfs2/BUILD | 3 + pkg/sentry/syscalls/linux/vfs2/fd.go | 17 +++ .../syscalls/linux/vfs2/linux64_override_amd64.go | 4 +- pkg/sentry/syscalls/linux/vfs2/pipe.go | 63 ++++++++ pkg/sentry/syscalls/linux/vfs2/read_write.go | 8 +- pkg/sentry/vfs/vfs.go | 2 +- test/syscalls/linux/pipe.cc | 2 + 16 files changed, 389 insertions(+), 112 deletions(-) create mode 100644 pkg/sentry/fsimpl/pipefs/BUILD create mode 100644 pkg/sentry/fsimpl/pipefs/pipefs.go create mode 100644 pkg/sentry/syscalls/linux/vfs2/pipe.go (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/pipefs/BUILD b/pkg/sentry/fsimpl/pipefs/BUILD new file mode 100644 index 000000000..0d411606f --- /dev/null +++ b/pkg/sentry/fsimpl/pipefs/BUILD @@ -0,0 +1,20 @@ +load("//tools:defs.bzl", "go_library") + +licenses(["notice"]) + +go_library( + name = "pipefs", + srcs = ["pipefs.go"], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/context", + "//pkg/sentry/fsimpl/kernfs", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/kernel/pipe", + "//pkg/sentry/kernel/time", + "//pkg/sentry/vfs", + "//pkg/syserror", + "//pkg/usermem", + ], +) diff --git a/pkg/sentry/fsimpl/pipefs/pipefs.go b/pkg/sentry/fsimpl/pipefs/pipefs.go new file mode 100644 index 000000000..faf3179bc --- /dev/null +++ b/pkg/sentry/fsimpl/pipefs/pipefs.go @@ -0,0 +1,148 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package pipefs provides the filesystem implementation backing +// Kernel.PipeMount. +package pipefs + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" + ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +type filesystemType struct{} + +// Name implements vfs.FilesystemType.Name. +func (filesystemType) Name() string { + return "pipefs" +} + +// GetFilesystem implements vfs.FilesystemType.GetFilesystem. +func (filesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { + panic("pipefs.filesystemType.GetFilesystem should never be called") +} + +// filesystem implements vfs.FilesystemImpl. +type filesystem struct { + kernfs.Filesystem + + // TODO(gvisor.dev/issue/1193): + // + // - kernfs does not provide a way to implement statfs, from which we + // should indicate PIPEFS_MAGIC. + // + // - kernfs does not provide a way to override names for + // vfs.FilesystemImpl.PrependPath(); pipefs inodes should use synthetic + // name fmt.Sprintf("pipe:[%d]", inode.ino). +} + +// NewFilesystem sets up and returns a new vfs.Filesystem implemented by +// pipefs. +func NewFilesystem(vfsObj *vfs.VirtualFilesystem) *vfs.Filesystem { + fs := &filesystem{} + fs.Init(vfsObj, filesystemType{}) + return fs.VFSFilesystem() +} + +// inode implements kernfs.Inode. +type inode struct { + kernfs.InodeNotDirectory + kernfs.InodeNotSymlink + kernfs.InodeNoopRefCount + + pipe *pipe.VFSPipe + + ino uint64 + uid auth.KUID + gid auth.KGID + // We use the creation timestamp for all of atime, mtime, and ctime. + ctime ktime.Time +} + +func newInode(ctx context.Context, fs *kernfs.Filesystem) *inode { + creds := auth.CredentialsFromContext(ctx) + return &inode{ + pipe: pipe.NewVFSPipe(false /* isNamed */, pipe.DefaultPipeSize, usermem.PageSize), + ino: fs.NextIno(), + uid: creds.EffectiveKUID, + gid: creds.EffectiveKGID, + ctime: ktime.NowFromContext(ctx), + } +} + +const pipeMode = 0600 | linux.S_IFIFO + +// CheckPermissions implements kernfs.Inode.CheckPermissions. +func (i *inode) CheckPermissions(ctx context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error { + return vfs.GenericCheckPermissions(creds, ats, pipeMode, i.uid, i.gid) +} + +// Mode implements kernfs.Inode.Mode. +func (i *inode) Mode() linux.FileMode { + return pipeMode +} + +// Stat implements kernfs.Inode.Stat. +func (i *inode) Stat(vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) { + ts := linux.NsecToStatxTimestamp(i.ctime.Nanoseconds()) + return linux.Statx{ + Mask: linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME | linux.STATX_INO | linux.STATX_SIZE | linux.STATX_BLOCKS, + Blksize: usermem.PageSize, + Nlink: 1, + UID: uint32(i.uid), + GID: uint32(i.gid), + Mode: pipeMode, + Ino: i.ino, + Size: 0, + Blocks: 0, + Atime: ts, + Ctime: ts, + Mtime: ts, + // TODO(gvisor.dev/issue/1197): Device number. + }, nil +} + +// SetStat implements kernfs.Inode.SetStat. +func (i *inode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error { + if opts.Stat.Mask == 0 { + return nil + } + return syserror.EPERM +} + +// Open implements kernfs.Inode.Open. +func (i *inode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + // FIXME(b/38173783): kernfs does not plumb Context here. + return i.pipe.Open(context.Background(), rp.Mount(), vfsd, opts.Flags) +} + +// NewConnectedPipeFDs returns a pair of FileDescriptions representing the read +// and write ends of a newly-created pipe, as for pipe(2) and pipe2(2). +// +// Preconditions: mnt.Filesystem() must have been returned by NewFilesystem(). +func NewConnectedPipeFDs(ctx context.Context, mnt *vfs.Mount, flags uint32) (*vfs.FileDescription, *vfs.FileDescription) { + fs := mnt.Filesystem().Impl().(*kernfs.Filesystem) + inode := newInode(ctx, fs) + var d kernfs.Dentry + d.Init(inode) + defer d.DecRef() + return inode.pipe.ReaderWriterPair(mnt, d.VFSDentry(), flags) +} diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go index f4d50d64f..660f5a29b 100644 --- a/pkg/sentry/fsimpl/tmpfs/filesystem.go +++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go @@ -392,7 +392,7 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.Open // Can't open symlinks without O_PATH (which is unimplemented). return nil, syserror.ELOOP case *namedPipe: - return newNamedPipeFD(ctx, impl, rp, &d.vfsd, opts.Flags) + return impl.pipe.Open(ctx, rp.Mount(), &d.vfsd, opts.Flags) case *deviceFile: return rp.VirtualFilesystem().OpenDeviceSpecialFile(ctx, rp.Mount(), &d.vfsd, impl.kind, impl.major, impl.minor, opts) case *socketFile: diff --git a/pkg/sentry/fsimpl/tmpfs/named_pipe.go b/pkg/sentry/fsimpl/tmpfs/named_pipe.go index 2c5c739df..8d77b3fa8 100644 --- a/pkg/sentry/fsimpl/tmpfs/named_pipe.go +++ b/pkg/sentry/fsimpl/tmpfs/named_pipe.go @@ -16,10 +16,8 @@ package tmpfs import ( "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" - "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/usermem" ) @@ -33,27 +31,8 @@ type namedPipe struct { // * fs.mu must be locked. // * rp.Mount().CheckBeginWrite() has been called successfully. func (fs *filesystem) newNamedPipe(creds *auth.Credentials, mode linux.FileMode) *inode { - file := &namedPipe{pipe: pipe.NewVFSPipe(pipe.DefaultPipeSize, usermem.PageSize)} + file := &namedPipe{pipe: pipe.NewVFSPipe(true /* isNamed */, pipe.DefaultPipeSize, usermem.PageSize)} file.inode.init(file, fs, creds, linux.S_IFIFO|mode) file.inode.nlink = 1 // Only the parent has a link. return &file.inode } - -// namedPipeFD implements vfs.FileDescriptionImpl. Methods are implemented -// entirely via struct embedding. -type namedPipeFD struct { - fileDescription - - *pipe.VFSPipeFD -} - -func newNamedPipeFD(ctx context.Context, np *namedPipe, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) { - var err error - var fd namedPipeFD - fd.VFSPipeFD, err = np.pipe.NewVFSPipeFD(ctx, vfsd, &fd.vfsfd, flags) - if err != nil { - return nil, err - } - fd.vfsfd.Init(&fd, flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{}) - return &fd.vfsfd, nil -} diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go index 9fa8637d5..a59b24d45 100644 --- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go +++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go @@ -357,6 +357,7 @@ func (i *inode) setStat(ctx context.Context, creds *auth.Credentials, stat *linu return err } i.mu.Lock() + defer i.mu.Unlock() var ( needsMtimeBump bool needsCtimeBump bool @@ -427,7 +428,6 @@ func (i *inode) setStat(ctx context.Context, creds *auth.Credentials, stat *linu atomic.StoreInt64(&i.ctime, now) } - i.mu.Unlock() return nil } diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD index e0ff58d8c..e47af66d6 100644 --- a/pkg/sentry/kernel/BUILD +++ b/pkg/sentry/kernel/BUILD @@ -170,6 +170,7 @@ go_library( "//pkg/sentry/fs/timerfd", "//pkg/sentry/fsbridge", "//pkg/sentry/fsimpl/kernfs", + "//pkg/sentry/fsimpl/pipefs", "//pkg/sentry/fsimpl/sockfs", "//pkg/sentry/hostcpu", "//pkg/sentry/inet", diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index de8a95854..fef60e636 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -50,6 +50,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/timerfd" "gvisor.dev/gvisor/pkg/sentry/fsbridge" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/pipefs" "gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs" "gvisor.dev/gvisor/pkg/sentry/hostcpu" "gvisor.dev/gvisor/pkg/sentry/inet" @@ -254,6 +255,10 @@ type Kernel struct { // VFS keeps the filesystem state used across the kernel. vfs vfs.VirtualFilesystem + // pipeMount is the Mount used for pipes created by the pipe() and pipe2() + // syscalls (as opposed to named pipes created by mknod()). + pipeMount *vfs.Mount + // If set to true, report address space activation waits as if the task is in // external wait so that the watchdog doesn't report the task stuck. SleepForAddressSpaceActivation bool @@ -354,19 +359,29 @@ func (k *Kernel) Init(args InitKernelArgs) error { k.monotonicClock = &timekeeperClock{tk: args.Timekeeper, c: sentrytime.Monotonic} k.futexes = futex.NewManager() k.netlinkPorts = port.New() + if VFS2Enabled { if err := k.vfs.Init(); err != nil { return fmt.Errorf("failed to initialize VFS: %v", err) } - fs := sockfs.NewFilesystem(&k.vfs) - // NewDisconnectedMount will take an additional reference on fs. - defer fs.DecRef() - sm, err := k.vfs.NewDisconnectedMount(fs, nil, &vfs.MountOptions{}) + + pipeFilesystem := pipefs.NewFilesystem(&k.vfs) + defer pipeFilesystem.DecRef() + pipeMount, err := k.vfs.NewDisconnectedMount(pipeFilesystem, nil, &vfs.MountOptions{}) + if err != nil { + return fmt.Errorf("failed to create pipefs mount: %v", err) + } + k.pipeMount = pipeMount + + socketFilesystem := sockfs.NewFilesystem(&k.vfs) + defer socketFilesystem.DecRef() + socketMount, err := k.vfs.NewDisconnectedMount(socketFilesystem, nil, &vfs.MountOptions{}) if err != nil { return fmt.Errorf("failed to initialize socket mount: %v", err) } - k.socketMount = sm + k.socketMount = socketMount } + return nil } @@ -1613,3 +1628,8 @@ func (k *Kernel) EmitUnimplementedEvent(ctx context.Context) { func (k *Kernel) VFS() *vfs.VirtualFilesystem { return &k.vfs } + +// PipeMount returns the pipefs mount. +func (k *Kernel) PipeMount() *vfs.Mount { + return k.pipeMount +} diff --git a/pkg/sentry/kernel/pipe/vfs.go b/pkg/sentry/kernel/pipe/vfs.go index a5675bd70..b54f08a30 100644 --- a/pkg/sentry/kernel/pipe/vfs.go +++ b/pkg/sentry/kernel/pipe/vfs.go @@ -49,38 +49,42 @@ type VFSPipe struct { } // NewVFSPipe returns an initialized VFSPipe. -func NewVFSPipe(sizeBytes, atomicIOBytes int64) *VFSPipe { +func NewVFSPipe(isNamed bool, sizeBytes, atomicIOBytes int64) *VFSPipe { var vp VFSPipe - initPipe(&vp.pipe, true /* isNamed */, sizeBytes, atomicIOBytes) + initPipe(&vp.pipe, isNamed, sizeBytes, atomicIOBytes) return &vp } -// NewVFSPipeFD opens a named pipe. Named pipes have special blocking semantics -// during open: +// ReaderWriterPair returns read-only and write-only FDs for vp. // -// "Normally, opening the FIFO blocks until the other end is opened also. A -// process can open a FIFO in nonblocking mode. In this case, opening for -// read-only will succeed even if no-one has opened on the write side yet, -// opening for write-only will fail with ENXIO (no such device or address) -// unless the other end has already been opened. Under Linux, opening a FIFO -// for read and write will succeed both in blocking and nonblocking mode. POSIX -// leaves this behavior undefined. This can be used to open a FIFO for writing -// while there are no readers available." - fifo(7) -func (vp *VFSPipe) NewVFSPipeFD(ctx context.Context, vfsd *vfs.Dentry, vfsfd *vfs.FileDescription, flags uint32) (*VFSPipeFD, error) { +// Preconditions: statusFlags should not contain an open access mode. +func (vp *VFSPipe) ReaderWriterPair(mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32) (*vfs.FileDescription, *vfs.FileDescription) { + return vp.newFD(mnt, vfsd, linux.O_RDONLY|statusFlags), vp.newFD(mnt, vfsd, linux.O_WRONLY|statusFlags) +} + +// Open opens the pipe represented by vp. +func (vp *VFSPipe) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32) (*vfs.FileDescription, error) { vp.mu.Lock() defer vp.mu.Unlock() - readable := vfs.MayReadFileWithOpenFlags(flags) - writable := vfs.MayWriteFileWithOpenFlags(flags) + readable := vfs.MayReadFileWithOpenFlags(statusFlags) + writable := vfs.MayWriteFileWithOpenFlags(statusFlags) if !readable && !writable { return nil, syserror.EINVAL } - vfd, err := vp.open(vfsd, vfsfd, flags) - if err != nil { - return nil, err - } + fd := vp.newFD(mnt, vfsd, statusFlags) + // Named pipes have special blocking semantics during open: + // + // "Normally, opening the FIFO blocks until the other end is opened also. A + // process can open a FIFO in nonblocking mode. In this case, opening for + // read-only will succeed even if no-one has opened on the write side yet, + // opening for write-only will fail with ENXIO (no such device or address) + // unless the other end has already been opened. Under Linux, opening a + // FIFO for read and write will succeed both in blocking and nonblocking + // mode. POSIX leaves this behavior undefined. This can be used to open a + // FIFO for writing while there are no readers available." - fifo(7) switch { case readable && writable: // Pipes opened for read-write always succeed without blocking. @@ -89,23 +93,26 @@ func (vp *VFSPipe) NewVFSPipeFD(ctx context.Context, vfsd *vfs.Dentry, vfsfd *vf case readable: newHandleLocked(&vp.rWakeup) - // If this pipe is being opened as nonblocking and there's no + // If this pipe is being opened as blocking and there's no // writer, we have to wait for a writer to open the other end. - if flags&linux.O_NONBLOCK == 0 && !vp.pipe.HasWriters() && !waitFor(&vp.mu, &vp.wWakeup, ctx) { + if vp.pipe.isNamed && statusFlags&linux.O_NONBLOCK == 0 && !vp.pipe.HasWriters() && !waitFor(&vp.mu, &vp.wWakeup, ctx) { + fd.DecRef() return nil, syserror.EINTR } case writable: newHandleLocked(&vp.wWakeup) - if !vp.pipe.HasReaders() { - // Nonblocking, write-only opens fail with ENXIO when - // the read side isn't open yet. - if flags&linux.O_NONBLOCK != 0 { + if vp.pipe.isNamed && !vp.pipe.HasReaders() { + // Non-blocking, write-only opens fail with ENXIO when the read + // side isn't open yet. + if statusFlags&linux.O_NONBLOCK != 0 { + fd.DecRef() return nil, syserror.ENXIO } // Wait for a reader to open the other end. if !waitFor(&vp.mu, &vp.rWakeup, ctx) { + fd.DecRef() return nil, syserror.EINTR } } @@ -114,96 +121,93 @@ func (vp *VFSPipe) NewVFSPipeFD(ctx context.Context, vfsd *vfs.Dentry, vfsfd *vf panic("invalid pipe flags: must be readable, writable, or both") } - return vfd, nil + return fd, nil } // Preconditions: vp.mu must be held. -func (vp *VFSPipe) open(vfsd *vfs.Dentry, vfsfd *vfs.FileDescription, flags uint32) (*VFSPipeFD, error) { - var fd VFSPipeFD - fd.flags = flags - fd.readable = vfs.MayReadFileWithOpenFlags(flags) - fd.writable = vfs.MayWriteFileWithOpenFlags(flags) - fd.vfsfd = vfsfd - fd.pipe = &vp.pipe +func (vp *VFSPipe) newFD(mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32) *vfs.FileDescription { + fd := &VFSPipeFD{ + pipe: &vp.pipe, + } + fd.vfsfd.Init(fd, statusFlags, mnt, vfsd, &vfs.FileDescriptionOptions{ + DenyPRead: true, + DenyPWrite: true, + UseDentryMetadata: true, + }) switch { - case fd.readable && fd.writable: + case fd.vfsfd.IsReadable() && fd.vfsfd.IsWritable(): vp.pipe.rOpen() vp.pipe.wOpen() - case fd.readable: + case fd.vfsfd.IsReadable(): vp.pipe.rOpen() - case fd.writable: + case fd.vfsfd.IsWritable(): vp.pipe.wOpen() default: panic("invalid pipe flags: must be readable, writable, or both") } - return &fd, nil + return &fd.vfsfd } -// VFSPipeFD implements a subset of vfs.FileDescriptionImpl for pipes. It is -// expected that filesystesm will use this in a struct implementing -// vfs.FileDescriptionImpl. +// VFSPipeFD implements vfs.FileDescriptionImpl for pipes. type VFSPipeFD struct { - pipe *Pipe - flags uint32 - readable bool - writable bool - vfsfd *vfs.FileDescription + vfsfd vfs.FileDescription + vfs.FileDescriptionDefaultImpl + vfs.DentryMetadataFileDescriptionImpl + + pipe *Pipe } // Release implements vfs.FileDescriptionImpl.Release. func (fd *VFSPipeFD) Release() { var event waiter.EventMask - if fd.readable { + if fd.vfsfd.IsReadable() { fd.pipe.rClose() - event |= waiter.EventIn + event |= waiter.EventOut } - if fd.writable { + if fd.vfsfd.IsWritable() { fd.pipe.wClose() - event |= waiter.EventOut + event |= waiter.EventIn | waiter.EventHUp } if event == 0 { panic("invalid pipe flags: must be readable, writable, or both") } - if fd.writable { - fd.vfsfd.VirtualDentry().Mount().EndWrite() - } - fd.pipe.Notify(event) } -// OnClose implements vfs.FileDescriptionImpl.OnClose. -func (fd *VFSPipeFD) OnClose(_ context.Context) error { - return nil +// Readiness implements waiter.Waitable.Readiness. +func (fd *VFSPipeFD) Readiness(mask waiter.EventMask) waiter.EventMask { + switch { + case fd.vfsfd.IsReadable() && fd.vfsfd.IsWritable(): + return fd.pipe.rwReadiness() + case fd.vfsfd.IsReadable(): + return fd.pipe.rReadiness() + case fd.vfsfd.IsWritable(): + return fd.pipe.wReadiness() + default: + panic("pipe FD is neither readable nor writable") + } } -// PRead implements vfs.FileDescriptionImpl.PRead. -func (fd *VFSPipeFD) PRead(_ context.Context, _ usermem.IOSequence, _ int64, _ vfs.ReadOptions) (int64, error) { - return 0, syserror.ESPIPE +// EventRegister implements waiter.Waitable.EventRegister. +func (fd *VFSPipeFD) EventRegister(e *waiter.Entry, mask waiter.EventMask) { + fd.pipe.EventRegister(e, mask) +} + +// EventUnregister implements waiter.Waitable.EventUnregister. +func (fd *VFSPipeFD) EventUnregister(e *waiter.Entry) { + fd.pipe.EventUnregister(e) } // Read implements vfs.FileDescriptionImpl.Read. func (fd *VFSPipeFD) Read(ctx context.Context, dst usermem.IOSequence, _ vfs.ReadOptions) (int64, error) { - if !fd.readable { - return 0, syserror.EINVAL - } - return fd.pipe.Read(ctx, dst) } -// PWrite implements vfs.FileDescriptionImpl.PWrite. -func (fd *VFSPipeFD) PWrite(_ context.Context, _ usermem.IOSequence, _ int64, _ vfs.WriteOptions) (int64, error) { - return 0, syserror.ESPIPE -} - // Write implements vfs.FileDescriptionImpl.Write. func (fd *VFSPipeFD) Write(ctx context.Context, src usermem.IOSequence, _ vfs.WriteOptions) (int64, error) { - if !fd.writable { - return 0, syserror.EINVAL - } - return fd.pipe.Write(ctx, src) } @@ -211,3 +215,17 @@ func (fd *VFSPipeFD) Write(ctx context.Context, src usermem.IOSequence, _ vfs.Wr func (fd *VFSPipeFD) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) { return fd.pipe.Ioctl(ctx, uio, args) } + +// PipeSize implements fcntl(F_GETPIPE_SZ). +func (fd *VFSPipeFD) PipeSize() int64 { + // Inline Pipe.FifoSize() rather than calling it with nil Context and + // fs.File and ignoring the returned error (which is always nil). + fd.pipe.mu.Lock() + defer fd.pipe.mu.Unlock() + return fd.pipe.max +} + +// SetPipeSize implements fcntl(F_SETPIPE_SZ). +func (fd *VFSPipeFD) SetPipeSize(size int64) (int64, error) { + return fd.pipe.SetFifoSize(size) +} diff --git a/pkg/sentry/syscalls/linux/sys_pipe.go b/pkg/sentry/syscalls/linux/sys_pipe.go index 798344042..43c510930 100644 --- a/pkg/sentry/syscalls/linux/sys_pipe.go +++ b/pkg/sentry/syscalls/linux/sys_pipe.go @@ -24,6 +24,8 @@ import ( "gvisor.dev/gvisor/pkg/usermem" ) +// LINT.IfChange + // pipe2 implements the actual system call with flags. func pipe2(t *kernel.Task, addr usermem.Addr, flags uint) (uintptr, error) { if flags&^(linux.O_NONBLOCK|linux.O_CLOEXEC) != 0 { @@ -45,10 +47,12 @@ func pipe2(t *kernel.Task, addr usermem.Addr, flags uint) (uintptr, error) { } if _, err := t.CopyOut(addr, fds); err != nil { - // The files are not closed in this case, the exact semantics - // of this error case are not well defined, but they could have - // already been observed by user space. - return 0, syserror.EFAULT + for _, fd := range fds { + if file, _ := t.FDTable().Remove(fd); file != nil { + file.DecRef() + } + } + return 0, err } return 0, nil } @@ -69,3 +73,5 @@ func Pipe2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall n, err := pipe2(t, addr, flags) return n, nil, err } + +// LINT.ThenChange(vfs2/pipe.go) diff --git a/pkg/sentry/syscalls/linux/vfs2/BUILD b/pkg/sentry/syscalls/linux/vfs2/BUILD index b32abfe59..6ff2d84d2 100644 --- a/pkg/sentry/syscalls/linux/vfs2/BUILD +++ b/pkg/sentry/syscalls/linux/vfs2/BUILD @@ -18,6 +18,7 @@ go_library( "linux64_override_arm64.go", "mmap.go", "path.go", + "pipe.go", "poll.go", "read_write.go", "setstat.go", @@ -39,8 +40,10 @@ go_library( "//pkg/gohacks", "//pkg/sentry/arch", "//pkg/sentry/fsbridge", + "//pkg/sentry/fsimpl/pipefs", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", + "//pkg/sentry/kernel/pipe", "//pkg/sentry/kernel/time", "//pkg/sentry/limits", "//pkg/sentry/loader", diff --git a/pkg/sentry/syscalls/linux/vfs2/fd.go b/pkg/sentry/syscalls/linux/vfs2/fd.go index 3afcea665..8181d80f4 100644 --- a/pkg/sentry/syscalls/linux/vfs2/fd.go +++ b/pkg/sentry/syscalls/linux/vfs2/fd.go @@ -18,6 +18,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" "gvisor.dev/gvisor/pkg/syserror" ) @@ -140,6 +141,22 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall return uintptr(file.StatusFlags()), nil, nil case linux.F_SETFL: return 0, nil, file.SetStatusFlags(t, t.Credentials(), args[2].Uint()) + case linux.F_SETPIPE_SZ: + pipefile, ok := file.Impl().(*pipe.VFSPipeFD) + if !ok { + return 0, nil, syserror.EBADF + } + n, err := pipefile.SetPipeSize(int64(args[2].Int())) + if err != nil { + return 0, nil, err + } + return uintptr(n), nil, nil + case linux.F_GETPIPE_SZ: + pipefile, ok := file.Impl().(*pipe.VFSPipeFD) + if !ok { + return 0, nil, syserror.EBADF + } + return uintptr(pipefile.PipeSize()), nil, nil default: // TODO(gvisor.dev/issue/1623): Everything else is not yet supported. return 0, nil, syserror.EINVAL diff --git a/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go b/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go index 645e0bcb8..21eb98444 100644 --- a/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go +++ b/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go @@ -39,7 +39,7 @@ func Override(table map[uintptr]kernel.Syscall) { table[19] = syscalls.Supported("readv", Readv) table[20] = syscalls.Supported("writev", Writev) table[21] = syscalls.Supported("access", Access) - delete(table, 22) // pipe + table[22] = syscalls.Supported("pipe", Pipe) table[23] = syscalls.Supported("select", Select) table[32] = syscalls.Supported("dup", Dup) table[33] = syscalls.Supported("dup2", Dup2) @@ -151,7 +151,7 @@ func Override(table map[uintptr]kernel.Syscall) { delete(table, 290) // eventfd2 table[291] = syscalls.Supported("epoll_create1", EpollCreate1) table[292] = syscalls.Supported("dup3", Dup3) - delete(table, 293) // pipe2 + table[293] = syscalls.Supported("pipe2", Pipe2) delete(table, 294) // inotify_init1 table[295] = syscalls.Supported("preadv", Preadv) table[296] = syscalls.Supported("pwritev", Pwritev) diff --git a/pkg/sentry/syscalls/linux/vfs2/pipe.go b/pkg/sentry/syscalls/linux/vfs2/pipe.go new file mode 100644 index 000000000..4a01e4209 --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/pipe.go @@ -0,0 +1,63 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/pipefs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// Pipe implements Linux syscall pipe(2). +func Pipe(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + return 0, nil, pipe2(t, addr, 0) +} + +// Pipe2 implements Linux syscall pipe2(2). +func Pipe2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + flags := args[1].Int() + return 0, nil, pipe2(t, addr, flags) +} + +func pipe2(t *kernel.Task, addr usermem.Addr, flags int32) error { + if flags&^(linux.O_NONBLOCK|linux.O_CLOEXEC) != 0 { + return syserror.EINVAL + } + r, w := pipefs.NewConnectedPipeFDs(t, t.Kernel().PipeMount(), uint32(flags&linux.O_NONBLOCK)) + defer r.DecRef() + defer w.DecRef() + + fds, err := t.NewFDsVFS2(0, []*vfs.FileDescription{r, w}, kernel.FDFlags{ + CloseOnExec: flags&linux.O_CLOEXEC != 0, + }) + if err != nil { + return err + } + if _, err := t.CopyOut(addr, fds); err != nil { + for _, fd := range fds { + if _, file := t.FDTable().Remove(fd); file != nil { + file.DecRef() + } + } + return err + } + return nil +} diff --git a/pkg/sentry/syscalls/linux/vfs2/read_write.go b/pkg/sentry/syscalls/linux/vfs2/read_write.go index 898b190fd..6c6998f45 100644 --- a/pkg/sentry/syscalls/linux/vfs2/read_write.go +++ b/pkg/sentry/syscalls/linux/vfs2/read_write.go @@ -103,7 +103,7 @@ func read(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, opt // Issue the request and break out if it completes with anything other than // "would block". - n, err := file.Read(t, dst, opts) + n, err = file.Read(t, dst, opts) total += n if err != syserror.ErrWouldBlock { break @@ -248,7 +248,7 @@ func pread(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, of // Issue the request and break out if it completes with anything other than // "would block". - n, err := file.PRead(t, dst, offset+total, opts) + n, err = file.PRead(t, dst, offset+total, opts) total += n if err != syserror.ErrWouldBlock { break @@ -335,7 +335,7 @@ func write(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, op // Issue the request and break out if it completes with anything other than // "would block". - n, err := file.Write(t, src, opts) + n, err = file.Write(t, src, opts) total += n if err != syserror.ErrWouldBlock { break @@ -480,7 +480,7 @@ func pwrite(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, o // Issue the request and break out if it completes with anything other than // "would block". - n, err := file.PWrite(t, src, offset+total, opts) + n, err = file.PWrite(t, src, offset+total, opts) total += n if err != syserror.ErrWouldBlock { break diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go index 053c6e1d1..cb5bbd781 100644 --- a/pkg/sentry/vfs/vfs.go +++ b/pkg/sentry/vfs/vfs.go @@ -335,7 +335,7 @@ func (vfs *VirtualFilesystem) MknodAt(ctx context.Context, creds *auth.Credentia rp := vfs.getResolvingPath(creds, pop) for { err := rp.mount.fs.impl.MknodAt(ctx, rp, *opts) - if err != nil { + if err == nil { vfs.putResolvingPath(rp) return nil } diff --git a/test/syscalls/linux/pipe.cc b/test/syscalls/linux/pipe.cc index d8e19e910..67228b66b 100644 --- a/test/syscalls/linux/pipe.cc +++ b/test/syscalls/linux/pipe.cc @@ -265,6 +265,8 @@ TEST_P(PipeTest, OffsetCalls) { SyscallFailsWithErrno(ESPIPE)); struct iovec iov; + iov.iov_base = &buf; + iov.iov_len = sizeof(buf); EXPECT_THAT(preadv(wfd_.get(), &iov, 1, 0), SyscallFailsWithErrno(ESPIPE)); EXPECT_THAT(pwritev(rfd_.get(), &iov, 1, 0), SyscallFailsWithErrno(ESPIPE)); } -- cgit v1.2.3 From 1a940f2b6c834693d9f85e3b648a3be3d986129d Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Mon, 20 Apr 2020 08:50:27 -0700 Subject: Resolve issue with file mode for host fds. Instead of plumbing error through kernfs.Inode.Mode, panic if err != nil. The errors that can result from an fstat syscall all indicate that something is fundamentally wrong, and panicking should be acceptable. PiperOrigin-RevId: 307406847 --- pkg/sentry/fsimpl/host/host.go | 34 +++------------------------------- 1 file changed, 3 insertions(+), 31 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go index 97fa7f7ab..fe14476f1 100644 --- a/pkg/sentry/fsimpl/host/host.go +++ b/pkg/sentry/fsimpl/host/host.go @@ -94,7 +94,6 @@ func ImportFD(mnt *vfs.Mount, hostFD int, isTTY bool) (*vfs.FileDescription, err isTTY: isTTY, canMap: canMap(uint32(fileType)), ino: fs.NextIno(), - mode: fileMode, // For simplicity, set offset to 0. Technically, we should use the existing // offset on the host if the file is seekable. offset: 0, @@ -149,20 +148,6 @@ type inode struct { // This field is initialized at creation time and is immutable. ino uint64 - // modeMu protects mode. - modeMu sync.Mutex - - // mode is a cached version of the file mode on the host. Note that it may - // become out of date if the mode is changed on the host, e.g. with chmod. - // - // Generally, it is better to retrieve the mode from the host through an - // fstat syscall. We only use this value in inode.Mode(), which cannot - // return an error, if the syscall to host fails. - // - // FIXME(b/152294168): Plumb error into Inode.Mode() return value so we - // can get rid of this. - mode linux.FileMode - // offsetMu protects offset. offsetMu sync.Mutex @@ -195,10 +180,11 @@ func (i *inode) CheckPermissions(ctx context.Context, creds *auth.Credentials, a // Mode implements kernfs.Inode. func (i *inode) Mode() linux.FileMode { mode, _, _, err := i.getPermissions() + // Retrieving the mode from the host fd using fstat(2) should not fail. + // If the syscall does not succeed, something is fundamentally wrong. if err != nil { - return i.mode + panic(fmt.Sprintf("failed to retrieve mode from host fd %d: %v", i.hostFD, err)) } - return linux.FileMode(mode) } @@ -208,11 +194,6 @@ func (i *inode) getPermissions() (linux.FileMode, auth.KUID, auth.KGID, error) { if err := syscall.Fstat(i.hostFD, &s); err != nil { return 0, 0, 0, err } - - // Update cached mode. - i.modeMu.Lock() - i.mode = linux.FileMode(s.Mode) - i.modeMu.Unlock() return linux.FileMode(s.Mode), auth.KUID(s.Uid), auth.KGID(s.Gid), nil } @@ -292,12 +273,6 @@ func (i *inode) Stat(_ *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, erro ls.Ino = i.ino } - // Update cached mode. - if (mask&linux.STATX_TYPE != 0) && (mask&linux.STATX_MODE != 0) { - i.modeMu.Lock() - i.mode = linux.FileMode(s.Mode) - i.modeMu.Unlock() - } return ls, nil } @@ -364,9 +339,6 @@ func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Cre if err := syscall.Fchmod(i.hostFD, uint32(s.Mode)); err != nil { return err } - i.modeMu.Lock() - i.mode = linux.FileMode(s.Mode) - i.modeMu.Unlock() } if m&linux.STATX_SIZE != 0 { if err := syscall.Ftruncate(i.hostFD, int64(s.Size)); err != nil { -- cgit v1.2.3 From e72ce8cce46ed48af306df50d252853c8790924d Mon Sep 17 00:00:00 2001 From: Jamie Liu Date: Mon, 20 Apr 2020 10:09:23 -0700 Subject: Change lingering uses of "memfs" in fsimpl/tmpfs to "tmpfs". PiperOrigin-RevId: 307422746 --- pkg/sentry/fsimpl/tmpfs/benchmark_test.go | 4 ++-- pkg/sentry/fsimpl/tmpfs/filesystem.go | 2 +- pkg/sentry/fsimpl/tmpfs/tmpfs.go | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/tmpfs/benchmark_test.go b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go index 383133e44..651912169 100644 --- a/pkg/sentry/fsimpl/tmpfs/benchmark_test.go +++ b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go @@ -168,7 +168,7 @@ func BenchmarkVFS1TmpfsStat(b *testing.B) { } } -func BenchmarkVFS2MemfsStat(b *testing.B) { +func BenchmarkVFS2TmpfsStat(b *testing.B) { for _, depth := range depths { b.Run(fmt.Sprintf("%d", depth), func(b *testing.B) { ctx := contexttest.Context(b) @@ -362,7 +362,7 @@ func BenchmarkVFS1TmpfsMountStat(b *testing.B) { } } -func BenchmarkVFS2MemfsMountStat(b *testing.B) { +func BenchmarkVFS2TmpfsMountStat(b *testing.B) { for _, depth := range depths { b.Run(fmt.Sprintf("%d", depth), func(b *testing.B) { ctx := contexttest.Context(b) diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go index 660f5a29b..452c4e2e0 100644 --- a/pkg/sentry/fsimpl/tmpfs/filesystem.go +++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go @@ -148,7 +148,7 @@ func (fs *filesystem) doCreateAt(rp *vfs.ResolvingPath, dir bool, create func(pa if !dir && rp.MustBeDir() { return syserror.ENOENT } - // In memfs, the only way to cause a dentry to be disowned is by removing + // In tmpfs, the only way to cause a dentry to be disowned is by removing // it from the filesystem, so this check is equivalent to checking if // parent has been removed. if parent.vfsd.IsDisowned() { diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go index a59b24d45..82c709b43 100644 --- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go +++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go @@ -247,7 +247,7 @@ func (i *inode) incLinksLocked() { panic("tmpfs.inode.incLinksLocked() called with no existing links") } if i.nlink == maxLinks { - panic("memfs.inode.incLinksLocked() called with maximum link count") + panic("tmpfs.inode.incLinksLocked() called with maximum link count") } atomic.AddUint32(&i.nlink, 1) } -- cgit v1.2.3 From 9b5e305e05ef3ad51778981062d6152cea1cd4fb Mon Sep 17 00:00:00 2001 From: Jamie Liu Date: Tue, 21 Apr 2020 12:16:42 -0700 Subject: Remove filesystem structure from vfs.Dentry. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This change: - Drastically simplifies the synchronization model: filesystem structure is both implementation-defined and implementation-synchronized. - Allows implementations of vfs.DentryImpl to use implementation-specific dentry types, reducing casts during path traversal. - Doesn't require dentries representing non-directory files to waste space on a map of children. - Allows dentry revalidation and mount lookup to be correctly ordered (fixed FIXME in fsimpl/gofer/filesystem.go). - Removes the need to have two separate maps in gofer.dentry (dentry.vfsd.children and dentry.negativeChildren) for positive and negative lookups respectively. //pkg/sentry/fsimpl/tmpfs/benchmark_test.go: name old time/op new time/op delta VFS2TmpfsStat/1-112 172ns ± 4% 165ns ± 3% -4.08% (p=0.002 n=9+9) VFS2TmpfsStat/2-112 199ns ± 3% 195ns ±10% ~ (p=0.132 n=8+9) VFS2TmpfsStat/3-112 230ns ± 2% 216ns ± 2% -6.15% (p=0.000 n=8+8) VFS2TmpfsStat/8-112 390ns ± 2% 358ns ± 4% -8.33% (p=0.000 n=9+8) VFS2TmpfsStat/64-112 2.20µs ± 3% 2.01µs ± 3% -8.48% (p=0.000 n=10+8) VFS2TmpfsStat/100-112 3.42µs ± 9% 3.08µs ± 2% -9.82% (p=0.000 n=9+8) VFS2TmpfsMountStat/1-112 278ns ± 1% 286ns ±15% ~ (p=0.712 n=8+10) VFS2TmpfsMountStat/2-112 311ns ± 4% 298ns ± 2% -4.27% (p=0.000 n=9+8) VFS2TmpfsMountStat/3-112 339ns ± 3% 330ns ± 9% ~ (p=0.070 n=8+9) VFS2TmpfsMountStat/8-112 503ns ± 3% 466ns ± 3% -7.38% (p=0.000 n=8+8) VFS2TmpfsMountStat/64-112 2.53µs ±16% 2.17µs ± 7% -14.19% (p=0.000 n=10+9) VFS2TmpfsMountStat/100-112 3.60µs ± 4% 3.30µs ± 8% -8.33% (p=0.001 n=8+9) Updates #1035 PiperOrigin-RevId: 307655892 --- pkg/sentry/fsimpl/ext/BUILD | 12 ++ pkg/sentry/fsimpl/ext/dentry.go | 4 + pkg/sentry/fsimpl/ext/directory.go | 21 ++- pkg/sentry/fsimpl/ext/filesystem.go | 54 ++++-- pkg/sentry/fsimpl/ext/inode.go | 2 +- pkg/sentry/fsimpl/gofer/BUILD | 12 ++ pkg/sentry/fsimpl/gofer/directory.go | 55 +++--- pkg/sentry/fsimpl/gofer/filesystem.go | 202 +++++++++++--------- pkg/sentry/fsimpl/gofer/gofer.go | 66 ++++--- pkg/sentry/fsimpl/gofer/gofer_test.go | 3 +- pkg/sentry/fsimpl/kernfs/BUILD | 12 ++ pkg/sentry/fsimpl/kernfs/fd_impl_util.go | 2 +- pkg/sentry/fsimpl/kernfs/filesystem.go | 159 +++++++++------- pkg/sentry/fsimpl/kernfs/inode_impl_util.go | 2 +- pkg/sentry/fsimpl/kernfs/kernfs.go | 33 ++-- pkg/sentry/fsimpl/proc/tasks_test.go | 16 +- pkg/sentry/fsimpl/tmpfs/BUILD | 12 ++ pkg/sentry/fsimpl/tmpfs/benchmark_test.go | 7 - pkg/sentry/fsimpl/tmpfs/directory.go | 84 ++++++--- pkg/sentry/fsimpl/tmpfs/filesystem.go | 249 +++++++++++++------------ pkg/sentry/fsimpl/tmpfs/stat_test.go | 12 +- pkg/sentry/fsimpl/tmpfs/tmpfs.go | 82 ++++---- pkg/sentry/vfs/dentry.go | 259 +++++--------------------- pkg/sentry/vfs/file_description.go | 3 +- pkg/sentry/vfs/filesystem.go | 5 +- pkg/sentry/vfs/filesystem_impl_util.go | 26 --- pkg/sentry/vfs/genericfstree/BUILD | 16 ++ pkg/sentry/vfs/genericfstree/genericfstree.go | 80 ++++++++ pkg/sentry/vfs/mount.go | 9 +- pkg/sentry/vfs/pathname.go | 6 +- pkg/sentry/vfs/resolving_path.go | 85 +++------ 31 files changed, 836 insertions(+), 754 deletions(-) create mode 100644 pkg/sentry/vfs/genericfstree/BUILD create mode 100644 pkg/sentry/vfs/genericfstree/genericfstree.go (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/ext/BUILD b/pkg/sentry/fsimpl/ext/BUILD index d83d75b3d..a4947c480 100644 --- a/pkg/sentry/fsimpl/ext/BUILD +++ b/pkg/sentry/fsimpl/ext/BUILD @@ -15,6 +15,17 @@ go_template_instance( }, ) +go_template_instance( + name = "fstree", + out = "fstree.go", + package = "ext", + prefix = "generic", + template = "//pkg/sentry/vfs/genericfstree:generic_fstree", + types = { + "Dentry": "dentry", + }, +) + go_library( name = "ext", srcs = [ @@ -26,6 +37,7 @@ go_library( "extent_file.go", "file_description.go", "filesystem.go", + "fstree.go", "inode.go", "regular_file.go", "symlink.go", diff --git a/pkg/sentry/fsimpl/ext/dentry.go b/pkg/sentry/fsimpl/ext/dentry.go index a080cb189..bfbd7c3d4 100644 --- a/pkg/sentry/fsimpl/ext/dentry.go +++ b/pkg/sentry/fsimpl/ext/dentry.go @@ -22,6 +22,10 @@ import ( type dentry struct { vfsd vfs.Dentry + // Protected by filesystem.mu. + parent *dentry + name string + // inode is the inode represented by this dentry. Multiple Dentries may // share a single non-directory Inode (with hard links). inode is // immutable. diff --git a/pkg/sentry/fsimpl/ext/directory.go b/pkg/sentry/fsimpl/ext/directory.go index bd6ede995..12b875c8f 100644 --- a/pkg/sentry/fsimpl/ext/directory.go +++ b/pkg/sentry/fsimpl/ext/directory.go @@ -21,7 +21,6 @@ import ( "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout" - "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" @@ -31,6 +30,10 @@ import ( type directory struct { inode inode + // childCache maps filenames to dentries for children for which dentries + // have been instantiated. childCache is protected by filesystem.mu. + childCache map[string]*dentry + // mu serializes the changes to childList. // Lock Order (outermost locks must be taken first): // directory.mu @@ -50,9 +53,13 @@ type directory struct { childMap map[string]*dirent } -// newDirectroy is the directory constructor. -func newDirectroy(inode inode, newDirent bool) (*directory, error) { - file := &directory{inode: inode, childMap: make(map[string]*dirent)} +// newDirectory is the directory constructor. +func newDirectory(inode inode, newDirent bool) (*directory, error) { + file := &directory{ + inode: inode, + childCache: make(map[string]*dentry), + childMap: make(map[string]*dirent), + } file.inode.impl = file // Initialize childList by reading dirents from the underlying file. @@ -299,9 +306,3 @@ func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (in fd.off = offset return offset, nil } - -// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap. -func (fd *directoryFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { - // mmap(2) specifies that EACCESS should be returned for non-regular file fds. - return syserror.EACCES -} diff --git a/pkg/sentry/fsimpl/ext/filesystem.go b/pkg/sentry/fsimpl/ext/filesystem.go index afea58f65..2c22a04af 100644 --- a/pkg/sentry/fsimpl/ext/filesystem.go +++ b/pkg/sentry/fsimpl/ext/filesystem.go @@ -89,14 +89,33 @@ func stepLocked(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, inode *inode, write boo } for { - nextVFSD, err := rp.ResolveComponent(vfsd) - if err != nil { - return nil, nil, err + name := rp.Component() + if name == "." { + rp.Advance() + return vfsd, inode, nil } - if nextVFSD == nil { - // Since the Dentry tree is not the sole source of truth for extfs, if it's - // not in the Dentry tree, it might need to be pulled from disk. - childDirent, ok := inode.impl.(*directory).childMap[rp.Component()] + d := vfsd.Impl().(*dentry) + if name == ".." { + isRoot, err := rp.CheckRoot(vfsd) + if err != nil { + return nil, nil, err + } + if isRoot || d.parent == nil { + rp.Advance() + return vfsd, inode, nil + } + if err := rp.CheckMount(&d.parent.vfsd); err != nil { + return nil, nil, err + } + rp.Advance() + return &d.parent.vfsd, d.parent.inode, nil + } + + dir := inode.impl.(*directory) + child, ok := dir.childCache[name] + if !ok { + // We may need to instantiate a new dentry for this child. + childDirent, ok := dir.childMap[name] if !ok { // The underlying inode does not exist on disk. return nil, nil, syserror.ENOENT @@ -115,21 +134,22 @@ func stepLocked(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, inode *inode, write boo } // incRef because this is being added to the dentry tree. childInode.incRef() - child := newDentry(childInode) - vfsd.InsertChild(&child.vfsd, rp.Component()) - - // Continue as usual now that nextVFSD is not nil. - nextVFSD = &child.vfsd + child = newDentry(childInode) + child.parent = d + child.name = name + dir.childCache[name] = child + } + if err := rp.CheckMount(&child.vfsd); err != nil { + return nil, nil, err } - nextInode := nextVFSD.Impl().(*dentry).inode - if nextInode.isSymlink() && rp.ShouldFollowSymlink() { - if err := rp.HandleSymlink(inode.impl.(*symlink).target); err != nil { + if child.inode.isSymlink() && rp.ShouldFollowSymlink() { + if err := rp.HandleSymlink(child.inode.impl.(*symlink).target); err != nil { return nil, nil, err } continue } rp.Advance() - return nextVFSD, nextInode, nil + return &child.vfsd, child.inode, nil } } @@ -515,5 +535,5 @@ func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error { fs.mu.RLock() defer fs.mu.RUnlock() - return vfs.GenericPrependPath(vfsroot, vd, b) + return genericPrependPath(vfsroot, vd.Mount(), vd.Dentry().Impl().(*dentry), b) } diff --git a/pkg/sentry/fsimpl/ext/inode.go b/pkg/sentry/fsimpl/ext/inode.go index a39a37318..a98512350 100644 --- a/pkg/sentry/fsimpl/ext/inode.go +++ b/pkg/sentry/fsimpl/ext/inode.go @@ -136,7 +136,7 @@ func newInode(fs *filesystem, inodeNum uint32) (*inode, error) { } return &f.inode, nil case linux.ModeDirectory: - f, err := newDirectroy(inode, fs.sb.IncompatibleFeatures().DirentFileType) + f, err := newDirectory(inode, fs.sb.IncompatibleFeatures().DirentFileType) if err != nil { return nil, err } diff --git a/pkg/sentry/fsimpl/gofer/BUILD b/pkg/sentry/fsimpl/gofer/BUILD index 99d1e3f8f..acd061905 100644 --- a/pkg/sentry/fsimpl/gofer/BUILD +++ b/pkg/sentry/fsimpl/gofer/BUILD @@ -15,12 +15,24 @@ go_template_instance( }, ) +go_template_instance( + name = "fstree", + out = "fstree.go", + package = "gofer", + prefix = "generic", + template = "//pkg/sentry/vfs/genericfstree:generic_fstree", + types = { + "Dentry": "dentry", + }, +) + go_library( name = "gofer", srcs = [ "dentry_list.go", "directory.go", "filesystem.go", + "fstree.go", "gofer.go", "handle.go", "handle_unsafe.go", diff --git a/pkg/sentry/fsimpl/gofer/directory.go b/pkg/sentry/fsimpl/gofer/directory.go index 49d9f859b..d02691232 100644 --- a/pkg/sentry/fsimpl/gofer/directory.go +++ b/pkg/sentry/fsimpl/gofer/directory.go @@ -29,13 +29,25 @@ func (d *dentry) isDir() bool { return d.fileType() == linux.S_IFDIR } +// Preconditions: filesystem.renameMu must be locked. d.dirMu must be locked. +// d.isDir(). child must be a newly-created dentry that has never had a parent. +func (d *dentry) cacheNewChildLocked(child *dentry, name string) { + d.IncRef() // reference held by child on its parent + child.parent = d + child.name = name + if d.children == nil { + d.children = make(map[string]*dentry) + } + d.children[name] = child +} + // Preconditions: d.dirMu must be locked. d.isDir(). fs.opts.interop != // InteropModeShared. func (d *dentry) cacheNegativeChildLocked(name string) { - if d.negativeChildren == nil { - d.negativeChildren = make(map[string]struct{}) + if d.children == nil { + d.children = make(map[string]*dentry) } - d.negativeChildren[name] = struct{}{} + d.children[name] = nil } type directoryFD struct { @@ -80,34 +92,32 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba // Preconditions: d.isDir(). There exists at least one directoryFD representing d. func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) { - // 9P2000.L's readdir does not specify behavior in the presence of - // concurrent mutation of an iterated directory, so implementations may - // duplicate or omit entries in this case, which violates POSIX semantics. - // Thus we read all directory entries while holding d.dirMu to exclude - // directory mutations. (Note that it is impossible for the client to - // exclude concurrent mutation from other remote filesystem users. Since - // there is no way to detect if the server has incorrectly omitted - // directory entries, we simply assume that the server is well-behaved - // under InteropModeShared.) This is inconsistent with Linux (which appears - // to assume that directory fids have the correct semantics, and translates - // struct file_operations::readdir calls directly to readdir RPCs), but is - // consistent with VFS1. - // - // NOTE(b/135560623): In particular, some gofer implementations may not - // retain state between calls to Readdir, so may not provide a coherent - // directory stream across in the presence of mutation. - + // NOTE(b/135560623): 9P2000.L's readdir does not specify behavior in the + // presence of concurrent mutation of an iterated directory, so + // implementations may duplicate or omit entries in this case, which + // violates POSIX semantics. Thus we read all directory entries while + // holding d.dirMu to exclude directory mutations. (Note that it is + // impossible for the client to exclude concurrent mutation from other + // remote filesystem users. Since there is no way to detect if the server + // has incorrectly omitted directory entries, we simply assume that the + // server is well-behaved under InteropModeShared.) This is inconsistent + // with Linux (which appears to assume that directory fids have the correct + // semantics, and translates struct file_operations::readdir calls directly + // to readdir RPCs), but is consistent with VFS1. + + // filesystem.renameMu is needed for d.parent, and must be locked before + // dentry.dirMu. d.fs.renameMu.RLock() - defer d.fs.renameMu.RUnlock() d.dirMu.Lock() defer d.dirMu.Unlock() if d.dirents != nil { + d.fs.renameMu.RUnlock() return d.dirents, nil } // It's not clear if 9P2000.L's readdir is expected to return "." and "..", // so we generate them here. - parent := d.vfsd.ParentOrSelf().Impl().(*dentry) + parent := genericParentOrSelf(d) dirents := []vfs.Dirent{ { Name: ".", @@ -122,6 +132,7 @@ func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) { NextOff: 2, }, } + d.fs.renameMu.RUnlock() off := uint64(0) const count = 64 * 1024 // for consistency with the vfs1 client d.handleMu.RLock() diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go index cd744bf5e..43e863c61 100644 --- a/pkg/sentry/fsimpl/gofer/filesystem.go +++ b/pkg/sentry/fsimpl/gofer/filesystem.go @@ -116,6 +116,8 @@ func putDentrySlice(ds *[]*dentry) { // Preconditions: fs.renameMu must be locked. d.dirMu must be locked. // !rp.Done(). If fs.opts.interop == InteropModeShared, then d's cached // metadata must be up to date. +// +// Postconditions: The returned dentry's cached metadata is up to date. func (fs *filesystem) stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, ds **[]*dentry) (*dentry, error) { if !d.isDir() { return nil, syserror.ENOTDIR @@ -130,39 +132,42 @@ afterSymlink: return d, nil } if name == ".." { - parentVFSD, err := rp.ResolveParent(&d.vfsd) - if err != nil { + if isRoot, err := rp.CheckRoot(&d.vfsd); err != nil { + return nil, err + } else if isRoot || d.parent == nil { + rp.Advance() + return d, nil + } + // We must assume that d.parent is correct, because if d has been moved + // elsewhere in the remote filesystem so that its parent has changed, + // we have no way of determining its new parent's location in the + // filesystem. + // + // Call rp.CheckMount() before updating d.parent's metadata, since if + // we traverse to another mount then d.parent's metadata is irrelevant. + if err := rp.CheckMount(&d.parent.vfsd); err != nil { return nil, err } - parent := parentVFSD.Impl().(*dentry) - if fs.opts.interop == InteropModeShared { - // We must assume that parentVFSD is correct, because if d has been - // moved elsewhere in the remote filesystem so that its parent has - // changed, we have no way of determining its new parent's location - // in the filesystem. Get updated metadata for parentVFSD. - _, attrMask, attr, err := parent.file.getAttr(ctx, dentryAttrMask()) + if fs.opts.interop == InteropModeShared && d != d.parent { + _, attrMask, attr, err := d.parent.file.getAttr(ctx, dentryAttrMask()) if err != nil { return nil, err } - parent.updateFromP9Attrs(attrMask, &attr) + d.parent.updateFromP9Attrs(attrMask, &attr) } rp.Advance() - return parent, nil + return d.parent, nil } - childVFSD, err := rp.ResolveChild(&d.vfsd, name) - if err != nil { - return nil, err - } - // FIXME(jamieliu): Linux performs revalidation before mount lookup - // (fs/namei.c:lookup_fast() => __d_lookup_rcu(), d_revalidate(), - // __follow_mount_rcu()). - child, err := fs.revalidateChildLocked(ctx, rp.VirtualFilesystem(), d, name, childVFSD, ds) + child, err := fs.getChildLocked(ctx, rp.VirtualFilesystem(), d, name, ds) if err != nil { return nil, err } if child == nil { return nil, syserror.ENOENT } + if err := rp.CheckMount(&child.vfsd); err != nil { + return nil, err + } if child.isSymlink() && rp.ShouldFollowSymlink() { target, err := child.readlink(ctx, rp.Mount()) if err != nil { @@ -177,38 +182,37 @@ afterSymlink: return child, nil } -// revalidateChildLocked must be called after a call to parent.vfsd.Child(name) -// or vfs.ResolvingPath.ResolveChild(name) returns childVFSD (which may be -// nil) to verify that the returned child (or lack thereof) is correct. If no file -// exists at name, revalidateChildLocked returns (nil, nil). +// getChildLocked returns a dentry representing the child of parent with the +// given name. If no such child exists, getChildLocked returns (nil, nil). // // Preconditions: fs.renameMu must be locked. parent.dirMu must be locked. // parent.isDir(). name is not "." or "..". // -// Postconditions: If revalidateChildLocked returns a non-nil dentry, its -// cached metadata is up to date. -func (fs *filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.VirtualFilesystem, parent *dentry, name string, childVFSD *vfs.Dentry, ds **[]*dentry) (*dentry, error) { - if childVFSD != nil && fs.opts.interop != InteropModeShared { - // We have a cached dentry that is assumed to be correct. - return childVFSD.Impl().(*dentry), nil - } - // We either don't have a cached dentry or need to verify that it's still - // correct, either of which requires a remote lookup. Check if this name is - // valid before performing the lookup. +// Postconditions: If getChildLocked returns a non-nil dentry, its cached +// metadata is up to date. +func (fs *filesystem) getChildLocked(ctx context.Context, vfsObj *vfs.VirtualFilesystem, parent *dentry, name string, ds **[]*dentry) (*dentry, error) { if len(name) > maxFilenameLen { return nil, syserror.ENAMETOOLONG } - // Check if we've already cached this lookup with a negative result. - if _, ok := parent.negativeChildren[name]; ok { - return nil, nil + child, ok := parent.children[name] + if ok && fs.opts.interop != InteropModeShared { + // Whether child is nil or not, it is cached information that is + // assumed to be correct. + return child, nil } - // Perform the remote lookup. + // We either don't have cached information or need to verify that it's + // still correct, either of which requires a remote lookup. Check if this + // name is valid before performing the lookup. + return fs.revalidateChildLocked(ctx, vfsObj, parent, name, child, ds) +} + +// Preconditions: As for getChildLocked. +func (fs *filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.VirtualFilesystem, parent *dentry, name string, child *dentry, ds **[]*dentry) (*dentry, error) { qid, file, attrMask, attr, err := parent.file.walkGetAttrOne(ctx, name) if err != nil && err != syserror.ENOENT { return nil, err } - if childVFSD != nil { - child := childVFSD.Impl().(*dentry) + if child != nil { if !file.isNil() && qid.Path == child.ino { // The file at this path hasn't changed. Just update cached // metadata. @@ -219,9 +223,8 @@ func (fs *filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.Vir // The file at this path has changed or no longer exists. Remove // the stale dentry from the tree, and re-evaluate its caching // status (i.e. if it has 0 references, drop it). - vfsObj.ForceDeleteDentry(childVFSD) + vfsObj.InvalidateDentry(&child.vfsd) *ds = appendDentry(*ds, child) - childVFSD = nil } if file.isNil() { // No file exists at this path now. Cache the negative lookup if @@ -232,13 +235,12 @@ func (fs *filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.Vir return nil, nil } // Create a new dentry representing the file. - child, err := fs.newDentry(ctx, file, qid, attrMask, &attr) + child, err = fs.newDentry(ctx, file, qid, attrMask, &attr) if err != nil { file.close(ctx) return nil, err } - parent.IncRef() // reference held by child on its parent - parent.vfsd.InsertChild(&child.vfsd, name) + parent.cacheNewChildLocked(child, name) // For now, child has 0 references, so our caller should call // child.checkCachingLocked(). *ds = appendDentry(*ds, child) @@ -318,9 +320,6 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { return err } - if parent.isDeleted() { - return syserror.ENOENT - } name := rp.Component() if name == "." || name == ".." { return syserror.EEXIST @@ -331,6 +330,9 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir if !dir && rp.MustBeDir() { return syserror.ENOENT } + if parent.isDeleted() { + return syserror.ENOENT + } mnt := rp.Mount() if err := mnt.CheckBeginWrite(); err != nil { return err @@ -348,7 +350,7 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir // it's used. return create(parent, name) } - if parent.vfsd.Child(name) != nil { + if child := parent.children[name]; child != nil { return syserror.EEXIST } // No cached dentry exists; however, there might still be an existing file @@ -356,10 +358,11 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir if err := create(parent, name); err != nil { return err } - if fs.opts.interop != InteropModeShared { - parent.touchCMtime() - } - delete(parent.negativeChildren, name) + parent.touchCMtime() + // Either parent.children[name] doesn't exist (in which case this is a + // no-op) or is nil (in which case this erases the now-stale information + // that the file doesn't exist). + delete(parent.children, name) parent.dirents = nil return nil } @@ -407,56 +410,55 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b defer mntns.DecRef() parent.dirMu.Lock() defer parent.dirMu.Unlock() - childVFSD := parent.vfsd.Child(name) - var child *dentry + child, ok := parent.children[name] + if ok && child == nil { + return syserror.ENOENT + } // We only need a dentry representing the file at name if it can be a mount - // point. If childVFSD is nil, then it can't be a mount point. If childVFSD - // is non-nil but stale, the actual file can't be a mount point either; we + // point. If child is nil, then it can't be a mount point. If child is + // non-nil but stale, the actual file can't be a mount point either; we // detect this case by just speculatively calling PrepareDeleteDentry and // only revalidating the dentry if that fails (indicating that the existing // dentry is a mount point). - if childVFSD != nil { - child = childVFSD.Impl().(*dentry) - if err := vfsObj.PrepareDeleteDentry(mntns, childVFSD); err != nil { - child, err = fs.revalidateChildLocked(ctx, vfsObj, parent, name, childVFSD, &ds) + if child != nil { + if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil { + if fs.opts.interop != InteropModeShared { + return err + } + child, err = fs.revalidateChildLocked(ctx, vfsObj, parent, name, child, &ds) if err != nil { return err } if child != nil { - childVFSD = &child.vfsd - if err := vfsObj.PrepareDeleteDentry(mntns, childVFSD); err != nil { + if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil { return err } - } else { - childVFSD = nil } } - } else if _, ok := parent.negativeChildren[name]; ok { - return syserror.ENOENT } flags := uint32(0) if dir { if child != nil && !child.isDir() { - vfsObj.AbortDeleteDentry(childVFSD) + vfsObj.AbortDeleteDentry(&child.vfsd) return syserror.ENOTDIR } flags = linux.AT_REMOVEDIR } else { if child != nil && child.isDir() { - vfsObj.AbortDeleteDentry(childVFSD) + vfsObj.AbortDeleteDentry(&child.vfsd) return syserror.EISDIR } if rp.MustBeDir() { - if childVFSD != nil { - vfsObj.AbortDeleteDentry(childVFSD) + if child != nil { + vfsObj.AbortDeleteDentry(&child.vfsd) } return syserror.ENOTDIR } } err = parent.file.unlinkAt(ctx, name, flags) if err != nil { - if childVFSD != nil { - vfsObj.AbortDeleteDentry(childVFSD) + if child != nil { + vfsObj.AbortDeleteDentry(&child.vfsd) } return err } @@ -467,10 +469,12 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b } parent.cacheNegativeChildLocked(name) parent.dirents = nil + } else { + delete(parent.children, name) } if child != nil { child.setDeleted() - vfsObj.CommitDeleteDentry(childVFSD) + vfsObj.CommitDeleteDentry(&child.vfsd) ds = appendDentry(ds, child) } return nil @@ -806,16 +810,14 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving // eligible for caching yet, so we don't need to append to a dentry slice.) child.refs = 1 // Insert the dentry into the tree. - d.IncRef() // reference held by child on its parent d - d.vfsd.InsertChild(&child.vfsd, name) + d.cacheNewChildLocked(child, name) if d.fs.opts.interop != InteropModeShared { - delete(d.negativeChildren, name) + d.touchCMtime() d.dirents = nil } // Finally, construct a file description representing the created file. var childVFSFD *vfs.FileDescription - mnt.IncRef() if useRegularFileFD { fd := ®ularFileFD{} if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &child.vfsd, &vfs.FileDescriptionOptions{ @@ -840,9 +842,6 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving } childVFSFD = &fd.vfsfd } - if d.fs.opts.interop != InteropModeShared { - d.touchCMtime() - } return childVFSFD, nil } @@ -902,7 +901,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa // directory, we need to check for write permission on it. oldParent.dirMu.Lock() defer oldParent.dirMu.Unlock() - renamed, err := fs.revalidateChildLocked(ctx, vfsObj, oldParent, oldName, oldParent.vfsd.Child(oldName), &ds) + renamed, err := fs.getChildLocked(ctx, vfsObj, oldParent, oldName, &ds) if err != nil { return err } @@ -910,7 +909,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa return syserror.ENOENT } if renamed.isDir() { - if renamed == newParent || renamed.vfsd.IsAncestorOf(&newParent.vfsd) { + if renamed == newParent || genericIsAncestorDentry(renamed, newParent) { return syserror.EINVAL } if oldParent != newParent { @@ -934,16 +933,17 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa if newParent.isDeleted() { return syserror.ENOENT } - replacedVFSD := newParent.vfsd.Child(newName) - var replaced *dentry + replaced := newParent.children[newName] // This is similar to unlinkAt, except: // - // - We revalidate the replaced dentry unconditionally for simplicity. + // - If a dentry exists for the file to be replaced, we revalidate it + // unconditionally (instead of only if PrepareRenameDentry fails) for + // simplicity. // // - If rp.MustBeDir(), then we need a dentry representing the replaced // file regardless to confirm that it's a directory. - if replacedVFSD != nil || rp.MustBeDir() { - replaced, err = fs.revalidateChildLocked(ctx, vfsObj, newParent, newName, replacedVFSD, &ds) + if replaced != nil || rp.MustBeDir() { + replaced, err = fs.getChildLocked(ctx, rp.VirtualFilesystem(), newParent, newName, &ds) if err != nil { return err } @@ -957,11 +957,12 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa return syserror.ENOTDIR } } - replacedVFSD = &replaced.vfsd - } else { - replacedVFSD = nil } } + var replacedVFSD *vfs.Dentry + if replaced != nil { + replacedVFSD = &replaced.vfsd + } if oldParent == newParent && oldName == newName { return nil @@ -978,7 +979,6 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa if fs.opts.interop != InteropModeShared { oldParent.cacheNegativeChildLocked(oldName) oldParent.dirents = nil - delete(newParent.negativeChildren, newName) newParent.dirents = nil if renamed.isDir() { oldParent.decLinks() @@ -987,8 +987,24 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa oldParent.touchCMtime() newParent.touchCMtime() renamed.touchCtime() + } else { + delete(oldParent.children, oldName) + } + if oldParent != newParent { + appendDentry(ds, oldParent) + newParent.IncRef() + } + renamed.parent = newParent + renamed.name = newName + if newParent.children == nil { + newParent.children = make(map[string]*dentry) + } + newParent.children[newName] = renamed + if replaced != nil { + replaced.setDeleted() + appendDentry(ds, replaced) } - vfsObj.CommitRenameReplaceDentry(&renamed.vfsd, &newParent.vfsd, newName, replacedVFSD) + vfsObj.CommitRenameReplaceDentry(&renamed.vfsd, replacedVFSD) return nil } @@ -1131,5 +1147,5 @@ func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error { fs.renameMu.RLock() defer fs.renameMu.RUnlock() - return vfs.GenericPrependPath(vfsroot, vd, b) + return genericPrependPath(vfsroot, vd.Mount(), vd.Dentry().Impl().(*dentry), b) } diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go index 2485cdb53..293df2545 100644 --- a/pkg/sentry/fsimpl/gofer/gofer.go +++ b/pkg/sentry/fsimpl/gofer/gofer.go @@ -452,6 +452,16 @@ type dentry struct { // fs is the owning filesystem. fs is immutable. fs *filesystem + // parent is this dentry's parent directory. Each dentry holds a reference + // on its parent. If this dentry is a filesystem root, parent is nil. + // parent is protected by filesystem.renameMu. + parent *dentry + + // name is the name of this dentry in its parent. If this dentry is a + // filesystem root, name is the empty string. name is protected by + // filesystem.renameMu. + name string + // We don't support hard links, so each dentry maps 1:1 to an inode. // file is the unopened p9.File that backs this dentry. file is immutable. @@ -469,10 +479,15 @@ type dentry struct { dirMu sync.Mutex - // If this dentry represents a directory, and InteropModeShared is not in - // effect, negativeChildren is a set of child names in this directory that - // are known not to exist. negativeChildren is protected by dirMu. - negativeChildren map[string]struct{} + // If this dentry represents a directory, children contains: + // + // - Mappings of child filenames to dentries representing those children. + // + // - Mappings of child filenames that are known not to exist to nil + // dentries (only if InteropModeShared is not in effect). + // + // children is protected by dirMu. + children map[string]*dentry // If this dentry represents a directory, InteropModeShared is not in // effect, and dirents is not nil, it is a cache of all entries in the @@ -910,9 +925,9 @@ func (d *dentry) checkCachingLocked() { // Dentry has already been destroyed. return } - // Non-child dentries with zero references are no longer reachable by path - // resolution and should be dropped immediately. - if d.vfsd.Parent() == nil || d.vfsd.IsDisowned() { + // Deleted and invalidated dentries with zero references are no longer + // reachable by path resolution and should be dropped immediately. + if d.vfsd.IsDead() { if d.cached { d.fs.cachedDentries.Remove(d) d.fs.cachedDentriesLen-- @@ -937,28 +952,26 @@ func (d *dentry) checkCachingLocked() { d.fs.cachedDentries.Remove(victim) d.fs.cachedDentriesLen-- victim.cached = false - // victim.refs may have become non-zero from an earlier path - // resolution since it was inserted into fs.cachedDentries; see - // dentry.incRefLocked(). Either way, we brought - // fs.cachedDentriesLen back down to fs.opts.maxCachedDentries, so - // we don't loop. + // victim.refs may have become non-zero from an earlier path resolution + // since it was inserted into fs.cachedDentries. if atomic.LoadInt64(&victim.refs) == 0 { - if victimParentVFSD := victim.vfsd.Parent(); victimParentVFSD != nil { - victimParent := victimParentVFSD.Impl().(*dentry) - victimParent.dirMu.Lock() - if !victim.vfsd.IsDisowned() { - // victim can't be a mount point (in any mount - // namespace), since VFS holds references on mount - // points. - d.fs.vfsfs.VirtualFilesystem().ForceDeleteDentry(&victim.vfsd) + if victim.parent != nil { + victim.parent.dirMu.Lock() + if !victim.vfsd.IsDead() { + // Note that victim can't be a mount point (in any mount + // namespace), since VFS holds references on mount points. + d.fs.vfsfs.VirtualFilesystem().InvalidateDentry(&victim.vfsd) + delete(victim.parent.children, victim.name) // We're only deleting the dentry, not the file it // represents, so we don't need to update // victimParent.dirents etc. } - victimParent.dirMu.Unlock() + victim.parent.dirMu.Unlock() } victim.destroyLocked() } + // Whether or not victim was destroyed, we brought fs.cachedDentriesLen + // back down to fs.opts.maxCachedDentries, so we don't loop. } } @@ -1005,12 +1018,11 @@ func (d *dentry) destroyLocked() { d.fs.syncMu.Lock() delete(d.fs.dentries, d) d.fs.syncMu.Unlock() - // Drop the reference held by d on its parent. - if parentVFSD := d.vfsd.Parent(); parentVFSD != nil { - parent := parentVFSD.Impl().(*dentry) - // This is parent.DecRef() without recursive locking of d.fs.renameMu. - if refs := atomic.AddInt64(&parent.refs, -1); refs == 0 { - parent.checkCachingLocked() + // Drop the reference held by d on its parent without recursively locking + // d.fs.renameMu. + if d.parent != nil { + if refs := atomic.AddInt64(&d.parent.refs, -1); refs == 0 { + d.parent.checkCachingLocked() } else if refs < 0 { panic("gofer.dentry.DecRef() called without holding a reference") } diff --git a/pkg/sentry/fsimpl/gofer/gofer_test.go b/pkg/sentry/fsimpl/gofer/gofer_test.go index 82bc239db..4041fb252 100644 --- a/pkg/sentry/fsimpl/gofer/gofer_test.go +++ b/pkg/sentry/fsimpl/gofer/gofer_test.go @@ -48,8 +48,7 @@ func TestDestroyIdempotent(t *testing.T) { if err != nil { t.Fatalf("fs.newDentry(): %v", err) } - parent.IncRef() // reference held by child on its parent. - parent.vfsd.InsertChild(&child.vfsd, "child") + parent.cacheNewChildLocked(child, "child") child.checkCachingLocked() if got := atomic.LoadInt64(&child.refs); got != -1 { diff --git a/pkg/sentry/fsimpl/kernfs/BUILD b/pkg/sentry/fsimpl/kernfs/BUILD index b3d6299d0..ef34cb28a 100644 --- a/pkg/sentry/fsimpl/kernfs/BUILD +++ b/pkg/sentry/fsimpl/kernfs/BUILD @@ -3,6 +3,17 @@ load("//tools/go_generics:defs.bzl", "go_template_instance") licenses(["notice"]) +go_template_instance( + name = "fstree", + out = "fstree.go", + package = "kernfs", + prefix = "generic", + template = "//pkg/sentry/vfs/genericfstree:generic_fstree", + types = { + "Dentry": "Dentry", + }, +) + go_template_instance( name = "slot_list", out = "slot_list.go", @@ -21,6 +32,7 @@ go_library( "dynamic_bytes_file.go", "fd_impl_util.go", "filesystem.go", + "fstree.go", "inode_impl_util.go", "kernfs.go", "slot_list.go", diff --git a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go index bfa786c88..e8a4670b8 100644 --- a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go @@ -129,7 +129,7 @@ func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirent // Handle "..". if fd.off == 1 { - parentInode := vfsd.ParentOrSelf().Impl().(*Dentry).inode + parentInode := genericParentOrSelf(vfsd.Impl().(*Dentry)).inode stat, err := parentInode.Stat(vfsFS, opts) if err != nil { return err diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go index baf81b4db..01c23d192 100644 --- a/pkg/sentry/fsimpl/kernfs/filesystem.go +++ b/pkg/sentry/fsimpl/kernfs/filesystem.go @@ -56,25 +56,28 @@ afterSymlink: return vfsd, nil } if name == ".." { - nextVFSD, err := rp.ResolveParent(vfsd) - if err != nil { + if isRoot, err := rp.CheckRoot(vfsd); err != nil { + return nil, err + } else if isRoot || d.parent == nil { + rp.Advance() + return vfsd, nil + } + if err := rp.CheckMount(&d.parent.vfsd); err != nil { return nil, err } rp.Advance() - return nextVFSD, nil + return &d.parent.vfsd, nil } if len(name) > linux.NAME_MAX { return nil, syserror.ENAMETOOLONG } d.dirMu.Lock() - nextVFSD, err := rp.ResolveChild(vfsd, name) + next, err := fs.revalidateChildLocked(ctx, rp.VirtualFilesystem(), d, name, d.children[name]) + d.dirMu.Unlock() if err != nil { - d.dirMu.Unlock() return nil, err } - next, err := fs.revalidateChildLocked(ctx, rp.VirtualFilesystem(), d, name, nextVFSD) - d.dirMu.Unlock() - if err != nil { + if err := rp.CheckMount(&next.vfsd); err != nil { return nil, err } // Resolve any symlink at current path component. @@ -108,17 +111,17 @@ afterSymlink: // parent.dirMu must be locked. parent.isDir(). name is not "." or "..". // // Postconditions: Caller must call fs.processDeferredDecRefs*. -func (fs *Filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.VirtualFilesystem, parent *Dentry, name string, childVFSD *vfs.Dentry) (*Dentry, error) { - if childVFSD != nil { +func (fs *Filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.VirtualFilesystem, parent *Dentry, name string, child *Dentry) (*Dentry, error) { + if child != nil { // Cached dentry exists, revalidate. - child := childVFSD.Impl().(*Dentry) if !child.inode.Valid(ctx) { - vfsObj.ForceDeleteDentry(childVFSD) - fs.deferDecRef(childVFSD) // Reference from Lookup. - childVFSD = nil + delete(parent.children, name) + vfsObj.InvalidateDentry(&child.vfsd) + fs.deferDecRef(&child.vfsd) // Reference from Lookup. + child = nil } } - if childVFSD == nil { + if child == nil { // Dentry isn't cached; it either doesn't exist or failed // revalidation. Attempt to resolve it via Lookup. // @@ -126,15 +129,15 @@ func (fs *Filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.Vir // *(kernfs.)Dentry, not *vfs.Dentry, since (kernfs.)Filesystem assumes // that all dentries in the filesystem are (kernfs.)Dentry and performs // vfs.DentryImpl casts accordingly. - var err error - childVFSD, err = parent.inode.Lookup(ctx, name) + childVFSD, err := parent.inode.Lookup(ctx, name) if err != nil { return nil, err } // Reference on childVFSD dropped by a corresponding Valid. - parent.insertChildLocked(name, childVFSD) + child = childVFSD.Impl().(*Dentry) + parent.insertChildLocked(name, child) } - return childVFSD.Impl().(*Dentry), nil + return child, nil } // walkExistingLocked resolves rp to an existing file. @@ -203,14 +206,11 @@ func checkCreateLocked(ctx context.Context, rp *vfs.ResolvingPath, parentVFSD *v if len(pc) > linux.NAME_MAX { return "", syserror.ENAMETOOLONG } - childVFSD, err := rp.ResolveChild(parentVFSD, pc) - if err != nil { - return "", err - } - if childVFSD != nil { + // FIXME(gvisor.dev/issue/1193): Data race due to not holding dirMu. + if _, ok := parentVFSD.Impl().(*Dentry).children[pc]; ok { return "", syserror.EEXIST } - if parentVFSD.IsDisowned() { + if parentVFSD.IsDead() { return "", syserror.ENOENT } return pc, nil @@ -220,14 +220,14 @@ func checkCreateLocked(ctx context.Context, rp *vfs.ResolvingPath, parentVFSD *v // // Preconditions: Filesystem.mu must be locked for at least reading. func checkDeleteLocked(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry) error { - parentVFSD := vfsd.Parent() - if parentVFSD == nil { + parent := vfsd.Impl().(*Dentry).parent + if parent == nil { return syserror.EBUSY } - if parentVFSD.IsDisowned() { + if parent.vfsd.IsDead() { return syserror.ENOENT } - if err := parentVFSD.Impl().(*Dentry).inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { + if err := parent.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { return err } return nil @@ -321,11 +321,11 @@ func (fs *Filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs. return syserror.EPERM } - child, err := parentInode.NewLink(ctx, pc, d.inode) + childVFSD, err := parentInode.NewLink(ctx, pc, d.inode) if err != nil { return err } - parentVFSD.Impl().(*Dentry).InsertChild(pc, child) + parentVFSD.Impl().(*Dentry).InsertChild(pc, childVFSD.Impl().(*Dentry)) return nil } @@ -349,11 +349,11 @@ func (fs *Filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v return err } defer rp.Mount().EndWrite() - child, err := parentInode.NewDir(ctx, pc, opts) + childVFSD, err := parentInode.NewDir(ctx, pc, opts) if err != nil { return err } - parentVFSD.Impl().(*Dentry).InsertChild(pc, child) + parentVFSD.Impl().(*Dentry).InsertChild(pc, childVFSD.Impl().(*Dentry)) return nil } @@ -377,11 +377,11 @@ func (fs *Filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v return err } defer rp.Mount().EndWrite() - new, err := parentInode.NewNode(ctx, pc, opts) + newVFSD, err := parentInode.NewNode(ctx, pc, opts) if err != nil { return err } - parentVFSD.Impl().(*Dentry).InsertChild(pc, new) + parentVFSD.Impl().(*Dentry).InsertChild(pc, newVFSD.Impl().(*Dentry)) return nil } @@ -449,11 +449,8 @@ afterTrailingSymlink: return nil, syserror.ENAMETOOLONG } // Determine whether or not we need to create a file. - childVFSD, err := rp.ResolveChild(parentVFSD, pc) - if err != nil { - return nil, err - } - if childVFSD == nil { + childVFSD, err := fs.stepExistingLocked(ctx, rp, parentVFSD) + if err == syserror.ENOENT { // Already checked for searchability above; now check for writability. if err := parentInode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite); err != nil { return nil, err @@ -463,21 +460,24 @@ afterTrailingSymlink: } defer rp.Mount().EndWrite() // Create and open the child. - child, err := parentInode.NewFile(ctx, pc, opts) + childVFSD, err = parentInode.NewFile(ctx, pc, opts) if err != nil { return nil, err } + child := childVFSD.Impl().(*Dentry) parentVFSD.Impl().(*Dentry).InsertChild(pc, child) - return child.Impl().(*Dentry).inode.Open(rp, child, opts) + return child.inode.Open(rp, childVFSD, opts) + } + if err != nil { + return nil, err } // Open existing file or follow symlink. if mustCreate { return nil, syserror.EEXIST } - childDentry := childVFSD.Impl().(*Dentry) - childInode := childDentry.inode - if rp.ShouldFollowSymlink() && childDentry.isSymlink() { - targetVD, targetPathname, err := childInode.Getlink(ctx) + child := childVFSD.Impl().(*Dentry) + if rp.ShouldFollowSymlink() && child.isSymlink() { + targetVD, targetPathname, err := child.inode.Getlink(ctx) if err != nil { return nil, err } @@ -496,10 +496,10 @@ afterTrailingSymlink: // symlink target. goto afterTrailingSymlink } - if err := childInode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil { + if err := child.inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil { return nil, err } - return childInode.Open(rp, childVFSD, opts) + return child.inode.Open(rp, &child.vfsd, opts) } // ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt. @@ -526,15 +526,16 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa noReplace := opts.Flags&linux.RENAME_NOREPLACE != 0 fs.mu.Lock() - defer fs.mu.Lock() + defer fs.processDeferredDecRefsLocked() + defer fs.mu.Unlock() // Resolve the destination directory first to verify that it's on this // Mount. dstDirVFSD, dstDirInode, err := fs.walkParentDirLocked(ctx, rp) - fs.processDeferredDecRefsLocked() if err != nil { return err } + dstDir := dstDirVFSD.Impl().(*Dentry) mnt := rp.Mount() if mnt != oldParentVD.Mount() { return syserror.EXDEV @@ -547,9 +548,8 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa srcDirVFSD := oldParentVD.Dentry() srcDir := srcDirVFSD.Impl().(*Dentry) srcDir.dirMu.Lock() - src, err := fs.revalidateChildLocked(ctx, rp.VirtualFilesystem(), srcDir, oldName, srcDirVFSD.Child(oldName)) + src, err := fs.revalidateChildLocked(ctx, rp.VirtualFilesystem(), srcDir, oldName, srcDir.children[oldName]) srcDir.dirMu.Unlock() - fs.processDeferredDecRefsLocked() if err != nil { return err } @@ -561,7 +561,7 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa } // Can we create the dst dentry? - var dstVFSD *vfs.Dentry + var dst *Dentry pc, err := checkCreateLocked(ctx, rp, dstDirVFSD, dstDirInode) switch err { case nil: @@ -571,38 +571,51 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa // Won't overwrite existing node since RENAME_NOREPLACE was requested. return syserror.EEXIST } - dstVFSD, err = rp.ResolveChild(dstDirVFSD, pc) - if err != nil { + dst = dstDir.children[pc] + if dst == nil { panic(fmt.Sprintf("Child %q for parent Dentry %+v disappeared inside atomic section?", pc, dstDirVFSD)) } default: return err } + var dstVFSD *vfs.Dentry + if dst != nil { + dstVFSD = &dst.vfsd + } mntns := vfs.MountNamespaceFromContext(ctx) defer mntns.DecRef() virtfs := rp.VirtualFilesystem() - srcDirDentry := srcDirVFSD.Impl().(*Dentry) - dstDirDentry := dstDirVFSD.Impl().(*Dentry) - // We can't deadlock here due to lock ordering because we're protected from // concurrent renames by fs.mu held for writing. - srcDirDentry.dirMu.Lock() - defer srcDirDentry.dirMu.Unlock() - dstDirDentry.dirMu.Lock() - defer dstDirDentry.dirMu.Unlock() + srcDir.dirMu.Lock() + defer srcDir.dirMu.Unlock() + if srcDir != dstDir { + dstDir.dirMu.Lock() + defer dstDir.dirMu.Unlock() + } if err := virtfs.PrepareRenameDentry(mntns, srcVFSD, dstVFSD); err != nil { return err } - srcDirInode := srcDirDentry.inode - replaced, err := srcDirInode.Rename(ctx, srcVFSD.Name(), pc, srcVFSD, dstDirVFSD) + replaced, err := srcDir.inode.Rename(ctx, src.name, pc, srcVFSD, dstDirVFSD) if err != nil { virtfs.AbortRenameDentry(srcVFSD, dstVFSD) return err } - virtfs.CommitRenameReplaceDentry(srcVFSD, dstDirVFSD, pc, replaced) + delete(srcDir.children, src.name) + if srcDir != dstDir { + fs.deferDecRef(srcDirVFSD) + dstDir.IncRef() + } + src.parent = dstDir + src.name = pc + if dstDir.children == nil { + dstDir.children = make(map[string]*Dentry) + } + dstDir.children[pc] = src + virtfs.CommitRenameReplaceDentry(srcVFSD, replaced) return nil } @@ -622,14 +635,15 @@ func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error if err := checkDeleteLocked(ctx, rp, vfsd); err != nil { return err } - if !vfsd.Impl().(*Dentry).isDir() { + d := vfsd.Impl().(*Dentry) + if !d.isDir() { return syserror.ENOTDIR } if inode.HasChildren() { return syserror.ENOTEMPTY } virtfs := rp.VirtualFilesystem() - parentDentry := vfsd.Parent().Impl().(*Dentry) + parentDentry := d.parent parentDentry.dirMu.Lock() defer parentDentry.dirMu.Unlock() @@ -706,11 +720,11 @@ func (fs *Filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, targ return err } defer rp.Mount().EndWrite() - child, err := parentInode.NewSymlink(ctx, pc, target) + childVFSD, err := parentInode.NewSymlink(ctx, pc, target) if err != nil { return err } - parentVFSD.Impl().(*Dentry).InsertChild(pc, child) + parentVFSD.Impl().(*Dentry).InsertChild(pc, childVFSD.Impl().(*Dentry)) return nil } @@ -730,11 +744,12 @@ func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error if err := checkDeleteLocked(ctx, rp, vfsd); err != nil { return err } - if vfsd.Impl().(*Dentry).isDir() { + d := vfsd.Impl().(*Dentry) + if d.isDir() { return syserror.EISDIR } virtfs := rp.VirtualFilesystem() - parentDentry := vfsd.Parent().Impl().(*Dentry) + parentDentry := d.parent parentDentry.dirMu.Lock() defer parentDentry.dirMu.Unlock() mntns := vfs.MountNamespaceFromContext(ctx) @@ -818,5 +833,5 @@ func (fs *Filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, func (fs *Filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error { fs.mu.RLock() defer fs.mu.RUnlock() - return vfs.GenericPrependPath(vfsroot, vd, b) + return genericPrependPath(vfsroot, vd.Mount(), vd.Dentry().Impl().(*Dentry), b) } diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go index 65f09af5d..9f526359e 100644 --- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go @@ -370,7 +370,7 @@ func (o *OrderedChildren) Populate(d *Dentry, children map[string]*Dentry) uint3 if err := o.Insert(name, child.VFSDentry()); err != nil { panic(fmt.Sprintf("Collision when attempting to insert child %q (%+v) into %+v", name, child, d)) } - d.InsertChild(name, child.VFSDentry()) + d.InsertChild(name, child) } return links } diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go index ad76b9f64..f5041824f 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs.go @@ -168,17 +168,22 @@ const ( // // Must be initialized by Init prior to first use. type Dentry struct { - refs.AtomicRefCount + vfsd vfs.Dentry - vfsd vfs.Dentry - inode Inode + refs.AtomicRefCount // flags caches useful information about the dentry from the inode. See the // dflags* consts above. Must be accessed by atomic ops. flags uint32 - // dirMu protects vfsd.children for directory dentries. - dirMu sync.Mutex + parent *Dentry + name string + + // dirMu protects children and the names of child Dentries. + dirMu sync.Mutex + children map[string]*Dentry + + inode Inode } // Init initializes this dentry. @@ -222,8 +227,8 @@ func (d *Dentry) DecRef() { func (d *Dentry) destroy() { d.inode.DecRef() // IncRef from Init. d.inode = nil - if parent := d.vfsd.Parent(); parent != nil { - parent.DecRef() // IncRef from Dentry.InsertChild. + if d.parent != nil { + d.parent.DecRef() // IncRef from Dentry.InsertChild. } } @@ -233,7 +238,7 @@ func (d *Dentry) destroy() { // updates the link count on d if required. // // Precondition: d must represent a directory inode. -func (d *Dentry) InsertChild(name string, child *vfs.Dentry) { +func (d *Dentry) InsertChild(name string, child *Dentry) { d.dirMu.Lock() d.insertChildLocked(name, child) d.dirMu.Unlock() @@ -243,13 +248,17 @@ func (d *Dentry) InsertChild(name string, child *vfs.Dentry) { // preconditions. // // Precondition: d.dirMu must be locked. -func (d *Dentry) insertChildLocked(name string, child *vfs.Dentry) { +func (d *Dentry) insertChildLocked(name string, child *Dentry) { if !d.isDir() { panic(fmt.Sprintf("InsertChild called on non-directory Dentry: %+v.", d)) } - vfsDentry := d.VFSDentry() - vfsDentry.IncRef() // DecRef in child's Dentry.destroy. - vfsDentry.InsertChild(child, name) + d.IncRef() // DecRef in child's Dentry.destroy. + child.parent = d + child.name = name + if d.children == nil { + d.children = make(map[string]*Dentry) + } + d.children[name] = child } // The Inode interface maps filesystem-level operations that operate on paths to diff --git a/pkg/sentry/fsimpl/proc/tasks_test.go b/pkg/sentry/fsimpl/proc/tasks_test.go index d0f97c137..19abb5034 100644 --- a/pkg/sentry/fsimpl/proc/tasks_test.go +++ b/pkg/sentry/fsimpl/proc/tasks_test.go @@ -415,36 +415,36 @@ func iterateDir(ctx context.Context, t *testing.T, s *testutil.System, fd *vfs.F if d.Name == "." || d.Name == ".." { continue } - childPath := path.Join(fd.MappedName(ctx), d.Name) + absPath := path.Join(fd.MappedName(ctx), d.Name) if d.Type == linux.DT_LNK { link, err := s.VFS.ReadlinkAt( ctx, auth.CredentialsFromContext(ctx), - &vfs.PathOperation{Root: s.Root, Start: s.Root, Path: fspath.Parse(childPath)}, + &vfs.PathOperation{Root: s.Root, Start: s.Root, Path: fspath.Parse(absPath)}, ) if err != nil { - t.Errorf("vfsfs.ReadlinkAt(%v) failed: %v", childPath, err) + t.Errorf("vfsfs.ReadlinkAt(%v) failed: %v", absPath, err) } else { - t.Logf("Skipping symlink: /proc%s => %s", childPath, link) + t.Logf("Skipping symlink: %s => %s", absPath, link) } continue } - t.Logf("Opening: /proc%s", childPath) + t.Logf("Opening: %s", absPath) child, err := s.VFS.OpenAt( ctx, auth.CredentialsFromContext(ctx), - &vfs.PathOperation{Root: s.Root, Start: s.Root, Path: fspath.Parse(childPath)}, + &vfs.PathOperation{Root: s.Root, Start: s.Root, Path: fspath.Parse(absPath)}, &vfs.OpenOptions{}, ) if err != nil { - t.Errorf("vfsfs.OpenAt(%v) failed: %v", childPath, err) + t.Errorf("vfsfs.OpenAt(%v) failed: %v", absPath, err) continue } defer child.DecRef() stat, err := child.Stat(ctx, vfs.StatOptions{}) if err != nil { - t.Errorf("Stat(%v) failed: %v", childPath, err) + t.Errorf("Stat(%v) failed: %v", absPath, err) } if got := linux.FileMode(stat.Mode).DirentType(); got != d.Type { t.Errorf("wrong file mode, stat: %v, dirent: %v", got, d.Type) diff --git a/pkg/sentry/fsimpl/tmpfs/BUILD b/pkg/sentry/fsimpl/tmpfs/BUILD index 4e6cd3491..a2d9649e7 100644 --- a/pkg/sentry/fsimpl/tmpfs/BUILD +++ b/pkg/sentry/fsimpl/tmpfs/BUILD @@ -15,6 +15,17 @@ go_template_instance( }, ) +go_template_instance( + name = "fstree", + out = "fstree.go", + package = "tmpfs", + prefix = "generic", + template = "//pkg/sentry/vfs/genericfstree:generic_fstree", + types = { + "Dentry": "dentry", + }, +) + go_library( name = "tmpfs", srcs = [ @@ -22,6 +33,7 @@ go_library( "device_file.go", "directory.go", "filesystem.go", + "fstree.go", "named_pipe.go", "regular_file.go", "socket_file.go", diff --git a/pkg/sentry/fsimpl/tmpfs/benchmark_test.go b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go index 651912169..2fb5c4d84 100644 --- a/pkg/sentry/fsimpl/tmpfs/benchmark_test.go +++ b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go @@ -438,13 +438,6 @@ func BenchmarkVFS2TmpfsMountStat(b *testing.B) { filePathBuilder.WriteByte('/') } - // Verify that we didn't create any directories under the mount - // point (i.e. they were all created on the submount). - firstDirName := fmt.Sprintf("%d", depth) - if child := mountPoint.Dentry().Child(firstDirName); child != nil { - b.Fatalf("created directory %q under root mount, not submount", firstDirName) - } - // Create the file that will be stat'd. fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{ Root: root, diff --git a/pkg/sentry/fsimpl/tmpfs/directory.go b/pkg/sentry/fsimpl/tmpfs/directory.go index 45712c9b9..f2399981b 100644 --- a/pkg/sentry/fsimpl/tmpfs/directory.go +++ b/pkg/sentry/fsimpl/tmpfs/directory.go @@ -15,35 +15,77 @@ package tmpfs import ( + "sync/atomic" + "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) type directory struct { - inode inode + // Since directories can't be hard-linked, each directory can only be + // associated with a single dentry, which we can store in the directory + // struct. + dentry dentry + inode inode + + // childMap maps the names of the directory's children to their dentries. + // childMap is protected by filesystem.mu. + childMap map[string]*dentry - // childList is a list containing (1) child Dentries and (2) fake Dentries + // numChildren is len(childMap), but accessed using atomic memory + // operations to avoid locking in inode.statTo(). + numChildren int64 + + // childList is a list containing (1) child dentries and (2) fake dentries // (with inode == nil) that represent the iteration position of // directoryFDs. childList is used to support directoryFD.IterDirents() - // efficiently. childList is protected by filesystem.mu. + // efficiently. childList is protected by iterMu. + iterMu sync.Mutex childList dentryList } -func (fs *filesystem) newDirectory(creds *auth.Credentials, mode linux.FileMode) *inode { +func (fs *filesystem) newDirectory(creds *auth.Credentials, mode linux.FileMode) *directory { dir := &directory{} dir.inode.init(dir, fs, creds, linux.S_IFDIR|mode) dir.inode.nlink = 2 // from "." and parent directory or ".." for root - return &dir.inode + dir.dentry.inode = &dir.inode + dir.dentry.vfsd.Init(&dir.dentry) + return dir +} + +// Preconditions: filesystem.mu must be locked for writing. dir must not +// already contain a child with the given name. +func (dir *directory) insertChildLocked(child *dentry, name string) { + child.parent = &dir.dentry + child.name = name + if dir.childMap == nil { + dir.childMap = make(map[string]*dentry) + } + dir.childMap[name] = child + atomic.AddInt64(&dir.numChildren, 1) + dir.iterMu.Lock() + dir.childList.PushBack(child) + dir.iterMu.Unlock() +} + +// Preconditions: filesystem.mu must be locked for writing. +func (dir *directory) removeChildLocked(child *dentry) { + delete(dir.childMap, child.name) + atomic.AddInt64(&dir.numChildren, -1) + dir.iterMu.Lock() + dir.childList.Remove(child) + dir.iterMu.Unlock() } type directoryFD struct { fileDescription vfs.DirectoryFileDescriptionDefaultImpl - // Protected by filesystem.mu. + // Protected by directory.iterMu. iter *dentry off int64 } @@ -51,11 +93,10 @@ type directoryFD struct { // Release implements vfs.FileDescriptionImpl.Release. func (fd *directoryFD) Release() { if fd.iter != nil { - fs := fd.filesystem() dir := fd.inode().impl.(*directory) - fs.mu.Lock() + dir.iterMu.Lock() dir.childList.Remove(fd.iter) - fs.mu.Unlock() + dir.iterMu.Unlock() fd.iter = nil } } @@ -63,10 +104,13 @@ func (fd *directoryFD) Release() { // IterDirents implements vfs.FileDescriptionImpl.IterDirents. func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error { fs := fd.filesystem() - vfsd := fd.vfsfd.VirtualDentry().Dentry() + dir := fd.inode().impl.(*directory) - fs.mu.Lock() - defer fs.mu.Unlock() + // fs.mu is required to read d.parent and dentry.name. + fs.mu.RLock() + defer fs.mu.RUnlock() + dir.iterMu.Lock() + defer dir.iterMu.Unlock() fd.inode().touchAtime(fd.vfsfd.Mount()) @@ -74,15 +118,16 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba if err := cb.Handle(vfs.Dirent{ Name: ".", Type: linux.DT_DIR, - Ino: vfsd.Impl().(*dentry).inode.ino, + Ino: dir.inode.ino, NextOff: 1, }); err != nil { return err } fd.off++ } + if fd.off == 1 { - parentInode := vfsd.ParentOrSelf().Impl().(*dentry).inode + parentInode := genericParentOrSelf(&dir.dentry).inode if err := cb.Handle(vfs.Dirent{ Name: "..", Type: parentInode.direntType(), @@ -94,7 +139,6 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba fd.off++ } - dir := vfsd.Impl().(*dentry).inode.impl.(*directory) var child *dentry if fd.iter == nil { // Start iteration at the beginning of dir. @@ -109,7 +153,7 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba // Skip other directoryFD iterators. if child.inode != nil { if err := cb.Handle(vfs.Dirent{ - Name: child.vfsd.Name(), + Name: child.name, Type: child.inode.direntType(), Ino: child.inode.ino, NextOff: fd.off + 1, @@ -127,9 +171,9 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba // Seek implements vfs.FileDescriptionImpl.Seek. func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { - fs := fd.filesystem() - fs.mu.Lock() - defer fs.mu.Unlock() + dir := fd.inode().impl.(*directory) + dir.iterMu.Lock() + defer dir.iterMu.Unlock() switch whence { case linux.SEEK_SET: @@ -157,8 +201,6 @@ func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (in remChildren = offset - 2 } - dir := fd.inode().impl.(*directory) - // Ensure that fd.iter exists and is not linked into dir.childList. if fd.iter == nil { fd.iter = &dentry{} diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go index 452c4e2e0..5b62f9ebb 100644 --- a/pkg/sentry/fsimpl/tmpfs/filesystem.go +++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go @@ -39,27 +39,43 @@ func (fs *filesystem) Sync(ctx context.Context) error { // // Preconditions: filesystem.mu must be locked. !rp.Done(). func stepLocked(rp *vfs.ResolvingPath, d *dentry) (*dentry, error) { - if !d.inode.isDir() { + dir, ok := d.inode.impl.(*directory) + if !ok { return nil, syserror.ENOTDIR } if err := d.inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { return nil, err } afterSymlink: - if len(rp.Component()) > linux.NAME_MAX { - return nil, syserror.ENAMETOOLONG + name := rp.Component() + if name == "." { + rp.Advance() + return d, nil } - nextVFSD, err := rp.ResolveComponent(&d.vfsd) - if err != nil { - return nil, err + if name == ".." { + if isRoot, err := rp.CheckRoot(&d.vfsd); err != nil { + return nil, err + } else if isRoot || d.parent == nil { + rp.Advance() + return d, nil + } + if err := rp.CheckMount(&d.parent.vfsd); err != nil { + return nil, err + } + rp.Advance() + return d.parent, nil } - if nextVFSD == nil { - // Since the Dentry tree is the sole source of truth for tmpfs, if it's - // not in the Dentry tree, it doesn't exist. + if len(name) > linux.NAME_MAX { + return nil, syserror.ENAMETOOLONG + } + child, ok := dir.childMap[name] + if !ok { return nil, syserror.ENOENT } - next := nextVFSD.Impl().(*dentry) - if symlink, ok := next.inode.impl.(*symlink); ok && rp.ShouldFollowSymlink() { + if err := rp.CheckMount(&child.vfsd); err != nil { + return nil, err + } + if symlink, ok := child.inode.impl.(*symlink); ok && rp.ShouldFollowSymlink() { // TODO(gvisor.dev/issue/1197): Symlink traversals updates // access time. if err := rp.HandleSymlink(symlink.target); err != nil { @@ -68,7 +84,7 @@ afterSymlink: goto afterSymlink // don't check the current directory again } rp.Advance() - return next, nil + return child, nil } // walkParentDirLocked resolves all but the last path component of rp to an @@ -80,7 +96,7 @@ afterSymlink: // fs/namei.c:path_parentat(). // // Preconditions: filesystem.mu must be locked. !rp.Done(). -func walkParentDirLocked(rp *vfs.ResolvingPath, d *dentry) (*dentry, error) { +func walkParentDirLocked(rp *vfs.ResolvingPath, d *dentry) (*directory, error) { for !rp.Final() { next, err := stepLocked(rp, d) if err != nil { @@ -88,10 +104,11 @@ func walkParentDirLocked(rp *vfs.ResolvingPath, d *dentry) (*dentry, error) { } d = next } - if !d.inode.isDir() { + dir, ok := d.inode.impl.(*directory) + if !ok { return nil, syserror.ENOTDIR } - return d, nil + return dir, nil } // resolveLocked resolves rp to an existing file. @@ -122,14 +139,14 @@ func resolveLocked(rp *vfs.ResolvingPath) (*dentry, error) { // // Preconditions: !rp.Done(). For the final path component in rp, // !rp.ShouldFollowSymlink(). -func (fs *filesystem) doCreateAt(rp *vfs.ResolvingPath, dir bool, create func(parent *dentry, name string) error) error { +func (fs *filesystem) doCreateAt(rp *vfs.ResolvingPath, dir bool, create func(parentDir *directory, name string) error) error { fs.mu.Lock() defer fs.mu.Unlock() - parent, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry)) + parentDir, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry)) if err != nil { return err } - if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { + if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { return err } name := rp.Component() @@ -139,19 +156,15 @@ func (fs *filesystem) doCreateAt(rp *vfs.ResolvingPath, dir bool, create func(pa if len(name) > linux.NAME_MAX { return syserror.ENAMETOOLONG } - // Call parent.vfsd.Child() instead of stepLocked() or rp.ResolveChild(), - // because if the child exists we want to return EEXIST immediately instead - // of attempting symlink/mount traversal. - if parent.vfsd.Child(name) != nil { + if _, ok := parentDir.childMap[name]; ok { return syserror.EEXIST } if !dir && rp.MustBeDir() { return syserror.ENOENT } - // In tmpfs, the only way to cause a dentry to be disowned is by removing - // it from the filesystem, so this check is equivalent to checking if - // parent has been removed. - if parent.vfsd.IsDisowned() { + // tmpfs never calls VFS.InvalidateDentry(), so parentDir.dentry can only + // be dead if it was deleted. + if parentDir.dentry.vfsd.IsDead() { return syserror.ENOENT } mnt := rp.Mount() @@ -159,10 +172,10 @@ func (fs *filesystem) doCreateAt(rp *vfs.ResolvingPath, dir bool, create func(pa return err } defer mnt.EndWrite() - if err := create(parent, name); err != nil { + if err := create(parentDir, name); err != nil { return err } - parent.inode.touchCMtime() + parentDir.inode.touchCMtime() return nil } @@ -201,17 +214,17 @@ func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, op func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) { fs.mu.RLock() defer fs.mu.RUnlock() - d, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry)) + dir, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry)) if err != nil { return nil, err } - d.IncRef() - return &d.vfsd, nil + dir.dentry.IncRef() + return &dir.dentry.vfsd, nil } // LinkAt implements vfs.FilesystemImpl.LinkAt. func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error { - return fs.doCreateAt(rp, false /* dir */, func(parent *dentry, name string) error { + return fs.doCreateAt(rp, false /* dir */, func(parentDir *directory, name string) error { if rp.Mount() != vd.Mount() { return syserror.EXDEV } @@ -226,30 +239,27 @@ func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs. return syserror.EMLINK } d.inode.incLinksLocked() - child := fs.newDentry(d.inode) - parent.vfsd.InsertChild(&child.vfsd, name) - parent.inode.impl.(*directory).childList.PushBack(child) + parentDir.insertChildLocked(fs.newDentry(d.inode), name) return nil }) } // MkdirAt implements vfs.FilesystemImpl.MkdirAt. func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error { - return fs.doCreateAt(rp, true /* dir */, func(parent *dentry, name string) error { - if parent.inode.nlink == maxLinks { + return fs.doCreateAt(rp, true /* dir */, func(parentDir *directory, name string) error { + if parentDir.inode.nlink == maxLinks { return syserror.EMLINK } - parent.inode.incLinksLocked() // from child's ".." - child := fs.newDentry(fs.newDirectory(rp.Credentials(), opts.Mode)) - parent.vfsd.InsertChild(&child.vfsd, name) - parent.inode.impl.(*directory).childList.PushBack(child) + parentDir.inode.incLinksLocked() // from child's ".." + childDir := fs.newDirectory(rp.Credentials(), opts.Mode) + parentDir.insertChildLocked(&childDir.dentry, name) return nil }) } // MknodAt implements vfs.FilesystemImpl.MknodAt. func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error { - return fs.doCreateAt(rp, false /* dir */, func(parent *dentry, name string) error { + return fs.doCreateAt(rp, false /* dir */, func(parentDir *directory, name string) error { var childInode *inode switch opts.Mode.FileType() { case 0, linux.S_IFREG: @@ -266,8 +276,7 @@ func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v return syserror.EINVAL } child := fs.newDentry(childInode) - parent.vfsd.InsertChild(&child.vfsd, name) - parent.inode.impl.(*directory).childList.PushBack(child) + parentDir.insertChildLocked(child, name) return nil }) } @@ -306,12 +315,12 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf return start.open(ctx, rp, &opts, false /* afterCreate */) } afterTrailingSymlink: - parent, err := walkParentDirLocked(rp, start) + parentDir, err := walkParentDirLocked(rp, start) if err != nil { return nil, err } // Check for search permission in the parent directory. - if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { + if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { return nil, err } // Reject attempts to open directories with O_CREAT. @@ -322,11 +331,14 @@ afterTrailingSymlink: if name == "." || name == ".." { return nil, syserror.EISDIR } + if len(name) > linux.NAME_MAX { + return nil, syserror.ENAMETOOLONG + } // Determine whether or not we need to create a file. - child, err := stepLocked(rp, parent) - if err == syserror.ENOENT { + child, ok := parentDir.childMap[name] + if !ok { // Already checked for searchability above; now check for writability. - if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil { + if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil { return nil, err } if err := rp.Mount().CheckBeginWrite(); err != nil { @@ -335,21 +347,26 @@ afterTrailingSymlink: defer rp.Mount().EndWrite() // Create and open the child. child := fs.newDentry(fs.newRegularFile(rp.Credentials(), opts.Mode)) - parent.vfsd.InsertChild(&child.vfsd, name) - parent.inode.impl.(*directory).childList.PushBack(child) + parentDir.insertChildLocked(child, name) fd, err := child.open(ctx, rp, &opts, true) if err != nil { return nil, err } - parent.inode.touchCMtime() + parentDir.inode.touchCMtime() return fd, nil } - if err != nil { + // Is the file mounted over? + if err := rp.CheckMount(&child.vfsd); err != nil { return nil, err } // Do we need to resolve a trailing symlink? - if !rp.Done() { - start = parent + if symlink, ok := child.inode.impl.(*symlink); ok && rp.ShouldFollowSymlink() { + // TODO(gvisor.dev/issue/1197): Symlink traversals updates + // access time. + if err := rp.HandleSymlink(symlink.target); err != nil { + return nil, err + } + start = &parentDir.dentry goto afterTrailingSymlink } // Open existing file. @@ -428,7 +445,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa // Resolve newParent first to verify that it's on this Mount. fs.mu.Lock() defer fs.mu.Unlock() - newParent, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry)) + newParentDir, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry)) if err != nil { return err } @@ -445,23 +462,22 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa } defer mnt.EndWrite() - oldParent := oldParentVD.Dentry().Impl().(*dentry) - if err := oldParent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { + oldParentDir := oldParentVD.Dentry().Impl().(*dentry).inode.impl.(*directory) + if err := oldParentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { return err } - // Call vfs.Dentry.Child() instead of stepLocked() or rp.ResolveChild(), - // because if the existing child is a symlink or mount point then we want - // to rename over it rather than follow it. - renamedVFSD := oldParent.vfsd.Child(oldName) - if renamedVFSD == nil { + renamed, ok := oldParentDir.childMap[oldName] + if !ok { return syserror.ENOENT } - renamed := renamedVFSD.Impl().(*dentry) + // Note that we don't need to call rp.CheckMount(), since if renamed is a + // mount point then we want to rename the mount point, not anything in the + // mounted filesystem. if renamed.inode.isDir() { - if renamed == newParent || renamedVFSD.IsAncestorOf(&newParent.vfsd) { + if renamed == &newParentDir.dentry || genericIsAncestorDentry(renamed, &newParentDir.dentry) { return syserror.EINVAL } - if oldParent != newParent { + if oldParentDir != newParentDir { // Writability is needed to change renamed's "..". if err := renamed.inode.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil { return err @@ -473,18 +489,17 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa } } - if err := newParent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { + if err := newParentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { return err } - replacedVFSD := newParent.vfsd.Child(newName) - var replaced *dentry - if replacedVFSD != nil { - replaced = replacedVFSD.Impl().(*dentry) - if replaced.inode.isDir() { + replaced, ok := newParentDir.childMap[newName] + if ok { + replacedDir, ok := replaced.inode.impl.(*directory) + if ok { if !renamed.inode.isDir() { return syserror.EISDIR } - if replaced.vfsd.HasChildren() { + if len(replacedDir.childMap) != 0 { return syserror.ENOTEMPTY } } else { @@ -496,11 +511,13 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa } } } else { - if renamed.inode.isDir() && newParent.inode.nlink == maxLinks { + if renamed.inode.isDir() && newParentDir.inode.nlink == maxLinks { return syserror.EMLINK } } - if newParent.vfsd.IsDisowned() { + // tmpfs never calls VFS.InvalidateDentry(), so newParentDir.dentry can + // only be dead if it was deleted. + if newParentDir.dentry.vfsd.IsDead() { return syserror.ENOENT } @@ -508,36 +525,38 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa // simplicity, under the assumption that applications are not intentionally // doing noop renames expecting them to succeed where non-noop renames // would fail. - if renamedVFSD == replacedVFSD { + if renamed == replaced { return nil } vfsObj := rp.VirtualFilesystem() - oldParentDir := oldParent.inode.impl.(*directory) - newParentDir := newParent.inode.impl.(*directory) mntns := vfs.MountNamespaceFromContext(ctx) defer mntns.DecRef() - if err := vfsObj.PrepareRenameDentry(mntns, renamedVFSD, replacedVFSD); err != nil { + var replacedVFSD *vfs.Dentry + if replaced != nil { + replacedVFSD = &replaced.vfsd + } + if err := vfsObj.PrepareRenameDentry(mntns, &renamed.vfsd, replacedVFSD); err != nil { return err } if replaced != nil { - newParentDir.childList.Remove(replaced) + newParentDir.removeChildLocked(replaced) if replaced.inode.isDir() { - newParent.inode.decLinksLocked() // from replaced's ".." + newParentDir.inode.decLinksLocked() // from replaced's ".." } replaced.inode.decLinksLocked() } - oldParentDir.childList.Remove(renamed) - newParentDir.childList.PushBack(renamed) - if renamed.inode.isDir() { - oldParent.inode.decLinksLocked() - newParent.inode.incLinksLocked() + oldParentDir.removeChildLocked(renamed) + newParentDir.insertChildLocked(renamed, newName) + vfsObj.CommitRenameReplaceDentry(&renamed.vfsd, replacedVFSD) + oldParentDir.inode.touchCMtime() + if oldParentDir != newParentDir { + if renamed.inode.isDir() { + oldParentDir.inode.decLinksLocked() + newParentDir.inode.incLinksLocked() + } + newParentDir.inode.touchCMtime() } - oldParent.inode.touchCMtime() - newParent.inode.touchCMtime() renamed.inode.touchCtime() - // TODO(gvisor.dev/issue/1197): Update timestamps and parent directory - // sizes. - vfsObj.CommitRenameReplaceDentry(renamedVFSD, &newParent.vfsd, newName, replacedVFSD) return nil } @@ -545,11 +564,11 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error { fs.mu.Lock() defer fs.mu.Unlock() - parent, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry)) + parentDir, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry)) if err != nil { return err } - if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { + if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { return err } name := rp.Component() @@ -559,15 +578,15 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error if name == ".." { return syserror.ENOTEMPTY } - childVFSD := parent.vfsd.Child(name) - if childVFSD == nil { + child, ok := parentDir.childMap[name] + if !ok { return syserror.ENOENT } - child := childVFSD.Impl().(*dentry) - if !child.inode.isDir() { + childDir, ok := child.inode.impl.(*directory) + if !ok { return syserror.ENOTDIR } - if childVFSD.HasChildren() { + if len(childDir.childMap) != 0 { return syserror.ENOTEMPTY } mnt := rp.Mount() @@ -578,14 +597,14 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error vfsObj := rp.VirtualFilesystem() mntns := vfs.MountNamespaceFromContext(ctx) defer mntns.DecRef() - if err := vfsObj.PrepareDeleteDentry(mntns, childVFSD); err != nil { + if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil { return err } - parent.inode.impl.(*directory).childList.Remove(child) - parent.inode.decLinksLocked() // from child's ".." + parentDir.removeChildLocked(child) + parentDir.inode.decLinksLocked() // from child's ".." child.inode.decLinksLocked() - vfsObj.CommitDeleteDentry(childVFSD) - parent.inode.touchCMtime() + vfsObj.CommitDeleteDentry(&child.vfsd) + parentDir.inode.touchCMtime() return nil } @@ -627,10 +646,9 @@ func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linu // SymlinkAt implements vfs.FilesystemImpl.SymlinkAt. func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error { - return fs.doCreateAt(rp, false /* dir */, func(parent *dentry, name string) error { + return fs.doCreateAt(rp, false /* dir */, func(parentDir *directory, name string) error { child := fs.newDentry(fs.newSymlink(rp.Credentials(), target)) - parent.vfsd.InsertChild(&child.vfsd, name) - parent.inode.impl.(*directory).childList.PushBack(child) + parentDir.insertChildLocked(child, name) return nil }) } @@ -639,22 +657,21 @@ func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, targ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error { fs.mu.Lock() defer fs.mu.Unlock() - parent, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry)) + parentDir, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry)) if err != nil { return err } - if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { + if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { return err } name := rp.Component() if name == "." || name == ".." { return syserror.EISDIR } - childVFSD := parent.vfsd.Child(name) - if childVFSD == nil { + child, ok := parentDir.childMap[name] + if !ok { return syserror.ENOENT } - child := childVFSD.Impl().(*dentry) if child.inode.isDir() { return syserror.EISDIR } @@ -669,13 +686,13 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error vfsObj := rp.VirtualFilesystem() mntns := vfs.MountNamespaceFromContext(ctx) defer mntns.DecRef() - if err := vfsObj.PrepareDeleteDentry(mntns, childVFSD); err != nil { + if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil { return err } - parent.inode.impl.(*directory).childList.Remove(child) + parentDir.removeChildLocked(child) child.inode.decLinksLocked() - vfsObj.CommitDeleteDentry(childVFSD) - parent.inode.touchCMtime() + vfsObj.CommitDeleteDentry(&child.vfsd) + parentDir.inode.touchCMtime() return nil } @@ -743,5 +760,5 @@ func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error { fs.mu.RLock() defer fs.mu.RUnlock() - return vfs.GenericPrependPath(vfsroot, vd, b) + return genericPrependPath(vfsroot, vd.Mount(), vd.Dentry().Impl().(*dentry), b) } diff --git a/pkg/sentry/fsimpl/tmpfs/stat_test.go b/pkg/sentry/fsimpl/tmpfs/stat_test.go index d4f59ee5b..60c2c980e 100644 --- a/pkg/sentry/fsimpl/tmpfs/stat_test.go +++ b/pkg/sentry/fsimpl/tmpfs/stat_test.go @@ -71,9 +71,15 @@ func TestStatAfterCreate(t *testing.T) { t.Errorf("got btime %d, want 0", got.Btime.ToNsec()) } - // Size should be 0. - if got.Size != 0 { - t.Errorf("got size %d, want 0", got.Size) + // Size should be 0 (except for directories, which make up a size + // of 20 per entry, including the "." and ".." entries present in + // otherwise-empty directories). + wantSize := uint64(0) + if typ == "dir" { + wantSize = 40 + } + if got.Size != wantSize { + t.Errorf("got size %d, want %d", got.Size, wantSize) } // Nlink should be 1 for files, 2 for dirs. diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go index 82c709b43..efc931468 100644 --- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go +++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go @@ -12,16 +12,19 @@ // See the License for the specific language governing permissions and // limitations under the License. -// Package tmpfs provides a filesystem implementation that behaves like tmpfs: -// the Dentry tree is the sole source of truth for the state of the filesystem. +// Package tmpfs provides an in-memory filesystem whose contents are +// application-mutable, consistent with Linux's tmpfs. // // Lock order: // // filesystem.mu // inode.mu // regularFileFD.offMu +// *** "memmap.Mappable locks" below this point // regularFile.mapsMu +// *** "memmap.Mappable locks taken by Translate" below this point // regularFile.dataMu +// directory.iterMu package tmpfs import ( @@ -41,6 +44,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/vfs/memxattr" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // Name is the default filesystem name. @@ -112,18 +116,18 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt fs.vfsfs.Init(vfsObj, newFSType, &fs) - var root *inode + var root *dentry switch rootFileType { case linux.S_IFREG: - root = fs.newRegularFile(creds, 0777) + root = fs.newDentry(fs.newRegularFile(creds, 0777)) case linux.S_IFLNK: - root = fs.newSymlink(creds, tmpfsOpts.RootSymlinkTarget) + root = fs.newDentry(fs.newSymlink(creds, tmpfsOpts.RootSymlinkTarget)) case linux.S_IFDIR: - root = fs.newDirectory(creds, 01777) + root = &fs.newDirectory(creds, 01777).dentry default: return nil, nil, fmt.Errorf("invalid tmpfs root file type: %#o", rootFileType) } - return &fs.vfsfs, &fs.newDentry(root).vfsd, nil + return &fs.vfsfs, &root.vfsd, nil } // Release implements vfs.FilesystemImpl.Release. @@ -134,20 +138,29 @@ func (fs *filesystem) Release() { type dentry struct { vfsd vfs.Dentry + // parent is this dentry's parent directory. Each referenced dentry holds a + // reference on parent.dentry. If this dentry is a filesystem root, parent + // is nil. parent is protected by filesystem.mu. + parent *dentry + + // name is the name of this dentry in its parent. If this dentry is a + // filesystem root, name is the empty string. name is protected by + // filesystem.mu. + name string + + // dentryEntry (ugh) links dentries into their parent directory.childList. + dentryEntry + // inode is the inode represented by this dentry. Multiple Dentries may // share a single non-directory inode (with hard links). inode is // immutable. - inode *inode - + // // tmpfs doesn't count references on dentries; because the dentry tree is // the sole source of truth, it is by definition always consistent with the // state of the filesystem. However, it does count references on inodes, // because inode resources are released when all references are dropped. - // (tmpfs doesn't really have resources to release, but we implement - // reference counting because tmpfs regular files will.) - - // dentryEntry (ugh) links dentries into their parent directory.childList. - dentryEntry + // dentry therefore forwards reference counting directly to inode. + inode *inode } func (fs *filesystem) newDentry(inode *inode) *dentry { @@ -207,10 +220,6 @@ type inode struct { ctime int64 // nanoseconds mtime int64 // nanoseconds - // Only meaningful for device special files. - rdevMajor uint32 - rdevMinor uint32 - // Advisory file locks, which lock at the inode level. locks lock.FileLocks @@ -230,7 +239,7 @@ func (i *inode) init(impl interface{}, fs *filesystem, creds *auth.Credentials, i.gid = uint32(creds.EffectiveKGID) i.ino = atomic.AddUint64(&fs.nextInoMinusOne, 1) // Tmpfs creation sets atime, ctime, and mtime to current time. - now := i.clock.Now().Nanoseconds() + now := fs.clock.Now().Nanoseconds() i.atime = now i.ctime = now i.mtime = now @@ -283,14 +292,10 @@ func (i *inode) tryIncRef() bool { func (i *inode) decRef() { if refs := atomic.AddInt64(&i.refs, -1); refs == 0 { if regFile, ok := i.impl.(*regularFile); ok { - // Hold inode.mu and regFile.dataMu while mutating - // size. - i.mu.Lock() - regFile.dataMu.Lock() + // Release memory used by regFile to store data. Since regFile is + // no longer usable, we don't need to grab any locks or update any + // metadata. regFile.data.DropAll(regFile.memFile) - atomic.StoreUint64(®File.size, 0) - regFile.dataMu.Unlock() - i.mu.Unlock() } } else if refs < 0 { panic("tmpfs.inode.decRef() called without holding a reference") @@ -310,15 +315,15 @@ func (i *inode) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) e // a concurrent modification), so we do not require holding inode.mu. func (i *inode) statTo(stat *linux.Statx) { stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | - linux.STATX_UID | linux.STATX_GID | linux.STATX_INO | linux.STATX_ATIME | - linux.STATX_BTIME | linux.STATX_CTIME | linux.STATX_MTIME - stat.Blksize = 1 // usermem.PageSize in tmpfs + linux.STATX_UID | linux.STATX_GID | linux.STATX_INO | linux.STATX_SIZE | + linux.STATX_BLOCKS | linux.STATX_ATIME | linux.STATX_CTIME | + linux.STATX_MTIME + stat.Blksize = usermem.PageSize stat.Nlink = atomic.LoadUint32(&i.nlink) stat.UID = atomic.LoadUint32(&i.uid) stat.GID = atomic.LoadUint32(&i.gid) stat.Mode = uint16(atomic.LoadUint32(&i.mode)) stat.Ino = i.ino - // Linux's tmpfs has no concept of btime, so zero-value is returned. stat.Atime = linux.NsecToStatxTimestamp(i.atime) stat.Ctime = linux.NsecToStatxTimestamp(i.ctime) stat.Mtime = linux.NsecToStatxTimestamp(i.mtime) @@ -327,19 +332,22 @@ func (i *inode) statTo(stat *linux.Statx) { case *regularFile: stat.Mask |= linux.STATX_SIZE | linux.STATX_BLOCKS stat.Size = uint64(atomic.LoadUint64(&impl.size)) - // In tmpfs, this will be FileRangeSet.Span() / 512 (but also cached in - // a uint64 accessed using atomic memory operations to avoid taking - // locks). + // TODO(jamieliu): This should be impl.data.Span() / 512, but this is + // too expensive to compute here. Cache it in regularFile. stat.Blocks = allocatedBlocksForSize(stat.Size) + case *directory: + // "20" is mm/shmem.c:BOGO_DIRENT_SIZE. + stat.Size = 20 * (2 + uint64(atomic.LoadInt64(&impl.numChildren))) + // stat.Blocks is 0. case *symlink: - stat.Mask |= linux.STATX_SIZE | linux.STATX_BLOCKS stat.Size = uint64(len(impl.target)) - stat.Blocks = allocatedBlocksForSize(stat.Size) + // stat.Blocks is 0. + case *namedPipe, *socketFile: + // stat.Size and stat.Blocks are 0. case *deviceFile: + // stat.Size and stat.Blocks are 0. stat.RdevMajor = impl.major stat.RdevMinor = impl.minor - case *socketFile, *directory, *namedPipe: - // Nothing to do. default: panic(fmt.Sprintf("unknown inode type: %T", i.impl)) } diff --git a/pkg/sentry/vfs/dentry.go b/pkg/sentry/vfs/dentry.go index 35b208721..8624dbd5d 100644 --- a/pkg/sentry/vfs/dentry.go +++ b/pkg/sentry/vfs/dentry.go @@ -15,34 +15,17 @@ package vfs import ( - "fmt" "sync/atomic" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) -// Dentry represents a node in a Filesystem tree which may represent a file. +// Dentry represents a node in a Filesystem tree at which a file exists. // // Dentries are reference-counted. Unless otherwise specified, all Dentry // methods require that a reference is held. // -// A Dentry transitions through up to 3 different states through its lifetime: -// -// - Dentries are initially "independent". Independent Dentries have no parent, -// and consequently no name. -// -// - Dentry.InsertChild() causes an independent Dentry to become a "child" of -// another Dentry. A child node has a parent node, and a name in that parent, -// both of which are mutable by DentryMoveChild(). Each child Dentry's name is -// unique within its parent. -// -// - Dentry.RemoveChild() causes a child Dentry to become "disowned". A -// disowned Dentry can still refer to its former parent and its former name in -// said parent, but the disowned Dentry is no longer reachable from its parent, -// and a new Dentry with the same name may become a child of the parent. (This -// is analogous to a struct dentry being "unhashed" in Linux.) -// // Dentry is loosely analogous to Linux's struct dentry, but: // // - VFS does not associate Dentries with inodes. gVisor interacts primarily @@ -57,9 +40,6 @@ import ( // and/or FileDescriptionImpl methods in gVisor's VFS. Filesystems that do // support inodes may store appropriate state in implementations of DentryImpl. // -// - VFS does not provide synchronization for mutable Dentry fields, other than -// mount-related ones. -// // - VFS does not require that Dentries are instantiated for all paths accessed // through VFS, only those that are tracked beyond the scope of a single // Filesystem operation. This includes file descriptions, mount points, mount @@ -67,6 +47,10 @@ import ( // of Dentries for operations on mutable remote filesystems that can't actually // cache any state in the Dentry. // +// - VFS does not track filesystem structure (i.e. relationships between +// Dentries), since both the relevant state and synchronization are +// filesystem-specific. +// // - For the reasons above, VFS is not directly responsible for managing Dentry // lifetime. Dentry reference counts only indicate the extent to which VFS // requires Dentries to exist; Filesystems may elect to cache or discard @@ -74,36 +58,23 @@ import ( // // +stateify savable type Dentry struct { - // parent is this Dentry's parent in this Filesystem. If this Dentry is - // independent, parent is nil. - parent *Dentry - - // name is this Dentry's name in parent. - name string + // mu synchronizes deletion/invalidation and mounting over this Dentry. + mu sync.Mutex `state:"nosave"` - flags uint32 + // dead is true if the file represented by this Dentry has been deleted (by + // CommitDeleteDentry or CommitRenameReplaceDentry) or invalidated (by + // InvalidateDentry). dead is protected by mu. + dead bool // mounts is the number of Mounts for which this Dentry is Mount.point. // mounts is accessed using atomic memory operations. mounts uint32 - // children are child Dentries. - children map[string]*Dentry - - // mu synchronizes disowning and mounting over this Dentry. - mu sync.Mutex `state:"nosave"` - // impl is the DentryImpl associated with this Dentry. impl is immutable. // This should be the last field in Dentry. impl DentryImpl } -const ( - // dflagsDisownedMask is set in Dentry.flags if the Dentry has been - // disowned. - dflagsDisownedMask = 1 << iota -) - // Init must be called before first use of d. func (d *Dentry) Init(impl DentryImpl) { d.impl = impl @@ -134,20 +105,6 @@ type DentryImpl interface { DecRef() } -// IsDisowned returns true if d is disowned. -func (d *Dentry) IsDisowned() bool { - return atomic.LoadUint32(&d.flags)&dflagsDisownedMask != 0 -} - -// Preconditions: !d.IsDisowned(). -func (d *Dentry) setDisowned() { - atomic.AddUint32(&d.flags, dflagsDisownedMask) -} - -func (d *Dentry) isMounted() bool { - return atomic.LoadUint32(&d.mounts) != 0 -} - // IncRef increments d's reference count. func (d *Dentry) IncRef() { d.impl.IncRef() @@ -164,104 +121,26 @@ func (d *Dentry) DecRef() { d.impl.DecRef() } -// These functions are exported so that filesystem implementations can use -// them. The vfs package, and users of VFS, should not call these functions. -// Unless otherwise specified, these methods require that there are no -// concurrent mutators of d. - -// Name returns d's name in its parent in its owning Filesystem. If d is -// independent, Name returns an empty string. -func (d *Dentry) Name() string { - return d.name -} - -// Parent returns d's parent in its owning Filesystem. It does not take a -// reference on the returned Dentry. If d is independent, Parent returns nil. -func (d *Dentry) Parent() *Dentry { - return d.parent -} - -// ParentOrSelf is equivalent to Parent, but returns d if d is independent. -func (d *Dentry) ParentOrSelf() *Dentry { - if d.parent == nil { - return d - } - return d.parent -} - -// Child returns d's child with the given name in its owning Filesystem. It -// does not take a reference on the returned Dentry. If no such child exists, -// Child returns nil. -func (d *Dentry) Child(name string) *Dentry { - return d.children[name] -} - -// HasChildren returns true if d has any children. -func (d *Dentry) HasChildren() bool { - return len(d.children) != 0 -} - -// Children returns a map containing all of d's children. -func (d *Dentry) Children() map[string]*Dentry { - if !d.HasChildren() { - return nil - } - m := make(map[string]*Dentry) - for name, child := range d.children { - m[name] = child - } - return m +// IsDead returns true if d has been deleted or invalidated by its owning +// filesystem. +func (d *Dentry) IsDead() bool { + d.mu.Lock() + defer d.mu.Unlock() + return d.dead } -// InsertChild makes child a child of d with the given name. -// -// InsertChild is a mutator of d and child. -// -// Preconditions: child must be an independent Dentry. d and child must be from -// the same Filesystem. d must not already have a child with the given name. -func (d *Dentry) InsertChild(child *Dentry, name string) { - if checkInvariants { - if _, ok := d.children[name]; ok { - panic(fmt.Sprintf("parent already contains a child named %q", name)) - } - if child.parent != nil || child.name != "" { - panic(fmt.Sprintf("child is not independent: parent = %v, name = %q", child.parent, child.name)) - } - } - if d.children == nil { - d.children = make(map[string]*Dentry) - } - d.children[name] = child - child.parent = d - child.name = name +func (d *Dentry) isMounted() bool { + return atomic.LoadUint32(&d.mounts) != 0 } -// IsAncestorOf returns true if d is an ancestor of d2; that is, d is either -// d2's parent or an ancestor of d2's parent. -func (d *Dentry) IsAncestorOf(d2 *Dentry) bool { - for d2.parent != nil { - if d2.parent == d { - return true - } - d2 = d2.parent - } - return false -} +// The following functions are exported so that filesystem implementations can +// use them. The vfs package, and users of VFS, should not call these +// functions. // PrepareDeleteDentry must be called before attempting to delete the file // represented by d. If PrepareDeleteDentry succeeds, the caller must call // AbortDeleteDentry or CommitDeleteDentry depending on the deletion's outcome. -// -// Preconditions: d is a child Dentry. func (vfs *VirtualFilesystem) PrepareDeleteDentry(mntns *MountNamespace, d *Dentry) error { - if checkInvariants { - if d.parent == nil { - panic("d is independent") - } - if d.IsDisowned() { - panic("d is already disowned") - } - } vfs.mountMu.Lock() if mntns.mountpoints[d] != 0 { vfs.mountMu.Unlock() @@ -280,42 +159,27 @@ func (vfs *VirtualFilesystem) AbortDeleteDentry(d *Dentry) { d.mu.Unlock() } -// CommitDeleteDentry must be called after the file represented by d is -// deleted, and causes d to become disowned. -// -// CommitDeleteDentry is a mutator of d and d.Parent(). -// -// Preconditions: PrepareDeleteDentry was previously called on d. +// CommitDeleteDentry must be called after PrepareDeleteDentry if the deletion +// succeeds. func (vfs *VirtualFilesystem) CommitDeleteDentry(d *Dentry) { - if d.parent != nil { - delete(d.parent.children, d.name) - } - d.setDisowned() + d.dead = true d.mu.Unlock() if d.isMounted() { - vfs.forgetDisownedMountpoint(d) + vfs.forgetDeadMountpoint(d) } } -// ForceDeleteDentry causes d to become disowned. It should only be used in -// cases where VFS has no ability to stop the deletion (e.g. d represents the -// local state of a file on a remote filesystem on which the file has already -// been deleted). -// -// ForceDeleteDentry is a mutator of d and d.Parent(). -// -// Preconditions: d is a child Dentry. -func (vfs *VirtualFilesystem) ForceDeleteDentry(d *Dentry) { - if checkInvariants { - if d.parent == nil { - panic("d is independent") - } - if d.IsDisowned() { - panic("d is already disowned") - } - } +// InvalidateDentry is called when d ceases to represent the file it formerly +// did for reasons outside of VFS' control (e.g. d represents the local state +// of a file on a remote filesystem on which the file has already been +// deleted). +func (vfs *VirtualFilesystem) InvalidateDentry(d *Dentry) { d.mu.Lock() - vfs.CommitDeleteDentry(d) + d.dead = true + d.mu.Unlock() + if d.isMounted() { + vfs.forgetDeadMountpoint(d) + } } // PrepareRenameDentry must be called before attempting to rename the file @@ -324,25 +188,9 @@ func (vfs *VirtualFilesystem) ForceDeleteDentry(d *Dentry) { // caller must call AbortRenameDentry, CommitRenameReplaceDentry, or // CommitRenameExchangeDentry depending on the rename's outcome. // -// Preconditions: from is a child Dentry. If to is not nil, it must be a child -// Dentry from the same Filesystem. from != to. +// Preconditions: If to is not nil, it must be a child Dentry from the same +// Filesystem. from != to. func (vfs *VirtualFilesystem) PrepareRenameDentry(mntns *MountNamespace, from, to *Dentry) error { - if checkInvariants { - if from.parent == nil { - panic("from is independent") - } - if from.IsDisowned() { - panic("from is already disowned") - } - if to != nil { - if to.parent == nil { - panic("to is independent") - } - if to.IsDisowned() { - panic("to is already disowned") - } - } - } vfs.mountMu.Lock() if mntns.mountpoints[from] != 0 { vfs.mountMu.Unlock() @@ -376,24 +224,14 @@ func (vfs *VirtualFilesystem) AbortRenameDentry(from, to *Dentry) { // is renamed without RENAME_EXCHANGE. If to is not nil, it represents the file // that was replaced by from. // -// CommitRenameReplaceDentry is a mutator of from, to, from.Parent(), and -// to.Parent(). -// // Preconditions: PrepareRenameDentry was previously called on from and to. -// newParent.Child(newName) == to. -func (vfs *VirtualFilesystem) CommitRenameReplaceDentry(from, newParent *Dentry, newName string, to *Dentry) { - if newParent.children == nil { - newParent.children = make(map[string]*Dentry) - } - newParent.children[newName] = from - from.parent = newParent - from.name = newName +func (vfs *VirtualFilesystem) CommitRenameReplaceDentry(from, to *Dentry) { from.mu.Unlock() if to != nil { - to.setDisowned() + to.dead = true to.mu.Unlock() if to.isMounted() { - vfs.forgetDisownedMountpoint(to) + vfs.forgetDeadMountpoint(to) } } } @@ -401,25 +239,18 @@ func (vfs *VirtualFilesystem) CommitRenameReplaceDentry(from, newParent *Dentry, // CommitRenameExchangeDentry must be called after the files represented by // from and to are exchanged by rename(RENAME_EXCHANGE). // -// CommitRenameExchangeDentry is a mutator of from, to, from.Parent(), and -// to.Parent(). -// // Preconditions: PrepareRenameDentry was previously called on from and to. func (vfs *VirtualFilesystem) CommitRenameExchangeDentry(from, to *Dentry) { - from.parent, to.parent = to.parent, from.parent - from.name, to.name = to.name, from.name - from.parent.children[from.name] = from - to.parent.children[to.name] = to from.mu.Unlock() to.mu.Unlock() } -// forgetDisownedMountpoint is called when a mount point is deleted to umount -// all mounts using it in all other mount namespaces. +// forgetDeadMountpoint is called when a mount point is deleted or invalidated +// to umount all mounts using it in all other mount namespaces. // -// forgetDisownedMountpoint is analogous to Linux's +// forgetDeadMountpoint is analogous to Linux's // fs/namespace.c:__detach_mounts(). -func (vfs *VirtualFilesystem) forgetDisownedMountpoint(d *Dentry) { +func (vfs *VirtualFilesystem) forgetDeadMountpoint(d *Dentry) { var ( vdsToDecRef []VirtualDentry mountsToDecRef []*Mount diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go index 5976b5ccd..15cc091e2 100644 --- a/pkg/sentry/vfs/file_description.go +++ b/pkg/sentry/vfs/file_description.go @@ -127,7 +127,8 @@ func (fd *FileDescription) Init(impl FileDescriptionImpl, statusFlags uint32, mn mount: mnt, dentry: d, } - fd.vd.IncRef() + mnt.IncRef() + d.IncRef() fd.opts = *opts fd.readable = MayReadFileWithOpenFlags(statusFlags) fd.writable = writable diff --git a/pkg/sentry/vfs/filesystem.go b/pkg/sentry/vfs/filesystem.go index a537a29d1..74577bc2f 100644 --- a/pkg/sentry/vfs/filesystem.go +++ b/pkg/sentry/vfs/filesystem.go @@ -346,7 +346,10 @@ type FilesystemImpl interface { // ENOTEMPTY. // // Preconditions: !rp.Done(). For the final path component in rp, - // !rp.ShouldFollowSymlink(). oldName is not "." or "..". + // !rp.ShouldFollowSymlink(). oldParentVD.Dentry() was obtained from a + // previous call to + // oldParentVD.Mount().Filesystem().Impl().GetParentDentryAt(). oldName is + // not "." or "..". // // Postconditions: If RenameAt returns an error returned by // ResolvingPath.Resolve*(), then !rp.Done(). diff --git a/pkg/sentry/vfs/filesystem_impl_util.go b/pkg/sentry/vfs/filesystem_impl_util.go index 7315a588e..465e610e0 100644 --- a/pkg/sentry/vfs/filesystem_impl_util.go +++ b/pkg/sentry/vfs/filesystem_impl_util.go @@ -16,8 +16,6 @@ package vfs import ( "strings" - - "gvisor.dev/gvisor/pkg/fspath" ) // GenericParseMountOptions parses a comma-separated list of options of the @@ -43,27 +41,3 @@ func GenericParseMountOptions(str string) map[string]string { } return m } - -// GenericPrependPath may be used by implementations of -// FilesystemImpl.PrependPath() for which a single statically-determined lock -// or set of locks is sufficient to ensure its preconditions (as opposed to -// e.g. per-Dentry locks). -// -// Preconditions: Dentry.Name() and Dentry.Parent() must be held constant for -// vd.Dentry() and all of its ancestors. -func GenericPrependPath(vfsroot, vd VirtualDentry, b *fspath.Builder) error { - mnt, d := vd.mount, vd.dentry - for { - if mnt == vfsroot.mount && d == vfsroot.dentry { - return PrependPathAtVFSRootError{} - } - if d == mnt.root { - return nil - } - if d.parent == nil { - return PrependPathAtNonMountRootError{} - } - b.PrependComponent(d.name) - d = d.parent - } -} diff --git a/pkg/sentry/vfs/genericfstree/BUILD b/pkg/sentry/vfs/genericfstree/BUILD new file mode 100644 index 000000000..d8fd92677 --- /dev/null +++ b/pkg/sentry/vfs/genericfstree/BUILD @@ -0,0 +1,16 @@ +load("//tools/go_generics:defs.bzl", "go_template") + +package( + default_visibility = ["//:sandbox"], + licenses = ["notice"], +) + +go_template( + name = "generic_fstree", + srcs = [ + "genericfstree.go", + ], + types = [ + "Dentry", + ], +) diff --git a/pkg/sentry/vfs/genericfstree/genericfstree.go b/pkg/sentry/vfs/genericfstree/genericfstree.go new file mode 100644 index 000000000..286510195 --- /dev/null +++ b/pkg/sentry/vfs/genericfstree/genericfstree.go @@ -0,0 +1,80 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package genericfstree provides tools for implementing vfs.FilesystemImpls +// where a single statically-determined lock or set of locks is sufficient to +// ensure that a Dentry's name and parent are contextually immutable. +// +// Clients using this package must use the go_template_instance rule in +// tools/go_generics/defs.bzl to create an instantiation of this template +// package, providing types to use in place of Dentry. +package genericfstree + +import ( + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/vfs" +) + +// Dentry is a required type parameter that is a struct with the given fields. +type Dentry struct { + // vfsd is the embedded vfs.Dentry corresponding to this vfs.DentryImpl. + vfsd vfs.Dentry + + // parent is the parent of this Dentry in the filesystem's tree. If this + // Dentry is a filesystem root, parent is nil. + parent *Dentry + + // name is the name of this Dentry in its parent. If this Dentry is a + // filesystem root, name is unspecified. + name string +} + +// IsAncestorDentry returns true if d is an ancestor of d2; that is, d is +// either d2's parent or an ancestor of d2's parent. +func IsAncestorDentry(d, d2 *Dentry) bool { + for { + if d2.parent == d { + return true + } + if d2.parent == d2 { + return false + } + d2 = d2.parent + } +} + +// ParentOrSelf returns d.parent. If d.parent is nil, ParentOrSelf returns d. +func ParentOrSelf(d *Dentry) *Dentry { + if d.parent != nil { + return d.parent + } + return d +} + +// PrependPath is a generic implementation of FilesystemImpl.PrependPath(). +func PrependPath(vfsroot vfs.VirtualDentry, mnt *vfs.Mount, d *Dentry, b *fspath.Builder) error { + for { + if mnt == vfsroot.Mount() && &d.vfsd == vfsroot.Dentry() { + return vfs.PrependPathAtVFSRootError{} + } + if &d.vfsd == mnt.Root() { + return nil + } + if d.parent == nil { + return vfs.PrependPathAtNonMountRootError{} + } + b.PrependComponent(d.name) + d = d.parent + } +} diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go index f06946103..02850b65c 100644 --- a/pkg/sentry/vfs/mount.go +++ b/pkg/sentry/vfs/mount.go @@ -188,6 +188,7 @@ func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentia if err != nil { return err } + // We can't hold vfs.mountMu while calling FilesystemImpl methods due to // lock ordering. vd, err := vfs.GetDentryAt(ctx, creds, target, &GetDentryOptions{}) @@ -199,7 +200,7 @@ func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentia vfs.mountMu.Lock() vd.dentry.mu.Lock() for { - if vd.dentry.IsDisowned() { + if vd.dentry.dead { vd.dentry.mu.Unlock() vfs.mountMu.Unlock() vd.DecRef() @@ -665,6 +666,12 @@ func (mnt *Mount) submountsLocked() []*Mount { return mounts } +// Root returns the mount's root. It does not take a reference on the returned +// Dentry. +func (mnt *Mount) Root() *Dentry { + return mnt.root +} + // Root returns mntns' root. A reference is taken on the returned // VirtualDentry. func (mntns *MountNamespace) Root() VirtualDentry { diff --git a/pkg/sentry/vfs/pathname.go b/pkg/sentry/vfs/pathname.go index f21a88034..cd78d66bc 100644 --- a/pkg/sentry/vfs/pathname.go +++ b/pkg/sentry/vfs/pathname.go @@ -58,7 +58,7 @@ loop: switch err.(type) { case nil: if vd.mount == vfsroot.mount && vd.mount.root == vfsroot.dentry { - // GenericPrependPath() will have returned + // genericfstree.PrependPath() will have returned // PrependPathAtVFSRootError in this case since it checks // against vfsroot before mnt.root, but other implementations // of FilesystemImpl.PrependPath() may return nil instead. @@ -84,7 +84,7 @@ loop: } } b.PrependByte('/') - if origD.IsDisowned() { + if origD.IsDead() { b.AppendString(" (deleted)") } return b.String(), nil @@ -136,7 +136,7 @@ loop: // PathnameForGetcwd returns an absolute pathname to vd, consistent with // Linux's sys_getcwd(). func (vfs *VirtualFilesystem) PathnameForGetcwd(ctx context.Context, vfsroot, vd VirtualDentry) (string, error) { - if vd.dentry.IsDisowned() { + if vd.dentry.IsDead() { return "", syserror.ENOENT } diff --git a/pkg/sentry/vfs/resolving_path.go b/pkg/sentry/vfs/resolving_path.go index 8f31495da..9d047ff88 100644 --- a/pkg/sentry/vfs/resolving_path.go +++ b/pkg/sentry/vfs/resolving_path.go @@ -29,7 +29,9 @@ import ( // // From the perspective of FilesystemImpl methods, a ResolvingPath represents a // starting Dentry on the associated Filesystem (on which a reference is -// already held) and a stream of path components relative to that Dentry. +// already held), a stream of path components relative to that Dentry, and +// elements of the invoking Context that are commonly required by +// FilesystemImpl methods. // // ResolvingPath is loosely analogous to Linux's struct nameidata. type ResolvingPath struct { @@ -251,18 +253,17 @@ func (rp *ResolvingPath) relpathCommit() { rp.origParts[rp.curPart] = rp.pit } -// ResolveParent returns the VFS parent of d. It does not take a reference on -// the returned Dentry. -// -// Preconditions: There are no concurrent mutators of d. -// -// Postconditions: If the returned error is nil, then the returned Dentry is -// not nil. -func (rp *ResolvingPath) ResolveParent(d *Dentry) (*Dentry, error) { - var parent *Dentry +// CheckRoot is called before resolving the parent of the Dentry d. If the +// Dentry is contextually a VFS root, such that path resolution should treat +// d's parent as itself, CheckRoot returns (true, nil). If the Dentry is the +// root of a non-root mount, such that path resolution should switch to another +// Mount, CheckRoot returns (unspecified, non-nil error). Otherwise, path +// resolution should resolve d's parent normally, and CheckRoot returns (false, +// nil). +func (rp *ResolvingPath) CheckRoot(d *Dentry) (bool, error) { if d == rp.root.dentry && rp.mount == rp.root.mount { - // At contextual VFS root. - parent = d + // At contextual VFS root (due to e.g. chroot(2)). + return true, nil } else if d == rp.mount.root { // At mount root ... vd := rp.vfs.getMountpointAt(rp.mount, rp.root) @@ -270,59 +271,27 @@ func (rp *ResolvingPath) ResolveParent(d *Dentry) (*Dentry, error) { // ... of non-root mount. rp.nextMount = vd.mount rp.nextStart = vd.dentry - return nil, resolveMountRootOrJumpError{} + return false, resolveMountRootOrJumpError{} } // ... of root mount. - parent = d - } else if d.parent == nil { - // At filesystem root. - parent = d - } else { - parent = d.parent + return true, nil } - if parent.isMounted() { - if mnt := rp.vfs.getMountAt(rp.mount, parent); mnt != nil { - rp.nextMount = mnt - return nil, resolveMountPointError{} - } - } - return parent, nil + return false, nil } -// ResolveChild returns the VFS child of d with the given name. It does not -// take a reference on the returned Dentry. If no such child exists, -// ResolveChild returns (nil, nil). -// -// Preconditions: There are no concurrent mutators of d. -func (rp *ResolvingPath) ResolveChild(d *Dentry, name string) (*Dentry, error) { - child := d.children[name] - if child == nil { - return nil, nil +// CheckMount is called after resolving the parent or child of another Dentry +// to d. If d is a mount point, such that path resolution should switch to +// another Mount, CheckMount returns a non-nil error. Otherwise, CheckMount +// returns nil. +func (rp *ResolvingPath) CheckMount(d *Dentry) error { + if !d.isMounted() { + return nil } - if child.isMounted() { - if mnt := rp.vfs.getMountAt(rp.mount, child); mnt != nil { - rp.nextMount = mnt - return nil, resolveMountPointError{} - } - } - return child, nil -} - -// ResolveComponent returns the Dentry reached by starting at d and resolving -// the current path component in the stream represented by rp. It does not -// advance the stream. It does not take a reference on the returned Dentry. If -// no such Dentry exists, ResolveComponent returns (nil, nil). -// -// Preconditions: !rp.Done(). There are no concurrent mutators of d. -func (rp *ResolvingPath) ResolveComponent(d *Dentry) (*Dentry, error) { - switch pc := rp.Component(); pc { - case ".": - return d, nil - case "..": - return rp.ResolveParent(d) - default: - return rp.ResolveChild(d, pc) + if mnt := rp.vfs.getMountAt(rp.mount, d); mnt != nil { + rp.nextMount = mnt + return resolveMountPointError{} } + return nil } // ShouldFollowSymlink returns true if, supposing that the current path -- cgit v1.2.3 From 37e01fd2ea6a0e67637975863317be9aae1b02f0 Mon Sep 17 00:00:00 2001 From: Fabricio Voznika Date: Tue, 21 Apr 2020 16:30:26 -0700 Subject: Misc VFS2 fixes - Fix defer operation ordering in kernfs.Filesystem.AccessAt() - Add AT_NULL entry in proc/pid/auvx - Fix line padding in /proc/pid/maps - Fix linux_dirent serialization for getdents(2) - Remove file creation flags from vfs.FileDescription.statusFlags() Updates #1193, #1035 PiperOrigin-RevId: 307704159 --- pkg/sentry/fs/proc/task.go | 3 +-- pkg/sentry/fsimpl/kernfs/filesystem.go | 2 +- pkg/sentry/fsimpl/proc/task_files.go | 13 +++++++------ pkg/sentry/fsimpl/proc/tasks_sys.go | 2 +- pkg/sentry/mm/procfs.go | 4 ++-- pkg/sentry/syscalls/linux/vfs2/getdents.go | 6 +++--- pkg/sentry/vfs/file_description.go | 15 +++++++++------ 7 files changed, 24 insertions(+), 21 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fs/proc/task.go b/pkg/sentry/fs/proc/task.go index 4d42eac83..4bbe90198 100644 --- a/pkg/sentry/fs/proc/task.go +++ b/pkg/sentry/fs/proc/task.go @@ -73,8 +73,7 @@ func checkTaskState(t *kernel.Task) error { type taskDir struct { ramfs.Dir - t *kernel.Task - pidns *kernel.PIDNamespace + t *kernel.Task } var _ fs.InodeOperations = (*taskDir)(nil) diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go index 01c23d192..3164d022c 100644 --- a/pkg/sentry/fsimpl/kernfs/filesystem.go +++ b/pkg/sentry/fsimpl/kernfs/filesystem.go @@ -246,8 +246,8 @@ func (fs *Filesystem) Sync(ctx context.Context) error { // AccessAt implements vfs.Filesystem.Impl.AccessAt. func (fs *Filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error { fs.mu.RLock() - defer fs.mu.RUnlock() defer fs.processDeferredDecRefs() + defer fs.mu.RUnlock() _, inode, err := fs.walkExistingLocked(ctx, rp) if err != nil { diff --git a/pkg/sentry/fsimpl/proc/task_files.go b/pkg/sentry/fsimpl/proc/task_files.go index 2c6f8bdfc..f3173e197 100644 --- a/pkg/sentry/fsimpl/proc/task_files.go +++ b/pkg/sentry/fsimpl/proc/task_files.go @@ -111,17 +111,18 @@ func (d *auxvData) Generate(ctx context.Context, buf *bytes.Buffer) error { } defer m.DecUsers(ctx) - // Space for buffer with AT_NULL (0) terminator at the end. auxv := m.Auxv() + // Space for buffer with AT_NULL (0) terminator at the end. buf.Grow((len(auxv) + 1) * 16) for _, e := range auxv { - var tmp [8]byte - usermem.ByteOrder.PutUint64(tmp[:], e.Key) - buf.Write(tmp[:]) - - usermem.ByteOrder.PutUint64(tmp[:], uint64(e.Value)) + var tmp [16]byte + usermem.ByteOrder.PutUint64(tmp[:8], e.Key) + usermem.ByteOrder.PutUint64(tmp[8:], uint64(e.Value)) buf.Write(tmp[:]) } + var atNull [16]byte + buf.Write(atNull[:]) + return nil } diff --git a/pkg/sentry/fsimpl/proc/tasks_sys.go b/pkg/sentry/fsimpl/proc/tasks_sys.go index 3d5dc463c..f08668ca2 100644 --- a/pkg/sentry/fsimpl/proc/tasks_sys.go +++ b/pkg/sentry/fsimpl/proc/tasks_sys.go @@ -39,7 +39,7 @@ func newSysDir(root *auth.Credentials, inoGen InoGenerator, k *kernel.Kernel) *k "shmmni": newDentry(root, inoGen.NextIno(), 0444, shmData(linux.SHMMNI)), }), "vm": kernfs.NewStaticDir(root, inoGen.NextIno(), 0555, map[string]*kernfs.Dentry{ - "mmap_min_addr": newDentry(root, inoGen.NextIno(), 0444, &mmapMinAddrData{}), + "mmap_min_addr": newDentry(root, inoGen.NextIno(), 0444, &mmapMinAddrData{k: k}), "overcommit_memory": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("0\n")), }), "net": newSysNetDir(root, inoGen, k), diff --git a/pkg/sentry/mm/procfs.go b/pkg/sentry/mm/procfs.go index 1ab92f046..6efe5102b 100644 --- a/pkg/sentry/mm/procfs.go +++ b/pkg/sentry/mm/procfs.go @@ -148,7 +148,7 @@ func (mm *MemoryManager) appendVMAMapsEntryLocked(ctx context.Context, vseg vmaI // Do not include the guard page: fs/proc/task_mmu.c:show_map_vma() => // stack_guard_page_start(). - fmt.Fprintf(b, "%08x-%08x %s%s %08x %02x:%02x %d ", + lineLen, _ := fmt.Fprintf(b, "%08x-%08x %s%s %08x %02x:%02x %d ", vseg.Start(), vseg.End(), vma.realPerms, private, vma.off, devMajor, devMinor, ino) // Figure out our filename or hint. @@ -165,7 +165,7 @@ func (mm *MemoryManager) appendVMAMapsEntryLocked(ctx context.Context, vseg vmaI } if s != "" { // Per linux, we pad until the 74th character. - if pad := 73 - b.Len(); pad > 0 { + if pad := 73 - lineLen; pad > 0 { b.WriteString(strings.Repeat(" ", pad)) } b.WriteString(s) diff --git a/pkg/sentry/syscalls/linux/vfs2/getdents.go b/pkg/sentry/syscalls/linux/vfs2/getdents.go index 62e98817d..c7c7bf7ce 100644 --- a/pkg/sentry/syscalls/linux/vfs2/getdents.go +++ b/pkg/sentry/syscalls/linux/vfs2/getdents.go @@ -130,7 +130,7 @@ func (cb *getdentsCallback) Handle(dirent vfs.Dirent) error { if cb.t.Arch().Width() != 8 { panic(fmt.Sprintf("unsupported sizeof(unsigned long): %d", cb.t.Arch().Width())) } - size := 8 + 8 + 2 + 1 + 1 + 1 + len(dirent.Name) + size := 8 + 8 + 2 + 1 + 1 + len(dirent.Name) size = (size + 7) &^ 7 // round up to multiple of sizeof(long) if size > cb.remaining { return syserror.EINVAL @@ -143,11 +143,11 @@ func (cb *getdentsCallback) Handle(dirent vfs.Dirent) error { // Zero out all remaining bytes in buf, including the NUL terminator // after dirent.Name and the zero padding byte between the name and // dirent type. - bufTail := buf[18+len(dirent.Name):] + bufTail := buf[18+len(dirent.Name) : size-1] for i := range bufTail { bufTail[i] = 0 } - bufTail[2] = dirent.Type + buf[size-1] = dirent.Type } n, err := cb.t.CopyOutBytes(cb.addr, buf) if err != nil { diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go index 15cc091e2..418d69b96 100644 --- a/pkg/sentry/vfs/file_description.go +++ b/pkg/sentry/vfs/file_description.go @@ -111,10 +111,10 @@ type FileDescriptionOptions struct { } // Init must be called before first use of fd. If it succeeds, it takes -// references on mnt and d. statusFlags is the initial file description status -// flags, which is usually the full set of flags passed to open(2). -func (fd *FileDescription) Init(impl FileDescriptionImpl, statusFlags uint32, mnt *Mount, d *Dentry, opts *FileDescriptionOptions) error { - writable := MayWriteFileWithOpenFlags(statusFlags) +// references on mnt and d. flags is the initial file description flags, which +// is usually the full set of flags passed to open(2). +func (fd *FileDescription) Init(impl FileDescriptionImpl, flags uint32, mnt *Mount, d *Dentry, opts *FileDescriptionOptions) error { + writable := MayWriteFileWithOpenFlags(flags) if writable { if err := mnt.CheckBeginWrite(); err != nil { return err @@ -122,7 +122,10 @@ func (fd *FileDescription) Init(impl FileDescriptionImpl, statusFlags uint32, mn } fd.refs = 1 - fd.statusFlags = statusFlags + + // Remove "file creation flags" to mirror the behavior from file.f_flags in + // fs/open.c:do_dentry_open + fd.statusFlags = flags &^ (linux.O_CREAT | linux.O_EXCL | linux.O_NOCTTY | linux.O_TRUNC) fd.vd = VirtualDentry{ mount: mnt, dentry: d, @@ -130,7 +133,7 @@ func (fd *FileDescription) Init(impl FileDescriptionImpl, statusFlags uint32, mn mnt.IncRef() d.IncRef() fd.opts = *opts - fd.readable = MayReadFileWithOpenFlags(statusFlags) + fd.readable = MayReadFileWithOpenFlags(flags) fd.writable = writable fd.impl = impl return nil -- cgit v1.2.3 From 80d0a958199cc6095e2d580e403d50ac1c3b5206 Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Tue, 21 Apr 2020 17:58:43 -0700 Subject: Update gofer.filesystem.BoundEndpointAt() to allow path resolution. Even though BoundEndpointAt is not yet implemented for gofer fs, allow path resolution errors to be returned so that we can jump to tmpfs, where it is implemented. Updates #1476. PiperOrigin-RevId: 307718335 --- pkg/sentry/fsimpl/gofer/filesystem.go | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go index 43e863c61..eba4aabe8 100644 --- a/pkg/sentry/fsimpl/gofer/filesystem.go +++ b/pkg/sentry/fsimpl/gofer/filesystem.go @@ -1089,9 +1089,15 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error } // BoundEndpointAt implements FilesystemImpl.BoundEndpointAt. -// -// TODO(gvisor.dev/issue/1476): Implement BoundEndpointAt. func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath) (transport.BoundEndpoint, error) { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckCaching(&ds) + _, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + return nil, err + } + // TODO(gvisor.dev/issue/1476): Implement BoundEndpointAt. return nil, syserror.ECONNREFUSED } -- cgit v1.2.3 From e0c67014cb2200ad58cd28b12fddb3f55652a21b Mon Sep 17 00:00:00 2001 From: Jamie Liu Date: Thu, 23 Apr 2020 11:06:59 -0700 Subject: Factor fsimpl/gofer.host{Preadv,Pwritev} out of fsimpl/gofer. Also fix returning EOF when 0 bytes are read. PiperOrigin-RevId: 308089875 --- pkg/sentry/fsimpl/gofer/BUILD | 2 +- pkg/sentry/fsimpl/gofer/handle.go | 5 +- pkg/sentry/fsimpl/gofer/handle_unsafe.go | 66 ------------------- pkg/sentry/fsimpl/host/BUILD | 3 +- pkg/sentry/fsimpl/host/host.go | 31 ++------- pkg/sentry/hostfd/BUILD | 17 +++++ pkg/sentry/hostfd/hostfd.go | 84 ++++++++++++++++++++++++ pkg/sentry/hostfd/hostfd_unsafe.go | 107 +++++++++++++++++++++++++++++++ 8 files changed, 218 insertions(+), 97 deletions(-) delete mode 100644 pkg/sentry/fsimpl/gofer/handle_unsafe.go create mode 100644 pkg/sentry/hostfd/BUILD create mode 100644 pkg/sentry/hostfd/hostfd.go create mode 100644 pkg/sentry/hostfd/hostfd_unsafe.go (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/gofer/BUILD b/pkg/sentry/fsimpl/gofer/BUILD index acd061905..b9c4beee4 100644 --- a/pkg/sentry/fsimpl/gofer/BUILD +++ b/pkg/sentry/fsimpl/gofer/BUILD @@ -35,7 +35,6 @@ go_library( "fstree.go", "gofer.go", "handle.go", - "handle_unsafe.go", "p9file.go", "pagemath.go", "regular_file.go", @@ -53,6 +52,7 @@ go_library( "//pkg/p9", "//pkg/safemem", "//pkg/sentry/fs/fsutil", + "//pkg/sentry/hostfd", "//pkg/sentry/kernel/auth", "//pkg/sentry/kernel/time", "//pkg/sentry/memmap", diff --git a/pkg/sentry/fsimpl/gofer/handle.go b/pkg/sentry/fsimpl/gofer/handle.go index cfe66f797..724a3f1f7 100644 --- a/pkg/sentry/fsimpl/gofer/handle.go +++ b/pkg/sentry/fsimpl/gofer/handle.go @@ -20,6 +20,7 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/p9" "gvisor.dev/gvisor/pkg/safemem" + "gvisor.dev/gvisor/pkg/sentry/hostfd" ) // handle represents a remote "open file descriptor", consisting of an opened @@ -77,7 +78,7 @@ func (h *handle) readToBlocksAt(ctx context.Context, dsts safemem.BlockSeq, offs } if h.fd >= 0 { ctx.UninterruptibleSleepStart(false) - n, err := hostPreadv(h.fd, dsts, int64(offset)) + n, err := hostfd.Preadv2(h.fd, dsts, int64(offset), 0 /* flags */) ctx.UninterruptibleSleepFinish(false) return n, err } @@ -103,7 +104,7 @@ func (h *handle) writeFromBlocksAt(ctx context.Context, srcs safemem.BlockSeq, o } if h.fd >= 0 { ctx.UninterruptibleSleepStart(false) - n, err := hostPwritev(h.fd, srcs, int64(offset)) + n, err := hostfd.Pwritev2(h.fd, srcs, int64(offset), 0 /* flags */) ctx.UninterruptibleSleepFinish(false) return n, err } diff --git a/pkg/sentry/fsimpl/gofer/handle_unsafe.go b/pkg/sentry/fsimpl/gofer/handle_unsafe.go deleted file mode 100644 index 19560ab26..000000000 --- a/pkg/sentry/fsimpl/gofer/handle_unsafe.go +++ /dev/null @@ -1,66 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package gofer - -import ( - "syscall" - "unsafe" - - "gvisor.dev/gvisor/pkg/safemem" -) - -// Preconditions: !dsts.IsEmpty(). -func hostPreadv(fd int32, dsts safemem.BlockSeq, off int64) (uint64, error) { - // No buffering is necessary regardless of safecopy; host syscalls will - // return EFAULT if appropriate, instead of raising SIGBUS. - if dsts.NumBlocks() == 1 { - // Use pread() instead of preadv() to avoid iovec allocation and - // copying. - dst := dsts.Head() - n, _, e := syscall.Syscall6(syscall.SYS_PREAD64, uintptr(fd), dst.Addr(), uintptr(dst.Len()), uintptr(off), 0, 0) - if e != 0 { - return 0, e - } - return uint64(n), nil - } - iovs := safemem.IovecsFromBlockSeq(dsts) - n, _, e := syscall.Syscall6(syscall.SYS_PREADV, uintptr(fd), uintptr((unsafe.Pointer)(&iovs[0])), uintptr(len(iovs)), uintptr(off), 0, 0) - if e != 0 { - return 0, e - } - return uint64(n), nil -} - -// Preconditions: !srcs.IsEmpty(). -func hostPwritev(fd int32, srcs safemem.BlockSeq, off int64) (uint64, error) { - // No buffering is necessary regardless of safecopy; host syscalls will - // return EFAULT if appropriate, instead of raising SIGBUS. - if srcs.NumBlocks() == 1 { - // Use pwrite() instead of pwritev() to avoid iovec allocation and - // copying. - src := srcs.Head() - n, _, e := syscall.Syscall6(syscall.SYS_PWRITE64, uintptr(fd), src.Addr(), uintptr(src.Len()), uintptr(off), 0, 0) - if e != 0 { - return 0, e - } - return uint64(n), nil - } - iovs := safemem.IovecsFromBlockSeq(srcs) - n, _, e := syscall.Syscall6(syscall.SYS_PWRITEV, uintptr(fd), uintptr((unsafe.Pointer)(&iovs[0])), uintptr(len(iovs)), uintptr(off), 0, 0) - if e != 0 { - return 0, e - } - return uint64(n), nil -} diff --git a/pkg/sentry/fsimpl/host/BUILD b/pkg/sentry/fsimpl/host/BUILD index 82e1fb74b..44dd9f672 100644 --- a/pkg/sentry/fsimpl/host/BUILD +++ b/pkg/sentry/fsimpl/host/BUILD @@ -15,12 +15,11 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/context", - "//pkg/fd", "//pkg/log", "//pkg/refs", - "//pkg/safemem", "//pkg/sentry/arch", "//pkg/sentry/fsimpl/kernfs", + "//pkg/sentry/hostfd", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", "//pkg/sentry/memmap", diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go index fe14476f1..ae94cfa6e 100644 --- a/pkg/sentry/fsimpl/host/host.go +++ b/pkg/sentry/fsimpl/host/host.go @@ -25,11 +25,10 @@ import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/refs" - "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" + "gvisor.dev/gvisor/pkg/sentry/hostfd" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/vfs" @@ -492,19 +491,9 @@ func readFromHostFD(ctx context.Context, hostFD int, dst usermem.IOSequence, off if flags != 0 { return 0, syserror.EOPNOTSUPP } - - var reader safemem.Reader - if offset == -1 { - reader = safemem.FromIOReader{fd.NewReadWriter(hostFD)} - } else { - reader = safemem.FromVecReaderFunc{ - func(srcs [][]byte) (int64, error) { - n, err := unix.Preadv(hostFD, srcs, offset) - return int64(n), err - }, - } - } + reader := hostfd.GetReadWriterAt(int32(hostFD), offset, flags) n, err := dst.CopyOutFrom(ctx, reader) + hostfd.PutReadWriterAt(reader) return int64(n), err } @@ -542,19 +531,9 @@ func writeToHostFD(ctx context.Context, hostFD int, src usermem.IOSequence, offs if flags != 0 { return 0, syserror.EOPNOTSUPP } - - var writer safemem.Writer - if offset == -1 { - writer = safemem.FromIOWriter{fd.NewReadWriter(hostFD)} - } else { - writer = safemem.FromVecWriterFunc{ - func(srcs [][]byte) (int64, error) { - n, err := unix.Pwritev(hostFD, srcs, offset) - return int64(n), err - }, - } - } + writer := hostfd.GetReadWriterAt(int32(hostFD), offset, flags) n, err := src.CopyInTo(ctx, writer) + hostfd.PutReadWriterAt(writer) return int64(n), err } diff --git a/pkg/sentry/hostfd/BUILD b/pkg/sentry/hostfd/BUILD new file mode 100644 index 000000000..364a78306 --- /dev/null +++ b/pkg/sentry/hostfd/BUILD @@ -0,0 +1,17 @@ +load("//tools:defs.bzl", "go_library") + +licenses(["notice"]) + +go_library( + name = "hostfd", + srcs = [ + "hostfd.go", + "hostfd_unsafe.go", + ], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/safemem", + "//pkg/sync", + "@org_golang_x_sys//unix:go_default_library", + ], +) diff --git a/pkg/sentry/hostfd/hostfd.go b/pkg/sentry/hostfd/hostfd.go new file mode 100644 index 000000000..70dd9cafb --- /dev/null +++ b/pkg/sentry/hostfd/hostfd.go @@ -0,0 +1,84 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package hostfd provides efficient I/O with host file descriptors. +package hostfd + +import ( + "gvisor.dev/gvisor/pkg/safemem" + "gvisor.dev/gvisor/pkg/sync" +) + +// ReadWriterAt implements safemem.Reader and safemem.Writer by reading from +// and writing to a host file descriptor respectively. ReadWriterAts should be +// obtained by calling GetReadWriterAt. +// +// Clients should usually prefer to use Preadv2 and Pwritev2 directly. +type ReadWriterAt struct { + fd int32 + offset int64 + flags uint32 +} + +var rwpool = sync.Pool{ + New: func() interface{} { + return &ReadWriterAt{} + }, +} + +// GetReadWriterAt returns a ReadWriterAt that reads from / writes to the given +// host file descriptor, starting at the given offset and using the given +// preadv2(2)/pwritev2(2) flags. If offset is -1, the host file descriptor's +// offset is used instead. Users are responsible for ensuring that fd remains +// valid for the lifetime of the returned ReadWriterAt, and must call +// PutReadWriterAt when it is no longer needed. +func GetReadWriterAt(fd int32, offset int64, flags uint32) *ReadWriterAt { + rw := rwpool.Get().(*ReadWriterAt) + *rw = ReadWriterAt{ + fd: fd, + offset: offset, + flags: flags, + } + return rw +} + +// PutReadWriterAt releases a ReadWriterAt returned by a previous call to +// GetReadWriterAt that is no longer in use. +func PutReadWriterAt(rw *ReadWriterAt) { + rwpool.Put(rw) +} + +// ReadToBlocks implements safemem.Reader.ReadToBlocks. +func (rw *ReadWriterAt) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) { + if dsts.IsEmpty() { + return 0, nil + } + n, err := Preadv2(rw.fd, dsts, rw.offset, rw.flags) + if rw.offset >= 0 { + rw.offset += int64(n) + } + return n, err +} + +// WriteFromBlocks implements safemem.Writer.WriteFromBlocks. +func (rw *ReadWriterAt) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) { + if srcs.IsEmpty() { + return 0, nil + } + n, err := Pwritev2(rw.fd, srcs, rw.offset, rw.flags) + if rw.offset >= 0 { + rw.offset += int64(n) + } + return n, err +} diff --git a/pkg/sentry/hostfd/hostfd_unsafe.go b/pkg/sentry/hostfd/hostfd_unsafe.go new file mode 100644 index 000000000..5e9e60fc4 --- /dev/null +++ b/pkg/sentry/hostfd/hostfd_unsafe.go @@ -0,0 +1,107 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package hostfd + +import ( + "io" + "syscall" + "unsafe" + + "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/safemem" +) + +// Preadv2 reads up to dsts.NumBytes() bytes from host file descriptor fd into +// dsts. offset and flags are interpreted as for preadv2(2). +// +// Preconditions: !dsts.IsEmpty(). +func Preadv2(fd int32, dsts safemem.BlockSeq, offset int64, flags uint32) (uint64, error) { + // No buffering is necessary regardless of safecopy; host syscalls will + // return EFAULT if appropriate, instead of raising SIGBUS. + var ( + n uintptr + e syscall.Errno + ) + // Avoid preadv2(2) if possible, since it's relatively new and thus least + // likely to be supported by the host kernel. + if flags == 0 { + if dsts.NumBlocks() == 1 { + // Use read() or pread() to avoid iovec allocation and copying. + dst := dsts.Head() + if offset == -1 { + n, _, e = syscall.Syscall(unix.SYS_READ, uintptr(fd), dst.Addr(), uintptr(dst.Len())) + } else { + n, _, e = syscall.Syscall6(unix.SYS_PREAD64, uintptr(fd), dst.Addr(), uintptr(dst.Len()), uintptr(offset), 0 /* pos_h */, 0 /* unused */) + } + } else { + iovs := safemem.IovecsFromBlockSeq(dsts) + if offset == -1 { + n, _, e = syscall.Syscall(unix.SYS_READV, uintptr(fd), uintptr((unsafe.Pointer)(&iovs[0])), uintptr(len(iovs))) + } else { + n, _, e = syscall.Syscall6(unix.SYS_PREADV, uintptr(fd), uintptr((unsafe.Pointer)(&iovs[0])), uintptr(len(iovs)), uintptr(offset), 0 /* pos_h */, 0 /* unused */) + } + } + } else { + iovs := safemem.IovecsFromBlockSeq(dsts) + n, _, e = syscall.Syscall6(unix.SYS_PREADV2, uintptr(fd), uintptr((unsafe.Pointer)(&iovs[0])), uintptr(len(iovs)), uintptr(offset), 0 /* pos_h */, uintptr(flags)) + } + if e != 0 { + return 0, e + } + if n == 0 { + return 0, io.EOF + } + return uint64(n), nil +} + +// Pwritev2 writes up to srcs.NumBytes() from srcs into host file descriptor +// fd. offset and flags are interpreted as for pwritev2(2). +// +// Preconditions: !srcs.IsEmpty(). +func Pwritev2(fd int32, srcs safemem.BlockSeq, offset int64, flags uint32) (uint64, error) { + // No buffering is necessary regardless of safecopy; host syscalls will + // return EFAULT if appropriate, instead of raising SIGBUS. + var ( + n uintptr + e syscall.Errno + ) + // Avoid pwritev2(2) if possible, since it's relatively new and thus least + // likely to be supported by the host kernel. + if flags == 0 { + if srcs.NumBlocks() == 1 { + // Use write() or pwrite() to avoid iovec allocation and copying. + src := srcs.Head() + if offset == -1 { + n, _, e = syscall.Syscall(unix.SYS_WRITE, uintptr(fd), src.Addr(), uintptr(src.Len())) + } else { + n, _, e = syscall.Syscall6(unix.SYS_PWRITE64, uintptr(fd), src.Addr(), uintptr(src.Len()), uintptr(offset), 0 /* pos_h */, 0 /* unused */) + } + } else { + iovs := safemem.IovecsFromBlockSeq(srcs) + if offset == -1 { + n, _, e = syscall.Syscall(unix.SYS_WRITEV, uintptr(fd), uintptr((unsafe.Pointer)(&iovs[0])), uintptr(len(iovs))) + } else { + n, _, e = syscall.Syscall6(unix.SYS_PWRITEV, uintptr(fd), uintptr((unsafe.Pointer)(&iovs[0])), uintptr(len(iovs)), uintptr(offset), 0 /* pos_h */, 0 /* unused */) + } + } + } else { + iovs := safemem.IovecsFromBlockSeq(srcs) + n, _, e = syscall.Syscall6(unix.SYS_PWRITEV2, uintptr(fd), uintptr((unsafe.Pointer)(&iovs[0])), uintptr(len(iovs)), uintptr(offset), 0 /* pos_h */, uintptr(flags)) + } + if e != 0 { + return 0, e + } + return uint64(n), nil +} -- cgit v1.2.3 From 1481499fe27157ad2716c00682f6ad819115a6c7 Mon Sep 17 00:00:00 2001 From: Adin Scannell Date: Thu, 23 Apr 2020 11:32:08 -0700 Subject: Simplify Docker test infrastructure. This change adds a layer of abstraction around the internal Docker APIs, and eliminates all direct dependencies on Dockerfiles in the infrastructure. A subsequent change will automated the generation of local images (with efficient caching). Note that this change drops the use of bazel container rules, as that experiment does not seem to be viable. PiperOrigin-RevId: 308095430 --- WORKSPACE | 39 - pkg/sentry/fsimpl/ext/BUILD | 2 +- pkg/sentry/fsimpl/ext/ext_test.go | 3 +- pkg/tcpip/transport/tcp/BUILD | 2 +- pkg/tcpip/transport/tcp/tcp_noracedetector_test.go | 2 +- pkg/test/criutil/BUILD | 14 + pkg/test/criutil/criutil.go | 306 +++ pkg/test/dockerutil/BUILD | 14 + pkg/test/dockerutil/dockerutil.go | 581 ++++++ pkg/test/testutil/BUILD | 20 + pkg/test/testutil/testutil.go | 550 ++++++ pkg/test/testutil/testutil_runfiles.go | 75 + runsc/boot/BUILD | 1 + runsc/cmd/BUILD | 2 +- runsc/cmd/capability_test.go | 9 +- runsc/container/BUILD | 6 +- runsc/container/console_test.go | 115 +- runsc/container/container.go | 2 +- runsc/container/container_norace_test.go | 20 + runsc/container/container_race_test.go | 20 + runsc/container/container_test.go | 2046 ++++++++++---------- runsc/container/multi_container_test.go | 1161 +++++------ runsc/container/shared_volume_test.go | 18 +- runsc/container/test_app/BUILD | 21 - runsc/container/test_app/fds.go | 185 -- runsc/container/test_app/test_app.go | 394 ---- runsc/criutil/BUILD | 11 - runsc/criutil/criutil.go | 277 --- runsc/dockerutil/BUILD | 14 - runsc/dockerutil/dockerutil.go | 486 ----- runsc/testutil/BUILD | 21 - runsc/testutil/testutil.go | 433 ----- runsc/testutil/testutil_runfiles.go | 75 - scripts/iptables_tests.sh | 13 +- test/cmd/test_app/BUILD | 21 + test/cmd/test_app/fds.go | 185 ++ test/cmd/test_app/test_app.go | 394 ++++ test/e2e/BUILD | 4 +- test/e2e/exec_test.go | 193 +- test/e2e/integration_test.go | 233 ++- test/e2e/regression_test.go | 18 +- test/image/BUILD | 4 +- test/image/image_test.go | 195 +- test/image/ruby.sh | 0 test/iptables/BUILD | 8 +- test/iptables/README.md | 2 +- test/iptables/iptables.go | 7 + test/iptables/iptables_test.go | 271 +-- test/iptables/iptables_util.go | 2 +- test/iptables/runner/BUILD | 17 +- test/iptables/runner/main.go | 3 + test/packetdrill/packetdrill_test.sh | 25 +- test/packetimpact/testbench/dut.go | 2 +- test/packetimpact/tests/test_runner.sh | 24 +- test/root/BUILD | 8 +- test/root/cgroup_test.go | 114 +- test/root/chroot_test.go | 20 +- test/root/crictl_test.go | 192 +- test/root/main_test.go | 2 +- test/root/oom_score_adj_test.go | 78 +- test/root/runsc_test.go | 2 +- test/root/testdata/BUILD | 18 - test/root/testdata/busybox.go | 32 - test/root/testdata/containerd_config.go | 39 - test/root/testdata/httpd.go | 32 - test/root/testdata/httpd_mount_paths.go | 53 - test/root/testdata/sandbox.go | 30 - test/root/testdata/simple.go | 41 - test/runner/BUILD | 2 +- test/runner/runner.go | 12 +- test/runtimes/BUILD | 22 +- test/runtimes/README.md | 56 - test/runtimes/blacklist_test.go | 37 - test/runtimes/build_defs.bzl | 75 - test/runtimes/defs.bzl | 79 + test/runtimes/images/proctor/BUILD | 26 - test/runtimes/images/proctor/go.go | 90 - test/runtimes/images/proctor/java.go | 71 - test/runtimes/images/proctor/nodejs.go | 46 - test/runtimes/images/proctor/php.go | 42 - test/runtimes/images/proctor/proctor.go | 163 -- test/runtimes/images/proctor/proctor_test.go | 127 -- test/runtimes/images/proctor/python.go | 49 - test/runtimes/proctor/BUILD | 27 + test/runtimes/proctor/go.go | 90 + test/runtimes/proctor/java.go | 71 + test/runtimes/proctor/nodejs.go | 46 + test/runtimes/proctor/php.go | 42 + test/runtimes/proctor/proctor.go | 163 ++ test/runtimes/proctor/proctor_test.go | 127 ++ test/runtimes/proctor/python.go | 49 + test/runtimes/runner.go | 196 -- test/runtimes/runner.sh | 35 - test/runtimes/runner/BUILD | 21 + test/runtimes/runner/blacklist_test.go | 37 + test/runtimes/runner/main.go | 189 ++ tools/bazeldefs/defs.bzl | 4 - tools/defs.bzl | 4 +- 98 files changed, 5512 insertions(+), 5693 deletions(-) create mode 100644 pkg/test/criutil/BUILD create mode 100644 pkg/test/criutil/criutil.go create mode 100644 pkg/test/dockerutil/BUILD create mode 100644 pkg/test/dockerutil/dockerutil.go create mode 100644 pkg/test/testutil/BUILD create mode 100644 pkg/test/testutil/testutil.go create mode 100644 pkg/test/testutil/testutil_runfiles.go create mode 100644 runsc/container/container_norace_test.go create mode 100644 runsc/container/container_race_test.go delete mode 100644 runsc/container/test_app/BUILD delete mode 100644 runsc/container/test_app/fds.go delete mode 100644 runsc/container/test_app/test_app.go delete mode 100644 runsc/criutil/BUILD delete mode 100644 runsc/criutil/criutil.go delete mode 100644 runsc/dockerutil/BUILD delete mode 100644 runsc/dockerutil/dockerutil.go delete mode 100644 runsc/testutil/BUILD delete mode 100644 runsc/testutil/testutil.go delete mode 100644 runsc/testutil/testutil_runfiles.go create mode 100644 test/cmd/test_app/BUILD create mode 100644 test/cmd/test_app/fds.go create mode 100644 test/cmd/test_app/test_app.go mode change 100644 => 100755 test/image/ruby.sh delete mode 100644 test/root/testdata/BUILD delete mode 100644 test/root/testdata/busybox.go delete mode 100644 test/root/testdata/containerd_config.go delete mode 100644 test/root/testdata/httpd.go delete mode 100644 test/root/testdata/httpd_mount_paths.go delete mode 100644 test/root/testdata/sandbox.go delete mode 100644 test/root/testdata/simple.go delete mode 100644 test/runtimes/README.md delete mode 100644 test/runtimes/blacklist_test.go delete mode 100644 test/runtimes/build_defs.bzl create mode 100644 test/runtimes/defs.bzl delete mode 100644 test/runtimes/images/proctor/BUILD delete mode 100644 test/runtimes/images/proctor/go.go delete mode 100644 test/runtimes/images/proctor/java.go delete mode 100644 test/runtimes/images/proctor/nodejs.go delete mode 100644 test/runtimes/images/proctor/php.go delete mode 100644 test/runtimes/images/proctor/proctor.go delete mode 100644 test/runtimes/images/proctor/proctor_test.go delete mode 100644 test/runtimes/images/proctor/python.go create mode 100644 test/runtimes/proctor/BUILD create mode 100644 test/runtimes/proctor/go.go create mode 100644 test/runtimes/proctor/java.go create mode 100644 test/runtimes/proctor/nodejs.go create mode 100644 test/runtimes/proctor/php.go create mode 100644 test/runtimes/proctor/proctor.go create mode 100644 test/runtimes/proctor/proctor_test.go create mode 100644 test/runtimes/proctor/python.go delete mode 100644 test/runtimes/runner.go delete mode 100755 test/runtimes/runner.sh create mode 100644 test/runtimes/runner/BUILD create mode 100644 test/runtimes/runner/blacklist_test.go create mode 100644 test/runtimes/runner/main.go (limited to 'pkg/sentry/fsimpl') diff --git a/WORKSPACE b/WORKSPACE index b895647fb..3bf5cc9c1 100644 --- a/WORKSPACE +++ b/WORKSPACE @@ -127,45 +127,6 @@ load("@rules_pkg//:deps.bzl", "rules_pkg_dependencies") rules_pkg_dependencies() -# Container rules. -http_archive( - name = "io_bazel_rules_docker", - sha256 = "14ac30773fdb393ddec90e158c9ec7ebb3f8a4fd533ec2abbfd8789ad81a284b", - strip_prefix = "rules_docker-0.12.1", - urls = ["https://github.com/bazelbuild/rules_docker/releases/download/v0.12.1/rules_docker-v0.12.1.tar.gz"], -) - -load( - "@io_bazel_rules_docker//repositories:repositories.bzl", - container_repositories = "repositories", -) - -container_repositories() - -load("@io_bazel_rules_docker//repositories:deps.bzl", container_deps = "deps") - -container_deps() - -load( - "@io_bazel_rules_docker//container:container.bzl", - "container_pull", -) - -# This container is built from the Dockerfile in test/iptables/runner. -container_pull( - name = "iptables-test", - digest = "sha256:a137d692a2eb9fc7bf95c5f4a568da090e2c31098e93634421ed88f3a3f1db65", - registry = "gcr.io", - repository = "gvisor-presubmit/iptables-test", -) - -load( - "@io_bazel_rules_docker//go:image.bzl", - _go_image_repos = "repositories", -) - -_go_image_repos() - # Load C++ grpc rules. http_archive( name = "com_github_grpc_grpc", diff --git a/pkg/sentry/fsimpl/ext/BUILD b/pkg/sentry/fsimpl/ext/BUILD index a4947c480..ff861d0fe 100644 --- a/pkg/sentry/fsimpl/ext/BUILD +++ b/pkg/sentry/fsimpl/ext/BUILD @@ -93,8 +93,8 @@ go_test( "//pkg/sentry/kernel/auth", "//pkg/sentry/vfs", "//pkg/syserror", + "//pkg/test/testutil", "//pkg/usermem", - "//runsc/testutil", "@com_github_google_go-cmp//cmp:go_default_library", "@com_github_google_go-cmp//cmp/cmpopts:go_default_library", ], diff --git a/pkg/sentry/fsimpl/ext/ext_test.go b/pkg/sentry/fsimpl/ext/ext_test.go index 29bb73765..64e9a579f 100644 --- a/pkg/sentry/fsimpl/ext/ext_test.go +++ b/pkg/sentry/fsimpl/ext/ext_test.go @@ -32,9 +32,8 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/test/testutil" "gvisor.dev/gvisor/pkg/usermem" - - "gvisor.dev/gvisor/runsc/testutil" ) const ( diff --git a/pkg/tcpip/transport/tcp/BUILD b/pkg/tcpip/transport/tcp/BUILD index 61426623c..f2aa69069 100644 --- a/pkg/tcpip/transport/tcp/BUILD +++ b/pkg/tcpip/transport/tcp/BUILD @@ -105,8 +105,8 @@ go_test( "//pkg/tcpip/seqnum", "//pkg/tcpip/stack", "//pkg/tcpip/transport/tcp/testing/context", + "//pkg/test/testutil", "//pkg/waiter", - "//runsc/testutil", ], ) diff --git a/pkg/tcpip/transport/tcp/tcp_noracedetector_test.go b/pkg/tcpip/transport/tcp/tcp_noracedetector_test.go index 359a75e73..5fe23113b 100644 --- a/pkg/tcpip/transport/tcp/tcp_noracedetector_test.go +++ b/pkg/tcpip/transport/tcp/tcp_noracedetector_test.go @@ -31,7 +31,7 @@ import ( "gvisor.dev/gvisor/pkg/tcpip/header" "gvisor.dev/gvisor/pkg/tcpip/transport/tcp" "gvisor.dev/gvisor/pkg/tcpip/transport/tcp/testing/context" - "gvisor.dev/gvisor/runsc/testutil" + "gvisor.dev/gvisor/pkg/test/testutil" ) func TestFastRecovery(t *testing.T) { diff --git a/pkg/test/criutil/BUILD b/pkg/test/criutil/BUILD new file mode 100644 index 000000000..a7b082cee --- /dev/null +++ b/pkg/test/criutil/BUILD @@ -0,0 +1,14 @@ +load("//tools:defs.bzl", "go_library") + +package(licenses = ["notice"]) + +go_library( + name = "criutil", + testonly = 1, + srcs = ["criutil.go"], + visibility = ["//:sandbox"], + deps = [ + "//pkg/test/dockerutil", + "//pkg/test/testutil", + ], +) diff --git a/pkg/test/criutil/criutil.go b/pkg/test/criutil/criutil.go new file mode 100644 index 000000000..bebebb48e --- /dev/null +++ b/pkg/test/criutil/criutil.go @@ -0,0 +1,306 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package criutil contains utility functions for interacting with the +// Container Runtime Interface (CRI), principally via the crictl command line +// tool. This requires critools to be installed on the local system. +package criutil + +import ( + "encoding/json" + "fmt" + "os" + "os/exec" + "strings" + "time" + + "gvisor.dev/gvisor/pkg/test/dockerutil" + "gvisor.dev/gvisor/pkg/test/testutil" +) + +// Crictl contains information required to run the crictl utility. +type Crictl struct { + logger testutil.Logger + endpoint string + cleanup []func() +} + +// resolvePath attempts to find binary paths. It may set the path to invalid, +// which will cause the execution to fail with a sensible error. +func resolvePath(executable string) string { + guess, err := exec.LookPath(executable) + if err != nil { + guess = fmt.Sprintf("/usr/local/bin/%s", executable) + } + return guess +} + +// NewCrictl returns a Crictl configured with a timeout and an endpoint over +// which it will talk to containerd. +func NewCrictl(logger testutil.Logger, endpoint string) *Crictl { + // Attempt to find the executable, but don't bother propagating the + // error at this point. The first command executed will return with a + // binary not found error. + return &Crictl{ + logger: logger, + endpoint: endpoint, + } +} + +// CleanUp executes cleanup functions. +func (cc *Crictl) CleanUp() { + for _, c := range cc.cleanup { + c() + } + cc.cleanup = nil +} + +// RunPod creates a sandbox. It corresponds to `crictl runp`. +func (cc *Crictl) RunPod(sbSpecFile string) (string, error) { + podID, err := cc.run("runp", sbSpecFile) + if err != nil { + return "", fmt.Errorf("runp failed: %v", err) + } + // Strip the trailing newline from crictl output. + return strings.TrimSpace(podID), nil +} + +// Create creates a container within a sandbox. It corresponds to `crictl +// create`. +func (cc *Crictl) Create(podID, contSpecFile, sbSpecFile string) (string, error) { + podID, err := cc.run("create", podID, contSpecFile, sbSpecFile) + if err != nil { + return "", fmt.Errorf("create failed: %v", err) + } + // Strip the trailing newline from crictl output. + return strings.TrimSpace(podID), nil +} + +// Start starts a container. It corresponds to `crictl start`. +func (cc *Crictl) Start(contID string) (string, error) { + output, err := cc.run("start", contID) + if err != nil { + return "", fmt.Errorf("start failed: %v", err) + } + return output, nil +} + +// Stop stops a container. It corresponds to `crictl stop`. +func (cc *Crictl) Stop(contID string) error { + _, err := cc.run("stop", contID) + return err +} + +// Exec execs a program inside a container. It corresponds to `crictl exec`. +func (cc *Crictl) Exec(contID string, args ...string) (string, error) { + a := []string{"exec", contID} + a = append(a, args...) + output, err := cc.run(a...) + if err != nil { + return "", fmt.Errorf("exec failed: %v", err) + } + return output, nil +} + +// Rm removes a container. It corresponds to `crictl rm`. +func (cc *Crictl) Rm(contID string) error { + _, err := cc.run("rm", contID) + return err +} + +// StopPod stops a pod. It corresponds to `crictl stopp`. +func (cc *Crictl) StopPod(podID string) error { + _, err := cc.run("stopp", podID) + return err +} + +// containsConfig is a minimal copy of +// https://github.com/kubernetes/kubernetes/blob/master/pkg/kubelet/apis/cri/runtime/v1alpha2/api.proto +// It only contains fields needed for testing. +type containerConfig struct { + Status containerStatus +} + +type containerStatus struct { + Network containerNetwork +} + +type containerNetwork struct { + IP string +} + +// PodIP returns a pod's IP address. +func (cc *Crictl) PodIP(podID string) (string, error) { + output, err := cc.run("inspectp", podID) + if err != nil { + return "", err + } + conf := &containerConfig{} + if err := json.Unmarshal([]byte(output), conf); err != nil { + return "", fmt.Errorf("failed to unmarshal JSON: %v, %s", err, output) + } + if conf.Status.Network.IP == "" { + return "", fmt.Errorf("no IP found in config: %s", output) + } + return conf.Status.Network.IP, nil +} + +// RmPod removes a container. It corresponds to `crictl rmp`. +func (cc *Crictl) RmPod(podID string) error { + _, err := cc.run("rmp", podID) + return err +} + +// Import imports the given container from the local Docker instance. +func (cc *Crictl) Import(image string) error { + // Note that we provide a 10 minute timeout after connect because we may + // be pushing a lot of bytes in order to import the image. The connect + // timeout stays the same and is inherited from the Crictl instance. + cmd := testutil.Command(cc.logger, + resolvePath("ctr"), + fmt.Sprintf("--connect-timeout=%s", 30*time.Second), + fmt.Sprintf("--address=%s", cc.endpoint), + "-n", "k8s.io", "images", "import", "-") + cmd.Stderr = os.Stderr // Pass through errors. + + // Create a pipe and start the program. + w, err := cmd.StdinPipe() + if err != nil { + return err + } + if err := cmd.Start(); err != nil { + return err + } + + // Save the image on the other end. + if err := dockerutil.Save(cc.logger, image, w); err != nil { + cmd.Wait() + return err + } + + // Close our pipe reference & see if it was loaded. + if err := w.Close(); err != nil { + return w.Close() + } + + return cmd.Wait() +} + +// StartContainer pulls the given image ands starts the container in the +// sandbox with the given podID. +// +// Note that the image will always be imported from the local docker daemon. +func (cc *Crictl) StartContainer(podID, image, sbSpec, contSpec string) (string, error) { + if err := cc.Import(image); err != nil { + return "", err + } + + // Write the specs to files that can be read by crictl. + sbSpecFile, cleanup, err := testutil.WriteTmpFile("sbSpec", sbSpec) + if err != nil { + return "", fmt.Errorf("failed to write sandbox spec: %v", err) + } + cc.cleanup = append(cc.cleanup, cleanup) + contSpecFile, cleanup, err := testutil.WriteTmpFile("contSpec", contSpec) + if err != nil { + return "", fmt.Errorf("failed to write container spec: %v", err) + } + cc.cleanup = append(cc.cleanup, cleanup) + + return cc.startContainer(podID, image, sbSpecFile, contSpecFile) +} + +func (cc *Crictl) startContainer(podID, image, sbSpecFile, contSpecFile string) (string, error) { + contID, err := cc.Create(podID, contSpecFile, sbSpecFile) + if err != nil { + return "", fmt.Errorf("failed to create container in pod %q: %v", podID, err) + } + + if _, err := cc.Start(contID); err != nil { + return "", fmt.Errorf("failed to start container %q in pod %q: %v", contID, podID, err) + } + + return contID, nil +} + +// StopContainer stops and deletes the container with the given container ID. +func (cc *Crictl) StopContainer(contID string) error { + if err := cc.Stop(contID); err != nil { + return fmt.Errorf("failed to stop container %q: %v", contID, err) + } + + if err := cc.Rm(contID); err != nil { + return fmt.Errorf("failed to remove container %q: %v", contID, err) + } + + return nil +} + +// StartPodAndContainer starts a sandbox and container in that sandbox. It +// returns the pod ID and container ID. +func (cc *Crictl) StartPodAndContainer(image, sbSpec, contSpec string) (string, string, error) { + if err := cc.Import(image); err != nil { + return "", "", err + } + + // Write the specs to files that can be read by crictl. + sbSpecFile, cleanup, err := testutil.WriteTmpFile("sbSpec", sbSpec) + if err != nil { + return "", "", fmt.Errorf("failed to write sandbox spec: %v", err) + } + cc.cleanup = append(cc.cleanup, cleanup) + contSpecFile, cleanup, err := testutil.WriteTmpFile("contSpec", contSpec) + if err != nil { + return "", "", fmt.Errorf("failed to write container spec: %v", err) + } + cc.cleanup = append(cc.cleanup, cleanup) + + podID, err := cc.RunPod(sbSpecFile) + if err != nil { + return "", "", err + } + + contID, err := cc.startContainer(podID, image, sbSpecFile, contSpecFile) + + return podID, contID, err +} + +// StopPodAndContainer stops a container and pod. +func (cc *Crictl) StopPodAndContainer(podID, contID string) error { + if err := cc.StopContainer(contID); err != nil { + return fmt.Errorf("failed to stop container %q in pod %q: %v", contID, podID, err) + } + + if err := cc.StopPod(podID); err != nil { + return fmt.Errorf("failed to stop pod %q: %v", podID, err) + } + + if err := cc.RmPod(podID); err != nil { + return fmt.Errorf("failed to remove pod %q: %v", podID, err) + } + + return nil +} + +// run runs crictl with the given args. +func (cc *Crictl) run(args ...string) (string, error) { + defaultArgs := []string{ + resolvePath("crictl"), + "--image-endpoint", fmt.Sprintf("unix://%s", cc.endpoint), + "--runtime-endpoint", fmt.Sprintf("unix://%s", cc.endpoint), + } + fullArgs := append(defaultArgs, args...) + out, err := testutil.Command(cc.logger, fullArgs...).CombinedOutput() + return string(out), err +} diff --git a/pkg/test/dockerutil/BUILD b/pkg/test/dockerutil/BUILD new file mode 100644 index 000000000..7c8758e35 --- /dev/null +++ b/pkg/test/dockerutil/BUILD @@ -0,0 +1,14 @@ +load("//tools:defs.bzl", "go_library") + +package(licenses = ["notice"]) + +go_library( + name = "dockerutil", + testonly = 1, + srcs = ["dockerutil.go"], + visibility = ["//:sandbox"], + deps = [ + "//pkg/test/testutil", + "@com_github_kr_pty//:go_default_library", + ], +) diff --git a/pkg/test/dockerutil/dockerutil.go b/pkg/test/dockerutil/dockerutil.go new file mode 100644 index 000000000..baa8fc2f2 --- /dev/null +++ b/pkg/test/dockerutil/dockerutil.go @@ -0,0 +1,581 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package dockerutil is a collection of utility functions. +package dockerutil + +import ( + "encoding/json" + "flag" + "fmt" + "io" + "io/ioutil" + "log" + "net" + "os" + "os/exec" + "path" + "regexp" + "strconv" + "strings" + "syscall" + "time" + + "github.com/kr/pty" + "gvisor.dev/gvisor/pkg/test/testutil" +) + +var ( + // runtime is the runtime to use for tests. This will be applied to all + // containers. Note that the default here ("runsc") corresponds to the + // default used by the installations. This is important, because the + // default installer for vm_tests (in tools/installers:head, invoked + // via tools/vm:defs.bzl) will install with this name. So without + // changing anything, tests should have a runsc runtime available to + // them. Otherwise installers should update the existing runtime + // instead of installing a new one. + runtime = flag.String("runtime", "runsc", "specify which runtime to use") + + // config is the default Docker daemon configuration path. + config = flag.String("config_path", "/etc/docker/daemon.json", "configuration file for reading paths") +) + +// EnsureSupportedDockerVersion checks if correct docker is installed. +// +// This logs directly to stderr, as it is typically called from a Main wrapper. +func EnsureSupportedDockerVersion() { + cmd := exec.Command("docker", "version") + out, err := cmd.CombinedOutput() + if err != nil { + log.Fatalf("error running %q: %v", "docker version", err) + } + re := regexp.MustCompile(`Version:\s+(\d+)\.(\d+)\.\d.*`) + matches := re.FindStringSubmatch(string(out)) + if len(matches) != 3 { + log.Fatalf("Invalid docker output: %s", out) + } + major, _ := strconv.Atoi(matches[1]) + minor, _ := strconv.Atoi(matches[2]) + if major < 17 || (major == 17 && minor < 9) { + log.Fatalf("Docker version 17.09.0 or greater is required, found: %02d.%02d", major, minor) + } +} + +// RuntimePath returns the binary path for the current runtime. +func RuntimePath() (string, error) { + // Read the configuration data; the file must exist. + configBytes, err := ioutil.ReadFile(*config) + if err != nil { + return "", err + } + + // Unmarshal the configuration. + c := make(map[string]interface{}) + if err := json.Unmarshal(configBytes, &c); err != nil { + return "", err + } + + // Decode the expected configuration. + r, ok := c["runtimes"] + if !ok { + return "", fmt.Errorf("no runtimes declared: %v", c) + } + rs, ok := r.(map[string]interface{}) + if !ok { + // The runtimes are not a map. + return "", fmt.Errorf("unexpected format: %v", c) + } + r, ok = rs[*runtime] + if !ok { + // The expected runtime is not declared. + return "", fmt.Errorf("runtime %q not found: %v", *runtime, c) + } + rs, ok = r.(map[string]interface{}) + if !ok { + // The runtime is not a map. + return "", fmt.Errorf("unexpected format: %v", c) + } + p, ok := rs["path"].(string) + if !ok { + // The runtime does not declare a path. + return "", fmt.Errorf("unexpected format: %v", c) + } + return p, nil +} + +// Save exports a container image to the given Writer. +// +// Note that the writer should be actively consuming the output, otherwise it +// is not guaranteed that the Save will make any progress and the call may +// stall indefinitely. +// +// This is called by criutil in order to import imports. +func Save(logger testutil.Logger, image string, w io.Writer) error { + cmd := testutil.Command(logger, "docker", "save", testutil.ImageByName(image)) + cmd.Stdout = w // Send directly to the writer. + return cmd.Run() +} + +// MountMode describes if the mount should be ro or rw. +type MountMode int + +const ( + // ReadOnly is what the name says. + ReadOnly MountMode = iota + // ReadWrite is what the name says. + ReadWrite +) + +// String returns the mount mode argument for this MountMode. +func (m MountMode) String() string { + switch m { + case ReadOnly: + return "ro" + case ReadWrite: + return "rw" + } + panic(fmt.Sprintf("invalid mode: %d", m)) +} + +// Docker contains the name and the runtime of a docker container. +type Docker struct { + logger testutil.Logger + Runtime string + Name string + copyErr error + mounts []string + cleanups []func() +} + +// MakeDocker sets up the struct for a Docker container. +// +// Names of containers will be unique. +func MakeDocker(logger testutil.Logger) *Docker { + return &Docker{ + logger: logger, + Name: testutil.RandomID(logger.Name()), + Runtime: *runtime, + } +} + +// Mount mounts the given source and makes it available in the container. +func (d *Docker) Mount(target, source string, mode MountMode) { + d.mounts = append(d.mounts, fmt.Sprintf("-v=%s:%s:%v", source, target, mode)) +} + +// CopyFiles copies in and mounts the given files. They are always ReadOnly. +func (d *Docker) CopyFiles(target string, sources ...string) { + dir, err := ioutil.TempDir("", d.Name) + if err != nil { + d.copyErr = fmt.Errorf("ioutil.TempDir failed: %v", err) + return + } + d.cleanups = append(d.cleanups, func() { os.RemoveAll(dir) }) + if err := os.Chmod(dir, 0755); err != nil { + d.copyErr = fmt.Errorf("os.Chmod(%q, 0755) failed: %v", dir, err) + return + } + for _, name := range sources { + src, err := testutil.FindFile(name) + if err != nil { + d.copyErr = fmt.Errorf("testutil.FindFile(%q) failed: %v", name, err) + return + } + dst := path.Join(dir, path.Base(name)) + if err := testutil.Copy(src, dst); err != nil { + d.copyErr = fmt.Errorf("testutil.Copy(%q, %q) failed: %v", src, dst, err) + return + } + d.logger.Logf("copy: %s -> %s", src, dst) + } + d.Mount(target, dir, ReadOnly) +} + +// Link links the given target. +func (d *Docker) Link(target string, source *Docker) { + d.mounts = append(d.mounts, fmt.Sprintf("--link=%s:%s", source.Name, target)) +} + +// RunOpts are options for running a container. +type RunOpts struct { + // Image is the image relative to images/. This will be mangled + // appropriately, to ensure that only first-party images are used. + Image string + + // Memory is the memory limit in kB. + Memory int + + // Ports are the ports to be allocated. + Ports []int + + // WorkDir sets the working directory. + WorkDir string + + // ReadOnly sets the read-only flag. + ReadOnly bool + + // Env are additional environment variables. + Env []string + + // User is the user to use. + User string + + // Privileged enables privileged mode. + Privileged bool + + // CapAdd are the extra set of capabilities to add. + CapAdd []string + + // CapDrop are the extra set of capabilities to drop. + CapDrop []string + + // Pty indicates that a pty will be allocated. If this is non-nil, then + // this will run after start-up with the *exec.Command and Pty file + // passed in to the function. + Pty func(*exec.Cmd, *os.File) + + // Foreground indicates that the container should be run in the + // foreground. If this is true, then the output will be available as a + // return value from the Run function. + Foreground bool + + // Extra are extra arguments that may be passed. + Extra []string +} + +// args returns common arguments. +// +// Note that this does not define the complete behavior. +func (d *Docker) argsFor(r *RunOpts, command string, p []string) (rv []string) { + isExec := command == "exec" + isRun := command == "run" + + if isRun || isExec { + rv = append(rv, "-i") + } + if r.Pty != nil { + rv = append(rv, "-t") + } + if r.User != "" { + rv = append(rv, fmt.Sprintf("--user=%s", r.User)) + } + if r.Privileged { + rv = append(rv, "--privileged") + } + for _, c := range r.CapAdd { + rv = append(rv, fmt.Sprintf("--cap-add=%s", c)) + } + for _, c := range r.CapDrop { + rv = append(rv, fmt.Sprintf("--cap-drop=%s", c)) + } + for _, e := range r.Env { + rv = append(rv, fmt.Sprintf("--env=%s", e)) + } + if r.WorkDir != "" { + rv = append(rv, fmt.Sprintf("--workdir=%s", r.WorkDir)) + } + if !isExec { + if r.Memory != 0 { + rv = append(rv, fmt.Sprintf("--memory=%dk", r.Memory)) + } + for _, p := range r.Ports { + rv = append(rv, fmt.Sprintf("--publish=%d", p)) + } + if r.ReadOnly { + rv = append(rv, fmt.Sprintf("--read-only")) + } + if len(p) > 0 { + rv = append(rv, "--entrypoint=") + } + } + + // Always attach the test environment & Extra. + rv = append(rv, fmt.Sprintf("--env=RUNSC_TEST_NAME=%s", d.Name)) + rv = append(rv, r.Extra...) + + // Attach necessary bits. + if isExec { + rv = append(rv, d.Name) + } else { + rv = append(rv, d.mounts...) + rv = append(rv, fmt.Sprintf("--runtime=%s", d.Runtime)) + rv = append(rv, fmt.Sprintf("--name=%s", d.Name)) + rv = append(rv, testutil.ImageByName(r.Image)) + } + + // Attach other arguments. + rv = append(rv, p...) + return rv +} + +// run runs a complete command. +func (d *Docker) run(r RunOpts, command string, p ...string) (string, error) { + if d.copyErr != nil { + return "", d.copyErr + } + basicArgs := []string{"docker"} + if command == "spawn" { + command = "run" + basicArgs = append(basicArgs, command) + basicArgs = append(basicArgs, "-d") + } else { + basicArgs = append(basicArgs, command) + } + customArgs := d.argsFor(&r, command, p) + cmd := testutil.Command(d.logger, append(basicArgs, customArgs...)...) + if r.Pty != nil { + // If allocating a terminal, then we just ignore the output + // from the command. + ptmx, err := pty.Start(cmd.Cmd) + if err != nil { + return "", err + } + defer cmd.Wait() // Best effort. + r.Pty(cmd.Cmd, ptmx) + } else { + // Can't support PTY or streaming. + out, err := cmd.CombinedOutput() + return string(out), err + } + return "", nil +} + +// Create calls 'docker create' with the arguments provided. +func (d *Docker) Create(r RunOpts, args ...string) error { + _, err := d.run(r, "create", args...) + return err +} + +// Start calls 'docker start'. +func (d *Docker) Start() error { + return testutil.Command(d.logger, "docker", "start", d.Name).Run() +} + +// Stop calls 'docker stop'. +func (d *Docker) Stop() error { + return testutil.Command(d.logger, "docker", "stop", d.Name).Run() +} + +// Run calls 'docker run' with the arguments provided. +func (d *Docker) Run(r RunOpts, args ...string) (string, error) { + return d.run(r, "run", args...) +} + +// Spawn starts the container and detaches. +func (d *Docker) Spawn(r RunOpts, args ...string) error { + _, err := d.run(r, "spawn", args...) + return err +} + +// Logs calls 'docker logs'. +func (d *Docker) Logs() (string, error) { + // Don't capture the output; since it will swamp the logs. + out, err := exec.Command("docker", "logs", d.Name).CombinedOutput() + return string(out), err +} + +// Exec calls 'docker exec' with the arguments provided. +func (d *Docker) Exec(r RunOpts, args ...string) (string, error) { + return d.run(r, "exec", args...) +} + +// Pause calls 'docker pause'. +func (d *Docker) Pause() error { + return testutil.Command(d.logger, "docker", "pause", d.Name).Run() +} + +// Unpause calls 'docker pause'. +func (d *Docker) Unpause() error { + return testutil.Command(d.logger, "docker", "unpause", d.Name).Run() +} + +// Checkpoint calls 'docker checkpoint'. +func (d *Docker) Checkpoint(name string) error { + return testutil.Command(d.logger, "docker", "checkpoint", "create", d.Name, name).Run() +} + +// Restore calls 'docker start --checkname [name]'. +func (d *Docker) Restore(name string) error { + return testutil.Command(d.logger, "docker", "start", fmt.Sprintf("--checkpoint=%s", name), d.Name).Run() +} + +// Kill calls 'docker kill'. +func (d *Docker) Kill() error { + // Skip logging this command, it will likely be an error. + out, err := exec.Command("docker", "kill", d.Name).CombinedOutput() + if err != nil && !strings.Contains(string(out), "is not running") { + return err + } + return nil +} + +// Remove calls 'docker rm'. +func (d *Docker) Remove() error { + return testutil.Command(d.logger, "docker", "rm", d.Name).Run() +} + +// CleanUp kills and deletes the container (best effort). +func (d *Docker) CleanUp() { + // Kill the container. + if err := d.Kill(); err != nil { + // Just log; can't do anything here. + d.logger.Logf("error killing container %q: %v", d.Name, err) + } + // Remove the image. + if err := d.Remove(); err != nil { + d.logger.Logf("error removing container %q: %v", d.Name, err) + } + // Forget all mounts. + d.mounts = nil + // Execute all cleanups. + for _, c := range d.cleanups { + c() + } + d.cleanups = nil +} + +// FindPort returns the host port that is mapped to 'sandboxPort'. This calls +// docker to allocate a free port in the host and prevent conflicts. +func (d *Docker) FindPort(sandboxPort int) (int, error) { + format := fmt.Sprintf(`{{ (index (index .NetworkSettings.Ports "%d/tcp") 0).HostPort }}`, sandboxPort) + out, err := testutil.Command(d.logger, "docker", "inspect", "-f", format, d.Name).CombinedOutput() + if err != nil { + return -1, fmt.Errorf("error retrieving port: %v", err) + } + port, err := strconv.Atoi(strings.TrimSuffix(string(out), "\n")) + if err != nil { + return -1, fmt.Errorf("error parsing port %q: %v", out, err) + } + return port, nil +} + +// FindIP returns the IP address of the container. +func (d *Docker) FindIP() (net.IP, error) { + const format = `{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}` + out, err := testutil.Command(d.logger, "docker", "inspect", "-f", format, d.Name).CombinedOutput() + if err != nil { + return net.IP{}, fmt.Errorf("error retrieving IP: %v", err) + } + ip := net.ParseIP(strings.TrimSpace(string(out))) + if ip == nil { + return net.IP{}, fmt.Errorf("invalid IP: %q", string(out)) + } + return ip, nil +} + +// SandboxPid returns the PID to the sandbox process. +func (d *Docker) SandboxPid() (int, error) { + out, err := testutil.Command(d.logger, "docker", "inspect", "-f={{.State.Pid}}", d.Name).CombinedOutput() + if err != nil { + return -1, fmt.Errorf("error retrieving pid: %v", err) + } + pid, err := strconv.Atoi(strings.TrimSuffix(string(out), "\n")) + if err != nil { + return -1, fmt.Errorf("error parsing pid %q: %v", out, err) + } + return pid, nil +} + +// ID returns the container ID. +func (d *Docker) ID() (string, error) { + out, err := testutil.Command(d.logger, "docker", "inspect", "-f={{.Id}}", d.Name).CombinedOutput() + if err != nil { + return "", fmt.Errorf("error retrieving ID: %v", err) + } + return strings.TrimSpace(string(out)), nil +} + +// Wait waits for container to exit, up to the given timeout. Returns error if +// wait fails or timeout is hit. Returns the application return code otherwise. +// Note that the application may have failed even if err == nil, always check +// the exit code. +func (d *Docker) Wait(timeout time.Duration) (syscall.WaitStatus, error) { + timeoutChan := time.After(timeout) + waitChan := make(chan (syscall.WaitStatus)) + errChan := make(chan (error)) + + go func() { + out, err := testutil.Command(d.logger, "docker", "wait", d.Name).CombinedOutput() + if err != nil { + errChan <- fmt.Errorf("error waiting for container %q: %v", d.Name, err) + } + exit, err := strconv.Atoi(strings.TrimSuffix(string(out), "\n")) + if err != nil { + errChan <- fmt.Errorf("error parsing exit code %q: %v", out, err) + } + waitChan <- syscall.WaitStatus(uint32(exit)) + }() + + select { + case ws := <-waitChan: + return ws, nil + case err := <-errChan: + return syscall.WaitStatus(1), err + case <-timeoutChan: + return syscall.WaitStatus(1), fmt.Errorf("timeout waiting for container %q", d.Name) + } +} + +// WaitForOutput calls 'docker logs' to retrieve containers output and searches +// for the given pattern. +func (d *Docker) WaitForOutput(pattern string, timeout time.Duration) (string, error) { + matches, err := d.WaitForOutputSubmatch(pattern, timeout) + if err != nil { + return "", err + } + if len(matches) == 0 { + return "", nil + } + return matches[0], nil +} + +// WaitForOutputSubmatch calls 'docker logs' to retrieve containers output and +// searches for the given pattern. It returns any regexp submatches as well. +func (d *Docker) WaitForOutputSubmatch(pattern string, timeout time.Duration) ([]string, error) { + re := regexp.MustCompile(pattern) + var ( + lastOut string + stopped bool + ) + for exp := time.Now().Add(timeout); time.Now().Before(exp); { + out, err := d.Logs() + if err != nil { + return nil, err + } + if out != lastOut { + if lastOut == "" { + d.logger.Logf("output (start): %s", out) + } else if strings.HasPrefix(out, lastOut) { + d.logger.Logf("output (contn): %s", out[len(lastOut):]) + } else { + d.logger.Logf("output (trunc): %s", out) + } + lastOut = out // Save for future. + if matches := re.FindStringSubmatch(lastOut); matches != nil { + return matches, nil // Success! + } + } else if stopped { + // The sandbox stopped and we looked at the + // logs at least once since determining that. + return nil, fmt.Errorf("no longer running: %v", err) + } else if pid, err := d.SandboxPid(); pid == 0 || err != nil { + // The sandbox may have stopped, but it's + // possible that it has emitted the terminal + // line between the last call to Logs and here. + stopped = true + } + time.Sleep(100 * time.Millisecond) + } + return nil, fmt.Errorf("timeout waiting for output %q: %s", re.String(), lastOut) +} diff --git a/pkg/test/testutil/BUILD b/pkg/test/testutil/BUILD new file mode 100644 index 000000000..03b1b4677 --- /dev/null +++ b/pkg/test/testutil/BUILD @@ -0,0 +1,20 @@ +load("//tools:defs.bzl", "go_library") + +package(licenses = ["notice"]) + +go_library( + name = "testutil", + testonly = 1, + srcs = [ + "testutil.go", + "testutil_runfiles.go", + ], + visibility = ["//:sandbox"], + deps = [ + "//pkg/sync", + "//runsc/boot", + "//runsc/specutils", + "@com_github_cenkalti_backoff//:go_default_library", + "@com_github_opencontainers_runtime-spec//specs-go:go_default_library", + ], +) diff --git a/pkg/test/testutil/testutil.go b/pkg/test/testutil/testutil.go new file mode 100644 index 000000000..d75ceca3d --- /dev/null +++ b/pkg/test/testutil/testutil.go @@ -0,0 +1,550 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package testutil contains utility functions for runsc tests. +package testutil + +import ( + "bufio" + "context" + "debug/elf" + "encoding/base32" + "encoding/json" + "flag" + "fmt" + "io" + "io/ioutil" + "log" + "math" + "math/rand" + "net/http" + "os" + "os/exec" + "os/signal" + "path" + "path/filepath" + "strconv" + "strings" + "sync/atomic" + "syscall" + "testing" + "time" + + "github.com/cenkalti/backoff" + specs "github.com/opencontainers/runtime-spec/specs-go" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/runsc/boot" + "gvisor.dev/gvisor/runsc/specutils" +) + +var ( + checkpoint = flag.Bool("checkpoint", true, "control checkpoint/restore support") +) + +// IsCheckpointSupported returns the relevant command line flag. +func IsCheckpointSupported() bool { + return *checkpoint +} + +// nameToActual is used by ImageByName (for now). +var nameToActual = map[string]string{ + "basic/alpine": "alpine", + "basic/busybox": "busybox:1.31.1", + "basic/httpd": "httpd", + "basic/mysql": "mysql", + "basic/nginx": "nginx", + "basic/python": "gcr.io/gvisor-presubmit/python-hello", + "basic/resolv": "k8s.gcr.io/busybox", + "basic/ruby": "ruby", + "basic/tomcat": "tomcat:8.0", + "basic/ubuntu": "ubuntu:trusty", + "iptables": "gcr.io/gvisor-presubmit/iptables-test", + "packetdrill": "gcr.io/gvisor-presubmit/packetdrill", + "packetimpact": "gcr.io/gvisor-presubmit/packetimpact", + "runtimes/go1.12": "gcr.io/gvisor-presubmit/go1.12", + "runtimes/java11": "gcr.io/gvisor-presubmit/java11", + "runtimes/nodejs12.4.0": "gcr.io/gvisor-presubmit/nodejs12.4.0", + "runtimes/php7.3.6": "gcr.io/gvisor-presubmit/php7.3.6", + "runtimes/python3.7.3": "gcr.io/gvisor-presubmit/python3.7.3", +} + +// ImageByName mangles the image name used locally. +// +// For now, this is implemented as a static lookup table. In a subsequent +// change, this will be used to reference a locally-generated image. +func ImageByName(name string) string { + actual, ok := nameToActual[name] + if !ok { + panic(fmt.Sprintf("unknown image: %v", name)) + } + // A terrible hack, for now execute a manual pull. + if out, err := exec.Command("docker", "pull", actual).CombinedOutput(); err != nil { + panic(fmt.Sprintf("error pulling image %q -> %q: %v, out: %s", name, actual, err, string(out))) + } + return actual +} + +// ConfigureExePath configures the executable for runsc in the test environment. +func ConfigureExePath() error { + path, err := FindFile("runsc/runsc") + if err != nil { + return err + } + specutils.ExePath = path + return nil +} + +// TmpDir returns the absolute path to a writable directory that can be used as +// scratch by the test. +func TmpDir() string { + dir := os.Getenv("TEST_TMPDIR") + if dir == "" { + dir = "/tmp" + } + return dir +} + +// Logger is a simple logging wrapper. +// +// This is designed to be implemented by *testing.T. +type Logger interface { + Name() string + Logf(fmt string, args ...interface{}) +} + +// DefaultLogger logs using the log package. +type DefaultLogger string + +// Name implements Logger.Name. +func (d DefaultLogger) Name() string { + return string(d) +} + +// Logf implements Logger.Logf. +func (d DefaultLogger) Logf(fmt string, args ...interface{}) { + log.Printf(fmt, args...) +} + +// Cmd is a simple wrapper. +type Cmd struct { + logger Logger + *exec.Cmd +} + +// CombinedOutput returns the output and logs. +func (c *Cmd) CombinedOutput() ([]byte, error) { + out, err := c.Cmd.CombinedOutput() + if len(out) > 0 { + c.logger.Logf("output: %s", string(out)) + } + if err != nil { + c.logger.Logf("error: %v", err) + } + return out, err +} + +// Command is a simple wrapper around exec.Command, that logs. +func Command(logger Logger, args ...string) *Cmd { + logger.Logf("command: %s", strings.Join(args, " ")) + return &Cmd{ + logger: logger, + Cmd: exec.Command(args[0], args[1:]...), + } +} + +// TestConfig returns the default configuration to use in tests. Note that +// 'RootDir' must be set by caller if required. +func TestConfig(t *testing.T) *boot.Config { + logDir := os.TempDir() + if dir, ok := os.LookupEnv("TEST_UNDECLARED_OUTPUTS_DIR"); ok { + logDir = dir + "/" + } + return &boot.Config{ + Debug: true, + DebugLog: path.Join(logDir, "runsc.log."+t.Name()+".%TIMESTAMP%.%COMMAND%"), + LogFormat: "text", + DebugLogFormat: "text", + LogPackets: true, + Network: boot.NetworkNone, + Strace: true, + Platform: "ptrace", + FileAccess: boot.FileAccessExclusive, + NumNetworkChannels: 1, + + TestOnlyAllowRunAsCurrentUserWithoutChroot: true, + } +} + +// NewSpecWithArgs creates a simple spec with the given args suitable for use +// in tests. +func NewSpecWithArgs(args ...string) *specs.Spec { + return &specs.Spec{ + // The host filesystem root is the container root. + Root: &specs.Root{ + Path: "/", + Readonly: true, + }, + Process: &specs.Process{ + Args: args, + Env: []string{ + "PATH=" + os.Getenv("PATH"), + }, + Capabilities: specutils.AllCapabilities(), + }, + Mounts: []specs.Mount{ + // Hide the host /etc to avoid any side-effects. + // For example, bash reads /etc/passwd and if it is + // very big, tests can fail by timeout. + { + Type: "tmpfs", + Destination: "/etc", + }, + // Root is readonly, but many tests want to write to tmpdir. + // This creates a writable mount inside the root. Also, when tmpdir points + // to "/tmp", it makes the the actual /tmp to be mounted and not a tmpfs + // inside the sentry. + { + Type: "bind", + Destination: TmpDir(), + Source: TmpDir(), + }, + }, + Hostname: "runsc-test-hostname", + } +} + +// SetupRootDir creates a root directory for containers. +func SetupRootDir() (string, func(), error) { + rootDir, err := ioutil.TempDir(TmpDir(), "containers") + if err != nil { + return "", nil, fmt.Errorf("error creating root dir: %v", err) + } + return rootDir, func() { os.RemoveAll(rootDir) }, nil +} + +// SetupContainer creates a bundle and root dir for the container, generates a +// test config, and writes the spec to config.json in the bundle dir. +func SetupContainer(spec *specs.Spec, conf *boot.Config) (rootDir, bundleDir string, cleanup func(), err error) { + rootDir, rootCleanup, err := SetupRootDir() + if err != nil { + return "", "", nil, err + } + conf.RootDir = rootDir + bundleDir, bundleCleanup, err := SetupBundleDir(spec) + if err != nil { + rootCleanup() + return "", "", nil, err + } + return rootDir, bundleDir, func() { + bundleCleanup() + rootCleanup() + }, err +} + +// SetupBundleDir creates a bundle dir and writes the spec to config.json. +func SetupBundleDir(spec *specs.Spec) (string, func(), error) { + bundleDir, err := ioutil.TempDir(TmpDir(), "bundle") + if err != nil { + return "", nil, fmt.Errorf("error creating bundle dir: %v", err) + } + cleanup := func() { os.RemoveAll(bundleDir) } + if err := writeSpec(bundleDir, spec); err != nil { + cleanup() + return "", nil, fmt.Errorf("error writing spec: %v", err) + } + return bundleDir, cleanup, nil +} + +// writeSpec writes the spec to disk in the given directory. +func writeSpec(dir string, spec *specs.Spec) error { + b, err := json.Marshal(spec) + if err != nil { + return err + } + return ioutil.WriteFile(filepath.Join(dir, "config.json"), b, 0755) +} + +// RandomID returns 20 random bytes following the given prefix. +func RandomID(prefix string) string { + // Read 20 random bytes. + b := make([]byte, 20) + // "[Read] always returns len(p) and a nil error." --godoc + if _, err := rand.Read(b); err != nil { + panic("rand.Read failed: " + err.Error()) + } + return fmt.Sprintf("%s-%s", prefix, base32.StdEncoding.EncodeToString(b)) +} + +// RandomContainerID generates a random container id for each test. +// +// The container id is used to create an abstract unix domain socket, which +// must be unique. While the container forbids creating two containers with the +// same name, sometimes between test runs the socket does not get cleaned up +// quickly enough, causing container creation to fail. +func RandomContainerID() string { + return RandomID("test-container-") +} + +// Copy copies file from src to dst. +func Copy(src, dst string) error { + in, err := os.Open(src) + if err != nil { + return err + } + defer in.Close() + + st, err := in.Stat() + if err != nil { + return err + } + + out, err := os.OpenFile(dst, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, st.Mode().Perm()) + if err != nil { + return err + } + defer out.Close() + + // Mirror the local user's permissions across all users. This is + // because as we inject things into the container, the UID/GID will + // change. Also, the build system may generate artifacts with different + // modes. At the top-level (volume mapping) we have a big read-only + // knob that can be applied to prevent modifications. + // + // Note that this must be done via a separate Chmod call, otherwise the + // current process's umask will get in the way. + var mode os.FileMode + if st.Mode()&0100 != 0 { + mode |= 0111 + } + if st.Mode()&0200 != 0 { + mode |= 0222 + } + if st.Mode()&0400 != 0 { + mode |= 0444 + } + if err := os.Chmod(dst, mode); err != nil { + return err + } + + _, err = io.Copy(out, in) + return err +} + +// Poll is a shorthand function to poll for something with given timeout. +func Poll(cb func() error, timeout time.Duration) error { + ctx, cancel := context.WithTimeout(context.Background(), timeout) + defer cancel() + b := backoff.WithContext(backoff.NewConstantBackOff(100*time.Millisecond), ctx) + return backoff.Retry(cb, b) +} + +// WaitForHTTP tries GET requests on a port until the call succeeds or timeout. +func WaitForHTTP(port int, timeout time.Duration) error { + cb := func() error { + c := &http.Client{ + // Calculate timeout to be able to do minimum 5 attempts. + Timeout: timeout / 5, + } + url := fmt.Sprintf("http://localhost:%d/", port) + resp, err := c.Get(url) + if err != nil { + log.Printf("Waiting %s: %v", url, err) + return err + } + resp.Body.Close() + return nil + } + return Poll(cb, timeout) +} + +// Reaper reaps child processes. +type Reaper struct { + // mu protects ch, which will be nil if the reaper is not running. + mu sync.Mutex + ch chan os.Signal +} + +// Start starts reaping child processes. +func (r *Reaper) Start() { + r.mu.Lock() + defer r.mu.Unlock() + + if r.ch != nil { + panic("reaper.Start called on a running reaper") + } + + r.ch = make(chan os.Signal, 1) + signal.Notify(r.ch, syscall.SIGCHLD) + + go func() { + for { + r.mu.Lock() + ch := r.ch + r.mu.Unlock() + if ch == nil { + return + } + + _, ok := <-ch + if !ok { + // Channel closed. + return + } + for { + cpid, _ := syscall.Wait4(-1, nil, syscall.WNOHANG, nil) + if cpid < 1 { + break + } + } + } + }() +} + +// Stop stops reaping child processes. +func (r *Reaper) Stop() { + r.mu.Lock() + defer r.mu.Unlock() + + if r.ch == nil { + panic("reaper.Stop called on a stopped reaper") + } + + signal.Stop(r.ch) + close(r.ch) + r.ch = nil +} + +// StartReaper is a helper that starts a new Reaper and returns a function to +// stop it. +func StartReaper() func() { + r := &Reaper{} + r.Start() + return r.Stop +} + +// WaitUntilRead reads from the given reader until the wanted string is found +// or until timeout. +func WaitUntilRead(r io.Reader, want string, split bufio.SplitFunc, timeout time.Duration) error { + sc := bufio.NewScanner(r) + if split != nil { + sc.Split(split) + } + // done must be accessed atomically. A value greater than 0 indicates + // that the read loop can exit. + var done uint32 + doneCh := make(chan struct{}) + go func() { + for sc.Scan() { + t := sc.Text() + if strings.Contains(t, want) { + atomic.StoreUint32(&done, 1) + close(doneCh) + break + } + if atomic.LoadUint32(&done) > 0 { + break + } + } + }() + select { + case <-time.After(timeout): + atomic.StoreUint32(&done, 1) + return fmt.Errorf("timeout waiting to read %q", want) + case <-doneCh: + return nil + } +} + +// KillCommand kills the process running cmd unless it hasn't been started. It +// returns an error if it cannot kill the process unless the reason is that the +// process has already exited. +// +// KillCommand will also reap the process. +func KillCommand(cmd *exec.Cmd) error { + if cmd.Process == nil { + return nil + } + if err := cmd.Process.Kill(); err != nil { + if !strings.Contains(err.Error(), "process already finished") { + return fmt.Errorf("failed to kill process %v: %v", cmd, err) + } + } + return cmd.Wait() +} + +// WriteTmpFile writes text to a temporary file, closes the file, and returns +// the name of the file. A cleanup function is also returned. +func WriteTmpFile(pattern, text string) (string, func(), error) { + file, err := ioutil.TempFile(TmpDir(), pattern) + if err != nil { + return "", nil, err + } + defer file.Close() + if _, err := file.Write([]byte(text)); err != nil { + return "", nil, err + } + return file.Name(), func() { os.RemoveAll(file.Name()) }, nil +} + +// IsStatic returns true iff the given file is a static binary. +func IsStatic(filename string) (bool, error) { + f, err := elf.Open(filename) + if err != nil { + return false, err + } + for _, prog := range f.Progs { + if prog.Type == elf.PT_INTERP { + return false, nil // Has interpreter. + } + } + return true, nil +} + +// TestIndicesForShard returns indices for this test shard based on the +// TEST_SHARD_INDEX and TEST_TOTAL_SHARDS environment vars. +// +// If either of the env vars are not present, then the function will return all +// tests. If there are more shards than there are tests, then the returned list +// may be empty. +func TestIndicesForShard(numTests int) ([]int, error) { + var ( + shardIndex = 0 + shardTotal = 1 + ) + + indexStr, totalStr := os.Getenv("TEST_SHARD_INDEX"), os.Getenv("TEST_TOTAL_SHARDS") + if indexStr != "" && totalStr != "" { + // Parse index and total to ints. + var err error + shardIndex, err = strconv.Atoi(indexStr) + if err != nil { + return nil, fmt.Errorf("invalid TEST_SHARD_INDEX %q: %v", indexStr, err) + } + shardTotal, err = strconv.Atoi(totalStr) + if err != nil { + return nil, fmt.Errorf("invalid TEST_TOTAL_SHARDS %q: %v", totalStr, err) + } + } + + // Calculate! + var indices []int + numBlocks := int(math.Ceil(float64(numTests) / float64(shardTotal))) + for i := 0; i < numBlocks; i++ { + pick := i*shardTotal + shardIndex + if pick < numTests { + indices = append(indices, pick) + } + } + return indices, nil +} diff --git a/pkg/test/testutil/testutil_runfiles.go b/pkg/test/testutil/testutil_runfiles.go new file mode 100644 index 000000000..ece9ea9a1 --- /dev/null +++ b/pkg/test/testutil/testutil_runfiles.go @@ -0,0 +1,75 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package testutil + +import ( + "fmt" + "os" + "path/filepath" +) + +// FindFile searchs for a file inside the test run environment. It returns the +// full path to the file. It fails if none or more than one file is found. +func FindFile(path string) (string, error) { + wd, err := os.Getwd() + if err != nil { + return "", err + } + + // The test root is demarcated by a path element called "__main__". Search for + // it backwards from the working directory. + root := wd + for { + dir, name := filepath.Split(root) + if name == "__main__" { + break + } + if len(dir) == 0 { + return "", fmt.Errorf("directory __main__ not found in %q", wd) + } + // Remove ending slash to loop around. + root = dir[:len(dir)-1] + } + + // Annoyingly, bazel adds the build type to the directory path for go + // binaries, but not for c++ binaries. We use two different patterns to + // to find our file. + patterns := []string{ + // Try the obvious path first. + filepath.Join(root, path), + // If it was a go binary, use a wildcard to match the build + // type. The pattern is: /test-path/__main__/directories/*/file. + filepath.Join(root, filepath.Dir(path), "*", filepath.Base(path)), + } + + for _, p := range patterns { + matches, err := filepath.Glob(p) + if err != nil { + // "The only possible returned error is ErrBadPattern, + // when pattern is malformed." -godoc + return "", fmt.Errorf("error globbing %q: %v", p, err) + } + switch len(matches) { + case 0: + // Try the next pattern. + case 1: + // We found it. + return matches[0], nil + default: + return "", fmt.Errorf("more than one match found for %q: %s", path, matches) + } + } + return "", fmt.Errorf("file %q not found", path) +} diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD index 72c2fe381..69dcc74f2 100644 --- a/runsc/boot/BUILD +++ b/runsc/boot/BUILD @@ -23,6 +23,7 @@ go_library( "vfs.go", ], visibility = [ + "//pkg/test:__subpackages__", "//runsc:__subpackages__", "//test:__subpackages__", ], diff --git a/runsc/cmd/BUILD b/runsc/cmd/BUILD index 4900fbe16..af3538ef0 100644 --- a/runsc/cmd/BUILD +++ b/runsc/cmd/BUILD @@ -82,11 +82,11 @@ go_test( "//pkg/log", "//pkg/sentry/control", "//pkg/sentry/kernel/auth", + "//pkg/test/testutil", "//pkg/urpc", "//runsc/boot", "//runsc/container", "//runsc/specutils", - "//runsc/testutil", "@com_github_google_go-cmp//cmp:go_default_library", "@com_github_google_go-cmp//cmp/cmpopts:go_default_library", "@com_github_opencontainers_runtime-spec//specs-go:go_default_library", diff --git a/runsc/cmd/capability_test.go b/runsc/cmd/capability_test.go index 9360d7442..a84067112 100644 --- a/runsc/cmd/capability_test.go +++ b/runsc/cmd/capability_test.go @@ -23,10 +23,10 @@ import ( specs "github.com/opencontainers/runtime-spec/specs-go" "github.com/syndtr/gocapability/capability" "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/test/testutil" "gvisor.dev/gvisor/runsc/boot" "gvisor.dev/gvisor/runsc/container" "gvisor.dev/gvisor/runsc/specutils" - "gvisor.dev/gvisor/runsc/testutil" ) func init() { @@ -90,16 +90,15 @@ func TestCapabilities(t *testing.T) { // Use --network=host to make sandbox use spec's capabilities. conf.Network = boot.NetworkHost - rootDir, bundleDir, err := testutil.SetupContainer(spec, conf) + _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) if err != nil { t.Fatalf("error setting up container: %v", err) } - defer os.RemoveAll(rootDir) - defer os.RemoveAll(bundleDir) + defer cleanup() // Create and start the container. args := container.Args{ - ID: testutil.UniqueContainerID(), + ID: testutil.RandomContainerID(), Spec: spec, BundleDir: bundleDir, } diff --git a/runsc/container/BUILD b/runsc/container/BUILD index 0aaeea3a8..331b8e866 100644 --- a/runsc/container/BUILD +++ b/runsc/container/BUILD @@ -33,13 +33,15 @@ go_test( size = "large", srcs = [ "console_test.go", + "container_norace_test.go", + "container_race_test.go", "container_test.go", "multi_container_test.go", "shared_volume_test.go", ], data = [ "//runsc", - "//runsc/container/test_app", + "//test/cmd/test_app", ], library = ":container", shard_count = 5, @@ -54,12 +56,12 @@ go_test( "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", "//pkg/sync", + "//pkg/test/testutil", "//pkg/unet", "//pkg/urpc", "//runsc/boot", "//runsc/boot/platforms", "//runsc/specutils", - "//runsc/testutil", "@com_github_cenkalti_backoff//:go_default_library", "@com_github_kr_pty//:go_default_library", "@com_github_opencontainers_runtime-spec//specs-go:go_default_library", diff --git a/runsc/container/console_test.go b/runsc/container/console_test.go index af245b6d8..294dca5e7 100644 --- a/runsc/container/console_test.go +++ b/runsc/container/console_test.go @@ -29,9 +29,9 @@ import ( "gvisor.dev/gvisor/pkg/sentry/control" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/test/testutil" "gvisor.dev/gvisor/pkg/unet" "gvisor.dev/gvisor/pkg/urpc" - "gvisor.dev/gvisor/runsc/testutil" ) // socketPath creates a path inside bundleDir and ensures that the returned @@ -58,25 +58,26 @@ func socketPath(bundleDir string) (string, error) { } // createConsoleSocket creates a socket at the given path that will receive a -// console fd from the sandbox. If no error occurs, it returns the server -// socket and a cleanup function. -func createConsoleSocket(path string) (*unet.ServerSocket, func() error, error) { +// console fd from the sandbox. If an error occurs, t.Fatalf will be called. +// The function returning should be deferred as cleanup. +func createConsoleSocket(t *testing.T, path string) (*unet.ServerSocket, func()) { + t.Helper() srv, err := unet.BindAndListen(path, false) if err != nil { - return nil, nil, fmt.Errorf("error binding and listening to socket %q: %v", path, err) + t.Fatalf("error binding and listening to socket %q: %v", path, err) } - cleanup := func() error { + cleanup := func() { + // Log errors; nothing can be done. if err := srv.Close(); err != nil { - return fmt.Errorf("error closing socket %q: %v", path, err) + t.Logf("error closing socket %q: %v", path, err) } if err := os.Remove(path); err != nil { - return fmt.Errorf("error removing socket %q: %v", path, err) + t.Logf("error removing socket %q: %v", path, err) } - return nil } - return srv, cleanup, nil + return srv, cleanup } // receiveConsolePTY accepts a connection on the server socket and reads fds. @@ -118,45 +119,42 @@ func receiveConsolePTY(srv *unet.ServerSocket) (*os.File, error) { // Test that an pty FD is sent over the console socket if one is provided. func TestConsoleSocket(t *testing.T) { - for _, conf := range configs(t, all...) { - t.Logf("Running test with conf: %+v", conf) - spec := testutil.NewSpecWithArgs("true") - rootDir, bundleDir, err := testutil.SetupContainer(spec, conf) - if err != nil { - t.Fatalf("error setting up container: %v", err) - } - defer os.RemoveAll(rootDir) - defer os.RemoveAll(bundleDir) + for name, conf := range configs(t, all...) { + t.Run(name, func(t *testing.T) { + spec := testutil.NewSpecWithArgs("true") + _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) + if err != nil { + t.Fatalf("error setting up container: %v", err) + } + defer cleanup() - sock, err := socketPath(bundleDir) - if err != nil { - t.Fatalf("error getting socket path: %v", err) - } - srv, cleanup, err := createConsoleSocket(sock) - if err != nil { - t.Fatalf("error creating socket at %q: %v", sock, err) - } - defer cleanup() - - // Create the container and pass the socket name. - args := Args{ - ID: testutil.UniqueContainerID(), - Spec: spec, - BundleDir: bundleDir, - ConsoleSocket: sock, - } - c, err := New(conf, args) - if err != nil { - t.Fatalf("error creating container: %v", err) - } - defer c.Destroy() + sock, err := socketPath(bundleDir) + if err != nil { + t.Fatalf("error getting socket path: %v", err) + } + srv, cleanup := createConsoleSocket(t, sock) + defer cleanup() + + // Create the container and pass the socket name. + args := Args{ + ID: testutil.RandomContainerID(), + Spec: spec, + BundleDir: bundleDir, + ConsoleSocket: sock, + } + c, err := New(conf, args) + if err != nil { + t.Fatalf("error creating container: %v", err) + } + defer c.Destroy() - // Make sure we get a console PTY. - ptyMaster, err := receiveConsolePTY(srv) - if err != nil { - t.Fatalf("error receiving console FD: %v", err) - } - ptyMaster.Close() + // Make sure we get a console PTY. + ptyMaster, err := receiveConsolePTY(srv) + if err != nil { + t.Fatalf("error receiving console FD: %v", err) + } + ptyMaster.Close() + }) } } @@ -165,16 +163,15 @@ func TestJobControlSignalExec(t *testing.T) { spec := testutil.NewSpecWithArgs("/bin/sleep", "10000") conf := testutil.TestConfig(t) - rootDir, bundleDir, err := testutil.SetupContainer(spec, conf) + _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) if err != nil { t.Fatalf("error setting up container: %v", err) } - defer os.RemoveAll(rootDir) - defer os.RemoveAll(bundleDir) + defer cleanup() // Create and start the container. args := Args{ - ID: testutil.UniqueContainerID(), + ID: testutil.RandomContainerID(), Spec: spec, BundleDir: bundleDir, } @@ -292,26 +289,22 @@ func TestJobControlSignalRootContainer(t *testing.T) { spec := testutil.NewSpecWithArgs("/bin/bash", "--noprofile", "--norc") spec.Process.Terminal = true - rootDir, bundleDir, err := testutil.SetupContainer(spec, conf) + _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) if err != nil { t.Fatalf("error setting up container: %v", err) } - defer os.RemoveAll(rootDir) - defer os.RemoveAll(bundleDir) + defer cleanup() sock, err := socketPath(bundleDir) if err != nil { t.Fatalf("error getting socket path: %v", err) } - srv, cleanup, err := createConsoleSocket(sock) - if err != nil { - t.Fatalf("error creating socket at %q: %v", sock, err) - } + srv, cleanup := createConsoleSocket(t, sock) defer cleanup() // Create the container and pass the socket name. args := Args{ - ID: testutil.UniqueContainerID(), + ID: testutil.RandomContainerID(), Spec: spec, BundleDir: bundleDir, ConsoleSocket: sock, @@ -368,7 +361,7 @@ func TestJobControlSignalRootContainer(t *testing.T) { {PID: 1, Cmd: "bash", Threads: []kernel.ThreadID{1}}, } if err := waitForProcessList(c, expectedPL); err != nil { - t.Fatal(err) + t.Fatalf("error waiting for processes: %v", err) } // Execute sleep via the terminal. @@ -377,7 +370,7 @@ func TestJobControlSignalRootContainer(t *testing.T) { // Wait for sleep to start. expectedPL = append(expectedPL, &control.Process{PID: 2, PPID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{2}}) if err := waitForProcessList(c, expectedPL); err != nil { - t.Fatal(err) + t.Fatalf("error waiting for processes: %v", err) } // Reset the pty buffer, so there is less output for us to scan later. diff --git a/runsc/container/container.go b/runsc/container/container.go index 7233659b1..117ea7d7b 100644 --- a/runsc/container/container.go +++ b/runsc/container/container.go @@ -274,7 +274,7 @@ func New(conf *boot.Config, args Args) (*Container, error) { } if err := os.MkdirAll(conf.RootDir, 0711); err != nil { - return nil, fmt.Errorf("creating container root directory: %v", err) + return nil, fmt.Errorf("creating container root directory %q: %v", conf.RootDir, err) } c := &Container{ diff --git a/runsc/container/container_norace_test.go b/runsc/container/container_norace_test.go new file mode 100644 index 000000000..838c1e20a --- /dev/null +++ b/runsc/container/container_norace_test.go @@ -0,0 +1,20 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build !race + +package container + +// Allow both kvm and ptrace for non-race builds. +var platformOptions = []configOption{ptrace, kvm} diff --git a/runsc/container/container_race_test.go b/runsc/container/container_race_test.go new file mode 100644 index 000000000..9fb4c4fc0 --- /dev/null +++ b/runsc/container/container_race_test.go @@ -0,0 +1,20 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build race + +package container + +// Only enabled ptrace with race builds. +var platformOptions = []configOption{ptrace} diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go index 5db6d64aa..3ff89f38c 100644 --- a/runsc/container/container_test.go +++ b/runsc/container/container_test.go @@ -39,10 +39,10 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/test/testutil" "gvisor.dev/gvisor/runsc/boot" "gvisor.dev/gvisor/runsc/boot/platforms" "gvisor.dev/gvisor/runsc/specutils" - "gvisor.dev/gvisor/runsc/testutil" ) // waitForProcessList waits for the given process list to show up in the container. @@ -215,16 +215,15 @@ func readOutputNum(file string, position int) (int, error) { // run starts the sandbox and waits for it to exit, checking that the // application succeeded. func run(spec *specs.Spec, conf *boot.Config) error { - rootDir, bundleDir, err := testutil.SetupContainer(spec, conf) + _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) if err != nil { return fmt.Errorf("error setting up container: %v", err) } - defer os.RemoveAll(rootDir) - defer os.RemoveAll(bundleDir) + defer cleanup() // Create, start and wait for the container. args := Args{ - ID: testutil.UniqueContainerID(), + ID: testutil.RandomContainerID(), Spec: spec, BundleDir: bundleDir, Attached: true, @@ -243,35 +242,41 @@ type configOption int const ( overlay configOption = iota + ptrace kvm nonExclusiveFS ) -var noOverlay = []configOption{kvm, nonExclusiveFS} -var all = append(noOverlay, overlay) +var ( + noOverlay = append(platformOptions, nonExclusiveFS) + all = append(noOverlay, overlay) +) // configs generates different configurations to run tests. -func configs(t *testing.T, opts ...configOption) []*boot.Config { +func configs(t *testing.T, opts ...configOption) map[string]*boot.Config { // Always load the default config. - cs := []*boot.Config{testutil.TestConfig(t)} - + cs := make(map[string]*boot.Config) for _, o := range opts { - c := testutil.TestConfig(t) switch o { case overlay: + c := testutil.TestConfig(t) c.Overlay = true + cs["overlay"] = c + case ptrace: + c := testutil.TestConfig(t) + c.Platform = platforms.Ptrace + cs["ptrace"] = c case kvm: - // TODO(b/112165693): KVM tests are flaky. Disable until fixed. - continue - + c := testutil.TestConfig(t) c.Platform = platforms.KVM + cs["kvm"] = c case nonExclusiveFS: + c := testutil.TestConfig(t) c.FileAccess = boot.FileAccessShared + cs["non-exclusive"] = c default: panic(fmt.Sprintf("unknown config option %v", o)) - } - cs = append(cs, c) } return cs } @@ -285,133 +290,133 @@ func TestLifecycle(t *testing.T) { childReaper.Start() defer childReaper.Stop() - for _, conf := range configs(t, all...) { - t.Logf("Running test with conf: %+v", conf) - // The container will just sleep for a long time. We will kill it before - // it finishes sleeping. - spec := testutil.NewSpecWithArgs("sleep", "100") - - rootDir, bundleDir, err := testutil.SetupContainer(spec, conf) - if err != nil { - t.Fatalf("error setting up container: %v", err) - } - defer os.RemoveAll(rootDir) - defer os.RemoveAll(bundleDir) - - // expectedPL lists the expected process state of the container. - expectedPL := []*control.Process{ - { - UID: 0, - PID: 1, - PPID: 0, - C: 0, - Cmd: "sleep", - Threads: []kernel.ThreadID{1}, - }, - } - // Create the container. - args := Args{ - ID: testutil.UniqueContainerID(), - Spec: spec, - BundleDir: bundleDir, - } - c, err := New(conf, args) - if err != nil { - t.Fatalf("error creating container: %v", err) - } - defer c.Destroy() + for name, conf := range configs(t, all...) { + t.Run(name, func(t *testing.T) { + // The container will just sleep for a long time. We will kill it before + // it finishes sleeping. + spec := testutil.NewSpecWithArgs("sleep", "100") - // Load the container from disk and check the status. - c, err = Load(rootDir, args.ID) - if err != nil { - t.Fatalf("error loading container: %v", err) - } - if got, want := c.Status, Created; got != want { - t.Errorf("container status got %v, want %v", got, want) - } + rootDir, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) + if err != nil { + t.Fatalf("error setting up container: %v", err) + } + defer cleanup() - // List should return the container id. - ids, err := List(rootDir) - if err != nil { - t.Fatalf("error listing containers: %v", err) - } - if got, want := ids, []string{args.ID}; !reflect.DeepEqual(got, want) { - t.Errorf("container list got %v, want %v", got, want) - } + // expectedPL lists the expected process state of the container. + expectedPL := []*control.Process{ + { + UID: 0, + PID: 1, + PPID: 0, + C: 0, + Cmd: "sleep", + Threads: []kernel.ThreadID{1}, + }, + } + // Create the container. + args := Args{ + ID: testutil.RandomContainerID(), + Spec: spec, + BundleDir: bundleDir, + } + c, err := New(conf, args) + if err != nil { + t.Fatalf("error creating container: %v", err) + } + defer c.Destroy() - // Start the container. - if err := c.Start(conf); err != nil { - t.Fatalf("error starting container: %v", err) - } + // Load the container from disk and check the status. + c, err = Load(rootDir, args.ID) + if err != nil { + t.Fatalf("error loading container: %v", err) + } + if got, want := c.Status, Created; got != want { + t.Errorf("container status got %v, want %v", got, want) + } - // Load the container from disk and check the status. - c, err = Load(rootDir, args.ID) - if err != nil { - t.Fatalf("error loading container: %v", err) - } - if got, want := c.Status, Running; got != want { - t.Errorf("container status got %v, want %v", got, want) - } + // List should return the container id. + ids, err := List(rootDir) + if err != nil { + t.Fatalf("error listing containers: %v", err) + } + if got, want := ids, []string{args.ID}; !reflect.DeepEqual(got, want) { + t.Errorf("container list got %v, want %v", got, want) + } - // Verify that "sleep 100" is running. - if err := waitForProcessList(c, expectedPL); err != nil { - t.Error(err) - } + // Start the container. + if err := c.Start(conf); err != nil { + t.Fatalf("error starting container: %v", err) + } - // Wait on the container. - var wg sync.WaitGroup - wg.Add(1) - ch := make(chan struct{}) - go func() { - ch <- struct{}{} - ws, err := c.Wait() + // Load the container from disk and check the status. + c, err = Load(rootDir, args.ID) if err != nil { - t.Fatalf("error waiting on container: %v", err) + t.Fatalf("error loading container: %v", err) } - if got, want := ws.Signal(), syscall.SIGTERM; got != want { - t.Fatalf("got signal %v, want %v", got, want) + if got, want := c.Status, Running; got != want { + t.Errorf("container status got %v, want %v", got, want) } - wg.Done() - }() - // Wait a bit to ensure that we've started waiting on the - // container before we signal. - <-ch - time.Sleep(100 * time.Millisecond) - // Send the container a SIGTERM which will cause it to stop. - if err := c.SignalContainer(syscall.SIGTERM, false); err != nil { - t.Fatalf("error sending signal %v to container: %v", syscall.SIGTERM, err) - } - // Wait for it to die. - wg.Wait() + // Verify that "sleep 100" is running. + if err := waitForProcessList(c, expectedPL); err != nil { + t.Error(err) + } - // Load the container from disk and check the status. - c, err = Load(rootDir, args.ID) - if err != nil { - t.Fatalf("error loading container: %v", err) - } - if got, want := c.Status, Stopped; got != want { - t.Errorf("container status got %v, want %v", got, want) - } + // Wait on the container. + ch := make(chan error) + go func() { + ws, err := c.Wait() + if err != nil { + ch <- err + } + if got, want := ws.Signal(), syscall.SIGTERM; got != want { + ch <- fmt.Errorf("got signal %v, want %v", got, want) + } + ch <- nil + }() - // Destroy the container. - if err := c.Destroy(); err != nil { - t.Fatalf("error destroying container: %v", err) - } + // Wait a bit to ensure that we've started waiting on + // the container before we signal. + time.Sleep(time.Second) - // List should not return the container id. - ids, err = List(rootDir) - if err != nil { - t.Fatalf("error listing containers: %v", err) - } - if len(ids) != 0 { - t.Errorf("expected container list to be empty, but got %v", ids) - } + // Send the container a SIGTERM which will cause it to stop. + if err := c.SignalContainer(syscall.SIGTERM, false); err != nil { + t.Fatalf("error sending signal %v to container: %v", syscall.SIGTERM, err) + } - // Loading the container by id should fail. - if _, err = Load(rootDir, args.ID); err == nil { - t.Errorf("expected loading destroyed container to fail, but it did not") - } + // Wait for it to die. + if err := <-ch; err != nil { + t.Fatalf("error waiting for container: %v", err) + } + + // Load the container from disk and check the status. + c, err = Load(rootDir, args.ID) + if err != nil { + t.Fatalf("error loading container: %v", err) + } + if got, want := c.Status, Stopped; got != want { + t.Errorf("container status got %v, want %v", got, want) + } + + // Destroy the container. + if err := c.Destroy(); err != nil { + t.Fatalf("error destroying container: %v", err) + } + + // List should not return the container id. + ids, err = List(rootDir) + if err != nil { + t.Fatalf("error listing containers: %v", err) + } + if len(ids) != 0 { + t.Errorf("expected container list to be empty, but got %v", ids) + } + + // Loading the container by id should fail. + if _, err = Load(rootDir, args.ID); err == nil { + t.Errorf("expected loading destroyed container to fail, but it did not") + } + }) } } @@ -420,12 +425,14 @@ func TestExePath(t *testing.T) { // Create two directories that will be prepended to PATH. firstPath, err := ioutil.TempDir(testutil.TmpDir(), "first") if err != nil { - t.Fatal(err) + t.Fatalf("error creating temporary directory: %v", err) } + defer os.RemoveAll(firstPath) secondPath, err := ioutil.TempDir(testutil.TmpDir(), "second") if err != nil { - t.Fatal(err) + t.Fatalf("error creating temporary directory: %v", err) } + defer os.RemoveAll(secondPath) // Create two minimal executables in the second path, two of which // will be masked by files in first path. @@ -433,11 +440,11 @@ func TestExePath(t *testing.T) { path := filepath.Join(secondPath, p) f, err := os.OpenFile(path, os.O_CREATE|os.O_EXCL|os.O_RDWR, 0777) if err != nil { - t.Fatal(err) + t.Fatalf("error opening path: %v", err) } defer f.Close() if _, err := io.WriteString(f, "#!/bin/true\n"); err != nil { - t.Fatal(err) + t.Fatalf("error writing contents: %v", err) } } @@ -446,7 +453,7 @@ func TestExePath(t *testing.T) { nonExecutable := filepath.Join(firstPath, "masked1") f2, err := os.OpenFile(nonExecutable, os.O_CREATE|os.O_EXCL, 0666) if err != nil { - t.Fatal(err) + t.Fatalf("error opening file: %v", err) } f2.Close() @@ -454,68 +461,69 @@ func TestExePath(t *testing.T) { // executable in the second. nonRegular := filepath.Join(firstPath, "masked2") if err := os.Mkdir(nonRegular, 0777); err != nil { - t.Fatal(err) - } - - for _, conf := range configs(t, overlay) { - t.Logf("Running test with conf: %+v", conf) - for _, test := range []struct { - path string - success bool - }{ - {path: "true", success: true}, - {path: "bin/true", success: true}, - {path: "/bin/true", success: true}, - {path: "thisfiledoesntexit", success: false}, - {path: "bin/thisfiledoesntexit", success: false}, - {path: "/bin/thisfiledoesntexit", success: false}, - - {path: "unmasked", success: true}, - {path: filepath.Join(firstPath, "unmasked"), success: false}, - {path: filepath.Join(secondPath, "unmasked"), success: true}, - - {path: "masked1", success: true}, - {path: filepath.Join(firstPath, "masked1"), success: false}, - {path: filepath.Join(secondPath, "masked1"), success: true}, - - {path: "masked2", success: true}, - {path: filepath.Join(firstPath, "masked2"), success: false}, - {path: filepath.Join(secondPath, "masked2"), success: true}, - } { - spec := testutil.NewSpecWithArgs(test.path) - spec.Process.Env = []string{ - fmt.Sprintf("PATH=%s:%s:%s", firstPath, secondPath, os.Getenv("PATH")), - } - - rootDir, bundleDir, err := testutil.SetupContainer(spec, conf) - if err != nil { - t.Fatalf("exec: %s, error setting up container: %v", test.path, err) - } - - args := Args{ - ID: testutil.UniqueContainerID(), - Spec: spec, - BundleDir: bundleDir, - Attached: true, - } - ws, err := Run(conf, args) + t.Fatalf("error making directory: %v", err) + } + + for name, conf := range configs(t, overlay) { + t.Run(name, func(t *testing.T) { + for _, test := range []struct { + path string + success bool + }{ + {path: "true", success: true}, + {path: "bin/true", success: true}, + {path: "/bin/true", success: true}, + {path: "thisfiledoesntexit", success: false}, + {path: "bin/thisfiledoesntexit", success: false}, + {path: "/bin/thisfiledoesntexit", success: false}, + + {path: "unmasked", success: true}, + {path: filepath.Join(firstPath, "unmasked"), success: false}, + {path: filepath.Join(secondPath, "unmasked"), success: true}, + + {path: "masked1", success: true}, + {path: filepath.Join(firstPath, "masked1"), success: false}, + {path: filepath.Join(secondPath, "masked1"), success: true}, + + {path: "masked2", success: true}, + {path: filepath.Join(firstPath, "masked2"), success: false}, + {path: filepath.Join(secondPath, "masked2"), success: true}, + } { + t.Run(fmt.Sprintf("path=%s,success=%t", test.path, test.success), func(t *testing.T) { + spec := testutil.NewSpecWithArgs(test.path) + spec.Process.Env = []string{ + fmt.Sprintf("PATH=%s:%s:%s", firstPath, secondPath, os.Getenv("PATH")), + } - os.RemoveAll(rootDir) - os.RemoveAll(bundleDir) + _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) + if err != nil { + t.Fatalf("exec: error setting up container: %v", err) + } + defer cleanup() - if test.success { - if err != nil { - t.Errorf("exec: %s, error running container: %v", test.path, err) - } - if ws.ExitStatus() != 0 { - t.Errorf("exec: %s, got exit status %v want %v", test.path, ws.ExitStatus(), 0) - } - } else { - if err == nil { - t.Errorf("exec: %s, got: no error, want: error", test.path) - } + args := Args{ + ID: testutil.RandomContainerID(), + Spec: spec, + BundleDir: bundleDir, + Attached: true, + } + ws, err := Run(conf, args) + + if test.success { + if err != nil { + t.Errorf("exec: error running container: %v", err) + } + if ws.ExitStatus() != 0 { + t.Errorf("exec: got exit status %v want %v", ws.ExitStatus(), 0) + } + } else { + if err == nil { + t.Errorf("exec: got: no error, want: error") + } + } + }) } - } + }) } } @@ -534,15 +542,14 @@ func doAppExitStatus(t *testing.T, vfs2 bool) { succSpec := testutil.NewSpecWithArgs("true") conf := testutil.TestConfig(t) conf.VFS2 = vfs2 - rootDir, bundleDir, err := testutil.SetupContainer(succSpec, conf) + _, bundleDir, cleanup, err := testutil.SetupContainer(succSpec, conf) if err != nil { t.Fatalf("error setting up container: %v", err) } - defer os.RemoveAll(rootDir) - defer os.RemoveAll(bundleDir) + defer cleanup() args := Args{ - ID: testutil.UniqueContainerID(), + ID: testutil.RandomContainerID(), Spec: succSpec, BundleDir: bundleDir, Attached: true, @@ -559,15 +566,14 @@ func doAppExitStatus(t *testing.T, vfs2 bool) { wantStatus := 123 errSpec := testutil.NewSpecWithArgs("bash", "-c", fmt.Sprintf("exit %d", wantStatus)) - rootDir2, bundleDir2, err := testutil.SetupContainer(errSpec, conf) + _, bundleDir2, cleanup2, err := testutil.SetupContainer(errSpec, conf) if err != nil { t.Fatalf("error setting up container: %v", err) } - defer os.RemoveAll(rootDir2) - defer os.RemoveAll(bundleDir2) + defer cleanup2() args2 := Args{ - ID: testutil.UniqueContainerID(), + ID: testutil.RandomContainerID(), Spec: errSpec, BundleDir: bundleDir2, Attached: true, @@ -583,166 +589,163 @@ func doAppExitStatus(t *testing.T, vfs2 bool) { // TestExec verifies that a container can exec a new program. func TestExec(t *testing.T) { - for _, conf := range configs(t, overlay) { - t.Logf("Running test with conf: %+v", conf) + for name, conf := range configs(t, overlay) { + t.Run(name, func(t *testing.T) { + const uid = 343 + spec := testutil.NewSpecWithArgs("sleep", "100") - const uid = 343 - spec := testutil.NewSpecWithArgs("sleep", "100") + _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) + if err != nil { + t.Fatalf("error setting up container: %v", err) + } + defer cleanup() - rootDir, bundleDir, err := testutil.SetupContainer(spec, conf) - if err != nil { - t.Fatalf("error setting up container: %v", err) - } - defer os.RemoveAll(rootDir) - defer os.RemoveAll(bundleDir) + // Create and start the container. + args := Args{ + ID: testutil.RandomContainerID(), + Spec: spec, + BundleDir: bundleDir, + } + cont, err := New(conf, args) + if err != nil { + t.Fatalf("error creating container: %v", err) + } + defer cont.Destroy() + if err := cont.Start(conf); err != nil { + t.Fatalf("error starting container: %v", err) + } - // Create and start the container. - args := Args{ - ID: testutil.UniqueContainerID(), - Spec: spec, - BundleDir: bundleDir, - } - cont, err := New(conf, args) - if err != nil { - t.Fatalf("error creating container: %v", err) - } - defer cont.Destroy() - if err := cont.Start(conf); err != nil { - t.Fatalf("error starting container: %v", err) - } + // expectedPL lists the expected process state of the container. + expectedPL := []*control.Process{ + { + UID: 0, + PID: 1, + PPID: 0, + C: 0, + Cmd: "sleep", + Threads: []kernel.ThreadID{1}, + }, + { + UID: uid, + PID: 2, + PPID: 0, + C: 0, + Cmd: "sleep", + Threads: []kernel.ThreadID{2}, + }, + } - // expectedPL lists the expected process state of the container. - expectedPL := []*control.Process{ - { - UID: 0, - PID: 1, - PPID: 0, - C: 0, - Cmd: "sleep", - Threads: []kernel.ThreadID{1}, - }, - { - UID: uid, - PID: 2, - PPID: 0, - C: 0, - Cmd: "sleep", - Threads: []kernel.ThreadID{2}, - }, - } + // Verify that "sleep 100" is running. + if err := waitForProcessList(cont, expectedPL[:1]); err != nil { + t.Error(err) + } - // Verify that "sleep 100" is running. - if err := waitForProcessList(cont, expectedPL[:1]); err != nil { - t.Error(err) - } + execArgs := &control.ExecArgs{ + Filename: "/bin/sleep", + Argv: []string{"/bin/sleep", "5"}, + WorkingDirectory: "/", + KUID: uid, + } - execArgs := &control.ExecArgs{ - Filename: "/bin/sleep", - Argv: []string{"/bin/sleep", "5"}, - WorkingDirectory: "/", - KUID: uid, - } + // Verify that "sleep 100" and "sleep 5" are running + // after exec. First, start running exec (whick + // blocks). + ch := make(chan error) + go func() { + exitStatus, err := cont.executeSync(execArgs) + if err != nil { + ch <- err + } else if exitStatus != 0 { + ch <- fmt.Errorf("failed with exit status: %v", exitStatus) + } else { + ch <- nil + } + }() - // Verify that "sleep 100" and "sleep 5" are running after exec. - // First, start running exec (whick blocks). - status := make(chan error, 1) - go func() { - exitStatus, err := cont.executeSync(execArgs) - if err != nil { - log.Debugf("error executing: %v", err) - status <- err - } else if exitStatus != 0 { - log.Debugf("bad status: %d", exitStatus) - status <- fmt.Errorf("failed with exit status: %v", exitStatus) - } else { - status <- nil + if err := waitForProcessList(cont, expectedPL); err != nil { + t.Fatalf("error waiting for processes: %v", err) } - }() - - if err := waitForProcessList(cont, expectedPL); err != nil { - t.Fatal(err) - } - // Ensure that exec finished without error. - select { - case <-time.After(10 * time.Second): - t.Fatalf("container timed out waiting for exec to finish.") - case st := <-status: - if st != nil { - t.Errorf("container failed to exec %v: %v", args, err) + // Ensure that exec finished without error. + select { + case <-time.After(10 * time.Second): + t.Fatalf("container timed out waiting for exec to finish.") + case err := <-ch: + if err != nil { + t.Errorf("container failed to exec %v: %v", args, err) + } } - } + }) } } // TestKillPid verifies that we can signal individual exec'd processes. func TestKillPid(t *testing.T) { - for _, conf := range configs(t, overlay) { - t.Logf("Running test with conf: %+v", conf) - - app, err := testutil.FindFile("runsc/container/test_app/test_app") - if err != nil { - t.Fatal("error finding test_app:", err) - } + for name, conf := range configs(t, overlay) { + t.Run(name, func(t *testing.T) { + app, err := testutil.FindFile("test/cmd/test_app/test_app") + if err != nil { + t.Fatal("error finding test_app:", err) + } - const nProcs = 4 - spec := testutil.NewSpecWithArgs(app, "task-tree", "--depth", strconv.Itoa(nProcs-1), "--width=1", "--pause=true") - rootDir, bundleDir, err := testutil.SetupContainer(spec, conf) - if err != nil { - t.Fatalf("error setting up container: %v", err) - } - defer os.RemoveAll(rootDir) - defer os.RemoveAll(bundleDir) + const nProcs = 4 + spec := testutil.NewSpecWithArgs(app, "task-tree", "--depth", strconv.Itoa(nProcs-1), "--width=1", "--pause=true") + _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) + if err != nil { + t.Fatalf("error setting up container: %v", err) + } + defer cleanup() - // Create and start the container. - args := Args{ - ID: testutil.UniqueContainerID(), - Spec: spec, - BundleDir: bundleDir, - } - cont, err := New(conf, args) - if err != nil { - t.Fatalf("error creating container: %v", err) - } - defer cont.Destroy() - if err := cont.Start(conf); err != nil { - t.Fatalf("error starting container: %v", err) - } + // Create and start the container. + args := Args{ + ID: testutil.RandomContainerID(), + Spec: spec, + BundleDir: bundleDir, + } + cont, err := New(conf, args) + if err != nil { + t.Fatalf("error creating container: %v", err) + } + defer cont.Destroy() + if err := cont.Start(conf); err != nil { + t.Fatalf("error starting container: %v", err) + } - // Verify that all processes are running. - if err := waitForProcessCount(cont, nProcs); err != nil { - t.Fatalf("timed out waiting for processes to start: %v", err) - } + // Verify that all processes are running. + if err := waitForProcessCount(cont, nProcs); err != nil { + t.Fatalf("timed out waiting for processes to start: %v", err) + } - // Kill the child process with the largest PID. - procs, err := cont.Processes() - if err != nil { - t.Fatalf("failed to get process list: %v", err) - } - var pid int32 - for _, p := range procs { - if pid < int32(p.PID) { - pid = int32(p.PID) + // Kill the child process with the largest PID. + procs, err := cont.Processes() + if err != nil { + t.Fatalf("failed to get process list: %v", err) + } + var pid int32 + for _, p := range procs { + if pid < int32(p.PID) { + pid = int32(p.PID) + } + } + if err := cont.SignalProcess(syscall.SIGKILL, pid); err != nil { + t.Fatalf("failed to signal process %d: %v", pid, err) } - } - if err := cont.SignalProcess(syscall.SIGKILL, pid); err != nil { - t.Fatalf("failed to signal process %d: %v", pid, err) - } - // Verify that one process is gone. - if err := waitForProcessCount(cont, nProcs-1); err != nil { - t.Fatal(err) - } + // Verify that one process is gone. + if err := waitForProcessCount(cont, nProcs-1); err != nil { + t.Fatalf("error waiting for processes: %v", err) + } - procs, err = cont.Processes() - if err != nil { - t.Fatalf("failed to get process list: %v", err) - } - for _, p := range procs { - if pid == int32(p.PID) { - t.Fatalf("pid %d is still alive, which should be killed", pid) + procs, err = cont.Processes() + if err != nil { + t.Fatalf("failed to get process list: %v", err) } - } + for _, p := range procs { + if pid == int32(p.PID) { + t.Fatalf("pid %d is still alive, which should be killed", pid) + } + } + }) } } @@ -753,160 +756,160 @@ func TestKillPid(t *testing.T) { // be the next consecutive number after the last number from the checkpointed container. func TestCheckpointRestore(t *testing.T) { // Skip overlay because test requires writing to host file. - for _, conf := range configs(t, noOverlay...) { - t.Logf("Running test with conf: %+v", conf) - - dir, err := ioutil.TempDir(testutil.TmpDir(), "checkpoint-test") - if err != nil { - t.Fatalf("ioutil.TempDir failed: %v", err) - } - if err := os.Chmod(dir, 0777); err != nil { - t.Fatalf("error chmoding file: %q, %v", dir, err) - } + for name, conf := range configs(t, noOverlay...) { + t.Run(name, func(t *testing.T) { + dir, err := ioutil.TempDir(testutil.TmpDir(), "checkpoint-test") + if err != nil { + t.Fatalf("ioutil.TempDir failed: %v", err) + } + defer os.RemoveAll(dir) + if err := os.Chmod(dir, 0777); err != nil { + t.Fatalf("error chmoding file: %q, %v", dir, err) + } - outputPath := filepath.Join(dir, "output") - outputFile, err := createWriteableOutputFile(outputPath) - if err != nil { - t.Fatalf("error creating output file: %v", err) - } - defer outputFile.Close() + outputPath := filepath.Join(dir, "output") + outputFile, err := createWriteableOutputFile(outputPath) + if err != nil { + t.Fatalf("error creating output file: %v", err) + } + defer outputFile.Close() - script := fmt.Sprintf("for ((i=0; ;i++)); do echo $i >> %q; sleep 1; done", outputPath) - spec := testutil.NewSpecWithArgs("bash", "-c", script) - rootDir, bundleDir, err := testutil.SetupContainer(spec, conf) - if err != nil { - t.Fatalf("error setting up container: %v", err) - } - defer os.RemoveAll(rootDir) - defer os.RemoveAll(bundleDir) + script := fmt.Sprintf("for ((i=0; ;i++)); do echo $i >> %q; sleep 1; done", outputPath) + spec := testutil.NewSpecWithArgs("bash", "-c", script) + _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) + if err != nil { + t.Fatalf("error setting up container: %v", err) + } + defer cleanup() - // Create and start the container. - args := Args{ - ID: testutil.UniqueContainerID(), - Spec: spec, - BundleDir: bundleDir, - } - cont, err := New(conf, args) - if err != nil { - t.Fatalf("error creating container: %v", err) - } - defer cont.Destroy() - if err := cont.Start(conf); err != nil { - t.Fatalf("error starting container: %v", err) - } + // Create and start the container. + args := Args{ + ID: testutil.RandomContainerID(), + Spec: spec, + BundleDir: bundleDir, + } + cont, err := New(conf, args) + if err != nil { + t.Fatalf("error creating container: %v", err) + } + defer cont.Destroy() + if err := cont.Start(conf); err != nil { + t.Fatalf("error starting container: %v", err) + } - // Set the image path, which is where the checkpoint image will be saved. - imagePath := filepath.Join(dir, "test-image-file") + // Set the image path, which is where the checkpoint image will be saved. + imagePath := filepath.Join(dir, "test-image-file") - // Create the image file and open for writing. - file, err := os.OpenFile(imagePath, os.O_CREATE|os.O_EXCL|os.O_RDWR, 0644) - if err != nil { - t.Fatalf("error opening new file at imagePath: %v", err) - } - defer file.Close() + // Create the image file and open for writing. + file, err := os.OpenFile(imagePath, os.O_CREATE|os.O_EXCL|os.O_RDWR, 0644) + if err != nil { + t.Fatalf("error opening new file at imagePath: %v", err) + } + defer file.Close() - // Wait until application has ran. - if err := waitForFileNotEmpty(outputFile); err != nil { - t.Fatalf("Failed to wait for output file: %v", err) - } + // Wait until application has ran. + if err := waitForFileNotEmpty(outputFile); err != nil { + t.Fatalf("Failed to wait for output file: %v", err) + } - // Checkpoint running container; save state into new file. - if err := cont.Checkpoint(file); err != nil { - t.Fatalf("error checkpointing container to empty file: %v", err) - } - defer os.RemoveAll(imagePath) + // Checkpoint running container; save state into new file. + if err := cont.Checkpoint(file); err != nil { + t.Fatalf("error checkpointing container to empty file: %v", err) + } + defer os.RemoveAll(imagePath) - lastNum, err := readOutputNum(outputPath, -1) - if err != nil { - t.Fatalf("error with outputFile: %v", err) - } + lastNum, err := readOutputNum(outputPath, -1) + if err != nil { + t.Fatalf("error with outputFile: %v", err) + } - // Delete and recreate file before restoring. - if err := os.Remove(outputPath); err != nil { - t.Fatalf("error removing file") - } - outputFile2, err := createWriteableOutputFile(outputPath) - if err != nil { - t.Fatalf("error creating output file: %v", err) - } - defer outputFile2.Close() + // Delete and recreate file before restoring. + if err := os.Remove(outputPath); err != nil { + t.Fatalf("error removing file") + } + outputFile2, err := createWriteableOutputFile(outputPath) + if err != nil { + t.Fatalf("error creating output file: %v", err) + } + defer outputFile2.Close() - // Restore into a new container. - args2 := Args{ - ID: testutil.UniqueContainerID(), - Spec: spec, - BundleDir: bundleDir, - } - cont2, err := New(conf, args2) - if err != nil { - t.Fatalf("error creating container: %v", err) - } - defer cont2.Destroy() + // Restore into a new container. + args2 := Args{ + ID: testutil.RandomContainerID(), + Spec: spec, + BundleDir: bundleDir, + } + cont2, err := New(conf, args2) + if err != nil { + t.Fatalf("error creating container: %v", err) + } + defer cont2.Destroy() - if err := cont2.Restore(spec, conf, imagePath); err != nil { - t.Fatalf("error restoring container: %v", err) - } + if err := cont2.Restore(spec, conf, imagePath); err != nil { + t.Fatalf("error restoring container: %v", err) + } - // Wait until application has ran. - if err := waitForFileNotEmpty(outputFile2); err != nil { - t.Fatalf("Failed to wait for output file: %v", err) - } + // Wait until application has ran. + if err := waitForFileNotEmpty(outputFile2); err != nil { + t.Fatalf("Failed to wait for output file: %v", err) + } - firstNum, err := readOutputNum(outputPath, 0) - if err != nil { - t.Fatalf("error with outputFile: %v", err) - } + firstNum, err := readOutputNum(outputPath, 0) + if err != nil { + t.Fatalf("error with outputFile: %v", err) + } - // Check that lastNum is one less than firstNum and that the container picks - // up from where it left off. - if lastNum+1 != firstNum { - t.Errorf("error numbers not in order, previous: %d, next: %d", lastNum, firstNum) - } - cont2.Destroy() + // Check that lastNum is one less than firstNum and that the container picks + // up from where it left off. + if lastNum+1 != firstNum { + t.Errorf("error numbers not in order, previous: %d, next: %d", lastNum, firstNum) + } + cont2.Destroy() - // Restore into another container! - // Delete and recreate file before restoring. - if err := os.Remove(outputPath); err != nil { - t.Fatalf("error removing file") - } - outputFile3, err := createWriteableOutputFile(outputPath) - if err != nil { - t.Fatalf("error creating output file: %v", err) - } - defer outputFile3.Close() + // Restore into another container! + // Delete and recreate file before restoring. + if err := os.Remove(outputPath); err != nil { + t.Fatalf("error removing file") + } + outputFile3, err := createWriteableOutputFile(outputPath) + if err != nil { + t.Fatalf("error creating output file: %v", err) + } + defer outputFile3.Close() - // Restore into a new container. - args3 := Args{ - ID: testutil.UniqueContainerID(), - Spec: spec, - BundleDir: bundleDir, - } - cont3, err := New(conf, args3) - if err != nil { - t.Fatalf("error creating container: %v", err) - } - defer cont3.Destroy() + // Restore into a new container. + args3 := Args{ + ID: testutil.RandomContainerID(), + Spec: spec, + BundleDir: bundleDir, + } + cont3, err := New(conf, args3) + if err != nil { + t.Fatalf("error creating container: %v", err) + } + defer cont3.Destroy() - if err := cont3.Restore(spec, conf, imagePath); err != nil { - t.Fatalf("error restoring container: %v", err) - } + if err := cont3.Restore(spec, conf, imagePath); err != nil { + t.Fatalf("error restoring container: %v", err) + } - // Wait until application has ran. - if err := waitForFileNotEmpty(outputFile3); err != nil { - t.Fatalf("Failed to wait for output file: %v", err) - } + // Wait until application has ran. + if err := waitForFileNotEmpty(outputFile3); err != nil { + t.Fatalf("Failed to wait for output file: %v", err) + } - firstNum2, err := readOutputNum(outputPath, 0) - if err != nil { - t.Fatalf("error with outputFile: %v", err) - } + firstNum2, err := readOutputNum(outputPath, 0) + if err != nil { + t.Fatalf("error with outputFile: %v", err) + } - // Check that lastNum is one less than firstNum and that the container picks - // up from where it left off. - if lastNum+1 != firstNum2 { - t.Errorf("error numbers not in order, previous: %d, next: %d", lastNum, firstNum2) - } - cont3.Destroy() + // Check that lastNum is one less than firstNum and that the container picks + // up from where it left off. + if lastNum+1 != firstNum2 { + t.Errorf("error numbers not in order, previous: %d, next: %d", lastNum, firstNum2) + } + cont3.Destroy() + }) } } @@ -914,135 +917,134 @@ func TestCheckpointRestore(t *testing.T) { // with filesystem Unix Domain Socket use. func TestUnixDomainSockets(t *testing.T) { // Skip overlay because test requires writing to host file. - for _, conf := range configs(t, noOverlay...) { - t.Logf("Running test with conf: %+v", conf) - - // UDS path is limited to 108 chars for compatibility with older systems. - // Use '/tmp' (instead of testutil.TmpDir) to ensure the size limit is - // not exceeded. Assumes '/tmp' exists in the system. - dir, err := ioutil.TempDir("/tmp", "uds-test") - if err != nil { - t.Fatalf("ioutil.TempDir failed: %v", err) - } - defer os.RemoveAll(dir) + for name, conf := range configs(t, noOverlay...) { + t.Run(name, func(t *testing.T) { + // UDS path is limited to 108 chars for compatibility with older systems. + // Use '/tmp' (instead of testutil.TmpDir) to ensure the size limit is + // not exceeded. Assumes '/tmp' exists in the system. + dir, err := ioutil.TempDir("/tmp", "uds-test") + if err != nil { + t.Fatalf("ioutil.TempDir failed: %v", err) + } + defer os.RemoveAll(dir) - outputPath := filepath.Join(dir, "uds_output") - outputFile, err := os.OpenFile(outputPath, os.O_CREATE|os.O_EXCL|os.O_RDWR, 0666) - if err != nil { - t.Fatalf("error creating output file: %v", err) - } - defer outputFile.Close() + outputPath := filepath.Join(dir, "uds_output") + outputFile, err := os.OpenFile(outputPath, os.O_CREATE|os.O_EXCL|os.O_RDWR, 0666) + if err != nil { + t.Fatalf("error creating output file: %v", err) + } + defer outputFile.Close() - app, err := testutil.FindFile("runsc/container/test_app/test_app") - if err != nil { - t.Fatal("error finding test_app:", err) - } + app, err := testutil.FindFile("test/cmd/test_app/test_app") + if err != nil { + t.Fatal("error finding test_app:", err) + } - socketPath := filepath.Join(dir, "uds_socket") - defer os.Remove(socketPath) + socketPath := filepath.Join(dir, "uds_socket") + defer os.Remove(socketPath) - spec := testutil.NewSpecWithArgs(app, "uds", "--file", outputPath, "--socket", socketPath) - spec.Process.User = specs.User{ - UID: uint32(os.Getuid()), - GID: uint32(os.Getgid()), - } - spec.Mounts = []specs.Mount{{ - Type: "bind", - Destination: dir, - Source: dir, - }} + spec := testutil.NewSpecWithArgs(app, "uds", "--file", outputPath, "--socket", socketPath) + spec.Process.User = specs.User{ + UID: uint32(os.Getuid()), + GID: uint32(os.Getgid()), + } + spec.Mounts = []specs.Mount{{ + Type: "bind", + Destination: dir, + Source: dir, + }} - rootDir, bundleDir, err := testutil.SetupContainer(spec, conf) - if err != nil { - t.Fatalf("error setting up container: %v", err) - } - defer os.RemoveAll(rootDir) - defer os.RemoveAll(bundleDir) + _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) + if err != nil { + t.Fatalf("error setting up container: %v", err) + } + defer cleanup() - // Create and start the container. - args := Args{ - ID: testutil.UniqueContainerID(), - Spec: spec, - BundleDir: bundleDir, - } - cont, err := New(conf, args) - if err != nil { - t.Fatalf("error creating container: %v", err) - } - defer cont.Destroy() - if err := cont.Start(conf); err != nil { - t.Fatalf("error starting container: %v", err) - } + // Create and start the container. + args := Args{ + ID: testutil.RandomContainerID(), + Spec: spec, + BundleDir: bundleDir, + } + cont, err := New(conf, args) + if err != nil { + t.Fatalf("error creating container: %v", err) + } + defer cont.Destroy() + if err := cont.Start(conf); err != nil { + t.Fatalf("error starting container: %v", err) + } - // Set the image path, the location where the checkpoint image will be saved. - imagePath := filepath.Join(dir, "test-image-file") + // Set the image path, the location where the checkpoint image will be saved. + imagePath := filepath.Join(dir, "test-image-file") - // Create the image file and open for writing. - file, err := os.OpenFile(imagePath, os.O_CREATE|os.O_EXCL|os.O_RDWR, 0644) - if err != nil { - t.Fatalf("error opening new file at imagePath: %v", err) - } - defer file.Close() - defer os.RemoveAll(imagePath) + // Create the image file and open for writing. + file, err := os.OpenFile(imagePath, os.O_CREATE|os.O_EXCL|os.O_RDWR, 0644) + if err != nil { + t.Fatalf("error opening new file at imagePath: %v", err) + } + defer file.Close() + defer os.RemoveAll(imagePath) - // Wait until application has ran. - if err := waitForFileNotEmpty(outputFile); err != nil { - t.Fatalf("Failed to wait for output file: %v", err) - } + // Wait until application has ran. + if err := waitForFileNotEmpty(outputFile); err != nil { + t.Fatalf("Failed to wait for output file: %v", err) + } - // Checkpoint running container; save state into new file. - if err := cont.Checkpoint(file); err != nil { - t.Fatalf("error checkpointing container to empty file: %v", err) - } + // Checkpoint running container; save state into new file. + if err := cont.Checkpoint(file); err != nil { + t.Fatalf("error checkpointing container to empty file: %v", err) + } - // Read last number outputted before checkpoint. - lastNum, err := readOutputNum(outputPath, -1) - if err != nil { - t.Fatalf("error with outputFile: %v", err) - } + // Read last number outputted before checkpoint. + lastNum, err := readOutputNum(outputPath, -1) + if err != nil { + t.Fatalf("error with outputFile: %v", err) + } - // Delete and recreate file before restoring. - if err := os.Remove(outputPath); err != nil { - t.Fatalf("error removing file") - } - outputFile2, err := os.OpenFile(outputPath, os.O_CREATE|os.O_EXCL|os.O_RDWR, 0666) - if err != nil { - t.Fatalf("error creating output file: %v", err) - } - defer outputFile2.Close() + // Delete and recreate file before restoring. + if err := os.Remove(outputPath); err != nil { + t.Fatalf("error removing file") + } + outputFile2, err := os.OpenFile(outputPath, os.O_CREATE|os.O_EXCL|os.O_RDWR, 0666) + if err != nil { + t.Fatalf("error creating output file: %v", err) + } + defer outputFile2.Close() - // Restore into a new container. - argsRestore := Args{ - ID: testutil.UniqueContainerID(), - Spec: spec, - BundleDir: bundleDir, - } - contRestore, err := New(conf, argsRestore) - if err != nil { - t.Fatalf("error creating container: %v", err) - } - defer contRestore.Destroy() + // Restore into a new container. + argsRestore := Args{ + ID: testutil.RandomContainerID(), + Spec: spec, + BundleDir: bundleDir, + } + contRestore, err := New(conf, argsRestore) + if err != nil { + t.Fatalf("error creating container: %v", err) + } + defer contRestore.Destroy() - if err := contRestore.Restore(spec, conf, imagePath); err != nil { - t.Fatalf("error restoring container: %v", err) - } + if err := contRestore.Restore(spec, conf, imagePath); err != nil { + t.Fatalf("error restoring container: %v", err) + } - // Wait until application has ran. - if err := waitForFileNotEmpty(outputFile2); err != nil { - t.Fatalf("Failed to wait for output file: %v", err) - } + // Wait until application has ran. + if err := waitForFileNotEmpty(outputFile2); err != nil { + t.Fatalf("Failed to wait for output file: %v", err) + } - // Read first number outputted after restore. - firstNum, err := readOutputNum(outputPath, 0) - if err != nil { - t.Fatalf("error with outputFile: %v", err) - } + // Read first number outputted after restore. + firstNum, err := readOutputNum(outputPath, 0) + if err != nil { + t.Fatalf("error with outputFile: %v", err) + } - // Check that lastNum is one less than firstNum. - if lastNum+1 != firstNum { - t.Errorf("error numbers not consecutive, previous: %d, next: %d", lastNum, firstNum) - } - contRestore.Destroy() + // Check that lastNum is one less than firstNum. + if lastNum+1 != firstNum { + t.Errorf("error numbers not consecutive, previous: %d, next: %d", lastNum, firstNum) + } + contRestore.Destroy() + }) } } @@ -1052,10 +1054,8 @@ func TestUnixDomainSockets(t *testing.T) { // recreated. Then it resumes the container, verify that the file gets created // again. func TestPauseResume(t *testing.T) { - for _, conf := range configs(t, noOverlay...) { - t.Run(fmt.Sprintf("conf: %+v", conf), func(t *testing.T) { - t.Logf("Running test with conf: %+v", conf) - + for name, conf := range configs(t, noOverlay...) { + t.Run(name, func(t *testing.T) { tmpDir, err := ioutil.TempDir(testutil.TmpDir(), "lock") if err != nil { t.Fatalf("error creating temp dir: %v", err) @@ -1066,16 +1066,15 @@ func TestPauseResume(t *testing.T) { script := fmt.Sprintf("while [[ true ]]; do touch %q; sleep 0.1; done", running) spec := testutil.NewSpecWithArgs("/bin/bash", "-c", script) - rootDir, bundleDir, err := testutil.SetupContainer(spec, conf) + _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) if err != nil { t.Fatalf("error setting up container: %v", err) } - defer os.RemoveAll(rootDir) - defer os.RemoveAll(bundleDir) + defer cleanup() // Create and start the container. args := Args{ - ID: testutil.UniqueContainerID(), + ID: testutil.RandomContainerID(), Spec: spec, BundleDir: bundleDir, } @@ -1134,16 +1133,15 @@ func TestPauseResume(t *testing.T) { func TestPauseResumeStatus(t *testing.T) { spec := testutil.NewSpecWithArgs("sleep", "20") conf := testutil.TestConfig(t) - rootDir, bundleDir, err := testutil.SetupContainer(spec, conf) + _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) if err != nil { t.Fatalf("error setting up container: %v", err) } - defer os.RemoveAll(rootDir) - defer os.RemoveAll(bundleDir) + defer cleanup() // Create and start the container. args := Args{ - ID: testutil.UniqueContainerID(), + ID: testutil.RandomContainerID(), Spec: spec, BundleDir: bundleDir, } @@ -1199,359 +1197,356 @@ func TestCapabilities(t *testing.T) { uid := auth.KUID(os.Getuid() + 1) gid := auth.KGID(os.Getgid() + 1) - for _, conf := range configs(t, all...) { - t.Logf("Running test with conf: %+v", conf) - - spec := testutil.NewSpecWithArgs("sleep", "100") - rootDir, bundleDir, err := testutil.SetupContainer(spec, conf) - if err != nil { - t.Fatalf("error setting up container: %v", err) - } - defer os.RemoveAll(rootDir) - defer os.RemoveAll(bundleDir) + for name, conf := range configs(t, all...) { + t.Run(name, func(t *testing.T) { + spec := testutil.NewSpecWithArgs("sleep", "100") + rootDir, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) + if err != nil { + t.Fatalf("error setting up container: %v", err) + } + defer cleanup() - // Create and start the container. - args := Args{ - ID: testutil.UniqueContainerID(), - Spec: spec, - BundleDir: bundleDir, - } - cont, err := New(conf, args) - if err != nil { - t.Fatalf("error creating container: %v", err) - } - defer cont.Destroy() - if err := cont.Start(conf); err != nil { - t.Fatalf("error starting container: %v", err) - } + // Create and start the container. + args := Args{ + ID: testutil.RandomContainerID(), + Spec: spec, + BundleDir: bundleDir, + } + cont, err := New(conf, args) + if err != nil { + t.Fatalf("error creating container: %v", err) + } + defer cont.Destroy() + if err := cont.Start(conf); err != nil { + t.Fatalf("error starting container: %v", err) + } - // expectedPL lists the expected process state of the container. - expectedPL := []*control.Process{ - { - UID: 0, - PID: 1, - PPID: 0, - C: 0, - Cmd: "sleep", - Threads: []kernel.ThreadID{1}, - }, - { - UID: uid, - PID: 2, - PPID: 0, - C: 0, - Cmd: "exe", - Threads: []kernel.ThreadID{2}, - }, - } - if err := waitForProcessList(cont, expectedPL[:1]); err != nil { - t.Fatalf("Failed to wait for sleep to start, err: %v", err) - } + // expectedPL lists the expected process state of the container. + expectedPL := []*control.Process{ + { + UID: 0, + PID: 1, + PPID: 0, + C: 0, + Cmd: "sleep", + Threads: []kernel.ThreadID{1}, + }, + { + UID: uid, + PID: 2, + PPID: 0, + C: 0, + Cmd: "exe", + Threads: []kernel.ThreadID{2}, + }, + } + if err := waitForProcessList(cont, expectedPL[:1]); err != nil { + t.Fatalf("Failed to wait for sleep to start, err: %v", err) + } - // Create an executable that can't be run with the specified UID:GID. - // This shouldn't be callable within the container until we add the - // CAP_DAC_OVERRIDE capability to skip the access check. - exePath := filepath.Join(rootDir, "exe") - if err := ioutil.WriteFile(exePath, []byte("#!/bin/sh\necho hello"), 0770); err != nil { - t.Fatalf("couldn't create executable: %v", err) - } - defer os.Remove(exePath) - - // Need to traverse the intermediate directory. - os.Chmod(rootDir, 0755) - - execArgs := &control.ExecArgs{ - Filename: exePath, - Argv: []string{exePath}, - WorkingDirectory: "/", - KUID: uid, - KGID: gid, - Capabilities: &auth.TaskCapabilities{}, - } + // Create an executable that can't be run with the specified UID:GID. + // This shouldn't be callable within the container until we add the + // CAP_DAC_OVERRIDE capability to skip the access check. + exePath := filepath.Join(rootDir, "exe") + if err := ioutil.WriteFile(exePath, []byte("#!/bin/sh\necho hello"), 0770); err != nil { + t.Fatalf("couldn't create executable: %v", err) + } + defer os.Remove(exePath) + + // Need to traverse the intermediate directory. + os.Chmod(rootDir, 0755) + + execArgs := &control.ExecArgs{ + Filename: exePath, + Argv: []string{exePath}, + WorkingDirectory: "/", + KUID: uid, + KGID: gid, + Capabilities: &auth.TaskCapabilities{}, + } - // "exe" should fail because we don't have the necessary permissions. - if _, err := cont.executeSync(execArgs); err == nil { - t.Fatalf("container executed without error, but an error was expected") - } + // "exe" should fail because we don't have the necessary permissions. + if _, err := cont.executeSync(execArgs); err == nil { + t.Fatalf("container executed without error, but an error was expected") + } - // Now we run with the capability enabled and should succeed. - execArgs.Capabilities = &auth.TaskCapabilities{ - EffectiveCaps: auth.CapabilitySetOf(linux.CAP_DAC_OVERRIDE), - } - // "exe" should not fail this time. - if _, err := cont.executeSync(execArgs); err != nil { - t.Fatalf("container failed to exec %v: %v", args, err) - } + // Now we run with the capability enabled and should succeed. + execArgs.Capabilities = &auth.TaskCapabilities{ + EffectiveCaps: auth.CapabilitySetOf(linux.CAP_DAC_OVERRIDE), + } + // "exe" should not fail this time. + if _, err := cont.executeSync(execArgs); err != nil { + t.Fatalf("container failed to exec %v: %v", args, err) + } + }) } } // TestRunNonRoot checks that sandbox can be configured when running as // non-privileged user. func TestRunNonRoot(t *testing.T) { - for _, conf := range configs(t, noOverlay...) { - t.Logf("Running test with conf: %+v", conf) - - spec := testutil.NewSpecWithArgs("/bin/true") - - // Set a random user/group with no access to "blocked" dir. - spec.Process.User.UID = 343 - spec.Process.User.GID = 2401 - spec.Process.Capabilities = nil + for name, conf := range configs(t, noOverlay...) { + t.Run(name, func(t *testing.T) { + spec := testutil.NewSpecWithArgs("/bin/true") + + // Set a random user/group with no access to "blocked" dir. + spec.Process.User.UID = 343 + spec.Process.User.GID = 2401 + spec.Process.Capabilities = nil + + // User running inside container can't list '$TMP/blocked' and would fail to + // mount it. + dir, err := ioutil.TempDir(testutil.TmpDir(), "blocked") + if err != nil { + t.Fatalf("ioutil.TempDir() failed: %v", err) + } + if err := os.Chmod(dir, 0700); err != nil { + t.Fatalf("os.MkDir(%q) failed: %v", dir, err) + } + dir = path.Join(dir, "test") + if err := os.Mkdir(dir, 0755); err != nil { + t.Fatalf("os.MkDir(%q) failed: %v", dir, err) + } - // User running inside container can't list '$TMP/blocked' and would fail to - // mount it. - dir, err := ioutil.TempDir(testutil.TmpDir(), "blocked") - if err != nil { - t.Fatalf("ioutil.TempDir() failed: %v", err) - } - if err := os.Chmod(dir, 0700); err != nil { - t.Fatalf("os.MkDir(%q) failed: %v", dir, err) - } - dir = path.Join(dir, "test") - if err := os.Mkdir(dir, 0755); err != nil { - t.Fatalf("os.MkDir(%q) failed: %v", dir, err) - } + src, err := ioutil.TempDir(testutil.TmpDir(), "src") + if err != nil { + t.Fatalf("ioutil.TempDir() failed: %v", err) + } - src, err := ioutil.TempDir(testutil.TmpDir(), "src") - if err != nil { - t.Fatalf("ioutil.TempDir() failed: %v", err) - } + spec.Mounts = append(spec.Mounts, specs.Mount{ + Destination: dir, + Source: src, + Type: "bind", + }) - spec.Mounts = append(spec.Mounts, specs.Mount{ - Destination: dir, - Source: src, - Type: "bind", + if err := run(spec, conf); err != nil { + t.Fatalf("error running sandbox: %v", err) + } }) - - if err := run(spec, conf); err != nil { - t.Fatalf("error running sandbox: %v", err) - } } } // TestMountNewDir checks that runsc will create destination directory if it // doesn't exit. func TestMountNewDir(t *testing.T) { - for _, conf := range configs(t, overlay) { - t.Logf("Running test with conf: %+v", conf) + for name, conf := range configs(t, overlay) { + t.Run(name, func(t *testing.T) { + root, err := ioutil.TempDir(testutil.TmpDir(), "root") + if err != nil { + t.Fatal("ioutil.TempDir() failed:", err) + } - root, err := ioutil.TempDir(testutil.TmpDir(), "root") - if err != nil { - t.Fatal("ioutil.TempDir() failed:", err) - } + srcDir := path.Join(root, "src", "dir", "anotherdir") + if err := os.MkdirAll(srcDir, 0755); err != nil { + t.Fatalf("os.MkDir(%q) failed: %v", srcDir, err) + } - srcDir := path.Join(root, "src", "dir", "anotherdir") - if err := os.MkdirAll(srcDir, 0755); err != nil { - t.Fatalf("os.MkDir(%q) failed: %v", srcDir, err) - } + mountDir := path.Join(root, "dir", "anotherdir") - mountDir := path.Join(root, "dir", "anotherdir") + spec := testutil.NewSpecWithArgs("/bin/ls", mountDir) + spec.Mounts = append(spec.Mounts, specs.Mount{ + Destination: mountDir, + Source: srcDir, + Type: "bind", + }) - spec := testutil.NewSpecWithArgs("/bin/ls", mountDir) - spec.Mounts = append(spec.Mounts, specs.Mount{ - Destination: mountDir, - Source: srcDir, - Type: "bind", + if err := run(spec, conf); err != nil { + t.Fatalf("error running sandbox: %v", err) + } }) - - if err := run(spec, conf); err != nil { - t.Fatalf("error running sandbox: %v", err) - } } } func TestReadonlyRoot(t *testing.T) { - for _, conf := range configs(t, overlay) { - t.Logf("Running test with conf: %+v", conf) - - spec := testutil.NewSpecWithArgs("/bin/touch", "/foo") - spec.Root.Readonly = true - rootDir, bundleDir, err := testutil.SetupContainer(spec, conf) - if err != nil { - t.Fatalf("error setting up container: %v", err) - } - defer os.RemoveAll(rootDir) - defer os.RemoveAll(bundleDir) + for name, conf := range configs(t, overlay) { + t.Run(name, func(t *testing.T) { + spec := testutil.NewSpecWithArgs("/bin/touch", "/foo") + spec.Root.Readonly = true + _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) + if err != nil { + t.Fatalf("error setting up container: %v", err) + } + defer cleanup() - // Create, start and wait for the container. - args := Args{ - ID: testutil.UniqueContainerID(), - Spec: spec, - BundleDir: bundleDir, - } - c, err := New(conf, args) - if err != nil { - t.Fatalf("error creating container: %v", err) - } - defer c.Destroy() - if err := c.Start(conf); err != nil { - t.Fatalf("error starting container: %v", err) - } + // Create, start and wait for the container. + args := Args{ + ID: testutil.RandomContainerID(), + Spec: spec, + BundleDir: bundleDir, + } + c, err := New(conf, args) + if err != nil { + t.Fatalf("error creating container: %v", err) + } + defer c.Destroy() + if err := c.Start(conf); err != nil { + t.Fatalf("error starting container: %v", err) + } - ws, err := c.Wait() - if err != nil { - t.Fatalf("error waiting on container: %v", err) - } - if !ws.Exited() || syscall.Errno(ws.ExitStatus()) != syscall.EPERM { - t.Fatalf("container failed, waitStatus: %v", ws) - } + ws, err := c.Wait() + if err != nil { + t.Fatalf("error waiting on container: %v", err) + } + if !ws.Exited() || syscall.Errno(ws.ExitStatus()) != syscall.EPERM { + t.Fatalf("container failed, waitStatus: %v", ws) + } + }) } } func TestUIDMap(t *testing.T) { - for _, conf := range configs(t, noOverlay...) { - t.Logf("Running test with conf: %+v", conf) - testDir, err := ioutil.TempDir(testutil.TmpDir(), "test-mount") - if err != nil { - t.Fatal(err) - } - defer os.RemoveAll(testDir) - testFile := path.Join(testDir, "testfile") - - spec := testutil.NewSpecWithArgs("touch", "/tmp/testfile") - uid := os.Getuid() - gid := os.Getgid() - spec.Linux = &specs.Linux{ - Namespaces: []specs.LinuxNamespace{ - {Type: specs.UserNamespace}, - {Type: specs.PIDNamespace}, - {Type: specs.MountNamespace}, - }, - UIDMappings: []specs.LinuxIDMapping{ - { - ContainerID: 0, - HostID: uint32(uid), - Size: 1, + for name, conf := range configs(t, noOverlay...) { + t.Run(name, func(t *testing.T) { + testDir, err := ioutil.TempDir(testutil.TmpDir(), "test-mount") + if err != nil { + t.Fatalf("ioutil.TempDir() failed: %v", err) + } + defer os.RemoveAll(testDir) + testFile := path.Join(testDir, "testfile") + + spec := testutil.NewSpecWithArgs("touch", "/tmp/testfile") + uid := os.Getuid() + gid := os.Getgid() + spec.Linux = &specs.Linux{ + Namespaces: []specs.LinuxNamespace{ + {Type: specs.UserNamespace}, + {Type: specs.PIDNamespace}, + {Type: specs.MountNamespace}, }, - }, - GIDMappings: []specs.LinuxIDMapping{ - { - ContainerID: 0, - HostID: uint32(gid), - Size: 1, + UIDMappings: []specs.LinuxIDMapping{ + { + ContainerID: 0, + HostID: uint32(uid), + Size: 1, + }, }, - }, - } + GIDMappings: []specs.LinuxIDMapping{ + { + ContainerID: 0, + HostID: uint32(gid), + Size: 1, + }, + }, + } - spec.Mounts = append(spec.Mounts, specs.Mount{ - Destination: "/tmp", - Source: testDir, - Type: "bind", - }) + spec.Mounts = append(spec.Mounts, specs.Mount{ + Destination: "/tmp", + Source: testDir, + Type: "bind", + }) - rootDir, bundleDir, err := testutil.SetupContainer(spec, conf) - if err != nil { - t.Fatalf("error setting up container: %v", err) - } - defer os.RemoveAll(rootDir) - defer os.RemoveAll(bundleDir) + _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) + if err != nil { + t.Fatalf("error setting up container: %v", err) + } + defer cleanup() - // Create, start and wait for the container. - args := Args{ - ID: testutil.UniqueContainerID(), - Spec: spec, - BundleDir: bundleDir, - } - c, err := New(conf, args) - if err != nil { - t.Fatalf("error creating container: %v", err) - } - defer c.Destroy() - if err := c.Start(conf); err != nil { - t.Fatalf("error starting container: %v", err) - } + // Create, start and wait for the container. + args := Args{ + ID: testutil.RandomContainerID(), + Spec: spec, + BundleDir: bundleDir, + } + c, err := New(conf, args) + if err != nil { + t.Fatalf("error creating container: %v", err) + } + defer c.Destroy() + if err := c.Start(conf); err != nil { + t.Fatalf("error starting container: %v", err) + } - ws, err := c.Wait() - if err != nil { - t.Fatalf("error waiting on container: %v", err) - } - if !ws.Exited() || ws.ExitStatus() != 0 { - t.Fatalf("container failed, waitStatus: %v", ws) - } - st := syscall.Stat_t{} - if err := syscall.Stat(testFile, &st); err != nil { - t.Fatalf("error stat /testfile: %v", err) - } + ws, err := c.Wait() + if err != nil { + t.Fatalf("error waiting on container: %v", err) + } + if !ws.Exited() || ws.ExitStatus() != 0 { + t.Fatalf("container failed, waitStatus: %v", ws) + } + st := syscall.Stat_t{} + if err := syscall.Stat(testFile, &st); err != nil { + t.Fatalf("error stat /testfile: %v", err) + } - if st.Uid != uint32(uid) || st.Gid != uint32(gid) { - t.Fatalf("UID: %d (%d) GID: %d (%d)", st.Uid, uid, st.Gid, gid) - } + if st.Uid != uint32(uid) || st.Gid != uint32(gid) { + t.Fatalf("UID: %d (%d) GID: %d (%d)", st.Uid, uid, st.Gid, gid) + } + }) } } func TestReadonlyMount(t *testing.T) { - for _, conf := range configs(t, overlay) { - t.Logf("Running test with conf: %+v", conf) - - dir, err := ioutil.TempDir(testutil.TmpDir(), "ro-mount") - spec := testutil.NewSpecWithArgs("/bin/touch", path.Join(dir, "file")) - if err != nil { - t.Fatalf("ioutil.TempDir() failed: %v", err) - } - spec.Mounts = append(spec.Mounts, specs.Mount{ - Destination: dir, - Source: dir, - Type: "bind", - Options: []string{"ro"}, - }) - spec.Root.Readonly = false - - rootDir, bundleDir, err := testutil.SetupContainer(spec, conf) - if err != nil { - t.Fatalf("error setting up container: %v", err) - } - defer os.RemoveAll(rootDir) - defer os.RemoveAll(bundleDir) + for name, conf := range configs(t, overlay) { + t.Run(name, func(t *testing.T) { + dir, err := ioutil.TempDir(testutil.TmpDir(), "ro-mount") + spec := testutil.NewSpecWithArgs("/bin/touch", path.Join(dir, "file")) + if err != nil { + t.Fatalf("ioutil.TempDir() failed: %v", err) + } + spec.Mounts = append(spec.Mounts, specs.Mount{ + Destination: dir, + Source: dir, + Type: "bind", + Options: []string{"ro"}, + }) + spec.Root.Readonly = false + + _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) + if err != nil { + t.Fatalf("error setting up container: %v", err) + } + defer cleanup() - // Create, start and wait for the container. - args := Args{ - ID: testutil.UniqueContainerID(), - Spec: spec, - BundleDir: bundleDir, - } - c, err := New(conf, args) - if err != nil { - t.Fatalf("error creating container: %v", err) - } - defer c.Destroy() - if err := c.Start(conf); err != nil { - t.Fatalf("error starting container: %v", err) - } + // Create, start and wait for the container. + args := Args{ + ID: testutil.RandomContainerID(), + Spec: spec, + BundleDir: bundleDir, + } + c, err := New(conf, args) + if err != nil { + t.Fatalf("error creating container: %v", err) + } + defer c.Destroy() + if err := c.Start(conf); err != nil { + t.Fatalf("error starting container: %v", err) + } - ws, err := c.Wait() - if err != nil { - t.Fatalf("error waiting on container: %v", err) - } - if !ws.Exited() || syscall.Errno(ws.ExitStatus()) != syscall.EPERM { - t.Fatalf("container failed, waitStatus: %v", ws) - } + ws, err := c.Wait() + if err != nil { + t.Fatalf("error waiting on container: %v", err) + } + if !ws.Exited() || syscall.Errno(ws.ExitStatus()) != syscall.EPERM { + t.Fatalf("container failed, waitStatus: %v", ws) + } + }) } } // TestAbbreviatedIDs checks that runsc supports using abbreviated container // IDs in place of full IDs. func TestAbbreviatedIDs(t *testing.T) { - rootDir, err := testutil.SetupRootDir() + rootDir, cleanup, err := testutil.SetupRootDir() if err != nil { t.Fatalf("error creating root dir: %v", err) } - defer os.RemoveAll(rootDir) + defer cleanup() conf := testutil.TestConfig(t) conf.RootDir = rootDir cids := []string{ - "foo-" + testutil.UniqueContainerID(), - "bar-" + testutil.UniqueContainerID(), - "baz-" + testutil.UniqueContainerID(), + "foo-" + testutil.RandomContainerID(), + "bar-" + testutil.RandomContainerID(), + "baz-" + testutil.RandomContainerID(), } for _, cid := range cids { spec := testutil.NewSpecWithArgs("sleep", "100") - bundleDir, err := testutil.SetupBundleDir(spec) + bundleDir, cleanup, err := testutil.SetupBundleDir(spec) if err != nil { t.Fatalf("error setting up container: %v", err) } - defer os.RemoveAll(bundleDir) + defer cleanup() // Create and start the container. args := Args{ @@ -1596,16 +1591,15 @@ func TestAbbreviatedIDs(t *testing.T) { func TestGoferExits(t *testing.T) { spec := testutil.NewSpecWithArgs("/bin/sleep", "10000") conf := testutil.TestConfig(t) - rootDir, bundleDir, err := testutil.SetupContainer(spec, conf) + _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) if err != nil { t.Fatalf("error setting up container: %v", err) } - defer os.RemoveAll(rootDir) - defer os.RemoveAll(bundleDir) + defer cleanup() // Create and start the container. args := Args{ - ID: testutil.UniqueContainerID(), + ID: testutil.RandomContainerID(), Spec: spec, BundleDir: bundleDir, } @@ -1634,7 +1628,7 @@ func TestGoferExits(t *testing.T) { } func TestRootNotMount(t *testing.T) { - appSym, err := testutil.FindFile("runsc/container/test_app/test_app") + appSym, err := testutil.FindFile("test/cmd/test_app/test_app") if err != nil { t.Fatal("error finding test_app:", err) } @@ -1671,7 +1665,7 @@ func TestRootNotMount(t *testing.T) { } func TestUserLog(t *testing.T) { - app, err := testutil.FindFile("runsc/container/test_app/test_app") + app, err := testutil.FindFile("test/cmd/test_app/test_app") if err != nil { t.Fatal("error finding test_app:", err) } @@ -1679,12 +1673,11 @@ func TestUserLog(t *testing.T) { // sched_rr_get_interval = 148 - not implemented in gvisor. spec := testutil.NewSpecWithArgs(app, "syscall", "--syscall=148") conf := testutil.TestConfig(t) - rootDir, bundleDir, err := testutil.SetupContainer(spec, conf) + _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) if err != nil { t.Fatalf("error setting up container: %v", err) } - defer os.RemoveAll(rootDir) - defer os.RemoveAll(bundleDir) + defer cleanup() dir, err := ioutil.TempDir(testutil.TmpDir(), "user_log_test") if err != nil { @@ -1694,7 +1687,7 @@ func TestUserLog(t *testing.T) { // Create, start and wait for the container. args := Args{ - ID: testutil.UniqueContainerID(), + ID: testutil.RandomContainerID(), Spec: spec, BundleDir: bundleDir, UserLog: userLog, @@ -1718,72 +1711,70 @@ func TestUserLog(t *testing.T) { } func TestWaitOnExitedSandbox(t *testing.T) { - for _, conf := range configs(t, all...) { - t.Logf("Running test with conf: %+v", conf) - - // Run a shell that sleeps for 1 second and then exits with a - // non-zero code. - const wantExit = 17 - cmd := fmt.Sprintf("sleep 1; exit %d", wantExit) - spec := testutil.NewSpecWithArgs("/bin/sh", "-c", cmd) - rootDir, bundleDir, err := testutil.SetupContainer(spec, conf) - if err != nil { - t.Fatalf("error setting up container: %v", err) - } - defer os.RemoveAll(rootDir) - defer os.RemoveAll(bundleDir) + for name, conf := range configs(t, all...) { + t.Run(name, func(t *testing.T) { + // Run a shell that sleeps for 1 second and then exits with a + // non-zero code. + const wantExit = 17 + cmd := fmt.Sprintf("sleep 1; exit %d", wantExit) + spec := testutil.NewSpecWithArgs("/bin/sh", "-c", cmd) + _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) + if err != nil { + t.Fatalf("error setting up container: %v", err) + } + defer cleanup() - // Create and Start the container. - args := Args{ - ID: testutil.UniqueContainerID(), - Spec: spec, - BundleDir: bundleDir, - } - c, err := New(conf, args) - if err != nil { - t.Fatalf("error creating container: %v", err) - } - defer c.Destroy() - if err := c.Start(conf); err != nil { - t.Fatalf("error starting container: %v", err) - } + // Create and Start the container. + args := Args{ + ID: testutil.RandomContainerID(), + Spec: spec, + BundleDir: bundleDir, + } + c, err := New(conf, args) + if err != nil { + t.Fatalf("error creating container: %v", err) + } + defer c.Destroy() + if err := c.Start(conf); err != nil { + t.Fatalf("error starting container: %v", err) + } - // Wait on the sandbox. This will make an RPC to the sandbox - // and get the actual exit status of the application. - ws, err := c.Wait() - if err != nil { - t.Fatalf("error waiting on container: %v", err) - } - if got := ws.ExitStatus(); got != wantExit { - t.Errorf("got exit status %d, want %d", got, wantExit) - } + // Wait on the sandbox. This will make an RPC to the sandbox + // and get the actual exit status of the application. + ws, err := c.Wait() + if err != nil { + t.Fatalf("error waiting on container: %v", err) + } + if got := ws.ExitStatus(); got != wantExit { + t.Errorf("got exit status %d, want %d", got, wantExit) + } - // Now the sandbox has exited, but the zombie sandbox process - // still exists. Calling Wait() now will return the sandbox - // exit status. - ws, err = c.Wait() - if err != nil { - t.Fatalf("error waiting on container: %v", err) - } - if got := ws.ExitStatus(); got != wantExit { - t.Errorf("got exit status %d, want %d", got, wantExit) - } + // Now the sandbox has exited, but the zombie sandbox process + // still exists. Calling Wait() now will return the sandbox + // exit status. + ws, err = c.Wait() + if err != nil { + t.Fatalf("error waiting on container: %v", err) + } + if got := ws.ExitStatus(); got != wantExit { + t.Errorf("got exit status %d, want %d", got, wantExit) + } + }) } } func TestDestroyNotStarted(t *testing.T) { spec := testutil.NewSpecWithArgs("/bin/sleep", "100") conf := testutil.TestConfig(t) - rootDir, bundleDir, err := testutil.SetupContainer(spec, conf) + _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) if err != nil { t.Fatalf("error setting up container: %v", err) } - defer os.RemoveAll(rootDir) - defer os.RemoveAll(bundleDir) + defer cleanup() // Create the container and check that it can be destroyed. args := Args{ - ID: testutil.UniqueContainerID(), + ID: testutil.RandomContainerID(), Spec: spec, BundleDir: bundleDir, } @@ -1801,16 +1792,15 @@ func TestDestroyStarting(t *testing.T) { for i := 0; i < 10; i++ { spec := testutil.NewSpecWithArgs("/bin/sleep", "100") conf := testutil.TestConfig(t) - rootDir, bundleDir, err := testutil.SetupContainer(spec, conf) + rootDir, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) if err != nil { t.Fatalf("error setting up container: %v", err) } - defer os.RemoveAll(rootDir) - defer os.RemoveAll(bundleDir) + defer cleanup() // Create the container and check that it can be destroyed. args := Args{ - ID: testutil.UniqueContainerID(), + ID: testutil.RandomContainerID(), Spec: spec, BundleDir: bundleDir, } @@ -1845,23 +1835,23 @@ func TestDestroyStarting(t *testing.T) { } func TestCreateWorkingDir(t *testing.T) { - for _, conf := range configs(t, overlay) { - t.Logf("Running test with conf: %+v", conf) - - tmpDir, err := ioutil.TempDir(testutil.TmpDir(), "cwd-create") - if err != nil { - t.Fatalf("ioutil.TempDir() failed: %v", err) - } - dir := path.Join(tmpDir, "new/working/dir") + for name, conf := range configs(t, overlay) { + t.Run(name, func(t *testing.T) { + tmpDir, err := ioutil.TempDir(testutil.TmpDir(), "cwd-create") + if err != nil { + t.Fatalf("ioutil.TempDir() failed: %v", err) + } + dir := path.Join(tmpDir, "new/working/dir") - // touch will fail if the directory doesn't exist. - spec := testutil.NewSpecWithArgs("/bin/touch", path.Join(dir, "file")) - spec.Process.Cwd = dir - spec.Root.Readonly = true + // touch will fail if the directory doesn't exist. + spec := testutil.NewSpecWithArgs("/bin/touch", path.Join(dir, "file")) + spec.Process.Cwd = dir + spec.Root.Readonly = true - if err := run(spec, conf); err != nil { - t.Fatalf("Error running container: %v", err) - } + if err := run(spec, conf); err != nil { + t.Fatalf("Error running container: %v", err) + } + }) } } @@ -1919,15 +1909,14 @@ func TestMountPropagation(t *testing.T) { } conf := testutil.TestConfig(t) - rootDir, bundleDir, err := testutil.SetupContainer(spec, conf) + _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) if err != nil { t.Fatalf("error setting up container: %v", err) } - defer os.RemoveAll(rootDir) - defer os.RemoveAll(bundleDir) + defer cleanup() args := Args{ - ID: testutil.UniqueContainerID(), + ID: testutil.RandomContainerID(), Spec: spec, BundleDir: bundleDir, } @@ -1969,81 +1958,81 @@ func TestMountPropagation(t *testing.T) { } func TestMountSymlink(t *testing.T) { - for _, conf := range configs(t, overlay) { - t.Logf("Running test with conf: %+v", conf) - - dir, err := ioutil.TempDir(testutil.TmpDir(), "mount-symlink") - if err != nil { - t.Fatalf("ioutil.TempDir() failed: %v", err) - } + for name, conf := range configs(t, overlay) { + t.Run(name, func(t *testing.T) { + dir, err := ioutil.TempDir(testutil.TmpDir(), "mount-symlink") + if err != nil { + t.Fatalf("ioutil.TempDir() failed: %v", err) + } + defer os.RemoveAll(dir) - source := path.Join(dir, "source") - target := path.Join(dir, "target") - for _, path := range []string{source, target} { - if err := os.MkdirAll(path, 0777); err != nil { - t.Fatalf("os.MkdirAll(): %v", err) + source := path.Join(dir, "source") + target := path.Join(dir, "target") + for _, path := range []string{source, target} { + if err := os.MkdirAll(path, 0777); err != nil { + t.Fatalf("os.MkdirAll(): %v", err) + } } - } - f, err := os.Create(path.Join(source, "file")) - if err != nil { - t.Fatalf("os.Create(): %v", err) - } - f.Close() + f, err := os.Create(path.Join(source, "file")) + if err != nil { + t.Fatalf("os.Create(): %v", err) + } + f.Close() - link := path.Join(dir, "link") - if err := os.Symlink(target, link); err != nil { - t.Fatalf("os.Symlink(%q, %q): %v", target, link, err) - } + link := path.Join(dir, "link") + if err := os.Symlink(target, link); err != nil { + t.Fatalf("os.Symlink(%q, %q): %v", target, link, err) + } - spec := testutil.NewSpecWithArgs("/bin/sleep", "1000") + spec := testutil.NewSpecWithArgs("/bin/sleep", "1000") - // Mount to a symlink to ensure the mount code will follow it and mount - // at the symlink target. - spec.Mounts = append(spec.Mounts, specs.Mount{ - Type: "bind", - Destination: link, - Source: source, - }) + // Mount to a symlink to ensure the mount code will follow it and mount + // at the symlink target. + spec.Mounts = append(spec.Mounts, specs.Mount{ + Type: "bind", + Destination: link, + Source: source, + }) - rootDir, bundleDir, err := testutil.SetupContainer(spec, conf) - if err != nil { - t.Fatalf("error setting up container: %v", err) - } - defer os.RemoveAll(rootDir) - defer os.RemoveAll(bundleDir) + _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) + if err != nil { + t.Fatalf("error setting up container: %v", err) + } + defer cleanup() - args := Args{ - ID: testutil.UniqueContainerID(), - Spec: spec, - BundleDir: bundleDir, - } - cont, err := New(conf, args) - if err != nil { - t.Fatalf("creating container: %v", err) - } - defer cont.Destroy() + args := Args{ + ID: testutil.RandomContainerID(), + Spec: spec, + BundleDir: bundleDir, + } + cont, err := New(conf, args) + if err != nil { + t.Fatalf("creating container: %v", err) + } + defer cont.Destroy() - if err := cont.Start(conf); err != nil { - t.Fatalf("starting container: %v", err) - } + if err := cont.Start(conf); err != nil { + t.Fatalf("starting container: %v", err) + } - // Check that symlink was resolved and mount was created where the symlink - // is pointing to. - file := path.Join(target, "file") - execArgs := &control.ExecArgs{ - Filename: "/usr/bin/test", - Argv: []string{"test", "-f", file}, - } - if ws, err := cont.executeSync(execArgs); err != nil || ws != 0 { - t.Fatalf("exec: test -f %q, ws: %v, err: %v", file, ws, err) - } + // Check that symlink was resolved and mount was created where the symlink + // is pointing to. + file := path.Join(target, "file") + execArgs := &control.ExecArgs{ + Filename: "/usr/bin/test", + Argv: []string{"test", "-f", file}, + } + if ws, err := cont.executeSync(execArgs); err != nil || ws != 0 { + t.Fatalf("exec: test -f %q, ws: %v, err: %v", file, ws, err) + } + }) } } // Check that --net-raw disables the CAP_NET_RAW capability. func TestNetRaw(t *testing.T) { capNetRaw := strconv.FormatUint(bits.MaskOf64(int(linux.CAP_NET_RAW)), 10) - app, err := testutil.FindFile("runsc/container/test_app/test_app") + app, err := testutil.FindFile("test/cmd/test_app/test_app") if err != nil { t.Fatal("error finding test_app:", err) } @@ -2106,7 +2095,7 @@ func TestTTYField(t *testing.T) { stop := testutil.StartReaper() defer stop() - testApp, err := testutil.FindFile("runsc/container/test_app/test_app") + testApp, err := testutil.FindFile("test/cmd/test_app/test_app") if err != nil { t.Fatal("error finding test_app:", err) } @@ -2140,16 +2129,15 @@ func TestTTYField(t *testing.T) { } spec := testutil.NewSpecWithArgs(cmd...) - rootDir, bundleDir, err := testutil.SetupContainer(spec, conf) + _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) if err != nil { t.Fatalf("error setting up container: %v", err) } - defer os.RemoveAll(rootDir) - defer os.RemoveAll(bundleDir) + defer cleanup() // Create and start the container. args := Args{ - ID: testutil.UniqueContainerID(), + ID: testutil.RandomContainerID(), Spec: spec, BundleDir: bundleDir, } diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go index dc2fb42ce..e3704b453 100644 --- a/runsc/container/multi_container_test.go +++ b/runsc/container/multi_container_test.go @@ -30,15 +30,15 @@ import ( "gvisor.dev/gvisor/pkg/sentry/control" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/test/testutil" "gvisor.dev/gvisor/runsc/boot" "gvisor.dev/gvisor/runsc/specutils" - "gvisor.dev/gvisor/runsc/testutil" ) func createSpecs(cmds ...[]string) ([]*specs.Spec, []string) { var specs []*specs.Spec var ids []string - rootID := testutil.UniqueContainerID() + rootID := testutil.RandomContainerID() for i, cmd := range cmds { spec := testutil.NewSpecWithArgs(cmd...) @@ -52,7 +52,7 @@ func createSpecs(cmds ...[]string) ([]*specs.Spec, []string) { specutils.ContainerdContainerTypeAnnotation: specutils.ContainerdContainerTypeContainer, specutils.ContainerdSandboxIDAnnotation: rootID, } - ids = append(ids, testutil.UniqueContainerID()) + ids = append(ids, testutil.RandomContainerID()) } specs = append(specs, spec) } @@ -64,23 +64,29 @@ func startContainers(conf *boot.Config, specs []*specs.Spec, ids []string) ([]*C panic("conf.RootDir not set. Call testutil.SetupRootDir() to set.") } - var containers []*Container - var bundles []string - cleanup := func() { + var ( + containers []*Container + cleanups []func() + ) + cleanups = append(cleanups, func() { for _, c := range containers { c.Destroy() } - for _, b := range bundles { - os.RemoveAll(b) + }) + cleanupAll := func() { + for _, c := range cleanups { + c() } } + localClean := specutils.MakeCleanup(cleanupAll) + defer localClean.Clean() + for i, spec := range specs { - bundleDir, err := testutil.SetupBundleDir(spec) + bundleDir, cleanup, err := testutil.SetupBundleDir(spec) if err != nil { - cleanup() return nil, nil, fmt.Errorf("error setting up container: %v", err) } - bundles = append(bundles, bundleDir) + cleanups = append(cleanups, cleanup) args := Args{ ID: ids[i], @@ -89,17 +95,17 @@ func startContainers(conf *boot.Config, specs []*specs.Spec, ids []string) ([]*C } cont, err := New(conf, args) if err != nil { - cleanup() return nil, nil, fmt.Errorf("error creating container: %v", err) } containers = append(containers, cont) if err := cont.Start(conf); err != nil { - cleanup() return nil, nil, fmt.Errorf("error starting container: %v", err) } } - return containers, cleanup, nil + + localClean.Release() + return containers, cleanupAll, nil } type execDesc struct { @@ -135,159 +141,159 @@ func createSharedMount(mount specs.Mount, name string, pod ...*specs.Spec) { // TestMultiContainerSanity checks that it is possible to run 2 dead-simple // containers in the same sandbox. func TestMultiContainerSanity(t *testing.T) { - for _, conf := range configs(t, all...) { - t.Logf("Running test with conf: %+v", conf) - - rootDir, err := testutil.SetupRootDir() - if err != nil { - t.Fatalf("error creating root dir: %v", err) - } - defer os.RemoveAll(rootDir) - conf.RootDir = rootDir + for name, conf := range configs(t, all...) { + t.Run(name, func(t *testing.T) { + rootDir, cleanup, err := testutil.SetupRootDir() + if err != nil { + t.Fatalf("error creating root dir: %v", err) + } + defer cleanup() + conf.RootDir = rootDir - // Setup the containers. - sleep := []string{"sleep", "100"} - specs, ids := createSpecs(sleep, sleep) - containers, cleanup, err := startContainers(conf, specs, ids) - if err != nil { - t.Fatalf("error starting containers: %v", err) - } - defer cleanup() + // Setup the containers. + sleep := []string{"sleep", "100"} + specs, ids := createSpecs(sleep, sleep) + containers, cleanup, err := startContainers(conf, specs, ids) + if err != nil { + t.Fatalf("error starting containers: %v", err) + } + defer cleanup() - // Check via ps that multiple processes are running. - expectedPL := []*control.Process{ - {PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}}, - } - if err := waitForProcessList(containers[0], expectedPL); err != nil { - t.Errorf("failed to wait for sleep to start: %v", err) - } - expectedPL = []*control.Process{ - {PID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{2}}, - } - if err := waitForProcessList(containers[1], expectedPL); err != nil { - t.Errorf("failed to wait for sleep to start: %v", err) - } + // Check via ps that multiple processes are running. + expectedPL := []*control.Process{ + {PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}}, + } + if err := waitForProcessList(containers[0], expectedPL); err != nil { + t.Errorf("failed to wait for sleep to start: %v", err) + } + expectedPL = []*control.Process{ + {PID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{2}}, + } + if err := waitForProcessList(containers[1], expectedPL); err != nil { + t.Errorf("failed to wait for sleep to start: %v", err) + } + }) } } // TestMultiPIDNS checks that it is possible to run 2 dead-simple // containers in the same sandbox with different pidns. func TestMultiPIDNS(t *testing.T) { - for _, conf := range configs(t, all...) { - t.Logf("Running test with conf: %+v", conf) - - rootDir, err := testutil.SetupRootDir() - if err != nil { - t.Fatalf("error creating root dir: %v", err) - } - defer os.RemoveAll(rootDir) - conf.RootDir = rootDir - - // Setup the containers. - sleep := []string{"sleep", "100"} - testSpecs, ids := createSpecs(sleep, sleep) - testSpecs[1].Linux = &specs.Linux{ - Namespaces: []specs.LinuxNamespace{ - { - Type: "pid", + for name, conf := range configs(t, all...) { + t.Run(name, func(t *testing.T) { + rootDir, cleanup, err := testutil.SetupRootDir() + if err != nil { + t.Fatalf("error creating root dir: %v", err) + } + defer cleanup() + conf.RootDir = rootDir + + // Setup the containers. + sleep := []string{"sleep", "100"} + testSpecs, ids := createSpecs(sleep, sleep) + testSpecs[1].Linux = &specs.Linux{ + Namespaces: []specs.LinuxNamespace{ + { + Type: "pid", + }, }, - }, - } + } - containers, cleanup, err := startContainers(conf, testSpecs, ids) - if err != nil { - t.Fatalf("error starting containers: %v", err) - } - defer cleanup() + containers, cleanup, err := startContainers(conf, testSpecs, ids) + if err != nil { + t.Fatalf("error starting containers: %v", err) + } + defer cleanup() - // Check via ps that multiple processes are running. - expectedPL := []*control.Process{ - {PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}}, - } - if err := waitForProcessList(containers[0], expectedPL); err != nil { - t.Errorf("failed to wait for sleep to start: %v", err) - } - expectedPL = []*control.Process{ - {PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}}, - } - if err := waitForProcessList(containers[1], expectedPL); err != nil { - t.Errorf("failed to wait for sleep to start: %v", err) - } + // Check via ps that multiple processes are running. + expectedPL := []*control.Process{ + {PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}}, + } + if err := waitForProcessList(containers[0], expectedPL); err != nil { + t.Errorf("failed to wait for sleep to start: %v", err) + } + expectedPL = []*control.Process{ + {PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}}, + } + if err := waitForProcessList(containers[1], expectedPL); err != nil { + t.Errorf("failed to wait for sleep to start: %v", err) + } + }) } } // TestMultiPIDNSPath checks the pidns path. func TestMultiPIDNSPath(t *testing.T) { - for _, conf := range configs(t, all...) { - t.Logf("Running test with conf: %+v", conf) - - rootDir, err := testutil.SetupRootDir() - if err != nil { - t.Fatalf("error creating root dir: %v", err) - } - defer os.RemoveAll(rootDir) - conf.RootDir = rootDir - - // Setup the containers. - sleep := []string{"sleep", "100"} - testSpecs, ids := createSpecs(sleep, sleep, sleep) - testSpecs[0].Linux = &specs.Linux{ - Namespaces: []specs.LinuxNamespace{ - { - Type: "pid", - Path: "/proc/1/ns/pid", + for name, conf := range configs(t, all...) { + t.Run(name, func(t *testing.T) { + rootDir, cleanup, err := testutil.SetupRootDir() + if err != nil { + t.Fatalf("error creating root dir: %v", err) + } + defer cleanup() + conf.RootDir = rootDir + + // Setup the containers. + sleep := []string{"sleep", "100"} + testSpecs, ids := createSpecs(sleep, sleep, sleep) + testSpecs[0].Linux = &specs.Linux{ + Namespaces: []specs.LinuxNamespace{ + { + Type: "pid", + Path: "/proc/1/ns/pid", + }, }, - }, - } - testSpecs[1].Linux = &specs.Linux{ - Namespaces: []specs.LinuxNamespace{ - { - Type: "pid", - Path: "/proc/1/ns/pid", + } + testSpecs[1].Linux = &specs.Linux{ + Namespaces: []specs.LinuxNamespace{ + { + Type: "pid", + Path: "/proc/1/ns/pid", + }, }, - }, - } - testSpecs[2].Linux = &specs.Linux{ - Namespaces: []specs.LinuxNamespace{ - { - Type: "pid", - Path: "/proc/2/ns/pid", + } + testSpecs[2].Linux = &specs.Linux{ + Namespaces: []specs.LinuxNamespace{ + { + Type: "pid", + Path: "/proc/2/ns/pid", + }, }, - }, - } + } - containers, cleanup, err := startContainers(conf, testSpecs, ids) - if err != nil { - t.Fatalf("error starting containers: %v", err) - } - defer cleanup() + containers, cleanup, err := startContainers(conf, testSpecs, ids) + if err != nil { + t.Fatalf("error starting containers: %v", err) + } + defer cleanup() - // Check via ps that multiple processes are running. - expectedPL := []*control.Process{ - {PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}}, - } - if err := waitForProcessList(containers[0], expectedPL); err != nil { - t.Errorf("failed to wait for sleep to start: %v", err) - } - if err := waitForProcessList(containers[2], expectedPL); err != nil { - t.Errorf("failed to wait for sleep to start: %v", err) - } + // Check via ps that multiple processes are running. + expectedPL := []*control.Process{ + {PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}}, + } + if err := waitForProcessList(containers[0], expectedPL); err != nil { + t.Errorf("failed to wait for sleep to start: %v", err) + } + if err := waitForProcessList(containers[2], expectedPL); err != nil { + t.Errorf("failed to wait for sleep to start: %v", err) + } - expectedPL = []*control.Process{ - {PID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{2}}, - } - if err := waitForProcessList(containers[1], expectedPL); err != nil { - t.Errorf("failed to wait for sleep to start: %v", err) - } + expectedPL = []*control.Process{ + {PID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{2}}, + } + if err := waitForProcessList(containers[1], expectedPL); err != nil { + t.Errorf("failed to wait for sleep to start: %v", err) + } + }) } } func TestMultiContainerWait(t *testing.T) { - rootDir, err := testutil.SetupRootDir() + rootDir, cleanup, err := testutil.SetupRootDir() if err != nil { t.Fatalf("error creating root dir: %v", err) } - defer os.RemoveAll(rootDir) + defer cleanup() conf := testutil.TestConfig(t) conf.RootDir = rootDir @@ -361,11 +367,11 @@ func TestMultiContainerWait(t *testing.T) { // TestExecWait ensures what we can wait containers and individual processes in the // sandbox that have already exited. func TestExecWait(t *testing.T) { - rootDir, err := testutil.SetupRootDir() + rootDir, cleanup, err := testutil.SetupRootDir() if err != nil { t.Fatalf("error creating root dir: %v", err) } - defer os.RemoveAll(rootDir) + defer cleanup() conf := testutil.TestConfig(t) conf.RootDir = rootDir @@ -457,11 +463,11 @@ func TestMultiContainerMount(t *testing.T) { }) // Setup the containers. - rootDir, err := testutil.SetupRootDir() + rootDir, cleanup, err := testutil.SetupRootDir() if err != nil { t.Fatalf("error creating root dir: %v", err) } - defer os.RemoveAll(rootDir) + defer cleanup() conf := testutil.TestConfig(t) conf.RootDir = rootDir @@ -484,174 +490,174 @@ func TestMultiContainerMount(t *testing.T) { // TestMultiContainerSignal checks that it is possible to signal individual // containers without killing the entire sandbox. func TestMultiContainerSignal(t *testing.T) { - for _, conf := range configs(t, all...) { - t.Logf("Running test with conf: %+v", conf) - - rootDir, err := testutil.SetupRootDir() - if err != nil { - t.Fatalf("error creating root dir: %v", err) - } - defer os.RemoveAll(rootDir) - conf.RootDir = rootDir + for name, conf := range configs(t, all...) { + t.Run(name, func(t *testing.T) { + rootDir, cleanup, err := testutil.SetupRootDir() + if err != nil { + t.Fatalf("error creating root dir: %v", err) + } + defer cleanup() + conf.RootDir = rootDir - // Setup the containers. - sleep := []string{"sleep", "100"} - specs, ids := createSpecs(sleep, sleep) - containers, cleanup, err := startContainers(conf, specs, ids) - if err != nil { - t.Fatalf("error starting containers: %v", err) - } - defer cleanup() + // Setup the containers. + sleep := []string{"sleep", "100"} + specs, ids := createSpecs(sleep, sleep) + containers, cleanup, err := startContainers(conf, specs, ids) + if err != nil { + t.Fatalf("error starting containers: %v", err) + } + defer cleanup() - // Check via ps that container 1 process is running. - expectedPL := []*control.Process{ - {PID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{2}}, - } + // Check via ps that container 1 process is running. + expectedPL := []*control.Process{ + {PID: 2, Cmd: "sleep", Threads: []kernel.ThreadID{2}}, + } - if err := waitForProcessList(containers[1], expectedPL); err != nil { - t.Errorf("failed to wait for sleep to start: %v", err) - } + if err := waitForProcessList(containers[1], expectedPL); err != nil { + t.Errorf("failed to wait for sleep to start: %v", err) + } - // Kill process 2. - if err := containers[1].SignalContainer(syscall.SIGKILL, false); err != nil { - t.Errorf("failed to kill process 2: %v", err) - } + // Kill process 2. + if err := containers[1].SignalContainer(syscall.SIGKILL, false); err != nil { + t.Errorf("failed to kill process 2: %v", err) + } - // Make sure process 1 is still running. - expectedPL = []*control.Process{ - {PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}}, - } - if err := waitForProcessList(containers[0], expectedPL); err != nil { - t.Errorf("failed to wait for sleep to start: %v", err) - } + // Make sure process 1 is still running. + expectedPL = []*control.Process{ + {PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}}, + } + if err := waitForProcessList(containers[0], expectedPL); err != nil { + t.Errorf("failed to wait for sleep to start: %v", err) + } - // goferPid is reset when container is destroyed. - goferPid := containers[1].GoferPid + // goferPid is reset when container is destroyed. + goferPid := containers[1].GoferPid - // Destroy container and ensure container's gofer process has exited. - if err := containers[1].Destroy(); err != nil { - t.Errorf("failed to destroy container: %v", err) - } - _, _, err = specutils.RetryEintr(func() (uintptr, uintptr, error) { - cpid, err := syscall.Wait4(goferPid, nil, 0, nil) - return uintptr(cpid), 0, err - }) - if err != syscall.ECHILD { - t.Errorf("error waiting for gofer to exit: %v", err) - } - // Make sure process 1 is still running. - if err := waitForProcessList(containers[0], expectedPL); err != nil { - t.Errorf("failed to wait for sleep to start: %v", err) - } + // Destroy container and ensure container's gofer process has exited. + if err := containers[1].Destroy(); err != nil { + t.Errorf("failed to destroy container: %v", err) + } + _, _, err = specutils.RetryEintr(func() (uintptr, uintptr, error) { + cpid, err := syscall.Wait4(goferPid, nil, 0, nil) + return uintptr(cpid), 0, err + }) + if err != syscall.ECHILD { + t.Errorf("error waiting for gofer to exit: %v", err) + } + // Make sure process 1 is still running. + if err := waitForProcessList(containers[0], expectedPL); err != nil { + t.Errorf("failed to wait for sleep to start: %v", err) + } - // Now that process 2 is gone, ensure we get an error trying to - // signal it again. - if err := containers[1].SignalContainer(syscall.SIGKILL, false); err == nil { - t.Errorf("container %q shouldn't exist, but we were able to signal it", containers[1].ID) - } + // Now that process 2 is gone, ensure we get an error trying to + // signal it again. + if err := containers[1].SignalContainer(syscall.SIGKILL, false); err == nil { + t.Errorf("container %q shouldn't exist, but we were able to signal it", containers[1].ID) + } - // Kill process 1. - if err := containers[0].SignalContainer(syscall.SIGKILL, false); err != nil { - t.Errorf("failed to kill process 1: %v", err) - } + // Kill process 1. + if err := containers[0].SignalContainer(syscall.SIGKILL, false); err != nil { + t.Errorf("failed to kill process 1: %v", err) + } - // Ensure that container's gofer and sandbox process are no more. - err = blockUntilWaitable(containers[0].GoferPid) - if err != nil && err != syscall.ECHILD { - t.Errorf("error waiting for gofer to exit: %v", err) - } + // Ensure that container's gofer and sandbox process are no more. + err = blockUntilWaitable(containers[0].GoferPid) + if err != nil && err != syscall.ECHILD { + t.Errorf("error waiting for gofer to exit: %v", err) + } - err = blockUntilWaitable(containers[0].Sandbox.Pid) - if err != nil && err != syscall.ECHILD { - t.Errorf("error waiting for sandbox to exit: %v", err) - } + err = blockUntilWaitable(containers[0].Sandbox.Pid) + if err != nil && err != syscall.ECHILD { + t.Errorf("error waiting for sandbox to exit: %v", err) + } - // The sentry should be gone, so signaling should yield an error. - if err := containers[0].SignalContainer(syscall.SIGKILL, false); err == nil { - t.Errorf("sandbox %q shouldn't exist, but we were able to signal it", containers[0].Sandbox.ID) - } + // The sentry should be gone, so signaling should yield an error. + if err := containers[0].SignalContainer(syscall.SIGKILL, false); err == nil { + t.Errorf("sandbox %q shouldn't exist, but we were able to signal it", containers[0].Sandbox.ID) + } - if err := containers[0].Destroy(); err != nil { - t.Errorf("failed to destroy container: %v", err) - } + if err := containers[0].Destroy(); err != nil { + t.Errorf("failed to destroy container: %v", err) + } + }) } } // TestMultiContainerDestroy checks that container are properly cleaned-up when // they are destroyed. func TestMultiContainerDestroy(t *testing.T) { - app, err := testutil.FindFile("runsc/container/test_app/test_app") + app, err := testutil.FindFile("test/cmd/test_app/test_app") if err != nil { t.Fatal("error finding test_app:", err) } - for _, conf := range configs(t, all...) { - t.Logf("Running test with conf: %+v", conf) - - rootDir, err := testutil.SetupRootDir() - if err != nil { - t.Fatalf("error creating root dir: %v", err) - } - defer os.RemoveAll(rootDir) - conf.RootDir = rootDir - - // First container will remain intact while the second container is killed. - podSpecs, ids := createSpecs( - []string{"sleep", "100"}, - []string{app, "fork-bomb"}) - - // Run the fork bomb in a PID namespace to prevent processes to be - // re-parented to PID=1 in the root container. - podSpecs[1].Linux = &specs.Linux{ - Namespaces: []specs.LinuxNamespace{{Type: "pid"}}, - } - containers, cleanup, err := startContainers(conf, podSpecs, ids) - if err != nil { - t.Fatalf("error starting containers: %v", err) - } - defer cleanup() + for name, conf := range configs(t, all...) { + t.Run(name, func(t *testing.T) { + rootDir, cleanup, err := testutil.SetupRootDir() + if err != nil { + t.Fatalf("error creating root dir: %v", err) + } + defer cleanup() + conf.RootDir = rootDir + + // First container will remain intact while the second container is killed. + podSpecs, ids := createSpecs( + []string{"sleep", "100"}, + []string{app, "fork-bomb"}) + + // Run the fork bomb in a PID namespace to prevent processes to be + // re-parented to PID=1 in the root container. + podSpecs[1].Linux = &specs.Linux{ + Namespaces: []specs.LinuxNamespace{{Type: "pid"}}, + } + containers, cleanup, err := startContainers(conf, podSpecs, ids) + if err != nil { + t.Fatalf("error starting containers: %v", err) + } + defer cleanup() - // Exec more processes to ensure signal all works for exec'd processes too. - args := &control.ExecArgs{ - Filename: app, - Argv: []string{app, "fork-bomb"}, - } - if _, err := containers[1].Execute(args); err != nil { - t.Fatalf("error exec'ing: %v", err) - } + // Exec more processes to ensure signal all works for exec'd processes too. + args := &control.ExecArgs{ + Filename: app, + Argv: []string{app, "fork-bomb"}, + } + if _, err := containers[1].Execute(args); err != nil { + t.Fatalf("error exec'ing: %v", err) + } - // Let it brew... - time.Sleep(500 * time.Millisecond) + // Let it brew... + time.Sleep(500 * time.Millisecond) - if err := containers[1].Destroy(); err != nil { - t.Fatalf("error destroying container: %v", err) - } + if err := containers[1].Destroy(); err != nil { + t.Fatalf("error destroying container: %v", err) + } - // Check that destroy killed all processes belonging to the container and - // waited for them to exit before returning. - pss, err := containers[0].Sandbox.Processes("") - if err != nil { - t.Fatalf("error getting process data from sandbox: %v", err) - } - expectedPL := []*control.Process{{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}}} - if r, err := procListsEqual(pss, expectedPL); !r { - t.Errorf("container got process list: %s, want: %s: error: %v", - procListToString(pss), procListToString(expectedPL), err) - } + // Check that destroy killed all processes belonging to the container and + // waited for them to exit before returning. + pss, err := containers[0].Sandbox.Processes("") + if err != nil { + t.Fatalf("error getting process data from sandbox: %v", err) + } + expectedPL := []*control.Process{{PID: 1, Cmd: "sleep", Threads: []kernel.ThreadID{1}}} + if r, err := procListsEqual(pss, expectedPL); !r { + t.Errorf("container got process list: %s, want: %s: error: %v", + procListToString(pss), procListToString(expectedPL), err) + } - // Check that cont.Destroy is safe to call multiple times. - if err := containers[1].Destroy(); err != nil { - t.Errorf("error destroying container: %v", err) - } + // Check that cont.Destroy is safe to call multiple times. + if err := containers[1].Destroy(); err != nil { + t.Errorf("error destroying container: %v", err) + } + }) } } func TestMultiContainerProcesses(t *testing.T) { - rootDir, err := testutil.SetupRootDir() + rootDir, cleanup, err := testutil.SetupRootDir() if err != nil { t.Fatalf("error creating root dir: %v", err) } - defer os.RemoveAll(rootDir) + defer cleanup() conf := testutil.TestConfig(t) conf.RootDir = rootDir @@ -706,11 +712,11 @@ func TestMultiContainerProcesses(t *testing.T) { // TestMultiContainerKillAll checks that all process that belong to a container // are killed when SIGKILL is sent to *all* processes in that container. func TestMultiContainerKillAll(t *testing.T) { - rootDir, err := testutil.SetupRootDir() + rootDir, cleanup, err := testutil.SetupRootDir() if err != nil { t.Fatalf("error creating root dir: %v", err) } - defer os.RemoveAll(rootDir) + defer cleanup() conf := testutil.TestConfig(t) conf.RootDir = rootDir @@ -721,7 +727,7 @@ func TestMultiContainerKillAll(t *testing.T) { {killContainer: true}, {killContainer: false}, } { - app, err := testutil.FindFile("runsc/container/test_app/test_app") + app, err := testutil.FindFile("test/cmd/test_app/test_app") if err != nil { t.Fatal("error finding test_app:", err) } @@ -739,11 +745,11 @@ func TestMultiContainerKillAll(t *testing.T) { // Wait until all processes are created. rootProcCount := int(math.Pow(2, 3) - 1) if err := waitForProcessCount(containers[0], rootProcCount); err != nil { - t.Fatal(err) + t.Fatalf("error waitting for processes: %v", err) } procCount := int(math.Pow(2, 5) - 1) if err := waitForProcessCount(containers[1], procCount); err != nil { - t.Fatal(err) + t.Fatalf("error waiting for processes: %v", err) } // Exec more processes to ensure signal works for exec'd processes too. @@ -757,7 +763,7 @@ func TestMultiContainerKillAll(t *testing.T) { // Wait for these new processes to start. procCount += int(math.Pow(2, 3) - 1) if err := waitForProcessCount(containers[1], procCount); err != nil { - t.Fatal(err) + t.Fatalf("error waiting for processes: %v", err) } if tc.killContainer { @@ -790,11 +796,11 @@ func TestMultiContainerKillAll(t *testing.T) { // Check that all processes are gone. if err := waitForProcessCount(containers[1], 0); err != nil { - t.Fatal(err) + t.Fatalf("error waiting for processes: %v", err) } // Check that root container was not affected. if err := waitForProcessCount(containers[0], rootProcCount); err != nil { - t.Fatal(err) + t.Fatalf("error waiting for processes: %v", err) } } } @@ -805,17 +811,16 @@ func TestMultiContainerDestroyNotStarted(t *testing.T) { []string{"/bin/sleep", "100"}) conf := testutil.TestConfig(t) - rootDir, rootBundleDir, err := testutil.SetupContainer(specs[0], conf) + _, bundleDir, cleanup, err := testutil.SetupContainer(specs[0], conf) if err != nil { t.Fatalf("error setting up container: %v", err) } - defer os.RemoveAll(rootDir) - defer os.RemoveAll(rootBundleDir) + defer cleanup() rootArgs := Args{ ID: ids[0], Spec: specs[0], - BundleDir: rootBundleDir, + BundleDir: bundleDir, } root, err := New(conf, rootArgs) if err != nil { @@ -827,11 +832,11 @@ func TestMultiContainerDestroyNotStarted(t *testing.T) { } // Create and destroy sub-container. - bundleDir, err := testutil.SetupBundleDir(specs[1]) + bundleDir, cleanupSub, err := testutil.SetupBundleDir(specs[1]) if err != nil { t.Fatalf("error setting up container: %v", err) } - defer os.RemoveAll(bundleDir) + defer cleanupSub() args := Args{ ID: ids[1], @@ -859,17 +864,16 @@ func TestMultiContainerDestroyStarting(t *testing.T) { specs, ids := createSpecs(cmds...) conf := testutil.TestConfig(t) - rootDir, rootBundleDir, err := testutil.SetupContainer(specs[0], conf) + rootDir, bundleDir, cleanup, err := testutil.SetupContainer(specs[0], conf) if err != nil { t.Fatalf("error setting up container: %v", err) } - defer os.RemoveAll(rootDir) - defer os.RemoveAll(rootBundleDir) + defer cleanup() rootArgs := Args{ ID: ids[0], Spec: specs[0], - BundleDir: rootBundleDir, + BundleDir: bundleDir, } root, err := New(conf, rootArgs) if err != nil { @@ -886,16 +890,16 @@ func TestMultiContainerDestroyStarting(t *testing.T) { continue // skip root container } - bundleDir, err := testutil.SetupBundleDir(specs[i]) + bundleDir, cleanup, err := testutil.SetupBundleDir(specs[i]) if err != nil { t.Fatalf("error setting up container: %v", err) } - defer os.RemoveAll(bundleDir) + defer cleanup() rootArgs := Args{ ID: ids[i], Spec: specs[i], - BundleDir: rootBundleDir, + BundleDir: bundleDir, } cont, err := New(conf, rootArgs) if err != nil { @@ -937,11 +941,11 @@ func TestMultiContainerDifferentFilesystems(t *testing.T) { script := fmt.Sprintf("if [ -f %q ]; then exit 1; else touch %q; fi", filename, filename) cmd := []string{"sh", "-c", script} - rootDir, err := testutil.SetupRootDir() + rootDir, cleanup, err := testutil.SetupRootDir() if err != nil { t.Fatalf("error creating root dir: %v", err) } - defer os.RemoveAll(rootDir) + defer cleanup() conf := testutil.TestConfig(t) conf.RootDir = rootDir @@ -977,7 +981,7 @@ func TestMultiContainerDifferentFilesystems(t *testing.T) { // TestMultiContainerContainerDestroyStress tests that IO operations continue // to work after containers have been stopped and gofers killed. func TestMultiContainerContainerDestroyStress(t *testing.T) { - app, err := testutil.FindFile("runsc/container/test_app/test_app") + app, err := testutil.FindFile("test/cmd/test_app/test_app") if err != nil { t.Fatal("error finding test_app:", err) } @@ -1007,12 +1011,11 @@ func TestMultiContainerContainerDestroyStress(t *testing.T) { childrenIDs := allIDs[1:] conf := testutil.TestConfig(t) - rootDir, bundleDir, err := testutil.SetupContainer(rootSpec, conf) + _, bundleDir, cleanup, err := testutil.SetupContainer(rootSpec, conf) if err != nil { t.Fatalf("error setting up container: %v", err) } - defer os.RemoveAll(rootDir) - defer os.RemoveAll(bundleDir) + defer cleanup() // Start root container. rootArgs := Args{ @@ -1038,11 +1041,11 @@ func TestMultiContainerContainerDestroyStress(t *testing.T) { var children []*Container for j, spec := range specs { - bundleDir, err := testutil.SetupBundleDir(spec) + bundleDir, cleanup, err := testutil.SetupBundleDir(spec) if err != nil { t.Fatalf("error setting up container: %v", err) } - defer os.RemoveAll(bundleDir) + defer cleanup() args := Args{ ID: ids[j], @@ -1080,306 +1083,306 @@ func TestMultiContainerContainerDestroyStress(t *testing.T) { // Test that pod shared mounts are properly mounted in 2 containers and that // changes from one container is reflected in the other. func TestMultiContainerSharedMount(t *testing.T) { - for _, conf := range configs(t, all...) { - t.Logf("Running test with conf: %+v", conf) - - rootDir, err := testutil.SetupRootDir() - if err != nil { - t.Fatalf("error creating root dir: %v", err) - } - defer os.RemoveAll(rootDir) - conf.RootDir = rootDir - - // Setup the containers. - sleep := []string{"sleep", "100"} - podSpec, ids := createSpecs(sleep, sleep) - mnt0 := specs.Mount{ - Destination: "/mydir/test", - Source: "/some/dir", - Type: "tmpfs", - Options: nil, - } - podSpec[0].Mounts = append(podSpec[0].Mounts, mnt0) + for name, conf := range configs(t, all...) { + t.Run(name, func(t *testing.T) { + rootDir, cleanup, err := testutil.SetupRootDir() + if err != nil { + t.Fatalf("error creating root dir: %v", err) + } + defer cleanup() + conf.RootDir = rootDir + + // Setup the containers. + sleep := []string{"sleep", "100"} + podSpec, ids := createSpecs(sleep, sleep) + mnt0 := specs.Mount{ + Destination: "/mydir/test", + Source: "/some/dir", + Type: "tmpfs", + Options: nil, + } + podSpec[0].Mounts = append(podSpec[0].Mounts, mnt0) - mnt1 := mnt0 - mnt1.Destination = "/mydir2/test2" - podSpec[1].Mounts = append(podSpec[1].Mounts, mnt1) + mnt1 := mnt0 + mnt1.Destination = "/mydir2/test2" + podSpec[1].Mounts = append(podSpec[1].Mounts, mnt1) - createSharedMount(mnt0, "test-mount", podSpec...) + createSharedMount(mnt0, "test-mount", podSpec...) - containers, cleanup, err := startContainers(conf, podSpec, ids) - if err != nil { - t.Fatalf("error starting containers: %v", err) - } - defer cleanup() + containers, cleanup, err := startContainers(conf, podSpec, ids) + if err != nil { + t.Fatalf("error starting containers: %v", err) + } + defer cleanup() - file0 := path.Join(mnt0.Destination, "abc") - file1 := path.Join(mnt1.Destination, "abc") - execs := []execDesc{ - { - c: containers[0], - cmd: []string{"/usr/bin/test", "-d", mnt0.Destination}, - desc: "directory is mounted in container0", - }, - { - c: containers[1], - cmd: []string{"/usr/bin/test", "-d", mnt1.Destination}, - desc: "directory is mounted in container1", - }, - { - c: containers[0], - cmd: []string{"/usr/bin/touch", file0}, - desc: "create file in container0", - }, - { - c: containers[0], - cmd: []string{"/usr/bin/test", "-f", file0}, - desc: "file appears in container0", - }, - { - c: containers[1], - cmd: []string{"/usr/bin/test", "-f", file1}, - desc: "file appears in container1", - }, - { - c: containers[1], - cmd: []string{"/bin/rm", file1}, - desc: "file removed from container1", - }, - { - c: containers[0], - cmd: []string{"/usr/bin/test", "!", "-f", file0}, - desc: "file removed from container0", - }, - { - c: containers[1], - cmd: []string{"/usr/bin/test", "!", "-f", file1}, - desc: "file removed from container1", - }, - { - c: containers[1], - cmd: []string{"/bin/mkdir", file1}, - desc: "create directory in container1", - }, - { - c: containers[0], - cmd: []string{"/usr/bin/test", "-d", file0}, - desc: "dir appears in container0", - }, - { - c: containers[1], - cmd: []string{"/usr/bin/test", "-d", file1}, - desc: "dir appears in container1", - }, - { - c: containers[0], - cmd: []string{"/bin/rmdir", file0}, - desc: "create directory in container0", - }, - { - c: containers[0], - cmd: []string{"/usr/bin/test", "!", "-d", file0}, - desc: "dir removed from container0", - }, - { - c: containers[1], - cmd: []string{"/usr/bin/test", "!", "-d", file1}, - desc: "dir removed from container1", - }, - } - if err := execMany(execs); err != nil { - t.Fatal(err.Error()) - } + file0 := path.Join(mnt0.Destination, "abc") + file1 := path.Join(mnt1.Destination, "abc") + execs := []execDesc{ + { + c: containers[0], + cmd: []string{"/usr/bin/test", "-d", mnt0.Destination}, + desc: "directory is mounted in container0", + }, + { + c: containers[1], + cmd: []string{"/usr/bin/test", "-d", mnt1.Destination}, + desc: "directory is mounted in container1", + }, + { + c: containers[0], + cmd: []string{"/usr/bin/touch", file0}, + desc: "create file in container0", + }, + { + c: containers[0], + cmd: []string{"/usr/bin/test", "-f", file0}, + desc: "file appears in container0", + }, + { + c: containers[1], + cmd: []string{"/usr/bin/test", "-f", file1}, + desc: "file appears in container1", + }, + { + c: containers[1], + cmd: []string{"/bin/rm", file1}, + desc: "file removed from container1", + }, + { + c: containers[0], + cmd: []string{"/usr/bin/test", "!", "-f", file0}, + desc: "file removed from container0", + }, + { + c: containers[1], + cmd: []string{"/usr/bin/test", "!", "-f", file1}, + desc: "file removed from container1", + }, + { + c: containers[1], + cmd: []string{"/bin/mkdir", file1}, + desc: "create directory in container1", + }, + { + c: containers[0], + cmd: []string{"/usr/bin/test", "-d", file0}, + desc: "dir appears in container0", + }, + { + c: containers[1], + cmd: []string{"/usr/bin/test", "-d", file1}, + desc: "dir appears in container1", + }, + { + c: containers[0], + cmd: []string{"/bin/rmdir", file0}, + desc: "create directory in container0", + }, + { + c: containers[0], + cmd: []string{"/usr/bin/test", "!", "-d", file0}, + desc: "dir removed from container0", + }, + { + c: containers[1], + cmd: []string{"/usr/bin/test", "!", "-d", file1}, + desc: "dir removed from container1", + }, + } + if err := execMany(execs); err != nil { + t.Fatal(err.Error()) + } + }) } } // Test that pod mounts are mounted as readonly when requested. func TestMultiContainerSharedMountReadonly(t *testing.T) { - for _, conf := range configs(t, all...) { - t.Logf("Running test with conf: %+v", conf) - - rootDir, err := testutil.SetupRootDir() - if err != nil { - t.Fatalf("error creating root dir: %v", err) - } - defer os.RemoveAll(rootDir) - conf.RootDir = rootDir - - // Setup the containers. - sleep := []string{"sleep", "100"} - podSpec, ids := createSpecs(sleep, sleep) - mnt0 := specs.Mount{ - Destination: "/mydir/test", - Source: "/some/dir", - Type: "tmpfs", - Options: []string{"ro"}, - } - podSpec[0].Mounts = append(podSpec[0].Mounts, mnt0) + for name, conf := range configs(t, all...) { + t.Run(name, func(t *testing.T) { + rootDir, cleanup, err := testutil.SetupRootDir() + if err != nil { + t.Fatalf("error creating root dir: %v", err) + } + defer cleanup() + conf.RootDir = rootDir + + // Setup the containers. + sleep := []string{"sleep", "100"} + podSpec, ids := createSpecs(sleep, sleep) + mnt0 := specs.Mount{ + Destination: "/mydir/test", + Source: "/some/dir", + Type: "tmpfs", + Options: []string{"ro"}, + } + podSpec[0].Mounts = append(podSpec[0].Mounts, mnt0) - mnt1 := mnt0 - mnt1.Destination = "/mydir2/test2" - podSpec[1].Mounts = append(podSpec[1].Mounts, mnt1) + mnt1 := mnt0 + mnt1.Destination = "/mydir2/test2" + podSpec[1].Mounts = append(podSpec[1].Mounts, mnt1) - createSharedMount(mnt0, "test-mount", podSpec...) + createSharedMount(mnt0, "test-mount", podSpec...) - containers, cleanup, err := startContainers(conf, podSpec, ids) - if err != nil { - t.Fatalf("error starting containers: %v", err) - } - defer cleanup() + containers, cleanup, err := startContainers(conf, podSpec, ids) + if err != nil { + t.Fatalf("error starting containers: %v", err) + } + defer cleanup() - file0 := path.Join(mnt0.Destination, "abc") - file1 := path.Join(mnt1.Destination, "abc") - execs := []execDesc{ - { - c: containers[0], - cmd: []string{"/usr/bin/test", "-d", mnt0.Destination}, - desc: "directory is mounted in container0", - }, - { - c: containers[1], - cmd: []string{"/usr/bin/test", "-d", mnt1.Destination}, - desc: "directory is mounted in container1", - }, - { - c: containers[0], - cmd: []string{"/usr/bin/touch", file0}, - want: 1, - desc: "fails to write to container0", - }, - { - c: containers[1], - cmd: []string{"/usr/bin/touch", file1}, - want: 1, - desc: "fails to write to container1", - }, - } - if err := execMany(execs); err != nil { - t.Fatal(err.Error()) - } + file0 := path.Join(mnt0.Destination, "abc") + file1 := path.Join(mnt1.Destination, "abc") + execs := []execDesc{ + { + c: containers[0], + cmd: []string{"/usr/bin/test", "-d", mnt0.Destination}, + desc: "directory is mounted in container0", + }, + { + c: containers[1], + cmd: []string{"/usr/bin/test", "-d", mnt1.Destination}, + desc: "directory is mounted in container1", + }, + { + c: containers[0], + cmd: []string{"/usr/bin/touch", file0}, + want: 1, + desc: "fails to write to container0", + }, + { + c: containers[1], + cmd: []string{"/usr/bin/touch", file1}, + want: 1, + desc: "fails to write to container1", + }, + } + if err := execMany(execs); err != nil { + t.Fatal(err.Error()) + } + }) } } // Test that shared pod mounts continue to work after container is restarted. func TestMultiContainerSharedMountRestart(t *testing.T) { - for _, conf := range configs(t, all...) { - t.Logf("Running test with conf: %+v", conf) - - rootDir, err := testutil.SetupRootDir() - if err != nil { - t.Fatalf("error creating root dir: %v", err) - } - defer os.RemoveAll(rootDir) - conf.RootDir = rootDir - - // Setup the containers. - sleep := []string{"sleep", "100"} - podSpec, ids := createSpecs(sleep, sleep) - mnt0 := specs.Mount{ - Destination: "/mydir/test", - Source: "/some/dir", - Type: "tmpfs", - Options: nil, - } - podSpec[0].Mounts = append(podSpec[0].Mounts, mnt0) + for name, conf := range configs(t, all...) { + t.Run(name, func(t *testing.T) { + rootDir, cleanup, err := testutil.SetupRootDir() + if err != nil { + t.Fatalf("error creating root dir: %v", err) + } + defer cleanup() + conf.RootDir = rootDir + + // Setup the containers. + sleep := []string{"sleep", "100"} + podSpec, ids := createSpecs(sleep, sleep) + mnt0 := specs.Mount{ + Destination: "/mydir/test", + Source: "/some/dir", + Type: "tmpfs", + Options: nil, + } + podSpec[0].Mounts = append(podSpec[0].Mounts, mnt0) - mnt1 := mnt0 - mnt1.Destination = "/mydir2/test2" - podSpec[1].Mounts = append(podSpec[1].Mounts, mnt1) + mnt1 := mnt0 + mnt1.Destination = "/mydir2/test2" + podSpec[1].Mounts = append(podSpec[1].Mounts, mnt1) - createSharedMount(mnt0, "test-mount", podSpec...) + createSharedMount(mnt0, "test-mount", podSpec...) - containers, cleanup, err := startContainers(conf, podSpec, ids) - if err != nil { - t.Fatalf("error starting containers: %v", err) - } - defer cleanup() + containers, cleanup, err := startContainers(conf, podSpec, ids) + if err != nil { + t.Fatalf("error starting containers: %v", err) + } + defer cleanup() - file0 := path.Join(mnt0.Destination, "abc") - file1 := path.Join(mnt1.Destination, "abc") - execs := []execDesc{ - { - c: containers[0], - cmd: []string{"/usr/bin/touch", file0}, - desc: "create file in container0", - }, - { - c: containers[0], - cmd: []string{"/usr/bin/test", "-f", file0}, - desc: "file appears in container0", - }, - { - c: containers[1], - cmd: []string{"/usr/bin/test", "-f", file1}, - desc: "file appears in container1", - }, - } - if err := execMany(execs); err != nil { - t.Fatal(err.Error()) - } + file0 := path.Join(mnt0.Destination, "abc") + file1 := path.Join(mnt1.Destination, "abc") + execs := []execDesc{ + { + c: containers[0], + cmd: []string{"/usr/bin/touch", file0}, + desc: "create file in container0", + }, + { + c: containers[0], + cmd: []string{"/usr/bin/test", "-f", file0}, + desc: "file appears in container0", + }, + { + c: containers[1], + cmd: []string{"/usr/bin/test", "-f", file1}, + desc: "file appears in container1", + }, + } + if err := execMany(execs); err != nil { + t.Fatal(err.Error()) + } - containers[1].Destroy() + containers[1].Destroy() - bundleDir, err := testutil.SetupBundleDir(podSpec[1]) - if err != nil { - t.Fatalf("error restarting container: %v", err) - } - defer os.RemoveAll(bundleDir) + bundleDir, cleanup, err := testutil.SetupBundleDir(podSpec[1]) + if err != nil { + t.Fatalf("error restarting container: %v", err) + } + defer cleanup() - args := Args{ - ID: ids[1], - Spec: podSpec[1], - BundleDir: bundleDir, - } - containers[1], err = New(conf, args) - if err != nil { - t.Fatalf("error creating container: %v", err) - } - if err := containers[1].Start(conf); err != nil { - t.Fatalf("error starting container: %v", err) - } + args := Args{ + ID: ids[1], + Spec: podSpec[1], + BundleDir: bundleDir, + } + containers[1], err = New(conf, args) + if err != nil { + t.Fatalf("error creating container: %v", err) + } + if err := containers[1].Start(conf); err != nil { + t.Fatalf("error starting container: %v", err) + } - execs = []execDesc{ - { - c: containers[0], - cmd: []string{"/usr/bin/test", "-f", file0}, - desc: "file is still in container0", - }, - { - c: containers[1], - cmd: []string{"/usr/bin/test", "-f", file1}, - desc: "file is still in container1", - }, - { - c: containers[1], - cmd: []string{"/bin/rm", file1}, - desc: "file removed from container1", - }, - { - c: containers[0], - cmd: []string{"/usr/bin/test", "!", "-f", file0}, - desc: "file removed from container0", - }, - { - c: containers[1], - cmd: []string{"/usr/bin/test", "!", "-f", file1}, - desc: "file removed from container1", - }, - } - if err := execMany(execs); err != nil { - t.Fatal(err.Error()) - } + execs = []execDesc{ + { + c: containers[0], + cmd: []string{"/usr/bin/test", "-f", file0}, + desc: "file is still in container0", + }, + { + c: containers[1], + cmd: []string{"/usr/bin/test", "-f", file1}, + desc: "file is still in container1", + }, + { + c: containers[1], + cmd: []string{"/bin/rm", file1}, + desc: "file removed from container1", + }, + { + c: containers[0], + cmd: []string{"/usr/bin/test", "!", "-f", file0}, + desc: "file removed from container0", + }, + { + c: containers[1], + cmd: []string{"/usr/bin/test", "!", "-f", file1}, + desc: "file removed from container1", + }, + } + if err := execMany(execs); err != nil { + t.Fatal(err.Error()) + } + }) } } // Test that unsupported pod mounts options are ignored when matching master and // slave mounts. func TestMultiContainerSharedMountUnsupportedOptions(t *testing.T) { - rootDir, err := testutil.SetupRootDir() + rootDir, cleanup, err := testutil.SetupRootDir() if err != nil { t.Fatalf("error creating root dir: %v", err) } - defer os.RemoveAll(rootDir) + defer cleanup() conf := testutil.TestConfig(t) conf.RootDir = rootDir @@ -1428,7 +1431,7 @@ func TestMultiContainerSharedMountUnsupportedOptions(t *testing.T) { // Test that one container can send an FD to another container, even though // they have distinct MountNamespaces. func TestMultiContainerMultiRootCanHandleFDs(t *testing.T) { - app, err := testutil.FindFile("runsc/container/test_app/test_app") + app, err := testutil.FindFile("test/cmd/test_app/test_app") if err != nil { t.Fatal("error finding test_app:", err) } @@ -1457,11 +1460,11 @@ func TestMultiContainerMultiRootCanHandleFDs(t *testing.T) { Type: "tmpfs", } - rootDir, err := testutil.SetupRootDir() + rootDir, cleanup, err := testutil.SetupRootDir() if err != nil { t.Fatalf("error creating root dir: %v", err) } - defer os.RemoveAll(rootDir) + defer cleanup() conf := testutil.TestConfig(t) conf.RootDir = rootDir @@ -1494,11 +1497,11 @@ func TestMultiContainerMultiRootCanHandleFDs(t *testing.T) { // Test that container is destroyed when Gofer is killed. func TestMultiContainerGoferKilled(t *testing.T) { - rootDir, err := testutil.SetupRootDir() + rootDir, cleanup, err := testutil.SetupRootDir() if err != nil { t.Fatalf("error creating root dir: %v", err) } - defer os.RemoveAll(rootDir) + defer cleanup() conf := testutil.TestConfig(t) conf.RootDir = rootDir @@ -1581,11 +1584,11 @@ func TestMultiContainerLoadSandbox(t *testing.T) { sleep := []string{"sleep", "100"} specs, ids := createSpecs(sleep, sleep, sleep) - rootDir, err := testutil.SetupRootDir() + rootDir, cleanup, err := testutil.SetupRootDir() if err != nil { t.Fatalf("error creating root dir: %v", err) } - defer os.RemoveAll(rootDir) + defer cleanup() conf := testutil.TestConfig(t) conf.RootDir = rootDir @@ -1614,7 +1617,7 @@ func TestMultiContainerLoadSandbox(t *testing.T) { } // Create a valid but empty container directory. - randomCID := testutil.UniqueContainerID() + randomCID := testutil.RandomContainerID() dir = filepath.Join(conf.RootDir, randomCID) if err := os.MkdirAll(dir, 0755); err != nil { t.Fatalf("os.MkdirAll(%q)=%v", dir, err) @@ -1681,11 +1684,11 @@ func TestMultiContainerRunNonRoot(t *testing.T) { Type: "bind", }) - rootDir, err := testutil.SetupRootDir() + rootDir, cleanup, err := testutil.SetupRootDir() if err != nil { t.Fatalf("error creating root dir: %v", err) } - defer os.RemoveAll(rootDir) + defer cleanup() conf := testutil.TestConfig(t) conf.RootDir = rootDir diff --git a/runsc/container/shared_volume_test.go b/runsc/container/shared_volume_test.go index f80852414..bac177a88 100644 --- a/runsc/container/shared_volume_test.go +++ b/runsc/container/shared_volume_test.go @@ -24,8 +24,8 @@ import ( "gvisor.dev/gvisor/pkg/sentry/control" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/test/testutil" "gvisor.dev/gvisor/runsc/boot" - "gvisor.dev/gvisor/runsc/testutil" ) // TestSharedVolume checks that modifications to a volume mount are propagated @@ -33,7 +33,6 @@ import ( func TestSharedVolume(t *testing.T) { conf := testutil.TestConfig(t) conf.FileAccess = boot.FileAccessShared - t.Logf("Running test with conf: %+v", conf) // Main process just sleeps. We will use "exec" to probe the state of // the filesystem. @@ -44,16 +43,15 @@ func TestSharedVolume(t *testing.T) { t.Fatalf("TempDir failed: %v", err) } - rootDir, bundleDir, err := testutil.SetupContainer(spec, conf) + _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) if err != nil { t.Fatalf("error setting up container: %v", err) } - defer os.RemoveAll(rootDir) - defer os.RemoveAll(bundleDir) + defer cleanup() // Create and start the container. args := Args{ - ID: testutil.UniqueContainerID(), + ID: testutil.RandomContainerID(), Spec: spec, BundleDir: bundleDir, } @@ -192,7 +190,6 @@ func checkFile(c *Container, filename string, want []byte) error { func TestSharedVolumeFile(t *testing.T) { conf := testutil.TestConfig(t) conf.FileAccess = boot.FileAccessShared - t.Logf("Running test with conf: %+v", conf) // Main process just sleeps. We will use "exec" to probe the state of // the filesystem. @@ -203,16 +200,15 @@ func TestSharedVolumeFile(t *testing.T) { t.Fatalf("TempDir failed: %v", err) } - rootDir, bundleDir, err := testutil.SetupContainer(spec, conf) + _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) if err != nil { t.Fatalf("error setting up container: %v", err) } - defer os.RemoveAll(rootDir) - defer os.RemoveAll(bundleDir) + defer cleanup() // Create and start the container. args := Args{ - ID: testutil.UniqueContainerID(), + ID: testutil.RandomContainerID(), Spec: spec, BundleDir: bundleDir, } diff --git a/runsc/container/test_app/BUILD b/runsc/container/test_app/BUILD deleted file mode 100644 index 0defbd9fc..000000000 --- a/runsc/container/test_app/BUILD +++ /dev/null @@ -1,21 +0,0 @@ -load("//tools:defs.bzl", "go_binary") - -package(licenses = ["notice"]) - -go_binary( - name = "test_app", - testonly = 1, - srcs = [ - "fds.go", - "test_app.go", - ], - pure = True, - visibility = ["//runsc/container:__pkg__"], - deps = [ - "//pkg/unet", - "//runsc/flag", - "//runsc/testutil", - "@com_github_google_subcommands//:go_default_library", - "@com_github_kr_pty//:go_default_library", - ], -) diff --git a/runsc/container/test_app/fds.go b/runsc/container/test_app/fds.go deleted file mode 100644 index 2a146a2c3..000000000 --- a/runsc/container/test_app/fds.go +++ /dev/null @@ -1,185 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package main - -import ( - "context" - "io/ioutil" - "log" - "os" - "time" - - "github.com/google/subcommands" - "gvisor.dev/gvisor/pkg/unet" - "gvisor.dev/gvisor/runsc/flag" - "gvisor.dev/gvisor/runsc/testutil" -) - -const fileContents = "foobarbaz" - -// fdSender will open a file and send the FD over a unix domain socket. -type fdSender struct { - socketPath string -} - -// Name implements subcommands.Command.Name. -func (*fdSender) Name() string { - return "fd_sender" -} - -// Synopsis implements subcommands.Command.Synopsys. -func (*fdSender) Synopsis() string { - return "creates a file and sends the FD over the socket" -} - -// Usage implements subcommands.Command.Usage. -func (*fdSender) Usage() string { - return "fd_sender " -} - -// SetFlags implements subcommands.Command.SetFlags. -func (fds *fdSender) SetFlags(f *flag.FlagSet) { - f.StringVar(&fds.socketPath, "socket", "", "path to socket") -} - -// Execute implements subcommands.Command.Execute. -func (fds *fdSender) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus { - if fds.socketPath == "" { - log.Fatalf("socket flag must be set") - } - - dir, err := ioutil.TempDir("", "") - if err != nil { - log.Fatalf("TempDir failed: %v", err) - } - - fileToSend, err := ioutil.TempFile(dir, "") - if err != nil { - log.Fatalf("TempFile failed: %v", err) - } - defer fileToSend.Close() - - if _, err := fileToSend.WriteString(fileContents); err != nil { - log.Fatalf("Write(%q) failed: %v", fileContents, err) - } - - // Receiver may not be started yet, so try connecting in a poll loop. - var s *unet.Socket - if err := testutil.Poll(func() error { - var err error - s, err = unet.Connect(fds.socketPath, true /* SEQPACKET, so we can send empty message with FD */) - return err - }, 10*time.Second); err != nil { - log.Fatalf("Error connecting to socket %q: %v", fds.socketPath, err) - } - defer s.Close() - - w := s.Writer(true) - w.ControlMessage.PackFDs(int(fileToSend.Fd())) - if _, err := w.WriteVec([][]byte{[]byte{'a'}}); err != nil { - log.Fatalf("Error sending FD %q over socket %q: %v", fileToSend.Fd(), fds.socketPath, err) - } - - log.Print("FD SENDER exiting successfully") - return subcommands.ExitSuccess -} - -// fdReceiver receives an FD from a unix domain socket and does things to it. -type fdReceiver struct { - socketPath string -} - -// Name implements subcommands.Command.Name. -func (*fdReceiver) Name() string { - return "fd_receiver" -} - -// Synopsis implements subcommands.Command.Synopsys. -func (*fdReceiver) Synopsis() string { - return "reads an FD from a unix socket, and then does things to it" -} - -// Usage implements subcommands.Command.Usage. -func (*fdReceiver) Usage() string { - return "fd_receiver " -} - -// SetFlags implements subcommands.Command.SetFlags. -func (fdr *fdReceiver) SetFlags(f *flag.FlagSet) { - f.StringVar(&fdr.socketPath, "socket", "", "path to socket") -} - -// Execute implements subcommands.Command.Execute. -func (fdr *fdReceiver) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus { - if fdr.socketPath == "" { - log.Fatalf("Flags cannot be empty, given: socket: %q", fdr.socketPath) - } - - ss, err := unet.BindAndListen(fdr.socketPath, true /* packet */) - if err != nil { - log.Fatalf("BindAndListen(%q) failed: %v", fdr.socketPath, err) - } - defer ss.Close() - - var s *unet.Socket - c := make(chan error, 1) - go func() { - var err error - s, err = ss.Accept() - c <- err - }() - - select { - case err := <-c: - if err != nil { - log.Fatalf("Accept() failed: %v", err) - } - case <-time.After(10 * time.Second): - log.Fatalf("Timeout waiting for accept") - } - - r := s.Reader(true) - r.EnableFDs(1) - b := [][]byte{{'a'}} - if n, err := r.ReadVec(b); n != 1 || err != nil { - log.Fatalf("ReadVec got n=%d err %v (wanted 0, nil)", n, err) - } - - fds, err := r.ExtractFDs() - if err != nil { - log.Fatalf("ExtractFD() got err %v", err) - } - if len(fds) != 1 { - log.Fatalf("ExtractFD() got %d FDs, wanted 1", len(fds)) - } - fd := fds[0] - - file := os.NewFile(uintptr(fd), "received file") - defer file.Close() - if _, err := file.Seek(0, os.SEEK_SET); err != nil { - log.Fatalf("Seek(0, 0) failed: %v", err) - } - - got, err := ioutil.ReadAll(file) - if err != nil { - log.Fatalf("ReadAll failed: %v", err) - } - if string(got) != fileContents { - log.Fatalf("ReadAll got %q want %q", string(got), fileContents) - } - - log.Print("FD RECEIVER exiting successfully") - return subcommands.ExitSuccess -} diff --git a/runsc/container/test_app/test_app.go b/runsc/container/test_app/test_app.go deleted file mode 100644 index 5f1c4b7d6..000000000 --- a/runsc/container/test_app/test_app.go +++ /dev/null @@ -1,394 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Binary test_app is like a swiss knife for tests that need to run anything -// inside the sandbox. New functionality can be added with new commands. -package main - -import ( - "context" - "fmt" - "io" - "io/ioutil" - "log" - "net" - "os" - "os/exec" - "regexp" - "strconv" - sys "syscall" - "time" - - "github.com/google/subcommands" - "github.com/kr/pty" - "gvisor.dev/gvisor/runsc/flag" - "gvisor.dev/gvisor/runsc/testutil" -) - -func main() { - subcommands.Register(subcommands.HelpCommand(), "") - subcommands.Register(subcommands.FlagsCommand(), "") - subcommands.Register(new(capability), "") - subcommands.Register(new(fdReceiver), "") - subcommands.Register(new(fdSender), "") - subcommands.Register(new(forkBomb), "") - subcommands.Register(new(ptyRunner), "") - subcommands.Register(new(reaper), "") - subcommands.Register(new(syscall), "") - subcommands.Register(new(taskTree), "") - subcommands.Register(new(uds), "") - - flag.Parse() - - exitCode := subcommands.Execute(context.Background()) - os.Exit(int(exitCode)) -} - -type uds struct { - fileName string - socketPath string -} - -// Name implements subcommands.Command.Name. -func (*uds) Name() string { - return "uds" -} - -// Synopsis implements subcommands.Command.Synopsys. -func (*uds) Synopsis() string { - return "creates unix domain socket client and server. Client sends a contant flow of sequential numbers. Server prints them to --file" -} - -// Usage implements subcommands.Command.Usage. -func (*uds) Usage() string { - return "uds " -} - -// SetFlags implements subcommands.Command.SetFlags. -func (c *uds) SetFlags(f *flag.FlagSet) { - f.StringVar(&c.fileName, "file", "", "name of output file") - f.StringVar(&c.socketPath, "socket", "", "path to socket") -} - -// Execute implements subcommands.Command.Execute. -func (c *uds) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus { - if c.fileName == "" || c.socketPath == "" { - log.Fatalf("Flags cannot be empty, given: fileName: %q, socketPath: %q", c.fileName, c.socketPath) - return subcommands.ExitFailure - } - outputFile, err := os.OpenFile(c.fileName, os.O_WRONLY|os.O_CREATE, 0666) - if err != nil { - log.Fatal("error opening output file:", err) - } - - defer os.Remove(c.socketPath) - - listener, err := net.Listen("unix", c.socketPath) - if err != nil { - log.Fatalf("error listening on socket %q: %v", c.socketPath, err) - } - - go server(listener, outputFile) - for i := 0; ; i++ { - conn, err := net.Dial("unix", c.socketPath) - if err != nil { - log.Fatal("error dialing:", err) - } - if _, err := conn.Write([]byte(strconv.Itoa(i))); err != nil { - log.Fatal("error writing:", err) - } - conn.Close() - time.Sleep(100 * time.Millisecond) - } -} - -func server(listener net.Listener, out *os.File) { - buf := make([]byte, 16) - - for { - c, err := listener.Accept() - if err != nil { - log.Fatal("error accepting connection:", err) - } - nr, err := c.Read(buf) - if err != nil { - log.Fatal("error reading from buf:", err) - } - data := buf[0:nr] - fmt.Fprint(out, string(data)+"\n") - } -} - -type taskTree struct { - depth int - width int - pause bool -} - -// Name implements subcommands.Command. -func (*taskTree) Name() string { - return "task-tree" -} - -// Synopsis implements subcommands.Command. -func (*taskTree) Synopsis() string { - return "creates a tree of tasks" -} - -// Usage implements subcommands.Command. -func (*taskTree) Usage() string { - return "task-tree " -} - -// SetFlags implements subcommands.Command. -func (c *taskTree) SetFlags(f *flag.FlagSet) { - f.IntVar(&c.depth, "depth", 1, "number of levels to create") - f.IntVar(&c.width, "width", 1, "number of tasks at each level") - f.BoolVar(&c.pause, "pause", false, "whether the tasks should pause perpetually") -} - -// Execute implements subcommands.Command. -func (c *taskTree) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus { - stop := testutil.StartReaper() - defer stop() - - if c.depth == 0 { - log.Printf("Child sleeping, PID: %d\n", os.Getpid()) - select {} - } - log.Printf("Parent %d sleeping, PID: %d\n", c.depth, os.Getpid()) - - var cmds []*exec.Cmd - for i := 0; i < c.width; i++ { - cmd := exec.Command( - "/proc/self/exe", c.Name(), - "--depth", strconv.Itoa(c.depth-1), - "--width", strconv.Itoa(c.width), - "--pause", strconv.FormatBool(c.pause)) - cmd.Stdout = os.Stdout - cmd.Stderr = os.Stderr - - if err := cmd.Start(); err != nil { - log.Fatal("failed to call self:", err) - } - cmds = append(cmds, cmd) - } - - for _, c := range cmds { - c.Wait() - } - - if c.pause { - select {} - } - - return subcommands.ExitSuccess -} - -type forkBomb struct { - delay time.Duration -} - -// Name implements subcommands.Command. -func (*forkBomb) Name() string { - return "fork-bomb" -} - -// Synopsis implements subcommands.Command. -func (*forkBomb) Synopsis() string { - return "creates child process until the end of times" -} - -// Usage implements subcommands.Command. -func (*forkBomb) Usage() string { - return "fork-bomb " -} - -// SetFlags implements subcommands.Command. -func (c *forkBomb) SetFlags(f *flag.FlagSet) { - f.DurationVar(&c.delay, "delay", 100*time.Millisecond, "amount of time to delay creation of child") -} - -// Execute implements subcommands.Command. -func (c *forkBomb) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus { - time.Sleep(c.delay) - - cmd := exec.Command("/proc/self/exe", c.Name()) - cmd.Stdout = os.Stdout - cmd.Stderr = os.Stderr - if err := cmd.Run(); err != nil { - log.Fatal("failed to call self:", err) - } - return subcommands.ExitSuccess -} - -type reaper struct{} - -// Name implements subcommands.Command. -func (*reaper) Name() string { - return "reaper" -} - -// Synopsis implements subcommands.Command. -func (*reaper) Synopsis() string { - return "reaps all children in a loop" -} - -// Usage implements subcommands.Command. -func (*reaper) Usage() string { - return "reaper " -} - -// SetFlags implements subcommands.Command. -func (*reaper) SetFlags(*flag.FlagSet) {} - -// Execute implements subcommands.Command. -func (c *reaper) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus { - stop := testutil.StartReaper() - defer stop() - select {} -} - -type syscall struct { - sysno uint64 -} - -// Name implements subcommands.Command. -func (*syscall) Name() string { - return "syscall" -} - -// Synopsis implements subcommands.Command. -func (*syscall) Synopsis() string { - return "syscall makes a syscall" -} - -// Usage implements subcommands.Command. -func (*syscall) Usage() string { - return "syscall " -} - -// SetFlags implements subcommands.Command. -func (s *syscall) SetFlags(f *flag.FlagSet) { - f.Uint64Var(&s.sysno, "syscall", 0, "syscall to call") -} - -// Execute implements subcommands.Command. -func (s *syscall) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus { - if _, _, errno := sys.Syscall(uintptr(s.sysno), 0, 0, 0); errno != 0 { - fmt.Printf("syscall(%d, 0, 0...) failed: %v\n", s.sysno, errno) - } else { - fmt.Printf("syscall(%d, 0, 0...) success\n", s.sysno) - } - return subcommands.ExitSuccess -} - -type capability struct { - enabled uint64 - disabled uint64 -} - -// Name implements subcommands.Command. -func (*capability) Name() string { - return "capability" -} - -// Synopsis implements subcommands.Command. -func (*capability) Synopsis() string { - return "checks if effective capabilities are set/unset" -} - -// Usage implements subcommands.Command. -func (*capability) Usage() string { - return "capability [--enabled=number] [--disabled=number]" -} - -// SetFlags implements subcommands.Command. -func (c *capability) SetFlags(f *flag.FlagSet) { - f.Uint64Var(&c.enabled, "enabled", 0, "") - f.Uint64Var(&c.disabled, "disabled", 0, "") -} - -// Execute implements subcommands.Command. -func (c *capability) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus { - if c.enabled == 0 && c.disabled == 0 { - fmt.Println("One of the flags must be set") - return subcommands.ExitUsageError - } - - status, err := ioutil.ReadFile("/proc/self/status") - if err != nil { - fmt.Printf("Error reading %q: %v\n", "proc/self/status", err) - return subcommands.ExitFailure - } - re := regexp.MustCompile("CapEff:\t([0-9a-f]+)\n") - matches := re.FindStringSubmatch(string(status)) - if matches == nil || len(matches) != 2 { - fmt.Printf("Effective capabilities not found in\n%s\n", status) - return subcommands.ExitFailure - } - caps, err := strconv.ParseUint(matches[1], 16, 64) - if err != nil { - fmt.Printf("failed to convert capabilities %q: %v\n", matches[1], err) - return subcommands.ExitFailure - } - - if c.enabled != 0 && (caps&c.enabled) != c.enabled { - fmt.Printf("Missing capabilities, want: %#x: got: %#x\n", c.enabled, caps) - return subcommands.ExitFailure - } - if c.disabled != 0 && (caps&c.disabled) != 0 { - fmt.Printf("Extra capabilities found, dont_want: %#x: got: %#x\n", c.disabled, caps) - return subcommands.ExitFailure - } - - return subcommands.ExitSuccess -} - -type ptyRunner struct{} - -// Name implements subcommands.Command. -func (*ptyRunner) Name() string { - return "pty-runner" -} - -// Synopsis implements subcommands.Command. -func (*ptyRunner) Synopsis() string { - return "runs the given command with an open pty terminal" -} - -// Usage implements subcommands.Command. -func (*ptyRunner) Usage() string { - return "pty-runner [command]" -} - -// SetFlags implements subcommands.Command.SetFlags. -func (*ptyRunner) SetFlags(f *flag.FlagSet) {} - -// Execute implements subcommands.Command. -func (*ptyRunner) Execute(_ context.Context, fs *flag.FlagSet, _ ...interface{}) subcommands.ExitStatus { - c := exec.Command(fs.Args()[0], fs.Args()[1:]...) - f, err := pty.Start(c) - if err != nil { - fmt.Printf("pty.Start failed: %v", err) - return subcommands.ExitFailure - } - defer f.Close() - - // Copy stdout from the command to keep this process alive until the - // subprocess exits. - io.Copy(os.Stdout, f) - - return subcommands.ExitSuccess -} diff --git a/runsc/criutil/BUILD b/runsc/criutil/BUILD deleted file mode 100644 index 8a571a000..000000000 --- a/runsc/criutil/BUILD +++ /dev/null @@ -1,11 +0,0 @@ -load("//tools:defs.bzl", "go_library") - -package(licenses = ["notice"]) - -go_library( - name = "criutil", - testonly = 1, - srcs = ["criutil.go"], - visibility = ["//:sandbox"], - deps = ["//runsc/testutil"], -) diff --git a/runsc/criutil/criutil.go b/runsc/criutil/criutil.go deleted file mode 100644 index 773f5a1c4..000000000 --- a/runsc/criutil/criutil.go +++ /dev/null @@ -1,277 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Package criutil contains utility functions for interacting with the -// Container Runtime Interface (CRI), principally via the crictl command line -// tool. This requires critools to be installed on the local system. -package criutil - -import ( - "encoding/json" - "fmt" - "os" - "os/exec" - "strings" - "time" - - "gvisor.dev/gvisor/runsc/testutil" -) - -const endpointPrefix = "unix://" - -// Crictl contains information required to run the crictl utility. -type Crictl struct { - executable string - timeout time.Duration - imageEndpoint string - runtimeEndpoint string -} - -// NewCrictl returns a Crictl configured with a timeout and an endpoint over -// which it will talk to containerd. -func NewCrictl(timeout time.Duration, endpoint string) *Crictl { - // Bazel doesn't pass PATH through, assume the location of crictl - // unless specified by environment variable. - executable := os.Getenv("CRICTL_PATH") - if executable == "" { - executable = "/usr/local/bin/crictl" - } - return &Crictl{ - executable: executable, - timeout: timeout, - imageEndpoint: endpointPrefix + endpoint, - runtimeEndpoint: endpointPrefix + endpoint, - } -} - -// Pull pulls an container image. It corresponds to `crictl pull`. -func (cc *Crictl) Pull(imageName string) error { - _, err := cc.run("pull", imageName) - return err -} - -// RunPod creates a sandbox. It corresponds to `crictl runp`. -func (cc *Crictl) RunPod(sbSpecFile string) (string, error) { - podID, err := cc.run("runp", sbSpecFile) - if err != nil { - return "", fmt.Errorf("runp failed: %v", err) - } - // Strip the trailing newline from crictl output. - return strings.TrimSpace(podID), nil -} - -// Create creates a container within a sandbox. It corresponds to `crictl -// create`. -func (cc *Crictl) Create(podID, contSpecFile, sbSpecFile string) (string, error) { - podID, err := cc.run("create", podID, contSpecFile, sbSpecFile) - if err != nil { - return "", fmt.Errorf("create failed: %v", err) - } - // Strip the trailing newline from crictl output. - return strings.TrimSpace(podID), nil -} - -// Start starts a container. It corresponds to `crictl start`. -func (cc *Crictl) Start(contID string) (string, error) { - output, err := cc.run("start", contID) - if err != nil { - return "", fmt.Errorf("start failed: %v", err) - } - return output, nil -} - -// Stop stops a container. It corresponds to `crictl stop`. -func (cc *Crictl) Stop(contID string) error { - _, err := cc.run("stop", contID) - return err -} - -// Exec execs a program inside a container. It corresponds to `crictl exec`. -func (cc *Crictl) Exec(contID string, args ...string) (string, error) { - a := []string{"exec", contID} - a = append(a, args...) - output, err := cc.run(a...) - if err != nil { - return "", fmt.Errorf("exec failed: %v", err) - } - return output, nil -} - -// Rm removes a container. It corresponds to `crictl rm`. -func (cc *Crictl) Rm(contID string) error { - _, err := cc.run("rm", contID) - return err -} - -// StopPod stops a pod. It corresponds to `crictl stopp`. -func (cc *Crictl) StopPod(podID string) error { - _, err := cc.run("stopp", podID) - return err -} - -// containsConfig is a minimal copy of -// https://github.com/kubernetes/kubernetes/blob/master/pkg/kubelet/apis/cri/runtime/v1alpha2/api.proto -// It only contains fields needed for testing. -type containerConfig struct { - Status containerStatus -} - -type containerStatus struct { - Network containerNetwork -} - -type containerNetwork struct { - IP string -} - -// PodIP returns a pod's IP address. -func (cc *Crictl) PodIP(podID string) (string, error) { - output, err := cc.run("inspectp", podID) - if err != nil { - return "", err - } - conf := &containerConfig{} - if err := json.Unmarshal([]byte(output), conf); err != nil { - return "", fmt.Errorf("failed to unmarshal JSON: %v, %s", err, output) - } - if conf.Status.Network.IP == "" { - return "", fmt.Errorf("no IP found in config: %s", output) - } - return conf.Status.Network.IP, nil -} - -// RmPod removes a container. It corresponds to `crictl rmp`. -func (cc *Crictl) RmPod(podID string) error { - _, err := cc.run("rmp", podID) - return err -} - -// StartContainer pulls the given image ands starts the container in the -// sandbox with the given podID. -func (cc *Crictl) StartContainer(podID, image, sbSpec, contSpec string) (string, error) { - // Write the specs to files that can be read by crictl. - sbSpecFile, err := testutil.WriteTmpFile("sbSpec", sbSpec) - if err != nil { - return "", fmt.Errorf("failed to write sandbox spec: %v", err) - } - contSpecFile, err := testutil.WriteTmpFile("contSpec", contSpec) - if err != nil { - return "", fmt.Errorf("failed to write container spec: %v", err) - } - - return cc.startContainer(podID, image, sbSpecFile, contSpecFile) -} - -func (cc *Crictl) startContainer(podID, image, sbSpecFile, contSpecFile string) (string, error) { - if err := cc.Pull(image); err != nil { - return "", fmt.Errorf("failed to pull %s: %v", image, err) - } - - contID, err := cc.Create(podID, contSpecFile, sbSpecFile) - if err != nil { - return "", fmt.Errorf("failed to create container in pod %q: %v", podID, err) - } - - if _, err := cc.Start(contID); err != nil { - return "", fmt.Errorf("failed to start container %q in pod %q: %v", contID, podID, err) - } - - return contID, nil -} - -// StopContainer stops and deletes the container with the given container ID. -func (cc *Crictl) StopContainer(contID string) error { - if err := cc.Stop(contID); err != nil { - return fmt.Errorf("failed to stop container %q: %v", contID, err) - } - - if err := cc.Rm(contID); err != nil { - return fmt.Errorf("failed to remove container %q: %v", contID, err) - } - - return nil -} - -// StartPodAndContainer pulls an image, then starts a sandbox and container in -// that sandbox. It returns the pod ID and container ID. -func (cc *Crictl) StartPodAndContainer(image, sbSpec, contSpec string) (string, string, error) { - // Write the specs to files that can be read by crictl. - sbSpecFile, err := testutil.WriteTmpFile("sbSpec", sbSpec) - if err != nil { - return "", "", fmt.Errorf("failed to write sandbox spec: %v", err) - } - contSpecFile, err := testutil.WriteTmpFile("contSpec", contSpec) - if err != nil { - return "", "", fmt.Errorf("failed to write container spec: %v", err) - } - - podID, err := cc.RunPod(sbSpecFile) - if err != nil { - return "", "", err - } - - contID, err := cc.startContainer(podID, image, sbSpecFile, contSpecFile) - - return podID, contID, err -} - -// StopPodAndContainer stops a container and pod. -func (cc *Crictl) StopPodAndContainer(podID, contID string) error { - if err := cc.StopContainer(contID); err != nil { - return fmt.Errorf("failed to stop container %q in pod %q: %v", contID, podID, err) - } - - if err := cc.StopPod(podID); err != nil { - return fmt.Errorf("failed to stop pod %q: %v", podID, err) - } - - if err := cc.RmPod(podID); err != nil { - return fmt.Errorf("failed to remove pod %q: %v", podID, err) - } - - return nil -} - -// run runs crictl with the given args and returns an error if it takes longer -// than cc.Timeout to run. -func (cc *Crictl) run(args ...string) (string, error) { - defaultArgs := []string{ - "--image-endpoint", cc.imageEndpoint, - "--runtime-endpoint", cc.runtimeEndpoint, - } - cmd := exec.Command(cc.executable, append(defaultArgs, args...)...) - - // Run the command with a timeout. - done := make(chan string) - errCh := make(chan error) - go func() { - output, err := cmd.CombinedOutput() - if err != nil { - errCh <- fmt.Errorf("error: \"%v\", output: %s", err, string(output)) - return - } - done <- string(output) - }() - select { - case output := <-done: - return output, nil - case err := <-errCh: - return "", err - case <-time.After(cc.timeout): - if err := testutil.KillCommand(cmd); err != nil { - return "", fmt.Errorf("timed out, then couldn't kill process %+v: %v", cmd, err) - } - return "", fmt.Errorf("timed out: %+v", cmd) - } -} diff --git a/runsc/dockerutil/BUILD b/runsc/dockerutil/BUILD deleted file mode 100644 index 8621af901..000000000 --- a/runsc/dockerutil/BUILD +++ /dev/null @@ -1,14 +0,0 @@ -load("//tools:defs.bzl", "go_library") - -package(licenses = ["notice"]) - -go_library( - name = "dockerutil", - testonly = 1, - srcs = ["dockerutil.go"], - visibility = ["//:sandbox"], - deps = [ - "//runsc/testutil", - "@com_github_kr_pty//:go_default_library", - ], -) diff --git a/runsc/dockerutil/dockerutil.go b/runsc/dockerutil/dockerutil.go deleted file mode 100644 index f009486bc..000000000 --- a/runsc/dockerutil/dockerutil.go +++ /dev/null @@ -1,486 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Package dockerutil is a collection of utility functions, primarily for -// testing. -package dockerutil - -import ( - "encoding/json" - "flag" - "fmt" - "io/ioutil" - "log" - "os" - "os/exec" - "path" - "regexp" - "strconv" - "strings" - "syscall" - "time" - - "github.com/kr/pty" - "gvisor.dev/gvisor/runsc/testutil" -) - -var ( - // runtime is the runtime to use for tests. This will be applied to all - // containers. Note that the default here ("runsc") corresponds to the - // default used by the installations. This is important, because the - // default installer for vm_tests (in tools/installers:head, invoked - // via tools/vm:defs.bzl) will install with this name. So without - // changing anything, tests should have a runsc runtime available to - // them. Otherwise installers should update the existing runtime - // instead of installing a new one. - runtime = flag.String("runtime", "runsc", "specify which runtime to use") - - // config is the default Docker daemon configuration path. - config = flag.String("config_path", "/etc/docker/daemon.json", "configuration file for reading paths") -) - -// EnsureSupportedDockerVersion checks if correct docker is installed. -func EnsureSupportedDockerVersion() { - cmd := exec.Command("docker", "version") - out, err := cmd.CombinedOutput() - if err != nil { - log.Fatalf("Error running %q: %v", "docker version", err) - } - re := regexp.MustCompile(`Version:\s+(\d+)\.(\d+)\.\d.*`) - matches := re.FindStringSubmatch(string(out)) - if len(matches) != 3 { - log.Fatalf("Invalid docker output: %s", out) - } - major, _ := strconv.Atoi(matches[1]) - minor, _ := strconv.Atoi(matches[2]) - if major < 17 || (major == 17 && minor < 9) { - log.Fatalf("Docker version 17.09.0 or greater is required, found: %02d.%02d", major, minor) - } -} - -// RuntimePath returns the binary path for the current runtime. -func RuntimePath() (string, error) { - // Read the configuration data; the file must exist. - configBytes, err := ioutil.ReadFile(*config) - if err != nil { - return "", err - } - - // Unmarshal the configuration. - c := make(map[string]interface{}) - if err := json.Unmarshal(configBytes, &c); err != nil { - return "", err - } - - // Decode the expected configuration. - r, ok := c["runtimes"] - if !ok { - return "", fmt.Errorf("no runtimes declared: %v", c) - } - rs, ok := r.(map[string]interface{}) - if !ok { - // The runtimes are not a map. - return "", fmt.Errorf("unexpected format: %v", c) - } - r, ok = rs[*runtime] - if !ok { - // The expected runtime is not declared. - return "", fmt.Errorf("runtime %q not found: %v", *runtime, c) - } - rs, ok = r.(map[string]interface{}) - if !ok { - // The runtime is not a map. - return "", fmt.Errorf("unexpected format: %v", c) - } - p, ok := rs["path"].(string) - if !ok { - // The runtime does not declare a path. - return "", fmt.Errorf("unexpected format: %v", c) - } - return p, nil -} - -// MountMode describes if the mount should be ro or rw. -type MountMode int - -const ( - // ReadOnly is what the name says. - ReadOnly MountMode = iota - // ReadWrite is what the name says. - ReadWrite -) - -// String returns the mount mode argument for this MountMode. -func (m MountMode) String() string { - switch m { - case ReadOnly: - return "ro" - case ReadWrite: - return "rw" - } - panic(fmt.Sprintf("invalid mode: %d", m)) -} - -// MountArg formats the volume argument to mount in the container. -func MountArg(source, target string, mode MountMode) string { - return fmt.Sprintf("-v=%s:%s:%v", source, target, mode) -} - -// LinkArg formats the link argument. -func LinkArg(source *Docker, target string) string { - return fmt.Sprintf("--link=%s:%s", source.Name, target) -} - -// PrepareFiles creates temp directory to copy files there. The sandbox doesn't -// have access to files in the test dir. -func PrepareFiles(names ...string) (string, error) { - dir, err := ioutil.TempDir("", "image-test") - if err != nil { - return "", fmt.Errorf("ioutil.TempDir failed: %v", err) - } - if err := os.Chmod(dir, 0777); err != nil { - return "", fmt.Errorf("os.Chmod(%q, 0777) failed: %v", dir, err) - } - for _, name := range names { - src, err := testutil.FindFile(name) - if err != nil { - return "", fmt.Errorf("testutil.Preparefiles(%q) failed: %v", name, err) - } - dst := path.Join(dir, path.Base(name)) - if err := testutil.Copy(src, dst); err != nil { - return "", fmt.Errorf("testutil.Copy(%q, %q) failed: %v", src, dst, err) - } - } - return dir, nil -} - -// do executes docker command. -func do(args ...string) (string, error) { - log.Printf("Running: docker %s\n", args) - cmd := exec.Command("docker", args...) - out, err := cmd.CombinedOutput() - if err != nil { - return "", fmt.Errorf("error executing docker %s: %v\nout: %s", args, err, out) - } - return string(out), nil -} - -// doWithPty executes docker command with stdio attached to a pty. -func doWithPty(args ...string) (*exec.Cmd, *os.File, error) { - log.Printf("Running with pty: docker %s\n", args) - cmd := exec.Command("docker", args...) - ptmx, err := pty.Start(cmd) - if err != nil { - return nil, nil, fmt.Errorf("error executing docker %s with a pty: %v", args, err) - } - return cmd, ptmx, nil -} - -// Pull pulls a docker image. This is used in tests to isolate the -// time to pull the image off the network from the time to actually -// start the container, to avoid timeouts over slow networks. -func Pull(image string) error { - _, err := do("pull", image) - return err -} - -// Docker contains the name and the runtime of a docker container. -type Docker struct { - Runtime string - Name string -} - -// MakeDocker sets up the struct for a Docker container. -// Names of containers will be unique. -func MakeDocker(namePrefix string) Docker { - return Docker{ - Name: testutil.RandomName(namePrefix), - Runtime: *runtime, - } -} - -// logDockerID logs a container id, which is needed to find container runsc logs. -func (d *Docker) logDockerID() { - id, err := d.ID() - if err != nil { - log.Printf("%v\n", err) - } - log.Printf("Name: %s ID: %v\n", d.Name, id) -} - -// Create calls 'docker create' with the arguments provided. -func (d *Docker) Create(args ...string) error { - a := []string{"create", "--runtime", d.Runtime, "--name", d.Name} - a = append(a, args...) - _, err := do(a...) - if err == nil { - d.logDockerID() - } - return err -} - -// Start calls 'docker start'. -func (d *Docker) Start() error { - if _, err := do("start", d.Name); err != nil { - return fmt.Errorf("error starting container %q: %v", d.Name, err) - } - return nil -} - -// Stop calls 'docker stop'. -func (d *Docker) Stop() error { - if _, err := do("stop", d.Name); err != nil { - return fmt.Errorf("error stopping container %q: %v", d.Name, err) - } - return nil -} - -// Run calls 'docker run' with the arguments provided. The container starts -// running in the background and the call returns immediately. -func (d *Docker) Run(args ...string) error { - a := d.runArgs("-d") - a = append(a, args...) - _, err := do(a...) - if err == nil { - d.logDockerID() - } - return err -} - -// RunWithPty is like Run but with an attached pty. -func (d *Docker) RunWithPty(args ...string) (*exec.Cmd, *os.File, error) { - a := d.runArgs("-it") - a = append(a, args...) - return doWithPty(a...) -} - -// RunFg calls 'docker run' with the arguments provided in the foreground. It -// blocks until the container exits and returns the output. -func (d *Docker) RunFg(args ...string) (string, error) { - a := d.runArgs(args...) - out, err := do(a...) - if err == nil { - d.logDockerID() - } - return string(out), err -} - -func (d *Docker) runArgs(args ...string) []string { - // Environment variable RUNSC_TEST_NAME is picked up by the runtime and added - // to the log name, so one can easily identify the corresponding logs for - // this test. - rv := []string{"run", "--runtime", d.Runtime, "--name", d.Name, "-e", "RUNSC_TEST_NAME=" + d.Name} - return append(rv, args...) -} - -// Logs calls 'docker logs'. -func (d *Docker) Logs() (string, error) { - return do("logs", d.Name) -} - -// Exec calls 'docker exec' with the arguments provided. -func (d *Docker) Exec(args ...string) (string, error) { - return d.ExecWithFlags(nil, args...) -} - -// ExecWithFlags calls 'docker exec name '. -func (d *Docker) ExecWithFlags(flags []string, args ...string) (string, error) { - a := []string{"exec"} - a = append(a, flags...) - a = append(a, d.Name) - a = append(a, args...) - return do(a...) -} - -// ExecAsUser calls 'docker exec' as the given user with the arguments -// provided. -func (d *Docker) ExecAsUser(user string, args ...string) (string, error) { - a := []string{"exec", "--user", user, d.Name} - a = append(a, args...) - return do(a...) -} - -// ExecWithTerminal calls 'docker exec -it' with the arguments provided and -// attaches a pty to stdio. -func (d *Docker) ExecWithTerminal(args ...string) (*exec.Cmd, *os.File, error) { - a := []string{"exec", "-it", d.Name} - a = append(a, args...) - return doWithPty(a...) -} - -// Pause calls 'docker pause'. -func (d *Docker) Pause() error { - if _, err := do("pause", d.Name); err != nil { - return fmt.Errorf("error pausing container %q: %v", d.Name, err) - } - return nil -} - -// Unpause calls 'docker pause'. -func (d *Docker) Unpause() error { - if _, err := do("unpause", d.Name); err != nil { - return fmt.Errorf("error unpausing container %q: %v", d.Name, err) - } - return nil -} - -// Checkpoint calls 'docker checkpoint'. -func (d *Docker) Checkpoint(name string) error { - if _, err := do("checkpoint", "create", d.Name, name); err != nil { - return fmt.Errorf("error pausing container %q: %v", d.Name, err) - } - return nil -} - -// Restore calls 'docker start --checkname [name]'. -func (d *Docker) Restore(name string) error { - if _, err := do("start", "--checkpoint", name, d.Name); err != nil { - return fmt.Errorf("error starting container %q: %v", d.Name, err) - } - return nil -} - -// Remove calls 'docker rm'. -func (d *Docker) Remove() error { - if _, err := do("rm", d.Name); err != nil { - return fmt.Errorf("error deleting container %q: %v", d.Name, err) - } - return nil -} - -// CleanUp kills and deletes the container (best effort). -func (d *Docker) CleanUp() { - d.logDockerID() - if _, err := do("kill", d.Name); err != nil { - if strings.Contains(err.Error(), "is not running") { - // Nothing to kill. Don't log the error in this case. - } else { - log.Printf("error killing container %q: %v", d.Name, err) - } - } - if err := d.Remove(); err != nil { - log.Print(err) - } -} - -// FindPort returns the host port that is mapped to 'sandboxPort'. This calls -// docker to allocate a free port in the host and prevent conflicts. -func (d *Docker) FindPort(sandboxPort int) (int, error) { - format := fmt.Sprintf(`{{ (index (index .NetworkSettings.Ports "%d/tcp") 0).HostPort }}`, sandboxPort) - out, err := do("inspect", "-f", format, d.Name) - if err != nil { - return -1, fmt.Errorf("error retrieving port: %v", err) - } - port, err := strconv.Atoi(strings.TrimSuffix(string(out), "\n")) - if err != nil { - return -1, fmt.Errorf("error parsing port %q: %v", out, err) - } - return port, nil -} - -// FindIP returns the IP address of the container as a string. -func (d *Docker) FindIP() (string, error) { - const format = `{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}` - out, err := do("inspect", "-f", format, d.Name) - if err != nil { - return "", fmt.Errorf("error retrieving IP: %v", err) - } - return strings.TrimSpace(out), nil -} - -// SandboxPid returns the PID to the sandbox process. -func (d *Docker) SandboxPid() (int, error) { - out, err := do("inspect", "-f={{.State.Pid}}", d.Name) - if err != nil { - return -1, fmt.Errorf("error retrieving pid: %v", err) - } - pid, err := strconv.Atoi(strings.TrimSuffix(string(out), "\n")) - if err != nil { - return -1, fmt.Errorf("error parsing pid %q: %v", out, err) - } - return pid, nil -} - -// ID returns the container ID. -func (d *Docker) ID() (string, error) { - out, err := do("inspect", "-f={{.Id}}", d.Name) - if err != nil { - return "", fmt.Errorf("error retrieving ID: %v", err) - } - return strings.TrimSpace(string(out)), nil -} - -// Wait waits for container to exit, up to the given timeout. Returns error if -// wait fails or timeout is hit. Returns the application return code otherwise. -// Note that the application may have failed even if err == nil, always check -// the exit code. -func (d *Docker) Wait(timeout time.Duration) (syscall.WaitStatus, error) { - timeoutChan := time.After(timeout) - waitChan := make(chan (syscall.WaitStatus)) - errChan := make(chan (error)) - - go func() { - out, err := do("wait", d.Name) - if err != nil { - errChan <- fmt.Errorf("error waiting for container %q: %v", d.Name, err) - } - exit, err := strconv.Atoi(strings.TrimSuffix(string(out), "\n")) - if err != nil { - errChan <- fmt.Errorf("error parsing exit code %q: %v", out, err) - } - waitChan <- syscall.WaitStatus(uint32(exit)) - }() - - select { - case ws := <-waitChan: - return ws, nil - case err := <-errChan: - return syscall.WaitStatus(1), err - case <-timeoutChan: - return syscall.WaitStatus(1), fmt.Errorf("timeout waiting for container %q", d.Name) - } -} - -// WaitForOutput calls 'docker logs' to retrieve containers output and searches -// for the given pattern. -func (d *Docker) WaitForOutput(pattern string, timeout time.Duration) (string, error) { - matches, err := d.WaitForOutputSubmatch(pattern, timeout) - if err != nil { - return "", err - } - if len(matches) == 0 { - return "", nil - } - return matches[0], nil -} - -// WaitForOutputSubmatch calls 'docker logs' to retrieve containers output and -// searches for the given pattern. It returns any regexp submatches as well. -func (d *Docker) WaitForOutputSubmatch(pattern string, timeout time.Duration) ([]string, error) { - re := regexp.MustCompile(pattern) - var out string - for exp := time.Now().Add(timeout); time.Now().Before(exp); { - var err error - out, err = d.Logs() - if err != nil { - return nil, err - } - if matches := re.FindStringSubmatch(out); matches != nil { - // Success! - return matches, nil - } - time.Sleep(100 * time.Millisecond) - } - return nil, fmt.Errorf("timeout waiting for output %q: %s", re.String(), out) -} diff --git a/runsc/testutil/BUILD b/runsc/testutil/BUILD deleted file mode 100644 index 945405303..000000000 --- a/runsc/testutil/BUILD +++ /dev/null @@ -1,21 +0,0 @@ -load("//tools:defs.bzl", "go_library") - -package(licenses = ["notice"]) - -go_library( - name = "testutil", - testonly = 1, - srcs = [ - "testutil.go", - "testutil_runfiles.go", - ], - visibility = ["//:sandbox"], - deps = [ - "//pkg/log", - "//pkg/sync", - "//runsc/boot", - "//runsc/specutils", - "@com_github_cenkalti_backoff//:go_default_library", - "@com_github_opencontainers_runtime-spec//specs-go:go_default_library", - ], -) diff --git a/runsc/testutil/testutil.go b/runsc/testutil/testutil.go deleted file mode 100644 index 5e09f8f16..000000000 --- a/runsc/testutil/testutil.go +++ /dev/null @@ -1,433 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Package testutil contains utility functions for runsc tests. -package testutil - -import ( - "bufio" - "context" - "debug/elf" - "encoding/base32" - "encoding/json" - "flag" - "fmt" - "io" - "io/ioutil" - "math" - "math/rand" - "net/http" - "os" - "os/exec" - "os/signal" - "path" - "path/filepath" - "strconv" - "strings" - "sync/atomic" - "syscall" - "testing" - "time" - - "github.com/cenkalti/backoff" - specs "github.com/opencontainers/runtime-spec/specs-go" - "gvisor.dev/gvisor/pkg/log" - "gvisor.dev/gvisor/pkg/sync" - "gvisor.dev/gvisor/runsc/boot" - "gvisor.dev/gvisor/runsc/specutils" -) - -var ( - checkpoint = flag.Bool("checkpoint", true, "control checkpoint/restore support") -) - -func init() { - rand.Seed(time.Now().UnixNano()) -} - -// IsCheckpointSupported returns the relevant command line flag. -func IsCheckpointSupported() bool { - return *checkpoint -} - -// TmpDir returns the absolute path to a writable directory that can be used as -// scratch by the test. -func TmpDir() string { - dir := os.Getenv("TEST_TMPDIR") - if dir == "" { - dir = "/tmp" - } - return dir -} - -// ConfigureExePath configures the executable for runsc in the test environment. -func ConfigureExePath() error { - path, err := FindFile("runsc/runsc") - if err != nil { - return err - } - specutils.ExePath = path - return nil -} - -// TestConfig returns the default configuration to use in tests. Note that -// 'RootDir' must be set by caller if required. -func TestConfig(t *testing.T) *boot.Config { - logDir := "" - if dir, ok := os.LookupEnv("TEST_UNDECLARED_OUTPUTS_DIR"); ok { - logDir = dir + "/" - } - return &boot.Config{ - Debug: true, - DebugLog: path.Join(logDir, "runsc.log."+t.Name()+".%TIMESTAMP%.%COMMAND%"), - LogFormat: "text", - DebugLogFormat: "text", - LogPackets: true, - Network: boot.NetworkNone, - Strace: true, - Platform: "ptrace", - FileAccess: boot.FileAccessExclusive, - NumNetworkChannels: 1, - - TestOnlyAllowRunAsCurrentUserWithoutChroot: true, - } -} - -// NewSpecWithArgs creates a simple spec with the given args suitable for use -// in tests. -func NewSpecWithArgs(args ...string) *specs.Spec { - return &specs.Spec{ - // The host filesystem root is the container root. - Root: &specs.Root{ - Path: "/", - Readonly: true, - }, - Process: &specs.Process{ - Args: args, - Env: []string{ - "PATH=" + os.Getenv("PATH"), - }, - Capabilities: specutils.AllCapabilities(), - }, - Mounts: []specs.Mount{ - // Hide the host /etc to avoid any side-effects. - // For example, bash reads /etc/passwd and if it is - // very big, tests can fail by timeout. - { - Type: "tmpfs", - Destination: "/etc", - }, - // Root is readonly, but many tests want to write to tmpdir. - // This creates a writable mount inside the root. Also, when tmpdir points - // to "/tmp", it makes the the actual /tmp to be mounted and not a tmpfs - // inside the sentry. - { - Type: "bind", - Destination: TmpDir(), - Source: TmpDir(), - }, - }, - Hostname: "runsc-test-hostname", - } -} - -// SetupRootDir creates a root directory for containers. -func SetupRootDir() (string, error) { - rootDir, err := ioutil.TempDir(TmpDir(), "containers") - if err != nil { - return "", fmt.Errorf("error creating root dir: %v", err) - } - return rootDir, nil -} - -// SetupContainer creates a bundle and root dir for the container, generates a -// test config, and writes the spec to config.json in the bundle dir. -func SetupContainer(spec *specs.Spec, conf *boot.Config) (rootDir, bundleDir string, err error) { - rootDir, err = SetupRootDir() - if err != nil { - return "", "", err - } - conf.RootDir = rootDir - bundleDir, err = SetupBundleDir(spec) - return rootDir, bundleDir, err -} - -// SetupBundleDir creates a bundle dir and writes the spec to config.json. -func SetupBundleDir(spec *specs.Spec) (bundleDir string, err error) { - bundleDir, err = ioutil.TempDir(TmpDir(), "bundle") - if err != nil { - return "", fmt.Errorf("error creating bundle dir: %v", err) - } - - if err = writeSpec(bundleDir, spec); err != nil { - return "", fmt.Errorf("error writing spec: %v", err) - } - return bundleDir, nil -} - -// writeSpec writes the spec to disk in the given directory. -func writeSpec(dir string, spec *specs.Spec) error { - b, err := json.Marshal(spec) - if err != nil { - return err - } - return ioutil.WriteFile(filepath.Join(dir, "config.json"), b, 0755) -} - -// UniqueContainerID generates a unique container id for each test. -// -// The container id is used to create an abstract unix domain socket, which must -// be unique. While the container forbids creating two containers with the same -// name, sometimes between test runs the socket does not get cleaned up quickly -// enough, causing container creation to fail. -func UniqueContainerID() string { - // Read 20 random bytes. - b := make([]byte, 20) - // "[Read] always returns len(p) and a nil error." --godoc - if _, err := rand.Read(b); err != nil { - panic("rand.Read failed: " + err.Error()) - } - // base32 encode the random bytes, so that the name is a valid - // container id and can be used as a socket name in the filesystem. - return fmt.Sprintf("test-container-%s", base32.StdEncoding.EncodeToString(b)) -} - -// Copy copies file from src to dst. -func Copy(src, dst string) error { - in, err := os.Open(src) - if err != nil { - return err - } - defer in.Close() - - out, err := os.Create(dst) - if err != nil { - return err - } - defer out.Close() - - _, err = io.Copy(out, in) - return err -} - -// Poll is a shorthand function to poll for something with given timeout. -func Poll(cb func() error, timeout time.Duration) error { - ctx, cancel := context.WithTimeout(context.Background(), timeout) - defer cancel() - b := backoff.WithContext(backoff.NewConstantBackOff(100*time.Millisecond), ctx) - return backoff.Retry(cb, b) -} - -// WaitForHTTP tries GET requests on a port until the call succeeds or timeout. -func WaitForHTTP(port int, timeout time.Duration) error { - cb := func() error { - c := &http.Client{ - // Calculate timeout to be able to do minimum 5 attempts. - Timeout: timeout / 5, - } - url := fmt.Sprintf("http://localhost:%d/", port) - resp, err := c.Get(url) - if err != nil { - log.Infof("Waiting %s: %v", url, err) - return err - } - resp.Body.Close() - return nil - } - return Poll(cb, timeout) -} - -// Reaper reaps child processes. -type Reaper struct { - // mu protects ch, which will be nil if the reaper is not running. - mu sync.Mutex - ch chan os.Signal -} - -// Start starts reaping child processes. -func (r *Reaper) Start() { - r.mu.Lock() - defer r.mu.Unlock() - - if r.ch != nil { - panic("reaper.Start called on a running reaper") - } - - r.ch = make(chan os.Signal, 1) - signal.Notify(r.ch, syscall.SIGCHLD) - - go func() { - for { - r.mu.Lock() - ch := r.ch - r.mu.Unlock() - if ch == nil { - return - } - - _, ok := <-ch - if !ok { - // Channel closed. - return - } - for { - cpid, _ := syscall.Wait4(-1, nil, syscall.WNOHANG, nil) - if cpid < 1 { - break - } - } - } - }() -} - -// Stop stops reaping child processes. -func (r *Reaper) Stop() { - r.mu.Lock() - defer r.mu.Unlock() - - if r.ch == nil { - panic("reaper.Stop called on a stopped reaper") - } - - signal.Stop(r.ch) - close(r.ch) - r.ch = nil -} - -// StartReaper is a helper that starts a new Reaper and returns a function to -// stop it. -func StartReaper() func() { - r := &Reaper{} - r.Start() - return r.Stop -} - -// WaitUntilRead reads from the given reader until the wanted string is found -// or until timeout. -func WaitUntilRead(r io.Reader, want string, split bufio.SplitFunc, timeout time.Duration) error { - sc := bufio.NewScanner(r) - if split != nil { - sc.Split(split) - } - // done must be accessed atomically. A value greater than 0 indicates - // that the read loop can exit. - var done uint32 - doneCh := make(chan struct{}) - go func() { - for sc.Scan() { - t := sc.Text() - if strings.Contains(t, want) { - atomic.StoreUint32(&done, 1) - close(doneCh) - break - } - if atomic.LoadUint32(&done) > 0 { - break - } - } - }() - select { - case <-time.After(timeout): - atomic.StoreUint32(&done, 1) - return fmt.Errorf("timeout waiting to read %q", want) - case <-doneCh: - return nil - } -} - -// KillCommand kills the process running cmd unless it hasn't been started. It -// returns an error if it cannot kill the process unless the reason is that the -// process has already exited. -func KillCommand(cmd *exec.Cmd) error { - if cmd.Process == nil { - return nil - } - if err := cmd.Process.Kill(); err != nil { - if !strings.Contains(err.Error(), "process already finished") { - return fmt.Errorf("failed to kill process %v: %v", cmd, err) - } - } - return nil -} - -// WriteTmpFile writes text to a temporary file, closes the file, and returns -// the name of the file. -func WriteTmpFile(pattern, text string) (string, error) { - file, err := ioutil.TempFile(TmpDir(), pattern) - if err != nil { - return "", err - } - defer file.Close() - if _, err := file.Write([]byte(text)); err != nil { - return "", err - } - return file.Name(), nil -} - -// RandomName create a name with a 6 digit random number appended to it. -func RandomName(prefix string) string { - return fmt.Sprintf("%s-%06d", prefix, rand.Int31n(1000000)) -} - -// IsStatic returns true iff the given file is a static binary. -func IsStatic(filename string) (bool, error) { - f, err := elf.Open(filename) - if err != nil { - return false, err - } - for _, prog := range f.Progs { - if prog.Type == elf.PT_INTERP { - return false, nil // Has interpreter. - } - } - return true, nil -} - -// TestIndicesForShard returns indices for this test shard based on the -// TEST_SHARD_INDEX and TEST_TOTAL_SHARDS environment vars. -// -// If either of the env vars are not present, then the function will return all -// tests. If there are more shards than there are tests, then the returned list -// may be empty. -func TestIndicesForShard(numTests int) ([]int, error) { - var ( - shardIndex = 0 - shardTotal = 1 - ) - - indexStr, totalStr := os.Getenv("TEST_SHARD_INDEX"), os.Getenv("TEST_TOTAL_SHARDS") - if indexStr != "" && totalStr != "" { - // Parse index and total to ints. - var err error - shardIndex, err = strconv.Atoi(indexStr) - if err != nil { - return nil, fmt.Errorf("invalid TEST_SHARD_INDEX %q: %v", indexStr, err) - } - shardTotal, err = strconv.Atoi(totalStr) - if err != nil { - return nil, fmt.Errorf("invalid TEST_TOTAL_SHARDS %q: %v", totalStr, err) - } - } - - // Calculate! - var indices []int - numBlocks := int(math.Ceil(float64(numTests) / float64(shardTotal))) - for i := 0; i < numBlocks; i++ { - pick := i*shardTotal + shardIndex - if pick < numTests { - indices = append(indices, pick) - } - } - return indices, nil -} diff --git a/runsc/testutil/testutil_runfiles.go b/runsc/testutil/testutil_runfiles.go deleted file mode 100644 index ece9ea9a1..000000000 --- a/runsc/testutil/testutil_runfiles.go +++ /dev/null @@ -1,75 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package testutil - -import ( - "fmt" - "os" - "path/filepath" -) - -// FindFile searchs for a file inside the test run environment. It returns the -// full path to the file. It fails if none or more than one file is found. -func FindFile(path string) (string, error) { - wd, err := os.Getwd() - if err != nil { - return "", err - } - - // The test root is demarcated by a path element called "__main__". Search for - // it backwards from the working directory. - root := wd - for { - dir, name := filepath.Split(root) - if name == "__main__" { - break - } - if len(dir) == 0 { - return "", fmt.Errorf("directory __main__ not found in %q", wd) - } - // Remove ending slash to loop around. - root = dir[:len(dir)-1] - } - - // Annoyingly, bazel adds the build type to the directory path for go - // binaries, but not for c++ binaries. We use two different patterns to - // to find our file. - patterns := []string{ - // Try the obvious path first. - filepath.Join(root, path), - // If it was a go binary, use a wildcard to match the build - // type. The pattern is: /test-path/__main__/directories/*/file. - filepath.Join(root, filepath.Dir(path), "*", filepath.Base(path)), - } - - for _, p := range patterns { - matches, err := filepath.Glob(p) - if err != nil { - // "The only possible returned error is ErrBadPattern, - // when pattern is malformed." -godoc - return "", fmt.Errorf("error globbing %q: %v", p, err) - } - switch len(matches) { - case 0: - // Try the next pattern. - case 1: - // We found it. - return matches[0], nil - default: - return "", fmt.Errorf("more than one match found for %q: %s", path, matches) - } - } - return "", fmt.Errorf("file %q not found", path) -} diff --git a/scripts/iptables_tests.sh b/scripts/iptables_tests.sh index 0f46909ac..c8da1f32d 100755 --- a/scripts/iptables_tests.sh +++ b/scripts/iptables_tests.sh @@ -17,14 +17,5 @@ source $(dirname $0)/common.sh install_runsc_for_test iptables --net-raw - -# Build the docker image for the test. -run //test/iptables/runner:runner-image --norun - -test //test/iptables:iptables_test \ - "--test_arg=--runtime=runc" \ - "--test_arg=--image=bazel/test/iptables/runner:runner-image" - -test //test/iptables:iptables_test \ - "--test_arg=--runtime=${RUNTIME}" \ - "--test_arg=--image=bazel/test/iptables/runner:runner-image" +test //test/iptables:iptables_test --test_arg=--runtime=runc +test //test/iptables:iptables_test --test_arg=--runtime=${RUNTIME} diff --git a/test/cmd/test_app/BUILD b/test/cmd/test_app/BUILD new file mode 100644 index 000000000..98ba5a3d9 --- /dev/null +++ b/test/cmd/test_app/BUILD @@ -0,0 +1,21 @@ +load("//tools:defs.bzl", "go_binary") + +package(licenses = ["notice"]) + +go_binary( + name = "test_app", + testonly = 1, + srcs = [ + "fds.go", + "test_app.go", + ], + pure = True, + visibility = ["//runsc/container:__pkg__"], + deps = [ + "//pkg/test/testutil", + "//pkg/unet", + "//runsc/flag", + "@com_github_google_subcommands//:go_default_library", + "@com_github_kr_pty//:go_default_library", + ], +) diff --git a/test/cmd/test_app/fds.go b/test/cmd/test_app/fds.go new file mode 100644 index 000000000..a7658eefd --- /dev/null +++ b/test/cmd/test_app/fds.go @@ -0,0 +1,185 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package main + +import ( + "context" + "io/ioutil" + "log" + "os" + "time" + + "github.com/google/subcommands" + "gvisor.dev/gvisor/pkg/test/testutil" + "gvisor.dev/gvisor/pkg/unet" + "gvisor.dev/gvisor/runsc/flag" +) + +const fileContents = "foobarbaz" + +// fdSender will open a file and send the FD over a unix domain socket. +type fdSender struct { + socketPath string +} + +// Name implements subcommands.Command.Name. +func (*fdSender) Name() string { + return "fd_sender" +} + +// Synopsis implements subcommands.Command.Synopsys. +func (*fdSender) Synopsis() string { + return "creates a file and sends the FD over the socket" +} + +// Usage implements subcommands.Command.Usage. +func (*fdSender) Usage() string { + return "fd_sender " +} + +// SetFlags implements subcommands.Command.SetFlags. +func (fds *fdSender) SetFlags(f *flag.FlagSet) { + f.StringVar(&fds.socketPath, "socket", "", "path to socket") +} + +// Execute implements subcommands.Command.Execute. +func (fds *fdSender) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus { + if fds.socketPath == "" { + log.Fatalf("socket flag must be set") + } + + dir, err := ioutil.TempDir("", "") + if err != nil { + log.Fatalf("TempDir failed: %v", err) + } + + fileToSend, err := ioutil.TempFile(dir, "") + if err != nil { + log.Fatalf("TempFile failed: %v", err) + } + defer fileToSend.Close() + + if _, err := fileToSend.WriteString(fileContents); err != nil { + log.Fatalf("Write(%q) failed: %v", fileContents, err) + } + + // Receiver may not be started yet, so try connecting in a poll loop. + var s *unet.Socket + if err := testutil.Poll(func() error { + var err error + s, err = unet.Connect(fds.socketPath, true /* SEQPACKET, so we can send empty message with FD */) + return err + }, 10*time.Second); err != nil { + log.Fatalf("Error connecting to socket %q: %v", fds.socketPath, err) + } + defer s.Close() + + w := s.Writer(true) + w.ControlMessage.PackFDs(int(fileToSend.Fd())) + if _, err := w.WriteVec([][]byte{[]byte{'a'}}); err != nil { + log.Fatalf("Error sending FD %q over socket %q: %v", fileToSend.Fd(), fds.socketPath, err) + } + + log.Print("FD SENDER exiting successfully") + return subcommands.ExitSuccess +} + +// fdReceiver receives an FD from a unix domain socket and does things to it. +type fdReceiver struct { + socketPath string +} + +// Name implements subcommands.Command.Name. +func (*fdReceiver) Name() string { + return "fd_receiver" +} + +// Synopsis implements subcommands.Command.Synopsys. +func (*fdReceiver) Synopsis() string { + return "reads an FD from a unix socket, and then does things to it" +} + +// Usage implements subcommands.Command.Usage. +func (*fdReceiver) Usage() string { + return "fd_receiver " +} + +// SetFlags implements subcommands.Command.SetFlags. +func (fdr *fdReceiver) SetFlags(f *flag.FlagSet) { + f.StringVar(&fdr.socketPath, "socket", "", "path to socket") +} + +// Execute implements subcommands.Command.Execute. +func (fdr *fdReceiver) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus { + if fdr.socketPath == "" { + log.Fatalf("Flags cannot be empty, given: socket: %q", fdr.socketPath) + } + + ss, err := unet.BindAndListen(fdr.socketPath, true /* packet */) + if err != nil { + log.Fatalf("BindAndListen(%q) failed: %v", fdr.socketPath, err) + } + defer ss.Close() + + var s *unet.Socket + c := make(chan error, 1) + go func() { + var err error + s, err = ss.Accept() + c <- err + }() + + select { + case err := <-c: + if err != nil { + log.Fatalf("Accept() failed: %v", err) + } + case <-time.After(10 * time.Second): + log.Fatalf("Timeout waiting for accept") + } + + r := s.Reader(true) + r.EnableFDs(1) + b := [][]byte{{'a'}} + if n, err := r.ReadVec(b); n != 1 || err != nil { + log.Fatalf("ReadVec got n=%d err %v (wanted 0, nil)", n, err) + } + + fds, err := r.ExtractFDs() + if err != nil { + log.Fatalf("ExtractFD() got err %v", err) + } + if len(fds) != 1 { + log.Fatalf("ExtractFD() got %d FDs, wanted 1", len(fds)) + } + fd := fds[0] + + file := os.NewFile(uintptr(fd), "received file") + defer file.Close() + if _, err := file.Seek(0, os.SEEK_SET); err != nil { + log.Fatalf("Seek(0, 0) failed: %v", err) + } + + got, err := ioutil.ReadAll(file) + if err != nil { + log.Fatalf("ReadAll failed: %v", err) + } + if string(got) != fileContents { + log.Fatalf("ReadAll got %q want %q", string(got), fileContents) + } + + log.Print("FD RECEIVER exiting successfully") + return subcommands.ExitSuccess +} diff --git a/test/cmd/test_app/test_app.go b/test/cmd/test_app/test_app.go new file mode 100644 index 000000000..3ba4f38f8 --- /dev/null +++ b/test/cmd/test_app/test_app.go @@ -0,0 +1,394 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Binary test_app is like a swiss knife for tests that need to run anything +// inside the sandbox. New functionality can be added with new commands. +package main + +import ( + "context" + "fmt" + "io" + "io/ioutil" + "log" + "net" + "os" + "os/exec" + "regexp" + "strconv" + sys "syscall" + "time" + + "github.com/google/subcommands" + "github.com/kr/pty" + "gvisor.dev/gvisor/pkg/test/testutil" + "gvisor.dev/gvisor/runsc/flag" +) + +func main() { + subcommands.Register(subcommands.HelpCommand(), "") + subcommands.Register(subcommands.FlagsCommand(), "") + subcommands.Register(new(capability), "") + subcommands.Register(new(fdReceiver), "") + subcommands.Register(new(fdSender), "") + subcommands.Register(new(forkBomb), "") + subcommands.Register(new(ptyRunner), "") + subcommands.Register(new(reaper), "") + subcommands.Register(new(syscall), "") + subcommands.Register(new(taskTree), "") + subcommands.Register(new(uds), "") + + flag.Parse() + + exitCode := subcommands.Execute(context.Background()) + os.Exit(int(exitCode)) +} + +type uds struct { + fileName string + socketPath string +} + +// Name implements subcommands.Command.Name. +func (*uds) Name() string { + return "uds" +} + +// Synopsis implements subcommands.Command.Synopsys. +func (*uds) Synopsis() string { + return "creates unix domain socket client and server. Client sends a contant flow of sequential numbers. Server prints them to --file" +} + +// Usage implements subcommands.Command.Usage. +func (*uds) Usage() string { + return "uds " +} + +// SetFlags implements subcommands.Command.SetFlags. +func (c *uds) SetFlags(f *flag.FlagSet) { + f.StringVar(&c.fileName, "file", "", "name of output file") + f.StringVar(&c.socketPath, "socket", "", "path to socket") +} + +// Execute implements subcommands.Command.Execute. +func (c *uds) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus { + if c.fileName == "" || c.socketPath == "" { + log.Fatalf("Flags cannot be empty, given: fileName: %q, socketPath: %q", c.fileName, c.socketPath) + return subcommands.ExitFailure + } + outputFile, err := os.OpenFile(c.fileName, os.O_WRONLY|os.O_CREATE, 0666) + if err != nil { + log.Fatal("error opening output file:", err) + } + + defer os.Remove(c.socketPath) + + listener, err := net.Listen("unix", c.socketPath) + if err != nil { + log.Fatalf("error listening on socket %q: %v", c.socketPath, err) + } + + go server(listener, outputFile) + for i := 0; ; i++ { + conn, err := net.Dial("unix", c.socketPath) + if err != nil { + log.Fatal("error dialing:", err) + } + if _, err := conn.Write([]byte(strconv.Itoa(i))); err != nil { + log.Fatal("error writing:", err) + } + conn.Close() + time.Sleep(100 * time.Millisecond) + } +} + +func server(listener net.Listener, out *os.File) { + buf := make([]byte, 16) + + for { + c, err := listener.Accept() + if err != nil { + log.Fatal("error accepting connection:", err) + } + nr, err := c.Read(buf) + if err != nil { + log.Fatal("error reading from buf:", err) + } + data := buf[0:nr] + fmt.Fprint(out, string(data)+"\n") + } +} + +type taskTree struct { + depth int + width int + pause bool +} + +// Name implements subcommands.Command. +func (*taskTree) Name() string { + return "task-tree" +} + +// Synopsis implements subcommands.Command. +func (*taskTree) Synopsis() string { + return "creates a tree of tasks" +} + +// Usage implements subcommands.Command. +func (*taskTree) Usage() string { + return "task-tree " +} + +// SetFlags implements subcommands.Command. +func (c *taskTree) SetFlags(f *flag.FlagSet) { + f.IntVar(&c.depth, "depth", 1, "number of levels to create") + f.IntVar(&c.width, "width", 1, "number of tasks at each level") + f.BoolVar(&c.pause, "pause", false, "whether the tasks should pause perpetually") +} + +// Execute implements subcommands.Command. +func (c *taskTree) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus { + stop := testutil.StartReaper() + defer stop() + + if c.depth == 0 { + log.Printf("Child sleeping, PID: %d\n", os.Getpid()) + select {} + } + log.Printf("Parent %d sleeping, PID: %d\n", c.depth, os.Getpid()) + + var cmds []*exec.Cmd + for i := 0; i < c.width; i++ { + cmd := exec.Command( + "/proc/self/exe", c.Name(), + "--depth", strconv.Itoa(c.depth-1), + "--width", strconv.Itoa(c.width), + "--pause", strconv.FormatBool(c.pause)) + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + + if err := cmd.Start(); err != nil { + log.Fatal("failed to call self:", err) + } + cmds = append(cmds, cmd) + } + + for _, c := range cmds { + c.Wait() + } + + if c.pause { + select {} + } + + return subcommands.ExitSuccess +} + +type forkBomb struct { + delay time.Duration +} + +// Name implements subcommands.Command. +func (*forkBomb) Name() string { + return "fork-bomb" +} + +// Synopsis implements subcommands.Command. +func (*forkBomb) Synopsis() string { + return "creates child process until the end of times" +} + +// Usage implements subcommands.Command. +func (*forkBomb) Usage() string { + return "fork-bomb " +} + +// SetFlags implements subcommands.Command. +func (c *forkBomb) SetFlags(f *flag.FlagSet) { + f.DurationVar(&c.delay, "delay", 100*time.Millisecond, "amount of time to delay creation of child") +} + +// Execute implements subcommands.Command. +func (c *forkBomb) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus { + time.Sleep(c.delay) + + cmd := exec.Command("/proc/self/exe", c.Name()) + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + if err := cmd.Run(); err != nil { + log.Fatal("failed to call self:", err) + } + return subcommands.ExitSuccess +} + +type reaper struct{} + +// Name implements subcommands.Command. +func (*reaper) Name() string { + return "reaper" +} + +// Synopsis implements subcommands.Command. +func (*reaper) Synopsis() string { + return "reaps all children in a loop" +} + +// Usage implements subcommands.Command. +func (*reaper) Usage() string { + return "reaper " +} + +// SetFlags implements subcommands.Command. +func (*reaper) SetFlags(*flag.FlagSet) {} + +// Execute implements subcommands.Command. +func (c *reaper) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus { + stop := testutil.StartReaper() + defer stop() + select {} +} + +type syscall struct { + sysno uint64 +} + +// Name implements subcommands.Command. +func (*syscall) Name() string { + return "syscall" +} + +// Synopsis implements subcommands.Command. +func (*syscall) Synopsis() string { + return "syscall makes a syscall" +} + +// Usage implements subcommands.Command. +func (*syscall) Usage() string { + return "syscall " +} + +// SetFlags implements subcommands.Command. +func (s *syscall) SetFlags(f *flag.FlagSet) { + f.Uint64Var(&s.sysno, "syscall", 0, "syscall to call") +} + +// Execute implements subcommands.Command. +func (s *syscall) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus { + if _, _, errno := sys.Syscall(uintptr(s.sysno), 0, 0, 0); errno != 0 { + fmt.Printf("syscall(%d, 0, 0...) failed: %v\n", s.sysno, errno) + } else { + fmt.Printf("syscall(%d, 0, 0...) success\n", s.sysno) + } + return subcommands.ExitSuccess +} + +type capability struct { + enabled uint64 + disabled uint64 +} + +// Name implements subcommands.Command. +func (*capability) Name() string { + return "capability" +} + +// Synopsis implements subcommands.Command. +func (*capability) Synopsis() string { + return "checks if effective capabilities are set/unset" +} + +// Usage implements subcommands.Command. +func (*capability) Usage() string { + return "capability [--enabled=number] [--disabled=number]" +} + +// SetFlags implements subcommands.Command. +func (c *capability) SetFlags(f *flag.FlagSet) { + f.Uint64Var(&c.enabled, "enabled", 0, "") + f.Uint64Var(&c.disabled, "disabled", 0, "") +} + +// Execute implements subcommands.Command. +func (c *capability) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus { + if c.enabled == 0 && c.disabled == 0 { + fmt.Println("One of the flags must be set") + return subcommands.ExitUsageError + } + + status, err := ioutil.ReadFile("/proc/self/status") + if err != nil { + fmt.Printf("Error reading %q: %v\n", "proc/self/status", err) + return subcommands.ExitFailure + } + re := regexp.MustCompile("CapEff:\t([0-9a-f]+)\n") + matches := re.FindStringSubmatch(string(status)) + if matches == nil || len(matches) != 2 { + fmt.Printf("Effective capabilities not found in\n%s\n", status) + return subcommands.ExitFailure + } + caps, err := strconv.ParseUint(matches[1], 16, 64) + if err != nil { + fmt.Printf("failed to convert capabilities %q: %v\n", matches[1], err) + return subcommands.ExitFailure + } + + if c.enabled != 0 && (caps&c.enabled) != c.enabled { + fmt.Printf("Missing capabilities, want: %#x: got: %#x\n", c.enabled, caps) + return subcommands.ExitFailure + } + if c.disabled != 0 && (caps&c.disabled) != 0 { + fmt.Printf("Extra capabilities found, dont_want: %#x: got: %#x\n", c.disabled, caps) + return subcommands.ExitFailure + } + + return subcommands.ExitSuccess +} + +type ptyRunner struct{} + +// Name implements subcommands.Command. +func (*ptyRunner) Name() string { + return "pty-runner" +} + +// Synopsis implements subcommands.Command. +func (*ptyRunner) Synopsis() string { + return "runs the given command with an open pty terminal" +} + +// Usage implements subcommands.Command. +func (*ptyRunner) Usage() string { + return "pty-runner [command]" +} + +// SetFlags implements subcommands.Command.SetFlags. +func (*ptyRunner) SetFlags(f *flag.FlagSet) {} + +// Execute implements subcommands.Command. +func (*ptyRunner) Execute(_ context.Context, fs *flag.FlagSet, _ ...interface{}) subcommands.ExitStatus { + c := exec.Command(fs.Args()[0], fs.Args()[1:]...) + f, err := pty.Start(c) + if err != nil { + fmt.Printf("pty.Start failed: %v", err) + return subcommands.ExitFailure + } + defer f.Close() + + // Copy stdout from the command to keep this process alive until the + // subprocess exits. + io.Copy(os.Stdout, f) + + return subcommands.ExitSuccess +} diff --git a/test/e2e/BUILD b/test/e2e/BUILD index 76e04f878..44cce0e3b 100644 --- a/test/e2e/BUILD +++ b/test/e2e/BUILD @@ -20,9 +20,9 @@ go_test( deps = [ "//pkg/abi/linux", "//pkg/bits", - "//runsc/dockerutil", + "//pkg/test/dockerutil", + "//pkg/test/testutil", "//runsc/specutils", - "//runsc/testutil", ], ) diff --git a/test/e2e/exec_test.go b/test/e2e/exec_test.go index 594c8e752..6a63b1232 100644 --- a/test/e2e/exec_test.go +++ b/test/e2e/exec_test.go @@ -23,6 +23,8 @@ package integration import ( "fmt" + "os" + "os/exec" "strconv" "strings" "syscall" @@ -31,23 +33,23 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/bits" - "gvisor.dev/gvisor/runsc/dockerutil" + "gvisor.dev/gvisor/pkg/test/dockerutil" "gvisor.dev/gvisor/runsc/specutils" ) // Test that exec uses the exact same capability set as the container. func TestExecCapabilities(t *testing.T) { - if err := dockerutil.Pull("alpine"); err != nil { - t.Fatalf("docker pull failed: %v", err) - } - d := dockerutil.MakeDocker("exec-capabilities-test") + d := dockerutil.MakeDocker(t) + defer d.CleanUp() // Start the container. - if err := d.Run("alpine", "sh", "-c", "cat /proc/self/status; sleep 100"); err != nil { + if err := d.Spawn(dockerutil.RunOpts{ + Image: "basic/alpine", + }, "sh", "-c", "cat /proc/self/status; sleep 100"); err != nil { t.Fatalf("docker run failed: %v", err) } - defer d.CleanUp() + // Check that capability. matches, err := d.WaitForOutputSubmatch("CapEff:\t([0-9a-f]+)\n", 5*time.Second) if err != nil { t.Fatalf("WaitForOutputSubmatch() timeout: %v", err) @@ -59,7 +61,7 @@ func TestExecCapabilities(t *testing.T) { t.Log("Root capabilities:", want) // Now check that exec'd process capabilities match the root. - got, err := d.Exec("grep", "CapEff:", "/proc/self/status") + got, err := d.Exec(dockerutil.RunOpts{}, "grep", "CapEff:", "/proc/self/status") if err != nil { t.Fatalf("docker exec failed: %v", err) } @@ -72,16 +74,16 @@ func TestExecCapabilities(t *testing.T) { // Test that 'exec --privileged' adds all capabilities, except for CAP_NET_RAW // which is removed from the container when --net-raw=false. func TestExecPrivileged(t *testing.T) { - if err := dockerutil.Pull("alpine"); err != nil { - t.Fatalf("docker pull failed: %v", err) - } - d := dockerutil.MakeDocker("exec-privileged-test") + d := dockerutil.MakeDocker(t) + defer d.CleanUp() // Start the container with all capabilities dropped. - if err := d.Run("--cap-drop=all", "alpine", "sh", "-c", "cat /proc/self/status; sleep 100"); err != nil { + if err := d.Spawn(dockerutil.RunOpts{ + Image: "basic/alpine", + CapDrop: []string{"all"}, + }, "sh", "-c", "cat /proc/self/status; sleep 100"); err != nil { t.Fatalf("docker run failed: %v", err) } - defer d.CleanUp() // Check that all capabilities where dropped from container. matches, err := d.WaitForOutputSubmatch("CapEff:\t([0-9a-f]+)\n", 5*time.Second) @@ -100,9 +102,11 @@ func TestExecPrivileged(t *testing.T) { t.Fatalf("Container should have no capabilities: %x", containerCaps) } - // Check that 'exec --privileged' adds all capabilities, except - // for CAP_NET_RAW. - got, err := d.ExecWithFlags([]string{"--privileged"}, "grep", "CapEff:", "/proc/self/status") + // Check that 'exec --privileged' adds all capabilities, except for + // CAP_NET_RAW. + got, err := d.Exec(dockerutil.RunOpts{ + Privileged: true, + }, "grep", "CapEff:", "/proc/self/status") if err != nil { t.Fatalf("docker exec failed: %v", err) } @@ -114,97 +118,99 @@ func TestExecPrivileged(t *testing.T) { } func TestExecJobControl(t *testing.T) { - if err := dockerutil.Pull("alpine"); err != nil { - t.Fatalf("docker pull failed: %v", err) - } - d := dockerutil.MakeDocker("exec-job-control-test") + d := dockerutil.MakeDocker(t) + defer d.CleanUp() // Start the container. - if err := d.Run("alpine", "sleep", "1000"); err != nil { + if err := d.Spawn(dockerutil.RunOpts{ + Image: "basic/alpine", + }, "sleep", "1000"); err != nil { t.Fatalf("docker run failed: %v", err) } - defer d.CleanUp() // Exec 'sh' with an attached pty. - cmd, ptmx, err := d.ExecWithTerminal("sh") - if err != nil { + if _, err := d.Exec(dockerutil.RunOpts{ + Pty: func(cmd *exec.Cmd, ptmx *os.File) { + // Call "sleep 100 | cat" in the shell. We pipe to cat + // so that there will be two processes in the + // foreground process group. + if _, err := ptmx.Write([]byte("sleep 100 | cat\n")); err != nil { + t.Fatalf("error writing to pty: %v", err) + } + + // Give shell a few seconds to start executing the sleep. + time.Sleep(2 * time.Second) + + // Send a ^C to the pty, which should kill sleep and + // cat, but not the shell. \x03 is ASCII "end of + // text", which is the same as ^C. + if _, err := ptmx.Write([]byte{'\x03'}); err != nil { + t.Fatalf("error writing to pty: %v", err) + } + + // The shell should still be alive at this point. Sleep + // should have exited with code 2+128=130. We'll exit + // with 10 plus that number, so that we can be sure + // that the shell did not get signalled. + if _, err := ptmx.Write([]byte("exit $(expr $? + 10)\n")); err != nil { + t.Fatalf("error writing to pty: %v", err) + } + + // Exec process should exit with code 10+130=140. + ps, err := cmd.Process.Wait() + if err != nil { + t.Fatalf("error waiting for exec process: %v", err) + } + ws := ps.Sys().(syscall.WaitStatus) + if !ws.Exited() { + t.Errorf("ws.Exited got false, want true") + } + if got, want := ws.ExitStatus(), 140; got != want { + t.Errorf("ws.ExitedStatus got %d, want %d", got, want) + } + }, + }, "sh"); err != nil { t.Fatalf("docker exec failed: %v", err) } - defer ptmx.Close() - - // Call "sleep 100 | cat" in the shell. We pipe to cat so that there - // will be two processes in the foreground process group. - if _, err := ptmx.Write([]byte("sleep 100 | cat\n")); err != nil { - t.Fatalf("error writing to pty: %v", err) - } - - // Give shell a few seconds to start executing the sleep. - time.Sleep(2 * time.Second) - - // Send a ^C to the pty, which should kill sleep and cat, but not the - // shell. \x03 is ASCII "end of text", which is the same as ^C. - if _, err := ptmx.Write([]byte{'\x03'}); err != nil { - t.Fatalf("error writing to pty: %v", err) - } - - // The shell should still be alive at this point. Sleep should have - // exited with code 2+128=130. We'll exit with 10 plus that number, so - // that we can be sure that the shell did not get signalled. - if _, err := ptmx.Write([]byte("exit $(expr $? + 10)\n")); err != nil { - t.Fatalf("error writing to pty: %v", err) - } - - // Exec process should exit with code 10+130=140. - ps, err := cmd.Process.Wait() - if err != nil { - t.Fatalf("error waiting for exec process: %v", err) - } - ws := ps.Sys().(syscall.WaitStatus) - if !ws.Exited() { - t.Errorf("ws.Exited got false, want true") - } - if got, want := ws.ExitStatus(), 140; got != want { - t.Errorf("ws.ExitedStatus got %d, want %d", got, want) - } } // Test that failure to exec returns proper error message. func TestExecError(t *testing.T) { - if err := dockerutil.Pull("alpine"); err != nil { - t.Fatalf("docker pull failed: %v", err) - } - d := dockerutil.MakeDocker("exec-error-test") + d := dockerutil.MakeDocker(t) + defer d.CleanUp() // Start the container. - if err := d.Run("alpine", "sleep", "1000"); err != nil { + if err := d.Spawn(dockerutil.RunOpts{ + Image: "basic/alpine", + }, "sleep", "1000"); err != nil { t.Fatalf("docker run failed: %v", err) } - defer d.CleanUp() - _, err := d.Exec("no_can_find") + // Attempt to exec a binary that doesn't exist. + out, err := d.Exec(dockerutil.RunOpts{}, "no_can_find") if err == nil { t.Fatalf("docker exec didn't fail") } - if want := `error finding executable "no_can_find" in PATH`; !strings.Contains(err.Error(), want) { - t.Fatalf("docker exec wrong error, got: %s, want: .*%s.*", err.Error(), want) + if want := `error finding executable "no_can_find" in PATH`; !strings.Contains(out, want) { + t.Fatalf("docker exec wrong error, got: %s, want: .*%s.*", out, want) } } // Test that exec inherits environment from run. func TestExecEnv(t *testing.T) { - if err := dockerutil.Pull("alpine"); err != nil { - t.Fatalf("docker pull failed: %v", err) - } - d := dockerutil.MakeDocker("exec-env-test") + d := dockerutil.MakeDocker(t) + defer d.CleanUp() // Start the container with env FOO=BAR. - if err := d.Run("-e", "FOO=BAR", "alpine", "sleep", "1000"); err != nil { + if err := d.Spawn(dockerutil.RunOpts{ + Image: "basic/alpine", + Env: []string{"FOO=BAR"}, + }, "sleep", "1000"); err != nil { t.Fatalf("docker run failed: %v", err) } - defer d.CleanUp() // Exec "echo $FOO". - got, err := d.Exec("/bin/sh", "-c", "echo $FOO") + got, err := d.Exec(dockerutil.RunOpts{}, "/bin/sh", "-c", "echo $FOO") if err != nil { t.Fatalf("docker exec failed: %v", err) } @@ -216,17 +222,19 @@ func TestExecEnv(t *testing.T) { // TestRunEnvHasHome tests that run always has HOME environment set. func TestRunEnvHasHome(t *testing.T) { // Base alpine image does not have any environment variables set. - if err := dockerutil.Pull("alpine"); err != nil { - t.Fatalf("docker pull failed: %v", err) - } - d := dockerutil.MakeDocker("run-env-test") + d := dockerutil.MakeDocker(t) + defer d.CleanUp() // Exec "echo $HOME". The 'bin' user's home dir is '/bin'. - got, err := d.RunFg("--user", "bin", "alpine", "/bin/sh", "-c", "echo $HOME") + got, err := d.Run(dockerutil.RunOpts{ + Image: "basic/alpine", + User: "bin", + }, "/bin/sh", "-c", "echo $HOME") if err != nil { t.Fatalf("docker run failed: %v", err) } - defer d.CleanUp() + + // Check that the directory matches. if got, want := strings.TrimSpace(got), "/bin"; got != want { t.Errorf("bad output from 'docker run'. Got %q; Want %q.", got, want) } @@ -235,18 +243,17 @@ func TestRunEnvHasHome(t *testing.T) { // Test that exec always has HOME environment set, even when not set in run. func TestExecEnvHasHome(t *testing.T) { // Base alpine image does not have any environment variables set. - if err := dockerutil.Pull("alpine"); err != nil { - t.Fatalf("docker pull failed: %v", err) - } - d := dockerutil.MakeDocker("exec-env-home-test") + d := dockerutil.MakeDocker(t) + defer d.CleanUp() - if err := d.Run("alpine", "sleep", "1000"); err != nil { + if err := d.Spawn(dockerutil.RunOpts{ + Image: "basic/alpine", + }, "sleep", "1000"); err != nil { t.Fatalf("docker run failed: %v", err) } - defer d.CleanUp() // Exec "echo $HOME", and expect to see "/root". - got, err := d.Exec("/bin/sh", "-c", "echo $HOME") + got, err := d.Exec(dockerutil.RunOpts{}, "/bin/sh", "-c", "echo $HOME") if err != nil { t.Fatalf("docker exec failed: %v", err) } @@ -258,12 +265,14 @@ func TestExecEnvHasHome(t *testing.T) { newUID := 1234 newHome := "/foo/bar" cmd := fmt.Sprintf("mkdir -p -m 777 %q && adduser foo -D -u %d -h %q", newHome, newUID, newHome) - if _, err := d.Exec("/bin/sh", "-c", cmd); err != nil { + if _, err := d.Exec(dockerutil.RunOpts{}, "/bin/sh", "-c", cmd); err != nil { t.Fatalf("docker exec failed: %v", err) } // Execute the same as the new user and expect newHome. - got, err = d.ExecAsUser(strconv.Itoa(newUID), "/bin/sh", "-c", "echo $HOME") + got, err = d.Exec(dockerutil.RunOpts{ + User: strconv.Itoa(newUID), + }, "/bin/sh", "-c", "echo $HOME") if err != nil { t.Fatalf("docker exec failed: %v", err) } diff --git a/test/e2e/integration_test.go b/test/e2e/integration_test.go index cc4fbbaed..404e37689 100644 --- a/test/e2e/integration_test.go +++ b/test/e2e/integration_test.go @@ -27,14 +27,15 @@ import ( "net" "net/http" "os" + "os/exec" "strconv" "strings" "syscall" "testing" "time" - "gvisor.dev/gvisor/runsc/dockerutil" - "gvisor.dev/gvisor/runsc/testutil" + "gvisor.dev/gvisor/pkg/test/dockerutil" + "gvisor.dev/gvisor/pkg/test/testutil" ) // httpRequestSucceeds sends a request to a given url and checks that the status is OK. @@ -53,65 +54,66 @@ func httpRequestSucceeds(client http.Client, server string, port int) error { // TestLifeCycle tests a basic Create/Start/Stop docker container life cycle. func TestLifeCycle(t *testing.T) { - if err := dockerutil.Pull("nginx"); err != nil { - t.Fatal("docker pull failed:", err) - } - d := dockerutil.MakeDocker("lifecycle-test") - if err := d.Create("-p", "80", "nginx"); err != nil { - t.Fatal("docker create failed:", err) + d := dockerutil.MakeDocker(t) + defer d.CleanUp() + + // Start the container. + if err := d.Create(dockerutil.RunOpts{ + Image: "basic/nginx", + Ports: []int{80}, + }); err != nil { + t.Fatalf("docker create failed: %v", err) } if err := d.Start(); err != nil { - d.CleanUp() - t.Fatal("docker start failed:", err) + t.Fatalf("docker start failed: %v", err) } - // Test that container is working + // Test that container is working. port, err := d.FindPort(80) if err != nil { - t.Fatal("docker.FindPort(80) failed: ", err) + t.Fatalf("docker.FindPort(80) failed: %v", err) } if err := testutil.WaitForHTTP(port, 30*time.Second); err != nil { - t.Fatal("WaitForHTTP() timeout:", err) + t.Fatalf("WaitForHTTP() timeout: %v", err) } client := http.Client{Timeout: time.Duration(2 * time.Second)} if err := httpRequestSucceeds(client, "localhost", port); err != nil { - t.Error("http request failed:", err) + t.Errorf("http request failed: %v", err) } if err := d.Stop(); err != nil { - d.CleanUp() - t.Fatal("docker stop failed:", err) + t.Fatalf("docker stop failed: %v", err) } if err := d.Remove(); err != nil { - t.Fatal("docker rm failed:", err) + t.Fatalf("docker rm failed: %v", err) } } func TestPauseResume(t *testing.T) { - const img = "gcr.io/gvisor-presubmit/python-hello" if !testutil.IsCheckpointSupported() { - t.Log("Checkpoint is not supported, skipping test.") - return + t.Skip("Checkpoint is not supported.") } - if err := dockerutil.Pull(img); err != nil { - t.Fatal("docker pull failed:", err) - } - d := dockerutil.MakeDocker("pause-resume-test") - if err := d.Run("-p", "8080", img); err != nil { + d := dockerutil.MakeDocker(t) + defer d.CleanUp() + + // Start the container. + if err := d.Spawn(dockerutil.RunOpts{ + Image: "basic/python", + Ports: []int{8080}, // See Dockerfile. + }); err != nil { t.Fatalf("docker run failed: %v", err) } - defer d.CleanUp() // Find where port 8080 is mapped to. port, err := d.FindPort(8080) if err != nil { - t.Fatal("docker.FindPort(8080) failed:", err) + t.Fatalf("docker.FindPort(8080) failed: %v", err) } // Wait until it's up and running. if err := testutil.WaitForHTTP(port, 30*time.Second); err != nil { - t.Fatal("WaitForHTTP() timeout:", err) + t.Fatalf("WaitForHTTP() timeout: %v", err) } // Check that container is working. @@ -121,7 +123,7 @@ func TestPauseResume(t *testing.T) { } if err := d.Pause(); err != nil { - t.Fatal("docker pause failed:", err) + t.Fatalf("docker pause failed: %v", err) } // Check if container is paused. @@ -137,12 +139,12 @@ func TestPauseResume(t *testing.T) { } if err := d.Unpause(); err != nil { - t.Fatal("docker unpause failed:", err) + t.Fatalf("docker unpause failed: %v", err) } // Wait until it's up and running. if err := testutil.WaitForHTTP(port, 30*time.Second); err != nil { - t.Fatal("WaitForHTTP() timeout:", err) + t.Fatalf("WaitForHTTP() timeout: %v", err) } // Check if container is working again. @@ -152,43 +154,43 @@ func TestPauseResume(t *testing.T) { } func TestCheckpointRestore(t *testing.T) { - const img = "gcr.io/gvisor-presubmit/python-hello" if !testutil.IsCheckpointSupported() { - t.Log("Pause/resume is not supported, skipping test.") - return + t.Skip("Pause/resume is not supported.") } - if err := dockerutil.Pull(img); err != nil { - t.Fatal("docker pull failed:", err) - } - d := dockerutil.MakeDocker("save-restore-test") - if err := d.Run("-p", "8080", img); err != nil { + d := dockerutil.MakeDocker(t) + defer d.CleanUp() + + // Start the container. + if err := d.Spawn(dockerutil.RunOpts{ + Image: "basic/python", + Ports: []int{8080}, // See Dockerfile. + }); err != nil { t.Fatalf("docker run failed: %v", err) } - defer d.CleanUp() + // Create a snapshot. if err := d.Checkpoint("test"); err != nil { - t.Fatal("docker checkpoint failed:", err) + t.Fatalf("docker checkpoint failed: %v", err) } - if _, err := d.Wait(30 * time.Second); err != nil { - t.Fatal(err) + t.Fatalf("wait failed: %v", err) } // TODO(b/143498576): Remove Poll after github.com/moby/moby/issues/38963 is fixed. if err := testutil.Poll(func() error { return d.Restore("test") }, 15*time.Second); err != nil { - t.Fatal("docker restore failed:", err) + t.Fatalf("docker restore failed: %v", err) } // Find where port 8080 is mapped to. port, err := d.FindPort(8080) if err != nil { - t.Fatal("docker.FindPort(8080) failed:", err) + t.Fatalf("docker.FindPort(8080) failed: %v", err) } // Wait until it's up and running. if err := testutil.WaitForHTTP(port, 30*time.Second); err != nil { - t.Fatal("WaitForHTTP() timeout:", err) + t.Fatalf("WaitForHTTP() timeout: %v", err) } // Check if container is working again. @@ -200,26 +202,28 @@ func TestCheckpointRestore(t *testing.T) { // Create client and server that talk to each other using the local IP. func TestConnectToSelf(t *testing.T) { - d := dockerutil.MakeDocker("connect-to-self-test") + d := dockerutil.MakeDocker(t) + defer d.CleanUp() // Creates server that replies "server" and exists. Sleeps at the end because // 'docker exec' gets killed if the init process exists before it can finish. - if err := d.Run("ubuntu:trusty", "/bin/sh", "-c", "echo server | nc -l -p 8080 && sleep 1"); err != nil { - t.Fatal("docker run failed:", err) + if err := d.Spawn(dockerutil.RunOpts{ + Image: "basic/ubuntu", + }, "/bin/sh", "-c", "echo server | nc -l -p 8080 && sleep 1"); err != nil { + t.Fatalf("docker run failed: %v", err) } - defer d.CleanUp() // Finds IP address for host. - ip, err := d.Exec("/bin/sh", "-c", "cat /etc/hosts | grep ${HOSTNAME} | awk '{print $1}'") + ip, err := d.Exec(dockerutil.RunOpts{}, "/bin/sh", "-c", "cat /etc/hosts | grep ${HOSTNAME} | awk '{print $1}'") if err != nil { - t.Fatal("docker exec failed:", err) + t.Fatalf("docker exec failed: %v", err) } ip = strings.TrimRight(ip, "\n") // Runs client that sends "client" to the server and exits. - reply, err := d.Exec("/bin/sh", "-c", fmt.Sprintf("echo client | nc %s 8080", ip)) + reply, err := d.Exec(dockerutil.RunOpts{}, "/bin/sh", "-c", fmt.Sprintf("echo client | nc %s 8080", ip)) if err != nil { - t.Fatal("docker exec failed:", err) + t.Fatalf("docker exec failed: %v", err) } // Ensure both client and server got the message from each other. @@ -227,21 +231,22 @@ func TestConnectToSelf(t *testing.T) { t.Errorf("Error on server, want: %q, got: %q", want, reply) } if _, err := d.WaitForOutput("^client\n$", 1*time.Second); err != nil { - t.Fatal("docker.WaitForOutput(client) timeout:", err) + t.Fatalf("docker.WaitForOutput(client) timeout: %v", err) } } func TestMemLimit(t *testing.T) { - if err := dockerutil.Pull("alpine"); err != nil { - t.Fatal("docker pull failed:", err) - } - d := dockerutil.MakeDocker("cgroup-test") - cmd := "cat /proc/meminfo | grep MemTotal: | awk '{print $2}'" - out, err := d.RunFg("--memory=500MB", "alpine", "sh", "-c", cmd) + d := dockerutil.MakeDocker(t) + defer d.CleanUp() + + allocMemory := 500 * 1024 + out, err := d.Run(dockerutil.RunOpts{ + Image: "basic/alpine", + Memory: allocMemory, // In kB. + }, "sh", "-c", "cat /proc/meminfo | grep MemTotal: | awk '{print $2}'") if err != nil { - t.Fatal("docker run failed:", err) + t.Fatalf("docker run failed: %v", err) } - defer d.CleanUp() // Remove warning message that swap isn't present. if strings.HasPrefix(out, "WARNING") { @@ -252,27 +257,30 @@ func TestMemLimit(t *testing.T) { out = lines[1] } + // Ensure the memory matches what we want. got, err := strconv.ParseUint(strings.TrimSpace(out), 10, 64) if err != nil { t.Fatalf("failed to parse %q: %v", out, err) } - if want := uint64(500 * 1024); got != want { + if want := uint64(allocMemory); got != want { t.Errorf("MemTotal got: %d, want: %d", got, want) } } func TestNumCPU(t *testing.T) { - if err := dockerutil.Pull("alpine"); err != nil { - t.Fatal("docker pull failed:", err) - } - d := dockerutil.MakeDocker("cgroup-test") - cmd := "cat /proc/cpuinfo | grep 'processor.*:' | wc -l" - out, err := d.RunFg("--cpuset-cpus=0", "alpine", "sh", "-c", cmd) + d := dockerutil.MakeDocker(t) + defer d.CleanUp() + + // Read how many cores are in the container. + out, err := d.Run(dockerutil.RunOpts{ + Image: "basic/alpine", + Extra: []string{"--cpuset-cpus=0"}, + }, "sh", "-c", "cat /proc/cpuinfo | grep 'processor.*:' | wc -l") if err != nil { - t.Fatal("docker run failed:", err) + t.Fatalf("docker run failed: %v", err) } - defer d.CleanUp() + // Ensure it matches what we want. got, err := strconv.Atoi(strings.TrimSpace(out)) if err != nil { t.Fatalf("failed to parse %q: %v", out, err) @@ -284,39 +292,39 @@ func TestNumCPU(t *testing.T) { // TestJobControl tests that job control characters are handled properly. func TestJobControl(t *testing.T) { - if err := dockerutil.Pull("alpine"); err != nil { - t.Fatalf("docker pull failed: %v", err) - } - d := dockerutil.MakeDocker("job-control-test") + d := dockerutil.MakeDocker(t) + defer d.CleanUp() // Start the container with an attached PTY. - _, ptmx, err := d.RunWithPty("alpine", "sh") - if err != nil { + if _, err := d.Run(dockerutil.RunOpts{ + Image: "basic/alpine", + Pty: func(_ *exec.Cmd, ptmx *os.File) { + // Call "sleep 100" in the shell. + if _, err := ptmx.Write([]byte("sleep 100\n")); err != nil { + t.Fatalf("error writing to pty: %v", err) + } + + // Give shell a few seconds to start executing the sleep. + time.Sleep(2 * time.Second) + + // Send a ^C to the pty, which should kill sleep, but + // not the shell. \x03 is ASCII "end of text", which + // is the same as ^C. + if _, err := ptmx.Write([]byte{'\x03'}); err != nil { + t.Fatalf("error writing to pty: %v", err) + } + + // The shell should still be alive at this point. Sleep + // should have exited with code 2+128=130. We'll exit + // with 10 plus that number, so that we can be sure + // that the shell did not get signalled. + if _, err := ptmx.Write([]byte("exit $(expr $? + 10)\n")); err != nil { + t.Fatalf("error writing to pty: %v", err) + } + }, + }, "sh"); err != nil { t.Fatalf("docker run failed: %v", err) } - defer ptmx.Close() - defer d.CleanUp() - - // Call "sleep 100" in the shell. - if _, err := ptmx.Write([]byte("sleep 100\n")); err != nil { - t.Fatalf("error writing to pty: %v", err) - } - - // Give shell a few seconds to start executing the sleep. - time.Sleep(2 * time.Second) - - // Send a ^C to the pty, which should kill sleep, but not the shell. - // \x03 is ASCII "end of text", which is the same as ^C. - if _, err := ptmx.Write([]byte{'\x03'}); err != nil { - t.Fatalf("error writing to pty: %v", err) - } - - // The shell should still be alive at this point. Sleep should have - // exited with code 2+128=130. We'll exit with 10 plus that number, so - // that we can be sure that the shell did not get signalled. - if _, err := ptmx.Write([]byte("exit $(expr $? + 10)\n")); err != nil { - t.Fatalf("error writing to pty: %v", err) - } // Wait for the container to exit. got, err := d.Wait(5 * time.Second) @@ -332,14 +340,25 @@ func TestJobControl(t *testing.T) { // TestTmpFile checks that files inside '/tmp' are not overridden. In addition, // it checks that working dir is created if it doesn't exit. func TestTmpFile(t *testing.T) { - if err := dockerutil.Pull("alpine"); err != nil { - t.Fatal("docker pull failed:", err) + d := dockerutil.MakeDocker(t) + defer d.CleanUp() + + // Should work without ReadOnly + if _, err := d.Run(dockerutil.RunOpts{ + Image: "basic/alpine", + WorkDir: "/tmp/foo/bar", + }, "touch", "/tmp/foo/bar/file"); err != nil { + t.Fatalf("docker run failed: %v", err) } - d := dockerutil.MakeDocker("tmp-file-test") - if err := d.Run("-w=/tmp/foo/bar", "--read-only", "alpine", "touch", "/tmp/foo/bar/file"); err != nil { - t.Fatal("docker run failed:", err) + + // Expect failure. + if _, err := d.Run(dockerutil.RunOpts{ + Image: "basic/alpine", + WorkDir: "/tmp/foo/bar", + ReadOnly: true, + }, "touch", "/tmp/foo/bar/file"); err == nil { + t.Fatalf("docker run expected failure, but succeeded") } - defer d.CleanUp() } func TestMain(m *testing.M) { diff --git a/test/e2e/regression_test.go b/test/e2e/regression_test.go index 2488be383..327a2174c 100644 --- a/test/e2e/regression_test.go +++ b/test/e2e/regression_test.go @@ -18,7 +18,7 @@ import ( "strings" "testing" - "gvisor.dev/gvisor/runsc/dockerutil" + "gvisor.dev/gvisor/pkg/test/dockerutil" ) // Test that UDS can be created using overlay when parent directory is in lower @@ -27,19 +27,19 @@ import ( // Prerequisite: the directory where the socket file is created must not have // been open for write before bind(2) is called. func TestBindOverlay(t *testing.T) { - if err := dockerutil.Pull("ubuntu:trusty"); err != nil { - t.Fatal("docker pull failed:", err) - } - d := dockerutil.MakeDocker("bind-overlay-test") + d := dockerutil.MakeDocker(t) + defer d.CleanUp() - cmd := "nc -l -U /var/run/sock & p=$! && sleep 1 && echo foobar-asdf | nc -U /var/run/sock && wait $p" - got, err := d.RunFg("ubuntu:trusty", "bash", "-c", cmd) + // Run the container. + got, err := d.Run(dockerutil.RunOpts{ + Image: "basic/ubuntu", + }, "bash", "-c", "nc -l -U /var/run/sock & p=$! && sleep 1 && echo foobar-asdf | nc -U /var/run/sock && wait $p") if err != nil { - t.Fatal("docker run failed:", err) + t.Fatalf("docker run failed: %v", err) } + // Check the output contains what we want. if want := "foobar-asdf"; !strings.Contains(got, want) { t.Fatalf("docker run output is missing %q: %s", want, got) } - defer d.CleanUp() } diff --git a/test/image/BUILD b/test/image/BUILD index 7392ac54e..e749e47d4 100644 --- a/test/image/BUILD +++ b/test/image/BUILD @@ -22,8 +22,8 @@ go_test( ], visibility = ["//:sandbox"], deps = [ - "//runsc/dockerutil", - "//runsc/testutil", + "//pkg/test/dockerutil", + "//pkg/test/testutil", ], ) diff --git a/test/image/image_test.go b/test/image/image_test.go index 0a1e19d6f..2e3543109 100644 --- a/test/image/image_test.go +++ b/test/image/image_test.go @@ -28,24 +28,29 @@ import ( "log" "net/http" "os" - "path/filepath" "strings" "testing" "time" - "gvisor.dev/gvisor/runsc/dockerutil" - "gvisor.dev/gvisor/runsc/testutil" + "gvisor.dev/gvisor/pkg/test/dockerutil" + "gvisor.dev/gvisor/pkg/test/testutil" ) func TestHelloWorld(t *testing.T) { - d := dockerutil.MakeDocker("hello-test") - if err := d.Run("hello-world"); err != nil { + d := dockerutil.MakeDocker(t) + defer d.CleanUp() + + // Run the basic container. + out, err := d.Run(dockerutil.RunOpts{ + Image: "basic/alpine", + }, "echo", "Hello world!") + if err != nil { t.Fatalf("docker run failed: %v", err) } - defer d.CleanUp() - if _, err := d.WaitForOutput("Hello from Docker!", 5*time.Second); err != nil { - t.Fatalf("docker didn't say hello: %v", err) + // Check the output. + if !strings.Contains(out, "Hello world!") { + t.Fatalf("docker didn't say hello: got %s", out) } } @@ -102,27 +107,22 @@ func testHTTPServer(t *testing.T, port int) { } func TestHttpd(t *testing.T) { - if err := dockerutil.Pull("httpd"); err != nil { - t.Fatalf("docker pull failed: %v", err) - } - d := dockerutil.MakeDocker("http-test") - - dir, err := dockerutil.PrepareFiles("test/image/latin10k.txt") - if err != nil { - t.Fatalf("PrepareFiles() failed: %v", err) - } + d := dockerutil.MakeDocker(t) + defer d.CleanUp() // Start the container. - mountArg := dockerutil.MountArg(dir, "/usr/local/apache2/htdocs", dockerutil.ReadOnly) - if err := d.Run("-p", "80", mountArg, "httpd"); err != nil { + d.CopyFiles("/usr/local/apache2/htdocs", "test/image/latin10k.txt") + if err := d.Spawn(dockerutil.RunOpts{ + Image: "basic/httpd", + Ports: []int{80}, + }); err != nil { t.Fatalf("docker run failed: %v", err) } - defer d.CleanUp() // Find where port 80 is mapped to. port, err := d.FindPort(80) if err != nil { - t.Fatalf("docker.FindPort(80) failed: %v", err) + t.Fatalf("FindPort(80) failed: %v", err) } // Wait until it's up and running. @@ -134,27 +134,22 @@ func TestHttpd(t *testing.T) { } func TestNginx(t *testing.T) { - if err := dockerutil.Pull("nginx"); err != nil { - t.Fatalf("docker pull failed: %v", err) - } - d := dockerutil.MakeDocker("net-test") - - dir, err := dockerutil.PrepareFiles("test/image/latin10k.txt") - if err != nil { - t.Fatalf("PrepareFiles() failed: %v", err) - } + d := dockerutil.MakeDocker(t) + defer d.CleanUp() // Start the container. - mountArg := dockerutil.MountArg(dir, "/usr/share/nginx/html", dockerutil.ReadOnly) - if err := d.Run("-p", "80", mountArg, "nginx"); err != nil { + d.CopyFiles("/usr/share/nginx/html", "test/image/latin10k.txt") + if err := d.Spawn(dockerutil.RunOpts{ + Image: "basic/nginx", + Ports: []int{80}, + }); err != nil { t.Fatalf("docker run failed: %v", err) } - defer d.CleanUp() // Find where port 80 is mapped to. port, err := d.FindPort(80) if err != nil { - t.Fatalf("docker.FindPort(80) failed: %v", err) + t.Fatalf("FindPort(80) failed: %v", err) } // Wait until it's up and running. @@ -166,99 +161,58 @@ func TestNginx(t *testing.T) { } func TestMysql(t *testing.T) { - if err := dockerutil.Pull("mysql"); err != nil { - t.Fatalf("docker pull failed: %v", err) - } - d := dockerutil.MakeDocker("mysql-test") + server := dockerutil.MakeDocker(t) + defer server.CleanUp() // Start the container. - if err := d.Run("-e", "MYSQL_ROOT_PASSWORD=foobar123", "mysql"); err != nil { + if err := server.Spawn(dockerutil.RunOpts{ + Image: "basic/mysql", + Env: []string{"MYSQL_ROOT_PASSWORD=foobar123"}, + }); err != nil { t.Fatalf("docker run failed: %v", err) } - defer d.CleanUp() // Wait until it's up and running. - if _, err := d.WaitForOutput("port: 3306 MySQL Community Server", 3*time.Minute); err != nil { - t.Fatalf("docker.WaitForOutput() timeout: %v", err) + if _, err := server.WaitForOutput("port: 3306 MySQL Community Server", 3*time.Minute); err != nil { + t.Fatalf("WaitForOutput() timeout: %v", err) } - client := dockerutil.MakeDocker("mysql-client-test") - dir, err := dockerutil.PrepareFiles("test/image/mysql.sql") - if err != nil { - t.Fatalf("PrepareFiles() failed: %v", err) - } + // Generate the client and copy in the SQL payload. + client := dockerutil.MakeDocker(t) + defer client.CleanUp() - // Tell mysql client to connect to the server and execute the file in verbose - // mode to verify the output. - args := []string{ - dockerutil.LinkArg(&d, "mysql"), - dockerutil.MountArg(dir, "/sql", dockerutil.ReadWrite), - "mysql", - "mysql", "-hmysql", "-uroot", "-pfoobar123", "-v", "-e", "source /sql/mysql.sql", - } - if err := client.Run(args...); err != nil { + // Tell mysql client to connect to the server and execute the file in + // verbose mode to verify the output. + client.CopyFiles("/sql", "test/image/mysql.sql") + client.Link("mysql", server) + if _, err := client.Run(dockerutil.RunOpts{ + Image: "basic/mysql", + }, "mysql", "-hmysql", "-uroot", "-pfoobar123", "-v", "-e", "source /sql/mysql.sql"); err != nil { t.Fatalf("docker run failed: %v", err) } - defer client.CleanUp() // Ensure file executed to the end and shutdown mysql. - if _, err := client.WaitForOutput("--------------\nshutdown\n--------------", 15*time.Second); err != nil { - t.Fatalf("docker.WaitForOutput() timeout: %v", err) - } - if _, err := d.WaitForOutput("mysqld: Shutdown complete", 30*time.Second); err != nil { - t.Fatalf("docker.WaitForOutput() timeout: %v", err) + if _, err := server.WaitForOutput("mysqld: Shutdown complete", 30*time.Second); err != nil { + t.Fatalf("WaitForOutput() timeout: %v", err) } } -func TestPythonHello(t *testing.T) { - // TODO(b/136503277): Once we have more complete python runtime tests, - // we can drop this one. - const img = "gcr.io/gvisor-presubmit/python-hello" - if err := dockerutil.Pull(img); err != nil { - t.Fatalf("docker pull failed: %v", err) - } - d := dockerutil.MakeDocker("python-hello-test") - if err := d.Run("-p", "8080", img); err != nil { - t.Fatalf("docker run failed: %v", err) - } +func TestTomcat(t *testing.T) { + d := dockerutil.MakeDocker(t) defer d.CleanUp() - // Find where port 8080 is mapped to. - port, err := d.FindPort(8080) - if err != nil { - t.Fatalf("docker.FindPort(8080) failed: %v", err) - } - - // Wait until it's up and running. - if err := testutil.WaitForHTTP(port, 30*time.Second); err != nil { - t.Fatalf("WaitForHTTP() timeout: %v", err) - } - - // Ensure that content is being served. - url := fmt.Sprintf("http://localhost:%d", port) - resp, err := http.Get(url) - if err != nil { - t.Errorf("Error reaching http server: %v", err) - } - if want := http.StatusOK; resp.StatusCode != want { - t.Errorf("Wrong response code, got: %d, want: %d", resp.StatusCode, want) - } -} - -func TestTomcat(t *testing.T) { - if err := dockerutil.Pull("tomcat:8.0"); err != nil { - t.Fatalf("docker pull failed: %v", err) - } - d := dockerutil.MakeDocker("tomcat-test") - if err := d.Run("-p", "8080", "tomcat:8.0"); err != nil { + // Start the server. + if err := d.Spawn(dockerutil.RunOpts{ + Image: "basic/tomcat", + Ports: []int{8080}, + }); err != nil { t.Fatalf("docker run failed: %v", err) } - defer d.CleanUp() // Find where port 8080 is mapped to. port, err := d.FindPort(8080) if err != nil { - t.Fatalf("docker.FindPort(8080) failed: %v", err) + t.Fatalf("FindPort(8080) failed: %v", err) } // Wait until it's up and running. @@ -278,28 +232,22 @@ func TestTomcat(t *testing.T) { } func TestRuby(t *testing.T) { - if err := dockerutil.Pull("ruby"); err != nil { - t.Fatalf("docker pull failed: %v", err) - } - d := dockerutil.MakeDocker("ruby-test") - - dir, err := dockerutil.PrepareFiles("test/image/ruby.rb", "test/image/ruby.sh") - if err != nil { - t.Fatalf("PrepareFiles() failed: %v", err) - } - if err := os.Chmod(filepath.Join(dir, "ruby.sh"), 0333); err != nil { - t.Fatalf("os.Chmod(%q, 0333) failed: %v", dir, err) - } + d := dockerutil.MakeDocker(t) + defer d.CleanUp() - if err := d.Run("-p", "8080", dockerutil.MountArg(dir, "/src", dockerutil.ReadOnly), "ruby", "/src/ruby.sh"); err != nil { + // Execute the ruby workload. + d.CopyFiles("/src", "test/image/ruby.rb", "test/image/ruby.sh") + if err := d.Spawn(dockerutil.RunOpts{ + Image: "basic/ruby", + Ports: []int{8080}, + }, "/src/ruby.sh"); err != nil { t.Fatalf("docker run failed: %v", err) } - defer d.CleanUp() // Find where port 8080 is mapped to. port, err := d.FindPort(8080) if err != nil { - t.Fatalf("docker.FindPort(8080) failed: %v", err) + t.Fatalf("FindPort(8080) failed: %v", err) } // Wait until it's up and running, 'gem install' can take some time. @@ -326,18 +274,17 @@ func TestRuby(t *testing.T) { } func TestStdio(t *testing.T) { - if err := dockerutil.Pull("alpine"); err != nil { - t.Fatalf("docker pull failed: %v", err) - } - d := dockerutil.MakeDocker("stdio-test") + d := dockerutil.MakeDocker(t) + defer d.CleanUp() wantStdout := "hello stdout" wantStderr := "bonjour stderr" cmd := fmt.Sprintf("echo %q; echo %q 1>&2;", wantStdout, wantStderr) - if err := d.Run("alpine", "/bin/sh", "-c", cmd); err != nil { + if err := d.Spawn(dockerutil.RunOpts{ + Image: "basic/alpine", + }, "/bin/sh", "-c", cmd); err != nil { t.Fatalf("docker run failed: %v", err) } - defer d.CleanUp() for _, want := range []string{wantStdout, wantStderr} { if _, err := d.WaitForOutput(want, 5*time.Second); err != nil { diff --git a/test/image/ruby.sh b/test/image/ruby.sh old mode 100644 new mode 100755 diff --git a/test/iptables/BUILD b/test/iptables/BUILD index 6bb3b82b5..3e29ca90d 100644 --- a/test/iptables/BUILD +++ b/test/iptables/BUILD @@ -14,7 +14,7 @@ go_library( ], visibility = ["//test/iptables:__subpackages__"], deps = [ - "//runsc/testutil", + "//pkg/test/testutil", ], ) @@ -23,14 +23,14 @@ go_test( srcs = [ "iptables_test.go", ], + data = ["//test/iptables/runner"], library = ":iptables", tags = [ "local", "manual", ], deps = [ - "//pkg/log", - "//runsc/dockerutil", - "//runsc/testutil", + "//pkg/test/dockerutil", + "//pkg/test/testutil", ], ) diff --git a/test/iptables/README.md b/test/iptables/README.md index cc8a2fcac..b9f44bd40 100644 --- a/test/iptables/README.md +++ b/test/iptables/README.md @@ -38,7 +38,7 @@ Build the testing Docker container. Re-run this when you modify the test code in this directory: ```bash -$ bazel run //test/iptables/runner:runner-image -- --norun +$ make load-iptables ``` Run an individual test via: diff --git a/test/iptables/iptables.go b/test/iptables/iptables.go index 2e565d988..16cb4f4da 100644 --- a/test/iptables/iptables.go +++ b/test/iptables/iptables.go @@ -18,12 +18,19 @@ package iptables import ( "fmt" "net" + "time" ) // IPExchangePort is the port the container listens on to receive the IP // address of the local process. const IPExchangePort = 2349 +// TerminalStatement is the last statement in the test runner. +const TerminalStatement = "Finished!" + +// TestTimeout is the timeout used for all tests. +const TestTimeout = 10 * time.Minute + // A TestCase contains one action to run in the container and one to run // locally. The actions run concurrently and each must succeed for the test // pass. diff --git a/test/iptables/iptables_test.go b/test/iptables/iptables_test.go index 493d69052..334d8e676 100644 --- a/test/iptables/iptables_test.go +++ b/test/iptables/iptables_test.go @@ -15,28 +15,14 @@ package iptables import ( - "flag" "fmt" "net" - "os" - "path" "testing" - "time" - "gvisor.dev/gvisor/pkg/log" - "gvisor.dev/gvisor/runsc/dockerutil" - "gvisor.dev/gvisor/runsc/testutil" + "gvisor.dev/gvisor/pkg/test/dockerutil" + "gvisor.dev/gvisor/pkg/test/testutil" ) -const timeout = 18 * time.Second - -var image = flag.String("image", "bazel/test/iptables/runner:runner-image", "image to run tests in") - -type result struct { - output string - err error -} - // singleTest runs a TestCase. Each test follows a pattern: // - Create a container. // - Get the container's IP. @@ -46,77 +32,45 @@ type result struct { // // Container output is logged to $TEST_UNDECLARED_OUTPUTS_DIR if it exists, or // to stderr. -func singleTest(test TestCase) error { +func singleTest(t *testing.T, test TestCase) { if _, ok := Tests[test.Name()]; !ok { - return fmt.Errorf("no test found with name %q. Has it been registered?", test.Name()) + t.Fatalf("no test found with name %q. Has it been registered?", test.Name()) } + d := dockerutil.MakeDocker(t) + defer d.CleanUp() + // Create and start the container. - cont := dockerutil.MakeDocker("gvisor-iptables") - defer cont.CleanUp() - resultChan := make(chan *result) - go func() { - output, err := cont.RunFg("--cap-add=NET_ADMIN", *image, "-name", test.Name()) - logContainer(output, err) - resultChan <- &result{output, err} - }() + d.CopyFiles("/runner", "test/iptables/runner/runner") + if err := d.Spawn(dockerutil.RunOpts{ + Image: "iptables", + CapAdd: []string{"NET_ADMIN"}, + }, "/runner/runner", "-name", test.Name()); err != nil { + t.Fatalf("docker run failed: %v", err) + } // Get the container IP. - ip, err := getIP(cont) + ip, err := d.FindIP() if err != nil { - return fmt.Errorf("failed to get container IP: %v", err) + t.Fatalf("failed to get container IP: %v", err) } // Give the container our IP. if err := sendIP(ip); err != nil { - return fmt.Errorf("failed to send IP to container: %v", err) + t.Fatalf("failed to send IP to container: %v", err) } // Run our side of the test. - errChan := make(chan error) - go func() { - errChan <- test.LocalAction(ip) - }() - - // Wait for both the container and local tests to finish. - var res *result - to := time.After(timeout) - for localDone := false; res == nil || !localDone; { - select { - case res = <-resultChan: - log.Infof("Container finished.") - case err, localDone = <-errChan: - log.Infof("Local finished.") - if err != nil { - return fmt.Errorf("local test failed: %v", err) - } - case <-to: - return fmt.Errorf("timed out after %f seconds", timeout.Seconds()) - } + if err := test.LocalAction(ip); err != nil { + t.Fatalf("LocalAction failed: %v", err) } - return res.err -} - -func getIP(cont dockerutil.Docker) (net.IP, error) { - // The container might not have started yet, so retry a few times. - var ipStr string - to := time.After(timeout) - for ipStr == "" { - ipStr, _ = cont.FindIP() - select { - case <-to: - return net.IP{}, fmt.Errorf("timed out getting IP after %f seconds", timeout.Seconds()) - default: - time.Sleep(250 * time.Millisecond) - } - } - ip := net.ParseIP(ipStr) - if ip == nil { - return net.IP{}, fmt.Errorf("invalid IP: %q", ipStr) + // Wait for the final statement. This structure has the side effect + // that all container logs will appear within the individual test + // context. + if _, err := d.WaitForOutput(TerminalStatement, TestTimeout); err != nil { + t.Fatalf("test failed: %v", err) } - log.Infof("Container has IP of %s", ipStr) - return ip, nil } func sendIP(ip net.IP) error { @@ -132,7 +86,7 @@ func sendIP(ip net.IP) error { conn = c return err } - if err := testutil.Poll(cb, timeout); err != nil { + if err := testutil.Poll(cb, TestTimeout); err != nil { return fmt.Errorf("timed out waiting to send IP, most recent error: %v", err) } if _, err := conn.Write([]byte{0}); err != nil { @@ -141,281 +95,184 @@ func sendIP(ip net.IP) error { return nil } -func logContainer(output string, err error) { - msg := fmt.Sprintf("Container error: %v\nContainer output:\n%v", err, output) - if artifactsDir := os.Getenv("TEST_UNDECLARED_OUTPUTS_DIR"); artifactsDir != "" { - fpath := path.Join(artifactsDir, "container.log") - if file, err := os.OpenFile(fpath, os.O_WRONLY|os.O_CREATE, 0644); err != nil { - log.Warningf("Failed to open log file %q: %v", fpath, err) - } else { - defer file.Close() - if _, err := file.Write([]byte(msg)); err == nil { - return - } - log.Warningf("Failed to write to log file %s: %v", fpath, err) - } - } - - // We couldn't write to the output directory -- just log to stderr. - log.Infof(msg) -} - func TestFilterInputDropUDP(t *testing.T) { - if err := singleTest(FilterInputDropUDP{}); err != nil { - t.Fatal(err) - } + singleTest(t, FilterInputDropUDP{}) } func TestFilterInputDropUDPPort(t *testing.T) { - if err := singleTest(FilterInputDropUDPPort{}); err != nil { - t.Fatal(err) - } + singleTest(t, FilterInputDropUDPPort{}) } func TestFilterInputDropDifferentUDPPort(t *testing.T) { - if err := singleTest(FilterInputDropDifferentUDPPort{}); err != nil { - t.Fatal(err) - } + singleTest(t, FilterInputDropDifferentUDPPort{}) } func TestFilterInputDropAll(t *testing.T) { - if err := singleTest(FilterInputDropAll{}); err != nil { - t.Fatal(err) - } + singleTest(t, FilterInputDropAll{}) } func TestFilterInputDropOnlyUDP(t *testing.T) { - if err := singleTest(FilterInputDropOnlyUDP{}); err != nil { - t.Fatal(err) - } + singleTest(t, FilterInputDropOnlyUDP{}) } func TestNATRedirectUDPPort(t *testing.T) { // TODO(gvisor.dev/issue/170): Enable when supported. t.Skip("NAT isn't supported yet (gvisor.dev/issue/170).") - if err := singleTest(NATRedirectUDPPort{}); err != nil { - t.Fatal(err) - } + singleTest(t, NATRedirectUDPPort{}) } func TestNATRedirectTCPPort(t *testing.T) { // TODO(gvisor.dev/issue/170): Enable when supported. t.Skip("NAT isn't supported yet (gvisor.dev/issue/170).") - if err := singleTest(NATRedirectTCPPort{}); err != nil { - t.Fatal(err) - } + singleTest(t, NATRedirectTCPPort{}) } func TestNATDropUDP(t *testing.T) { // TODO(gvisor.dev/issue/170): Enable when supported. t.Skip("NAT isn't supported yet (gvisor.dev/issue/170).") - if err := singleTest(NATDropUDP{}); err != nil { - t.Fatal(err) - } + singleTest(t, NATDropUDP{}) } func TestNATAcceptAll(t *testing.T) { // TODO(gvisor.dev/issue/170): Enable when supported. t.Skip("NAT isn't supported yet (gvisor.dev/issue/170).") - if err := singleTest(NATAcceptAll{}); err != nil { - t.Fatal(err) - } + singleTest(t, NATAcceptAll{}) } func TestFilterInputDropTCPDestPort(t *testing.T) { - if err := singleTest(FilterInputDropTCPDestPort{}); err != nil { - t.Fatal(err) - } + singleTest(t, FilterInputDropTCPDestPort{}) } func TestFilterInputDropTCPSrcPort(t *testing.T) { - if err := singleTest(FilterInputDropTCPSrcPort{}); err != nil { - t.Fatal(err) - } + singleTest(t, FilterInputDropTCPSrcPort{}) } func TestFilterInputCreateUserChain(t *testing.T) { - if err := singleTest(FilterInputCreateUserChain{}); err != nil { - t.Fatal(err) - } + singleTest(t, FilterInputCreateUserChain{}) } func TestFilterInputDefaultPolicyAccept(t *testing.T) { - if err := singleTest(FilterInputDefaultPolicyAccept{}); err != nil { - t.Fatal(err) - } + singleTest(t, FilterInputDefaultPolicyAccept{}) } func TestFilterInputDefaultPolicyDrop(t *testing.T) { - if err := singleTest(FilterInputDefaultPolicyDrop{}); err != nil { - t.Fatal(err) - } + singleTest(t, FilterInputDefaultPolicyDrop{}) } func TestFilterInputReturnUnderflow(t *testing.T) { - if err := singleTest(FilterInputReturnUnderflow{}); err != nil { - t.Fatal(err) - } + singleTest(t, FilterInputReturnUnderflow{}) } func TestFilterOutputDropTCPDestPort(t *testing.T) { // TODO(gvisor.dev/issue/170): Enable when supported. t.Skip("filter OUTPUT isn't supported yet (gvisor.dev/issue/170).") - if err := singleTest(FilterOutputDropTCPDestPort{}); err != nil { - t.Fatal(err) - } + singleTest(t, FilterOutputDropTCPDestPort{}) } func TestFilterOutputDropTCPSrcPort(t *testing.T) { // TODO(gvisor.dev/issue/170): Enable when supported. t.Skip("filter OUTPUT isn't supported yet (gvisor.dev/issue/170).") - if err := singleTest(FilterOutputDropTCPSrcPort{}); err != nil { - t.Fatal(err) - } + singleTest(t, FilterOutputDropTCPSrcPort{}) } func TestFilterOutputAcceptTCPOwner(t *testing.T) { - if err := singleTest(FilterOutputAcceptTCPOwner{}); err != nil { - t.Fatal(err) - } + singleTest(t, FilterOutputAcceptTCPOwner{}) } func TestFilterOutputDropTCPOwner(t *testing.T) { - if err := singleTest(FilterOutputDropTCPOwner{}); err != nil { - t.Fatal(err) - } + singleTest(t, FilterOutputDropTCPOwner{}) } func TestFilterOutputAcceptUDPOwner(t *testing.T) { - if err := singleTest(FilterOutputAcceptUDPOwner{}); err != nil { - t.Fatal(err) - } + singleTest(t, FilterOutputAcceptUDPOwner{}) } func TestFilterOutputDropUDPOwner(t *testing.T) { - if err := singleTest(FilterOutputDropUDPOwner{}); err != nil { - t.Fatal(err) - } + singleTest(t, FilterOutputDropUDPOwner{}) } func TestFilterOutputOwnerFail(t *testing.T) { - if err := singleTest(FilterOutputOwnerFail{}); err != nil { - t.Fatal(err) - } + singleTest(t, FilterOutputOwnerFail{}) } func TestJumpSerialize(t *testing.T) { - if err := singleTest(FilterInputSerializeJump{}); err != nil { - t.Fatal(err) - } + singleTest(t, FilterInputSerializeJump{}) } func TestJumpBasic(t *testing.T) { - if err := singleTest(FilterInputJumpBasic{}); err != nil { - t.Fatal(err) - } + singleTest(t, FilterInputJumpBasic{}) } func TestJumpReturn(t *testing.T) { - if err := singleTest(FilterInputJumpReturn{}); err != nil { - t.Fatal(err) - } + singleTest(t, FilterInputJumpReturn{}) } func TestJumpReturnDrop(t *testing.T) { - if err := singleTest(FilterInputJumpReturnDrop{}); err != nil { - t.Fatal(err) - } + singleTest(t, FilterInputJumpReturnDrop{}) } func TestJumpBuiltin(t *testing.T) { - if err := singleTest(FilterInputJumpBuiltin{}); err != nil { - t.Fatal(err) - } + singleTest(t, FilterInputJumpBuiltin{}) } func TestJumpTwice(t *testing.T) { - if err := singleTest(FilterInputJumpTwice{}); err != nil { - t.Fatal(err) - } + singleTest(t, FilterInputJumpTwice{}) } func TestInputDestination(t *testing.T) { - if err := singleTest(FilterInputDestination{}); err != nil { - t.Fatal(err) - } + singleTest(t, FilterInputDestination{}) } func TestInputInvertDestination(t *testing.T) { - if err := singleTest(FilterInputInvertDestination{}); err != nil { - t.Fatal(err) - } + singleTest(t, FilterInputInvertDestination{}) } func TestOutputDestination(t *testing.T) { - if err := singleTest(FilterOutputDestination{}); err != nil { - t.Fatal(err) - } + singleTest(t, FilterOutputDestination{}) } func TestOutputInvertDestination(t *testing.T) { - if err := singleTest(FilterOutputInvertDestination{}); err != nil { - t.Fatal(err) - } + singleTest(t, FilterOutputInvertDestination{}) } func TestNATOutRedirectIP(t *testing.T) { // TODO(gvisor.dev/issue/170): Enable when supported. t.Skip("NAT isn't supported yet (gvisor.dev/issue/170).") - if err := singleTest(NATOutRedirectIP{}); err != nil { - t.Fatal(err) - } + singleTest(t, NATOutRedirectIP{}) } func TestNATOutDontRedirectIP(t *testing.T) { // TODO(gvisor.dev/issue/170): Enable when supported. t.Skip("NAT isn't supported yet (gvisor.dev/issue/170).") - if err := singleTest(NATOutDontRedirectIP{}); err != nil { - t.Fatal(err) - } + singleTest(t, NATOutDontRedirectIP{}) } func TestNATOutRedirectInvert(t *testing.T) { // TODO(gvisor.dev/issue/170): Enable when supported. t.Skip("NAT isn't supported yet (gvisor.dev/issue/170).") - if err := singleTest(NATOutRedirectInvert{}); err != nil { - t.Fatal(err) - } + singleTest(t, NATOutRedirectInvert{}) } func TestNATPreRedirectIP(t *testing.T) { // TODO(gvisor.dev/issue/170): Enable when supported. t.Skip("NAT isn't supported yet (gvisor.dev/issue/170).") - if err := singleTest(NATPreRedirectIP{}); err != nil { - t.Fatal(err) - } + singleTest(t, NATPreRedirectIP{}) } func TestNATPreDontRedirectIP(t *testing.T) { // TODO(gvisor.dev/issue/170): Enable when supported. t.Skip("NAT isn't supported yet (gvisor.dev/issue/170).") - if err := singleTest(NATPreDontRedirectIP{}); err != nil { - t.Fatal(err) - } + singleTest(t, NATPreDontRedirectIP{}) } func TestNATPreRedirectInvert(t *testing.T) { // TODO(gvisor.dev/issue/170): Enable when supported. t.Skip("NAT isn't supported yet (gvisor.dev/issue/170).") - if err := singleTest(NATPreRedirectInvert{}); err != nil { - t.Fatal(err) - } + singleTest(t, NATPreRedirectInvert{}) } func TestNATRedirectRequiresProtocol(t *testing.T) { // TODO(gvisor.dev/issue/170): Enable when supported. t.Skip("NAT isn't supported yet (gvisor.dev/issue/170).") - if err := singleTest(NATRedirectRequiresProtocol{}); err != nil { - t.Fatal(err) - } + singleTest(t, NATRedirectRequiresProtocol{}) } diff --git a/test/iptables/iptables_util.go b/test/iptables/iptables_util.go index 134391e8d..2a00677be 100644 --- a/test/iptables/iptables_util.go +++ b/test/iptables/iptables_util.go @@ -20,7 +20,7 @@ import ( "os/exec" "time" - "gvisor.dev/gvisor/runsc/testutil" + "gvisor.dev/gvisor/pkg/test/testutil" ) const iptablesBinary = "iptables" diff --git a/test/iptables/runner/BUILD b/test/iptables/runner/BUILD index b9199387a..24504a1b9 100644 --- a/test/iptables/runner/BUILD +++ b/test/iptables/runner/BUILD @@ -1,4 +1,4 @@ -load("//tools:defs.bzl", "container_image", "go_binary", "go_image") +load("//tools:defs.bzl", "go_binary") package(licenses = ["notice"]) @@ -6,18 +6,7 @@ go_binary( name = "runner", testonly = 1, srcs = ["main.go"], - deps = ["//test/iptables"], -) - -container_image( - name = "iptables-base", - base = "@iptables-test//image", -) - -go_image( - name = "runner-image", - testonly = 1, - srcs = ["main.go"], - base = ":iptables-base", + pure = True, + visibility = ["//test/iptables:__subpackages__"], deps = ["//test/iptables"], ) diff --git a/test/iptables/runner/main.go b/test/iptables/runner/main.go index 3c794114e..6f77c0684 100644 --- a/test/iptables/runner/main.go +++ b/test/iptables/runner/main.go @@ -46,6 +46,9 @@ func main() { if err := test.ContainerAction(ip); err != nil { log.Fatalf("Failed running test %q: %v", *name, err) } + + // Emit the final line. + log.Printf("%s", iptables.TerminalStatement) } // getIP listens for a connection from the local process and returns the source diff --git a/test/packetdrill/packetdrill_test.sh b/test/packetdrill/packetdrill_test.sh index c8268170f..922547d65 100755 --- a/test/packetdrill/packetdrill_test.sh +++ b/test/packetdrill/packetdrill_test.sh @@ -85,23 +85,26 @@ if [[ ! -x "${INIT_SCRIPT-}" ]]; then exit 2 fi +function new_net_prefix() { + # Class C, 192.0.0.0 to 223.255.255.255, transitionally has mask 24. + echo "$(shuf -i 192-223 -n 1).$(shuf -i 0-255 -n 1).$(shuf -i 0-255 -n 1)" +} + # Variables specific to the control network and interface start with CTRL_. # Variables specific to the test network and interface start with TEST_. # Variables specific to the DUT start with DUT_. # Variables specific to the test runner start with TEST_RUNNER_. declare -r PACKETDRILL="/packetdrill/gtests/net/packetdrill/packetdrill" # Use random numbers so that test networks don't collide. -declare -r CTRL_NET="ctrl_net-$(shuf -i 0-99999999 -n 1)" -declare -r TEST_NET="test_net-$(shuf -i 0-99999999 -n 1)" +declare CTRL_NET="ctrl_net-$(shuf -i 0-99999999 -n 1)" +declare CTRL_NET_PREFIX=$(new_net_prefix) +declare TEST_NET="test_net-$(shuf -i 0-99999999 -n 1)" +declare TEST_NET_PREFIX=$(new_net_prefix) declare -r tolerance_usecs=100000 # On both DUT and test runner, testing packets are on the eth2 interface. declare -r TEST_DEVICE="eth2" # Number of bits in the *_NET_PREFIX variables. declare -r NET_MASK="24" -function new_net_prefix() { - # Class C, 192.0.0.0 to 223.255.255.255, transitionally has mask 24. - echo "$(shuf -i 192-223 -n 1).$(shuf -i 0-255 -n 1).$(shuf -i 0-255 -n 1)" -} # Last bits of the DUT's IP address. declare -r DUT_NET_SUFFIX=".10" # Control port. @@ -137,23 +140,21 @@ function finish { trap finish EXIT # Subnet for control packets between test runner and DUT. -declare CTRL_NET_PREFIX=$(new_net_prefix) while ! docker network create \ "--subnet=${CTRL_NET_PREFIX}.0/${NET_MASK}" "${CTRL_NET}"; do sleep 0.1 - declare CTRL_NET_PREFIX=$(new_net_prefix) + CTRL_NET_PREFIX=$(new_net_prefix) + CTRL_NET="ctrl_net-$(shuf -i 0-99999999 -n 1)" done # Subnet for the packets that are part of the test. -declare TEST_NET_PREFIX=$(new_net_prefix) while ! docker network create \ "--subnet=${TEST_NET_PREFIX}.0/${NET_MASK}" "${TEST_NET}"; do sleep 0.1 - declare TEST_NET_PREFIX=$(new_net_prefix) + TEST_NET_PREFIX=$(new_net_prefix) + TEST_NET="test_net-$(shuf -i 0-99999999 -n 1)" done -docker pull "${IMAGE_TAG}" - # Create the DUT container and connect to network. DUT=$(docker create ${RUNTIME_ARG} --privileged --rm \ --stop-timeout ${TIMEOUT} -it ${IMAGE_TAG}) diff --git a/test/packetimpact/testbench/dut.go b/test/packetimpact/testbench/dut.go index 9335909c0..3f340c6bc 100644 --- a/test/packetimpact/testbench/dut.go +++ b/test/packetimpact/testbench/dut.go @@ -132,7 +132,7 @@ func (dut *DUT) CreateBoundSocket(typ, proto int32, addr net.IP) (int32, uint16) copy(sa.Addr[:], addr.To16()) dut.Bind(fd, &sa) } else { - dut.t.Fatal("unknown ip addr type for remoteIP") + dut.t.Fatalf("unknown ip addr type for remoteIP") } sa := dut.GetSockName(fd) var port int diff --git a/test/packetimpact/tests/test_runner.sh b/test/packetimpact/tests/test_runner.sh index 2be3c17c3..46d63d5e5 100755 --- a/test/packetimpact/tests/test_runner.sh +++ b/test/packetimpact/tests/test_runner.sh @@ -107,21 +107,24 @@ if [[ ! -f "${TESTBENCH_BINARY-}" ]]; then exit 2 fi +function new_net_prefix() { + # Class C, 192.0.0.0 to 223.255.255.255, transitionally has mask 24. + echo "$(shuf -i 192-223 -n 1).$(shuf -i 0-255 -n 1).$(shuf -i 0-255 -n 1)" +} + # Variables specific to the control network and interface start with CTRL_. # Variables specific to the test network and interface start with TEST_. # Variables specific to the DUT start with DUT_. # Variables specific to the test bench start with TESTBENCH_. # Use random numbers so that test networks don't collide. -declare -r CTRL_NET="ctrl_net-${RANDOM}${RANDOM}" -declare -r TEST_NET="test_net-${RANDOM}${RANDOM}" +declare CTRL_NET="ctrl_net-${RANDOM}${RANDOM}" +declare CTRL_NET_PREFIX=$(new_net_prefix) +declare TEST_NET="test_net-${RANDOM}${RANDOM}" +declare TEST_NET_PREFIX=$(new_net_prefix) # On both DUT and test bench, testing packets are on the eth2 interface. declare -r TEST_DEVICE="eth2" # Number of bits in the *_NET_PREFIX variables. declare -r NET_MASK="24" -function new_net_prefix() { - # Class C, 192.0.0.0 to 223.255.255.255, transitionally has mask 24. - echo "$(shuf -i 192-223 -n 1).$(shuf -i 0-255 -n 1).$(shuf -i 0-255 -n 1)" -} # Last bits of the DUT's IP address. declare -r DUT_NET_SUFFIX=".10" # Control port. @@ -130,6 +133,7 @@ declare -r CTRL_PORT="40000" declare -r TESTBENCH_NET_SUFFIX=".20" declare -r TIMEOUT="60" declare -r IMAGE_TAG="gcr.io/gvisor-presubmit/packetimpact" + # Make sure that docker is installed. docker --version @@ -169,19 +173,19 @@ function finish { trap finish EXIT # Subnet for control packets between test bench and DUT. -declare CTRL_NET_PREFIX=$(new_net_prefix) while ! docker network create \ "--subnet=${CTRL_NET_PREFIX}.0/${NET_MASK}" "${CTRL_NET}"; do sleep 0.1 - declare CTRL_NET_PREFIX=$(new_net_prefix) + CTRL_NET_PREFIX=$(new_net_prefix) + CTRL_NET="ctrl_net-${RANDOM}${RANDOM}" done # Subnet for the packets that are part of the test. -declare TEST_NET_PREFIX=$(new_net_prefix) while ! docker network create \ "--subnet=${TEST_NET_PREFIX}.0/${NET_MASK}" "${TEST_NET}"; do sleep 0.1 - declare TEST_NET_PREFIX=$(new_net_prefix) + TEST_NET_PREFIX=$(new_net_prefix) + TEST_NET="test_net-${RANDOM}${RANDOM}" done docker pull "${IMAGE_TAG}" diff --git a/test/root/BUILD b/test/root/BUILD index 05166673a..17e51e66e 100644 --- a/test/root/BUILD +++ b/test/root/BUILD @@ -33,14 +33,12 @@ go_test( ], visibility = ["//:sandbox"], deps = [ - "//runsc/boot", + "//pkg/test/criutil", + "//pkg/test/dockerutil", + "//pkg/test/testutil", "//runsc/cgroup", "//runsc/container", - "//runsc/criutil", - "//runsc/dockerutil", "//runsc/specutils", - "//runsc/testutil", - "//test/root/testdata", "@com_github_cenkalti_backoff//:go_default_library", "@com_github_opencontainers_runtime-spec//specs-go:go_default_library", "@com_github_syndtr_gocapability//capability:go_default_library", diff --git a/test/root/cgroup_test.go b/test/root/cgroup_test.go index 679342def..8876d0d61 100644 --- a/test/root/cgroup_test.go +++ b/test/root/cgroup_test.go @@ -26,9 +26,9 @@ import ( "testing" "time" + "gvisor.dev/gvisor/pkg/test/dockerutil" + "gvisor.dev/gvisor/pkg/test/testutil" "gvisor.dev/gvisor/runsc/cgroup" - "gvisor.dev/gvisor/runsc/dockerutil" - "gvisor.dev/gvisor/runsc/testutil" ) func verifyPid(pid int, path string) error { @@ -56,54 +56,70 @@ func verifyPid(pid int, path string) error { return fmt.Errorf("got: %v, want: %d", gots, pid) } -// TestCgroup sets cgroup options and checks that cgroup was properly configured. func TestMemCGroup(t *testing.T) { - allocMemSize := 128 << 20 - if err := dockerutil.Pull("python"); err != nil { - t.Fatal("docker pull failed:", err) - } - d := dockerutil.MakeDocker("memusage-test") + d := dockerutil.MakeDocker(t) + defer d.CleanUp() // Start a new container and allocate the specified about of memory. - args := []string{ - "--memory=256MB", - "python", - "python", - "-c", - fmt.Sprintf("import time; s = 'a' * %d; time.sleep(100)", allocMemSize), - } - if err := d.Run(args...); err != nil { - t.Fatal("docker create failed:", err) + allocMemSize := 128 << 20 + allocMemLimit := 2 * allocMemSize + if err := d.Spawn(dockerutil.RunOpts{ + Image: "basic/python", + Memory: allocMemLimit / 1024, // Must be in Kb. + }, "python", "-c", fmt.Sprintf("import time; s = 'a' * %d; time.sleep(100)", allocMemSize)); err != nil { + t.Fatalf("docker run failed: %v", err) } - defer d.CleanUp() + // Extract the ID to lookup the cgroup. gid, err := d.ID() if err != nil { t.Fatalf("Docker.ID() failed: %v", err) } t.Logf("cgroup ID: %s", gid) - path := filepath.Join("/sys/fs/cgroup/memory/docker", gid, "memory.usage_in_bytes") - memUsage := 0 - // Wait when the container will allocate memory. + memUsage := 0 start := time.Now() - for time.Now().Sub(start) < 30*time.Second { + for time.Since(start) < 30*time.Second { + // Sleep for a brief period of time after spawning the + // container (so that Docker can create the cgroup etc. + // or after looping below (so the application can start). + time.Sleep(100 * time.Millisecond) + + // Read the cgroup memory limit. + path := filepath.Join("/sys/fs/cgroup/memory/docker", gid, "memory.limit_in_bytes") outRaw, err := ioutil.ReadFile(path) if err != nil { - t.Fatalf("failed to read %q: %v", path, err) + // It's possible that the container does not exist yet. + continue } out := strings.TrimSpace(string(outRaw)) + memLimit, err := strconv.Atoi(out) + if err != nil { + t.Fatalf("Atoi(%v): %v", out, err) + } + if memLimit != allocMemLimit { + // The group may not have had the correct limit set yet. + continue + } + + // Read the cgroup memory usage. + path = filepath.Join("/sys/fs/cgroup/memory/docker", gid, "memory.max_usage_in_bytes") + outRaw, err = ioutil.ReadFile(path) + if err != nil { + t.Fatalf("error reading usage: %v", err) + } + out = strings.TrimSpace(string(outRaw)) memUsage, err = strconv.Atoi(out) if err != nil { t.Fatalf("Atoi(%v): %v", out, err) } + t.Logf("read usage: %v, wanted: %v", memUsage, allocMemSize) - if memUsage > allocMemSize { + // Are we done? + if memUsage >= allocMemSize { return } - - time.Sleep(100 * time.Millisecond) } t.Fatalf("%vMB is less than %vMB", memUsage>>20, allocMemSize>>20) @@ -111,10 +127,8 @@ func TestMemCGroup(t *testing.T) { // TestCgroup sets cgroup options and checks that cgroup was properly configured. func TestCgroup(t *testing.T) { - if err := dockerutil.Pull("alpine"); err != nil { - t.Fatal("docker pull failed:", err) - } - d := dockerutil.MakeDocker("cgroup-test") + d := dockerutil.MakeDocker(t) + defer d.CleanUp() // This is not a comprehensive list of attributes. // @@ -179,10 +193,11 @@ func TestCgroup(t *testing.T) { want: "5", }, { - arg: "--blkio-weight=750", - ctrl: "blkio", - file: "blkio.weight", - want: "750", + arg: "--blkio-weight=750", + ctrl: "blkio", + file: "blkio.weight", + want: "750", + skipIfNotFound: true, // blkio groups may not be available. }, } @@ -191,12 +206,15 @@ func TestCgroup(t *testing.T) { args = append(args, attr.arg) } - args = append(args, "alpine", "sleep", "10000") - if err := d.Run(args...); err != nil { - t.Fatal("docker create failed:", err) + // Start the container. + if err := d.Spawn(dockerutil.RunOpts{ + Image: "basic/alpine", + Extra: args, // Cgroup arguments. + }, "sleep", "10000"); err != nil { + t.Fatalf("docker run failed: %v", err) } - defer d.CleanUp() + // Lookup the relevant cgroup ID. gid, err := d.ID() if err != nil { t.Fatalf("Docker.ID() failed: %v", err) @@ -245,17 +263,21 @@ func TestCgroup(t *testing.T) { } } +// TestCgroup sets cgroup options and checks that cgroup was properly configured. func TestCgroupParent(t *testing.T) { - if err := dockerutil.Pull("alpine"); err != nil { - t.Fatal("docker pull failed:", err) - } - d := dockerutil.MakeDocker("cgroup-test") + d := dockerutil.MakeDocker(t) + defer d.CleanUp() - parent := testutil.RandomName("runsc") - if err := d.Run("--cgroup-parent", parent, "alpine", "sleep", "10000"); err != nil { - t.Fatal("docker create failed:", err) + // Construct a known cgroup name. + parent := testutil.RandomID("runsc-") + if err := d.Spawn(dockerutil.RunOpts{ + Image: "basic/alpine", + Extra: []string{fmt.Sprintf("--cgroup-parent=%s", parent)}, + }, "sleep", "10000"); err != nil { + t.Fatalf("docker run failed: %v", err) } - defer d.CleanUp() + + // Extract the ID to look up the cgroup. gid, err := d.ID() if err != nil { t.Fatalf("Docker.ID() failed: %v", err) diff --git a/test/root/chroot_test.go b/test/root/chroot_test.go index be0f63d18..a306132a4 100644 --- a/test/root/chroot_test.go +++ b/test/root/chroot_test.go @@ -24,17 +24,20 @@ import ( "strings" "testing" - "gvisor.dev/gvisor/runsc/dockerutil" + "gvisor.dev/gvisor/pkg/test/dockerutil" ) // TestChroot verifies that the sandbox is chroot'd and that mounts are cleaned // up after the sandbox is destroyed. func TestChroot(t *testing.T) { - d := dockerutil.MakeDocker("chroot-test") - if err := d.Run("alpine", "sleep", "10000"); err != nil { + d := dockerutil.MakeDocker(t) + defer d.CleanUp() + + if err := d.Spawn(dockerutil.RunOpts{ + Image: "basic/alpine", + }, "sleep", "10000"); err != nil { t.Fatalf("docker run failed: %v", err) } - defer d.CleanUp() pid, err := d.SandboxPid() if err != nil { @@ -76,11 +79,14 @@ func TestChroot(t *testing.T) { } func TestChrootGofer(t *testing.T) { - d := dockerutil.MakeDocker("chroot-test") - if err := d.Run("alpine", "sleep", "10000"); err != nil { + d := dockerutil.MakeDocker(t) + defer d.CleanUp() + + if err := d.Spawn(dockerutil.RunOpts{ + Image: "basic/alpine", + }, "sleep", "10000"); err != nil { t.Fatalf("docker run failed: %v", err) } - defer d.CleanUp() // It's tricky to find gofers. Get sandbox PID first, then find parent. From // parent get all immediate children, remove the sandbox, and everything else diff --git a/test/root/crictl_test.go b/test/root/crictl_test.go index 3f90c4c6a..85007dcce 100644 --- a/test/root/crictl_test.go +++ b/test/root/crictl_test.go @@ -16,6 +16,7 @@ package root import ( "bytes" + "encoding/json" "fmt" "io" "io/ioutil" @@ -29,16 +30,58 @@ import ( "testing" "time" - "gvisor.dev/gvisor/runsc/criutil" - "gvisor.dev/gvisor/runsc/dockerutil" + "gvisor.dev/gvisor/pkg/test/criutil" + "gvisor.dev/gvisor/pkg/test/dockerutil" + "gvisor.dev/gvisor/pkg/test/testutil" "gvisor.dev/gvisor/runsc/specutils" - "gvisor.dev/gvisor/runsc/testutil" - "gvisor.dev/gvisor/test/root/testdata" ) // Tests for crictl have to be run as root (rather than in a user namespace) // because crictl creates named network namespaces in /var/run/netns/. +// SimpleSpec returns a JSON config for a simple container that runs the +// specified command in the specified image. +func SimpleSpec(name, image string, cmd []string, extra map[string]interface{}) string { + s := map[string]interface{}{ + "metadata": map[string]string{ + "name": name, + }, + "image": map[string]string{ + "image": testutil.ImageByName(image), + }, + "log_path": fmt.Sprintf("%s.log", name), + } + if len(cmd) > 0 { // Omit if empty. + s["command"] = cmd + } + for k, v := range extra { + s[k] = v // Extra settings. + } + v, err := json.Marshal(s) + if err != nil { + // This shouldn't happen. + panic(err) + } + return string(v) +} + +// Sandbox is a default JSON config for a sandbox. +var Sandbox = `{ + "metadata": { + "name": "default-sandbox", + "namespace": "default", + "attempt": 1, + "uid": "hdishd83djaidwnduwk28bcsb" + }, + "linux": { + }, + "log_directory": "/tmp" +} +` + +// Httpd is a JSON config for an httpd container. +var Httpd = SimpleSpec("httpd", "basic/httpd", nil, nil) + // TestCrictlSanity refers to b/112433158. func TestCrictlSanity(t *testing.T) { // Setup containerd and crictl. @@ -47,9 +90,9 @@ func TestCrictlSanity(t *testing.T) { t.Fatalf("failed to setup crictl: %v", err) } defer cleanup() - podID, contID, err := crictl.StartPodAndContainer("httpd", testdata.Sandbox, testdata.Httpd) + podID, contID, err := crictl.StartPodAndContainer("basic/httpd", Sandbox, Httpd) if err != nil { - t.Fatal(err) + t.Fatalf("start failed: %v", err) } // Look for the httpd page. @@ -59,10 +102,38 @@ func TestCrictlSanity(t *testing.T) { // Stop everything. if err := crictl.StopPodAndContainer(podID, contID); err != nil { - t.Fatal(err) + t.Fatalf("stop failed: %v", err) } } +// HttpdMountPaths is a JSON config for an httpd container with additional +// mounts. +var HttpdMountPaths = SimpleSpec("httpd", "basic/httpd", nil, map[string]interface{}{ + "mounts": []map[string]interface{}{ + map[string]interface{}{ + "container_path": "/var/run/secrets/kubernetes.io/serviceaccount", + "host_path": "/var/lib/kubelet/pods/82bae206-cdf5-11e8-b245-8cdcd43ac064/volumes/kubernetes.io~secret/default-token-2rpfx", + "readonly": true, + }, + map[string]interface{}{ + "container_path": "/etc/hosts", + "host_path": "/var/lib/kubelet/pods/82bae206-cdf5-11e8-b245-8cdcd43ac064/etc-hosts", + "readonly": false, + }, + map[string]interface{}{ + "container_path": "/dev/termination-log", + "host_path": "/var/lib/kubelet/pods/82bae206-cdf5-11e8-b245-8cdcd43ac064/containers/httpd/d1709580", + "readonly": false, + }, + map[string]interface{}{ + "container_path": "/usr/local/apache2/htdocs/test", + "host_path": "/var/lib/kubelet/pods/82bae206-cdf5-11e8-b245-8cdcd43ac064", + "readonly": true, + }, + }, + "linux": map[string]interface{}{}, +}) + // TestMountPaths refers to b/117635704. func TestMountPaths(t *testing.T) { // Setup containerd and crictl. @@ -71,9 +142,9 @@ func TestMountPaths(t *testing.T) { t.Fatalf("failed to setup crictl: %v", err) } defer cleanup() - podID, contID, err := crictl.StartPodAndContainer("httpd", testdata.Sandbox, testdata.HttpdMountPaths) + podID, contID, err := crictl.StartPodAndContainer("basic/httpd", Sandbox, HttpdMountPaths) if err != nil { - t.Fatal(err) + t.Fatalf("start failed: %v", err) } // Look for the directory available at /test. @@ -83,7 +154,7 @@ func TestMountPaths(t *testing.T) { // Stop everything. if err := crictl.StopPodAndContainer(podID, contID); err != nil { - t.Fatal(err) + t.Fatalf("stop failed: %v", err) } } @@ -95,14 +166,16 @@ func TestMountOverSymlinks(t *testing.T) { t.Fatalf("failed to setup crictl: %v", err) } defer cleanup() - podID, contID, err := crictl.StartPodAndContainer("k8s.gcr.io/busybox", testdata.Sandbox, testdata.MountOverSymlink) + + spec := SimpleSpec("busybox", "basic/resolv", []string{"sleep", "1000"}, nil) + podID, contID, err := crictl.StartPodAndContainer("basic/resolv", Sandbox, spec) if err != nil { - t.Fatal(err) + t.Fatalf("start failed: %v", err) } out, err := crictl.Exec(contID, "readlink", "/etc/resolv.conf") if err != nil { - t.Fatal(err) + t.Fatalf("readlink failed: %v, out: %s", err, out) } if want := "/tmp/resolv.conf"; !strings.Contains(string(out), want) { t.Fatalf("/etc/resolv.conf is not pointing to %q: %q", want, string(out)) @@ -110,11 +183,11 @@ func TestMountOverSymlinks(t *testing.T) { etc, err := crictl.Exec(contID, "cat", "/etc/resolv.conf") if err != nil { - t.Fatal(err) + t.Fatalf("cat failed: %v, out: %s", err, etc) } tmp, err := crictl.Exec(contID, "cat", "/tmp/resolv.conf") if err != nil { - t.Fatal(err) + t.Fatalf("cat failed: %v, out: %s", err, out) } if tmp != etc { t.Fatalf("file content doesn't match:\n\t/etc/resolv.conf: %s\n\t/tmp/resolv.conf: %s", string(etc), string(tmp)) @@ -122,7 +195,7 @@ func TestMountOverSymlinks(t *testing.T) { // Stop everything. if err := crictl.StopPodAndContainer(podID, contID); err != nil { - t.Fatal(err) + t.Fatalf("stop failed: %v", err) } } @@ -135,16 +208,16 @@ func TestHomeDir(t *testing.T) { t.Fatalf("failed to setup crictl: %v", err) } defer cleanup() - contSpec := testdata.SimpleSpec("root", "k8s.gcr.io/busybox", []string{"sleep", "1000"}) - podID, contID, err := crictl.StartPodAndContainer("k8s.gcr.io/busybox", testdata.Sandbox, contSpec) + contSpec := SimpleSpec("root", "basic/busybox", []string{"sleep", "1000"}, nil) + podID, contID, err := crictl.StartPodAndContainer("basic/busybox", Sandbox, contSpec) if err != nil { - t.Fatal(err) + t.Fatalf("start failed: %v", err) } t.Run("root container", func(t *testing.T) { out, err := crictl.Exec(contID, "sh", "-c", "echo $HOME") if err != nil { - t.Fatal(err) + t.Fatalf("exec failed: %v, out: %s", err, out) } if got, want := strings.TrimSpace(string(out)), "/root"; got != want { t.Fatalf("Home directory invalid. Got %q, Want : %q", got, want) @@ -153,32 +226,47 @@ func TestHomeDir(t *testing.T) { t.Run("sub-container", func(t *testing.T) { // Create a sub container in the same pod. - subContSpec := testdata.SimpleSpec("subcontainer", "k8s.gcr.io/busybox", []string{"sleep", "1000"}) - subContID, err := crictl.StartContainer(podID, "k8s.gcr.io/busybox", testdata.Sandbox, subContSpec) + subContSpec := SimpleSpec("subcontainer", "basic/busybox", []string{"sleep", "1000"}, nil) + subContID, err := crictl.StartContainer(podID, "basic/busybox", Sandbox, subContSpec) if err != nil { - t.Fatal(err) + t.Fatalf("start failed: %v", err) } out, err := crictl.Exec(subContID, "sh", "-c", "echo $HOME") if err != nil { - t.Fatal(err) + t.Fatalf("exec failed: %v, out: %s", err, out) } if got, want := strings.TrimSpace(string(out)), "/root"; got != want { t.Fatalf("Home directory invalid. Got %q, Want: %q", got, want) } if err := crictl.StopContainer(subContID); err != nil { - t.Fatal(err) + t.Fatalf("stop failed: %v", err) } }) // Stop everything. if err := crictl.StopPodAndContainer(podID, contID); err != nil { - t.Fatal(err) + t.Fatalf("stop failed: %v", err) } } +// containerdConfigTemplate is a .toml config for containerd. It contains a +// formatting verb so the runtime field can be set via fmt.Sprintf. +const containerdConfigTemplate = ` +disabled_plugins = ["restart"] +[plugins.linux] + runtime = "%s" + runtime_root = "/tmp/test-containerd/runsc" + shim = "/usr/local/bin/gvisor-containerd-shim" + shim_debug = true + +[plugins.cri.containerd.runtimes.runsc] + runtime_type = "io.containerd.runtime.v1.linux" + runtime_engine = "%s" +` + // setup sets up before a test. Specifically it: // * Creates directories and a socket for containerd to utilize. // * Runs containerd and waits for it to reach a "ready" state for testing. @@ -213,50 +301,52 @@ func setup(t *testing.T) (*criutil.Crictl, func(), error) { if err != nil { t.Fatalf("error discovering runtime path: %v", err) } - config, err := testutil.WriteTmpFile("containerd-config", testdata.ContainerdConfig(runtime)) + config, configCleanup, err := testutil.WriteTmpFile("containerd-config", fmt.Sprintf(containerdConfigTemplate, runtime, runtime)) if err != nil { t.Fatalf("failed to write containerd config") } - cleanups = append(cleanups, func() { os.RemoveAll(config) }) + cleanups = append(cleanups, configCleanup) // Start containerd. - containerd := exec.Command(getContainerd(), + cmd := exec.Command(getContainerd(), "--config", config, "--log-level", "debug", "--root", containerdRoot, "--state", containerdState, "--address", sockAddr) + startupR, startupW := io.Pipe() + defer startupR.Close() + defer startupW.Close() + stderr := &bytes.Buffer{} + stdout := &bytes.Buffer{} + cmd.Stderr = io.MultiWriter(startupW, stderr) + cmd.Stdout = io.MultiWriter(startupW, stdout) cleanups = append(cleanups, func() { - if err := testutil.KillCommand(containerd); err != nil { - log.Printf("error killing containerd: %v", err) - } + t.Logf("containerd stdout: %s", stdout.String()) + t.Logf("containerd stderr: %s", stderr.String()) }) - containerdStderr, err := containerd.StderrPipe() - if err != nil { - t.Fatalf("failed to get containerd stderr: %v", err) - } - containerdStdout, err := containerd.StdoutPipe() - if err != nil { - t.Fatalf("failed to get containerd stdout: %v", err) - } - if err := containerd.Start(); err != nil { + + // Start the process. + if err := cmd.Start(); err != nil { t.Fatalf("failed running containerd: %v", err) } - // Wait for containerd to boot. Then put all containerd output into a - // buffer to be logged at the end of the test. - testutil.WaitUntilRead(containerdStderr, "Start streaming server", nil, 10*time.Second) - stdoutBuf := &bytes.Buffer{} - stderrBuf := &bytes.Buffer{} - go func() { io.Copy(stdoutBuf, containerdStdout) }() - go func() { io.Copy(stderrBuf, containerdStderr) }() + // Wait for containerd to boot. + if err := testutil.WaitUntilRead(startupR, "Start streaming server", nil, 10*time.Second); err != nil { + t.Fatalf("failed to start containerd: %v", err) + } + + // Kill must be the last cleanup (as it will be executed first). + cc := criutil.NewCrictl(t, sockAddr) cleanups = append(cleanups, func() { - t.Logf("containerd stdout: %s", string(stdoutBuf.Bytes())) - t.Logf("containerd stderr: %s", string(stderrBuf.Bytes())) + cc.CleanUp() // Remove tmp files, etc. + if err := testutil.KillCommand(cmd); err != nil { + log.Printf("error killing containerd: %v", err) + } }) cleanup.Release() - return criutil.NewCrictl(20*time.Second, sockAddr), cleanupFunc, nil + return cc, cleanupFunc, nil } // httpGet GETs the contents of a file served from a pod on port 80. diff --git a/test/root/main_test.go b/test/root/main_test.go index d74dec85f..9fb17e0dd 100644 --- a/test/root/main_test.go +++ b/test/root/main_test.go @@ -21,7 +21,7 @@ import ( "testing" "github.com/syndtr/gocapability/capability" - "gvisor.dev/gvisor/runsc/dockerutil" + "gvisor.dev/gvisor/pkg/test/dockerutil" "gvisor.dev/gvisor/runsc/specutils" ) diff --git a/test/root/oom_score_adj_test.go b/test/root/oom_score_adj_test.go index 22488b05d..9a3cecd97 100644 --- a/test/root/oom_score_adj_test.go +++ b/test/root/oom_score_adj_test.go @@ -20,10 +20,9 @@ import ( "testing" specs "github.com/opencontainers/runtime-spec/specs-go" - "gvisor.dev/gvisor/runsc/boot" + "gvisor.dev/gvisor/pkg/test/testutil" "gvisor.dev/gvisor/runsc/container" "gvisor.dev/gvisor/runsc/specutils" - "gvisor.dev/gvisor/runsc/testutil" ) var ( @@ -40,15 +39,6 @@ var ( // TestOOMScoreAdjSingle tests that oom_score_adj is set properly in a // single container sandbox. func TestOOMScoreAdjSingle(t *testing.T) { - rootDir, err := testutil.SetupRootDir() - if err != nil { - t.Fatalf("error creating root dir: %v", err) - } - defer os.RemoveAll(rootDir) - - conf := testutil.TestConfig(t) - conf.RootDir = rootDir - ppid, err := specutils.GetParentPid(os.Getpid()) if err != nil { t.Fatalf("getting parent pid: %v", err) @@ -89,11 +79,11 @@ func TestOOMScoreAdjSingle(t *testing.T) { for _, testCase := range testCases { t.Run(testCase.Name, func(t *testing.T) { - id := testutil.UniqueContainerID() + id := testutil.RandomContainerID() s := testutil.NewSpecWithArgs("sleep", "1000") s.Process.OOMScoreAdj = testCase.OOMScoreAdj - containers, cleanup, err := startContainers(conf, []*specs.Spec{s}, []string{id}) + containers, cleanup, err := startContainers(t, []*specs.Spec{s}, []string{id}) if err != nil { t.Fatalf("error starting containers: %v", err) } @@ -131,15 +121,6 @@ func TestOOMScoreAdjSingle(t *testing.T) { // TestOOMScoreAdjMulti tests that oom_score_adj is set properly in a // multi-container sandbox. func TestOOMScoreAdjMulti(t *testing.T) { - rootDir, err := testutil.SetupRootDir() - if err != nil { - t.Fatalf("error creating root dir: %v", err) - } - defer os.RemoveAll(rootDir) - - conf := testutil.TestConfig(t) - conf.RootDir = rootDir - ppid, err := specutils.GetParentPid(os.Getpid()) if err != nil { t.Fatalf("getting parent pid: %v", err) @@ -257,7 +238,7 @@ func TestOOMScoreAdjMulti(t *testing.T) { } } - containers, cleanup, err := startContainers(conf, specs, ids) + containers, cleanup, err := startContainers(t, specs, ids) if err != nil { t.Fatalf("error starting containers: %v", err) } @@ -321,7 +302,7 @@ func TestOOMScoreAdjMulti(t *testing.T) { func createSpecs(cmds ...[]string) ([]*specs.Spec, []string) { var specs []*specs.Spec var ids []string - rootID := testutil.UniqueContainerID() + rootID := testutil.RandomContainerID() for i, cmd := range cmds { spec := testutil.NewSpecWithArgs(cmd...) @@ -335,35 +316,48 @@ func createSpecs(cmds ...[]string) ([]*specs.Spec, []string) { specutils.ContainerdContainerTypeAnnotation: specutils.ContainerdContainerTypeContainer, specutils.ContainerdSandboxIDAnnotation: rootID, } - ids = append(ids, testutil.UniqueContainerID()) + ids = append(ids, testutil.RandomContainerID()) } specs = append(specs, spec) } return specs, ids } -func startContainers(conf *boot.Config, specs []*specs.Spec, ids []string) ([]*container.Container, func(), error) { - if len(conf.RootDir) == 0 { - panic("conf.RootDir not set. Call testutil.SetupRootDir() to set.") - } - - var containers []*container.Container - var bundles []string - cleanup := func() { +func startContainers(t *testing.T, specs []*specs.Spec, ids []string) ([]*container.Container, func(), error) { + var ( + containers []*container.Container + cleanups []func() + ) + cleanups = append(cleanups, func() { for _, c := range containers { c.Destroy() } - for _, b := range bundles { - os.RemoveAll(b) + }) + cleanupAll := func() { + for _, c := range cleanups { + c() } } + localClean := specutils.MakeCleanup(cleanupAll) + defer localClean.Clean() + + // All containers must share the same root. + rootDir, cleanup, err := testutil.SetupRootDir() + if err != nil { + t.Fatalf("error creating root dir: %v", err) + } + cleanups = append(cleanups, cleanup) + + // Point this to from the configuration. + conf := testutil.TestConfig(t) + conf.RootDir = rootDir + for i, spec := range specs { - bundleDir, err := testutil.SetupBundleDir(spec) + bundleDir, cleanup, err := testutil.SetupBundleDir(spec) if err != nil { - cleanup() - return nil, nil, fmt.Errorf("error setting up container: %v", err) + return nil, nil, fmt.Errorf("error setting up bundle: %v", err) } - bundles = append(bundles, bundleDir) + cleanups = append(cleanups, cleanup) args := container.Args{ ID: ids[i], @@ -372,15 +366,15 @@ func startContainers(conf *boot.Config, specs []*specs.Spec, ids []string) ([]*c } cont, err := container.New(conf, args) if err != nil { - cleanup() return nil, nil, fmt.Errorf("error creating container: %v", err) } containers = append(containers, cont) if err := cont.Start(conf); err != nil { - cleanup() return nil, nil, fmt.Errorf("error starting container: %v", err) } } - return containers, cleanup, nil + + localClean.Release() + return containers, cleanupAll, nil } diff --git a/test/root/runsc_test.go b/test/root/runsc_test.go index 90373e2db..25204bebb 100644 --- a/test/root/runsc_test.go +++ b/test/root/runsc_test.go @@ -28,8 +28,8 @@ import ( "github.com/cenkalti/backoff" "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/test/testutil" "gvisor.dev/gvisor/runsc/specutils" - "gvisor.dev/gvisor/runsc/testutil" ) // TestDoKill checks that when "runsc do..." is killed, the sandbox process is diff --git a/test/root/testdata/BUILD b/test/root/testdata/BUILD deleted file mode 100644 index 6859541ad..000000000 --- a/test/root/testdata/BUILD +++ /dev/null @@ -1,18 +0,0 @@ -load("//tools:defs.bzl", "go_library") - -package(licenses = ["notice"]) - -go_library( - name = "testdata", - srcs = [ - "busybox.go", - "containerd_config.go", - "httpd.go", - "httpd_mount_paths.go", - "sandbox.go", - "simple.go", - ], - visibility = [ - "//:sandbox", - ], -) diff --git a/test/root/testdata/busybox.go b/test/root/testdata/busybox.go deleted file mode 100644 index e4dbd2843..000000000 --- a/test/root/testdata/busybox.go +++ /dev/null @@ -1,32 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package testdata - -// MountOverSymlink is a JSON config for a container that /etc/resolv.conf is a -// symlink to /tmp/resolv.conf. -var MountOverSymlink = ` -{ - "metadata": { - "name": "busybox" - }, - "image": { - "image": "k8s.gcr.io/busybox" - }, - "command": [ - "sleep", - "1000" - ] -} -` diff --git a/test/root/testdata/containerd_config.go b/test/root/testdata/containerd_config.go deleted file mode 100644 index e12f1ec88..000000000 --- a/test/root/testdata/containerd_config.go +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Package testdata contains data required for root tests. -package testdata - -import "fmt" - -// containerdConfigTemplate is a .toml config for containerd. It contains a -// formatting verb so the runtime field can be set via fmt.Sprintf. -const containerdConfigTemplate = ` -disabled_plugins = ["restart"] -[plugins.linux] - runtime = "%s" - runtime_root = "/tmp/test-containerd/runsc" - shim = "/usr/local/bin/gvisor-containerd-shim" - shim_debug = true - -[plugins.cri.containerd.runtimes.runsc] - runtime_type = "io.containerd.runtime.v1.linux" - runtime_engine = "%s" -` - -// ContainerdConfig returns a containerd config file with the specified -// runtime. -func ContainerdConfig(runtime string) string { - return fmt.Sprintf(containerdConfigTemplate, runtime, runtime) -} diff --git a/test/root/testdata/httpd.go b/test/root/testdata/httpd.go deleted file mode 100644 index 45d5e33d4..000000000 --- a/test/root/testdata/httpd.go +++ /dev/null @@ -1,32 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package testdata - -// Httpd is a JSON config for an httpd container. -const Httpd = ` -{ - "metadata": { - "name": "httpd" - }, - "image":{ - "image": "httpd" - }, - "mounts": [ - ], - "linux": { - }, - "log_path": "httpd.log" -} -` diff --git a/test/root/testdata/httpd_mount_paths.go b/test/root/testdata/httpd_mount_paths.go deleted file mode 100644 index ac3f4446a..000000000 --- a/test/root/testdata/httpd_mount_paths.go +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package testdata - -// HttpdMountPaths is a JSON config for an httpd container with additional -// mounts. -const HttpdMountPaths = ` -{ - "metadata": { - "name": "httpd" - }, - "image":{ - "image": "httpd" - }, - "mounts": [ - { - "container_path": "/var/run/secrets/kubernetes.io/serviceaccount", - "host_path": "/var/lib/kubelet/pods/82bae206-cdf5-11e8-b245-8cdcd43ac064/volumes/kubernetes.io~secret/default-token-2rpfx", - "readonly": true - }, - { - "container_path": "/etc/hosts", - "host_path": "/var/lib/kubelet/pods/82bae206-cdf5-11e8-b245-8cdcd43ac064/etc-hosts", - "readonly": false - }, - { - "container_path": "/dev/termination-log", - "host_path": "/var/lib/kubelet/pods/82bae206-cdf5-11e8-b245-8cdcd43ac064/containers/httpd/d1709580", - "readonly": false - }, - { - "container_path": "/usr/local/apache2/htdocs/test", - "host_path": "/var/lib/kubelet/pods/82bae206-cdf5-11e8-b245-8cdcd43ac064", - "readonly": true - } - ], - "linux": { - }, - "log_path": "httpd.log" -} -` diff --git a/test/root/testdata/sandbox.go b/test/root/testdata/sandbox.go deleted file mode 100644 index 0db210370..000000000 --- a/test/root/testdata/sandbox.go +++ /dev/null @@ -1,30 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package testdata - -// Sandbox is a default JSON config for a sandbox. -const Sandbox = ` -{ - "metadata": { - "name": "default-sandbox", - "namespace": "default", - "attempt": 1, - "uid": "hdishd83djaidwnduwk28bcsb" - }, - "linux": { - }, - "log_directory": "/tmp" -} -` diff --git a/test/root/testdata/simple.go b/test/root/testdata/simple.go deleted file mode 100644 index 1cca53f0c..000000000 --- a/test/root/testdata/simple.go +++ /dev/null @@ -1,41 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package testdata - -import ( - "encoding/json" - "fmt" -) - -// SimpleSpec returns a JSON config for a simple container that runs the -// specified command in the specified image. -func SimpleSpec(name, image string, cmd []string) string { - cmds, err := json.Marshal(cmd) - if err != nil { - // This shouldn't happen. - panic(err) - } - return fmt.Sprintf(` -{ - "metadata": { - "name": %q - }, - "image": { - "image": %q - }, - "command": %s - } -`, name, image, cmds) -} diff --git a/test/runner/BUILD b/test/runner/BUILD index 9959ef9b0..6833c9986 100644 --- a/test/runner/BUILD +++ b/test/runner/BUILD @@ -12,8 +12,8 @@ go_binary( visibility = ["//:sandbox"], deps = [ "//pkg/log", + "//pkg/test/testutil", "//runsc/specutils", - "//runsc/testutil", "//test/runner/gtest", "//test/uds", "@com_github_opencontainers_runtime-spec//specs-go:go_default_library", diff --git a/test/runner/runner.go b/test/runner/runner.go index 0d3742f71..14c9cbc47 100644 --- a/test/runner/runner.go +++ b/test/runner/runner.go @@ -32,8 +32,8 @@ import ( specs "github.com/opencontainers/runtime-spec/specs-go" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/test/testutil" "gvisor.dev/gvisor/runsc/specutils" - "gvisor.dev/gvisor/runsc/testutil" "gvisor.dev/gvisor/test/runner/gtest" "gvisor.dev/gvisor/test/uds" ) @@ -115,20 +115,20 @@ func runTestCaseNative(testBin string, tc gtest.TestCase, t *testing.T) { // // Returns an error if the sandboxed application exits non-zero. func runRunsc(tc gtest.TestCase, spec *specs.Spec) error { - bundleDir, err := testutil.SetupBundleDir(spec) + bundleDir, cleanup, err := testutil.SetupBundleDir(spec) if err != nil { return fmt.Errorf("SetupBundleDir failed: %v", err) } - defer os.RemoveAll(bundleDir) + defer cleanup() - rootDir, err := testutil.SetupRootDir() + rootDir, cleanup, err := testutil.SetupRootDir() if err != nil { return fmt.Errorf("SetupRootDir failed: %v", err) } - defer os.RemoveAll(rootDir) + defer cleanup() name := tc.FullName() - id := testutil.UniqueContainerID() + id := testutil.RandomContainerID() log.Infof("Running test %q in container %q", name, id) specutils.LogSpec(spec) diff --git a/test/runtimes/BUILD b/test/runtimes/BUILD index 2c472bf8d..4cd627222 100644 --- a/test/runtimes/BUILD +++ b/test/runtimes/BUILD @@ -1,20 +1,7 @@ -# These packages are used to run language runtime tests inside gVisor sandboxes. - -load("//tools:defs.bzl", "go_binary", "go_test") -load("//test/runtimes:build_defs.bzl", "runtime_test") +load("//test/runtimes:defs.bzl", "runtime_test") package(licenses = ["notice"]) -go_binary( - name = "runner", - testonly = 1, - srcs = ["runner.go"], - deps = [ - "//runsc/dockerutil", - "//runsc/testutil", - ], -) - runtime_test( name = "go1.12", blacklist_file = "blacklist_go1.12.csv", @@ -44,10 +31,3 @@ runtime_test( blacklist_file = "blacklist_python3.7.3.csv", lang = "python", ) - -go_test( - name = "blacklist_test", - size = "small", - srcs = ["blacklist_test.go"], - library = ":runner", -) diff --git a/test/runtimes/README.md b/test/runtimes/README.md deleted file mode 100644 index 42d722553..000000000 --- a/test/runtimes/README.md +++ /dev/null @@ -1,56 +0,0 @@ -# Runtimes Tests Dockerfiles - -The Dockerfiles defined under this path are configured to host the execution of -the runtimes language tests. Each Dockerfile can support the language indicated -by its directory. - -The following runtimes are currently supported: - -- Go 1.12 -- Java 11 -- Node.js 12 -- PHP 7.3 -- Python 3.7 - -### Building and pushing the images: - -The canonical source of images is the -[gvisor-presubmit container registry](https://gcr.io/gvisor-presubmit/). You can -build new images with the following command: - -```bash -$ cd images -$ docker build -f Dockerfile_$LANG [-t $NAME] . -``` - -To push them to our container registry, set the tag in the command above to -`gcr.io/gvisor-presubmit/$LANG`, then push them. (Note that you will need -appropriate permissions to the `gvisor-presubmit` GCP project.) - -```bash -gcloud docker -- push gcr.io/gvisor-presubmit/$LANG -``` - -#### Running in Docker locally: - -1) [Install and configure Docker](https://docs.docker.com/install/) - -2) Pull the image you want to run: - -```bash -$ docker pull gcr.io/gvisor-presubmit/$LANG -``` - -3) Run docker with the image. - -```bash -$ docker run [--runtime=runsc] --rm -it $NAME [FLAG] -``` - -Running the command with no flags will cause all the available tests to execute. - -Flags can be added for additional functionality: - -- --list: Print a list of all available tests -- --test <name>: Run a single test from the list of available tests -- --v: Print the language version diff --git a/test/runtimes/blacklist_test.go b/test/runtimes/blacklist_test.go deleted file mode 100644 index 0ff69ab18..000000000 --- a/test/runtimes/blacklist_test.go +++ /dev/null @@ -1,37 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package main - -import ( - "flag" - "os" - "testing" -) - -func TestMain(m *testing.M) { - flag.Parse() - os.Exit(m.Run()) -} - -// Test that the blacklist parses without error. -func TestBlacklists(t *testing.T) { - bl, err := getBlacklist() - if err != nil { - t.Fatalf("error parsing blacklist: %v", err) - } - if *blacklistFile != "" && len(bl) == 0 { - t.Errorf("got empty blacklist for file %q", *blacklistFile) - } -} diff --git a/test/runtimes/build_defs.bzl b/test/runtimes/build_defs.bzl deleted file mode 100644 index 92e275a76..000000000 --- a/test/runtimes/build_defs.bzl +++ /dev/null @@ -1,75 +0,0 @@ -"""Defines a rule for runtime test targets.""" - -load("//tools:defs.bzl", "go_test", "loopback") - -def runtime_test( - name, - lang, - image_repo = "gcr.io/gvisor-presubmit", - image_name = None, - blacklist_file = None, - shard_count = 50, - size = "enormous"): - """Generates sh_test and blacklist test targets for a given runtime. - - Args: - name: The name of the runtime being tested. Typically, the lang + version. - This is used in the names of the generated test targets. - lang: The language being tested. - image_repo: The docker repository containing the proctor image to run. - i.e., the prefix to the fully qualified docker image id. - image_name: The name of the image in the image_repo. - Defaults to the test name. - blacklist_file: A test blacklist to pass to the runtime test's runner. - shard_count: See Bazel common test attributes. - size: See Bazel common test attributes. - """ - if image_name == None: - image_name = name - args = [ - "--lang", - lang, - "--image", - "/".join([image_repo, image_name]), - ] - data = [ - ":runner", - loopback, - ] - if blacklist_file: - args += ["--blacklist_file", "test/runtimes/" + blacklist_file] - data += [blacklist_file] - - # Add a test that the blacklist parses correctly. - blacklist_test(name, blacklist_file) - - sh_test( - name = name + "_test", - srcs = ["runner.sh"], - args = args, - data = data, - size = size, - shard_count = shard_count, - tags = [ - # Requires docker and runsc to be configured before the test runs. - "local", - # Don't include test target in wildcard target patterns. - "manual", - ], - ) - -def blacklist_test(name, blacklist_file): - """Test that a blacklist parses correctly.""" - go_test( - name = name + "_blacklist_test", - library = ":runner", - srcs = ["blacklist_test.go"], - args = ["--blacklist_file", "test/runtimes/" + blacklist_file], - data = [blacklist_file], - ) - -def sh_test(**kwargs): - """Wraps the standard sh_test.""" - native.sh_test( - **kwargs - ) diff --git a/test/runtimes/defs.bzl b/test/runtimes/defs.bzl new file mode 100644 index 000000000..f836dd952 --- /dev/null +++ b/test/runtimes/defs.bzl @@ -0,0 +1,79 @@ +"""Defines a rule for runtime test targets.""" + +load("//tools:defs.bzl", "go_test") + +def _runtime_test_impl(ctx): + # Construct arguments. + args = [ + "--lang", + ctx.attr.lang, + "--image", + ctx.attr.image, + ] + if ctx.attr.blacklist_file: + args += [ + "--blacklist_file", + ctx.files.blacklist_file[0].short_path, + ] + + # Build a runner. + runner = ctx.actions.declare_file("%s-executer" % ctx.label.name) + runner_content = "\n".join([ + "#!/bin/bash", + "%s %s\n" % (ctx.files._runner[0].short_path, " ".join(args)), + ]) + ctx.actions.write(runner, runner_content, is_executable = True) + + # Return the runner. + return [DefaultInfo( + executable = runner, + runfiles = ctx.runfiles( + files = ctx.files._runner + ctx.files.blacklist_file + ctx.files._proctor, + collect_default = True, + collect_data = True, + ), + )] + +_runtime_test = rule( + implementation = _runtime_test_impl, + attrs = { + "image": attr.string( + mandatory = False, + ), + "lang": attr.string( + mandatory = True, + ), + "blacklist_file": attr.label( + mandatory = False, + allow_single_file = True, + ), + "_runner": attr.label( + default = "//test/runtimes/runner:runner", + ), + "_proctor": attr.label( + default = "//test/runtimes/proctor:proctor", + ), + }, + test = True, +) + +def runtime_test(name, **kwargs): + _runtime_test( + name = name, + image = name, # Resolved as images/runtimes/%s. + tags = [ + "local", + "manual", + ], + **kwargs + ) + +def blacklist_test(name, blacklist_file): + """Test that a blacklist parses correctly.""" + go_test( + name = name + "_blacklist_test", + library = ":runner", + srcs = ["blacklist_test.go"], + args = ["--blacklist_file", "test/runtimes/" + blacklist_file], + data = [blacklist_file], + ) diff --git a/test/runtimes/images/proctor/BUILD b/test/runtimes/images/proctor/BUILD deleted file mode 100644 index 85e004c45..000000000 --- a/test/runtimes/images/proctor/BUILD +++ /dev/null @@ -1,26 +0,0 @@ -load("//tools:defs.bzl", "go_binary", "go_test") - -package(licenses = ["notice"]) - -go_binary( - name = "proctor", - srcs = [ - "go.go", - "java.go", - "nodejs.go", - "php.go", - "proctor.go", - "python.go", - ], - visibility = ["//test/runtimes/images:__subpackages__"], -) - -go_test( - name = "proctor_test", - size = "small", - srcs = ["proctor_test.go"], - library = ":proctor", - deps = [ - "//runsc/testutil", - ], -) diff --git a/test/runtimes/images/proctor/go.go b/test/runtimes/images/proctor/go.go deleted file mode 100644 index 3e2d5d8db..000000000 --- a/test/runtimes/images/proctor/go.go +++ /dev/null @@ -1,90 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package main - -import ( - "fmt" - "os" - "os/exec" - "regexp" - "strings" -) - -var ( - goTestRegEx = regexp.MustCompile(`^.+\.go$`) - - // Directories with .dir contain helper files for tests. - // Exclude benchmarks and stress tests. - goDirFilter = regexp.MustCompile(`^(bench|stress)\/.+$|^.+\.dir.+$`) -) - -// Location of Go tests on disk. -const goTestDir = "/usr/local/go/test" - -// goRunner implements TestRunner for Go. -// -// There are two types of Go tests: "Go tool tests" and "Go tests on disk". -// "Go tool tests" are found and executed using `go tool dist test`. "Go tests -// on disk" are found in the /usr/local/go/test directory and are executed -// using `go run run.go`. -type goRunner struct{} - -var _ TestRunner = goRunner{} - -// ListTests implements TestRunner.ListTests. -func (goRunner) ListTests() ([]string, error) { - // Go tool dist test tests. - args := []string{"tool", "dist", "test", "-list"} - cmd := exec.Command("go", args...) - cmd.Stderr = os.Stderr - out, err := cmd.Output() - if err != nil { - return nil, fmt.Errorf("failed to list: %v", err) - } - var toolSlice []string - for _, test := range strings.Split(string(out), "\n") { - toolSlice = append(toolSlice, test) - } - - // Go tests on disk. - diskSlice, err := search(goTestDir, goTestRegEx) - if err != nil { - return nil, err - } - // Remove items from /bench/, /stress/ and .dir files - diskFiltered := diskSlice[:0] - for _, file := range diskSlice { - if !goDirFilter.MatchString(file) { - diskFiltered = append(diskFiltered, file) - } - } - - return append(toolSlice, diskFiltered...), nil -} - -// TestCmd implements TestRunner.TestCmd. -func (goRunner) TestCmd(test string) *exec.Cmd { - // Check if test exists on disk by searching for file of the same name. - // This will determine whether or not it is a Go test on disk. - if strings.HasSuffix(test, ".go") { - // Test has suffix ".go" which indicates a disk test, run it as such. - cmd := exec.Command("go", "run", "run.go", "-v", "--", test) - cmd.Dir = goTestDir - return cmd - } - - // No ".go" suffix, run as a tool test. - return exec.Command("go", "tool", "dist", "test", "-run", test) -} diff --git a/test/runtimes/images/proctor/java.go b/test/runtimes/images/proctor/java.go deleted file mode 100644 index 8b362029d..000000000 --- a/test/runtimes/images/proctor/java.go +++ /dev/null @@ -1,71 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package main - -import ( - "fmt" - "os" - "os/exec" - "regexp" - "strings" -) - -// Directories to exclude from tests. -var javaExclDirs = regexp.MustCompile(`(^(sun\/security)|(java\/util\/stream)|(java\/time)| )`) - -// Location of java tests. -const javaTestDir = "/root/test/jdk" - -// javaRunner implements TestRunner for Java. -type javaRunner struct{} - -var _ TestRunner = javaRunner{} - -// ListTests implements TestRunner.ListTests. -func (javaRunner) ListTests() ([]string, error) { - args := []string{ - "-dir:" + javaTestDir, - "-ignore:quiet", - "-a", - "-listtests", - ":jdk_core", - ":jdk_svc", - ":jdk_sound", - ":jdk_imageio", - } - cmd := exec.Command("jtreg", args...) - cmd.Stderr = os.Stderr - out, err := cmd.Output() - if err != nil { - return nil, fmt.Errorf("jtreg -listtests : %v", err) - } - var testSlice []string - for _, test := range strings.Split(string(out), "\n") { - if !javaExclDirs.MatchString(test) { - testSlice = append(testSlice, test) - } - } - return testSlice, nil -} - -// TestCmd implements TestRunner.TestCmd. -func (javaRunner) TestCmd(test string) *exec.Cmd { - args := []string{ - "-noreport", - "-dir:" + javaTestDir, - test, - } - return exec.Command("jtreg", args...) -} diff --git a/test/runtimes/images/proctor/nodejs.go b/test/runtimes/images/proctor/nodejs.go deleted file mode 100644 index bd57db444..000000000 --- a/test/runtimes/images/proctor/nodejs.go +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package main - -import ( - "os/exec" - "path/filepath" - "regexp" -) - -var nodejsTestRegEx = regexp.MustCompile(`^test-[^-].+\.js$`) - -// Location of nodejs tests relative to working dir. -const nodejsTestDir = "test" - -// nodejsRunner implements TestRunner for NodeJS. -type nodejsRunner struct{} - -var _ TestRunner = nodejsRunner{} - -// ListTests implements TestRunner.ListTests. -func (nodejsRunner) ListTests() ([]string, error) { - testSlice, err := search(nodejsTestDir, nodejsTestRegEx) - if err != nil { - return nil, err - } - return testSlice, nil -} - -// TestCmd implements TestRunner.TestCmd. -func (nodejsRunner) TestCmd(test string) *exec.Cmd { - args := []string{filepath.Join("tools", "test.py"), test} - return exec.Command("/usr/bin/python", args...) -} diff --git a/test/runtimes/images/proctor/php.go b/test/runtimes/images/proctor/php.go deleted file mode 100644 index 9115040e1..000000000 --- a/test/runtimes/images/proctor/php.go +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package main - -import ( - "os/exec" - "regexp" -) - -var phpTestRegEx = regexp.MustCompile(`^.+\.phpt$`) - -// phpRunner implements TestRunner for PHP. -type phpRunner struct{} - -var _ TestRunner = phpRunner{} - -// ListTests implements TestRunner.ListTests. -func (phpRunner) ListTests() ([]string, error) { - testSlice, err := search(".", phpTestRegEx) - if err != nil { - return nil, err - } - return testSlice, nil -} - -// TestCmd implements TestRunner.TestCmd. -func (phpRunner) TestCmd(test string) *exec.Cmd { - args := []string{"test", "TESTS=" + test} - return exec.Command("make", args...) -} diff --git a/test/runtimes/images/proctor/proctor.go b/test/runtimes/images/proctor/proctor.go deleted file mode 100644 index b54abe434..000000000 --- a/test/runtimes/images/proctor/proctor.go +++ /dev/null @@ -1,163 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Binary proctor runs the test for a particular runtime. It is meant to be -// included in Docker images for all runtime tests. -package main - -import ( - "flag" - "fmt" - "log" - "os" - "os/exec" - "os/signal" - "path/filepath" - "regexp" - "syscall" -) - -// TestRunner is an interface that must be implemented for each runtime -// integrated with proctor. -type TestRunner interface { - // ListTests returns a string slice of tests available to run. - ListTests() ([]string, error) - - // TestCmd returns an *exec.Cmd that will run the given test. - TestCmd(test string) *exec.Cmd -} - -var ( - runtime = flag.String("runtime", "", "name of runtime") - list = flag.Bool("list", false, "list all available tests") - testName = flag.String("test", "", "run a single test from the list of available tests") - pause = flag.Bool("pause", false, "cause container to pause indefinitely, reaping any zombie children") -) - -func main() { - flag.Parse() - - if *pause { - pauseAndReap() - panic("pauseAndReap should never return") - } - - if *runtime == "" { - log.Fatalf("runtime flag must be provided") - } - - tr, err := testRunnerForRuntime(*runtime) - if err != nil { - log.Fatalf("%v", err) - } - - // List tests. - if *list { - tests, err := tr.ListTests() - if err != nil { - log.Fatalf("failed to list tests: %v", err) - } - for _, test := range tests { - fmt.Println(test) - } - return - } - - var tests []string - if *testName == "" { - // Run every test. - tests, err = tr.ListTests() - if err != nil { - log.Fatalf("failed to get all tests: %v", err) - } - } else { - // Run a single test. - tests = []string{*testName} - } - for _, test := range tests { - cmd := tr.TestCmd(test) - cmd.Stdout, cmd.Stderr = os.Stdout, os.Stderr - if err := cmd.Run(); err != nil { - log.Fatalf("FAIL: %v", err) - } - } -} - -// testRunnerForRuntime returns a new TestRunner for the given runtime. -func testRunnerForRuntime(runtime string) (TestRunner, error) { - switch runtime { - case "go": - return goRunner{}, nil - case "java": - return javaRunner{}, nil - case "nodejs": - return nodejsRunner{}, nil - case "php": - return phpRunner{}, nil - case "python": - return pythonRunner{}, nil - } - return nil, fmt.Errorf("invalid runtime %q", runtime) -} - -// pauseAndReap is like init. It runs forever and reaps any children. -func pauseAndReap() { - // Get notified of any new children. - ch := make(chan os.Signal, 1) - signal.Notify(ch, syscall.SIGCHLD) - - for { - if _, ok := <-ch; !ok { - // Channel closed. This should not happen. - panic("signal channel closed") - } - - // Reap the child. - for { - if cpid, _ := syscall.Wait4(-1, nil, syscall.WNOHANG, nil); cpid < 1 { - break - } - } - } -} - -// search is a helper function to find tests in the given directory that match -// the regex. -func search(root string, testFilter *regexp.Regexp) ([]string, error) { - var testSlice []string - - err := filepath.Walk(root, func(path string, info os.FileInfo, err error) error { - if err != nil { - return err - } - - name := filepath.Base(path) - - if info.IsDir() || !testFilter.MatchString(name) { - return nil - } - - relPath, err := filepath.Rel(root, path) - if err != nil { - return err - } - testSlice = append(testSlice, relPath) - return nil - }) - if err != nil { - return nil, fmt.Errorf("walking %q: %v", root, err) - } - - return testSlice, nil -} diff --git a/test/runtimes/images/proctor/proctor_test.go b/test/runtimes/images/proctor/proctor_test.go deleted file mode 100644 index 6bb61d142..000000000 --- a/test/runtimes/images/proctor/proctor_test.go +++ /dev/null @@ -1,127 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package main - -import ( - "io/ioutil" - "os" - "path/filepath" - "reflect" - "regexp" - "strings" - "testing" - - "gvisor.dev/gvisor/runsc/testutil" -) - -func touch(t *testing.T, name string) { - t.Helper() - f, err := os.Create(name) - if err != nil { - t.Fatal(err) - } - if err := f.Close(); err != nil { - t.Fatal(err) - } -} - -func TestSearchEmptyDir(t *testing.T) { - td, err := ioutil.TempDir(testutil.TmpDir(), "searchtest") - if err != nil { - t.Fatal(err) - } - defer os.RemoveAll(td) - - var want []string - - testFilter := regexp.MustCompile(`^test-[^-].+\.tc$`) - got, err := search(td, testFilter) - if err != nil { - t.Errorf("search error: %v", err) - } - - if !reflect.DeepEqual(got, want) { - t.Errorf("Found %#v; want %#v", got, want) - } -} - -func TestSearch(t *testing.T) { - td, err := ioutil.TempDir(testutil.TmpDir(), "searchtest") - if err != nil { - t.Fatal(err) - } - defer os.RemoveAll(td) - - // Creating various files similar to the test filter regex. - files := []string{ - "emp/", - "tee/", - "test-foo.tc", - "test-foo.tc", - "test-bar.tc", - "test-sam.tc", - "Test-que.tc", - "test-brett", - "test--abc.tc", - "test---xyz.tc", - "test-bool.TC", - "--test-gvs.tc", - " test-pew.tc", - "dir/test_baz.tc", - "dir/testsnap.tc", - "dir/test-luk.tc", - "dir/nest/test-ok.tc", - "dir/dip/diz/goog/test-pack.tc", - "dir/dip/diz/wobble/thud/test-cas.e", - "dir/dip/diz/wobble/thud/test-cas.tc", - } - want := []string{ - "dir/dip/diz/goog/test-pack.tc", - "dir/dip/diz/wobble/thud/test-cas.tc", - "dir/nest/test-ok.tc", - "dir/test-luk.tc", - "test-bar.tc", - "test-foo.tc", - "test-sam.tc", - } - - for _, item := range files { - if strings.HasSuffix(item, "/") { - // This item is a directory, create it. - if err := os.MkdirAll(filepath.Join(td, item), 0755); err != nil { - t.Fatal(err) - } - } else { - // This item is a file, create the directory and touch file. - // Create directory in which file should be created - fullDirPath := filepath.Join(td, filepath.Dir(item)) - if err := os.MkdirAll(fullDirPath, 0755); err != nil { - t.Fatal(err) - } - // Create file with full path to file. - touch(t, filepath.Join(td, item)) - } - } - - testFilter := regexp.MustCompile(`^test-[^-].+\.tc$`) - got, err := search(td, testFilter) - if err != nil { - t.Errorf("search error: %v", err) - } - - if !reflect.DeepEqual(got, want) { - t.Errorf("Found %#v; want %#v", got, want) - } -} diff --git a/test/runtimes/images/proctor/python.go b/test/runtimes/images/proctor/python.go deleted file mode 100644 index b9e0fbe6f..000000000 --- a/test/runtimes/images/proctor/python.go +++ /dev/null @@ -1,49 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package main - -import ( - "fmt" - "os" - "os/exec" - "strings" -) - -// pythonRunner implements TestRunner for Python. -type pythonRunner struct{} - -var _ TestRunner = pythonRunner{} - -// ListTests implements TestRunner.ListTests. -func (pythonRunner) ListTests() ([]string, error) { - args := []string{"-m", "test", "--list-tests"} - cmd := exec.Command("./python", args...) - cmd.Stderr = os.Stderr - out, err := cmd.Output() - if err != nil { - return nil, fmt.Errorf("failed to list: %v", err) - } - var toolSlice []string - for _, test := range strings.Split(string(out), "\n") { - toolSlice = append(toolSlice, test) - } - return toolSlice, nil -} - -// TestCmd implements TestRunner.TestCmd. -func (pythonRunner) TestCmd(test string) *exec.Cmd { - args := []string{"-m", "test", test} - return exec.Command("./python", args...) -} diff --git a/test/runtimes/proctor/BUILD b/test/runtimes/proctor/BUILD new file mode 100644 index 000000000..50a26d182 --- /dev/null +++ b/test/runtimes/proctor/BUILD @@ -0,0 +1,27 @@ +load("//tools:defs.bzl", "go_binary", "go_test") + +package(licenses = ["notice"]) + +go_binary( + name = "proctor", + srcs = [ + "go.go", + "java.go", + "nodejs.go", + "php.go", + "proctor.go", + "python.go", + ], + pure = True, + visibility = ["//test/runtimes:__pkg__"], +) + +go_test( + name = "proctor_test", + size = "small", + srcs = ["proctor_test.go"], + library = ":proctor", + deps = [ + "//pkg/test/testutil", + ], +) diff --git a/test/runtimes/proctor/go.go b/test/runtimes/proctor/go.go new file mode 100644 index 000000000..3e2d5d8db --- /dev/null +++ b/test/runtimes/proctor/go.go @@ -0,0 +1,90 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package main + +import ( + "fmt" + "os" + "os/exec" + "regexp" + "strings" +) + +var ( + goTestRegEx = regexp.MustCompile(`^.+\.go$`) + + // Directories with .dir contain helper files for tests. + // Exclude benchmarks and stress tests. + goDirFilter = regexp.MustCompile(`^(bench|stress)\/.+$|^.+\.dir.+$`) +) + +// Location of Go tests on disk. +const goTestDir = "/usr/local/go/test" + +// goRunner implements TestRunner for Go. +// +// There are two types of Go tests: "Go tool tests" and "Go tests on disk". +// "Go tool tests" are found and executed using `go tool dist test`. "Go tests +// on disk" are found in the /usr/local/go/test directory and are executed +// using `go run run.go`. +type goRunner struct{} + +var _ TestRunner = goRunner{} + +// ListTests implements TestRunner.ListTests. +func (goRunner) ListTests() ([]string, error) { + // Go tool dist test tests. + args := []string{"tool", "dist", "test", "-list"} + cmd := exec.Command("go", args...) + cmd.Stderr = os.Stderr + out, err := cmd.Output() + if err != nil { + return nil, fmt.Errorf("failed to list: %v", err) + } + var toolSlice []string + for _, test := range strings.Split(string(out), "\n") { + toolSlice = append(toolSlice, test) + } + + // Go tests on disk. + diskSlice, err := search(goTestDir, goTestRegEx) + if err != nil { + return nil, err + } + // Remove items from /bench/, /stress/ and .dir files + diskFiltered := diskSlice[:0] + for _, file := range diskSlice { + if !goDirFilter.MatchString(file) { + diskFiltered = append(diskFiltered, file) + } + } + + return append(toolSlice, diskFiltered...), nil +} + +// TestCmd implements TestRunner.TestCmd. +func (goRunner) TestCmd(test string) *exec.Cmd { + // Check if test exists on disk by searching for file of the same name. + // This will determine whether or not it is a Go test on disk. + if strings.HasSuffix(test, ".go") { + // Test has suffix ".go" which indicates a disk test, run it as such. + cmd := exec.Command("go", "run", "run.go", "-v", "--", test) + cmd.Dir = goTestDir + return cmd + } + + // No ".go" suffix, run as a tool test. + return exec.Command("go", "tool", "dist", "test", "-run", test) +} diff --git a/test/runtimes/proctor/java.go b/test/runtimes/proctor/java.go new file mode 100644 index 000000000..8b362029d --- /dev/null +++ b/test/runtimes/proctor/java.go @@ -0,0 +1,71 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package main + +import ( + "fmt" + "os" + "os/exec" + "regexp" + "strings" +) + +// Directories to exclude from tests. +var javaExclDirs = regexp.MustCompile(`(^(sun\/security)|(java\/util\/stream)|(java\/time)| )`) + +// Location of java tests. +const javaTestDir = "/root/test/jdk" + +// javaRunner implements TestRunner for Java. +type javaRunner struct{} + +var _ TestRunner = javaRunner{} + +// ListTests implements TestRunner.ListTests. +func (javaRunner) ListTests() ([]string, error) { + args := []string{ + "-dir:" + javaTestDir, + "-ignore:quiet", + "-a", + "-listtests", + ":jdk_core", + ":jdk_svc", + ":jdk_sound", + ":jdk_imageio", + } + cmd := exec.Command("jtreg", args...) + cmd.Stderr = os.Stderr + out, err := cmd.Output() + if err != nil { + return nil, fmt.Errorf("jtreg -listtests : %v", err) + } + var testSlice []string + for _, test := range strings.Split(string(out), "\n") { + if !javaExclDirs.MatchString(test) { + testSlice = append(testSlice, test) + } + } + return testSlice, nil +} + +// TestCmd implements TestRunner.TestCmd. +func (javaRunner) TestCmd(test string) *exec.Cmd { + args := []string{ + "-noreport", + "-dir:" + javaTestDir, + test, + } + return exec.Command("jtreg", args...) +} diff --git a/test/runtimes/proctor/nodejs.go b/test/runtimes/proctor/nodejs.go new file mode 100644 index 000000000..bd57db444 --- /dev/null +++ b/test/runtimes/proctor/nodejs.go @@ -0,0 +1,46 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package main + +import ( + "os/exec" + "path/filepath" + "regexp" +) + +var nodejsTestRegEx = regexp.MustCompile(`^test-[^-].+\.js$`) + +// Location of nodejs tests relative to working dir. +const nodejsTestDir = "test" + +// nodejsRunner implements TestRunner for NodeJS. +type nodejsRunner struct{} + +var _ TestRunner = nodejsRunner{} + +// ListTests implements TestRunner.ListTests. +func (nodejsRunner) ListTests() ([]string, error) { + testSlice, err := search(nodejsTestDir, nodejsTestRegEx) + if err != nil { + return nil, err + } + return testSlice, nil +} + +// TestCmd implements TestRunner.TestCmd. +func (nodejsRunner) TestCmd(test string) *exec.Cmd { + args := []string{filepath.Join("tools", "test.py"), test} + return exec.Command("/usr/bin/python", args...) +} diff --git a/test/runtimes/proctor/php.go b/test/runtimes/proctor/php.go new file mode 100644 index 000000000..9115040e1 --- /dev/null +++ b/test/runtimes/proctor/php.go @@ -0,0 +1,42 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package main + +import ( + "os/exec" + "regexp" +) + +var phpTestRegEx = regexp.MustCompile(`^.+\.phpt$`) + +// phpRunner implements TestRunner for PHP. +type phpRunner struct{} + +var _ TestRunner = phpRunner{} + +// ListTests implements TestRunner.ListTests. +func (phpRunner) ListTests() ([]string, error) { + testSlice, err := search(".", phpTestRegEx) + if err != nil { + return nil, err + } + return testSlice, nil +} + +// TestCmd implements TestRunner.TestCmd. +func (phpRunner) TestCmd(test string) *exec.Cmd { + args := []string{"test", "TESTS=" + test} + return exec.Command("make", args...) +} diff --git a/test/runtimes/proctor/proctor.go b/test/runtimes/proctor/proctor.go new file mode 100644 index 000000000..b54abe434 --- /dev/null +++ b/test/runtimes/proctor/proctor.go @@ -0,0 +1,163 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Binary proctor runs the test for a particular runtime. It is meant to be +// included in Docker images for all runtime tests. +package main + +import ( + "flag" + "fmt" + "log" + "os" + "os/exec" + "os/signal" + "path/filepath" + "regexp" + "syscall" +) + +// TestRunner is an interface that must be implemented for each runtime +// integrated with proctor. +type TestRunner interface { + // ListTests returns a string slice of tests available to run. + ListTests() ([]string, error) + + // TestCmd returns an *exec.Cmd that will run the given test. + TestCmd(test string) *exec.Cmd +} + +var ( + runtime = flag.String("runtime", "", "name of runtime") + list = flag.Bool("list", false, "list all available tests") + testName = flag.String("test", "", "run a single test from the list of available tests") + pause = flag.Bool("pause", false, "cause container to pause indefinitely, reaping any zombie children") +) + +func main() { + flag.Parse() + + if *pause { + pauseAndReap() + panic("pauseAndReap should never return") + } + + if *runtime == "" { + log.Fatalf("runtime flag must be provided") + } + + tr, err := testRunnerForRuntime(*runtime) + if err != nil { + log.Fatalf("%v", err) + } + + // List tests. + if *list { + tests, err := tr.ListTests() + if err != nil { + log.Fatalf("failed to list tests: %v", err) + } + for _, test := range tests { + fmt.Println(test) + } + return + } + + var tests []string + if *testName == "" { + // Run every test. + tests, err = tr.ListTests() + if err != nil { + log.Fatalf("failed to get all tests: %v", err) + } + } else { + // Run a single test. + tests = []string{*testName} + } + for _, test := range tests { + cmd := tr.TestCmd(test) + cmd.Stdout, cmd.Stderr = os.Stdout, os.Stderr + if err := cmd.Run(); err != nil { + log.Fatalf("FAIL: %v", err) + } + } +} + +// testRunnerForRuntime returns a new TestRunner for the given runtime. +func testRunnerForRuntime(runtime string) (TestRunner, error) { + switch runtime { + case "go": + return goRunner{}, nil + case "java": + return javaRunner{}, nil + case "nodejs": + return nodejsRunner{}, nil + case "php": + return phpRunner{}, nil + case "python": + return pythonRunner{}, nil + } + return nil, fmt.Errorf("invalid runtime %q", runtime) +} + +// pauseAndReap is like init. It runs forever and reaps any children. +func pauseAndReap() { + // Get notified of any new children. + ch := make(chan os.Signal, 1) + signal.Notify(ch, syscall.SIGCHLD) + + for { + if _, ok := <-ch; !ok { + // Channel closed. This should not happen. + panic("signal channel closed") + } + + // Reap the child. + for { + if cpid, _ := syscall.Wait4(-1, nil, syscall.WNOHANG, nil); cpid < 1 { + break + } + } + } +} + +// search is a helper function to find tests in the given directory that match +// the regex. +func search(root string, testFilter *regexp.Regexp) ([]string, error) { + var testSlice []string + + err := filepath.Walk(root, func(path string, info os.FileInfo, err error) error { + if err != nil { + return err + } + + name := filepath.Base(path) + + if info.IsDir() || !testFilter.MatchString(name) { + return nil + } + + relPath, err := filepath.Rel(root, path) + if err != nil { + return err + } + testSlice = append(testSlice, relPath) + return nil + }) + if err != nil { + return nil, fmt.Errorf("walking %q: %v", root, err) + } + + return testSlice, nil +} diff --git a/test/runtimes/proctor/proctor_test.go b/test/runtimes/proctor/proctor_test.go new file mode 100644 index 000000000..6ef2de085 --- /dev/null +++ b/test/runtimes/proctor/proctor_test.go @@ -0,0 +1,127 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package main + +import ( + "io/ioutil" + "os" + "path/filepath" + "reflect" + "regexp" + "strings" + "testing" + + "gvisor.dev/gvisor/pkg/test/testutil" +) + +func touch(t *testing.T, name string) { + t.Helper() + f, err := os.Create(name) + if err != nil { + t.Fatalf("error creating file %q: %v", name, err) + } + if err := f.Close(); err != nil { + t.Fatalf("error closing file %q: %v", name, err) + } +} + +func TestSearchEmptyDir(t *testing.T) { + td, err := ioutil.TempDir(testutil.TmpDir(), "searchtest") + if err != nil { + t.Fatalf("error creating searchtest: %v", err) + } + defer os.RemoveAll(td) + + var want []string + + testFilter := regexp.MustCompile(`^test-[^-].+\.tc$`) + got, err := search(td, testFilter) + if err != nil { + t.Errorf("search error: %v", err) + } + + if !reflect.DeepEqual(got, want) { + t.Errorf("Found %#v; want %#v", got, want) + } +} + +func TestSearch(t *testing.T) { + td, err := ioutil.TempDir(testutil.TmpDir(), "searchtest") + if err != nil { + t.Fatalf("error creating searchtest: %v", err) + } + defer os.RemoveAll(td) + + // Creating various files similar to the test filter regex. + files := []string{ + "emp/", + "tee/", + "test-foo.tc", + "test-foo.tc", + "test-bar.tc", + "test-sam.tc", + "Test-que.tc", + "test-brett", + "test--abc.tc", + "test---xyz.tc", + "test-bool.TC", + "--test-gvs.tc", + " test-pew.tc", + "dir/test_baz.tc", + "dir/testsnap.tc", + "dir/test-luk.tc", + "dir/nest/test-ok.tc", + "dir/dip/diz/goog/test-pack.tc", + "dir/dip/diz/wobble/thud/test-cas.e", + "dir/dip/diz/wobble/thud/test-cas.tc", + } + want := []string{ + "dir/dip/diz/goog/test-pack.tc", + "dir/dip/diz/wobble/thud/test-cas.tc", + "dir/nest/test-ok.tc", + "dir/test-luk.tc", + "test-bar.tc", + "test-foo.tc", + "test-sam.tc", + } + + for _, item := range files { + if strings.HasSuffix(item, "/") { + // This item is a directory, create it. + if err := os.MkdirAll(filepath.Join(td, item), 0755); err != nil { + t.Fatalf("error making directory: %v", err) + } + } else { + // This item is a file, create the directory and touch file. + // Create directory in which file should be created + fullDirPath := filepath.Join(td, filepath.Dir(item)) + if err := os.MkdirAll(fullDirPath, 0755); err != nil { + t.Fatalf("error making directory: %v", err) + } + // Create file with full path to file. + touch(t, filepath.Join(td, item)) + } + } + + testFilter := regexp.MustCompile(`^test-[^-].+\.tc$`) + got, err := search(td, testFilter) + if err != nil { + t.Errorf("search error: %v", err) + } + + if !reflect.DeepEqual(got, want) { + t.Errorf("Found %#v; want %#v", got, want) + } +} diff --git a/test/runtimes/proctor/python.go b/test/runtimes/proctor/python.go new file mode 100644 index 000000000..b9e0fbe6f --- /dev/null +++ b/test/runtimes/proctor/python.go @@ -0,0 +1,49 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package main + +import ( + "fmt" + "os" + "os/exec" + "strings" +) + +// pythonRunner implements TestRunner for Python. +type pythonRunner struct{} + +var _ TestRunner = pythonRunner{} + +// ListTests implements TestRunner.ListTests. +func (pythonRunner) ListTests() ([]string, error) { + args := []string{"-m", "test", "--list-tests"} + cmd := exec.Command("./python", args...) + cmd.Stderr = os.Stderr + out, err := cmd.Output() + if err != nil { + return nil, fmt.Errorf("failed to list: %v", err) + } + var toolSlice []string + for _, test := range strings.Split(string(out), "\n") { + toolSlice = append(toolSlice, test) + } + return toolSlice, nil +} + +// TestCmd implements TestRunner.TestCmd. +func (pythonRunner) TestCmd(test string) *exec.Cmd { + args := []string{"-m", "test", test} + return exec.Command("./python", args...) +} diff --git a/test/runtimes/runner.go b/test/runtimes/runner.go deleted file mode 100644 index 3c98f4570..000000000 --- a/test/runtimes/runner.go +++ /dev/null @@ -1,196 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Binary runner runs the runtime tests in a Docker container. -package main - -import ( - "encoding/csv" - "flag" - "fmt" - "io" - "os" - "sort" - "strings" - "testing" - "time" - - "gvisor.dev/gvisor/runsc/dockerutil" - "gvisor.dev/gvisor/runsc/testutil" -) - -var ( - lang = flag.String("lang", "", "language runtime to test") - image = flag.String("image", "", "docker image with runtime tests") - blacklistFile = flag.String("blacklist_file", "", "file containing blacklist of tests to exclude, in CSV format with fields: test name, bug id, comment") -) - -// Wait time for each test to run. -const timeout = 5 * time.Minute - -func main() { - flag.Parse() - if *lang == "" || *image == "" { - fmt.Fprintf(os.Stderr, "lang and image flags must not be empty\n") - os.Exit(1) - } - - os.Exit(runTests()) -} - -// runTests is a helper that is called by main. It exists so that we can run -// defered functions before exiting. It returns an exit code that should be -// passed to os.Exit. -func runTests() int { - // Get tests to blacklist. - blacklist, err := getBlacklist() - if err != nil { - fmt.Fprintf(os.Stderr, "Error getting blacklist: %s\n", err.Error()) - return 1 - } - - // Create a single docker container that will be used for all tests. - d := dockerutil.MakeDocker("gvisor-" + *lang) - defer d.CleanUp() - - // Get a slice of tests to run. This will also start a single Docker - // container that will be used to run each test. The final test will - // stop the Docker container. - tests, err := getTests(d, blacklist) - if err != nil { - fmt.Fprintf(os.Stderr, "%s\n", err.Error()) - return 1 - } - - m := testing.MainStart(testDeps{}, tests, nil, nil) - return m.Run() -} - -// getTests returns a slice of tests to run, subject to the shard size and -// index. -func getTests(d dockerutil.Docker, blacklist map[string]struct{}) ([]testing.InternalTest, error) { - // Pull the image. - if err := dockerutil.Pull(*image); err != nil { - return nil, fmt.Errorf("docker pull %q failed: %v", *image, err) - } - - // Run proctor with --pause flag to keep container alive forever. - if err := d.Run(*image, "--pause"); err != nil { - return nil, fmt.Errorf("docker run failed: %v", err) - } - - // Get a list of all tests in the image. - list, err := d.Exec("/proctor", "--runtime", *lang, "--list") - if err != nil { - return nil, fmt.Errorf("docker exec failed: %v", err) - } - - // Calculate a subset of tests to run corresponding to the current - // shard. - tests := strings.Fields(list) - sort.Strings(tests) - indices, err := testutil.TestIndicesForShard(len(tests)) - if err != nil { - return nil, fmt.Errorf("TestsForShard() failed: %v", err) - } - - var itests []testing.InternalTest - for _, tci := range indices { - // Capture tc in this scope. - tc := tests[tci] - itests = append(itests, testing.InternalTest{ - Name: tc, - F: func(t *testing.T) { - // Is the test blacklisted? - if _, ok := blacklist[tc]; ok { - t.Skipf("SKIP: blacklisted test %q", tc) - } - - var ( - now = time.Now() - done = make(chan struct{}) - output string - err error - ) - - go func() { - fmt.Printf("RUNNING %s...\n", tc) - output, err = d.Exec("/proctor", "--runtime", *lang, "--test", tc) - close(done) - }() - - select { - case <-done: - if err == nil { - fmt.Printf("PASS: %s (%v)\n\n", tc, time.Since(now)) - return - } - t.Errorf("FAIL: %s (%v):\n%s\n", tc, time.Since(now), output) - case <-time.After(timeout): - t.Errorf("TIMEOUT: %s (%v):\n%s\n", tc, time.Since(now), output) - } - }, - }) - } - return itests, nil -} - -// getBlacklist reads the blacklist file and returns a set of test names to -// exclude. -func getBlacklist() (map[string]struct{}, error) { - blacklist := make(map[string]struct{}) - if *blacklistFile == "" { - return blacklist, nil - } - file, err := testutil.FindFile(*blacklistFile) - if err != nil { - return nil, err - } - f, err := os.Open(file) - if err != nil { - return nil, err - } - defer f.Close() - - r := csv.NewReader(f) - - // First line is header. Skip it. - if _, err := r.Read(); err != nil { - return nil, err - } - - for { - record, err := r.Read() - if err == io.EOF { - break - } - if err != nil { - return nil, err - } - blacklist[record[0]] = struct{}{} - } - return blacklist, nil -} - -// testDeps implements testing.testDeps (an unexported interface), and is -// required to use testing.MainStart. -type testDeps struct{} - -func (f testDeps) MatchString(a, b string) (bool, error) { return a == b, nil } -func (f testDeps) StartCPUProfile(io.Writer) error { return nil } -func (f testDeps) StopCPUProfile() {} -func (f testDeps) WriteProfileTo(string, io.Writer, int) error { return nil } -func (f testDeps) ImportPath() string { return "" } -func (f testDeps) StartTestLog(io.Writer) {} -func (f testDeps) StopTestLog() error { return nil } diff --git a/test/runtimes/runner.sh b/test/runtimes/runner.sh deleted file mode 100755 index a8d9a3460..000000000 --- a/test/runtimes/runner.sh +++ /dev/null @@ -1,35 +0,0 @@ -#!/bin/bash - -# Copyright 2018 The gVisor Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -set -euf -x -o pipefail - -echo -- "$@" - -# Create outputs dir if it does not exist. -if [[ -n "${TEST_UNDECLARED_OUTPUTS_DIR}" ]]; then - mkdir -p "${TEST_UNDECLARED_OUTPUTS_DIR}" - chmod a+rwx "${TEST_UNDECLARED_OUTPUTS_DIR}" -fi - -# Update the timestamp on the shard status file. Bazel looks for this. -touch "${TEST_SHARD_STATUS_FILE}" - -# Get location of runner binary. -readonly runner=$(find "${TEST_SRCDIR}" -name runner) - -# Pass the arguments of this script directly to the runner. -exec "${runner}" "$@" - diff --git a/test/runtimes/runner/BUILD b/test/runtimes/runner/BUILD new file mode 100644 index 000000000..63924b9c5 --- /dev/null +++ b/test/runtimes/runner/BUILD @@ -0,0 +1,21 @@ +load("//tools:defs.bzl", "go_binary", "go_test") + +package(licenses = ["notice"]) + +go_binary( + name = "runner", + testonly = 1, + srcs = ["main.go"], + visibility = ["//test/runtimes:__pkg__"], + deps = [ + "//pkg/test/dockerutil", + "//pkg/test/testutil", + ], +) + +go_test( + name = "blacklist_test", + size = "small", + srcs = ["blacklist_test.go"], + library = ":runner", +) diff --git a/test/runtimes/runner/blacklist_test.go b/test/runtimes/runner/blacklist_test.go new file mode 100644 index 000000000..0ff69ab18 --- /dev/null +++ b/test/runtimes/runner/blacklist_test.go @@ -0,0 +1,37 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package main + +import ( + "flag" + "os" + "testing" +) + +func TestMain(m *testing.M) { + flag.Parse() + os.Exit(m.Run()) +} + +// Test that the blacklist parses without error. +func TestBlacklists(t *testing.T) { + bl, err := getBlacklist() + if err != nil { + t.Fatalf("error parsing blacklist: %v", err) + } + if *blacklistFile != "" && len(bl) == 0 { + t.Errorf("got empty blacklist for file %q", *blacklistFile) + } +} diff --git a/test/runtimes/runner/main.go b/test/runtimes/runner/main.go new file mode 100644 index 000000000..57540e00e --- /dev/null +++ b/test/runtimes/runner/main.go @@ -0,0 +1,189 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Binary runner runs the runtime tests in a Docker container. +package main + +import ( + "encoding/csv" + "flag" + "fmt" + "io" + "os" + "sort" + "strings" + "testing" + "time" + + "gvisor.dev/gvisor/pkg/test/dockerutil" + "gvisor.dev/gvisor/pkg/test/testutil" +) + +var ( + lang = flag.String("lang", "", "language runtime to test") + image = flag.String("image", "", "docker image with runtime tests") + blacklistFile = flag.String("blacklist_file", "", "file containing blacklist of tests to exclude, in CSV format with fields: test name, bug id, comment") +) + +// Wait time for each test to run. +const timeout = 5 * time.Minute + +func main() { + flag.Parse() + if *lang == "" || *image == "" { + fmt.Fprintf(os.Stderr, "lang and image flags must not be empty\n") + os.Exit(1) + } + os.Exit(runTests()) +} + +// runTests is a helper that is called by main. It exists so that we can run +// defered functions before exiting. It returns an exit code that should be +// passed to os.Exit. +func runTests() int { + // Get tests to blacklist. + blacklist, err := getBlacklist() + if err != nil { + fmt.Fprintf(os.Stderr, "Error getting blacklist: %s\n", err.Error()) + return 1 + } + + // Construct the shared docker instance. + d := dockerutil.MakeDocker(testutil.DefaultLogger(*lang)) + defer d.CleanUp() + + // Get a slice of tests to run. This will also start a single Docker + // container that will be used to run each test. The final test will + // stop the Docker container. + tests, err := getTests(d, blacklist) + if err != nil { + fmt.Fprintf(os.Stderr, "%s\n", err.Error()) + return 1 + } + + m := testing.MainStart(testDeps{}, tests, nil, nil) + return m.Run() +} + +// getTests executes all tests as table tests. +func getTests(d *dockerutil.Docker, blacklist map[string]struct{}) ([]testing.InternalTest, error) { + // Start the container. + d.CopyFiles("/proctor", "test/runtimes/proctor/proctor") + if err := d.Spawn(dockerutil.RunOpts{ + Image: fmt.Sprintf("runtimes/%s", *image), + }, "/proctor/proctor", "--pause"); err != nil { + return nil, fmt.Errorf("docker run failed: %v", err) + } + + // Get a list of all tests in the image. + list, err := d.Exec(dockerutil.RunOpts{}, "/proctor/proctor", "--runtime", *lang, "--list") + if err != nil { + return nil, fmt.Errorf("docker exec failed: %v", err) + } + + // Calculate a subset of tests to run corresponding to the current + // shard. + tests := strings.Fields(list) + sort.Strings(tests) + indices, err := testutil.TestIndicesForShard(len(tests)) + if err != nil { + return nil, fmt.Errorf("TestsForShard() failed: %v", err) + } + + var itests []testing.InternalTest + for _, tci := range indices { + // Capture tc in this scope. + tc := tests[tci] + itests = append(itests, testing.InternalTest{ + Name: tc, + F: func(t *testing.T) { + // Is the test blacklisted? + if _, ok := blacklist[tc]; ok { + t.Skipf("SKIP: blacklisted test %q", tc) + } + + var ( + now = time.Now() + done = make(chan struct{}) + output string + err error + ) + + go func() { + fmt.Printf("RUNNING %s...\n", tc) + output, err = d.Exec(dockerutil.RunOpts{}, "/proctor/proctor", "--runtime", *lang, "--test", tc) + close(done) + }() + + select { + case <-done: + if err == nil { + fmt.Printf("PASS: %s (%v)\n\n", tc, time.Since(now)) + return + } + t.Errorf("FAIL: %s (%v):\n%s\n", tc, time.Since(now), output) + case <-time.After(timeout): + t.Errorf("TIMEOUT: %s (%v):\n%s\n", tc, time.Since(now), output) + } + }, + }) + } + + return itests, nil +} + +// getBlacklist reads the blacklist file and returns a set of test names to +// exclude. +func getBlacklist() (map[string]struct{}, error) { + blacklist := make(map[string]struct{}) + if *blacklistFile == "" { + return blacklist, nil + } + f, err := os.Open(*blacklistFile) + if err != nil { + return nil, err + } + defer f.Close() + + r := csv.NewReader(f) + + // First line is header. Skip it. + if _, err := r.Read(); err != nil { + return nil, err + } + + for { + record, err := r.Read() + if err == io.EOF { + break + } + if err != nil { + return nil, err + } + blacklist[record[0]] = struct{}{} + } + return blacklist, nil +} + +// testDeps implements testing.testDeps (an unexported interface), and is +// required to use testing.MainStart. +type testDeps struct{} + +func (f testDeps) MatchString(a, b string) (bool, error) { return a == b, nil } +func (f testDeps) StartCPUProfile(io.Writer) error { return nil } +func (f testDeps) StopCPUProfile() {} +func (f testDeps) WriteProfileTo(string, io.Writer, int) error { return nil } +func (f testDeps) ImportPath() string { return "" } +func (f testDeps) StartTestLog(io.Writer) {} +func (f testDeps) StopTestLog() error { return nil } diff --git a/tools/bazeldefs/defs.bzl b/tools/bazeldefs/defs.bzl index 2207b9b34..3c22aec24 100644 --- a/tools/bazeldefs/defs.bzl +++ b/tools/bazeldefs/defs.bzl @@ -5,18 +5,14 @@ load("@io_bazel_rules_go//go:def.bzl", "GoLibrary", _go_binary = "go_binary", _g load("@io_bazel_rules_go//proto:def.bzl", _go_grpc_library = "go_grpc_library", _go_proto_library = "go_proto_library") load("@rules_cc//cc:defs.bzl", _cc_binary = "cc_binary", _cc_library = "cc_library", _cc_proto_library = "cc_proto_library", _cc_test = "cc_test") load("@rules_pkg//:pkg.bzl", _pkg_deb = "pkg_deb", _pkg_tar = "pkg_tar") -load("@io_bazel_rules_docker//go:image.bzl", _go_image = "go_image") -load("@io_bazel_rules_docker//container:container.bzl", _container_image = "container_image") load("@pydeps//:requirements.bzl", _py_requirement = "requirement") load("@com_github_grpc_grpc//bazel:cc_grpc_library.bzl", _cc_grpc_library = "cc_grpc_library") -container_image = _container_image cc_library = _cc_library cc_flags_supplier = _cc_flags_supplier cc_proto_library = _cc_proto_library cc_test = _cc_test cc_toolchain = "@bazel_tools//tools/cpp:current_cc_toolchain" -go_image = _go_image go_embed_data = _go_embed_data gtest = "@com_google_googletest//:gtest" grpcpp = "@com_github_grpc_grpc//:grpc++" diff --git a/tools/defs.bzl b/tools/defs.bzl index 33240e7f4..cdaf281f3 100644 --- a/tools/defs.bzl +++ b/tools/defs.bzl @@ -7,7 +7,7 @@ change for Google-internal and bazel-compatible rules. load("//tools/go_stateify:defs.bzl", "go_stateify") load("//tools/go_marshal:defs.bzl", "go_marshal", "marshal_deps", "marshal_test_deps") -load("//tools/bazeldefs:defs.bzl", _cc_binary = "cc_binary", _cc_flags_supplier = "cc_flags_supplier", _cc_grpc_library = "cc_grpc_library", _cc_library = "cc_library", _cc_proto_library = "cc_proto_library", _cc_test = "cc_test", _cc_toolchain = "cc_toolchain", _container_image = "container_image", _default_installer = "default_installer", _default_net_util = "default_net_util", _gbenchmark = "gbenchmark", _go_binary = "go_binary", _go_embed_data = "go_embed_data", _go_grpc_and_proto_libraries = "go_grpc_and_proto_libraries", _go_image = "go_image", _go_library = "go_library", _go_proto_library = "go_proto_library", _go_test = "go_test", _grpcpp = "grpcpp", _gtest = "gtest", _loopback = "loopback", _pkg_deb = "pkg_deb", _pkg_tar = "pkg_tar", _proto_library = "proto_library", _py_binary = "py_binary", _py_library = "py_library", _py_requirement = "py_requirement", _py_test = "py_test", _select_arch = "select_arch", _select_system = "select_system") +load("//tools/bazeldefs:defs.bzl", _cc_binary = "cc_binary", _cc_flags_supplier = "cc_flags_supplier", _cc_grpc_library = "cc_grpc_library", _cc_library = "cc_library", _cc_proto_library = "cc_proto_library", _cc_test = "cc_test", _cc_toolchain = "cc_toolchain", _default_installer = "default_installer", _default_net_util = "default_net_util", _gbenchmark = "gbenchmark", _go_binary = "go_binary", _go_embed_data = "go_embed_data", _go_grpc_and_proto_libraries = "go_grpc_and_proto_libraries", _go_library = "go_library", _go_proto_library = "go_proto_library", _go_test = "go_test", _grpcpp = "grpcpp", _gtest = "gtest", _loopback = "loopback", _pkg_deb = "pkg_deb", _pkg_tar = "pkg_tar", _proto_library = "proto_library", _py_binary = "py_binary", _py_library = "py_library", _py_requirement = "py_requirement", _py_test = "py_test", _select_arch = "select_arch", _select_system = "select_system") load("//tools/bazeldefs:platforms.bzl", _default_platform = "default_platform", _platforms = "platforms") load("//tools/bazeldefs:tags.bzl", "go_suffixes") load("//tools/nogo:defs.bzl", "nogo_test") @@ -19,12 +19,10 @@ cc_grpc_library = _cc_grpc_library cc_library = _cc_library cc_test = _cc_test cc_toolchain = _cc_toolchain -container_image = _container_image default_installer = _default_installer default_net_util = _default_net_util gbenchmark = _gbenchmark go_embed_data = _go_embed_data -go_image = _go_image go_test = _go_test gtest = _gtest grpcpp = _grpcpp -- cgit v1.2.3 From 5042ea7e2cbdc0c04fd454583589a3b1e152f95d Mon Sep 17 00:00:00 2001 From: Jamie Liu Date: Thu, 23 Apr 2020 15:35:56 -0700 Subject: Add vfs.MkdirOptions.ForSyntheticMountpoint. PiperOrigin-RevId: 308143529 --- pkg/sentry/fsimpl/gofer/directory.go | 147 +++++++++++----- pkg/sentry/fsimpl/gofer/filesystem.go | 323 ++++++++++++++++++++++------------ pkg/sentry/fsimpl/gofer/gofer.go | 177 ++++++++++++------- pkg/sentry/fsimpl/gofer/gofer_test.go | 2 +- pkg/sentry/vfs/filesystem.go | 3 +- pkg/sentry/vfs/options.go | 19 ++ runsc/boot/vfs.go | 6 + 7 files changed, 461 insertions(+), 216 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/gofer/directory.go b/pkg/sentry/fsimpl/gofer/directory.go index d02691232..c67766ab2 100644 --- a/pkg/sentry/fsimpl/gofer/directory.go +++ b/pkg/sentry/fsimpl/gofer/directory.go @@ -21,8 +21,10 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/p9" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) func (d *dentry) isDir() bool { @@ -41,15 +43,46 @@ func (d *dentry) cacheNewChildLocked(child *dentry, name string) { d.children[name] = child } -// Preconditions: d.dirMu must be locked. d.isDir(). fs.opts.interop != -// InteropModeShared. -func (d *dentry) cacheNegativeChildLocked(name string) { +// Preconditions: d.dirMu must be locked. d.isDir(). +func (d *dentry) cacheNegativeLookupLocked(name string) { + // Don't cache negative lookups if InteropModeShared is in effect (since + // this makes remote lookup unavoidable), or if d.isSynthetic() (in which + // case the only files in the directory are those for which a dentry exists + // in d.children). Instead, just delete any previously-cached dentry. + if d.fs.opts.interop == InteropModeShared || d.isSynthetic() { + delete(d.children, name) + return + } if d.children == nil { d.children = make(map[string]*dentry) } d.children[name] = nil } +// createSyntheticDirectory creates a synthetic directory with the given name +// in d. +// +// Preconditions: d.dirMu must be locked. d.isDir(). d does not already contain +// a child with the given name. +func (d *dentry) createSyntheticDirectoryLocked(name string, mode linux.FileMode, kuid auth.KUID, kgid auth.KGID) { + d2 := &dentry{ + refs: 1, // held by d + fs: d.fs, + mode: uint32(mode) | linux.S_IFDIR, + uid: uint32(kuid), + gid: uint32(kgid), + blockSize: usermem.PageSize, // arbitrary + handle: handle{ + fd: -1, + }, + } + d2.pf.dentry = d2 + d2.vfsd.Init(d2) + + d.cacheNewChildLocked(d2, name) + d.syntheticChildren++ +} + type directoryFD struct { fileDescription vfs.DirectoryFileDescriptionDefaultImpl @@ -77,7 +110,7 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba fd.dirents = ds } - if d.fs.opts.interop != InteropModeShared { + if d.cachedMetadataAuthoritative() { d.touchAtime(fd.vfsfd.Mount()) } @@ -108,10 +141,10 @@ func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) { // filesystem.renameMu is needed for d.parent, and must be locked before // dentry.dirMu. d.fs.renameMu.RLock() + defer d.fs.renameMu.RUnlock() d.dirMu.Lock() defer d.dirMu.Unlock() if d.dirents != nil { - d.fs.renameMu.RUnlock() return d.dirents, nil } @@ -132,51 +165,81 @@ func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) { NextOff: 2, }, } - d.fs.renameMu.RUnlock() - off := uint64(0) - const count = 64 * 1024 // for consistency with the vfs1 client - d.handleMu.RLock() - defer d.handleMu.RUnlock() - if !d.handleReadable { - // This should not be possible because a readable handle should have - // been opened when the calling directoryFD was opened. - panic("gofer.dentry.getDirents called without a readable handle") - } - for { - p9ds, err := d.handle.file.readdir(ctx, off, count) - if err != nil { - return nil, err + var realChildren map[string]struct{} + if !d.isSynthetic() { + if d.syntheticChildren != 0 && d.fs.opts.interop == InteropModeShared { + // Record the set of children d actually has so that we don't emit + // duplicate entries for synthetic children. + realChildren = make(map[string]struct{}) } - if len(p9ds) == 0 { - // Cache dirents for future directoryFDs if permitted. - if d.fs.opts.interop != InteropModeShared { - d.dirents = dirents + off := uint64(0) + const count = 64 * 1024 // for consistency with the vfs1 client + d.handleMu.RLock() + if !d.handleReadable { + // This should not be possible because a readable handle should + // have been opened when the calling directoryFD was opened. + d.handleMu.RUnlock() + panic("gofer.dentry.getDirents called without a readable handle") + } + for { + p9ds, err := d.handle.file.readdir(ctx, off, count) + if err != nil { + d.handleMu.RUnlock() + return nil, err + } + if len(p9ds) == 0 { + d.handleMu.RUnlock() + break + } + for _, p9d := range p9ds { + if p9d.Name == "." || p9d.Name == ".." { + continue + } + dirent := vfs.Dirent{ + Name: p9d.Name, + Ino: p9d.QID.Path, + NextOff: int64(len(dirents) + 1), + } + // p9 does not expose 9P2000.U's DMDEVICE, DMNAMEDPIPE, or + // DMSOCKET. + switch p9d.Type { + case p9.TypeSymlink: + dirent.Type = linux.DT_LNK + case p9.TypeDir: + dirent.Type = linux.DT_DIR + default: + dirent.Type = linux.DT_REG + } + dirents = append(dirents, dirent) + if realChildren != nil { + realChildren[p9d.Name] = struct{}{} + } } - return dirents, nil + off = p9ds[len(p9ds)-1].Offset } - for _, p9d := range p9ds { - if p9d.Name == "." || p9d.Name == ".." { + } + // Emit entries for synthetic children. + if d.syntheticChildren != 0 { + for _, child := range d.children { + if child == nil || !child.isSynthetic() { continue } - dirent := vfs.Dirent{ - Name: p9d.Name, - Ino: p9d.QID.Path, - NextOff: int64(len(dirents) + 1), - } - // p9 does not expose 9P2000.U's DMDEVICE, DMNAMEDPIPE, or - // DMSOCKET. - switch p9d.Type { - case p9.TypeSymlink: - dirent.Type = linux.DT_LNK - case p9.TypeDir: - dirent.Type = linux.DT_DIR - default: - dirent.Type = linux.DT_REG + if _, ok := realChildren[child.name]; ok { + continue } - dirents = append(dirents, dirent) + dirents = append(dirents, vfs.Dirent{ + Name: child.name, + Type: uint8(atomic.LoadUint32(&child.mode) >> 12), + Ino: child.ino, + NextOff: int64(len(dirents) + 1), + }) } - off = p9ds[len(p9ds)-1].Offset } + // Cache dirents for future directoryFDs if permitted. + if d.cachedMetadataAuthoritative() { + d.dirents = dirents + } + return dirents, nil } // Seek implements vfs.FileDescriptionImpl.Seek. diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go index eba4aabe8..98ccb42fd 100644 --- a/pkg/sentry/fsimpl/gofer/filesystem.go +++ b/pkg/sentry/fsimpl/gofer/filesystem.go @@ -29,14 +29,16 @@ import ( // Sync implements vfs.FilesystemImpl.Sync. func (fs *filesystem) Sync(ctx context.Context) error { - // Snapshot current dentries and special files. + // Snapshot current syncable dentries and special files. fs.syncMu.Lock() - ds := make([]*dentry, 0, len(fs.dentries)) - for d := range fs.dentries { + ds := make([]*dentry, 0, len(fs.syncableDentries)) + for d := range fs.syncableDentries { + d.IncRef() ds = append(ds, d) } sffds := make([]*specialFileFD, 0, len(fs.specialFileFDs)) for sffd := range fs.specialFileFDs { + sffd.vfsfd.IncRef() sffds = append(sffds, sffd) } fs.syncMu.Unlock() @@ -47,9 +49,6 @@ func (fs *filesystem) Sync(ctx context.Context) error { // Sync regular files. for _, d := range ds { - if !d.TryIncRef() { - continue - } err := d.syncSharedHandle(ctx) d.DecRef() if err != nil && retErr == nil { @@ -60,9 +59,6 @@ func (fs *filesystem) Sync(ctx context.Context) error { // Sync special files, which may be writable but do not use dentry shared // handles (so they won't be synced by the above). for _, sffd := range sffds { - if !sffd.vfsfd.TryIncRef() { - continue - } err := sffd.Sync(ctx) sffd.vfsfd.DecRef() if err != nil && retErr == nil { @@ -114,8 +110,8 @@ func putDentrySlice(ds *[]*dentry) { // to *ds. // // Preconditions: fs.renameMu must be locked. d.dirMu must be locked. -// !rp.Done(). If fs.opts.interop == InteropModeShared, then d's cached -// metadata must be up to date. +// !rp.Done(). If !d.cachedMetadataAuthoritative(), then d's cached metadata +// must be up to date. // // Postconditions: The returned dentry's cached metadata is up to date. func (fs *filesystem) stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, ds **[]*dentry) (*dentry, error) { @@ -148,7 +144,7 @@ afterSymlink: if err := rp.CheckMount(&d.parent.vfsd); err != nil { return nil, err } - if fs.opts.interop == InteropModeShared && d != d.parent { + if d != d.parent && !d.cachedMetadataAuthoritative() { _, attrMask, attr, err := d.parent.file.getAttr(ctx, dentryAttrMask()) if err != nil { return nil, err @@ -195,7 +191,7 @@ func (fs *filesystem) getChildLocked(ctx context.Context, vfsObj *vfs.VirtualFil return nil, syserror.ENAMETOOLONG } child, ok := parent.children[name] - if ok && fs.opts.interop != InteropModeShared { + if (ok && fs.opts.interop != InteropModeShared) || parent.isSynthetic() { // Whether child is nil or not, it is cached information that is // assumed to be correct. return child, nil @@ -206,7 +202,7 @@ func (fs *filesystem) getChildLocked(ctx context.Context, vfsObj *vfs.VirtualFil return fs.revalidateChildLocked(ctx, vfsObj, parent, name, child, ds) } -// Preconditions: As for getChildLocked. +// Preconditions: As for getChildLocked. !parent.isSynthetic(). func (fs *filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.VirtualFilesystem, parent *dentry, name string, child *dentry, ds **[]*dentry) (*dentry, error) { qid, file, attrMask, attr, err := parent.file.walkGetAttrOne(ctx, name) if err != nil && err != syserror.ENOENT { @@ -220,24 +216,41 @@ func (fs *filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.Vir child.updateFromP9Attrs(attrMask, &attr) return child, nil } - // The file at this path has changed or no longer exists. Remove - // the stale dentry from the tree, and re-evaluate its caching - // status (i.e. if it has 0 references, drop it). + if file.isNil() && child.isSynthetic() { + // We have a synthetic file, and no remote file has arisen to + // replace it. + return child, nil + } + // The file at this path has changed or no longer exists. Mark the + // dentry invalidated, and re-evaluate its caching status (i.e. if it + // has 0 references, drop it). Wait to update parent.children until we + // know what to replace the existing dentry with (i.e. one of the + // returns below), to avoid a redundant map access. vfsObj.InvalidateDentry(&child.vfsd) + if child.isSynthetic() { + // Normally we don't mark invalidated dentries as deleted since + // they may still exist (but at a different path), and also for + // consistency with Linux. However, synthetic files are guaranteed + // to become unreachable if their dentries are invalidated, so + // treat their invalidation as deletion. + child.setDeleted() + parent.syntheticChildren-- + child.decRefLocked() + parent.dirents = nil + } *ds = appendDentry(*ds, child) } if file.isNil() { // No file exists at this path now. Cache the negative lookup if // allowed. - if fs.opts.interop != InteropModeShared { - parent.cacheNegativeChildLocked(name) - } + parent.cacheNegativeLookupLocked(name) return nil, nil } // Create a new dentry representing the file. child, err = fs.newDentry(ctx, file, qid, attrMask, &attr) if err != nil { file.close(ctx) + delete(parent.children, name) return nil, err } parent.cacheNewChildLocked(child, name) @@ -252,8 +265,9 @@ func (fs *filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.Vir // rp.Start().Impl().(*dentry)). It does not check that the returned directory // is searchable by the provider of rp. // -// Preconditions: fs.renameMu must be locked. !rp.Done(). If fs.opts.interop == -// InteropModeShared, then d's cached metadata must be up to date. +// Preconditions: fs.renameMu must be locked. !rp.Done(). If +// !d.cachedMetadataAuthoritative(), then d's cached metadata must be up to +// date. func (fs *filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, ds **[]*dentry) (*dentry, error) { for !rp.Final() { d.dirMu.Lock() @@ -275,7 +289,7 @@ func (fs *filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.Resolving // Preconditions: fs.renameMu must be locked. func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath, ds **[]*dentry) (*dentry, error) { d := rp.Start().Impl().(*dentry) - if fs.opts.interop == InteropModeShared { + if !d.cachedMetadataAuthoritative() { // Get updated metadata for rp.Start() as required by fs.stepLocked(). if err := d.updateFromGetattr(ctx); err != nil { return nil, err @@ -297,16 +311,17 @@ func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath, } // doCreateAt checks that creating a file at rp is permitted, then invokes -// create to do so. +// createInRemoteDir (if the parent directory is a real remote directory) or +// createInSyntheticDir (if the parent directory is synthetic) to do so. // // Preconditions: !rp.Done(). For the final path component in rp, // !rp.ShouldFollowSymlink(). -func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool, create func(parent *dentry, name string) error) error { +func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool, createInRemoteDir func(parent *dentry, name string) error, createInSyntheticDir func(parent *dentry, name string) error) error { var ds *[]*dentry fs.renameMu.RLock() defer fs.renameMuRUnlockAndCheckCaching(&ds) start := rp.Start().Impl().(*dentry) - if fs.opts.interop == InteropModeShared { + if !start.cachedMetadataAuthoritative() { // Get updated metadata for start as required by // fs.walkParentDirLocked(). if err := start.updateFromGetattr(ctx); err != nil { @@ -340,6 +355,20 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir defer mnt.EndWrite() parent.dirMu.Lock() defer parent.dirMu.Unlock() + if parent.isSynthetic() { + if child := parent.children[name]; child != nil { + return syserror.EEXIST + } + if createInSyntheticDir == nil { + return syserror.EPERM + } + if err := createInSyntheticDir(parent, name); err != nil { + return err + } + parent.touchCMtime() + parent.dirents = nil + return nil + } if fs.opts.interop == InteropModeShared { // The existence of a dentry at name would be inconclusive because the // file it represents may have been deleted from the remote filesystem, @@ -348,21 +377,21 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir // will fail with EEXIST like we would have. If the RPC succeeds, and a // stale dentry exists, the dentry will fail revalidation next time // it's used. - return create(parent, name) + return createInRemoteDir(parent, name) } if child := parent.children[name]; child != nil { return syserror.EEXIST } // No cached dentry exists; however, there might still be an existing file // at name. As above, we attempt the file creation RPC anyway. - if err := create(parent, name); err != nil { + if err := createInRemoteDir(parent, name); err != nil { return err } + if child, ok := parent.children[name]; ok && child == nil { + // Delete the now-stale negative dentry. + delete(parent.children, name) + } parent.touchCMtime() - // Either parent.children[name] doesn't exist (in which case this is a - // no-op) or is nil (in which case this erases the now-stale information - // that the file doesn't exist). - delete(parent.children, name) parent.dirents = nil return nil } @@ -373,7 +402,7 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b fs.renameMu.RLock() defer fs.renameMuRUnlockAndCheckCaching(&ds) start := rp.Start().Impl().(*dentry) - if fs.opts.interop == InteropModeShared { + if !start.cachedMetadataAuthoritative() { // Get updated metadata for start as required by // fs.walkParentDirLocked(). if err := start.updateFromGetattr(ctx); err != nil { @@ -421,8 +450,10 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b // only revalidating the dentry if that fails (indicating that the existing // dentry is a mount point). if child != nil { + child.dirMu.Lock() + defer child.dirMu.Unlock() if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil { - if fs.opts.interop != InteropModeShared { + if parent.cachedMetadataAuthoritative() { return err } child, err = fs.revalidateChildLocked(ctx, vfsObj, parent, name, child, &ds) @@ -437,13 +468,37 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b } } flags := uint32(0) + // If a dentry exists, use it for best-effort checks on its deletability. if dir { - if child != nil && !child.isDir() { - vfsObj.AbortDeleteDentry(&child.vfsd) - return syserror.ENOTDIR + if child != nil { + // child must be an empty directory. + if child.syntheticChildren != 0 { + // This is definitely not an empty directory, irrespective of + // fs.opts.interop. + vfsObj.AbortDeleteDentry(&child.vfsd) + return syserror.ENOTEMPTY + } + // If InteropModeShared is in effect and the first call to + // PrepareDeleteDentry above succeeded, then child wasn't + // revalidated (so we can't expect its file type to be correct) and + // individually revalidating its children (to confirm that they + // still exist) would be a waste of time. + if child.cachedMetadataAuthoritative() { + if !child.isDir() { + vfsObj.AbortDeleteDentry(&child.vfsd) + return syserror.ENOTDIR + } + for _, grandchild := range child.children { + if grandchild != nil { + vfsObj.AbortDeleteDentry(&child.vfsd) + return syserror.ENOTEMPTY + } + } + } } flags = linux.AT_REMOVEDIR } else { + // child must be a non-directory file. if child != nil && child.isDir() { vfsObj.AbortDeleteDentry(&child.vfsd) return syserror.EISDIR @@ -455,28 +510,36 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b return syserror.ENOTDIR } } - err = parent.file.unlinkAt(ctx, name, flags) - if err != nil { - if child != nil { - vfsObj.AbortDeleteDentry(&child.vfsd) - } - return err - } - if fs.opts.interop != InteropModeShared { - parent.touchCMtime() - if dir { - parent.decLinks() + if parent.isSynthetic() { + if child == nil { + return syserror.ENOENT } - parent.cacheNegativeChildLocked(name) - parent.dirents = nil } else { - delete(parent.children, name) + err = parent.file.unlinkAt(ctx, name, flags) + if err != nil { + if child != nil { + vfsObj.AbortDeleteDentry(&child.vfsd) + } + return err + } } if child != nil { - child.setDeleted() vfsObj.CommitDeleteDentry(&child.vfsd) + child.setDeleted() + if child.isSynthetic() { + parent.syntheticChildren-- + child.decRefLocked() + } ds = appendDentry(ds, child) } + parent.cacheNegativeLookupLocked(name) + if parent.cachedMetadataAuthoritative() { + parent.dirents = nil + parent.touchCMtime() + if dir { + parent.decLinks() + } + } return nil } @@ -554,7 +617,7 @@ func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPa fs.renameMu.RLock() defer fs.renameMuRUnlockAndCheckCaching(&ds) start := rp.Start().Impl().(*dentry) - if fs.opts.interop == InteropModeShared { + if !start.cachedMetadataAuthoritative() { // Get updated metadata for start as required by // fs.walkParentDirLocked(). if err := start.updateFromGetattr(ctx); err != nil { @@ -577,20 +640,32 @@ func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs. } // 9P2000.L supports hard links, but we don't. return syserror.EPERM - }) + }, nil) } // MkdirAt implements vfs.FilesystemImpl.MkdirAt. func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error { + creds := rp.Credentials() return fs.doCreateAt(ctx, rp, true /* dir */, func(parent *dentry, name string) error { - creds := rp.Credentials() if _, err := parent.file.mkdir(ctx, name, (p9.FileMode)(opts.Mode), (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID)); err != nil { - return err + if !opts.ForSyntheticMountpoint || err == syserror.EEXIST { + return err + } + ctx.Infof("Failed to create remote directory %q: %v; falling back to synthetic directory", name, err) + parent.createSyntheticDirectoryLocked(name, opts.Mode, creds.EffectiveKUID, creds.EffectiveKGID) } if fs.opts.interop != InteropModeShared { parent.incLinks() } return nil + }, func(parent *dentry, name string) error { + if !opts.ForSyntheticMountpoint { + // Can't create non-synthetic files in synthetic directories. + return syserror.EPERM + } + parent.createSyntheticDirectoryLocked(name, opts.Mode, creds.EffectiveKUID, creds.EffectiveKGID) + parent.incLinks() + return nil }) } @@ -600,7 +675,7 @@ func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v creds := rp.Credentials() _, err := parent.file.mknod(ctx, name, (p9.FileMode)(opts.Mode), opts.DevMajor, opts.DevMinor, (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID)) return err - }) + }, nil) } // OpenAt implements vfs.FilesystemImpl.OpenAt. @@ -620,7 +695,7 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf defer fs.renameMuRUnlockAndCheckCaching(&ds) start := rp.Start().Impl().(*dentry) - if fs.opts.interop == InteropModeShared { + if !start.cachedMetadataAuthoritative() { // Get updated metadata for start as required by fs.stepLocked(). if err := start.updateFromGetattr(ctx); err != nil { return nil, err @@ -643,6 +718,10 @@ afterTrailingSymlink: parent.dirMu.Lock() child, err := fs.stepLocked(ctx, rp, parent, &ds) if err == syserror.ENOENT && mayCreate { + if parent.isSynthetic() { + parent.dirMu.Unlock() + return nil, syserror.EPERM + } fd, err := parent.createAndOpenChildLocked(ctx, rp, &opts) parent.dirMu.Unlock() return fd, err @@ -702,8 +781,10 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vf if opts.Flags&linux.O_DIRECT != 0 { return nil, syserror.EINVAL } - if err := d.ensureSharedHandle(ctx, ats&vfs.MayRead != 0, false /* write */, false /* trunc */); err != nil { - return nil, err + if !d.isSynthetic() { + if err := d.ensureSharedHandle(ctx, ats&vfs.MayRead != 0, false /* write */, false /* trunc */); err != nil { + return nil, err + } } fd := &directoryFD{} if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil { @@ -733,6 +814,7 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vf } // Preconditions: d.fs.renameMu must be locked. d.dirMu must be locked. +// !d.isSynthetic(). func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions) (*vfs.FileDescription, error) { if err := d.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil { return nil, err @@ -811,7 +893,7 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving child.refs = 1 // Insert the dentry into the tree. d.cacheNewChildLocked(child, name) - if d.fs.opts.interop != InteropModeShared { + if d.cachedMetadataAuthoritative() { d.touchCMtime() d.dirents = nil } @@ -888,7 +970,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa defer mnt.EndWrite() oldParent := oldParentVD.Dentry().Impl().(*dentry) - if fs.opts.interop == InteropModeShared { + if !oldParent.cachedMetadataAuthoritative() { if err := oldParent.updateFromGetattr(ctx); err != nil { return err } @@ -933,35 +1015,22 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa if newParent.isDeleted() { return syserror.ENOENT } - replaced := newParent.children[newName] - // This is similar to unlinkAt, except: - // - // - If a dentry exists for the file to be replaced, we revalidate it - // unconditionally (instead of only if PrepareRenameDentry fails) for - // simplicity. - // - // - If rp.MustBeDir(), then we need a dentry representing the replaced - // file regardless to confirm that it's a directory. - if replaced != nil || rp.MustBeDir() { - replaced, err = fs.getChildLocked(ctx, rp.VirtualFilesystem(), newParent, newName, &ds) - if err != nil { - return err - } - if replaced != nil { - if replaced.isDir() { - if !renamed.isDir() { - return syserror.EISDIR - } - } else { - if rp.MustBeDir() || renamed.isDir() { - return syserror.ENOTDIR - } - } - } + replaced, err := fs.getChildLocked(ctx, rp.VirtualFilesystem(), newParent, newName, &ds) + if err != nil { + return err } var replacedVFSD *vfs.Dentry if replaced != nil { replacedVFSD = &replaced.vfsd + if replaced.isDir() { + if !renamed.isDir() { + return syserror.EISDIR + } + } else { + if rp.MustBeDir() || renamed.isDir() { + return syserror.ENOTDIR + } + } } if oldParent == newParent && oldName == newName { @@ -972,27 +1041,47 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa if err := vfsObj.PrepareRenameDentry(mntns, &renamed.vfsd, replacedVFSD); err != nil { return err } - if err := renamed.file.rename(ctx, newParent.file, newName); err != nil { - vfsObj.AbortRenameDentry(&renamed.vfsd, replacedVFSD) - return err + + // Update the remote filesystem. + if !renamed.isSynthetic() { + if err := renamed.file.rename(ctx, newParent.file, newName); err != nil { + vfsObj.AbortRenameDentry(&renamed.vfsd, replacedVFSD) + return err + } + } else if replaced != nil && !replaced.isSynthetic() { + // We are replacing an existing real file with a synthetic one, so we + // need to unlink the former. + flags := uint32(0) + if replaced.isDir() { + flags = linux.AT_REMOVEDIR + } + if err := newParent.file.unlinkAt(ctx, newName, flags); err != nil { + vfsObj.AbortRenameDentry(&renamed.vfsd, replacedVFSD) + return err + } } - if fs.opts.interop != InteropModeShared { - oldParent.cacheNegativeChildLocked(oldName) - oldParent.dirents = nil - newParent.dirents = nil - if renamed.isDir() { - oldParent.decLinks() - newParent.incLinks() + + // Update the dentry tree. + vfsObj.CommitRenameReplaceDentry(&renamed.vfsd, replacedVFSD) + if replaced != nil { + replaced.setDeleted() + if replaced.isSynthetic() { + newParent.syntheticChildren-- + replaced.decRefLocked() } - oldParent.touchCMtime() - newParent.touchCMtime() - renamed.touchCtime() - } else { - delete(oldParent.children, oldName) + ds = appendDentry(ds, replaced) } + oldParent.cacheNegativeLookupLocked(oldName) + // We don't use newParent.cacheNewChildLocked() since we don't want to mess + // with reference counts and queue oldParent for checkCachingLocked if the + // parent isn't actually changing. if oldParent != newParent { - appendDentry(ds, oldParent) + ds = appendDentry(ds, oldParent) newParent.IncRef() + if renamed.isSynthetic() { + oldParent.syntheticChildren-- + newParent.syntheticChildren++ + } } renamed.parent = newParent renamed.name = newName @@ -1000,11 +1089,25 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa newParent.children = make(map[string]*dentry) } newParent.children[newName] = renamed - if replaced != nil { - replaced.setDeleted() - appendDentry(ds, replaced) + + // Update metadata. + if renamed.cachedMetadataAuthoritative() { + renamed.touchCtime() + } + if oldParent.cachedMetadataAuthoritative() { + oldParent.dirents = nil + oldParent.touchCMtime() + if renamed.isDir() { + oldParent.decLinks() + } + } + if newParent.cachedMetadataAuthoritative() { + newParent.dirents = nil + newParent.touchCMtime() + if renamed.isDir() { + newParent.incLinks() + } } - vfsObj.CommitRenameReplaceDentry(&renamed.vfsd, replacedVFSD) return nil } @@ -1051,6 +1154,10 @@ func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linu if err != nil { return linux.Statfs{}, err } + // If d is synthetic, invoke statfs on the first ancestor of d that isn't. + for d.isSynthetic() { + d = d.parent + } fsstat, err := d.file.statFS(ctx) if err != nil { return linux.Statfs{}, err @@ -1080,7 +1187,7 @@ func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, targ creds := rp.Credentials() _, err := parent.file.symlink(ctx, target, name, (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID)) return err - }) + }, nil) } // UnlinkAt implements vfs.FilesystemImpl.UnlinkAt. diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go index 293df2545..8b4e91d17 100644 --- a/pkg/sentry/fsimpl/gofer/gofer.go +++ b/pkg/sentry/fsimpl/gofer/gofer.go @@ -27,8 +27,9 @@ // dentry.handleMu // dentry.dataMu // -// Locking dentry.dirMu in multiple dentries requires holding -// filesystem.renameMu for writing. +// Locking dentry.dirMu in multiple dentries requires that either ancestor +// dentries are locked before descendant dentries, or that filesystem.renameMu +// is locked for writing. package gofer import ( @@ -102,11 +103,12 @@ type filesystem struct { cachedDentries dentryList cachedDentriesLen uint64 - // dentries contains all dentries in this filesystem. specialFileFDs - // contains all open specialFileFDs. These fields are protected by syncMu. - syncMu sync.Mutex - dentries map[*dentry]struct{} - specialFileFDs map[*specialFileFD]struct{} + // syncableDentries contains all dentries in this filesystem for which + // !dentry.file.isNil(). specialFileFDs contains all open specialFileFDs. + // These fields are protected by syncMu. + syncMu sync.Mutex + syncableDentries map[*dentry]struct{} + specialFileFDs map[*specialFileFD]struct{} } type filesystemOptions struct { @@ -187,7 +189,8 @@ const ( // InteropModeShared is appropriate when there are users of the remote // filesystem that may mutate its state other than the client. // - // - The client must verify cached filesystem state before using it. + // - The client must verify ("revalidate") cached filesystem state before + // using it. // // - Client changes to filesystem state must be sent to the remote // filesystem synchronously. @@ -376,14 +379,14 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt // Construct the filesystem object. fs := &filesystem{ - mfp: mfp, - opts: fsopts, - uid: creds.EffectiveKUID, - gid: creds.EffectiveKGID, - client: client, - clock: ktime.RealtimeClockFromContext(ctx), - dentries: make(map[*dentry]struct{}), - specialFileFDs: make(map[*specialFileFD]struct{}), + mfp: mfp, + opts: fsopts, + uid: creds.EffectiveKUID, + gid: creds.EffectiveKGID, + client: client, + clock: ktime.RealtimeClockFromContext(ctx), + syncableDentries: make(map[*dentry]struct{}), + specialFileFDs: make(map[*specialFileFD]struct{}), } fs.vfsfs.Init(vfsObj, &fstype, fs) @@ -409,7 +412,7 @@ func (fs *filesystem) Release() { mf := fs.mfp.MemoryFile() fs.syncMu.Lock() - for d := range fs.dentries { + for d := range fs.syncableDentries { d.handleMu.Lock() d.dataMu.Lock() if d.handleWritable { @@ -444,9 +447,11 @@ type dentry struct { vfsd vfs.Dentry // refs is the reference count. Each dentry holds a reference on its - // parent, even if disowned. refs is accessed using atomic memory - // operations. When refs reaches 0, the dentry may be added to the cache or - // destroyed. If refs==-1 the dentry has already been destroyed. + // parent, even if disowned. An additional reference is held on all + // synthetic dentries until they are unlinked or invalidated. When refs + // reaches 0, the dentry may be added to the cache or destroyed. If refs == + // -1, the dentry has already been destroyed. refs is accessed using atomic + // memory operations. refs int64 // fs is the owning filesystem. fs is immutable. @@ -465,6 +470,12 @@ type dentry struct { // We don't support hard links, so each dentry maps 1:1 to an inode. // file is the unopened p9.File that backs this dentry. file is immutable. + // + // If file.isNil(), this dentry represents a synthetic file, i.e. a file + // that does not exist on the remote filesystem. As of this writing, this + // is only possible for a directory created with + // MkdirOptions.ForSyntheticMountpoint == true. + // TODO(gvisor.dev/issue/1476): Support synthetic sockets (and pipes). file p9file // If deleted is non-zero, the file represented by this dentry has been @@ -484,15 +495,21 @@ type dentry struct { // - Mappings of child filenames to dentries representing those children. // // - Mappings of child filenames that are known not to exist to nil - // dentries (only if InteropModeShared is not in effect). + // dentries (only if InteropModeShared is not in effect and the directory + // is not synthetic). // // children is protected by dirMu. children map[string]*dentry - // If this dentry represents a directory, InteropModeShared is not in - // effect, and dirents is not nil, it is a cache of all entries in the - // directory, in the order they were returned by the server. dirents is - // protected by dirMu. + // If this dentry represents a directory, syntheticChildren is the number + // of child dentries for which dentry.isSynthetic() == true. + // syntheticChildren is protected by dirMu. + syntheticChildren int + + // If this dentry represents a directory, + // dentry.cachedMetadataAuthoritative() == true, and dirents is not nil, it + // is a cache of all entries in the directory, in the order they were + // returned by the server. dirents is protected by dirMu. dirents []vfs.Dirent // Cached metadata; protected by metadataMu and accessed using atomic @@ -589,6 +606,8 @@ func dentryAttrMask() p9.AttrMask { // initially has no references, but is not cached; it is the caller's // responsibility to set the dentry's reference count and/or call // dentry.checkCachingLocked() as appropriate. +// +// Preconditions: !file.isNil(). func (fs *filesystem) newDentry(ctx context.Context, file p9file, qid p9.QID, mask p9.AttrMask, attr *p9.Attr) (*dentry, error) { if !mask.Mode { ctx.Warningf("can't create gofer.dentry without file type") @@ -612,10 +631,10 @@ func (fs *filesystem) newDentry(ctx context.Context, file p9file, qid p9.QID, ma }, } d.pf.dentry = d - if mask.UID { + if mask.UID && attr.UID != auth.NoID { d.uid = uint32(attr.UID) } - if mask.GID { + if mask.GID && attr.GID != auth.NoID { d.gid = uint32(attr.GID) } if mask.Size { @@ -642,11 +661,19 @@ func (fs *filesystem) newDentry(ctx context.Context, file p9file, qid p9.QID, ma d.vfsd.Init(d) fs.syncMu.Lock() - fs.dentries[d] = struct{}{} + fs.syncableDentries[d] = struct{}{} fs.syncMu.Unlock() return d, nil } +func (d *dentry) isSynthetic() bool { + return d.file.isNil() +} + +func (d *dentry) cachedMetadataAuthoritative() bool { + return d.fs.opts.interop != InteropModeShared || d.isSynthetic() +} + // updateFromP9Attrs is called to update d's metadata after an update from the // remote filesystem. func (d *dentry) updateFromP9Attrs(mask p9.AttrMask, attr *p9.Attr) { @@ -691,6 +718,7 @@ func (d *dentry) updateFromP9Attrs(mask p9.AttrMask, attr *p9.Attr) { d.metadataMu.Unlock() } +// Preconditions: !d.isSynthetic() func (d *dentry) updateFromGetattr(ctx context.Context) error { // Use d.handle.file, which represents a 9P fid that has been opened, in // preference to d.file, which represents a 9P fid that has not. This may @@ -758,7 +786,7 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *lin defer mnt.EndWrite() setLocalAtime := false setLocalMtime := false - if d.fs.opts.interop != InteropModeShared { + if d.cachedMetadataAuthoritative() { // Timestamp updates will be handled locally. setLocalAtime = stat.Mask&linux.STATX_ATIME != 0 setLocalMtime = stat.Mask&linux.STATX_MTIME != 0 @@ -771,35 +799,37 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *lin } d.metadataMu.Lock() defer d.metadataMu.Unlock() - if stat.Mask != 0 { - if err := d.file.setAttr(ctx, p9.SetAttrMask{ - Permissions: stat.Mask&linux.STATX_MODE != 0, - UID: stat.Mask&linux.STATX_UID != 0, - GID: stat.Mask&linux.STATX_GID != 0, - Size: stat.Mask&linux.STATX_SIZE != 0, - ATime: stat.Mask&linux.STATX_ATIME != 0, - MTime: stat.Mask&linux.STATX_MTIME != 0, - ATimeNotSystemTime: stat.Atime.Nsec != linux.UTIME_NOW, - MTimeNotSystemTime: stat.Mtime.Nsec != linux.UTIME_NOW, - }, p9.SetAttr{ - Permissions: p9.FileMode(stat.Mode), - UID: p9.UID(stat.UID), - GID: p9.GID(stat.GID), - Size: stat.Size, - ATimeSeconds: uint64(stat.Atime.Sec), - ATimeNanoSeconds: uint64(stat.Atime.Nsec), - MTimeSeconds: uint64(stat.Mtime.Sec), - MTimeNanoSeconds: uint64(stat.Mtime.Nsec), - }); err != nil { - return err + if !d.isSynthetic() { + if stat.Mask != 0 { + if err := d.file.setAttr(ctx, p9.SetAttrMask{ + Permissions: stat.Mask&linux.STATX_MODE != 0, + UID: stat.Mask&linux.STATX_UID != 0, + GID: stat.Mask&linux.STATX_GID != 0, + Size: stat.Mask&linux.STATX_SIZE != 0, + ATime: stat.Mask&linux.STATX_ATIME != 0, + MTime: stat.Mask&linux.STATX_MTIME != 0, + ATimeNotSystemTime: stat.Atime.Nsec != linux.UTIME_NOW, + MTimeNotSystemTime: stat.Mtime.Nsec != linux.UTIME_NOW, + }, p9.SetAttr{ + Permissions: p9.FileMode(stat.Mode), + UID: p9.UID(stat.UID), + GID: p9.GID(stat.GID), + Size: stat.Size, + ATimeSeconds: uint64(stat.Atime.Sec), + ATimeNanoSeconds: uint64(stat.Atime.Nsec), + MTimeSeconds: uint64(stat.Mtime.Sec), + MTimeNanoSeconds: uint64(stat.Mtime.Nsec), + }); err != nil { + return err + } + } + if d.fs.opts.interop == InteropModeShared { + // There's no point to updating d's metadata in this case since + // it'll be overwritten by revalidation before the next time it's + // used anyway. (InteropModeShared inhibits client caching of + // regular file data, so there's no cache to truncate either.) + return nil } - } - if d.fs.opts.interop == InteropModeShared { - // There's no point to updating d's metadata in this case since it'll - // be overwritten by revalidation before the next time it's used - // anyway. (InteropModeShared inhibits client caching of regular file - // data, so there's no cache to truncate either.) - return nil } now := d.fs.clock.Now().Nanoseconds() if stat.Mask&linux.STATX_MODE != 0 { @@ -897,6 +927,15 @@ func (d *dentry) DecRef() { } } +// decRefLocked decrements d's reference count without calling +// d.checkCachingLocked, even if d's reference count reaches 0; callers are +// responsible for ensuring that d.checkCachingLocked will be called later. +func (d *dentry) decRefLocked() { + if refs := atomic.AddInt64(&d.refs, -1); refs < 0 { + panic("gofer.dentry.decRefLocked() called without holding a reference") + } +} + // checkCachingLocked should be called after d's reference count becomes 0 or it // becomes disowned. // @@ -1013,11 +1052,11 @@ func (d *dentry) destroyLocked() { if !d.file.isNil() { d.file.close(ctx) d.file = p9file{} + // Remove d from the set of syncable dentries. + d.fs.syncMu.Lock() + delete(d.fs.syncableDentries, d) + d.fs.syncMu.Unlock() } - // Remove d from the set of all dentries. - d.fs.syncMu.Lock() - delete(d.fs.dentries, d) - d.fs.syncMu.Unlock() // Drop the reference held by d on its parent without recursively locking // d.fs.renameMu. if d.parent != nil { @@ -1040,6 +1079,9 @@ func (d *dentry) setDeleted() { // We only support xattrs prefixed with "user." (see b/148380782). Currently, // there is no need to expose any other xattrs through a gofer. func (d *dentry) listxattr(ctx context.Context, creds *auth.Credentials, size uint64) ([]string, error) { + if d.file.isNil() { + return nil, nil + } xattrMap, err := d.file.listXattr(ctx, size) if err != nil { return nil, err @@ -1054,6 +1096,9 @@ func (d *dentry) listxattr(ctx context.Context, creds *auth.Credentials, size ui } func (d *dentry) getxattr(ctx context.Context, creds *auth.Credentials, opts *vfs.GetxattrOptions) (string, error) { + if d.file.isNil() { + return "", syserror.ENODATA + } if err := d.checkPermissions(creds, vfs.MayRead); err != nil { return "", err } @@ -1064,6 +1109,9 @@ func (d *dentry) getxattr(ctx context.Context, creds *auth.Credentials, opts *vf } func (d *dentry) setxattr(ctx context.Context, creds *auth.Credentials, opts *vfs.SetxattrOptions) error { + if d.file.isNil() { + return syserror.EPERM + } if err := d.checkPermissions(creds, vfs.MayWrite); err != nil { return err } @@ -1074,6 +1122,9 @@ func (d *dentry) setxattr(ctx context.Context, creds *auth.Credentials, opts *vf } func (d *dentry) removexattr(ctx context.Context, creds *auth.Credentials, name string) error { + if d.file.isNil() { + return syserror.EPERM + } if err := d.checkPermissions(creds, vfs.MayWrite); err != nil { return err } @@ -1083,7 +1134,7 @@ func (d *dentry) removexattr(ctx context.Context, creds *auth.Credentials, name return d.file.removeXattr(ctx, name) } -// Preconditions: d.isRegularFile() || d.isDirectory(). +// Preconditions: !d.file.isNil(). d.isRegularFile() || d.isDirectory(). func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool) error { // O_TRUNC unconditionally requires us to obtain a new handle (opened with // O_TRUNC). @@ -1213,7 +1264,7 @@ func (fd *fileDescription) dentry() *dentry { func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { d := fd.dentry() const validMask = uint32(linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME | linux.STATX_SIZE | linux.STATX_BLOCKS | linux.STATX_BTIME) - if d.fs.opts.interop == InteropModeShared && opts.Mask&(validMask) != 0 && opts.Sync != linux.AT_STATX_DONT_SYNC { + if !d.cachedMetadataAuthoritative() && opts.Mask&validMask != 0 && opts.Sync != linux.AT_STATX_DONT_SYNC { // TODO(jamieliu): Use specialFileFD.handle.file for the getattr if // available? if err := d.updateFromGetattr(ctx); err != nil { diff --git a/pkg/sentry/fsimpl/gofer/gofer_test.go b/pkg/sentry/fsimpl/gofer/gofer_test.go index 4041fb252..adff39490 100644 --- a/pkg/sentry/fsimpl/gofer/gofer_test.go +++ b/pkg/sentry/fsimpl/gofer/gofer_test.go @@ -24,7 +24,7 @@ import ( func TestDestroyIdempotent(t *testing.T) { fs := filesystem{ - dentries: make(map[*dentry]struct{}), + syncableDentries: make(map[*dentry]struct{}), opts: filesystemOptions{ // Test relies on no dentry being held in the cache. maxCachedDentries: 0, diff --git a/pkg/sentry/vfs/filesystem.go b/pkg/sentry/vfs/filesystem.go index 74577bc2f..20e5bb072 100644 --- a/pkg/sentry/vfs/filesystem.go +++ b/pkg/sentry/vfs/filesystem.go @@ -443,8 +443,7 @@ type FilesystemImpl interface { // Errors: // // - If extended attributes are not supported by the filesystem, - // ListxattrAt returns nil. (See FileDescription.Listxattr for an - // explanation.) + // ListxattrAt returns ENOTSUP. // // - If the size of the list (including a NUL terminating byte after every // entry) would exceed size, ERANGE may be returned. Note that diff --git a/pkg/sentry/vfs/options.go b/pkg/sentry/vfs/options.go index 534528ce6..022bac127 100644 --- a/pkg/sentry/vfs/options.go +++ b/pkg/sentry/vfs/options.go @@ -33,6 +33,25 @@ type GetDentryOptions struct { type MkdirOptions struct { // Mode is the file mode bits for the created directory. Mode linux.FileMode + + // If ForSyntheticMountpoint is true, FilesystemImpl.MkdirAt() may create + // the given directory in memory only (as opposed to persistent storage). + // The created directory should be able to support the creation of + // subdirectories with ForSyntheticMountpoint == true. It does not need to + // support the creation of subdirectories with ForSyntheticMountpoint == + // false, or files of other types. + // + // FilesystemImpls are permitted to ignore the ForSyntheticMountpoint + // option. + // + // The ForSyntheticMountpoint option exists because, unlike mount(2), the + // OCI Runtime Specification permits the specification of mount points that + // do not exist, under the expectation that container runtimes will create + // them. (More accurately, the OCI Runtime Specification completely fails + // to document this feature, but it's implemented by runc.) + // ForSyntheticMountpoint allows such mount points to be created even when + // the underlying persistent filesystem is immutable. + ForSyntheticMountpoint bool } // MknodOptions contains options to VirtualFilesystem.MknodAt() and diff --git a/runsc/boot/vfs.go b/runsc/boot/vfs.go index 82083c57d..bce3a3593 100644 --- a/runsc/boot/vfs.go +++ b/runsc/boot/vfs.go @@ -251,6 +251,12 @@ func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *Config, // All writes go to upper, be paranoid and make lower readonly. opts.ReadOnly = useOverlay + if err := c.k.VFS().MkdirAt(ctx, creds, target, &vfs.MkdirOptions{ + ForSyntheticMountpoint: true, + }); err != nil && err != syserror.EEXIST { + // Log a warning, but attempt the mount anyway. + log.Warningf("Failed to create mount point at %q: %v", submount.Destination, err) + } if err := c.k.VFS().MountAt(ctx, creds, "", target, submount.Type, opts); err != nil { return fmt.Errorf("failed to mount %q (type: %s): %w, opts: %v", submount.Destination, submount.Type, err, opts) } -- cgit v1.2.3 From 696feaf10c9339a57d177a913e847ddb488ece69 Mon Sep 17 00:00:00 2001 From: Nicolas Lacasse Date: Thu, 23 Apr 2020 17:32:59 -0700 Subject: Port devpts to VFS2. PiperOrigin-RevId: 308164359 --- pkg/sentry/fs/tty/line_discipline.go | 4 + pkg/sentry/fs/tty/master.go | 4 + pkg/sentry/fs/tty/queue.go | 4 + pkg/sentry/fs/tty/slave.go | 4 + pkg/sentry/fs/tty/terminal.go | 4 + pkg/sentry/fsimpl/devpts/BUILD | 43 +++ pkg/sentry/fsimpl/devpts/devpts.go | 207 +++++++++++++ pkg/sentry/fsimpl/devpts/devpts_test.go | 56 ++++ pkg/sentry/fsimpl/devpts/line_discipline.go | 449 ++++++++++++++++++++++++++++ pkg/sentry/fsimpl/devpts/master.go | 226 ++++++++++++++ pkg/sentry/fsimpl/devpts/queue.go | 240 +++++++++++++++ pkg/sentry/fsimpl/devpts/slave.go | 186 ++++++++++++ pkg/sentry/fsimpl/devpts/terminal.go | 124 ++++++++ pkg/sentry/fsimpl/devtmpfs/devtmpfs.go | 13 +- pkg/sentry/fsimpl/kernfs/filesystem.go | 2 +- pkg/sentry/fsimpl/kernfs/inode_impl_util.go | 9 +- 16 files changed, 1570 insertions(+), 5 deletions(-) create mode 100644 pkg/sentry/fsimpl/devpts/BUILD create mode 100644 pkg/sentry/fsimpl/devpts/devpts.go create mode 100644 pkg/sentry/fsimpl/devpts/devpts_test.go create mode 100644 pkg/sentry/fsimpl/devpts/line_discipline.go create mode 100644 pkg/sentry/fsimpl/devpts/master.go create mode 100644 pkg/sentry/fsimpl/devpts/queue.go create mode 100644 pkg/sentry/fsimpl/devpts/slave.go create mode 100644 pkg/sentry/fsimpl/devpts/terminal.go (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fs/tty/line_discipline.go b/pkg/sentry/fs/tty/line_discipline.go index 12b1c6097..2e9dd2d55 100644 --- a/pkg/sentry/fs/tty/line_discipline.go +++ b/pkg/sentry/fs/tty/line_discipline.go @@ -27,6 +27,8 @@ import ( "gvisor.dev/gvisor/pkg/waiter" ) +// LINT.IfChange + const ( // canonMaxBytes is the number of bytes that fit into a single line of // terminal input in canonical mode. This corresponds to N_TTY_BUF_SIZE @@ -443,3 +445,5 @@ func (l *lineDiscipline) peek(b []byte) int { } return size } + +// LINT.ThenChange(../../fsimpl/devpts/line_discipline.go) diff --git a/pkg/sentry/fs/tty/master.go b/pkg/sentry/fs/tty/master.go index f62da49bd..fe07fa929 100644 --- a/pkg/sentry/fs/tty/master.go +++ b/pkg/sentry/fs/tty/master.go @@ -26,6 +26,8 @@ import ( "gvisor.dev/gvisor/pkg/waiter" ) +// LINT.IfChange + // masterInodeOperations are the fs.InodeOperations for the master end of the // Terminal (ptmx file). // @@ -232,3 +234,5 @@ func maybeEmitUnimplementedEvent(ctx context.Context, cmd uint32) { unimpl.EmitUnimplementedEvent(ctx) } } + +// LINT.ThenChange(../../fsimpl/devpts/master.go) diff --git a/pkg/sentry/fs/tty/queue.go b/pkg/sentry/fs/tty/queue.go index 1ca79c0b2..ceabb9b1e 100644 --- a/pkg/sentry/fs/tty/queue.go +++ b/pkg/sentry/fs/tty/queue.go @@ -25,6 +25,8 @@ import ( "gvisor.dev/gvisor/pkg/waiter" ) +// LINT.IfChange + // waitBufMaxBytes is the maximum size of a wait buffer. It is based on // TTYB_DEFAULT_MEM_LIMIT. const waitBufMaxBytes = 131072 @@ -234,3 +236,5 @@ func (q *queue) waitBufAppend(b []byte) { q.waitBuf = append(q.waitBuf, b) q.waitBufLen += uint64(len(b)) } + +// LINT.ThenChange(../../fsimpl/devpts/queue.go) diff --git a/pkg/sentry/fs/tty/slave.go b/pkg/sentry/fs/tty/slave.go index 6a2dbc576..9871f6fc6 100644 --- a/pkg/sentry/fs/tty/slave.go +++ b/pkg/sentry/fs/tty/slave.go @@ -25,6 +25,8 @@ import ( "gvisor.dev/gvisor/pkg/waiter" ) +// LINT.IfChange + // slaveInodeOperations are the fs.InodeOperations for the slave end of the // Terminal (pts file). // @@ -172,3 +174,5 @@ func (sf *slaveFileOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem return 0, syserror.ENOTTY } } + +// LINT.ThenChange(../../fsimpl/devpts/slave.go) diff --git a/pkg/sentry/fs/tty/terminal.go b/pkg/sentry/fs/tty/terminal.go index 5883f26db..ddcccf4da 100644 --- a/pkg/sentry/fs/tty/terminal.go +++ b/pkg/sentry/fs/tty/terminal.go @@ -23,6 +23,8 @@ import ( "gvisor.dev/gvisor/pkg/usermem" ) +// LINT.IfChange + // Terminal is a pseudoterminal. // // +stateify savable @@ -126,3 +128,5 @@ func (tm *Terminal) tty(isMaster bool) *kernel.TTY { } return tm.slaveKTTY } + +// LINT.ThenChange(../../fsimpl/devpts/terminal.go) diff --git a/pkg/sentry/fsimpl/devpts/BUILD b/pkg/sentry/fsimpl/devpts/BUILD new file mode 100644 index 000000000..585764223 --- /dev/null +++ b/pkg/sentry/fsimpl/devpts/BUILD @@ -0,0 +1,43 @@ +load("//tools:defs.bzl", "go_library", "go_test") + +licenses(["notice"]) + +go_library( + name = "devpts", + srcs = [ + "devpts.go", + "line_discipline.go", + "master.go", + "queue.go", + "slave.go", + "terminal.go", + ], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/context", + "//pkg/safemem", + "//pkg/sentry/arch", + "//pkg/sentry/fsimpl/kernfs", + "//pkg/sentry/kernel", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/unimpl", + "//pkg/sentry/vfs", + "//pkg/sync", + "//pkg/syserror", + "//pkg/usermem", + "//pkg/waiter", + ], +) + +go_test( + name = "devpts_test", + size = "small", + srcs = ["devpts_test.go"], + library = ":devpts", + deps = [ + "//pkg/abi/linux", + "//pkg/sentry/contexttest", + "//pkg/usermem", + ], +) diff --git a/pkg/sentry/fsimpl/devpts/devpts.go b/pkg/sentry/fsimpl/devpts/devpts.go new file mode 100644 index 000000000..07a69b940 --- /dev/null +++ b/pkg/sentry/fsimpl/devpts/devpts.go @@ -0,0 +1,207 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package devpts provides a filesystem implementation that behaves like +// devpts. +package devpts + +import ( + "fmt" + "math" + "sort" + "strconv" + "sync" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +// Name is the filesystem name. +const Name = "devpts" + +// FilesystemType implements vfs.FilesystemType. +type FilesystemType struct{} + +// Name implements vfs.FilesystemType.Name. +func (FilesystemType) Name() string { + return Name +} + +var _ vfs.FilesystemType = (*FilesystemType)(nil) + +// GetFilesystem implements vfs.FilesystemType.GetFilesystem. +func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { + // No data allowed. + if opts.Data != "" { + return nil, nil, syserror.EINVAL + } + + fs, root := fstype.newFilesystem(vfsObj, creds) + return fs.VFSFilesystem(), root.VFSDentry(), nil +} + +// newFilesystem creates a new devpts filesystem with root directory and ptmx +// master inode. It returns the filesystem and root Dentry. +func (fstype FilesystemType) newFilesystem(vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials) (*kernfs.Filesystem, *kernfs.Dentry) { + fs := &kernfs.Filesystem{} + fs.Init(vfsObj, fstype) + + // Construct the root directory. This is always inode id 1. + root := &rootInode{ + slaves: make(map[uint32]*slaveInode), + } + root.InodeAttrs.Init(creds, 1, linux.ModeDirectory|0555) + root.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) + root.dentry.Init(root) + + // Construct the pts master inode and dentry. Linux always uses inode + // id 2 for ptmx. See fs/devpts/inode.c:mknod_ptmx. + master := &masterInode{ + root: root, + } + master.InodeAttrs.Init(creds, 2, linux.ModeCharacterDevice|0666) + master.dentry.Init(master) + + // Add the master as a child of the root. + links := root.OrderedChildren.Populate(&root.dentry, map[string]*kernfs.Dentry{ + "ptmx": &master.dentry, + }) + root.IncLinks(links) + + return fs, &root.dentry +} + +// rootInode is the root directory inode for the devpts mounts. +type rootInode struct { + kernfs.AlwaysValid + kernfs.InodeAttrs + kernfs.InodeDirectoryNoNewChildren + kernfs.InodeNotSymlink + kernfs.OrderedChildren + + // Keep a reference to this inode's dentry. + dentry kernfs.Dentry + + // master is the master pty inode. Immutable. + master *masterInode + + // root is the root directory inode for this filesystem. Immutable. + root *rootInode + + // mu protects the fields below. + mu sync.Mutex + + // slaves maps pty ids to slave inodes. + slaves map[uint32]*slaveInode + + // nextIdx is the next pty index to use. Must be accessed atomically. + // + // TODO(b/29356795): reuse indices when ptys are closed. + nextIdx uint32 +} + +var _ kernfs.Inode = (*rootInode)(nil) + +// allocateTerminal creates a new Terminal and installs a pts node for it. +func (i *rootInode) allocateTerminal(creds *auth.Credentials) (*Terminal, error) { + i.mu.Lock() + defer i.mu.Unlock() + if i.nextIdx == math.MaxUint32 { + return nil, syserror.ENOMEM + } + idx := i.nextIdx + i.nextIdx++ + + // Sanity check that slave with idx does not exist. + if _, ok := i.slaves[idx]; ok { + panic(fmt.Sprintf("pty index collision; index %d already exists", idx)) + } + + // Create the new terminal and slave. + t := newTerminal(idx) + slave := &slaveInode{ + root: i, + t: t, + } + // Linux always uses pty index + 3 as the inode id. See + // fs/devpts/inode.c:devpts_pty_new(). + slave.InodeAttrs.Init(creds, uint64(idx+3), linux.ModeCharacterDevice|0600) + slave.dentry.Init(slave) + i.slaves[idx] = slave + + return t, nil +} + +// masterClose is called when the master end of t is closed. +func (i *rootInode) masterClose(t *Terminal) { + i.mu.Lock() + defer i.mu.Unlock() + + // Sanity check that slave with idx exists. + if _, ok := i.slaves[t.n]; !ok { + panic(fmt.Sprintf("pty with index %d does not exist", t.n)) + } + delete(i.slaves, t.n) +} + +// Open implements kernfs.Inode.Open. +func (i *rootInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + fd := &kernfs.GenericDirectoryFD{} + fd.Init(rp.Mount(), vfsd, &i.OrderedChildren, &opts) + return fd.VFSFileDescription(), nil +} + +// Lookup implements kernfs.Inode.Lookup. +func (i *rootInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) { + idx, err := strconv.ParseUint(name, 10, 32) + if err != nil { + return nil, syserror.ENOENT + } + i.mu.Lock() + defer i.mu.Unlock() + if si, ok := i.slaves[uint32(idx)]; ok { + si.dentry.IncRef() + return si.dentry.VFSDentry(), nil + + } + return nil, syserror.ENOENT +} + +// IterDirents implements kernfs.Inode.IterDirents. +func (i *rootInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) { + i.mu.Lock() + defer i.mu.Unlock() + ids := make([]int, 0, len(i.slaves)) + for id := range i.slaves { + ids = append(ids, int(id)) + } + sort.Ints(ids) + for _, id := range ids[relOffset:] { + dirent := vfs.Dirent{ + Name: strconv.FormatUint(uint64(id), 10), + Type: linux.DT_CHR, + Ino: i.slaves[uint32(id)].InodeAttrs.Ino(), + NextOff: offset + 1, + } + if err := cb.Handle(dirent); err != nil { + return offset, err + } + offset++ + } + return offset, nil +} diff --git a/pkg/sentry/fsimpl/devpts/devpts_test.go b/pkg/sentry/fsimpl/devpts/devpts_test.go new file mode 100644 index 000000000..b7c149047 --- /dev/null +++ b/pkg/sentry/fsimpl/devpts/devpts_test.go @@ -0,0 +1,56 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package devpts + +import ( + "testing" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/contexttest" + "gvisor.dev/gvisor/pkg/usermem" +) + +func TestSimpleMasterToSlave(t *testing.T) { + ld := newLineDiscipline(linux.DefaultSlaveTermios) + ctx := contexttest.Context(t) + inBytes := []byte("hello, tty\n") + src := usermem.BytesIOSequence(inBytes) + outBytes := make([]byte, 32) + dst := usermem.BytesIOSequence(outBytes) + + // Write to the input queue. + nw, err := ld.inputQueueWrite(ctx, src) + if err != nil { + t.Fatalf("error writing to input queue: %v", err) + } + if nw != int64(len(inBytes)) { + t.Fatalf("wrote wrong length: got %d, want %d", nw, len(inBytes)) + } + + // Read from the input queue. + nr, err := ld.inputQueueRead(ctx, dst) + if err != nil { + t.Fatalf("error reading from input queue: %v", err) + } + if nr != int64(len(inBytes)) { + t.Fatalf("read wrong length: got %d, want %d", nr, len(inBytes)) + } + + outStr := string(outBytes[:nr]) + inStr := string(inBytes) + if outStr != inStr { + t.Fatalf("written and read strings do not match: got %q, want %q", outStr, inStr) + } +} diff --git a/pkg/sentry/fsimpl/devpts/line_discipline.go b/pkg/sentry/fsimpl/devpts/line_discipline.go new file mode 100644 index 000000000..e201801d6 --- /dev/null +++ b/pkg/sentry/fsimpl/devpts/line_discipline.go @@ -0,0 +1,449 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package devpts + +import ( + "bytes" + "unicode/utf8" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +// LINT.IfChange + +const ( + // canonMaxBytes is the number of bytes that fit into a single line of + // terminal input in canonical mode. This corresponds to N_TTY_BUF_SIZE + // in include/linux/tty.h. + canonMaxBytes = 4096 + + // nonCanonMaxBytes is the maximum number of bytes that can be read at + // a time in noncanonical mode. + nonCanonMaxBytes = canonMaxBytes - 1 + + spacesPerTab = 8 +) + +// lineDiscipline dictates how input and output are handled between the +// pseudoterminal (pty) master and slave. It can be configured to alter I/O, +// modify control characters (e.g. Ctrl-C for SIGINT), etc. The following man +// pages are good resources for how to affect the line discipline: +// +// * termios(3) +// * tty_ioctl(4) +// +// This file corresponds most closely to drivers/tty/n_tty.c. +// +// lineDiscipline has a simple structure but supports a multitude of options +// (see the above man pages). It consists of two queues of bytes: one from the +// terminal master to slave (the input queue) and one from slave to master (the +// output queue). When bytes are written to one end of the pty, the line +// discipline reads the bytes, modifies them or takes special action if +// required, and enqueues them to be read by the other end of the pty: +// +// input from terminal +-------------+ input to process (e.g. bash) +// +------------------------>| input queue |---------------------------+ +// | (inputQueueWrite) +-------------+ (inputQueueRead) | +// | | +// | v +// masterFD slaveFD +// ^ | +// | | +// | output to terminal +--------------+ output from process | +// +------------------------| output queue |<--------------------------+ +// (outputQueueRead) +--------------+ (outputQueueWrite) +// +// Lock order: +// termiosMu +// inQueue.mu +// outQueue.mu +// +// +stateify savable +type lineDiscipline struct { + // sizeMu protects size. + sizeMu sync.Mutex `state:"nosave"` + + // size is the terminal size (width and height). + size linux.WindowSize + + // inQueue is the input queue of the terminal. + inQueue queue + + // outQueue is the output queue of the terminal. + outQueue queue + + // termiosMu protects termios. + termiosMu sync.RWMutex `state:"nosave"` + + // termios is the terminal configuration used by the lineDiscipline. + termios linux.KernelTermios + + // column is the location in a row of the cursor. This is important for + // handling certain special characters like backspace. + column int + + // masterWaiter is used to wait on the master end of the TTY. + masterWaiter waiter.Queue `state:"zerovalue"` + + // slaveWaiter is used to wait on the slave end of the TTY. + slaveWaiter waiter.Queue `state:"zerovalue"` +} + +func newLineDiscipline(termios linux.KernelTermios) *lineDiscipline { + ld := lineDiscipline{termios: termios} + ld.inQueue.transformer = &inputQueueTransformer{} + ld.outQueue.transformer = &outputQueueTransformer{} + return &ld +} + +// getTermios gets the linux.Termios for the tty. +func (l *lineDiscipline) getTermios(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { + l.termiosMu.RLock() + defer l.termiosMu.RUnlock() + // We must copy a Termios struct, not KernelTermios. + t := l.termios.ToTermios() + _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), t, usermem.IOOpts{ + AddressSpaceActive: true, + }) + return 0, err +} + +// setTermios sets a linux.Termios for the tty. +func (l *lineDiscipline) setTermios(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { + l.termiosMu.Lock() + defer l.termiosMu.Unlock() + oldCanonEnabled := l.termios.LEnabled(linux.ICANON) + // We must copy a Termios struct, not KernelTermios. + var t linux.Termios + _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &t, usermem.IOOpts{ + AddressSpaceActive: true, + }) + l.termios.FromTermios(t) + + // If canonical mode is turned off, move bytes from inQueue's wait + // buffer to its read buffer. Anything already in the read buffer is + // now readable. + if oldCanonEnabled && !l.termios.LEnabled(linux.ICANON) { + l.inQueue.mu.Lock() + l.inQueue.pushWaitBufLocked(l) + l.inQueue.readable = true + l.inQueue.mu.Unlock() + l.slaveWaiter.Notify(waiter.EventIn) + } + + return 0, err +} + +func (l *lineDiscipline) windowSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error { + l.sizeMu.Lock() + defer l.sizeMu.Unlock() + _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), l.size, usermem.IOOpts{ + AddressSpaceActive: true, + }) + return err +} + +func (l *lineDiscipline) setWindowSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error { + l.sizeMu.Lock() + defer l.sizeMu.Unlock() + _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &l.size, usermem.IOOpts{ + AddressSpaceActive: true, + }) + return err +} + +func (l *lineDiscipline) masterReadiness() waiter.EventMask { + // We don't have to lock a termios because the default master termios + // is immutable. + return l.inQueue.writeReadiness(&linux.MasterTermios) | l.outQueue.readReadiness(&linux.MasterTermios) +} + +func (l *lineDiscipline) slaveReadiness() waiter.EventMask { + l.termiosMu.RLock() + defer l.termiosMu.RUnlock() + return l.outQueue.writeReadiness(&l.termios) | l.inQueue.readReadiness(&l.termios) +} + +func (l *lineDiscipline) inputQueueReadSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error { + return l.inQueue.readableSize(ctx, io, args) +} + +func (l *lineDiscipline) inputQueueRead(ctx context.Context, dst usermem.IOSequence) (int64, error) { + l.termiosMu.RLock() + defer l.termiosMu.RUnlock() + n, pushed, err := l.inQueue.read(ctx, dst, l) + if err != nil { + return 0, err + } + if n > 0 { + l.masterWaiter.Notify(waiter.EventOut) + if pushed { + l.slaveWaiter.Notify(waiter.EventIn) + } + return n, nil + } + return 0, syserror.ErrWouldBlock +} + +func (l *lineDiscipline) inputQueueWrite(ctx context.Context, src usermem.IOSequence) (int64, error) { + l.termiosMu.RLock() + defer l.termiosMu.RUnlock() + n, err := l.inQueue.write(ctx, src, l) + if err != nil { + return 0, err + } + if n > 0 { + l.slaveWaiter.Notify(waiter.EventIn) + return n, nil + } + return 0, syserror.ErrWouldBlock +} + +func (l *lineDiscipline) outputQueueReadSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error { + return l.outQueue.readableSize(ctx, io, args) +} + +func (l *lineDiscipline) outputQueueRead(ctx context.Context, dst usermem.IOSequence) (int64, error) { + l.termiosMu.RLock() + defer l.termiosMu.RUnlock() + n, pushed, err := l.outQueue.read(ctx, dst, l) + if err != nil { + return 0, err + } + if n > 0 { + l.slaveWaiter.Notify(waiter.EventOut) + if pushed { + l.masterWaiter.Notify(waiter.EventIn) + } + return n, nil + } + return 0, syserror.ErrWouldBlock +} + +func (l *lineDiscipline) outputQueueWrite(ctx context.Context, src usermem.IOSequence) (int64, error) { + l.termiosMu.RLock() + defer l.termiosMu.RUnlock() + n, err := l.outQueue.write(ctx, src, l) + if err != nil { + return 0, err + } + if n > 0 { + l.masterWaiter.Notify(waiter.EventIn) + return n, nil + } + return 0, syserror.ErrWouldBlock +} + +// transformer is a helper interface to make it easier to stateify queue. +type transformer interface { + // transform functions require queue's mutex to be held. + transform(*lineDiscipline, *queue, []byte) int +} + +// outputQueueTransformer implements transformer. It performs line discipline +// transformations on the output queue. +// +// +stateify savable +type outputQueueTransformer struct{} + +// transform does output processing for one end of the pty. See +// drivers/tty/n_tty.c:do_output_char for an analogous kernel function. +// +// Preconditions: +// * l.termiosMu must be held for reading. +// * q.mu must be held. +func (*outputQueueTransformer) transform(l *lineDiscipline, q *queue, buf []byte) int { + // transformOutput is effectively always in noncanonical mode, as the + // master termios never has ICANON set. + + if !l.termios.OEnabled(linux.OPOST) { + q.readBuf = append(q.readBuf, buf...) + if len(q.readBuf) > 0 { + q.readable = true + } + return len(buf) + } + + var ret int + for len(buf) > 0 { + size := l.peek(buf) + cBytes := append([]byte{}, buf[:size]...) + ret += size + buf = buf[size:] + // We're guaranteed that cBytes has at least one element. + switch cBytes[0] { + case '\n': + if l.termios.OEnabled(linux.ONLRET) { + l.column = 0 + } + if l.termios.OEnabled(linux.ONLCR) { + q.readBuf = append(q.readBuf, '\r', '\n') + continue + } + case '\r': + if l.termios.OEnabled(linux.ONOCR) && l.column == 0 { + continue + } + if l.termios.OEnabled(linux.OCRNL) { + cBytes[0] = '\n' + if l.termios.OEnabled(linux.ONLRET) { + l.column = 0 + } + break + } + l.column = 0 + case '\t': + spaces := spacesPerTab - l.column%spacesPerTab + if l.termios.OutputFlags&linux.TABDLY == linux.XTABS { + l.column += spaces + q.readBuf = append(q.readBuf, bytes.Repeat([]byte{' '}, spacesPerTab)...) + continue + } + l.column += spaces + case '\b': + if l.column > 0 { + l.column-- + } + default: + l.column++ + } + q.readBuf = append(q.readBuf, cBytes...) + } + if len(q.readBuf) > 0 { + q.readable = true + } + return ret +} + +// inputQueueTransformer implements transformer. It performs line discipline +// transformations on the input queue. +// +// +stateify savable +type inputQueueTransformer struct{} + +// transform does input processing for one end of the pty. Characters read are +// transformed according to flags set in the termios struct. See +// drivers/tty/n_tty.c:n_tty_receive_char_special for an analogous kernel +// function. +// +// Preconditions: +// * l.termiosMu must be held for reading. +// * q.mu must be held. +func (*inputQueueTransformer) transform(l *lineDiscipline, q *queue, buf []byte) int { + // If there's a line waiting to be read in canonical mode, don't write + // anything else to the read buffer. + if l.termios.LEnabled(linux.ICANON) && q.readable { + return 0 + } + + maxBytes := nonCanonMaxBytes + if l.termios.LEnabled(linux.ICANON) { + maxBytes = canonMaxBytes + } + + var ret int + for len(buf) > 0 && len(q.readBuf) < canonMaxBytes { + size := l.peek(buf) + cBytes := append([]byte{}, buf[:size]...) + // We're guaranteed that cBytes has at least one element. + switch cBytes[0] { + case '\r': + if l.termios.IEnabled(linux.IGNCR) { + buf = buf[size:] + ret += size + continue + } + if l.termios.IEnabled(linux.ICRNL) { + cBytes[0] = '\n' + } + case '\n': + if l.termios.IEnabled(linux.INLCR) { + cBytes[0] = '\r' + } + } + + // In canonical mode, we discard non-terminating characters + // after the first 4095. + if l.shouldDiscard(q, cBytes) { + buf = buf[size:] + ret += size + continue + } + + // Stop if the buffer would be overfilled. + if len(q.readBuf)+size > maxBytes { + break + } + buf = buf[size:] + ret += size + + // If we get EOF, make the buffer available for reading. + if l.termios.LEnabled(linux.ICANON) && l.termios.IsEOF(cBytes[0]) { + q.readable = true + break + } + + q.readBuf = append(q.readBuf, cBytes...) + + // Anything written to the readBuf will have to be echoed. + if l.termios.LEnabled(linux.ECHO) { + l.outQueue.writeBytes(cBytes, l) + l.masterWaiter.Notify(waiter.EventIn) + } + + // If we finish a line, make it available for reading. + if l.termios.LEnabled(linux.ICANON) && l.termios.IsTerminating(cBytes) { + q.readable = true + break + } + } + + // In noncanonical mode, everything is readable. + if !l.termios.LEnabled(linux.ICANON) && len(q.readBuf) > 0 { + q.readable = true + } + + return ret +} + +// shouldDiscard returns whether c should be discarded. In canonical mode, if +// too many bytes are enqueued, we keep reading input and discarding it until +// we find a terminating character. Signal/echo processing still occurs. +// +// Precondition: +// * l.termiosMu must be held for reading. +// * q.mu must be held. +func (l *lineDiscipline) shouldDiscard(q *queue, cBytes []byte) bool { + return l.termios.LEnabled(linux.ICANON) && len(q.readBuf)+len(cBytes) >= canonMaxBytes && !l.termios.IsTerminating(cBytes) +} + +// peek returns the size in bytes of the next character to process. As long as +// b isn't empty, peek returns a value of at least 1. +func (l *lineDiscipline) peek(b []byte) int { + size := 1 + // If UTF-8 support is enabled, runes might be multiple bytes. + if l.termios.IEnabled(linux.IUTF8) { + _, size = utf8.DecodeRune(b) + } + return size +} + +// LINT.ThenChange(../../fs/tty/line_discipline.go) diff --git a/pkg/sentry/fsimpl/devpts/master.go b/pkg/sentry/fsimpl/devpts/master.go new file mode 100644 index 000000000..60340c28e --- /dev/null +++ b/pkg/sentry/fsimpl/devpts/master.go @@ -0,0 +1,226 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package devpts + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/unimpl" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +// LINT.IfChange + +// masterInode is the inode for the master end of the Terminal. +type masterInode struct { + kernfs.InodeAttrs + kernfs.InodeNoopRefCount + kernfs.InodeNotDirectory + kernfs.InodeNotSymlink + + // Keep a reference to this inode's dentry. + dentry kernfs.Dentry + + // root is the devpts root inode. + root *rootInode +} + +var _ kernfs.Inode = (*masterInode)(nil) + +// Open implements kernfs.Inode.Open. +func (mi *masterInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + t, err := mi.root.allocateTerminal(rp.Credentials()) + if err != nil { + return nil, err + } + + mi.IncRef() + fd := &masterFileDescription{ + inode: mi, + t: t, + } + if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{}); err != nil { + mi.DecRef() + return nil, err + } + return &fd.vfsfd, nil +} + +// Stat implements kernfs.Inode.Stat. +func (mi *masterInode) Stat(vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) { + statx, err := mi.InodeAttrs.Stat(vfsfs, opts) + if err != nil { + return linux.Statx{}, err + } + statx.Blksize = 1024 + statx.RdevMajor = linux.TTYAUX_MAJOR + statx.RdevMinor = linux.PTMX_MINOR + return statx, nil +} + +// SetStat implements kernfs.Inode.SetStat +func (mi *masterInode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error { + if opts.Stat.Mask&linux.STATX_SIZE != 0 { + return syserror.EINVAL + } + return mi.InodeAttrs.SetStat(ctx, vfsfs, creds, opts) +} + +type masterFileDescription struct { + vfsfd vfs.FileDescription + vfs.FileDescriptionDefaultImpl + + inode *masterInode + t *Terminal +} + +var _ vfs.FileDescriptionImpl = (*masterFileDescription)(nil) + +// Release implements vfs.FileDescriptionImpl.Release. +func (mfd *masterFileDescription) Release() { + mfd.inode.root.masterClose(mfd.t) + mfd.inode.DecRef() +} + +// EventRegister implements waiter.Waitable.EventRegister. +func (mfd *masterFileDescription) EventRegister(e *waiter.Entry, mask waiter.EventMask) { + mfd.t.ld.masterWaiter.EventRegister(e, mask) +} + +// EventUnregister implements waiter.Waitable.EventUnregister. +func (mfd *masterFileDescription) EventUnregister(e *waiter.Entry) { + mfd.t.ld.masterWaiter.EventUnregister(e) +} + +// Readiness implements waiter.Waitable.Readiness. +func (mfd *masterFileDescription) Readiness(mask waiter.EventMask) waiter.EventMask { + return mfd.t.ld.masterReadiness() +} + +// Read implements vfs.FileDescriptionImpl.Read. +func (mfd *masterFileDescription) Read(ctx context.Context, dst usermem.IOSequence, _ vfs.ReadOptions) (int64, error) { + return mfd.t.ld.outputQueueRead(ctx, dst) +} + +// Write implements vfs.FileDescriptionImpl.Write. +func (mfd *masterFileDescription) Write(ctx context.Context, src usermem.IOSequence, _ vfs.WriteOptions) (int64, error) { + return mfd.t.ld.inputQueueWrite(ctx, src) +} + +// Ioctl implements vfs.FileDescriptionImpl.Ioctl. +func (mfd *masterFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { + switch cmd := args[1].Uint(); cmd { + case linux.FIONREAD: // linux.FIONREAD == linux.TIOCINQ + // Get the number of bytes in the output queue read buffer. + return 0, mfd.t.ld.outputQueueReadSize(ctx, io, args) + case linux.TCGETS: + // N.B. TCGETS on the master actually returns the configuration + // of the slave end. + return mfd.t.ld.getTermios(ctx, io, args) + case linux.TCSETS: + // N.B. TCSETS on the master actually affects the configuration + // of the slave end. + return mfd.t.ld.setTermios(ctx, io, args) + case linux.TCSETSW: + // TODO(b/29356795): This should drain the output queue first. + return mfd.t.ld.setTermios(ctx, io, args) + case linux.TIOCGPTN: + _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), uint32(mfd.t.n), usermem.IOOpts{ + AddressSpaceActive: true, + }) + return 0, err + case linux.TIOCSPTLCK: + // TODO(b/29356795): Implement pty locking. For now just pretend we do. + return 0, nil + case linux.TIOCGWINSZ: + return 0, mfd.t.ld.windowSize(ctx, io, args) + case linux.TIOCSWINSZ: + return 0, mfd.t.ld.setWindowSize(ctx, io, args) + case linux.TIOCSCTTY: + // Make the given terminal the controlling terminal of the + // calling process. + return 0, mfd.t.setControllingTTY(ctx, io, args, true /* isMaster */) + case linux.TIOCNOTTY: + // Release this process's controlling terminal. + return 0, mfd.t.releaseControllingTTY(ctx, io, args, true /* isMaster */) + case linux.TIOCGPGRP: + // Get the foreground process group. + return mfd.t.foregroundProcessGroup(ctx, io, args, true /* isMaster */) + case linux.TIOCSPGRP: + // Set the foreground process group. + return mfd.t.setForegroundProcessGroup(ctx, io, args, true /* isMaster */) + default: + maybeEmitUnimplementedEvent(ctx, cmd) + return 0, syserror.ENOTTY + } +} + +// SetStat implements vfs.FileDescriptionImpl.SetStat. +func (mfd *masterFileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { + creds := auth.CredentialsFromContext(ctx) + fs := mfd.vfsfd.VirtualDentry().Mount().Filesystem() + return mfd.inode.SetStat(ctx, fs, creds, opts) +} + +// Stat implements vfs.FileDescriptionImpl.Stat. +func (mfd *masterFileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { + fs := mfd.vfsfd.VirtualDentry().Mount().Filesystem() + return mfd.inode.Stat(fs, opts) +} + +// maybeEmitUnimplementedEvent emits unimplemented event if cmd is valid. +func maybeEmitUnimplementedEvent(ctx context.Context, cmd uint32) { + switch cmd { + case linux.TCGETS, + linux.TCSETS, + linux.TCSETSW, + linux.TCSETSF, + linux.TIOCGWINSZ, + linux.TIOCSWINSZ, + linux.TIOCSETD, + linux.TIOCSBRK, + linux.TIOCCBRK, + linux.TCSBRK, + linux.TCSBRKP, + linux.TIOCSTI, + linux.TIOCCONS, + linux.FIONBIO, + linux.TIOCEXCL, + linux.TIOCNXCL, + linux.TIOCGEXCL, + linux.TIOCGSID, + linux.TIOCGETD, + linux.TIOCVHANGUP, + linux.TIOCGDEV, + linux.TIOCMGET, + linux.TIOCMSET, + linux.TIOCMBIC, + linux.TIOCMBIS, + linux.TIOCGICOUNT, + linux.TCFLSH, + linux.TIOCSSERIAL, + linux.TIOCGPTPEER: + + unimpl.EmitUnimplementedEvent(ctx) + } +} + +// LINT.ThenChange(../../fs/tty/master.go) diff --git a/pkg/sentry/fsimpl/devpts/queue.go b/pkg/sentry/fsimpl/devpts/queue.go new file mode 100644 index 000000000..29a6be858 --- /dev/null +++ b/pkg/sentry/fsimpl/devpts/queue.go @@ -0,0 +1,240 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package devpts + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/safemem" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +// LINT.IfChange + +// waitBufMaxBytes is the maximum size of a wait buffer. It is based on +// TTYB_DEFAULT_MEM_LIMIT. +const waitBufMaxBytes = 131072 + +// queue represents one of the input or output queues between a pty master and +// slave. Bytes written to a queue are added to the read buffer until it is +// full, at which point they are written to the wait buffer. Bytes are +// processed (i.e. undergo termios transformations) as they are added to the +// read buffer. The read buffer is readable when its length is nonzero and +// readable is true. +// +// +stateify savable +type queue struct { + // mu protects everything in queue. + mu sync.Mutex `state:"nosave"` + + // readBuf is buffer of data ready to be read when readable is true. + // This data has been processed. + readBuf []byte + + // waitBuf contains data that can't fit into readBuf. It is put here + // until it can be loaded into the read buffer. waitBuf contains data + // that hasn't been processed. + waitBuf [][]byte + waitBufLen uint64 + + // readable indicates whether the read buffer can be read from. In + // canonical mode, there can be an unterminated line in the read buffer, + // so readable must be checked. + readable bool + + // transform is the the queue's function for transforming bytes + // entering the queue. For example, transform might convert all '\r's + // entering the queue to '\n's. + transformer +} + +// readReadiness returns whether q is ready to be read from. +func (q *queue) readReadiness(t *linux.KernelTermios) waiter.EventMask { + q.mu.Lock() + defer q.mu.Unlock() + if len(q.readBuf) > 0 && q.readable { + return waiter.EventIn + } + return waiter.EventMask(0) +} + +// writeReadiness returns whether q is ready to be written to. +func (q *queue) writeReadiness(t *linux.KernelTermios) waiter.EventMask { + q.mu.Lock() + defer q.mu.Unlock() + if q.waitBufLen < waitBufMaxBytes { + return waiter.EventOut + } + return waiter.EventMask(0) +} + +// readableSize writes the number of readable bytes to userspace. +func (q *queue) readableSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error { + q.mu.Lock() + defer q.mu.Unlock() + var size int32 + if q.readable { + size = int32(len(q.readBuf)) + } + + _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), size, usermem.IOOpts{ + AddressSpaceActive: true, + }) + return err + +} + +// read reads from q to userspace. It returns the number of bytes read as well +// as whether the read caused more readable data to become available (whether +// data was pushed from the wait buffer to the read buffer). +// +// Preconditions: +// * l.termiosMu must be held for reading. +func (q *queue) read(ctx context.Context, dst usermem.IOSequence, l *lineDiscipline) (int64, bool, error) { + q.mu.Lock() + defer q.mu.Unlock() + + if !q.readable { + return 0, false, syserror.ErrWouldBlock + } + + if dst.NumBytes() > canonMaxBytes { + dst = dst.TakeFirst(canonMaxBytes) + } + + n, err := dst.CopyOutFrom(ctx, safemem.ReaderFunc(func(dst safemem.BlockSeq) (uint64, error) { + src := safemem.BlockSeqOf(safemem.BlockFromSafeSlice(q.readBuf)) + n, err := safemem.CopySeq(dst, src) + if err != nil { + return 0, err + } + q.readBuf = q.readBuf[n:] + + // If we read everything, this queue is no longer readable. + if len(q.readBuf) == 0 { + q.readable = false + } + + return n, nil + })) + if err != nil { + return 0, false, err + } + + // Move data from the queue's wait buffer to its read buffer. + nPushed := q.pushWaitBufLocked(l) + + return int64(n), nPushed > 0, nil +} + +// write writes to q from userspace. +// +// Preconditions: +// * l.termiosMu must be held for reading. +func (q *queue) write(ctx context.Context, src usermem.IOSequence, l *lineDiscipline) (int64, error) { + q.mu.Lock() + defer q.mu.Unlock() + + // Copy data into the wait buffer. + n, err := src.CopyInTo(ctx, safemem.WriterFunc(func(src safemem.BlockSeq) (uint64, error) { + copyLen := src.NumBytes() + room := waitBufMaxBytes - q.waitBufLen + // If out of room, return EAGAIN. + if room == 0 && copyLen > 0 { + return 0, syserror.ErrWouldBlock + } + // Cap the size of the wait buffer. + if copyLen > room { + copyLen = room + src = src.TakeFirst64(room) + } + buf := make([]byte, copyLen) + + // Copy the data into the wait buffer. + dst := safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf)) + n, err := safemem.CopySeq(dst, src) + if err != nil { + return 0, err + } + q.waitBufAppend(buf) + + return n, nil + })) + if err != nil { + return 0, err + } + + // Push data from the wait to the read buffer. + q.pushWaitBufLocked(l) + + return n, nil +} + +// writeBytes writes to q from b. +// +// Preconditions: +// * l.termiosMu must be held for reading. +func (q *queue) writeBytes(b []byte, l *lineDiscipline) { + q.mu.Lock() + defer q.mu.Unlock() + + // Write to the wait buffer. + q.waitBufAppend(b) + q.pushWaitBufLocked(l) +} + +// pushWaitBufLocked fills the queue's read buffer with data from the wait +// buffer. +// +// Preconditions: +// * l.termiosMu must be held for reading. +// * q.mu must be locked. +func (q *queue) pushWaitBufLocked(l *lineDiscipline) int { + if q.waitBufLen == 0 { + return 0 + } + + // Move data from the wait to the read buffer. + var total int + var i int + for i = 0; i < len(q.waitBuf); i++ { + n := q.transform(l, q, q.waitBuf[i]) + total += n + if n != len(q.waitBuf[i]) { + // The read buffer filled up without consuming the + // entire buffer. + q.waitBuf[i] = q.waitBuf[i][n:] + break + } + } + + // Update wait buffer based on consumed data. + q.waitBuf = q.waitBuf[i:] + q.waitBufLen -= uint64(total) + + return total +} + +// Precondition: q.mu must be locked. +func (q *queue) waitBufAppend(b []byte) { + q.waitBuf = append(q.waitBuf, b) + q.waitBufLen += uint64(len(b)) +} + +// LINT.ThenChange(../../fs/tty/queue.go) diff --git a/pkg/sentry/fsimpl/devpts/slave.go b/pkg/sentry/fsimpl/devpts/slave.go new file mode 100644 index 000000000..e7e50d51e --- /dev/null +++ b/pkg/sentry/fsimpl/devpts/slave.go @@ -0,0 +1,186 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package devpts + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +// LINT.IfChange + +// slaveInode is the inode for the slave end of the Terminal. +type slaveInode struct { + kernfs.InodeAttrs + kernfs.InodeNoopRefCount + kernfs.InodeNotDirectory + kernfs.InodeNotSymlink + + // Keep a reference to this inode's dentry. + dentry kernfs.Dentry + + // root is the devpts root inode. + root *rootInode + + // t is the connected Terminal. + t *Terminal +} + +var _ kernfs.Inode = (*slaveInode)(nil) + +// Open implements kernfs.Inode.Open. +func (si *slaveInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + si.IncRef() + fd := &slaveFileDescription{ + inode: si, + } + if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{}); err != nil { + si.DecRef() + return nil, err + } + return &fd.vfsfd, nil + +} + +// Valid implements kernfs.Inode.Valid. +func (si *slaveInode) Valid(context.Context) bool { + // Return valid if the slave still exists. + si.root.mu.Lock() + defer si.root.mu.Unlock() + _, ok := si.root.slaves[si.t.n] + return ok +} + +// Stat implements kernfs.Inode.Stat. +func (si *slaveInode) Stat(vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) { + statx, err := si.InodeAttrs.Stat(vfsfs, opts) + if err != nil { + return linux.Statx{}, err + } + statx.Blksize = 1024 + statx.RdevMajor = linux.UNIX98_PTY_SLAVE_MAJOR + statx.RdevMinor = si.t.n + return statx, nil +} + +// SetStat implements kernfs.Inode.SetStat +func (si *slaveInode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error { + if opts.Stat.Mask&linux.STATX_SIZE != 0 { + return syserror.EINVAL + } + return si.InodeAttrs.SetStat(ctx, vfsfs, creds, opts) +} + +type slaveFileDescription struct { + vfsfd vfs.FileDescription + vfs.FileDescriptionDefaultImpl + + inode *slaveInode +} + +var _ vfs.FileDescriptionImpl = (*slaveFileDescription)(nil) + +// Release implements fs.FileOperations.Release. +func (sfd *slaveFileDescription) Release() { + sfd.inode.DecRef() +} + +// EventRegister implements waiter.Waitable.EventRegister. +func (sfd *slaveFileDescription) EventRegister(e *waiter.Entry, mask waiter.EventMask) { + sfd.inode.t.ld.slaveWaiter.EventRegister(e, mask) +} + +// EventUnregister implements waiter.Waitable.EventUnregister. +func (sfd *slaveFileDescription) EventUnregister(e *waiter.Entry) { + sfd.inode.t.ld.slaveWaiter.EventUnregister(e) +} + +// Readiness implements waiter.Waitable.Readiness. +func (sfd *slaveFileDescription) Readiness(mask waiter.EventMask) waiter.EventMask { + return sfd.inode.t.ld.slaveReadiness() +} + +// Read implements vfs.FileDescriptionImpl.Read. +func (sfd *slaveFileDescription) Read(ctx context.Context, dst usermem.IOSequence, _ vfs.ReadOptions) (int64, error) { + return sfd.inode.t.ld.inputQueueRead(ctx, dst) +} + +// Write implements vfs.FileDescriptionImpl.Write. +func (sfd *slaveFileDescription) Write(ctx context.Context, src usermem.IOSequence, _ vfs.WriteOptions) (int64, error) { + return sfd.inode.t.ld.outputQueueWrite(ctx, src) +} + +// Ioctl implements vfs.FileDescripionImpl.Ioctl. +func (sfd *slaveFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { + switch cmd := args[1].Uint(); cmd { + case linux.FIONREAD: // linux.FIONREAD == linux.TIOCINQ + // Get the number of bytes in the input queue read buffer. + return 0, sfd.inode.t.ld.inputQueueReadSize(ctx, io, args) + case linux.TCGETS: + return sfd.inode.t.ld.getTermios(ctx, io, args) + case linux.TCSETS: + return sfd.inode.t.ld.setTermios(ctx, io, args) + case linux.TCSETSW: + // TODO(b/29356795): This should drain the output queue first. + return sfd.inode.t.ld.setTermios(ctx, io, args) + case linux.TIOCGPTN: + _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), uint32(sfd.inode.t.n), usermem.IOOpts{ + AddressSpaceActive: true, + }) + return 0, err + case linux.TIOCGWINSZ: + return 0, sfd.inode.t.ld.windowSize(ctx, io, args) + case linux.TIOCSWINSZ: + return 0, sfd.inode.t.ld.setWindowSize(ctx, io, args) + case linux.TIOCSCTTY: + // Make the given terminal the controlling terminal of the + // calling process. + return 0, sfd.inode.t.setControllingTTY(ctx, io, args, false /* isMaster */) + case linux.TIOCNOTTY: + // Release this process's controlling terminal. + return 0, sfd.inode.t.releaseControllingTTY(ctx, io, args, false /* isMaster */) + case linux.TIOCGPGRP: + // Get the foreground process group. + return sfd.inode.t.foregroundProcessGroup(ctx, io, args, false /* isMaster */) + case linux.TIOCSPGRP: + // Set the foreground process group. + return sfd.inode.t.setForegroundProcessGroup(ctx, io, args, false /* isMaster */) + default: + maybeEmitUnimplementedEvent(ctx, cmd) + return 0, syserror.ENOTTY + } +} + +// SetStat implements vfs.FileDescriptionImpl.SetStat. +func (sfd *slaveFileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { + creds := auth.CredentialsFromContext(ctx) + fs := sfd.vfsfd.VirtualDentry().Mount().Filesystem() + return sfd.inode.SetStat(ctx, fs, creds, opts) +} + +// Stat implements vfs.FileDescriptionImpl.Stat. +func (sfd *slaveFileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { + fs := sfd.vfsfd.VirtualDentry().Mount().Filesystem() + return sfd.inode.Stat(fs, opts) +} + +// LINT.ThenChange(../../fs/tty/slave.go) diff --git a/pkg/sentry/fsimpl/devpts/terminal.go b/pkg/sentry/fsimpl/devpts/terminal.go new file mode 100644 index 000000000..b44e673d8 --- /dev/null +++ b/pkg/sentry/fsimpl/devpts/terminal.go @@ -0,0 +1,124 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package devpts + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/usermem" +) + +// LINT.IfChanges + +// Terminal is a pseudoterminal. +// +// +stateify savable +type Terminal struct { + // n is the terminal index. It is immutable. + n uint32 + + // ld is the line discipline of the terminal. It is immutable. + ld *lineDiscipline + + // masterKTTY contains the controlling process of the master end of + // this terminal. This field is immutable. + masterKTTY *kernel.TTY + + // slaveKTTY contains the controlling process of the slave end of this + // terminal. This field is immutable. + slaveKTTY *kernel.TTY +} + +func newTerminal(n uint32) *Terminal { + termios := linux.DefaultSlaveTermios + t := Terminal{ + n: n, + ld: newLineDiscipline(termios), + masterKTTY: &kernel.TTY{Index: n}, + slaveKTTY: &kernel.TTY{Index: n}, + } + return &t +} + +// setControllingTTY makes tm the controlling terminal of the calling thread +// group. +func (tm *Terminal) setControllingTTY(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) error { + task := kernel.TaskFromContext(ctx) + if task == nil { + panic("setControllingTTY must be called from a task context") + } + + return task.ThreadGroup().SetControllingTTY(tm.tty(isMaster), args[2].Int()) +} + +// releaseControllingTTY removes tm as the controlling terminal of the calling +// thread group. +func (tm *Terminal) releaseControllingTTY(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) error { + task := kernel.TaskFromContext(ctx) + if task == nil { + panic("releaseControllingTTY must be called from a task context") + } + + return task.ThreadGroup().ReleaseControllingTTY(tm.tty(isMaster)) +} + +// foregroundProcessGroup gets the process group ID of tm's foreground process. +func (tm *Terminal) foregroundProcessGroup(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) (uintptr, error) { + task := kernel.TaskFromContext(ctx) + if task == nil { + panic("foregroundProcessGroup must be called from a task context") + } + + ret, err := task.ThreadGroup().ForegroundProcessGroup(tm.tty(isMaster)) + if err != nil { + return 0, err + } + + // Write it out to *arg. + _, err = usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(ret), usermem.IOOpts{ + AddressSpaceActive: true, + }) + return 0, err +} + +// foregroundProcessGroup sets tm's foreground process. +func (tm *Terminal) setForegroundProcessGroup(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) (uintptr, error) { + task := kernel.TaskFromContext(ctx) + if task == nil { + panic("setForegroundProcessGroup must be called from a task context") + } + + // Read in the process group ID. + var pgid int32 + if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &pgid, usermem.IOOpts{ + AddressSpaceActive: true, + }); err != nil { + return 0, err + } + + ret, err := task.ThreadGroup().SetForegroundProcessGroup(tm.tty(isMaster), kernel.ProcessGroupID(pgid)) + return uintptr(ret), err +} + +func (tm *Terminal) tty(isMaster bool) *kernel.TTY { + if isMaster { + return tm.masterKTTY + } + return tm.slaveKTTY +} + +// LINT.ThenChange(../../fs/tty/terminal.go) diff --git a/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go b/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go index 64f1b142c..142ee53b0 100644 --- a/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go +++ b/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go @@ -163,16 +163,25 @@ func (a *Accessor) CreateDeviceFile(ctx context.Context, pathname string, kind v func (a *Accessor) UserspaceInit(ctx context.Context) error { actx := a.wrapContext(ctx) - // systemd: src/shared/dev-setup.c:dev_setup() + // Initialize symlinks. for _, symlink := range []struct { source string target string }{ - // /proc/kcore is not implemented. + // systemd: src/shared/dev-setup.c:dev_setup() {source: "fd", target: "/proc/self/fd"}, {source: "stdin", target: "/proc/self/fd/0"}, {source: "stdout", target: "/proc/self/fd/1"}, {source: "stderr", target: "/proc/self/fd/2"}, + // /proc/kcore is not implemented. + + // Linux implements /dev/ptmx as a device node, but advises + // container implementations to create /dev/ptmx as a symlink + // to pts/ptmx (Documentation/filesystems/devpts.txt). Systemd + // follows this advice (src/nspawn/nspawn.c:setup_pts()), while + // LXC tries to create a bind mount and falls back to a symlink + // (src/lxc/conf.c:lxc_setup_devpts()). + {source: "ptmx", target: "pts/ptmx"}, } { if err := a.vfsObj.SymlinkAt(actx, a.creds, a.pathOperationAt(symlink.source), symlink.target); err != nil { return fmt.Errorf("failed to create symlink %q => %q: %v", symlink.source, symlink.target, err) diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go index 3164d022c..1d46dba25 100644 --- a/pkg/sentry/fsimpl/kernfs/filesystem.go +++ b/pkg/sentry/fsimpl/kernfs/filesystem.go @@ -391,7 +391,7 @@ func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf // O_NOFOLLOW have no effect here (they're handled by VFS by setting // appropriate bits in rp), but are returned by // FileDescriptionImpl.StatusFlags(). - opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_TRUNC | linux.O_DIRECTORY | linux.O_NOFOLLOW + opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_TRUNC | linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_NONBLOCK ats := vfs.AccessTypesForOpenFlags(&opts) // Do not create new file. diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go index 9f526359e..a946645f6 100644 --- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go @@ -216,6 +216,11 @@ func (a *InodeAttrs) Init(creds *auth.Credentials, ino uint64, mode linux.FileMo atomic.StoreUint32(&a.nlink, nlink) } +// Ino returns the inode id. +func (a *InodeAttrs) Ino() uint64 { + return atomic.LoadUint64(&a.ino) +} + // Mode implements Inode.Mode. func (a *InodeAttrs) Mode() linux.FileMode { return linux.FileMode(atomic.LoadUint32(&a.mode)) @@ -359,8 +364,8 @@ func (o *OrderedChildren) Destroy() { // cache. Populate returns the number of directories inserted, which the caller // may use to update the link count for the parent directory. // -// Precondition: d.Impl() must be a kernfs Dentry. d must represent a directory -// inode. children must not contain any conflicting entries already in o. +// Precondition: d must represent a directory inode. children must not contain +// any conflicting entries already in o. func (o *OrderedChildren) Populate(d *Dentry, children map[string]*Dentry) uint32 { var links uint32 for name, child := range children { -- cgit v1.2.3 From 40a712c57cd78c51c9875ae04b5e795113c75e62 Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Fri, 24 Apr 2020 08:19:11 -0700 Subject: Refactor syscall.Fstat calls in hostfs. Just call syscall.Fstat directly each time mode/file owner are needed. This feels more natural than using i.getPermissions(). PiperOrigin-RevId: 308257405 --- pkg/sentry/fsimpl/host/host.go | 35 +++++++++++++---------------------- 1 file changed, 13 insertions(+), 22 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go index ae94cfa6e..7847e3cc2 100644 --- a/pkg/sentry/fsimpl/host/host.go +++ b/pkg/sentry/fsimpl/host/host.go @@ -169,31 +169,22 @@ func fileFlagsFromHostFD(fd int) (int, error) { // CheckPermissions implements kernfs.Inode. func (i *inode) CheckPermissions(ctx context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error { - mode, uid, gid, err := i.getPermissions() - if err != nil { + var s syscall.Stat_t + if err := syscall.Fstat(i.hostFD, &s); err != nil { return err } - return vfs.GenericCheckPermissions(creds, ats, mode, uid, gid) + return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(s.Mode), auth.KUID(s.Uid), auth.KGID(s.Gid)) } // Mode implements kernfs.Inode. func (i *inode) Mode() linux.FileMode { - mode, _, _, err := i.getPermissions() - // Retrieving the mode from the host fd using fstat(2) should not fail. - // If the syscall does not succeed, something is fundamentally wrong. - if err != nil { - panic(fmt.Sprintf("failed to retrieve mode from host fd %d: %v", i.hostFD, err)) - } - return linux.FileMode(mode) -} - -func (i *inode) getPermissions() (linux.FileMode, auth.KUID, auth.KGID, error) { - // Retrieve metadata. var s syscall.Stat_t if err := syscall.Fstat(i.hostFD, &s); err != nil { - return 0, 0, 0, err + // Retrieving the mode from the host fd using fstat(2) should not fail. + // If the syscall does not succeed, something is fundamentally wrong. + panic(fmt.Sprintf("failed to retrieve mode from host fd %d: %v", i.hostFD, err)) } - return linux.FileMode(s.Mode), auth.KUID(s.Uid), auth.KGID(s.Gid), nil + return linux.FileMode(s.Mode) } // Stat implements kernfs.Inode. @@ -326,11 +317,11 @@ func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Cre if m&^(linux.STATX_MODE|linux.STATX_SIZE|linux.STATX_ATIME|linux.STATX_MTIME) != 0 { return syserror.EPERM } - mode, uid, gid, err := i.getPermissions() - if err != nil { + var hostStat syscall.Stat_t + if err := syscall.Fstat(i.hostFD, &hostStat); err != nil { return err } - if err := vfs.CheckSetStat(ctx, creds, &s, mode.Permissions(), uid, gid); err != nil { + if err := vfs.CheckSetStat(ctx, creds, &s, linux.FileMode(hostStat.Mode&linux.PermissionsMask), auth.KUID(hostStat.Uid), auth.KGID(hostStat.Gid)); err != nil { return err } @@ -374,11 +365,11 @@ func (i *inode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptio } func (i *inode) open(d *vfs.Dentry, mnt *vfs.Mount) (*vfs.FileDescription, error) { - mode, _, _, err := i.getPermissions() - if err != nil { + var s syscall.Stat_t + if err := syscall.Fstat(i.hostFD, &s); err != nil { return nil, err } - fileType := mode.FileType() + fileType := s.Mode & linux.FileTypeMask if fileType == syscall.S_IFSOCK { if i.isTTY { return nil, errors.New("cannot use host socket as TTY") -- cgit v1.2.3 From 1b88c63b3e6b330c8399bf92f148cc80374bee18 Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Fri, 24 Apr 2020 10:02:22 -0700 Subject: Move hostfs mount to Kernel struct. This is needed to set up host fds passed through a Unix socket. Note that the host package depends on kernel, so we cannot set up the hostfs mount directly in Kernel.Init as we do for sockfs and pipefs. Also, adjust sockfs to make its setup look more like hostfs's and pipefs's. PiperOrigin-RevId: 308274053 --- pkg/sentry/fsimpl/host/host.go | 16 +++++++-------- pkg/sentry/fsimpl/sockfs/sockfs.go | 26 ++++++++++------------- pkg/sentry/kernel/kernel.go | 42 ++++++++++++++++++++++++++++---------- runsc/boot/fds.go | 7 +------ runsc/boot/loader.go | 13 ++++++++++++ 5 files changed, 64 insertions(+), 40 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go index 7847e3cc2..a26b13067 100644 --- a/pkg/sentry/fsimpl/host/host.go +++ b/pkg/sentry/fsimpl/host/host.go @@ -42,7 +42,7 @@ type filesystemType struct{} // GetFilesystem implements FilesystemType.GetFilesystem. func (filesystemType) GetFilesystem(context.Context, *vfs.VirtualFilesystem, *auth.Credentials, string, vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { - panic("cannot instaniate a host filesystem") + panic("host.filesystemType.GetFilesystem should never be called") } // Name implements FilesystemType.Name. @@ -55,14 +55,14 @@ type filesystem struct { kernfs.Filesystem } -// NewMount returns a new disconnected mount in vfsObj that may be passed to ImportFD. -func NewMount(vfsObj *vfs.VirtualFilesystem) (*vfs.Mount, error) { +// NewFilesystem sets up and returns a new hostfs filesystem. +// +// Note that there should only ever be one instance of host.filesystem, +// a global mount for host fds. +func NewFilesystem(vfsObj *vfs.VirtualFilesystem) *vfs.Filesystem { fs := &filesystem{} - fs.Init(vfsObj, &filesystemType{}) - vfsfs := fs.VFSFilesystem() - // NewDisconnectedMount will take an additional reference on vfsfs. - defer vfsfs.DecRef() - return vfsObj.NewDisconnectedMount(vfsfs, nil, &vfs.MountOptions{}) + fs.Init(vfsObj, filesystemType{}) + return fs.VFSFilesystem() } // ImportFD sets up and returns a vfs.FileDescription from a donated fd. diff --git a/pkg/sentry/fsimpl/sockfs/sockfs.go b/pkg/sentry/fsimpl/sockfs/sockfs.go index 3f7ad1d65..632cfde88 100644 --- a/pkg/sentry/fsimpl/sockfs/sockfs.go +++ b/pkg/sentry/fsimpl/sockfs/sockfs.go @@ -24,26 +24,12 @@ import ( "gvisor.dev/gvisor/pkg/syserror" ) -// NewFilesystem creates a new sockfs filesystem. -// -// Note that there should only ever be one instance of sockfs.Filesystem, -// backing a global socket mount. -func NewFilesystem(vfsObj *vfs.VirtualFilesystem) *vfs.Filesystem { - fs, _, err := filesystemType{}.GetFilesystem(nil, vfsObj, nil, "", vfs.GetFilesystemOptions{}) - if err != nil { - panic("failed to create sockfs filesystem") - } - return fs -} - // filesystemType implements vfs.FilesystemType. type filesystemType struct{} // GetFilesystem implements FilesystemType.GetFilesystem. func (fsType filesystemType) GetFilesystem(_ context.Context, vfsObj *vfs.VirtualFilesystem, _ *auth.Credentials, _ string, _ vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { - fs := &filesystem{} - fs.Init(vfsObj, fsType) - return fs.VFSFilesystem(), nil, nil + panic("sockfs.filesystemType.GetFilesystem should never be called") } // Name implements FilesystemType.Name. @@ -60,6 +46,16 @@ type filesystem struct { kernfs.Filesystem } +// NewFilesystem sets up and returns a new sockfs filesystem. +// +// Note that there should only ever be one instance of sockfs.Filesystem, +// backing a global socket mount. +func NewFilesystem(vfsObj *vfs.VirtualFilesystem) *vfs.Filesystem { + fs := &filesystem{} + fs.Init(vfsObj, filesystemType{}) + return fs.VFSFilesystem() +} + // inode implements kernfs.Inode. // // TODO(gvisor.dev/issue/1476): Add device numbers to this inode (which are diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index fef60e636..c91b9dce2 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -227,11 +227,6 @@ type Kernel struct { // by extMu. nextSocketEntry uint64 - // socketMount is a disconnected vfs.Mount, not included in k.vfs, - // representing a sockfs.filesystem. socketMount is used to back - // VirtualDentries representing anonymous sockets. - socketMount *vfs.Mount - // deviceRegistry is used to save/restore device.SimpleDevices. deviceRegistry struct{} `state:".(*device.Registry)"` @@ -255,10 +250,22 @@ type Kernel struct { // VFS keeps the filesystem state used across the kernel. vfs vfs.VirtualFilesystem + // hostMount is the Mount used for file descriptors that were imported + // from the host. + hostMount *vfs.Mount + // pipeMount is the Mount used for pipes created by the pipe() and pipe2() // syscalls (as opposed to named pipes created by mknod()). pipeMount *vfs.Mount + // socketMount is the Mount used for sockets created by the socket() and + // socketpair() syscalls. There are several cases where a socket dentry will + // not be contained in socketMount: + // 1. Socket files created by mknod() + // 2. Socket fds imported from the host (Kernel.hostMount is used for these) + // 3. Socket files created by binding Unix sockets to a file path + socketMount *vfs.Mount + // If set to true, report address space activation waits as if the task is in // external wait so that the watchdog doesn't report the task stuck. SleepForAddressSpaceActivation bool @@ -377,7 +384,7 @@ func (k *Kernel) Init(args InitKernelArgs) error { defer socketFilesystem.DecRef() socketMount, err := k.vfs.NewDisconnectedMount(socketFilesystem, nil, &vfs.MountOptions{}) if err != nil { - return fmt.Errorf("failed to initialize socket mount: %v", err) + return fmt.Errorf("failed to create sockfs mount: %v", err) } k.socketMount = socketMount } @@ -1526,11 +1533,6 @@ func (k *Kernel) ListSockets() []*SocketEntry { return socks } -// SocketMount returns the global socket mount. -func (k *Kernel) SocketMount() *vfs.Mount { - return k.socketMount -} - // supervisorContext is a privileged context. type supervisorContext struct { context.NoopSleeper @@ -1629,7 +1631,25 @@ func (k *Kernel) VFS() *vfs.VirtualFilesystem { return &k.vfs } +// SetHostMount sets the hostfs mount. +func (k *Kernel) SetHostMount(mnt *vfs.Mount) { + if k.hostMount != nil { + panic("Kernel.hostMount cannot be set more than once") + } + k.hostMount = mnt +} + +// HostMount returns the hostfs mount. +func (k *Kernel) HostMount() *vfs.Mount { + return k.hostMount +} + // PipeMount returns the pipefs mount. func (k *Kernel) PipeMount() *vfs.Mount { return k.pipeMount } + +// SocketMount returns the sockfs mount. +func (k *Kernel) SocketMount() *vfs.Mount { + return k.socketMount +} diff --git a/runsc/boot/fds.go b/runsc/boot/fds.go index 7e49f6f9f..0cbd63857 100644 --- a/runsc/boot/fds.go +++ b/runsc/boot/fds.go @@ -89,14 +89,9 @@ func createFDTableVFS2(ctx context.Context, console bool, stdioFDs []int) (*kern fdTable := k.NewFDTable() defer fdTable.DecRef() - hostMount, err := vfshost.NewMount(k.VFS()) - if err != nil { - return nil, fmt.Errorf("creating host mount: %w", err) - } - for appFD, hostFD := range stdioFDs { // TODO(gvisor.dev/issue/1482): Add TTY support. - appFile, err := vfshost.ImportFD(hostMount, hostFD, false) + appFile, err := vfshost.ImportFD(k.HostMount(), hostFD, false) if err != nil { return nil, err } diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index 096b0e9f0..3f41d8357 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -36,6 +36,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/host" "gvisor.dev/gvisor/pkg/sentry/fs/user" + vfs2host "gvisor.dev/gvisor/pkg/sentry/fsimpl/host" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" @@ -46,6 +47,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/syscalls/linux/vfs2" "gvisor.dev/gvisor/pkg/sentry/time" "gvisor.dev/gvisor/pkg/sentry/usage" + "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sentry/watchdog" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" @@ -329,6 +331,17 @@ func New(args Args) (*Loader, error) { return nil, fmt.Errorf("creating pod mount hints: %v", err) } + if kernel.VFS2Enabled { + // Set up host mount that will be used for imported fds. + hostFilesystem := vfs2host.NewFilesystem(k.VFS()) + defer hostFilesystem.DecRef() + hostMount, err := k.VFS().NewDisconnectedMount(hostFilesystem, nil, &vfs.MountOptions{}) + if err != nil { + return nil, fmt.Errorf("failed to create hostfs mount: %v", err) + } + k.SetHostMount(hostMount) + } + // Make host FDs stable between invocations. Host FDs must map to the exact // same number when the sandbox is restored. Otherwise the wrong FD will be // used. -- cgit v1.2.3 From 2cc0fd42f462f3942230c4b33ca2825e2a28765d Mon Sep 17 00:00:00 2001 From: Fabricio Voznika Date: Fri, 24 Apr 2020 11:43:49 -0700 Subject: Fixes for procfs - Return ENOENT for /proc/[pid]/task if task is zoombied or terminated - Allow directory to be Seek() to the end - Construct synthetic files for /proc/[pid]/ns/* - Changed GenericDirectoryFD.Init to not register with FileDescription, otherwise other implementation cannot change behavior. Updates #1195,1193 PiperOrigin-RevId: 308294649 --- pkg/sentry/fsimpl/devpts/devpts.go | 6 ++- pkg/sentry/fsimpl/kernfs/fd_impl_util.go | 28 +++++++++-- pkg/sentry/fsimpl/kernfs/filesystem.go | 4 +- pkg/sentry/fsimpl/kernfs/inode_impl_util.go | 8 +-- pkg/sentry/fsimpl/kernfs/kernfs.go | 2 +- pkg/sentry/fsimpl/kernfs/kernfs_test.go | 10 ++-- pkg/sentry/fsimpl/kernfs/symlink.go | 2 +- pkg/sentry/fsimpl/proc/subtasks.go | 49 ++++++++++++++++++- pkg/sentry/fsimpl/proc/task.go | 7 ++- pkg/sentry/fsimpl/proc/task_fds.go | 14 ++++-- pkg/sentry/fsimpl/proc/task_files.go | 76 +++++++++++++++++++++++++++-- pkg/sentry/fsimpl/proc/tasks.go | 6 ++- pkg/sentry/fsimpl/proc/tasks_files.go | 4 +- pkg/sentry/fsimpl/sys/sys.go | 4 +- pkg/sentry/vfs/vfs.go | 8 +++ 15 files changed, 192 insertions(+), 36 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/devpts/devpts.go b/pkg/sentry/fsimpl/devpts/devpts.go index 07a69b940..f36bf50fc 100644 --- a/pkg/sentry/fsimpl/devpts/devpts.go +++ b/pkg/sentry/fsimpl/devpts/devpts.go @@ -161,8 +161,10 @@ func (i *rootInode) masterClose(t *Terminal) { // Open implements kernfs.Inode.Open. func (i *rootInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - fd := &kernfs.GenericDirectoryFD{} - fd.Init(rp.Mount(), vfsd, &i.OrderedChildren, &opts) + fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &opts) + if err != nil { + return nil, err + } return fd.VFSFileDescription(), nil } diff --git a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go index e8a4670b8..dd5806301 100644 --- a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go @@ -15,6 +15,8 @@ package kernfs import ( + "math" + "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" @@ -43,15 +45,27 @@ type GenericDirectoryFD struct { off int64 } -// Init initializes a GenericDirectoryFD. -func (fd *GenericDirectoryFD) Init(m *vfs.Mount, d *vfs.Dentry, children *OrderedChildren, opts *vfs.OpenOptions) error { +// NewGenericDirectoryFD creates a new GenericDirectoryFD and returns its +// dentry. +func NewGenericDirectoryFD(m *vfs.Mount, d *vfs.Dentry, children *OrderedChildren, opts *vfs.OpenOptions) (*GenericDirectoryFD, error) { + fd := &GenericDirectoryFD{} + if err := fd.Init(children, opts); err != nil { + return nil, err + } + if err := fd.vfsfd.Init(fd, opts.Flags, m, d, &vfs.FileDescriptionOptions{}); err != nil { + return nil, err + } + return fd, nil +} + +// Init initializes a GenericDirectoryFD. Use it when overriding +// GenericDirectoryFD. Caller must call fd.VFSFileDescription.Init() with the +// correct implementation. +func (fd *GenericDirectoryFD) Init(children *OrderedChildren, opts *vfs.OpenOptions) error { if vfs.AccessTypesForOpenFlags(opts)&vfs.MayWrite != 0 { // Can't open directories for writing. return syserror.EISDIR } - if err := fd.vfsfd.Init(fd, opts.Flags, m, d, &vfs.FileDescriptionOptions{}); err != nil { - return err - } fd.children = children return nil } @@ -187,6 +201,10 @@ func (fd *GenericDirectoryFD) Seek(ctx context.Context, offset int64, whence int // Use offset as given. case linux.SEEK_CUR: offset += fd.off + case linux.SEEK_END: + // TODO(gvisor.dev/issue/1193): This can prevent new files from showing up + // if they are added after SEEK_END. + offset = math.MaxInt64 default: return 0, syserror.EINVAL } diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go index 1d46dba25..3ccd92fc5 100644 --- a/pkg/sentry/fsimpl/kernfs/filesystem.go +++ b/pkg/sentry/fsimpl/kernfs/filesystem.go @@ -82,7 +82,7 @@ afterSymlink: } // Resolve any symlink at current path component. if rp.ShouldFollowSymlink() && next.isSymlink() { - targetVD, targetPathname, err := next.inode.Getlink(ctx) + targetVD, targetPathname, err := next.inode.Getlink(ctx, rp.Mount()) if err != nil { return nil, err } @@ -477,7 +477,7 @@ afterTrailingSymlink: } child := childVFSD.Impl().(*Dentry) if rp.ShouldFollowSymlink() && child.isSymlink() { - targetVD, targetPathname, err := child.inode.Getlink(ctx) + targetVD, targetPathname, err := child.inode.Getlink(ctx, rp.Mount()) if err != nil { return nil, err } diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go index a946645f6..02f35a675 100644 --- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go @@ -182,7 +182,7 @@ func (InodeNotSymlink) Readlink(context.Context) (string, error) { } // Getlink implements Inode.Getlink. -func (InodeNotSymlink) Getlink(context.Context) (vfs.VirtualDentry, string, error) { +func (InodeNotSymlink) Getlink(context.Context, *vfs.Mount) (vfs.VirtualDentry, string, error) { return vfs.VirtualDentry{}, "", syserror.EINVAL } @@ -568,8 +568,10 @@ func (s *StaticDirectory) Init(creds *auth.Credentials, ino uint64, perm linux.F // Open implements kernfs.Inode. func (s *StaticDirectory) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - fd := &GenericDirectoryFD{} - fd.Init(rp.Mount(), vfsd, &s.OrderedChildren, &opts) + fd, err := NewGenericDirectoryFD(rp.Mount(), vfsd, &s.OrderedChildren, &opts) + if err != nil { + return nil, err + } return fd.VFSFileDescription(), nil } diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go index f5041824f..95cf6dc24 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs.go @@ -439,5 +439,5 @@ type inodeSymlink interface { // // - If the inode is not a symlink, Getlink returns (zero-value // VirtualDentry, "", EINVAL). - Getlink(ctx context.Context) (vfs.VirtualDentry, string, error) + Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDentry, string, error) } diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go index 465451f35..0964d5456 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs_test.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go @@ -117,8 +117,8 @@ func (fs *filesystem) newReadonlyDir(creds *auth.Credentials, mode linux.FileMod } func (d *readonlyDir) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - fd := &kernfs.GenericDirectoryFD{} - if err := fd.Init(rp.Mount(), vfsd, &d.OrderedChildren, &opts); err != nil { + fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &opts) + if err != nil { return nil, err } return fd.VFSFileDescription(), nil @@ -147,8 +147,10 @@ func (fs *filesystem) newDir(creds *auth.Credentials, mode linux.FileMode, conte } func (d *dir) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - fd := &kernfs.GenericDirectoryFD{} - fd.Init(rp.Mount(), vfsd, &d.OrderedChildren, &opts) + fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &opts) + if err != nil { + return nil, err + } return fd.VFSFileDescription(), nil } diff --git a/pkg/sentry/fsimpl/kernfs/symlink.go b/pkg/sentry/fsimpl/kernfs/symlink.go index 018aa503c..0aa6dc979 100644 --- a/pkg/sentry/fsimpl/kernfs/symlink.go +++ b/pkg/sentry/fsimpl/kernfs/symlink.go @@ -56,7 +56,7 @@ func (s *StaticSymlink) Readlink(_ context.Context) (string, error) { } // Getlink implements Inode.Getlink. -func (s *StaticSymlink) Getlink(_ context.Context) (vfs.VirtualDentry, string, error) { +func (s *StaticSymlink) Getlink(context.Context, *vfs.Mount) (vfs.VirtualDentry, string, error) { return vfs.VirtualDentry{}, s.target, nil } diff --git a/pkg/sentry/fsimpl/proc/subtasks.go b/pkg/sentry/fsimpl/proc/subtasks.go index a21313666..28ec2484a 100644 --- a/pkg/sentry/fsimpl/proc/subtasks.go +++ b/pkg/sentry/fsimpl/proc/subtasks.go @@ -88,6 +88,9 @@ func (i *subtasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallb if len(tasks) == 0 { return offset, syserror.ENOENT } + if relOffset >= int64(len(tasks)) { + return offset, nil + } tids := make([]int, 0, len(tasks)) for _, tid := range tasks { @@ -110,10 +113,52 @@ func (i *subtasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallb return offset, nil } +type subtasksFD struct { + kernfs.GenericDirectoryFD + + task *kernel.Task +} + +func (fd *subtasksFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error { + if fd.task.ExitState() >= kernel.TaskExitZombie { + return syserror.ENOENT + } + return fd.GenericDirectoryFD.IterDirents(ctx, cb) +} + +// Seek implements vfs.FileDecriptionImpl.Seek. +func (fd *subtasksFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { + if fd.task.ExitState() >= kernel.TaskExitZombie { + return 0, syserror.ENOENT + } + return fd.GenericDirectoryFD.Seek(ctx, offset, whence) +} + +// Stat implements vfs.FileDescriptionImpl.Stat. +func (fd *subtasksFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { + if fd.task.ExitState() >= kernel.TaskExitZombie { + return linux.Statx{}, syserror.ENOENT + } + return fd.GenericDirectoryFD.Stat(ctx, opts) +} + +// SetStat implements vfs.FileDescriptionImpl.SetStat. +func (fd *subtasksFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { + if fd.task.ExitState() >= kernel.TaskExitZombie { + return syserror.ENOENT + } + return fd.GenericDirectoryFD.SetStat(ctx, opts) +} + // Open implements kernfs.Inode. func (i *subtasksInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - fd := &kernfs.GenericDirectoryFD{} - fd.Init(rp.Mount(), vfsd, &i.OrderedChildren, &opts) + fd := &subtasksFD{task: i.task} + if err := fd.Init(&i.OrderedChildren, &opts); err != nil { + return nil, err + } + if err := fd.VFSFileDescription().Init(fd, opts.Flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{}); err != nil { + return nil, err + } return fd.VFSFileDescription(), nil } diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go index 888afc0fd..e2790d35b 100644 --- a/pkg/sentry/fsimpl/proc/task.go +++ b/pkg/sentry/fsimpl/proc/task.go @@ -44,6 +44,7 @@ type taskInode struct { var _ kernfs.Inode = (*taskInode)(nil) func newTaskInode(inoGen InoGenerator, task *kernel.Task, pidns *kernel.PIDNamespace, isThreadGroup bool, cgroupControllers map[string]string) *kernfs.Dentry { + // TODO(gvisor.dev/issue/164): Fail with ESRCH if task exited. contents := map[string]*kernfs.Dentry{ "auxv": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &auxvData{task: task}), "cmdline": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &cmdlineData{task: task, arg: cmdlineDataArg}), @@ -102,8 +103,10 @@ func (i *taskInode) Valid(ctx context.Context) bool { // Open implements kernfs.Inode. func (i *taskInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - fd := &kernfs.GenericDirectoryFD{} - fd.Init(rp.Mount(), vfsd, &i.OrderedChildren, &opts) + fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &opts) + if err != nil { + return nil, err + } return fd.VFSFileDescription(), nil } diff --git a/pkg/sentry/fsimpl/proc/task_fds.go b/pkg/sentry/fsimpl/proc/task_fds.go index 046265eca..a7622f1b6 100644 --- a/pkg/sentry/fsimpl/proc/task_fds.go +++ b/pkg/sentry/fsimpl/proc/task_fds.go @@ -143,8 +143,10 @@ func (i *fdDirInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, erro // Open implements kernfs.Inode. func (i *fdDirInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - fd := &kernfs.GenericDirectoryFD{} - fd.Init(rp.Mount(), vfsd, &i.OrderedChildren, &opts) + fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &opts) + if err != nil { + return nil, err + } return fd.VFSFileDescription(), nil } @@ -207,7 +209,7 @@ func (s *fdSymlink) Readlink(ctx context.Context) (string, error) { return s.task.Kernel().VFS().PathnameWithDeleted(ctx, root, file.VirtualDentry()) } -func (s *fdSymlink) Getlink(ctx context.Context) (vfs.VirtualDentry, string, error) { +func (s *fdSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDentry, string, error) { file, _ := getTaskFD(s.task, s.fd) if file == nil { return vfs.VirtualDentry{}, "", syserror.ENOENT @@ -268,8 +270,10 @@ func (i *fdInfoDirInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, // Open implements kernfs.Inode. func (i *fdInfoDirInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - fd := &kernfs.GenericDirectoryFD{} - fd.Init(rp.Mount(), vfsd, &i.OrderedChildren, &opts) + fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &opts) + if err != nil { + return nil, err + } return fd.VFSFileDescription(), nil } diff --git a/pkg/sentry/fsimpl/proc/task_files.go b/pkg/sentry/fsimpl/proc/task_files.go index f3173e197..410cc3552 100644 --- a/pkg/sentry/fsimpl/proc/task_files.go +++ b/pkg/sentry/fsimpl/proc/task_files.go @@ -622,7 +622,7 @@ func (s *exeSymlink) Readlink(ctx context.Context) (string, error) { } // Getlink implements kernfs.Inode.Getlink. -func (s *exeSymlink) Getlink(ctx context.Context) (vfs.VirtualDentry, string, error) { +func (s *exeSymlink) Getlink(ctx context.Context, _ *vfs.Mount) (vfs.VirtualDentry, string, error) { if !kernel.ContextCanTrace(ctx, s.task, false) { return vfs.VirtualDentry{}, "", syserror.EACCES } @@ -754,9 +754,79 @@ func (s *namespaceSymlink) Readlink(ctx context.Context) (string, error) { } // Getlink implements Inode.Getlink. -func (s *namespaceSymlink) Getlink(ctx context.Context) (vfs.VirtualDentry, string, error) { +func (s *namespaceSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDentry, string, error) { if err := checkTaskState(s.task); err != nil { return vfs.VirtualDentry{}, "", err } - return s.StaticSymlink.Getlink(ctx) + + // Create a synthetic inode to represent the namespace. + dentry := &kernfs.Dentry{} + dentry.Init(&namespaceInode{}) + vd := vfs.MakeVirtualDentry(mnt, dentry.VFSDentry()) + vd.IncRef() + dentry.DecRef() + return vd, "", nil +} + +// namespaceInode is a synthetic inode created to represent a namespace in +// /proc/[pid]/ns/*. +type namespaceInode struct { + kernfs.InodeAttrs + kernfs.InodeNoopRefCount + kernfs.InodeNotDirectory + kernfs.InodeNotSymlink +} + +var _ kernfs.Inode = (*namespaceInode)(nil) + +// Init initializes a namespace inode. +func (i *namespaceInode) Init(creds *auth.Credentials, ino uint64, perm linux.FileMode) { + if perm&^linux.PermissionsMask != 0 { + panic(fmt.Sprintf("Only permission mask must be set: %x", perm&linux.PermissionsMask)) + } + i.InodeAttrs.Init(creds, ino, linux.ModeRegular|perm) +} + +// Open implements Inode.Open. +func (i *namespaceInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + fd := &namespaceFD{inode: i} + i.IncRef() + if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{}); err != nil { + return nil, err + } + return &fd.vfsfd, nil +} + +// namespace FD is a synthetic file that represents a namespace in +// /proc/[pid]/ns/*. +type namespaceFD struct { + vfs.FileDescriptionDefaultImpl + + vfsfd vfs.FileDescription + inode *namespaceInode +} + +var _ vfs.FileDescriptionImpl = (*namespaceFD)(nil) + +// Stat implements FileDescriptionImpl. +func (fd *namespaceFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { + vfs := fd.vfsfd.VirtualDentry().Mount().Filesystem() + return fd.inode.Stat(vfs, opts) +} + +// SetStat implements FileDescriptionImpl. +func (fd *namespaceFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { + vfs := fd.vfsfd.VirtualDentry().Mount().Filesystem() + creds := auth.CredentialsFromContext(ctx) + return fd.inode.SetStat(ctx, vfs, creds, opts) +} + +// Release implements FileDescriptionImpl. +func (fd *namespaceFD) Release() { + fd.inode.DecRef() +} + +// OnClose implements FileDescriptionImpl. +func (*namespaceFD) OnClose(context.Context) error { + return nil } diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go index 9f2ef8200..26518ed03 100644 --- a/pkg/sentry/fsimpl/proc/tasks.go +++ b/pkg/sentry/fsimpl/proc/tasks.go @@ -202,8 +202,10 @@ func (i *tasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback // Open implements kernfs.Inode. func (i *tasksInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - fd := &kernfs.GenericDirectoryFD{} - fd.Init(rp.Mount(), vfsd, &i.OrderedChildren, &opts) + fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &opts) + if err != nil { + return nil, err + } return fd.VFSFileDescription(), nil } diff --git a/pkg/sentry/fsimpl/proc/tasks_files.go b/pkg/sentry/fsimpl/proc/tasks_files.go index 4621e2de0..92007df81 100644 --- a/pkg/sentry/fsimpl/proc/tasks_files.go +++ b/pkg/sentry/fsimpl/proc/tasks_files.go @@ -63,7 +63,7 @@ func (s *selfSymlink) Readlink(ctx context.Context) (string, error) { return strconv.FormatUint(uint64(tgid), 10), nil } -func (s *selfSymlink) Getlink(ctx context.Context) (vfs.VirtualDentry, string, error) { +func (s *selfSymlink) Getlink(ctx context.Context, _ *vfs.Mount) (vfs.VirtualDentry, string, error) { target, err := s.Readlink(ctx) return vfs.VirtualDentry{}, target, err } @@ -106,7 +106,7 @@ func (s *threadSelfSymlink) Readlink(ctx context.Context) (string, error) { return fmt.Sprintf("%d/task/%d", tgid, tid), nil } -func (s *threadSelfSymlink) Getlink(ctx context.Context) (vfs.VirtualDentry, string, error) { +func (s *threadSelfSymlink) Getlink(ctx context.Context, _ *vfs.Mount) (vfs.VirtualDentry, string, error) { target, err := s.Readlink(ctx) return vfs.VirtualDentry{}, target, err } diff --git a/pkg/sentry/fsimpl/sys/sys.go b/pkg/sentry/fsimpl/sys/sys.go index 5c617270e..34e8e0cbe 100644 --- a/pkg/sentry/fsimpl/sys/sys.go +++ b/pkg/sentry/fsimpl/sys/sys.go @@ -106,8 +106,8 @@ func (*dir) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.Set // Open implements kernfs.Inode.Open. func (d *dir) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - fd := &kernfs.GenericDirectoryFD{} - if err := fd.Init(rp.Mount(), vfsd, &d.OrderedChildren, &opts); err != nil { + fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &opts) + if err != nil { return nil, err } return fd.VFSFileDescription(), nil diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go index cb5bbd781..9015f2cc1 100644 --- a/pkg/sentry/vfs/vfs.go +++ b/pkg/sentry/vfs/vfs.go @@ -797,6 +797,14 @@ type VirtualDentry struct { dentry *Dentry } +// MakeVirtualDentry creates a VirtualDentry. +func MakeVirtualDentry(mount *Mount, dentry *Dentry) VirtualDentry { + return VirtualDentry{ + mount: mount, + dentry: dentry, + } +} + // Ok returns true if vd is not empty. It does not require that a reference is // held. func (vd VirtualDentry) Ok() bool { -- cgit v1.2.3 From 632b104aff3fedf7798447eedc5662c973525c66 Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Fri, 24 Apr 2020 12:36:14 -0700 Subject: Plumb context.Context into kernfs.Inode.Open(). PiperOrigin-RevId: 308304793 --- pkg/sentry/fsimpl/devpts/devpts.go | 2 +- pkg/sentry/fsimpl/devpts/master.go | 2 +- pkg/sentry/fsimpl/devpts/slave.go | 2 +- pkg/sentry/fsimpl/host/host.go | 10 +++++----- pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go | 2 +- pkg/sentry/fsimpl/kernfs/filesystem.go | 8 ++++---- pkg/sentry/fsimpl/kernfs/inode_impl_util.go | 4 ++-- pkg/sentry/fsimpl/kernfs/kernfs.go | 2 +- pkg/sentry/fsimpl/kernfs/kernfs_test.go | 4 ++-- pkg/sentry/fsimpl/pipefs/pipefs.go | 5 ++--- pkg/sentry/fsimpl/proc/subtasks.go | 2 +- pkg/sentry/fsimpl/proc/task.go | 2 +- pkg/sentry/fsimpl/proc/task_fds.go | 4 ++-- pkg/sentry/fsimpl/proc/task_files.go | 2 +- pkg/sentry/fsimpl/proc/tasks.go | 2 +- pkg/sentry/fsimpl/sockfs/sockfs.go | 2 +- pkg/sentry/fsimpl/sys/sys.go | 2 +- runsc/boot/fds.go | 2 +- 18 files changed, 29 insertions(+), 30 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/devpts/devpts.go b/pkg/sentry/fsimpl/devpts/devpts.go index f36bf50fc..181d765d3 100644 --- a/pkg/sentry/fsimpl/devpts/devpts.go +++ b/pkg/sentry/fsimpl/devpts/devpts.go @@ -160,7 +160,7 @@ func (i *rootInode) masterClose(t *Terminal) { } // Open implements kernfs.Inode.Open. -func (i *rootInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { +func (i *rootInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &opts) if err != nil { return nil, err diff --git a/pkg/sentry/fsimpl/devpts/master.go b/pkg/sentry/fsimpl/devpts/master.go index 60340c28e..04a292927 100644 --- a/pkg/sentry/fsimpl/devpts/master.go +++ b/pkg/sentry/fsimpl/devpts/master.go @@ -46,7 +46,7 @@ type masterInode struct { var _ kernfs.Inode = (*masterInode)(nil) // Open implements kernfs.Inode.Open. -func (mi *masterInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { +func (mi *masterInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { t, err := mi.root.allocateTerminal(rp.Credentials()) if err != nil { return nil, err diff --git a/pkg/sentry/fsimpl/devpts/slave.go b/pkg/sentry/fsimpl/devpts/slave.go index e7e50d51e..0a98dc896 100644 --- a/pkg/sentry/fsimpl/devpts/slave.go +++ b/pkg/sentry/fsimpl/devpts/slave.go @@ -48,7 +48,7 @@ type slaveInode struct { var _ kernfs.Inode = (*slaveInode)(nil) // Open implements kernfs.Inode.Open. -func (si *slaveInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { +func (si *slaveInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { si.IncRef() fd := &slaveFileDescription{ inode: si, diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go index a26b13067..1e53b5c1b 100644 --- a/pkg/sentry/fsimpl/host/host.go +++ b/pkg/sentry/fsimpl/host/host.go @@ -66,7 +66,7 @@ func NewFilesystem(vfsObj *vfs.VirtualFilesystem) *vfs.Filesystem { } // ImportFD sets up and returns a vfs.FileDescription from a donated fd. -func ImportFD(mnt *vfs.Mount, hostFD int, isTTY bool) (*vfs.FileDescription, error) { +func ImportFD(ctx context.Context, mnt *vfs.Mount, hostFD int, isTTY bool) (*vfs.FileDescription, error) { fs, ok := mnt.Filesystem().Impl().(*kernfs.Filesystem) if !ok { return nil, fmt.Errorf("can't import host FDs into filesystems of type %T", mnt.Filesystem().Impl()) @@ -108,7 +108,7 @@ func ImportFD(mnt *vfs.Mount, hostFD int, isTTY bool) (*vfs.FileDescription, err // i.open will take a reference on d. defer d.DecRef() - return i.open(d.VFSDentry(), mnt) + return i.open(ctx, d.VFSDentry(), mnt) } // inode implements kernfs.Inode. @@ -360,11 +360,11 @@ func (i *inode) Destroy() { } // Open implements kernfs.Inode. -func (i *inode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - return i.open(vfsd, rp.Mount()) +func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + return i.open(ctx, vfsd, rp.Mount()) } -func (i *inode) open(d *vfs.Dentry, mnt *vfs.Mount) (*vfs.FileDescription, error) { +func (i *inode) open(ctx context.Context, d *vfs.Dentry, mnt *vfs.Mount) (*vfs.FileDescription, error) { var s syscall.Stat_t if err := syscall.Fstat(i.hostFD, &s); err != nil { return nil, err diff --git a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go index d8bddbafa..c7779fc11 100644 --- a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go +++ b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go @@ -53,7 +53,7 @@ func (f *DynamicBytesFile) Init(creds *auth.Credentials, ino uint64, data vfs.Dy } // Open implements Inode.Open. -func (f *DynamicBytesFile) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { +func (f *DynamicBytesFile) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { fd := &DynamicBytesFD{} if err := fd.Init(rp.Mount(), vfsd, f.data, opts.Flags); err != nil { return nil, err diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go index 3ccd92fc5..9e8d80414 100644 --- a/pkg/sentry/fsimpl/kernfs/filesystem.go +++ b/pkg/sentry/fsimpl/kernfs/filesystem.go @@ -406,7 +406,7 @@ func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf if err := inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil { return nil, err } - return inode.Open(rp, vfsd, opts) + return inode.Open(ctx, rp, vfsd, opts) } // May create new file. @@ -425,7 +425,7 @@ func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf if err := inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil { return nil, err } - return inode.Open(rp, vfsd, opts) + return inode.Open(ctx, rp, vfsd, opts) } afterTrailingSymlink: parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp) @@ -466,7 +466,7 @@ afterTrailingSymlink: } child := childVFSD.Impl().(*Dentry) parentVFSD.Impl().(*Dentry).InsertChild(pc, child) - return child.inode.Open(rp, childVFSD, opts) + return child.inode.Open(ctx, rp, childVFSD, opts) } if err != nil { return nil, err @@ -499,7 +499,7 @@ afterTrailingSymlink: if err := child.inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil { return nil, err } - return child.inode.Open(rp, &child.vfsd, opts) + return child.inode.Open(ctx, rp, &child.vfsd, opts) } // ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt. diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go index 02f35a675..615592d5f 100644 --- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go @@ -525,7 +525,7 @@ type InodeSymlink struct { } // Open implements Inode.Open. -func (InodeSymlink) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { +func (InodeSymlink) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { return nil, syserror.ELOOP } @@ -567,7 +567,7 @@ func (s *StaticDirectory) Init(creds *auth.Credentials, ino uint64, perm linux.F } // Open implements kernfs.Inode. -func (s *StaticDirectory) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { +func (s *StaticDirectory) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { fd, err := NewGenericDirectoryFD(rp.Mount(), vfsd, &s.OrderedChildren, &opts) if err != nil { return nil, err diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go index 95cf6dc24..732837933 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs.go @@ -308,7 +308,7 @@ type Inode interface { // // Precondition: rp.Done(). vfsd.Impl() must be the kernfs Dentry containing // the inode on which Open() is being called. - Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) + Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) } type inodeRefs interface { diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go index 0964d5456..a9f671bc8 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs_test.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go @@ -116,7 +116,7 @@ func (fs *filesystem) newReadonlyDir(creds *auth.Credentials, mode linux.FileMod return &dir.dentry } -func (d *readonlyDir) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { +func (d *readonlyDir) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &opts) if err != nil { return nil, err @@ -146,7 +146,7 @@ func (fs *filesystem) newDir(creds *auth.Credentials, mode linux.FileMode, conte return &dir.dentry } -func (d *dir) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { +func (d *dir) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &opts) if err != nil { return nil, err diff --git a/pkg/sentry/fsimpl/pipefs/pipefs.go b/pkg/sentry/fsimpl/pipefs/pipefs.go index faf3179bc..d6bd67467 100644 --- a/pkg/sentry/fsimpl/pipefs/pipefs.go +++ b/pkg/sentry/fsimpl/pipefs/pipefs.go @@ -129,9 +129,8 @@ func (i *inode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds *auth. } // Open implements kernfs.Inode.Open. -func (i *inode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - // FIXME(b/38173783): kernfs does not plumb Context here. - return i.pipe.Open(context.Background(), rp.Mount(), vfsd, opts.Flags) +func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + return i.pipe.Open(ctx, rp.Mount(), vfsd, opts.Flags) } // NewConnectedPipeFDs returns a pair of FileDescriptions representing the read diff --git a/pkg/sentry/fsimpl/proc/subtasks.go b/pkg/sentry/fsimpl/proc/subtasks.go index 28ec2484a..a5cfa8333 100644 --- a/pkg/sentry/fsimpl/proc/subtasks.go +++ b/pkg/sentry/fsimpl/proc/subtasks.go @@ -151,7 +151,7 @@ func (fd *subtasksFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) erro } // Open implements kernfs.Inode. -func (i *subtasksInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { +func (i *subtasksInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { fd := &subtasksFD{task: i.task} if err := fd.Init(&i.OrderedChildren, &opts); err != nil { return nil, err diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go index e2790d35b..66419d91b 100644 --- a/pkg/sentry/fsimpl/proc/task.go +++ b/pkg/sentry/fsimpl/proc/task.go @@ -102,7 +102,7 @@ func (i *taskInode) Valid(ctx context.Context) bool { } // Open implements kernfs.Inode. -func (i *taskInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { +func (i *taskInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &opts) if err != nil { return nil, err diff --git a/pkg/sentry/fsimpl/proc/task_fds.go b/pkg/sentry/fsimpl/proc/task_fds.go index a7622f1b6..8ad976073 100644 --- a/pkg/sentry/fsimpl/proc/task_fds.go +++ b/pkg/sentry/fsimpl/proc/task_fds.go @@ -142,7 +142,7 @@ func (i *fdDirInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, erro } // Open implements kernfs.Inode. -func (i *fdDirInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { +func (i *fdDirInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &opts) if err != nil { return nil, err @@ -269,7 +269,7 @@ func (i *fdInfoDirInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, } // Open implements kernfs.Inode. -func (i *fdInfoDirInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { +func (i *fdInfoDirInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &opts) if err != nil { return nil, err diff --git a/pkg/sentry/fsimpl/proc/task_files.go b/pkg/sentry/fsimpl/proc/task_files.go index 410cc3552..515f25327 100644 --- a/pkg/sentry/fsimpl/proc/task_files.go +++ b/pkg/sentry/fsimpl/proc/task_files.go @@ -788,7 +788,7 @@ func (i *namespaceInode) Init(creds *auth.Credentials, ino uint64, perm linux.Fi } // Open implements Inode.Open. -func (i *namespaceInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { +func (i *namespaceInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { fd := &namespaceFD{inode: i} i.IncRef() if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{}); err != nil { diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go index 26518ed03..5aeda8c9b 100644 --- a/pkg/sentry/fsimpl/proc/tasks.go +++ b/pkg/sentry/fsimpl/proc/tasks.go @@ -201,7 +201,7 @@ func (i *tasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback } // Open implements kernfs.Inode. -func (i *tasksInode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { +func (i *tasksInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &opts) if err != nil { return nil, err diff --git a/pkg/sentry/fsimpl/sockfs/sockfs.go b/pkg/sentry/fsimpl/sockfs/sockfs.go index 632cfde88..5ce50625b 100644 --- a/pkg/sentry/fsimpl/sockfs/sockfs.go +++ b/pkg/sentry/fsimpl/sockfs/sockfs.go @@ -69,7 +69,7 @@ type inode struct { } // Open implements kernfs.Inode.Open. -func (i *inode) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { +func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { return nil, syserror.ENXIO } diff --git a/pkg/sentry/fsimpl/sys/sys.go b/pkg/sentry/fsimpl/sys/sys.go index 34e8e0cbe..f8d25d35e 100644 --- a/pkg/sentry/fsimpl/sys/sys.go +++ b/pkg/sentry/fsimpl/sys/sys.go @@ -105,7 +105,7 @@ func (*dir) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.Set } // Open implements kernfs.Inode.Open. -func (d *dir) Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { +func (d *dir) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &opts) if err != nil { return nil, err diff --git a/runsc/boot/fds.go b/runsc/boot/fds.go index 0cbd63857..7e7a31fbd 100644 --- a/runsc/boot/fds.go +++ b/runsc/boot/fds.go @@ -91,7 +91,7 @@ func createFDTableVFS2(ctx context.Context, console bool, stdioFDs []int) (*kern for appFD, hostFD := range stdioFDs { // TODO(gvisor.dev/issue/1482): Add TTY support. - appFile, err := vfshost.ImportFD(k.HostMount(), hostFD, false) + appFile, err := vfshost.ImportFD(ctx, k.HostMount(), hostFD, false) if err != nil { return nil, err } -- cgit v1.2.3 From f13f26d17da56d585fd9857a81175bbd0be8ce60 Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Fri, 24 Apr 2020 13:45:31 -0700 Subject: Port SCM Rights to VFS2. Fixes #1477. PiperOrigin-RevId: 308317511 --- pkg/sentry/fs/host/control.go | 6 +- pkg/sentry/fsimpl/host/BUILD | 3 + pkg/sentry/fsimpl/host/control.go | 96 ++++++++++++++++++++++ pkg/sentry/socket/control/BUILD | 6 +- pkg/sentry/socket/control/control.go | 24 +++++- pkg/sentry/socket/control/control_vfs2.go | 131 ++++++++++++++++++++++++++++++ pkg/sentry/syscalls/linux/vfs2/socket.go | 2 +- 7 files changed, 260 insertions(+), 8 deletions(-) create mode 100644 pkg/sentry/fsimpl/host/control.go create mode 100644 pkg/sentry/socket/control/control_vfs2.go (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fs/host/control.go b/pkg/sentry/fs/host/control.go index 52c0504b6..39299b7e4 100644 --- a/pkg/sentry/fs/host/control.go +++ b/pkg/sentry/fs/host/control.go @@ -23,6 +23,8 @@ import ( "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" ) +// LINT.IfChange + type scmRights struct { fds []int } @@ -32,8 +34,6 @@ func newSCMRights(fds []int) control.SCMRights { } // Files implements control.SCMRights.Files. -// -// TODO(gvisor.dev/issue/2017): Port to VFS2. func (c *scmRights) Files(ctx context.Context, max int) (control.RightsFiles, bool) { n := max var trunc bool @@ -93,3 +93,5 @@ func fdsToFiles(ctx context.Context, fds []int) []*fs.File { } return files } + +// LINT.ThenChange(../../fsimpl/host/control.go) diff --git a/pkg/sentry/fsimpl/host/BUILD b/pkg/sentry/fsimpl/host/BUILD index 44dd9f672..2dcb03a73 100644 --- a/pkg/sentry/fsimpl/host/BUILD +++ b/pkg/sentry/fsimpl/host/BUILD @@ -5,6 +5,7 @@ licenses(["notice"]) go_library( name = "host", srcs = [ + "control.go", "host.go", "ioctl_unsafe.go", "tty.go", @@ -23,6 +24,8 @@ go_library( "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", "//pkg/sentry/memmap", + "//pkg/sentry/socket/control", + "//pkg/sentry/socket/unix/transport", "//pkg/sentry/unimpl", "//pkg/sentry/vfs", "//pkg/sync", diff --git a/pkg/sentry/fsimpl/host/control.go b/pkg/sentry/fsimpl/host/control.go new file mode 100644 index 000000000..b9082a20f --- /dev/null +++ b/pkg/sentry/fsimpl/host/control.go @@ -0,0 +1,96 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package host + +import ( + "syscall" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/socket/control" + "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" + "gvisor.dev/gvisor/pkg/sentry/vfs" +) + +type scmRights struct { + fds []int +} + +func newSCMRights(fds []int) control.SCMRightsVFS2 { + return &scmRights{fds} +} + +// Files implements control.SCMRights.Files. +func (c *scmRights) Files(ctx context.Context, max int) (control.RightsFilesVFS2, bool) { + n := max + var trunc bool + if l := len(c.fds); n > l { + n = l + } else if n < l { + trunc = true + } + + rf := control.RightsFilesVFS2(fdsToFiles(ctx, c.fds[:n])) + + // Only consume converted FDs (fdsToFiles may convert fewer than n FDs). + c.fds = c.fds[len(rf):] + return rf, trunc +} + +// Clone implements transport.RightsControlMessage.Clone. +func (c *scmRights) Clone() transport.RightsControlMessage { + // Host rights never need to be cloned. + return nil +} + +// Release implements transport.RightsControlMessage.Release. +func (c *scmRights) Release() { + for _, fd := range c.fds { + syscall.Close(fd) + } + c.fds = nil +} + +// If an error is encountered, only files created before the error will be +// returned. This is what Linux does. +func fdsToFiles(ctx context.Context, fds []int) []*vfs.FileDescription { + files := make([]*vfs.FileDescription, 0, len(fds)) + for _, fd := range fds { + // Get flags. We do it here because they may be modified + // by subsequent functions. + fileFlags, _, errno := syscall.Syscall(syscall.SYS_FCNTL, uintptr(fd), syscall.F_GETFL, 0) + if errno != 0 { + ctx.Warningf("Error retrieving host FD flags: %v", error(errno)) + break + } + + // Create the file backed by hostFD. + file, err := ImportFD(ctx, kernel.KernelFromContext(ctx).HostMount(), fd, false /* isTTY */) + if err != nil { + ctx.Warningf("Error creating file from host FD: %v", err) + break + } + + if err := file.SetStatusFlags(ctx, auth.CredentialsFromContext(ctx), uint32(fileFlags&linux.O_NONBLOCK)); err != nil { + ctx.Warningf("Error setting flags on host FD file: %v", err) + break + } + + files = append(files, file) + } + return files +} diff --git a/pkg/sentry/socket/control/BUILD b/pkg/sentry/socket/control/BUILD index 4d42d29cb..ca16d0381 100644 --- a/pkg/sentry/socket/control/BUILD +++ b/pkg/sentry/socket/control/BUILD @@ -4,7 +4,10 @@ package(licenses = ["notice"]) go_library( name = "control", - srcs = ["control.go"], + srcs = [ + "control.go", + "control_vfs2.go", + ], imports = [ "gvisor.dev/gvisor/pkg/sentry/fs", ], @@ -18,6 +21,7 @@ go_library( "//pkg/sentry/kernel/auth", "//pkg/sentry/socket", "//pkg/sentry/socket/unix/transport", + "//pkg/sentry/vfs", "//pkg/syserror", "//pkg/tcpip", "//pkg/usermem", diff --git a/pkg/sentry/socket/control/control.go b/pkg/sentry/socket/control/control.go index 8834a1e1a..8b439a078 100644 --- a/pkg/sentry/socket/control/control.go +++ b/pkg/sentry/socket/control/control.go @@ -41,6 +41,8 @@ type SCMCredentials interface { Credentials(t *kernel.Task) (kernel.ThreadID, auth.UID, auth.GID) } +// LINT.IfChange + // SCMRights represents a SCM_RIGHTS socket control message. type SCMRights interface { transport.RightsControlMessage @@ -142,6 +144,8 @@ func PackRights(t *kernel.Task, rights SCMRights, cloexec bool, buf []byte, flag return putCmsg(buf, flags, linux.SCM_RIGHTS, align, fds) } +// LINT.ThenChange(./control_vfs2.go) + // scmCredentials represents an SCM_CREDENTIALS socket control message. // // +stateify savable @@ -537,11 +541,19 @@ func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte) (socket.Con } if len(fds) > 0 { - rights, err := NewSCMRights(t, fds) - if err != nil { - return socket.ControlMessages{}, err + if kernel.VFS2Enabled { + rights, err := NewSCMRightsVFS2(t, fds) + if err != nil { + return socket.ControlMessages{}, err + } + cmsgs.Unix.Rights = rights + } else { + rights, err := NewSCMRights(t, fds) + if err != nil { + return socket.ControlMessages{}, err + } + cmsgs.Unix.Rights = rights } - cmsgs.Unix.Rights = rights } return cmsgs, nil @@ -566,6 +578,8 @@ func MakeCreds(t *kernel.Task) SCMCredentials { return &scmCredentials{t, tcred.EffectiveKUID, tcred.EffectiveKGID} } +// LINT.IfChange + // New creates default control messages if needed. func New(t *kernel.Task, socketOrEndpoint interface{}, rights SCMRights) transport.ControlMessages { return transport.ControlMessages{ @@ -573,3 +587,5 @@ func New(t *kernel.Task, socketOrEndpoint interface{}, rights SCMRights) transpo Rights: rights, } } + +// LINT.ThenChange(./control_vfs2.go) diff --git a/pkg/sentry/socket/control/control_vfs2.go b/pkg/sentry/socket/control/control_vfs2.go new file mode 100644 index 000000000..fd08179be --- /dev/null +++ b/pkg/sentry/socket/control/control_vfs2.go @@ -0,0 +1,131 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package control + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +// SCMRightsVFS2 represents a SCM_RIGHTS socket control message. +type SCMRightsVFS2 interface { + transport.RightsControlMessage + + // Files returns up to max RightsFiles. + // + // Returned files are consumed and ownership is transferred to the caller. + // Subsequent calls to Files will return the next files. + Files(ctx context.Context, max int) (rf RightsFilesVFS2, truncated bool) +} + +// RightsFiles represents a SCM_RIGHTS socket control message. A reference is +// maintained for each vfs.FileDescription and is release either when an FD is created or +// when the Release method is called. +type RightsFilesVFS2 []*vfs.FileDescription + +// NewSCMRightsVFS2 creates a new SCM_RIGHTS socket control message +// representation using local sentry FDs. +func NewSCMRightsVFS2(t *kernel.Task, fds []int32) (SCMRightsVFS2, error) { + files := make(RightsFilesVFS2, 0, len(fds)) + for _, fd := range fds { + file := t.GetFileVFS2(fd) + if file == nil { + files.Release() + return nil, syserror.EBADF + } + files = append(files, file) + } + return &files, nil +} + +// Files implements SCMRights.Files. +func (fs *RightsFilesVFS2) Files(ctx context.Context, max int) (RightsFilesVFS2, bool) { + n := max + var trunc bool + if l := len(*fs); n > l { + n = l + } else if n < l { + trunc = true + } + rf := (*fs)[:n] + *fs = (*fs)[n:] + return rf, trunc +} + +// Clone implements transport.RightsControlMessage.Clone. +func (fs *RightsFilesVFS2) Clone() transport.RightsControlMessage { + nfs := append(RightsFilesVFS2(nil), *fs...) + for _, nf := range nfs { + nf.IncRef() + } + return &nfs +} + +// Release implements transport.RightsControlMessage.Release. +func (fs *RightsFilesVFS2) Release() { + for _, f := range *fs { + f.DecRef() + } + *fs = nil +} + +// rightsFDsVFS2 gets up to the specified maximum number of FDs. +func rightsFDsVFS2(t *kernel.Task, rights SCMRightsVFS2, cloexec bool, max int) ([]int32, bool) { + files, trunc := rights.Files(t, max) + fds := make([]int32, 0, len(files)) + for i := 0; i < max && len(files) > 0; i++ { + fd, err := t.NewFDFromVFS2(0, files[0], kernel.FDFlags{ + CloseOnExec: cloexec, + }) + files[0].DecRef() + files = files[1:] + if err != nil { + t.Warningf("Error inserting FD: %v", err) + // This is what Linux does. + break + } + + fds = append(fds, int32(fd)) + } + return fds, trunc +} + +// PackRightsVFS2 packs as many FDs as will fit into the unused capacity of buf. +func PackRightsVFS2(t *kernel.Task, rights SCMRightsVFS2, cloexec bool, buf []byte, flags int) ([]byte, int) { + maxFDs := (cap(buf) - len(buf) - linux.SizeOfControlMessageHeader) / 4 + // Linux does not return any FDs if none fit. + if maxFDs <= 0 { + flags |= linux.MSG_CTRUNC + return buf, flags + } + fds, trunc := rightsFDsVFS2(t, rights, cloexec, maxFDs) + if trunc { + flags |= linux.MSG_CTRUNC + } + align := t.Arch().Width() + return putCmsg(buf, flags, linux.SCM_RIGHTS, align, fds) +} + +// NewVFS2 creates default control messages if needed. +func NewVFS2(t *kernel.Task, socketOrEndpoint interface{}, rights SCMRightsVFS2) transport.ControlMessages { + return transport.ControlMessages{ + Credentials: makeCreds(t, socketOrEndpoint), + Rights: rights, + } +} diff --git a/pkg/sentry/syscalls/linux/vfs2/socket.go b/pkg/sentry/syscalls/linux/vfs2/socket.go index b1ede32f0..10b668477 100644 --- a/pkg/sentry/syscalls/linux/vfs2/socket.go +++ b/pkg/sentry/syscalls/linux/vfs2/socket.go @@ -804,7 +804,7 @@ func recvSingleMsg(t *kernel.Task, s socket.SocketVFS2, msgPtr usermem.Addr, fla } if cms.Unix.Rights != nil { - controlData, mflags = control.PackRights(t, cms.Unix.Rights.(control.SCMRights), flags&linux.MSG_CMSG_CLOEXEC != 0, controlData, mflags) + controlData, mflags = control.PackRightsVFS2(t, cms.Unix.Rights.(control.SCMRightsVFS2), flags&linux.MSG_CMSG_CLOEXEC != 0, controlData, mflags) } // Copy the address to the caller. -- cgit v1.2.3 From 15a822a1936e295cb6418df7ddf445d8500dfb2e Mon Sep 17 00:00:00 2001 From: Zach Koopmans Date: Fri, 24 Apr 2020 18:22:21 -0700 Subject: VFS2: Get HelloWorld image tests to pass with VFS2 This change includes: - Modifications to loader_test.go to get TestCreateMountNamespace to pass with VFS2. - Changes necessary to get TestHelloWorld in image tests to pass with VFS2. This means runsc can run the hello-world container with docker on VSF2. Note: Containers that use sockets will not run with these changes. See "//test/image/...". Any tests here with sockets currently fail (which is all of them but HelloWorld). PiperOrigin-RevId: 308363072 --- pkg/sentry/fsimpl/gofer/directory.go | 1 + runsc/boot/BUILD | 2 + runsc/boot/loader.go | 13 +-- runsc/boot/loader_test.go | 152 ++++++++++++++++++++++++----------- runsc/boot/vfs.go | 78 ++++++++++++++---- scripts/docker_tests.sh | 3 + 6 files changed, 183 insertions(+), 66 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/gofer/directory.go b/pkg/sentry/fsimpl/gofer/directory.go index c67766ab2..55f9ed911 100644 --- a/pkg/sentry/fsimpl/gofer/directory.go +++ b/pkg/sentry/fsimpl/gofer/directory.go @@ -75,6 +75,7 @@ func (d *dentry) createSyntheticDirectoryLocked(name string, mode linux.FileMode handle: handle{ fd: -1, }, + nlink: uint32(2), } d2.pf.dentry = d2 d2.vfsd.Init(d2) diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD index 69dcc74f2..ed3c8f546 100644 --- a/runsc/boot/BUILD +++ b/runsc/boot/BUILD @@ -119,11 +119,13 @@ go_test( library = ":boot", deps = [ "//pkg/control/server", + "//pkg/fspath", "//pkg/log", "//pkg/p9", "//pkg/sentry/contexttest", "//pkg/sentry/fs", "//pkg/sentry/kernel", + "//pkg/sentry/vfs", "//pkg/sync", "//pkg/unet", "//runsc/fsgofer", diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index 3f41d8357..f6ea4c102 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -625,11 +625,14 @@ func (l *Loader) run() error { // l.stdioFDs are derived from dup() in boot.New() and they are now dup()ed again // either in createFDTable() during initial start or in descriptor.initAfterLoad() - // during restore, we can release l.stdioFDs now. - for _, fd := range l.stdioFDs { - err := syscall.Close(fd) - if err != nil { - return fmt.Errorf("close dup()ed stdioFDs: %v", err) + // during restore, we can release l.stdioFDs now. VFS2 takes ownership of the + // passed FDs, so only close for VFS1. + if !kernel.VFS2Enabled { + for _, fd := range l.stdioFDs { + err := syscall.Close(fd) + if err != nil { + return fmt.Errorf("close dup()ed stdioFDs: %v", err) + } } } diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go index e7c71734f..55d27a632 100644 --- a/runsc/boot/loader_test.go +++ b/runsc/boot/loader_test.go @@ -26,11 +26,13 @@ import ( specs "github.com/opencontainers/runtime-spec/specs-go" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/control/server" + "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/p9" "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/unet" "gvisor.dev/gvisor/runsc/fsgofer" @@ -107,14 +109,12 @@ func startGofer(root string) (int, func(), error) { return sandboxEnd, cleanup, nil } -func createLoader(vfsEnabled bool) (*Loader, func(), error) { +func createLoader(vfsEnabled bool, spec *specs.Spec) (*Loader, func(), error) { fd, err := server.CreateSocket(ControlSocketAddr(fmt.Sprintf("%010d", rand.Int())[:10])) if err != nil { return nil, nil, err } conf := testConfig() - spec := testSpec() - conf.VFS2 = vfsEnabled sandEnd, cleanup, err := startGofer(spec.Root.Path) @@ -161,7 +161,7 @@ func TestRunVFS2(t *testing.T) { } func doRun(t *testing.T, vfsEnabled bool) { - l, cleanup, err := createLoader(vfsEnabled) + l, cleanup, err := createLoader(vfsEnabled, testSpec()) if err != nil { t.Fatalf("error creating loader: %v", err) } @@ -210,7 +210,7 @@ func TestStartSignalVFS2(t *testing.T) { } func doStartSignal(t *testing.T, vfsEnabled bool) { - l, cleanup, err := createLoader(vfsEnabled) + l, cleanup, err := createLoader(vfsEnabled, testSpec()) if err != nil { t.Fatalf("error creating loader: %v", err) } @@ -258,18 +258,19 @@ func doStartSignal(t *testing.T, vfsEnabled bool) { } -// Test that MountNamespace can be created with various specs. -func TestCreateMountNamespace(t *testing.T) { - testCases := []struct { - name string - // Spec that will be used to create the mount manager. Note - // that we can't mount procfs without a kernel, so each spec - // MUST contain something other than procfs mounted at /proc. - spec specs.Spec - // Paths that are expected to exist in the resulting fs. - expectedPaths []string - }{ - { +type CreateMountTestcase struct { + name string + // Spec that will be used to create the mount manager. Note + // that we can't mount procfs without a kernel, so each spec + // MUST contain something other than procfs mounted at /proc. + spec specs.Spec + // Paths that are expected to exist in the resulting fs. + expectedPaths []string +} + +func createMountTestcases(vfs2 bool) []*CreateMountTestcase { + testCases := []*CreateMountTestcase{ + &CreateMountTestcase{ // Only proc. name: "only proc mount", spec: specs.Spec{ @@ -311,7 +312,7 @@ func TestCreateMountNamespace(t *testing.T) { // /dev, and /sys. expectedPaths: []string{"/some/very/very/deep/path", "/proc", "/dev", "/sys"}, }, - { + &CreateMountTestcase{ // Mounts are nested inside each other. name: "nested mounts", spec: specs.Spec{ @@ -355,7 +356,7 @@ func TestCreateMountNamespace(t *testing.T) { expectedPaths: []string{"/foo", "/foo/bar", "/foo/bar/baz", "/foo/qux", "/foo/qux-quz", "/foo/some/very/very/deep/path", "/proc", "/dev", "/sys"}, }, - { + &CreateMountTestcase{ name: "mount inside /dev", spec: specs.Spec{ Root: &specs.Root{ @@ -398,40 +399,47 @@ func TestCreateMountNamespace(t *testing.T) { }, expectedPaths: []string{"/proc", "/dev", "/dev/fd-foo", "/dev/foo", "/dev/bar", "/sys"}, }, - { - name: "mounts inside mandatory mounts", - spec: specs.Spec{ - Root: &specs.Root{ - Path: os.TempDir(), - Readonly: true, + } + + vfsCase := &CreateMountTestcase{ + name: "mounts inside mandatory mounts", + spec: specs.Spec{ + Root: &specs.Root{ + Path: os.TempDir(), + Readonly: true, + }, + Mounts: []specs.Mount{ + { + Destination: "/proc", + Type: "tmpfs", }, - Mounts: []specs.Mount{ - { - Destination: "/proc", - Type: "tmpfs", - }, - // We don't include /sys, and /tmp in - // the spec, since they will be added - // automatically. - // - // Instead, add submounts inside these - // directories and make sure they are - // visible under the mandatory mounts. - { - Destination: "/sys/bar", - Type: "tmpfs", - }, - { - Destination: "/tmp/baz", - Type: "tmpfs", - }, + // TODO (gvisor.dev/issue/1487): Re-add this case when sysfs supports + // MkDirAt in VFS2 (and remove the reduntant append). + // { + // Destination: "/sys/bar", + // Type: "tmpfs", + // }, + // + { + Destination: "/tmp/baz", + Type: "tmpfs", }, }, - expectedPaths: []string{"/proc", "/sys", "/sys/bar", "/tmp", "/tmp/baz"}, }, + expectedPaths: []string{"/proc", "/sys" /* "/sys/bar" ,*/, "/tmp", "/tmp/baz"}, } - for _, tc := range testCases { + if !vfs2 { + vfsCase.spec.Mounts = append(vfsCase.spec.Mounts, specs.Mount{Destination: "/sys/bar", Type: "tmpfs"}) + vfsCase.expectedPaths = append(vfsCase.expectedPaths, "/sys/bar") + } + return append(testCases, vfsCase) +} + +// Test that MountNamespace can be created with various specs. +func TestCreateMountNamespace(t *testing.T) { + + for _, tc := range createMountTestcases(false /* vfs2 */) { t.Run(tc.name, func(t *testing.T) { conf := testConfig() ctx := contexttest.Context(t) @@ -466,6 +474,56 @@ func TestCreateMountNamespace(t *testing.T) { } } +// Test that MountNamespace can be created with various specs. +func TestCreateMountNamespaceVFS2(t *testing.T) { + + for _, tc := range createMountTestcases(true /* vfs2 */) { + t.Run(tc.name, func(t *testing.T) { + defer resetSyscallTable() + + spec := testSpec() + spec.Mounts = tc.spec.Mounts + spec.Root = tc.spec.Root + + l, loaderCleanup, err := createLoader(true /* VFS2 Enabled */, spec) + if err != nil { + t.Fatalf("failed to create loader: %v", err) + } + defer l.Destroy() + defer loaderCleanup() + + mntr := newContainerMounter(l.spec, l.goferFDs, l.k, l.mountHints) + if err := mntr.processHints(l.conf); err != nil { + t.Fatalf("failed process hints: %v", err) + } + + ctx := l.rootProcArgs.NewContext(l.k) + mns, err := mntr.setupVFS2(ctx, l.conf, &l.rootProcArgs) + if err != nil { + t.Fatalf("failed to setupVFS2: %v", err) + } + + root := mns.Root() + defer root.DecRef() + for _, p := range tc.expectedPaths { + + target := &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(p), + } + + if d, err := l.k.VFS().GetDentryAt(ctx, l.rootProcArgs.Credentials, target, &vfs.GetDentryOptions{}); err != nil { + t.Errorf("expected path %v to exist with spec %v, but got error %v", p, tc.spec, err) + } else { + d.DecRef() + } + + } + }) + } +} + // TestRestoreEnvironment tests that the correct mounts are collected from the spec and config // in order to build the environment for restoring. func TestRestoreEnvironment(t *testing.T) { diff --git a/runsc/boot/vfs.go b/runsc/boot/vfs.go index bce3a3593..0b9b0b436 100644 --- a/runsc/boot/vfs.go +++ b/runsc/boot/vfs.go @@ -17,6 +17,7 @@ package boot import ( "fmt" "path" + "sort" "strconv" "strings" @@ -192,14 +193,9 @@ func (c *containerMounter) setupVFS2(ctx context.Context, conf *Config, procArgs return nil, fmt.Errorf("register filesystems: %w", err) } - fd := c.fds.remove() - - opts := strings.Join(p9MountOptionsVFS2(fd, conf.FileAccess), ",") - - log.Infof("Mounting root over 9P, ioFD: %d", fd) - mns, err := c.k.VFS().NewMountNamespace(ctx, creds, "", rootFsName, &vfs.GetFilesystemOptions{Data: opts}) + mns, err := c.createMountNamespaceVFS2(ctx, conf, creds) if err != nil { - return nil, fmt.Errorf("setting up mountnamespace: %w", err) + return nil, fmt.Errorf("creating mount namespace: %w", err) } rootProcArgs.MountNamespaceVFS2 = mns @@ -212,8 +208,23 @@ func (c *containerMounter) setupVFS2(ctx context.Context, conf *Config, procArgs return mns, nil } +func (c *containerMounter) createMountNamespaceVFS2(ctx context.Context, conf *Config, creds *auth.Credentials) (*vfs.MountNamespace, error) { + + fd := c.fds.remove() + opts := strings.Join(p9MountOptionsVFS2(fd, conf.FileAccess), ",") + + log.Infof("Mounting root over 9P, ioFD: %d", fd) + mns, err := c.k.VFS().NewMountNamespace(ctx, creds, "", rootFsName, &vfs.GetFilesystemOptions{Data: opts}) + if err != nil { + return nil, fmt.Errorf("setting up mount namespace: %w", err) + } + return mns, nil +} + func (c *containerMounter) mountSubmountsVFS2(ctx context.Context, conf *Config, mns *vfs.MountNamespace, creds *auth.Credentials) error { + c.prepareMountsVFS2() + for _, submount := range c.mounts { log.Debugf("Mounting %q to %q, type: %s, options: %s", submount.Source, submount.Destination, submount.Type, submount.Options) if err := c.mountSubmountVFS2(ctx, conf, mns, creds, &submount); err != nil { @@ -226,6 +237,11 @@ func (c *containerMounter) mountSubmountsVFS2(ctx context.Context, conf *Config, return c.checkDispenser() } +func (c *containerMounter) prepareMountsVFS2() { + // Sort the mounts so that we don't place children before parents. + sort.Slice(c.mounts, func(i, j int) bool { return len(c.mounts[i].Destination) < len(c.mounts[j].Destination) }) +} + // TODO(gvisor.dev/issue/1487): Implement submount options similar to the VFS1 version. func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *Config, mns *vfs.MountNamespace, creds *auth.Credentials, submount *specs.Mount) error { root := mns.Root() @@ -236,11 +252,21 @@ func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *Config, Path: fspath.Parse(submount.Destination), } - _, options, useOverlay, err := c.getMountNameAndOptionsVFS2(conf, *submount) + fsName, options, useOverlay, err := c.getMountNameAndOptionsVFS2(conf, *submount) if err != nil { return fmt.Errorf("mountOptions failed: %w", err) } + if fsName == "" { + // Filesystem is not supported (e.g. cgroup), just skip it. + return nil + } + + if err := c.makeSyntheticMount(ctx, submount.Destination, root, creds); err != nil { + return err + } + log.Debugf("directory exists or made directory for submount: %s", submount.Destination) + opts := &vfs.MountOptions{ GetFilesystemOptions: vfs.GetFilesystemOptions{ Data: strings.Join(options, ","), @@ -251,12 +277,6 @@ func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *Config, // All writes go to upper, be paranoid and make lower readonly. opts.ReadOnly = useOverlay - if err := c.k.VFS().MkdirAt(ctx, creds, target, &vfs.MkdirOptions{ - ForSyntheticMountpoint: true, - }); err != nil && err != syserror.EEXIST { - // Log a warning, but attempt the mount anyway. - log.Warningf("Failed to create mount point at %q: %v", submount.Destination, err) - } if err := c.k.VFS().MountAt(ctx, creds, "", target, submount.Type, opts); err != nil { return fmt.Errorf("failed to mount %q (type: %s): %w, opts: %v", submount.Destination, submount.Type, err, opts) } @@ -314,3 +334,33 @@ func p9MountOptionsVFS2(fd int, fa FileAccessType) []string { } return opts } + +func (c *containerMounter) makeSyntheticMount(ctx context.Context, currentPath string, root vfs.VirtualDentry, creds *auth.Credentials) error { + + target := &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(currentPath), + } + + _, err := c.k.VFS().StatAt(ctx, creds, target, &vfs.StatOptions{}) + switch { + + case err == syserror.ENOENT: + if err := c.makeSyntheticMount(ctx, path.Dir(currentPath), root, creds); err != nil { + return err + } + + mkdirOpts := &vfs.MkdirOptions{Mode: 0777, ForSyntheticMountpoint: true} + if err := c.k.VFS().MkdirAt(ctx, creds, target, mkdirOpts); err != nil { + return fmt.Errorf("failed to makedir for mount %+v: %w", target, err) + } + return nil + + case err != nil: + return fmt.Errorf("stat failed for mount %+v: %w", target, err) + + default: + return nil + } +} diff --git a/scripts/docker_tests.sh b/scripts/docker_tests.sh index 931ce1aa4..dce0a4085 100755 --- a/scripts/docker_tests.sh +++ b/scripts/docker_tests.sh @@ -20,3 +20,6 @@ make load-all-images install_runsc_for_test docker test_runsc //test/image:image_test //test/e2e:integration_test + +install_runsc_for_test docker --vfs2 +test_runsc //test/image:image_test --test_filter=.*TestHelloWorld -- cgit v1.2.3 From 1c2ecbb1a03ffaa3bcdb2ee69c879da5e7076fa5 Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Mon, 27 Apr 2020 16:00:39 -0700 Subject: Import host sockets. The FileDescription implementation for hostfs sockets uses the standard Unix socket implementation (unix.SocketVFS2), but is also tied to a hostfs dentry. Updates #1672, #1476 PiperOrigin-RevId: 308716426 --- pkg/sentry/fs/host/socket.go | 4 + pkg/sentry/fs/host/socket_iovec.go | 4 + pkg/sentry/fs/host/socket_unsafe.go | 4 + pkg/sentry/fsimpl/host/BUILD | 10 + pkg/sentry/fsimpl/host/host.go | 57 +++-- pkg/sentry/fsimpl/host/socket.go | 397 ++++++++++++++++++++++++++++++++ pkg/sentry/fsimpl/host/socket_iovec.go | 113 +++++++++ pkg/sentry/fsimpl/host/socket_unsafe.go | 101 ++++++++ pkg/sentry/fsimpl/sockfs/sockfs.go | 20 +- pkg/sentry/socket/unix/BUILD | 1 + pkg/sentry/socket/unix/unix_vfs2.go | 42 ++-- 11 files changed, 698 insertions(+), 55 deletions(-) create mode 100644 pkg/sentry/fsimpl/host/socket.go create mode 100644 pkg/sentry/fsimpl/host/socket_iovec.go create mode 100644 pkg/sentry/fsimpl/host/socket_unsafe.go (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fs/host/socket.go b/pkg/sentry/fs/host/socket.go index 06fc2d80a..b6e94583e 100644 --- a/pkg/sentry/fs/host/socket.go +++ b/pkg/sentry/fs/host/socket.go @@ -37,6 +37,8 @@ import ( "gvisor.dev/gvisor/pkg/waiter" ) +// LINT.IfChange + // maxSendBufferSize is the maximum host send buffer size allowed for endpoint. // // N.B. 8MB is the default maximum on Linux (2 * sysctl_wmem_max). @@ -388,3 +390,5 @@ func (c *ConnectedEndpoint) Release() { // CloseUnread implements transport.ConnectedEndpoint.CloseUnread. func (c *ConnectedEndpoint) CloseUnread() {} + +// LINT.ThenChange(../../fsimpl/host/socket.go) diff --git a/pkg/sentry/fs/host/socket_iovec.go b/pkg/sentry/fs/host/socket_iovec.go index af6955675..5c18dbd5e 100644 --- a/pkg/sentry/fs/host/socket_iovec.go +++ b/pkg/sentry/fs/host/socket_iovec.go @@ -21,6 +21,8 @@ import ( "gvisor.dev/gvisor/pkg/syserror" ) +// LINT.IfChange + // maxIovs is the maximum number of iovecs to pass to the host. var maxIovs = linux.UIO_MAXIOV @@ -111,3 +113,5 @@ func buildIovec(bufs [][]byte, maxlen int64, truncate bool) (length int64, iovec return total, iovecs, nil, err } + +// LINT.ThenChange(../../fsimpl/host/socket_iovec.go) diff --git a/pkg/sentry/fs/host/socket_unsafe.go b/pkg/sentry/fs/host/socket_unsafe.go index f3bbed7ea..5d4f312cf 100644 --- a/pkg/sentry/fs/host/socket_unsafe.go +++ b/pkg/sentry/fs/host/socket_unsafe.go @@ -19,6 +19,8 @@ import ( "unsafe" ) +// LINT.IfChange + // fdReadVec receives from fd to bufs. // // If the total length of bufs is > maxlen, fdReadVec will do a partial read @@ -99,3 +101,5 @@ func fdWriteVec(fd int, bufs [][]byte, maxlen int64, truncate bool) (int64, int6 return int64(n), length, err } + +// LINT.ThenChange(../../fsimpl/host/socket_unsafe.go) diff --git a/pkg/sentry/fsimpl/host/BUILD b/pkg/sentry/fsimpl/host/BUILD index 2dcb03a73..e1c56d89b 100644 --- a/pkg/sentry/fsimpl/host/BUILD +++ b/pkg/sentry/fsimpl/host/BUILD @@ -8,6 +8,9 @@ go_library( "control.go", "host.go", "ioctl_unsafe.go", + "socket.go", + "socket_iovec.go", + "socket_unsafe.go", "tty.go", "util.go", "util_unsafe.go", @@ -16,6 +19,7 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/context", + "//pkg/fdnotifier", "//pkg/log", "//pkg/refs", "//pkg/sentry/arch", @@ -25,12 +29,18 @@ go_library( "//pkg/sentry/kernel/auth", "//pkg/sentry/memmap", "//pkg/sentry/socket/control", + "//pkg/sentry/socket/unix", "//pkg/sentry/socket/unix/transport", "//pkg/sentry/unimpl", + "//pkg/sentry/uniqueid", "//pkg/sentry/vfs", "//pkg/sync", + "//pkg/syserr", "//pkg/syserror", + "//pkg/tcpip", + "//pkg/unet", "//pkg/usermem", + "//pkg/waiter", "@org_golang_x_sys//unix:go_default_library", ], ) diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go index 1e53b5c1b..05538ea42 100644 --- a/pkg/sentry/fsimpl/host/host.go +++ b/pkg/sentry/fsimpl/host/host.go @@ -17,7 +17,6 @@ package host import ( - "errors" "fmt" "math" "syscall" @@ -31,6 +30,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/hostfd" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/memmap" + unixsocket "gvisor.dev/gvisor/pkg/sentry/socket/unix" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" @@ -156,7 +156,7 @@ type inode struct { // Note that these flags may become out of date, since they can be modified // on the host, e.g. with fcntl. -func fileFlagsFromHostFD(fd int) (int, error) { +func fileFlagsFromHostFD(fd int) (uint32, error) { flags, err := unix.FcntlInt(uintptr(fd), syscall.F_GETFL, 0) if err != nil { log.Warningf("Failed to get file flags for donated FD %d: %v", fd, err) @@ -164,7 +164,7 @@ func fileFlagsFromHostFD(fd int) (int, error) { } // TODO(gvisor.dev/issue/1672): implement behavior corresponding to these allowed flags. flags &= syscall.O_ACCMODE | syscall.O_DIRECT | syscall.O_NONBLOCK | syscall.O_DSYNC | syscall.O_SYNC | syscall.O_APPEND - return flags, nil + return uint32(flags), nil } // CheckPermissions implements kernfs.Inode. @@ -361,6 +361,10 @@ func (i *inode) Destroy() { // Open implements kernfs.Inode. func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + // Once created, we cannot re-open a socket fd through /proc/[pid]/fd/. + if i.Mode().FileType() == linux.S_IFSOCK { + return nil, syserror.ENXIO + } return i.open(ctx, vfsd, rp.Mount()) } @@ -370,42 +374,45 @@ func (i *inode) open(ctx context.Context, d *vfs.Dentry, mnt *vfs.Mount) (*vfs.F return nil, err } fileType := s.Mode & linux.FileTypeMask + flags, err := fileFlagsFromHostFD(i.hostFD) + if err != nil { + return nil, err + } + if fileType == syscall.S_IFSOCK { if i.isTTY { - return nil, errors.New("cannot use host socket as TTY") + log.Warningf("cannot use host socket fd %d as TTY", i.hostFD) + return nil, syserror.ENOTTY } - // TODO(gvisor.dev/issue/1672): support importing sockets. - return nil, errors.New("importing host sockets not supported") + + ep, err := newEndpoint(ctx, i.hostFD) + if err != nil { + return nil, err + } + // Currently, we only allow Unix sockets to be imported. + return unixsocket.NewFileDescription(ep, ep.Type(), flags, mnt, d) } // TODO(gvisor.dev/issue/1672): Whitelist specific file types here, so that // we don't allow importing arbitrary file types without proper support. - var ( - vfsfd *vfs.FileDescription - fdImpl vfs.FileDescriptionImpl - ) if i.isTTY { fd := &ttyFD{ fileDescription: fileDescription{inode: i}, termios: linux.DefaultSlaveTermios, } - vfsfd = &fd.vfsfd - fdImpl = fd - } else { - // For simplicity, set offset to 0. Technically, we should - // only set to 0 on files that are not seekable (sockets, pipes, etc.), - // and use the offset from the host fd otherwise. - fd := &fileDescription{inode: i} - vfsfd = &fd.vfsfd - fdImpl = fd - } - - flags, err := fileFlagsFromHostFD(i.hostFD) - if err != nil { - return nil, err + vfsfd := &fd.vfsfd + if err := vfsfd.Init(fd, flags, mnt, d, &vfs.FileDescriptionOptions{}); err != nil { + return nil, err + } + return vfsfd, nil } - if err := vfsfd.Init(fdImpl, uint32(flags), mnt, d, &vfs.FileDescriptionOptions{}); err != nil { + // For simplicity, set offset to 0. Technically, we should + // only set to 0 on files that are not seekable (sockets, pipes, etc.), + // and use the offset from the host fd otherwise. + fd := &fileDescription{inode: i} + vfsfd := &fd.vfsfd + if err := vfsfd.Init(fd, flags, mnt, d, &vfs.FileDescriptionOptions{}); err != nil { return nil, err } return vfsfd, nil diff --git a/pkg/sentry/fsimpl/host/socket.go b/pkg/sentry/fsimpl/host/socket.go new file mode 100644 index 000000000..39dd624a0 --- /dev/null +++ b/pkg/sentry/fsimpl/host/socket.go @@ -0,0 +1,397 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package host + +import ( + "fmt" + "syscall" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fdnotifier" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/refs" + "gvisor.dev/gvisor/pkg/sentry/socket/control" + "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" + "gvisor.dev/gvisor/pkg/sentry/uniqueid" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserr" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/unet" + "gvisor.dev/gvisor/pkg/waiter" +) + +// Create a new host-backed endpoint from the given fd. +func newEndpoint(ctx context.Context, hostFD int) (transport.Endpoint, error) { + // Set up an external transport.Endpoint using the host fd. + addr := fmt.Sprintf("hostfd:[%d]", hostFD) + var q waiter.Queue + e, err := NewConnectedEndpoint(ctx, hostFD, &q, addr, true /* saveable */) + if err != nil { + return nil, err.ToError() + } + e.Init() + ep := transport.NewExternal(ctx, e.stype, uniqueid.GlobalProviderFromContext(ctx), &q, e, e) + return ep, nil +} + +// maxSendBufferSize is the maximum host send buffer size allowed for endpoint. +// +// N.B. 8MB is the default maximum on Linux (2 * sysctl_wmem_max). +const maxSendBufferSize = 8 << 20 + +// ConnectedEndpoint is an implementation of transport.ConnectedEndpoint and +// transport.Receiver. It is backed by a host fd that was imported at sentry +// startup. This fd is shared with a hostfs inode, which retains ownership of +// it. +// +// ConnectedEndpoint is saveable, since we expect that the host will provide +// the same fd upon restore. +// +// As of this writing, we only allow Unix sockets to be imported. +// +// +stateify savable +type ConnectedEndpoint struct { + // ref keeps track of references to a ConnectedEndpoint. + ref refs.AtomicRefCount + + // mu protects fd below. + mu sync.RWMutex `state:"nosave"` + + // fd is the host fd backing this endpoint. + fd int + + // addr is the address at which this endpoint is bound. + addr string + + queue *waiter.Queue + + // sndbuf is the size of the send buffer. + // + // N.B. When this is smaller than the host size, we present it via + // GetSockOpt and message splitting/rejection in SendMsg, but do not + // prevent lots of small messages from filling the real send buffer + // size on the host. + sndbuf int64 `state:"nosave"` + + // stype is the type of Unix socket. + stype linux.SockType +} + +// init performs initialization required for creating new ConnectedEndpoints and +// for restoring them. +func (c *ConnectedEndpoint) init() *syserr.Error { + family, err := syscall.GetsockoptInt(c.fd, syscall.SOL_SOCKET, syscall.SO_DOMAIN) + if err != nil { + return syserr.FromError(err) + } + + if family != syscall.AF_UNIX { + // We only allow Unix sockets. + return syserr.ErrInvalidEndpointState + } + + stype, err := syscall.GetsockoptInt(c.fd, syscall.SOL_SOCKET, syscall.SO_TYPE) + if err != nil { + return syserr.FromError(err) + } + + if err := syscall.SetNonblock(c.fd, true); err != nil { + return syserr.FromError(err) + } + + sndbuf, err := syscall.GetsockoptInt(c.fd, syscall.SOL_SOCKET, syscall.SO_SNDBUF) + if err != nil { + return syserr.FromError(err) + } + if sndbuf > maxSendBufferSize { + log.Warningf("Socket send buffer too large: %d", sndbuf) + return syserr.ErrInvalidEndpointState + } + + c.stype = linux.SockType(stype) + c.sndbuf = int64(sndbuf) + + return nil +} + +// NewConnectedEndpoint creates a new ConnectedEndpoint backed by a host fd +// imported at sentry startup, +// +// The caller is responsible for calling Init(). Additionaly, Release needs to +// be called twice because ConnectedEndpoint is both a transport.Receiver and +// transport.ConnectedEndpoint. +func NewConnectedEndpoint(ctx context.Context, hostFD int, queue *waiter.Queue, addr string, saveable bool) (*ConnectedEndpoint, *syserr.Error) { + e := ConnectedEndpoint{ + fd: hostFD, + addr: addr, + queue: queue, + } + + if err := e.init(); err != nil { + return nil, err + } + + // AtomicRefCounters start off with a single reference. We need two. + e.ref.IncRef() + e.ref.EnableLeakCheck("host.ConnectedEndpoint") + return &e, nil +} + +// Init will do the initialization required without holding other locks. +func (c *ConnectedEndpoint) Init() { + if err := fdnotifier.AddFD(int32(c.fd), c.queue); err != nil { + panic(err) + } +} + +// Send implements transport.ConnectedEndpoint.Send. +func (c *ConnectedEndpoint) Send(data [][]byte, controlMessages transport.ControlMessages, from tcpip.FullAddress) (int64, bool, *syserr.Error) { + c.mu.RLock() + defer c.mu.RUnlock() + + if !controlMessages.Empty() { + return 0, false, syserr.ErrInvalidEndpointState + } + + // Since stream sockets don't preserve message boundaries, we can write + // only as much of the message as fits in the send buffer. + truncate := c.stype == linux.SOCK_STREAM + + n, totalLen, err := fdWriteVec(c.fd, data, c.sndbuf, truncate) + if n < totalLen && err == nil { + // The host only returns a short write if it would otherwise + // block (and only for stream sockets). + err = syserror.EAGAIN + } + if n > 0 && err != syserror.EAGAIN { + // The caller may need to block to send more data, but + // otherwise there isn't anything that can be done about an + // error with a partial write. + err = nil + } + + // There is no need for the callee to call SendNotify because fdWriteVec + // uses the host's sendmsg(2) and the host kernel's queue. + return n, false, syserr.FromError(err) +} + +// SendNotify implements transport.ConnectedEndpoint.SendNotify. +func (c *ConnectedEndpoint) SendNotify() {} + +// CloseSend implements transport.ConnectedEndpoint.CloseSend. +func (c *ConnectedEndpoint) CloseSend() { + c.mu.Lock() + defer c.mu.Unlock() + + if err := syscall.Shutdown(c.fd, syscall.SHUT_WR); err != nil { + // A well-formed UDS shutdown can't fail. See + // net/unix/af_unix.c:unix_shutdown. + panic(fmt.Sprintf("failed write shutdown on host socket %+v: %v", c, err)) + } +} + +// CloseNotify implements transport.ConnectedEndpoint.CloseNotify. +func (c *ConnectedEndpoint) CloseNotify() {} + +// Writable implements transport.ConnectedEndpoint.Writable. +func (c *ConnectedEndpoint) Writable() bool { + c.mu.RLock() + defer c.mu.RUnlock() + + return fdnotifier.NonBlockingPoll(int32(c.fd), waiter.EventOut)&waiter.EventOut != 0 +} + +// Passcred implements transport.ConnectedEndpoint.Passcred. +func (c *ConnectedEndpoint) Passcred() bool { + // We don't support credential passing for host sockets. + return false +} + +// GetLocalAddress implements transport.ConnectedEndpoint.GetLocalAddress. +func (c *ConnectedEndpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) { + return tcpip.FullAddress{Addr: tcpip.Address(c.addr)}, nil +} + +// EventUpdate implements transport.ConnectedEndpoint.EventUpdate. +func (c *ConnectedEndpoint) EventUpdate() { + c.mu.RLock() + defer c.mu.RUnlock() + if c.fd != -1 { + fdnotifier.UpdateFD(int32(c.fd)) + } +} + +// Recv implements transport.Receiver.Recv. +func (c *ConnectedEndpoint) Recv(data [][]byte, creds bool, numRights int, peek bool) (int64, int64, transport.ControlMessages, bool, tcpip.FullAddress, bool, *syserr.Error) { + c.mu.RLock() + defer c.mu.RUnlock() + + var cm unet.ControlMessage + if numRights > 0 { + cm.EnableFDs(int(numRights)) + } + + // N.B. Unix sockets don't have a receive buffer, the send buffer + // serves both purposes. + rl, ml, cl, cTrunc, err := fdReadVec(c.fd, data, []byte(cm), peek, c.sndbuf) + if rl > 0 && err != nil { + // We got some data, so all we need to do on error is return + // the data that we got. Short reads are fine, no need to + // block. + err = nil + } + if err != nil { + return 0, 0, transport.ControlMessages{}, false, tcpip.FullAddress{}, false, syserr.FromError(err) + } + + // There is no need for the callee to call RecvNotify because fdReadVec uses + // the host's recvmsg(2) and the host kernel's queue. + + // Trim the control data if we received less than the full amount. + if cl < uint64(len(cm)) { + cm = cm[:cl] + } + + // Avoid extra allocations in the case where there isn't any control data. + if len(cm) == 0 { + return rl, ml, transport.ControlMessages{}, cTrunc, tcpip.FullAddress{Addr: tcpip.Address(c.addr)}, false, nil + } + + fds, err := cm.ExtractFDs() + if err != nil { + return 0, 0, transport.ControlMessages{}, false, tcpip.FullAddress{}, false, syserr.FromError(err) + } + + if len(fds) == 0 { + return rl, ml, transport.ControlMessages{}, cTrunc, tcpip.FullAddress{Addr: tcpip.Address(c.addr)}, false, nil + } + return rl, ml, control.NewVFS2(nil, nil, newSCMRights(fds)), cTrunc, tcpip.FullAddress{Addr: tcpip.Address(c.addr)}, false, nil +} + +// RecvNotify implements transport.Receiver.RecvNotify. +func (c *ConnectedEndpoint) RecvNotify() {} + +// CloseRecv implements transport.Receiver.CloseRecv. +func (c *ConnectedEndpoint) CloseRecv() { + c.mu.Lock() + defer c.mu.Unlock() + + if err := syscall.Shutdown(c.fd, syscall.SHUT_RD); err != nil { + // A well-formed UDS shutdown can't fail. See + // net/unix/af_unix.c:unix_shutdown. + panic(fmt.Sprintf("failed read shutdown on host socket %+v: %v", c, err)) + } +} + +// Readable implements transport.Receiver.Readable. +func (c *ConnectedEndpoint) Readable() bool { + c.mu.RLock() + defer c.mu.RUnlock() + + return fdnotifier.NonBlockingPoll(int32(c.fd), waiter.EventIn)&waiter.EventIn != 0 +} + +// SendQueuedSize implements transport.Receiver.SendQueuedSize. +func (c *ConnectedEndpoint) SendQueuedSize() int64 { + // TODO(gvisor.dev/issue/273): SendQueuedSize isn't supported for host + // sockets because we don't allow the sentry to call ioctl(2). + return -1 +} + +// RecvQueuedSize implements transport.Receiver.RecvQueuedSize. +func (c *ConnectedEndpoint) RecvQueuedSize() int64 { + // TODO(gvisor.dev/issue/273): RecvQueuedSize isn't supported for host + // sockets because we don't allow the sentry to call ioctl(2). + return -1 +} + +// SendMaxQueueSize implements transport.Receiver.SendMaxQueueSize. +func (c *ConnectedEndpoint) SendMaxQueueSize() int64 { + return int64(c.sndbuf) +} + +// RecvMaxQueueSize implements transport.Receiver.RecvMaxQueueSize. +func (c *ConnectedEndpoint) RecvMaxQueueSize() int64 { + // N.B. Unix sockets don't use the receive buffer. We'll claim it is + // the same size as the send buffer. + return int64(c.sndbuf) +} + +func (c *ConnectedEndpoint) destroyLocked() { + fdnotifier.RemoveFD(int32(c.fd)) + c.fd = -1 +} + +// Release implements transport.ConnectedEndpoint.Release and +// transport.Receiver.Release. +func (c *ConnectedEndpoint) Release() { + c.ref.DecRefWithDestructor(func() { + c.mu.Lock() + c.destroyLocked() + c.mu.Unlock() + }) +} + +// CloseUnread implements transport.ConnectedEndpoint.CloseUnread. +func (c *ConnectedEndpoint) CloseUnread() {} + +// SCMConnectedEndpoint represents an endpoint backed by a host fd that was +// passed through a gofer Unix socket. It is almost the same as +// ConnectedEndpoint, with the following differences: +// - SCMConnectedEndpoint is not saveable, because the host cannot guarantee +// the same descriptor number across S/R. +// - SCMConnectedEndpoint holds ownership of its fd and is responsible for +// closing it. +type SCMConnectedEndpoint struct { + ConnectedEndpoint +} + +// Release implements transport.ConnectedEndpoint.Release and +// transport.Receiver.Release. +func (e *SCMConnectedEndpoint) Release() { + e.ref.DecRefWithDestructor(func() { + e.mu.Lock() + if err := syscall.Close(e.fd); err != nil { + log.Warningf("Failed to close host fd %d: %v", err) + } + e.destroyLocked() + e.mu.Unlock() + }) +} + +// NewSCMEndpoint creates a new SCMConnectedEndpoint backed by a host fd that +// was passed through a Unix socket. +// +// The caller is responsible for calling Init(). Additionaly, Release needs to +// be called twice because ConnectedEndpoint is both a transport.Receiver and +// transport.ConnectedEndpoint. +func NewSCMEndpoint(ctx context.Context, hostFD int, queue *waiter.Queue, addr string) (*SCMConnectedEndpoint, *syserr.Error) { + e := SCMConnectedEndpoint{ConnectedEndpoint{ + fd: hostFD, + addr: addr, + queue: queue, + }} + + if err := e.init(); err != nil { + return nil, err + } + + // AtomicRefCounters start off with a single reference. We need two. + e.ref.IncRef() + e.ref.EnableLeakCheck("host.SCMConnectedEndpoint") + return &e, nil +} diff --git a/pkg/sentry/fsimpl/host/socket_iovec.go b/pkg/sentry/fsimpl/host/socket_iovec.go new file mode 100644 index 000000000..584c247d2 --- /dev/null +++ b/pkg/sentry/fsimpl/host/socket_iovec.go @@ -0,0 +1,113 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package host + +import ( + "syscall" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/syserror" +) + +// maxIovs is the maximum number of iovecs to pass to the host. +var maxIovs = linux.UIO_MAXIOV + +// copyToMulti copies as many bytes from src to dst as possible. +func copyToMulti(dst [][]byte, src []byte) { + for _, d := range dst { + done := copy(d, src) + src = src[done:] + if len(src) == 0 { + break + } + } +} + +// copyFromMulti copies as many bytes from src to dst as possible. +func copyFromMulti(dst []byte, src [][]byte) { + for _, s := range src { + done := copy(dst, s) + dst = dst[done:] + if len(dst) == 0 { + break + } + } +} + +// buildIovec builds an iovec slice from the given []byte slice. +// +// If truncate, truncate bufs > maxlen. Otherwise, immediately return an error. +// +// If length < the total length of bufs, err indicates why, even when returning +// a truncated iovec. +// +// If intermediate != nil, iovecs references intermediate rather than bufs and +// the caller must copy to/from bufs as necessary. +func buildIovec(bufs [][]byte, maxlen int64, truncate bool) (length int64, iovecs []syscall.Iovec, intermediate []byte, err error) { + var iovsRequired int + for _, b := range bufs { + length += int64(len(b)) + if len(b) > 0 { + iovsRequired++ + } + } + + stopLen := length + if length > maxlen { + if truncate { + stopLen = maxlen + err = syserror.EAGAIN + } else { + return 0, nil, nil, syserror.EMSGSIZE + } + } + + if iovsRequired > maxIovs { + // The kernel will reject our call if we pass this many iovs. + // Use a single intermediate buffer instead. + b := make([]byte, stopLen) + + return stopLen, []syscall.Iovec{{ + Base: &b[0], + Len: uint64(stopLen), + }}, b, err + } + + var total int64 + iovecs = make([]syscall.Iovec, 0, iovsRequired) + for i := range bufs { + l := len(bufs[i]) + if l == 0 { + continue + } + + stop := int64(l) + if total+stop > stopLen { + stop = stopLen - total + } + + iovecs = append(iovecs, syscall.Iovec{ + Base: &bufs[i][0], + Len: uint64(stop), + }) + + total += stop + if total >= stopLen { + break + } + } + + return total, iovecs, nil, err +} diff --git a/pkg/sentry/fsimpl/host/socket_unsafe.go b/pkg/sentry/fsimpl/host/socket_unsafe.go new file mode 100644 index 000000000..35ded24bc --- /dev/null +++ b/pkg/sentry/fsimpl/host/socket_unsafe.go @@ -0,0 +1,101 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package host + +import ( + "syscall" + "unsafe" +) + +// fdReadVec receives from fd to bufs. +// +// If the total length of bufs is > maxlen, fdReadVec will do a partial read +// and err will indicate why the message was truncated. +func fdReadVec(fd int, bufs [][]byte, control []byte, peek bool, maxlen int64) (readLen int64, msgLen int64, controlLen uint64, controlTrunc bool, err error) { + flags := uintptr(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC) + if peek { + flags |= syscall.MSG_PEEK + } + + // Always truncate the receive buffer. All socket types will truncate + // received messages. + length, iovecs, intermediate, err := buildIovec(bufs, maxlen, true) + if err != nil && len(iovecs) == 0 { + // No partial write to do, return error immediately. + return 0, 0, 0, false, err + } + + var msg syscall.Msghdr + if len(control) != 0 { + msg.Control = &control[0] + msg.Controllen = uint64(len(control)) + } + + if len(iovecs) != 0 { + msg.Iov = &iovecs[0] + msg.Iovlen = uint64(len(iovecs)) + } + + rawN, _, e := syscall.RawSyscall(syscall.SYS_RECVMSG, uintptr(fd), uintptr(unsafe.Pointer(&msg)), flags) + if e != 0 { + // N.B. prioritize the syscall error over the buildIovec error. + return 0, 0, 0, false, e + } + n := int64(rawN) + + // Copy data back to bufs. + if intermediate != nil { + copyToMulti(bufs, intermediate) + } + + controlTrunc = msg.Flags&syscall.MSG_CTRUNC == syscall.MSG_CTRUNC + + if n > length { + return length, n, msg.Controllen, controlTrunc, err + } + + return n, n, msg.Controllen, controlTrunc, err +} + +// fdWriteVec sends from bufs to fd. +// +// If the total length of bufs is > maxlen && truncate, fdWriteVec will do a +// partial write and err will indicate why the message was truncated. +func fdWriteVec(fd int, bufs [][]byte, maxlen int64, truncate bool) (int64, int64, error) { + length, iovecs, intermediate, err := buildIovec(bufs, maxlen, truncate) + if err != nil && len(iovecs) == 0 { + // No partial write to do, return error immediately. + return 0, length, err + } + + // Copy data to intermediate buf. + if intermediate != nil { + copyFromMulti(intermediate, bufs) + } + + var msg syscall.Msghdr + if len(iovecs) > 0 { + msg.Iov = &iovecs[0] + msg.Iovlen = uint64(len(iovecs)) + } + + n, _, e := syscall.RawSyscall(syscall.SYS_SENDMSG, uintptr(fd), uintptr(unsafe.Pointer(&msg)), syscall.MSG_DONTWAIT|syscall.MSG_NOSIGNAL) + if e != 0 { + // N.B. prioritize the syscall error over the buildIovec error. + return 0, length, e + } + + return int64(n), length, err +} diff --git a/pkg/sentry/fsimpl/sockfs/sockfs.go b/pkg/sentry/fsimpl/sockfs/sockfs.go index 5ce50625b..271134af8 100644 --- a/pkg/sentry/fsimpl/sockfs/sockfs.go +++ b/pkg/sentry/fsimpl/sockfs/sockfs.go @@ -73,26 +73,14 @@ func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentr return nil, syserror.ENXIO } -// InitSocket initializes a socket FileDescription, with a corresponding -// Dentry in mnt. -// -// fd should be the FileDescription associated with socketImpl, i.e. its first -// field. mnt should be the global socket mount, Kernel.socketMount. -func InitSocket(socketImpl vfs.FileDescriptionImpl, fd *vfs.FileDescription, mnt *vfs.Mount, creds *auth.Credentials) error { - fsimpl := mnt.Filesystem().Impl() - fs := fsimpl.(*kernfs.Filesystem) - +// NewDentry constructs and returns a sockfs dentry. +func NewDentry(creds *auth.Credentials, ino uint64) *vfs.Dentry { // File mode matches net/socket.c:sock_alloc. filemode := linux.FileMode(linux.S_IFSOCK | 0600) i := &inode{} - i.Init(creds, fs.NextIno(), filemode) + i.Init(creds, ino, filemode) d := &kernfs.Dentry{} d.Init(i) - - opts := &vfs.FileDescriptionOptions{UseDentryMetadata: true} - if err := fd.Init(socketImpl, linux.O_RDWR, mnt, d.VFSDentry(), opts); err != nil { - return err - } - return nil + return d.VFSDentry() } diff --git a/pkg/sentry/socket/unix/BUILD b/pkg/sentry/socket/unix/BUILD index de2cc4bdf..941a91097 100644 --- a/pkg/sentry/socket/unix/BUILD +++ b/pkg/sentry/socket/unix/BUILD @@ -21,6 +21,7 @@ go_library( "//pkg/sentry/device", "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", + "//pkg/sentry/fsimpl/kernfs", "//pkg/sentry/fsimpl/sockfs", "//pkg/sentry/kernel", "//pkg/sentry/kernel/time", diff --git a/pkg/sentry/socket/unix/unix_vfs2.go b/pkg/sentry/socket/unix/unix_vfs2.go index 3e54d49c4..315dc274f 100644 --- a/pkg/sentry/socket/unix/unix_vfs2.go +++ b/pkg/sentry/socket/unix/unix_vfs2.go @@ -19,6 +19,7 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/socket/control" @@ -42,30 +43,44 @@ type SocketVFS2 struct { socketOpsCommon } -// NewVFS2File creates and returns a new vfs.FileDescription for a unix socket. -func NewVFS2File(t *kernel.Task, ep transport.Endpoint, stype linux.SockType) (*vfs.FileDescription, *syserr.Error) { - sock := NewFDImpl(ep, stype) - vfsfd := &sock.vfsfd - if err := sockfs.InitSocket(sock, vfsfd, t.Kernel().SocketMount(), t.Credentials()); err != nil { +// NewSockfsFile creates a new socket file in the global sockfs mount and +// returns a corresponding file description. +func NewSockfsFile(t *kernel.Task, ep transport.Endpoint, stype linux.SockType) (*vfs.FileDescription, *syserr.Error) { + mnt := t.Kernel().SocketMount() + fs := mnt.Filesystem().Impl().(*kernfs.Filesystem) + d := sockfs.NewDentry(t.Credentials(), fs.NextIno()) + + fd, err := NewFileDescription(ep, stype, linux.O_RDWR, mnt, d) + if err != nil { return nil, syserr.FromError(err) } - return vfsfd, nil + return fd, nil } -// NewFDImpl creates and returns a new SocketVFS2. -func NewFDImpl(ep transport.Endpoint, stype linux.SockType) *SocketVFS2 { +// NewFileDescription creates and returns a socket file description +// corresponding to the given mount and dentry. +func NewFileDescription(ep transport.Endpoint, stype linux.SockType, flags uint32, mnt *vfs.Mount, d *vfs.Dentry) (*vfs.FileDescription, error) { // You can create AF_UNIX, SOCK_RAW sockets. They're the same as // SOCK_DGRAM and don't require CAP_NET_RAW. if stype == linux.SOCK_RAW { stype = linux.SOCK_DGRAM } - return &SocketVFS2{ + sock := &SocketVFS2{ socketOpsCommon: socketOpsCommon{ ep: ep, stype: stype, }, } + vfsfd := &sock.vfsfd + if err := vfsfd.Init(sock, flags, mnt, d, &vfs.FileDescriptionOptions{ + DenyPRead: true, + DenyPWrite: true, + UseDentryMetadata: true, + }); err != nil { + return nil, err + } + return vfsfd, nil } // GetSockOpt implements the linux syscall getsockopt(2) for sockets backed by @@ -112,8 +127,7 @@ func (s *SocketVFS2) Accept(t *kernel.Task, peerRequested bool, flags int, block } } - // We expect this to be a FileDescription here. - ns, err := NewVFS2File(t, ep, s.stype) + ns, err := NewSockfsFile(t, ep, s.stype) if err != nil { return 0, nil, 0, err } @@ -307,7 +321,7 @@ func (*providerVFS2) Socket(t *kernel.Task, stype linux.SockType, protocol int) return nil, syserr.ErrInvalidArgument } - f, err := NewVFS2File(t, ep, stype) + f, err := NewSockfsFile(t, ep, stype) if err != nil { ep.Close() return nil, err @@ -331,13 +345,13 @@ func (*providerVFS2) Pair(t *kernel.Task, stype linux.SockType, protocol int) (* // Create the endpoints and sockets. ep1, ep2 := transport.NewPair(t, stype, t.Kernel()) - s1, err := NewVFS2File(t, ep1, stype) + s1, err := NewSockfsFile(t, ep1, stype) if err != nil { ep1.Close() ep2.Close() return nil, nil, err } - s2, err := NewVFS2File(t, ep2, stype) + s2, err := NewSockfsFile(t, ep2, stype) if err != nil { s1.DecRef() ep2.Close() -- cgit v1.2.3 From f3ca5ca82abe35d13210f8a8035926170bb87436 Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Tue, 28 Apr 2020 08:32:20 -0700 Subject: Support pipes and sockets in VFS2 gofer fs. Named pipes and sockets can be represented in two ways in gofer fs: 1. As a file on the remote filesystem. In this case, all file operations are passed through 9p. 2. As a synthetic file that is internal to the sandbox. In this case, the dentry stores an endpoint or VFSPipe for sockets and pipes respectively, which replaces interactions with the remote fs through the gofer. In gofer.filesystem.MknodAt, we attempt to call mknod(2) through 9p, and if it fails, fall back to the synthetic version. Updates #1200. PiperOrigin-RevId: 308828161 --- pkg/sentry/fs/gofer/socket.go | 4 + pkg/sentry/fsimpl/ext/filesystem.go | 2 +- pkg/sentry/fsimpl/gofer/BUILD | 8 ++ pkg/sentry/fsimpl/gofer/directory.go | 39 +++++++-- pkg/sentry/fsimpl/gofer/filesystem.go | 128 ++++++++++++++++++++++-------- pkg/sentry/fsimpl/gofer/gofer.go | 32 ++++++-- pkg/sentry/fsimpl/gofer/socket.go | 141 +++++++++++++++++++++++++++++++++ pkg/sentry/fsimpl/kernfs/filesystem.go | 2 +- pkg/sentry/fsimpl/tmpfs/filesystem.go | 2 +- pkg/sentry/socket/unix/unix.go | 2 +- pkg/sentry/vfs/anonfs.go | 2 +- pkg/sentry/vfs/filesystem.go | 2 +- pkg/sentry/vfs/options.go | 18 +++++ pkg/sentry/vfs/vfs.go | 54 ++++++------- 14 files changed, 359 insertions(+), 77 deletions(-) create mode 100644 pkg/sentry/fsimpl/gofer/socket.go (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fs/gofer/socket.go b/pkg/sentry/fs/gofer/socket.go index 10ba2f5f0..40f2c1cad 100644 --- a/pkg/sentry/fs/gofer/socket.go +++ b/pkg/sentry/fs/gofer/socket.go @@ -47,6 +47,8 @@ func (i *inodeOperations) BoundEndpoint(inode *fs.Inode, path string) transport. return &endpoint{inode, i.fileState.file.file, path} } +// LINT.IfChange + // endpoint is a Gofer-backed transport.BoundEndpoint. // // An endpoint's lifetime is the time between when InodeOperations.BoundEndpoint() @@ -146,3 +148,5 @@ func (e *endpoint) Release() { func (e *endpoint) Passcred() bool { return false } + +// LINT.ThenChange(../../fsimpl/gofer/socket.go) diff --git a/pkg/sentry/fsimpl/ext/filesystem.go b/pkg/sentry/fsimpl/ext/filesystem.go index 2c22a04af..6c882d0b6 100644 --- a/pkg/sentry/fsimpl/ext/filesystem.go +++ b/pkg/sentry/fsimpl/ext/filesystem.go @@ -485,7 +485,7 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error } // BoundEndpointAt implements FilesystemImpl.BoundEndpointAt. -func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath) (transport.BoundEndpoint, error) { +func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) { _, _, err := fs.walk(rp, false) if err != nil { return nil, err diff --git a/pkg/sentry/fsimpl/gofer/BUILD b/pkg/sentry/fsimpl/gofer/BUILD index b9c4beee4..5ce82b793 100644 --- a/pkg/sentry/fsimpl/gofer/BUILD +++ b/pkg/sentry/fsimpl/gofer/BUILD @@ -38,6 +38,7 @@ go_library( "p9file.go", "pagemath.go", "regular_file.go", + "socket.go", "special_file.go", "symlink.go", "time.go", @@ -52,18 +53,25 @@ go_library( "//pkg/p9", "//pkg/safemem", "//pkg/sentry/fs/fsutil", + "//pkg/sentry/fsimpl/host", "//pkg/sentry/hostfd", + "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", + "//pkg/sentry/kernel/pipe", "//pkg/sentry/kernel/time", "//pkg/sentry/memmap", "//pkg/sentry/pgalloc", "//pkg/sentry/platform", + "//pkg/sentry/socket/control", + "//pkg/sentry/socket/unix", "//pkg/sentry/socket/unix/transport", "//pkg/sentry/usage", "//pkg/sentry/vfs", + "//pkg/syserr", "//pkg/syserror", "//pkg/unet", "//pkg/usermem", + "//pkg/waiter", ], ) diff --git a/pkg/sentry/fsimpl/gofer/directory.go b/pkg/sentry/fsimpl/gofer/directory.go index 55f9ed911..b98218753 100644 --- a/pkg/sentry/fsimpl/gofer/directory.go +++ b/pkg/sentry/fsimpl/gofer/directory.go @@ -15,6 +15,7 @@ package gofer import ( + "fmt" "sync" "sync/atomic" @@ -22,6 +23,8 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/p9" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" + "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" @@ -59,28 +62,52 @@ func (d *dentry) cacheNegativeLookupLocked(name string) { d.children[name] = nil } -// createSyntheticDirectory creates a synthetic directory with the given name +type createSyntheticOpts struct { + name string + mode linux.FileMode + kuid auth.KUID + kgid auth.KGID + + // The endpoint for a synthetic socket. endpoint should be nil if the file + // being created is not a socket. + endpoint transport.BoundEndpoint + + // pipe should be nil if the file being created is not a pipe. + pipe *pipe.VFSPipe +} + +// createSyntheticChildLocked creates a synthetic file with the given name // in d. // // Preconditions: d.dirMu must be locked. d.isDir(). d does not already contain // a child with the given name. -func (d *dentry) createSyntheticDirectoryLocked(name string, mode linux.FileMode, kuid auth.KUID, kgid auth.KGID) { +func (d *dentry) createSyntheticChildLocked(opts *createSyntheticOpts) { d2 := &dentry{ refs: 1, // held by d fs: d.fs, - mode: uint32(mode) | linux.S_IFDIR, - uid: uint32(kuid), - gid: uint32(kgid), + mode: uint32(opts.mode), + uid: uint32(opts.kuid), + gid: uint32(opts.kgid), blockSize: usermem.PageSize, // arbitrary handle: handle{ fd: -1, }, nlink: uint32(2), } + switch opts.mode.FileType() { + case linux.S_IFDIR: + // Nothing else needs to be done. + case linux.S_IFSOCK: + d2.endpoint = opts.endpoint + case linux.S_IFIFO: + d2.pipe = opts.pipe + default: + panic(fmt.Sprintf("failed to create synthetic file of unrecognized type: %v", opts.mode.FileType())) + } d2.pf.dentry = d2 d2.vfsd.Init(d2) - d.cacheNewChildLocked(d2, name) + d.cacheNewChildLocked(d2, opts.name) d.syntheticChildren++ } diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go index 98ccb42fd..4a8411371 100644 --- a/pkg/sentry/fsimpl/gofer/filesystem.go +++ b/pkg/sentry/fsimpl/gofer/filesystem.go @@ -22,9 +22,11 @@ import ( "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/p9" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // Sync implements vfs.FilesystemImpl.Sync. @@ -652,7 +654,12 @@ func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v return err } ctx.Infof("Failed to create remote directory %q: %v; falling back to synthetic directory", name, err) - parent.createSyntheticDirectoryLocked(name, opts.Mode, creds.EffectiveKUID, creds.EffectiveKGID) + parent.createSyntheticChildLocked(&createSyntheticOpts{ + name: name, + mode: linux.S_IFDIR | opts.Mode, + kuid: creds.EffectiveKUID, + kgid: creds.EffectiveKGID, + }) } if fs.opts.interop != InteropModeShared { parent.incLinks() @@ -663,7 +670,12 @@ func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v // Can't create non-synthetic files in synthetic directories. return syserror.EPERM } - parent.createSyntheticDirectoryLocked(name, opts.Mode, creds.EffectiveKUID, creds.EffectiveKGID) + parent.createSyntheticChildLocked(&createSyntheticOpts{ + name: name, + mode: linux.S_IFDIR | opts.Mode, + kuid: creds.EffectiveKUID, + kgid: creds.EffectiveKGID, + }) parent.incLinks() return nil }) @@ -674,6 +686,28 @@ func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, name string) error { creds := rp.Credentials() _, err := parent.file.mknod(ctx, name, (p9.FileMode)(opts.Mode), opts.DevMajor, opts.DevMinor, (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID)) + if err == syserror.EPERM { + switch opts.Mode.FileType() { + case linux.S_IFSOCK: + parent.createSyntheticChildLocked(&createSyntheticOpts{ + name: name, + mode: opts.Mode, + kuid: creds.EffectiveKUID, + kgid: creds.EffectiveKGID, + endpoint: opts.Endpoint, + }) + return nil + case linux.S_IFIFO: + parent.createSyntheticChildLocked(&createSyntheticOpts{ + name: name, + mode: opts.Mode, + kuid: creds.EffectiveKUID, + kgid: creds.EffectiveKGID, + pipe: pipe.NewVFSPipe(true /* isNamed */, pipe.DefaultPipeSize, usermem.PageSize), + }) + return nil + } + } return err }, nil) } @@ -756,20 +790,21 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vf return nil, err } mnt := rp.Mount() - filetype := d.fileType() - switch { - case filetype == linux.S_IFREG && !d.fs.opts.regularFilesUseSpecialFileFD: - if err := d.ensureSharedHandle(ctx, ats&vfs.MayRead != 0, ats&vfs.MayWrite != 0, opts.Flags&linux.O_TRUNC != 0); err != nil { - return nil, err - } - fd := ®ularFileFD{} - if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{ - AllowDirectIO: true, - }); err != nil { - return nil, err + switch d.fileType() { + case linux.S_IFREG: + if !d.fs.opts.regularFilesUseSpecialFileFD { + if err := d.ensureSharedHandle(ctx, ats&vfs.MayRead != 0, ats&vfs.MayWrite != 0, opts.Flags&linux.O_TRUNC != 0); err != nil { + return nil, err + } + fd := ®ularFileFD{} + if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{ + AllowDirectIO: true, + }); err != nil { + return nil, err + } + return &fd.vfsfd, nil } - return &fd.vfsfd, nil - case filetype == linux.S_IFDIR: + case linux.S_IFDIR: // Can't open directories with O_CREAT. if opts.Flags&linux.O_CREAT != 0 { return nil, syserror.EISDIR @@ -791,26 +826,40 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vf return nil, err } return &fd.vfsfd, nil - case filetype == linux.S_IFLNK: + case linux.S_IFLNK: // Can't open symlinks without O_PATH (which is unimplemented). return nil, syserror.ELOOP - default: - if opts.Flags&linux.O_DIRECT != 0 { - return nil, syserror.EINVAL - } - h, err := openHandle(ctx, d.file, ats&vfs.MayRead != 0, ats&vfs.MayWrite != 0, opts.Flags&linux.O_TRUNC != 0) - if err != nil { - return nil, err - } - fd := &specialFileFD{ - handle: h, + case linux.S_IFSOCK: + if d.isSynthetic() { + return nil, syserror.ENXIO } - if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil { - h.close(ctx) - return nil, err + case linux.S_IFIFO: + if d.isSynthetic() { + return d.pipe.Open(ctx, mnt, &d.vfsd, opts.Flags) } - return &fd.vfsfd, nil } + return d.openSpecialFileLocked(ctx, mnt, opts) +} + +func (d *dentry) openSpecialFileLocked(ctx context.Context, mnt *vfs.Mount, opts *vfs.OpenOptions) (*vfs.FileDescription, error) { + ats := vfs.AccessTypesForOpenFlags(opts) + // Treat as a special file. This is done for non-synthetic pipes as well as + // regular files when d.fs.opts.regularFilesUseSpecialFileFD is true. + if opts.Flags&linux.O_DIRECT != 0 { + return nil, syserror.EINVAL + } + h, err := openHandle(ctx, d.file, ats&vfs.MayRead != 0, ats&vfs.MayWrite != 0, opts.Flags&linux.O_TRUNC != 0) + if err != nil { + return nil, err + } + fd := &specialFileFD{ + handle: h, + } + if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil { + h.close(ctx) + return nil, err + } + return &fd.vfsfd, nil } // Preconditions: d.fs.renameMu must be locked. d.dirMu must be locked. @@ -1196,15 +1245,28 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error } // BoundEndpointAt implements FilesystemImpl.BoundEndpointAt. -func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath) (transport.BoundEndpoint, error) { +func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) { var ds *[]*dentry fs.renameMu.RLock() defer fs.renameMuRUnlockAndCheckCaching(&ds) - _, err := fs.resolveLocked(ctx, rp, &ds) + d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return nil, err } - // TODO(gvisor.dev/issue/1476): Implement BoundEndpointAt. + if err := d.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil { + return nil, err + } + if d.isSocket() { + if !d.isSynthetic() { + d.IncRef() + return &endpoint{ + dentry: d, + file: d.file.file, + path: opts.Addr, + }, nil + } + return d.endpoint, nil + } return nil, syserror.ECONNREFUSED } diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go index 8b4e91d17..3235816c2 100644 --- a/pkg/sentry/fsimpl/gofer/gofer.go +++ b/pkg/sentry/fsimpl/gofer/gofer.go @@ -46,9 +46,11 @@ import ( "gvisor.dev/gvisor/pkg/p9" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" + "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/unet" @@ -585,6 +587,14 @@ type dentry struct { // and target are protected by dataMu. haveTarget bool target string + + // If this dentry represents a synthetic socket file, endpoint is the + // transport endpoint bound to this file. + endpoint transport.BoundEndpoint + + // If this dentry represents a synthetic named pipe, pipe is the pipe + // endpoint bound to this file. + pipe *pipe.VFSPipe } // dentryAttrMask returns a p9.AttrMask enabling all attributes used by the @@ -791,10 +801,21 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *lin setLocalAtime = stat.Mask&linux.STATX_ATIME != 0 setLocalMtime = stat.Mask&linux.STATX_MTIME != 0 stat.Mask &^= linux.STATX_ATIME | linux.STATX_MTIME - if !setLocalMtime && (stat.Mask&linux.STATX_SIZE != 0) { - // Truncate updates mtime. - setLocalMtime = true - stat.Mtime.Nsec = linux.UTIME_NOW + + // Prepare for truncate. + if stat.Mask&linux.STATX_SIZE != 0 { + switch d.mode & linux.S_IFMT { + case linux.S_IFREG: + if !setLocalMtime { + // Truncate updates mtime. + setLocalMtime = true + stat.Mtime.Nsec = linux.UTIME_NOW + } + case linux.S_IFDIR: + return syserror.EISDIR + default: + return syserror.EINVAL + } } } d.metadataMu.Lock() @@ -1049,6 +1070,7 @@ func (d *dentry) destroyLocked() { d.handle.close(ctx) } d.handleMu.Unlock() + if !d.file.isNil() { d.file.close(ctx) d.file = p9file{} @@ -1134,7 +1156,7 @@ func (d *dentry) removexattr(ctx context.Context, creds *auth.Credentials, name return d.file.removeXattr(ctx, name) } -// Preconditions: !d.file.isNil(). d.isRegularFile() || d.isDirectory(). +// Preconditions: !d.isSynthetic(). d.isRegularFile() || d.isDirectory(). func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool) error { // O_TRUNC unconditionally requires us to obtain a new handle (opened with // O_TRUNC). diff --git a/pkg/sentry/fsimpl/gofer/socket.go b/pkg/sentry/fsimpl/gofer/socket.go new file mode 100644 index 000000000..73835df91 --- /dev/null +++ b/pkg/sentry/fsimpl/gofer/socket.go @@ -0,0 +1,141 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package gofer + +import ( + "syscall" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/p9" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/host" + "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" + "gvisor.dev/gvisor/pkg/syserr" + "gvisor.dev/gvisor/pkg/waiter" +) + +func (d *dentry) isSocket() bool { + return d.fileType() == linux.S_IFSOCK +} + +// endpoint is a Gofer-backed transport.BoundEndpoint. +// +// An endpoint's lifetime is the time between when filesystem.BoundEndpointAt() +// is called and either BoundEndpoint.BidirectionalConnect or +// BoundEndpoint.UnidirectionalConnect is called. +type endpoint struct { + // dentry is the filesystem dentry which produced this endpoint. + dentry *dentry + + // file is the p9 file that contains a single unopened fid. + file p9.File + + // path is the sentry path where this endpoint is bound. + path string +} + +func sockTypeToP9(t linux.SockType) (p9.ConnectFlags, bool) { + switch t { + case linux.SOCK_STREAM: + return p9.StreamSocket, true + case linux.SOCK_SEQPACKET: + return p9.SeqpacketSocket, true + case linux.SOCK_DGRAM: + return p9.DgramSocket, true + } + return 0, false +} + +// BidirectionalConnect implements ConnectableEndpoint.BidirectionalConnect. +func (e *endpoint) BidirectionalConnect(ctx context.Context, ce transport.ConnectingEndpoint, returnConnect func(transport.Receiver, transport.ConnectedEndpoint)) *syserr.Error { + cf, ok := sockTypeToP9(ce.Type()) + if !ok { + return syserr.ErrConnectionRefused + } + + // No lock ordering required as only the ConnectingEndpoint has a mutex. + ce.Lock() + + // Check connecting state. + if ce.Connected() { + ce.Unlock() + return syserr.ErrAlreadyConnected + } + if ce.Listening() { + ce.Unlock() + return syserr.ErrInvalidEndpointState + } + + c, err := e.newConnectedEndpoint(ctx, cf, ce.WaiterQueue()) + if err != nil { + ce.Unlock() + return err + } + + returnConnect(c, c) + ce.Unlock() + c.Init() + + return nil +} + +// UnidirectionalConnect implements +// transport.BoundEndpoint.UnidirectionalConnect. +func (e *endpoint) UnidirectionalConnect(ctx context.Context) (transport.ConnectedEndpoint, *syserr.Error) { + c, err := e.newConnectedEndpoint(ctx, p9.DgramSocket, &waiter.Queue{}) + if err != nil { + return nil, err + } + c.Init() + + // We don't need the receiver. + c.CloseRecv() + c.Release() + + return c, nil +} + +func (e *endpoint) newConnectedEndpoint(ctx context.Context, flags p9.ConnectFlags, queue *waiter.Queue) (*host.SCMConnectedEndpoint, *syserr.Error) { + hostFile, err := e.file.Connect(flags) + if err != nil { + return nil, syserr.ErrConnectionRefused + } + // Dup the fd so that the new endpoint can manage its lifetime. + hostFD, err := syscall.Dup(hostFile.FD()) + if err != nil { + log.Warningf("Could not dup host socket fd %d: %v", hostFile.FD(), err) + return nil, syserr.FromError(err) + } + // After duplicating, we no longer need hostFile. + hostFile.Close() + + c, serr := host.NewSCMEndpoint(ctx, hostFD, queue, e.path) + if serr != nil { + log.Warningf("Gofer returned invalid host socket for BidirectionalConnect; file %+v flags %+v: %v", e.file, flags, serr) + return nil, serr + } + return c, nil +} + +// Release implements transport.BoundEndpoint.Release. +func (e *endpoint) Release() { + e.dentry.DecRef() +} + +// Passcred implements transport.BoundEndpoint.Passcred. +func (e *endpoint) Passcred() bool { + return false +} diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go index 9e8d80414..40c1b2104 100644 --- a/pkg/sentry/fsimpl/kernfs/filesystem.go +++ b/pkg/sentry/fsimpl/kernfs/filesystem.go @@ -766,7 +766,7 @@ func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error } // BoundEndpointAt implements FilesystemImpl.BoundEndpointAt. -func (fs *Filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath) (transport.BoundEndpoint, error) { +func (fs *Filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) { fs.mu.RLock() _, _, err := fs.walkExistingLocked(ctx, rp) fs.mu.RUnlock() diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go index 5b62f9ebb..388b98bef 100644 --- a/pkg/sentry/fsimpl/tmpfs/filesystem.go +++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go @@ -697,7 +697,7 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error } // BoundEndpointAt implements FilesystemImpl.BoundEndpointAt. -func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath) (transport.BoundEndpoint, error) { +func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) { fs.mu.RLock() defer fs.mu.RUnlock() d, err := resolveLocked(rp) diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go index 7c64f30fa..9aad41c9e 100644 --- a/pkg/sentry/socket/unix/unix.go +++ b/pkg/sentry/socket/unix/unix.go @@ -373,7 +373,7 @@ func extractEndpoint(t *kernel.Task, sockaddr []byte) (transport.BoundEndpoint, Path: p, FollowFinalSymlink: true, } - ep, e := t.Kernel().VFS().BoundEndpointAt(t, t.Credentials(), &pop) + ep, e := t.Kernel().VFS().BoundEndpointAt(t, t.Credentials(), &pop, &vfs.BoundEndpointOptions{path}) root.DecRef() if relPath { start.DecRef() diff --git a/pkg/sentry/vfs/anonfs.go b/pkg/sentry/vfs/anonfs.go index a64d86122..b1a998590 100644 --- a/pkg/sentry/vfs/anonfs.go +++ b/pkg/sentry/vfs/anonfs.go @@ -237,7 +237,7 @@ func (fs *anonFilesystem) UnlinkAt(ctx context.Context, rp *ResolvingPath) error } // BoundEndpointAt implements FilesystemImpl.BoundEndpointAt. -func (fs *anonFilesystem) BoundEndpointAt(ctx context.Context, rp *ResolvingPath) (transport.BoundEndpoint, error) { +func (fs *anonFilesystem) BoundEndpointAt(ctx context.Context, rp *ResolvingPath, opts BoundEndpointOptions) (transport.BoundEndpoint, error) { if !rp.Final() { return nil, syserror.ENOTDIR } diff --git a/pkg/sentry/vfs/filesystem.go b/pkg/sentry/vfs/filesystem.go index 20e5bb072..70385a21f 100644 --- a/pkg/sentry/vfs/filesystem.go +++ b/pkg/sentry/vfs/filesystem.go @@ -495,7 +495,7 @@ type FilesystemImpl interface { // BoundEndpointAt returns the Unix socket endpoint bound at the path rp. // // - If a non-socket file exists at rp, then BoundEndpointAt returns ECONNREFUSED. - BoundEndpointAt(ctx context.Context, rp *ResolvingPath) (transport.BoundEndpoint, error) + BoundEndpointAt(ctx context.Context, rp *ResolvingPath, opts BoundEndpointOptions) (transport.BoundEndpoint, error) // PrependPath prepends a path from vd to vd.Mount().Root() to b. // diff --git a/pkg/sentry/vfs/options.go b/pkg/sentry/vfs/options.go index 022bac127..53d364c5c 100644 --- a/pkg/sentry/vfs/options.go +++ b/pkg/sentry/vfs/options.go @@ -151,6 +151,24 @@ type SetStatOptions struct { Stat linux.Statx } +// BoundEndpointOptions contains options to VirtualFilesystem.BoundEndpointAt() +// and FilesystemImpl.BoundEndpointAt(). +type BoundEndpointOptions struct { + // Addr is the path of the file whose socket endpoint is being retrieved. + // It is generally irrelevant: most endpoints are stored at a dentry that + // was created through a bind syscall, so the path can be stored on creation. + // However, if the endpoint was created in FilesystemImpl.BoundEndpointAt(), + // then we may not know what the original bind address was. + // + // For example, if connect(2) is called with address "foo" which corresponds + // a remote named socket in goferfs, we need to generate an endpoint wrapping + // that file. In this case, we can use Addr to set the endpoint address to + // "foo". Note that Addr is only a best-effort attempt--we still do not know + // the exact address that was used on the remote fs to bind the socket (it + // may have been "foo", "./foo", etc.). + Addr string +} + // GetxattrOptions contains options to VirtualFilesystem.GetxattrAt(), // FilesystemImpl.GetxattrAt(), FileDescription.Getxattr(), and // FileDescriptionImpl.Getxattr(). diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go index 9015f2cc1..8d7f8f8af 100644 --- a/pkg/sentry/vfs/vfs.go +++ b/pkg/sentry/vfs/vfs.go @@ -351,33 +351,6 @@ func (vfs *VirtualFilesystem) MknodAt(ctx context.Context, creds *auth.Credentia } } -// BoundEndpointAt gets the bound endpoint at the given path, if one exists. -func (vfs *VirtualFilesystem) BoundEndpointAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) (transport.BoundEndpoint, error) { - if !pop.Path.Begin.Ok() { - if pop.Path.Absolute { - return nil, syserror.ECONNREFUSED - } - return nil, syserror.ENOENT - } - rp := vfs.getResolvingPath(creds, pop) - for { - bep, err := rp.mount.fs.impl.BoundEndpointAt(ctx, rp) - if err == nil { - vfs.putResolvingPath(rp) - return bep, nil - } - if checkInvariants { - if rp.canHandleError(err) && rp.Done() { - panic(fmt.Sprintf("%T.BoundEndpointAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) - } - } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) - return nil, err - } - } -} - // OpenAt returns a FileDescription providing access to the file at the given // path. A reference is taken on the returned FileDescription. func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *OpenOptions) (*FileDescription, error) { @@ -675,6 +648,33 @@ func (vfs *VirtualFilesystem) UnlinkAt(ctx context.Context, creds *auth.Credenti } } +// BoundEndpointAt gets the bound endpoint at the given path, if one exists. +func (vfs *VirtualFilesystem) BoundEndpointAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *BoundEndpointOptions) (transport.BoundEndpoint, error) { + if !pop.Path.Begin.Ok() { + if pop.Path.Absolute { + return nil, syserror.ECONNREFUSED + } + return nil, syserror.ENOENT + } + rp := vfs.getResolvingPath(creds, pop) + for { + bep, err := rp.mount.fs.impl.BoundEndpointAt(ctx, rp, *opts) + if err == nil { + vfs.putResolvingPath(rp) + return bep, nil + } + if checkInvariants { + if rp.canHandleError(err) && rp.Done() { + panic(fmt.Sprintf("%T.BoundEndpointAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) + } + } + if !rp.handleError(err) { + vfs.putResolvingPath(rp) + return nil, err + } + } +} + // ListxattrAt returns all extended attribute names for the file at the given // path. func (vfs *VirtualFilesystem) ListxattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, size uint64) ([]string, error) { -- cgit v1.2.3 From ce19497c1c0829af6ba56f0cc68e3a4cb33cf1c8 Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Tue, 28 Apr 2020 20:11:43 -0700 Subject: Fix Unix socket permissions. Enforce write permission checks in BoundEndpointAt, which corresponds to the permission checks in Linux (net/unix/af_unix.c:unix_find_other). Also, create bound socket files with the correct permissions in VFS2. Fixes #2324. PiperOrigin-RevId: 308949084 --- pkg/sentry/fsimpl/ext/filesystem.go | 5 ++- pkg/sentry/fsimpl/kernfs/filesystem.go | 5 ++- pkg/sentry/fsimpl/tmpfs/filesystem.go | 3 ++ pkg/sentry/socket/unix/unix.go | 5 ++- pkg/sentry/socket/unix/unix_vfs2.go | 12 ++++--- pkg/sentry/vfs/anonfs.go | 3 ++ pkg/sentry/vfs/filesystem.go | 8 ++++- test/syscalls/linux/BUILD | 1 + test/syscalls/linux/socket.cc | 61 ++++++++++++++++++++++++++++++++++ 9 files changed, 94 insertions(+), 9 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/ext/filesystem.go b/pkg/sentry/fsimpl/ext/filesystem.go index 6c882d0b6..77b644275 100644 --- a/pkg/sentry/fsimpl/ext/filesystem.go +++ b/pkg/sentry/fsimpl/ext/filesystem.go @@ -486,10 +486,13 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error // BoundEndpointAt implements FilesystemImpl.BoundEndpointAt. func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) { - _, _, err := fs.walk(rp, false) + _, inode, err := fs.walk(rp, false) if err != nil { return nil, err } + if err := inode.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil { + return nil, err + } // TODO(b/134676337): Support sockets. return nil, syserror.ECONNREFUSED diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go index 40c1b2104..4a12ae245 100644 --- a/pkg/sentry/fsimpl/kernfs/filesystem.go +++ b/pkg/sentry/fsimpl/kernfs/filesystem.go @@ -768,12 +768,15 @@ func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error // BoundEndpointAt implements FilesystemImpl.BoundEndpointAt. func (fs *Filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) { fs.mu.RLock() - _, _, err := fs.walkExistingLocked(ctx, rp) + _, inode, err := fs.walkExistingLocked(ctx, rp) fs.mu.RUnlock() fs.processDeferredDecRefs() if err != nil { return nil, err } + if err := inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite); err != nil { + return nil, err + } return nil, syserror.ECONNREFUSED } diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go index 388b98bef..36ffcb592 100644 --- a/pkg/sentry/fsimpl/tmpfs/filesystem.go +++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go @@ -704,6 +704,9 @@ func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath if err != nil { return nil, err } + if err := d.inode.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil { + return nil, err + } switch impl := d.inode.impl.(type) { case *socketFile: return impl.ep, nil diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go index ddd0eda4b..5b29e9d7f 100644 --- a/pkg/sentry/socket/unix/unix.go +++ b/pkg/sentry/socket/unix/unix.go @@ -323,7 +323,10 @@ func (s *SocketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { // Create the socket. // - // TODO(gvisor.dev/issue/2324): Correctly set file permissions. + // Note that the file permissions here are not set correctly (see + // gvisor.dev/issue/2324). There is no convenient way to get permissions + // on the socket referred to by s, so we will leave this discrepancy + // unresolved until VFS2 replaces this code. childDir, err := d.Bind(t, t.FSContext().RootDirectory(), name, bep, fs.FilePermissions{User: fs.PermMask{Read: true}}) if err != nil { return syserr.ErrPortInUse diff --git a/pkg/sentry/socket/unix/unix_vfs2.go b/pkg/sentry/socket/unix/unix_vfs2.go index 433cde9cb..23db93f33 100644 --- a/pkg/sentry/socket/unix/unix_vfs2.go +++ b/pkg/sentry/socket/unix/unix_vfs2.go @@ -197,11 +197,13 @@ func (s *SocketVFS2) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { Start: start, Path: path, } - err := t.Kernel().VFS().MknodAt(t, t.Credentials(), &pop, &vfs.MknodOptions{ - // TODO(gvisor.dev/issue/2324): The file permissions should be taken - // from s and t.FSContext().Umask() (see net/unix/af_unix.c:unix_bind), - // but VFS1 just always uses 0400. Resolve this inconsistency. - Mode: linux.S_IFSOCK | 0400, + stat, err := s.vfsfd.Stat(t, vfs.StatOptions{Mask: linux.STATX_MODE}) + if err != nil { + return syserr.FromError(err) + } + err = t.Kernel().VFS().MknodAt(t, t.Credentials(), &pop, &vfs.MknodOptions{ + // File permissions correspond to net/unix/af_unix.c:unix_bind. + Mode: linux.FileMode(linux.S_IFSOCK | uint(stat.Mode)&^t.FSContext().Umask()), Endpoint: bep, }) if err == syserror.EEXIST { diff --git a/pkg/sentry/vfs/anonfs.go b/pkg/sentry/vfs/anonfs.go index b1a998590..981bd8caa 100644 --- a/pkg/sentry/vfs/anonfs.go +++ b/pkg/sentry/vfs/anonfs.go @@ -241,6 +241,9 @@ func (fs *anonFilesystem) BoundEndpointAt(ctx context.Context, rp *ResolvingPath if !rp.Final() { return nil, syserror.ENOTDIR } + if err := GenericCheckPermissions(rp.Credentials(), MayWrite, anonFileMode, anonFileUID, anonFileGID); err != nil { + return nil, err + } return nil, syserror.ECONNREFUSED } diff --git a/pkg/sentry/vfs/filesystem.go b/pkg/sentry/vfs/filesystem.go index 70385a21f..1edd584c9 100644 --- a/pkg/sentry/vfs/filesystem.go +++ b/pkg/sentry/vfs/filesystem.go @@ -494,7 +494,13 @@ type FilesystemImpl interface { // BoundEndpointAt returns the Unix socket endpoint bound at the path rp. // - // - If a non-socket file exists at rp, then BoundEndpointAt returns ECONNREFUSED. + // Errors: + // + // - If the file does not have write permissions, then BoundEndpointAt + // returns EACCES. + // + // - If a non-socket file exists at rp, then BoundEndpointAt returns + // ECONNREFUSED. BoundEndpointAt(ctx context.Context, rp *ResolvingPath, opts BoundEndpointOptions) (transport.BoundEndpoint, error) // PrependPath prepends a path from vd to vd.Mount().Root() to b. diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD index d9095c95f..837e56042 100644 --- a/test/syscalls/linux/BUILD +++ b/test/syscalls/linux/BUILD @@ -365,6 +365,7 @@ cc_binary( ":socket_test_util", "//test/util:file_descriptor", gtest, + "//test/util:temp_umask", "//test/util:test_main", "//test/util:test_util", ], diff --git a/test/syscalls/linux/socket.cc b/test/syscalls/linux/socket.cc index 3a07ac8d2..703d594a2 100644 --- a/test/syscalls/linux/socket.cc +++ b/test/syscalls/linux/socket.cc @@ -13,11 +13,14 @@ // limitations under the License. #include +#include +#include #include #include "gtest/gtest.h" #include "test/syscalls/linux/socket_test_util.h" #include "test/util/file_descriptor.h" +#include "test/util/temp_umask.h" #include "test/util/test_util.h" namespace gvisor { @@ -58,11 +61,69 @@ TEST(SocketTest, ProtocolInet) { } } +TEST(SocketTest, UnixSocketFileMode) { + // TODO(gvisor.dev/issue/1624): Re-enable this test once VFS1 is deleted. It + // should pass in VFS2. + SKIP_IF(IsRunningOnGvisor()); + + FileDescriptor bound = + ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_UNIX, SOCK_STREAM, PF_UNIX)); + + // The permissions of the file created with bind(2) should be defined by the + // permissions of the bound socket and the umask. + mode_t sock_perm = 0765, mask = 0123; + ASSERT_THAT(fchmod(bound.get(), sock_perm), SyscallSucceeds()); + TempUmask m(mask); + + struct sockaddr_un addr = + ASSERT_NO_ERRNO_AND_VALUE(UniqueUnixAddr(/*abstract=*/false, AF_UNIX)); + ASSERT_THAT(bind(bound.get(), reinterpret_cast(&addr), + sizeof(addr)), + SyscallSucceeds()); + + struct stat statbuf = {}; + ASSERT_THAT(stat(addr.sun_path, &statbuf), SyscallSucceeds()); + EXPECT_EQ(statbuf.st_mode, S_IFSOCK | sock_perm & ~mask); +} + +TEST(SocketTest, UnixConnectNeedsWritePerm) { + // TODO(gvisor.dev/issue/1624): Re-enable this test once VFS1 is deleted. It + // should succeed in VFS2. + SKIP_IF(IsRunningOnGvisor()); + + FileDescriptor bound = + ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_UNIX, SOCK_STREAM, PF_UNIX)); + + struct sockaddr_un addr = + ASSERT_NO_ERRNO_AND_VALUE(UniqueUnixAddr(/*abstract=*/false, AF_UNIX)); + ASSERT_THAT(bind(bound.get(), reinterpret_cast(&addr), + sizeof(addr)), + SyscallSucceeds()); + ASSERT_THAT(listen(bound.get(), 1), SyscallSucceeds()); + + // Connect should fail without write perms. + ASSERT_THAT(chmod(addr.sun_path, 0500), SyscallSucceeds()); + FileDescriptor client = + ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_UNIX, SOCK_STREAM, PF_UNIX)); + EXPECT_THAT(connect(client.get(), reinterpret_cast(&addr), + sizeof(addr)), + SyscallFailsWithErrno(EACCES)); + + // Connect should succeed with write perms. + ASSERT_THAT(chmod(addr.sun_path, 0200), SyscallSucceeds()); + EXPECT_THAT(connect(client.get(), reinterpret_cast(&addr), + sizeof(addr)), + SyscallSucceeds()); +} + using SocketOpenTest = ::testing::TestWithParam; // UDS cannot be opened. TEST_P(SocketOpenTest, Unix) { // FIXME(b/142001530): Open incorrectly succeeds on gVisor. + // + // TODO(gvisor.dev/issue/1624): Re-enable this test once VFS1 is deleted. It + // should succeed in VFS2. SKIP_IF(IsRunningOnGvisor()); FileDescriptor bound = -- cgit v1.2.3 From 442fde405d86c555ac09994772c85ca15a3b4fc7 Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Thu, 30 Apr 2020 09:46:36 -0700 Subject: Fix proc net bugs in VFS2. The /proc/net/udp header was missing, and /proc/sys/net was set up as /proc/sys/net/net. Discovered while trying to run networking tests for VFS2. PiperOrigin-RevId: 309243758 --- pkg/sentry/fsimpl/proc/task_net.go | 2 ++ pkg/sentry/fsimpl/proc/tasks_sys.go | 4 +--- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/proc/task_net.go b/pkg/sentry/fsimpl/proc/task_net.go index 6595fcee6..9c329341a 100644 --- a/pkg/sentry/fsimpl/proc/task_net.go +++ b/pkg/sentry/fsimpl/proc/task_net.go @@ -511,6 +511,8 @@ func (d *netUDPData) Generate(ctx context.Context, buf *bytes.Buffer) error { // degrade gracefully and retrieve what we can. t := kernel.TaskFromContext(ctx) + buf.WriteString(" sl local_address rem_address st tx_queue rx_queue tr tm->when retrnsmt uid timeout inode ref pointer drops \n") + for _, se := range d.kernel.ListSockets() { s := se.SockVFS2 if !s.TryIncRef() { diff --git a/pkg/sentry/fsimpl/proc/tasks_sys.go b/pkg/sentry/fsimpl/proc/tasks_sys.go index f08668ca2..0e90e02fe 100644 --- a/pkg/sentry/fsimpl/proc/tasks_sys.go +++ b/pkg/sentry/fsimpl/proc/tasks_sys.go @@ -112,9 +112,7 @@ func newSysNetDir(root *auth.Credentials, inoGen InoGenerator, k *kernel.Kernel) } } - return kernfs.NewStaticDir(root, inoGen.NextIno(), 0555, map[string]*kernfs.Dentry{ - "net": kernfs.NewStaticDir(root, inoGen.NextIno(), 0555, contents), - }) + return kernfs.NewStaticDir(root, inoGen.NextIno(), 0555, contents) } // mmapMinAddrData implements vfs.DynamicBytesSource for -- cgit v1.2.3 From bae30a0c69dbc2bf93db9598e1cf07ef8757a571 Mon Sep 17 00:00:00 2001 From: Nicolas Lacasse Date: Thu, 30 Apr 2020 14:18:31 -0700 Subject: Implement waiter.Waitable methods on VFS2 host inodes. This fixes bash in Ubuntu. Updates #1672. PiperOrigin-RevId: 309298252 --- pkg/sentry/fsimpl/host/host.go | 53 ++++++++++++++++++++++++++++++++++++++---- 1 file changed, 48 insertions(+), 5 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go index 05538ea42..33bd205af 100644 --- a/pkg/sentry/fsimpl/host/host.go +++ b/pkg/sentry/fsimpl/host/host.go @@ -24,6 +24,7 @@ import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fdnotifier" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" @@ -35,6 +36,7 @@ import ( "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" ) // filesystemType implements vfs.FilesystemType. @@ -88,11 +90,12 @@ func ImportFD(ctx context.Context, mnt *vfs.Mount, hostFD int, isTTY bool) (*vfs seekable := err != syserror.ESPIPE i := &inode{ - hostFD: hostFD, - seekable: seekable, - isTTY: isTTY, - canMap: canMap(uint32(fileType)), - ino: fs.NextIno(), + hostFD: hostFD, + seekable: seekable, + isTTY: isTTY, + canMap: canMap(uint32(fileType)), + wouldBlock: wouldBlock(uint32(fileType)), + ino: fs.NextIno(), // For simplicity, set offset to 0. Technically, we should use the existing // offset on the host if the file is seekable. offset: 0, @@ -103,6 +106,17 @@ func ImportFD(ctx context.Context, mnt *vfs.Mount, hostFD int, isTTY bool) (*vfs panic("files that can return EWOULDBLOCK (sockets, pipes, etc.) cannot be memory mapped") } + // If the hostFD would block, we must set it to non-blocking and handle + // blocking behavior in the sentry. + if i.wouldBlock { + if err := syscall.SetNonblock(i.hostFD, true); err != nil { + return nil, err + } + if err := fdnotifier.AddFD(int32(i.hostFD), &i.queue); err != nil { + return nil, err + } + } + d := &kernfs.Dentry{} d.Init(i) // i.open will take a reference on d. @@ -125,6 +139,12 @@ type inode struct { // This field is initialized at creation time and is immutable. hostFD int + // wouldBlock is true if the host FD would return EWOULDBLOCK for + // operations that would block. + // + // This field is initialized at creation time and is immutable. + wouldBlock bool + // seekable is false if the host fd points to a file representing a stream, // e.g. a socket or a pipe. Such files are not seekable and can return // EWOULDBLOCK for I/O operations. @@ -152,6 +172,9 @@ type inode struct { // offset specifies the current file offset. offset int64 + + // Event queue for blocking operations. + queue waiter.Queue } // Note that these flags may become out of date, since they can be modified @@ -354,6 +377,9 @@ func (i *inode) DecRef() { // Destroy implements kernfs.Inode. func (i *inode) Destroy() { + if i.wouldBlock { + fdnotifier.RemoveFD(int32(i.hostFD)) + } if err := unix.Close(i.hostFD); err != nil { log.Warningf("failed to close host fd %d: %v", i.hostFD, err) } @@ -614,3 +640,20 @@ func (f *fileDescription) ConfigureMMap(_ context.Context, opts *memmap.MMapOpts // TODO(gvisor.dev/issue/1672): Implement ConfigureMMap and Mappable interface. return syserror.ENODEV } + +// EventRegister implements waiter.Waitable.EventRegister. +func (f *fileDescription) EventRegister(e *waiter.Entry, mask waiter.EventMask) { + f.inode.queue.EventRegister(e, mask) + fdnotifier.UpdateFD(int32(f.inode.hostFD)) +} + +// EventUnregister implements waiter.Waitable.EventUnregister. +func (f *fileDescription) EventUnregister(e *waiter.Entry) { + f.inode.queue.EventUnregister(e) + fdnotifier.UpdateFD(int32(f.inode.hostFD)) +} + +// Readiness uses the poll() syscall to check the status of the underlying FD. +func (f *fileDescription) Readiness(mask waiter.EventMask) waiter.EventMask { + return fdnotifier.NonBlockingPoll(int32(f.inode.hostFD), mask) +} -- cgit v1.2.3 From 01beec3bb457a2a3a7313c7fe6dc795817f47746 Mon Sep 17 00:00:00 2001 From: Jamie Liu Date: Thu, 30 Apr 2020 16:03:04 -0700 Subject: Add gofer.InternalFilesystemOptions.LeakConnection. PiperOrigin-RevId: 309317605 --- pkg/sentry/fsimpl/gofer/gofer.go | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go index 3235816c2..2881c7bdd 100644 --- a/pkg/sentry/fsimpl/gofer/gofer.go +++ b/pkg/sentry/fsimpl/gofer/gofer.go @@ -72,7 +72,8 @@ type filesystem struct { mfp pgalloc.MemoryFileProvider // Immutable options. - opts filesystemOptions + opts filesystemOptions + iopts InternalFilesystemOptions // client is the client used by this filesystem. client is immutable. client *p9.Client @@ -209,6 +210,16 @@ const ( InteropModeShared ) +// InternalFilesystemOptions may be passed as +// vfs.GetFilesystemOptions.InternalData to FilesystemType.GetFilesystem. +type InternalFilesystemOptions struct { + // If LeakConnection is true, do not close the connection to the server + // when the Filesystem is released. This is necessary for deployments in + // which servers can handle only a single client and report failure if that + // client disconnects. + LeakConnection bool +} + // Name implements vfs.FilesystemType.Name. func (FilesystemType) Name() string { return Name @@ -347,6 +358,14 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt return nil, nil, syserror.EINVAL } + // Handle internal options. + iopts, ok := opts.InternalData.(InternalFilesystemOptions) + if opts.InternalData != nil && !ok { + ctx.Warningf("gofer.FilesystemType.GetFilesystem: GetFilesystemOptions.InternalData has type %T, wanted gofer.InternalFilesystemOptions", opts.InternalData) + return nil, nil, syserror.EINVAL + } + // If !ok, iopts being the zero value is correct. + // Establish a connection with the server. conn, err := unet.NewSocket(fsopts.fd) if err != nil { @@ -383,6 +402,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt fs := &filesystem{ mfp: mfp, opts: fsopts, + iopts: iopts, uid: creds.EffectiveKUID, gid: creds.EffectiveKGID, client: client, @@ -440,8 +460,10 @@ func (fs *filesystem) Release() { // fs. fs.syncMu.Unlock() - // Close the connection to the server. This implicitly clunks all fids. - fs.client.Close() + if !fs.iopts.LeakConnection { + // Close the connection to the server. This implicitly clunks all fids. + fs.client.Close() + } } // dentry implements vfs.DentryImpl. -- cgit v1.2.3 From 82bae30ceea0984c94af3085866b58ec9e69ea67 Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Fri, 1 May 2020 12:53:15 -0700 Subject: Port netstack, hostinet, and netlink sockets to VFS2. All three follow the same pattern: 1. Refactor VFS1 sockets into socketOpsCommon, so that most of the methods can be shared with VFS2. 2. Create a FileDescriptionImpl with the corresponding socket operations, rewriting the few that cannot be shared with VFS1. 3. Set up a VFS2 socket provider that creates a socket by setting up a dentry in the global Kernel.socketMount and connecting it with a new FileDescription. This mostly completes the work for porting sockets to VFS2, and many syscall tests can be enabled as a result. There are several networking-related syscall tests that are still not passing: 1. net gofer tests 2. socketpair gofer tests 2. sendfile tests (splice is not implemented in VFS2 yet) Updates #1478, #1484, #1485 PiperOrigin-RevId: 309457331 --- pkg/sentry/fsimpl/gofer/socket.go | 9 +- pkg/sentry/fsimpl/host/host.go | 2 +- pkg/sentry/fsimpl/host/socket.go | 53 +++-- pkg/sentry/fsimpl/sockfs/sockfs.go | 5 + pkg/sentry/socket/hostinet/BUILD | 5 + pkg/sentry/socket/hostinet/socket.go | 105 ++++++--- pkg/sentry/socket/hostinet/socket_unsafe.go | 10 +- pkg/sentry/socket/hostinet/socket_vfs2.go | 183 ++++++++++++++++ pkg/sentry/socket/netlink/BUILD | 5 + pkg/sentry/socket/netlink/provider.go | 5 + pkg/sentry/socket/netlink/provider_vfs2.go | 71 ++++++ pkg/sentry/socket/netlink/socket.go | 72 +++--- pkg/sentry/socket/netlink/socket_vfs2.go | 138 ++++++++++++ pkg/sentry/socket/netstack/BUILD | 5 + pkg/sentry/socket/netstack/netstack.go | 72 +++--- pkg/sentry/socket/netstack/netstack_vfs2.go | 327 ++++++++++++++++++++++++++++ pkg/sentry/socket/netstack/provider.go | 4 + pkg/sentry/socket/netstack/provider_vfs2.go | 141 ++++++++++++ pkg/sentry/socket/unix/unix_vfs2.go | 4 +- 19 files changed, 1090 insertions(+), 126 deletions(-) create mode 100644 pkg/sentry/socket/hostinet/socket_vfs2.go create mode 100644 pkg/sentry/socket/netlink/provider_vfs2.go create mode 100644 pkg/sentry/socket/netlink/socket_vfs2.go create mode 100644 pkg/sentry/socket/netstack/netstack_vfs2.go create mode 100644 pkg/sentry/socket/netstack/provider_vfs2.go (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/gofer/socket.go b/pkg/sentry/fsimpl/gofer/socket.go index 73835df91..d6dbe9092 100644 --- a/pkg/sentry/fsimpl/gofer/socket.go +++ b/pkg/sentry/fsimpl/gofer/socket.go @@ -87,7 +87,9 @@ func (e *endpoint) BidirectionalConnect(ctx context.Context, ce transport.Connec returnConnect(c, c) ce.Unlock() - c.Init() + if err := c.Init(); err != nil { + return syserr.FromError(err) + } return nil } @@ -99,7 +101,10 @@ func (e *endpoint) UnidirectionalConnect(ctx context.Context) (transport.Connect if err != nil { return nil, err } - c.Init() + + if err := c.Init(); err != nil { + return nil, syserr.FromError(err) + } // We don't need the receiver. c.CloseRecv() diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go index 33bd205af..34fbc69af 100644 --- a/pkg/sentry/fsimpl/host/host.go +++ b/pkg/sentry/fsimpl/host/host.go @@ -411,7 +411,7 @@ func (i *inode) open(ctx context.Context, d *vfs.Dentry, mnt *vfs.Mount) (*vfs.F return nil, syserror.ENOTTY } - ep, err := newEndpoint(ctx, i.hostFD) + ep, err := newEndpoint(ctx, i.hostFD, &i.queue) if err != nil { return nil, err } diff --git a/pkg/sentry/fsimpl/host/socket.go b/pkg/sentry/fsimpl/host/socket.go index 39dd624a0..38f1fbfba 100644 --- a/pkg/sentry/fsimpl/host/socket.go +++ b/pkg/sentry/fsimpl/host/socket.go @@ -34,17 +34,16 @@ import ( "gvisor.dev/gvisor/pkg/waiter" ) -// Create a new host-backed endpoint from the given fd. -func newEndpoint(ctx context.Context, hostFD int) (transport.Endpoint, error) { +// Create a new host-backed endpoint from the given fd and its corresponding +// notification queue. +func newEndpoint(ctx context.Context, hostFD int, queue *waiter.Queue) (transport.Endpoint, error) { // Set up an external transport.Endpoint using the host fd. addr := fmt.Sprintf("hostfd:[%d]", hostFD) - var q waiter.Queue - e, err := NewConnectedEndpoint(ctx, hostFD, &q, addr, true /* saveable */) + e, err := NewConnectedEndpoint(ctx, hostFD, addr, true /* saveable */) if err != nil { return nil, err.ToError() } - e.Init() - ep := transport.NewExternal(ctx, e.stype, uniqueid.GlobalProviderFromContext(ctx), &q, e, e) + ep := transport.NewExternal(ctx, e.stype, uniqueid.GlobalProviderFromContext(ctx), queue, e, e) return ep, nil } @@ -77,8 +76,6 @@ type ConnectedEndpoint struct { // addr is the address at which this endpoint is bound. addr string - queue *waiter.Queue - // sndbuf is the size of the send buffer. // // N.B. When this is smaller than the host size, we present it via @@ -134,11 +131,10 @@ func (c *ConnectedEndpoint) init() *syserr.Error { // The caller is responsible for calling Init(). Additionaly, Release needs to // be called twice because ConnectedEndpoint is both a transport.Receiver and // transport.ConnectedEndpoint. -func NewConnectedEndpoint(ctx context.Context, hostFD int, queue *waiter.Queue, addr string, saveable bool) (*ConnectedEndpoint, *syserr.Error) { +func NewConnectedEndpoint(ctx context.Context, hostFD int, addr string, saveable bool) (*ConnectedEndpoint, *syserr.Error) { e := ConnectedEndpoint{ - fd: hostFD, - addr: addr, - queue: queue, + fd: hostFD, + addr: addr, } if err := e.init(); err != nil { @@ -151,13 +147,6 @@ func NewConnectedEndpoint(ctx context.Context, hostFD int, queue *waiter.Queue, return &e, nil } -// Init will do the initialization required without holding other locks. -func (c *ConnectedEndpoint) Init() { - if err := fdnotifier.AddFD(int32(c.fd), c.queue); err != nil { - panic(err) - } -} - // Send implements transport.ConnectedEndpoint.Send. func (c *ConnectedEndpoint) Send(data [][]byte, controlMessages transport.ControlMessages, from tcpip.FullAddress) (int64, bool, *syserr.Error) { c.mu.RLock() @@ -332,7 +321,6 @@ func (c *ConnectedEndpoint) RecvMaxQueueSize() int64 { } func (c *ConnectedEndpoint) destroyLocked() { - fdnotifier.RemoveFD(int32(c.fd)) c.fd = -1 } @@ -350,14 +338,20 @@ func (c *ConnectedEndpoint) Release() { func (c *ConnectedEndpoint) CloseUnread() {} // SCMConnectedEndpoint represents an endpoint backed by a host fd that was -// passed through a gofer Unix socket. It is almost the same as -// ConnectedEndpoint, with the following differences: +// passed through a gofer Unix socket. It resembles ConnectedEndpoint, with the +// following differences: // - SCMConnectedEndpoint is not saveable, because the host cannot guarantee // the same descriptor number across S/R. -// - SCMConnectedEndpoint holds ownership of its fd and is responsible for -// closing it. +// - SCMConnectedEndpoint holds ownership of its fd and notification queue. type SCMConnectedEndpoint struct { ConnectedEndpoint + + queue *waiter.Queue +} + +// Init will do the initialization required without holding other locks. +func (e *SCMConnectedEndpoint) Init() error { + return fdnotifier.AddFD(int32(e.fd), e.queue) } // Release implements transport.ConnectedEndpoint.Release and @@ -368,6 +362,7 @@ func (e *SCMConnectedEndpoint) Release() { if err := syscall.Close(e.fd); err != nil { log.Warningf("Failed to close host fd %d: %v", err) } + fdnotifier.RemoveFD(int32(e.fd)) e.destroyLocked() e.mu.Unlock() }) @@ -380,11 +375,13 @@ func (e *SCMConnectedEndpoint) Release() { // be called twice because ConnectedEndpoint is both a transport.Receiver and // transport.ConnectedEndpoint. func NewSCMEndpoint(ctx context.Context, hostFD int, queue *waiter.Queue, addr string) (*SCMConnectedEndpoint, *syserr.Error) { - e := SCMConnectedEndpoint{ConnectedEndpoint{ - fd: hostFD, - addr: addr, + e := SCMConnectedEndpoint{ + ConnectedEndpoint: ConnectedEndpoint{ + fd: hostFD, + addr: addr, + }, queue: queue, - }} + } if err := e.init(); err != nil { return nil, err diff --git a/pkg/sentry/fsimpl/sockfs/sockfs.go b/pkg/sentry/fsimpl/sockfs/sockfs.go index 271134af8..dac2389fc 100644 --- a/pkg/sentry/fsimpl/sockfs/sockfs.go +++ b/pkg/sentry/fsimpl/sockfs/sockfs.go @@ -74,6 +74,11 @@ func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentr } // NewDentry constructs and returns a sockfs dentry. +// +// TODO(gvisor.dev/issue/1476): Currently, we are using +// sockfs.filesystem.NextIno() to get inode numbers. We should use +// device-specific numbers, so that we are not using the same generator for +// netstack, unix, etc. func NewDentry(creds *auth.Credentials, ino uint64) *vfs.Dentry { // File mode matches net/socket.c:sock_alloc. filemode := linux.FileMode(linux.S_IFSOCK | 0600) diff --git a/pkg/sentry/socket/hostinet/BUILD b/pkg/sentry/socket/hostinet/BUILD index 023bad156..deedd35f7 100644 --- a/pkg/sentry/socket/hostinet/BUILD +++ b/pkg/sentry/socket/hostinet/BUILD @@ -10,6 +10,7 @@ go_library( "save_restore.go", "socket.go", "socket_unsafe.go", + "socket_vfs2.go", "sockopt_impl.go", "stack.go", ], @@ -25,11 +26,15 @@ go_library( "//pkg/sentry/device", "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", + "//pkg/sentry/fsimpl/kernfs", + "//pkg/sentry/fsimpl/sockfs", + "//pkg/sentry/hostfd", "//pkg/sentry/inet", "//pkg/sentry/kernel", "//pkg/sentry/kernel/time", "//pkg/sentry/socket", "//pkg/sentry/socket/control", + "//pkg/sentry/vfs", "//pkg/syserr", "//pkg/syserror", "//pkg/tcpip/stack", diff --git a/pkg/sentry/socket/hostinet/socket.go b/pkg/sentry/socket/hostinet/socket.go index 22f78d2e2..b49433326 100644 --- a/pkg/sentry/socket/hostinet/socket.go +++ b/pkg/sentry/socket/hostinet/socket.go @@ -25,6 +25,7 @@ import ( "gvisor.dev/gvisor/pkg/fdnotifier" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/safemem" + "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/kernel" @@ -49,6 +50,8 @@ const ( maxControlLen = 1024 ) +// LINT.IfChange + // socketOperations implements fs.FileOperations and socket.Socket for a socket // implemented using a host socket. type socketOperations struct { @@ -59,23 +62,37 @@ type socketOperations struct { fsutil.FileNoSplice `state:"nosave"` fsutil.FileNoopFlush `state:"nosave"` fsutil.FileUseInodeUnstableAttr `state:"nosave"` + + socketOpsCommon +} + +// socketOpsCommon contains the socket operations common to VFS1 and VFS2. +// +// +stateify savable +type socketOpsCommon struct { socket.SendReceiveTimeout family int // Read-only. stype linux.SockType // Read-only. protocol int // Read-only. - fd int // must be O_NONBLOCK queue waiter.Queue + + // fd is the host socket fd. It must have O_NONBLOCK, so that operations + // will return EWOULDBLOCK instead of blocking on the host. This allows us to + // handle blocking behavior independently in the sentry. + fd int } var _ = socket.Socket(&socketOperations{}) func newSocketFile(ctx context.Context, family int, stype linux.SockType, protocol int, fd int, nonblock bool) (*fs.File, *syserr.Error) { s := &socketOperations{ - family: family, - stype: stype, - protocol: protocol, - fd: fd, + socketOpsCommon: socketOpsCommon{ + family: family, + stype: stype, + protocol: protocol, + fd: fd, + }, } if err := fdnotifier.AddFD(int32(fd), &s.queue); err != nil { return nil, syserr.FromError(err) @@ -86,28 +103,33 @@ func newSocketFile(ctx context.Context, family int, stype linux.SockType, protoc } // Release implements fs.FileOperations.Release. -func (s *socketOperations) Release() { +func (s *socketOpsCommon) Release() { fdnotifier.RemoveFD(int32(s.fd)) syscall.Close(s.fd) } // Readiness implements waiter.Waitable.Readiness. -func (s *socketOperations) Readiness(mask waiter.EventMask) waiter.EventMask { +func (s *socketOpsCommon) Readiness(mask waiter.EventMask) waiter.EventMask { return fdnotifier.NonBlockingPoll(int32(s.fd), mask) } // EventRegister implements waiter.Waitable.EventRegister. -func (s *socketOperations) EventRegister(e *waiter.Entry, mask waiter.EventMask) { +func (s *socketOpsCommon) EventRegister(e *waiter.Entry, mask waiter.EventMask) { s.queue.EventRegister(e, mask) fdnotifier.UpdateFD(int32(s.fd)) } // EventUnregister implements waiter.Waitable.EventUnregister. -func (s *socketOperations) EventUnregister(e *waiter.Entry) { +func (s *socketOpsCommon) EventUnregister(e *waiter.Entry) { s.queue.EventUnregister(e) fdnotifier.UpdateFD(int32(s.fd)) } +// Ioctl implements fs.FileOperations.Ioctl. +func (s *socketOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { + return ioctl(ctx, s.fd, io, args) +} + // Read implements fs.FileOperations.Read. func (s *socketOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, _ int64) (int64, error) { n, err := dst.CopyOutFrom(ctx, safemem.ReaderFunc(func(dsts safemem.BlockSeq) (uint64, error) { @@ -155,7 +177,7 @@ func (s *socketOperations) Write(ctx context.Context, _ *fs.File, src usermem.IO } // Connect implements socket.Socket.Connect. -func (s *socketOperations) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error { +func (s *socketOpsCommon) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error { if len(sockaddr) > sizeofSockaddr { sockaddr = sockaddr[:sizeofSockaddr] } @@ -195,7 +217,7 @@ func (s *socketOperations) Connect(t *kernel.Task, sockaddr []byte, blocking boo } // Accept implements socket.Socket.Accept. -func (s *socketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error) { +func (s *socketOpsCommon) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error) { var peerAddr linux.SockAddr var peerAddrBuf []byte var peerAddrlen uint32 @@ -209,7 +231,7 @@ func (s *socketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, } // Conservatively ignore all flags specified by the application and add - // SOCK_NONBLOCK since socketOperations requires it. + // SOCK_NONBLOCK since socketOpsCommon requires it. fd, syscallErr := accept4(s.fd, peerAddrPtr, peerAddrlenPtr, syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC) if blocking { var ch chan struct{} @@ -235,23 +257,41 @@ func (s *socketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, return 0, peerAddr, peerAddrlen, syserr.FromError(syscallErr) } - f, err := newSocketFile(t, s.family, s.stype, s.protocol, fd, flags&syscall.SOCK_NONBLOCK != 0) - if err != nil { - syscall.Close(fd) - return 0, nil, 0, err - } - defer f.DecRef() + var ( + kfd int32 + kerr error + ) + if kernel.VFS2Enabled { + f, err := newVFS2Socket(t, s.family, s.stype, s.protocol, fd, uint32(flags&syscall.SOCK_NONBLOCK)) + if err != nil { + syscall.Close(fd) + return 0, nil, 0, err + } + defer f.DecRef() - kfd, kerr := t.NewFDFrom(0, f, kernel.FDFlags{ - CloseOnExec: flags&syscall.SOCK_CLOEXEC != 0, - }) - t.Kernel().RecordSocket(f) + kfd, kerr = t.NewFDFromVFS2(0, f, kernel.FDFlags{ + CloseOnExec: flags&syscall.SOCK_CLOEXEC != 0, + }) + t.Kernel().RecordSocketVFS2(f) + } else { + f, err := newSocketFile(t, s.family, s.stype, s.protocol, fd, flags&syscall.SOCK_NONBLOCK != 0) + if err != nil { + syscall.Close(fd) + return 0, nil, 0, err + } + defer f.DecRef() + + kfd, kerr = t.NewFDFrom(0, f, kernel.FDFlags{ + CloseOnExec: flags&syscall.SOCK_CLOEXEC != 0, + }) + t.Kernel().RecordSocket(f) + } return kfd, peerAddr, peerAddrlen, syserr.FromError(kerr) } // Bind implements socket.Socket.Bind. -func (s *socketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { +func (s *socketOpsCommon) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { if len(sockaddr) > sizeofSockaddr { sockaddr = sockaddr[:sizeofSockaddr] } @@ -264,12 +304,12 @@ func (s *socketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { } // Listen implements socket.Socket.Listen. -func (s *socketOperations) Listen(t *kernel.Task, backlog int) *syserr.Error { +func (s *socketOpsCommon) Listen(t *kernel.Task, backlog int) *syserr.Error { return syserr.FromError(syscall.Listen(s.fd, backlog)) } // Shutdown implements socket.Socket.Shutdown. -func (s *socketOperations) Shutdown(t *kernel.Task, how int) *syserr.Error { +func (s *socketOpsCommon) Shutdown(t *kernel.Task, how int) *syserr.Error { switch how { case syscall.SHUT_RD, syscall.SHUT_WR, syscall.SHUT_RDWR: return syserr.FromError(syscall.Shutdown(s.fd, how)) @@ -279,7 +319,7 @@ func (s *socketOperations) Shutdown(t *kernel.Task, how int) *syserr.Error { } // GetSockOpt implements socket.Socket.GetSockOpt. -func (s *socketOperations) GetSockOpt(t *kernel.Task, level int, name int, outPtr usermem.Addr, outLen int) (interface{}, *syserr.Error) { +func (s *socketOpsCommon) GetSockOpt(t *kernel.Task, level int, name int, outPtr usermem.Addr, outLen int) (interface{}, *syserr.Error) { if outLen < 0 { return nil, syserr.ErrInvalidArgument } @@ -328,7 +368,7 @@ func (s *socketOperations) GetSockOpt(t *kernel.Task, level int, name int, outPt } // SetSockOpt implements socket.Socket.SetSockOpt. -func (s *socketOperations) SetSockOpt(t *kernel.Task, level int, name int, opt []byte) *syserr.Error { +func (s *socketOpsCommon) SetSockOpt(t *kernel.Task, level int, name int, opt []byte) *syserr.Error { // Whitelist options and constrain option length. optlen := setSockOptLen(t, level, name) switch level { @@ -374,7 +414,7 @@ func (s *socketOperations) SetSockOpt(t *kernel.Task, level int, name int, opt [ } // RecvMsg implements socket.Socket.RecvMsg. -func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlLen uint64) (int, int, linux.SockAddr, uint32, socket.ControlMessages, *syserr.Error) { +func (s *socketOpsCommon) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlLen uint64) (int, int, linux.SockAddr, uint32, socket.ControlMessages, *syserr.Error) { // Whitelist flags. // // FIXME(jamieliu): We can't support MSG_ERRQUEUE because it uses ancillary @@ -496,7 +536,7 @@ func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags } // SendMsg implements socket.Socket.SendMsg. -func (s *socketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, haveDeadline bool, deadline ktime.Time, controlMessages socket.ControlMessages) (int, *syserr.Error) { +func (s *socketOpsCommon) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, haveDeadline bool, deadline ktime.Time, controlMessages socket.ControlMessages) (int, *syserr.Error) { // Whitelist flags. if flags&^(syscall.MSG_DONTWAIT|syscall.MSG_EOR|syscall.MSG_FASTOPEN|syscall.MSG_MORE|syscall.MSG_NOSIGNAL) != 0 { return 0, syserr.ErrInvalidArgument @@ -585,7 +625,7 @@ func translateIOSyscallError(err error) error { } // State implements socket.Socket.State. -func (s *socketOperations) State() uint32 { +func (s *socketOpsCommon) State() uint32 { info := linux.TCPInfo{} buf, err := getsockopt(s.fd, syscall.SOL_TCP, syscall.TCP_INFO, linux.SizeOfTCPInfo) if err != nil { @@ -607,7 +647,7 @@ func (s *socketOperations) State() uint32 { } // Type implements socket.Socket.Type. -func (s *socketOperations) Type() (family int, skType linux.SockType, protocol int) { +func (s *socketOpsCommon) Type() (family int, skType linux.SockType, protocol int) { return s.family, s.stype, s.protocol } @@ -663,8 +703,11 @@ func (p *socketProvider) Pair(t *kernel.Task, stype linux.SockType, protocol int return nil, nil, nil } +// LINT.ThenChange(./socket_vfs2.go) + func init() { for _, family := range []int{syscall.AF_INET, syscall.AF_INET6} { socket.RegisterProvider(family, &socketProvider{family}) + socket.RegisterProviderVFS2(family, &socketProviderVFS2{}) } } diff --git a/pkg/sentry/socket/hostinet/socket_unsafe.go b/pkg/sentry/socket/hostinet/socket_unsafe.go index cd67234d2..3f420c2ec 100644 --- a/pkg/sentry/socket/hostinet/socket_unsafe.go +++ b/pkg/sentry/socket/hostinet/socket_unsafe.go @@ -21,7 +21,6 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/socket" "gvisor.dev/gvisor/pkg/syserr" @@ -54,12 +53,11 @@ func writev(fd int, srcs []syscall.Iovec) (uint64, error) { return uint64(n), nil } -// Ioctl implements fs.FileOperations.Ioctl. -func (s *socketOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { +func ioctl(ctx context.Context, fd int, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { switch cmd := uintptr(args[1].Int()); cmd { case syscall.TIOCINQ, syscall.TIOCOUTQ: var val int32 - if _, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(s.fd), cmd, uintptr(unsafe.Pointer(&val))); errno != 0 { + if _, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), cmd, uintptr(unsafe.Pointer(&val))); errno != 0 { return 0, translateIOSyscallError(errno) } var buf [4]byte @@ -93,7 +91,7 @@ func getsockopt(fd int, level, name int, optlen int) ([]byte, error) { } // GetSockName implements socket.Socket.GetSockName. -func (s *socketOperations) GetSockName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) { +func (s *socketOpsCommon) GetSockName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) { addr := make([]byte, sizeofSockaddr) addrlen := uint32(len(addr)) _, _, errno := syscall.Syscall(syscall.SYS_GETSOCKNAME, uintptr(s.fd), uintptr(unsafe.Pointer(&addr[0])), uintptr(unsafe.Pointer(&addrlen))) @@ -104,7 +102,7 @@ func (s *socketOperations) GetSockName(t *kernel.Task) (linux.SockAddr, uint32, } // GetPeerName implements socket.Socket.GetPeerName. -func (s *socketOperations) GetPeerName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) { +func (s *socketOpsCommon) GetPeerName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) { addr := make([]byte, sizeofSockaddr) addrlen := uint32(len(addr)) _, _, errno := syscall.Syscall(syscall.SYS_GETPEERNAME, uintptr(s.fd), uintptr(unsafe.Pointer(&addr[0])), uintptr(unsafe.Pointer(&addrlen))) diff --git a/pkg/sentry/socket/hostinet/socket_vfs2.go b/pkg/sentry/socket/hostinet/socket_vfs2.go new file mode 100644 index 000000000..b03ca2f26 --- /dev/null +++ b/pkg/sentry/socket/hostinet/socket_vfs2.go @@ -0,0 +1,183 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package hostinet + +import ( + "syscall" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fdnotifier" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs" + "gvisor.dev/gvisor/pkg/sentry/hostfd" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/socket" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserr" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +type socketVFS2 struct { + vfsfd vfs.FileDescription + vfs.FileDescriptionDefaultImpl + // TODO(gvisor.dev/issue/1484): VFS1 stores internal metadata for hostinet. + // We should perhaps rely on the host, much like in hostfs. + vfs.DentryMetadataFileDescriptionImpl + + socketOpsCommon +} + +var _ = socket.SocketVFS2(&socketVFS2{}) + +func newVFS2Socket(t *kernel.Task, family int, stype linux.SockType, protocol int, fd int, flags uint32) (*vfs.FileDescription, *syserr.Error) { + mnt := t.Kernel().SocketMount() + fs := mnt.Filesystem().Impl().(*kernfs.Filesystem) + d := sockfs.NewDentry(t.Credentials(), fs.NextIno()) + + s := &socketVFS2{ + socketOpsCommon: socketOpsCommon{ + family: family, + stype: stype, + protocol: protocol, + fd: fd, + }, + } + if err := fdnotifier.AddFD(int32(fd), &s.queue); err != nil { + return nil, syserr.FromError(err) + } + vfsfd := &s.vfsfd + if err := vfsfd.Init(s, linux.O_RDWR|(flags&linux.O_NONBLOCK), mnt, d, &vfs.FileDescriptionOptions{ + DenyPRead: true, + DenyPWrite: true, + UseDentryMetadata: true, + }); err != nil { + return nil, syserr.FromError(err) + } + return vfsfd, nil +} + +// Readiness implements waiter.Waitable.Readiness. +func (s *socketVFS2) Readiness(mask waiter.EventMask) waiter.EventMask { + return s.socketOpsCommon.Readiness(mask) +} + +// EventRegister implements waiter.Waitable.EventRegister. +func (s *socketVFS2) EventRegister(e *waiter.Entry, mask waiter.EventMask) { + s.socketOpsCommon.EventRegister(e, mask) +} + +// EventUnregister implements waiter.Waitable.EventUnregister. +func (s *socketVFS2) EventUnregister(e *waiter.Entry) { + s.socketOpsCommon.EventUnregister(e) +} + +// Ioctl implements vfs.FileDescriptionImpl. +func (s *socketVFS2) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) { + return ioctl(ctx, s.fd, uio, args) +} + +// PRead implements vfs.FileDescriptionImpl. +func (s *socketVFS2) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { + return 0, syserror.ESPIPE +} + +// Read implements vfs.FileDescriptionImpl. +func (s *socketVFS2) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + // All flags other than RWF_NOWAIT should be ignored. + // TODO(gvisor.dev/issue/2601): Support RWF_NOWAIT. + if opts.Flags != 0 { + return 0, syserror.EOPNOTSUPP + } + + reader := hostfd.GetReadWriterAt(int32(s.fd), -1, opts.Flags) + n, err := dst.CopyOutFrom(ctx, reader) + hostfd.PutReadWriterAt(reader) + return int64(n), err +} + +// PWrite implements vfs.FileDescriptionImpl. +func (s *socketVFS2) PWrite(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { + return 0, syserror.ESPIPE +} + +// Write implements vfs.FileDescriptionImpl. +func (s *socketVFS2) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { + // All flags other than RWF_NOWAIT should be ignored. + // TODO(gvisor.dev/issue/2601): Support RWF_NOWAIT. + if opts.Flags != 0 { + return 0, syserror.EOPNOTSUPP + } + + writer := hostfd.GetReadWriterAt(int32(s.fd), -1, opts.Flags) + n, err := src.CopyInTo(ctx, writer) + hostfd.PutReadWriterAt(writer) + return int64(n), err +} + +type socketProviderVFS2 struct { + family int +} + +// Socket implements socket.ProviderVFS2.Socket. +func (p *socketProviderVFS2) Socket(t *kernel.Task, stypeflags linux.SockType, protocol int) (*vfs.FileDescription, *syserr.Error) { + // Check that we are using the host network stack. + stack := t.NetworkContext() + if stack == nil { + return nil, nil + } + if _, ok := stack.(*Stack); !ok { + return nil, nil + } + + // Only accept TCP and UDP. + stype := stypeflags & linux.SOCK_TYPE_MASK + switch stype { + case syscall.SOCK_STREAM: + switch protocol { + case 0, syscall.IPPROTO_TCP: + // ok + default: + return nil, nil + } + case syscall.SOCK_DGRAM: + switch protocol { + case 0, syscall.IPPROTO_UDP: + // ok + default: + return nil, nil + } + default: + return nil, nil + } + + // Conservatively ignore all flags specified by the application and add + // SOCK_NONBLOCK since socketOperations requires it. Pass a protocol of 0 + // to simplify the syscall filters, since 0 and IPPROTO_* are equivalent. + fd, err := syscall.Socket(p.family, int(stype)|syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC, 0) + if err != nil { + return nil, syserr.FromError(err) + } + return newVFS2Socket(t, p.family, stype, protocol, fd, uint32(stypeflags&syscall.SOCK_NONBLOCK)) +} + +// Pair implements socket.Provider.Pair. +func (p *socketProviderVFS2) Pair(t *kernel.Task, stype linux.SockType, protocol int) (*vfs.FileDescription, *vfs.FileDescription, *syserr.Error) { + // Not supported by AF_INET/AF_INET6. + return nil, nil, nil +} diff --git a/pkg/sentry/socket/netlink/BUILD b/pkg/sentry/socket/netlink/BUILD index 1911cd9b8..09ca00a4a 100644 --- a/pkg/sentry/socket/netlink/BUILD +++ b/pkg/sentry/socket/netlink/BUILD @@ -7,7 +7,9 @@ go_library( srcs = [ "message.go", "provider.go", + "provider_vfs2.go", "socket.go", + "socket_vfs2.go", ], visibility = ["//pkg/sentry:internal"], deps = [ @@ -18,6 +20,8 @@ go_library( "//pkg/sentry/device", "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", + "//pkg/sentry/fsimpl/kernfs", + "//pkg/sentry/fsimpl/sockfs", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", "//pkg/sentry/kernel/time", @@ -25,6 +29,7 @@ go_library( "//pkg/sentry/socket/netlink/port", "//pkg/sentry/socket/unix", "//pkg/sentry/socket/unix/transport", + "//pkg/sentry/vfs", "//pkg/sync", "//pkg/syserr", "//pkg/syserror", diff --git a/pkg/sentry/socket/netlink/provider.go b/pkg/sentry/socket/netlink/provider.go index b0dc70e5c..0d45e5053 100644 --- a/pkg/sentry/socket/netlink/provider.go +++ b/pkg/sentry/socket/netlink/provider.go @@ -67,6 +67,8 @@ func RegisterProvider(protocol int, provider Provider) { protocols[protocol] = provider } +// LINT.IfChange + // socketProvider implements socket.Provider. type socketProvider struct { } @@ -105,7 +107,10 @@ func (*socketProvider) Pair(*kernel.Task, linux.SockType, int) (*fs.File, *fs.Fi return nil, nil, syserr.ErrNotSupported } +// LINT.ThenChange(./provider_vfs2.go) + // init registers the socket provider. func init() { socket.RegisterProvider(linux.AF_NETLINK, &socketProvider{}) + socket.RegisterProviderVFS2(linux.AF_NETLINK, &socketProviderVFS2{}) } diff --git a/pkg/sentry/socket/netlink/provider_vfs2.go b/pkg/sentry/socket/netlink/provider_vfs2.go new file mode 100644 index 000000000..dcd92b5cd --- /dev/null +++ b/pkg/sentry/socket/netlink/provider_vfs2.go @@ -0,0 +1,71 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package netlink + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserr" +) + +// socketProviderVFS2 implements socket.Provider. +type socketProviderVFS2 struct { +} + +// Socket implements socket.Provider.Socket. +func (*socketProviderVFS2) Socket(t *kernel.Task, stype linux.SockType, protocol int) (*vfs.FileDescription, *syserr.Error) { + // Netlink sockets must be specified as datagram or raw, but they + // behave the same regardless of type. + if stype != linux.SOCK_DGRAM && stype != linux.SOCK_RAW { + return nil, syserr.ErrSocketNotSupported + } + + provider, ok := protocols[protocol] + if !ok { + return nil, syserr.ErrProtocolNotSupported + } + + p, err := provider(t) + if err != nil { + return nil, err + } + + s, err := NewVFS2(t, stype, p) + if err != nil { + return nil, err + } + + vfsfd := &s.vfsfd + mnt := t.Kernel().SocketMount() + fs := mnt.Filesystem().Impl().(*kernfs.Filesystem) + d := sockfs.NewDentry(t.Credentials(), fs.NextIno()) + if err := vfsfd.Init(s, linux.O_RDWR, mnt, d, &vfs.FileDescriptionOptions{ + DenyPRead: true, + DenyPWrite: true, + UseDentryMetadata: true, + }); err != nil { + return nil, syserr.FromError(err) + } + return vfsfd, nil +} + +// Pair implements socket.Provider.Pair by returning an error. +func (*socketProviderVFS2) Pair(*kernel.Task, linux.SockType, int) (*vfs.FileDescription, *vfs.FileDescription, *syserr.Error) { + // Netlink sockets never supports creating socket pairs. + return nil, nil, syserr.ErrNotSupported +} diff --git a/pkg/sentry/socket/netlink/socket.go b/pkg/sentry/socket/netlink/socket.go index 2ca02567d..81f34c5a2 100644 --- a/pkg/sentry/socket/netlink/socket.go +++ b/pkg/sentry/socket/netlink/socket.go @@ -58,6 +58,8 @@ var errNoFilter = syserr.New("no filter attached", linux.ENOENT) // netlinkSocketDevice is the netlink socket virtual device. var netlinkSocketDevice = device.NewAnonDevice() +// LINT.IfChange + // Socket is the base socket type for netlink sockets. // // This implementation only supports userspace sending and receiving messages @@ -74,6 +76,14 @@ type Socket struct { fsutil.FileNoSplice `state:"nosave"` fsutil.FileNoopFlush `state:"nosave"` fsutil.FileUseInodeUnstableAttr `state:"nosave"` + + socketOpsCommon +} + +// socketOpsCommon contains the socket operations common to VFS1 and VFS2. +// +// +stateify savable +type socketOpsCommon struct { socket.SendReceiveTimeout // ports provides netlink port allocation. @@ -140,17 +150,19 @@ func NewSocket(t *kernel.Task, skType linux.SockType, protocol Protocol) (*Socke } return &Socket{ - ports: t.Kernel().NetlinkPorts(), - protocol: protocol, - skType: skType, - ep: ep, - connection: connection, - sendBufferSize: defaultSendBufferSize, + socketOpsCommon: socketOpsCommon{ + ports: t.Kernel().NetlinkPorts(), + protocol: protocol, + skType: skType, + ep: ep, + connection: connection, + sendBufferSize: defaultSendBufferSize, + }, }, nil } // Release implements fs.FileOperations.Release. -func (s *Socket) Release() { +func (s *socketOpsCommon) Release() { s.connection.Release() s.ep.Close() @@ -160,7 +172,7 @@ func (s *Socket) Release() { } // Readiness implements waiter.Waitable.Readiness. -func (s *Socket) Readiness(mask waiter.EventMask) waiter.EventMask { +func (s *socketOpsCommon) Readiness(mask waiter.EventMask) waiter.EventMask { // ep holds messages to be read and thus handles EventIn readiness. ready := s.ep.Readiness(mask) @@ -174,18 +186,18 @@ func (s *Socket) Readiness(mask waiter.EventMask) waiter.EventMask { } // EventRegister implements waiter.Waitable.EventRegister. -func (s *Socket) EventRegister(e *waiter.Entry, mask waiter.EventMask) { +func (s *socketOpsCommon) EventRegister(e *waiter.Entry, mask waiter.EventMask) { s.ep.EventRegister(e, mask) // Writable readiness never changes, so no registration is needed. } // EventUnregister implements waiter.Waitable.EventUnregister. -func (s *Socket) EventUnregister(e *waiter.Entry) { +func (s *socketOpsCommon) EventUnregister(e *waiter.Entry) { s.ep.EventUnregister(e) } // Passcred implements transport.Credentialer.Passcred. -func (s *Socket) Passcred() bool { +func (s *socketOpsCommon) Passcred() bool { s.mu.Lock() passcred := s.passcred s.mu.Unlock() @@ -193,7 +205,7 @@ func (s *Socket) Passcred() bool { } // ConnectedPasscred implements transport.Credentialer.ConnectedPasscred. -func (s *Socket) ConnectedPasscred() bool { +func (s *socketOpsCommon) ConnectedPasscred() bool { // This socket is connected to the kernel, which doesn't need creds. // // This is arbitrary, as ConnectedPasscred on this type has no callers. @@ -227,7 +239,7 @@ func ExtractSockAddr(b []byte) (*linux.SockAddrNetlink, *syserr.Error) { // port of 0 defaults to the ThreadGroup ID. // // Preconditions: mu is held. -func (s *Socket) bindPort(t *kernel.Task, port int32) *syserr.Error { +func (s *socketOpsCommon) bindPort(t *kernel.Task, port int32) *syserr.Error { if s.bound { // Re-binding is only allowed if the port doesn't change. if port != s.portID { @@ -251,7 +263,7 @@ func (s *Socket) bindPort(t *kernel.Task, port int32) *syserr.Error { } // Bind implements socket.Socket.Bind. -func (s *Socket) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { +func (s *socketOpsCommon) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { a, err := ExtractSockAddr(sockaddr) if err != nil { return err @@ -269,7 +281,7 @@ func (s *Socket) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { } // Connect implements socket.Socket.Connect. -func (s *Socket) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error { +func (s *socketOpsCommon) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error { a, err := ExtractSockAddr(sockaddr) if err != nil { return err @@ -300,25 +312,25 @@ func (s *Socket) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr } // Accept implements socket.Socket.Accept. -func (s *Socket) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error) { +func (s *socketOpsCommon) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error) { // Netlink sockets never support accept. return 0, nil, 0, syserr.ErrNotSupported } // Listen implements socket.Socket.Listen. -func (s *Socket) Listen(t *kernel.Task, backlog int) *syserr.Error { +func (s *socketOpsCommon) Listen(t *kernel.Task, backlog int) *syserr.Error { // Netlink sockets never support listen. return syserr.ErrNotSupported } // Shutdown implements socket.Socket.Shutdown. -func (s *Socket) Shutdown(t *kernel.Task, how int) *syserr.Error { +func (s *socketOpsCommon) Shutdown(t *kernel.Task, how int) *syserr.Error { // Netlink sockets never support shutdown. return syserr.ErrNotSupported } // GetSockOpt implements socket.Socket.GetSockOpt. -func (s *Socket) GetSockOpt(t *kernel.Task, level int, name int, outPtr usermem.Addr, outLen int) (interface{}, *syserr.Error) { +func (s *socketOpsCommon) GetSockOpt(t *kernel.Task, level int, name int, outPtr usermem.Addr, outLen int) (interface{}, *syserr.Error) { switch level { case linux.SOL_SOCKET: switch name { @@ -369,7 +381,7 @@ func (s *Socket) GetSockOpt(t *kernel.Task, level int, name int, outPtr usermem. } // SetSockOpt implements socket.Socket.SetSockOpt. -func (s *Socket) SetSockOpt(t *kernel.Task, level int, name int, opt []byte) *syserr.Error { +func (s *socketOpsCommon) SetSockOpt(t *kernel.Task, level int, name int, opt []byte) *syserr.Error { switch level { case linux.SOL_SOCKET: switch name { @@ -466,7 +478,7 @@ func (s *Socket) SetSockOpt(t *kernel.Task, level int, name int, opt []byte) *sy } // GetSockName implements socket.Socket.GetSockName. -func (s *Socket) GetSockName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) { +func (s *socketOpsCommon) GetSockName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) { s.mu.Lock() defer s.mu.Unlock() @@ -478,7 +490,7 @@ func (s *Socket) GetSockName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Er } // GetPeerName implements socket.Socket.GetPeerName. -func (s *Socket) GetPeerName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) { +func (s *socketOpsCommon) GetPeerName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) { sa := &linux.SockAddrNetlink{ Family: linux.AF_NETLINK, // TODO(b/68878065): Support non-kernel peers. For now the peer @@ -489,7 +501,7 @@ func (s *Socket) GetPeerName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Er } // RecvMsg implements socket.Socket.RecvMsg. -func (s *Socket) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (int, int, linux.SockAddr, uint32, socket.ControlMessages, *syserr.Error) { +func (s *socketOpsCommon) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (int, int, linux.SockAddr, uint32, socket.ControlMessages, *syserr.Error) { from := &linux.SockAddrNetlink{ Family: linux.AF_NETLINK, PortID: 0, @@ -590,7 +602,7 @@ func (kernelSCM) Credentials(*kernel.Task) (kernel.ThreadID, auth.UID, auth.GID) var kernelCreds = &kernelSCM{} // sendResponse sends the response messages in ms back to userspace. -func (s *Socket) sendResponse(ctx context.Context, ms *MessageSet) *syserr.Error { +func (s *socketOpsCommon) sendResponse(ctx context.Context, ms *MessageSet) *syserr.Error { // Linux combines multiple netlink messages into a single datagram. bufs := make([][]byte, 0, len(ms.Messages)) for _, m := range ms.Messages { @@ -666,7 +678,7 @@ func dumpAckMesage(hdr linux.NetlinkMessageHeader, ms *MessageSet) { // processMessages handles each message in buf, passing it to the protocol // handler for final handling. -func (s *Socket) processMessages(ctx context.Context, buf []byte) *syserr.Error { +func (s *socketOpsCommon) processMessages(ctx context.Context, buf []byte) *syserr.Error { for len(buf) > 0 { msg, rest, ok := ParseMessage(buf) if !ok { @@ -698,7 +710,7 @@ func (s *Socket) processMessages(ctx context.Context, buf []byte) *syserr.Error } // sendMsg is the core of message send, used for SendMsg and Write. -func (s *Socket) sendMsg(ctx context.Context, src usermem.IOSequence, to []byte, flags int, controlMessages socket.ControlMessages) (int, *syserr.Error) { +func (s *socketOpsCommon) sendMsg(ctx context.Context, src usermem.IOSequence, to []byte, flags int, controlMessages socket.ControlMessages) (int, *syserr.Error) { dstPort := int32(0) if len(to) != 0 { @@ -745,7 +757,7 @@ func (s *Socket) sendMsg(ctx context.Context, src usermem.IOSequence, to []byte, } // SendMsg implements socket.Socket.SendMsg. -func (s *Socket) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, haveDeadline bool, deadline ktime.Time, controlMessages socket.ControlMessages) (int, *syserr.Error) { +func (s *socketOpsCommon) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, haveDeadline bool, deadline ktime.Time, controlMessages socket.ControlMessages) (int, *syserr.Error) { return s.sendMsg(t, src, to, flags, controlMessages) } @@ -756,11 +768,13 @@ func (s *Socket) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, } // State implements socket.Socket.State. -func (s *Socket) State() uint32 { +func (s *socketOpsCommon) State() uint32 { return s.ep.State() } // Type implements socket.Socket.Type. -func (s *Socket) Type() (family int, skType linux.SockType, protocol int) { +func (s *socketOpsCommon) Type() (family int, skType linux.SockType, protocol int) { return linux.AF_NETLINK, s.skType, s.protocol.Protocol() } + +// LINT.ThenChange(./socket_vfs2.go) diff --git a/pkg/sentry/socket/netlink/socket_vfs2.go b/pkg/sentry/socket/netlink/socket_vfs2.go new file mode 100644 index 000000000..b854bf990 --- /dev/null +++ b/pkg/sentry/socket/netlink/socket_vfs2.go @@ -0,0 +1,138 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package netlink + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/socket" + "gvisor.dev/gvisor/pkg/sentry/socket/unix" + "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserr" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +// SocketVFS2 is the base VFS2 socket type for netlink sockets. +// +// This implementation only supports userspace sending and receiving messages +// to/from the kernel. +// +// SocketVFS2 implements socket.SocketVFS2 and transport.Credentialer. +type SocketVFS2 struct { + vfsfd vfs.FileDescription + vfs.FileDescriptionDefaultImpl + vfs.DentryMetadataFileDescriptionImpl + + socketOpsCommon +} + +var _ socket.SocketVFS2 = (*SocketVFS2)(nil) +var _ transport.Credentialer = (*SocketVFS2)(nil) + +// NewVFS2 creates a new SocketVFS2. +func NewVFS2(t *kernel.Task, skType linux.SockType, protocol Protocol) (*SocketVFS2, *syserr.Error) { + // Datagram endpoint used to buffer kernel -> user messages. + ep := transport.NewConnectionless(t) + + // Bind the endpoint for good measure so we can connect to it. The + // bound address will never be exposed. + if err := ep.Bind(tcpip.FullAddress{Addr: "dummy"}, nil); err != nil { + ep.Close() + return nil, err + } + + // Create a connection from which the kernel can write messages. + connection, err := ep.(transport.BoundEndpoint).UnidirectionalConnect(t) + if err != nil { + ep.Close() + return nil, err + } + + return &SocketVFS2{ + socketOpsCommon: socketOpsCommon{ + ports: t.Kernel().NetlinkPorts(), + protocol: protocol, + skType: skType, + ep: ep, + connection: connection, + sendBufferSize: defaultSendBufferSize, + }, + }, nil +} + +// Readiness implements waiter.Waitable.Readiness. +func (s *SocketVFS2) Readiness(mask waiter.EventMask) waiter.EventMask { + return s.socketOpsCommon.Readiness(mask) +} + +// EventRegister implements waiter.Waitable.EventRegister. +func (s *SocketVFS2) EventRegister(e *waiter.Entry, mask waiter.EventMask) { + s.socketOpsCommon.EventRegister(e, mask) +} + +// EventUnregister implements waiter.Waitable.EventUnregister. +func (s *SocketVFS2) EventUnregister(e *waiter.Entry) { + s.socketOpsCommon.EventUnregister(e) +} + +// Ioctl implements vfs.FileDescriptionImpl. +func (*SocketVFS2) Ioctl(context.Context, usermem.IO, arch.SyscallArguments) (uintptr, error) { + // TODO(b/68878065): no ioctls supported. + return 0, syserror.ENOTTY +} + +// PRead implements vfs.FileDescriptionImpl. +func (s *SocketVFS2) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { + return 0, syserror.ESPIPE +} + +// Read implements vfs.FileDescriptionImpl. +func (s *SocketVFS2) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + // All flags other than RWF_NOWAIT should be ignored. + // TODO(gvisor.dev/issue/2601): Support RWF_NOWAIT. + if opts.Flags != 0 { + return 0, syserror.EOPNOTSUPP + } + + if dst.NumBytes() == 0 { + return 0, nil + } + return dst.CopyOutFrom(ctx, &unix.EndpointReader{ + Endpoint: s.ep, + }) +} + +// PWrite implements vfs.FileDescriptionImpl. +func (s *SocketVFS2) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { + return 0, syserror.ESPIPE +} + +// Write implements vfs.FileDescriptionImpl. +func (s *SocketVFS2) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { + // All flags other than RWF_NOWAIT should be ignored. + // TODO(gvisor.dev/issue/2601): Support RWF_NOWAIT. + if opts.Flags != 0 { + return 0, syserror.EOPNOTSUPP + } + + n, err := s.sendMsg(ctx, src, nil, 0, socket.ControlMessages{}) + return int64(n), err.ToError() +} diff --git a/pkg/sentry/socket/netstack/BUILD b/pkg/sentry/socket/netstack/BUILD index cbf46b1e9..ccf9fcf5c 100644 --- a/pkg/sentry/socket/netstack/BUILD +++ b/pkg/sentry/socket/netstack/BUILD @@ -7,7 +7,9 @@ go_library( srcs = [ "device.go", "netstack.go", + "netstack_vfs2.go", "provider.go", + "provider_vfs2.go", "save_restore.go", "stack.go", ], @@ -25,6 +27,8 @@ go_library( "//pkg/sentry/device", "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", + "//pkg/sentry/fsimpl/kernfs", + "//pkg/sentry/fsimpl/sockfs", "//pkg/sentry/inet", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", @@ -32,6 +36,7 @@ go_library( "//pkg/sentry/socket", "//pkg/sentry/socket/netfilter", "//pkg/sentry/unimpl", + "//pkg/sentry/vfs", "//pkg/sync", "//pkg/syserr", "//pkg/syserror", diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go index d5879c10f..81053d8ef 100644 --- a/pkg/sentry/socket/netstack/netstack.go +++ b/pkg/sentry/socket/netstack/netstack.go @@ -252,6 +252,8 @@ type commonEndpoint interface { GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) } +// LINT.IfChange + // SocketOperations encapsulates all the state needed to represent a network stack // endpoint in the kernel context. // @@ -263,6 +265,14 @@ type SocketOperations struct { fsutil.FileNoFsync `state:"nosave"` fsutil.FileNoMMap `state:"nosave"` fsutil.FileUseInodeUnstableAttr `state:"nosave"` + + socketOpsCommon +} + +// socketOpsCommon contains the socket operations common to VFS1 and VFS2. +// +// +stateify savable +type socketOpsCommon struct { socket.SendReceiveTimeout *waiter.Queue @@ -314,11 +324,13 @@ func New(t *kernel.Task, family int, skType linux.SockType, protocol int, queue dirent := socket.NewDirent(t, netstackDevice) defer dirent.DecRef() return fs.NewFile(t, dirent, fs.FileFlags{Read: true, Write: true, NonSeekable: true}, &SocketOperations{ - Queue: queue, - family: family, - Endpoint: endpoint, - skType: skType, - protocol: protocol, + socketOpsCommon: socketOpsCommon{ + Queue: queue, + family: family, + Endpoint: endpoint, + skType: skType, + protocol: protocol, + }, }), nil } @@ -417,7 +429,7 @@ func AddressAndFamily(addr []byte) (tcpip.FullAddress, uint16, *syserr.Error) { } } -func (s *SocketOperations) isPacketBased() bool { +func (s *socketOpsCommon) isPacketBased() bool { return s.skType == linux.SOCK_DGRAM || s.skType == linux.SOCK_SEQPACKET || s.skType == linux.SOCK_RDM || s.skType == linux.SOCK_RAW } @@ -425,7 +437,7 @@ func (s *SocketOperations) isPacketBased() bool { // empty. It assumes that the socket is locked. // // Precondition: s.readMu must be held. -func (s *SocketOperations) fetchReadView() *syserr.Error { +func (s *socketOpsCommon) fetchReadView() *syserr.Error { if len(s.readView) > 0 { return nil } @@ -446,7 +458,7 @@ func (s *SocketOperations) fetchReadView() *syserr.Error { } // Release implements fs.FileOperations.Release. -func (s *SocketOperations) Release() { +func (s *socketOpsCommon) Release() { s.Endpoint.Close() } @@ -633,7 +645,7 @@ func (s *SocketOperations) ReadFrom(ctx context.Context, _ *fs.File, r io.Reader } // Readiness returns a mask of ready events for socket s. -func (s *SocketOperations) Readiness(mask waiter.EventMask) waiter.EventMask { +func (s *socketOpsCommon) Readiness(mask waiter.EventMask) waiter.EventMask { r := s.Endpoint.Readiness(mask) // Check our cached value iff the caller asked for readability and the @@ -647,7 +659,7 @@ func (s *SocketOperations) Readiness(mask waiter.EventMask) waiter.EventMask { return r } -func (s *SocketOperations) checkFamily(family uint16, exact bool) *syserr.Error { +func (s *socketOpsCommon) checkFamily(family uint16, exact bool) *syserr.Error { if family == uint16(s.family) { return nil } @@ -670,7 +682,7 @@ func (s *SocketOperations) checkFamily(family uint16, exact bool) *syserr.Error // represented by the empty string. // // TODO(gvisor.dev/issue/1556): remove this function. -func (s *SocketOperations) mapFamily(addr tcpip.FullAddress, family uint16) tcpip.FullAddress { +func (s *socketOpsCommon) mapFamily(addr tcpip.FullAddress, family uint16) tcpip.FullAddress { if len(addr.Addr) == 0 && s.family == linux.AF_INET6 && family == linux.AF_INET { addr.Addr = "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xff\xff\x00\x00\x00\x00" } @@ -679,7 +691,7 @@ func (s *SocketOperations) mapFamily(addr tcpip.FullAddress, family uint16) tcpi // Connect implements the linux syscall connect(2) for sockets backed by // tpcip.Endpoint. -func (s *SocketOperations) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error { +func (s *socketOpsCommon) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error { addr, family, err := AddressAndFamily(sockaddr) if err != nil { return err @@ -725,7 +737,7 @@ func (s *SocketOperations) Connect(t *kernel.Task, sockaddr []byte, blocking boo // Bind implements the linux syscall bind(2) for sockets backed by // tcpip.Endpoint. -func (s *SocketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { +func (s *socketOpsCommon) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { if len(sockaddr) < 2 { return syserr.ErrInvalidArgument } @@ -771,13 +783,13 @@ func (s *SocketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { // Listen implements the linux syscall listen(2) for sockets backed by // tcpip.Endpoint. -func (s *SocketOperations) Listen(t *kernel.Task, backlog int) *syserr.Error { +func (s *socketOpsCommon) Listen(t *kernel.Task, backlog int) *syserr.Error { return syserr.TranslateNetstackError(s.Endpoint.Listen(backlog)) } // blockingAccept implements a blocking version of accept(2), that is, if no // connections are ready to be accept, it will block until one becomes ready. -func (s *SocketOperations) blockingAccept(t *kernel.Task) (tcpip.Endpoint, *waiter.Queue, *syserr.Error) { +func (s *socketOpsCommon) blockingAccept(t *kernel.Task) (tcpip.Endpoint, *waiter.Queue, *syserr.Error) { // Register for notifications. e, ch := waiter.NewChannelEntry(nil) s.EventRegister(&e, waiter.EventIn) @@ -863,7 +875,7 @@ func ConvertShutdown(how int) (tcpip.ShutdownFlags, *syserr.Error) { // Shutdown implements the linux syscall shutdown(2) for sockets backed by // tcpip.Endpoint. -func (s *SocketOperations) Shutdown(t *kernel.Task, how int) *syserr.Error { +func (s *socketOpsCommon) Shutdown(t *kernel.Task, how int) *syserr.Error { f, err := ConvertShutdown(how) if err != nil { return err @@ -2258,7 +2270,7 @@ func ConvertAddress(family int, addr tcpip.FullAddress) (linux.SockAddr, uint32) // GetSockName implements the linux syscall getsockname(2) for sockets backed by // tcpip.Endpoint. -func (s *SocketOperations) GetSockName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) { +func (s *socketOpsCommon) GetSockName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) { addr, err := s.Endpoint.GetLocalAddress() if err != nil { return nil, 0, syserr.TranslateNetstackError(err) @@ -2270,7 +2282,7 @@ func (s *SocketOperations) GetSockName(t *kernel.Task) (linux.SockAddr, uint32, // GetPeerName implements the linux syscall getpeername(2) for sockets backed by // tcpip.Endpoint. -func (s *SocketOperations) GetPeerName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) { +func (s *socketOpsCommon) GetPeerName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) { addr, err := s.Endpoint.GetRemoteAddress() if err != nil { return nil, 0, syserr.TranslateNetstackError(err) @@ -2285,7 +2297,7 @@ func (s *SocketOperations) GetPeerName(t *kernel.Task) (linux.SockAddr, uint32, // caller. // // Precondition: s.readMu must be locked. -func (s *SocketOperations) coalescingRead(ctx context.Context, dst usermem.IOSequence, discard bool) (int, *syserr.Error) { +func (s *socketOpsCommon) coalescingRead(ctx context.Context, dst usermem.IOSequence, discard bool) (int, *syserr.Error) { var err *syserr.Error var copied int @@ -2337,7 +2349,7 @@ func (s *SocketOperations) coalescingRead(ctx context.Context, dst usermem.IOSeq return 0, err } -func (s *SocketOperations) fillCmsgInq(cmsg *socket.ControlMessages) { +func (s *socketOpsCommon) fillCmsgInq(cmsg *socket.ControlMessages) { if !s.sockOptInq { return } @@ -2352,7 +2364,7 @@ func (s *SocketOperations) fillCmsgInq(cmsg *socket.ControlMessages) { // nonBlockingRead issues a non-blocking read. // // TODO(b/78348848): Support timestamps for stream sockets. -func (s *SocketOperations) nonBlockingRead(ctx context.Context, dst usermem.IOSequence, peek, trunc, senderRequested bool) (int, int, linux.SockAddr, uint32, socket.ControlMessages, *syserr.Error) { +func (s *socketOpsCommon) nonBlockingRead(ctx context.Context, dst usermem.IOSequence, peek, trunc, senderRequested bool) (int, int, linux.SockAddr, uint32, socket.ControlMessages, *syserr.Error) { isPacket := s.isPacketBased() // Fast path for regular reads from stream (e.g., TCP) endpoints. Note @@ -2461,7 +2473,7 @@ func (s *SocketOperations) nonBlockingRead(ctx context.Context, dst usermem.IOSe return n, flags, addr, addrLen, cmsg, syserr.FromError(err) } -func (s *SocketOperations) controlMessages() socket.ControlMessages { +func (s *socketOpsCommon) controlMessages() socket.ControlMessages { return socket.ControlMessages{ IP: tcpip.ControlMessages{ HasTimestamp: s.readCM.HasTimestamp && s.sockOptTimestamp, @@ -2480,7 +2492,7 @@ func (s *SocketOperations) controlMessages() socket.ControlMessages { // successfully writing packet data out to userspace. // // Precondition: s.readMu must be locked. -func (s *SocketOperations) updateTimestamp() { +func (s *socketOpsCommon) updateTimestamp() { // Save the SIOCGSTAMP timestamp only if SO_TIMESTAMP is disabled. if !s.sockOptTimestamp { s.timestampValid = true @@ -2490,7 +2502,7 @@ func (s *SocketOperations) updateTimestamp() { // RecvMsg implements the linux syscall recvmsg(2) for sockets backed by // tcpip.Endpoint. -func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (n int, msgFlags int, senderAddr linux.SockAddr, senderAddrLen uint32, controlMessages socket.ControlMessages, err *syserr.Error) { +func (s *socketOpsCommon) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (n int, msgFlags int, senderAddr linux.SockAddr, senderAddrLen uint32, controlMessages socket.ControlMessages, err *syserr.Error) { trunc := flags&linux.MSG_TRUNC != 0 peek := flags&linux.MSG_PEEK != 0 dontWait := flags&linux.MSG_DONTWAIT != 0 @@ -2558,7 +2570,7 @@ func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags // SendMsg implements the linux syscall sendmsg(2) for sockets backed by // tcpip.Endpoint. -func (s *SocketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, haveDeadline bool, deadline ktime.Time, controlMessages socket.ControlMessages) (int, *syserr.Error) { +func (s *socketOpsCommon) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, haveDeadline bool, deadline ktime.Time, controlMessages socket.ControlMessages) (int, *syserr.Error) { // Reject Unix control messages. if !controlMessages.Unix.Empty() { return 0, syserr.ErrInvalidArgument @@ -2634,6 +2646,10 @@ func (s *SocketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to [] // Ioctl implements fs.FileOperations.Ioctl. func (s *SocketOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { + return s.socketOpsCommon.ioctl(ctx, io, args) +} + +func (s *socketOpsCommon) ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { // SIOCGSTAMP is implemented by netstack rather than all commonEndpoint // sockets. // TODO(b/78348848): Add a commonEndpoint method to support SIOCGSTAMP. @@ -2973,7 +2989,7 @@ func nicStateFlagsToLinux(f stack.NICStateFlags) uint32 { // State implements socket.Socket.State. State translates the internal state // returned by netstack to values defined by Linux. -func (s *SocketOperations) State() uint32 { +func (s *socketOpsCommon) State() uint32 { if s.family != linux.AF_INET && s.family != linux.AF_INET6 { // States not implemented for this socket's family. return 0 @@ -3033,6 +3049,8 @@ func (s *SocketOperations) State() uint32 { } // Type implements socket.Socket.Type. -func (s *SocketOperations) Type() (family int, skType linux.SockType, protocol int) { +func (s *socketOpsCommon) Type() (family int, skType linux.SockType, protocol int) { return s.family, s.skType, s.protocol } + +// LINT.ThenChange(./netstack_vfs2.go) diff --git a/pkg/sentry/socket/netstack/netstack_vfs2.go b/pkg/sentry/socket/netstack/netstack_vfs2.go new file mode 100644 index 000000000..eec71035d --- /dev/null +++ b/pkg/sentry/socket/netstack/netstack_vfs2.go @@ -0,0 +1,327 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package netstack + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs" + "gvisor.dev/gvisor/pkg/sentry/inet" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/socket/netfilter" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserr" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +// SocketVFS2 encapsulates all the state needed to represent a network stack +// endpoint in the kernel context. +type SocketVFS2 struct { + vfsfd vfs.FileDescription + vfs.FileDescriptionDefaultImpl + vfs.DentryMetadataFileDescriptionImpl + + socketOpsCommon +} + +// NewVFS2 creates a new endpoint socket. +func NewVFS2(t *kernel.Task, family int, skType linux.SockType, protocol int, queue *waiter.Queue, endpoint tcpip.Endpoint) (*vfs.FileDescription, *syserr.Error) { + if skType == linux.SOCK_STREAM { + if err := endpoint.SetSockOptBool(tcpip.DelayOption, true); err != nil { + return nil, syserr.TranslateNetstackError(err) + } + } + + mnt := t.Kernel().SocketMount() + fs := mnt.Filesystem().Impl().(*kernfs.Filesystem) + d := sockfs.NewDentry(t.Credentials(), fs.NextIno()) + + s := &SocketVFS2{ + socketOpsCommon: socketOpsCommon{ + Queue: queue, + family: family, + Endpoint: endpoint, + skType: skType, + protocol: protocol, + }, + } + vfsfd := &s.vfsfd + if err := vfsfd.Init(s, linux.O_RDWR, mnt, d, &vfs.FileDescriptionOptions{ + DenyPRead: true, + DenyPWrite: true, + UseDentryMetadata: true, + }); err != nil { + return nil, syserr.FromError(err) + } + return vfsfd, nil +} + +// Readiness implements waiter.Waitable.Readiness. +func (s *SocketVFS2) Readiness(mask waiter.EventMask) waiter.EventMask { + return s.socketOpsCommon.Readiness(mask) +} + +// EventRegister implements waiter.Waitable.EventRegister. +func (s *SocketVFS2) EventRegister(e *waiter.Entry, mask waiter.EventMask) { + s.socketOpsCommon.EventRegister(e, mask) +} + +// EventUnregister implements waiter.Waitable.EventUnregister. +func (s *SocketVFS2) EventUnregister(e *waiter.Entry) { + s.socketOpsCommon.EventUnregister(e) +} + +// PRead implements vfs.FileDescriptionImpl. +func (s *SocketVFS2) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { + return 0, syserror.ESPIPE +} + +// Read implements vfs.FileDescriptionImpl. +func (s *SocketVFS2) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + // All flags other than RWF_NOWAIT should be ignored. + // TODO(gvisor.dev/issue/2601): Support RWF_NOWAIT. + if opts.Flags != 0 { + return 0, syserror.EOPNOTSUPP + } + + if dst.NumBytes() == 0 { + return 0, nil + } + n, _, _, _, _, err := s.nonBlockingRead(ctx, dst, false, false, false) + if err == syserr.ErrWouldBlock { + return int64(n), syserror.ErrWouldBlock + } + if err != nil { + return 0, err.ToError() + } + return int64(n), nil +} + +// PWrite implements vfs.FileDescriptionImpl. +func (s *SocketVFS2) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { + return 0, syserror.ESPIPE +} + +// Write implements vfs.FileDescriptionImpl. +func (s *SocketVFS2) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { + // All flags other than RWF_NOWAIT should be ignored. + // TODO(gvisor.dev/issue/2601): Support RWF_NOWAIT. + if opts.Flags != 0 { + return 0, syserror.EOPNOTSUPP + } + + f := &ioSequencePayload{ctx: ctx, src: src} + n, resCh, err := s.Endpoint.Write(f, tcpip.WriteOptions{}) + if err == tcpip.ErrWouldBlock { + return 0, syserror.ErrWouldBlock + } + + if resCh != nil { + t := kernel.TaskFromContext(ctx) + if err := t.Block(resCh); err != nil { + return 0, syserr.FromError(err).ToError() + } + + n, _, err = s.Endpoint.Write(f, tcpip.WriteOptions{}) + } + + if err != nil { + return 0, syserr.TranslateNetstackError(err).ToError() + } + + if int64(n) < src.NumBytes() { + return int64(n), syserror.ErrWouldBlock + } + + return int64(n), nil +} + +// Accept implements the linux syscall accept(2) for sockets backed by +// tcpip.Endpoint. +func (s *SocketVFS2) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error) { + // Issue the accept request to get the new endpoint. + ep, wq, terr := s.Endpoint.Accept() + if terr != nil { + if terr != tcpip.ErrWouldBlock || !blocking { + return 0, nil, 0, syserr.TranslateNetstackError(terr) + } + + var err *syserr.Error + ep, wq, err = s.blockingAccept(t) + if err != nil { + return 0, nil, 0, err + } + } + + ns, err := NewVFS2(t, s.family, s.skType, s.protocol, wq, ep) + if err != nil { + return 0, nil, 0, err + } + defer ns.DecRef() + + if err := ns.SetStatusFlags(t, t.Credentials(), uint32(flags&linux.SOCK_NONBLOCK)); err != nil { + return 0, nil, 0, syserr.FromError(err) + } + + var addr linux.SockAddr + var addrLen uint32 + if peerRequested { + // Get address of the peer and write it to peer slice. + var err *syserr.Error + addr, addrLen, err = ns.Impl().(*SocketVFS2).GetPeerName(t) + if err != nil { + return 0, nil, 0, err + } + } + + fd, e := t.NewFDFromVFS2(0, ns, kernel.FDFlags{ + CloseOnExec: flags&linux.SOCK_CLOEXEC != 0, + }) + + t.Kernel().RecordSocketVFS2(ns) + + return fd, addr, addrLen, syserr.FromError(e) +} + +// Ioctl implements vfs.FileDescriptionImpl. +func (s *SocketVFS2) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) { + return s.socketOpsCommon.ioctl(ctx, uio, args) +} + +// GetSockOpt implements the linux syscall getsockopt(2) for sockets backed by +// tcpip.Endpoint. +func (s *SocketVFS2) GetSockOpt(t *kernel.Task, level, name int, outPtr usermem.Addr, outLen int) (interface{}, *syserr.Error) { + // TODO(b/78348848): Unlike other socket options, SO_TIMESTAMP is + // implemented specifically for netstack.SocketVFS2 rather than + // commonEndpoint. commonEndpoint should be extended to support socket + // options where the implementation is not shared, as unix sockets need + // their own support for SO_TIMESTAMP. + if level == linux.SOL_SOCKET && name == linux.SO_TIMESTAMP { + if outLen < sizeOfInt32 { + return nil, syserr.ErrInvalidArgument + } + val := int32(0) + s.readMu.Lock() + defer s.readMu.Unlock() + if s.sockOptTimestamp { + val = 1 + } + return val, nil + } + if level == linux.SOL_TCP && name == linux.TCP_INQ { + if outLen < sizeOfInt32 { + return nil, syserr.ErrInvalidArgument + } + val := int32(0) + s.readMu.Lock() + defer s.readMu.Unlock() + if s.sockOptInq { + val = 1 + } + return val, nil + } + + if s.skType == linux.SOCK_RAW && level == linux.IPPROTO_IP { + switch name { + case linux.IPT_SO_GET_INFO: + if outLen < linux.SizeOfIPTGetinfo { + return nil, syserr.ErrInvalidArgument + } + + stack := inet.StackFromContext(t) + if stack == nil { + return nil, syserr.ErrNoDevice + } + info, err := netfilter.GetInfo(t, stack.(*Stack).Stack, outPtr) + if err != nil { + return nil, err + } + return info, nil + + case linux.IPT_SO_GET_ENTRIES: + if outLen < linux.SizeOfIPTGetEntries { + return nil, syserr.ErrInvalidArgument + } + + stack := inet.StackFromContext(t) + if stack == nil { + return nil, syserr.ErrNoDevice + } + entries, err := netfilter.GetEntries(t, stack.(*Stack).Stack, outPtr, outLen) + if err != nil { + return nil, err + } + return entries, nil + + } + } + + return GetSockOpt(t, s, s.Endpoint, s.family, s.skType, level, name, outLen) +} + +// SetSockOpt implements the linux syscall setsockopt(2) for sockets backed by +// tcpip.Endpoint. +func (s *SocketVFS2) SetSockOpt(t *kernel.Task, level int, name int, optVal []byte) *syserr.Error { + // TODO(b/78348848): Unlike other socket options, SO_TIMESTAMP is + // implemented specifically for netstack.SocketVFS2 rather than + // commonEndpoint. commonEndpoint should be extended to support socket + // options where the implementation is not shared, as unix sockets need + // their own support for SO_TIMESTAMP. + if level == linux.SOL_SOCKET && name == linux.SO_TIMESTAMP { + if len(optVal) < sizeOfInt32 { + return syserr.ErrInvalidArgument + } + s.readMu.Lock() + defer s.readMu.Unlock() + s.sockOptTimestamp = usermem.ByteOrder.Uint32(optVal) != 0 + return nil + } + if level == linux.SOL_TCP && name == linux.TCP_INQ { + if len(optVal) < sizeOfInt32 { + return syserr.ErrInvalidArgument + } + s.readMu.Lock() + defer s.readMu.Unlock() + s.sockOptInq = usermem.ByteOrder.Uint32(optVal) != 0 + return nil + } + + if s.skType == linux.SOCK_RAW && level == linux.IPPROTO_IP { + switch name { + case linux.IPT_SO_SET_REPLACE: + if len(optVal) < linux.SizeOfIPTReplace { + return syserr.ErrInvalidArgument + } + + stack := inet.StackFromContext(t) + if stack == nil { + return syserr.ErrNoDevice + } + // Stack must be a netstack stack. + return netfilter.SetEntries(stack.(*Stack).Stack, optVal) + + case linux.IPT_SO_SET_ADD_COUNTERS: + // TODO(gvisor.dev/issue/170): Counter support. + return nil + } + } + + return SetSockOpt(t, s, s.Endpoint, level, name, optVal) +} diff --git a/pkg/sentry/socket/netstack/provider.go b/pkg/sentry/socket/netstack/provider.go index c3f04b613..ead3b2b79 100644 --- a/pkg/sentry/socket/netstack/provider.go +++ b/pkg/sentry/socket/netstack/provider.go @@ -33,6 +33,8 @@ import ( "gvisor.dev/gvisor/pkg/waiter" ) +// LINT.IfChange + // provider is an inet socket provider. type provider struct { family int @@ -167,6 +169,8 @@ func packetSocket(t *kernel.Task, epStack *Stack, stype linux.SockType, protocol return New(t, linux.AF_PACKET, stype, protocol, wq, ep) } +// LINT.ThenChange(./provider_vfs2.go) + // Pair just returns nil sockets (not supported). func (*provider) Pair(*kernel.Task, linux.SockType, int) (*fs.File, *fs.File, *syserr.Error) { return nil, nil, nil diff --git a/pkg/sentry/socket/netstack/provider_vfs2.go b/pkg/sentry/socket/netstack/provider_vfs2.go new file mode 100644 index 000000000..2a01143f6 --- /dev/null +++ b/pkg/sentry/socket/netstack/provider_vfs2.go @@ -0,0 +1,141 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package netstack + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/socket" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserr" + "gvisor.dev/gvisor/pkg/tcpip" + "gvisor.dev/gvisor/pkg/tcpip/network/ipv4" + "gvisor.dev/gvisor/pkg/tcpip/network/ipv6" + "gvisor.dev/gvisor/pkg/waiter" +) + +// providerVFS2 is an inet socket provider. +type providerVFS2 struct { + family int + netProto tcpip.NetworkProtocolNumber +} + +// Socket creates a new socket object for the AF_INET, AF_INET6, or AF_PACKET +// family. +func (p *providerVFS2) Socket(t *kernel.Task, stype linux.SockType, protocol int) (*vfs.FileDescription, *syserr.Error) { + // Fail right away if we don't have a stack. + stack := t.NetworkContext() + if stack == nil { + // Don't propagate an error here. Instead, allow the socket + // code to continue searching for another provider. + return nil, nil + } + eps, ok := stack.(*Stack) + if !ok { + return nil, nil + } + + // Packet sockets are handled separately, since they are neither INET + // nor INET6 specific. + if p.family == linux.AF_PACKET { + return packetSocketVFS2(t, eps, stype, protocol) + } + + // Figure out the transport protocol. + transProto, associated, err := getTransportProtocol(t, stype, protocol) + if err != nil { + return nil, err + } + + // Create the endpoint. + var ep tcpip.Endpoint + var e *tcpip.Error + wq := &waiter.Queue{} + if stype == linux.SOCK_RAW { + ep, e = eps.Stack.NewRawEndpoint(transProto, p.netProto, wq, associated) + } else { + ep, e = eps.Stack.NewEndpoint(transProto, p.netProto, wq) + + // Assign task to PacketOwner interface to get the UID and GID for + // iptables owner matching. + if e == nil { + ep.SetOwner(t) + } + } + if e != nil { + return nil, syserr.TranslateNetstackError(e) + } + + return NewVFS2(t, p.family, stype, int(transProto), wq, ep) +} + +func packetSocketVFS2(t *kernel.Task, epStack *Stack, stype linux.SockType, protocol int) (*vfs.FileDescription, *syserr.Error) { + // Packet sockets require CAP_NET_RAW. + creds := auth.CredentialsFromContext(t) + if !creds.HasCapability(linux.CAP_NET_RAW) { + return nil, syserr.ErrNotPermitted + } + + // "cooked" packets don't contain link layer information. + var cooked bool + switch stype { + case linux.SOCK_DGRAM: + cooked = true + case linux.SOCK_RAW: + cooked = false + default: + return nil, syserr.ErrProtocolNotSupported + } + + // protocol is passed in network byte order, but netstack wants it in + // host order. + netProto := tcpip.NetworkProtocolNumber(ntohs(uint16(protocol))) + + wq := &waiter.Queue{} + ep, err := epStack.Stack.NewPacketEndpoint(cooked, netProto, wq) + if err != nil { + return nil, syserr.TranslateNetstackError(err) + } + + return NewVFS2(t, linux.AF_PACKET, stype, protocol, wq, ep) +} + +// Pair just returns nil sockets (not supported). +func (*providerVFS2) Pair(*kernel.Task, linux.SockType, int) (*vfs.FileDescription, *vfs.FileDescription, *syserr.Error) { + return nil, nil, nil +} + +// init registers socket providers for AF_INET, AF_INET6, and AF_PACKET. +func init() { + // Providers backed by netstack. + p := []providerVFS2{ + { + family: linux.AF_INET, + netProto: ipv4.ProtocolNumber, + }, + { + family: linux.AF_INET6, + netProto: ipv6.ProtocolNumber, + }, + { + family: linux.AF_PACKET, + }, + } + + for i := range p { + socket.RegisterProviderVFS2(p[i].family, &p[i]) + } +} diff --git a/pkg/sentry/socket/unix/unix_vfs2.go b/pkg/sentry/socket/unix/unix_vfs2.go index 23db93f33..5edc3cdf4 100644 --- a/pkg/sentry/socket/unix/unix_vfs2.go +++ b/pkg/sentry/socket/unix/unix_vfs2.go @@ -229,7 +229,7 @@ func (s *SocketVFS2) PRead(ctx context.Context, dst usermem.IOSequence, offset i // Read implements vfs.FileDescriptionImpl. func (s *SocketVFS2) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { // All flags other than RWF_NOWAIT should be ignored. - // TODO(gvisor.dev/issue/1476): Support RWF_NOWAIT. + // TODO(gvisor.dev/issue/2601): Support RWF_NOWAIT. if opts.Flags != 0 { return 0, syserror.EOPNOTSUPP } @@ -254,7 +254,7 @@ func (s *SocketVFS2) PWrite(ctx context.Context, src usermem.IOSequence, offset // Write implements vfs.FileDescriptionImpl. func (s *SocketVFS2) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { // All flags other than RWF_NOWAIT should be ignored. - // TODO(gvisor.dev/issue/1476): Support RWF_NOWAIT. + // TODO(gvisor.dev/issue/2601): Support RWF_NOWAIT. if opts.Flags != 0 { return 0, syserror.EOPNOTSUPP } -- cgit v1.2.3 From cbc5bef2a66ece1f9e63b213d4dfa616db488df8 Mon Sep 17 00:00:00 2001 From: Fabricio Voznika Date: Mon, 4 May 2020 10:58:01 -0700 Subject: Add TTY support on VFS2 to runsc Updates #1623, #1487 PiperOrigin-RevId: 309777922 --- pkg/sentry/control/BUILD | 3 + pkg/sentry/control/proc.go | 98 +++++++----------- pkg/sentry/fdimport/BUILD | 19 ++++ pkg/sentry/fdimport/fdimport.go | 129 ++++++++++++++++++++++++ pkg/sentry/fsimpl/host/host.go | 2 +- pkg/sentry/fsimpl/host/tty.go | 26 ++--- pkg/sentry/vfs/context.go | 24 +++++ pkg/sentry/vfs/epoll.go | 1 + runsc/boot/BUILD | 3 +- runsc/boot/fds.go | 108 -------------------- runsc/boot/loader.go | 204 +++++++++++++++++++++++++------------- runsc/boot/vfs.go | 42 +++----- runsc/container/container_test.go | 107 ++++++++++---------- 13 files changed, 434 insertions(+), 332 deletions(-) create mode 100644 pkg/sentry/fdimport/BUILD create mode 100644 pkg/sentry/fdimport/fdimport.go delete mode 100644 runsc/boot/fds.go (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/control/BUILD b/pkg/sentry/control/BUILD index d16d78aa5..e74275d2d 100644 --- a/pkg/sentry/control/BUILD +++ b/pkg/sentry/control/BUILD @@ -20,9 +20,11 @@ go_library( "//pkg/fd", "//pkg/fspath", "//pkg/log", + "//pkg/sentry/fdimport", "//pkg/sentry/fs", "//pkg/sentry/fs/host", "//pkg/sentry/fsbridge", + "//pkg/sentry/fsimpl/host", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", "//pkg/sentry/kernel/time", @@ -36,6 +38,7 @@ go_library( "//pkg/syserror", "//pkg/tcpip/link/sniffer", "//pkg/urpc", + "@org_golang_x_sys//unix:go_default_library", ], ) diff --git a/pkg/sentry/control/proc.go b/pkg/sentry/control/proc.go index b51fb3959..2ed17ee09 100644 --- a/pkg/sentry/control/proc.go +++ b/pkg/sentry/control/proc.go @@ -24,13 +24,16 @@ import ( "text/tabwriter" "time" + "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sentry/fdimport" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/host" "gvisor.dev/gvisor/pkg/sentry/fsbridge" + hostvfs2 "gvisor.dev/gvisor/pkg/sentry/fsimpl/host" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" @@ -84,15 +87,13 @@ type ExecArgs struct { // the root group if not set explicitly. KGID auth.KGID - // ExtraKGIDs is the list of additional groups to which the user - // belongs. + // ExtraKGIDs is the list of additional groups to which the user belongs. ExtraKGIDs []auth.KGID // Capabilities is the list of capabilities to give to the process. Capabilities *auth.TaskCapabilities - // StdioIsPty indicates that FDs 0, 1, and 2 are connected to a host - // pty FD. + // StdioIsPty indicates that FDs 0, 1, and 2 are connected to a host pty FD. StdioIsPty bool // FilePayload determines the files to give to the new process. @@ -117,7 +118,7 @@ func (args ExecArgs) String() string { // Exec runs a new task. func (proc *Proc) Exec(args *ExecArgs, waitStatus *uint32) error { - newTG, _, _, err := proc.execAsync(args) + newTG, _, _, _, err := proc.execAsync(args) if err != nil { return err } @@ -130,26 +131,18 @@ func (proc *Proc) Exec(args *ExecArgs, waitStatus *uint32) error { // ExecAsync runs a new task, but doesn't wait for it to finish. It is defined // as a function rather than a method to avoid exposing execAsync as an RPC. -func ExecAsync(proc *Proc, args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadID, *host.TTYFileOperations, error) { +func ExecAsync(proc *Proc, args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadID, *host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) { return proc.execAsync(args) } // execAsync runs a new task, but doesn't wait for it to finish. It returns the // newly created thread group and its PID. If the stdio FDs are TTYs, then a // TTYFileOperations that wraps the TTY is also returned. -func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadID, *host.TTYFileOperations, error) { +func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadID, *host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) { // Import file descriptors. fdTable := proc.Kernel.NewFDTable() defer fdTable.DecRef() - // No matter what happens, we should close all files in the FilePayload - // before returning. Any files that are imported will be duped. - defer func() { - for _, f := range args.FilePayload.Files { - f.Close() - } - }() - creds := auth.NewUserCredentials( args.KUID, args.KGID, @@ -202,7 +195,7 @@ func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadI vfsObj := proc.Kernel.VFS() file, err := ResolveExecutablePath(ctx, vfsObj, initArgs.WorkingDirectory, initArgs.Argv[0], paths) if err != nil { - return nil, 0, nil, fmt.Errorf("error finding executable %q in PATH %v: %v", initArgs.Argv[0], paths, err) + return nil, 0, nil, nil, fmt.Errorf("error finding executable %q in PATH %v: %v", initArgs.Argv[0], paths, err) } initArgs.File = fsbridge.NewVFSFile(file) } else { @@ -214,74 +207,57 @@ func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadI // initArgs must hold a reference on MountNamespace, which will // be donated to the new process in CreateProcess. - initArgs.MountNamespaceVFS2.IncRef() + initArgs.MountNamespace.IncRef() } f, err := initArgs.MountNamespace.ResolveExecutablePath(ctx, initArgs.WorkingDirectory, initArgs.Argv[0], paths) if err != nil { - return nil, 0, nil, fmt.Errorf("error finding executable %q in PATH %v: %v", initArgs.Argv[0], paths, err) + return nil, 0, nil, nil, fmt.Errorf("error finding executable %q in PATH %v: %v", initArgs.Argv[0], paths, err) } initArgs.Filename = f } } - // TODO(gvisor.dev/issue/1623): Use host FD when supported in VFS2. - var ttyFile *fs.File - for appFD, hostFile := range args.FilePayload.Files { - var appFile *fs.File - - if args.StdioIsPty && appFD < 3 { - // Import the file as a host TTY file. - if ttyFile == nil { - var err error - appFile, err = host.ImportFile(ctx, int(hostFile.Fd()), true /* isTTY */) - if err != nil { - return nil, 0, nil, err - } - defer appFile.DecRef() - - // Remember this in the TTY file, as we will - // use it for the other stdio FDs. - ttyFile = appFile - } else { - // Re-use the existing TTY file, as all three - // stdio FDs must point to the same fs.File in - // order to share TTY state, specifically the - // foreground process group id. - appFile = ttyFile - } - } else { - // Import the file as a regular host file. - var err error - appFile, err = host.ImportFile(ctx, int(hostFile.Fd()), false /* isTTY */) + fds := make([]int, len(args.FilePayload.Files)) + for i, file := range args.FilePayload.Files { + if kernel.VFS2Enabled { + // Need to dup to remove ownership from os.File. + dup, err := unix.Dup(int(file.Fd())) if err != nil { - return nil, 0, nil, err + return nil, 0, nil, nil, fmt.Errorf("duplicating payload files: %w", err) } - defer appFile.DecRef() + fds[i] = dup + } else { + // VFS1 dups the file on import. + fds[i] = int(file.Fd()) } - - // Add the file to the FD map. - if err := fdTable.NewFDAt(ctx, int32(appFD), appFile, kernel.FDFlags{}); err != nil { - return nil, 0, nil, err + } + ttyFile, ttyFileVFS2, err := fdimport.Import(ctx, fdTable, args.StdioIsPty, fds) + if err != nil { + if kernel.VFS2Enabled { + for _, fd := range fds { + unix.Close(fd) + } } + return nil, 0, nil, nil, err } tg, tid, err := proc.Kernel.CreateProcess(initArgs) if err != nil { - return nil, 0, nil, err + return nil, 0, nil, nil, err } - var ttyFileOps *host.TTYFileOperations - if ttyFile != nil { - // Set the foreground process group on the TTY before starting - // the process. - ttyFileOps = ttyFile.FileOperations.(*host.TTYFileOperations) - ttyFileOps.InitForegroundProcessGroup(tg.ProcessGroup()) + // Set the foreground process group on the TTY before starting the process. + switch { + case ttyFile != nil: + ttyFile.InitForegroundProcessGroup(tg.ProcessGroup()) + case ttyFileVFS2 != nil: + ttyFileVFS2.InitForegroundProcessGroup(tg.ProcessGroup()) } // Start the newly created process. proc.Kernel.StartProcess(tg) - return tg, tid, ttyFileOps, nil + return tg, tid, ttyFile, ttyFileVFS2, nil } // PsArgs is the set of arguments to ps. diff --git a/pkg/sentry/fdimport/BUILD b/pkg/sentry/fdimport/BUILD new file mode 100644 index 000000000..5e41ceb4e --- /dev/null +++ b/pkg/sentry/fdimport/BUILD @@ -0,0 +1,19 @@ +load("//tools:defs.bzl", "go_library") + +package(licenses = ["notice"]) + +go_library( + name = "fdimport", + srcs = [ + "fdimport.go", + ], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/context", + "//pkg/sentry/fs", + "//pkg/sentry/fs/host", + "//pkg/sentry/fsimpl/host", + "//pkg/sentry/kernel", + "//pkg/sentry/vfs", + ], +) diff --git a/pkg/sentry/fdimport/fdimport.go b/pkg/sentry/fdimport/fdimport.go new file mode 100644 index 000000000..a4199f9e9 --- /dev/null +++ b/pkg/sentry/fdimport/fdimport.go @@ -0,0 +1,129 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fdimport + +import ( + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/fs" + "gvisor.dev/gvisor/pkg/sentry/fs/host" + hostvfs2 "gvisor.dev/gvisor/pkg/sentry/fsimpl/host" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/vfs" +) + +// Import imports a slice of FDs into the given FDTable. If console is true, +// sets up TTY for the first 3 FDs in the slice representing stdin, stdout, +// stderr. Upon success, Import takes ownership of all FDs. +func Import(ctx context.Context, fdTable *kernel.FDTable, console bool, fds []int) (*host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) { + if kernel.VFS2Enabled { + ttyFile, err := importVFS2(ctx, fdTable, console, fds) + return nil, ttyFile, err + } + ttyFile, err := importFS(ctx, fdTable, console, fds) + return ttyFile, nil, err +} + +func importFS(ctx context.Context, fdTable *kernel.FDTable, console bool, fds []int) (*host.TTYFileOperations, error) { + var ttyFile *fs.File + for appFD, hostFD := range fds { + var appFile *fs.File + + if console && appFD < 3 { + // Import the file as a host TTY file. + if ttyFile == nil { + var err error + appFile, err = host.ImportFile(ctx, hostFD, true /* isTTY */) + if err != nil { + return nil, err + } + defer appFile.DecRef() + + // Remember this in the TTY file, as we will + // use it for the other stdio FDs. + ttyFile = appFile + } else { + // Re-use the existing TTY file, as all three + // stdio FDs must point to the same fs.File in + // order to share TTY state, specifically the + // foreground process group id. + appFile = ttyFile + } + } else { + // Import the file as a regular host file. + var err error + appFile, err = host.ImportFile(ctx, hostFD, false /* isTTY */) + if err != nil { + return nil, err + } + defer appFile.DecRef() + } + + // Add the file to the FD map. + if err := fdTable.NewFDAt(ctx, int32(appFD), appFile, kernel.FDFlags{}); err != nil { + return nil, err + } + } + + if ttyFile == nil { + return nil, nil + } + return ttyFile.FileOperations.(*host.TTYFileOperations), nil +} + +func importVFS2(ctx context.Context, fdTable *kernel.FDTable, console bool, stdioFDs []int) (*hostvfs2.TTYFileDescription, error) { + k := kernel.KernelFromContext(ctx) + + var ttyFile *vfs.FileDescription + for appFD, hostFD := range stdioFDs { + var appFile *vfs.FileDescription + + if console && appFD < 3 { + // Import the file as a host TTY file. + if ttyFile == nil { + var err error + appFile, err = hostvfs2.ImportFD(ctx, k.HostMount(), hostFD, true /* isTTY */) + if err != nil { + return nil, err + } + defer appFile.DecRef() + + // Remember this in the TTY file, as we will use it for the other stdio + // FDs. + ttyFile = appFile + } else { + // Re-use the existing TTY file, as all three stdio FDs must point to + // the same fs.File in order to share TTY state, specifically the + // foreground process group id. + appFile = ttyFile + } + } else { + var err error + appFile, err = hostvfs2.ImportFD(ctx, k.HostMount(), hostFD, false /* isTTY */) + if err != nil { + return nil, err + } + defer appFile.DecRef() + } + + if err := fdTable.NewFDAtVFS2(ctx, int32(appFD), appFile, kernel.FDFlags{}); err != nil { + return nil, err + } + } + + if ttyFile == nil { + return nil, nil + } + return ttyFile.Impl().(*hostvfs2.TTYFileDescription), nil +} diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go index 34fbc69af..2be498afc 100644 --- a/pkg/sentry/fsimpl/host/host.go +++ b/pkg/sentry/fsimpl/host/host.go @@ -422,7 +422,7 @@ func (i *inode) open(ctx context.Context, d *vfs.Dentry, mnt *vfs.Mount) (*vfs.F // TODO(gvisor.dev/issue/1672): Whitelist specific file types here, so that // we don't allow importing arbitrary file types without proper support. if i.isTTY { - fd := &ttyFD{ + fd := &TTYFileDescription{ fileDescription: fileDescription{inode: i}, termios: linux.DefaultSlaveTermios, } diff --git a/pkg/sentry/fsimpl/host/tty.go b/pkg/sentry/fsimpl/host/tty.go index 8936afb06..68af6e5af 100644 --- a/pkg/sentry/fsimpl/host/tty.go +++ b/pkg/sentry/fsimpl/host/tty.go @@ -26,15 +26,15 @@ import ( "gvisor.dev/gvisor/pkg/usermem" ) -// ttyFD implements vfs.FileDescriptionImpl for a host file descriptor -// that wraps a TTY FD. -type ttyFD struct { +// TTYFileDescription implements vfs.FileDescriptionImpl for a host file +// descriptor that wraps a TTY FD. +type TTYFileDescription struct { fileDescription // mu protects the fields below. mu sync.Mutex `state:"nosave"` - // session is the session attached to this ttyFD. + // session is the session attached to this TTYFileDescription. session *kernel.Session // fgProcessGroup is the foreground process group that is currently @@ -48,7 +48,7 @@ type ttyFD struct { // InitForegroundProcessGroup sets the foreground process group and session for // the TTY. This should only be called once, after the foreground process group // has been created, but before it has started running. -func (t *ttyFD) InitForegroundProcessGroup(pg *kernel.ProcessGroup) { +func (t *TTYFileDescription) InitForegroundProcessGroup(pg *kernel.ProcessGroup) { t.mu.Lock() defer t.mu.Unlock() if t.fgProcessGroup != nil { @@ -59,14 +59,14 @@ func (t *ttyFD) InitForegroundProcessGroup(pg *kernel.ProcessGroup) { } // ForegroundProcessGroup returns the foreground process for the TTY. -func (t *ttyFD) ForegroundProcessGroup() *kernel.ProcessGroup { +func (t *TTYFileDescription) ForegroundProcessGroup() *kernel.ProcessGroup { t.mu.Lock() defer t.mu.Unlock() return t.fgProcessGroup } // Release implements fs.FileOperations.Release. -func (t *ttyFD) Release() { +func (t *TTYFileDescription) Release() { t.mu.Lock() t.fgProcessGroup = nil t.mu.Unlock() @@ -78,7 +78,7 @@ func (t *ttyFD) Release() { // // Reading from a TTY is only allowed for foreground process groups. Background // process groups will either get EIO or a SIGTTIN. -func (t *ttyFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { +func (t *TTYFileDescription) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { t.mu.Lock() defer t.mu.Unlock() @@ -96,7 +96,7 @@ func (t *ttyFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, // // Reading from a TTY is only allowed for foreground process groups. Background // process groups will either get EIO or a SIGTTIN. -func (t *ttyFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { +func (t *TTYFileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { t.mu.Lock() defer t.mu.Unlock() @@ -111,7 +111,7 @@ func (t *ttyFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadO } // PWrite implements vfs.FileDescriptionImpl. -func (t *ttyFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { +func (t *TTYFileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { t.mu.Lock() defer t.mu.Unlock() @@ -126,7 +126,7 @@ func (t *ttyFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64 } // Write implements vfs.FileDescriptionImpl. -func (t *ttyFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { +func (t *TTYFileDescription) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { t.mu.Lock() defer t.mu.Unlock() @@ -141,7 +141,7 @@ func (t *ttyFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.Writ } // Ioctl implements vfs.FileDescriptionImpl. -func (t *ttyFD) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { +func (t *TTYFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { // Ignore arg[0]. This is the real FD: fd := t.inode.hostFD ioctl := args[1].Uint64() @@ -321,7 +321,7 @@ func (t *ttyFD) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArgum // is a bit convoluted, but documented inline. // // Preconditions: t.mu must be held. -func (t *ttyFD) checkChange(ctx context.Context, sig linux.Signal) error { +func (t *TTYFileDescription) checkChange(ctx context.Context, sig linux.Signal) error { task := kernel.TaskFromContext(ctx) if task == nil { // No task? Linux does not have an analog for this case, but diff --git a/pkg/sentry/vfs/context.go b/pkg/sentry/vfs/context.go index 82781e6d3..c9e724fef 100644 --- a/pkg/sentry/vfs/context.go +++ b/pkg/sentry/vfs/context.go @@ -49,3 +49,27 @@ func RootFromContext(ctx context.Context) VirtualDentry { } return VirtualDentry{} } + +type rootContext struct { + context.Context + root VirtualDentry +} + +// WithRoot returns a copy of ctx with the given root. +func WithRoot(ctx context.Context, root VirtualDentry) context.Context { + return &rootContext{ + Context: ctx, + root: root, + } +} + +// Value implements Context.Value. +func (rc rootContext) Value(key interface{}) interface{} { + switch key { + case CtxRoot: + rc.root.IncRef() + return rc.root + default: + return rc.Context.Value(key) + } +} diff --git a/pkg/sentry/vfs/epoll.go b/pkg/sentry/vfs/epoll.go index 8e0b40841..8297f964b 100644 --- a/pkg/sentry/vfs/epoll.go +++ b/pkg/sentry/vfs/epoll.go @@ -192,6 +192,7 @@ func (ep *EpollInstance) AddInterest(file *FileDescription, num int32, event lin mask: mask, userData: event.Data, } + epi.waiter.Callback = epi ep.interest[key] = epi wmask := waiter.EventMaskFromLinux(mask) file.EventRegister(&epi.waiter, wmask) diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD index abcaf4206..0e71e800b 100644 --- a/runsc/boot/BUILD +++ b/runsc/boot/BUILD @@ -12,7 +12,6 @@ go_library( "controller.go", "debug.go", "events.go", - "fds.go", "fs.go", "limits.go", "loader.go", @@ -43,6 +42,7 @@ go_library( "//pkg/sentry/arch:registers_go_proto", "//pkg/sentry/control", "//pkg/sentry/devices/memdev", + "//pkg/sentry/fdimport", "//pkg/sentry/fs", "//pkg/sentry/fs/dev", "//pkg/sentry/fs/gofer", @@ -53,6 +53,7 @@ go_library( "//pkg/sentry/fs/tmpfs", "//pkg/sentry/fs/tty", "//pkg/sentry/fs/user", + "//pkg/sentry/fsimpl/devpts", "//pkg/sentry/fsimpl/devtmpfs", "//pkg/sentry/fsimpl/gofer", "//pkg/sentry/fsimpl/host", diff --git a/runsc/boot/fds.go b/runsc/boot/fds.go deleted file mode 100644 index 7e7a31fbd..000000000 --- a/runsc/boot/fds.go +++ /dev/null @@ -1,108 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package boot - -import ( - "fmt" - - "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/sentry/fs" - "gvisor.dev/gvisor/pkg/sentry/fs/host" - vfshost "gvisor.dev/gvisor/pkg/sentry/fsimpl/host" - "gvisor.dev/gvisor/pkg/sentry/kernel" -) - -// createFDTable creates an FD table that contains stdin, stdout, and stderr. -// If console is true, then ioctl calls will be passed through to the host FD. -// Upon success, createFDMap dups then closes stdioFDs. -func createFDTable(ctx context.Context, console bool, stdioFDs []int) (*kernel.FDTable, error) { - if len(stdioFDs) != 3 { - return nil, fmt.Errorf("stdioFDs should contain exactly 3 FDs (stdin, stdout, and stderr), but %d FDs received", len(stdioFDs)) - } - - if kernel.VFS2Enabled { - return createFDTableVFS2(ctx, console, stdioFDs) - } - - k := kernel.KernelFromContext(ctx) - fdTable := k.NewFDTable() - defer fdTable.DecRef() - - var ttyFile *fs.File - for appFD, hostFD := range stdioFDs { - var appFile *fs.File - - if console && appFD < 3 { - // Import the file as a host TTY file. - if ttyFile == nil { - var err error - appFile, err = host.ImportFile(ctx, hostFD, true /* isTTY */) - if err != nil { - return nil, err - } - defer appFile.DecRef() - - // Remember this in the TTY file, as we will - // use it for the other stdio FDs. - ttyFile = appFile - } else { - // Re-use the existing TTY file, as all three - // stdio FDs must point to the same fs.File in - // order to share TTY state, specifically the - // foreground process group id. - appFile = ttyFile - } - } else { - // Import the file as a regular host file. - var err error - appFile, err = host.ImportFile(ctx, hostFD, false /* isTTY */) - if err != nil { - return nil, err - } - defer appFile.DecRef() - } - - // Add the file to the FD map. - if err := fdTable.NewFDAt(ctx, int32(appFD), appFile, kernel.FDFlags{}); err != nil { - return nil, err - } - } - - fdTable.IncRef() - return fdTable, nil -} - -func createFDTableVFS2(ctx context.Context, console bool, stdioFDs []int) (*kernel.FDTable, error) { - k := kernel.KernelFromContext(ctx) - fdTable := k.NewFDTable() - defer fdTable.DecRef() - - for appFD, hostFD := range stdioFDs { - // TODO(gvisor.dev/issue/1482): Add TTY support. - appFile, err := vfshost.ImportFD(ctx, k.HostMount(), hostFD, false) - if err != nil { - return nil, err - } - - if err := fdTable.NewFDAtVFS2(ctx, int32(appFD), appFile, kernel.FDFlags{}); err != nil { - appFile.DecRef() - return nil, err - } - appFile.DecRef() - } - - fdTable.IncRef() - return fdTable, nil -} diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index f6ea4c102..79ef3a880 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -27,16 +27,18 @@ import ( specs "github.com/opencontainers/runtime-spec/specs-go" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/cpuid" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/memutil" "gvisor.dev/gvisor/pkg/rand" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/control" + "gvisor.dev/gvisor/pkg/sentry/fdimport" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/host" "gvisor.dev/gvisor/pkg/sentry/fs/user" - vfs2host "gvisor.dev/gvisor/pkg/sentry/fsimpl/host" + hostvfs2 "gvisor.dev/gvisor/pkg/sentry/fsimpl/host" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" @@ -143,6 +145,9 @@ type execProcess struct { // tty will be nil if the process is not attached to a terminal. tty *host.TTYFileOperations + // tty will be nil if the process is not attached to a terminal. + ttyVFS2 *hostvfs2.TTYFileDescription + // pidnsPath is the pid namespace path in spec pidnsPath string } @@ -333,7 +338,7 @@ func New(args Args) (*Loader, error) { if kernel.VFS2Enabled { // Set up host mount that will be used for imported fds. - hostFilesystem := vfs2host.NewFilesystem(k.VFS()) + hostFilesystem := hostvfs2.NewFilesystem(k.VFS()) defer hostFilesystem.DecRef() hostMount, err := k.VFS().NewDisconnectedMount(hostFilesystem, nil, &vfs.MountOptions{}) if err != nil { @@ -528,6 +533,8 @@ func (l *Loader) run() error { // If we are restoring, we do not want to create a process. // l.restore is set by the container manager when a restore call is made. + var ttyFile *host.TTYFileOperations + var ttyFileVFS2 *hostvfs2.TTYFileDescription if !l.restore { if l.conf.ProfileEnable { pprof.Initialize() @@ -542,13 +549,14 @@ func (l *Loader) run() error { // Create the FD map, which will set stdin, stdout, and stderr. If console // is true, then ioctl calls will be passed through to the host fd. ctx := l.rootProcArgs.NewContext(l.k) - fdTable, err := createFDTable(ctx, l.console, l.stdioFDs) + var err error + + // CreateProcess takes a reference on FDMap if successful. We won't need + // ours either way. + l.rootProcArgs.FDTable, ttyFile, ttyFileVFS2, err = createFDTable(ctx, l.console, l.stdioFDs) if err != nil { return fmt.Errorf("importing fds: %v", err) } - // CreateProcess takes a reference on FDMap if successful. We won't need - // ours either way. - l.rootProcArgs.FDTable = fdTable // Setup the root container file system. l.startGoferMonitor(l.sandboxID, l.goferFDs) @@ -591,14 +599,16 @@ func (l *Loader) run() error { ep.pidnsPath = ns.Path } if l.console { - ttyFile, _ := l.rootProcArgs.FDTable.Get(0) - defer ttyFile.DecRef() - ep.tty = ttyFile.FileOperations.(*host.TTYFileOperations) - - // Set the foreground process group on the TTY to the global - // init process group, since that is what we are about to - // start running. - ep.tty.InitForegroundProcessGroup(ep.tg.ProcessGroup()) + // Set the foreground process group on the TTY to the global init process + // group, since that is what we are about to start running. + switch { + case ttyFileVFS2 != nil: + ep.ttyVFS2 = ttyFileVFS2 + ttyFileVFS2.InitForegroundProcessGroup(ep.tg.ProcessGroup()) + case ttyFile != nil: + ep.tty = ttyFile + ttyFile.InitForegroundProcessGroup(ep.tg.ProcessGroup()) + } } // Handle signals by forwarding them to the root container process @@ -719,7 +729,7 @@ func (l *Loader) startContainer(spec *specs.Spec, conf *Config, cid string, file // Create the FD map, which will set stdin, stdout, and stderr. ctx := procArgs.NewContext(l.k) - fdTable, err := createFDTable(ctx, false, stdioFDs) + fdTable, _, _, err := createFDTable(ctx, false, stdioFDs) if err != nil { return fmt.Errorf("importing fds: %v", err) } @@ -804,14 +814,14 @@ func (l *Loader) destroyContainer(cid string) error { l.mu.Lock() defer l.mu.Unlock() - _, _, started, err := l.threadGroupFromIDLocked(execID{cid: cid}) + tg, err := l.tryThreadGroupFromIDLocked(execID{cid: cid}) if err != nil { // Container doesn't exist. return err } - // The container exists, has it been started? - if started { + // The container exists, but has it been started? + if tg != nil { if err := l.signalAllProcesses(cid, int32(linux.SIGKILL)); err != nil { return fmt.Errorf("sending SIGKILL to all container processes: %v", err) } @@ -853,48 +863,65 @@ func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) { l.mu.Lock() defer l.mu.Unlock() - tg, _, started, err := l.threadGroupFromIDLocked(execID{cid: args.ContainerID}) + tg, err := l.tryThreadGroupFromIDLocked(execID{cid: args.ContainerID}) if err != nil { return 0, err } - if !started { + if tg == nil { return 0, fmt.Errorf("container %q not started", args.ContainerID) } - // TODO(gvisor.dev/issue/1623): Add VFS2 support - // Get the container MountNamespace from the Task. - tg.Leader().WithMuLocked(func(t *kernel.Task) { + if kernel.VFS2Enabled { // task.MountNamespace() does not take a ref, so we must do so ourselves. - args.MountNamespace = t.MountNamespace() - args.MountNamespace.IncRef() - }) - if args.MountNamespace != nil { - defer args.MountNamespace.DecRef() + args.MountNamespaceVFS2 = tg.Leader().MountNamespaceVFS2() + args.MountNamespaceVFS2.IncRef() + } else { + tg.Leader().WithMuLocked(func(t *kernel.Task) { + // task.MountNamespace() does not take a ref, so we must do so ourselves. + args.MountNamespace = t.MountNamespace() + args.MountNamespace.IncRef() + }) } // Add the HOME environment variable if it is not already set. - root := args.MountNamespace.Root() - defer root.DecRef() - ctx := fs.WithRoot(l.k.SupervisorContext(), root) - envv, err := user.MaybeAddExecUserHome(ctx, args.MountNamespace, args.KUID, args.Envv) - if err != nil { - return 0, err + if kernel.VFS2Enabled { + defer args.MountNamespaceVFS2.DecRef() + + root := args.MountNamespaceVFS2.Root() + defer root.DecRef() + ctx := vfs.WithRoot(l.k.SupervisorContext(), root) + envv, err := user.MaybeAddExecUserHomeVFS2(ctx, args.MountNamespaceVFS2, args.KUID, args.Envv) + if err != nil { + return 0, err + } + args.Envv = envv + } else { + defer args.MountNamespace.DecRef() + + root := args.MountNamespace.Root() + defer root.DecRef() + ctx := fs.WithRoot(l.k.SupervisorContext(), root) + envv, err := user.MaybeAddExecUserHome(ctx, args.MountNamespace, args.KUID, args.Envv) + if err != nil { + return 0, err + } + args.Envv = envv } - args.Envv = envv // Start the process. proc := control.Proc{Kernel: l.k} args.PIDNamespace = tg.PIDNamespace() - newTG, tgid, ttyFile, err := control.ExecAsync(&proc, args) + newTG, tgid, ttyFile, ttyFileVFS2, err := control.ExecAsync(&proc, args) if err != nil { return 0, err } eid := execID{cid: args.ContainerID, pid: tgid} l.processes[eid] = &execProcess{ - tg: newTG, - tty: ttyFile, + tg: newTG, + tty: ttyFile, + ttyVFS2: ttyFileVFS2, } log.Debugf("updated processes: %v", l.processes) @@ -905,7 +932,7 @@ func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) { func (l *Loader) waitContainer(cid string, waitStatus *uint32) error { // Don't defer unlock, as doing so would make it impossible for // multiple clients to wait on the same container. - tg, _, err := l.threadGroupFromID(execID{cid: cid}) + tg, err := l.threadGroupFromID(execID{cid: cid}) if err != nil { return fmt.Errorf("can't wait for container %q: %v", cid, err) } @@ -924,7 +951,7 @@ func (l *Loader) waitPID(tgid kernel.ThreadID, cid string, waitStatus *uint32) e // Try to find a process that was exec'd eid := execID{cid: cid, pid: tgid} - execTG, _, err := l.threadGroupFromID(eid) + execTG, err := l.threadGroupFromID(eid) if err == nil { ws := l.wait(execTG) *waitStatus = ws @@ -938,7 +965,7 @@ func (l *Loader) waitPID(tgid kernel.ThreadID, cid string, waitStatus *uint32) e // The caller may be waiting on a process not started directly via exec. // In this case, find the process in the container's PID namespace. - initTG, _, err := l.threadGroupFromID(execID{cid: cid}) + initTG, err := l.threadGroupFromID(execID{cid: cid}) if err != nil { return fmt.Errorf("waiting for PID %d: %v", tgid, err) } @@ -1089,8 +1116,7 @@ func (l *Loader) signal(cid string, pid, signo int32, mode SignalDeliveryMode) e return fmt.Errorf("PID (%d) cannot be set when signaling all processes", pid) } // Check that the container has actually started before signaling it. - _, _, err := l.threadGroupFromID(execID{cid: cid}) - if err != nil { + if _, err := l.threadGroupFromID(execID{cid: cid}); err != nil { return err } if err := l.signalAllProcesses(cid, signo); err != nil { @@ -1104,7 +1130,7 @@ func (l *Loader) signal(cid string, pid, signo int32, mode SignalDeliveryMode) e } func (l *Loader) signalProcess(cid string, tgid kernel.ThreadID, signo int32) error { - execTG, _, err := l.threadGroupFromID(execID{cid: cid, pid: tgid}) + execTG, err := l.threadGroupFromID(execID{cid: cid, pid: tgid}) if err == nil { // Send signal directly to the identified process. return l.k.SendExternalSignalThreadGroup(execTG, &arch.SignalInfo{Signo: signo}) @@ -1113,7 +1139,7 @@ func (l *Loader) signalProcess(cid string, tgid kernel.ThreadID, signo int32) er // The caller may be signaling a process not started directly via exec. // In this case, find the process in the container's PID namespace and // signal it. - initTG, _, err := l.threadGroupFromID(execID{cid: cid}) + initTG, err := l.threadGroupFromID(execID{cid: cid}) if err != nil { return fmt.Errorf("no thread group found: %v", err) } @@ -1127,17 +1153,35 @@ func (l *Loader) signalProcess(cid string, tgid kernel.ThreadID, signo int32) er return l.k.SendExternalSignalThreadGroup(tg, &arch.SignalInfo{Signo: signo}) } +// signalForegrondProcessGroup looks up foreground process group from the TTY +// for the given "tgid" inside container "cid", and send the signal to it. func (l *Loader) signalForegrondProcessGroup(cid string, tgid kernel.ThreadID, signo int32) error { - // Lookup foreground process group from the TTY for the given process, - // and send the signal to it. - tg, tty, err := l.threadGroupFromID(execID{cid: cid, pid: tgid}) + l.mu.Lock() + tg, err := l.tryThreadGroupFromIDLocked(execID{cid: cid, pid: tgid}) if err != nil { + l.mu.Unlock() return fmt.Errorf("no thread group found: %v", err) } - if tty == nil { + if tg == nil { + l.mu.Unlock() + return fmt.Errorf("container %q not started", cid) + } + + tty, ttyVFS2, err := l.ttyFromIDLocked(execID{cid: cid, pid: tgid}) + l.mu.Unlock() + if err != nil { + return fmt.Errorf("no thread group found: %v", err) + } + + var pg *kernel.ProcessGroup + switch { + case ttyVFS2 != nil: + pg = ttyVFS2.ForegroundProcessGroup() + case tty != nil: + pg = tty.ForegroundProcessGroup() + default: return fmt.Errorf("no TTY attached") } - pg := tty.ForegroundProcessGroup() if pg == nil { // No foreground process group has been set. Signal the // original thread group. @@ -1168,33 +1212,57 @@ func (l *Loader) signalAllProcesses(cid string, signo int32) error { return l.k.SendContainerSignal(cid, &arch.SignalInfo{Signo: signo}) } -// threadGroupFromID same as threadGroupFromIDLocked except that it acquires -// mutex before calling it. -func (l *Loader) threadGroupFromID(key execID) (*kernel.ThreadGroup, *host.TTYFileOperations, error) { +// threadGroupFromID is similar to tryThreadGroupFromIDLocked except that it +// acquires mutex before calling it and fails in case container hasn't started +// yet. +func (l *Loader) threadGroupFromID(key execID) (*kernel.ThreadGroup, error) { l.mu.Lock() defer l.mu.Unlock() - tg, tty, ok, err := l.threadGroupFromIDLocked(key) + tg, err := l.tryThreadGroupFromIDLocked(key) if err != nil { - return nil, nil, err + return nil, err } - if !ok { - return nil, nil, fmt.Errorf("container %q not started", key.cid) + if tg == nil { + return nil, fmt.Errorf("container %q not started", key.cid) } - return tg, tty, nil + return tg, nil } -// threadGroupFromIDLocked returns the thread group and TTY for the given -// execution ID. TTY may be nil if the process is not attached to a terminal. -// Also returns a boolean indicating whether the container has already started. -// Returns error if execution ID is invalid or if the container cannot be -// found (maybe it has been deleted). Caller must hold 'mu'. -func (l *Loader) threadGroupFromIDLocked(key execID) (*kernel.ThreadGroup, *host.TTYFileOperations, bool, error) { +// tryThreadGroupFromIDLocked returns the thread group for the given execution +// ID. It may return nil in case the container has not started yet. Returns +// error if execution ID is invalid or if the container cannot be found (maybe +// it has been deleted). Caller must hold 'mu'. +func (l *Loader) tryThreadGroupFromIDLocked(key execID) (*kernel.ThreadGroup, error) { ep := l.processes[key] if ep == nil { - return nil, nil, false, fmt.Errorf("container %q not found", key.cid) + return nil, fmt.Errorf("container %q not found", key.cid) } - if ep.tg == nil { - return nil, nil, false, nil + return ep.tg, nil +} + +// ttyFromIDLocked returns the TTY files for the given execution ID. It may +// return nil in case the container has not started yet. Returns error if +// execution ID is invalid or if the container cannot be found (maybe it has +// been deleted). Caller must hold 'mu'. +func (l *Loader) ttyFromIDLocked(key execID) (*host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) { + ep := l.processes[key] + if ep == nil { + return nil, nil, fmt.Errorf("container %q not found", key.cid) + } + return ep.tty, ep.ttyVFS2, nil +} + +func createFDTable(ctx context.Context, console bool, stdioFDs []int) (*kernel.FDTable, *host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) { + if len(stdioFDs) != 3 { + return nil, nil, nil, fmt.Errorf("stdioFDs should contain exactly 3 FDs (stdin, stdout, and stderr), but %d FDs received", len(stdioFDs)) + } + + k := kernel.KernelFromContext(ctx) + fdTable := k.NewFDTable() + ttyFile, ttyFileVFS2, err := fdimport.Import(ctx, fdTable, console, stdioFDs) + if err != nil { + fdTable.DecRef() + return nil, nil, nil, err } - return ep.tg, ep.tty, true, nil + return fdTable, ttyFile, ttyFileVFS2, nil } diff --git a/runsc/boot/vfs.go b/runsc/boot/vfs.go index 0b9b0b436..448fc4459 100644 --- a/runsc/boot/vfs.go +++ b/runsc/boot/vfs.go @@ -26,6 +26,7 @@ import ( "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/devices/memdev" "gvisor.dev/gvisor/pkg/sentry/fs" + devpts2 "gvisor.dev/gvisor/pkg/sentry/fsimpl/devpts" devtmpfsimpl "gvisor.dev/gvisor/pkg/sentry/fsimpl/devtmpfs" goferimpl "gvisor.dev/gvisor/pkg/sentry/fsimpl/gofer" procimpl "gvisor.dev/gvisor/pkg/sentry/fsimpl/proc" @@ -41,37 +42,28 @@ import ( ) func registerFilesystems(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials) error { - - vfsObj.MustRegisterFilesystemType(rootFsName, &goferimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ - AllowUserList: true, - }) - - vfsObj.MustRegisterFilesystemType(bind, &goferimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + vfsObj.MustRegisterFilesystemType(devpts2.Name, &devpts2.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ AllowUserList: true, + // TODO(b/29356795): Users may mount this once the terminals are in a + // usable state. + AllowUserMount: false, }) - - vfsObj.MustRegisterFilesystemType(devpts, &devtmpfsimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + vfsObj.MustRegisterFilesystemType(devtmpfsimpl.Name, &devtmpfsimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ AllowUserMount: true, AllowUserList: true, }) - - vfsObj.MustRegisterFilesystemType(devtmpfs, &devtmpfsimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ - AllowUserMount: true, - AllowUserList: true, - }) - vfsObj.MustRegisterFilesystemType(proc, &procimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ - AllowUserMount: true, - AllowUserList: true, + vfsObj.MustRegisterFilesystemType(goferimpl.Name, &goferimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserList: true, }) - vfsObj.MustRegisterFilesystemType(sysfs, &sysimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + vfsObj.MustRegisterFilesystemType(procimpl.Name, &procimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ AllowUserMount: true, AllowUserList: true, }) - vfsObj.MustRegisterFilesystemType(tmpfs, &tmpfsimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + vfsObj.MustRegisterFilesystemType(sysimpl.Name, &sysimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ AllowUserMount: true, AllowUserList: true, }) - vfsObj.MustRegisterFilesystemType(nonefs, &sysimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + vfsObj.MustRegisterFilesystemType(tmpfsimpl.Name, &tmpfsimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ AllowUserMount: true, AllowUserList: true, }) @@ -108,7 +100,6 @@ func setupContainerVFS2(ctx context.Context, conf *Config, mntr *containerMounte } func setExecutablePathVFS2(ctx context.Context, procArgs *kernel.CreateProcessArgs) error { - exe := procArgs.Argv[0] // Absolute paths can be used directly. @@ -120,11 +111,9 @@ func setExecutablePathVFS2(ctx context.Context, procArgs *kernel.CreateProcessAr // Paths with '/' in them should be joined to the working directory, or // to the root if working directory is not set. if strings.IndexByte(exe, '/') > 0 { - if !path.IsAbs(procArgs.WorkingDirectory) { return fmt.Errorf("working directory %q must be absolute", procArgs.WorkingDirectory) } - procArgs.Filename = path.Join(procArgs.WorkingDirectory, exe) return nil } @@ -144,21 +133,17 @@ func setExecutablePathVFS2(ctx context.Context, procArgs *kernel.CreateProcessAr creds := procArgs.Credentials for _, p := range paths { - binPath := path.Join(p, exe) - pop := &vfs.PathOperation{ Root: root, Start: root, Path: fspath.Parse(binPath), FollowFinalSymlink: true, } - opts := &vfs.OpenOptions{ FileExec: true, Flags: linux.O_RDONLY, } - dentry, err := root.Mount().Filesystem().VirtualFilesystem().OpenAt(ctx, creds, pop, opts) if err == syserror.ENOENT || err == syserror.EACCES { // Didn't find it here. @@ -209,7 +194,6 @@ func (c *containerMounter) setupVFS2(ctx context.Context, conf *Config, procArgs } func (c *containerMounter) createMountNamespaceVFS2(ctx context.Context, conf *Config, creds *auth.Credentials) (*vfs.MountNamespace, error) { - fd := c.fds.remove() opts := strings.Join(p9MountOptionsVFS2(fd, conf.FileAccess), ",") @@ -222,7 +206,6 @@ func (c *containerMounter) createMountNamespaceVFS2(ctx context.Context, conf *C } func (c *containerMounter) mountSubmountsVFS2(ctx context.Context, conf *Config, mns *vfs.MountNamespace, creds *auth.Credentials) error { - c.prepareMountsVFS2() for _, submount := range c.mounts { @@ -256,7 +239,6 @@ func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *Config, if err != nil { return fmt.Errorf("mountOptions failed: %w", err) } - if fsName == "" { // Filesystem is not supported (e.g. cgroup), just skip it. return nil @@ -277,7 +259,7 @@ func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *Config, // All writes go to upper, be paranoid and make lower readonly. opts.ReadOnly = useOverlay - if err := c.k.VFS().MountAt(ctx, creds, "", target, submount.Type, opts); err != nil { + if err := c.k.VFS().MountAt(ctx, creds, "", target, fsName, opts); err != nil { return fmt.Errorf("failed to mount %q (type: %s): %w, opts: %v", submount.Destination, submount.Type, err, opts) } log.Infof("Mounted %q to %q type: %s, internal-options: %q", submount.Source, submount.Destination, submount.Type, opts) diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go index f607fe8af..1bea5db02 100644 --- a/runsc/container/container_test.go +++ b/runsc/container/container_test.go @@ -2189,63 +2189,70 @@ func TestTTYField(t *testing.T) { } for _, test := range testCases { - t.Run(test.name, func(t *testing.T) { - conf := testutil.TestConfig(t) - - // We will run /bin/sleep, possibly with an open TTY. - cmd := []string{"/bin/sleep", "10000"} - if test.useTTY { - // Run inside the "pty-runner". - cmd = append([]string{testApp, "pty-runner"}, cmd...) - } - - spec := testutil.NewSpecWithArgs(cmd...) - _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) - if err != nil { - t.Fatalf("error setting up container: %v", err) - } - defer cleanup() + for _, vfs2 := range []bool{false, true} { + name := test.name + if vfs2 { + name += "-vfs2" + } + t.Run(name, func(t *testing.T) { + conf := testutil.TestConfig(t) + conf.VFS2 = vfs2 + + // We will run /bin/sleep, possibly with an open TTY. + cmd := []string{"/bin/sleep", "10000"} + if test.useTTY { + // Run inside the "pty-runner". + cmd = append([]string{testApp, "pty-runner"}, cmd...) + } - // Create and start the container. - args := Args{ - ID: testutil.RandomContainerID(), - Spec: spec, - BundleDir: bundleDir, - } - c, err := New(conf, args) - if err != nil { - t.Fatalf("error creating container: %v", err) - } - defer c.Destroy() - if err := c.Start(conf); err != nil { - t.Fatalf("error starting container: %v", err) - } + spec := testutil.NewSpecWithArgs(cmd...) + _, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf) + if err != nil { + t.Fatalf("error setting up container: %v", err) + } + defer cleanup() - // Wait for sleep to be running, and check the TTY - // field. - var gotTTYField string - cb := func() error { - ps, err := c.Processes() + // Create and start the container. + args := Args{ + ID: testutil.RandomContainerID(), + Spec: spec, + BundleDir: bundleDir, + } + c, err := New(conf, args) if err != nil { - err = fmt.Errorf("error getting process data from container: %v", err) - return &backoff.PermanentError{Err: err} + t.Fatalf("error creating container: %v", err) + } + defer c.Destroy() + if err := c.Start(conf); err != nil { + t.Fatalf("error starting container: %v", err) } - for _, p := range ps { - if strings.Contains(p.Cmd, "sleep") { - gotTTYField = p.TTY - return nil + + // Wait for sleep to be running, and check the TTY + // field. + var gotTTYField string + cb := func() error { + ps, err := c.Processes() + if err != nil { + err = fmt.Errorf("error getting process data from container: %v", err) + return &backoff.PermanentError{Err: err} } + for _, p := range ps { + if strings.Contains(p.Cmd, "sleep") { + gotTTYField = p.TTY + return nil + } + } + return fmt.Errorf("sleep not running") + } + if err := testutil.Poll(cb, 30*time.Second); err != nil { + t.Fatalf("error waiting for sleep process: %v", err) } - return fmt.Errorf("sleep not running") - } - if err := testutil.Poll(cb, 30*time.Second); err != nil { - t.Fatalf("error waiting for sleep process: %v", err) - } - if gotTTYField != test.wantTTYField { - t.Errorf("tty field got %q, want %q", gotTTYField, test.wantTTYField) - } - }) + if gotTTYField != test.wantTTYField { + t.Errorf("tty field got %q, want %q", gotTTYField, test.wantTTYField) + } + }) + } } } -- cgit v1.2.3 From 57dbd7f3624528a416664a618ce9edd4c9096d8d Mon Sep 17 00:00:00 2001 From: Fabricio Voznika Date: Mon, 4 May 2020 13:39:15 -0700 Subject: Remove kernfs.Filesystem cast from GenericDirectoryFD This allows for kerfs.Filesystem to be overridden by different implementations. Updates #1672 PiperOrigin-RevId: 309809321 --- pkg/sentry/fsimpl/kernfs/fd_impl_util.go | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go index dd5806301..8284e76a7 100644 --- a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go @@ -22,6 +22,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -36,13 +37,20 @@ import ( // inode. // // Must be initialize with Init before first use. +// +// Lock ordering: mu => children.mu. type GenericDirectoryFD struct { vfs.FileDescriptionDefaultImpl vfs.DirectoryFileDescriptionDefaultImpl vfsfd vfs.FileDescription children *OrderedChildren - off int64 + + // mu protects the fields below. + mu sync.Mutex + + // off is the current directory offset. Protected by "mu". + off int64 } // NewGenericDirectoryFD creates a new GenericDirectoryFD and returns its @@ -115,17 +123,13 @@ func (fd *GenericDirectoryFD) inode() Inode { // IterDirents implements vfs.FileDecriptionImpl.IterDirents. IterDirents holds // o.mu when calling cb. func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error { - vfsFS := fd.filesystem() - fs := vfsFS.Impl().(*Filesystem) - vfsd := fd.vfsfd.VirtualDentry().Dentry() - - fs.mu.Lock() - defer fs.mu.Unlock() + fd.mu.Lock() + defer fd.mu.Unlock() opts := vfs.StatOptions{Mask: linux.STATX_INO} // Handle ".". if fd.off == 0 { - stat, err := fd.inode().Stat(vfsFS, opts) + stat, err := fd.inode().Stat(fd.filesystem(), opts) if err != nil { return err } @@ -143,8 +147,9 @@ func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirent // Handle "..". if fd.off == 1 { + vfsd := fd.vfsfd.VirtualDentry().Dentry() parentInode := genericParentOrSelf(vfsd.Impl().(*Dentry)).inode - stat, err := parentInode.Stat(vfsFS, opts) + stat, err := parentInode.Stat(fd.filesystem(), opts) if err != nil { return err } @@ -168,7 +173,7 @@ func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirent childIdx := fd.off - 2 for it := fd.children.nthLocked(childIdx); it != nil; it = it.Next() { inode := it.Dentry.Impl().(*Dentry).inode - stat, err := inode.Stat(vfsFS, opts) + stat, err := inode.Stat(fd.filesystem(), opts) if err != nil { return err } @@ -192,9 +197,8 @@ func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirent // Seek implements vfs.FileDecriptionImpl.Seek. func (fd *GenericDirectoryFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { - fs := fd.filesystem().Impl().(*Filesystem) - fs.mu.Lock() - defer fs.mu.Unlock() + fd.mu.Lock() + defer fd.mu.Unlock() switch whence { case linux.SEEK_SET: -- cgit v1.2.3 From 35951c3671f3d429399eb581ad9da3b56e2a5f5a Mon Sep 17 00:00:00 2001 From: Jamie Liu Date: Mon, 4 May 2020 18:00:56 -0700 Subject: Translate p9.NoUID/GID to OverflowUID/GID. p9.NoUID/GID (== uint32(-1) == auth.NoID) is not a valid auth.KUID/KGID; in particular, using it for file ownership causes capabilities to be ineffective since file capabilities require that the file's KUID and KGID are mapped into the capability holder's user namespace [1], and auth.NoID is not mapped into any user namespace. Map p9.NoUID/GID to a different, valid KUID/KGID; in the unlikely case that an application actually using the overflow KUID/KGID attempts an operation that is consequently permitted by client permission checks, the remote operation will still fail with EPERM. Since this changes the VFS2 gofer client to no longer ignore the invalid IDs entirely, this CL both permits and requires that we change synthetic mount point creation to use root credentials. [1] See fs.Inode.CheckCapability or vfs.GenericCheckPermissions. PiperOrigin-RevId: 309856455 --- pkg/sentry/fs/gofer/attr.go | 12 ++++++++++-- pkg/sentry/fsimpl/gofer/gofer.go | 26 ++++++++++++++++++++------ 2 files changed, 30 insertions(+), 8 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fs/gofer/attr.go b/pkg/sentry/fs/gofer/attr.go index 6db4b762d..d481baf77 100644 --- a/pkg/sentry/fs/gofer/attr.go +++ b/pkg/sentry/fs/gofer/attr.go @@ -75,10 +75,18 @@ func owner(mounter fs.FileOwner, valid p9.AttrMask, pattr p9.Attr) fs.FileOwner // task's EUID/EGID. owner := mounter if valid.UID { - owner.UID = auth.KUID(pattr.UID) + if pattr.UID.Ok() { + owner.UID = auth.KUID(pattr.UID) + } else { + owner.UID = auth.KUID(auth.OverflowUID) + } } if valid.GID { - owner.GID = auth.KGID(pattr.GID) + if pattr.GID.Ok() { + owner.GID = auth.KGID(pattr.GID) + } else { + owner.GID = auth.KGID(auth.OverflowGID) + } } return owner } diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go index 2881c7bdd..1d9caf127 100644 --- a/pkg/sentry/fsimpl/gofer/gofer.go +++ b/pkg/sentry/fsimpl/gofer/gofer.go @@ -663,11 +663,11 @@ func (fs *filesystem) newDentry(ctx context.Context, file p9file, qid p9.QID, ma }, } d.pf.dentry = d - if mask.UID && attr.UID != auth.NoID { - d.uid = uint32(attr.UID) + if mask.UID { + d.uid = dentryUIDFromP9UID(attr.UID) } - if mask.GID && attr.GID != auth.NoID { - d.gid = uint32(attr.GID) + if mask.GID { + d.gid = dentryGIDFromP9GID(attr.GID) } if mask.Size { d.size = attr.Size @@ -718,10 +718,10 @@ func (d *dentry) updateFromP9Attrs(mask p9.AttrMask, attr *p9.Attr) { atomic.StoreUint32(&d.mode, uint32(attr.Mode)) } if mask.UID { - atomic.StoreUint32(&d.uid, uint32(attr.UID)) + atomic.StoreUint32(&d.uid, dentryUIDFromP9UID(attr.UID)) } if mask.GID { - atomic.StoreUint32(&d.gid, uint32(attr.GID)) + atomic.StoreUint32(&d.gid, dentryGIDFromP9GID(attr.GID)) } // There is no P9_GETATTR_* bit for I/O block size. if attr.BlockSize != 0 { @@ -939,6 +939,20 @@ func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(atomic.LoadUint32(&d.mode)), auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))) } +func dentryUIDFromP9UID(uid p9.UID) uint32 { + if !uid.Ok() { + return uint32(auth.OverflowUID) + } + return uint32(uid) +} + +func dentryGIDFromP9GID(gid p9.GID) uint32 { + if !gid.Ok() { + return uint32(auth.OverflowGID) + } + return uint32(gid) +} + // IncRef implements vfs.DentryImpl.IncRef. func (d *dentry) IncRef() { // d.refs may be 0 if d.fs.renameMu is locked, which serializes against -- cgit v1.2.3 From b3bd41434c17a95a87d67490f2b9bfd71e1ad705 Mon Sep 17 00:00:00 2001 From: Fabricio Voznika Date: Tue, 5 May 2020 09:18:21 -0700 Subject: Return correct name for imported host files Implement PrependPath() in host.filesystem to correctly format name for host files. Updates #1672 PiperOrigin-RevId: 309959135 --- pkg/sentry/fsimpl/devpts/devpts.go | 2 +- pkg/sentry/fsimpl/host/BUILD | 1 + pkg/sentry/fsimpl/host/host.go | 68 ++++++++++++++++++--------------- pkg/sentry/fsimpl/kernfs/kernfs.go | 12 +++--- pkg/sentry/fsimpl/kernfs/kernfs_test.go | 2 +- pkg/sentry/fsimpl/pipefs/pipefs.go | 28 ++++++-------- pkg/sentry/fsimpl/sockfs/sockfs.go | 9 +---- pkg/sentry/fsimpl/sys/sys.go | 2 +- 8 files changed, 60 insertions(+), 64 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/devpts/devpts.go b/pkg/sentry/fsimpl/devpts/devpts.go index 181d765d3..94db8fe5c 100644 --- a/pkg/sentry/fsimpl/devpts/devpts.go +++ b/pkg/sentry/fsimpl/devpts/devpts.go @@ -59,7 +59,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt // master inode. It returns the filesystem and root Dentry. func (fstype FilesystemType) newFilesystem(vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials) (*kernfs.Filesystem, *kernfs.Dentry) { fs := &kernfs.Filesystem{} - fs.Init(vfsObj, fstype) + fs.VFSFilesystem().Init(vfsObj, fstype, fs) // Construct the root directory. This is always inode id 1. root := &rootInode{ diff --git a/pkg/sentry/fsimpl/host/BUILD b/pkg/sentry/fsimpl/host/BUILD index e1c56d89b..39509f703 100644 --- a/pkg/sentry/fsimpl/host/BUILD +++ b/pkg/sentry/fsimpl/host/BUILD @@ -20,6 +20,7 @@ go_library( "//pkg/abi/linux", "//pkg/context", "//pkg/fdnotifier", + "//pkg/fspath", "//pkg/log", "//pkg/refs", "//pkg/sentry/arch", diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go index 2be498afc..144e04905 100644 --- a/pkg/sentry/fsimpl/host/host.go +++ b/pkg/sentry/fsimpl/host/host.go @@ -25,6 +25,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fdnotifier" + "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" @@ -39,37 +40,9 @@ import ( "gvisor.dev/gvisor/pkg/waiter" ) -// filesystemType implements vfs.FilesystemType. -type filesystemType struct{} - -// GetFilesystem implements FilesystemType.GetFilesystem. -func (filesystemType) GetFilesystem(context.Context, *vfs.VirtualFilesystem, *auth.Credentials, string, vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { - panic("host.filesystemType.GetFilesystem should never be called") -} - -// Name implements FilesystemType.Name. -func (filesystemType) Name() string { - return "none" -} - -// filesystem implements vfs.FilesystemImpl. -type filesystem struct { - kernfs.Filesystem -} - -// NewFilesystem sets up and returns a new hostfs filesystem. -// -// Note that there should only ever be one instance of host.filesystem, -// a global mount for host fds. -func NewFilesystem(vfsObj *vfs.VirtualFilesystem) *vfs.Filesystem { - fs := &filesystem{} - fs.Init(vfsObj, filesystemType{}) - return fs.VFSFilesystem() -} - // ImportFD sets up and returns a vfs.FileDescription from a donated fd. func ImportFD(ctx context.Context, mnt *vfs.Mount, hostFD int, isTTY bool) (*vfs.FileDescription, error) { - fs, ok := mnt.Filesystem().Impl().(*kernfs.Filesystem) + fs, ok := mnt.Filesystem().Impl().(*filesystem) if !ok { return nil, fmt.Errorf("can't import host FDs into filesystems of type %T", mnt.Filesystem().Impl()) } @@ -119,12 +92,47 @@ func ImportFD(ctx context.Context, mnt *vfs.Mount, hostFD int, isTTY bool) (*vfs d := &kernfs.Dentry{} d.Init(i) + // i.open will take a reference on d. defer d.DecRef() - return i.open(ctx, d.VFSDentry(), mnt) } +// filesystemType implements vfs.FilesystemType. +type filesystemType struct{} + +// GetFilesystem implements FilesystemType.GetFilesystem. +func (filesystemType) GetFilesystem(context.Context, *vfs.VirtualFilesystem, *auth.Credentials, string, vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { + panic("host.filesystemType.GetFilesystem should never be called") +} + +// Name implements FilesystemType.Name. +func (filesystemType) Name() string { + return "none" +} + +// NewFilesystem sets up and returns a new hostfs filesystem. +// +// Note that there should only ever be one instance of host.filesystem, +// a global mount for host fds. +func NewFilesystem(vfsObj *vfs.VirtualFilesystem) *vfs.Filesystem { + fs := &filesystem{} + fs.VFSFilesystem().Init(vfsObj, filesystemType{}, fs) + return fs.VFSFilesystem() +} + +// filesystem implements vfs.FilesystemImpl. +type filesystem struct { + kernfs.Filesystem +} + +func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error { + d := vd.Dentry().Impl().(*kernfs.Dentry) + inode := d.Inode().(*inode) + b.PrependComponent(fmt.Sprintf("host:[%d]", inode.ino)) + return vfs.PrependPathSyntheticError{} +} + // inode implements kernfs.Inode. type inode struct { kernfs.InodeNotDirectory diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go index 732837933..a83151ad3 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs.go @@ -132,13 +132,6 @@ func (fs *Filesystem) processDeferredDecRefsLocked() { fs.droppedDentriesMu.Unlock() } -// Init initializes a kernfs filesystem. This should be called from during -// vfs.FilesystemType.NewFilesystem for the concrete filesystem embedding -// kernfs. -func (fs *Filesystem) Init(vfsObj *vfs.VirtualFilesystem, fsType vfs.FilesystemType) { - fs.vfsfs.Init(vfsObj, fsType, fs) -} - // VFSFilesystem returns the generic vfs filesystem object. func (fs *Filesystem) VFSFilesystem() *vfs.Filesystem { return &fs.vfsfs @@ -261,6 +254,11 @@ func (d *Dentry) insertChildLocked(name string, child *Dentry) { d.children[name] = child } +// Inode returns the dentry's inode. +func (d *Dentry) Inode() Inode { + return d.inode +} + // The Inode interface maps filesystem-level operations that operate on paths to // equivalent operations on specific filesystem nodes. // diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go index a9f671bc8..1c5d3e7e7 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs_test.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go @@ -195,7 +195,7 @@ func (fsType) Name() string { func (fst fsType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opt vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { fs := &filesystem{} - fs.Init(vfsObj, &fst) + fs.VFSFilesystem().Init(vfsObj, &fst, fs) root := fst.rootFn(creds, fs) return fs.VFSFilesystem(), root.VFSDentry(), nil } diff --git a/pkg/sentry/fsimpl/pipefs/pipefs.go b/pkg/sentry/fsimpl/pipefs/pipefs.go index d6bd67467..5375e5e75 100644 --- a/pkg/sentry/fsimpl/pipefs/pipefs.go +++ b/pkg/sentry/fsimpl/pipefs/pipefs.go @@ -40,25 +40,19 @@ func (filesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFile panic("pipefs.filesystemType.GetFilesystem should never be called") } -// filesystem implements vfs.FilesystemImpl. -type filesystem struct { - kernfs.Filesystem - - // TODO(gvisor.dev/issue/1193): - // - // - kernfs does not provide a way to implement statfs, from which we - // should indicate PIPEFS_MAGIC. - // - // - kernfs does not provide a way to override names for - // vfs.FilesystemImpl.PrependPath(); pipefs inodes should use synthetic - // name fmt.Sprintf("pipe:[%d]", inode.ino). -} +// TODO(gvisor.dev/issue/1193): +// +// - kernfs does not provide a way to implement statfs, from which we +// should indicate PIPEFS_MAGIC. +// +// - kernfs does not provide a way to override names for +// vfs.FilesystemImpl.PrependPath(); pipefs inodes should use synthetic +// name fmt.Sprintf("pipe:[%d]", inode.ino). -// NewFilesystem sets up and returns a new vfs.Filesystem implemented by -// pipefs. +// NewFilesystem sets up and returns a new vfs.Filesystem implemented by pipefs. func NewFilesystem(vfsObj *vfs.VirtualFilesystem) *vfs.Filesystem { - fs := &filesystem{} - fs.Init(vfsObj, filesystemType{}) + fs := &kernfs.Filesystem{} + fs.VFSFilesystem().Init(vfsObj, filesystemType{}, fs) return fs.VFSFilesystem() } diff --git a/pkg/sentry/fsimpl/sockfs/sockfs.go b/pkg/sentry/fsimpl/sockfs/sockfs.go index dac2389fc..3f085d3ca 100644 --- a/pkg/sentry/fsimpl/sockfs/sockfs.go +++ b/pkg/sentry/fsimpl/sockfs/sockfs.go @@ -41,18 +41,13 @@ func (filesystemType) Name() string { return "sockfs" } -// filesystem implements vfs.FilesystemImpl. -type filesystem struct { - kernfs.Filesystem -} - // NewFilesystem sets up and returns a new sockfs filesystem. // // Note that there should only ever be one instance of sockfs.Filesystem, // backing a global socket mount. func NewFilesystem(vfsObj *vfs.VirtualFilesystem) *vfs.Filesystem { - fs := &filesystem{} - fs.Init(vfsObj, filesystemType{}) + fs := &kernfs.Filesystem{} + fs.VFSFilesystem().Init(vfsObj, filesystemType{}, fs) return fs.VFSFilesystem() } diff --git a/pkg/sentry/fsimpl/sys/sys.go b/pkg/sentry/fsimpl/sys/sys.go index f8d25d35e..00f7d6214 100644 --- a/pkg/sentry/fsimpl/sys/sys.go +++ b/pkg/sentry/fsimpl/sys/sys.go @@ -47,7 +47,7 @@ func (FilesystemType) Name() string { // GetFilesystem implements vfs.FilesystemType.GetFilesystem. func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { fs := &filesystem{} - fs.Filesystem.Init(vfsObj, &fsType) + fs.VFSFilesystem().Init(vfsObj, &fsType, fs) k := kernel.KernelFromContext(ctx) maxCPUCores := k.ApplicationCores() defaultSysDirMode := linux.FileMode(0755) -- cgit v1.2.3 From a6dbf9596de58f1a264c236bf5afb8dfcfe78174 Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Tue, 5 May 2020 10:00:02 -0700 Subject: Update comments for synthetic gofer files in vfs2. PiperOrigin-RevId: 309966538 --- pkg/sentry/fsimpl/gofer/filesystem.go | 2 ++ pkg/sentry/fsimpl/gofer/gofer.go | 6 ++---- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go index 4a8411371..4a32821bd 100644 --- a/pkg/sentry/fsimpl/gofer/filesystem.go +++ b/pkg/sentry/fsimpl/gofer/filesystem.go @@ -686,6 +686,8 @@ func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, name string) error { creds := rp.Credentials() _, err := parent.file.mknod(ctx, name, (p9.FileMode)(opts.Mode), opts.DevMajor, opts.DevMinor, (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID)) + // If the gofer does not allow creating a socket or pipe, create a + // synthetic one, i.e. one that is kept entirely in memory. if err == syserror.EPERM { switch opts.Mode.FileType() { case linux.S_IFSOCK: diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go index 1d9caf127..9ab8fdc65 100644 --- a/pkg/sentry/fsimpl/gofer/gofer.go +++ b/pkg/sentry/fsimpl/gofer/gofer.go @@ -496,10 +496,8 @@ type dentry struct { // file is the unopened p9.File that backs this dentry. file is immutable. // // If file.isNil(), this dentry represents a synthetic file, i.e. a file - // that does not exist on the remote filesystem. As of this writing, this - // is only possible for a directory created with - // MkdirOptions.ForSyntheticMountpoint == true. - // TODO(gvisor.dev/issue/1476): Support synthetic sockets (and pipes). + // that does not exist on the remote filesystem. As of this writing, the + // only files that can be synthetic are sockets, pipes, and directories. file p9file // If deleted is non-zero, the file represented by this dentry has been -- cgit v1.2.3 From faf89dd31a44b8409b32919d7193834e194ecc56 Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Tue, 5 May 2020 12:09:39 -0700 Subject: Update vfs2 socket TODOs. Three updates: - Mark all vfs2 socket syscalls as supported. - Use the same dev number and ino number generator for all types of sockets, unlike in VFS1. - Do not use host fd for hostinet metadata. Fixes #1476, #1478, #1484, 1485, #2017. PiperOrigin-RevId: 309994579 --- pkg/sentry/fsimpl/sockfs/sockfs.go | 9 +---- pkg/sentry/socket/hostinet/socket_vfs2.go | 7 ++-- pkg/sentry/socket/netstack/netstack_vfs2.go | 3 ++ pkg/sentry/socket/unix/unix_vfs2.go | 3 ++ .../syscalls/linux/vfs2/linux64_override_amd64.go | 40 ++++++++++------------ 5 files changed, 30 insertions(+), 32 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/sockfs/sockfs.go b/pkg/sentry/fsimpl/sockfs/sockfs.go index 3f085d3ca..239a9f4b4 100644 --- a/pkg/sentry/fsimpl/sockfs/sockfs.go +++ b/pkg/sentry/fsimpl/sockfs/sockfs.go @@ -53,9 +53,7 @@ func NewFilesystem(vfsObj *vfs.VirtualFilesystem) *vfs.Filesystem { // inode implements kernfs.Inode. // -// TODO(gvisor.dev/issue/1476): Add device numbers to this inode (which are -// not included in InodeAttrs) to store the numbers of the appropriate -// socket device. Override InodeAttrs.Stat() accordingly. +// TODO(gvisor.dev/issue/1193): Device numbers. type inode struct { kernfs.InodeNotDirectory kernfs.InodeNotSymlink @@ -69,11 +67,6 @@ func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentr } // NewDentry constructs and returns a sockfs dentry. -// -// TODO(gvisor.dev/issue/1476): Currently, we are using -// sockfs.filesystem.NextIno() to get inode numbers. We should use -// device-specific numbers, so that we are not using the same generator for -// netstack, unix, etc. func NewDentry(creds *auth.Credentials, ino uint64) *vfs.Dentry { // File mode matches net/socket.c:sock_alloc. filemode := linux.FileMode(linux.S_IFSOCK | 0600) diff --git a/pkg/sentry/socket/hostinet/socket_vfs2.go b/pkg/sentry/socket/hostinet/socket_vfs2.go index b03ca2f26..a8278bffc 100644 --- a/pkg/sentry/socket/hostinet/socket_vfs2.go +++ b/pkg/sentry/socket/hostinet/socket_vfs2.go @@ -36,8 +36,11 @@ import ( type socketVFS2 struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl - // TODO(gvisor.dev/issue/1484): VFS1 stores internal metadata for hostinet. - // We should perhaps rely on the host, much like in hostfs. + + // We store metadata for hostinet sockets internally. Technically, we should + // access metadata (e.g. through stat, chmod) on the host for correctness, + // but this is not very useful for inet socket fds, which do not belong to a + // concrete file anyway. vfs.DentryMetadataFileDescriptionImpl socketOpsCommon diff --git a/pkg/sentry/socket/netstack/netstack_vfs2.go b/pkg/sentry/socket/netstack/netstack_vfs2.go index eec71035d..f7d9b2ff4 100644 --- a/pkg/sentry/socket/netstack/netstack_vfs2.go +++ b/pkg/sentry/socket/netstack/netstack_vfs2.go @@ -22,6 +22,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/socket" "gvisor.dev/gvisor/pkg/sentry/socket/netfilter" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserr" @@ -41,6 +42,8 @@ type SocketVFS2 struct { socketOpsCommon } +var _ = socket.SocketVFS2(&SocketVFS2{}) + // NewVFS2 creates a new endpoint socket. func NewVFS2(t *kernel.Task, family int, skType linux.SockType, protocol int, queue *waiter.Queue, endpoint tcpip.Endpoint) (*vfs.FileDescription, *syserr.Error) { if skType == linux.SOCK_STREAM { diff --git a/pkg/sentry/socket/unix/unix_vfs2.go b/pkg/sentry/socket/unix/unix_vfs2.go index 5edc3cdf4..06d838868 100644 --- a/pkg/sentry/socket/unix/unix_vfs2.go +++ b/pkg/sentry/socket/unix/unix_vfs2.go @@ -22,6 +22,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs" "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/socket" "gvisor.dev/gvisor/pkg/sentry/socket/control" "gvisor.dev/gvisor/pkg/sentry/socket/netstack" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" @@ -43,6 +44,8 @@ type SocketVFS2 struct { socketOpsCommon } +var _ = socket.SocketVFS2(&SocketVFS2{}) + // NewSockfsFile creates a new socket file in the global sockfs mount and // returns a corresponding file description. func NewSockfsFile(t *kernel.Task, ep transport.Endpoint, stype linux.SockType) (*vfs.FileDescription, *syserr.Error) { diff --git a/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go b/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go index 074f58e5d..47c5d18e7 100644 --- a/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go +++ b/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go @@ -44,22 +44,21 @@ func Override(table map[uintptr]kernel.Syscall) { table[32] = syscalls.Supported("dup", Dup) table[33] = syscalls.Supported("dup2", Dup2) delete(table, 40) // sendfile - // TODO(gvisor.dev/issue/1485): Port all socket variants to VFS2. - table[41] = syscalls.PartiallySupported("socket", Socket, "In process of porting socket syscalls to VFS2.", nil) - table[42] = syscalls.PartiallySupported("connect", Connect, "In process of porting socket syscalls to VFS2.", nil) - table[43] = syscalls.PartiallySupported("accept", Accept, "In process of porting socket syscalls to VFS2.", nil) - table[44] = syscalls.PartiallySupported("sendto", SendTo, "In process of porting socket syscalls to VFS2.", nil) - table[45] = syscalls.PartiallySupported("recvfrom", RecvFrom, "In process of porting socket syscalls to VFS2.", nil) - table[46] = syscalls.PartiallySupported("sendmsg", SendMsg, "In process of porting socket syscalls to VFS2.", nil) - table[47] = syscalls.PartiallySupported("recvmsg", RecvMsg, "In process of porting socket syscalls to VFS2.", nil) - table[48] = syscalls.PartiallySupported("shutdown", Shutdown, "In process of porting socket syscalls to VFS2.", nil) - table[49] = syscalls.PartiallySupported("bind", Bind, "In process of porting socket syscalls to VFS2.", nil) - table[50] = syscalls.PartiallySupported("listen", Listen, "In process of porting socket syscalls to VFS2.", nil) - table[51] = syscalls.PartiallySupported("getsockname", GetSockName, "In process of porting socket syscalls to VFS2.", nil) - table[52] = syscalls.PartiallySupported("getpeername", GetPeerName, "In process of porting socket syscalls to VFS2.", nil) - table[53] = syscalls.PartiallySupported("socketpair", SocketPair, "In process of porting socket syscalls to VFS2.", nil) - table[54] = syscalls.PartiallySupported("setsockopt", SetSockOpt, "In process of porting socket syscalls to VFS2.", nil) - table[55] = syscalls.PartiallySupported("getsockopt", GetSockOpt, "In process of porting socket syscalls to VFS2.", nil) + table[41] = syscalls.Supported("socket", Socket) + table[42] = syscalls.Supported("connect", Connect) + table[43] = syscalls.Supported("accept", Accept) + table[44] = syscalls.Supported("sendto", SendTo) + table[45] = syscalls.Supported("recvfrom", RecvFrom) + table[46] = syscalls.Supported("sendmsg", SendMsg) + table[47] = syscalls.Supported("recvmsg", RecvMsg) + table[48] = syscalls.Supported("shutdown", Shutdown) + table[49] = syscalls.Supported("bind", Bind) + table[50] = syscalls.Supported("listen", Listen) + table[51] = syscalls.Supported("getsockname", GetSockName) + table[52] = syscalls.Supported("getpeername", GetPeerName) + table[53] = syscalls.Supported("socketpair", SocketPair) + table[54] = syscalls.Supported("setsockopt", SetSockOpt) + table[55] = syscalls.Supported("getsockopt", GetSockOpt) table[59] = syscalls.Supported("execve", Execve) table[72] = syscalls.Supported("fcntl", Fcntl) delete(table, 73) // flock @@ -145,8 +144,7 @@ func Override(table map[uintptr]kernel.Syscall) { delete(table, 285) // fallocate table[286] = syscalls.Supported("timerfd_settime", TimerfdSettime) table[287] = syscalls.Supported("timerfd_gettime", TimerfdGettime) - // TODO(gvisor.dev/issue/1485): Port all socket variants to VFS2. - table[288] = syscalls.PartiallySupported("accept4", Accept4, "In process of porting socket syscalls to VFS2.", nil) + table[288] = syscalls.Supported("accept4", Accept4) delete(table, 289) // signalfd4 table[290] = syscalls.Supported("eventfd2", Eventfd2) table[291] = syscalls.Supported("epoll_create1", EpollCreate1) @@ -155,11 +153,9 @@ func Override(table map[uintptr]kernel.Syscall) { delete(table, 294) // inotify_init1 table[295] = syscalls.Supported("preadv", Preadv) table[296] = syscalls.Supported("pwritev", Pwritev) - // TODO(gvisor.dev/issue/1485): Port all socket variants to VFS2. - table[299] = syscalls.PartiallySupported("recvmmsg", RecvMMsg, "In process of porting socket syscalls to VFS2.", nil) + table[299] = syscalls.Supported("recvmmsg", RecvMMsg) table[306] = syscalls.Supported("syncfs", Syncfs) - // TODO(gvisor.dev/issue/1485): Port all socket variants to VFS2. - table[307] = syscalls.PartiallySupported("sendmmsg", SendMMsg, "In process of porting socket syscalls to VFS2.", nil) + table[307] = syscalls.Supported("sendmmsg", SendMMsg) table[316] = syscalls.Supported("renameat2", Renameat2) delete(table, 319) // memfd_create table[322] = syscalls.Supported("execveat", Execveat) -- cgit v1.2.3 From 591ff0e424e3b30eb143bce06618cb8656784b90 Mon Sep 17 00:00:00 2001 From: Nicolas Lacasse Date: Wed, 6 May 2020 10:28:50 -0700 Subject: Add maximum memory limit. PiperOrigin-RevId: 310179277 --- pkg/sentry/fs/proc/meminfo.go | 10 +++++++--- pkg/sentry/fsimpl/proc/tasks_files.go | 10 +++++++--- pkg/sentry/syscalls/linux/sys_sysinfo.go | 7 ++++++- pkg/sentry/usage/memory.go | 24 ++++++++++++++++-------- 4 files changed, 36 insertions(+), 15 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fs/proc/meminfo.go b/pkg/sentry/fs/proc/meminfo.go index 465b47da9..91617267d 100644 --- a/pkg/sentry/fs/proc/meminfo.go +++ b/pkg/sentry/fs/proc/meminfo.go @@ -58,12 +58,16 @@ func (d *meminfoData) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) var buf bytes.Buffer fmt.Fprintf(&buf, "MemTotal: %8d kB\n", totalSize/1024) - memFree := (totalSize - totalUsage) / 1024 + memFree := totalSize - totalUsage + if memFree > totalSize { + // Underflow. + memFree = 0 + } // We use MemFree as MemAvailable because we don't swap. // TODO(rahat): When reclaim is implemented the value of MemAvailable // should change. - fmt.Fprintf(&buf, "MemFree: %8d kB\n", memFree) - fmt.Fprintf(&buf, "MemAvailable: %8d kB\n", memFree) + fmt.Fprintf(&buf, "MemFree: %8d kB\n", memFree/1024) + fmt.Fprintf(&buf, "MemAvailable: %8d kB\n", memFree/1024) fmt.Fprintf(&buf, "Buffers: 0 kB\n") // memory usage by block devices fmt.Fprintf(&buf, "Cached: %8d kB\n", (file+snapshot.Tmpfs)/1024) // Emulate a system with no swap, which disables inactivation of anon pages. diff --git a/pkg/sentry/fsimpl/proc/tasks_files.go b/pkg/sentry/fsimpl/proc/tasks_files.go index 92007df81..e5f13b69e 100644 --- a/pkg/sentry/fsimpl/proc/tasks_files.go +++ b/pkg/sentry/fsimpl/proc/tasks_files.go @@ -272,12 +272,16 @@ func (*meminfoData) Generate(ctx context.Context, buf *bytes.Buffer) error { inactiveFile := file - activeFile fmt.Fprintf(buf, "MemTotal: %8d kB\n", totalSize/1024) - memFree := (totalSize - totalUsage) / 1024 + memFree := totalSize - totalUsage + if memFree > totalSize { + // Underflow. + memFree = 0 + } // We use MemFree as MemAvailable because we don't swap. // TODO(rahat): When reclaim is implemented the value of MemAvailable // should change. - fmt.Fprintf(buf, "MemFree: %8d kB\n", memFree) - fmt.Fprintf(buf, "MemAvailable: %8d kB\n", memFree) + fmt.Fprintf(buf, "MemFree: %8d kB\n", memFree/1024) + fmt.Fprintf(buf, "MemAvailable: %8d kB\n", memFree/1024) fmt.Fprintf(buf, "Buffers: 0 kB\n") // memory usage by block devices fmt.Fprintf(buf, "Cached: %8d kB\n", (file+snapshot.Tmpfs)/1024) // Emulate a system with no swap, which disables inactivation of anon pages. diff --git a/pkg/sentry/syscalls/linux/sys_sysinfo.go b/pkg/sentry/syscalls/linux/sys_sysinfo.go index a65b560c8..297de052a 100644 --- a/pkg/sentry/syscalls/linux/sys_sysinfo.go +++ b/pkg/sentry/syscalls/linux/sys_sysinfo.go @@ -29,13 +29,18 @@ func Sysinfo(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca mf.UpdateUsage() _, totalUsage := usage.MemoryAccounting.Copy() totalSize := usage.TotalMemory(mf.TotalSize(), totalUsage) + memFree := totalSize - totalUsage + if memFree > totalSize { + // Underflow. + memFree = 0 + } // Only a subset of the fields in sysinfo_t make sense to return. si := linux.Sysinfo{ Procs: uint16(len(t.PIDNamespace().Tasks())), Uptime: t.Kernel().MonotonicClock().Now().Seconds(), TotalRAM: totalSize, - FreeRAM: totalSize - totalUsage, + FreeRAM: memFree, Unit: 1, } _, err := t.CopyOut(addr, si) diff --git a/pkg/sentry/usage/memory.go b/pkg/sentry/usage/memory.go index 4320ad17f..ab1d140d2 100644 --- a/pkg/sentry/usage/memory.go +++ b/pkg/sentry/usage/memory.go @@ -252,18 +252,23 @@ func (m *MemoryLocked) Copy() (MemoryStats, uint64) { return ms, m.totalLocked() } -// MinimumTotalMemoryBytes is the minimum reported total system memory. -// -// This can be configured through options provided to the Sentry at start. -// This number is purely synthetic. This is only set before the application -// starts executing, and must not be modified. -var MinimumTotalMemoryBytes uint64 = 2 << 30 // 2 GB +// These options control how much total memory the is reported to the application. +// They may only be set before the application starts executing, and must not +// be modified. +var ( + // MinimumTotalMemoryBytes is the minimum reported total system memory. + MinimumTotalMemoryBytes uint64 = 2 << 30 // 2 GB + + // MaximumTotalMemoryBytes is the maximum reported total system memory. + // The 0 value indicates no maximum. + MaximumTotalMemoryBytes uint64 +) // TotalMemory returns the "total usable memory" available. // // This number doesn't really have a true value so it's based on the following -// inputs and further bounded to be above some minimum guaranteed value (2GB), -// additionally ensuring that total memory reported is always less than used. +// inputs and further bounded to be above the MinumumTotalMemoryBytes and below +// MaximumTotalMemoryBytes. // // memSize should be the platform.Memory size reported by platform.Memory.TotalSize() // used is the total memory reported by MemoryLocked.Total() @@ -279,5 +284,8 @@ func TotalMemory(memSize, used uint64) uint64 { memSize = uint64(1) << (uint(msb) + 1) } } + if MaximumTotalMemoryBytes > 0 && memSize > MaximumTotalMemoryBytes { + memSize = MaximumTotalMemoryBytes + } return memSize } -- cgit v1.2.3 From 26c60d7d5d31fdf3ad380a0f09b4f33afae3d9d3 Mon Sep 17 00:00:00 2001 From: Nicolas Lacasse Date: Thu, 7 May 2020 11:40:01 -0700 Subject: Port signalfd to vfs2. PiperOrigin-RevId: 310404113 --- pkg/sentry/fsimpl/signalfd/BUILD | 20 +++++ pkg/sentry/fsimpl/signalfd/signalfd.go | 135 +++++++++++++++++++++++++++++++ pkg/sentry/syscalls/linux/sigset.go | 8 +- pkg/sentry/syscalls/linux/sys_epoll.go | 2 +- pkg/sentry/syscalls/linux/sys_poll.go | 4 +- pkg/sentry/syscalls/linux/sys_signal.go | 6 +- pkg/sentry/syscalls/linux/vfs2/BUILD | 2 + pkg/sentry/syscalls/linux/vfs2/signal.go | 100 +++++++++++++++++++++++ pkg/sentry/syscalls/linux/vfs2/vfs2.go | 4 +- pkg/sentry/vfs/BUILD | 1 + 10 files changed, 272 insertions(+), 10 deletions(-) create mode 100644 pkg/sentry/fsimpl/signalfd/BUILD create mode 100644 pkg/sentry/fsimpl/signalfd/signalfd.go create mode 100644 pkg/sentry/syscalls/linux/vfs2/signal.go (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/signalfd/BUILD b/pkg/sentry/fsimpl/signalfd/BUILD new file mode 100644 index 000000000..067c1657f --- /dev/null +++ b/pkg/sentry/fsimpl/signalfd/BUILD @@ -0,0 +1,20 @@ +load("//tools:defs.bzl", "go_library") + +package(licenses = ["notice"]) + +go_library( + name = "signalfd", + srcs = ["signalfd.go"], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/binary", + "//pkg/context", + "//pkg/sentry/kernel", + "//pkg/sentry/vfs", + "//pkg/sync", + "//pkg/syserror", + "//pkg/usermem", + "//pkg/waiter", + ], +) diff --git a/pkg/sentry/fsimpl/signalfd/signalfd.go b/pkg/sentry/fsimpl/signalfd/signalfd.go new file mode 100644 index 000000000..d29ef3f83 --- /dev/null +++ b/pkg/sentry/fsimpl/signalfd/signalfd.go @@ -0,0 +1,135 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package signalfd + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +// SignalFileDescription implements FileDescriptionImpl for signal fds. +type SignalFileDescription struct { + vfsfd vfs.FileDescription + vfs.FileDescriptionDefaultImpl + vfs.DentryMetadataFileDescriptionImpl + + // target is the original signal target task. + // + // The semantics here are a bit broken. Linux will always use current + // for all reads, regardless of where the signalfd originated. We can't + // do exactly that because we need to plumb the context through + // EventRegister in order to support proper blocking behavior. This + // will undoubtedly become very complicated quickly. + target *kernel.Task + + // mu protects mask. + mu sync.Mutex + + // mask is the signal mask. Protected by mu. + mask linux.SignalSet +} + +var _ vfs.FileDescriptionImpl = (*SignalFileDescription)(nil) + +// New creates a new signal fd. +func New(vfsObj *vfs.VirtualFilesystem, target *kernel.Task, mask linux.SignalSet, flags uint32) (*vfs.FileDescription, error) { + vd := vfsObj.NewAnonVirtualDentry("[signalfd]") + defer vd.DecRef() + sfd := &SignalFileDescription{ + target: target, + mask: mask, + } + if err := sfd.vfsfd.Init(sfd, flags, vd.Mount(), vd.Dentry(), &vfs.FileDescriptionOptions{ + UseDentryMetadata: true, + DenyPRead: true, + DenyPWrite: true, + }); err != nil { + return nil, err + } + return &sfd.vfsfd, nil +} + +// Mask returns the signal mask. +func (sfd *SignalFileDescription) Mask() linux.SignalSet { + sfd.mu.Lock() + defer sfd.mu.Unlock() + return sfd.mask +} + +// SetMask sets the signal mask. +func (sfd *SignalFileDescription) SetMask(mask linux.SignalSet) { + sfd.mu.Lock() + defer sfd.mu.Unlock() + sfd.mask = mask +} + +// Read implements FileDescriptionImpl.Read. +func (sfd *SignalFileDescription) Read(ctx context.Context, dst usermem.IOSequence, _ vfs.ReadOptions) (int64, error) { + // Attempt to dequeue relevant signals. + info, err := sfd.target.Sigtimedwait(sfd.Mask(), 0) + if err != nil { + // There must be no signal available. + return 0, syserror.ErrWouldBlock + } + + // Copy out the signal info using the specified format. + var buf [128]byte + binary.Marshal(buf[:0], usermem.ByteOrder, &linux.SignalfdSiginfo{ + Signo: uint32(info.Signo), + Errno: info.Errno, + Code: info.Code, + PID: uint32(info.Pid()), + UID: uint32(info.Uid()), + Status: info.Status(), + Overrun: uint32(info.Overrun()), + Addr: info.Addr(), + }) + n, err := dst.CopyOut(ctx, buf[:]) + return int64(n), err +} + +// Readiness implements waiter.Waitable.Readiness. +func (sfd *SignalFileDescription) Readiness(mask waiter.EventMask) waiter.EventMask { + sfd.mu.Lock() + defer sfd.mu.Unlock() + if mask&waiter.EventIn != 0 && sfd.target.PendingSignals()&sfd.mask != 0 { + return waiter.EventIn // Pending signals. + } + return 0 +} + +// EventRegister implements waiter.Waitable.EventRegister. +func (sfd *SignalFileDescription) EventRegister(entry *waiter.Entry, _ waiter.EventMask) { + sfd.mu.Lock() + defer sfd.mu.Unlock() + // Register for the signal set; ignore the passed events. + sfd.target.SignalRegister(entry, waiter.EventMask(sfd.mask)) +} + +// EventUnregister implements waiter.Waitable.EventUnregister. +func (sfd *SignalFileDescription) EventUnregister(entry *waiter.Entry) { + // Unregister the original entry. + sfd.target.SignalUnregister(entry) +} + +// Release implements FileDescriptionImpl.Release() +func (sfd *SignalFileDescription) Release() {} diff --git a/pkg/sentry/syscalls/linux/sigset.go b/pkg/sentry/syscalls/linux/sigset.go index 2ddb2b146..434559b80 100644 --- a/pkg/sentry/syscalls/linux/sigset.go +++ b/pkg/sentry/syscalls/linux/sigset.go @@ -21,9 +21,13 @@ import ( "gvisor.dev/gvisor/pkg/usermem" ) -// copyInSigSet copies in a sigset_t, checks its size, and ensures that KILL and +// CopyInSigSet copies in a sigset_t, checks its size, and ensures that KILL and // STOP are clear. -func copyInSigSet(t *kernel.Task, sigSetAddr usermem.Addr, size uint) (linux.SignalSet, error) { +// +// TODO(gvisor.dev/issue/1624): This is only exported because +// syscalls/vfs2/signal.go depends on it. Once vfs1 is deleted and the vfs2 +// syscalls are moved into this package, then they can be unexported. +func CopyInSigSet(t *kernel.Task, sigSetAddr usermem.Addr, size uint) (linux.SignalSet, error) { if size != linux.SignalSetSize { return 0, syserror.EINVAL } diff --git a/pkg/sentry/syscalls/linux/sys_epoll.go b/pkg/sentry/syscalls/linux/sys_epoll.go index 51bf205cf..7f460d30b 100644 --- a/pkg/sentry/syscalls/linux/sys_epoll.go +++ b/pkg/sentry/syscalls/linux/sys_epoll.go @@ -131,7 +131,7 @@ func EpollPwait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy maskSize := uint(args[5].Uint()) if maskAddr != 0 { - mask, err := copyInSigSet(t, maskAddr, maskSize) + mask, err := CopyInSigSet(t, maskAddr, maskSize) if err != nil { return 0, nil, err } diff --git a/pkg/sentry/syscalls/linux/sys_poll.go b/pkg/sentry/syscalls/linux/sys_poll.go index 4f8762d7d..f0198141c 100644 --- a/pkg/sentry/syscalls/linux/sys_poll.go +++ b/pkg/sentry/syscalls/linux/sys_poll.go @@ -443,7 +443,7 @@ func Ppoll(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall } if maskAddr != 0 { - mask, err := copyInSigSet(t, maskAddr, maskSize) + mask, err := CopyInSigSet(t, maskAddr, maskSize) if err != nil { return 0, nil, err } @@ -525,7 +525,7 @@ func Pselect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca } if maskAddr != 0 { - mask, err := copyInSigSet(t, maskAddr, size) + mask, err := CopyInSigSet(t, maskAddr, size) if err != nil { return 0, nil, err } diff --git a/pkg/sentry/syscalls/linux/sys_signal.go b/pkg/sentry/syscalls/linux/sys_signal.go index 582d37e03..d2b0012ae 100644 --- a/pkg/sentry/syscalls/linux/sys_signal.go +++ b/pkg/sentry/syscalls/linux/sys_signal.go @@ -295,7 +295,7 @@ func RtSigprocmask(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel } oldmask := t.SignalMask() if setaddr != 0 { - mask, err := copyInSigSet(t, setaddr, sigsetsize) + mask, err := CopyInSigSet(t, setaddr, sigsetsize) if err != nil { return 0, nil, err } @@ -366,7 +366,7 @@ func RtSigtimedwait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne timespec := args[2].Pointer() sigsetsize := args[3].SizeT() - mask, err := copyInSigSet(t, sigset, sigsetsize) + mask, err := CopyInSigSet(t, sigset, sigsetsize) if err != nil { return 0, nil, err } @@ -518,7 +518,7 @@ func RestartSyscall(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne // sharedSignalfd is shared between the two calls. func sharedSignalfd(t *kernel.Task, fd int32, sigset usermem.Addr, sigsetsize uint, flags int32) (uintptr, *kernel.SyscallControl, error) { // Copy in the signal mask. - mask, err := copyInSigSet(t, sigset, sigsetsize) + mask, err := CopyInSigSet(t, sigset, sigsetsize) if err != nil { return 0, nil, err } diff --git a/pkg/sentry/syscalls/linux/vfs2/BUILD b/pkg/sentry/syscalls/linux/vfs2/BUILD index 4c7b8f819..14838aa2c 100644 --- a/pkg/sentry/syscalls/linux/vfs2/BUILD +++ b/pkg/sentry/syscalls/linux/vfs2/BUILD @@ -19,6 +19,7 @@ go_library( "poll.go", "read_write.go", "setstat.go", + "signal.go", "socket.go", "stat.go", "stat_amd64.go", @@ -39,6 +40,7 @@ go_library( "//pkg/sentry/arch", "//pkg/sentry/fsbridge", "//pkg/sentry/fsimpl/pipefs", + "//pkg/sentry/fsimpl/signalfd", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", "//pkg/sentry/kernel/pipe", diff --git a/pkg/sentry/syscalls/linux/vfs2/signal.go b/pkg/sentry/syscalls/linux/vfs2/signal.go new file mode 100644 index 000000000..623992f6f --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/signal.go @@ -0,0 +1,100 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/signalfd" + "gvisor.dev/gvisor/pkg/sentry/kernel" + slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// sharedSignalfd is shared between the two calls. +func sharedSignalfd(t *kernel.Task, fd int32, sigset usermem.Addr, sigsetsize uint, flags int32) (uintptr, *kernel.SyscallControl, error) { + // Copy in the signal mask. + mask, err := slinux.CopyInSigSet(t, sigset, sigsetsize) + if err != nil { + return 0, nil, err + } + + // Always check for valid flags, even if not creating. + if flags&^(linux.SFD_NONBLOCK|linux.SFD_CLOEXEC) != 0 { + return 0, nil, syserror.EINVAL + } + + // Is this a change to an existing signalfd? + // + // The spec indicates that this should adjust the mask. + if fd != -1 { + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + // Is this a signalfd? + if sfd, ok := file.Impl().(*signalfd.SignalFileDescription); ok { + sfd.SetMask(mask) + return 0, nil, nil + } + + // Not a signalfd. + return 0, nil, syserror.EINVAL + } + + fileFlags := uint32(linux.O_RDWR) + if flags&linux.SFD_NONBLOCK != 0 { + fileFlags |= linux.O_NONBLOCK + } + + // Create a new file. + vfsObj := t.Kernel().VFS() + file, err := signalfd.New(vfsObj, t, mask, fileFlags) + if err != nil { + return 0, nil, err + } + defer file.DecRef() + + // Create a new descriptor. + fd, err = t.NewFDFromVFS2(0, file, kernel.FDFlags{ + CloseOnExec: flags&linux.SFD_CLOEXEC != 0, + }) + if err != nil { + return 0, nil, err + } + + // Done. + return uintptr(fd), nil, nil +} + +// Signalfd implements the linux syscall signalfd(2). +func Signalfd(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + sigset := args[1].Pointer() + sigsetsize := args[2].SizeT() + return sharedSignalfd(t, fd, sigset, sigsetsize, 0) +} + +// Signalfd4 implements the linux syscall signalfd4(2). +func Signalfd4(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + sigset := args[1].Pointer() + sigsetsize := args[2].SizeT() + flags := args[3].Int() + return sharedSignalfd(t, fd, sigset, sigsetsize, flags) +} diff --git a/pkg/sentry/syscalls/linux/vfs2/vfs2.go b/pkg/sentry/syscalls/linux/vfs2/vfs2.go index f1b697844..9c04677f1 100644 --- a/pkg/sentry/syscalls/linux/vfs2/vfs2.go +++ b/pkg/sentry/syscalls/linux/vfs2/vfs2.go @@ -139,14 +139,14 @@ func Override() { s.Table[277] = syscalls.Supported("sync_file_range", SyncFileRange) s.Table[280] = syscalls.Supported("utimensat", Utimensat) s.Table[281] = syscalls.Supported("epoll_pwait", EpollPwait) - delete(s.Table, 282) // signalfd + s.Table[282] = syscalls.Supported("signalfd", Signalfd) s.Table[283] = syscalls.Supported("timerfd_create", TimerfdCreate) s.Table[284] = syscalls.Supported("eventfd", Eventfd) delete(s.Table, 285) // fallocate s.Table[286] = syscalls.Supported("timerfd_settime", TimerfdSettime) s.Table[287] = syscalls.Supported("timerfd_gettime", TimerfdGettime) s.Table[288] = syscalls.Supported("accept4", Accept4) - delete(s.Table, 289) // signalfd4 + s.Table[289] = syscalls.Supported("signalfd4", Signalfd4) s.Table[290] = syscalls.Supported("eventfd2", Eventfd2) s.Table[291] = syscalls.Supported("epoll_create1", EpollCreate1) s.Table[292] = syscalls.Supported("dup3", Dup3) diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD index c62505fe2..86046dd99 100644 --- a/pkg/sentry/vfs/BUILD +++ b/pkg/sentry/vfs/BUILD @@ -43,6 +43,7 @@ go_library( visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", + "//pkg/binary", "//pkg/context", "//pkg/fd", "//pkg/fdnotifier", -- cgit v1.2.3 From d0b1d0233dc8a8ac837d534cd0664eabb9dd0a71 Mon Sep 17 00:00:00 2001 From: Nicolas Lacasse Date: Thu, 7 May 2020 12:42:46 -0700 Subject: Move pkg/sentry/vfs/{eventfd,timerfd} to new packages in pkg/sentry/fsimpl. They don't depend on anything in VFS2, so they should be their own packages. PiperOrigin-RevId: 310416807 --- pkg/sentry/fsimpl/eventfd/BUILD | 33 ++++ pkg/sentry/fsimpl/eventfd/eventfd.go | 284 ++++++++++++++++++++++++++++++ pkg/sentry/fsimpl/eventfd/eventfd_test.go | 97 ++++++++++ pkg/sentry/fsimpl/timerfd/BUILD | 17 ++ pkg/sentry/fsimpl/timerfd/timerfd.go | 143 +++++++++++++++ pkg/sentry/kernel/BUILD | 1 + pkg/sentry/kernel/kernel.go | 11 +- pkg/sentry/syscalls/linux/vfs2/BUILD | 2 + pkg/sentry/syscalls/linux/vfs2/eventfd.go | 4 +- pkg/sentry/syscalls/linux/vfs2/timerfd.go | 19 +- pkg/sentry/vfs/BUILD | 4 - pkg/sentry/vfs/eventfd.go | 282 ----------------------------- pkg/sentry/vfs/eventfd_test.go | 96 ---------- pkg/sentry/vfs/timerfd.go | 141 --------------- 14 files changed, 596 insertions(+), 538 deletions(-) create mode 100644 pkg/sentry/fsimpl/eventfd/BUILD create mode 100644 pkg/sentry/fsimpl/eventfd/eventfd.go create mode 100644 pkg/sentry/fsimpl/eventfd/eventfd_test.go create mode 100644 pkg/sentry/fsimpl/timerfd/BUILD create mode 100644 pkg/sentry/fsimpl/timerfd/timerfd.go delete mode 100644 pkg/sentry/vfs/eventfd.go delete mode 100644 pkg/sentry/vfs/eventfd_test.go delete mode 100644 pkg/sentry/vfs/timerfd.go (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/eventfd/BUILD b/pkg/sentry/fsimpl/eventfd/BUILD new file mode 100644 index 000000000..ea167d38c --- /dev/null +++ b/pkg/sentry/fsimpl/eventfd/BUILD @@ -0,0 +1,33 @@ +load("//tools:defs.bzl", "go_library", "go_test") + +licenses(["notice"]) + +go_library( + name = "eventfd", + srcs = ["eventfd.go"], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/context", + "//pkg/fdnotifier", + "//pkg/log", + "//pkg/sentry/vfs", + "//pkg/syserror", + "//pkg/usermem", + "//pkg/waiter", + ], +) + +go_test( + name = "eventfd_test", + size = "small", + srcs = ["eventfd_test.go"], + library = ":eventfd", + deps = [ + "//pkg/abi/linux", + "//pkg/sentry/contexttest", + "//pkg/sentry/vfs", + "//pkg/usermem", + "//pkg/waiter", + ], +) diff --git a/pkg/sentry/fsimpl/eventfd/eventfd.go b/pkg/sentry/fsimpl/eventfd/eventfd.go new file mode 100644 index 000000000..c573d7935 --- /dev/null +++ b/pkg/sentry/fsimpl/eventfd/eventfd.go @@ -0,0 +1,284 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package eventfd implements event fds. +package eventfd + +import ( + "math" + "sync" + "syscall" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fdnotifier" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +// EventFileDescription implements FileDescriptionImpl for file-based event +// notification (eventfd). Eventfds are usually internal to the Sentry but in +// certain situations they may be converted into a host-backed eventfd. +type EventFileDescription struct { + vfsfd vfs.FileDescription + vfs.FileDescriptionDefaultImpl + vfs.DentryMetadataFileDescriptionImpl + + // queue is used to notify interested parties when the event object + // becomes readable or writable. + queue waiter.Queue `state:"zerovalue"` + + // mu protects the fields below. + mu sync.Mutex `state:"nosave"` + + // val is the current value of the event counter. + val uint64 + + // semMode specifies whether the event is in "semaphore" mode. + semMode bool + + // hostfd indicates whether this eventfd is passed through to the host. + hostfd int +} + +var _ vfs.FileDescriptionImpl = (*EventFileDescription)(nil) + +// New creates a new event fd. +func New(vfsObj *vfs.VirtualFilesystem, initVal uint64, semMode bool, flags uint32) (*vfs.FileDescription, error) { + vd := vfsObj.NewAnonVirtualDentry("[eventfd]") + defer vd.DecRef() + efd := &EventFileDescription{ + val: initVal, + semMode: semMode, + hostfd: -1, + } + if err := efd.vfsfd.Init(efd, flags, vd.Mount(), vd.Dentry(), &vfs.FileDescriptionOptions{ + UseDentryMetadata: true, + DenyPRead: true, + DenyPWrite: true, + }); err != nil { + return nil, err + } + return &efd.vfsfd, nil +} + +// HostFD returns the host eventfd associated with this event. +func (efd *EventFileDescription) HostFD() (int, error) { + efd.mu.Lock() + defer efd.mu.Unlock() + if efd.hostfd >= 0 { + return efd.hostfd, nil + } + + flags := linux.EFD_NONBLOCK + if efd.semMode { + flags |= linux.EFD_SEMAPHORE + } + + fd, _, errno := syscall.Syscall(syscall.SYS_EVENTFD2, uintptr(efd.val), uintptr(flags), 0) + if errno != 0 { + return -1, errno + } + + if err := fdnotifier.AddFD(int32(fd), &efd.queue); err != nil { + if closeErr := syscall.Close(int(fd)); closeErr != nil { + log.Warningf("close(%d) eventfd failed: %v", fd, closeErr) + } + return -1, err + } + + efd.hostfd = int(fd) + return efd.hostfd, nil +} + +// Release implements FileDescriptionImpl.Release() +func (efd *EventFileDescription) Release() { + efd.mu.Lock() + defer efd.mu.Unlock() + if efd.hostfd >= 0 { + fdnotifier.RemoveFD(int32(efd.hostfd)) + if closeErr := syscall.Close(int(efd.hostfd)); closeErr != nil { + log.Warningf("close(%d) eventfd failed: %v", efd.hostfd, closeErr) + } + efd.hostfd = -1 + } +} + +// Read implements FileDescriptionImpl.Read. +func (efd *EventFileDescription) Read(ctx context.Context, dst usermem.IOSequence, _ vfs.ReadOptions) (int64, error) { + if dst.NumBytes() < 8 { + return 0, syscall.EINVAL + } + if err := efd.read(ctx, dst); err != nil { + return 0, err + } + return 8, nil +} + +// Write implements FileDescriptionImpl.Write. +func (efd *EventFileDescription) Write(ctx context.Context, src usermem.IOSequence, _ vfs.WriteOptions) (int64, error) { + if src.NumBytes() < 8 { + return 0, syscall.EINVAL + } + if err := efd.write(ctx, src); err != nil { + return 0, err + } + return 8, nil +} + +// Preconditions: Must be called with efd.mu locked. +func (efd *EventFileDescription) hostReadLocked(ctx context.Context, dst usermem.IOSequence) error { + var buf [8]byte + if _, err := syscall.Read(efd.hostfd, buf[:]); err != nil { + if err == syscall.EWOULDBLOCK { + return syserror.ErrWouldBlock + } + return err + } + _, err := dst.CopyOut(ctx, buf[:]) + return err +} + +func (efd *EventFileDescription) read(ctx context.Context, dst usermem.IOSequence) error { + efd.mu.Lock() + if efd.hostfd >= 0 { + defer efd.mu.Unlock() + return efd.hostReadLocked(ctx, dst) + } + + // We can't complete the read if the value is currently zero. + if efd.val == 0 { + efd.mu.Unlock() + return syserror.ErrWouldBlock + } + + // Update the value based on the mode the event is operating in. + var val uint64 + if efd.semMode { + val = 1 + // Consistent with Linux, this is done even if writing to memory fails. + efd.val-- + } else { + val = efd.val + efd.val = 0 + } + + efd.mu.Unlock() + + // Notify writers. We do this even if we were already writable because + // it is possible that a writer is waiting to write the maximum value + // to the event. + efd.queue.Notify(waiter.EventOut) + + var buf [8]byte + usermem.ByteOrder.PutUint64(buf[:], val) + _, err := dst.CopyOut(ctx, buf[:]) + return err +} + +// Preconditions: Must be called with efd.mu locked. +func (efd *EventFileDescription) hostWriteLocked(val uint64) error { + var buf [8]byte + usermem.ByteOrder.PutUint64(buf[:], val) + _, err := syscall.Write(efd.hostfd, buf[:]) + if err == syscall.EWOULDBLOCK { + return syserror.ErrWouldBlock + } + return err +} + +func (efd *EventFileDescription) write(ctx context.Context, src usermem.IOSequence) error { + var buf [8]byte + if _, err := src.CopyIn(ctx, buf[:]); err != nil { + return err + } + val := usermem.ByteOrder.Uint64(buf[:]) + + return efd.Signal(val) +} + +// Signal is an internal function to signal the event fd. +func (efd *EventFileDescription) Signal(val uint64) error { + if val == math.MaxUint64 { + return syscall.EINVAL + } + + efd.mu.Lock() + + if efd.hostfd >= 0 { + defer efd.mu.Unlock() + return efd.hostWriteLocked(val) + } + + // We only allow writes that won't cause the value to go over the max + // uint64 minus 1. + if val > math.MaxUint64-1-efd.val { + efd.mu.Unlock() + return syserror.ErrWouldBlock + } + + efd.val += val + efd.mu.Unlock() + + // Always trigger a notification. + efd.queue.Notify(waiter.EventIn) + + return nil +} + +// Readiness implements waiter.Waitable.Readiness. +func (efd *EventFileDescription) Readiness(mask waiter.EventMask) waiter.EventMask { + efd.mu.Lock() + defer efd.mu.Unlock() + + if efd.hostfd >= 0 { + return fdnotifier.NonBlockingPoll(int32(efd.hostfd), mask) + } + + ready := waiter.EventMask(0) + if efd.val > 0 { + ready |= waiter.EventIn + } + + if efd.val < math.MaxUint64-1 { + ready |= waiter.EventOut + } + + return mask & ready +} + +// EventRegister implements waiter.Waitable.EventRegister. +func (efd *EventFileDescription) EventRegister(entry *waiter.Entry, mask waiter.EventMask) { + efd.queue.EventRegister(entry, mask) + + efd.mu.Lock() + defer efd.mu.Unlock() + if efd.hostfd >= 0 { + fdnotifier.UpdateFD(int32(efd.hostfd)) + } +} + +// EventUnregister implements waiter.Waitable.EventUnregister. +func (efd *EventFileDescription) EventUnregister(entry *waiter.Entry) { + efd.queue.EventUnregister(entry) + + efd.mu.Lock() + defer efd.mu.Unlock() + if efd.hostfd >= 0 { + fdnotifier.UpdateFD(int32(efd.hostfd)) + } +} diff --git a/pkg/sentry/fsimpl/eventfd/eventfd_test.go b/pkg/sentry/fsimpl/eventfd/eventfd_test.go new file mode 100644 index 000000000..20e3adffc --- /dev/null +++ b/pkg/sentry/fsimpl/eventfd/eventfd_test.go @@ -0,0 +1,97 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package eventfd + +import ( + "testing" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/contexttest" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +func TestEventFD(t *testing.T) { + initVals := []uint64{ + 0, + // Using a non-zero initial value verifies that writing to an + // eventfd signals when the eventfd's counter was already + // non-zero. + 343, + } + + for _, initVal := range initVals { + ctx := contexttest.Context(t) + vfsObj := &vfs.VirtualFilesystem{} + if err := vfsObj.Init(); err != nil { + t.Fatalf("VFS init: %v", err) + } + + // Make a new eventfd that is writable. + eventfd, err := New(vfsObj, initVal, false, linux.O_RDWR) + if err != nil { + t.Fatalf("New() failed: %v", err) + } + defer eventfd.DecRef() + + // Register a callback for a write event. + w, ch := waiter.NewChannelEntry(nil) + eventfd.EventRegister(&w, waiter.EventIn) + defer eventfd.EventUnregister(&w) + + data := []byte("00000124") + // Create and submit a write request. + n, err := eventfd.Write(ctx, usermem.BytesIOSequence(data), vfs.WriteOptions{}) + if err != nil { + t.Fatal(err) + } + if n != 8 { + t.Errorf("eventfd.write wrote %d bytes, not full int64", n) + } + + // Check if the callback fired due to the write event. + select { + case <-ch: + default: + t.Errorf("Didn't get notified of EventIn after write") + } + } +} + +func TestEventFDStat(t *testing.T) { + ctx := contexttest.Context(t) + vfsObj := &vfs.VirtualFilesystem{} + if err := vfsObj.Init(); err != nil { + t.Fatalf("VFS init: %v", err) + } + + // Make a new eventfd that is writable. + eventfd, err := New(vfsObj, 0, false, linux.O_RDWR) + if err != nil { + t.Fatalf("New() failed: %v", err) + } + defer eventfd.DecRef() + + statx, err := eventfd.Stat(ctx, vfs.StatOptions{ + Mask: linux.STATX_BASIC_STATS, + }) + if err != nil { + t.Fatalf("eventfd.Stat failed: %v", err) + } + if statx.Size != 0 { + t.Errorf("eventfd size should be 0") + } +} diff --git a/pkg/sentry/fsimpl/timerfd/BUILD b/pkg/sentry/fsimpl/timerfd/BUILD new file mode 100644 index 000000000..fbb02a271 --- /dev/null +++ b/pkg/sentry/fsimpl/timerfd/BUILD @@ -0,0 +1,17 @@ +load("//tools:defs.bzl", "go_library") + +licenses(["notice"]) + +go_library( + name = "timerfd", + srcs = ["timerfd.go"], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/context", + "//pkg/sentry/kernel/time", + "//pkg/sentry/vfs", + "//pkg/syserror", + "//pkg/usermem", + "//pkg/waiter", + ], +) diff --git a/pkg/sentry/fsimpl/timerfd/timerfd.go b/pkg/sentry/fsimpl/timerfd/timerfd.go new file mode 100644 index 000000000..60c92d626 --- /dev/null +++ b/pkg/sentry/fsimpl/timerfd/timerfd.go @@ -0,0 +1,143 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package timerfd implements timer fds. +package timerfd + +import ( + "sync/atomic" + + "gvisor.dev/gvisor/pkg/context" + ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +// TimerFileDescription implements FileDescriptionImpl for timer fds. It also +// implements ktime.TimerListener. +type TimerFileDescription struct { + vfsfd vfs.FileDescription + vfs.FileDescriptionDefaultImpl + vfs.DentryMetadataFileDescriptionImpl + + events waiter.Queue + timer *ktime.Timer + + // val is the number of timer expirations since the last successful + // call to PRead, or SetTime. val must be accessed using atomic memory + // operations. + val uint64 +} + +var _ vfs.FileDescriptionImpl = (*TimerFileDescription)(nil) +var _ ktime.TimerListener = (*TimerFileDescription)(nil) + +// New returns a new timer fd. +func New(vfsObj *vfs.VirtualFilesystem, clock ktime.Clock, flags uint32) (*vfs.FileDescription, error) { + vd := vfsObj.NewAnonVirtualDentry("[timerfd]") + defer vd.DecRef() + tfd := &TimerFileDescription{} + tfd.timer = ktime.NewTimer(clock, tfd) + if err := tfd.vfsfd.Init(tfd, flags, vd.Mount(), vd.Dentry(), &vfs.FileDescriptionOptions{ + UseDentryMetadata: true, + DenyPRead: true, + DenyPWrite: true, + }); err != nil { + return nil, err + } + return &tfd.vfsfd, nil +} + +// Read implements FileDescriptionImpl.Read. +func (tfd *TimerFileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + const sizeofUint64 = 8 + if dst.NumBytes() < sizeofUint64 { + return 0, syserror.EINVAL + } + if val := atomic.SwapUint64(&tfd.val, 0); val != 0 { + var buf [sizeofUint64]byte + usermem.ByteOrder.PutUint64(buf[:], val) + if _, err := dst.CopyOut(ctx, buf[:]); err != nil { + // Linux does not undo consuming the number of + // expirations even if writing to userspace fails. + return 0, err + } + return sizeofUint64, nil + } + return 0, syserror.ErrWouldBlock +} + +// Clock returns the timer fd's Clock. +func (tfd *TimerFileDescription) Clock() ktime.Clock { + return tfd.timer.Clock() +} + +// GetTime returns the associated Timer's setting and the time at which it was +// observed. +func (tfd *TimerFileDescription) GetTime() (ktime.Time, ktime.Setting) { + return tfd.timer.Get() +} + +// SetTime atomically changes the associated Timer's setting, resets the number +// of expirations to 0, and returns the previous setting and the time at which +// it was observed. +func (tfd *TimerFileDescription) SetTime(s ktime.Setting) (ktime.Time, ktime.Setting) { + return tfd.timer.SwapAnd(s, func() { atomic.StoreUint64(&tfd.val, 0) }) +} + +// Readiness implements waiter.Waitable.Readiness. +func (tfd *TimerFileDescription) Readiness(mask waiter.EventMask) waiter.EventMask { + var ready waiter.EventMask + if atomic.LoadUint64(&tfd.val) != 0 { + ready |= waiter.EventIn + } + return ready +} + +// EventRegister implements waiter.Waitable.EventRegister. +func (tfd *TimerFileDescription) EventRegister(e *waiter.Entry, mask waiter.EventMask) { + tfd.events.EventRegister(e, mask) +} + +// EventUnregister implements waiter.Waitable.EventUnregister. +func (tfd *TimerFileDescription) EventUnregister(e *waiter.Entry) { + tfd.events.EventUnregister(e) +} + +// PauseTimer pauses the associated Timer. +func (tfd *TimerFileDescription) PauseTimer() { + tfd.timer.Pause() +} + +// ResumeTimer resumes the associated Timer. +func (tfd *TimerFileDescription) ResumeTimer() { + tfd.timer.Resume() +} + +// Release implements FileDescriptionImpl.Release() +func (tfd *TimerFileDescription) Release() { + tfd.timer.Destroy() +} + +// Notify implements ktime.TimerListener.Notify. +func (tfd *TimerFileDescription) Notify(exp uint64, setting ktime.Setting) (ktime.Setting, bool) { + atomic.AddUint64(&tfd.val, exp) + tfd.events.Notify(waiter.EventIn) + return ktime.Setting{}, false +} + +// Destroy implements ktime.TimerListener.Destroy. +func (tfd *TimerFileDescription) Destroy() {} diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD index e47af66d6..8104f50f3 100644 --- a/pkg/sentry/kernel/BUILD +++ b/pkg/sentry/kernel/BUILD @@ -172,6 +172,7 @@ go_library( "//pkg/sentry/fsimpl/kernfs", "//pkg/sentry/fsimpl/pipefs", "//pkg/sentry/fsimpl/sockfs", + "//pkg/sentry/fsimpl/timerfd", "//pkg/sentry/hostcpu", "//pkg/sentry/inet", "//pkg/sentry/kernel/auth", diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index c91b9dce2..271ea5faf 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -48,10 +48,11 @@ import ( "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" - "gvisor.dev/gvisor/pkg/sentry/fs/timerfd" + oldtimerfd "gvisor.dev/gvisor/pkg/sentry/fs/timerfd" "gvisor.dev/gvisor/pkg/sentry/fsbridge" "gvisor.dev/gvisor/pkg/sentry/fsimpl/pipefs" "gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/timerfd" "gvisor.dev/gvisor/pkg/sentry/hostcpu" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" @@ -1068,11 +1069,11 @@ func (k *Kernel) pauseTimeLocked() { if t.fdTable != nil { t.fdTable.forEach(func(_ int32, file *fs.File, fd *vfs.FileDescription, _ FDFlags) { if VFS2Enabled { - if tfd, ok := fd.Impl().(*vfs.TimerFileDescription); ok { + if tfd, ok := fd.Impl().(*timerfd.TimerFileDescription); ok { tfd.PauseTimer() } } else { - if tfd, ok := file.FileOperations.(*timerfd.TimerOperations); ok { + if tfd, ok := file.FileOperations.(*oldtimerfd.TimerOperations); ok { tfd.PauseTimer() } } @@ -1104,11 +1105,11 @@ func (k *Kernel) resumeTimeLocked() { if t.fdTable != nil { t.fdTable.forEach(func(_ int32, file *fs.File, fd *vfs.FileDescription, _ FDFlags) { if VFS2Enabled { - if tfd, ok := fd.Impl().(*vfs.TimerFileDescription); ok { + if tfd, ok := fd.Impl().(*timerfd.TimerFileDescription); ok { tfd.ResumeTimer() } } else { - if tfd, ok := file.FileOperations.(*timerfd.TimerOperations); ok { + if tfd, ok := file.FileOperations.(*oldtimerfd.TimerOperations); ok { tfd.ResumeTimer() } } diff --git a/pkg/sentry/syscalls/linux/vfs2/BUILD b/pkg/sentry/syscalls/linux/vfs2/BUILD index 14838aa2c..c32f942fb 100644 --- a/pkg/sentry/syscalls/linux/vfs2/BUILD +++ b/pkg/sentry/syscalls/linux/vfs2/BUILD @@ -39,8 +39,10 @@ go_library( "//pkg/gohacks", "//pkg/sentry/arch", "//pkg/sentry/fsbridge", + "//pkg/sentry/fsimpl/eventfd", "//pkg/sentry/fsimpl/pipefs", "//pkg/sentry/fsimpl/signalfd", + "//pkg/sentry/fsimpl/timerfd", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", "//pkg/sentry/kernel/pipe", diff --git a/pkg/sentry/syscalls/linux/vfs2/eventfd.go b/pkg/sentry/syscalls/linux/vfs2/eventfd.go index bd2194972..aff1a2070 100644 --- a/pkg/sentry/syscalls/linux/vfs2/eventfd.go +++ b/pkg/sentry/syscalls/linux/vfs2/eventfd.go @@ -17,6 +17,7 @@ package vfs2 import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/eventfd" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/syserror" ) @@ -31,12 +32,13 @@ func Eventfd2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc return 0, nil, syserror.EINVAL } + vfsObj := t.Kernel().VFS() fileFlags := uint32(linux.O_RDWR) if flags&linux.EFD_NONBLOCK != 0 { fileFlags |= linux.O_NONBLOCK } semMode := flags&linux.EFD_SEMAPHORE != 0 - eventfd, err := t.Kernel().VFS().NewEventFD(initVal, semMode, fileFlags) + eventfd, err := eventfd.New(vfsObj, initVal, semMode, fileFlags) if err != nil { return 0, nil, err } diff --git a/pkg/sentry/syscalls/linux/vfs2/timerfd.go b/pkg/sentry/syscalls/linux/vfs2/timerfd.go index 839a07db1..5ac79bc09 100644 --- a/pkg/sentry/syscalls/linux/vfs2/timerfd.go +++ b/pkg/sentry/syscalls/linux/vfs2/timerfd.go @@ -17,9 +17,9 @@ package vfs2 import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/timerfd" "gvisor.dev/gvisor/pkg/sentry/kernel" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" - "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" ) @@ -32,9 +32,12 @@ func TimerfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel return 0, nil, syserror.EINVAL } - var fileFlags uint32 + // Timerfds aren't writable per se (their implementation of Write just + // returns EINVAL), but they are "opened for writing", which is necessary + // to actually reach said implementation of Write. + fileFlags := uint32(linux.O_RDWR) if flags&linux.TFD_NONBLOCK != 0 { - fileFlags = linux.O_NONBLOCK + fileFlags |= linux.O_NONBLOCK } var clock ktime.Clock @@ -46,10 +49,8 @@ func TimerfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel default: return 0, nil, syserror.EINVAL } - // Timerfds aren't writable per se (their implementation of Write just - // returns EINVAL), but they are "opened for writing", which is necessary - // to actually reach said implementation of Write. - file, err := t.Kernel().VFS().NewTimerFD(clock, linux.O_RDWR|fileFlags) + vfsObj := t.Kernel().VFS() + file, err := timerfd.New(vfsObj, clock, fileFlags) if err != nil { return 0, nil, err } @@ -80,7 +81,7 @@ func TimerfdSettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne } defer file.DecRef() - tfd, ok := file.Impl().(*vfs.TimerFileDescription) + tfd, ok := file.Impl().(*timerfd.TimerFileDescription) if !ok { return 0, nil, syserror.EINVAL } @@ -114,7 +115,7 @@ func TimerfdGettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne } defer file.DecRef() - tfd, ok := file.Impl().(*vfs.TimerFileDescription) + tfd, ok := file.Impl().(*timerfd.TimerFileDescription) if !ok { return 0, nil, syserror.EINVAL } diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD index 86046dd99..94d69c1cc 100644 --- a/pkg/sentry/vfs/BUILD +++ b/pkg/sentry/vfs/BUILD @@ -25,7 +25,6 @@ go_library( "device.go", "epoll.go", "epoll_interest_list.go", - "eventfd.go", "file_description.go", "file_description_impl_util.go", "filesystem.go", @@ -37,7 +36,6 @@ go_library( "pathname.go", "permissions.go", "resolving_path.go", - "timerfd.go", "vfs.go", ], visibility = ["//pkg/sentry:internal"], @@ -71,7 +69,6 @@ go_test( name = "vfs_test", size = "small", srcs = [ - "eventfd_test.go", "file_description_impl_util_test.go", "mount_test.go", ], @@ -83,6 +80,5 @@ go_test( "//pkg/sync", "//pkg/syserror", "//pkg/usermem", - "//pkg/waiter", ], ) diff --git a/pkg/sentry/vfs/eventfd.go b/pkg/sentry/vfs/eventfd.go deleted file mode 100644 index f39dacacf..000000000 --- a/pkg/sentry/vfs/eventfd.go +++ /dev/null @@ -1,282 +0,0 @@ -// Copyright 2020 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package vfs - -import ( - "math" - "sync" - "syscall" - - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/fdnotifier" - "gvisor.dev/gvisor/pkg/log" - "gvisor.dev/gvisor/pkg/syserror" - "gvisor.dev/gvisor/pkg/usermem" - "gvisor.dev/gvisor/pkg/waiter" -) - -// EventFileDescription implements FileDescriptionImpl for file-based event -// notification (eventfd). Eventfds are usually internal to the Sentry but in -// certain situations they may be converted into a host-backed eventfd. -type EventFileDescription struct { - vfsfd FileDescription - FileDescriptionDefaultImpl - DentryMetadataFileDescriptionImpl - - // queue is used to notify interested parties when the event object - // becomes readable or writable. - queue waiter.Queue `state:"zerovalue"` - - // mu protects the fields below. - mu sync.Mutex `state:"nosave"` - - // val is the current value of the event counter. - val uint64 - - // semMode specifies whether the event is in "semaphore" mode. - semMode bool - - // hostfd indicates whether this eventfd is passed through to the host. - hostfd int -} - -var _ FileDescriptionImpl = (*EventFileDescription)(nil) - -// NewEventFD creates a new event fd. -func (vfs *VirtualFilesystem) NewEventFD(initVal uint64, semMode bool, flags uint32) (*FileDescription, error) { - vd := vfs.NewAnonVirtualDentry("[eventfd]") - defer vd.DecRef() - efd := &EventFileDescription{ - val: initVal, - semMode: semMode, - hostfd: -1, - } - if err := efd.vfsfd.Init(efd, flags, vd.Mount(), vd.Dentry(), &FileDescriptionOptions{ - UseDentryMetadata: true, - DenyPRead: true, - DenyPWrite: true, - }); err != nil { - return nil, err - } - return &efd.vfsfd, nil -} - -// HostFD returns the host eventfd associated with this event. -func (efd *EventFileDescription) HostFD() (int, error) { - efd.mu.Lock() - defer efd.mu.Unlock() - if efd.hostfd >= 0 { - return efd.hostfd, nil - } - - flags := linux.EFD_NONBLOCK - if efd.semMode { - flags |= linux.EFD_SEMAPHORE - } - - fd, _, errno := syscall.Syscall(syscall.SYS_EVENTFD2, uintptr(efd.val), uintptr(flags), 0) - if errno != 0 { - return -1, errno - } - - if err := fdnotifier.AddFD(int32(fd), &efd.queue); err != nil { - if closeErr := syscall.Close(int(fd)); closeErr != nil { - log.Warningf("close(%d) eventfd failed: %v", fd, closeErr) - } - return -1, err - } - - efd.hostfd = int(fd) - return efd.hostfd, nil -} - -// Release implements FileDescriptionImpl.Release() -func (efd *EventFileDescription) Release() { - efd.mu.Lock() - defer efd.mu.Unlock() - if efd.hostfd >= 0 { - fdnotifier.RemoveFD(int32(efd.hostfd)) - if closeErr := syscall.Close(int(efd.hostfd)); closeErr != nil { - log.Warningf("close(%d) eventfd failed: %v", efd.hostfd, closeErr) - } - efd.hostfd = -1 - } -} - -// Read implements FileDescriptionImpl.Read. -func (efd *EventFileDescription) Read(ctx context.Context, dst usermem.IOSequence, _ ReadOptions) (int64, error) { - if dst.NumBytes() < 8 { - return 0, syscall.EINVAL - } - if err := efd.read(ctx, dst); err != nil { - return 0, err - } - return 8, nil -} - -// Write implements FileDescriptionImpl.Write. -func (efd *EventFileDescription) Write(ctx context.Context, src usermem.IOSequence, _ WriteOptions) (int64, error) { - if src.NumBytes() < 8 { - return 0, syscall.EINVAL - } - if err := efd.write(ctx, src); err != nil { - return 0, err - } - return 8, nil -} - -// Preconditions: Must be called with efd.mu locked. -func (efd *EventFileDescription) hostReadLocked(ctx context.Context, dst usermem.IOSequence) error { - var buf [8]byte - if _, err := syscall.Read(efd.hostfd, buf[:]); err != nil { - if err == syscall.EWOULDBLOCK { - return syserror.ErrWouldBlock - } - return err - } - _, err := dst.CopyOut(ctx, buf[:]) - return err -} - -func (efd *EventFileDescription) read(ctx context.Context, dst usermem.IOSequence) error { - efd.mu.Lock() - if efd.hostfd >= 0 { - defer efd.mu.Unlock() - return efd.hostReadLocked(ctx, dst) - } - - // We can't complete the read if the value is currently zero. - if efd.val == 0 { - efd.mu.Unlock() - return syserror.ErrWouldBlock - } - - // Update the value based on the mode the event is operating in. - var val uint64 - if efd.semMode { - val = 1 - // Consistent with Linux, this is done even if writing to memory fails. - efd.val-- - } else { - val = efd.val - efd.val = 0 - } - - efd.mu.Unlock() - - // Notify writers. We do this even if we were already writable because - // it is possible that a writer is waiting to write the maximum value - // to the event. - efd.queue.Notify(waiter.EventOut) - - var buf [8]byte - usermem.ByteOrder.PutUint64(buf[:], val) - _, err := dst.CopyOut(ctx, buf[:]) - return err -} - -// Preconditions: Must be called with efd.mu locked. -func (efd *EventFileDescription) hostWriteLocked(val uint64) error { - var buf [8]byte - usermem.ByteOrder.PutUint64(buf[:], val) - _, err := syscall.Write(efd.hostfd, buf[:]) - if err == syscall.EWOULDBLOCK { - return syserror.ErrWouldBlock - } - return err -} - -func (efd *EventFileDescription) write(ctx context.Context, src usermem.IOSequence) error { - var buf [8]byte - if _, err := src.CopyIn(ctx, buf[:]); err != nil { - return err - } - val := usermem.ByteOrder.Uint64(buf[:]) - - return efd.Signal(val) -} - -// Signal is an internal function to signal the event fd. -func (efd *EventFileDescription) Signal(val uint64) error { - if val == math.MaxUint64 { - return syscall.EINVAL - } - - efd.mu.Lock() - - if efd.hostfd >= 0 { - defer efd.mu.Unlock() - return efd.hostWriteLocked(val) - } - - // We only allow writes that won't cause the value to go over the max - // uint64 minus 1. - if val > math.MaxUint64-1-efd.val { - efd.mu.Unlock() - return syserror.ErrWouldBlock - } - - efd.val += val - efd.mu.Unlock() - - // Always trigger a notification. - efd.queue.Notify(waiter.EventIn) - - return nil -} - -// Readiness implements waiter.Waitable.Readiness. -func (efd *EventFileDescription) Readiness(mask waiter.EventMask) waiter.EventMask { - efd.mu.Lock() - defer efd.mu.Unlock() - - if efd.hostfd >= 0 { - return fdnotifier.NonBlockingPoll(int32(efd.hostfd), mask) - } - - ready := waiter.EventMask(0) - if efd.val > 0 { - ready |= waiter.EventIn - } - - if efd.val < math.MaxUint64-1 { - ready |= waiter.EventOut - } - - return mask & ready -} - -// EventRegister implements waiter.Waitable.EventRegister. -func (efd *EventFileDescription) EventRegister(entry *waiter.Entry, mask waiter.EventMask) { - efd.queue.EventRegister(entry, mask) - - efd.mu.Lock() - defer efd.mu.Unlock() - if efd.hostfd >= 0 { - fdnotifier.UpdateFD(int32(efd.hostfd)) - } -} - -// EventUnregister implements waiter.Waitable.EventUnregister. -func (efd *EventFileDescription) EventUnregister(entry *waiter.Entry) { - efd.queue.EventUnregister(entry) - - efd.mu.Lock() - defer efd.mu.Unlock() - if efd.hostfd >= 0 { - fdnotifier.UpdateFD(int32(efd.hostfd)) - } -} diff --git a/pkg/sentry/vfs/eventfd_test.go b/pkg/sentry/vfs/eventfd_test.go deleted file mode 100644 index 2dff2d10b..000000000 --- a/pkg/sentry/vfs/eventfd_test.go +++ /dev/null @@ -1,96 +0,0 @@ -// Copyright 2018 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package vfs - -import ( - "testing" - - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/contexttest" - "gvisor.dev/gvisor/pkg/usermem" - "gvisor.dev/gvisor/pkg/waiter" -) - -func TestEventFD(t *testing.T) { - initVals := []uint64{ - 0, - // Using a non-zero initial value verifies that writing to an - // eventfd signals when the eventfd's counter was already - // non-zero. - 343, - } - - for _, initVal := range initVals { - ctx := contexttest.Context(t) - vfsObj := &VirtualFilesystem{} - if err := vfsObj.Init(); err != nil { - t.Fatalf("VFS init: %v", err) - } - - // Make a new eventfd that is writable. - eventfd, err := vfsObj.NewEventFD(initVal, false, linux.O_RDWR) - if err != nil { - t.Fatalf("NewEventFD failed: %v", err) - } - defer eventfd.DecRef() - - // Register a callback for a write event. - w, ch := waiter.NewChannelEntry(nil) - eventfd.EventRegister(&w, waiter.EventIn) - defer eventfd.EventUnregister(&w) - - data := []byte("00000124") - // Create and submit a write request. - n, err := eventfd.Write(ctx, usermem.BytesIOSequence(data), WriteOptions{}) - if err != nil { - t.Fatal(err) - } - if n != 8 { - t.Errorf("eventfd.write wrote %d bytes, not full int64", n) - } - - // Check if the callback fired due to the write event. - select { - case <-ch: - default: - t.Errorf("Didn't get notified of EventIn after write") - } - } -} - -func TestEventFDStat(t *testing.T) { - ctx := contexttest.Context(t) - vfsObj := &VirtualFilesystem{} - if err := vfsObj.Init(); err != nil { - t.Fatalf("VFS init: %v", err) - } - - // Make a new eventfd that is writable. - eventfd, err := vfsObj.NewEventFD(0, false, linux.O_RDWR) - if err != nil { - t.Fatalf("NewEventFD failed: %v", err) - } - defer eventfd.DecRef() - - statx, err := eventfd.Stat(ctx, StatOptions{ - Mask: linux.STATX_BASIC_STATS, - }) - if err != nil { - t.Fatalf("eventfd.Stat failed: %v", err) - } - if statx.Size != 0 { - t.Errorf("eventfd size should be 0") - } -} diff --git a/pkg/sentry/vfs/timerfd.go b/pkg/sentry/vfs/timerfd.go deleted file mode 100644 index cc536ceaf..000000000 --- a/pkg/sentry/vfs/timerfd.go +++ /dev/null @@ -1,141 +0,0 @@ -// Copyright 2020 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package vfs - -import ( - "sync/atomic" - - "gvisor.dev/gvisor/pkg/context" - ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" - "gvisor.dev/gvisor/pkg/syserror" - "gvisor.dev/gvisor/pkg/usermem" - "gvisor.dev/gvisor/pkg/waiter" -) - -// TimerFileDescription implements FileDescriptionImpl for timer fds. It also -// implements ktime.TimerListener. -type TimerFileDescription struct { - vfsfd FileDescription - FileDescriptionDefaultImpl - DentryMetadataFileDescriptionImpl - - events waiter.Queue - timer *ktime.Timer - - // val is the number of timer expirations since the last successful - // call to PRead, or SetTime. val must be accessed using atomic memory - // operations. - val uint64 -} - -var _ FileDescriptionImpl = (*TimerFileDescription)(nil) -var _ ktime.TimerListener = (*TimerFileDescription)(nil) - -// NewTimerFD returns a new timer fd. -func (vfs *VirtualFilesystem) NewTimerFD(clock ktime.Clock, flags uint32) (*FileDescription, error) { - vd := vfs.NewAnonVirtualDentry("[timerfd]") - defer vd.DecRef() - tfd := &TimerFileDescription{} - tfd.timer = ktime.NewTimer(clock, tfd) - if err := tfd.vfsfd.Init(tfd, flags, vd.Mount(), vd.Dentry(), &FileDescriptionOptions{ - UseDentryMetadata: true, - DenyPRead: true, - DenyPWrite: true, - }); err != nil { - return nil, err - } - return &tfd.vfsfd, nil -} - -// Read implements FileDescriptionImpl.Read. -func (tfd *TimerFileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) { - const sizeofUint64 = 8 - if dst.NumBytes() < sizeofUint64 { - return 0, syserror.EINVAL - } - if val := atomic.SwapUint64(&tfd.val, 0); val != 0 { - var buf [sizeofUint64]byte - usermem.ByteOrder.PutUint64(buf[:], val) - if _, err := dst.CopyOut(ctx, buf[:]); err != nil { - // Linux does not undo consuming the number of - // expirations even if writing to userspace fails. - return 0, err - } - return sizeofUint64, nil - } - return 0, syserror.ErrWouldBlock -} - -// Clock returns the timer fd's Clock. -func (tfd *TimerFileDescription) Clock() ktime.Clock { - return tfd.timer.Clock() -} - -// GetTime returns the associated Timer's setting and the time at which it was -// observed. -func (tfd *TimerFileDescription) GetTime() (ktime.Time, ktime.Setting) { - return tfd.timer.Get() -} - -// SetTime atomically changes the associated Timer's setting, resets the number -// of expirations to 0, and returns the previous setting and the time at which -// it was observed. -func (tfd *TimerFileDescription) SetTime(s ktime.Setting) (ktime.Time, ktime.Setting) { - return tfd.timer.SwapAnd(s, func() { atomic.StoreUint64(&tfd.val, 0) }) -} - -// Readiness implements waiter.Waitable.Readiness. -func (tfd *TimerFileDescription) Readiness(mask waiter.EventMask) waiter.EventMask { - var ready waiter.EventMask - if atomic.LoadUint64(&tfd.val) != 0 { - ready |= waiter.EventIn - } - return ready -} - -// EventRegister implements waiter.Waitable.EventRegister. -func (tfd *TimerFileDescription) EventRegister(e *waiter.Entry, mask waiter.EventMask) { - tfd.events.EventRegister(e, mask) -} - -// EventUnregister implements waiter.Waitable.EventUnregister. -func (tfd *TimerFileDescription) EventUnregister(e *waiter.Entry) { - tfd.events.EventUnregister(e) -} - -// PauseTimer pauses the associated Timer. -func (tfd *TimerFileDescription) PauseTimer() { - tfd.timer.Pause() -} - -// ResumeTimer resumes the associated Timer. -func (tfd *TimerFileDescription) ResumeTimer() { - tfd.timer.Resume() -} - -// Release implements FileDescriptionImpl.Release() -func (tfd *TimerFileDescription) Release() { - tfd.timer.Destroy() -} - -// Notify implements ktime.TimerListener.Notify. -func (tfd *TimerFileDescription) Notify(exp uint64, setting ktime.Setting) (ktime.Setting, bool) { - atomic.AddUint64(&tfd.val, exp) - tfd.events.Notify(waiter.EventIn) - return ktime.Setting{}, false -} - -// Destroy implements ktime.TimerListener.Destroy. -func (tfd *TimerFileDescription) Destroy() {} -- cgit v1.2.3 From 9115f26851b6f00ae01e9c130e3b5b342495c9e5 Mon Sep 17 00:00:00 2001 From: Jamie Liu Date: Thu, 7 May 2020 14:00:36 -0700 Subject: Allocate device numbers for VFS2 filesystems. Updates #1197, #1198, #1672 PiperOrigin-RevId: 310432006 --- pkg/abi/linux/dev.go | 4 + pkg/sentry/fsimpl/devpts/devpts.go | 40 +++++-- pkg/sentry/fsimpl/ext/ext.go | 16 ++- pkg/sentry/fsimpl/ext/filesystem.go | 8 +- pkg/sentry/fsimpl/ext/inode.go | 2 + pkg/sentry/fsimpl/gofer/gofer.go | 19 +++- pkg/sentry/fsimpl/host/host.go | 143 +++++++++++++------------ pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go | 4 +- pkg/sentry/fsimpl/kernfs/inode_impl_util.go | 36 +++++-- pkg/sentry/fsimpl/kernfs/kernfs_test.go | 6 +- pkg/sentry/fsimpl/kernfs/symlink.go | 8 +- pkg/sentry/fsimpl/pipefs/BUILD | 1 + pkg/sentry/fsimpl/pipefs/pipefs.go | 79 +++++++++----- pkg/sentry/fsimpl/proc/filesystem.go | 29 +++-- pkg/sentry/fsimpl/proc/subtasks.go | 12 +-- pkg/sentry/fsimpl/proc/task.go | 64 +++++------ pkg/sentry/fsimpl/proc/task_fds.go | 28 ++--- pkg/sentry/fsimpl/proc/task_files.go | 16 +-- pkg/sentry/fsimpl/proc/task_net.go | 38 +++---- pkg/sentry/fsimpl/proc/tasks.go | 47 ++++---- pkg/sentry/fsimpl/proc/tasks_files.go | 8 +- pkg/sentry/fsimpl/proc/tasks_sys.go | 108 +++++++++---------- pkg/sentry/fsimpl/sockfs/BUILD | 1 + pkg/sentry/fsimpl/sockfs/sockfs.go | 46 ++++++-- pkg/sentry/fsimpl/sys/sys.go | 21 +++- pkg/sentry/fsimpl/tmpfs/tmpfs.go | 39 ++++--- pkg/sentry/kernel/kernel.go | 10 +- pkg/sentry/socket/hostinet/BUILD | 1 - pkg/sentry/socket/hostinet/socket_vfs2.go | 4 +- pkg/sentry/socket/netlink/BUILD | 1 - pkg/sentry/socket/netlink/provider_vfs2.go | 4 +- pkg/sentry/socket/netstack/BUILD | 1 - pkg/sentry/socket/netstack/netstack_vfs2.go | 4 +- pkg/sentry/socket/unix/BUILD | 1 - pkg/sentry/socket/unix/unix_vfs2.go | 4 +- pkg/sentry/vfs/anonfs.go | 2 +- pkg/sentry/vfs/device.go | 2 +- runsc/boot/loader.go | 5 +- 38 files changed, 517 insertions(+), 345 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/abi/linux/dev.go b/pkg/abi/linux/dev.go index 89f9a793f..fa3ae5f18 100644 --- a/pkg/abi/linux/dev.go +++ b/pkg/abi/linux/dev.go @@ -36,6 +36,10 @@ func DecodeDeviceID(rdev uint32) (uint16, uint32) { // // See Documentations/devices.txt and uapi/linux/major.h. const ( + // UNNAMED_MAJOR is the major device number for "unnamed" devices, whose + // minor numbers are dynamically allocated by the kernel. + UNNAMED_MAJOR = 0 + // MEM_MAJOR is the major device number for "memory" character devices. MEM_MAJOR = 1 diff --git a/pkg/sentry/fsimpl/devpts/devpts.go b/pkg/sentry/fsimpl/devpts/devpts.go index 94db8fe5c..c03c65445 100644 --- a/pkg/sentry/fsimpl/devpts/devpts.go +++ b/pkg/sentry/fsimpl/devpts/devpts.go @@ -51,21 +51,37 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt return nil, nil, syserror.EINVAL } - fs, root := fstype.newFilesystem(vfsObj, creds) - return fs.VFSFilesystem(), root.VFSDentry(), nil + fs, root, err := fstype.newFilesystem(vfsObj, creds) + if err != nil { + return nil, nil, err + } + return fs.Filesystem.VFSFilesystem(), root.VFSDentry(), nil +} + +type filesystem struct { + kernfs.Filesystem + + devMinor uint32 } // newFilesystem creates a new devpts filesystem with root directory and ptmx // master inode. It returns the filesystem and root Dentry. -func (fstype FilesystemType) newFilesystem(vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials) (*kernfs.Filesystem, *kernfs.Dentry) { - fs := &kernfs.Filesystem{} - fs.VFSFilesystem().Init(vfsObj, fstype, fs) +func (fstype FilesystemType) newFilesystem(vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials) (*filesystem, *kernfs.Dentry, error) { + devMinor, err := vfsObj.GetAnonBlockDevMinor() + if err != nil { + return nil, nil, err + } + + fs := &filesystem{ + devMinor: devMinor, + } + fs.Filesystem.VFSFilesystem().Init(vfsObj, fstype, fs) // Construct the root directory. This is always inode id 1. root := &rootInode{ slaves: make(map[uint32]*slaveInode), } - root.InodeAttrs.Init(creds, 1, linux.ModeDirectory|0555) + root.InodeAttrs.Init(creds, linux.UNNAMED_MAJOR, devMinor, 1, linux.ModeDirectory|0555) root.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) root.dentry.Init(root) @@ -74,7 +90,7 @@ func (fstype FilesystemType) newFilesystem(vfsObj *vfs.VirtualFilesystem, creds master := &masterInode{ root: root, } - master.InodeAttrs.Init(creds, 2, linux.ModeCharacterDevice|0666) + master.InodeAttrs.Init(creds, linux.UNNAMED_MAJOR, devMinor, 2, linux.ModeCharacterDevice|0666) master.dentry.Init(master) // Add the master as a child of the root. @@ -83,7 +99,13 @@ func (fstype FilesystemType) newFilesystem(vfsObj *vfs.VirtualFilesystem, creds }) root.IncLinks(links) - return fs, &root.dentry + return fs, &root.dentry, nil +} + +// Release implements vfs.FilesystemImpl.Release. +func (fs *filesystem) Release() { + fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) + fs.Filesystem.Release() } // rootInode is the root directory inode for the devpts mounts. @@ -140,7 +162,7 @@ func (i *rootInode) allocateTerminal(creds *auth.Credentials) (*Terminal, error) } // Linux always uses pty index + 3 as the inode id. See // fs/devpts/inode.c:devpts_pty_new(). - slave.InodeAttrs.Init(creds, uint64(idx+3), linux.ModeCharacterDevice|0600) + slave.InodeAttrs.Init(creds, i.InodeAttrs.DevMajor(), i.InodeAttrs.DevMinor(), uint64(idx+3), linux.ModeCharacterDevice|0600) slave.dentry.Init(slave) i.slaves[idx] = slave diff --git a/pkg/sentry/fsimpl/ext/ext.go b/pkg/sentry/fsimpl/ext/ext.go index 7176af6d1..dac6effbf 100644 --- a/pkg/sentry/fsimpl/ext/ext.go +++ b/pkg/sentry/fsimpl/ext/ext.go @@ -105,36 +105,50 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt // EACCESS should be returned according to mount(2). Filesystem independent // flags (like readonly) are currently not available in pkg/sentry/vfs. + devMinor, err := vfsObj.GetAnonBlockDevMinor() + if err != nil { + return nil, nil, err + } + dev, err := getDeviceFd(source, opts) if err != nil { return nil, nil, err } - fs := filesystem{dev: dev, inodeCache: make(map[uint32]*inode)} + fs := filesystem{ + dev: dev, + inodeCache: make(map[uint32]*inode), + devMinor: devMinor, + } fs.vfsfs.Init(vfsObj, &fsType, &fs) fs.sb, err = readSuperBlock(dev) if err != nil { + fs.vfsfs.DecRef() return nil, nil, err } if fs.sb.Magic() != linux.EXT_SUPER_MAGIC { // mount(2) specifies that EINVAL should be returned if the superblock is // invalid. + fs.vfsfs.DecRef() return nil, nil, syserror.EINVAL } // Refuse to mount if the filesystem is incompatible. if !isCompatible(fs.sb) { + fs.vfsfs.DecRef() return nil, nil, syserror.EINVAL } fs.bgs, err = readBlockGroups(dev, fs.sb) if err != nil { + fs.vfsfs.DecRef() return nil, nil, err } rootInode, err := fs.getOrCreateInodeLocked(disklayout.RootDirInode) if err != nil { + fs.vfsfs.DecRef() return nil, nil, err } rootInode.incRef() diff --git a/pkg/sentry/fsimpl/ext/filesystem.go b/pkg/sentry/fsimpl/ext/filesystem.go index 77b644275..557963e03 100644 --- a/pkg/sentry/fsimpl/ext/filesystem.go +++ b/pkg/sentry/fsimpl/ext/filesystem.go @@ -64,6 +64,10 @@ type filesystem struct { // bgs represents all the block group descriptors for the filesystem. // Immutable after initialization. bgs []disklayout.BlockGroup + + // devMinor is this filesystem's device minor number. Immutable after + // initialization. + devMinor uint32 } // Compiles only if filesystem implements vfs.FilesystemImpl. @@ -366,7 +370,9 @@ func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linu } // Release implements vfs.FilesystemImpl.Release. -func (fs *filesystem) Release() {} +func (fs *filesystem) Release() { + fs.vfsfs.VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) +} // Sync implements vfs.FilesystemImpl.Sync. func (fs *filesystem) Sync(ctx context.Context) error { diff --git a/pkg/sentry/fsimpl/ext/inode.go b/pkg/sentry/fsimpl/ext/inode.go index a98512350..485f86f4b 100644 --- a/pkg/sentry/fsimpl/ext/inode.go +++ b/pkg/sentry/fsimpl/ext/inode.go @@ -204,6 +204,8 @@ func (in *inode) statTo(stat *linux.Statx) { stat.Atime = in.diskInode.AccessTime().StatxTimestamp() stat.Ctime = in.diskInode.ChangeTime().StatxTimestamp() stat.Mtime = in.diskInode.ModificationTime().StatxTimestamp() + stat.DevMajor = linux.UNNAMED_MAJOR + stat.DevMinor = in.fs.devMinor // TODO(b/134676337): Set stat.Blocks which is the number of 512 byte blocks // (including metadata blocks) required to represent this file. } diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go index 9ab8fdc65..e68e37ebc 100644 --- a/pkg/sentry/fsimpl/gofer/gofer.go +++ b/pkg/sentry/fsimpl/gofer/gofer.go @@ -81,6 +81,9 @@ type filesystem struct { // clock is a realtime clock used to set timestamps in file operations. clock ktime.Clock + // devMinor is the filesystem's minor device number. devMinor is immutable. + devMinor uint32 + // uid and gid are the effective KUID and KGID of the filesystem's creator, // and are used as the owner and group for files that don't specify one. // uid and gid are immutable. @@ -399,14 +402,21 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt } // Construct the filesystem object. + devMinor, err := vfsObj.GetAnonBlockDevMinor() + if err != nil { + attachFile.close(ctx) + client.Close() + return nil, nil, err + } fs := &filesystem{ mfp: mfp, opts: fsopts, iopts: iopts, - uid: creds.EffectiveKUID, - gid: creds.EffectiveKGID, client: client, clock: ktime.RealtimeClockFromContext(ctx), + devMinor: devMinor, + uid: creds.EffectiveKUID, + gid: creds.EffectiveKGID, syncableDentries: make(map[*dentry]struct{}), specialFileFDs: make(map[*specialFileFD]struct{}), } @@ -464,6 +474,8 @@ func (fs *filesystem) Release() { // Close the connection to the server. This implicitly clunks all fids. fs.client.Close() } + + fs.vfsfs.VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) } // dentry implements vfs.DentryImpl. @@ -796,7 +808,8 @@ func (d *dentry) statTo(stat *linux.Statx) { stat.Btime = statxTimestampFromDentry(atomic.LoadInt64(&d.btime)) stat.Ctime = statxTimestampFromDentry(atomic.LoadInt64(&d.ctime)) stat.Mtime = statxTimestampFromDentry(atomic.LoadInt64(&d.mtime)) - // TODO(gvisor.dev/issue/1198): device number + stat.DevMajor = linux.UNNAMED_MAJOR + stat.DevMinor = d.fs.devMinor } func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *linux.Statx, mnt *vfs.Mount) error { diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go index 144e04905..55de9c438 100644 --- a/pkg/sentry/fsimpl/host/host.go +++ b/pkg/sentry/fsimpl/host/host.go @@ -115,15 +115,28 @@ func (filesystemType) Name() string { // // Note that there should only ever be one instance of host.filesystem, // a global mount for host fds. -func NewFilesystem(vfsObj *vfs.VirtualFilesystem) *vfs.Filesystem { - fs := &filesystem{} +func NewFilesystem(vfsObj *vfs.VirtualFilesystem) (*vfs.Filesystem, error) { + devMinor, err := vfsObj.GetAnonBlockDevMinor() + if err != nil { + return nil, err + } + fs := &filesystem{ + devMinor: devMinor, + } fs.VFSFilesystem().Init(vfsObj, filesystemType{}, fs) - return fs.VFSFilesystem() + return fs.VFSFilesystem(), nil } // filesystem implements vfs.FilesystemImpl. type filesystem struct { kernfs.Filesystem + + devMinor uint32 +} + +func (fs *filesystem) Release() { + fs.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) + fs.Filesystem.Release() } func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error { @@ -219,7 +232,7 @@ func (i *inode) Mode() linux.FileMode { } // Stat implements kernfs.Inode. -func (i *inode) Stat(_ *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) { +func (i *inode) Stat(vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) { if opts.Mask&linux.STATX__RESERVED != 0 { return linux.Statx{}, syserror.EINVAL } @@ -227,73 +240,73 @@ func (i *inode) Stat(_ *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, erro return linux.Statx{}, syserror.EINVAL } + fs := vfsfs.Impl().(*filesystem) + // Limit our host call only to known flags. mask := opts.Mask & linux.STATX_ALL var s unix.Statx_t err := unix.Statx(i.hostFD, "", int(unix.AT_EMPTY_PATH|opts.Sync), int(mask), &s) - // Fallback to fstat(2), if statx(2) is not supported on the host. - // - // TODO(b/151263641): Remove fallback. if err == syserror.ENOSYS { - return i.fstat(opts) - } else if err != nil { + // Fallback to fstat(2), if statx(2) is not supported on the host. + // + // TODO(b/151263641): Remove fallback. + return i.fstat(fs) + } + if err != nil { return linux.Statx{}, err } - ls := linux.Statx{Mask: mask} - // Unconditionally fill blksize, attributes, and device numbers, as indicated - // by /include/uapi/linux/stat.h. - // - // RdevMajor/RdevMinor are left as zero, so as not to expose host device - // numbers. - // - // TODO(gvisor.dev/issue/1672): Use kernfs-specific, internally defined - // device numbers. If we use the device number from the host, it may collide - // with another sentry-internal device number. We handle device/inode - // numbers without relying on the host to prevent collisions. - ls.Blksize = s.Blksize - ls.Attributes = s.Attributes - ls.AttributesMask = s.Attributes_mask - - if mask&linux.STATX_TYPE != 0 { + // Unconditionally fill blksize, attributes, and device numbers, as + // indicated by /include/uapi/linux/stat.h. Inode number is always + // available, since we use our own rather than the host's. + ls := linux.Statx{ + Mask: linux.STATX_INO, + Blksize: s.Blksize, + Attributes: s.Attributes, + Ino: i.ino, + AttributesMask: s.Attributes_mask, + DevMajor: linux.UNNAMED_MAJOR, + DevMinor: fs.devMinor, + } + + // Copy other fields that were returned by the host. RdevMajor/RdevMinor + // are never copied (and therefore left as zero), so as not to expose host + // device numbers. + ls.Mask |= s.Mask & linux.STATX_ALL + if s.Mask&linux.STATX_TYPE != 0 { ls.Mode |= s.Mode & linux.S_IFMT } - if mask&linux.STATX_MODE != 0 { + if s.Mask&linux.STATX_MODE != 0 { ls.Mode |= s.Mode &^ linux.S_IFMT } - if mask&linux.STATX_NLINK != 0 { + if s.Mask&linux.STATX_NLINK != 0 { ls.Nlink = s.Nlink } - if mask&linux.STATX_UID != 0 { + if s.Mask&linux.STATX_UID != 0 { ls.UID = s.Uid } - if mask&linux.STATX_GID != 0 { + if s.Mask&linux.STATX_GID != 0 { ls.GID = s.Gid } - if mask&linux.STATX_ATIME != 0 { + if s.Mask&linux.STATX_ATIME != 0 { ls.Atime = unixToLinuxStatxTimestamp(s.Atime) } - if mask&linux.STATX_BTIME != 0 { + if s.Mask&linux.STATX_BTIME != 0 { ls.Btime = unixToLinuxStatxTimestamp(s.Btime) } - if mask&linux.STATX_CTIME != 0 { + if s.Mask&linux.STATX_CTIME != 0 { ls.Ctime = unixToLinuxStatxTimestamp(s.Ctime) } - if mask&linux.STATX_MTIME != 0 { + if s.Mask&linux.STATX_MTIME != 0 { ls.Mtime = unixToLinuxStatxTimestamp(s.Mtime) } - if mask&linux.STATX_SIZE != 0 { + if s.Mask&linux.STATX_SIZE != 0 { ls.Size = s.Size } - if mask&linux.STATX_BLOCKS != 0 { + if s.Mask&linux.STATX_BLOCKS != 0 { ls.Blocks = s.Blocks } - // Use our own internal inode number. - if mask&linux.STATX_INO != 0 { - ls.Ino = i.ino - } - return ls, nil } @@ -305,36 +318,30 @@ func (i *inode) Stat(_ *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, erro // of a mask or sync flags. fstat(2) does not provide any metadata // equivalent to Statx.Attributes, Statx.AttributesMask, or Statx.Btime, so // those fields remain empty. -func (i *inode) fstat(opts vfs.StatOptions) (linux.Statx, error) { +func (i *inode) fstat(fs *filesystem) (linux.Statx, error) { var s unix.Stat_t if err := unix.Fstat(i.hostFD, &s); err != nil { return linux.Statx{}, err } - // Note that rdev numbers are left as 0; do not expose host device numbers. - ls := linux.Statx{ - Mask: linux.STATX_BASIC_STATS, - Blksize: uint32(s.Blksize), - Nlink: uint32(s.Nlink), - UID: s.Uid, - GID: s.Gid, - Mode: uint16(s.Mode), - Size: uint64(s.Size), - Blocks: uint64(s.Blocks), - Atime: timespecToStatxTimestamp(s.Atim), - Ctime: timespecToStatxTimestamp(s.Ctim), - Mtime: timespecToStatxTimestamp(s.Mtim), - } - - // Use our own internal inode number. - // - // TODO(gvisor.dev/issue/1672): Use a kernfs-specific device number as well. - // If we use the device number from the host, it may collide with another - // sentry-internal device number. We handle device/inode numbers without - // relying on the host to prevent collisions. - ls.Ino = i.ino - - return ls, nil + // As with inode.Stat(), we always use internal device and inode numbers, + // and never expose the host's represented device numbers. + return linux.Statx{ + Mask: linux.STATX_BASIC_STATS, + Blksize: uint32(s.Blksize), + Nlink: uint32(s.Nlink), + UID: s.Uid, + GID: s.Gid, + Mode: uint16(s.Mode), + Ino: i.ino, + Size: uint64(s.Size), + Blocks: uint64(s.Blocks), + Atime: timespecToStatxTimestamp(s.Atim), + Ctime: timespecToStatxTimestamp(s.Ctim), + Mtime: timespecToStatxTimestamp(s.Mtim), + DevMajor: linux.UNNAMED_MAJOR, + DevMinor: fs.devMinor, + }, nil } // SetStat implements kernfs.Inode. @@ -453,8 +460,6 @@ func (i *inode) open(ctx context.Context, d *vfs.Dentry, mnt *vfs.Mount) (*vfs.F } // fileDescription is embedded by host fd implementations of FileDescriptionImpl. -// -// TODO(gvisor.dev/issue/1672): Implement Waitable interface. type fileDescription struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl @@ -471,12 +476,12 @@ type fileDescription struct { // SetStat implements vfs.FileDescriptionImpl. func (f *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { creds := auth.CredentialsFromContext(ctx) - return f.inode.SetStat(ctx, nil, creds, opts) + return f.inode.SetStat(ctx, f.vfsfd.Mount().Filesystem(), creds, opts) } // Stat implements vfs.FileDescriptionImpl. func (f *fileDescription) Stat(_ context.Context, opts vfs.StatOptions) (linux.Statx, error) { - return f.inode.Stat(nil, opts) + return f.inode.Stat(f.vfsfd.Mount().Filesystem(), opts) } // Release implements vfs.FileDescriptionImpl. diff --git a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go index c7779fc11..1568a9d49 100644 --- a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go +++ b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go @@ -44,11 +44,11 @@ type DynamicBytesFile struct { var _ Inode = (*DynamicBytesFile)(nil) // Init initializes a dynamic bytes file. -func (f *DynamicBytesFile) Init(creds *auth.Credentials, ino uint64, data vfs.DynamicBytesSource, perm linux.FileMode) { +func (f *DynamicBytesFile) Init(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, data vfs.DynamicBytesSource, perm linux.FileMode) { if perm&^linux.PermissionsMask != 0 { panic(fmt.Sprintf("Only permission mask must be set: %x", perm&linux.PermissionsMask)) } - f.InodeAttrs.Init(creds, ino, linux.ModeRegular|perm) + f.InodeAttrs.Init(creds, devMajor, devMinor, ino, linux.ModeRegular|perm) f.data = data } diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go index 615592d5f..982daa2e6 100644 --- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go @@ -192,15 +192,17 @@ func (InodeNotSymlink) Getlink(context.Context, *vfs.Mount) (vfs.VirtualDentry, // // Must be initialized by Init prior to first use. type InodeAttrs struct { - ino uint64 - mode uint32 - uid uint32 - gid uint32 - nlink uint32 + devMajor uint32 + devMinor uint32 + ino uint64 + mode uint32 + uid uint32 + gid uint32 + nlink uint32 } // Init initializes this InodeAttrs. -func (a *InodeAttrs) Init(creds *auth.Credentials, ino uint64, mode linux.FileMode) { +func (a *InodeAttrs) Init(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, mode linux.FileMode) { if mode.FileType() == 0 { panic(fmt.Sprintf("No file type specified in 'mode' for InodeAttrs.Init(): mode=0%o", mode)) } @@ -209,6 +211,8 @@ func (a *InodeAttrs) Init(creds *auth.Credentials, ino uint64, mode linux.FileMo if mode.FileType() == linux.ModeDirectory { nlink = 2 } + a.devMajor = devMajor + a.devMinor = devMinor atomic.StoreUint64(&a.ino, ino) atomic.StoreUint32(&a.mode, uint32(mode)) atomic.StoreUint32(&a.uid, uint32(creds.EffectiveKUID)) @@ -216,6 +220,16 @@ func (a *InodeAttrs) Init(creds *auth.Credentials, ino uint64, mode linux.FileMo atomic.StoreUint32(&a.nlink, nlink) } +// DevMajor returns the device major number. +func (a *InodeAttrs) DevMajor() uint32 { + return a.devMajor +} + +// DevMinor returns the device minor number. +func (a *InodeAttrs) DevMinor() uint32 { + return a.devMinor +} + // Ino returns the inode id. func (a *InodeAttrs) Ino() uint64 { return atomic.LoadUint64(&a.ino) @@ -232,6 +246,8 @@ func (a *InodeAttrs) Mode() linux.FileMode { func (a *InodeAttrs) Stat(*vfs.Filesystem, vfs.StatOptions) (linux.Statx, error) { var stat linux.Statx stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO | linux.STATX_NLINK + stat.DevMajor = a.devMajor + stat.DevMinor = a.devMinor stat.Ino = atomic.LoadUint64(&a.ino) stat.Mode = uint16(a.Mode()) stat.UID = atomic.LoadUint32(&a.uid) @@ -544,9 +560,9 @@ type StaticDirectory struct { var _ Inode = (*StaticDirectory)(nil) // NewStaticDir creates a new static directory and returns its dentry. -func NewStaticDir(creds *auth.Credentials, ino uint64, perm linux.FileMode, children map[string]*Dentry) *Dentry { +func NewStaticDir(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode, children map[string]*Dentry) *Dentry { inode := &StaticDirectory{} - inode.Init(creds, ino, perm) + inode.Init(creds, devMajor, devMinor, ino, perm) dentry := &Dentry{} dentry.Init(inode) @@ -559,11 +575,11 @@ func NewStaticDir(creds *auth.Credentials, ino uint64, perm linux.FileMode, chil } // Init initializes StaticDirectory. -func (s *StaticDirectory) Init(creds *auth.Credentials, ino uint64, perm linux.FileMode) { +func (s *StaticDirectory) Init(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode) { if perm&^linux.PermissionsMask != 0 { panic(fmt.Sprintf("Only permission mask must be set: %x", perm&linux.PermissionsMask)) } - s.InodeAttrs.Init(creds, ino, linux.ModeDirectory|perm) + s.InodeAttrs.Init(creds, devMajor, devMinor, ino, linux.ModeDirectory|perm) } // Open implements kernfs.Inode. diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go index 1c5d3e7e7..412cf6ac9 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs_test.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go @@ -75,7 +75,7 @@ type file struct { func (fs *filesystem) newFile(creds *auth.Credentials, content string) *kernfs.Dentry { f := &file{} f.content = content - f.DynamicBytesFile.Init(creds, fs.NextIno(), f, 0777) + f.DynamicBytesFile.Init(creds, 0 /* devMajor */, 0 /* devMinor */, fs.NextIno(), f, 0777) d := &kernfs.Dentry{} d.Init(f) @@ -107,7 +107,7 @@ type readonlyDir struct { func (fs *filesystem) newReadonlyDir(creds *auth.Credentials, mode linux.FileMode, contents map[string]*kernfs.Dentry) *kernfs.Dentry { dir := &readonlyDir{} - dir.attrs.Init(creds, fs.NextIno(), linux.ModeDirectory|mode) + dir.attrs.Init(creds, 0 /* devMajor */, 0 /* devMinor */, fs.NextIno(), linux.ModeDirectory|mode) dir.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) dir.dentry.Init(dir) @@ -137,7 +137,7 @@ type dir struct { func (fs *filesystem) newDir(creds *auth.Credentials, mode linux.FileMode, contents map[string]*kernfs.Dentry) *kernfs.Dentry { dir := &dir{} dir.fs = fs - dir.attrs.Init(creds, fs.NextIno(), linux.ModeDirectory|mode) + dir.attrs.Init(creds, 0 /* devMajor */, 0 /* devMinor */, fs.NextIno(), linux.ModeDirectory|mode) dir.OrderedChildren.Init(kernfs.OrderedChildrenOptions{Writable: true}) dir.dentry.Init(dir) diff --git a/pkg/sentry/fsimpl/kernfs/symlink.go b/pkg/sentry/fsimpl/kernfs/symlink.go index 0aa6dc979..2ab3f53fd 100644 --- a/pkg/sentry/fsimpl/kernfs/symlink.go +++ b/pkg/sentry/fsimpl/kernfs/symlink.go @@ -35,9 +35,9 @@ type StaticSymlink struct { var _ Inode = (*StaticSymlink)(nil) // NewStaticSymlink creates a new symlink file pointing to 'target'. -func NewStaticSymlink(creds *auth.Credentials, ino uint64, target string) *Dentry { +func NewStaticSymlink(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, target string) *Dentry { inode := &StaticSymlink{} - inode.Init(creds, ino, target) + inode.Init(creds, devMajor, devMinor, ino, target) d := &Dentry{} d.Init(inode) @@ -45,9 +45,9 @@ func NewStaticSymlink(creds *auth.Credentials, ino uint64, target string) *Dentr } // Init initializes the instance. -func (s *StaticSymlink) Init(creds *auth.Credentials, ino uint64, target string) { +func (s *StaticSymlink) Init(creds *auth.Credentials, devMajor uint32, devMinor uint32, ino uint64, target string) { s.target = target - s.InodeAttrs.Init(creds, ino, linux.ModeSymlink|0777) + s.InodeAttrs.Init(creds, devMajor, devMinor, ino, linux.ModeSymlink|0777) } // Readlink implements Inode. diff --git a/pkg/sentry/fsimpl/pipefs/BUILD b/pkg/sentry/fsimpl/pipefs/BUILD index 0d411606f..5950a2d59 100644 --- a/pkg/sentry/fsimpl/pipefs/BUILD +++ b/pkg/sentry/fsimpl/pipefs/BUILD @@ -9,6 +9,7 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/context", + "//pkg/fspath", "//pkg/sentry/fsimpl/kernfs", "//pkg/sentry/kernel/auth", "//pkg/sentry/kernel/pipe", diff --git a/pkg/sentry/fsimpl/pipefs/pipefs.go b/pkg/sentry/fsimpl/pipefs/pipefs.go index 5375e5e75..cab771211 100644 --- a/pkg/sentry/fsimpl/pipefs/pipefs.go +++ b/pkg/sentry/fsimpl/pipefs/pipefs.go @@ -17,8 +17,11 @@ package pipefs import ( + "fmt" + "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" @@ -40,20 +43,36 @@ func (filesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFile panic("pipefs.filesystemType.GetFilesystem should never be called") } -// TODO(gvisor.dev/issue/1193): -// -// - kernfs does not provide a way to implement statfs, from which we -// should indicate PIPEFS_MAGIC. -// -// - kernfs does not provide a way to override names for -// vfs.FilesystemImpl.PrependPath(); pipefs inodes should use synthetic -// name fmt.Sprintf("pipe:[%d]", inode.ino). +type filesystem struct { + kernfs.Filesystem + + devMinor uint32 +} // NewFilesystem sets up and returns a new vfs.Filesystem implemented by pipefs. -func NewFilesystem(vfsObj *vfs.VirtualFilesystem) *vfs.Filesystem { - fs := &kernfs.Filesystem{} - fs.VFSFilesystem().Init(vfsObj, filesystemType{}, fs) - return fs.VFSFilesystem() +func NewFilesystem(vfsObj *vfs.VirtualFilesystem) (*vfs.Filesystem, error) { + devMinor, err := vfsObj.GetAnonBlockDevMinor() + if err != nil { + return nil, err + } + fs := &filesystem{ + devMinor: devMinor, + } + fs.Filesystem.VFSFilesystem().Init(vfsObj, filesystemType{}, fs) + return fs.Filesystem.VFSFilesystem(), nil +} + +// Release implements vfs.FilesystemImpl.Release. +func (fs *filesystem) Release() { + fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) + fs.Filesystem.Release() +} + +// PrependPath implements vfs.FilesystemImpl.PrependPath. +func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error { + inode := vd.Dentry().Impl().(*kernfs.Dentry).Inode().(*inode) + b.PrependComponent(fmt.Sprintf("pipe:[%d]", inode.ino)) + return vfs.PrependPathSyntheticError{} } // inode implements kernfs.Inode. @@ -71,11 +90,11 @@ type inode struct { ctime ktime.Time } -func newInode(ctx context.Context, fs *kernfs.Filesystem) *inode { +func newInode(ctx context.Context, fs *filesystem) *inode { creds := auth.CredentialsFromContext(ctx) return &inode{ pipe: pipe.NewVFSPipe(false /* isNamed */, pipe.DefaultPipeSize, usermem.PageSize), - ino: fs.NextIno(), + ino: fs.Filesystem.NextIno(), uid: creds.EffectiveKUID, gid: creds.EffectiveKGID, ctime: ktime.NowFromContext(ctx), @@ -98,19 +117,20 @@ func (i *inode) Mode() linux.FileMode { func (i *inode) Stat(vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) { ts := linux.NsecToStatxTimestamp(i.ctime.Nanoseconds()) return linux.Statx{ - Mask: linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME | linux.STATX_INO | linux.STATX_SIZE | linux.STATX_BLOCKS, - Blksize: usermem.PageSize, - Nlink: 1, - UID: uint32(i.uid), - GID: uint32(i.gid), - Mode: pipeMode, - Ino: i.ino, - Size: 0, - Blocks: 0, - Atime: ts, - Ctime: ts, - Mtime: ts, - // TODO(gvisor.dev/issue/1197): Device number. + Mask: linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME | linux.STATX_INO | linux.STATX_SIZE | linux.STATX_BLOCKS, + Blksize: usermem.PageSize, + Nlink: 1, + UID: uint32(i.uid), + GID: uint32(i.gid), + Mode: pipeMode, + Ino: i.ino, + Size: 0, + Blocks: 0, + Atime: ts, + Ctime: ts, + Mtime: ts, + DevMajor: linux.UNNAMED_MAJOR, + DevMinor: vfsfs.Impl().(*filesystem).devMinor, }, nil } @@ -122,6 +142,9 @@ func (i *inode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds *auth. return syserror.EPERM } +// TODO(gvisor.dev/issue/1193): kernfs does not provide a way to implement +// statfs, from which we should indicate PIPEFS_MAGIC. + // Open implements kernfs.Inode.Open. func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { return i.pipe.Open(ctx, rp.Mount(), vfsd, opts.Flags) @@ -132,7 +155,7 @@ func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentr // // Preconditions: mnt.Filesystem() must have been returned by NewFilesystem(). func NewConnectedPipeFDs(ctx context.Context, mnt *vfs.Mount, flags uint32) (*vfs.FileDescription, *vfs.FileDescription) { - fs := mnt.Filesystem().Impl().(*kernfs.Filesystem) + fs := mnt.Filesystem().Impl().(*filesystem) inode := newInode(ctx, fs) var d kernfs.Dentry d.Init(inode) diff --git a/pkg/sentry/fsimpl/proc/filesystem.go b/pkg/sentry/fsimpl/proc/filesystem.go index 104fc9030..609210253 100644 --- a/pkg/sentry/fsimpl/proc/filesystem.go +++ b/pkg/sentry/fsimpl/proc/filesystem.go @@ -41,6 +41,12 @@ func (FilesystemType) Name() string { return Name } +type filesystem struct { + kernfs.Filesystem + + devMinor uint32 +} + // GetFilesystem implements vfs.FilesystemType.GetFilesystem. func (ft FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { k := kernel.KernelFromContext(ctx) @@ -51,8 +57,13 @@ func (ft FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualF if pidns == nil { return nil, nil, fmt.Errorf("procfs requires a PID namespace") } - - procfs := &kernfs.Filesystem{} + devMinor, err := vfsObj.GetAnonBlockDevMinor() + if err != nil { + return nil, nil, err + } + procfs := &filesystem{ + devMinor: devMinor, + } procfs.VFSFilesystem().Init(vfsObj, &ft, procfs) var cgroups map[string]string @@ -61,21 +72,27 @@ func (ft FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualF cgroups = data.Cgroups } - _, dentry := newTasksInode(procfs, k, pidns, cgroups) + _, dentry := procfs.newTasksInode(k, pidns, cgroups) return procfs.VFSFilesystem(), dentry.VFSDentry(), nil } +// Release implements vfs.FilesystemImpl.Release. +func (fs *filesystem) Release() { + fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) + fs.Filesystem.Release() +} + // dynamicInode is an overfitted interface for common Inodes with // dynamicByteSource types used in procfs. type dynamicInode interface { kernfs.Inode vfs.DynamicBytesSource - Init(creds *auth.Credentials, ino uint64, data vfs.DynamicBytesSource, perm linux.FileMode) + Init(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, data vfs.DynamicBytesSource, perm linux.FileMode) } -func newDentry(creds *auth.Credentials, ino uint64, perm linux.FileMode, inode dynamicInode) *kernfs.Dentry { - inode.Init(creds, ino, inode, perm) +func (fs *filesystem) newDentry(creds *auth.Credentials, ino uint64, perm linux.FileMode, inode dynamicInode) *kernfs.Dentry { + inode.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, ino, inode, perm) d := &kernfs.Dentry{} d.Init(inode) diff --git a/pkg/sentry/fsimpl/proc/subtasks.go b/pkg/sentry/fsimpl/proc/subtasks.go index a5cfa8333..36a911db4 100644 --- a/pkg/sentry/fsimpl/proc/subtasks.go +++ b/pkg/sentry/fsimpl/proc/subtasks.go @@ -37,23 +37,23 @@ type subtasksInode struct { kernfs.OrderedChildren kernfs.AlwaysValid + fs *filesystem task *kernel.Task pidns *kernel.PIDNamespace - inoGen InoGenerator cgroupControllers map[string]string } var _ kernfs.Inode = (*subtasksInode)(nil) -func newSubtasks(task *kernel.Task, pidns *kernel.PIDNamespace, inoGen InoGenerator, cgroupControllers map[string]string) *kernfs.Dentry { +func (fs *filesystem) newSubtasks(task *kernel.Task, pidns *kernel.PIDNamespace, cgroupControllers map[string]string) *kernfs.Dentry { subInode := &subtasksInode{ + fs: fs, task: task, pidns: pidns, - inoGen: inoGen, cgroupControllers: cgroupControllers, } // Note: credentials are overridden by taskOwnedInode. - subInode.InodeAttrs.Init(task.Credentials(), inoGen.NextIno(), linux.ModeDirectory|0555) + subInode.InodeAttrs.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555) subInode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) inode := &taskOwnedInode{Inode: subInode, owner: task} @@ -78,7 +78,7 @@ func (i *subtasksInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, e return nil, syserror.ENOENT } - subTaskDentry := newTaskInode(i.inoGen, subTask, i.pidns, false, i.cgroupControllers) + subTaskDentry := i.fs.newTaskInode(subTask, i.pidns, false, i.cgroupControllers) return subTaskDentry.VFSDentry(), nil } @@ -102,7 +102,7 @@ func (i *subtasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallb dirent := vfs.Dirent{ Name: strconv.FormatUint(uint64(tid), 10), Type: linux.DT_DIR, - Ino: i.inoGen.NextIno(), + Ino: i.fs.NextIno(), NextOff: offset + 1, } if err := cb.Handle(dirent); err != nil { diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go index 66419d91b..482055db1 100644 --- a/pkg/sentry/fsimpl/proc/task.go +++ b/pkg/sentry/fsimpl/proc/task.go @@ -43,45 +43,45 @@ type taskInode struct { var _ kernfs.Inode = (*taskInode)(nil) -func newTaskInode(inoGen InoGenerator, task *kernel.Task, pidns *kernel.PIDNamespace, isThreadGroup bool, cgroupControllers map[string]string) *kernfs.Dentry { +func (fs *filesystem) newTaskInode(task *kernel.Task, pidns *kernel.PIDNamespace, isThreadGroup bool, cgroupControllers map[string]string) *kernfs.Dentry { // TODO(gvisor.dev/issue/164): Fail with ESRCH if task exited. contents := map[string]*kernfs.Dentry{ - "auxv": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &auxvData{task: task}), - "cmdline": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &cmdlineData{task: task, arg: cmdlineDataArg}), - "comm": newComm(task, inoGen.NextIno(), 0444), - "environ": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &cmdlineData{task: task, arg: environDataArg}), - "exe": newExeSymlink(task, inoGen.NextIno()), - "fd": newFDDirInode(task, inoGen), - "fdinfo": newFDInfoDirInode(task, inoGen), - "gid_map": newTaskOwnedFile(task, inoGen.NextIno(), 0644, &idMapData{task: task, gids: true}), - "io": newTaskOwnedFile(task, inoGen.NextIno(), 0400, newIO(task, isThreadGroup)), - "maps": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &mapsData{task: task}), - "mountinfo": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &mountInfoData{task: task}), - "mounts": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &mountsData{task: task}), - "net": newTaskNetDir(task, inoGen), - "ns": newTaskOwnedDir(task, inoGen.NextIno(), 0511, map[string]*kernfs.Dentry{ - "net": newNamespaceSymlink(task, inoGen.NextIno(), "net"), - "pid": newNamespaceSymlink(task, inoGen.NextIno(), "pid"), - "user": newNamespaceSymlink(task, inoGen.NextIno(), "user"), + "auxv": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &auxvData{task: task}), + "cmdline": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &cmdlineData{task: task, arg: cmdlineDataArg}), + "comm": fs.newComm(task, fs.NextIno(), 0444), + "environ": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &cmdlineData{task: task, arg: environDataArg}), + "exe": fs.newExeSymlink(task, fs.NextIno()), + "fd": fs.newFDDirInode(task), + "fdinfo": fs.newFDInfoDirInode(task), + "gid_map": fs.newTaskOwnedFile(task, fs.NextIno(), 0644, &idMapData{task: task, gids: true}), + "io": fs.newTaskOwnedFile(task, fs.NextIno(), 0400, newIO(task, isThreadGroup)), + "maps": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &mapsData{task: task}), + "mountinfo": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &mountInfoData{task: task}), + "mounts": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &mountsData{task: task}), + "net": fs.newTaskNetDir(task), + "ns": fs.newTaskOwnedDir(task, fs.NextIno(), 0511, map[string]*kernfs.Dentry{ + "net": fs.newNamespaceSymlink(task, fs.NextIno(), "net"), + "pid": fs.newNamespaceSymlink(task, fs.NextIno(), "pid"), + "user": fs.newNamespaceSymlink(task, fs.NextIno(), "user"), }), - "oom_score": newTaskOwnedFile(task, inoGen.NextIno(), 0444, newStaticFile("0\n")), - "oom_score_adj": newTaskOwnedFile(task, inoGen.NextIno(), 0644, &oomScoreAdj{task: task}), - "smaps": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &smapsData{task: task}), - "stat": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &taskStatData{task: task, pidns: pidns, tgstats: isThreadGroup}), - "statm": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &statmData{task: task}), - "status": newTaskOwnedFile(task, inoGen.NextIno(), 0444, &statusData{task: task, pidns: pidns}), - "uid_map": newTaskOwnedFile(task, inoGen.NextIno(), 0644, &idMapData{task: task, gids: false}), + "oom_score": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, newStaticFile("0\n")), + "oom_score_adj": fs.newTaskOwnedFile(task, fs.NextIno(), 0644, &oomScoreAdj{task: task}), + "smaps": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &smapsData{task: task}), + "stat": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &taskStatData{task: task, pidns: pidns, tgstats: isThreadGroup}), + "statm": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &statmData{task: task}), + "status": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &statusData{task: task, pidns: pidns}), + "uid_map": fs.newTaskOwnedFile(task, fs.NextIno(), 0644, &idMapData{task: task, gids: false}), } if isThreadGroup { - contents["task"] = newSubtasks(task, pidns, inoGen, cgroupControllers) + contents["task"] = fs.newSubtasks(task, pidns, cgroupControllers) } if len(cgroupControllers) > 0 { - contents["cgroup"] = newTaskOwnedFile(task, inoGen.NextIno(), 0444, newCgroupData(cgroupControllers)) + contents["cgroup"] = fs.newTaskOwnedFile(task, fs.NextIno(), 0444, newCgroupData(cgroupControllers)) } taskInode := &taskInode{task: task} // Note: credentials are overridden by taskOwnedInode. - taskInode.InodeAttrs.Init(task.Credentials(), inoGen.NextIno(), linux.ModeDirectory|0555) + taskInode.InodeAttrs.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555) inode := &taskOwnedInode{Inode: taskInode, owner: task} dentry := &kernfs.Dentry{} @@ -126,9 +126,9 @@ type taskOwnedInode struct { var _ kernfs.Inode = (*taskOwnedInode)(nil) -func newTaskOwnedFile(task *kernel.Task, ino uint64, perm linux.FileMode, inode dynamicInode) *kernfs.Dentry { +func (fs *filesystem) newTaskOwnedFile(task *kernel.Task, ino uint64, perm linux.FileMode, inode dynamicInode) *kernfs.Dentry { // Note: credentials are overridden by taskOwnedInode. - inode.Init(task.Credentials(), ino, inode, perm) + inode.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, inode, perm) taskInode := &taskOwnedInode{Inode: inode, owner: task} d := &kernfs.Dentry{} @@ -136,11 +136,11 @@ func newTaskOwnedFile(task *kernel.Task, ino uint64, perm linux.FileMode, inode return d } -func newTaskOwnedDir(task *kernel.Task, ino uint64, perm linux.FileMode, children map[string]*kernfs.Dentry) *kernfs.Dentry { +func (fs *filesystem) newTaskOwnedDir(task *kernel.Task, ino uint64, perm linux.FileMode, children map[string]*kernfs.Dentry) *kernfs.Dentry { dir := &kernfs.StaticDirectory{} // Note: credentials are overridden by taskOwnedInode. - dir.Init(task.Credentials(), ino, perm) + dir.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, perm) inode := &taskOwnedInode{Inode: dir, owner: task} d := &kernfs.Dentry{} diff --git a/pkg/sentry/fsimpl/proc/task_fds.go b/pkg/sentry/fsimpl/proc/task_fds.go index 8ad976073..44ccc9e4a 100644 --- a/pkg/sentry/fsimpl/proc/task_fds.go +++ b/pkg/sentry/fsimpl/proc/task_fds.go @@ -53,8 +53,8 @@ func taskFDExists(t *kernel.Task, fd int32) bool { } type fdDir struct { - inoGen InoGenerator - task *kernel.Task + fs *filesystem + task *kernel.Task // When produceSymlinks is set, dirents produces for the FDs are reported // as symlink. Otherwise, they are reported as regular files. @@ -85,7 +85,7 @@ func (i *fdDir) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, abs dirent := vfs.Dirent{ Name: strconv.FormatUint(uint64(fd), 10), Type: typ, - Ino: i.inoGen.NextIno(), + Ino: i.fs.NextIno(), NextOff: offset + 1, } if err := cb.Handle(dirent); err != nil { @@ -110,15 +110,15 @@ type fdDirInode struct { var _ kernfs.Inode = (*fdDirInode)(nil) -func newFDDirInode(task *kernel.Task, inoGen InoGenerator) *kernfs.Dentry { +func (fs *filesystem) newFDDirInode(task *kernel.Task) *kernfs.Dentry { inode := &fdDirInode{ fdDir: fdDir{ - inoGen: inoGen, + fs: fs, task: task, produceSymlink: true, }, } - inode.InodeAttrs.Init(task.Credentials(), inoGen.NextIno(), linux.ModeDirectory|0555) + inode.InodeAttrs.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555) dentry := &kernfs.Dentry{} dentry.Init(inode) @@ -137,7 +137,7 @@ func (i *fdDirInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, erro if !taskFDExists(i.task, fd) { return nil, syserror.ENOENT } - taskDentry := newFDSymlink(i.task, fd, i.inoGen.NextIno()) + taskDentry := i.fs.newFDSymlink(i.task, fd, i.fs.NextIno()) return taskDentry.VFSDentry(), nil } @@ -186,12 +186,12 @@ type fdSymlink struct { var _ kernfs.Inode = (*fdSymlink)(nil) -func newFDSymlink(task *kernel.Task, fd int32, ino uint64) *kernfs.Dentry { +func (fs *filesystem) newFDSymlink(task *kernel.Task, fd int32, ino uint64) *kernfs.Dentry { inode := &fdSymlink{ task: task, fd: fd, } - inode.Init(task.Credentials(), ino, linux.ModeSymlink|0777) + inode.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777) d := &kernfs.Dentry{} d.Init(inode) @@ -234,14 +234,14 @@ type fdInfoDirInode struct { var _ kernfs.Inode = (*fdInfoDirInode)(nil) -func newFDInfoDirInode(task *kernel.Task, inoGen InoGenerator) *kernfs.Dentry { +func (fs *filesystem) newFDInfoDirInode(task *kernel.Task) *kernfs.Dentry { inode := &fdInfoDirInode{ fdDir: fdDir{ - inoGen: inoGen, - task: task, + fs: fs, + task: task, }, } - inode.InodeAttrs.Init(task.Credentials(), inoGen.NextIno(), linux.ModeDirectory|0555) + inode.InodeAttrs.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555) dentry := &kernfs.Dentry{} dentry.Init(inode) @@ -264,7 +264,7 @@ func (i *fdInfoDirInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, task: i.task, fd: fd, } - dentry := newTaskOwnedFile(i.task, i.inoGen.NextIno(), 0444, data) + dentry := i.fs.newTaskOwnedFile(i.task, i.fs.NextIno(), 0444, data) return dentry.VFSDentry(), nil } diff --git a/pkg/sentry/fsimpl/proc/task_files.go b/pkg/sentry/fsimpl/proc/task_files.go index 515f25327..2f297e48a 100644 --- a/pkg/sentry/fsimpl/proc/task_files.go +++ b/pkg/sentry/fsimpl/proc/task_files.go @@ -241,9 +241,9 @@ type commInode struct { task *kernel.Task } -func newComm(task *kernel.Task, ino uint64, perm linux.FileMode) *kernfs.Dentry { +func (fs *filesystem) newComm(task *kernel.Task, ino uint64, perm linux.FileMode) *kernfs.Dentry { inode := &commInode{task: task} - inode.DynamicBytesFile.Init(task.Credentials(), ino, &commData{task: task}, perm) + inode.DynamicBytesFile.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, &commData{task: task}, perm) d := &kernfs.Dentry{} d.Init(inode) @@ -596,9 +596,9 @@ type exeSymlink struct { var _ kernfs.Inode = (*exeSymlink)(nil) -func newExeSymlink(task *kernel.Task, ino uint64) *kernfs.Dentry { +func (fs *filesystem) newExeSymlink(task *kernel.Task, ino uint64) *kernfs.Dentry { inode := &exeSymlink{task: task} - inode.Init(task.Credentials(), ino, linux.ModeSymlink|0777) + inode.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777) d := &kernfs.Dentry{} d.Init(inode) @@ -729,7 +729,7 @@ type namespaceSymlink struct { task *kernel.Task } -func newNamespaceSymlink(task *kernel.Task, ino uint64, ns string) *kernfs.Dentry { +func (fs *filesystem) newNamespaceSymlink(task *kernel.Task, ino uint64, ns string) *kernfs.Dentry { // Namespace symlinks should contain the namespace name and the inode number // for the namespace instance, so for example user:[123456]. We currently fake // the inode number by sticking the symlink inode in its place. @@ -737,7 +737,7 @@ func newNamespaceSymlink(task *kernel.Task, ino uint64, ns string) *kernfs.Dentr inode := &namespaceSymlink{task: task} // Note: credentials are overridden by taskOwnedInode. - inode.Init(task.Credentials(), ino, target) + inode.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, target) taskInode := &taskOwnedInode{Inode: inode, owner: task} d := &kernfs.Dentry{} @@ -780,11 +780,11 @@ type namespaceInode struct { var _ kernfs.Inode = (*namespaceInode)(nil) // Init initializes a namespace inode. -func (i *namespaceInode) Init(creds *auth.Credentials, ino uint64, perm linux.FileMode) { +func (i *namespaceInode) Init(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode) { if perm&^linux.PermissionsMask != 0 { panic(fmt.Sprintf("Only permission mask must be set: %x", perm&linux.PermissionsMask)) } - i.InodeAttrs.Init(creds, ino, linux.ModeRegular|perm) + i.InodeAttrs.Init(creds, devMajor, devMinor, ino, linux.ModeRegular|perm) } // Open implements Inode.Open. diff --git a/pkg/sentry/fsimpl/proc/task_net.go b/pkg/sentry/fsimpl/proc/task_net.go index 9c329341a..6bde27376 100644 --- a/pkg/sentry/fsimpl/proc/task_net.go +++ b/pkg/sentry/fsimpl/proc/task_net.go @@ -37,7 +37,7 @@ import ( "gvisor.dev/gvisor/pkg/usermem" ) -func newTaskNetDir(task *kernel.Task, inoGen InoGenerator) *kernfs.Dentry { +func (fs *filesystem) newTaskNetDir(task *kernel.Task) *kernfs.Dentry { k := task.Kernel() pidns := task.PIDNamespace() root := auth.NewRootCredentials(pidns.UserNamespace()) @@ -57,37 +57,37 @@ func newTaskNetDir(task *kernel.Task, inoGen InoGenerator) *kernfs.Dentry { // TODO(gvisor.dev/issue/1833): Make sure file contents reflect the task // network namespace. contents = map[string]*kernfs.Dentry{ - "dev": newDentry(root, inoGen.NextIno(), 0444, &netDevData{stack: stack}), - "snmp": newDentry(root, inoGen.NextIno(), 0444, &netSnmpData{stack: stack}), + "dev": fs.newDentry(root, fs.NextIno(), 0444, &netDevData{stack: stack}), + "snmp": fs.newDentry(root, fs.NextIno(), 0444, &netSnmpData{stack: stack}), // The following files are simple stubs until they are implemented in // netstack, if the file contains a header the stub is just the header // otherwise it is an empty file. - "arp": newDentry(root, inoGen.NextIno(), 0444, newStaticFile(arp)), - "netlink": newDentry(root, inoGen.NextIno(), 0444, newStaticFile(netlink)), - "netstat": newDentry(root, inoGen.NextIno(), 0444, &netStatData{}), - "packet": newDentry(root, inoGen.NextIno(), 0444, newStaticFile(packet)), - "protocols": newDentry(root, inoGen.NextIno(), 0444, newStaticFile(protocols)), + "arp": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(arp)), + "netlink": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(netlink)), + "netstat": fs.newDentry(root, fs.NextIno(), 0444, &netStatData{}), + "packet": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(packet)), + "protocols": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(protocols)), // Linux sets psched values to: nsec per usec, psched tick in ns, 1000000, // high res timer ticks per sec (ClockGetres returns 1ns resolution). - "psched": newDentry(root, inoGen.NextIno(), 0444, newStaticFile(psched)), - "ptype": newDentry(root, inoGen.NextIno(), 0444, newStaticFile(ptype)), - "route": newDentry(root, inoGen.NextIno(), 0444, &netRouteData{stack: stack}), - "tcp": newDentry(root, inoGen.NextIno(), 0444, &netTCPData{kernel: k}), - "udp": newDentry(root, inoGen.NextIno(), 0444, &netUDPData{kernel: k}), - "unix": newDentry(root, inoGen.NextIno(), 0444, &netUnixData{kernel: k}), + "psched": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(psched)), + "ptype": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(ptype)), + "route": fs.newDentry(root, fs.NextIno(), 0444, &netRouteData{stack: stack}), + "tcp": fs.newDentry(root, fs.NextIno(), 0444, &netTCPData{kernel: k}), + "udp": fs.newDentry(root, fs.NextIno(), 0444, &netUDPData{kernel: k}), + "unix": fs.newDentry(root, fs.NextIno(), 0444, &netUnixData{kernel: k}), } if stack.SupportsIPv6() { - contents["if_inet6"] = newDentry(root, inoGen.NextIno(), 0444, &ifinet6{stack: stack}) - contents["ipv6_route"] = newDentry(root, inoGen.NextIno(), 0444, newStaticFile("")) - contents["tcp6"] = newDentry(root, inoGen.NextIno(), 0444, &netTCP6Data{kernel: k}) - contents["udp6"] = newDentry(root, inoGen.NextIno(), 0444, newStaticFile(upd6)) + contents["if_inet6"] = fs.newDentry(root, fs.NextIno(), 0444, &ifinet6{stack: stack}) + contents["ipv6_route"] = fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("")) + contents["tcp6"] = fs.newDentry(root, fs.NextIno(), 0444, &netTCP6Data{kernel: k}) + contents["udp6"] = fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(upd6)) } } - return newTaskOwnedDir(task, inoGen.NextIno(), 0555, contents) + return fs.newTaskOwnedDir(task, fs.NextIno(), 0555, contents) } // ifinet6 implements vfs.DynamicBytesSource for /proc/net/if_inet6. diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go index 5aeda8c9b..b51d43954 100644 --- a/pkg/sentry/fsimpl/proc/tasks.go +++ b/pkg/sentry/fsimpl/proc/tasks.go @@ -33,11 +33,6 @@ const ( threadSelfName = "thread-self" ) -// InoGenerator generates unique inode numbers for a given filesystem. -type InoGenerator interface { - NextIno() uint64 -} - // tasksInode represents the inode for /proc/ directory. // // +stateify savable @@ -48,8 +43,8 @@ type tasksInode struct { kernfs.OrderedChildren kernfs.AlwaysValid - inoGen InoGenerator - pidns *kernel.PIDNamespace + fs *filesystem + pidns *kernel.PIDNamespace // '/proc/self' and '/proc/thread-self' have custom directory offsets in // Linux. So handle them outside of OrderedChildren. @@ -64,29 +59,29 @@ type tasksInode struct { var _ kernfs.Inode = (*tasksInode)(nil) -func newTasksInode(inoGen InoGenerator, k *kernel.Kernel, pidns *kernel.PIDNamespace, cgroupControllers map[string]string) (*tasksInode, *kernfs.Dentry) { +func (fs *filesystem) newTasksInode(k *kernel.Kernel, pidns *kernel.PIDNamespace, cgroupControllers map[string]string) (*tasksInode, *kernfs.Dentry) { root := auth.NewRootCredentials(pidns.UserNamespace()) contents := map[string]*kernfs.Dentry{ - "cpuinfo": newDentry(root, inoGen.NextIno(), 0444, newStaticFileSetStat(cpuInfoData(k))), - "filesystems": newDentry(root, inoGen.NextIno(), 0444, &filesystemsData{}), - "loadavg": newDentry(root, inoGen.NextIno(), 0444, &loadavgData{}), - "sys": newSysDir(root, inoGen, k), - "meminfo": newDentry(root, inoGen.NextIno(), 0444, &meminfoData{}), - "mounts": kernfs.NewStaticSymlink(root, inoGen.NextIno(), "self/mounts"), - "net": kernfs.NewStaticSymlink(root, inoGen.NextIno(), "self/net"), - "stat": newDentry(root, inoGen.NextIno(), 0444, &statData{}), - "uptime": newDentry(root, inoGen.NextIno(), 0444, &uptimeData{}), - "version": newDentry(root, inoGen.NextIno(), 0444, &versionData{}), + "cpuinfo": fs.newDentry(root, fs.NextIno(), 0444, newStaticFileSetStat(cpuInfoData(k))), + "filesystems": fs.newDentry(root, fs.NextIno(), 0444, &filesystemsData{}), + "loadavg": fs.newDentry(root, fs.NextIno(), 0444, &loadavgData{}), + "sys": fs.newSysDir(root, k), + "meminfo": fs.newDentry(root, fs.NextIno(), 0444, &meminfoData{}), + "mounts": kernfs.NewStaticSymlink(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), "self/mounts"), + "net": kernfs.NewStaticSymlink(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), "self/net"), + "stat": fs.newDentry(root, fs.NextIno(), 0444, &statData{}), + "uptime": fs.newDentry(root, fs.NextIno(), 0444, &uptimeData{}), + "version": fs.newDentry(root, fs.NextIno(), 0444, &versionData{}), } inode := &tasksInode{ pidns: pidns, - inoGen: inoGen, - selfSymlink: newSelfSymlink(root, inoGen.NextIno(), pidns).VFSDentry(), - threadSelfSymlink: newThreadSelfSymlink(root, inoGen.NextIno(), pidns).VFSDentry(), + fs: fs, + selfSymlink: fs.newSelfSymlink(root, fs.NextIno(), pidns).VFSDentry(), + threadSelfSymlink: fs.newThreadSelfSymlink(root, fs.NextIno(), pidns).VFSDentry(), cgroupControllers: cgroupControllers, } - inode.InodeAttrs.Init(root, inoGen.NextIno(), linux.ModeDirectory|0555) + inode.InodeAttrs.Init(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555) dentry := &kernfs.Dentry{} dentry.Init(inode) @@ -118,7 +113,7 @@ func (i *tasksInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, erro return nil, syserror.ENOENT } - taskDentry := newTaskInode(i.inoGen, task, i.pidns, true, i.cgroupControllers) + taskDentry := i.fs.newTaskInode(task, i.pidns, true, i.cgroupControllers) return taskDentry.VFSDentry(), nil } @@ -144,7 +139,7 @@ func (i *tasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback dirent := vfs.Dirent{ Name: selfName, Type: linux.DT_LNK, - Ino: i.inoGen.NextIno(), + Ino: i.fs.NextIno(), NextOff: offset + 1, } if err := cb.Handle(dirent); err != nil { @@ -156,7 +151,7 @@ func (i *tasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback dirent := vfs.Dirent{ Name: threadSelfName, Type: linux.DT_LNK, - Ino: i.inoGen.NextIno(), + Ino: i.fs.NextIno(), NextOff: offset + 1, } if err := cb.Handle(dirent); err != nil { @@ -189,7 +184,7 @@ func (i *tasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback dirent := vfs.Dirent{ Name: strconv.FormatUint(uint64(tid), 10), Type: linux.DT_DIR, - Ino: i.inoGen.NextIno(), + Ino: i.fs.NextIno(), NextOff: FIRST_PROCESS_ENTRY + 2 + int64(tid) + 1, } if err := cb.Handle(dirent); err != nil { diff --git a/pkg/sentry/fsimpl/proc/tasks_files.go b/pkg/sentry/fsimpl/proc/tasks_files.go index e5f13b69e..7d8983aa5 100644 --- a/pkg/sentry/fsimpl/proc/tasks_files.go +++ b/pkg/sentry/fsimpl/proc/tasks_files.go @@ -41,9 +41,9 @@ type selfSymlink struct { var _ kernfs.Inode = (*selfSymlink)(nil) -func newSelfSymlink(creds *auth.Credentials, ino uint64, pidns *kernel.PIDNamespace) *kernfs.Dentry { +func (fs *filesystem) newSelfSymlink(creds *auth.Credentials, ino uint64, pidns *kernel.PIDNamespace) *kernfs.Dentry { inode := &selfSymlink{pidns: pidns} - inode.Init(creds, ino, linux.ModeSymlink|0777) + inode.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777) d := &kernfs.Dentry{} d.Init(inode) @@ -83,9 +83,9 @@ type threadSelfSymlink struct { var _ kernfs.Inode = (*threadSelfSymlink)(nil) -func newThreadSelfSymlink(creds *auth.Credentials, ino uint64, pidns *kernel.PIDNamespace) *kernfs.Dentry { +func (fs *filesystem) newThreadSelfSymlink(creds *auth.Credentials, ino uint64, pidns *kernel.PIDNamespace) *kernfs.Dentry { inode := &threadSelfSymlink{pidns: pidns} - inode.Init(creds, ino, linux.ModeSymlink|0777) + inode.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777) d := &kernfs.Dentry{} d.Init(inode) diff --git a/pkg/sentry/fsimpl/proc/tasks_sys.go b/pkg/sentry/fsimpl/proc/tasks_sys.go index 0e90e02fe..6dac2afa4 100644 --- a/pkg/sentry/fsimpl/proc/tasks_sys.go +++ b/pkg/sentry/fsimpl/proc/tasks_sys.go @@ -30,89 +30,89 @@ import ( ) // newSysDir returns the dentry corresponding to /proc/sys directory. -func newSysDir(root *auth.Credentials, inoGen InoGenerator, k *kernel.Kernel) *kernfs.Dentry { - return kernfs.NewStaticDir(root, inoGen.NextIno(), 0555, map[string]*kernfs.Dentry{ - "kernel": kernfs.NewStaticDir(root, inoGen.NextIno(), 0555, map[string]*kernfs.Dentry{ - "hostname": newDentry(root, inoGen.NextIno(), 0444, &hostnameData{}), - "shmall": newDentry(root, inoGen.NextIno(), 0444, shmData(linux.SHMALL)), - "shmmax": newDentry(root, inoGen.NextIno(), 0444, shmData(linux.SHMMAX)), - "shmmni": newDentry(root, inoGen.NextIno(), 0444, shmData(linux.SHMMNI)), +func (fs *filesystem) newSysDir(root *auth.Credentials, k *kernel.Kernel) *kernfs.Dentry { + return kernfs.NewStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{ + "kernel": kernfs.NewStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{ + "hostname": fs.newDentry(root, fs.NextIno(), 0444, &hostnameData{}), + "shmall": fs.newDentry(root, fs.NextIno(), 0444, shmData(linux.SHMALL)), + "shmmax": fs.newDentry(root, fs.NextIno(), 0444, shmData(linux.SHMMAX)), + "shmmni": fs.newDentry(root, fs.NextIno(), 0444, shmData(linux.SHMMNI)), }), - "vm": kernfs.NewStaticDir(root, inoGen.NextIno(), 0555, map[string]*kernfs.Dentry{ - "mmap_min_addr": newDentry(root, inoGen.NextIno(), 0444, &mmapMinAddrData{k: k}), - "overcommit_memory": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("0\n")), + "vm": kernfs.NewStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{ + "mmap_min_addr": fs.newDentry(root, fs.NextIno(), 0444, &mmapMinAddrData{k: k}), + "overcommit_memory": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0\n")), }), - "net": newSysNetDir(root, inoGen, k), + "net": fs.newSysNetDir(root, k), }) } // newSysNetDir returns the dentry corresponding to /proc/sys/net directory. -func newSysNetDir(root *auth.Credentials, inoGen InoGenerator, k *kernel.Kernel) *kernfs.Dentry { +func (fs *filesystem) newSysNetDir(root *auth.Credentials, k *kernel.Kernel) *kernfs.Dentry { var contents map[string]*kernfs.Dentry // TODO(gvisor.dev/issue/1833): Support for using the network stack in the // network namespace of the calling process. if stack := k.RootNetworkNamespace().Stack(); stack != nil { contents = map[string]*kernfs.Dentry{ - "ipv4": kernfs.NewStaticDir(root, inoGen.NextIno(), 0555, map[string]*kernfs.Dentry{ - "tcp_sack": newDentry(root, inoGen.NextIno(), 0644, &tcpSackData{stack: stack}), + "ipv4": kernfs.NewStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{ + "tcp_sack": fs.newDentry(root, fs.NextIno(), 0644, &tcpSackData{stack: stack}), // The following files are simple stubs until they are implemented in // netstack, most of these files are configuration related. We use the // value closest to the actual netstack behavior or any empty file, all // of these files will have mode 0444 (read-only for all users). - "ip_local_port_range": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("16000 65535")), - "ip_local_reserved_ports": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("")), - "ipfrag_time": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("30")), - "ip_nonlocal_bind": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("0")), - "ip_no_pmtu_disc": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("1")), + "ip_local_port_range": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("16000 65535")), + "ip_local_reserved_ports": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("")), + "ipfrag_time": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("30")), + "ip_nonlocal_bind": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")), + "ip_no_pmtu_disc": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("1")), // tcp_allowed_congestion_control tell the user what they are able to // do as an unprivledged process so we leave it empty. - "tcp_allowed_congestion_control": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("")), - "tcp_available_congestion_control": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("reno")), - "tcp_congestion_control": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("reno")), + "tcp_allowed_congestion_control": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("")), + "tcp_available_congestion_control": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("reno")), + "tcp_congestion_control": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("reno")), // Many of the following stub files are features netstack doesn't // support. The unsupported features return "0" to indicate they are // disabled. - "tcp_base_mss": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("1280")), - "tcp_dsack": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("0")), - "tcp_early_retrans": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("0")), - "tcp_fack": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("0")), - "tcp_fastopen": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("0")), - "tcp_fastopen_key": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("")), - "tcp_invalid_ratelimit": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("0")), - "tcp_keepalive_intvl": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("0")), - "tcp_keepalive_probes": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("0")), - "tcp_keepalive_time": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("7200")), - "tcp_mtu_probing": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("0")), - "tcp_no_metrics_save": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("1")), - "tcp_probe_interval": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("0")), - "tcp_probe_threshold": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("0")), - "tcp_retries1": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("3")), - "tcp_retries2": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("15")), - "tcp_rfc1337": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("1")), - "tcp_slow_start_after_idle": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("1")), - "tcp_synack_retries": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("5")), - "tcp_syn_retries": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("3")), - "tcp_timestamps": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("1")), + "tcp_base_mss": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("1280")), + "tcp_dsack": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")), + "tcp_early_retrans": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")), + "tcp_fack": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")), + "tcp_fastopen": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")), + "tcp_fastopen_key": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("")), + "tcp_invalid_ratelimit": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")), + "tcp_keepalive_intvl": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")), + "tcp_keepalive_probes": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")), + "tcp_keepalive_time": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("7200")), + "tcp_mtu_probing": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")), + "tcp_no_metrics_save": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("1")), + "tcp_probe_interval": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")), + "tcp_probe_threshold": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")), + "tcp_retries1": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("3")), + "tcp_retries2": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("15")), + "tcp_rfc1337": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("1")), + "tcp_slow_start_after_idle": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("1")), + "tcp_synack_retries": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("5")), + "tcp_syn_retries": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("3")), + "tcp_timestamps": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("1")), }), - "core": kernfs.NewStaticDir(root, inoGen.NextIno(), 0555, map[string]*kernfs.Dentry{ - "default_qdisc": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("pfifo_fast")), - "message_burst": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("10")), - "message_cost": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("5")), - "optmem_max": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("0")), - "rmem_default": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("212992")), - "rmem_max": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("212992")), - "somaxconn": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("128")), - "wmem_default": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("212992")), - "wmem_max": newDentry(root, inoGen.NextIno(), 0444, newStaticFile("212992")), + "core": kernfs.NewStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{ + "default_qdisc": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("pfifo_fast")), + "message_burst": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("10")), + "message_cost": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("5")), + "optmem_max": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")), + "rmem_default": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("212992")), + "rmem_max": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("212992")), + "somaxconn": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("128")), + "wmem_default": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("212992")), + "wmem_max": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("212992")), }), } } - return kernfs.NewStaticDir(root, inoGen.NextIno(), 0555, contents) + return kernfs.NewStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, contents) } // mmapMinAddrData implements vfs.DynamicBytesSource for diff --git a/pkg/sentry/fsimpl/sockfs/BUILD b/pkg/sentry/fsimpl/sockfs/BUILD index 52084ddb5..9453277b8 100644 --- a/pkg/sentry/fsimpl/sockfs/BUILD +++ b/pkg/sentry/fsimpl/sockfs/BUILD @@ -9,6 +9,7 @@ go_library( deps = [ "//pkg/abi/linux", "//pkg/context", + "//pkg/fspath", "//pkg/sentry/fsimpl/kernfs", "//pkg/sentry/kernel/auth", "//pkg/sentry/vfs", diff --git a/pkg/sentry/fsimpl/sockfs/sockfs.go b/pkg/sentry/fsimpl/sockfs/sockfs.go index 239a9f4b4..ee0828a15 100644 --- a/pkg/sentry/fsimpl/sockfs/sockfs.go +++ b/pkg/sentry/fsimpl/sockfs/sockfs.go @@ -16,8 +16,11 @@ package sockfs import ( + "fmt" + "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" @@ -41,19 +44,42 @@ func (filesystemType) Name() string { return "sockfs" } +type filesystem struct { + kernfs.Filesystem + + devMinor uint32 +} + // NewFilesystem sets up and returns a new sockfs filesystem. // // Note that there should only ever be one instance of sockfs.Filesystem, // backing a global socket mount. -func NewFilesystem(vfsObj *vfs.VirtualFilesystem) *vfs.Filesystem { - fs := &kernfs.Filesystem{} - fs.VFSFilesystem().Init(vfsObj, filesystemType{}, fs) - return fs.VFSFilesystem() +func NewFilesystem(vfsObj *vfs.VirtualFilesystem) (*vfs.Filesystem, error) { + devMinor, err := vfsObj.GetAnonBlockDevMinor() + if err != nil { + return nil, err + } + fs := &filesystem{ + devMinor: devMinor, + } + fs.Filesystem.VFSFilesystem().Init(vfsObj, filesystemType{}, fs) + return fs.Filesystem.VFSFilesystem(), nil +} + +// Release implements vfs.FilesystemImpl.Release. +func (fs *filesystem) Release() { + fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) + fs.Filesystem.Release() +} + +// PrependPath implements vfs.FilesystemImpl.PrependPath. +func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error { + inode := vd.Dentry().Impl().(*kernfs.Dentry).Inode().(*inode) + b.PrependComponent(fmt.Sprintf("socket:[%d]", inode.InodeAttrs.Ino())) + return vfs.PrependPathSyntheticError{} } // inode implements kernfs.Inode. -// -// TODO(gvisor.dev/issue/1193): Device numbers. type inode struct { kernfs.InodeNotDirectory kernfs.InodeNotSymlink @@ -67,11 +93,15 @@ func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentr } // NewDentry constructs and returns a sockfs dentry. -func NewDentry(creds *auth.Credentials, ino uint64) *vfs.Dentry { +// +// Preconditions: mnt.Filesystem() must have been returned by NewFilesystem(). +func NewDentry(creds *auth.Credentials, mnt *vfs.Mount) *vfs.Dentry { + fs := mnt.Filesystem().Impl().(*filesystem) + // File mode matches net/socket.c:sock_alloc. filemode := linux.FileMode(linux.S_IFSOCK | 0600) i := &inode{} - i.Init(creds, ino, filemode) + i.InodeAttrs.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.Filesystem.NextIno(), filemode) d := &kernfs.Dentry{} d.Init(i) diff --git a/pkg/sentry/fsimpl/sys/sys.go b/pkg/sentry/fsimpl/sys/sys.go index 00f7d6214..0af373604 100644 --- a/pkg/sentry/fsimpl/sys/sys.go +++ b/pkg/sentry/fsimpl/sys/sys.go @@ -37,6 +37,8 @@ type FilesystemType struct{} // filesystem implements vfs.FilesystemImpl. type filesystem struct { kernfs.Filesystem + + devMinor uint32 } // Name implements vfs.FilesystemType.Name. @@ -46,7 +48,14 @@ func (FilesystemType) Name() string { // GetFilesystem implements vfs.FilesystemType.GetFilesystem. func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { - fs := &filesystem{} + devMinor, err := vfsObj.GetAnonBlockDevMinor() + if err != nil { + return nil, nil, err + } + + fs := &filesystem{ + devMinor: devMinor, + } fs.VFSFilesystem().Init(vfsObj, &fsType, fs) k := kernel.KernelFromContext(ctx) maxCPUCores := k.ApplicationCores() @@ -77,6 +86,12 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt return fs.VFSFilesystem(), root.VFSDentry(), nil } +// Release implements vfs.FilesystemImpl.Release. +func (fs *filesystem) Release() { + fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) + fs.Filesystem.Release() +} + // dir implements kernfs.Inode. type dir struct { kernfs.InodeAttrs @@ -90,7 +105,7 @@ type dir struct { func (fs *filesystem) newDir(creds *auth.Credentials, mode linux.FileMode, contents map[string]*kernfs.Dentry) *kernfs.Dentry { d := &dir{} - d.InodeAttrs.Init(creds, fs.NextIno(), linux.ModeDirectory|0755) + d.InodeAttrs.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0755) d.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) d.dentry.Init(d) @@ -127,7 +142,7 @@ func (c *cpuFile) Generate(ctx context.Context, buf *bytes.Buffer) error { func (fs *filesystem) newCPUFile(creds *auth.Credentials, maxCores uint, mode linux.FileMode) *kernfs.Dentry { c := &cpuFile{maxCores: maxCores} - c.DynamicBytesFile.Init(creds, fs.NextIno(), c, mode) + c.DynamicBytesFile.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), c, mode) d := &kernfs.Dentry{} d.Init(c) return d diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go index efc931468..405928bd0 100644 --- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go +++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go @@ -63,6 +63,9 @@ type filesystem struct { // clock is a realtime clock used to set timestamps in file operations. clock time.Clock + // devMinor is the filesystem's minor device number. devMinor is immutable. + devMinor uint32 + // mu serializes changes to the Dentry tree. mu sync.RWMutex @@ -96,11 +99,6 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt if memFileProvider == nil { panic("MemoryFileProviderFromContext returned nil") } - clock := time.RealtimeClockFromContext(ctx) - fs := filesystem{ - memFile: memFileProvider.MemoryFile(), - clock: clock, - } rootFileType := uint16(linux.S_IFDIR) newFSType := vfs.FilesystemType(&fstype) @@ -114,6 +112,16 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt } } + devMinor, err := vfsObj.GetAnonBlockDevMinor() + if err != nil { + return nil, nil, err + } + clock := time.RealtimeClockFromContext(ctx) + fs := filesystem{ + memFile: memFileProvider.MemoryFile(), + clock: clock, + devMinor: devMinor, + } fs.vfsfs.Init(vfsObj, newFSType, &fs) var root *dentry @@ -125,6 +133,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt case linux.S_IFDIR: root = &fs.newDirectory(creds, 01777).dentry default: + fs.vfsfs.DecRef() return nil, nil, fmt.Errorf("invalid tmpfs root file type: %#o", rootFileType) } return &fs.vfsfs, &root.vfsd, nil @@ -132,6 +141,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt // Release implements vfs.FilesystemImpl.Release. func (fs *filesystem) Release() { + fs.vfsfs.VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) } // dentry implements vfs.DentryImpl. @@ -188,8 +198,8 @@ func (d *dentry) DecRef() { // inode represents a filesystem object. type inode struct { - // clock is a realtime clock used to set timestamps in file operations. - clock time.Clock + // fs is the owning filesystem. fs is immutable. + fs *filesystem // refs is a reference count. refs is accessed using atomic memory // operations. @@ -232,7 +242,7 @@ func (i *inode) init(impl interface{}, fs *filesystem, creds *auth.Credentials, if mode.FileType() == 0 { panic("file type is required in FileMode") } - i.clock = fs.clock + i.fs = fs i.refs = 1 i.mode = uint32(mode) i.uid = uint32(creds.EffectiveKUID) @@ -327,7 +337,8 @@ func (i *inode) statTo(stat *linux.Statx) { stat.Atime = linux.NsecToStatxTimestamp(i.atime) stat.Ctime = linux.NsecToStatxTimestamp(i.ctime) stat.Mtime = linux.NsecToStatxTimestamp(i.mtime) - // TODO(gvisor.dev/issue/1197): Device number. + stat.DevMajor = linux.UNNAMED_MAJOR + stat.DevMinor = i.fs.devMinor switch impl := i.impl.(type) { case *regularFile: stat.Mask |= linux.STATX_SIZE | linux.STATX_BLOCKS @@ -401,7 +412,7 @@ func (i *inode) setStat(ctx context.Context, creds *auth.Credentials, stat *linu return syserror.EINVAL } } - now := i.clock.Now().Nanoseconds() + now := i.fs.clock.Now().Nanoseconds() if mask&linux.STATX_ATIME != 0 { if stat.Atime.Nsec == linux.UTIME_NOW { atomic.StoreInt64(&i.atime, now) @@ -518,7 +529,7 @@ func (i *inode) touchAtime(mnt *vfs.Mount) { if err := mnt.CheckBeginWrite(); err != nil { return } - now := i.clock.Now().Nanoseconds() + now := i.fs.clock.Now().Nanoseconds() i.mu.Lock() atomic.StoreInt64(&i.atime, now) i.mu.Unlock() @@ -527,7 +538,7 @@ func (i *inode) touchAtime(mnt *vfs.Mount) { // Preconditions: The caller has called vfs.Mount.CheckBeginWrite(). func (i *inode) touchCtime() { - now := i.clock.Now().Nanoseconds() + now := i.fs.clock.Now().Nanoseconds() i.mu.Lock() atomic.StoreInt64(&i.ctime, now) i.mu.Unlock() @@ -535,7 +546,7 @@ func (i *inode) touchCtime() { // Preconditions: The caller has called vfs.Mount.CheckBeginWrite(). func (i *inode) touchCMtime() { - now := i.clock.Now().Nanoseconds() + now := i.fs.clock.Now().Nanoseconds() i.mu.Lock() atomic.StoreInt64(&i.mtime, now) atomic.StoreInt64(&i.ctime, now) @@ -545,7 +556,7 @@ func (i *inode) touchCMtime() { // Preconditions: The caller has called vfs.Mount.CheckBeginWrite() and holds // inode.mu. func (i *inode) touchCMtimeLocked() { - now := i.clock.Now().Nanoseconds() + now := i.fs.clock.Now().Nanoseconds() atomic.StoreInt64(&i.mtime, now) atomic.StoreInt64(&i.ctime, now) } diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index 271ea5faf..3617da8c6 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -373,7 +373,10 @@ func (k *Kernel) Init(args InitKernelArgs) error { return fmt.Errorf("failed to initialize VFS: %v", err) } - pipeFilesystem := pipefs.NewFilesystem(&k.vfs) + pipeFilesystem, err := pipefs.NewFilesystem(&k.vfs) + if err != nil { + return fmt.Errorf("failed to create pipefs filesystem: %v", err) + } defer pipeFilesystem.DecRef() pipeMount, err := k.vfs.NewDisconnectedMount(pipeFilesystem, nil, &vfs.MountOptions{}) if err != nil { @@ -381,7 +384,10 @@ func (k *Kernel) Init(args InitKernelArgs) error { } k.pipeMount = pipeMount - socketFilesystem := sockfs.NewFilesystem(&k.vfs) + socketFilesystem, err := sockfs.NewFilesystem(&k.vfs) + if err != nil { + return fmt.Errorf("failed to create sockfs filesystem: %v", err) + } defer socketFilesystem.DecRef() socketMount, err := k.vfs.NewDisconnectedMount(socketFilesystem, nil, &vfs.MountOptions{}) if err != nil { diff --git a/pkg/sentry/socket/hostinet/BUILD b/pkg/sentry/socket/hostinet/BUILD index deedd35f7..e82d6cd1e 100644 --- a/pkg/sentry/socket/hostinet/BUILD +++ b/pkg/sentry/socket/hostinet/BUILD @@ -26,7 +26,6 @@ go_library( "//pkg/sentry/device", "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", - "//pkg/sentry/fsimpl/kernfs", "//pkg/sentry/fsimpl/sockfs", "//pkg/sentry/hostfd", "//pkg/sentry/inet", diff --git a/pkg/sentry/socket/hostinet/socket_vfs2.go b/pkg/sentry/socket/hostinet/socket_vfs2.go index a8278bffc..677743113 100644 --- a/pkg/sentry/socket/hostinet/socket_vfs2.go +++ b/pkg/sentry/socket/hostinet/socket_vfs2.go @@ -21,7 +21,6 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fdnotifier" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs" "gvisor.dev/gvisor/pkg/sentry/hostfd" "gvisor.dev/gvisor/pkg/sentry/kernel" @@ -50,8 +49,7 @@ var _ = socket.SocketVFS2(&socketVFS2{}) func newVFS2Socket(t *kernel.Task, family int, stype linux.SockType, protocol int, fd int, flags uint32) (*vfs.FileDescription, *syserr.Error) { mnt := t.Kernel().SocketMount() - fs := mnt.Filesystem().Impl().(*kernfs.Filesystem) - d := sockfs.NewDentry(t.Credentials(), fs.NextIno()) + d := sockfs.NewDentry(t.Credentials(), mnt) s := &socketVFS2{ socketOpsCommon: socketOpsCommon{ diff --git a/pkg/sentry/socket/netlink/BUILD b/pkg/sentry/socket/netlink/BUILD index 09ca00a4a..7212d8644 100644 --- a/pkg/sentry/socket/netlink/BUILD +++ b/pkg/sentry/socket/netlink/BUILD @@ -20,7 +20,6 @@ go_library( "//pkg/sentry/device", "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", - "//pkg/sentry/fsimpl/kernfs", "//pkg/sentry/fsimpl/sockfs", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", diff --git a/pkg/sentry/socket/netlink/provider_vfs2.go b/pkg/sentry/socket/netlink/provider_vfs2.go index dcd92b5cd..bb205be0d 100644 --- a/pkg/sentry/socket/netlink/provider_vfs2.go +++ b/pkg/sentry/socket/netlink/provider_vfs2.go @@ -16,7 +16,6 @@ package netlink import ( "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/vfs" @@ -52,8 +51,7 @@ func (*socketProviderVFS2) Socket(t *kernel.Task, stype linux.SockType, protocol vfsfd := &s.vfsfd mnt := t.Kernel().SocketMount() - fs := mnt.Filesystem().Impl().(*kernfs.Filesystem) - d := sockfs.NewDentry(t.Credentials(), fs.NextIno()) + d := sockfs.NewDentry(t.Credentials(), mnt) if err := vfsfd.Init(s, linux.O_RDWR, mnt, d, &vfs.FileDescriptionOptions{ DenyPRead: true, DenyPWrite: true, diff --git a/pkg/sentry/socket/netstack/BUILD b/pkg/sentry/socket/netstack/BUILD index ccf9fcf5c..6129fb83d 100644 --- a/pkg/sentry/socket/netstack/BUILD +++ b/pkg/sentry/socket/netstack/BUILD @@ -27,7 +27,6 @@ go_library( "//pkg/sentry/device", "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", - "//pkg/sentry/fsimpl/kernfs", "//pkg/sentry/fsimpl/sockfs", "//pkg/sentry/inet", "//pkg/sentry/kernel", diff --git a/pkg/sentry/socket/netstack/netstack_vfs2.go b/pkg/sentry/socket/netstack/netstack_vfs2.go index f7d9b2ff4..191970d41 100644 --- a/pkg/sentry/socket/netstack/netstack_vfs2.go +++ b/pkg/sentry/socket/netstack/netstack_vfs2.go @@ -18,7 +18,6 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/kernel" @@ -53,8 +52,7 @@ func NewVFS2(t *kernel.Task, family int, skType linux.SockType, protocol int, qu } mnt := t.Kernel().SocketMount() - fs := mnt.Filesystem().Impl().(*kernfs.Filesystem) - d := sockfs.NewDentry(t.Credentials(), fs.NextIno()) + d := sockfs.NewDentry(t.Credentials(), mnt) s := &SocketVFS2{ socketOpsCommon: socketOpsCommon{ diff --git a/pkg/sentry/socket/unix/BUILD b/pkg/sentry/socket/unix/BUILD index 941a91097..de2cc4bdf 100644 --- a/pkg/sentry/socket/unix/BUILD +++ b/pkg/sentry/socket/unix/BUILD @@ -21,7 +21,6 @@ go_library( "//pkg/sentry/device", "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", - "//pkg/sentry/fsimpl/kernfs", "//pkg/sentry/fsimpl/sockfs", "//pkg/sentry/kernel", "//pkg/sentry/kernel/time", diff --git a/pkg/sentry/socket/unix/unix_vfs2.go b/pkg/sentry/socket/unix/unix_vfs2.go index 06d838868..45e109361 100644 --- a/pkg/sentry/socket/unix/unix_vfs2.go +++ b/pkg/sentry/socket/unix/unix_vfs2.go @@ -19,7 +19,6 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/socket" @@ -50,8 +49,7 @@ var _ = socket.SocketVFS2(&SocketVFS2{}) // returns a corresponding file description. func NewSockfsFile(t *kernel.Task, ep transport.Endpoint, stype linux.SockType) (*vfs.FileDescription, *syserr.Error) { mnt := t.Kernel().SocketMount() - fs := mnt.Filesystem().Impl().(*kernfs.Filesystem) - d := sockfs.NewDentry(t.Credentials(), fs.NextIno()) + d := sockfs.NewDentry(t.Credentials(), mnt) fd, err := NewFileDescription(ep, stype, linux.O_RDWR, mnt, d) if err != nil { diff --git a/pkg/sentry/vfs/anonfs.go b/pkg/sentry/vfs/anonfs.go index adebaeefb..caf770fd5 100644 --- a/pkg/sentry/vfs/anonfs.go +++ b/pkg/sentry/vfs/anonfs.go @@ -202,7 +202,7 @@ func (fs *anonFilesystem) StatAt(ctx context.Context, rp *ResolvingPath, opts St Ino: 1, Size: 0, Blocks: 0, - DevMajor: 0, + DevMajor: linux.UNNAMED_MAJOR, DevMinor: fs.devMinor, }, nil } diff --git a/pkg/sentry/vfs/device.go b/pkg/sentry/vfs/device.go index bda5576fa..1e9dffc8f 100644 --- a/pkg/sentry/vfs/device.go +++ b/pkg/sentry/vfs/device.go @@ -103,7 +103,7 @@ func (vfs *VirtualFilesystem) OpenDeviceSpecialFile(ctx context.Context, mnt *Mo } // GetAnonBlockDevMinor allocates and returns an unused minor device number for -// an "anonymous" block device with major number 0. +// an "anonymous" block device with major number UNNAMED_MAJOR. func (vfs *VirtualFilesystem) GetAnonBlockDevMinor() (uint32, error) { vfs.anonBlockDevMinorMu.Lock() defer vfs.anonBlockDevMinorMu.Unlock() diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index 8c8bad11c..f802bc9fb 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -334,7 +334,10 @@ func New(args Args) (*Loader, error) { if kernel.VFS2Enabled { // Set up host mount that will be used for imported fds. - hostFilesystem := hostvfs2.NewFilesystem(k.VFS()) + hostFilesystem, err := hostvfs2.NewFilesystem(k.VFS()) + if err != nil { + return nil, fmt.Errorf("failed to create hostfs filesystem: %v", err) + } defer hostFilesystem.DecRef() hostMount, err := k.VFS().NewDisconnectedMount(hostFilesystem, nil, &vfs.MountOptions{}) if err != nil { -- cgit v1.2.3 From 21b71395a6aa2eafbc4c59222574d56c2db2e23b Mon Sep 17 00:00:00 2001 From: Jamie Liu Date: Fri, 8 May 2020 11:34:15 -0700 Subject: Pass flags to fsimpl/host.inode.open(). This has two effects: It makes flags passed to open("/proc/[pid]/fd/[hostfd]") effective, and it prevents imported pipes/sockets/character devices from being opened with O_NONBLOCK unconditionally (because the underlying host FD was set to non-blocking in ImportFD()). PiperOrigin-RevId: 310596062 --- pkg/sentry/fsimpl/host/host.go | 35 ++++++++++++++--------------------- 1 file changed, 14 insertions(+), 21 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go index 55de9c438..36bceeaa4 100644 --- a/pkg/sentry/fsimpl/host/host.go +++ b/pkg/sentry/fsimpl/host/host.go @@ -53,13 +53,19 @@ func ImportFD(ctx context.Context, mnt *vfs.Mount, hostFD int, isTTY bool) (*vfs return nil, err } + // Get flags for the imported FD. + flags, err := unix.FcntlInt(uintptr(hostFD), syscall.F_GETFL, 0) + if err != nil { + return nil, err + } + fileMode := linux.FileMode(s.Mode) fileType := fileMode.FileType() // Determine if hostFD is seekable. If not, this syscall will return ESPIPE // (see fs/read_write.c:llseek), e.g. for pipes, sockets, and some character // devices. - _, err := unix.Seek(hostFD, 0, linux.SEEK_CUR) + _, err = unix.Seek(hostFD, 0, linux.SEEK_CUR) seekable := err != syserror.ESPIPE i := &inode{ @@ -95,7 +101,7 @@ func ImportFD(ctx context.Context, mnt *vfs.Mount, hostFD int, isTTY bool) (*vfs // i.open will take a reference on d. defer d.DecRef() - return i.open(ctx, d.VFSDentry(), mnt) + return i.open(ctx, d.VFSDentry(), mnt, uint32(flags)) } // filesystemType implements vfs.FilesystemType. @@ -198,19 +204,6 @@ type inode struct { queue waiter.Queue } -// Note that these flags may become out of date, since they can be modified -// on the host, e.g. with fcntl. -func fileFlagsFromHostFD(fd int) (uint32, error) { - flags, err := unix.FcntlInt(uintptr(fd), syscall.F_GETFL, 0) - if err != nil { - log.Warningf("Failed to get file flags for donated FD %d: %v", fd, err) - return 0, err - } - // TODO(gvisor.dev/issue/1672): implement behavior corresponding to these allowed flags. - flags &= syscall.O_ACCMODE | syscall.O_DIRECT | syscall.O_NONBLOCK | syscall.O_DSYNC | syscall.O_SYNC | syscall.O_APPEND - return uint32(flags), nil -} - // CheckPermissions implements kernfs.Inode. func (i *inode) CheckPermissions(ctx context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error { var s syscall.Stat_t @@ -406,19 +399,19 @@ func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentr if i.Mode().FileType() == linux.S_IFSOCK { return nil, syserror.ENXIO } - return i.open(ctx, vfsd, rp.Mount()) + return i.open(ctx, vfsd, rp.Mount(), opts.Flags) } -func (i *inode) open(ctx context.Context, d *vfs.Dentry, mnt *vfs.Mount) (*vfs.FileDescription, error) { +func (i *inode) open(ctx context.Context, d *vfs.Dentry, mnt *vfs.Mount, flags uint32) (*vfs.FileDescription, error) { var s syscall.Stat_t if err := syscall.Fstat(i.hostFD, &s); err != nil { return nil, err } fileType := s.Mode & linux.FileTypeMask - flags, err := fileFlagsFromHostFD(i.hostFD) - if err != nil { - return nil, err - } + + // Constrain flags to a subset we can handle. + // TODO(gvisor.dev/issue/1672): implement behavior corresponding to these allowed flags. + flags &= syscall.O_ACCMODE | syscall.O_DIRECT | syscall.O_NONBLOCK | syscall.O_DSYNC | syscall.O_SYNC | syscall.O_APPEND if fileType == syscall.S_IFSOCK { if i.isTTY { -- cgit v1.2.3 From 15de8cc9e0e7789c3d55595171b3272ec726931f Mon Sep 17 00:00:00 2001 From: Jamie Liu Date: Mon, 11 May 2020 16:13:14 -0700 Subject: Add fsimpl/gofer.InternalFilesystemOptions.OpenSocketsByConnecting. PiperOrigin-RevId: 311014995 --- pkg/sentry/fsimpl/gofer/filesystem.go | 27 +++++++++++++++++++++-- pkg/sentry/fsimpl/gofer/gofer.go | 4 ++++ pkg/sentry/fsimpl/host/host.go | 41 +++++++++++++++++++++++++++-------- 3 files changed, 61 insertions(+), 11 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go index 4a32821bd..7ca0cf3f4 100644 --- a/pkg/sentry/fsimpl/gofer/filesystem.go +++ b/pkg/sentry/fsimpl/gofer/filesystem.go @@ -21,6 +21,8 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/p9" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/host" + "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" @@ -835,6 +837,9 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vf if d.isSynthetic() { return nil, syserror.ENXIO } + if d.fs.iopts.OpenSocketsByConnecting { + return d.connectSocketLocked(ctx, opts) + } case linux.S_IFIFO: if d.isSynthetic() { return d.pipe.Open(ctx, mnt, &d.vfsd, opts.Flags) @@ -843,10 +848,28 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vf return d.openSpecialFileLocked(ctx, mnt, opts) } +func (d *dentry) connectSocketLocked(ctx context.Context, opts *vfs.OpenOptions) (*vfs.FileDescription, error) { + if opts.Flags&linux.O_DIRECT != 0 { + return nil, syserror.EINVAL + } + fdObj, err := d.file.connect(ctx, p9.AnonymousSocket) + if err != nil { + return nil, err + } + fd, err := host.NewFD(ctx, kernel.KernelFromContext(ctx).HostMount(), fdObj.FD(), &host.NewFDOptions{ + HaveFlags: true, + Flags: opts.Flags, + }) + if err != nil { + fdObj.Close() + return nil, err + } + fdObj.Release() + return fd, nil +} + func (d *dentry) openSpecialFileLocked(ctx context.Context, mnt *vfs.Mount, opts *vfs.OpenOptions) (*vfs.FileDescription, error) { ats := vfs.AccessTypesForOpenFlags(opts) - // Treat as a special file. This is done for non-synthetic pipes as well as - // regular files when d.fs.opts.regularFilesUseSpecialFileFD is true. if opts.Flags&linux.O_DIRECT != 0 { return nil, syserror.EINVAL } diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go index e68e37ebc..1da8d5d82 100644 --- a/pkg/sentry/fsimpl/gofer/gofer.go +++ b/pkg/sentry/fsimpl/gofer/gofer.go @@ -221,6 +221,10 @@ type InternalFilesystemOptions struct { // which servers can handle only a single client and report failure if that // client disconnects. LeakConnection bool + + // If OpenSocketsByConnecting is true, silently translate attempts to open + // files identifying as sockets to connect RPCs. + OpenSocketsByConnecting bool } // Name implements vfs.FilesystemType.Name. diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go index 36bceeaa4..8caf55a1b 100644 --- a/pkg/sentry/fsimpl/host/host.go +++ b/pkg/sentry/fsimpl/host/host.go @@ -40,8 +40,20 @@ import ( "gvisor.dev/gvisor/pkg/waiter" ) -// ImportFD sets up and returns a vfs.FileDescription from a donated fd. -func ImportFD(ctx context.Context, mnt *vfs.Mount, hostFD int, isTTY bool) (*vfs.FileDescription, error) { +// NewFDOptions contains options to NewFD. +type NewFDOptions struct { + // If IsTTY is true, the file descriptor is a TTY. + IsTTY bool + + // If HaveFlags is true, use Flags for the new file description. Otherwise, + // the new file description will inherit flags from hostFD. + HaveFlags bool + Flags uint32 +} + +// NewFD returns a vfs.FileDescription representing the given host file +// descriptor. mnt must be Kernel.HostMount(). +func NewFD(ctx context.Context, mnt *vfs.Mount, hostFD int, opts *NewFDOptions) (*vfs.FileDescription, error) { fs, ok := mnt.Filesystem().Impl().(*filesystem) if !ok { return nil, fmt.Errorf("can't import host FDs into filesystems of type %T", mnt.Filesystem().Impl()) @@ -53,10 +65,14 @@ func ImportFD(ctx context.Context, mnt *vfs.Mount, hostFD int, isTTY bool) (*vfs return nil, err } - // Get flags for the imported FD. - flags, err := unix.FcntlInt(uintptr(hostFD), syscall.F_GETFL, 0) - if err != nil { - return nil, err + flags := opts.Flags + if !opts.HaveFlags { + // Get flags for the imported FD. + flagsInt, err := unix.FcntlInt(uintptr(hostFD), syscall.F_GETFL, 0) + if err != nil { + return nil, err + } + flags = uint32(flagsInt) } fileMode := linux.FileMode(s.Mode) @@ -65,13 +81,13 @@ func ImportFD(ctx context.Context, mnt *vfs.Mount, hostFD int, isTTY bool) (*vfs // Determine if hostFD is seekable. If not, this syscall will return ESPIPE // (see fs/read_write.c:llseek), e.g. for pipes, sockets, and some character // devices. - _, err = unix.Seek(hostFD, 0, linux.SEEK_CUR) + _, err := unix.Seek(hostFD, 0, linux.SEEK_CUR) seekable := err != syserror.ESPIPE i := &inode{ hostFD: hostFD, seekable: seekable, - isTTY: isTTY, + isTTY: opts.IsTTY, canMap: canMap(uint32(fileType)), wouldBlock: wouldBlock(uint32(fileType)), ino: fs.NextIno(), @@ -101,7 +117,14 @@ func ImportFD(ctx context.Context, mnt *vfs.Mount, hostFD int, isTTY bool) (*vfs // i.open will take a reference on d. defer d.DecRef() - return i.open(ctx, d.VFSDentry(), mnt, uint32(flags)) + return i.open(ctx, d.VFSDentry(), mnt, flags) +} + +// ImportFD sets up and returns a vfs.FileDescription from a donated fd. +func ImportFD(ctx context.Context, mnt *vfs.Mount, hostFD int, isTTY bool) (*vfs.FileDescription, error) { + return NewFD(ctx, mnt, hostFD, &NewFDOptions{ + IsTTY: isTTY, + }) } // filesystemType implements vfs.FilesystemType. -- cgit v1.2.3 From 94251aedb4abcf2498a4f6bc71f32d2847ac0f3e Mon Sep 17 00:00:00 2001 From: Jamie Liu Date: Mon, 11 May 2020 20:02:09 -0700 Subject: Internal change. PiperOrigin-RevId: 311046755 --- pkg/sentry/fsimpl/gofer/filesystem.go | 35 ++++++++++++++++++--------------- pkg/sentry/fsimpl/gofer/special_file.go | 28 ++++++++++++++++++-------- 2 files changed, 39 insertions(+), 24 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go index 7ca0cf3f4..7f2181216 100644 --- a/pkg/sentry/fsimpl/gofer/filesystem.go +++ b/pkg/sentry/fsimpl/gofer/filesystem.go @@ -877,10 +877,15 @@ func (d *dentry) openSpecialFileLocked(ctx context.Context, mnt *vfs.Mount, opts if err != nil { return nil, err } + seekable := d.fileType() == linux.S_IFREG fd := &specialFileFD{ - handle: h, + handle: h, + seekable: seekable, } - if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil { + if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{ + DenyPRead: !seekable, + DenyPWrite: !seekable, + }); err != nil { h.close(ctx) return nil, err } @@ -911,7 +916,11 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving } creds := rp.Credentials() name := rp.Component() - fdobj, openFile, createQID, _, err := dirfile.create(ctx, name, (p9.OpenFlags)(opts.Flags), (p9.FileMode)(opts.Mode), (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID)) + // Filter file creation flags and O_LARGEFILE out; the create RPC already + // has the semantics of O_CREAT|O_EXCL, while some servers will choke on + // O_LARGEFILE. + createFlags := p9.OpenFlags(opts.Flags &^ (linux.O_CREAT | linux.O_EXCL | linux.O_NOCTTY | linux.O_TRUNC | linux.O_LARGEFILE)) + fdobj, openFile, createQID, _, err := dirfile.create(ctx, name, createFlags, (p9.FileMode)(opts.Mode), (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID)) if err != nil { dirfile.close(ctx) return nil, err @@ -919,7 +928,7 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving // Then we need to walk to the file we just created to get a non-open fid // representing it, and to get its metadata. This must use d.file since, as // explained above, dirfile was invalidated by dirfile.Create(). - walkQID, nonOpenFile, attrMask, attr, err := d.file.walkGetAttrOne(ctx, name) + _, nonOpenFile, attrMask, attr, err := d.file.walkGetAttrOne(ctx, name) if err != nil { openFile.close(ctx) if fdobj != nil { @@ -927,17 +936,6 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving } return nil, err } - // Sanity-check that we walked to the file we created. - if createQID.Path != walkQID.Path { - // Probably due to concurrent remote filesystem mutation? - ctx.Warningf("gofer.dentry.createAndOpenChildLocked: created file has QID %v before walk, QID %v after (interop=%v)", createQID, walkQID, d.fs.opts.interop) - nonOpenFile.close(ctx) - openFile.close(ctx) - if fdobj != nil { - fdobj.Close() - } - return nil, syserror.EAGAIN - } // Construct the new dentry. child, err := d.fs.newDentry(ctx, nonOpenFile, createQID, attrMask, &attr) @@ -983,16 +981,21 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving } childVFSFD = &fd.vfsfd } else { + seekable := child.fileType() == linux.S_IFREG fd := &specialFileFD{ handle: handle{ file: openFile, fd: -1, }, + seekable: seekable, } if fdobj != nil { fd.handle.fd = int32(fdobj.Release()) } - if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &child.vfsd, &vfs.FileDescriptionOptions{}); err != nil { + if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &child.vfsd, &vfs.FileDescriptionOptions{ + DenyPRead: !seekable, + DenyPWrite: !seekable, + }); err != nil { fd.handle.close(ctx) return nil, err } diff --git a/pkg/sentry/fsimpl/gofer/special_file.go b/pkg/sentry/fsimpl/gofer/special_file.go index 507e0e276..a464e6a94 100644 --- a/pkg/sentry/fsimpl/gofer/special_file.go +++ b/pkg/sentry/fsimpl/gofer/special_file.go @@ -33,13 +33,14 @@ import ( type specialFileFD struct { fileDescription - // handle is immutable. + // handle is used for file I/O. handle is immutable. handle handle - // off is the file offset. off is protected by mu. (POSIX 2.9.7 only - // requires operations using the file offset to be atomic for regular files - // and symlinks; however, since specialFileFD may be used for regular - // files, we apply this atomicity unconditionally.) + // seekable is true if this file description represents a file for which + // file offset is significant, i.e. a regular file. seekable is immutable. + seekable bool + + // If seekable is true, off is the file offset. off is protected by mu. mu sync.Mutex off int64 } @@ -63,7 +64,7 @@ func (fd *specialFileFD) OnClose(ctx context.Context) error { // PRead implements vfs.FileDescriptionImpl.PRead. func (fd *specialFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { - if offset < 0 { + if fd.seekable && offset < 0 { return 0, syserror.EINVAL } if opts.Flags != 0 { @@ -91,6 +92,10 @@ func (fd *specialFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offs // Read implements vfs.FileDescriptionImpl.Read. func (fd *specialFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + if !fd.seekable { + return fd.PRead(ctx, dst, -1, opts) + } + fd.mu.Lock() n, err := fd.PRead(ctx, dst, fd.off, opts) fd.off += n @@ -100,14 +105,14 @@ func (fd *specialFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts // PWrite implements vfs.FileDescriptionImpl.PWrite. func (fd *specialFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { - if offset < 0 { + if fd.seekable && offset < 0 { return 0, syserror.EINVAL } if opts.Flags != 0 { return 0, syserror.EOPNOTSUPP } - if fd.dentry().fileType() == linux.S_IFREG { + if fd.seekable { limit, err := vfs.CheckLimit(ctx, offset, src.NumBytes()) if err != nil { return 0, err @@ -130,6 +135,10 @@ func (fd *specialFileFD) PWrite(ctx context.Context, src usermem.IOSequence, off // Write implements vfs.FileDescriptionImpl.Write. func (fd *specialFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { + if !fd.seekable { + return fd.PWrite(ctx, src, -1, opts) + } + fd.mu.Lock() n, err := fd.PWrite(ctx, src, fd.off, opts) fd.off += n @@ -139,6 +148,9 @@ func (fd *specialFileFD) Write(ctx context.Context, src usermem.IOSequence, opts // Seek implements vfs.FileDescriptionImpl.Seek. func (fd *specialFileFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { + if !fd.seekable { + return 0, syserror.ESPIPE + } fd.mu.Lock() defer fd.mu.Unlock() switch whence { -- cgit v1.2.3 From d84607762889d999f93b89e64e09d2a4dda63bf7 Mon Sep 17 00:00:00 2001 From: Jamie Liu Date: Wed, 13 May 2020 10:51:50 -0700 Subject: Enable overlayfs_stale_read by default for runsc. Linux 4.18 and later make reads and writes coherent between pre-copy-up and post-copy-up FDs representing the same file on an overlay filesystem. However, memory mappings remain incoherent: - Documentation/filesystems/overlayfs.rst, "Non-standard behavior": "If a file residing on a lower layer is opened for read-only and then memory mapped with MAP_SHARED, then subsequent changes to the file are not reflected in the memory mapping." - fs/overlay/file.c:ovl_mmap() passes through to the underlying FD without any management of coherence in the overlay. - Experimentally on Linux 5.2: ``` $ cat mmap_cat_page.c #include #include #include #include #include #include int main(int argc, char **argv) { if (argc < 2) { errx(1, "syntax: %s [FILE]", argv[0]); } const int fd = open(argv[1], O_RDONLY); if (fd < 0) { err(1, "open(%s)", argv[1]); } const size_t page_size = sysconf(_SC_PAGE_SIZE); void* page = mmap(NULL, page_size, PROT_READ, MAP_SHARED, fd, 0); if (page == MAP_FAILED) { err(1, "mmap"); } for (;;) { write(1, page, strnlen(page, page_size)); if (getc(stdin) == EOF) { break; } } return 0; } $ gcc -O2 -o mmap_cat_page mmap_cat_page.c $ mkdir lowerdir upperdir workdir overlaydir $ echo old > lowerdir/file $ sudo mount -t overlay -o "lowerdir=lowerdir,upperdir=upperdir,workdir=workdir" none overlaydir $ ./mmap_cat_page overlaydir/file old ^Z [1]+ Stopped ./mmap_cat_page overlaydir/file $ echo new > overlaydir/file $ cat overlaydir/file new $ fg ./mmap_cat_page overlaydir/file old ``` Therefore, while the VFS1 gofer client's behavior of reopening read FDs is only necessary pre-4.18, replacing existing memory mappings (in both sentry and application address spaces) with mappings of the new FD is required regardless of kernel version, and this latter behavior is common to both VFS1 and VFS2. Re-document accordingly, and change the runsc flag to enabled by default. New test: - Before this CL: https://source.cloud.google.com/results/invocations/5b222d2c-e918-4bae-afc4-407f5bac509b - After this CL: https://source.cloud.google.com/results/invocations/f28c747e-d89c-4d8c-a461-602b33e71aab PiperOrigin-RevId: 311361267 --- images/hostoverlaytest/Dockerfile | 7 +++ images/hostoverlaytest/test.c | 88 +++++++++++++++++++++++++++++++++++++ images/hostoverlaytest/testfile.txt | 1 + pkg/sentry/fs/gofer/fs.go | 3 +- pkg/sentry/fsimpl/gofer/gofer.go | 9 ++-- runsc/boot/config.go | 6 ++- runsc/container/container_test.go | 37 ---------------- runsc/main.go | 2 +- test/e2e/integration_test.go | 15 +++++++ 9 files changed, 123 insertions(+), 45 deletions(-) create mode 100644 images/hostoverlaytest/Dockerfile create mode 100644 images/hostoverlaytest/test.c create mode 100644 images/hostoverlaytest/testfile.txt (limited to 'pkg/sentry/fsimpl') diff --git a/images/hostoverlaytest/Dockerfile b/images/hostoverlaytest/Dockerfile new file mode 100644 index 000000000..d83439e9c --- /dev/null +++ b/images/hostoverlaytest/Dockerfile @@ -0,0 +1,7 @@ +FROM ubuntu:bionic + +WORKDIR /root +COPY . . + +RUN apt-get update && apt-get install -y gcc +RUN gcc -O2 -o test test.c diff --git a/images/hostoverlaytest/test.c b/images/hostoverlaytest/test.c new file mode 100644 index 000000000..088f90746 --- /dev/null +++ b/images/hostoverlaytest/test.c @@ -0,0 +1,88 @@ +#include +#include +#include +#include +#include +#include + +int main(int argc, char** argv) { + const char kTestFilePath[] = "testfile.txt"; + const char kOldFileData[] = "old data\n"; + const char kNewFileData[] = "new data\n"; + const size_t kPageSize = sysconf(_SC_PAGE_SIZE); + + // Open a file that already exists in a host overlayfs lower layer. + const int fd_rdonly = open(kTestFilePath, O_RDONLY); + if (fd_rdonly < 0) { + err(1, "open(%s, O_RDONLY)", kTestFilePath); + } + + // Check that the file's initial contents are what we expect when read via + // syscall. + char oldbuf[sizeof(kOldFileData)] = {}; + ssize_t n = pread(fd_rdonly, oldbuf, sizeof(oldbuf), 0); + if (n < 0) { + err(1, "initial pread"); + } + if (n != strlen(kOldFileData)) { + errx(1, "short initial pread (%ld/%lu bytes)", n, strlen(kOldFileData)); + } + if (strcmp(oldbuf, kOldFileData) != 0) { + errx(1, "initial pread returned wrong data: %s", oldbuf); + } + + // Check that the file's initial contents are what we expect when read via + // memory mapping. + void* page = mmap(NULL, kPageSize, PROT_READ, MAP_SHARED, fd_rdonly, 0); + if (page == MAP_FAILED) { + err(1, "mmap"); + } + if (strcmp(page, kOldFileData) != 0) { + errx(1, "mapping contains wrong initial data: %s", (const char*)page); + } + + // Open the same file writably, causing host overlayfs to copy it up, and + // replace its contents. + const int fd_rdwr = open(kTestFilePath, O_RDWR); + if (fd_rdwr < 0) { + err(1, "open(%s, O_RDWR)", kTestFilePath); + } + n = write(fd_rdwr, kNewFileData, strlen(kNewFileData)); + if (n < 0) { + err(1, "write"); + } + if (n != strlen(kNewFileData)) { + errx(1, "short write (%ld/%lu bytes)", n, strlen(kNewFileData)); + } + if (ftruncate(fd_rdwr, strlen(kNewFileData)) < 0) { + err(1, "truncate"); + } + + int failed = 0; + + // Check that syscalls on the old FD return updated contents. (Before Linux + // 4.18, this requires that runsc use a post-copy-up FD to service the read.) + char newbuf[sizeof(kNewFileData)] = {}; + n = pread(fd_rdonly, newbuf, sizeof(newbuf), 0); + if (n < 0) { + err(1, "final pread"); + } + if (n != strlen(kNewFileData)) { + warnx("short final pread (%ld/%lu bytes)", n, strlen(kNewFileData)); + failed = 1; + } else if (strcmp(newbuf, kNewFileData) != 0) { + warnx("final pread returned wrong data: %s", newbuf); + failed = 1; + } + + // Check that the memory mapping of the old FD has been updated. (Linux + // overlayfs does not do this, so regardless of kernel version this requires + // that runsc replace existing memory mappings with mappings of a + // post-copy-up FD.) + if (strcmp(page, kNewFileData) != 0) { + warnx("mapping contains wrong final data: %s", (const char*)page); + failed = 1; + } + + return failed; +} diff --git a/images/hostoverlaytest/testfile.txt b/images/hostoverlaytest/testfile.txt new file mode 100644 index 000000000..e4188c841 --- /dev/null +++ b/images/hostoverlaytest/testfile.txt @@ -0,0 +1 @@ +old data diff --git a/pkg/sentry/fs/gofer/fs.go b/pkg/sentry/fs/gofer/fs.go index 9d41fcbdb..8ae2d78d7 100644 --- a/pkg/sentry/fs/gofer/fs.go +++ b/pkg/sentry/fs/gofer/fs.go @@ -60,8 +60,7 @@ const ( limitHostFDTranslationKey = "limit_host_fd_translation" // overlayfsStaleRead if present closes cached readonly file after the first - // write. This is done to workaround a limitation of overlayfs in kernels - // before 4.19 where open FDs are not updated after the file is copied up. + // write. This is done to workaround a limitation of Linux overlayfs. overlayfsStaleRead = "overlayfs_stale_read" ) diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go index 1da8d5d82..353e2cf5b 100644 --- a/pkg/sentry/fsimpl/gofer/gofer.go +++ b/pkg/sentry/fsimpl/gofer/gofer.go @@ -143,9 +143,12 @@ type filesystemOptions struct { // If overlayfsStaleRead is true, O_RDONLY host FDs provided by the remote // filesystem may not be coherent with writable host FDs opened later, so - // mappings of the former must be replaced by mappings of the latter. This - // is usually only the case when the remote filesystem is an overlayfs - // mount on Linux < 4.19. + // all uses of the former must be replaced by uses of the latter. This is + // usually only the case when the remote filesystem is a Linux overlayfs + // mount. (Prior to Linux 4.18, patch series centered on commit + // d1d04ef8572b "ovl: stack file ops", both I/O and memory mappings were + // incoherent between pre-copy-up and post-copy-up FDs; after that patch + // series, only memory mappings are incoherent.) overlayfsStaleRead bool // If regularFilesUseSpecialFileFD is true, application FDs representing diff --git a/runsc/boot/config.go b/runsc/boot/config.go index 6d6a705f8..bcec7e4db 100644 --- a/runsc/boot/config.go +++ b/runsc/boot/config.go @@ -241,8 +241,10 @@ type Config struct { // ReferenceLeakMode sets reference leak check mode ReferenceLeakMode refs.LeakMode - // OverlayfsStaleRead causes cached FDs to reopen after a file is opened for - // write to workaround overlayfs limitation on kernels before 4.19. + // OverlayfsStaleRead instructs the sandbox to assume that the root mount + // is on a Linux overlayfs mount, which does not necessarily preserve + // coherence between read-only and subsequent writable file descriptors + // representing the "same" file. OverlayfsStaleRead bool // TestOnlyAllowRunAsCurrentUserWithoutChroot should only be used in diff --git a/runsc/container/container_test.go b/runsc/container/container_test.go index acf988aa0..7ba301331 100644 --- a/runsc/container/container_test.go +++ b/runsc/container/container_test.go @@ -2126,43 +2126,6 @@ func TestNetRaw(t *testing.T) { } } -// TestOverlayfsStaleRead most basic test that '--overlayfs-stale-read' works. -func TestOverlayfsStaleRead(t *testing.T) { - conf := testutil.TestConfig(t) - conf.OverlayfsStaleRead = true - - in, err := ioutil.TempFile(testutil.TmpDir(), "stale-read.in") - if err != nil { - t.Fatalf("ioutil.TempFile() failed: %v", err) - } - defer in.Close() - if _, err := in.WriteString("stale data"); err != nil { - t.Fatalf("in.Write() failed: %v", err) - } - - out, err := ioutil.TempFile(testutil.TmpDir(), "stale-read.out") - if err != nil { - t.Fatalf("ioutil.TempFile() failed: %v", err) - } - defer out.Close() - - const want = "foobar" - cmd := fmt.Sprintf("cat %q >&2 && echo %q> %q && cp %q %q", in.Name(), want, in.Name(), in.Name(), out.Name()) - spec := testutil.NewSpecWithArgs("/bin/bash", "-c", cmd) - if err := run(spec, conf); err != nil { - t.Fatalf("Error running container: %v", err) - } - - gotBytes, err := ioutil.ReadAll(out) - if err != nil { - t.Fatalf("out.Read() failed: %v", err) - } - got := strings.TrimSpace(string(gotBytes)) - if want != got { - t.Errorf("Wrong content in out file, got: %q. want: %q", got, want) - } -} - // TestTTYField checks TTY field returned by container.Processes(). func TestTTYField(t *testing.T) { stop := testutil.StartReaper() diff --git a/runsc/main.go b/runsc/main.go index 0625a06e0..920ed84a5 100644 --- a/runsc/main.go +++ b/runsc/main.go @@ -76,7 +76,7 @@ var ( fileAccess = flag.String("file-access", "exclusive", "specifies which filesystem to use for the root mount: exclusive (default), shared. Volume mounts are always shared.") fsGoferHostUDS = flag.Bool("fsgofer-host-uds", false, "allow the gofer to mount Unix Domain Sockets.") overlay = flag.Bool("overlay", false, "wrap filesystem mounts with writable overlay. All modifications are stored in memory inside the sandbox.") - overlayfsStaleRead = flag.Bool("overlayfs-stale-read", false, "reopen cached FDs after a file is opened for write to workaround overlayfs limitation on kernels before 4.19.") + overlayfsStaleRead = flag.Bool("overlayfs-stale-read", true, "assume root mount is an overlay filesystem") watchdogAction = flag.String("watchdog-action", "log", "sets what action the watchdog takes when triggered: log (default), panic.") panicSignal = flag.Int("panic-signal", -1, "register signal handling that panics. Usually set to SIGUSR2(12) to troubleshoot hangs. -1 disables it.") profile = flag.Bool("profile", false, "prepares the sandbox to use Golang profiler. Note that enabling profiler loosens the seccomp protection added to the sandbox (DO NOT USE IN PRODUCTION).") diff --git a/test/e2e/integration_test.go b/test/e2e/integration_test.go index 404e37689..ff856883a 100644 --- a/test/e2e/integration_test.go +++ b/test/e2e/integration_test.go @@ -361,6 +361,21 @@ func TestTmpFile(t *testing.T) { } } +// TestHostOverlayfsCopyUp tests that the --overlayfs-stale-read option causes +// runsc to hide the incoherence of FDs opened before and after overlayfs +// copy-up on the host. +func TestHostOverlayfsCopyUp(t *testing.T) { + d := dockerutil.MakeDocker(t) + defer d.CleanUp() + + if _, err := d.Run(dockerutil.RunOpts{ + Image: "hostoverlaytest", + WorkDir: "/root", + }, "./test"); err != nil { + t.Fatalf("docker run failed: %v", err) + } +} + func TestMain(m *testing.M) { dockerutil.EnsureSupportedDockerVersion() flag.Parse() -- cgit v1.2.3 From db655f020ea556524f0e341e538e81c16d4f95e7 Mon Sep 17 00:00:00 2001 From: Nicolas Lacasse Date: Wed, 13 May 2020 17:35:04 -0700 Subject: Resolve remaining TODOs for tmpfs. Closes #1197 PiperOrigin-RevId: 311438223 --- pkg/sentry/fsimpl/tmpfs/BUILD | 1 + pkg/sentry/fsimpl/tmpfs/filesystem.go | 25 +++-- pkg/sentry/fsimpl/tmpfs/regular_file_test.go | 136 ----------------------- pkg/sentry/fsimpl/tmpfs/stat_test.go | 2 - pkg/sentry/fsimpl/tmpfs/tmpfs_test.go | 156 +++++++++++++++++++++++++++ test/syscalls/linux/BUILD | 1 + test/syscalls/linux/socket.cc | 9 +- test/syscalls/linux/symlink.cc | 25 +++++ 8 files changed, 208 insertions(+), 147 deletions(-) create mode 100644 pkg/sentry/fsimpl/tmpfs/tmpfs_test.go (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/tmpfs/BUILD b/pkg/sentry/fsimpl/tmpfs/BUILD index a2d9649e7..9a076ad71 100644 --- a/pkg/sentry/fsimpl/tmpfs/BUILD +++ b/pkg/sentry/fsimpl/tmpfs/BUILD @@ -96,6 +96,7 @@ go_test( "pipe_test.go", "regular_file_test.go", "stat_test.go", + "tmpfs_test.go", ], library = ":tmpfs", deps = [ diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go index 36ffcb592..e0ad82769 100644 --- a/pkg/sentry/fsimpl/tmpfs/filesystem.go +++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go @@ -16,6 +16,7 @@ package tmpfs import ( "fmt" + "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" @@ -24,6 +25,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) // Sync implements vfs.FilesystemImpl.Sync. @@ -76,8 +78,8 @@ afterSymlink: return nil, err } if symlink, ok := child.inode.impl.(*symlink); ok && rp.ShouldFollowSymlink() { - // TODO(gvisor.dev/issue/1197): Symlink traversals updates - // access time. + // Symlink traversal updates access time. + atomic.StoreInt64(&d.inode.atime, d.inode.fs.clock.Now().Nanoseconds()) if err := rp.HandleSymlink(symlink.target); err != nil { return nil, err } @@ -361,8 +363,8 @@ afterTrailingSymlink: } // Do we need to resolve a trailing symlink? if symlink, ok := child.inode.impl.(*symlink); ok && rp.ShouldFollowSymlink() { - // TODO(gvisor.dev/issue/1197): Symlink traversals updates - // access time. + // Symlink traversal updates access time. + atomic.StoreInt64(&child.inode.atime, child.inode.fs.clock.Now().Nanoseconds()) if err := rp.HandleSymlink(symlink.target); err != nil { return nil, err } @@ -636,12 +638,19 @@ func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) { fs.mu.RLock() defer fs.mu.RUnlock() - _, err := resolveLocked(rp) - if err != nil { + if _, err := resolveLocked(rp); err != nil { return linux.Statfs{}, err } - // TODO(gvisor.dev/issue/1197): Actually implement statfs. - return linux.Statfs{}, syserror.ENOSYS + statfs := linux.Statfs{ + Type: linux.TMPFS_MAGIC, + BlockSize: usermem.PageSize, + FragmentSize: usermem.PageSize, + NameLength: linux.NAME_MAX, + // TODO(b/29637826): Allow configuring a tmpfs size and enforce it. + Blocks: 0, + BlocksFree: 0, + } + return statfs, nil } // SymlinkAt implements vfs.FilesystemImpl.SymlinkAt. diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file_test.go b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go index 0399725cf..f2bc96d51 100644 --- a/pkg/sentry/fsimpl/tmpfs/regular_file_test.go +++ b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go @@ -18,152 +18,16 @@ import ( "bytes" "fmt" "io" - "sync/atomic" "testing" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/fs/lock" - "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/contexttest" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) -// nextFileID is used to generate unique file names. -var nextFileID int64 - -// newTmpfsRoot creates a new tmpfs mount, and returns the root. If the error -// is not nil, then cleanup should be called when the root is no longer needed. -func newTmpfsRoot(ctx context.Context) (*vfs.VirtualFilesystem, vfs.VirtualDentry, func(), error) { - creds := auth.CredentialsFromContext(ctx) - - vfsObj := &vfs.VirtualFilesystem{} - if err := vfsObj.Init(); err != nil { - return nil, vfs.VirtualDentry{}, nil, fmt.Errorf("VFS init: %v", err) - } - - vfsObj.MustRegisterFilesystemType("tmpfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ - AllowUserMount: true, - }) - mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{}) - if err != nil { - return nil, vfs.VirtualDentry{}, nil, fmt.Errorf("failed to create tmpfs root mount: %v", err) - } - root := mntns.Root() - return vfsObj, root, func() { - root.DecRef() - mntns.DecRef() - }, nil -} - -// newFileFD creates a new file in a new tmpfs mount, and returns the FD. If -// the returned err is not nil, then cleanup should be called when the FD is no -// longer needed. -func newFileFD(ctx context.Context, mode linux.FileMode) (*vfs.FileDescription, func(), error) { - creds := auth.CredentialsFromContext(ctx) - vfsObj, root, cleanup, err := newTmpfsRoot(ctx) - if err != nil { - return nil, nil, err - } - - filename := fmt.Sprintf("tmpfs-test-file-%d", atomic.AddInt64(&nextFileID, 1)) - - // Create the file that will be write/read. - fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{ - Root: root, - Start: root, - Path: fspath.Parse(filename), - }, &vfs.OpenOptions{ - Flags: linux.O_RDWR | linux.O_CREAT | linux.O_EXCL, - Mode: linux.ModeRegular | mode, - }) - if err != nil { - cleanup() - return nil, nil, fmt.Errorf("failed to create file %q: %v", filename, err) - } - - return fd, cleanup, nil -} - -// newDirFD is like newFileFD, but for directories. -func newDirFD(ctx context.Context, mode linux.FileMode) (*vfs.FileDescription, func(), error) { - creds := auth.CredentialsFromContext(ctx) - vfsObj, root, cleanup, err := newTmpfsRoot(ctx) - if err != nil { - return nil, nil, err - } - - dirname := fmt.Sprintf("tmpfs-test-dir-%d", atomic.AddInt64(&nextFileID, 1)) - - // Create the dir. - if err := vfsObj.MkdirAt(ctx, creds, &vfs.PathOperation{ - Root: root, - Start: root, - Path: fspath.Parse(dirname), - }, &vfs.MkdirOptions{ - Mode: linux.ModeDirectory | mode, - }); err != nil { - cleanup() - return nil, nil, fmt.Errorf("failed to create directory %q: %v", dirname, err) - } - - // Open the dir and return it. - fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{ - Root: root, - Start: root, - Path: fspath.Parse(dirname), - }, &vfs.OpenOptions{ - Flags: linux.O_RDONLY | linux.O_DIRECTORY, - }) - if err != nil { - cleanup() - return nil, nil, fmt.Errorf("failed to open directory %q: %v", dirname, err) - } - - return fd, cleanup, nil -} - -// newPipeFD is like newFileFD, but for pipes. -func newPipeFD(ctx context.Context, mode linux.FileMode) (*vfs.FileDescription, func(), error) { - creds := auth.CredentialsFromContext(ctx) - vfsObj, root, cleanup, err := newTmpfsRoot(ctx) - if err != nil { - return nil, nil, err - } - - pipename := fmt.Sprintf("tmpfs-test-pipe-%d", atomic.AddInt64(&nextFileID, 1)) - - // Create the pipe. - if err := vfsObj.MknodAt(ctx, creds, &vfs.PathOperation{ - Root: root, - Start: root, - Path: fspath.Parse(pipename), - }, &vfs.MknodOptions{ - Mode: linux.ModeNamedPipe | mode, - }); err != nil { - cleanup() - return nil, nil, fmt.Errorf("failed to create pipe %q: %v", pipename, err) - } - - // Open the pipe and return it. - fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{ - Root: root, - Start: root, - Path: fspath.Parse(pipename), - }, &vfs.OpenOptions{ - Flags: linux.O_RDWR, - }) - if err != nil { - cleanup() - return nil, nil, fmt.Errorf("failed to open pipe %q: %v", pipename, err) - } - - return fd, cleanup, nil -} - // Test that we can write some data to a file and read it back.` func TestSimpleWriteRead(t *testing.T) { ctx := contexttest.Context(t) diff --git a/pkg/sentry/fsimpl/tmpfs/stat_test.go b/pkg/sentry/fsimpl/tmpfs/stat_test.go index 60c2c980e..f52755092 100644 --- a/pkg/sentry/fsimpl/tmpfs/stat_test.go +++ b/pkg/sentry/fsimpl/tmpfs/stat_test.go @@ -29,7 +29,6 @@ func TestStatAfterCreate(t *testing.T) { mode := linux.FileMode(0644) // Run with different file types. - // TODO(gvisor.dev/issue/1197): Also test symlinks and sockets. for _, typ := range []string{"file", "dir", "pipe"} { t.Run(fmt.Sprintf("type=%q", typ), func(t *testing.T) { var ( @@ -175,7 +174,6 @@ func TestSetStat(t *testing.T) { mode := linux.FileMode(0644) // Run with different file types. - // TODO(gvisor.dev/issue/1197): Also test symlinks and sockets. for _, typ := range []string{"file", "dir", "pipe"} { t.Run(fmt.Sprintf("type=%q", typ), func(t *testing.T) { var ( diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs_test.go b/pkg/sentry/fsimpl/tmpfs/tmpfs_test.go new file mode 100644 index 000000000..a240fb276 --- /dev/null +++ b/pkg/sentry/fsimpl/tmpfs/tmpfs_test.go @@ -0,0 +1,156 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tmpfs + +import ( + "fmt" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" +) + +// nextFileID is used to generate unique file names. +var nextFileID int64 + +// newTmpfsRoot creates a new tmpfs mount, and returns the root. If the error +// is not nil, then cleanup should be called when the root is no longer needed. +func newTmpfsRoot(ctx context.Context) (*vfs.VirtualFilesystem, vfs.VirtualDentry, func(), error) { + creds := auth.CredentialsFromContext(ctx) + + vfsObj := &vfs.VirtualFilesystem{} + if err := vfsObj.Init(); err != nil { + return nil, vfs.VirtualDentry{}, nil, fmt.Errorf("VFS init: %v", err) + } + + vfsObj.MustRegisterFilesystemType("tmpfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserMount: true, + }) + mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{}) + if err != nil { + return nil, vfs.VirtualDentry{}, nil, fmt.Errorf("failed to create tmpfs root mount: %v", err) + } + root := mntns.Root() + return vfsObj, root, func() { + root.DecRef() + mntns.DecRef() + }, nil +} + +// newFileFD creates a new file in a new tmpfs mount, and returns the FD. If +// the returned err is not nil, then cleanup should be called when the FD is no +// longer needed. +func newFileFD(ctx context.Context, mode linux.FileMode) (*vfs.FileDescription, func(), error) { + creds := auth.CredentialsFromContext(ctx) + vfsObj, root, cleanup, err := newTmpfsRoot(ctx) + if err != nil { + return nil, nil, err + } + + filename := fmt.Sprintf("tmpfs-test-file-%d", atomic.AddInt64(&nextFileID, 1)) + + // Create the file that will be write/read. + fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(filename), + }, &vfs.OpenOptions{ + Flags: linux.O_RDWR | linux.O_CREAT | linux.O_EXCL, + Mode: linux.ModeRegular | mode, + }) + if err != nil { + cleanup() + return nil, nil, fmt.Errorf("failed to create file %q: %v", filename, err) + } + + return fd, cleanup, nil +} + +// newDirFD is like newFileFD, but for directories. +func newDirFD(ctx context.Context, mode linux.FileMode) (*vfs.FileDescription, func(), error) { + creds := auth.CredentialsFromContext(ctx) + vfsObj, root, cleanup, err := newTmpfsRoot(ctx) + if err != nil { + return nil, nil, err + } + + dirname := fmt.Sprintf("tmpfs-test-dir-%d", atomic.AddInt64(&nextFileID, 1)) + + // Create the dir. + if err := vfsObj.MkdirAt(ctx, creds, &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(dirname), + }, &vfs.MkdirOptions{ + Mode: linux.ModeDirectory | mode, + }); err != nil { + cleanup() + return nil, nil, fmt.Errorf("failed to create directory %q: %v", dirname, err) + } + + // Open the dir and return it. + fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(dirname), + }, &vfs.OpenOptions{ + Flags: linux.O_RDONLY | linux.O_DIRECTORY, + }) + if err != nil { + cleanup() + return nil, nil, fmt.Errorf("failed to open directory %q: %v", dirname, err) + } + + return fd, cleanup, nil +} + +// newPipeFD is like newFileFD, but for pipes. +func newPipeFD(ctx context.Context, mode linux.FileMode) (*vfs.FileDescription, func(), error) { + creds := auth.CredentialsFromContext(ctx) + vfsObj, root, cleanup, err := newTmpfsRoot(ctx) + if err != nil { + return nil, nil, err + } + + name := fmt.Sprintf("tmpfs-test-%d", atomic.AddInt64(&nextFileID, 1)) + + if err := vfsObj.MknodAt(ctx, creds, &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(name), + }, &vfs.MknodOptions{ + Mode: linux.ModeNamedPipe | mode, + }); err != nil { + cleanup() + return nil, nil, fmt.Errorf("failed to create pipe %q: %v", name, err) + } + + fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(name), + }, &vfs.OpenOptions{ + Flags: linux.O_RDWR, + }) + if err != nil { + cleanup() + return nil, nil, fmt.Errorf("failed to open pipe %q: %v", name, err) + } + + return fd, cleanup, nil +} diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD index adf259bba..e6bc63474 100644 --- a/test/syscalls/linux/BUILD +++ b/test/syscalls/linux/BUILD @@ -3288,6 +3288,7 @@ cc_binary( "//test/util:capability_util", "//test/util:file_descriptor", "//test/util:fs_util", + "@com_google_absl//absl/time", gtest, "//test/util:temp_path", "//test/util:test_main", diff --git a/test/syscalls/linux/socket.cc b/test/syscalls/linux/socket.cc index 703d594a2..afa59c1da 100644 --- a/test/syscalls/linux/socket.cc +++ b/test/syscalls/linux/socket.cc @@ -61,7 +61,7 @@ TEST(SocketTest, ProtocolInet) { } } -TEST(SocketTest, UnixSocketFileMode) { +TEST(SocketTest, UnixSocketStat) { // TODO(gvisor.dev/issue/1624): Re-enable this test once VFS1 is deleted. It // should pass in VFS2. SKIP_IF(IsRunningOnGvisor()); @@ -83,7 +83,14 @@ TEST(SocketTest, UnixSocketFileMode) { struct stat statbuf = {}; ASSERT_THAT(stat(addr.sun_path, &statbuf), SyscallSucceeds()); + + // Mode should be S_IFSOCK. EXPECT_EQ(statbuf.st_mode, S_IFSOCK | sock_perm & ~mask); + + // Timestamps should be equal and non-zero. + EXPECT_NE(statbuf.st_atime, 0); + EXPECT_EQ(statbuf.st_atime, statbuf.st_mtime); + EXPECT_EQ(statbuf.st_atime, statbuf.st_ctime); } TEST(SocketTest, UnixConnectNeedsWritePerm) { diff --git a/test/syscalls/linux/symlink.cc b/test/syscalls/linux/symlink.cc index 03ee1250d..a17ff62e9 100644 --- a/test/syscalls/linux/symlink.cc +++ b/test/syscalls/linux/symlink.cc @@ -20,6 +20,7 @@ #include #include "gtest/gtest.h" +#include "absl/time/clock.h" #include "test/util/capability_util.h" #include "test/util/file_descriptor.h" #include "test/util/fs_util.h" @@ -272,6 +273,30 @@ TEST(SymlinkTest, ChmodSymlink) { EXPECT_EQ(FilePermission(newpath), 0777); } +// Test that following a symlink updates the atime on the symlink. +TEST(SymlinkTest, FollowUpdatesATime) { + const auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile()); + const std::string link = NewTempAbsPath(); + EXPECT_THAT(symlink(file.path().c_str(), link.c_str()), SyscallSucceeds()); + + // Lstat the symlink. + struct stat st_before_follow; + ASSERT_THAT(lstat(link.c_str(), &st_before_follow), SyscallSucceeds()); + + // Let the clock advance. + absl::SleepFor(absl::Seconds(1)); + + // Open the file via the symlink. + int fd; + ASSERT_THAT(fd = open(link.c_str(), O_RDWR, 0666), SyscallSucceeds()); + FileDescriptor fd_closer(fd); + + // Lstat the symlink again, and check that atime is updated. + struct stat st_after_follow; + ASSERT_THAT(lstat(link.c_str(), &st_after_follow), SyscallSucceeds()); + EXPECT_LT(st_before_follow.st_atime, st_after_follow.st_atime); +} + class ParamSymlinkTest : public ::testing::TestWithParam {}; // Test that creating an existing symlink with creat will create the target. -- cgit v1.2.3 From 47dfba76616a69887f0d5a4be6eb82b5dc5d0f52 Mon Sep 17 00:00:00 2001 From: Nicolas Lacasse Date: Thu, 14 May 2020 09:34:21 -0700 Subject: Port memfd_create to vfs2 and finish implementation of file seals. Closes #2612. PiperOrigin-RevId: 311548074 --- pkg/sentry/fsimpl/tmpfs/BUILD | 2 - pkg/sentry/fsimpl/tmpfs/filesystem.go | 21 +++++++++- pkg/sentry/fsimpl/tmpfs/regular_file.go | 42 +++++++++++++++++++ pkg/sentry/fsimpl/tmpfs/regular_file_test.go | 2 +- pkg/sentry/fsimpl/tmpfs/stat_test.go | 2 +- pkg/sentry/fsimpl/tmpfs/tmpfs.go | 38 ++++++++++++++++- pkg/sentry/kernel/BUILD | 1 + pkg/sentry/kernel/kernel.go | 25 +++++++++++ pkg/sentry/syscalls/linux/vfs2/BUILD | 2 + pkg/sentry/syscalls/linux/vfs2/fd.go | 10 +++++ pkg/sentry/syscalls/linux/vfs2/memfd.go | 63 ++++++++++++++++++++++++++++ pkg/sentry/syscalls/linux/vfs2/vfs2.go | 2 +- 12 files changed, 203 insertions(+), 7 deletions(-) create mode 100644 pkg/sentry/syscalls/linux/vfs2/memfd.go (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/tmpfs/BUILD b/pkg/sentry/fsimpl/tmpfs/BUILD index 9a076ad71..007be1572 100644 --- a/pkg/sentry/fsimpl/tmpfs/BUILD +++ b/pkg/sentry/fsimpl/tmpfs/BUILD @@ -52,7 +52,6 @@ go_library( "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", "//pkg/sentry/fs/lock", - "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", "//pkg/sentry/kernel/pipe", "//pkg/sentry/kernel/time", @@ -106,7 +105,6 @@ go_test( "//pkg/sentry/contexttest", "//pkg/sentry/fs/lock", "//pkg/sentry/kernel/auth", - "//pkg/sentry/kernel/contexttest", "//pkg/sentry/vfs", "//pkg/syserror", "//pkg/usermem", diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go index e0ad82769..80fa7b29d 100644 --- a/pkg/sentry/fsimpl/tmpfs/filesystem.go +++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go @@ -772,5 +772,24 @@ func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error { fs.mu.RLock() defer fs.mu.RUnlock() - return genericPrependPath(vfsroot, vd.Mount(), vd.Dentry().Impl().(*dentry), b) + mnt := vd.Mount() + d := vd.Dentry().Impl().(*dentry) + for { + if mnt == vfsroot.Mount() && &d.vfsd == vfsroot.Dentry() { + return vfs.PrependPathAtVFSRootError{} + } + if &d.vfsd == mnt.Root() { + return nil + } + if d.parent == nil { + if d.name != "" { + // This must be an anonymous memfd file. + b.PrependComponent("/" + d.name) + return vfs.PrependPathSyntheticError{} + } + return vfs.PrependPathAtNonMountRootError{} + } + b.PrependComponent(d.name) + d = d.parent + } } diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file.go b/pkg/sentry/fsimpl/tmpfs/regular_file.go index 57e5e28ec..3f433d666 100644 --- a/pkg/sentry/fsimpl/tmpfs/regular_file.go +++ b/pkg/sentry/fsimpl/tmpfs/regular_file.go @@ -88,6 +88,7 @@ type regularFile struct { func (fs *filesystem) newRegularFile(creds *auth.Credentials, mode linux.FileMode) *inode { file := ®ularFile{ memFile: fs.memFile, + seals: linux.F_SEAL_SEAL, } file.inode.init(file, fs, creds, linux.S_IFREG|mode) file.inode.nlink = 1 // from parent directory @@ -577,3 +578,44 @@ exitLoop: return done, retErr } + +// GetSeals returns the current set of seals on a memfd inode. +func GetSeals(fd *vfs.FileDescription) (uint32, error) { + f, ok := fd.Impl().(*regularFileFD) + if !ok { + return 0, syserror.EINVAL + } + rf := f.inode().impl.(*regularFile) + rf.dataMu.RLock() + defer rf.dataMu.RUnlock() + return rf.seals, nil +} + +// AddSeals adds new file seals to a memfd inode. +func AddSeals(fd *vfs.FileDescription, val uint32) error { + f, ok := fd.Impl().(*regularFileFD) + if !ok { + return syserror.EINVAL + } + rf := f.inode().impl.(*regularFile) + rf.mapsMu.Lock() + defer rf.mapsMu.Unlock() + rf.dataMu.RLock() + defer rf.dataMu.RUnlock() + + if rf.seals&linux.F_SEAL_SEAL != 0 { + // Seal applied which prevents addition of any new seals. + return syserror.EPERM + } + + // F_SEAL_WRITE can only be added if there are no active writable maps. + if rf.seals&linux.F_SEAL_WRITE == 0 && val&linux.F_SEAL_WRITE != 0 { + if rf.writableMappingPages > 0 { + return syserror.EBUSY + } + } + + // Seals can only be added, never removed. + rf.seals |= val + return nil +} diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file_test.go b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go index f2bc96d51..64e1c40ad 100644 --- a/pkg/sentry/fsimpl/tmpfs/regular_file_test.go +++ b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go @@ -21,8 +21,8 @@ import ( "testing" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/sentry/fs/lock" - "gvisor.dev/gvisor/pkg/sentry/kernel/contexttest" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" diff --git a/pkg/sentry/fsimpl/tmpfs/stat_test.go b/pkg/sentry/fsimpl/tmpfs/stat_test.go index f52755092..f7ee4aab2 100644 --- a/pkg/sentry/fsimpl/tmpfs/stat_test.go +++ b/pkg/sentry/fsimpl/tmpfs/stat_test.go @@ -19,8 +19,8 @@ import ( "testing" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" - "gvisor.dev/gvisor/pkg/sentry/kernel/contexttest" "gvisor.dev/gvisor/pkg/sentry/vfs" ) diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go index 405928bd0..1e781aecd 100644 --- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go +++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go @@ -94,7 +94,7 @@ type FilesystemOpts struct { } // GetFilesystem implements vfs.FilesystemType.GetFilesystem. -func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { +func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, _ string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { memFileProvider := pgalloc.MemoryFileProviderFromContext(ctx) if memFileProvider == nil { panic("MemoryFileProviderFromContext returned nil") @@ -139,6 +139,11 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt return &fs.vfsfs, &root.vfsd, nil } +// NewFilesystem returns a new tmpfs filesystem. +func NewFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials) (*vfs.Filesystem, *vfs.Dentry, error) { + return FilesystemType{}.GetFilesystem(ctx, vfsObj, creds, "", vfs.GetFilesystemOptions{}) +} + // Release implements vfs.FilesystemImpl.Release. func (fs *filesystem) Release() { fs.vfsfs.VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) @@ -658,3 +663,34 @@ func (fd *fileDescription) Setxattr(ctx context.Context, opts vfs.SetxattrOption func (fd *fileDescription) Removexattr(ctx context.Context, name string) error { return fd.inode().removexattr(auth.CredentialsFromContext(ctx), name) } + +// NewMemfd creates a new tmpfs regular file and file description that can back +// an anonymous fd created by memfd_create. +func NewMemfd(mount *vfs.Mount, creds *auth.Credentials, allowSeals bool, name string) (*vfs.FileDescription, error) { + fs, ok := mount.Filesystem().Impl().(*filesystem) + if !ok { + panic("NewMemfd() called with non-tmpfs mount") + } + + // Per Linux, mm/shmem.c:__shmem_file_setup(), memfd inodes are set up with + // S_IRWXUGO. + mode := linux.FileMode(0777) + inode := fs.newRegularFile(creds, mode) + rf := inode.impl.(*regularFile) + if allowSeals { + rf.seals = 0 + } + + d := fs.newDentry(inode) + defer d.DecRef() + d.name = name + + // Per Linux, mm/shmem.c:__shmem_file_setup(), memfd files are set up with + // FMODE_READ | FMODE_WRITE. + var fd regularFileFD + flags := uint32(linux.O_RDWR) + if err := fd.vfsfd.Init(&fd, flags, mount, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil { + return nil, err + } + return &fd.vfsfd, nil +} diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD index 8104f50f3..a28eab8b8 100644 --- a/pkg/sentry/kernel/BUILD +++ b/pkg/sentry/kernel/BUILD @@ -173,6 +173,7 @@ go_library( "//pkg/sentry/fsimpl/pipefs", "//pkg/sentry/fsimpl/sockfs", "//pkg/sentry/fsimpl/timerfd", + "//pkg/sentry/fsimpl/tmpfs", "//pkg/sentry/hostcpu", "//pkg/sentry/inet", "//pkg/sentry/kernel/auth", diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index 3617da8c6..5efeb3767 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -53,6 +53,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fsimpl/pipefs" "gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs" "gvisor.dev/gvisor/pkg/sentry/fsimpl/timerfd" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs" "gvisor.dev/gvisor/pkg/sentry/hostcpu" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" @@ -259,6 +260,10 @@ type Kernel struct { // syscalls (as opposed to named pipes created by mknod()). pipeMount *vfs.Mount + // shmMount is the Mount used for anonymous files created by the + // memfd_create() syscalls. It is analagous to Linux's shm_mnt. + shmMount *vfs.Mount + // socketMount is the Mount used for sockets created by the socket() and // socketpair() syscalls. There are several cases where a socket dentry will // not be contained in socketMount: @@ -330,6 +335,9 @@ func (k *Kernel) Init(args InitKernelArgs) error { if args.Timekeeper == nil { return fmt.Errorf("Timekeeper is nil") } + if args.Timekeeper.clocks == nil { + return fmt.Errorf("Must call Timekeeper.SetClocks() before Kernel.Init()") + } if args.RootUserNamespace == nil { return fmt.Errorf("RootUserNamespace is nil") } @@ -384,6 +392,18 @@ func (k *Kernel) Init(args InitKernelArgs) error { } k.pipeMount = pipeMount + tmpfsFilesystem, tmpfsRoot, err := tmpfs.NewFilesystem(k.SupervisorContext(), &k.vfs, auth.NewRootCredentials(k.rootUserNamespace)) + if err != nil { + return fmt.Errorf("failed to create tmpfs filesystem: %v", err) + } + defer tmpfsFilesystem.DecRef() + defer tmpfsRoot.DecRef() + shmMount, err := k.vfs.NewDisconnectedMount(tmpfsFilesystem, tmpfsRoot, &vfs.MountOptions{}) + if err != nil { + return fmt.Errorf("failed to create tmpfs mount: %v", err) + } + k.shmMount = shmMount + socketFilesystem, err := sockfs.NewFilesystem(&k.vfs) if err != nil { return fmt.Errorf("failed to create sockfs filesystem: %v", err) @@ -1656,6 +1676,11 @@ func (k *Kernel) PipeMount() *vfs.Mount { return k.pipeMount } +// ShmMount returns the tmpfs mount. +func (k *Kernel) ShmMount() *vfs.Mount { + return k.shmMount +} + // SocketMount returns the sockfs mount. func (k *Kernel) SocketMount() *vfs.Mount { return k.socketMount diff --git a/pkg/sentry/syscalls/linux/vfs2/BUILD b/pkg/sentry/syscalls/linux/vfs2/BUILD index c32f942fb..f882ef840 100644 --- a/pkg/sentry/syscalls/linux/vfs2/BUILD +++ b/pkg/sentry/syscalls/linux/vfs2/BUILD @@ -13,6 +13,7 @@ go_library( "fscontext.go", "getdents.go", "ioctl.go", + "memfd.go", "mmap.go", "path.go", "pipe.go", @@ -43,6 +44,7 @@ go_library( "//pkg/sentry/fsimpl/pipefs", "//pkg/sentry/fsimpl/signalfd", "//pkg/sentry/fsimpl/timerfd", + "//pkg/sentry/fsimpl/tmpfs", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", "//pkg/sentry/kernel/pipe", diff --git a/pkg/sentry/syscalls/linux/vfs2/fd.go b/pkg/sentry/syscalls/linux/vfs2/fd.go index 8181d80f4..ca0f7fd1e 100644 --- a/pkg/sentry/syscalls/linux/vfs2/fd.go +++ b/pkg/sentry/syscalls/linux/vfs2/fd.go @@ -17,6 +17,7 @@ package vfs2 import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" @@ -157,6 +158,15 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall return 0, nil, syserror.EBADF } return uintptr(pipefile.PipeSize()), nil, nil + case linux.F_GET_SEALS: + val, err := tmpfs.GetSeals(file) + return uintptr(val), nil, err + case linux.F_ADD_SEALS: + if !file.IsWritable() { + return 0, nil, syserror.EPERM + } + err := tmpfs.AddSeals(file, args[2].Uint()) + return 0, nil, err default: // TODO(gvisor.dev/issue/1623): Everything else is not yet supported. return 0, nil, syserror.EINVAL diff --git a/pkg/sentry/syscalls/linux/vfs2/memfd.go b/pkg/sentry/syscalls/linux/vfs2/memfd.go new file mode 100644 index 000000000..bbe248d17 --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/memfd.go @@ -0,0 +1,63 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/syserror" +) + +const ( + memfdPrefix = "memfd:" + memfdMaxNameLen = linux.NAME_MAX - len(memfdPrefix) + memfdAllFlags = uint32(linux.MFD_CLOEXEC | linux.MFD_ALLOW_SEALING) +) + +// MemfdCreate implements the linux syscall memfd_create(2). +func MemfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + flags := args[1].Uint() + + if flags&^memfdAllFlags != 0 { + // Unknown bits in flags. + return 0, nil, syserror.EINVAL + } + + allowSeals := flags&linux.MFD_ALLOW_SEALING != 0 + cloExec := flags&linux.MFD_CLOEXEC != 0 + + name, err := t.CopyInString(addr, memfdMaxNameLen) + if err != nil { + return 0, nil, err + } + + shmMount := t.Kernel().ShmMount() + file, err := tmpfs.NewMemfd(shmMount, t.Credentials(), allowSeals, memfdPrefix+name) + if err != nil { + return 0, nil, err + } + + fd, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{ + CloseOnExec: cloExec, + }) + if err != nil { + return 0, nil, err + } + + return uintptr(fd), nil, nil +} diff --git a/pkg/sentry/syscalls/linux/vfs2/vfs2.go b/pkg/sentry/syscalls/linux/vfs2/vfs2.go index 9c04677f1..ec8da7f06 100644 --- a/pkg/sentry/syscalls/linux/vfs2/vfs2.go +++ b/pkg/sentry/syscalls/linux/vfs2/vfs2.go @@ -158,7 +158,7 @@ func Override() { s.Table[306] = syscalls.Supported("syncfs", Syncfs) s.Table[307] = syscalls.Supported("sendmmsg", SendMMsg) s.Table[316] = syscalls.Supported("renameat2", Renameat2) - delete(s.Table, 319) // memfd_create + s.Table[319] = syscalls.Supported("memfd_create", MemfdCreate) s.Table[322] = syscalls.Supported("execveat", Execveat) s.Table[327] = syscalls.Supported("preadv2", Preadv2) s.Table[328] = syscalls.Supported("pwritev2", Pwritev2) -- cgit v1.2.3 From fb7e5f1676ad9e6e7e83312e09fe55f91ade41bc Mon Sep 17 00:00:00 2001 From: Jamie Liu Date: Thu, 14 May 2020 20:08:25 -0700 Subject: Make utimes_test pass on VFS2. PiperOrigin-RevId: 311657502 --- pkg/sentry/fsimpl/gofer/gofer.go | 4 +- pkg/sentry/syscalls/linux/vfs2/setstat.go | 123 +++++++++++++++++++----------- pkg/sentry/syscalls/linux/vfs2/vfs2.go | 2 +- test/syscalls/linux/utimes.cc | 33 ++++---- 4 files changed, 101 insertions(+), 61 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go index 353e2cf5b..ebf063a58 100644 --- a/pkg/sentry/fsimpl/gofer/gofer.go +++ b/pkg/sentry/fsimpl/gofer/gofer.go @@ -869,8 +869,8 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *lin Size: stat.Mask&linux.STATX_SIZE != 0, ATime: stat.Mask&linux.STATX_ATIME != 0, MTime: stat.Mask&linux.STATX_MTIME != 0, - ATimeNotSystemTime: stat.Atime.Nsec != linux.UTIME_NOW, - MTimeNotSystemTime: stat.Mtime.Nsec != linux.UTIME_NOW, + ATimeNotSystemTime: stat.Mask&linux.STATX_ATIME != 0 && stat.Atime.Nsec != linux.UTIME_NOW, + MTimeNotSystemTime: stat.Mask&linux.STATX_MTIME != 0 && stat.Mtime.Nsec != linux.UTIME_NOW, }, p9.SetAttr{ Permissions: p9.FileMode(stat.Mode), UID: p9.UID(stat.UID), diff --git a/pkg/sentry/syscalls/linux/vfs2/setstat.go b/pkg/sentry/syscalls/linux/vfs2/setstat.go index 4e61f1452..09ecfed26 100644 --- a/pkg/sentry/syscalls/linux/vfs2/setstat.go +++ b/pkg/sentry/syscalls/linux/vfs2/setstat.go @@ -246,73 +246,104 @@ func Utimes(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal return 0, nil, err } - opts := vfs.SetStatOptions{ - Stat: linux.Statx{ - Mask: linux.STATX_ATIME | linux.STATX_MTIME, - }, - } - if timesAddr == 0 { - opts.Stat.Atime.Nsec = linux.UTIME_NOW - opts.Stat.Mtime.Nsec = linux.UTIME_NOW - } else { - var times [2]linux.Timeval - if _, err := t.CopyIn(timesAddr, ×); err != nil { - return 0, nil, err - } - opts.Stat.Atime = linux.StatxTimestamp{ - Sec: times[0].Sec, - Nsec: uint32(times[0].Usec * 1000), - } - opts.Stat.Mtime = linux.StatxTimestamp{ - Sec: times[1].Sec, - Nsec: uint32(times[1].Usec * 1000), - } + var opts vfs.SetStatOptions + if err := populateSetStatOptionsForUtimes(t, timesAddr, &opts); err != nil { + return 0, nil, err } return 0, nil, setstatat(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink, &opts) } -// Utimensat implements Linux syscall utimensat(2). -func Utimensat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { +// Futimesat implements Linux syscall futimesat(2). +func Futimesat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { dirfd := args[0].Int() pathAddr := args[1].Pointer() timesAddr := args[2].Pointer() - flags := args[3].Int() - if flags&^linux.AT_SYMLINK_NOFOLLOW != 0 { - return 0, nil, syserror.EINVAL - } - - path, err := copyInPath(t, pathAddr) - if err != nil { - return 0, nil, err + // "If filename is NULL and dfd refers to an open file, then operate on the + // file. Otherwise look up filename, possibly using dfd as a starting + // point." - fs/utimes.c + var path fspath.Path + shouldAllowEmptyPath := allowEmptyPath + if dirfd == linux.AT_FDCWD || pathAddr != 0 { + var err error + path, err = copyInPath(t, pathAddr) + if err != nil { + return 0, nil, err + } + shouldAllowEmptyPath = disallowEmptyPath } var opts vfs.SetStatOptions - if err := populateSetStatOptionsForUtimens(t, timesAddr, &opts); err != nil { + if err := populateSetStatOptionsForUtimes(t, timesAddr, &opts); err != nil { return 0, nil, err } - return 0, nil, setstatat(t, dirfd, path, disallowEmptyPath, followFinalSymlink, &opts) + return 0, nil, setstatat(t, dirfd, path, shouldAllowEmptyPath, followFinalSymlink, &opts) } -// Futimens implements Linux syscall futimens(2). -func Futimens(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { - fd := args[0].Int() - timesAddr := args[1].Pointer() - - file := t.GetFileVFS2(fd) - if file == nil { - return 0, nil, syserror.EBADF +func populateSetStatOptionsForUtimes(t *kernel.Task, timesAddr usermem.Addr, opts *vfs.SetStatOptions) error { + if timesAddr == 0 { + opts.Stat.Mask = linux.STATX_ATIME | linux.STATX_MTIME + opts.Stat.Atime.Nsec = linux.UTIME_NOW + opts.Stat.Mtime.Nsec = linux.UTIME_NOW + return nil } - defer file.DecRef() + var times [2]linux.Timeval + if _, err := t.CopyIn(timesAddr, ×); err != nil { + return err + } + if times[0].Usec < 0 || times[0].Usec > 999999 || times[1].Usec < 0 || times[1].Usec > 999999 { + return syserror.EINVAL + } + opts.Stat.Mask = linux.STATX_ATIME | linux.STATX_MTIME + opts.Stat.Atime = linux.StatxTimestamp{ + Sec: times[0].Sec, + Nsec: uint32(times[0].Usec * 1000), + } + opts.Stat.Mtime = linux.StatxTimestamp{ + Sec: times[1].Sec, + Nsec: uint32(times[1].Usec * 1000), + } + return nil +} +// Utimensat implements Linux syscall utimensat(2). +func Utimensat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + dirfd := args[0].Int() + pathAddr := args[1].Pointer() + timesAddr := args[2].Pointer() + flags := args[3].Int() + + // Linux requires that the UTIME_OMIT check occur before checking path or + // flags. var opts vfs.SetStatOptions if err := populateSetStatOptionsForUtimens(t, timesAddr, &opts); err != nil { return 0, nil, err } + if opts.Stat.Mask == 0 { + return 0, nil, nil + } - return 0, nil, file.SetStat(t, opts) + if flags&^linux.AT_SYMLINK_NOFOLLOW != 0 { + return 0, nil, syserror.EINVAL + } + + // "If filename is NULL and dfd refers to an open file, then operate on the + // file. Otherwise look up filename, possibly using dfd as a starting + // point." - fs/utimes.c + var path fspath.Path + shouldAllowEmptyPath := allowEmptyPath + if dirfd == linux.AT_FDCWD || pathAddr != 0 { + var err error + path, err = copyInPath(t, pathAddr) + if err != nil { + return 0, nil, err + } + shouldAllowEmptyPath = disallowEmptyPath + } + + return 0, nil, setstatat(t, dirfd, path, shouldAllowEmptyPath, shouldFollowFinalSymlink(flags&linux.AT_SYMLINK_NOFOLLOW == 0), &opts) } func populateSetStatOptionsForUtimens(t *kernel.Task, timesAddr usermem.Addr, opts *vfs.SetStatOptions) error { @@ -327,6 +358,9 @@ func populateSetStatOptionsForUtimens(t *kernel.Task, timesAddr usermem.Addr, op return err } if times[0].Nsec != linux.UTIME_OMIT { + if times[0].Nsec != linux.UTIME_NOW && (times[0].Nsec < 0 || times[0].Nsec > 999999999) { + return syserror.EINVAL + } opts.Stat.Mask |= linux.STATX_ATIME opts.Stat.Atime = linux.StatxTimestamp{ Sec: times[0].Sec, @@ -334,6 +368,9 @@ func populateSetStatOptionsForUtimens(t *kernel.Task, timesAddr usermem.Addr, op } } if times[1].Nsec != linux.UTIME_OMIT { + if times[1].Nsec != linux.UTIME_NOW && (times[1].Nsec < 0 || times[1].Nsec > 999999999) { + return syserror.EINVAL + } opts.Stat.Mask |= linux.STATX_MTIME opts.Stat.Mtime = linux.StatxTimestamp{ Sec: times[1].Sec, diff --git a/pkg/sentry/syscalls/linux/vfs2/vfs2.go b/pkg/sentry/syscalls/linux/vfs2/vfs2.go index ec8da7f06..a332d01bd 100644 --- a/pkg/sentry/syscalls/linux/vfs2/vfs2.go +++ b/pkg/sentry/syscalls/linux/vfs2/vfs2.go @@ -123,7 +123,7 @@ func Override() { s.Table[258] = syscalls.Supported("mkdirat", Mkdirat) s.Table[259] = syscalls.Supported("mknodat", Mknodat) s.Table[260] = syscalls.Supported("fchownat", Fchownat) - s.Table[261] = syscalls.Supported("futimens", Futimens) + s.Table[261] = syscalls.Supported("futimesat", Futimesat) s.Table[262] = syscalls.Supported("newfstatat", Newfstatat) s.Table[263] = syscalls.Supported("unlinkat", Unlinkat) s.Table[264] = syscalls.Supported("renameat", Renameat) diff --git a/test/syscalls/linux/utimes.cc b/test/syscalls/linux/utimes.cc index 22e6d1a85..e647d2896 100644 --- a/test/syscalls/linux/utimes.cc +++ b/test/syscalls/linux/utimes.cc @@ -48,12 +48,15 @@ void TimeBoxed(absl::Time* before, absl::Time* after, // filesystems set it to 1, so we don't do any truncation. struct timespec ts; EXPECT_THAT(clock_gettime(CLOCK_REALTIME_COARSE, &ts), SyscallSucceeds()); - *before = absl::TimeFromTimespec(ts); + // FIXME(b/132819225): gVisor filesystem timestamps inconsistently use the + // internal or host clock, which may diverge slightly. Allow some slack on + // times to account for the difference. + *before = absl::TimeFromTimespec(ts) - absl::Seconds(1); fn(); EXPECT_THAT(clock_gettime(CLOCK_REALTIME_COARSE, &ts), SyscallSucceeds()); - *after = absl::TimeFromTimespec(ts); + *after = absl::TimeFromTimespec(ts) + absl::Seconds(1); if (*after < *before) { // Clock jumped backwards; retry. @@ -68,11 +71,11 @@ void TimeBoxed(absl::Time* before, absl::Time* after, void TestUtimesOnPath(std::string const& path) { struct stat statbuf; - struct timeval times[2] = {{1, 0}, {2, 0}}; + struct timeval times[2] = {{10, 0}, {20, 0}}; EXPECT_THAT(utimes(path.c_str(), times), SyscallSucceeds()); EXPECT_THAT(stat(path.c_str(), &statbuf), SyscallSucceeds()); - EXPECT_EQ(1, statbuf.st_atime); - EXPECT_EQ(2, statbuf.st_mtime); + EXPECT_EQ(10, statbuf.st_atime); + EXPECT_EQ(20, statbuf.st_mtime); absl::Time before; absl::Time after; @@ -103,18 +106,18 @@ TEST(UtimesTest, OnDir) { TEST(UtimesTest, MissingPath) { auto path = NewTempAbsPath(); - struct timeval times[2] = {{1, 0}, {2, 0}}; + struct timeval times[2] = {{10, 0}, {20, 0}}; EXPECT_THAT(utimes(path.c_str(), times), SyscallFailsWithErrno(ENOENT)); } void TestFutimesat(int dirFd, std::string const& path) { struct stat statbuf; - struct timeval times[2] = {{1, 0}, {2, 0}}; + struct timeval times[2] = {{10, 0}, {20, 0}}; EXPECT_THAT(futimesat(dirFd, path.c_str(), times), SyscallSucceeds()); EXPECT_THAT(fstatat(dirFd, path.c_str(), &statbuf, 0), SyscallSucceeds()); - EXPECT_EQ(1, statbuf.st_atime); - EXPECT_EQ(2, statbuf.st_mtime); + EXPECT_EQ(10, statbuf.st_atime); + EXPECT_EQ(20, statbuf.st_mtime); absl::Time before; absl::Time after; @@ -175,11 +178,11 @@ TEST(FutimesatTest, InvalidNsec) { void TestUtimensat(int dirFd, std::string const& path) { struct stat statbuf; - const struct timespec times[2] = {{1, 0}, {2, 0}}; + const struct timespec times[2] = {{10, 0}, {20, 0}}; EXPECT_THAT(utimensat(dirFd, path.c_str(), times, 0), SyscallSucceeds()); EXPECT_THAT(fstatat(dirFd, path.c_str(), &statbuf, 0), SyscallSucceeds()); - EXPECT_EQ(1, statbuf.st_atime); - EXPECT_EQ(2, statbuf.st_mtime); + EXPECT_EQ(10, statbuf.st_atime); + EXPECT_EQ(20, statbuf.st_mtime); // Test setting with UTIME_NOW and UTIME_OMIT. struct stat statbuf2; @@ -301,13 +304,13 @@ TEST(Utimensat, NullPath) { auto f = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile()); const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(f.path(), O_RDWR)); struct stat statbuf; - const struct timespec times[2] = {{1, 0}, {2, 0}}; + const struct timespec times[2] = {{10, 0}, {20, 0}}; // Call syscall directly. EXPECT_THAT(syscall(SYS_utimensat, fd.get(), NULL, times, 0), SyscallSucceeds()); EXPECT_THAT(fstatat(0, f.path().c_str(), &statbuf, 0), SyscallSucceeds()); - EXPECT_EQ(1, statbuf.st_atime); - EXPECT_EQ(2, statbuf.st_mtime); + EXPECT_EQ(10, statbuf.st_atime); + EXPECT_EQ(20, statbuf.st_mtime); } } // namespace -- cgit v1.2.3 From 20e6efd302746554c485028f9fc1f2fbf88b234e Mon Sep 17 00:00:00 2001 From: Fabricio Voznika Date: Mon, 18 May 2020 14:24:47 -0700 Subject: Remove IfChange/ThenChange lint from VFS2 As new functionality is added to VFS2, corresponding files in VFS1 don't need to be changed. PiperOrigin-RevId: 312153799 --- pkg/sentry/fsimpl/devpts/line_discipline.go | 4 ---- pkg/sentry/fsimpl/devpts/master.go | 4 ---- pkg/sentry/fsimpl/devpts/queue.go | 4 ---- pkg/sentry/fsimpl/devpts/slave.go | 4 ---- pkg/sentry/fsimpl/devpts/terminal.go | 4 ---- 5 files changed, 20 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/devpts/line_discipline.go b/pkg/sentry/fsimpl/devpts/line_discipline.go index e201801d6..f7bc325d1 100644 --- a/pkg/sentry/fsimpl/devpts/line_discipline.go +++ b/pkg/sentry/fsimpl/devpts/line_discipline.go @@ -27,8 +27,6 @@ import ( "gvisor.dev/gvisor/pkg/waiter" ) -// LINT.IfChange - const ( // canonMaxBytes is the number of bytes that fit into a single line of // terminal input in canonical mode. This corresponds to N_TTY_BUF_SIZE @@ -445,5 +443,3 @@ func (l *lineDiscipline) peek(b []byte) int { } return size } - -// LINT.ThenChange(../../fs/tty/line_discipline.go) diff --git a/pkg/sentry/fsimpl/devpts/master.go b/pkg/sentry/fsimpl/devpts/master.go index 04a292927..7a7ce5d81 100644 --- a/pkg/sentry/fsimpl/devpts/master.go +++ b/pkg/sentry/fsimpl/devpts/master.go @@ -27,8 +27,6 @@ import ( "gvisor.dev/gvisor/pkg/waiter" ) -// LINT.IfChange - // masterInode is the inode for the master end of the Terminal. type masterInode struct { kernfs.InodeAttrs @@ -222,5 +220,3 @@ func maybeEmitUnimplementedEvent(ctx context.Context, cmd uint32) { unimpl.EmitUnimplementedEvent(ctx) } } - -// LINT.ThenChange(../../fs/tty/master.go) diff --git a/pkg/sentry/fsimpl/devpts/queue.go b/pkg/sentry/fsimpl/devpts/queue.go index 29a6be858..dffb4232c 100644 --- a/pkg/sentry/fsimpl/devpts/queue.go +++ b/pkg/sentry/fsimpl/devpts/queue.go @@ -25,8 +25,6 @@ import ( "gvisor.dev/gvisor/pkg/waiter" ) -// LINT.IfChange - // waitBufMaxBytes is the maximum size of a wait buffer. It is based on // TTYB_DEFAULT_MEM_LIMIT. const waitBufMaxBytes = 131072 @@ -236,5 +234,3 @@ func (q *queue) waitBufAppend(b []byte) { q.waitBuf = append(q.waitBuf, b) q.waitBufLen += uint64(len(b)) } - -// LINT.ThenChange(../../fs/tty/queue.go) diff --git a/pkg/sentry/fsimpl/devpts/slave.go b/pkg/sentry/fsimpl/devpts/slave.go index 0a98dc896..526cd406c 100644 --- a/pkg/sentry/fsimpl/devpts/slave.go +++ b/pkg/sentry/fsimpl/devpts/slave.go @@ -26,8 +26,6 @@ import ( "gvisor.dev/gvisor/pkg/waiter" ) -// LINT.IfChange - // slaveInode is the inode for the slave end of the Terminal. type slaveInode struct { kernfs.InodeAttrs @@ -182,5 +180,3 @@ func (sfd *slaveFileDescription) Stat(ctx context.Context, opts vfs.StatOptions) fs := sfd.vfsfd.VirtualDentry().Mount().Filesystem() return sfd.inode.Stat(fs, opts) } - -// LINT.ThenChange(../../fs/tty/slave.go) diff --git a/pkg/sentry/fsimpl/devpts/terminal.go b/pkg/sentry/fsimpl/devpts/terminal.go index b44e673d8..7d2781c54 100644 --- a/pkg/sentry/fsimpl/devpts/terminal.go +++ b/pkg/sentry/fsimpl/devpts/terminal.go @@ -22,8 +22,6 @@ import ( "gvisor.dev/gvisor/pkg/usermem" ) -// LINT.IfChanges - // Terminal is a pseudoterminal. // // +stateify savable @@ -120,5 +118,3 @@ func (tm *Terminal) tty(isMaster bool) *kernel.TTY { } return tm.slaveKTTY } - -// LINT.ThenChange(../../fs/tty/terminal.go) -- cgit v1.2.3 From 05c89af6edde6158844d4debfe68bc598fec4418 Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Tue, 19 May 2020 13:45:23 -0700 Subject: Implement mmap for host fs in vfs2. In VFS1, both fs/host and fs/gofer used the same utils for host file mappings. Refactor parts of fsimpl/gofer to create similar utils to share with fsimpl/host (memory accounting code moved to fsutil, page rounding arithmetic moved to usermem). Updates #1476. PiperOrigin-RevId: 312345090 --- pkg/sentry/fs/fsutil/frame_ref_set.go | 40 ++++++++++ pkg/sentry/fsimpl/gofer/BUILD | 1 - pkg/sentry/fsimpl/gofer/gofer.go | 4 +- pkg/sentry/fsimpl/gofer/pagemath.go | 31 -------- pkg/sentry/fsimpl/gofer/regular_file.go | 47 +++--------- pkg/sentry/fsimpl/host/BUILD | 4 + pkg/sentry/fsimpl/host/host.go | 70 ++++++++++++----- pkg/sentry/fsimpl/host/mmap.go | 132 ++++++++++++++++++++++++++++++++ pkg/usermem/addr.go | 17 ++++ 9 files changed, 254 insertions(+), 92 deletions(-) delete mode 100644 pkg/sentry/fsimpl/gofer/pagemath.go create mode 100644 pkg/sentry/fsimpl/host/mmap.go (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fs/fsutil/frame_ref_set.go b/pkg/sentry/fs/fsutil/frame_ref_set.go index 6564fd0c6..dd6f5aba6 100644 --- a/pkg/sentry/fs/fsutil/frame_ref_set.go +++ b/pkg/sentry/fs/fsutil/frame_ref_set.go @@ -18,6 +18,7 @@ import ( "math" "gvisor.dev/gvisor/pkg/sentry/platform" + "gvisor.dev/gvisor/pkg/sentry/usage" ) // FrameRefSetFunctions implements segment.Functions for FrameRefSet. @@ -49,3 +50,42 @@ func (FrameRefSetFunctions) Merge(_ platform.FileRange, val1 uint64, _ platform. func (FrameRefSetFunctions) Split(_ platform.FileRange, val uint64, _ uint64) (uint64, uint64) { return val, val } + +// IncRefAndAccount adds a reference on the range fr. All newly inserted segments +// are accounted as host page cache memory mappings. +func (refs *FrameRefSet) IncRefAndAccount(fr platform.FileRange) { + seg, gap := refs.Find(fr.Start) + for { + switch { + case seg.Ok() && seg.Start() < fr.End: + seg = refs.Isolate(seg, fr) + seg.SetValue(seg.Value() + 1) + seg, gap = seg.NextNonEmpty() + case gap.Ok() && gap.Start() < fr.End: + newRange := gap.Range().Intersect(fr) + usage.MemoryAccounting.Inc(newRange.Length(), usage.Mapped) + seg, gap = refs.InsertWithoutMerging(gap, newRange, 1).NextNonEmpty() + default: + refs.MergeAdjacent(fr) + return + } + } +} + +// DecRefAndAccount removes a reference on the range fr and untracks segments +// that are removed from memory accounting. +func (refs *FrameRefSet) DecRefAndAccount(fr platform.FileRange) { + seg := refs.FindSegment(fr.Start) + + for seg.Ok() && seg.Start() < fr.End { + seg = refs.Isolate(seg, fr) + if old := seg.Value(); old == 1 { + usage.MemoryAccounting.Dec(seg.Range().Length(), usage.Mapped) + seg = refs.Remove(seg).NextSegment() + } else { + seg.SetValue(old - 1) + seg = seg.NextSegment() + } + } + refs.MergeAdjacent(fr) +} diff --git a/pkg/sentry/fsimpl/gofer/BUILD b/pkg/sentry/fsimpl/gofer/BUILD index 5ce82b793..67e916525 100644 --- a/pkg/sentry/fsimpl/gofer/BUILD +++ b/pkg/sentry/fsimpl/gofer/BUILD @@ -36,7 +36,6 @@ go_library( "gofer.go", "handle.go", "p9file.go", - "pagemath.go", "regular_file.go", "socket.go", "special_file.go", diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go index ebf063a58..6295f6b54 100644 --- a/pkg/sentry/fsimpl/gofer/gofer.go +++ b/pkg/sentry/fsimpl/gofer/gofer.go @@ -928,8 +928,8 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *lin // so we can't race with Write or another truncate.) d.dataMu.Unlock() if d.size < oldSize { - oldpgend := pageRoundUp(oldSize) - newpgend := pageRoundUp(d.size) + oldpgend, _ := usermem.PageRoundUp(oldSize) + newpgend, _ := usermem.PageRoundUp(d.size) if oldpgend != newpgend { d.mapsMu.Lock() d.mappings.Invalidate(memmap.MappableRange{newpgend, oldpgend}, memmap.InvalidateOpts{ diff --git a/pkg/sentry/fsimpl/gofer/pagemath.go b/pkg/sentry/fsimpl/gofer/pagemath.go deleted file mode 100644 index 847cb0784..000000000 --- a/pkg/sentry/fsimpl/gofer/pagemath.go +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package gofer - -import ( - "gvisor.dev/gvisor/pkg/usermem" -) - -// This are equivalent to usermem.Addr.RoundDown/Up, but without the -// potentially truncating conversion to usermem.Addr. This is necessary because -// there is no way to define generic "PageRoundDown/Up" functions in Go. - -func pageRoundDown(x uint64) uint64 { - return x &^ (usermem.PageSize - 1) -} - -func pageRoundUp(x uint64) uint64 { - return pageRoundDown(x + usermem.PageSize - 1) -} diff --git a/pkg/sentry/fsimpl/gofer/regular_file.go b/pkg/sentry/fsimpl/gofer/regular_file.go index 857f7c74e..0d10cf7ac 100644 --- a/pkg/sentry/fsimpl/gofer/regular_file.go +++ b/pkg/sentry/fsimpl/gofer/regular_file.go @@ -148,9 +148,9 @@ func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, off return 0, err } // Remove touched pages from the cache. - pgstart := pageRoundDown(uint64(offset)) - pgend := pageRoundUp(uint64(offset + src.NumBytes())) - if pgend < pgstart { + pgstart := usermem.PageRoundDown(uint64(offset)) + pgend, ok := usermem.PageRoundUp(uint64(offset + src.NumBytes())) + if !ok { return 0, syserror.EINVAL } mr := memmap.MappableRange{pgstart, pgend} @@ -306,9 +306,10 @@ func (rw *dentryReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) if fillCache { // Read into the cache, then re-enter the loop to read from the // cache. + gapEnd, _ := usermem.PageRoundUp(gapMR.End) reqMR := memmap.MappableRange{ - Start: pageRoundDown(gapMR.Start), - End: pageRoundUp(gapMR.End), + Start: usermem.PageRoundDown(gapMR.Start), + End: gapEnd, } optMR := gap.Range() err := rw.d.cache.Fill(rw.ctx, reqMR, maxFillRange(reqMR, optMR), mf, usage.PageCache, rw.d.handle.readToBlocksAt) @@ -671,7 +672,7 @@ func (d *dentry) Translate(ctx context.Context, required, optional memmap.Mappab // Constrain translations to d.size (rounded up) to prevent translation to // pages that may be concurrently truncated. - pgend := pageRoundUp(d.size) + pgend, _ := usermem.PageRoundUp(d.size) var beyondEOF bool if required.End > pgend { if required.Start >= pgend { @@ -818,43 +819,15 @@ type dentryPlatformFile struct { // IncRef implements platform.File.IncRef. func (d *dentryPlatformFile) IncRef(fr platform.FileRange) { d.dataMu.Lock() - seg, gap := d.fdRefs.Find(fr.Start) - for { - switch { - case seg.Ok() && seg.Start() < fr.End: - seg = d.fdRefs.Isolate(seg, fr) - seg.SetValue(seg.Value() + 1) - seg, gap = seg.NextNonEmpty() - case gap.Ok() && gap.Start() < fr.End: - newRange := gap.Range().Intersect(fr) - usage.MemoryAccounting.Inc(newRange.Length(), usage.Mapped) - seg, gap = d.fdRefs.InsertWithoutMerging(gap, newRange, 1).NextNonEmpty() - default: - d.fdRefs.MergeAdjacent(fr) - d.dataMu.Unlock() - return - } - } + d.fdRefs.IncRefAndAccount(fr) + d.dataMu.Unlock() } // DecRef implements platform.File.DecRef. func (d *dentryPlatformFile) DecRef(fr platform.FileRange) { d.dataMu.Lock() - seg := d.fdRefs.FindSegment(fr.Start) - - for seg.Ok() && seg.Start() < fr.End { - seg = d.fdRefs.Isolate(seg, fr) - if old := seg.Value(); old == 1 { - usage.MemoryAccounting.Dec(seg.Range().Length(), usage.Mapped) - seg = d.fdRefs.Remove(seg).NextSegment() - } else { - seg.SetValue(old - 1) - seg = seg.NextSegment() - } - } - d.fdRefs.MergeAdjacent(fr) + d.fdRefs.DecRefAndAccount(fr) d.dataMu.Unlock() - } // MapInternal implements platform.File.MapInternal. diff --git a/pkg/sentry/fsimpl/host/BUILD b/pkg/sentry/fsimpl/host/BUILD index 39509f703..ca0fe6d2b 100644 --- a/pkg/sentry/fsimpl/host/BUILD +++ b/pkg/sentry/fsimpl/host/BUILD @@ -8,6 +8,7 @@ go_library( "control.go", "host.go", "ioctl_unsafe.go", + "mmap.go", "socket.go", "socket_iovec.go", "socket_unsafe.go", @@ -23,12 +24,15 @@ go_library( "//pkg/fspath", "//pkg/log", "//pkg/refs", + "//pkg/safemem", "//pkg/sentry/arch", + "//pkg/sentry/fs/fsutil", "//pkg/sentry/fsimpl/kernfs", "//pkg/sentry/hostfd", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", "//pkg/sentry/memmap", + "//pkg/sentry/platform", "//pkg/sentry/socket/control", "//pkg/sentry/socket/unix", "//pkg/sentry/socket/unix/transport", diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go index 8caf55a1b..65981197d 100644 --- a/pkg/sentry/fsimpl/host/host.go +++ b/pkg/sentry/fsimpl/host/host.go @@ -86,15 +86,16 @@ func NewFD(ctx context.Context, mnt *vfs.Mount, hostFD int, opts *NewFDOptions) i := &inode{ hostFD: hostFD, - seekable: seekable, + ino: fs.NextIno(), isTTY: opts.IsTTY, - canMap: canMap(uint32(fileType)), wouldBlock: wouldBlock(uint32(fileType)), - ino: fs.NextIno(), + seekable: seekable, // For simplicity, set offset to 0. Technically, we should use the existing // offset on the host if the file is seekable. offset: 0, + canMap: canMap(uint32(fileType)), } + i.pf.inode = i // Non-seekable files can't be memory mapped, assert this. if !i.seekable && i.canMap { @@ -189,11 +190,15 @@ type inode struct { // This field is initialized at creation time and is immutable. hostFD int - // wouldBlock is true if the host FD would return EWOULDBLOCK for - // operations that would block. + // ino is an inode number unique within this filesystem. // // This field is initialized at creation time and is immutable. - wouldBlock bool + ino uint64 + + // isTTY is true if this file represents a TTY. + // + // This field is initialized at creation time and is immutable. + isTTY bool // seekable is false if the host fd points to a file representing a stream, // e.g. a socket or a pipe. Such files are not seekable and can return @@ -202,29 +207,36 @@ type inode struct { // This field is initialized at creation time and is immutable. seekable bool - // isTTY is true if this file represents a TTY. + // offsetMu protects offset. + offsetMu sync.Mutex + + // offset specifies the current file offset. It is only meaningful when + // seekable is true. + offset int64 + + // wouldBlock is true if the host FD would return EWOULDBLOCK for + // operations that would block. // // This field is initialized at creation time and is immutable. - isTTY bool + wouldBlock bool + + // Event queue for blocking operations. + queue waiter.Queue // canMap specifies whether we allow the file to be memory mapped. // // This field is initialized at creation time and is immutable. canMap bool - // ino is an inode number unique within this filesystem. - // - // This field is initialized at creation time and is immutable. - ino uint64 - - // offsetMu protects offset. - offsetMu sync.Mutex + // mapsMu protects mappings. + mapsMu sync.Mutex - // offset specifies the current file offset. - offset int64 + // If canMap is true, mappings tracks mappings of hostFD into + // memmap.MappingSpaces. + mappings memmap.MappingSet - // Event queue for blocking operations. - queue waiter.Queue + // pf implements platform.File for mappings of hostFD. + pf inodePlatformFile } // CheckPermissions implements kernfs.Inode. @@ -388,6 +400,21 @@ func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Cre if err := syscall.Ftruncate(i.hostFD, int64(s.Size)); err != nil { return err } + oldSize := uint64(hostStat.Size) + if s.Size < oldSize { + oldpgend, _ := usermem.PageRoundUp(oldSize) + newpgend, _ := usermem.PageRoundUp(s.Size) + if oldpgend != newpgend { + i.mapsMu.Lock() + i.mappings.Invalidate(memmap.MappableRange{newpgend, oldpgend}, memmap.InvalidateOpts{ + // Compare Linux's mm/truncate.c:truncate_setsize() => + // truncate_pagecache() => + // mm/memory.c:unmap_mapping_range(evencows=1). + InvalidatePrivate: true, + }) + i.mapsMu.Unlock() + } + } } if m&(linux.STATX_ATIME|linux.STATX_MTIME) != 0 { ts := [2]syscall.Timespec{ @@ -666,8 +693,9 @@ func (f *fileDescription) ConfigureMMap(_ context.Context, opts *memmap.MMapOpts if !f.inode.canMap { return syserror.ENODEV } - // TODO(gvisor.dev/issue/1672): Implement ConfigureMMap and Mappable interface. - return syserror.ENODEV + i := f.inode + i.pf.fileMapperInitOnce.Do(i.pf.fileMapper.Init) + return vfs.GenericConfigureMMap(&f.vfsfd, i, opts) } // EventRegister implements waiter.Waitable.EventRegister. diff --git a/pkg/sentry/fsimpl/host/mmap.go b/pkg/sentry/fsimpl/host/mmap.go new file mode 100644 index 000000000..8545a82f0 --- /dev/null +++ b/pkg/sentry/fsimpl/host/mmap.go @@ -0,0 +1,132 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package host + +import ( + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/safemem" + "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" + "gvisor.dev/gvisor/pkg/sentry/memmap" + "gvisor.dev/gvisor/pkg/sentry/platform" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/usermem" +) + +// inodePlatformFile implements platform.File. It exists solely because inode +// cannot implement both kernfs.Inode.IncRef and platform.File.IncRef. +// +// inodePlatformFile should only be used if inode.canMap is true. +type inodePlatformFile struct { + *inode + + // fdRefsMu protects fdRefs. + fdRefsMu sync.Mutex + + // fdRefs counts references on platform.File offsets. It is used solely for + // memory accounting. + fdRefs fsutil.FrameRefSet + + // fileMapper caches mappings of the host file represented by this inode. + fileMapper fsutil.HostFileMapper + + // fileMapperInitOnce is used to lazily initialize fileMapper. + fileMapperInitOnce sync.Once +} + +// IncRef implements platform.File.IncRef. +// +// Precondition: i.inode.canMap must be true. +func (i *inodePlatformFile) IncRef(fr platform.FileRange) { + i.fdRefsMu.Lock() + i.fdRefs.IncRefAndAccount(fr) + i.fdRefsMu.Unlock() +} + +// DecRef implements platform.File.DecRef. +// +// Precondition: i.inode.canMap must be true. +func (i *inodePlatformFile) DecRef(fr platform.FileRange) { + i.fdRefsMu.Lock() + i.fdRefs.DecRefAndAccount(fr) + i.fdRefsMu.Unlock() +} + +// MapInternal implements platform.File.MapInternal. +// +// Precondition: i.inode.canMap must be true. +func (i *inodePlatformFile) MapInternal(fr platform.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) { + return i.fileMapper.MapInternal(fr, i.hostFD, at.Write) +} + +// FD implements platform.File.FD. +func (i *inodePlatformFile) FD() int { + return i.hostFD +} + +// AddMapping implements memmap.Mappable.AddMapping. +// +// Precondition: i.inode.canMap must be true. +func (i *inode) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) error { + i.mapsMu.Lock() + mapped := i.mappings.AddMapping(ms, ar, offset, writable) + for _, r := range mapped { + i.pf.fileMapper.IncRefOn(r) + } + i.mapsMu.Unlock() + return nil +} + +// RemoveMapping implements memmap.Mappable.RemoveMapping. +// +// Precondition: i.inode.canMap must be true. +func (i *inode) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) { + i.mapsMu.Lock() + unmapped := i.mappings.RemoveMapping(ms, ar, offset, writable) + for _, r := range unmapped { + i.pf.fileMapper.DecRefOn(r) + } + i.mapsMu.Unlock() +} + +// CopyMapping implements memmap.Mappable.CopyMapping. +// +// Precondition: i.inode.canMap must be true. +func (i *inode) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR usermem.AddrRange, offset uint64, writable bool) error { + return i.AddMapping(ctx, ms, dstAR, offset, writable) +} + +// Translate implements memmap.Mappable.Translate. +// +// Precondition: i.inode.canMap must be true. +func (i *inode) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) { + mr := optional + return []memmap.Translation{ + { + Source: mr, + File: &i.pf, + Offset: mr.Start, + Perms: usermem.AnyAccess, + }, + }, nil +} + +// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable. +// +// Precondition: i.inode.canMap must be true. +func (i *inode) InvalidateUnsavable(ctx context.Context) error { + // We expect the same host fd across save/restore, so all translations + // should be valid. + return nil +} diff --git a/pkg/usermem/addr.go b/pkg/usermem/addr.go index e79210804..c4100481e 100644 --- a/pkg/usermem/addr.go +++ b/pkg/usermem/addr.go @@ -106,3 +106,20 @@ func (ar AddrRange) IsPageAligned() bool { func (ar AddrRange) String() string { return fmt.Sprintf("[%#x, %#x)", ar.Start, ar.End) } + +// PageRoundDown/Up are equivalent to Addr.RoundDown/Up, but without the +// potentially truncating conversion from uint64 to Addr. This is necessary +// because there is no way to define generic "PageRoundDown/Up" functions in Go. + +// PageRoundDown returns x rounded down to the nearest page boundary. +func PageRoundDown(x uint64) uint64 { + return x &^ (PageSize - 1) +} + +// PageRoundUp returns x rounded up to the nearest page boundary. +// ok is true iff rounding up did not wrap around. +func PageRoundUp(x uint64) (addr uint64, ok bool) { + addr = PageRoundDown(x + PageSize - 1) + ok = addr >= x + return +} -- cgit v1.2.3 From 76369b6480b69bb7bf680db36ce8787fb324911c Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Wed, 20 May 2020 14:49:40 -0700 Subject: Move fsimpl/host file offset from inode to fileDescription. PiperOrigin-RevId: 312559861 --- pkg/sentry/fsimpl/host/host.go | 78 ++++++++++++++++++++---------------------- 1 file changed, 38 insertions(+), 40 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go index 65981197d..18b127521 100644 --- a/pkg/sentry/fsimpl/host/host.go +++ b/pkg/sentry/fsimpl/host/host.go @@ -90,10 +90,7 @@ func NewFD(ctx context.Context, mnt *vfs.Mount, hostFD int, opts *NewFDOptions) isTTY: opts.IsTTY, wouldBlock: wouldBlock(uint32(fileType)), seekable: seekable, - // For simplicity, set offset to 0. Technically, we should use the existing - // offset on the host if the file is seekable. - offset: 0, - canMap: canMap(uint32(fileType)), + canMap: canMap(uint32(fileType)), } i.pf.inode = i @@ -118,6 +115,10 @@ func NewFD(ctx context.Context, mnt *vfs.Mount, hostFD int, opts *NewFDOptions) // i.open will take a reference on d. defer d.DecRef() + + // For simplicity, fileDescription.offset is set to 0. Technically, we + // should only set to 0 on files that are not seekable (sockets, pipes, + // etc.), and use the offset from the host fd otherwise when importing. return i.open(ctx, d.VFSDentry(), mnt, flags) } @@ -207,13 +208,6 @@ type inode struct { // This field is initialized at creation time and is immutable. seekable bool - // offsetMu protects offset. - offsetMu sync.Mutex - - // offset specifies the current file offset. It is only meaningful when - // seekable is true. - offset int64 - // wouldBlock is true if the host FD would return EWOULDBLOCK for // operations that would block. // @@ -491,9 +485,6 @@ func (i *inode) open(ctx context.Context, d *vfs.Dentry, mnt *vfs.Mount, flags u return vfsfd, nil } - // For simplicity, set offset to 0. Technically, we should - // only set to 0 on files that are not seekable (sockets, pipes, etc.), - // and use the offset from the host fd otherwise. fd := &fileDescription{inode: i} vfsfd := &fd.vfsfd if err := vfsfd.Init(fd, flags, mnt, d, &vfs.FileDescriptionOptions{}); err != nil { @@ -514,6 +505,13 @@ type fileDescription struct { // // inode is immutable after fileDescription creation. inode *inode + + // offsetMu protects offset. + offsetMu sync.Mutex + + // offset specifies the current file offset. It is only meaningful when + // inode.seekable is true. + offset int64 } // SetStat implements vfs.FileDescriptionImpl. @@ -559,10 +557,10 @@ func (f *fileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts return n, err } // TODO(gvisor.dev/issue/1672): Cache pages, when forced to do so. - i.offsetMu.Lock() - n, err := readFromHostFD(ctx, i.hostFD, dst, i.offset, opts.Flags) - i.offset += n - i.offsetMu.Unlock() + f.offsetMu.Lock() + n, err := readFromHostFD(ctx, i.hostFD, dst, f.offset, opts.Flags) + f.offset += n + f.offsetMu.Unlock() return n, err } @@ -599,10 +597,10 @@ func (f *fileDescription) Write(ctx context.Context, src usermem.IOSequence, opt } // TODO(gvisor.dev/issue/1672): Cache pages, when forced to do so. // TODO(gvisor.dev/issue/1672): Write to end of file and update offset if O_APPEND is set on this file. - i.offsetMu.Lock() - n, err := writeToHostFD(ctx, i.hostFD, src, i.offset, opts.Flags) - i.offset += n - i.offsetMu.Unlock() + f.offsetMu.Lock() + n, err := writeToHostFD(ctx, i.hostFD, src, f.offset, opts.Flags) + f.offset += n + f.offsetMu.Unlock() return n, err } @@ -627,41 +625,41 @@ func (f *fileDescription) Seek(_ context.Context, offset int64, whence int32) (i return 0, syserror.ESPIPE } - i.offsetMu.Lock() - defer i.offsetMu.Unlock() + f.offsetMu.Lock() + defer f.offsetMu.Unlock() switch whence { case linux.SEEK_SET: if offset < 0 { - return i.offset, syserror.EINVAL + return f.offset, syserror.EINVAL } - i.offset = offset + f.offset = offset case linux.SEEK_CUR: - // Check for overflow. Note that underflow cannot occur, since i.offset >= 0. - if offset > math.MaxInt64-i.offset { - return i.offset, syserror.EOVERFLOW + // Check for overflow. Note that underflow cannot occur, since f.offset >= 0. + if offset > math.MaxInt64-f.offset { + return f.offset, syserror.EOVERFLOW } - if i.offset+offset < 0 { - return i.offset, syserror.EINVAL + if f.offset+offset < 0 { + return f.offset, syserror.EINVAL } - i.offset += offset + f.offset += offset case linux.SEEK_END: var s syscall.Stat_t if err := syscall.Fstat(i.hostFD, &s); err != nil { - return i.offset, err + return f.offset, err } size := s.Size // Check for overflow. Note that underflow cannot occur, since size >= 0. if offset > math.MaxInt64-size { - return i.offset, syserror.EOVERFLOW + return f.offset, syserror.EOVERFLOW } if size+offset < 0 { - return i.offset, syserror.EINVAL + return f.offset, syserror.EINVAL } - i.offset = size + offset + f.offset = size + offset case linux.SEEK_DATA, linux.SEEK_HOLE: // Modifying the offset in the host file table should not matter, since @@ -670,16 +668,16 @@ func (f *fileDescription) Seek(_ context.Context, offset int64, whence int32) (i // For reading and writing, we always rely on our internal offset. n, err := unix.Seek(i.hostFD, offset, int(whence)) if err != nil { - return i.offset, err + return f.offset, err } - i.offset = n + f.offset = n default: // Invalid whence. - return i.offset, syserror.EINVAL + return f.offset, syserror.EINVAL } - return i.offset, nil + return f.offset, nil } // Sync implements FileDescriptionImpl. -- cgit v1.2.3 From af3121a52383fb60579d769994be5d91bd788015 Mon Sep 17 00:00:00 2001 From: Jamie Liu Date: Tue, 26 May 2020 21:42:07 -0700 Subject: Implement splice(2) and tee(2) for VFS2. Updates #138 PiperOrigin-RevId: 313326354 --- pkg/buffer/safemem.go | 82 ++++----- pkg/sentry/fsimpl/tmpfs/regular_file.go | 2 +- pkg/sentry/kernel/pipe/BUILD | 2 + pkg/sentry/kernel/pipe/pipe.go | 6 + pkg/sentry/kernel/pipe/pipe_unsafe.go | 35 ++++ pkg/sentry/kernel/pipe/vfs.go | 219 ++++++++++++++++++++++- pkg/sentry/syscalls/linux/vfs2/BUILD | 1 + pkg/sentry/syscalls/linux/vfs2/splice.go | 286 +++++++++++++++++++++++++++++++ pkg/sentry/syscalls/linux/vfs2/vfs2.go | 4 +- pkg/sentry/vfs/file_description.go | 5 + test/syscalls/linux/splice.cc | 49 ++++++ 11 files changed, 648 insertions(+), 43 deletions(-) create mode 100644 pkg/sentry/kernel/pipe/pipe_unsafe.go create mode 100644 pkg/sentry/syscalls/linux/vfs2/splice.go (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/buffer/safemem.go b/pkg/buffer/safemem.go index 0e5b86344..b789e56e9 100644 --- a/pkg/buffer/safemem.go +++ b/pkg/buffer/safemem.go @@ -28,12 +28,11 @@ func (b *buffer) ReadBlock() safemem.Block { return safemem.BlockFromSafeSlice(b.ReadSlice()) } -// WriteFromBlocks implements safemem.Writer.WriteFromBlocks. -// -// This will advance the write index. -func (v *View) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) { - need := int(srcs.NumBytes()) - if need == 0 { +// WriteFromSafememReader writes up to count bytes from r to v and advances the +// write index by the number of bytes written. It calls r.ReadToBlocks() at +// most once. +func (v *View) WriteFromSafememReader(r safemem.Reader, count uint64) (uint64, error) { + if count == 0 { return 0, nil } @@ -50,32 +49,33 @@ func (v *View) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) { } // Does the last block have sufficient capacity alone? - if l := firstBuf.WriteSize(); l >= need { - dst = safemem.BlockSeqOf(firstBuf.WriteBlock()) + if l := uint64(firstBuf.WriteSize()); l >= count { + dst = safemem.BlockSeqOf(firstBuf.WriteBlock().TakeFirst64(count)) } else { // Append blocks until sufficient. - need -= l + count -= l blocks = append(blocks, firstBuf.WriteBlock()) - for need > 0 { + for count > 0 { emptyBuf := bufferPool.Get().(*buffer) v.data.PushBack(emptyBuf) - need -= emptyBuf.WriteSize() - blocks = append(blocks, emptyBuf.WriteBlock()) + block := emptyBuf.WriteBlock().TakeFirst64(count) + count -= uint64(block.Len()) + blocks = append(blocks, block) } dst = safemem.BlockSeqFromSlice(blocks) } - // Perform the copy. - n, err := safemem.CopySeq(dst, srcs) + // Perform I/O. + n, err := r.ReadToBlocks(dst) v.size += int64(n) // Update all indices. - for left := int(n); left > 0; firstBuf = firstBuf.Next() { - if l := firstBuf.WriteSize(); left >= l { + for left := n; left > 0; firstBuf = firstBuf.Next() { + if l := firstBuf.WriteSize(); left >= uint64(l) { firstBuf.WriteMove(l) // Whole block. - left -= l + left -= uint64(l) } else { - firstBuf.WriteMove(left) // Partial block. + firstBuf.WriteMove(int(left)) // Partial block. left = 0 } } @@ -83,14 +83,16 @@ func (v *View) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) { return n, err } -// ReadToBlocks implements safemem.Reader.ReadToBlocks. -// -// This will not advance the read index; the caller should follow -// this call with a call to TrimFront in order to remove the read -// data from the buffer. This is done to support pipe sematics. -func (v *View) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) { - need := int(dsts.NumBytes()) - if need == 0 { +// WriteFromBlocks implements safemem.Writer.WriteFromBlocks. It advances the +// write index by the number of bytes written. +func (v *View) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) { + return v.WriteFromSafememReader(&safemem.BlockSeqReader{srcs}, srcs.NumBytes()) +} + +// ReadToSafememWriter reads up to count bytes from v to w. It does not advance +// the read index. It calls w.WriteFromBlocks() at most once. +func (v *View) ReadToSafememWriter(w safemem.Writer, count uint64) (uint64, error) { + if count == 0 { return 0, nil } @@ -105,25 +107,27 @@ func (v *View) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) { } // Is all the data in a single block? - if l := firstBuf.ReadSize(); l >= need { - src = safemem.BlockSeqOf(firstBuf.ReadBlock()) + if l := uint64(firstBuf.ReadSize()); l >= count { + src = safemem.BlockSeqOf(firstBuf.ReadBlock().TakeFirst64(count)) } else { // Build a list of all the buffers. - need -= l + count -= l blocks = append(blocks, firstBuf.ReadBlock()) - for buf := firstBuf.Next(); buf != nil && need > 0; buf = buf.Next() { - need -= buf.ReadSize() - blocks = append(blocks, buf.ReadBlock()) + for buf := firstBuf.Next(); buf != nil && count > 0; buf = buf.Next() { + block := buf.ReadBlock().TakeFirst64(count) + count -= uint64(block.Len()) + blocks = append(blocks, block) } src = safemem.BlockSeqFromSlice(blocks) } - // Perform the copy. - n, err := safemem.CopySeq(dsts, src) - - // See above: we would normally advance the read index here, but we - // don't do that in order to support pipe semantics. We rely on a - // separate call to TrimFront() in this case. + // Perform I/O. As documented, we don't advance the read index. + return w.WriteFromBlocks(src) +} - return n, err +// ReadToBlocks implements safemem.Reader.ReadToBlocks. It does not advance the +// read index by the number of bytes read, such that it's only safe to call if +// the caller guarantees that ReadToBlocks will only be called once. +func (v *View) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) { + return v.ReadToSafememWriter(&safemem.BlockSeqWriter{dsts}, dsts.NumBytes()) } diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file.go b/pkg/sentry/fsimpl/tmpfs/regular_file.go index 3f433d666..fee174375 100644 --- a/pkg/sentry/fsimpl/tmpfs/regular_file.go +++ b/pkg/sentry/fsimpl/tmpfs/regular_file.go @@ -312,7 +312,7 @@ func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, off f := fd.inode().impl.(*regularFile) if end := offset + srclen; end < offset { // Overflow. - return 0, syserror.EFBIG + return 0, syserror.EINVAL } var err error diff --git a/pkg/sentry/kernel/pipe/BUILD b/pkg/sentry/kernel/pipe/BUILD index f29dc0472..7bfa9075a 100644 --- a/pkg/sentry/kernel/pipe/BUILD +++ b/pkg/sentry/kernel/pipe/BUILD @@ -8,6 +8,7 @@ go_library( "device.go", "node.go", "pipe.go", + "pipe_unsafe.go", "pipe_util.go", "reader.go", "reader_writer.go", @@ -20,6 +21,7 @@ go_library( "//pkg/amutex", "//pkg/buffer", "//pkg/context", + "//pkg/safemem", "//pkg/sentry/arch", "//pkg/sentry/device", "//pkg/sentry/fs", diff --git a/pkg/sentry/kernel/pipe/pipe.go b/pkg/sentry/kernel/pipe/pipe.go index 62c8691f1..79645d7d2 100644 --- a/pkg/sentry/kernel/pipe/pipe.go +++ b/pkg/sentry/kernel/pipe/pipe.go @@ -207,7 +207,10 @@ func (p *Pipe) read(ctx context.Context, ops readOps) (int64, error) { p.mu.Lock() defer p.mu.Unlock() + return p.readLocked(ctx, ops) +} +func (p *Pipe) readLocked(ctx context.Context, ops readOps) (int64, error) { // Is the pipe empty? if p.view.Size() == 0 { if !p.HasWriters() { @@ -246,7 +249,10 @@ type writeOps struct { func (p *Pipe) write(ctx context.Context, ops writeOps) (int64, error) { p.mu.Lock() defer p.mu.Unlock() + return p.writeLocked(ctx, ops) +} +func (p *Pipe) writeLocked(ctx context.Context, ops writeOps) (int64, error) { // Can't write to a pipe with no readers. if !p.HasReaders() { return 0, syscall.EPIPE diff --git a/pkg/sentry/kernel/pipe/pipe_unsafe.go b/pkg/sentry/kernel/pipe/pipe_unsafe.go new file mode 100644 index 000000000..dd60cba24 --- /dev/null +++ b/pkg/sentry/kernel/pipe/pipe_unsafe.go @@ -0,0 +1,35 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package pipe + +import ( + "unsafe" +) + +// lockTwoPipes locks both x.mu and y.mu in an order that is guaranteed to be +// consistent for both lockTwoPipes(x, y) and lockTwoPipes(y, x), such that +// concurrent calls cannot deadlock. +// +// Preconditions: x != y. +func lockTwoPipes(x, y *Pipe) { + // Lock the two pipes in order of increasing address. + if uintptr(unsafe.Pointer(x)) < uintptr(unsafe.Pointer(y)) { + x.mu.Lock() + y.mu.Lock() + } else { + y.mu.Lock() + x.mu.Lock() + } +} diff --git a/pkg/sentry/kernel/pipe/vfs.go b/pkg/sentry/kernel/pipe/vfs.go index b54f08a30..2602bed72 100644 --- a/pkg/sentry/kernel/pipe/vfs.go +++ b/pkg/sentry/kernel/pipe/vfs.go @@ -16,7 +16,9 @@ package pipe import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/buffer" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" @@ -150,7 +152,9 @@ func (vp *VFSPipe) newFD(mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32) * return &fd.vfsfd } -// VFSPipeFD implements vfs.FileDescriptionImpl for pipes. +// VFSPipeFD implements vfs.FileDescriptionImpl for pipes. It also implements +// non-atomic usermem.IO methods, allowing it to be passed as usermem.IO to +// other FileDescriptions for splice(2) and tee(2). type VFSPipeFD struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl @@ -229,3 +233,216 @@ func (fd *VFSPipeFD) PipeSize() int64 { func (fd *VFSPipeFD) SetPipeSize(size int64) (int64, error) { return fd.pipe.SetFifoSize(size) } + +// IOSequence returns a useremm.IOSequence that reads up to count bytes from, +// or writes up to count bytes to, fd. +func (fd *VFSPipeFD) IOSequence(count int64) usermem.IOSequence { + return usermem.IOSequence{ + IO: fd, + Addrs: usermem.AddrRangeSeqOf(usermem.AddrRange{0, usermem.Addr(count)}), + } +} + +// CopyIn implements usermem.IO.CopyIn. +func (fd *VFSPipeFD) CopyIn(ctx context.Context, addr usermem.Addr, dst []byte, opts usermem.IOOpts) (int, error) { + origCount := int64(len(dst)) + n, err := fd.pipe.read(ctx, readOps{ + left: func() int64 { + return int64(len(dst)) + }, + limit: func(l int64) { + dst = dst[:l] + }, + read: func(view *buffer.View) (int64, error) { + n, err := view.ReadAt(dst, 0) + view.TrimFront(int64(n)) + return int64(n), err + }, + }) + if n > 0 { + fd.pipe.Notify(waiter.EventOut) + } + if err == nil && n != origCount { + return int(n), syserror.ErrWouldBlock + } + return int(n), err +} + +// CopyOut implements usermem.IO.CopyOut. +func (fd *VFSPipeFD) CopyOut(ctx context.Context, addr usermem.Addr, src []byte, opts usermem.IOOpts) (int, error) { + origCount := int64(len(src)) + n, err := fd.pipe.write(ctx, writeOps{ + left: func() int64 { + return int64(len(src)) + }, + limit: func(l int64) { + src = src[:l] + }, + write: func(view *buffer.View) (int64, error) { + view.Append(src) + return int64(len(src)), nil + }, + }) + if n > 0 { + fd.pipe.Notify(waiter.EventIn) + } + if err == nil && n != origCount { + return int(n), syserror.ErrWouldBlock + } + return int(n), err +} + +// ZeroOut implements usermem.IO.ZeroOut. +func (fd *VFSPipeFD) ZeroOut(ctx context.Context, addr usermem.Addr, toZero int64, opts usermem.IOOpts) (int64, error) { + origCount := toZero + n, err := fd.pipe.write(ctx, writeOps{ + left: func() int64 { + return toZero + }, + limit: func(l int64) { + toZero = l + }, + write: func(view *buffer.View) (int64, error) { + view.Grow(view.Size()+toZero, true /* zero */) + return toZero, nil + }, + }) + if n > 0 { + fd.pipe.Notify(waiter.EventIn) + } + if err == nil && n != origCount { + return n, syserror.ErrWouldBlock + } + return n, err +} + +// CopyInTo implements usermem.IO.CopyInTo. +func (fd *VFSPipeFD) CopyInTo(ctx context.Context, ars usermem.AddrRangeSeq, dst safemem.Writer, opts usermem.IOOpts) (int64, error) { + count := ars.NumBytes() + if count == 0 { + return 0, nil + } + origCount := count + n, err := fd.pipe.read(ctx, readOps{ + left: func() int64 { + return count + }, + limit: func(l int64) { + count = l + }, + read: func(view *buffer.View) (int64, error) { + n, err := view.ReadToSafememWriter(dst, uint64(count)) + view.TrimFront(int64(n)) + return int64(n), err + }, + }) + if n > 0 { + fd.pipe.Notify(waiter.EventOut) + } + if err == nil && n != origCount { + return n, syserror.ErrWouldBlock + } + return n, err +} + +// CopyOutFrom implements usermem.IO.CopyOutFrom. +func (fd *VFSPipeFD) CopyOutFrom(ctx context.Context, ars usermem.AddrRangeSeq, src safemem.Reader, opts usermem.IOOpts) (int64, error) { + count := ars.NumBytes() + if count == 0 { + return 0, nil + } + origCount := count + n, err := fd.pipe.write(ctx, writeOps{ + left: func() int64 { + return count + }, + limit: func(l int64) { + count = l + }, + write: func(view *buffer.View) (int64, error) { + n, err := view.WriteFromSafememReader(src, uint64(count)) + return int64(n), err + }, + }) + if n > 0 { + fd.pipe.Notify(waiter.EventIn) + } + if err == nil && n != origCount { + return n, syserror.ErrWouldBlock + } + return n, err +} + +// SwapUint32 implements usermem.IO.SwapUint32. +func (fd *VFSPipeFD) SwapUint32(ctx context.Context, addr usermem.Addr, new uint32, opts usermem.IOOpts) (uint32, error) { + // How did a pipe get passed as the virtual address space to futex(2)? + panic("VFSPipeFD.SwapUint32 called unexpectedly") +} + +// CompareAndSwapUint32 implements usermem.IO.CompareAndSwapUint32. +func (fd *VFSPipeFD) CompareAndSwapUint32(ctx context.Context, addr usermem.Addr, old, new uint32, opts usermem.IOOpts) (uint32, error) { + panic("VFSPipeFD.CompareAndSwapUint32 called unexpectedly") +} + +// LoadUint32 implements usermem.IO.LoadUint32. +func (fd *VFSPipeFD) LoadUint32(ctx context.Context, addr usermem.Addr, opts usermem.IOOpts) (uint32, error) { + panic("VFSPipeFD.LoadUint32 called unexpectedly") +} + +// Splice reads up to count bytes from src and writes them to dst. It returns +// the number of bytes moved. +// +// Preconditions: count > 0. +func Splice(ctx context.Context, dst, src *VFSPipeFD, count int64) (int64, error) { + return spliceOrTee(ctx, dst, src, count, true /* removeFromSrc */) +} + +// Tee reads up to count bytes from src and writes them to dst, without +// removing the read bytes from src. It returns the number of bytes copied. +// +// Preconditions: count > 0. +func Tee(ctx context.Context, dst, src *VFSPipeFD, count int64) (int64, error) { + return spliceOrTee(ctx, dst, src, count, false /* removeFromSrc */) +} + +// Preconditions: count > 0. +func spliceOrTee(ctx context.Context, dst, src *VFSPipeFD, count int64, removeFromSrc bool) (int64, error) { + if dst.pipe == src.pipe { + return 0, syserror.EINVAL + } + + lockTwoPipes(dst.pipe, src.pipe) + defer dst.pipe.mu.Unlock() + defer src.pipe.mu.Unlock() + + n, err := dst.pipe.writeLocked(ctx, writeOps{ + left: func() int64 { + return count + }, + limit: func(l int64) { + count = l + }, + write: func(dstView *buffer.View) (int64, error) { + return src.pipe.readLocked(ctx, readOps{ + left: func() int64 { + return count + }, + limit: func(l int64) { + count = l + }, + read: func(srcView *buffer.View) (int64, error) { + n, err := srcView.ReadToSafememWriter(dstView, uint64(count)) + if n > 0 && removeFromSrc { + srcView.TrimFront(int64(n)) + } + return int64(n), err + }, + }) + }, + }) + if n > 0 { + dst.pipe.Notify(waiter.EventIn) + src.pipe.Notify(waiter.EventOut) + } + return n, err +} diff --git a/pkg/sentry/syscalls/linux/vfs2/BUILD b/pkg/sentry/syscalls/linux/vfs2/BUILD index f882ef840..d56927ff5 100644 --- a/pkg/sentry/syscalls/linux/vfs2/BUILD +++ b/pkg/sentry/syscalls/linux/vfs2/BUILD @@ -22,6 +22,7 @@ go_library( "setstat.go", "signal.go", "socket.go", + "splice.go", "stat.go", "stat_amd64.go", "stat_arm64.go", diff --git a/pkg/sentry/syscalls/linux/vfs2/splice.go b/pkg/sentry/syscalls/linux/vfs2/splice.go new file mode 100644 index 000000000..8f3c22a02 --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/splice.go @@ -0,0 +1,286 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/waiter" +) + +// Splice implements Linux syscall splice(2). +func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + inFD := args[0].Int() + inOffsetPtr := args[1].Pointer() + outFD := args[2].Int() + outOffsetPtr := args[3].Pointer() + count := int64(args[4].SizeT()) + flags := args[5].Int() + + if count == 0 { + return 0, nil, nil + } + if count > int64(kernel.MAX_RW_COUNT) { + count = int64(kernel.MAX_RW_COUNT) + } + + // Check for invalid flags. + if flags&^(linux.SPLICE_F_MOVE|linux.SPLICE_F_NONBLOCK|linux.SPLICE_F_MORE|linux.SPLICE_F_GIFT) != 0 { + return 0, nil, syserror.EINVAL + } + + // Get file descriptions. + inFile := t.GetFileVFS2(inFD) + if inFile == nil { + return 0, nil, syserror.EBADF + } + defer inFile.DecRef() + outFile := t.GetFileVFS2(outFD) + if outFile == nil { + return 0, nil, syserror.EBADF + } + defer outFile.DecRef() + + // Check that both files support the required directionality. + if !inFile.IsReadable() || !outFile.IsWritable() { + return 0, nil, syserror.EBADF + } + + // The operation is non-blocking if anything is non-blocking. + // + // N.B. This is a rather simplistic heuristic that avoids some + // poor edge case behavior since the exact semantics here are + // underspecified and vary between versions of Linux itself. + nonBlock := ((inFile.StatusFlags()|outFile.StatusFlags())&linux.O_NONBLOCK != 0) || (flags&linux.SPLICE_F_NONBLOCK != 0) + + // At least one file description must represent a pipe. + inPipeFD, inIsPipe := inFile.Impl().(*pipe.VFSPipeFD) + outPipeFD, outIsPipe := outFile.Impl().(*pipe.VFSPipeFD) + if !inIsPipe && !outIsPipe { + return 0, nil, syserror.EINVAL + } + + // Copy in offsets. + inOffset := int64(-1) + if inOffsetPtr != 0 { + if inIsPipe { + return 0, nil, syserror.ESPIPE + } + if inFile.Options().DenyPRead { + return 0, nil, syserror.EINVAL + } + if _, err := t.CopyIn(inOffsetPtr, &inOffset); err != nil { + return 0, nil, err + } + if inOffset < 0 { + return 0, nil, syserror.EINVAL + } + } + outOffset := int64(-1) + if outOffsetPtr != 0 { + if outIsPipe { + return 0, nil, syserror.ESPIPE + } + if outFile.Options().DenyPWrite { + return 0, nil, syserror.EINVAL + } + if _, err := t.CopyIn(outOffsetPtr, &outOffset); err != nil { + return 0, nil, err + } + if outOffset < 0 { + return 0, nil, syserror.EINVAL + } + } + + // Move data. + var ( + n int64 + err error + inCh chan struct{} + outCh chan struct{} + ) + for { + // If both input and output are pipes, delegate to the pipe + // implementation. Otherwise, exactly one end is a pipe, which we + // ensure is consistently ordered after the non-pipe FD's locks by + // passing the pipe FD as usermem.IO to the non-pipe end. + switch { + case inIsPipe && outIsPipe: + n, err = pipe.Splice(t, outPipeFD, inPipeFD, count) + case inIsPipe: + if outOffset != -1 { + n, err = outFile.PWrite(t, inPipeFD.IOSequence(count), outOffset, vfs.WriteOptions{}) + outOffset += n + } else { + n, err = outFile.Write(t, inPipeFD.IOSequence(count), vfs.WriteOptions{}) + } + case outIsPipe: + if inOffset != -1 { + n, err = inFile.PRead(t, outPipeFD.IOSequence(count), inOffset, vfs.ReadOptions{}) + inOffset += n + } else { + n, err = inFile.Read(t, outPipeFD.IOSequence(count), vfs.ReadOptions{}) + } + } + if n != 0 || err != syserror.ErrWouldBlock || nonBlock { + break + } + + // Note that the blocking behavior here is a bit different than the + // normal pattern. Because we need to have both data to read and data + // to write simultaneously, we actually explicitly block on both of + // these cases in turn before returning to the splice operation. + if inFile.Readiness(eventMaskRead)&eventMaskRead == 0 { + if inCh == nil { + inCh = make(chan struct{}, 1) + inW, _ := waiter.NewChannelEntry(inCh) + inFile.EventRegister(&inW, eventMaskRead) + defer inFile.EventUnregister(&inW) + continue // Need to refresh readiness. + } + if err = t.Block(inCh); err != nil { + break + } + } + if outFile.Readiness(eventMaskWrite)&eventMaskWrite == 0 { + if outCh == nil { + outCh = make(chan struct{}, 1) + outW, _ := waiter.NewChannelEntry(outCh) + outFile.EventRegister(&outW, eventMaskWrite) + defer outFile.EventUnregister(&outW) + continue // Need to refresh readiness. + } + if err = t.Block(outCh); err != nil { + break + } + } + } + + // Copy updated offsets out. + if inOffsetPtr != 0 { + if _, err := t.CopyOut(inOffsetPtr, &inOffset); err != nil { + return 0, nil, err + } + } + if outOffsetPtr != 0 { + if _, err := t.CopyOut(outOffsetPtr, &outOffset); err != nil { + return 0, nil, err + } + } + + if n == 0 { + return 0, nil, err + } + return uintptr(n), nil, nil +} + +// Tee implements Linux syscall tee(2). +func Tee(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + inFD := args[0].Int() + outFD := args[1].Int() + count := int64(args[2].SizeT()) + flags := args[3].Int() + + if count == 0 { + return 0, nil, nil + } + if count > int64(kernel.MAX_RW_COUNT) { + count = int64(kernel.MAX_RW_COUNT) + } + + // Check for invalid flags. + if flags&^(linux.SPLICE_F_MOVE|linux.SPLICE_F_NONBLOCK|linux.SPLICE_F_MORE|linux.SPLICE_F_GIFT) != 0 { + return 0, nil, syserror.EINVAL + } + + // Get file descriptions. + inFile := t.GetFileVFS2(inFD) + if inFile == nil { + return 0, nil, syserror.EBADF + } + defer inFile.DecRef() + outFile := t.GetFileVFS2(outFD) + if outFile == nil { + return 0, nil, syserror.EBADF + } + defer outFile.DecRef() + + // Check that both files support the required directionality. + if !inFile.IsReadable() || !outFile.IsWritable() { + return 0, nil, syserror.EBADF + } + + // The operation is non-blocking if anything is non-blocking. + // + // N.B. This is a rather simplistic heuristic that avoids some + // poor edge case behavior since the exact semantics here are + // underspecified and vary between versions of Linux itself. + nonBlock := ((inFile.StatusFlags()|outFile.StatusFlags())&linux.O_NONBLOCK != 0) || (flags&linux.SPLICE_F_NONBLOCK != 0) + + // Both file descriptions must represent pipes. + inPipeFD, inIsPipe := inFile.Impl().(*pipe.VFSPipeFD) + outPipeFD, outIsPipe := outFile.Impl().(*pipe.VFSPipeFD) + if !inIsPipe || !outIsPipe { + return 0, nil, syserror.EINVAL + } + + // Copy data. + var ( + inCh chan struct{} + outCh chan struct{} + ) + for { + n, err := pipe.Tee(t, outPipeFD, inPipeFD, count) + if n != 0 { + return uintptr(n), nil, nil + } + if err != syserror.ErrWouldBlock || nonBlock { + return 0, nil, err + } + + // Note that the blocking behavior here is a bit different than the + // normal pattern. Because we need to have both data to read and data + // to write simultaneously, we actually explicitly block on both of + // these cases in turn before returning to the tee operation. + if inFile.Readiness(eventMaskRead)&eventMaskRead == 0 { + if inCh == nil { + inCh = make(chan struct{}, 1) + inW, _ := waiter.NewChannelEntry(inCh) + inFile.EventRegister(&inW, eventMaskRead) + defer inFile.EventUnregister(&inW) + continue // Need to refresh readiness. + } + if err := t.Block(inCh); err != nil { + return 0, nil, err + } + } + if outFile.Readiness(eventMaskWrite)&eventMaskWrite == 0 { + if outCh == nil { + outCh = make(chan struct{}, 1) + outW, _ := waiter.NewChannelEntry(outCh) + outFile.EventRegister(&outW, eventMaskWrite) + defer outFile.EventUnregister(&outW) + continue // Need to refresh readiness. + } + if err := t.Block(outCh); err != nil { + return 0, nil, err + } + } + } +} diff --git a/pkg/sentry/syscalls/linux/vfs2/vfs2.go b/pkg/sentry/syscalls/linux/vfs2/vfs2.go index a332d01bd..083fdcf82 100644 --- a/pkg/sentry/syscalls/linux/vfs2/vfs2.go +++ b/pkg/sentry/syscalls/linux/vfs2/vfs2.go @@ -134,8 +134,8 @@ func Override() { s.Table[269] = syscalls.Supported("faccessat", Faccessat) s.Table[270] = syscalls.Supported("pselect", Pselect) s.Table[271] = syscalls.Supported("ppoll", Ppoll) - delete(s.Table, 275) // splice - delete(s.Table, 276) // tee + s.Table[275] = syscalls.Supported("splice", Splice) + s.Table[276] = syscalls.Supported("tee", Tee) s.Table[277] = syscalls.Supported("sync_file_range", SyncFileRange) s.Table[280] = syscalls.Supported("utimensat", Utimensat) s.Table[281] = syscalls.Supported("epoll_pwait", EpollPwait) diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go index cfabd936c..bb294563d 100644 --- a/pkg/sentry/vfs/file_description.go +++ b/pkg/sentry/vfs/file_description.go @@ -210,6 +210,11 @@ func (fd *FileDescription) VirtualDentry() VirtualDentry { return fd.vd } +// Options returns the options passed to fd.Init(). +func (fd *FileDescription) Options() FileDescriptionOptions { + return fd.opts +} + // StatusFlags returns file description status flags, as for fcntl(F_GETFL). func (fd *FileDescription) StatusFlags() uint32 { return atomic.LoadUint32(&fd.statusFlags) diff --git a/test/syscalls/linux/splice.cc b/test/syscalls/linux/splice.cc index f103e2e56..08fc4b1b7 100644 --- a/test/syscalls/linux/splice.cc +++ b/test/syscalls/linux/splice.cc @@ -430,6 +430,55 @@ TEST(SpliceTest, TwoPipes) { EXPECT_EQ(memcmp(rbuf.data(), buf.data(), kPageSize), 0); } +TEST(SpliceTest, TwoPipesCircular) { + // This test deadlocks the sentry on VFS1 because VFS1 splice ordering is + // based on fs.File.UniqueID, which does not prevent circular ordering between + // e.g. inode-level locks taken by fs.FileOperations. + SKIP_IF(IsRunningWithVFS1()); + + // Create two pipes. + int fds[2]; + ASSERT_THAT(pipe(fds), SyscallSucceeds()); + const FileDescriptor first_rfd(fds[0]); + const FileDescriptor first_wfd(fds[1]); + ASSERT_THAT(pipe(fds), SyscallSucceeds()); + const FileDescriptor second_rfd(fds[0]); + const FileDescriptor second_wfd(fds[1]); + + // On Linux, each pipe is normally limited to + // include/linux/pipe_fs_i.h:PIPE_DEF_BUFFERS buffers worth of data. + constexpr size_t PIPE_DEF_BUFFERS = 16; + + // Write some data to each pipe. Below we splice 1 byte at a time between + // pipes, which very quickly causes each byte to be stored in a separate + // buffer, so we must ensure that the total amount of data in the system is <= + // PIPE_DEF_BUFFERS bytes. + std::vector buf(PIPE_DEF_BUFFERS / 2); + RandomizeBuffer(buf.data(), buf.size()); + ASSERT_THAT(write(first_wfd.get(), buf.data(), buf.size()), + SyscallSucceedsWithValue(buf.size())); + ASSERT_THAT(write(second_wfd.get(), buf.data(), buf.size()), + SyscallSucceedsWithValue(buf.size())); + + // Have another thread splice from the second pipe to the first, while we + // splice from the first to the second. The test passes if this does not + // deadlock. + const int kIterations = 1000; + DisableSave ds; + ScopedThread t([&]() { + for (int i = 0; i < kIterations; i++) { + ASSERT_THAT( + splice(second_rfd.get(), nullptr, first_wfd.get(), nullptr, 1, 0), + SyscallSucceedsWithValue(1)); + } + }); + for (int i = 0; i < kIterations; i++) { + ASSERT_THAT( + splice(first_rfd.get(), nullptr, second_wfd.get(), nullptr, 1, 0), + SyscallSucceedsWithValue(1)); + } +} + TEST(SpliceTest, Blocking) { // Create two new pipes. int first[2], second[2]; -- cgit v1.2.3 From e028714a0dd390b2321c4beeac62c5b2904cd917 Mon Sep 17 00:00:00 2001 From: Jamie Liu Date: Tue, 26 May 2020 22:42:54 -0700 Subject: Support dfltuid and dfltgid mount options in the VFS2 gofer client. PiperOrigin-RevId: 313332542 --- pkg/sentry/fsimpl/gofer/gofer.go | 48 +++++++++++++++++++++++++++++++--------- 1 file changed, 38 insertions(+), 10 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go index 6295f6b54..131da332f 100644 --- a/pkg/sentry/fsimpl/gofer/gofer.go +++ b/pkg/sentry/fsimpl/gofer/gofer.go @@ -84,12 +84,6 @@ type filesystem struct { // devMinor is the filesystem's minor device number. devMinor is immutable. devMinor uint32 - // uid and gid are the effective KUID and KGID of the filesystem's creator, - // and are used as the owner and group for files that don't specify one. - // uid and gid are immutable. - uid auth.KUID - gid auth.KGID - // renameMu serves two purposes: // // - It synchronizes path resolution with renaming initiated by this @@ -122,6 +116,8 @@ type filesystemOptions struct { fd int aname string interop InteropMode // derived from the "cache" mount option + dfltuid auth.KUID + dfltgid auth.KGID msize uint32 version string @@ -230,6 +226,15 @@ type InternalFilesystemOptions struct { OpenSocketsByConnecting bool } +// _V9FS_DEFUID and _V9FS_DEFGID (from Linux's fs/9p/v9fs.h) are the default +// UIDs and GIDs used for files that do not provide a specific owner or group +// respectively. +const ( + // uint32(-2) doesn't work in Go. + _V9FS_DEFUID = auth.KUID(4294967294) + _V9FS_DEFGID = auth.KGID(4294967294) +) + // Name implements vfs.FilesystemType.Name. func (FilesystemType) Name() string { return Name @@ -315,6 +320,31 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt } } + // Parse the default UID and GID. + fsopts.dfltuid = _V9FS_DEFUID + if dfltuidstr, ok := mopts["dfltuid"]; ok { + delete(mopts, "dfltuid") + dfltuid, err := strconv.ParseUint(dfltuidstr, 10, 32) + if err != nil { + ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid default UID: dfltuid=%s", dfltuidstr) + return nil, nil, syserror.EINVAL + } + // In Linux, dfltuid is interpreted as a UID and is converted to a KUID + // in the caller's user namespace, but goferfs isn't + // application-mountable. + fsopts.dfltuid = auth.KUID(dfltuid) + } + fsopts.dfltgid = _V9FS_DEFGID + if dfltgidstr, ok := mopts["dfltgid"]; ok { + delete(mopts, "dfltgid") + dfltgid, err := strconv.ParseUint(dfltgidstr, 10, 32) + if err != nil { + ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid default UID: dfltgid=%s", dfltgidstr) + return nil, nil, syserror.EINVAL + } + fsopts.dfltgid = auth.KGID(dfltgid) + } + // Parse the 9P message size. fsopts.msize = 1024 * 1024 // 1M, tested to give good enough performance up to 64M if msizestr, ok := mopts["msize"]; ok { @@ -422,8 +452,6 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt client: client, clock: ktime.RealtimeClockFromContext(ctx), devMinor: devMinor, - uid: creds.EffectiveKUID, - gid: creds.EffectiveKGID, syncableDentries: make(map[*dentry]struct{}), specialFileFDs: make(map[*specialFileFD]struct{}), } @@ -672,8 +700,8 @@ func (fs *filesystem) newDentry(ctx context.Context, file p9file, qid p9.QID, ma file: file, ino: qid.Path, mode: uint32(attr.Mode), - uid: uint32(fs.uid), - gid: uint32(fs.gid), + uid: uint32(fs.opts.dfltuid), + gid: uint32(fs.opts.dfltgid), blockSize: usermem.PageSize, handle: handle{ fd: -1, -- cgit v1.2.3 From 32021bce9607e09f3d21d3d28047d49340da231e Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Wed, 27 May 2020 18:18:17 -0700 Subject: Correctly update link and ref counts in rmdir. Inotify sends events when a watch target is reaches a link count of 0 (see include/linux/fsnotify.h:fsnotify_inoderemove). Currently, we do not account for both dir/ and dir/.. in unlink, causing syscalls/linux/inotify.cc:WatchTargetDeletionGeneratesEvent to fail because the expected inotify events are not generated. Furthermore, we should DecRef() once the inode reaches zero links; otherwise, we will leak a reference. PiperOrigin-RevId: 313502091 --- pkg/sentry/fsimpl/tmpfs/filesystem.go | 4 +++- pkg/sentry/fsimpl/tmpfs/tmpfs.go | 15 ++++++++------- 2 files changed, 11 insertions(+), 8 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go index 80fa7b29d..7c04570f1 100644 --- a/pkg/sentry/fsimpl/tmpfs/filesystem.go +++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go @@ -603,8 +603,10 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error return err } parentDir.removeChildLocked(child) - parentDir.inode.decLinksLocked() // from child's ".." + // Remove links for child, child/., and child/.. child.inode.decLinksLocked() + child.inode.decLinksLocked() + parentDir.inode.decLinksLocked() vfsObj.CommitDeleteDentry(&child.vfsd) parentDir.inode.touchCMtime() return nil diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go index 1e781aecd..b739095b7 100644 --- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go +++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go @@ -209,11 +209,9 @@ type inode struct { // refs is a reference count. refs is accessed using atomic memory // operations. // - // A reference is held on all inodes that are reachable in the filesystem - // tree. For non-directories (which may have multiple hard links), this - // means that a reference is dropped when nlink reaches 0. For directories, - // nlink never reaches 0 due to the "." entry; instead, - // filesystem.RmdirAt() drops the reference. + // A reference is held on all inodes as long as they are reachable in the + // filesystem tree, i.e. nlink is nonzero. This reference is dropped when + // nlink reaches 0. refs int64 // xattrs implements extended attributes. @@ -276,14 +274,17 @@ func (i *inode) incLinksLocked() { atomic.AddUint32(&i.nlink, 1) } -// decLinksLocked decrements i's link count. +// decLinksLocked decrements i's link count. If the link count reaches 0, we +// remove a reference on i as well. // // Preconditions: filesystem.mu must be locked for writing. i.nlink != 0. func (i *inode) decLinksLocked() { if i.nlink == 0 { panic("tmpfs.inode.decLinksLocked() called with no existing links") } - atomic.AddUint32(&i.nlink, ^uint32(0)) + if atomic.AddUint32(&i.nlink, ^uint32(0)) == 0 { + i.decRef() + } } func (i *inode) incRef() { -- cgit v1.2.3 From fe464f44b7d3696bafd9a2faf3750e1dc4d56d80 Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Fri, 29 May 2020 08:07:44 -0700 Subject: Port inotify to vfs2, with support in tmpfs. Support in other filesystem impls is still needed. Unlike in Linux and vfs1, we need to plumb inotify down to each filesystem implementation in order to keep track of links/inode structures properly. IN_EXCL_UNLINK still needs to be implemented, as well as a few inotify hooks that are not present in either vfs1 or vfs2. Those will be addressed in subsequent changes. Updates #1479. PiperOrigin-RevId: 313781995 --- pkg/sentry/fsimpl/ext/dentry.go | 12 + pkg/sentry/fsimpl/gofer/gofer.go | 12 + pkg/sentry/fsimpl/kernfs/kernfs.go | 12 + pkg/sentry/fsimpl/tmpfs/BUILD | 1 + pkg/sentry/fsimpl/tmpfs/directory.go | 1 + pkg/sentry/fsimpl/tmpfs/filesystem.go | 40 +- pkg/sentry/fsimpl/tmpfs/tmpfs.go | 60 ++- pkg/sentry/kernel/fd_table.go | 8 +- pkg/sentry/syscalls/linux/vfs2/BUILD | 1 + pkg/sentry/syscalls/linux/vfs2/inotify.go | 134 ++++++ pkg/sentry/syscalls/linux/vfs2/read_write.go | 36 ++ pkg/sentry/syscalls/linux/vfs2/vfs2.go | 8 +- pkg/sentry/vfs/BUILD | 15 + pkg/sentry/vfs/anonfs.go | 12 + pkg/sentry/vfs/dentry.go | 27 ++ pkg/sentry/vfs/inotify.go | 675 +++++++++++++++++++++++++++ pkg/sentry/vfs/vfs.go | 1 + test/syscalls/linux/BUILD | 5 +- test/syscalls/linux/inotify.cc | 176 ++++++- 19 files changed, 1211 insertions(+), 25 deletions(-) create mode 100644 pkg/sentry/syscalls/linux/vfs2/inotify.go create mode 100644 pkg/sentry/vfs/inotify.go (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/ext/dentry.go b/pkg/sentry/fsimpl/ext/dentry.go index bfbd7c3d4..4d0deaf03 100644 --- a/pkg/sentry/fsimpl/ext/dentry.go +++ b/pkg/sentry/fsimpl/ext/dentry.go @@ -60,3 +60,15 @@ func (d *dentry) DecRef() { // inode.decRef(). d.inode.decRef() } + +// InotifyWithParent implements vfs.DentryImpl.InotifyWithParent. +// +// TODO(gvisor.dev/issue/1479): Implement inotify. +func (d *dentry) InotifyWithParent(events uint32, cookie uint32) {} + +// Watches implements vfs.DentryImpl.Watches. +// +// TODO(gvisor.dev/issue/1479): Implement inotify. +func (d *dentry) Watches() *vfs.Watches { + return nil +} diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go index 131da332f..850482a19 100644 --- a/pkg/sentry/fsimpl/gofer/gofer.go +++ b/pkg/sentry/fsimpl/gofer/gofer.go @@ -1039,6 +1039,18 @@ func (d *dentry) decRefLocked() { } } +// InotifyWithParent implements vfs.DentryImpl.InotifyWithParent. +// +// TODO(gvisor.dev/issue/1479): Implement inotify. +func (d *dentry) InotifyWithParent(events uint32, cookie uint32) {} + +// Watches implements vfs.DentryImpl.Watches. +// +// TODO(gvisor.dev/issue/1479): Implement inotify. +func (d *dentry) Watches() *vfs.Watches { + return nil +} + // checkCachingLocked should be called after d's reference count becomes 0 or it // becomes disowned. // diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go index a83151ad3..682545994 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs.go @@ -225,6 +225,18 @@ func (d *Dentry) destroy() { } } +// InotifyWithParent implements vfs.DentryImpl.InotifyWithParent. +// +// TODO(gvisor.dev/issue/1479): Implement inotify. +func (d *Dentry) InotifyWithParent(events uint32, cookie uint32) {} + +// Watches implements vfs.DentryImpl.Watches. +// +// TODO(gvisor.dev/issue/1479): Implement inotify. +func (d *Dentry) Watches() *vfs.Watches { + return nil +} + // InsertChild inserts child into the vfs dentry cache with the given name under // this dentry. This does not update the directory inode, so calling this on // it's own isn't sufficient to insert a child into a directory. InsertChild diff --git a/pkg/sentry/fsimpl/tmpfs/BUILD b/pkg/sentry/fsimpl/tmpfs/BUILD index 007be1572..062321cbc 100644 --- a/pkg/sentry/fsimpl/tmpfs/BUILD +++ b/pkg/sentry/fsimpl/tmpfs/BUILD @@ -59,6 +59,7 @@ go_library( "//pkg/sentry/pgalloc", "//pkg/sentry/platform", "//pkg/sentry/socket/unix/transport", + "//pkg/sentry/uniqueid", "//pkg/sentry/usage", "//pkg/sentry/vfs", "//pkg/sentry/vfs/lock", diff --git a/pkg/sentry/fsimpl/tmpfs/directory.go b/pkg/sentry/fsimpl/tmpfs/directory.go index f2399981b..8bc475f88 100644 --- a/pkg/sentry/fsimpl/tmpfs/directory.go +++ b/pkg/sentry/fsimpl/tmpfs/directory.go @@ -112,6 +112,7 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba dir.iterMu.Lock() defer dir.iterMu.Unlock() + fd.dentry().InotifyWithParent(linux.IN_ACCESS, 0) fd.inode().touchAtime(fd.vfsfd.Mount()) if fd.off == 0 { diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go index 7c04570f1..b4159f904 100644 --- a/pkg/sentry/fsimpl/tmpfs/filesystem.go +++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go @@ -177,6 +177,12 @@ func (fs *filesystem) doCreateAt(rp *vfs.ResolvingPath, dir bool, create func(pa if err := create(parentDir, name); err != nil { return err } + + ev := linux.IN_CREATE + if dir { + ev |= linux.IN_ISDIR + } + parentDir.inode.watches.Notify(name, uint32(ev), 0) parentDir.inode.touchCMtime() return nil } @@ -241,6 +247,7 @@ func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs. return syserror.EMLINK } d.inode.incLinksLocked() + d.inode.watches.Notify("", linux.IN_ATTRIB, 0) parentDir.insertChildLocked(fs.newDentry(d.inode), name) return nil }) @@ -354,6 +361,7 @@ afterTrailingSymlink: if err != nil { return nil, err } + parentDir.inode.watches.Notify(name, linux.IN_CREATE, 0) parentDir.inode.touchCMtime() return fd, nil } @@ -559,6 +567,8 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa newParentDir.inode.touchCMtime() } renamed.inode.touchCtime() + + vfs.InotifyRename(ctx, &renamed.inode.watches, &oldParentDir.inode.watches, &newParentDir.inode.watches, oldName, newName, renamed.inode.isDir()) return nil } @@ -603,6 +613,7 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error return err } parentDir.removeChildLocked(child) + parentDir.inode.watches.Notify(name, linux.IN_DELETE|linux.IN_ISDIR, 0) // Remove links for child, child/., and child/.. child.inode.decLinksLocked() child.inode.decLinksLocked() @@ -620,7 +631,14 @@ func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts if err != nil { return err } - return d.inode.setStat(ctx, rp.Credentials(), &opts.Stat) + if err := d.inode.setStat(ctx, rp.Credentials(), &opts.Stat); err != nil { + return err + } + + if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 { + d.InotifyWithParent(ev, 0) + } + return nil } // StatAt implements vfs.FilesystemImpl.StatAt. @@ -700,6 +718,12 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil { return err } + + // Generate inotify events. Note that this must take place before the link + // count of the child is decremented, or else the watches may be dropped + // before these events are added. + vfs.InotifyRemoveChild(&child.inode.watches, &parentDir.inode.watches, name) + parentDir.removeChildLocked(child) child.inode.decLinksLocked() vfsObj.CommitDeleteDentry(&child.vfsd) @@ -756,7 +780,12 @@ func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt if err != nil { return err } - return d.inode.setxattr(rp.Credentials(), &opts) + if err := d.inode.setxattr(rp.Credentials(), &opts); err != nil { + return err + } + + d.InotifyWithParent(linux.IN_ATTRIB, 0) + return nil } // RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt. @@ -767,7 +796,12 @@ func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, if err != nil { return err } - return d.inode.removexattr(rp.Credentials(), name) + if err := d.inode.removexattr(rp.Credentials(), name); err != nil { + return err + } + + d.InotifyWithParent(linux.IN_ATTRIB, 0) + return nil } // PrependPath implements vfs.FilesystemImpl.PrependPath. diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go index b739095b7..1d83b6840 100644 --- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go +++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go @@ -201,6 +201,26 @@ func (d *dentry) DecRef() { d.inode.decRef() } +// InotifyWithParent implements vfs.DentryImpl.InotifyWithParent. +func (d *dentry) InotifyWithParent(events uint32, cookie uint32) { + if d.inode.isDir() { + events |= linux.IN_ISDIR + } + + // The ordering below is important, Linux always notifies the parent first. + if d.parent != nil { + // Note that d.parent or d.name may be stale if there is a concurrent + // rename operation. Inotify does not provide consistency guarantees. + d.parent.inode.watches.Notify(d.name, events, cookie) + } + d.inode.watches.Notify("", events, cookie) +} + +// Watches implements vfs.DentryImpl.Watches. +func (d *dentry) Watches() *vfs.Watches { + return &d.inode.watches +} + // inode represents a filesystem object. type inode struct { // fs is the owning filesystem. fs is immutable. @@ -236,6 +256,9 @@ type inode struct { // Advisory file locks, which lock at the inode level. locks lock.FileLocks + // Inotify watches for this inode. + watches vfs.Watches + impl interface{} // immutable } @@ -257,6 +280,7 @@ func (i *inode) init(impl interface{}, fs *filesystem, creds *auth.Credentials, i.ctime = now i.mtime = now // i.nlink initialized by caller + i.watches = vfs.Watches{} i.impl = impl } @@ -307,6 +331,7 @@ func (i *inode) tryIncRef() bool { func (i *inode) decRef() { if refs := atomic.AddInt64(&i.refs, -1); refs == 0 { + i.watches.HandleDeletion() if regFile, ok := i.impl.(*regularFile); ok { // Release memory used by regFile to store data. Since regFile is // no longer usable, we don't need to grab any locks or update any @@ -628,8 +653,12 @@ func (fd *fileDescription) filesystem() *filesystem { return fd.vfsfd.Mount().Filesystem().Impl().(*filesystem) } +func (fd *fileDescription) dentry() *dentry { + return fd.vfsfd.Dentry().Impl().(*dentry) +} + func (fd *fileDescription) inode() *inode { - return fd.vfsfd.Dentry().Impl().(*dentry).inode + return fd.dentry().inode } // Stat implements vfs.FileDescriptionImpl.Stat. @@ -642,7 +671,16 @@ func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linu // SetStat implements vfs.FileDescriptionImpl.SetStat. func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { creds := auth.CredentialsFromContext(ctx) - return fd.inode().setStat(ctx, creds, &opts.Stat) + d := fd.dentry() + if err := d.inode.setStat(ctx, creds, &opts.Stat); err != nil { + return err + } + + // Generate inotify events. + if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 { + d.InotifyWithParent(ev, 0) + } + return nil } // Listxattr implements vfs.FileDescriptionImpl.Listxattr. @@ -657,12 +695,26 @@ func (fd *fileDescription) Getxattr(ctx context.Context, opts vfs.GetxattrOption // Setxattr implements vfs.FileDescriptionImpl.Setxattr. func (fd *fileDescription) Setxattr(ctx context.Context, opts vfs.SetxattrOptions) error { - return fd.inode().setxattr(auth.CredentialsFromContext(ctx), &opts) + d := fd.dentry() + if err := d.inode.setxattr(auth.CredentialsFromContext(ctx), &opts); err != nil { + return err + } + + // Generate inotify events. + d.InotifyWithParent(linux.IN_ATTRIB, 0) + return nil } // Removexattr implements vfs.FileDescriptionImpl.Removexattr. func (fd *fileDescription) Removexattr(ctx context.Context, name string) error { - return fd.inode().removexattr(auth.CredentialsFromContext(ctx), name) + d := fd.dentry() + if err := d.inode.removexattr(auth.CredentialsFromContext(ctx), name); err != nil { + return err + } + + // Generate inotify events. + d.InotifyWithParent(linux.IN_ATTRIB, 0) + return nil } // NewMemfd creates a new tmpfs regular file and file description that can back diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go index ed40b5303..ef73e1169 100644 --- a/pkg/sentry/kernel/fd_table.go +++ b/pkg/sentry/kernel/fd_table.go @@ -152,7 +152,13 @@ func (f *FDTable) drop(file *fs.File) { // dropVFS2 drops the table reference. func (f *FDTable) dropVFS2(file *vfs.FileDescription) { // TODO(gvisor.dev/issue/1480): Release locks. - // TODO(gvisor.dev/issue/1479): Send inotify events. + + // Generate inotify events. + ev := uint32(linux.IN_CLOSE_NOWRITE) + if file.IsWritable() { + ev = linux.IN_CLOSE_WRITE + } + file.Dentry().InotifyWithParent(ev, 0) // Drop the table reference. file.DecRef() diff --git a/pkg/sentry/syscalls/linux/vfs2/BUILD b/pkg/sentry/syscalls/linux/vfs2/BUILD index d56927ff5..9c8b44f64 100644 --- a/pkg/sentry/syscalls/linux/vfs2/BUILD +++ b/pkg/sentry/syscalls/linux/vfs2/BUILD @@ -12,6 +12,7 @@ go_library( "filesystem.go", "fscontext.go", "getdents.go", + "inotify.go", "ioctl.go", "memfd.go", "mmap.go", diff --git a/pkg/sentry/syscalls/linux/vfs2/inotify.go b/pkg/sentry/syscalls/linux/vfs2/inotify.go new file mode 100644 index 000000000..7d50b6a16 --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/inotify.go @@ -0,0 +1,134 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +const allFlags = linux.IN_NONBLOCK | linux.IN_CLOEXEC + +// InotifyInit1 implements the inotify_init1() syscalls. +func InotifyInit1(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + flags := args[0].Int() + if flags&^allFlags != 0 { + return 0, nil, syserror.EINVAL + } + + ino, err := vfs.NewInotifyFD(t, t.Kernel().VFS(), uint32(flags)) + if err != nil { + return 0, nil, err + } + defer ino.DecRef() + + fd, err := t.NewFDFromVFS2(0, ino, kernel.FDFlags{ + CloseOnExec: flags&linux.IN_CLOEXEC != 0, + }) + + if err != nil { + return 0, nil, err + } + + return uintptr(fd), nil, nil +} + +// InotifyInit implements the inotify_init() syscalls. +func InotifyInit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + args[0].Value = 0 + return InotifyInit1(t, args) +} + +// fdToInotify resolves an fd to an inotify object. If successful, the file will +// have an extra ref and the caller is responsible for releasing the ref. +func fdToInotify(t *kernel.Task, fd int32) (*vfs.Inotify, *vfs.FileDescription, error) { + f := t.GetFileVFS2(fd) + if f == nil { + // Invalid fd. + return nil, nil, syserror.EBADF + } + + ino, ok := f.Impl().(*vfs.Inotify) + if !ok { + // Not an inotify fd. + f.DecRef() + return nil, nil, syserror.EINVAL + } + + return ino, f, nil +} + +// InotifyAddWatch implements the inotify_add_watch() syscall. +func InotifyAddWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + addr := args[1].Pointer() + mask := args[2].Uint() + + // "EINVAL: The given event mask contains no valid events." + // -- inotify_add_watch(2) + if validBits := mask & linux.ALL_INOTIFY_BITS; validBits == 0 { + return 0, nil, syserror.EINVAL + } + + // "IN_DONT_FOLLOW: Don't dereference pathname if it is a symbolic link." + // -- inotify(7) + follow := followFinalSymlink + if mask&linux.IN_DONT_FOLLOW == 0 { + follow = nofollowFinalSymlink + } + + ino, f, err := fdToInotify(t, fd) + if err != nil { + return 0, nil, err + } + defer f.DecRef() + + path, err := copyInPath(t, addr) + if err != nil { + return 0, nil, err + } + if mask&linux.IN_ONLYDIR != 0 { + path.Dir = true + } + tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, follow) + if err != nil { + return 0, nil, err + } + defer tpop.Release() + d, err := t.Kernel().VFS().GetDentryAt(t, t.Credentials(), &tpop.pop, &vfs.GetDentryOptions{}) + if err != nil { + return 0, nil, err + } + defer d.DecRef() + + fd = ino.AddWatch(d.Dentry(), mask) + return uintptr(fd), nil, err +} + +// InotifyRmWatch implements the inotify_rm_watch() syscall. +func InotifyRmWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + wd := args[1].Int() + + ino, f, err := fdToInotify(t, fd) + if err != nil { + return 0, nil, err + } + defer f.DecRef() + return 0, nil, ino.RmWatch(wd) +} diff --git a/pkg/sentry/syscalls/linux/vfs2/read_write.go b/pkg/sentry/syscalls/linux/vfs2/read_write.go index 3a7ef24f5..92b5631a3 100644 --- a/pkg/sentry/syscalls/linux/vfs2/read_write.go +++ b/pkg/sentry/syscalls/linux/vfs2/read_write.go @@ -93,11 +93,17 @@ func Readv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall func read(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { n, err := file.Read(t, dst, opts) if err != syserror.ErrWouldBlock { + if n > 0 { + file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0) + } return n, err } allowBlock, deadline, hasDeadline := blockPolicy(t, file) if !allowBlock { + if n > 0 { + file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0) + } return n, err } @@ -128,6 +134,9 @@ func read(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, opt } file.EventUnregister(&w) + if total > 0 { + file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0) + } return total, err } @@ -248,11 +257,17 @@ func Preadv2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca func pread(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { n, err := file.PRead(t, dst, offset, opts) if err != syserror.ErrWouldBlock { + if n > 0 { + file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0) + } return n, err } allowBlock, deadline, hasDeadline := blockPolicy(t, file) if !allowBlock { + if n > 0 { + file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0) + } return n, err } @@ -283,6 +298,9 @@ func pread(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, of } file.EventUnregister(&w) + if total > 0 { + file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0) + } return total, err } @@ -345,11 +363,17 @@ func Writev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal func write(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { n, err := file.Write(t, src, opts) if err != syserror.ErrWouldBlock { + if n > 0 { + file.Dentry().InotifyWithParent(linux.IN_MODIFY, 0) + } return n, err } allowBlock, deadline, hasDeadline := blockPolicy(t, file) if !allowBlock { + if n > 0 { + file.Dentry().InotifyWithParent(linux.IN_MODIFY, 0) + } return n, err } @@ -380,6 +404,9 @@ func write(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, op } file.EventUnregister(&w) + if total > 0 { + file.Dentry().InotifyWithParent(linux.IN_MODIFY, 0) + } return total, err } @@ -500,11 +527,17 @@ func Pwritev2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc func pwrite(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { n, err := file.PWrite(t, src, offset, opts) if err != syserror.ErrWouldBlock { + if n > 0 { + file.Dentry().InotifyWithParent(linux.IN_MODIFY, 0) + } return n, err } allowBlock, deadline, hasDeadline := blockPolicy(t, file) if !allowBlock { + if n > 0 { + file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0) + } return n, err } @@ -535,6 +568,9 @@ func pwrite(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, o } file.EventUnregister(&w) + if total > 0 { + file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0) + } return total, err } diff --git a/pkg/sentry/syscalls/linux/vfs2/vfs2.go b/pkg/sentry/syscalls/linux/vfs2/vfs2.go index 083fdcf82..ef8358b8a 100644 --- a/pkg/sentry/syscalls/linux/vfs2/vfs2.go +++ b/pkg/sentry/syscalls/linux/vfs2/vfs2.go @@ -116,9 +116,9 @@ func Override() { s.Table[232] = syscalls.Supported("epoll_wait", EpollWait) s.Table[233] = syscalls.Supported("epoll_ctl", EpollCtl) s.Table[235] = syscalls.Supported("utimes", Utimes) - delete(s.Table, 253) // inotify_init - delete(s.Table, 254) // inotify_add_watch - delete(s.Table, 255) // inotify_rm_watch + s.Table[253] = syscalls.PartiallySupported("inotify_init", InotifyInit, "inotify events are only available inside the sandbox.", nil) + s.Table[254] = syscalls.PartiallySupported("inotify_add_watch", InotifyAddWatch, "inotify events are only available inside the sandbox.", nil) + s.Table[255] = syscalls.PartiallySupported("inotify_rm_watch", InotifyRmWatch, "inotify events are only available inside the sandbox.", nil) s.Table[257] = syscalls.Supported("openat", Openat) s.Table[258] = syscalls.Supported("mkdirat", Mkdirat) s.Table[259] = syscalls.Supported("mknodat", Mknodat) @@ -151,7 +151,7 @@ func Override() { s.Table[291] = syscalls.Supported("epoll_create1", EpollCreate1) s.Table[292] = syscalls.Supported("dup3", Dup3) s.Table[293] = syscalls.Supported("pipe2", Pipe2) - delete(s.Table, 294) // inotify_init1 + s.Table[294] = syscalls.PartiallySupported("inotify_init1", InotifyInit1, "inotify events are only available inside the sandbox.", nil) s.Table[295] = syscalls.Supported("preadv", Preadv) s.Table[296] = syscalls.Supported("pwritev", Pwritev) s.Table[299] = syscalls.Supported("recvmmsg", RecvMMsg) diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD index 94d69c1cc..774cc66cc 100644 --- a/pkg/sentry/vfs/BUILD +++ b/pkg/sentry/vfs/BUILD @@ -15,6 +15,18 @@ go_template_instance( }, ) +go_template_instance( + name = "event_list", + out = "event_list.go", + package = "vfs", + prefix = "event", + template = "//pkg/ilist:generic_list", + types = { + "Element": "*Event", + "Linker": "*Event", + }, +) + go_library( name = "vfs", srcs = [ @@ -25,11 +37,13 @@ go_library( "device.go", "epoll.go", "epoll_interest_list.go", + "event_list.go", "file_description.go", "file_description_impl_util.go", "filesystem.go", "filesystem_impl_util.go", "filesystem_type.go", + "inotify.go", "mount.go", "mount_unsafe.go", "options.go", @@ -57,6 +71,7 @@ go_library( "//pkg/sentry/limits", "//pkg/sentry/memmap", "//pkg/sentry/socket/unix/transport", + "//pkg/sentry/uniqueid", "//pkg/sync", "//pkg/syserror", "//pkg/usermem", diff --git a/pkg/sentry/vfs/anonfs.go b/pkg/sentry/vfs/anonfs.go index caf770fd5..55a3d54cc 100644 --- a/pkg/sentry/vfs/anonfs.go +++ b/pkg/sentry/vfs/anonfs.go @@ -297,3 +297,15 @@ func (d *anonDentry) TryIncRef() bool { func (d *anonDentry) DecRef() { // no-op } + +// InotifyWithParent implements DentryImpl.InotifyWithParent. +// +// TODO(gvisor.dev/issue/1479): Implement inotify. +func (d *anonDentry) InotifyWithParent(events uint32, cookie uint32) {} + +// Watches implements DentryImpl.Watches. +// +// TODO(gvisor.dev/issue/1479): Implement inotify. +func (d *anonDentry) Watches() *Watches { + return nil +} diff --git a/pkg/sentry/vfs/dentry.go b/pkg/sentry/vfs/dentry.go index 8624dbd5d..d61b9e09b 100644 --- a/pkg/sentry/vfs/dentry.go +++ b/pkg/sentry/vfs/dentry.go @@ -103,6 +103,22 @@ type DentryImpl interface { // DecRef decrements the Dentry's reference count. DecRef() + + // InotifyWithParent notifies all watches on the targets represented by this + // dentry and its parent. The parent's watches are notified first, followed + // by this dentry's. + // + // InotifyWithParent automatically adds the IN_ISDIR flag for dentries + // representing directories. + // + // Note that the events may not actually propagate up to the user, depending + // on the event masks. + InotifyWithParent(events uint32, cookie uint32) + + // Watches returns the set of inotify watches for the file corresponding to + // the Dentry. Dentries that are hard links to the same underlying file + // share the same watches. + Watches() *Watches } // IncRef increments d's reference count. @@ -133,6 +149,17 @@ func (d *Dentry) isMounted() bool { return atomic.LoadUint32(&d.mounts) != 0 } +// InotifyWithParent notifies all watches on the inodes for this dentry and +// its parent of events. +func (d *Dentry) InotifyWithParent(events uint32, cookie uint32) { + d.impl.InotifyWithParent(events, cookie) +} + +// Watches returns the set of inotify watches associated with d. +func (d *Dentry) Watches() *Watches { + return d.impl.Watches() +} + // The following functions are exported so that filesystem implementations can // use them. The vfs package, and users of VFS, should not call these // functions. diff --git a/pkg/sentry/vfs/inotify.go b/pkg/sentry/vfs/inotify.go new file mode 100644 index 000000000..1d28ccb46 --- /dev/null +++ b/pkg/sentry/vfs/inotify.go @@ -0,0 +1,675 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs + +import ( + "bytes" + "fmt" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/uniqueid" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +// inotifyEventBaseSize is the base size of linux's struct inotify_event. This +// must be a power 2 for rounding below. +const inotifyEventBaseSize = 16 + +// Inotify represents an inotify instance created by inotify_init(2) or +// inotify_init1(2). Inotify implements FileDescriptionImpl. +// +// Lock ordering: +// Inotify.mu -> Watches.mu -> Inotify.evMu +// +// +stateify savable +type Inotify struct { + vfsfd FileDescription + FileDescriptionDefaultImpl + DentryMetadataFileDescriptionImpl + + // Unique identifier for this inotify instance. We don't just reuse the + // inotify fd because fds can be duped. These should not be exposed to the + // user, since we may aggressively reuse an id on S/R. + id uint64 + + // queue is used to notify interested parties when the inotify instance + // becomes readable or writable. + queue waiter.Queue `state:"nosave"` + + // evMu *only* protects the events list. We need a separate lock while + // queuing events: using mu may violate lock ordering, since at that point + // the calling goroutine may already hold Watches.mu. + evMu sync.Mutex `state:"nosave"` + + // A list of pending events for this inotify instance. Protected by evMu. + events eventList + + // A scratch buffer, used to serialize inotify events. Allocate this + // ahead of time for the sake of performance. Protected by evMu. + scratch []byte + + // mu protects the fields below. + mu sync.Mutex `state:"nosave"` + + // nextWatchMinusOne is used to allocate watch descriptors on this Inotify + // instance. Note that Linux starts numbering watch descriptors from 1. + nextWatchMinusOne int32 + + // Map from watch descriptors to watch objects. + watches map[int32]*Watch +} + +var _ FileDescriptionImpl = (*Inotify)(nil) + +// NewInotifyFD constructs a new Inotify instance. +func NewInotifyFD(ctx context.Context, vfsObj *VirtualFilesystem, flags uint32) (*FileDescription, error) { + // O_CLOEXEC affects file descriptors, so it must be handled outside of vfs. + flags &^= linux.O_CLOEXEC + if flags&^linux.O_NONBLOCK != 0 { + return nil, syserror.EINVAL + } + + id := uniqueid.GlobalFromContext(ctx) + vd := vfsObj.NewAnonVirtualDentry(fmt.Sprintf("[inotifyfd:%d]", id)) + defer vd.DecRef() + fd := &Inotify{ + id: id, + scratch: make([]byte, inotifyEventBaseSize), + watches: make(map[int32]*Watch), + } + if err := fd.vfsfd.Init(fd, flags, vd.Mount(), vd.Dentry(), &FileDescriptionOptions{ + UseDentryMetadata: true, + DenyPRead: true, + DenyPWrite: true, + }); err != nil { + return nil, err + } + return &fd.vfsfd, nil +} + +// Release implements FileDescriptionImpl.Release. Release removes all +// watches and frees all resources for an inotify instance. +func (i *Inotify) Release() { + // We need to hold i.mu to avoid a race with concurrent calls to + // Inotify.handleDeletion from Watches. There's no risk of Watches + // accessing this Inotify after the destructor ends, because we remove all + // references to it below. + i.mu.Lock() + defer i.mu.Unlock() + for _, w := range i.watches { + // Remove references to the watch from the watches set on the target. We + // don't need to worry about the references from i.watches, since this + // file description is about to be destroyed. + w.set.Remove(i.id) + } +} + +// EventRegister implements waiter.Waitable. +func (i *Inotify) EventRegister(e *waiter.Entry, mask waiter.EventMask) { + i.queue.EventRegister(e, mask) +} + +// EventUnregister implements waiter.Waitable. +func (i *Inotify) EventUnregister(e *waiter.Entry) { + i.queue.EventUnregister(e) +} + +// Readiness implements waiter.Waitable.Readiness. +// +// Readiness indicates whether there are pending events for an inotify instance. +func (i *Inotify) Readiness(mask waiter.EventMask) waiter.EventMask { + ready := waiter.EventMask(0) + + i.evMu.Lock() + defer i.evMu.Unlock() + + if !i.events.Empty() { + ready |= waiter.EventIn + } + + return mask & ready +} + +// PRead implements FileDescriptionImpl. +func (*Inotify) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) { + return 0, syserror.ESPIPE +} + +// PWrite implements FileDescriptionImpl. +func (*Inotify) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) { + return 0, syserror.ESPIPE +} + +// Write implements FileDescriptionImpl.Write. +func (*Inotify) Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) { + return 0, syserror.EBADF +} + +// Read implements FileDescriptionImpl.Read. +func (i *Inotify) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) { + if dst.NumBytes() < inotifyEventBaseSize { + return 0, syserror.EINVAL + } + + i.evMu.Lock() + defer i.evMu.Unlock() + + if i.events.Empty() { + // Nothing to read yet, tell caller to block. + return 0, syserror.ErrWouldBlock + } + + var writeLen int64 + for it := i.events.Front(); it != nil; { + // Advance `it` before the element is removed from the list, or else + // it.Next() will always be nil. + event := it + it = it.Next() + + // Does the buffer have enough remaining space to hold the event we're + // about to write out? + if dst.NumBytes() < int64(event.sizeOf()) { + if writeLen > 0 { + // Buffer wasn't big enough for all pending events, but we did + // write some events out. + return writeLen, nil + } + return 0, syserror.EINVAL + } + + // Linux always dequeues an available event as long as there's enough + // buffer space to copy it out, even if the copy below fails. Emulate + // this behaviour. + i.events.Remove(event) + + // Buffer has enough space, copy event to the read buffer. + n, err := event.CopyTo(ctx, i.scratch, dst) + if err != nil { + return 0, err + } + + writeLen += n + dst = dst.DropFirst64(n) + } + return writeLen, nil +} + +// Ioctl implements fs.FileOperations.Ioctl. +func (i *Inotify) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) { + switch args[1].Int() { + case linux.FIONREAD: + i.evMu.Lock() + defer i.evMu.Unlock() + var n uint32 + for e := i.events.Front(); e != nil; e = e.Next() { + n += uint32(e.sizeOf()) + } + var buf [4]byte + usermem.ByteOrder.PutUint32(buf[:], n) + _, err := uio.CopyOut(ctx, args[2].Pointer(), buf[:], usermem.IOOpts{}) + return 0, err + + default: + return 0, syserror.ENOTTY + } +} + +func (i *Inotify) queueEvent(ev *Event) { + i.evMu.Lock() + + // Check if we should coalesce the event we're about to queue with the last + // one currently in the queue. Events are coalesced if they are identical. + if last := i.events.Back(); last != nil { + if ev.equals(last) { + // "Coalesce" the two events by simply not queuing the new one. We + // don't need to raise a waiter.EventIn notification because no new + // data is available for reading. + i.evMu.Unlock() + return + } + } + + i.events.PushBack(ev) + + // Release mutex before notifying waiters because we don't control what they + // can do. + i.evMu.Unlock() + + i.queue.Notify(waiter.EventIn) +} + +// newWatchLocked creates and adds a new watch to target. +// +// Precondition: i.mu must be locked. +func (i *Inotify) newWatchLocked(target *Dentry, mask uint32) *Watch { + targetWatches := target.Watches() + w := &Watch{ + owner: i, + wd: i.nextWatchIDLocked(), + set: targetWatches, + mask: mask, + } + + // Hold the watch in this inotify instance as well as the watch set on the + // target. + i.watches[w.wd] = w + targetWatches.Add(w) + return w +} + +// newWatchIDLocked allocates and returns a new watch descriptor. +// +// Precondition: i.mu must be locked. +func (i *Inotify) nextWatchIDLocked() int32 { + i.nextWatchMinusOne++ + return i.nextWatchMinusOne +} + +// handleDeletion handles the deletion of the target of watch w. It removes w +// from i.watches and a watch removal event is generated. +func (i *Inotify) handleDeletion(w *Watch) { + i.mu.Lock() + _, found := i.watches[w.wd] + delete(i.watches, w.wd) + i.mu.Unlock() + + if found { + i.queueEvent(newEvent(w.wd, "", linux.IN_IGNORED, 0)) + } +} + +// AddWatch constructs a new inotify watch and adds it to the target. It +// returns the watch descriptor returned by inotify_add_watch(2). +func (i *Inotify) AddWatch(target *Dentry, mask uint32) int32 { + // Note: Locking this inotify instance protects the result returned by + // Lookup() below. With the lock held, we know for sure the lookup result + // won't become stale because it's impossible for *this* instance to + // add/remove watches on target. + i.mu.Lock() + defer i.mu.Unlock() + + // Does the target already have a watch from this inotify instance? + if existing := target.Watches().Lookup(i.id); existing != nil { + newmask := mask + if mask&linux.IN_MASK_ADD != 0 { + // "Add (OR) events to watch mask for this pathname if it already + // exists (instead of replacing mask)." -- inotify(7) + newmask |= atomic.LoadUint32(&existing.mask) + } + atomic.StoreUint32(&existing.mask, newmask) + return existing.wd + } + + // No existing watch, create a new watch. + w := i.newWatchLocked(target, mask) + return w.wd +} + +// RmWatch looks up an inotify watch for the given 'wd' and configures the +// target to stop sending events to this inotify instance. +func (i *Inotify) RmWatch(wd int32) error { + i.mu.Lock() + + // Find the watch we were asked to removed. + w, ok := i.watches[wd] + if !ok { + i.mu.Unlock() + return syserror.EINVAL + } + + // Remove the watch from this instance. + delete(i.watches, wd) + + // Remove the watch from the watch target. + w.set.Remove(w.OwnerID()) + i.mu.Unlock() + + // Generate the event for the removal. + i.queueEvent(newEvent(wd, "", linux.IN_IGNORED, 0)) + + return nil +} + +// Watches is the collection of all inotify watches on a single file. +// +// +stateify savable +type Watches struct { + // mu protects the fields below. + mu sync.RWMutex `state:"nosave"` + + // ws is the map of active watches in this collection, keyed by the inotify + // instance id of the owner. + ws map[uint64]*Watch +} + +// Lookup returns the watch owned by an inotify instance with the given id. +// Returns nil if no such watch exists. +// +// Precondition: the inotify instance with the given id must be locked to +// prevent the returned watch from being concurrently modified or replaced in +// Inotify.watches. +func (w *Watches) Lookup(id uint64) *Watch { + w.mu.Lock() + defer w.mu.Unlock() + return w.ws[id] +} + +// Add adds watch into this set of watches. +// +// Precondition: the inotify instance with the given id must be locked. +func (w *Watches) Add(watch *Watch) { + w.mu.Lock() + defer w.mu.Unlock() + + owner := watch.OwnerID() + // Sanity check, we should never have two watches for one owner on the + // same target. + if _, exists := w.ws[owner]; exists { + panic(fmt.Sprintf("Watch collision with ID %+v", owner)) + } + if w.ws == nil { + w.ws = make(map[uint64]*Watch) + } + w.ws[owner] = watch +} + +// Remove removes a watch with the given id from this set of watches and +// releases it. The caller is responsible for generating any watch removal +// event, as appropriate. The provided id must match an existing watch in this +// collection. +// +// Precondition: the inotify instance with the given id must be locked. +func (w *Watches) Remove(id uint64) { + w.mu.Lock() + defer w.mu.Unlock() + + if w.ws == nil { + // This watch set is being destroyed. The thread executing the + // destructor is already in the process of deleting all our watches. We + // got here with no references on the target because we raced with the + // destructor notifying all the watch owners of destruction. See the + // comment in Watches.HandleDeletion for why this race exists. + return + } + + if _, ok := w.ws[id]; !ok { + // While there's technically no problem with silently ignoring a missing + // watch, this is almost certainly a bug. + panic(fmt.Sprintf("Attempt to remove a watch, but no watch found with provided id %+v.", id)) + } + delete(w.ws, id) +} + +// Notify queues a new event with all watches in this set. +func (w *Watches) Notify(name string, events, cookie uint32) { + // N.B. We don't defer the unlocks because Notify is in the hot path of + // all IO operations, and the defer costs too much for small IO + // operations. + w.mu.RLock() + for _, watch := range w.ws { + // TODO(gvisor.dev/issue/1479): Skip for IN_EXCL_UNLINK cases. + watch.Notify(name, events, cookie) + } + w.mu.RUnlock() +} + +// HandleDeletion is called when the watch target is destroyed to emit +// the appropriate events. +func (w *Watches) HandleDeletion() { + w.Notify("", linux.IN_DELETE_SELF, 0) + + // TODO(gvisor.dev/issue/1479): This doesn't work because maps are not copied + // by value. Ideally, we wouldn't have this circular locking so we can just + // notify of IN_DELETE_SELF in the same loop below. + // + // We can't hold w.mu while calling watch.handleDeletion to preserve lock + // ordering w.r.t to the owner inotify instances. Instead, atomically move + // the watches map into a local variable so we can iterate over it safely. + // + // Because of this however, it is possible for the watches' owners to reach + // this inode while the inode has no refs. This is still safe because the + // owners can only reach the inode until this function finishes calling + // watch.handleDeletion below and the inode is guaranteed to exist in the + // meantime. But we still have to be very careful not to rely on inode state + // that may have been already destroyed. + var ws map[uint64]*Watch + w.mu.Lock() + ws = w.ws + w.ws = nil + w.mu.Unlock() + + for _, watch := range ws { + // TODO(gvisor.dev/issue/1479): consider refactoring this. + watch.handleDeletion() + } +} + +// Watch represent a particular inotify watch created by inotify_add_watch. +// +// +stateify savable +type Watch struct { + // Inotify instance which owns this watch. + owner *Inotify + + // Descriptor for this watch. This is unique across an inotify instance. + wd int32 + + // set is the watch set containing this watch. It belongs to the target file + // of this watch. + set *Watches + + // Events being monitored via this watch. Must be accessed with atomic + // memory operations. + mask uint32 +} + +// OwnerID returns the id of the inotify instance that owns this watch. +func (w *Watch) OwnerID() uint64 { + return w.owner.id +} + +// ExcludeUnlinkedChildren indicates whether the watched object should continue +// to be notified of events of its children after they have been unlinked, e.g. +// for an open file descriptor. +// +// TODO(gvisor.dev/issue/1479): Implement IN_EXCL_UNLINK. +// We can do this by keeping track of the set of unlinked children in Watches +// to skip notification. +func (w *Watch) ExcludeUnlinkedChildren() bool { + return atomic.LoadUint32(&w.mask)&linux.IN_EXCL_UNLINK != 0 +} + +// Notify queues a new event on this watch. +func (w *Watch) Notify(name string, events uint32, cookie uint32) { + mask := atomic.LoadUint32(&w.mask) + if mask&events == 0 { + // We weren't watching for this event. + return + } + + // Event mask should include bits matched from the watch plus all control + // event bits. + unmaskableBits := ^uint32(0) &^ linux.IN_ALL_EVENTS + effectiveMask := unmaskableBits | mask + matchedEvents := effectiveMask & events + w.owner.queueEvent(newEvent(w.wd, name, matchedEvents, cookie)) +} + +// handleDeletion handles the deletion of w's target. +func (w *Watch) handleDeletion() { + w.owner.handleDeletion(w) +} + +// Event represents a struct inotify_event from linux. +// +// +stateify savable +type Event struct { + eventEntry + + wd int32 + mask uint32 + cookie uint32 + + // len is computed based on the name field is set automatically by + // Event.setName. It should be 0 when no name is set; otherwise it is the + // length of the name slice. + len uint32 + + // The name field has special padding requirements and should only be set by + // calling Event.setName. + name []byte +} + +func newEvent(wd int32, name string, events, cookie uint32) *Event { + e := &Event{ + wd: wd, + mask: events, + cookie: cookie, + } + if name != "" { + e.setName(name) + } + return e +} + +// paddedBytes converts a go string to a null-terminated c-string, padded with +// null bytes to a total size of 'l'. 'l' must be large enough for all the bytes +// in the 's' plus at least one null byte. +func paddedBytes(s string, l uint32) []byte { + if l < uint32(len(s)+1) { + panic("Converting string to byte array results in truncation, this can lead to buffer-overflow due to the missing null-byte!") + } + b := make([]byte, l) + copy(b, s) + + // b was zero-value initialized during make(), so the rest of the slice is + // already filled with null bytes. + + return b +} + +// setName sets the optional name for this event. +func (e *Event) setName(name string) { + // We need to pad the name such that the entire event length ends up a + // multiple of inotifyEventBaseSize. + unpaddedLen := len(name) + 1 + // Round up to nearest multiple of inotifyEventBaseSize. + e.len = uint32((unpaddedLen + inotifyEventBaseSize - 1) & ^(inotifyEventBaseSize - 1)) + // Make sure we haven't overflowed and wrapped around when rounding. + if unpaddedLen > int(e.len) { + panic("Overflow when rounding inotify event size, the 'name' field was too big.") + } + e.name = paddedBytes(name, e.len) +} + +func (e *Event) sizeOf() int { + s := inotifyEventBaseSize + int(e.len) + if s < inotifyEventBaseSize { + panic("overflow") + } + return s +} + +// CopyTo serializes this event to dst. buf is used as a scratch buffer to +// construct the output. We use a buffer allocated ahead of time for +// performance. buf must be at least inotifyEventBaseSize bytes. +func (e *Event) CopyTo(ctx context.Context, buf []byte, dst usermem.IOSequence) (int64, error) { + usermem.ByteOrder.PutUint32(buf[0:], uint32(e.wd)) + usermem.ByteOrder.PutUint32(buf[4:], e.mask) + usermem.ByteOrder.PutUint32(buf[8:], e.cookie) + usermem.ByteOrder.PutUint32(buf[12:], e.len) + + writeLen := 0 + + n, err := dst.CopyOut(ctx, buf) + if err != nil { + return 0, err + } + writeLen += n + dst = dst.DropFirst(n) + + if e.len > 0 { + n, err = dst.CopyOut(ctx, e.name) + if err != nil { + return 0, err + } + writeLen += n + } + + // Santiy check. + if writeLen != e.sizeOf() { + panic(fmt.Sprintf("Serialized unexpected amount of data for an event, expected %d, wrote %d.", e.sizeOf(), writeLen)) + } + + return int64(writeLen), nil +} + +func (e *Event) equals(other *Event) bool { + return e.wd == other.wd && + e.mask == other.mask && + e.cookie == other.cookie && + e.len == other.len && + bytes.Equal(e.name, other.name) +} + +// InotifyEventFromStatMask generates the appropriate events for an operation +// that set the stats specified in mask. +func InotifyEventFromStatMask(mask uint32) uint32 { + var ev uint32 + if mask&(linux.STATX_UID|linux.STATX_GID|linux.STATX_MODE) != 0 { + ev |= linux.IN_ATTRIB + } + if mask&linux.STATX_SIZE != 0 { + ev |= linux.IN_MODIFY + } + + if (mask & (linux.STATX_ATIME | linux.STATX_MTIME)) == (linux.STATX_ATIME | linux.STATX_MTIME) { + // Both times indicates a utime(s) call. + ev |= linux.IN_ATTRIB + } else if mask&linux.STATX_ATIME != 0 { + ev |= linux.IN_ACCESS + } else if mask&linux.STATX_MTIME != 0 { + mask |= linux.IN_MODIFY + } + return ev +} + +// InotifyRemoveChild sends the appriopriate notifications to the watch sets of +// the child being removed and its parent. +func InotifyRemoveChild(self, parent *Watches, name string) { + self.Notify("", linux.IN_ATTRIB, 0) + parent.Notify(name, linux.IN_DELETE, 0) + // TODO(gvisor.dev/issue/1479): implement IN_EXCL_UNLINK. +} + +// InotifyRename sends the appriopriate notifications to the watch sets of the +// file being renamed and its old/new parents. +func InotifyRename(ctx context.Context, renamed, oldParent, newParent *Watches, oldName, newName string, isDir bool) { + var dirEv uint32 + if isDir { + dirEv = linux.IN_ISDIR + } + cookie := uniqueid.InotifyCookie(ctx) + oldParent.Notify(oldName, dirEv|linux.IN_MOVED_FROM, cookie) + newParent.Notify(newName, dirEv|linux.IN_MOVED_TO, cookie) + // Somewhat surprisingly, self move events do not have a cookie. + renamed.Notify("", linux.IN_MOVE_SELF, 0) +} diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go index 6d2ba53ea..be6f21dba 100644 --- a/pkg/sentry/vfs/vfs.go +++ b/pkg/sentry/vfs/vfs.go @@ -422,6 +422,7 @@ func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credential } } + fd.Dentry().InotifyWithParent(linux.IN_OPEN, 0) return fd, nil } if !rp.handleError(err) { diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD index 5acdb8438..f4b5de18d 100644 --- a/test/syscalls/linux/BUILD +++ b/test/syscalls/linux/BUILD @@ -951,6 +951,7 @@ cc_binary( "//test/util:epoll_util", "//test/util:file_descriptor", "//test/util:fs_util", + "//test/util:posix_error", "//test/util:temp_path", "//test/util:test_main", "//test/util:test_util", @@ -1382,7 +1383,7 @@ cc_binary( srcs = ["partial_bad_buffer.cc"], linkstatic = 1, deps = [ - "//test/syscalls/linux:socket_test_util", + ":socket_test_util", "//test/util:file_descriptor", "//test/util:fs_util", "@com_google_absl//absl/time", @@ -3461,7 +3462,7 @@ cc_binary( deps = [ ":socket_test_util", gtest, - "//test/syscalls/linux:socket_netlink_route_util", + ":socket_netlink_route_util", "//test/util:capability_util", "//test/util:file_descriptor", "//test/util:fs_util", diff --git a/test/syscalls/linux/inotify.cc b/test/syscalls/linux/inotify.cc index 0e13ad190..e4565467b 100644 --- a/test/syscalls/linux/inotify.cc +++ b/test/syscalls/linux/inotify.cc @@ -33,6 +33,7 @@ #include "test/util/epoll_util.h" #include "test/util/file_descriptor.h" #include "test/util/fs_util.h" +#include "test/util/posix_error.h" #include "test/util/temp_path.h" #include "test/util/test_util.h" #include "test/util/thread_util.h" @@ -335,6 +336,11 @@ TEST(Inotify, InotifyFdNotWritable) { EXPECT_THAT(write(fd.get(), "x", 1), SyscallFailsWithErrno(EBADF)); } +TEST(Inotify, InitFlags) { + EXPECT_THAT(inotify_init1(IN_NONBLOCK | IN_CLOEXEC), SyscallSucceeds()); + EXPECT_THAT(inotify_init1(12345), SyscallFailsWithErrno(EINVAL)); +} + TEST(Inotify, NonBlockingReadReturnsEagain) { const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK)); @@ -395,7 +401,7 @@ TEST(Inotify, CanDeleteFileAfterRemovingWatch) { file1.reset(); } -TEST(Inotify, CanRemoveWatchAfterDeletingFile) { +TEST(Inotify, RemoveWatchAfterDeletingFileFails) { const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); TempPath file1 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(root.path())); @@ -491,17 +497,23 @@ TEST(Inotify, DeletingChildGeneratesEvents) { Event(IN_DELETE, root_wd, Basename(file1_path))})); } +// Creating a file in "parent/child" should generate events for child, but not +// parent. TEST(Inotify, CreatingFileGeneratesEvents) { - const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); + const TempPath parent = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); + const TempPath child = + ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(parent.path())); const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK)); + ASSERT_NO_ERRNO_AND_VALUE( + InotifyAddWatch(fd.get(), parent.path(), IN_ALL_EVENTS)); const int wd = ASSERT_NO_ERRNO_AND_VALUE( - InotifyAddWatch(fd.get(), root.path(), IN_ALL_EVENTS)); + InotifyAddWatch(fd.get(), child.path(), IN_ALL_EVENTS)); // Create a new file in the directory. const TempPath file1 = - ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(root.path())); + ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(child.path())); const std::vector events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get())); @@ -554,6 +566,47 @@ TEST(Inotify, WritingFileGeneratesModifyEvent) { ASSERT_THAT(events, Are({Event(IN_MODIFY, wd, Basename(file1.path()))})); } +TEST(Inotify, SizeZeroReadWriteGeneratesNothing) { + const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); + const FileDescriptor fd = + ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK)); + const TempPath file1 = + ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(root.path())); + + const FileDescriptor file1_fd = + ASSERT_NO_ERRNO_AND_VALUE(Open(file1.path(), O_RDWR)); + ASSERT_NO_ERRNO_AND_VALUE( + InotifyAddWatch(fd.get(), root.path(), IN_ALL_EVENTS)); + + // Read from the empty file. + int val; + ASSERT_THAT(read(file1_fd.get(), &val, sizeof(val)), + SyscallSucceedsWithValue(0)); + + // Write zero bytes. + ASSERT_THAT(write(file1_fd.get(), "", 0), SyscallSucceedsWithValue(0)); + + const std::vector events = + ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get())); + ASSERT_THAT(events, Are({})); +} + +TEST(Inotify, FailedFileCreationGeneratesNoEvents) { + const TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); + const FileDescriptor fd = + ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK)); + ASSERT_NO_ERRNO_AND_VALUE( + InotifyAddWatch(fd.get(), dir.path(), IN_ALL_EVENTS)); + + const char* p = dir.path().c_str(); + ASSERT_THAT(mkdir(p, 0777), SyscallFails()); + ASSERT_THAT(mknod(p, S_IFIFO, 0777), SyscallFails()); + ASSERT_THAT(symlink(p, p), SyscallFails()); + ASSERT_THAT(link(p, p), SyscallFails()); + std::vector events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get())); + ASSERT_THAT(events, Are({})); +} + TEST(Inotify, WatchSetAfterOpenReportsCloseFdEvent) { const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); const FileDescriptor fd = @@ -602,7 +655,7 @@ TEST(Inotify, ChildrenDeletionInWatchedDirGeneratesEvent) { Event(IN_DELETE | IN_ISDIR, wd, Basename(dir1_path))})); } -TEST(Inotify, WatchTargetDeletionGeneratesEvent) { +TEST(Inotify, RmdirOnWatchedTargetGeneratesEvent) { const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK)); @@ -1228,7 +1281,7 @@ TEST(Inotify, LinkGeneratesAttribAndCreateEvents) { InotifyAddWatch(fd.get(), file1.path(), IN_ALL_EVENTS)); const int rc = link(file1.path().c_str(), link1.path().c_str()); - // link(2) is only supported on tmpfs in the sandbox. + // NOTE(b/34861058): link(2) is only supported on tmpfs in the sandbox. SKIP_IF(IsRunningOnGvisor() && rc != 0 && (errno == EPERM || errno == ENOENT)); ASSERT_THAT(rc, SyscallSucceeds()); @@ -1322,21 +1375,27 @@ TEST(Inotify, HardlinksReuseSameWatch) { Event(IN_DELETE, root_wd, Basename(file1_path))})); } +// Calling mkdir within "parent/child" should generate an event for child, but +// not parent. TEST(Inotify, MkdirGeneratesCreateEventWithDirFlag) { - const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); + const TempPath parent = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); + const TempPath child = + ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(parent.path())); const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK)); - const int root_wd = ASSERT_NO_ERRNO_AND_VALUE( - InotifyAddWatch(fd.get(), root.path(), IN_ALL_EVENTS)); + ASSERT_NO_ERRNO_AND_VALUE( + InotifyAddWatch(fd.get(), parent.path(), IN_ALL_EVENTS)); + const int child_wd = ASSERT_NO_ERRNO_AND_VALUE( + InotifyAddWatch(fd.get(), child.path(), IN_ALL_EVENTS)); - const TempPath dir1(NewTempAbsPathInDir(root.path())); + const TempPath dir1(NewTempAbsPathInDir(child.path())); ASSERT_THAT(mkdir(dir1.path().c_str(), 0777), SyscallSucceeds()); const std::vector events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get())); ASSERT_THAT( events, - Are({Event(IN_CREATE | IN_ISDIR, root_wd, Basename(dir1.path()))})); + Are({Event(IN_CREATE | IN_ISDIR, child_wd, Basename(dir1.path()))})); } TEST(Inotify, MultipleInotifyInstancesAndWatchesAllGetEvents) { @@ -1597,6 +1656,8 @@ TEST(Inotify, EpollNoDeadlock) { } TEST(Inotify, SpliceEvent) { + // TODO(gvisor.dev/issue/138): Implement splice in VFS2. + SKIP_IF(IsRunningOnGvisor() && !IsRunningWithVFS1()); int pipes[2]; ASSERT_THAT(pipe2(pipes, O_NONBLOCK), SyscallSucceeds()); @@ -1624,6 +1685,99 @@ TEST(Inotify, SpliceEvent) { ASSERT_THAT(events, Are({Event(IN_ACCESS, watcher)})); } +// Watches on a parent should not be triggered by actions on a hard link to one +// of its children that has a different parent. +TEST(Inotify, LinkOnOtherParent) { + const TempPath dir1 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); + const TempPath dir2 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); + const TempPath file = + ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(dir1.path())); + std::string link_path = NewTempAbsPathInDir(dir2.path()); + + const int rc = link(file.path().c_str(), link_path.c_str()); + // NOTE(b/34861058): link(2) is only supported on tmpfs in the sandbox. + SKIP_IF(IsRunningOnGvisor() && rc != 0 && + (errno == EPERM || errno == ENOENT)); + ASSERT_THAT(rc, SyscallSucceeds()); + + const FileDescriptor inotify_fd = + ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK)); + ASSERT_NO_ERRNO_AND_VALUE( + InotifyAddWatch(inotify_fd.get(), dir1.path(), IN_ALL_EVENTS)); + + // Perform various actions on the link outside of dir1, which should trigger + // no inotify events. + const FileDescriptor fd = + ASSERT_NO_ERRNO_AND_VALUE(Open(link_path.c_str(), O_RDWR)); + int val = 0; + ASSERT_THAT(write(fd.get(), &val, sizeof(val)), SyscallSucceeds()); + ASSERT_THAT(read(fd.get(), &val, sizeof(val)), SyscallSucceeds()); + ASSERT_THAT(ftruncate(fd.get(), 12345), SyscallSucceeds()); + ASSERT_THAT(unlink(link_path.c_str()), SyscallSucceeds()); + const std::vector events = + ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get())); + EXPECT_THAT(events, Are({})); +} + +TEST(Inotify, Exec) { + const TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); + const TempPath bin = ASSERT_NO_ERRNO_AND_VALUE( + TempPath::CreateSymlinkTo(dir.path(), "/bin/true")); + + const FileDescriptor fd = + ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK)); + const int wd = ASSERT_NO_ERRNO_AND_VALUE( + InotifyAddWatch(fd.get(), bin.path(), IN_ALL_EVENTS)); + + // Perform exec. + ScopedThread t([&bin]() { + ASSERT_THAT(execl(bin.path().c_str(), bin.path().c_str(), (char*)nullptr), + SyscallSucceeds()); + }); + t.Join(); + + std::vector events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get())); + EXPECT_THAT(events, Are({Event(IN_OPEN, wd), Event(IN_ACCESS, wd)})); +} + +// Watches without IN_EXCL_UNLINK, should continue to emit events for file +// descriptors after their corresponding files have been unlinked. +// +// We need to disable S/R because there are filesystems where we cannot re-open +// fds to an unlinked file across S/R, e.g. gofer-backed filesytems. +TEST(Inotify, IncludeUnlinkedFile_NoRandomSave) { + const DisableSave ds; + + const TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); + const TempPath file = ASSERT_NO_ERRNO_AND_VALUE( + TempPath::CreateFileWith(dir.path(), "123", TempPath::kDefaultFileMode)); + + const FileDescriptor fd = + ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR)); + + const FileDescriptor inotify_fd = + ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK)); + const int dir_wd = ASSERT_NO_ERRNO_AND_VALUE( + InotifyAddWatch(inotify_fd.get(), dir.path(), IN_ALL_EVENTS)); + const int file_wd = ASSERT_NO_ERRNO_AND_VALUE( + InotifyAddWatch(inotify_fd.get(), file.path(), IN_ALL_EVENTS)); + + ASSERT_THAT(unlink(file.path().c_str()), SyscallSucceeds()); + int val = 0; + ASSERT_THAT(read(fd.get(), &val, sizeof(val)), SyscallSucceeds()); + ASSERT_THAT(write(fd.get(), &val, sizeof(val)), SyscallSucceeds()); + const std::vector events = + ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get())); + EXPECT_THAT(events, Are({ + Event(IN_ATTRIB, file_wd), + Event(IN_DELETE, dir_wd, Basename(file.path())), + Event(IN_ACCESS, dir_wd, Basename(file.path())), + Event(IN_ACCESS, file_wd), + Event(IN_MODIFY, dir_wd, Basename(file.path())), + Event(IN_MODIFY, file_wd), + })); +} + } // namespace } // namespace testing } // namespace gvisor -- cgit v1.2.3 From 9ada8c972e43e0c3e144b432fe57d95f823a1847 Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Fri, 29 May 2020 11:27:00 -0700 Subject: Fix the smallest of typos. PiperOrigin-RevId: 313817646 --- pkg/sentry/fsimpl/kernfs/kernfs.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go index 682545994..3f1220791 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs.go @@ -239,7 +239,7 @@ func (d *Dentry) Watches() *vfs.Watches { // InsertChild inserts child into the vfs dentry cache with the given name under // this dentry. This does not update the directory inode, so calling this on -// it's own isn't sufficient to insert a child into a directory. InsertChild +// its own isn't sufficient to insert a child into a directory. InsertChild // updates the link count on d if required. // // Precondition: d must represent a directory inode. -- cgit v1.2.3 From ccf69bdd7e05a4e5f404fbef89a7f49f218645e2 Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Fri, 29 May 2020 12:27:15 -0700 Subject: Implement IN_EXCL_UNLINK inotify option in vfs2. Limited to tmpfs. Inotify support in other filesystem implementations to follow. Updates #1479 PiperOrigin-RevId: 313828648 --- pkg/sentry/fsimpl/ext/dentry.go | 2 +- pkg/sentry/fsimpl/gofer/gofer.go | 2 +- pkg/sentry/fsimpl/kernfs/kernfs.go | 2 +- pkg/sentry/fsimpl/tmpfs/directory.go | 3 +- pkg/sentry/fsimpl/tmpfs/filesystem.go | 14 +-- pkg/sentry/fsimpl/tmpfs/tmpfs.go | 18 +-- pkg/sentry/kernel/fd_table.go | 2 +- pkg/sentry/syscalls/linux/vfs2/read_write.go | 24 ++-- pkg/sentry/vfs/anonfs.go | 2 +- pkg/sentry/vfs/dentry.go | 6 +- pkg/sentry/vfs/inotify.go | 38 +++++-- pkg/sentry/vfs/vfs.go | 2 +- test/syscalls/linux/inotify.cc | 164 +++++++++++++++++++++++++++ 13 files changed, 235 insertions(+), 44 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/ext/dentry.go b/pkg/sentry/fsimpl/ext/dentry.go index 4d0deaf03..6bd1a9fc6 100644 --- a/pkg/sentry/fsimpl/ext/dentry.go +++ b/pkg/sentry/fsimpl/ext/dentry.go @@ -64,7 +64,7 @@ func (d *dentry) DecRef() { // InotifyWithParent implements vfs.DentryImpl.InotifyWithParent. // // TODO(gvisor.dev/issue/1479): Implement inotify. -func (d *dentry) InotifyWithParent(events uint32, cookie uint32) {} +func (d *dentry) InotifyWithParent(events uint32, cookie uint32, et vfs.EventType) {} // Watches implements vfs.DentryImpl.Watches. // diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go index 850482a19..3f3bd56f0 100644 --- a/pkg/sentry/fsimpl/gofer/gofer.go +++ b/pkg/sentry/fsimpl/gofer/gofer.go @@ -1042,7 +1042,7 @@ func (d *dentry) decRefLocked() { // InotifyWithParent implements vfs.DentryImpl.InotifyWithParent. // // TODO(gvisor.dev/issue/1479): Implement inotify. -func (d *dentry) InotifyWithParent(events uint32, cookie uint32) {} +func (d *dentry) InotifyWithParent(events uint32, cookie uint32, et vfs.EventType) {} // Watches implements vfs.DentryImpl.Watches. // diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go index 3f1220791..bbee8ccda 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs.go @@ -228,7 +228,7 @@ func (d *Dentry) destroy() { // InotifyWithParent implements vfs.DentryImpl.InotifyWithParent. // // TODO(gvisor.dev/issue/1479): Implement inotify. -func (d *Dentry) InotifyWithParent(events uint32, cookie uint32) {} +func (d *Dentry) InotifyWithParent(events uint32, cookie uint32, et vfs.EventType) {} // Watches implements vfs.DentryImpl.Watches. // diff --git a/pkg/sentry/fsimpl/tmpfs/directory.go b/pkg/sentry/fsimpl/tmpfs/directory.go index 8bc475f88..70387cb9c 100644 --- a/pkg/sentry/fsimpl/tmpfs/directory.go +++ b/pkg/sentry/fsimpl/tmpfs/directory.go @@ -79,6 +79,7 @@ func (dir *directory) removeChildLocked(child *dentry) { dir.iterMu.Lock() dir.childList.Remove(child) dir.iterMu.Unlock() + child.unlinked = true } type directoryFD struct { @@ -112,7 +113,7 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba dir.iterMu.Lock() defer dir.iterMu.Unlock() - fd.dentry().InotifyWithParent(linux.IN_ACCESS, 0) + fd.dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent) fd.inode().touchAtime(fd.vfsfd.Mount()) if fd.off == 0 { diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go index b4159f904..183eb975c 100644 --- a/pkg/sentry/fsimpl/tmpfs/filesystem.go +++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go @@ -182,7 +182,7 @@ func (fs *filesystem) doCreateAt(rp *vfs.ResolvingPath, dir bool, create func(pa if dir { ev |= linux.IN_ISDIR } - parentDir.inode.watches.Notify(name, uint32(ev), 0) + parentDir.inode.watches.Notify(name, uint32(ev), 0, vfs.InodeEvent) parentDir.inode.touchCMtime() return nil } @@ -247,7 +247,7 @@ func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs. return syserror.EMLINK } d.inode.incLinksLocked() - d.inode.watches.Notify("", linux.IN_ATTRIB, 0) + d.inode.watches.Notify("", linux.IN_ATTRIB, 0, vfs.InodeEvent) parentDir.insertChildLocked(fs.newDentry(d.inode), name) return nil }) @@ -361,7 +361,7 @@ afterTrailingSymlink: if err != nil { return nil, err } - parentDir.inode.watches.Notify(name, linux.IN_CREATE, 0) + parentDir.inode.watches.Notify(name, linux.IN_CREATE, 0, vfs.PathEvent) parentDir.inode.touchCMtime() return fd, nil } @@ -613,7 +613,7 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error return err } parentDir.removeChildLocked(child) - parentDir.inode.watches.Notify(name, linux.IN_DELETE|linux.IN_ISDIR, 0) + parentDir.inode.watches.Notify(name, linux.IN_DELETE|linux.IN_ISDIR, 0, vfs.InodeEvent) // Remove links for child, child/., and child/.. child.inode.decLinksLocked() child.inode.decLinksLocked() @@ -636,7 +636,7 @@ func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts } if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 { - d.InotifyWithParent(ev, 0) + d.InotifyWithParent(ev, 0, vfs.InodeEvent) } return nil } @@ -784,7 +784,7 @@ func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt return err } - d.InotifyWithParent(linux.IN_ATTRIB, 0) + d.InotifyWithParent(linux.IN_ATTRIB, 0, vfs.InodeEvent) return nil } @@ -800,7 +800,7 @@ func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, return err } - d.InotifyWithParent(linux.IN_ATTRIB, 0) + d.InotifyWithParent(linux.IN_ATTRIB, 0, vfs.InodeEvent) return nil } diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go index 1d83b6840..f0e098702 100644 --- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go +++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go @@ -163,6 +163,11 @@ type dentry struct { // filesystem.mu. name string + // unlinked indicates whether this dentry has been unlinked from its parent. + // It is only set to true on an unlink operation, and never set from true to + // false. unlinked is protected by filesystem.mu. + unlinked bool + // dentryEntry (ugh) links dentries into their parent directory.childList. dentryEntry @@ -202,7 +207,7 @@ func (d *dentry) DecRef() { } // InotifyWithParent implements vfs.DentryImpl.InotifyWithParent. -func (d *dentry) InotifyWithParent(events uint32, cookie uint32) { +func (d *dentry) InotifyWithParent(events uint32, cookie uint32, et vfs.EventType) { if d.inode.isDir() { events |= linux.IN_ISDIR } @@ -211,9 +216,9 @@ func (d *dentry) InotifyWithParent(events uint32, cookie uint32) { if d.parent != nil { // Note that d.parent or d.name may be stale if there is a concurrent // rename operation. Inotify does not provide consistency guarantees. - d.parent.inode.watches.Notify(d.name, events, cookie) + d.parent.inode.watches.NotifyWithExclusions(d.name, events, cookie, et, d.unlinked) } - d.inode.watches.Notify("", events, cookie) + d.inode.watches.Notify("", events, cookie, et) } // Watches implements vfs.DentryImpl.Watches. @@ -676,9 +681,8 @@ func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) return err } - // Generate inotify events. if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 { - d.InotifyWithParent(ev, 0) + d.InotifyWithParent(ev, 0, vfs.InodeEvent) } return nil } @@ -701,7 +705,7 @@ func (fd *fileDescription) Setxattr(ctx context.Context, opts vfs.SetxattrOption } // Generate inotify events. - d.InotifyWithParent(linux.IN_ATTRIB, 0) + d.InotifyWithParent(linux.IN_ATTRIB, 0, vfs.InodeEvent) return nil } @@ -713,7 +717,7 @@ func (fd *fileDescription) Removexattr(ctx context.Context, name string) error { } // Generate inotify events. - d.InotifyWithParent(linux.IN_ATTRIB, 0) + d.InotifyWithParent(linux.IN_ATTRIB, 0, vfs.InodeEvent) return nil } diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go index ef73e1169..dbfcef0fa 100644 --- a/pkg/sentry/kernel/fd_table.go +++ b/pkg/sentry/kernel/fd_table.go @@ -158,7 +158,7 @@ func (f *FDTable) dropVFS2(file *vfs.FileDescription) { if file.IsWritable() { ev = linux.IN_CLOSE_WRITE } - file.Dentry().InotifyWithParent(ev, 0) + file.Dentry().InotifyWithParent(ev, 0, vfs.PathEvent) // Drop the table reference. file.DecRef() diff --git a/pkg/sentry/syscalls/linux/vfs2/read_write.go b/pkg/sentry/syscalls/linux/vfs2/read_write.go index 92b5631a3..7f9debd4a 100644 --- a/pkg/sentry/syscalls/linux/vfs2/read_write.go +++ b/pkg/sentry/syscalls/linux/vfs2/read_write.go @@ -94,7 +94,7 @@ func read(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, opt n, err := file.Read(t, dst, opts) if err != syserror.ErrWouldBlock { if n > 0 { - file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0) + file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent) } return n, err } @@ -102,7 +102,7 @@ func read(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, opt allowBlock, deadline, hasDeadline := blockPolicy(t, file) if !allowBlock { if n > 0 { - file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0) + file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent) } return n, err } @@ -135,7 +135,7 @@ func read(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, opt file.EventUnregister(&w) if total > 0 { - file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0) + file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent) } return total, err } @@ -258,7 +258,7 @@ func pread(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, of n, err := file.PRead(t, dst, offset, opts) if err != syserror.ErrWouldBlock { if n > 0 { - file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0) + file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent) } return n, err } @@ -266,7 +266,7 @@ func pread(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, of allowBlock, deadline, hasDeadline := blockPolicy(t, file) if !allowBlock { if n > 0 { - file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0) + file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent) } return n, err } @@ -299,7 +299,7 @@ func pread(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, of file.EventUnregister(&w) if total > 0 { - file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0) + file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent) } return total, err } @@ -364,7 +364,7 @@ func write(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, op n, err := file.Write(t, src, opts) if err != syserror.ErrWouldBlock { if n > 0 { - file.Dentry().InotifyWithParent(linux.IN_MODIFY, 0) + file.Dentry().InotifyWithParent(linux.IN_MODIFY, 0, vfs.PathEvent) } return n, err } @@ -372,7 +372,7 @@ func write(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, op allowBlock, deadline, hasDeadline := blockPolicy(t, file) if !allowBlock { if n > 0 { - file.Dentry().InotifyWithParent(linux.IN_MODIFY, 0) + file.Dentry().InotifyWithParent(linux.IN_MODIFY, 0, vfs.PathEvent) } return n, err } @@ -405,7 +405,7 @@ func write(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, op file.EventUnregister(&w) if total > 0 { - file.Dentry().InotifyWithParent(linux.IN_MODIFY, 0) + file.Dentry().InotifyWithParent(linux.IN_MODIFY, 0, vfs.PathEvent) } return total, err } @@ -528,7 +528,7 @@ func pwrite(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, o n, err := file.PWrite(t, src, offset, opts) if err != syserror.ErrWouldBlock { if n > 0 { - file.Dentry().InotifyWithParent(linux.IN_MODIFY, 0) + file.Dentry().InotifyWithParent(linux.IN_MODIFY, 0, vfs.PathEvent) } return n, err } @@ -536,7 +536,7 @@ func pwrite(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, o allowBlock, deadline, hasDeadline := blockPolicy(t, file) if !allowBlock { if n > 0 { - file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0) + file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent) } return n, err } @@ -569,7 +569,7 @@ func pwrite(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, o file.EventUnregister(&w) if total > 0 { - file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0) + file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent) } return total, err } diff --git a/pkg/sentry/vfs/anonfs.go b/pkg/sentry/vfs/anonfs.go index 55a3d54cc..b7c6b60b8 100644 --- a/pkg/sentry/vfs/anonfs.go +++ b/pkg/sentry/vfs/anonfs.go @@ -301,7 +301,7 @@ func (d *anonDentry) DecRef() { // InotifyWithParent implements DentryImpl.InotifyWithParent. // // TODO(gvisor.dev/issue/1479): Implement inotify. -func (d *anonDentry) InotifyWithParent(events uint32, cookie uint32) {} +func (d *anonDentry) InotifyWithParent(events uint32, cookie uint32, et EventType) {} // Watches implements DentryImpl.Watches. // diff --git a/pkg/sentry/vfs/dentry.go b/pkg/sentry/vfs/dentry.go index d61b9e09b..24af13eb1 100644 --- a/pkg/sentry/vfs/dentry.go +++ b/pkg/sentry/vfs/dentry.go @@ -113,7 +113,7 @@ type DentryImpl interface { // // Note that the events may not actually propagate up to the user, depending // on the event masks. - InotifyWithParent(events uint32, cookie uint32) + InotifyWithParent(events uint32, cookie uint32, et EventType) // Watches returns the set of inotify watches for the file corresponding to // the Dentry. Dentries that are hard links to the same underlying file @@ -151,8 +151,8 @@ func (d *Dentry) isMounted() bool { // InotifyWithParent notifies all watches on the inodes for this dentry and // its parent of events. -func (d *Dentry) InotifyWithParent(events uint32, cookie uint32) { - d.impl.InotifyWithParent(events, cookie) +func (d *Dentry) InotifyWithParent(events uint32, cookie uint32, et EventType) { + d.impl.InotifyWithParent(events, cookie, et) } // Watches returns the set of inotify watches associated with d. diff --git a/pkg/sentry/vfs/inotify.go b/pkg/sentry/vfs/inotify.go index 1d28ccb46..05a3051a4 100644 --- a/pkg/sentry/vfs/inotify.go +++ b/pkg/sentry/vfs/inotify.go @@ -33,6 +33,19 @@ import ( // must be a power 2 for rounding below. const inotifyEventBaseSize = 16 +// EventType defines different kinds of inotfiy events. +// +// The way events are labelled appears somewhat arbitrary, but they must match +// Linux so that IN_EXCL_UNLINK behaves as it does in Linux. +type EventType uint8 + +// PathEvent and InodeEvent correspond to FSNOTIFY_EVENT_PATH and +// FSNOTIFY_EVENT_INODE in Linux. +const ( + PathEvent EventType = iota + InodeEvent EventType = iota +) + // Inotify represents an inotify instance created by inotify_init(2) or // inotify_init1(2). Inotify implements FileDescriptionImpl. // @@ -419,13 +432,22 @@ func (w *Watches) Remove(id uint64) { } // Notify queues a new event with all watches in this set. -func (w *Watches) Notify(name string, events, cookie uint32) { +func (w *Watches) Notify(name string, events, cookie uint32, et EventType) { + w.NotifyWithExclusions(name, events, cookie, et, false) +} + +// NotifyWithExclusions queues a new event with watches in this set. Watches +// with IN_EXCL_UNLINK are skipped if the event is coming from a child that +// has been unlinked. +func (w *Watches) NotifyWithExclusions(name string, events, cookie uint32, et EventType, unlinked bool) { // N.B. We don't defer the unlocks because Notify is in the hot path of // all IO operations, and the defer costs too much for small IO // operations. w.mu.RLock() for _, watch := range w.ws { - // TODO(gvisor.dev/issue/1479): Skip for IN_EXCL_UNLINK cases. + if unlinked && watch.ExcludeUnlinkedChildren() && et == PathEvent { + continue + } watch.Notify(name, events, cookie) } w.mu.RUnlock() @@ -434,7 +456,7 @@ func (w *Watches) Notify(name string, events, cookie uint32) { // HandleDeletion is called when the watch target is destroyed to emit // the appropriate events. func (w *Watches) HandleDeletion() { - w.Notify("", linux.IN_DELETE_SELF, 0) + w.Notify("", linux.IN_DELETE_SELF, 0, InodeEvent) // TODO(gvisor.dev/issue/1479): This doesn't work because maps are not copied // by value. Ideally, we wouldn't have this circular locking so we can just @@ -655,8 +677,8 @@ func InotifyEventFromStatMask(mask uint32) uint32 { // InotifyRemoveChild sends the appriopriate notifications to the watch sets of // the child being removed and its parent. func InotifyRemoveChild(self, parent *Watches, name string) { - self.Notify("", linux.IN_ATTRIB, 0) - parent.Notify(name, linux.IN_DELETE, 0) + self.Notify("", linux.IN_ATTRIB, 0, InodeEvent) + parent.Notify(name, linux.IN_DELETE, 0, InodeEvent) // TODO(gvisor.dev/issue/1479): implement IN_EXCL_UNLINK. } @@ -668,8 +690,8 @@ func InotifyRename(ctx context.Context, renamed, oldParent, newParent *Watches, dirEv = linux.IN_ISDIR } cookie := uniqueid.InotifyCookie(ctx) - oldParent.Notify(oldName, dirEv|linux.IN_MOVED_FROM, cookie) - newParent.Notify(newName, dirEv|linux.IN_MOVED_TO, cookie) + oldParent.Notify(oldName, dirEv|linux.IN_MOVED_FROM, cookie, InodeEvent) + newParent.Notify(newName, dirEv|linux.IN_MOVED_TO, cookie, InodeEvent) // Somewhat surprisingly, self move events do not have a cookie. - renamed.Notify("", linux.IN_MOVE_SELF, 0) + renamed.Notify("", linux.IN_MOVE_SELF, 0, InodeEvent) } diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go index be6f21dba..52643a7c5 100644 --- a/pkg/sentry/vfs/vfs.go +++ b/pkg/sentry/vfs/vfs.go @@ -422,7 +422,7 @@ func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credential } } - fd.Dentry().InotifyWithParent(linux.IN_OPEN, 0) + fd.Dentry().InotifyWithParent(linux.IN_OPEN, 0, PathEvent) return fd, nil } if !rp.handleError(err) { diff --git a/test/syscalls/linux/inotify.cc b/test/syscalls/linux/inotify.cc index e4565467b..2306d9cab 100644 --- a/test/syscalls/linux/inotify.cc +++ b/test/syscalls/linux/inotify.cc @@ -1778,6 +1778,170 @@ TEST(Inotify, IncludeUnlinkedFile_NoRandomSave) { })); } +// Watches created with IN_EXCL_UNLINK will stop emitting events on fds for +// children that have already been unlinked. +// +// We need to disable S/R because there are filesystems where we cannot re-open +// fds to an unlinked file across S/R, e.g. gofer-backed filesytems. +TEST(Inotify, ExcludeUnlink_NoRandomSave) { + const DisableSave ds; + // TODO(gvisor.dev/issue/1624): This test fails on VFS1. + SKIP_IF(IsRunningWithVFS1()); + + const TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); + const TempPath file = + ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(dir.path())); + + const FileDescriptor fd = + ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR)); + + const FileDescriptor inotify_fd = + ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK)); + const int wd = ASSERT_NO_ERRNO_AND_VALUE(InotifyAddWatch( + inotify_fd.get(), dir.path(), IN_ALL_EVENTS | IN_EXCL_UNLINK)); + + // Unlink the child, which should cause further operations on the open file + // descriptor to be ignored. + ASSERT_THAT(unlink(file.path().c_str()), SyscallSucceeds()); + int val = 0; + ASSERT_THAT(write(fd.get(), &val, sizeof(val)), SyscallSucceeds()); + ASSERT_THAT(read(fd.get(), &val, sizeof(val)), SyscallSucceeds()); + const std::vector events = + ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get())); + EXPECT_THAT(events, Are({Event(IN_DELETE, wd, Basename(file.path()))})); +} + +// We need to disable S/R because there are filesystems where we cannot re-open +// fds to an unlinked file across S/R, e.g. gofer-backed filesytems. +TEST(Inotify, ExcludeUnlinkDirectory_NoRandomSave) { + const DisableSave ds; + + const TempPath parent = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); + TempPath dir = + ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirIn(parent.path())); + std::string dirPath = dir.path(); + const FileDescriptor inotify_fd = + ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK)); + + const FileDescriptor fd = + ASSERT_NO_ERRNO_AND_VALUE(Open(dirPath.c_str(), O_RDONLY | O_DIRECTORY)); + const int wd = ASSERT_NO_ERRNO_AND_VALUE(InotifyAddWatch( + inotify_fd.get(), parent.path(), IN_ALL_EVENTS | IN_EXCL_UNLINK)); + + // Unlink the dir, and then close the open fd. + ASSERT_THAT(rmdir(dirPath.c_str()), SyscallSucceeds()); + dir.reset(); + + const std::vector events = + ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get())); + // No close event should appear. + ASSERT_THAT(events, + Are({Event(IN_DELETE | IN_ISDIR, wd, Basename(dirPath))})); +} + +// If "dir/child" and "dir/child2" are links to the same file, and "dir/child" +// is unlinked, a watch on "dir" with IN_EXCL_UNLINK will exclude future events +// for fds on "dir/child" but not "dir/child2". +// +// We need to disable S/R because there are filesystems where we cannot re-open +// fds to an unlinked file across S/R, e.g. gofer-backed filesytems. +TEST(Inotify, ExcludeUnlinkMultipleChildren_NoRandomSave) { + const DisableSave ds; + // TODO(gvisor.dev/issue/1624): This test fails on VFS1. + SKIP_IF(IsRunningWithVFS1()); + + const TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); + const TempPath file = + ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(dir.path())); + std::string path1 = file.path(); + std::string path2 = NewTempAbsPathInDir(dir.path()); + + const int rc = link(path1.c_str(), path2.c_str()); + // NOTE(b/34861058): link(2) is only supported on tmpfs in the sandbox. + SKIP_IF(IsRunningOnGvisor() && rc != 0 && + (errno == EPERM || errno == ENOENT)); + ASSERT_THAT(rc, SyscallSucceeds()); + const FileDescriptor fd1 = + ASSERT_NO_ERRNO_AND_VALUE(Open(path1.c_str(), O_RDWR)); + const FileDescriptor fd2 = + ASSERT_NO_ERRNO_AND_VALUE(Open(path2.c_str(), O_RDWR)); + + const FileDescriptor inotify_fd = + ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK)); + const int wd = ASSERT_NO_ERRNO_AND_VALUE(InotifyAddWatch( + inotify_fd.get(), dir.path(), IN_ALL_EVENTS | IN_EXCL_UNLINK)); + + // After unlinking path1, only events on the fd for path2 should be generated. + ASSERT_THAT(unlink(path1.c_str()), SyscallSucceeds()); + ASSERT_THAT(write(fd1.get(), "x", 1), SyscallSucceeds()); + ASSERT_THAT(write(fd2.get(), "x", 1), SyscallSucceeds()); + + const std::vector events = + ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get())); + EXPECT_THAT(events, Are({ + Event(IN_DELETE, wd, Basename(path1)), + Event(IN_MODIFY, wd, Basename(path2)), + })); +} + +// On native Linux, actions of data type FSNOTIFY_EVENT_INODE are not affected +// by IN_EXCL_UNLINK (see +// fs/notify/inotify/inotify_fsnotify.c:inotify_handle_event). Inode-level +// events include changes to metadata and extended attributes. +// +// We need to disable S/R because there are filesystems where we cannot re-open +// fds to an unlinked file across S/R, e.g. gofer-backed filesytems. +TEST(Inotify, ExcludeUnlinkInodeEvents_NoRandomSave) { + const DisableSave ds; + + const TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); + const TempPath file = + ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(dir.path())); + + const FileDescriptor fd = + ASSERT_NO_ERRNO_AND_VALUE(Open(file.path().c_str(), O_RDWR)); + const FileDescriptor inotify_fd = + ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK)); + const int wd = ASSERT_NO_ERRNO_AND_VALUE(InotifyAddWatch( + inotify_fd.get(), dir.path(), IN_ALL_EVENTS | IN_EXCL_UNLINK)); + + // NOTE(b/157163751): Create another link before unlinking. This is needed for + // the gofer filesystem in gVisor, where open fds will not work once the link + // count hits zero. In VFS2, we end up skipping the gofer test anyway, because + // hard links are not supported for gofer fs. + if (IsRunningOnGvisor()) { + std::string link_path = NewTempAbsPath(); + const int rc = link(file.path().c_str(), link_path.c_str()); + // NOTE(b/34861058): link(2) is only supported on tmpfs in the sandbox. + SKIP_IF(rc != 0 && (errno == EPERM || errno == ENOENT)); + ASSERT_THAT(rc, SyscallSucceeds()); + } + + // Even after unlinking, inode-level operations will trigger events regardless + // of IN_EXCL_UNLINK. + ASSERT_THAT(unlink(file.path().c_str()), SyscallSucceeds()); + + // Perform various actions on fd. + ASSERT_THAT(ftruncate(fd.get(), 12345), SyscallSucceeds()); + std::vector events = + ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get())); + EXPECT_THAT(events, Are({ + Event(IN_DELETE, wd, Basename(file.path())), + Event(IN_MODIFY, wd, Basename(file.path())), + })); + + struct timeval times[2] = {{1, 0}, {2, 0}}; + ASSERT_THAT(futimes(fd.get(), times), SyscallSucceeds()); + events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get())); + EXPECT_THAT(events, Are({Event(IN_ATTRIB, wd, Basename(file.path()))})); + + // S/R is disabled on this entire test due to behavior with unlink; it must + // also be disabled after this point because of fchmod. + ASSERT_THAT(fchmod(fd.get(), 0777), SyscallSucceeds()); + events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get())); + EXPECT_THAT(events, Are({Event(IN_ATTRIB, wd, Basename(file.path()))})); +} + } // namespace } // namespace testing } // namespace gvisor -- cgit v1.2.3 From 3a987160aa09f814a8459ed3f6192ce741b701a3 Mon Sep 17 00:00:00 2001 From: Jamie Liu Date: Mon, 1 Jun 2020 15:31:59 -0700 Subject: Handle gofer blocking opens of host named pipes in VFS2. Using tee instead of read to detect when a O_RDONLY|O_NONBLOCK pipe FD has a writer circumvents the problem of what to do with the byte read from the pipe, avoiding much of the complexity of the fdpipe package. PiperOrigin-RevId: 314216146 --- pkg/sentry/fsimpl/gofer/BUILD | 3 + pkg/sentry/fsimpl/gofer/filesystem.go | 56 ++++++++++------- pkg/sentry/fsimpl/gofer/host_named_pipe.go | 97 ++++++++++++++++++++++++++++++ pkg/sentry/fsimpl/gofer/special_file.go | 77 ++++++++++++++++++++++-- runsc/boot/filter/config.go | 8 +++ 5 files changed, 214 insertions(+), 27 deletions(-) create mode 100644 pkg/sentry/fsimpl/gofer/host_named_pipe.go (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/gofer/BUILD b/pkg/sentry/fsimpl/gofer/BUILD index 67e916525..f5f35a3bc 100644 --- a/pkg/sentry/fsimpl/gofer/BUILD +++ b/pkg/sentry/fsimpl/gofer/BUILD @@ -35,6 +35,7 @@ go_library( "fstree.go", "gofer.go", "handle.go", + "host_named_pipe.go", "p9file.go", "regular_file.go", "socket.go", @@ -47,6 +48,7 @@ go_library( "//pkg/abi/linux", "//pkg/context", "//pkg/fd", + "//pkg/fdnotifier", "//pkg/fspath", "//pkg/log", "//pkg/p9", @@ -71,6 +73,7 @@ go_library( "//pkg/unet", "//pkg/usermem", "//pkg/waiter", + "@org_golang_x_sys//unix:go_default_library", ], ) diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go index 7f2181216..0000ca6ba 100644 --- a/pkg/sentry/fsimpl/gofer/filesystem.go +++ b/pkg/sentry/fsimpl/gofer/filesystem.go @@ -873,19 +873,37 @@ func (d *dentry) openSpecialFileLocked(ctx context.Context, mnt *vfs.Mount, opts if opts.Flags&linux.O_DIRECT != 0 { return nil, syserror.EINVAL } - h, err := openHandle(ctx, d.file, ats&vfs.MayRead != 0, ats&vfs.MayWrite != 0, opts.Flags&linux.O_TRUNC != 0) + // We assume that the server silently inserts O_NONBLOCK in the open flags + // for all named pipes (because all existing gofers do this). + // + // NOTE(b/133875563): This makes named pipe opens racy, because the + // mechanisms for translating nonblocking to blocking opens can only detect + // the instantaneous presence of a peer holding the other end of the pipe + // open, not whether the pipe was *previously* opened by a peer that has + // since closed its end. + isBlockingOpenOfNamedPipe := d.fileType() == linux.S_IFIFO && opts.Flags&linux.O_NONBLOCK == 0 +retry: + h, err := openHandle(ctx, d.file, ats.MayRead(), ats.MayWrite(), opts.Flags&linux.O_TRUNC != 0) if err != nil { + if isBlockingOpenOfNamedPipe && ats == vfs.MayWrite && err == syserror.ENXIO { + // An attempt to open a named pipe with O_WRONLY|O_NONBLOCK fails + // with ENXIO if opening the same named pipe with O_WRONLY would + // block because there are no readers of the pipe. + if err := sleepBetweenNamedPipeOpenChecks(ctx); err != nil { + return nil, err + } + goto retry + } return nil, err } - seekable := d.fileType() == linux.S_IFREG - fd := &specialFileFD{ - handle: h, - seekable: seekable, + if isBlockingOpenOfNamedPipe && ats == vfs.MayRead && h.fd >= 0 { + if err := blockUntilNonblockingPipeHasWriter(ctx, h.fd); err != nil { + h.close(ctx) + return nil, err + } } - if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{ - DenyPRead: !seekable, - DenyPWrite: !seekable, - }); err != nil { + fd, err := newSpecialFileFD(h, mnt, d, opts.Flags) + if err != nil { h.close(ctx) return nil, err } @@ -981,22 +999,16 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving } childVFSFD = &fd.vfsfd } else { - seekable := child.fileType() == linux.S_IFREG - fd := &specialFileFD{ - handle: handle{ - file: openFile, - fd: -1, - }, - seekable: seekable, + h := handle{ + file: openFile, + fd: -1, } if fdobj != nil { - fd.handle.fd = int32(fdobj.Release()) + h.fd = int32(fdobj.Release()) } - if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &child.vfsd, &vfs.FileDescriptionOptions{ - DenyPRead: !seekable, - DenyPWrite: !seekable, - }); err != nil { - fd.handle.close(ctx) + fd, err := newSpecialFileFD(h, mnt, child, opts.Flags) + if err != nil { + h.close(ctx) return nil, err } childVFSFD = &fd.vfsfd diff --git a/pkg/sentry/fsimpl/gofer/host_named_pipe.go b/pkg/sentry/fsimpl/gofer/host_named_pipe.go new file mode 100644 index 000000000..7294de7d6 --- /dev/null +++ b/pkg/sentry/fsimpl/gofer/host_named_pipe.go @@ -0,0 +1,97 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package gofer + +import ( + "fmt" + "sync" + "time" + + "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/syserror" +) + +// Global pipe used by blockUntilNonblockingPipeHasWriter since we can't create +// pipes after sentry initialization due to syscall filters. +var ( + tempPipeMu sync.Mutex + tempPipeReadFD int + tempPipeWriteFD int + tempPipeBuf [1]byte +) + +func init() { + var pipeFDs [2]int + if err := unix.Pipe(pipeFDs[:]); err != nil { + panic(fmt.Sprintf("failed to create pipe for gofer.blockUntilNonblockingPipeHasWriter: %v", err)) + } + tempPipeReadFD = pipeFDs[0] + tempPipeWriteFD = pipeFDs[1] +} + +func blockUntilNonblockingPipeHasWriter(ctx context.Context, fd int32) error { + for { + ok, err := nonblockingPipeHasWriter(fd) + if err != nil { + return err + } + if ok { + return nil + } + if err := sleepBetweenNamedPipeOpenChecks(ctx); err != nil { + return err + } + } +} + +func nonblockingPipeHasWriter(fd int32) (bool, error) { + tempPipeMu.Lock() + defer tempPipeMu.Unlock() + // Copy 1 byte from fd into the temporary pipe. + n, err := unix.Tee(int(fd), tempPipeWriteFD, 1, unix.SPLICE_F_NONBLOCK) + if err == syserror.EAGAIN { + // The pipe represented by fd is empty, but has a writer. + return true, nil + } + if err != nil { + return false, err + } + if n == 0 { + // The pipe represented by fd is empty and has no writer. + return false, nil + } + // The pipe represented by fd is non-empty, so it either has, or has + // previously had, a writer. Remove the byte copied to the temporary pipe + // before returning. + if n, err := unix.Read(tempPipeReadFD, tempPipeBuf[:]); err != nil || n != 1 { + panic(fmt.Sprintf("failed to drain pipe for gofer.blockUntilNonblockingPipeHasWriter: got (%d, %v), wanted (1, nil)", n, err)) + } + return true, nil +} + +func sleepBetweenNamedPipeOpenChecks(ctx context.Context) error { + t := time.NewTimer(100 * time.Millisecond) + defer t.Stop() + cancel := ctx.SleepStart() + select { + case <-t.C: + ctx.SleepFinish(true) + return nil + case <-cancel: + ctx.SleepFinish(false) + return syserror.ErrInterrupted + } +} diff --git a/pkg/sentry/fsimpl/gofer/special_file.go b/pkg/sentry/fsimpl/gofer/special_file.go index a464e6a94..ff6126b87 100644 --- a/pkg/sentry/fsimpl/gofer/special_file.go +++ b/pkg/sentry/fsimpl/gofer/special_file.go @@ -19,17 +19,18 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fdnotifier" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" ) -// specialFileFD implements vfs.FileDescriptionImpl for files other than -// regular files, directories, and symlinks: pipes, sockets, etc. It is also -// used for regular files when filesystemOptions.specialRegularFiles is in -// effect. specialFileFD differs from regularFileFD by using per-FD handles -// instead of shared per-dentry handles, and never buffering I/O. +// specialFileFD implements vfs.FileDescriptionImpl for pipes, sockets, device +// special files, and (when filesystemOptions.specialRegularFiles is in effect) +// regular files. specialFileFD differs from regularFileFD by using per-FD +// handles instead of shared per-dentry handles, and never buffering I/O. type specialFileFD struct { fileDescription @@ -40,13 +41,47 @@ type specialFileFD struct { // file offset is significant, i.e. a regular file. seekable is immutable. seekable bool + // mayBlock is true if this file description represents a file for which + // queue may send I/O readiness events. mayBlock is immutable. + mayBlock bool + queue waiter.Queue + // If seekable is true, off is the file offset. off is protected by mu. mu sync.Mutex off int64 } +func newSpecialFileFD(h handle, mnt *vfs.Mount, d *dentry, flags uint32) (*specialFileFD, error) { + ftype := d.fileType() + seekable := ftype == linux.S_IFREG + mayBlock := ftype == linux.S_IFIFO || ftype == linux.S_IFSOCK + fd := &specialFileFD{ + handle: h, + seekable: seekable, + mayBlock: mayBlock, + } + if mayBlock && h.fd >= 0 { + if err := fdnotifier.AddFD(h.fd, &fd.queue); err != nil { + return nil, err + } + } + if err := fd.vfsfd.Init(fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{ + DenyPRead: !seekable, + DenyPWrite: !seekable, + }); err != nil { + if mayBlock && h.fd >= 0 { + fdnotifier.RemoveFD(h.fd) + } + return nil, err + } + return fd, nil +} + // Release implements vfs.FileDescriptionImpl.Release. func (fd *specialFileFD) Release() { + if fd.mayBlock && fd.handle.fd >= 0 { + fdnotifier.RemoveFD(fd.handle.fd) + } fd.handle.close(context.Background()) fs := fd.vfsfd.Mount().Filesystem().Impl().(*filesystem) fs.syncMu.Lock() @@ -62,6 +97,32 @@ func (fd *specialFileFD) OnClose(ctx context.Context) error { return fd.handle.file.flush(ctx) } +// Readiness implements waiter.Waitable.Readiness. +func (fd *specialFileFD) Readiness(mask waiter.EventMask) waiter.EventMask { + if fd.mayBlock { + return fdnotifier.NonBlockingPoll(fd.handle.fd, mask) + } + return fd.fileDescription.Readiness(mask) +} + +// EventRegister implements waiter.Waitable.EventRegister. +func (fd *specialFileFD) EventRegister(e *waiter.Entry, mask waiter.EventMask) { + if fd.mayBlock { + fd.queue.EventRegister(e, mask) + return + } + fd.fileDescription.EventRegister(e, mask) +} + +// EventUnregister implements waiter.Waitable.EventUnregister. +func (fd *specialFileFD) EventUnregister(e *waiter.Entry) { + if fd.mayBlock { + fd.queue.EventUnregister(e) + return + } + fd.fileDescription.EventUnregister(e) +} + // PRead implements vfs.FileDescriptionImpl.PRead. func (fd *specialFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { if fd.seekable && offset < 0 { @@ -81,6 +142,9 @@ func (fd *specialFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offs } buf := make([]byte, dst.NumBytes()) n, err := fd.handle.readToBlocksAt(ctx, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf)), uint64(offset)) + if err == syserror.EAGAIN { + err = syserror.ErrWouldBlock + } if n == 0 { return 0, err } @@ -130,6 +194,9 @@ func (fd *specialFileFD) PWrite(ctx context.Context, src usermem.IOSequence, off return 0, err } n, err := fd.handle.writeFromBlocksAt(ctx, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf)), uint64(offset)) + if err == syserror.EAGAIN { + err = syserror.ErrWouldBlock + } return int64(n), err } diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go index 98cdd90dd..60e33425f 100644 --- a/runsc/boot/filter/config.go +++ b/runsc/boot/filter/config.go @@ -288,6 +288,14 @@ var allowedSyscalls = seccomp.SyscallRules{ syscall.SYS_SIGALTSTACK: {}, unix.SYS_STATX: {}, syscall.SYS_SYNC_FILE_RANGE: {}, + syscall.SYS_TEE: []seccomp.Rule{ + { + seccomp.AllowAny{}, + seccomp.AllowAny{}, + seccomp.AllowValue(1), /* len */ + seccomp.AllowValue(unix.SPLICE_F_NONBLOCK), /* flags */ + }, + }, syscall.SYS_TGKILL: []seccomp.Rule{ { seccomp.AllowValue(uint64(os.Getpid())), -- cgit v1.2.3 From 49a9b78f74fca28cc9312dfb29ccbe70e3b5fcc3 Mon Sep 17 00:00:00 2001 From: Jamie Liu Date: Mon, 1 Jun 2020 18:11:35 -0700 Subject: Fix VFS2 gofer open(O_CREAT) reference leak. gofer.filesystem.createAndOpenChildLocked() doesn't need to take a reference on the new dentry since vfs.FileDescription.Init() will do so. PiperOrigin-RevId: 314242127 --- pkg/sentry/fsimpl/gofer/filesystem.go | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go index 0000ca6ba..36e0e1856 100644 --- a/pkg/sentry/fsimpl/gofer/filesystem.go +++ b/pkg/sentry/fsimpl/gofer/filesystem.go @@ -760,7 +760,7 @@ afterTrailingSymlink: parent.dirMu.Unlock() return nil, syserror.EPERM } - fd, err := parent.createAndOpenChildLocked(ctx, rp, &opts) + fd, err := parent.createAndOpenChildLocked(ctx, rp, &opts, &ds) parent.dirMu.Unlock() return fd, err } @@ -912,7 +912,7 @@ retry: // Preconditions: d.fs.renameMu must be locked. d.dirMu must be locked. // !d.isSynthetic(). -func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions) (*vfs.FileDescription, error) { +func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions, ds **[]*dentry) (*vfs.FileDescription, error) { if err := d.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil { return nil, err } @@ -965,6 +965,7 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving } return nil, err } + *ds = appendDentry(*ds, child) // Incorporate the fid that was opened by lcreate. useRegularFileFD := child.fileType() == linux.S_IFREG && !d.fs.opts.regularFilesUseSpecialFileFD if useRegularFileFD { @@ -977,10 +978,6 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving child.handleWritable = vfs.MayWriteFileWithOpenFlags(opts.Flags) child.handleMu.Unlock() } - // Take a reference on the new dentry to be held by the new file - // description. (This reference also means that the new dentry is not - // eligible for caching yet, so we don't need to append to a dentry slice.) - child.refs = 1 // Insert the dentry into the tree. d.cacheNewChildLocked(child, name) if d.cachedMetadataAuthoritative() { -- cgit v1.2.3 From 21b6bc7280f68f43360a008ffd02a4f461ec9fc8 Mon Sep 17 00:00:00 2001 From: Rahat Mahmood Date: Fri, 5 Jun 2020 19:10:28 -0700 Subject: Implement mount(2) and umount2(2) for VFS2. This is mostly syscall plumbing, VFS2 already implements the internals of mounts. In addition to the syscall defintions, the following mount-related mechanisms are updated: - Implement MS_NOATIME for VFS2, but only for tmpfs and goferfs. The other VFS2 filesystems don't implement node-level timestamps yet. - Implement the 'mode', 'uid' and 'gid' mount options for VFS2's tmpfs. - Plumb mount namespace ownership, which is necessary for checking appropriate capabilities during mount(2). Updates #1035 PiperOrigin-RevId: 315035352 --- pkg/sentry/fsimpl/gofer/time.go | 3 + pkg/sentry/fsimpl/tmpfs/tmpfs.go | 41 +++++++- pkg/sentry/kernel/auth/credentials.go | 28 +++++ pkg/sentry/syscalls/linux/vfs2/BUILD | 1 + pkg/sentry/syscalls/linux/vfs2/mount.go | 145 ++++++++++++++++++++++++++ pkg/sentry/syscalls/linux/vfs2/vfs2.go | 4 +- pkg/sentry/vfs/genericfstree/genericfstree.go | 3 +- pkg/sentry/vfs/mount.go | 34 +++--- pkg/sentry/vfs/options.go | 4 + pkg/sentry/vfs/vfs.go | 2 +- runsc/boot/vfs.go | 2 +- 11 files changed, 247 insertions(+), 20 deletions(-) create mode 100644 pkg/sentry/syscalls/linux/vfs2/mount.go (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/gofer/time.go b/pkg/sentry/fsimpl/gofer/time.go index 2608e7e1d..1d5aa82dc 100644 --- a/pkg/sentry/fsimpl/gofer/time.go +++ b/pkg/sentry/fsimpl/gofer/time.go @@ -38,6 +38,9 @@ func statxTimestampFromDentry(ns int64) linux.StatxTimestamp { // Preconditions: fs.interop != InteropModeShared. func (d *dentry) touchAtime(mnt *vfs.Mount) { + if mnt.Flags.NoATime { + return + } if err := mnt.CheckBeginWrite(); err != nil { return } diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go index f0e098702..3777ebdf2 100644 --- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go +++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go @@ -30,6 +30,7 @@ package tmpfs import ( "fmt" "math" + "strconv" "strings" "sync/atomic" @@ -124,14 +125,45 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt } fs.vfsfs.Init(vfsObj, newFSType, &fs) + mopts := vfs.GenericParseMountOptions(opts.Data) + + defaultMode := linux.FileMode(0777) + if modeStr, ok := mopts["mode"]; ok { + mode, err := strconv.ParseUint(modeStr, 8, 32) + if err != nil { + return nil, nil, fmt.Errorf("Mount option \"mode='%v'\" not parsable: %v", modeStr, err) + } + defaultMode = linux.FileMode(mode) + } + + defaultOwnerCreds := creds.Fork() + if uidStr, ok := mopts["uid"]; ok { + uid, err := strconv.ParseInt(uidStr, 10, 32) + if err != nil { + return nil, nil, fmt.Errorf("Mount option \"uid='%v'\" not parsable: %v", uidStr, err) + } + if err := defaultOwnerCreds.SetUID(auth.UID(uid)); err != nil { + return nil, nil, fmt.Errorf("Error using mount option \"uid='%v'\": %v", uidStr, err) + } + } + if gidStr, ok := mopts["gid"]; ok { + gid, err := strconv.ParseInt(gidStr, 10, 32) + if err != nil { + return nil, nil, fmt.Errorf("Mount option \"gid='%v'\" not parsable: %v", gidStr, err) + } + if err := defaultOwnerCreds.SetGID(auth.GID(gid)); err != nil { + return nil, nil, fmt.Errorf("Error using mount option \"gid='%v'\": %v", gidStr, err) + } + } + var root *dentry switch rootFileType { case linux.S_IFREG: - root = fs.newDentry(fs.newRegularFile(creds, 0777)) + root = fs.newDentry(fs.newRegularFile(defaultOwnerCreds, defaultMode)) case linux.S_IFLNK: - root = fs.newDentry(fs.newSymlink(creds, tmpfsOpts.RootSymlinkTarget)) + root = fs.newDentry(fs.newSymlink(defaultOwnerCreds, tmpfsOpts.RootSymlinkTarget)) case linux.S_IFDIR: - root = &fs.newDirectory(creds, 01777).dentry + root = &fs.newDirectory(defaultOwnerCreds, defaultMode).dentry default: fs.vfsfs.DecRef() return nil, nil, fmt.Errorf("invalid tmpfs root file type: %#o", rootFileType) @@ -562,6 +594,9 @@ func (i *inode) isDir() bool { } func (i *inode) touchAtime(mnt *vfs.Mount) { + if mnt.Flags.NoATime { + return + } if err := mnt.CheckBeginWrite(); err != nil { return } diff --git a/pkg/sentry/kernel/auth/credentials.go b/pkg/sentry/kernel/auth/credentials.go index e057d2c6d..6862f2ef5 100644 --- a/pkg/sentry/kernel/auth/credentials.go +++ b/pkg/sentry/kernel/auth/credentials.go @@ -232,3 +232,31 @@ func (c *Credentials) UseGID(gid GID) (KGID, error) { } return NoID, syserror.EPERM } + +// SetUID translates the provided uid to the root user namespace and updates c's +// uids to it. This performs no permissions or capabilities checks, the caller +// is responsible for ensuring the calling context is permitted to modify c. +func (c *Credentials) SetUID(uid UID) error { + kuid := c.UserNamespace.MapToKUID(uid) + if !kuid.Ok() { + return syserror.EINVAL + } + c.RealKUID = kuid + c.EffectiveKUID = kuid + c.SavedKUID = kuid + return nil +} + +// SetGID translates the provided gid to the root user namespace and updates c's +// gids to it. This performs no permissions or capabilities checks, the caller +// is responsible for ensuring the calling context is permitted to modify c. +func (c *Credentials) SetGID(gid GID) error { + kgid := c.UserNamespace.MapToKGID(gid) + if !kgid.Ok() { + return syserror.EINVAL + } + c.RealKGID = kgid + c.EffectiveKGID = kgid + c.SavedKGID = kgid + return nil +} diff --git a/pkg/sentry/syscalls/linux/vfs2/BUILD b/pkg/sentry/syscalls/linux/vfs2/BUILD index 9c8b44f64..c0d005247 100644 --- a/pkg/sentry/syscalls/linux/vfs2/BUILD +++ b/pkg/sentry/syscalls/linux/vfs2/BUILD @@ -16,6 +16,7 @@ go_library( "ioctl.go", "memfd.go", "mmap.go", + "mount.go", "path.go", "pipe.go", "poll.go", diff --git a/pkg/sentry/syscalls/linux/vfs2/mount.go b/pkg/sentry/syscalls/linux/vfs2/mount.go new file mode 100644 index 000000000..adeaa39cc --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/mount.go @@ -0,0 +1,145 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +// Mount implements Linux syscall mount(2). +func Mount(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + sourceAddr := args[0].Pointer() + targetAddr := args[1].Pointer() + typeAddr := args[2].Pointer() + flags := args[3].Uint64() + dataAddr := args[4].Pointer() + + // For null-terminated strings related to mount(2), Linux copies in at most + // a page worth of data. See fs/namespace.c:copy_mount_string(). + fsType, err := t.CopyInString(typeAddr, usermem.PageSize) + if err != nil { + return 0, nil, err + } + source, err := t.CopyInString(sourceAddr, usermem.PageSize) + if err != nil { + return 0, nil, err + } + + targetPath, err := copyInPath(t, targetAddr) + if err != nil { + return 0, nil, err + } + + data := "" + if dataAddr != 0 { + // In Linux, a full page is always copied in regardless of null + // character placement, and the address is passed to each file system. + // Most file systems always treat this data as a string, though, and so + // do all of the ones we implement. + data, err = t.CopyInString(dataAddr, usermem.PageSize) + if err != nil { + return 0, nil, err + } + } + + // Ignore magic value that was required before Linux 2.4. + if flags&linux.MS_MGC_MSK == linux.MS_MGC_VAL { + flags = flags &^ linux.MS_MGC_MSK + } + + // Must have CAP_SYS_ADMIN in the current mount namespace's associated user + // namespace. + creds := t.Credentials() + if !creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, t.MountNamespaceVFS2().Owner) { + return 0, nil, syserror.EPERM + } + + const unsupportedOps = linux.MS_REMOUNT | linux.MS_BIND | + linux.MS_SHARED | linux.MS_PRIVATE | linux.MS_SLAVE | + linux.MS_UNBINDABLE | linux.MS_MOVE + + // Silently allow MS_NOSUID, since we don't implement set-id bits + // anyway. + const unsupportedFlags = linux.MS_NODEV | + linux.MS_NODIRATIME | linux.MS_STRICTATIME + + // Linux just allows passing any flags to mount(2) - it won't fail when + // unknown or unsupported flags are passed. Since we don't implement + // everything, we fail explicitly on flags that are unimplemented. + if flags&(unsupportedOps|unsupportedFlags) != 0 { + return 0, nil, syserror.EINVAL + } + + var opts vfs.MountOptions + if flags&linux.MS_NOATIME == linux.MS_NOATIME { + opts.Flags.NoATime = true + } + if flags&linux.MS_NOEXEC == linux.MS_NOEXEC { + opts.Flags.NoExec = true + } + if flags&linux.MS_RDONLY == linux.MS_RDONLY { + opts.ReadOnly = true + } + opts.GetFilesystemOptions.Data = data + + target, err := getTaskPathOperation(t, linux.AT_FDCWD, targetPath, disallowEmptyPath, nofollowFinalSymlink) + if err != nil { + return 0, nil, err + } + defer target.Release() + + return 0, nil, t.Kernel().VFS().MountAt(t, creds, source, &target.pop, fsType, &opts) +} + +// Umount2 implements Linux syscall umount2(2). +func Umount2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + addr := args[0].Pointer() + flags := args[1].Int() + + // Must have CAP_SYS_ADMIN in the mount namespace's associated user + // namespace. + // + // Currently, this is always the init task's user namespace. + creds := t.Credentials() + if !creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, t.MountNamespaceVFS2().Owner) { + return 0, nil, syserror.EPERM + } + + const unsupported = linux.MNT_FORCE | linux.MNT_EXPIRE + if flags&unsupported != 0 { + return 0, nil, syserror.EINVAL + } + + path, err := copyInPath(t, addr) + if err != nil { + return 0, nil, err + } + tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, nofollowFinalSymlink) + if err != nil { + return 0, nil, err + } + defer tpop.Release() + + opts := vfs.UmountOptions{ + Flags: uint32(flags), + } + + return 0, nil, t.Kernel().VFS().UmountAt(t, creds, &tpop.pop, &opts) +} diff --git a/pkg/sentry/syscalls/linux/vfs2/vfs2.go b/pkg/sentry/syscalls/linux/vfs2/vfs2.go index ef8358b8a..7b6e7571a 100644 --- a/pkg/sentry/syscalls/linux/vfs2/vfs2.go +++ b/pkg/sentry/syscalls/linux/vfs2/vfs2.go @@ -90,8 +90,8 @@ func Override() { s.Table[138] = syscalls.Supported("fstatfs", Fstatfs) s.Table[161] = syscalls.Supported("chroot", Chroot) s.Table[162] = syscalls.Supported("sync", Sync) - delete(s.Table, 165) // mount - delete(s.Table, 166) // umount2 + s.Table[165] = syscalls.Supported("mount", Mount) + s.Table[166] = syscalls.Supported("umount2", Umount2) delete(s.Table, 187) // readahead s.Table[188] = syscalls.Supported("setxattr", Setxattr) s.Table[189] = syscalls.Supported("lsetxattr", Lsetxattr) diff --git a/pkg/sentry/vfs/genericfstree/genericfstree.go b/pkg/sentry/vfs/genericfstree/genericfstree.go index 286510195..8882fa84a 100644 --- a/pkg/sentry/vfs/genericfstree/genericfstree.go +++ b/pkg/sentry/vfs/genericfstree/genericfstree.go @@ -43,7 +43,7 @@ type Dentry struct { // IsAncestorDentry returns true if d is an ancestor of d2; that is, d is // either d2's parent or an ancestor of d2's parent. func IsAncestorDentry(d, d2 *Dentry) bool { - for { + for d2 != nil { // Stop at root, where d2.parent == nil. if d2.parent == d { return true } @@ -52,6 +52,7 @@ func IsAncestorDentry(d, d2 *Dentry) bool { } d2 = d2.parent } + return false } // ParentOrSelf returns d.parent. If d.parent is nil, ParentOrSelf returns d. diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go index 3adb7c97d..32f901bd8 100644 --- a/pkg/sentry/vfs/mount.go +++ b/pkg/sentry/vfs/mount.go @@ -55,6 +55,10 @@ type Mount struct { // ID is the immutable mount ID. ID uint64 + // Flags contains settings as specified for mount(2), e.g. MS_NOEXEC, except + // for MS_RDONLY which is tracked in "writers". Immutable. + Flags MountFlags + // key is protected by VirtualFilesystem.mountMu and // VirtualFilesystem.mounts.seq, and may be nil. References are held on // key.parent and key.point if they are not nil. @@ -81,10 +85,6 @@ type Mount struct { // umounted is true. umounted is protected by VirtualFilesystem.mountMu. umounted bool - // flags contains settings as specified for mount(2), e.g. MS_NOEXEC, except - // for MS_RDONLY which is tracked in "writers". - flags MountFlags - // The lower 63 bits of writers is the number of calls to // Mount.CheckBeginWrite() that have not yet been paired with a call to // Mount.EndWrite(). The MSB of writers is set if MS_RDONLY is in effect. @@ -95,10 +95,10 @@ type Mount struct { func newMount(vfs *VirtualFilesystem, fs *Filesystem, root *Dentry, mntns *MountNamespace, opts *MountOptions) *Mount { mnt := &Mount{ ID: atomic.AddUint64(&vfs.lastMountID, 1), + Flags: opts.Flags, vfs: vfs, fs: fs, root: root, - flags: opts.Flags, ns: mntns, refs: 1, } @@ -113,13 +113,12 @@ func (mnt *Mount) Options() MountOptions { mnt.vfs.mountMu.Lock() defer mnt.vfs.mountMu.Unlock() return MountOptions{ - Flags: mnt.flags, + Flags: mnt.Flags, ReadOnly: mnt.readOnly(), } } -// A MountNamespace is a collection of Mounts. -// +// A MountNamespace is a collection of Mounts.// // MountNamespaces are reference-counted. Unless otherwise specified, all // MountNamespace methods require that a reference is held. // @@ -127,6 +126,9 @@ func (mnt *Mount) Options() MountOptions { // // +stateify savable type MountNamespace struct { + // Owner is the usernamespace that owns this mount namespace. + Owner *auth.UserNamespace + // root is the MountNamespace's root mount. root is immutable. root *Mount @@ -163,6 +165,7 @@ func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth return nil, err } mntns := &MountNamespace{ + Owner: creds.UserNamespace, refs: 1, mountpoints: make(map[*Dentry]uint32), } @@ -279,6 +282,9 @@ func (vfs *VirtualFilesystem) UmountAt(ctx context.Context, creds *auth.Credenti } // MNT_FORCE is currently unimplemented except for the permission check. + // Force unmounting specifically requires CAP_SYS_ADMIN in the root user + // namespace, and not in the owner user namespace for the target mount. See + // fs/namespace.c:SYSCALL_DEFINE2(umount, ...) if opts.Flags&linux.MNT_FORCE != 0 && creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, creds.UserNamespace.Root()) { return syserror.EPERM } @@ -753,7 +759,10 @@ func (vfs *VirtualFilesystem) GenerateProcMounts(ctx context.Context, taskRootDi if mnt.readOnly() { opts = "ro" } - if mnt.flags.NoExec { + if mnt.Flags.NoATime { + opts = ",noatime" + } + if mnt.Flags.NoExec { opts += ",noexec" } @@ -838,11 +847,12 @@ func (vfs *VirtualFilesystem) GenerateProcMountInfo(ctx context.Context, taskRoo if mnt.readOnly() { opts = "ro" } - if mnt.flags.NoExec { + if mnt.Flags.NoATime { + opts = ",noatime" + } + if mnt.Flags.NoExec { opts += ",noexec" } - // TODO(gvisor.dev/issue/1193): Add "noatime" if MS_NOATIME is - // set. fmt.Fprintf(buf, "%s ", opts) // (7) Optional fields: zero or more fields of the form "tag[:value]". diff --git a/pkg/sentry/vfs/options.go b/pkg/sentry/vfs/options.go index 53d364c5c..f223aeda8 100644 --- a/pkg/sentry/vfs/options.go +++ b/pkg/sentry/vfs/options.go @@ -75,6 +75,10 @@ type MknodOptions struct { type MountFlags struct { // NoExec is equivalent to MS_NOEXEC. NoExec bool + + // NoATime is equivalent to MS_NOATIME and indicates that the + // filesystem should not update access time in-place. + NoATime bool } // MountOptions contains options to VirtualFilesystem.MountAt(). diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go index 52643a7c5..9acca8bc7 100644 --- a/pkg/sentry/vfs/vfs.go +++ b/pkg/sentry/vfs/vfs.go @@ -405,7 +405,7 @@ func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credential vfs.putResolvingPath(rp) if opts.FileExec { - if fd.Mount().flags.NoExec { + if fd.Mount().Flags.NoExec { fd.DecRef() return nil, syserror.EACCES } diff --git a/runsc/boot/vfs.go b/runsc/boot/vfs.go index 6c84f0794..7ed6801b4 100644 --- a/runsc/boot/vfs.go +++ b/runsc/boot/vfs.go @@ -272,7 +272,7 @@ func (c *containerMounter) getMountNameAndOptionsVFS2(conf *Config, m *mountAndF case "ro": opts.ReadOnly = true case "noatime": - // TODO(gvisor.dev/issue/1193): Implement MS_NOATIME. + opts.Flags.NoATime = true case "noexec": opts.Flags.NoExec = true default: -- cgit v1.2.3 From dc029b4b96e92719b2850e9d5556f68837737997 Mon Sep 17 00:00:00 2001 From: Jamie Liu Date: Mon, 8 Jun 2020 13:27:06 -0700 Subject: Implement VFS2 tmpfs mount options. As in VFS1, the mode, uid, and gid options are supported. Updates #1197 PiperOrigin-RevId: 315340510 --- pkg/sentry/fsimpl/tmpfs/device_file.go | 4 +- pkg/sentry/fsimpl/tmpfs/directory.go | 4 +- pkg/sentry/fsimpl/tmpfs/filesystem.go | 20 ++++--- pkg/sentry/fsimpl/tmpfs/named_pipe.go | 4 +- pkg/sentry/fsimpl/tmpfs/regular_file.go | 4 +- pkg/sentry/fsimpl/tmpfs/socket_file.go | 4 +- pkg/sentry/fsimpl/tmpfs/symlink.go | 4 +- pkg/sentry/fsimpl/tmpfs/tmpfs.go | 94 ++++++++++++++++++++------------- 8 files changed, 81 insertions(+), 57 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/tmpfs/device_file.go b/pkg/sentry/fsimpl/tmpfs/device_file.go index 83bf885ee..ac54d420d 100644 --- a/pkg/sentry/fsimpl/tmpfs/device_file.go +++ b/pkg/sentry/fsimpl/tmpfs/device_file.go @@ -29,7 +29,7 @@ type deviceFile struct { minor uint32 } -func (fs *filesystem) newDeviceFile(creds *auth.Credentials, mode linux.FileMode, kind vfs.DeviceKind, major, minor uint32) *inode { +func (fs *filesystem) newDeviceFile(kuid auth.KUID, kgid auth.KGID, mode linux.FileMode, kind vfs.DeviceKind, major, minor uint32) *inode { file := &deviceFile{ kind: kind, major: major, @@ -43,7 +43,7 @@ func (fs *filesystem) newDeviceFile(creds *auth.Credentials, mode linux.FileMode default: panic(fmt.Sprintf("invalid DeviceKind: %v", kind)) } - file.inode.init(file, fs, creds, mode) + file.inode.init(file, fs, kuid, kgid, mode) file.inode.nlink = 1 // from parent directory return &file.inode } diff --git a/pkg/sentry/fsimpl/tmpfs/directory.go b/pkg/sentry/fsimpl/tmpfs/directory.go index 70387cb9c..913b8a6c5 100644 --- a/pkg/sentry/fsimpl/tmpfs/directory.go +++ b/pkg/sentry/fsimpl/tmpfs/directory.go @@ -48,9 +48,9 @@ type directory struct { childList dentryList } -func (fs *filesystem) newDirectory(creds *auth.Credentials, mode linux.FileMode) *directory { +func (fs *filesystem) newDirectory(kuid auth.KUID, kgid auth.KGID, mode linux.FileMode) *directory { dir := &directory{} - dir.inode.init(dir, fs, creds, linux.S_IFDIR|mode) + dir.inode.init(dir, fs, kuid, kgid, linux.S_IFDIR|mode) dir.inode.nlink = 2 // from "." and parent directory or ".." for root dir.dentry.inode = &dir.inode dir.dentry.vfsd.Init(&dir.dentry) diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go index 183eb975c..e801680e8 100644 --- a/pkg/sentry/fsimpl/tmpfs/filesystem.go +++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go @@ -256,11 +256,12 @@ func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs. // MkdirAt implements vfs.FilesystemImpl.MkdirAt. func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error { return fs.doCreateAt(rp, true /* dir */, func(parentDir *directory, name string) error { + creds := rp.Credentials() if parentDir.inode.nlink == maxLinks { return syserror.EMLINK } parentDir.inode.incLinksLocked() // from child's ".." - childDir := fs.newDirectory(rp.Credentials(), opts.Mode) + childDir := fs.newDirectory(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode) parentDir.insertChildLocked(&childDir.dentry, name) return nil }) @@ -269,18 +270,19 @@ func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v // MknodAt implements vfs.FilesystemImpl.MknodAt. func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error { return fs.doCreateAt(rp, false /* dir */, func(parentDir *directory, name string) error { + creds := rp.Credentials() var childInode *inode switch opts.Mode.FileType() { case 0, linux.S_IFREG: - childInode = fs.newRegularFile(rp.Credentials(), opts.Mode) + childInode = fs.newRegularFile(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode) case linux.S_IFIFO: - childInode = fs.newNamedPipe(rp.Credentials(), opts.Mode) + childInode = fs.newNamedPipe(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode) case linux.S_IFBLK: - childInode = fs.newDeviceFile(rp.Credentials(), opts.Mode, vfs.BlockDevice, opts.DevMajor, opts.DevMinor) + childInode = fs.newDeviceFile(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode, vfs.BlockDevice, opts.DevMajor, opts.DevMinor) case linux.S_IFCHR: - childInode = fs.newDeviceFile(rp.Credentials(), opts.Mode, vfs.CharDevice, opts.DevMajor, opts.DevMinor) + childInode = fs.newDeviceFile(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode, vfs.CharDevice, opts.DevMajor, opts.DevMinor) case linux.S_IFSOCK: - childInode = fs.newSocketFile(rp.Credentials(), opts.Mode, opts.Endpoint) + childInode = fs.newSocketFile(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode, opts.Endpoint) default: return syserror.EINVAL } @@ -355,7 +357,8 @@ afterTrailingSymlink: } defer rp.Mount().EndWrite() // Create and open the child. - child := fs.newDentry(fs.newRegularFile(rp.Credentials(), opts.Mode)) + creds := rp.Credentials() + child := fs.newDentry(fs.newRegularFile(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode)) parentDir.insertChildLocked(child, name) fd, err := child.open(ctx, rp, &opts, true) if err != nil { @@ -676,7 +679,8 @@ func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linu // SymlinkAt implements vfs.FilesystemImpl.SymlinkAt. func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error { return fs.doCreateAt(rp, false /* dir */, func(parentDir *directory, name string) error { - child := fs.newDentry(fs.newSymlink(rp.Credentials(), target)) + creds := rp.Credentials() + child := fs.newDentry(fs.newSymlink(creds.EffectiveKUID, creds.EffectiveKGID, 0777, target)) parentDir.insertChildLocked(child, name) return nil }) diff --git a/pkg/sentry/fsimpl/tmpfs/named_pipe.go b/pkg/sentry/fsimpl/tmpfs/named_pipe.go index 8d77b3fa8..739350cf0 100644 --- a/pkg/sentry/fsimpl/tmpfs/named_pipe.go +++ b/pkg/sentry/fsimpl/tmpfs/named_pipe.go @@ -30,9 +30,9 @@ type namedPipe struct { // Preconditions: // * fs.mu must be locked. // * rp.Mount().CheckBeginWrite() has been called successfully. -func (fs *filesystem) newNamedPipe(creds *auth.Credentials, mode linux.FileMode) *inode { +func (fs *filesystem) newNamedPipe(kuid auth.KUID, kgid auth.KGID, mode linux.FileMode) *inode { file := &namedPipe{pipe: pipe.NewVFSPipe(true /* isNamed */, pipe.DefaultPipeSize, usermem.PageSize)} - file.inode.init(file, fs, creds, linux.S_IFIFO|mode) + file.inode.init(file, fs, kuid, kgid, linux.S_IFIFO|mode) file.inode.nlink = 1 // Only the parent has a link. return &file.inode } diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file.go b/pkg/sentry/fsimpl/tmpfs/regular_file.go index fee174375..4f2ae04d2 100644 --- a/pkg/sentry/fsimpl/tmpfs/regular_file.go +++ b/pkg/sentry/fsimpl/tmpfs/regular_file.go @@ -85,12 +85,12 @@ type regularFile struct { size uint64 } -func (fs *filesystem) newRegularFile(creds *auth.Credentials, mode linux.FileMode) *inode { +func (fs *filesystem) newRegularFile(kuid auth.KUID, kgid auth.KGID, mode linux.FileMode) *inode { file := ®ularFile{ memFile: fs.memFile, seals: linux.F_SEAL_SEAL, } - file.inode.init(file, fs, creds, linux.S_IFREG|mode) + file.inode.init(file, fs, kuid, kgid, linux.S_IFREG|mode) file.inode.nlink = 1 // from parent directory return &file.inode } diff --git a/pkg/sentry/fsimpl/tmpfs/socket_file.go b/pkg/sentry/fsimpl/tmpfs/socket_file.go index 25c2321af..3ed650474 100644 --- a/pkg/sentry/fsimpl/tmpfs/socket_file.go +++ b/pkg/sentry/fsimpl/tmpfs/socket_file.go @@ -26,9 +26,9 @@ type socketFile struct { ep transport.BoundEndpoint } -func (fs *filesystem) newSocketFile(creds *auth.Credentials, mode linux.FileMode, ep transport.BoundEndpoint) *inode { +func (fs *filesystem) newSocketFile(kuid auth.KUID, kgid auth.KGID, mode linux.FileMode, ep transport.BoundEndpoint) *inode { file := &socketFile{ep: ep} - file.inode.init(file, fs, creds, mode) + file.inode.init(file, fs, kuid, kgid, mode) file.inode.nlink = 1 // from parent directory return &file.inode } diff --git a/pkg/sentry/fsimpl/tmpfs/symlink.go b/pkg/sentry/fsimpl/tmpfs/symlink.go index 47e075ed4..b0de5fabe 100644 --- a/pkg/sentry/fsimpl/tmpfs/symlink.go +++ b/pkg/sentry/fsimpl/tmpfs/symlink.go @@ -24,11 +24,11 @@ type symlink struct { target string // immutable } -func (fs *filesystem) newSymlink(creds *auth.Credentials, target string) *inode { +func (fs *filesystem) newSymlink(kuid auth.KUID, kgid auth.KGID, mode linux.FileMode, target string) *inode { link := &symlink{ target: target, } - link.inode.init(link, fs, creds, linux.S_IFLNK|0777) + link.inode.init(link, fs, kuid, kgid, linux.S_IFLNK|mode) link.inode.nlink = 1 // from parent directory return &link.inode } diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go index 3777ebdf2..7ce1b86c7 100644 --- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go +++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go @@ -113,57 +113,78 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt } } - devMinor, err := vfsObj.GetAnonBlockDevMinor() - if err != nil { - return nil, nil, err - } - clock := time.RealtimeClockFromContext(ctx) - fs := filesystem{ - memFile: memFileProvider.MemoryFile(), - clock: clock, - devMinor: devMinor, - } - fs.vfsfs.Init(vfsObj, newFSType, &fs) - mopts := vfs.GenericParseMountOptions(opts.Data) - - defaultMode := linux.FileMode(0777) - if modeStr, ok := mopts["mode"]; ok { + rootMode := linux.FileMode(0777) + if rootFileType == linux.S_IFDIR { + rootMode = 01777 + } + modeStr, ok := mopts["mode"] + if ok { + delete(mopts, "mode") mode, err := strconv.ParseUint(modeStr, 8, 32) if err != nil { - return nil, nil, fmt.Errorf("Mount option \"mode='%v'\" not parsable: %v", modeStr, err) + ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: invalid mode: %q", modeStr) + return nil, nil, syserror.EINVAL } - defaultMode = linux.FileMode(mode) + rootMode = linux.FileMode(mode & 07777) } - - defaultOwnerCreds := creds.Fork() - if uidStr, ok := mopts["uid"]; ok { - uid, err := strconv.ParseInt(uidStr, 10, 32) + rootKUID := creds.EffectiveKUID + uidStr, ok := mopts["uid"] + if ok { + delete(mopts, "uid") + uid, err := strconv.ParseUint(uidStr, 10, 32) if err != nil { - return nil, nil, fmt.Errorf("Mount option \"uid='%v'\" not parsable: %v", uidStr, err) + ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: invalid uid: %q", uidStr) + return nil, nil, syserror.EINVAL } - if err := defaultOwnerCreds.SetUID(auth.UID(uid)); err != nil { - return nil, nil, fmt.Errorf("Error using mount option \"uid='%v'\": %v", uidStr, err) + kuid := creds.UserNamespace.MapToKUID(auth.UID(uid)) + if !kuid.Ok() { + ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: unmapped uid: %d", uid) + return nil, nil, syserror.EINVAL } + rootKUID = kuid } - if gidStr, ok := mopts["gid"]; ok { - gid, err := strconv.ParseInt(gidStr, 10, 32) + rootKGID := creds.EffectiveKGID + gidStr, ok := mopts["gid"] + if ok { + delete(mopts, "gid") + gid, err := strconv.ParseUint(gidStr, 10, 32) if err != nil { - return nil, nil, fmt.Errorf("Mount option \"gid='%v'\" not parsable: %v", gidStr, err) + ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: invalid gid: %q", gidStr) + return nil, nil, syserror.EINVAL } - if err := defaultOwnerCreds.SetGID(auth.GID(gid)); err != nil { - return nil, nil, fmt.Errorf("Error using mount option \"gid='%v'\": %v", gidStr, err) + kgid := creds.UserNamespace.MapToKGID(auth.GID(gid)) + if !kgid.Ok() { + ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: unmapped gid: %d", gid) + return nil, nil, syserror.EINVAL } + rootKGID = kgid + } + if len(mopts) != 0 { + ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: unknown options: %v", mopts) + return nil, nil, syserror.EINVAL + } + + devMinor, err := vfsObj.GetAnonBlockDevMinor() + if err != nil { + return nil, nil, err } + clock := time.RealtimeClockFromContext(ctx) + fs := filesystem{ + memFile: memFileProvider.MemoryFile(), + clock: clock, + devMinor: devMinor, + } + fs.vfsfs.Init(vfsObj, newFSType, &fs) var root *dentry switch rootFileType { case linux.S_IFREG: - root = fs.newDentry(fs.newRegularFile(defaultOwnerCreds, defaultMode)) + root = fs.newDentry(fs.newRegularFile(rootKUID, rootKGID, rootMode)) case linux.S_IFLNK: - root = fs.newDentry(fs.newSymlink(defaultOwnerCreds, tmpfsOpts.RootSymlinkTarget)) + root = fs.newDentry(fs.newSymlink(rootKUID, rootKGID, rootMode, tmpfsOpts.RootSymlinkTarget)) case linux.S_IFDIR: - root = &fs.newDirectory(defaultOwnerCreds, defaultMode).dentry + root = &fs.newDirectory(rootKUID, rootKGID, rootMode).dentry default: fs.vfsfs.DecRef() return nil, nil, fmt.Errorf("invalid tmpfs root file type: %#o", rootFileType) @@ -301,15 +322,15 @@ type inode struct { const maxLinks = math.MaxUint32 -func (i *inode) init(impl interface{}, fs *filesystem, creds *auth.Credentials, mode linux.FileMode) { +func (i *inode) init(impl interface{}, fs *filesystem, kuid auth.KUID, kgid auth.KGID, mode linux.FileMode) { if mode.FileType() == 0 { panic("file type is required in FileMode") } i.fs = fs i.refs = 1 i.mode = uint32(mode) - i.uid = uint32(creds.EffectiveKUID) - i.gid = uint32(creds.EffectiveKGID) + i.uid = uint32(kuid) + i.gid = uint32(kgid) i.ino = atomic.AddUint64(&fs.nextInoMinusOne, 1) // Tmpfs creation sets atime, ctime, and mtime to current time. now := fs.clock.Now().Nanoseconds() @@ -766,8 +787,7 @@ func NewMemfd(mount *vfs.Mount, creds *auth.Credentials, allowSeals bool, name s // Per Linux, mm/shmem.c:__shmem_file_setup(), memfd inodes are set up with // S_IRWXUGO. - mode := linux.FileMode(0777) - inode := fs.newRegularFile(creds, mode) + inode := fs.newRegularFile(creds.EffectiveKUID, creds.EffectiveKGID, 0777) rf := inode.impl.(*regularFile) if allowSeals { rf.seals = 0 -- cgit v1.2.3 From 67565078bbcdd8f797206d996605df8f6658d00a Mon Sep 17 00:00:00 2001 From: Fabricio Voznika Date: Tue, 9 Jun 2020 18:44:57 -0700 Subject: Implement flock(2) in VFS2 LockFD is the generic implementation that can be embedded in FileDescriptionImpl implementations. Unique lock ID is maintained in vfs.FileDescription and is created on demand. Updates #1480 PiperOrigin-RevId: 315604825 --- pkg/sentry/devices/memdev/full.go | 1 + pkg/sentry/devices/memdev/null.go | 1 + pkg/sentry/devices/memdev/random.go | 1 + pkg/sentry/devices/memdev/zero.go | 1 + pkg/sentry/fs/file.go | 2 +- pkg/sentry/fs/lock/lock.go | 41 +++----- pkg/sentry/fs/lock/lock_set_functions.go | 8 +- pkg/sentry/fs/lock/lock_test.go | 111 +++++++++++----------- pkg/sentry/fsimpl/devpts/BUILD | 1 + pkg/sentry/fsimpl/devpts/devpts.go | 5 +- pkg/sentry/fsimpl/devpts/master.go | 5 + pkg/sentry/fsimpl/devpts/slave.go | 5 + pkg/sentry/fsimpl/eventfd/eventfd.go | 1 + pkg/sentry/fsimpl/ext/BUILD | 1 + pkg/sentry/fsimpl/ext/file_description.go | 1 + pkg/sentry/fsimpl/ext/inode.go | 6 ++ pkg/sentry/fsimpl/ext/regular_file.go | 1 + pkg/sentry/fsimpl/gofer/BUILD | 2 + pkg/sentry/fsimpl/gofer/filesystem.go | 9 +- pkg/sentry/fsimpl/gofer/gofer.go | 23 +++++ pkg/sentry/fsimpl/gofer/special_file.go | 4 +- pkg/sentry/fsimpl/host/BUILD | 1 + pkg/sentry/fsimpl/host/host.go | 8 +- pkg/sentry/fsimpl/kernfs/BUILD | 2 + pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go | 10 +- pkg/sentry/fsimpl/kernfs/fd_impl_util.go | 9 +- pkg/sentry/fsimpl/kernfs/inode_impl_util.go | 5 +- pkg/sentry/fsimpl/kernfs/kernfs_test.go | 13 ++- pkg/sentry/fsimpl/pipefs/BUILD | 1 + pkg/sentry/fsimpl/pipefs/pipefs.go | 6 +- pkg/sentry/fsimpl/proc/BUILD | 1 + pkg/sentry/fsimpl/proc/subtasks.go | 5 +- pkg/sentry/fsimpl/proc/task.go | 5 +- pkg/sentry/fsimpl/proc/task_fds.go | 7 +- pkg/sentry/fsimpl/proc/task_files.go | 10 +- pkg/sentry/fsimpl/proc/tasks.go | 5 +- pkg/sentry/fsimpl/signalfd/signalfd.go | 1 + pkg/sentry/fsimpl/sys/BUILD | 1 + pkg/sentry/fsimpl/sys/sys.go | 7 +- pkg/sentry/fsimpl/timerfd/timerfd.go | 1 + pkg/sentry/fsimpl/tmpfs/filesystem.go | 6 +- pkg/sentry/fsimpl/tmpfs/regular_file.go | 23 ----- pkg/sentry/fsimpl/tmpfs/tmpfs.go | 41 +------- pkg/sentry/kernel/fd_table.go | 15 +-- pkg/sentry/kernel/kernel.go | 5 - pkg/sentry/kernel/pipe/BUILD | 1 + pkg/sentry/kernel/pipe/vfs.go | 13 ++- pkg/sentry/socket/hostinet/BUILD | 1 + pkg/sentry/socket/hostinet/socket_vfs2.go | 3 + pkg/sentry/socket/netlink/BUILD | 1 + pkg/sentry/socket/netlink/socket_vfs2.go | 8 +- pkg/sentry/socket/netstack/BUILD | 1 + pkg/sentry/socket/netstack/netstack_vfs2.go | 3 + pkg/sentry/socket/unix/BUILD | 1 + pkg/sentry/socket/unix/unix_vfs2.go | 7 +- pkg/sentry/syscalls/linux/sys_file.go | 39 ++------ pkg/sentry/syscalls/linux/vfs2/BUILD | 2 + pkg/sentry/syscalls/linux/vfs2/lock.go | 64 +++++++++++++ pkg/sentry/syscalls/linux/vfs2/vfs2.go | 2 +- pkg/sentry/vfs/BUILD | 1 + pkg/sentry/vfs/epoll.go | 1 + pkg/sentry/vfs/file_description.go | 25 ++++- pkg/sentry/vfs/file_description_impl_util.go | 80 ++++++++++++---- pkg/sentry/vfs/file_description_impl_util_test.go | 1 + pkg/sentry/vfs/inotify.go | 1 + test/syscalls/linux/BUILD | 3 + test/syscalls/linux/flock.cc | 75 ++++++++++++--- 67 files changed, 470 insertions(+), 281 deletions(-) create mode 100644 pkg/sentry/syscalls/linux/vfs2/lock.go (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/devices/memdev/full.go b/pkg/sentry/devices/memdev/full.go index c7e197691..af66fe4dc 100644 --- a/pkg/sentry/devices/memdev/full.go +++ b/pkg/sentry/devices/memdev/full.go @@ -42,6 +42,7 @@ type fullFD struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl vfs.DentryMetadataFileDescriptionImpl + vfs.NoLockFD } // Release implements vfs.FileDescriptionImpl.Release. diff --git a/pkg/sentry/devices/memdev/null.go b/pkg/sentry/devices/memdev/null.go index 33d060d02..92d3d71be 100644 --- a/pkg/sentry/devices/memdev/null.go +++ b/pkg/sentry/devices/memdev/null.go @@ -43,6 +43,7 @@ type nullFD struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl vfs.DentryMetadataFileDescriptionImpl + vfs.NoLockFD } // Release implements vfs.FileDescriptionImpl.Release. diff --git a/pkg/sentry/devices/memdev/random.go b/pkg/sentry/devices/memdev/random.go index acfa23149..6b81da5ef 100644 --- a/pkg/sentry/devices/memdev/random.go +++ b/pkg/sentry/devices/memdev/random.go @@ -48,6 +48,7 @@ type randomFD struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl vfs.DentryMetadataFileDescriptionImpl + vfs.NoLockFD // off is the "file offset". off is accessed using atomic memory // operations. diff --git a/pkg/sentry/devices/memdev/zero.go b/pkg/sentry/devices/memdev/zero.go index 3b1372b9e..c6f15054d 100644 --- a/pkg/sentry/devices/memdev/zero.go +++ b/pkg/sentry/devices/memdev/zero.go @@ -44,6 +44,7 @@ type zeroFD struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl vfs.DentryMetadataFileDescriptionImpl + vfs.NoLockFD } // Release implements vfs.FileDescriptionImpl.Release. diff --git a/pkg/sentry/fs/file.go b/pkg/sentry/fs/file.go index 2a278fbe3..ca41520b4 100644 --- a/pkg/sentry/fs/file.go +++ b/pkg/sentry/fs/file.go @@ -146,7 +146,7 @@ func (f *File) DecRef() { f.DecRefWithDestructor(func() { // Drop BSD style locks. lockRng := lock.LockRange{Start: 0, End: lock.LockEOF} - f.Dirent.Inode.LockCtx.BSD.UnlockRegion(lock.UniqueID(f.UniqueID), lockRng) + f.Dirent.Inode.LockCtx.BSD.UnlockRegion(f, lockRng) // Release resources held by the FileOperations. f.FileOperations.Release() diff --git a/pkg/sentry/fs/lock/lock.go b/pkg/sentry/fs/lock/lock.go index 926538d90..8a5d9c7eb 100644 --- a/pkg/sentry/fs/lock/lock.go +++ b/pkg/sentry/fs/lock/lock.go @@ -62,7 +62,7 @@ import ( type LockType int // UniqueID is a unique identifier of the holder of a regional file lock. -type UniqueID uint64 +type UniqueID interface{} const ( // ReadLock describes a POSIX regional file lock to be taken @@ -98,12 +98,7 @@ type Lock struct { // If len(Readers) > 0 then HasWriter must be false. Readers map[UniqueID]bool - // HasWriter indicates that this is a write lock held by a single - // UniqueID. - HasWriter bool - - // Writer is only valid if HasWriter is true. It identifies a - // single write lock holder. + // Writer holds the writer unique ID. It's nil if there are no writers. Writer UniqueID } @@ -186,7 +181,6 @@ func makeLock(uid UniqueID, t LockType) Lock { case ReadLock: value.Readers[uid] = true case WriteLock: - value.HasWriter = true value.Writer = uid default: panic(fmt.Sprintf("makeLock: invalid lock type %d", t)) @@ -196,10 +190,7 @@ func makeLock(uid UniqueID, t LockType) Lock { // isHeld returns true if uid is a holder of Lock. func (l Lock) isHeld(uid UniqueID) bool { - if l.HasWriter && l.Writer == uid { - return true - } - return l.Readers[uid] + return l.Writer == uid || l.Readers[uid] } // lock sets uid as a holder of a typed lock on Lock. @@ -214,20 +205,20 @@ func (l *Lock) lock(uid UniqueID, t LockType) { } // We cannot downgrade a write lock to a read lock unless the // uid is the same. - if l.HasWriter { + if l.Writer != nil { if l.Writer != uid { panic(fmt.Sprintf("lock: cannot downgrade write lock to read lock for uid %d, writer is %d", uid, l.Writer)) } // Ensure that there is only one reader if upgrading. l.Readers = make(map[UniqueID]bool) // Ensure that there is no longer a writer. - l.HasWriter = false + l.Writer = nil } l.Readers[uid] = true return case WriteLock: // If we are already the writer, then this is a no-op. - if l.HasWriter && l.Writer == uid { + if l.Writer == uid { return } // We can only upgrade a read lock to a write lock if there @@ -243,7 +234,6 @@ func (l *Lock) lock(uid UniqueID, t LockType) { } // Ensure that there is only a writer. l.Readers = make(map[UniqueID]bool) - l.HasWriter = true l.Writer = uid default: panic(fmt.Sprintf("lock: invalid lock type %d", t)) @@ -277,9 +267,8 @@ func (l LockSet) canLock(uid UniqueID, t LockType, r LockRange) bool { switch t { case ReadLock: return l.lockable(r, func(value Lock) bool { - // If there is no writer, there's no problem adding - // another reader. - if !value.HasWriter { + // If there is no writer, there's no problem adding another reader. + if value.Writer == nil { return true } // If there is a writer, then it must be the same uid @@ -289,10 +278,9 @@ func (l LockSet) canLock(uid UniqueID, t LockType, r LockRange) bool { case WriteLock: return l.lockable(r, func(value Lock) bool { // If there are only readers. - if !value.HasWriter { - // Then this uid can only take a write lock if - // this is a private upgrade, meaning that the - // only reader is uid. + if value.Writer == nil { + // Then this uid can only take a write lock if this is a private + // upgrade, meaning that the only reader is uid. return len(value.Readers) == 1 && value.Readers[uid] } // If the uid is already a writer on this region, then @@ -304,7 +292,8 @@ func (l LockSet) canLock(uid UniqueID, t LockType, r LockRange) bool { } } -// lock returns true if uid took a lock of type t on the entire range of LockRange. +// lock returns true if uid took a lock of type t on the entire range of +// LockRange. // // Preconditions: r.Start <= r.End (will panic otherwise). func (l *LockSet) lock(uid UniqueID, t LockType, r LockRange) bool { @@ -339,7 +328,7 @@ func (l *LockSet) lock(uid UniqueID, t LockType, r LockRange) bool { seg, _ = l.SplitUnchecked(seg, r.End) } - // Set the lock on the segment. This is guaranteed to + // Set the lock on the segment. This is guaranteed to // always be safe, given canLock above. value := seg.ValuePtr() value.lock(uid, t) @@ -386,7 +375,7 @@ func (l *LockSet) unlock(uid UniqueID, r LockRange) { value := seg.Value() var remove bool - if value.HasWriter && value.Writer == uid { + if value.Writer == uid { // If we are unlocking a writer, then since there can // only ever be one writer and no readers, then this // lock should always be removed from the set. diff --git a/pkg/sentry/fs/lock/lock_set_functions.go b/pkg/sentry/fs/lock/lock_set_functions.go index 8a3ace0c1..50a16e662 100644 --- a/pkg/sentry/fs/lock/lock_set_functions.go +++ b/pkg/sentry/fs/lock/lock_set_functions.go @@ -44,14 +44,9 @@ func (lockSetFunctions) Merge(r1 LockRange, val1 Lock, r2 LockRange, val2 Lock) return Lock{}, false } } - if val1.HasWriter != val2.HasWriter { + if val1.Writer != val2.Writer { return Lock{}, false } - if val1.HasWriter { - if val1.Writer != val2.Writer { - return Lock{}, false - } - } return val1, true } @@ -62,7 +57,6 @@ func (lockSetFunctions) Split(r LockRange, val Lock, split uint64) (Lock, Lock) for k, v := range val.Readers { val0.Readers[k] = v } - val0.HasWriter = val.HasWriter val0.Writer = val.Writer return val, val0 diff --git a/pkg/sentry/fs/lock/lock_test.go b/pkg/sentry/fs/lock/lock_test.go index ba002aeb7..fad90984b 100644 --- a/pkg/sentry/fs/lock/lock_test.go +++ b/pkg/sentry/fs/lock/lock_test.go @@ -42,9 +42,6 @@ func equals(e0, e1 []entry) bool { if !reflect.DeepEqual(e0[i].LockRange, e1[i].LockRange) { return false } - if e0[i].Lock.HasWriter != e1[i].Lock.HasWriter { - return false - } if e0[i].Lock.Writer != e1[i].Lock.Writer { return false } @@ -105,7 +102,7 @@ func TestCanLock(t *testing.T) { LockRange: LockRange{2048, 3072}, }, { - Lock: Lock{HasWriter: true, Writer: 1}, + Lock: Lock{Writer: 1}, LockRange: LockRange{3072, 4096}, }, }) @@ -241,7 +238,7 @@ func TestSetLock(t *testing.T) { // 0 max uint64 after: []entry{ { - Lock: Lock{HasWriter: true, Writer: 0}, + Lock: Lock{Writer: 0}, LockRange: LockRange{0, LockEOF}, }, }, @@ -254,7 +251,7 @@ func TestSetLock(t *testing.T) { // 0 max uint64 before: []entry{ { - Lock: Lock{HasWriter: true, Writer: 0}, + Lock: Lock{Writer: 0}, LockRange: LockRange{0, LockEOF}, }, }, @@ -273,7 +270,7 @@ func TestSetLock(t *testing.T) { LockRange: LockRange{0, 4096}, }, { - Lock: Lock{HasWriter: true, Writer: 0}, + Lock: Lock{Writer: 0}, LockRange: LockRange{4096, LockEOF}, }, }, @@ -301,7 +298,7 @@ func TestSetLock(t *testing.T) { // 0 4096 max uint64 after: []entry{ { - Lock: Lock{HasWriter: true, Writer: 0}, + Lock: Lock{Writer: 0}, LockRange: LockRange{0, 4096}, }, { @@ -318,7 +315,7 @@ func TestSetLock(t *testing.T) { // 0 max uint64 before: []entry{ { - Lock: Lock{HasWriter: true, Writer: 0}, + Lock: Lock{Writer: 0}, LockRange: LockRange{0, LockEOF}, }, }, @@ -550,7 +547,7 @@ func TestSetLock(t *testing.T) { LockRange: LockRange{0, 1024}, }, { - Lock: Lock{HasWriter: true, Writer: 0}, + Lock: Lock{Writer: 0}, LockRange: LockRange{1024, 4096}, }, { @@ -594,7 +591,7 @@ func TestSetLock(t *testing.T) { LockRange: LockRange{0, 1024}, }, { - Lock: Lock{HasWriter: true, Writer: 0}, + Lock: Lock{Writer: 0}, LockRange: LockRange{1024, 3072}, }, { @@ -633,7 +630,7 @@ func TestSetLock(t *testing.T) { // 0 1024 2048 4096 max uint64 before: []entry{ { - Lock: Lock{HasWriter: true, Writer: 0}, + Lock: Lock{Writer: 0}, LockRange: LockRange{0, 1024}, }, { @@ -663,11 +660,11 @@ func TestSetLock(t *testing.T) { // 0 1024 max uint64 after: []entry{ { - Lock: Lock{HasWriter: true, Writer: 0}, + Lock: Lock{Writer: 0}, LockRange: LockRange{0, 1024}, }, { - Lock: Lock{HasWriter: true, Writer: 0}, + Lock: Lock{Writer: 0}, LockRange: LockRange{1024, LockEOF}, }, }, @@ -675,28 +672,30 @@ func TestSetLock(t *testing.T) { } for _, test := range tests { - l := fill(test.before) + t.Run(test.name, func(t *testing.T) { + l := fill(test.before) - r := LockRange{Start: test.start, End: test.end} - success := l.lock(test.uid, test.lockType, r) - var got []entry - for seg := l.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { - got = append(got, entry{ - Lock: seg.Value(), - LockRange: seg.Range(), - }) - } + r := LockRange{Start: test.start, End: test.end} + success := l.lock(test.uid, test.lockType, r) + var got []entry + for seg := l.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { + got = append(got, entry{ + Lock: seg.Value(), + LockRange: seg.Range(), + }) + } - if success != test.success { - t.Errorf("%s: setlock(%v, %+v, %d, %d) got success %v, want %v", test.name, test.before, r, test.uid, test.lockType, success, test.success) - continue - } + if success != test.success { + t.Errorf("setlock(%v, %+v, %d, %d) got success %v, want %v", test.before, r, test.uid, test.lockType, success, test.success) + return + } - if success { - if !equals(got, test.after) { - t.Errorf("%s: got set %+v, want %+v", test.name, got, test.after) + if success { + if !equals(got, test.after) { + t.Errorf("got set %+v, want %+v", got, test.after) + } } - } + }) } } @@ -782,7 +781,7 @@ func TestUnlock(t *testing.T) { // 0 max uint64 before: []entry{ { - Lock: Lock{HasWriter: true, Writer: 0}, + Lock: Lock{Writer: 0}, LockRange: LockRange{0, LockEOF}, }, }, @@ -824,7 +823,7 @@ func TestUnlock(t *testing.T) { // 0 max uint64 before: []entry{ { - Lock: Lock{HasWriter: true, Writer: 0}, + Lock: Lock{Writer: 0}, LockRange: LockRange{0, LockEOF}, }, }, @@ -837,7 +836,7 @@ func TestUnlock(t *testing.T) { // 0 4096 max uint64 after: []entry{ { - Lock: Lock{HasWriter: true, Writer: 0}, + Lock: Lock{Writer: 0}, LockRange: LockRange{4096, LockEOF}, }, }, @@ -876,7 +875,7 @@ func TestUnlock(t *testing.T) { // 0 max uint64 before: []entry{ { - Lock: Lock{HasWriter: true, Writer: 0}, + Lock: Lock{Writer: 0}, LockRange: LockRange{0, LockEOF}, }, }, @@ -889,7 +888,7 @@ func TestUnlock(t *testing.T) { // 0 4096 after: []entry{ { - Lock: Lock{HasWriter: true, Writer: 0}, + Lock: Lock{Writer: 0}, LockRange: LockRange{0, 4096}, }, }, @@ -906,7 +905,7 @@ func TestUnlock(t *testing.T) { LockRange: LockRange{0, 1024}, }, { - Lock: Lock{HasWriter: true, Writer: 0}, + Lock: Lock{Writer: 0}, LockRange: LockRange{1024, 4096}, }, { @@ -974,7 +973,7 @@ func TestUnlock(t *testing.T) { // 0 1024 4096 max uint64 before: []entry{ { - Lock: Lock{HasWriter: true, Writer: 0}, + Lock: Lock{Writer: 0}, LockRange: LockRange{0, 1024}, }, { @@ -991,7 +990,7 @@ func TestUnlock(t *testing.T) { // 0 8 4096 max uint64 after: []entry{ { - Lock: Lock{HasWriter: true, Writer: 0}, + Lock: Lock{Writer: 0}, LockRange: LockRange{0, 8}, }, { @@ -1008,7 +1007,7 @@ func TestUnlock(t *testing.T) { // 0 1024 4096 max uint64 before: []entry{ { - Lock: Lock{HasWriter: true, Writer: 0}, + Lock: Lock{Writer: 0}, LockRange: LockRange{0, 1024}, }, { @@ -1025,7 +1024,7 @@ func TestUnlock(t *testing.T) { // 0 1024 4096 8192 max uint64 after: []entry{ { - Lock: Lock{HasWriter: true, Writer: 0}, + Lock: Lock{Writer: 0}, LockRange: LockRange{0, 1024}, }, { @@ -1041,19 +1040,21 @@ func TestUnlock(t *testing.T) { } for _, test := range tests { - l := fill(test.before) + t.Run(test.name, func(t *testing.T) { + l := fill(test.before) - r := LockRange{Start: test.start, End: test.end} - l.unlock(test.uid, r) - var got []entry - for seg := l.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { - got = append(got, entry{ - Lock: seg.Value(), - LockRange: seg.Range(), - }) - } - if !equals(got, test.after) { - t.Errorf("%s: got set %+v, want %+v", test.name, got, test.after) - } + r := LockRange{Start: test.start, End: test.end} + l.unlock(test.uid, r) + var got []entry + for seg := l.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { + got = append(got, entry{ + Lock: seg.Value(), + LockRange: seg.Range(), + }) + } + if !equals(got, test.after) { + t.Errorf("got set %+v, want %+v", got, test.after) + } + }) } } diff --git a/pkg/sentry/fsimpl/devpts/BUILD b/pkg/sentry/fsimpl/devpts/BUILD index 585764223..cf440dce8 100644 --- a/pkg/sentry/fsimpl/devpts/BUILD +++ b/pkg/sentry/fsimpl/devpts/BUILD @@ -23,6 +23,7 @@ go_library( "//pkg/sentry/kernel/auth", "//pkg/sentry/unimpl", "//pkg/sentry/vfs", + "//pkg/sentry/vfs/lock", "//pkg/sync", "//pkg/syserror", "//pkg/usermem", diff --git a/pkg/sentry/fsimpl/devpts/devpts.go b/pkg/sentry/fsimpl/devpts/devpts.go index c03c65445..9b0e0cca2 100644 --- a/pkg/sentry/fsimpl/devpts/devpts.go +++ b/pkg/sentry/fsimpl/devpts/devpts.go @@ -28,6 +28,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/syserror" ) @@ -116,6 +117,8 @@ type rootInode struct { kernfs.InodeNotSymlink kernfs.OrderedChildren + locks lock.FileLocks + // Keep a reference to this inode's dentry. dentry kernfs.Dentry @@ -183,7 +186,7 @@ func (i *rootInode) masterClose(t *Terminal) { // Open implements kernfs.Inode.Open. func (i *rootInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &opts) + fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts) if err != nil { return nil, err } diff --git a/pkg/sentry/fsimpl/devpts/master.go b/pkg/sentry/fsimpl/devpts/master.go index 7a7ce5d81..1d22adbe3 100644 --- a/pkg/sentry/fsimpl/devpts/master.go +++ b/pkg/sentry/fsimpl/devpts/master.go @@ -22,6 +22,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/unimpl" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" @@ -34,6 +35,8 @@ type masterInode struct { kernfs.InodeNotDirectory kernfs.InodeNotSymlink + locks lock.FileLocks + // Keep a reference to this inode's dentry. dentry kernfs.Dentry @@ -55,6 +58,7 @@ func (mi *masterInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vf inode: mi, t: t, } + fd.LockFD.Init(&mi.locks) if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{}); err != nil { mi.DecRef() return nil, err @@ -85,6 +89,7 @@ func (mi *masterInode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds type masterFileDescription struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl + vfs.LockFD inode *masterInode t *Terminal diff --git a/pkg/sentry/fsimpl/devpts/slave.go b/pkg/sentry/fsimpl/devpts/slave.go index 526cd406c..7fe475080 100644 --- a/pkg/sentry/fsimpl/devpts/slave.go +++ b/pkg/sentry/fsimpl/devpts/slave.go @@ -21,6 +21,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" @@ -33,6 +34,8 @@ type slaveInode struct { kernfs.InodeNotDirectory kernfs.InodeNotSymlink + locks lock.FileLocks + // Keep a reference to this inode's dentry. dentry kernfs.Dentry @@ -51,6 +54,7 @@ func (si *slaveInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs fd := &slaveFileDescription{ inode: si, } + fd.LockFD.Init(&si.locks) if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{}); err != nil { si.DecRef() return nil, err @@ -91,6 +95,7 @@ func (si *slaveInode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds type slaveFileDescription struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl + vfs.LockFD inode *slaveInode } diff --git a/pkg/sentry/fsimpl/eventfd/eventfd.go b/pkg/sentry/fsimpl/eventfd/eventfd.go index c573d7935..d12d78b84 100644 --- a/pkg/sentry/fsimpl/eventfd/eventfd.go +++ b/pkg/sentry/fsimpl/eventfd/eventfd.go @@ -37,6 +37,7 @@ type EventFileDescription struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl vfs.DentryMetadataFileDescriptionImpl + vfs.NoLockFD // queue is used to notify interested parties when the event object // becomes readable or writable. diff --git a/pkg/sentry/fsimpl/ext/BUILD b/pkg/sentry/fsimpl/ext/BUILD index ff861d0fe..973fa0def 100644 --- a/pkg/sentry/fsimpl/ext/BUILD +++ b/pkg/sentry/fsimpl/ext/BUILD @@ -60,6 +60,7 @@ go_library( "//pkg/sentry/socket/unix/transport", "//pkg/sentry/syscalls/linux", "//pkg/sentry/vfs", + "//pkg/sentry/vfs/lock", "//pkg/sync", "//pkg/syserror", "//pkg/usermem", diff --git a/pkg/sentry/fsimpl/ext/file_description.go b/pkg/sentry/fsimpl/ext/file_description.go index 92f7da40d..90b086468 100644 --- a/pkg/sentry/fsimpl/ext/file_description.go +++ b/pkg/sentry/fsimpl/ext/file_description.go @@ -26,6 +26,7 @@ import ( type fileDescription struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl + vfs.LockFD } func (fd *fileDescription) filesystem() *filesystem { diff --git a/pkg/sentry/fsimpl/ext/inode.go b/pkg/sentry/fsimpl/ext/inode.go index 485f86f4b..e4b434b13 100644 --- a/pkg/sentry/fsimpl/ext/inode.go +++ b/pkg/sentry/fsimpl/ext/inode.go @@ -22,6 +22,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/syserror" ) @@ -54,6 +55,8 @@ type inode struct { // diskInode gives us access to the inode struct on disk. Immutable. diskInode disklayout.Inode + locks lock.FileLocks + // This is immutable. The first field of the implementations must have inode // as the first field to ensure temporality. impl interface{} @@ -157,6 +160,7 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts *vfs.OpenOpt switch in.impl.(type) { case *regularFile: var fd regularFileFD + fd.LockFD.Init(&in.locks) if err := fd.vfsfd.Init(&fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{}); err != nil { return nil, err } @@ -168,6 +172,7 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts *vfs.OpenOpt return nil, syserror.EISDIR } var fd directoryFD + fd.LockFD.Init(&in.locks) if err := fd.vfsfd.Init(&fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{}); err != nil { return nil, err } @@ -178,6 +183,7 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts *vfs.OpenOpt return nil, syserror.ELOOP } var fd symlinkFD + fd.LockFD.Init(&in.locks) fd.vfsfd.Init(&fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{}) return &fd.vfsfd, nil default: diff --git a/pkg/sentry/fsimpl/ext/regular_file.go b/pkg/sentry/fsimpl/ext/regular_file.go index 30135ddb0..f7015c44f 100644 --- a/pkg/sentry/fsimpl/ext/regular_file.go +++ b/pkg/sentry/fsimpl/ext/regular_file.go @@ -77,6 +77,7 @@ func (in *inode) isRegular() bool { // vfs.FileDescriptionImpl. type regularFileFD struct { fileDescription + vfs.LockFD // off is the file offset. off is accessed using atomic memory operations. off int64 diff --git a/pkg/sentry/fsimpl/gofer/BUILD b/pkg/sentry/fsimpl/gofer/BUILD index f5f35a3bc..5cdeeaeb5 100644 --- a/pkg/sentry/fsimpl/gofer/BUILD +++ b/pkg/sentry/fsimpl/gofer/BUILD @@ -54,6 +54,7 @@ go_library( "//pkg/p9", "//pkg/safemem", "//pkg/sentry/fs/fsutil", + "//pkg/sentry/fs/lock", "//pkg/sentry/fsimpl/host", "//pkg/sentry/hostfd", "//pkg/sentry/kernel", @@ -68,6 +69,7 @@ go_library( "//pkg/sentry/socket/unix/transport", "//pkg/sentry/usage", "//pkg/sentry/vfs", + "//pkg/sentry/vfs/lock", "//pkg/syserr", "//pkg/syserror", "//pkg/unet", diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go index 36e0e1856..40933b74b 100644 --- a/pkg/sentry/fsimpl/gofer/filesystem.go +++ b/pkg/sentry/fsimpl/gofer/filesystem.go @@ -801,6 +801,7 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vf return nil, err } fd := ®ularFileFD{} + fd.LockFD.Init(&d.locks) if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{ AllowDirectIO: true, }); err != nil { @@ -826,6 +827,7 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vf } } fd := &directoryFD{} + fd.LockFD.Init(&d.locks) if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil { return nil, err } @@ -842,7 +844,7 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vf } case linux.S_IFIFO: if d.isSynthetic() { - return d.pipe.Open(ctx, mnt, &d.vfsd, opts.Flags) + return d.pipe.Open(ctx, mnt, &d.vfsd, opts.Flags, &d.locks) } } return d.openSpecialFileLocked(ctx, mnt, opts) @@ -902,7 +904,7 @@ retry: return nil, err } } - fd, err := newSpecialFileFD(h, mnt, d, opts.Flags) + fd, err := newSpecialFileFD(h, mnt, d, &d.locks, opts.Flags) if err != nil { h.close(ctx) return nil, err @@ -989,6 +991,7 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving var childVFSFD *vfs.FileDescription if useRegularFileFD { fd := ®ularFileFD{} + fd.LockFD.Init(&child.locks) if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &child.vfsd, &vfs.FileDescriptionOptions{ AllowDirectIO: true, }); err != nil { @@ -1003,7 +1006,7 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving if fdobj != nil { h.fd = int32(fdobj.Release()) } - fd, err := newSpecialFileFD(h, mnt, child, opts.Flags) + fd, err := newSpecialFileFD(h, mnt, child, &d.locks, opts.Flags) if err != nil { h.close(ctx) return nil, err diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go index 3f3bd56f0..0d88a328e 100644 --- a/pkg/sentry/fsimpl/gofer/gofer.go +++ b/pkg/sentry/fsimpl/gofer/gofer.go @@ -45,6 +45,7 @@ import ( "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/p9" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" + fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" @@ -52,6 +53,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/unet" "gvisor.dev/gvisor/pkg/usermem" @@ -662,6 +664,8 @@ type dentry struct { // If this dentry represents a synthetic named pipe, pipe is the pipe // endpoint bound to this file. pipe *pipe.VFSPipe + + locks lock.FileLocks } // dentryAttrMask returns a p9.AttrMask enabling all attributes used by the @@ -1366,6 +1370,9 @@ func (d *dentry) decLinks() { type fileDescription struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl + vfs.LockFD + + lockLogging sync.Once } func (fd *fileDescription) filesystem() *filesystem { @@ -1416,3 +1423,19 @@ func (fd *fileDescription) Setxattr(ctx context.Context, opts vfs.SetxattrOption func (fd *fileDescription) Removexattr(ctx context.Context, name string) error { return fd.dentry().removexattr(ctx, auth.CredentialsFromContext(ctx), name) } + +// LockBSD implements vfs.FileDescriptionImpl.LockBSD. +func (fd *fileDescription) LockBSD(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, block fslock.Blocker) error { + fd.lockLogging.Do(func() { + log.Infof("File lock using gofer file handled internally.") + }) + return fd.LockFD.LockBSD(ctx, uid, t, block) +} + +// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. +func (fd *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, rng fslock.LockRange, block fslock.Blocker) error { + fd.lockLogging.Do(func() { + log.Infof("Range lock using gofer file handled internally.") + }) + return fd.LockFD.LockPOSIX(ctx, uid, t, rng, block) +} diff --git a/pkg/sentry/fsimpl/gofer/special_file.go b/pkg/sentry/fsimpl/gofer/special_file.go index ff6126b87..289efdd25 100644 --- a/pkg/sentry/fsimpl/gofer/special_file.go +++ b/pkg/sentry/fsimpl/gofer/special_file.go @@ -22,6 +22,7 @@ import ( "gvisor.dev/gvisor/pkg/fdnotifier" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" @@ -51,7 +52,7 @@ type specialFileFD struct { off int64 } -func newSpecialFileFD(h handle, mnt *vfs.Mount, d *dentry, flags uint32) (*specialFileFD, error) { +func newSpecialFileFD(h handle, mnt *vfs.Mount, d *dentry, locks *lock.FileLocks, flags uint32) (*specialFileFD, error) { ftype := d.fileType() seekable := ftype == linux.S_IFREG mayBlock := ftype == linux.S_IFIFO || ftype == linux.S_IFSOCK @@ -60,6 +61,7 @@ func newSpecialFileFD(h handle, mnt *vfs.Mount, d *dentry, flags uint32) (*speci seekable: seekable, mayBlock: mayBlock, } + fd.LockFD.Init(locks) if mayBlock && h.fd >= 0 { if err := fdnotifier.AddFD(h.fd, &fd.queue); err != nil { return nil, err diff --git a/pkg/sentry/fsimpl/host/BUILD b/pkg/sentry/fsimpl/host/BUILD index ca0fe6d2b..54f16ad63 100644 --- a/pkg/sentry/fsimpl/host/BUILD +++ b/pkg/sentry/fsimpl/host/BUILD @@ -39,6 +39,7 @@ go_library( "//pkg/sentry/unimpl", "//pkg/sentry/uniqueid", "//pkg/sentry/vfs", + "//pkg/sentry/vfs/lock", "//pkg/sync", "//pkg/syserr", "//pkg/syserror", diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go index 18b127521..5ec5100b8 100644 --- a/pkg/sentry/fsimpl/host/host.go +++ b/pkg/sentry/fsimpl/host/host.go @@ -34,6 +34,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/memmap" unixsocket "gvisor.dev/gvisor/pkg/sentry/socket/unix" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" @@ -182,6 +183,8 @@ type inode struct { kernfs.InodeNotDirectory kernfs.InodeNotSymlink + locks lock.FileLocks + // When the reference count reaches zero, the host fd is closed. refs.AtomicRefCount @@ -468,7 +471,7 @@ func (i *inode) open(ctx context.Context, d *vfs.Dentry, mnt *vfs.Mount, flags u return nil, err } // Currently, we only allow Unix sockets to be imported. - return unixsocket.NewFileDescription(ep, ep.Type(), flags, mnt, d) + return unixsocket.NewFileDescription(ep, ep.Type(), flags, mnt, d, &i.locks) } // TODO(gvisor.dev/issue/1672): Whitelist specific file types here, so that @@ -478,6 +481,7 @@ func (i *inode) open(ctx context.Context, d *vfs.Dentry, mnt *vfs.Mount, flags u fileDescription: fileDescription{inode: i}, termios: linux.DefaultSlaveTermios, } + fd.LockFD.Init(&i.locks) vfsfd := &fd.vfsfd if err := vfsfd.Init(fd, flags, mnt, d, &vfs.FileDescriptionOptions{}); err != nil { return nil, err @@ -486,6 +490,7 @@ func (i *inode) open(ctx context.Context, d *vfs.Dentry, mnt *vfs.Mount, flags u } fd := &fileDescription{inode: i} + fd.LockFD.Init(&i.locks) vfsfd := &fd.vfsfd if err := vfsfd.Init(fd, flags, mnt, d, &vfs.FileDescriptionOptions{}); err != nil { return nil, err @@ -497,6 +502,7 @@ func (i *inode) open(ctx context.Context, d *vfs.Dentry, mnt *vfs.Mount, flags u type fileDescription struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl + vfs.LockFD // inode is vfsfd.Dentry().Impl().(*kernfs.Dentry).Inode().(*inode), but // cached to reduce indirections and casting. fileDescription does not hold diff --git a/pkg/sentry/fsimpl/kernfs/BUILD b/pkg/sentry/fsimpl/kernfs/BUILD index ef34cb28a..0299dbde9 100644 --- a/pkg/sentry/fsimpl/kernfs/BUILD +++ b/pkg/sentry/fsimpl/kernfs/BUILD @@ -49,6 +49,7 @@ go_library( "//pkg/sentry/memmap", "//pkg/sentry/socket/unix/transport", "//pkg/sentry/vfs", + "//pkg/sentry/vfs/lock", "//pkg/sync", "//pkg/syserror", "//pkg/usermem", @@ -67,6 +68,7 @@ go_test( "//pkg/sentry/fsimpl/testutil", "//pkg/sentry/kernel/auth", "//pkg/sentry/vfs", + "//pkg/sentry/vfs/lock", "//pkg/syserror", "//pkg/usermem", "@com_github_google_go-cmp//cmp:go_default_library", diff --git a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go index 1568a9d49..6418de0a3 100644 --- a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go +++ b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go @@ -21,6 +21,7 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -38,7 +39,8 @@ type DynamicBytesFile struct { InodeNotDirectory InodeNotSymlink - data vfs.DynamicBytesSource + locks lock.FileLocks + data vfs.DynamicBytesSource } var _ Inode = (*DynamicBytesFile)(nil) @@ -55,7 +57,7 @@ func (f *DynamicBytesFile) Init(creds *auth.Credentials, devMajor, devMinor uint // Open implements Inode.Open. func (f *DynamicBytesFile) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { fd := &DynamicBytesFD{} - if err := fd.Init(rp.Mount(), vfsd, f.data, opts.Flags); err != nil { + if err := fd.Init(rp.Mount(), vfsd, f.data, &f.locks, opts.Flags); err != nil { return nil, err } return &fd.vfsfd, nil @@ -77,13 +79,15 @@ func (*DynamicBytesFile) SetStat(context.Context, *vfs.Filesystem, *auth.Credent type DynamicBytesFD struct { vfs.FileDescriptionDefaultImpl vfs.DynamicBytesFileDescriptionImpl + vfs.LockFD vfsfd vfs.FileDescription inode Inode } // Init initializes a DynamicBytesFD. -func (fd *DynamicBytesFD) Init(m *vfs.Mount, d *vfs.Dentry, data vfs.DynamicBytesSource, flags uint32) error { +func (fd *DynamicBytesFD) Init(m *vfs.Mount, d *vfs.Dentry, data vfs.DynamicBytesSource, locks *lock.FileLocks, flags uint32) error { + fd.LockFD.Init(locks) if err := fd.vfsfd.Init(fd, flags, m, d, &vfs.FileDescriptionOptions{}); err != nil { return err } diff --git a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go index 8284e76a7..33a5968ca 100644 --- a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go @@ -22,6 +22,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" @@ -42,6 +43,7 @@ import ( type GenericDirectoryFD struct { vfs.FileDescriptionDefaultImpl vfs.DirectoryFileDescriptionDefaultImpl + vfs.LockFD vfsfd vfs.FileDescription children *OrderedChildren @@ -55,9 +57,9 @@ type GenericDirectoryFD struct { // NewGenericDirectoryFD creates a new GenericDirectoryFD and returns its // dentry. -func NewGenericDirectoryFD(m *vfs.Mount, d *vfs.Dentry, children *OrderedChildren, opts *vfs.OpenOptions) (*GenericDirectoryFD, error) { +func NewGenericDirectoryFD(m *vfs.Mount, d *vfs.Dentry, children *OrderedChildren, locks *lock.FileLocks, opts *vfs.OpenOptions) (*GenericDirectoryFD, error) { fd := &GenericDirectoryFD{} - if err := fd.Init(children, opts); err != nil { + if err := fd.Init(children, locks, opts); err != nil { return nil, err } if err := fd.vfsfd.Init(fd, opts.Flags, m, d, &vfs.FileDescriptionOptions{}); err != nil { @@ -69,11 +71,12 @@ func NewGenericDirectoryFD(m *vfs.Mount, d *vfs.Dentry, children *OrderedChildre // Init initializes a GenericDirectoryFD. Use it when overriding // GenericDirectoryFD. Caller must call fd.VFSFileDescription.Init() with the // correct implementation. -func (fd *GenericDirectoryFD) Init(children *OrderedChildren, opts *vfs.OpenOptions) error { +func (fd *GenericDirectoryFD) Init(children *OrderedChildren, locks *lock.FileLocks, opts *vfs.OpenOptions) error { if vfs.AccessTypesForOpenFlags(opts)&vfs.MayWrite != 0 { // Can't open directories for writing. return syserror.EISDIR } + fd.LockFD.Init(locks) fd.children = children return nil } diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go index 982daa2e6..0e4927215 100644 --- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go @@ -23,6 +23,7 @@ import ( "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) @@ -555,6 +556,8 @@ type StaticDirectory struct { InodeAttrs InodeNoDynamicLookup OrderedChildren + + locks lock.FileLocks } var _ Inode = (*StaticDirectory)(nil) @@ -584,7 +587,7 @@ func (s *StaticDirectory) Init(creds *auth.Credentials, devMajor, devMinor uint3 // Open implements kernfs.Inode. func (s *StaticDirectory) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - fd, err := NewGenericDirectoryFD(rp.Mount(), vfsd, &s.OrderedChildren, &opts) + fd, err := NewGenericDirectoryFD(rp.Mount(), vfsd, &s.OrderedChildren, &s.locks, &opts) if err != nil { return nil, err } diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go index 412cf6ac9..6749facf7 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs_test.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go @@ -27,6 +27,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fsimpl/testutil" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -100,8 +101,10 @@ type readonlyDir struct { kernfs.InodeNotSymlink kernfs.InodeNoDynamicLookup kernfs.InodeDirectoryNoNewChildren - kernfs.OrderedChildren + + locks lock.FileLocks + dentry kernfs.Dentry } @@ -117,7 +120,7 @@ func (fs *filesystem) newReadonlyDir(creds *auth.Credentials, mode linux.FileMod } func (d *readonlyDir) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &opts) + fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &d.locks, &opts) if err != nil { return nil, err } @@ -128,10 +131,12 @@ type dir struct { attrs kernfs.InodeNotSymlink kernfs.InodeNoDynamicLookup + kernfs.OrderedChildren + + locks lock.FileLocks fs *filesystem dentry kernfs.Dentry - kernfs.OrderedChildren } func (fs *filesystem) newDir(creds *auth.Credentials, mode linux.FileMode, contents map[string]*kernfs.Dentry) *kernfs.Dentry { @@ -147,7 +152,7 @@ func (fs *filesystem) newDir(creds *auth.Credentials, mode linux.FileMode, conte } func (d *dir) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &opts) + fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &d.locks, &opts) if err != nil { return nil, err } diff --git a/pkg/sentry/fsimpl/pipefs/BUILD b/pkg/sentry/fsimpl/pipefs/BUILD index 5950a2d59..c618dbe6c 100644 --- a/pkg/sentry/fsimpl/pipefs/BUILD +++ b/pkg/sentry/fsimpl/pipefs/BUILD @@ -15,6 +15,7 @@ go_library( "//pkg/sentry/kernel/pipe", "//pkg/sentry/kernel/time", "//pkg/sentry/vfs", + "//pkg/sentry/vfs/lock", "//pkg/syserror", "//pkg/usermem", ], diff --git a/pkg/sentry/fsimpl/pipefs/pipefs.go b/pkg/sentry/fsimpl/pipefs/pipefs.go index cab771211..e4dabaa33 100644 --- a/pkg/sentry/fsimpl/pipefs/pipefs.go +++ b/pkg/sentry/fsimpl/pipefs/pipefs.go @@ -27,6 +27,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -81,7 +82,8 @@ type inode struct { kernfs.InodeNotSymlink kernfs.InodeNoopRefCount - pipe *pipe.VFSPipe + locks lock.FileLocks + pipe *pipe.VFSPipe ino uint64 uid auth.KUID @@ -147,7 +149,7 @@ func (i *inode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds *auth. // Open implements kernfs.Inode.Open. func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - return i.pipe.Open(ctx, rp.Mount(), vfsd, opts.Flags) + return i.pipe.Open(ctx, rp.Mount(), vfsd, opts.Flags, &i.locks) } // NewConnectedPipeFDs returns a pair of FileDescriptions representing the read diff --git a/pkg/sentry/fsimpl/proc/BUILD b/pkg/sentry/fsimpl/proc/BUILD index 17c1342b5..351ba4ee9 100644 --- a/pkg/sentry/fsimpl/proc/BUILD +++ b/pkg/sentry/fsimpl/proc/BUILD @@ -35,6 +35,7 @@ go_library( "//pkg/sentry/socket/unix/transport", "//pkg/sentry/usage", "//pkg/sentry/vfs", + "//pkg/sentry/vfs/lock", "//pkg/syserror", "//pkg/tcpip/header", "//pkg/usermem", diff --git a/pkg/sentry/fsimpl/proc/subtasks.go b/pkg/sentry/fsimpl/proc/subtasks.go index 36a911db4..e2cdb7ee9 100644 --- a/pkg/sentry/fsimpl/proc/subtasks.go +++ b/pkg/sentry/fsimpl/proc/subtasks.go @@ -24,6 +24,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/syserror" ) @@ -37,6 +38,8 @@ type subtasksInode struct { kernfs.OrderedChildren kernfs.AlwaysValid + locks lock.FileLocks + fs *filesystem task *kernel.Task pidns *kernel.PIDNamespace @@ -153,7 +156,7 @@ func (fd *subtasksFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) erro // Open implements kernfs.Inode. func (i *subtasksInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { fd := &subtasksFD{task: i.task} - if err := fd.Init(&i.OrderedChildren, &opts); err != nil { + if err := fd.Init(&i.OrderedChildren, &i.locks, &opts); err != nil { return nil, err } if err := fd.VFSFileDescription().Init(fd, opts.Flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{}); err != nil { diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go index 482055db1..44078a765 100644 --- a/pkg/sentry/fsimpl/proc/task.go +++ b/pkg/sentry/fsimpl/proc/task.go @@ -25,6 +25,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/mm" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/syserror" ) @@ -38,6 +39,8 @@ type taskInode struct { kernfs.InodeAttrs kernfs.OrderedChildren + locks lock.FileLocks + task *kernel.Task } @@ -103,7 +106,7 @@ func (i *taskInode) Valid(ctx context.Context) bool { // Open implements kernfs.Inode. func (i *taskInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &opts) + fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts) if err != nil { return nil, err } diff --git a/pkg/sentry/fsimpl/proc/task_fds.go b/pkg/sentry/fsimpl/proc/task_fds.go index 44ccc9e4a..ef6c1d04f 100644 --- a/pkg/sentry/fsimpl/proc/task_fds.go +++ b/pkg/sentry/fsimpl/proc/task_fds.go @@ -27,6 +27,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/syserror" ) @@ -53,6 +54,8 @@ func taskFDExists(t *kernel.Task, fd int32) bool { } type fdDir struct { + locks lock.FileLocks + fs *filesystem task *kernel.Task @@ -143,7 +146,7 @@ func (i *fdDirInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, erro // Open implements kernfs.Inode. func (i *fdDirInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &opts) + fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts) if err != nil { return nil, err } @@ -270,7 +273,7 @@ func (i *fdInfoDirInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, // Open implements kernfs.Inode. func (i *fdInfoDirInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &opts) + fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts) if err != nil { return nil, err } diff --git a/pkg/sentry/fsimpl/proc/task_files.go b/pkg/sentry/fsimpl/proc/task_files.go index 2f297e48a..e5eaa91cd 100644 --- a/pkg/sentry/fsimpl/proc/task_files.go +++ b/pkg/sentry/fsimpl/proc/task_files.go @@ -30,6 +30,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/mm" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -775,6 +776,8 @@ type namespaceInode struct { kernfs.InodeNoopRefCount kernfs.InodeNotDirectory kernfs.InodeNotSymlink + + locks lock.FileLocks } var _ kernfs.Inode = (*namespaceInode)(nil) @@ -791,6 +794,7 @@ func (i *namespaceInode) Init(creds *auth.Credentials, devMajor, devMinor uint32 func (i *namespaceInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { fd := &namespaceFD{inode: i} i.IncRef() + fd.LockFD.Init(&i.locks) if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{}); err != nil { return nil, err } @@ -801,6 +805,7 @@ func (i *namespaceInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd * // /proc/[pid]/ns/*. type namespaceFD struct { vfs.FileDescriptionDefaultImpl + vfs.LockFD vfsfd vfs.FileDescription inode *namespaceInode @@ -825,8 +830,3 @@ func (fd *namespaceFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) err func (fd *namespaceFD) Release() { fd.inode.DecRef() } - -// OnClose implements FileDescriptionImpl. -func (*namespaceFD) OnClose(context.Context) error { - return nil -} diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go index b51d43954..58c8b9d05 100644 --- a/pkg/sentry/fsimpl/proc/tasks.go +++ b/pkg/sentry/fsimpl/proc/tasks.go @@ -25,6 +25,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/syserror" ) @@ -43,6 +44,8 @@ type tasksInode struct { kernfs.OrderedChildren kernfs.AlwaysValid + locks lock.FileLocks + fs *filesystem pidns *kernel.PIDNamespace @@ -197,7 +200,7 @@ func (i *tasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback // Open implements kernfs.Inode. func (i *tasksInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &opts) + fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts) if err != nil { return nil, err } diff --git a/pkg/sentry/fsimpl/signalfd/signalfd.go b/pkg/sentry/fsimpl/signalfd/signalfd.go index d29ef3f83..242ba9b5d 100644 --- a/pkg/sentry/fsimpl/signalfd/signalfd.go +++ b/pkg/sentry/fsimpl/signalfd/signalfd.go @@ -31,6 +31,7 @@ type SignalFileDescription struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl vfs.DentryMetadataFileDescriptionImpl + vfs.NoLockFD // target is the original signal target task. // diff --git a/pkg/sentry/fsimpl/sys/BUILD b/pkg/sentry/fsimpl/sys/BUILD index a741e2bb6..237f17def 100644 --- a/pkg/sentry/fsimpl/sys/BUILD +++ b/pkg/sentry/fsimpl/sys/BUILD @@ -15,6 +15,7 @@ go_library( "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", "//pkg/sentry/vfs", + "//pkg/sentry/vfs/lock", "//pkg/syserror", ], ) diff --git a/pkg/sentry/fsimpl/sys/sys.go b/pkg/sentry/fsimpl/sys/sys.go index 0af373604..b84463d3a 100644 --- a/pkg/sentry/fsimpl/sys/sys.go +++ b/pkg/sentry/fsimpl/sys/sys.go @@ -25,6 +25,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/syserror" ) @@ -98,8 +99,10 @@ type dir struct { kernfs.InodeNoDynamicLookup kernfs.InodeNotSymlink kernfs.InodeDirectoryNoNewChildren - kernfs.OrderedChildren + + locks lock.FileLocks + dentry kernfs.Dentry } @@ -121,7 +124,7 @@ func (*dir) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.Set // Open implements kernfs.Inode.Open. func (d *dir) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &opts) + fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &d.locks, &opts) if err != nil { return nil, err } diff --git a/pkg/sentry/fsimpl/timerfd/timerfd.go b/pkg/sentry/fsimpl/timerfd/timerfd.go index 60c92d626..2dc90d484 100644 --- a/pkg/sentry/fsimpl/timerfd/timerfd.go +++ b/pkg/sentry/fsimpl/timerfd/timerfd.go @@ -32,6 +32,7 @@ type TimerFileDescription struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl vfs.DentryMetadataFileDescriptionImpl + vfs.NoLockFD events waiter.Queue timer *ktime.Timer diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go index e801680e8..72399b321 100644 --- a/pkg/sentry/fsimpl/tmpfs/filesystem.go +++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go @@ -399,6 +399,7 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.Open switch impl := d.inode.impl.(type) { case *regularFile: var fd regularFileFD + fd.LockFD.Init(&d.inode.locks) if err := fd.vfsfd.Init(&fd, opts.Flags, rp.Mount(), &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil { return nil, err } @@ -414,15 +415,16 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.Open return nil, syserror.EISDIR } var fd directoryFD + fd.LockFD.Init(&d.inode.locks) if err := fd.vfsfd.Init(&fd, opts.Flags, rp.Mount(), &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil { return nil, err } return &fd.vfsfd, nil case *symlink: - // Can't open symlinks without O_PATH (which is unimplemented). + // TODO(gvisor.dev/issue/2782): Can't open symlinks without O_PATH. return nil, syserror.ELOOP case *namedPipe: - return impl.pipe.Open(ctx, rp.Mount(), &d.vfsd, opts.Flags) + return impl.pipe.Open(ctx, rp.Mount(), &d.vfsd, opts.Flags, &d.inode.locks) case *deviceFile: return rp.VirtualFilesystem().OpenDeviceSpecialFile(ctx, rp.Mount(), &d.vfsd, impl.kind, impl.major, impl.minor, opts) case *socketFile: diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file.go b/pkg/sentry/fsimpl/tmpfs/regular_file.go index 4f2ae04d2..77447b32c 100644 --- a/pkg/sentry/fsimpl/tmpfs/regular_file.go +++ b/pkg/sentry/fsimpl/tmpfs/regular_file.go @@ -25,7 +25,6 @@ import ( "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" - "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" @@ -366,28 +365,6 @@ func (fd *regularFileFD) Sync(ctx context.Context) error { return nil } -// LockBSD implements vfs.FileDescriptionImpl.LockBSD. -func (fd *regularFileFD) LockBSD(ctx context.Context, uid lock.UniqueID, t lock.LockType, block lock.Blocker) error { - return fd.inode().lockBSD(uid, t, block) -} - -// UnlockBSD implements vfs.FileDescriptionImpl.UnlockBSD. -func (fd *regularFileFD) UnlockBSD(ctx context.Context, uid lock.UniqueID) error { - fd.inode().unlockBSD(uid) - return nil -} - -// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. -func (fd *regularFileFD) LockPOSIX(ctx context.Context, uid lock.UniqueID, t lock.LockType, rng lock.LockRange, block lock.Blocker) error { - return fd.inode().lockPOSIX(uid, t, rng, block) -} - -// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. -func (fd *regularFileFD) UnlockPOSIX(ctx context.Context, uid lock.UniqueID, rng lock.LockRange) error { - fd.inode().unlockPOSIX(uid, rng) - return nil -} - // ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap. func (fd *regularFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { file := fd.inode().impl.(*regularFile) diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go index 7ce1b86c7..71a7522af 100644 --- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go +++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go @@ -36,7 +36,6 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" - fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/pgalloc" @@ -311,7 +310,6 @@ type inode struct { ctime int64 // nanoseconds mtime int64 // nanoseconds - // Advisory file locks, which lock at the inode level. locks lock.FileLocks // Inotify watches for this inode. @@ -539,44 +537,6 @@ func (i *inode) setStat(ctx context.Context, creds *auth.Credentials, stat *linu return nil } -// TODO(gvisor.dev/issue/1480): support file locking for file types other than regular. -func (i *inode) lockBSD(uid fslock.UniqueID, t fslock.LockType, block fslock.Blocker) error { - switch i.impl.(type) { - case *regularFile: - return i.locks.LockBSD(uid, t, block) - } - return syserror.EBADF -} - -// TODO(gvisor.dev/issue/1480): support file locking for file types other than regular. -func (i *inode) unlockBSD(uid fslock.UniqueID) error { - switch i.impl.(type) { - case *regularFile: - i.locks.UnlockBSD(uid) - return nil - } - return syserror.EBADF -} - -// TODO(gvisor.dev/issue/1480): support file locking for file types other than regular. -func (i *inode) lockPOSIX(uid fslock.UniqueID, t fslock.LockType, rng fslock.LockRange, block fslock.Blocker) error { - switch i.impl.(type) { - case *regularFile: - return i.locks.LockPOSIX(uid, t, rng, block) - } - return syserror.EBADF -} - -// TODO(gvisor.dev/issue/1480): support file locking for file types other than regular. -func (i *inode) unlockPOSIX(uid fslock.UniqueID, rng fslock.LockRange) error { - switch i.impl.(type) { - case *regularFile: - i.locks.UnlockPOSIX(uid, rng) - return nil - } - return syserror.EBADF -} - // allocatedBlocksForSize returns the number of 512B blocks needed to // accommodate the given size in bytes, as appropriate for struct // stat::st_blocks and struct statx::stx_blocks. (Note that this 512B block @@ -708,6 +668,7 @@ func (i *inode) userXattrSupported() bool { type fileDescription struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl + vfs.LockFD } func (fd *fileDescription) filesystem() *filesystem { diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go index dbfcef0fa..b35afafe3 100644 --- a/pkg/sentry/kernel/fd_table.go +++ b/pkg/sentry/kernel/fd_table.go @@ -80,9 +80,6 @@ type FDTable struct { refs.AtomicRefCount k *Kernel - // uid is a unique identifier. - uid uint64 - // mu protects below. mu sync.Mutex `state:"nosave"` @@ -130,7 +127,7 @@ func (f *FDTable) loadDescriptorTable(m map[int32]descriptor) { // drop drops the table reference. func (f *FDTable) drop(file *fs.File) { // Release locks. - file.Dirent.Inode.LockCtx.Posix.UnlockRegion(lock.UniqueID(f.uid), lock.LockRange{0, lock.LockEOF}) + file.Dirent.Inode.LockCtx.Posix.UnlockRegion(f, lock.LockRange{0, lock.LockEOF}) // Send inotify events. d := file.Dirent @@ -164,17 +161,9 @@ func (f *FDTable) dropVFS2(file *vfs.FileDescription) { file.DecRef() } -// ID returns a unique identifier for this FDTable. -func (f *FDTable) ID() uint64 { - return f.uid -} - // NewFDTable allocates a new FDTable that may be used by tasks in k. func (k *Kernel) NewFDTable() *FDTable { - f := &FDTable{ - k: k, - uid: atomic.AddUint64(&k.fdMapUids, 1), - } + f := &FDTable{k: k} f.init() return f } diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index 5efeb3767..bcbeb6a39 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -194,11 +194,6 @@ type Kernel struct { // cpuClockTickerSetting is protected by runningTasksMu. cpuClockTickerSetting ktime.Setting - // fdMapUids is an ever-increasing counter for generating FDTable uids. - // - // fdMapUids is mutable, and is accessed using atomic memory operations. - fdMapUids uint64 - // uniqueID is used to generate unique identifiers. // // uniqueID is mutable, and is accessed using atomic memory operations. diff --git a/pkg/sentry/kernel/pipe/BUILD b/pkg/sentry/kernel/pipe/BUILD index 7bfa9075a..0db546b98 100644 --- a/pkg/sentry/kernel/pipe/BUILD +++ b/pkg/sentry/kernel/pipe/BUILD @@ -27,6 +27,7 @@ go_library( "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", "//pkg/sentry/vfs", + "//pkg/sentry/vfs/lock", "//pkg/sync", "//pkg/syserror", "//pkg/usermem", diff --git a/pkg/sentry/kernel/pipe/vfs.go b/pkg/sentry/kernel/pipe/vfs.go index 2602bed72..c0e9ee1f4 100644 --- a/pkg/sentry/kernel/pipe/vfs.go +++ b/pkg/sentry/kernel/pipe/vfs.go @@ -21,6 +21,7 @@ import ( "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" @@ -61,11 +62,13 @@ func NewVFSPipe(isNamed bool, sizeBytes, atomicIOBytes int64) *VFSPipe { // // Preconditions: statusFlags should not contain an open access mode. func (vp *VFSPipe) ReaderWriterPair(mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32) (*vfs.FileDescription, *vfs.FileDescription) { - return vp.newFD(mnt, vfsd, linux.O_RDONLY|statusFlags), vp.newFD(mnt, vfsd, linux.O_WRONLY|statusFlags) + // Connected pipes share the same locks. + locks := &lock.FileLocks{} + return vp.newFD(mnt, vfsd, linux.O_RDONLY|statusFlags, locks), vp.newFD(mnt, vfsd, linux.O_WRONLY|statusFlags, locks) } // Open opens the pipe represented by vp. -func (vp *VFSPipe) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32) (*vfs.FileDescription, error) { +func (vp *VFSPipe) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32, locks *lock.FileLocks) (*vfs.FileDescription, error) { vp.mu.Lock() defer vp.mu.Unlock() @@ -75,7 +78,7 @@ func (vp *VFSPipe) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, s return nil, syserror.EINVAL } - fd := vp.newFD(mnt, vfsd, statusFlags) + fd := vp.newFD(mnt, vfsd, statusFlags, locks) // Named pipes have special blocking semantics during open: // @@ -127,10 +130,11 @@ func (vp *VFSPipe) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, s } // Preconditions: vp.mu must be held. -func (vp *VFSPipe) newFD(mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32) *vfs.FileDescription { +func (vp *VFSPipe) newFD(mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32, locks *lock.FileLocks) *vfs.FileDescription { fd := &VFSPipeFD{ pipe: &vp.pipe, } + fd.LockFD.Init(locks) fd.vfsfd.Init(fd, statusFlags, mnt, vfsd, &vfs.FileDescriptionOptions{ DenyPRead: true, DenyPWrite: true, @@ -159,6 +163,7 @@ type VFSPipeFD struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl vfs.DentryMetadataFileDescriptionImpl + vfs.LockFD pipe *Pipe } diff --git a/pkg/sentry/socket/hostinet/BUILD b/pkg/sentry/socket/hostinet/BUILD index e82d6cd1e..60c9896fc 100644 --- a/pkg/sentry/socket/hostinet/BUILD +++ b/pkg/sentry/socket/hostinet/BUILD @@ -34,6 +34,7 @@ go_library( "//pkg/sentry/socket", "//pkg/sentry/socket/control", "//pkg/sentry/vfs", + "//pkg/sentry/vfs/lock", "//pkg/syserr", "//pkg/syserror", "//pkg/tcpip/stack", diff --git a/pkg/sentry/socket/hostinet/socket_vfs2.go b/pkg/sentry/socket/hostinet/socket_vfs2.go index 677743113..027add1fd 100644 --- a/pkg/sentry/socket/hostinet/socket_vfs2.go +++ b/pkg/sentry/socket/hostinet/socket_vfs2.go @@ -26,6 +26,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/socket" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" @@ -35,6 +36,7 @@ import ( type socketVFS2 struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl + vfs.LockFD // We store metadata for hostinet sockets internally. Technically, we should // access metadata (e.g. through stat, chmod) on the host for correctness, @@ -59,6 +61,7 @@ func newVFS2Socket(t *kernel.Task, family int, stype linux.SockType, protocol in fd: fd, }, } + s.LockFD.Init(&lock.FileLocks{}) if err := fdnotifier.AddFD(int32(fd), &s.queue); err != nil { return nil, syserr.FromError(err) } diff --git a/pkg/sentry/socket/netlink/BUILD b/pkg/sentry/socket/netlink/BUILD index 7212d8644..420e573c9 100644 --- a/pkg/sentry/socket/netlink/BUILD +++ b/pkg/sentry/socket/netlink/BUILD @@ -29,6 +29,7 @@ go_library( "//pkg/sentry/socket/unix", "//pkg/sentry/socket/unix/transport", "//pkg/sentry/vfs", + "//pkg/sentry/vfs/lock", "//pkg/sync", "//pkg/syserr", "//pkg/syserror", diff --git a/pkg/sentry/socket/netlink/socket_vfs2.go b/pkg/sentry/socket/netlink/socket_vfs2.go index b854bf990..8bfee5193 100644 --- a/pkg/sentry/socket/netlink/socket_vfs2.go +++ b/pkg/sentry/socket/netlink/socket_vfs2.go @@ -23,6 +23,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/socket/unix" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/tcpip" @@ -40,6 +41,7 @@ type SocketVFS2 struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl vfs.DentryMetadataFileDescriptionImpl + vfs.LockFD socketOpsCommon } @@ -66,7 +68,7 @@ func NewVFS2(t *kernel.Task, skType linux.SockType, protocol Protocol) (*SocketV return nil, err } - return &SocketVFS2{ + fd := &SocketVFS2{ socketOpsCommon: socketOpsCommon{ ports: t.Kernel().NetlinkPorts(), protocol: protocol, @@ -75,7 +77,9 @@ func NewVFS2(t *kernel.Task, skType linux.SockType, protocol Protocol) (*SocketV connection: connection, sendBufferSize: defaultSendBufferSize, }, - }, nil + } + fd.LockFD.Init(&lock.FileLocks{}) + return fd, nil } // Readiness implements waiter.Waitable.Readiness. diff --git a/pkg/sentry/socket/netstack/BUILD b/pkg/sentry/socket/netstack/BUILD index 8f0f5466e..0f592ecc3 100644 --- a/pkg/sentry/socket/netstack/BUILD +++ b/pkg/sentry/socket/netstack/BUILD @@ -37,6 +37,7 @@ go_library( "//pkg/sentry/socket/netfilter", "//pkg/sentry/unimpl", "//pkg/sentry/vfs", + "//pkg/sentry/vfs/lock", "//pkg/sync", "//pkg/syserr", "//pkg/syserror", diff --git a/pkg/sentry/socket/netstack/netstack_vfs2.go b/pkg/sentry/socket/netstack/netstack_vfs2.go index fcd8013c0..1412a4810 100644 --- a/pkg/sentry/socket/netstack/netstack_vfs2.go +++ b/pkg/sentry/socket/netstack/netstack_vfs2.go @@ -25,6 +25,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/socket" "gvisor.dev/gvisor/pkg/sentry/socket/netfilter" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/tcpip" @@ -38,6 +39,7 @@ type SocketVFS2 struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl vfs.DentryMetadataFileDescriptionImpl + vfs.LockFD socketOpsCommon } @@ -64,6 +66,7 @@ func NewVFS2(t *kernel.Task, family int, skType linux.SockType, protocol int, qu protocol: protocol, }, } + s.LockFD.Init(&lock.FileLocks{}) vfsfd := &s.vfsfd if err := vfsfd.Init(s, linux.O_RDWR, mnt, d, &vfs.FileDescriptionOptions{ DenyPRead: true, diff --git a/pkg/sentry/socket/unix/BUILD b/pkg/sentry/socket/unix/BUILD index de2cc4bdf..7d4cc80fe 100644 --- a/pkg/sentry/socket/unix/BUILD +++ b/pkg/sentry/socket/unix/BUILD @@ -29,6 +29,7 @@ go_library( "//pkg/sentry/socket/netstack", "//pkg/sentry/socket/unix/transport", "//pkg/sentry/vfs", + "//pkg/sentry/vfs/lock", "//pkg/syserr", "//pkg/syserror", "//pkg/tcpip", diff --git a/pkg/sentry/socket/unix/unix_vfs2.go b/pkg/sentry/socket/unix/unix_vfs2.go index 45e109361..8c32371a2 100644 --- a/pkg/sentry/socket/unix/unix_vfs2.go +++ b/pkg/sentry/socket/unix/unix_vfs2.go @@ -26,6 +26,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/socket/netstack" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/tcpip" @@ -39,6 +40,7 @@ type SocketVFS2 struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl vfs.DentryMetadataFileDescriptionImpl + vfs.LockFD socketOpsCommon } @@ -51,7 +53,7 @@ func NewSockfsFile(t *kernel.Task, ep transport.Endpoint, stype linux.SockType) mnt := t.Kernel().SocketMount() d := sockfs.NewDentry(t.Credentials(), mnt) - fd, err := NewFileDescription(ep, stype, linux.O_RDWR, mnt, d) + fd, err := NewFileDescription(ep, stype, linux.O_RDWR, mnt, d, &lock.FileLocks{}) if err != nil { return nil, syserr.FromError(err) } @@ -60,7 +62,7 @@ func NewSockfsFile(t *kernel.Task, ep transport.Endpoint, stype linux.SockType) // NewFileDescription creates and returns a socket file description // corresponding to the given mount and dentry. -func NewFileDescription(ep transport.Endpoint, stype linux.SockType, flags uint32, mnt *vfs.Mount, d *vfs.Dentry) (*vfs.FileDescription, error) { +func NewFileDescription(ep transport.Endpoint, stype linux.SockType, flags uint32, mnt *vfs.Mount, d *vfs.Dentry, locks *lock.FileLocks) (*vfs.FileDescription, error) { // You can create AF_UNIX, SOCK_RAW sockets. They're the same as // SOCK_DGRAM and don't require CAP_NET_RAW. if stype == linux.SOCK_RAW { @@ -73,6 +75,7 @@ func NewFileDescription(ep transport.Endpoint, stype linux.SockType, flags uint3 stype: stype, }, } + sock.LockFD.Init(locks) vfsfd := &sock.vfsfd if err := vfsfd.Init(sock, flags, mnt, d, &vfs.FileDescriptionOptions{ DenyPRead: true, diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go index 35a98212a..8347617bd 100644 --- a/pkg/sentry/syscalls/linux/sys_file.go +++ b/pkg/sentry/syscalls/linux/sys_file.go @@ -998,9 +998,6 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall return 0, nil, err } - // The lock uid is that of the Task's FDTable. - lockUniqueID := lock.UniqueID(t.FDTable().ID()) - // These locks don't block; execute the non-blocking operation using the inode's lock // context directly. switch flock.Type { @@ -1010,12 +1007,12 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall } if cmd == linux.F_SETLK { // Non-blocking lock, provide a nil lock.Blocker. - if !file.Dirent.Inode.LockCtx.Posix.LockRegion(lockUniqueID, lock.ReadLock, rng, nil) { + if !file.Dirent.Inode.LockCtx.Posix.LockRegion(t.FDTable(), lock.ReadLock, rng, nil) { return 0, nil, syserror.EAGAIN } } else { // Blocking lock, pass in the task to satisfy the lock.Blocker interface. - if !file.Dirent.Inode.LockCtx.Posix.LockRegion(lockUniqueID, lock.ReadLock, rng, t) { + if !file.Dirent.Inode.LockCtx.Posix.LockRegion(t.FDTable(), lock.ReadLock, rng, t) { return 0, nil, syserror.EINTR } } @@ -1026,18 +1023,18 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall } if cmd == linux.F_SETLK { // Non-blocking lock, provide a nil lock.Blocker. - if !file.Dirent.Inode.LockCtx.Posix.LockRegion(lockUniqueID, lock.WriteLock, rng, nil) { + if !file.Dirent.Inode.LockCtx.Posix.LockRegion(t.FDTable(), lock.WriteLock, rng, nil) { return 0, nil, syserror.EAGAIN } } else { // Blocking lock, pass in the task to satisfy the lock.Blocker interface. - if !file.Dirent.Inode.LockCtx.Posix.LockRegion(lockUniqueID, lock.WriteLock, rng, t) { + if !file.Dirent.Inode.LockCtx.Posix.LockRegion(t.FDTable(), lock.WriteLock, rng, t) { return 0, nil, syserror.EINTR } } return 0, nil, nil case linux.F_UNLCK: - file.Dirent.Inode.LockCtx.Posix.UnlockRegion(lockUniqueID, rng) + file.Dirent.Inode.LockCtx.Posix.UnlockRegion(t.FDTable(), rng) return 0, nil, nil default: return 0, nil, syserror.EINVAL @@ -2157,22 +2154,6 @@ func Flock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall nonblocking := operation&linux.LOCK_NB != 0 operation &^= linux.LOCK_NB - // flock(2): - // Locks created by flock() are associated with an open file table entry. This means that - // duplicate file descriptors (created by, for example, fork(2) or dup(2)) refer to the - // same lock, and this lock may be modified or released using any of these descriptors. Furthermore, - // the lock is released either by an explicit LOCK_UN operation on any of these duplicate - // descriptors, or when all such descriptors have been closed. - // - // If a process uses open(2) (or similar) to obtain more than one descriptor for the same file, - // these descriptors are treated independently by flock(). An attempt to lock the file using - // one of these file descriptors may be denied by a lock that the calling process has already placed via - // another descriptor. - // - // We use the File UniqueID as the lock UniqueID because it needs to reference the same lock across dup(2) - // and fork(2). - lockUniqueID := lock.UniqueID(file.UniqueID) - // A BSD style lock spans the entire file. rng := lock.LockRange{ Start: 0, @@ -2183,29 +2164,29 @@ func Flock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall case linux.LOCK_EX: if nonblocking { // Since we're nonblocking we pass a nil lock.Blocker implementation. - if !file.Dirent.Inode.LockCtx.BSD.LockRegion(lockUniqueID, lock.WriteLock, rng, nil) { + if !file.Dirent.Inode.LockCtx.BSD.LockRegion(file, lock.WriteLock, rng, nil) { return 0, nil, syserror.EWOULDBLOCK } } else { // Because we're blocking we will pass the task to satisfy the lock.Blocker interface. - if !file.Dirent.Inode.LockCtx.BSD.LockRegion(lockUniqueID, lock.WriteLock, rng, t) { + if !file.Dirent.Inode.LockCtx.BSD.LockRegion(file, lock.WriteLock, rng, t) { return 0, nil, syserror.EINTR } } case linux.LOCK_SH: if nonblocking { // Since we're nonblocking we pass a nil lock.Blocker implementation. - if !file.Dirent.Inode.LockCtx.BSD.LockRegion(lockUniqueID, lock.ReadLock, rng, nil) { + if !file.Dirent.Inode.LockCtx.BSD.LockRegion(file, lock.ReadLock, rng, nil) { return 0, nil, syserror.EWOULDBLOCK } } else { // Because we're blocking we will pass the task to satisfy the lock.Blocker interface. - if !file.Dirent.Inode.LockCtx.BSD.LockRegion(lockUniqueID, lock.ReadLock, rng, t) { + if !file.Dirent.Inode.LockCtx.BSD.LockRegion(file, lock.ReadLock, rng, t) { return 0, nil, syserror.EINTR } } case linux.LOCK_UN: - file.Dirent.Inode.LockCtx.BSD.UnlockRegion(lockUniqueID, rng) + file.Dirent.Inode.LockCtx.BSD.UnlockRegion(file, rng) default: // flock(2): EINVAL operation is invalid. return 0, nil, syserror.EINVAL diff --git a/pkg/sentry/syscalls/linux/vfs2/BUILD b/pkg/sentry/syscalls/linux/vfs2/BUILD index c0d005247..9f93f4354 100644 --- a/pkg/sentry/syscalls/linux/vfs2/BUILD +++ b/pkg/sentry/syscalls/linux/vfs2/BUILD @@ -14,6 +14,7 @@ go_library( "getdents.go", "inotify.go", "ioctl.go", + "lock.go", "memfd.go", "mmap.go", "mount.go", @@ -42,6 +43,7 @@ go_library( "//pkg/fspath", "//pkg/gohacks", "//pkg/sentry/arch", + "//pkg/sentry/fs/lock", "//pkg/sentry/fsbridge", "//pkg/sentry/fsimpl/eventfd", "//pkg/sentry/fsimpl/pipefs", diff --git a/pkg/sentry/syscalls/linux/vfs2/lock.go b/pkg/sentry/syscalls/linux/vfs2/lock.go new file mode 100644 index 000000000..bf19028c4 --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/lock.go @@ -0,0 +1,64 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fs/lock" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/syserror" +) + +// Flock implements linux syscall flock(2). +func Flock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + operation := args[1].Int() + + file := t.GetFileVFS2(fd) + if file == nil { + // flock(2): EBADF fd is not an open file descriptor. + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + nonblocking := operation&linux.LOCK_NB != 0 + operation &^= linux.LOCK_NB + + var blocker lock.Blocker + if !nonblocking { + blocker = t + } + + switch operation { + case linux.LOCK_EX: + if err := file.LockBSD(t, lock.WriteLock, blocker); err != nil { + return 0, nil, err + } + case linux.LOCK_SH: + if err := file.LockBSD(t, lock.ReadLock, blocker); err != nil { + return 0, nil, err + } + case linux.LOCK_UN: + if err := file.UnlockBSD(t); err != nil { + return 0, nil, err + } + default: + // flock(2): EINVAL operation is invalid. + return 0, nil, syserror.EINVAL + } + + return 0, nil, nil +} diff --git a/pkg/sentry/syscalls/linux/vfs2/vfs2.go b/pkg/sentry/syscalls/linux/vfs2/vfs2.go index 7b6e7571a..954c82f97 100644 --- a/pkg/sentry/syscalls/linux/vfs2/vfs2.go +++ b/pkg/sentry/syscalls/linux/vfs2/vfs2.go @@ -62,7 +62,7 @@ func Override() { s.Table[55] = syscalls.Supported("getsockopt", GetSockOpt) s.Table[59] = syscalls.Supported("execve", Execve) s.Table[72] = syscalls.Supported("fcntl", Fcntl) - delete(s.Table, 73) // flock + s.Table[73] = syscalls.Supported("fcntl", Flock) s.Table[74] = syscalls.Supported("fsync", Fsync) s.Table[75] = syscalls.Supported("fdatasync", Fdatasync) s.Table[76] = syscalls.Supported("truncate", Truncate) diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD index 774cc66cc..16d9f3a28 100644 --- a/pkg/sentry/vfs/BUILD +++ b/pkg/sentry/vfs/BUILD @@ -72,6 +72,7 @@ go_library( "//pkg/sentry/memmap", "//pkg/sentry/socket/unix/transport", "//pkg/sentry/uniqueid", + "//pkg/sentry/vfs/lock", "//pkg/sync", "//pkg/syserror", "//pkg/usermem", diff --git a/pkg/sentry/vfs/epoll.go b/pkg/sentry/vfs/epoll.go index 8297f964b..599c3131c 100644 --- a/pkg/sentry/vfs/epoll.go +++ b/pkg/sentry/vfs/epoll.go @@ -31,6 +31,7 @@ type EpollInstance struct { vfsfd FileDescription FileDescriptionDefaultImpl DentryMetadataFileDescriptionImpl + NoLockFD // q holds waiters on this EpollInstance. q waiter.Queue diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go index bb294563d..97b9b18d7 100644 --- a/pkg/sentry/vfs/file_description.go +++ b/pkg/sentry/vfs/file_description.go @@ -73,6 +73,8 @@ type FileDescription struct { // writable is analogous to Linux's FMODE_WRITE. writable bool + usedLockBSD uint32 + // impl is the FileDescriptionImpl associated with this Filesystem. impl is // immutable. This should be the last field in FileDescription. impl FileDescriptionImpl @@ -175,6 +177,12 @@ func (fd *FileDescription) DecRef() { } ep.interestMu.Unlock() } + + // If BSD locks were used, release any lock that it may have acquired. + if atomic.LoadUint32(&fd.usedLockBSD) != 0 { + fd.impl.UnlockBSD(context.Background(), fd) + } + // Release implementation resources. fd.impl.Release() if fd.writable { @@ -420,13 +428,9 @@ type FileDescriptionImpl interface { Removexattr(ctx context.Context, name string) error // LockBSD tries to acquire a BSD-style advisory file lock. - // - // TODO(gvisor.dev/issue/1480): BSD-style file locking LockBSD(ctx context.Context, uid lock.UniqueID, t lock.LockType, block lock.Blocker) error - // LockBSD releases a BSD-style advisory file lock. - // - // TODO(gvisor.dev/issue/1480): BSD-style file locking + // UnlockBSD releases a BSD-style advisory file lock. UnlockBSD(ctx context.Context, uid lock.UniqueID) error // LockPOSIX tries to acquire a POSIX-style advisory file lock. @@ -736,3 +740,14 @@ func (fd *FileDescription) InodeID() uint64 { func (fd *FileDescription) Msync(ctx context.Context, mr memmap.MappableRange) error { return fd.Sync(ctx) } + +// LockBSD tries to acquire a BSD-style advisory file lock. +func (fd *FileDescription) LockBSD(ctx context.Context, lockType lock.LockType, blocker lock.Blocker) error { + atomic.StoreUint32(&fd.usedLockBSD, 1) + return fd.impl.LockBSD(ctx, fd, lockType, blocker) +} + +// UnlockBSD releases a BSD-style advisory file lock. +func (fd *FileDescription) UnlockBSD(ctx context.Context) error { + return fd.impl.UnlockBSD(ctx, fd) +} diff --git a/pkg/sentry/vfs/file_description_impl_util.go b/pkg/sentry/vfs/file_description_impl_util.go index f4c111926..af7213dfd 100644 --- a/pkg/sentry/vfs/file_description_impl_util.go +++ b/pkg/sentry/vfs/file_description_impl_util.go @@ -21,8 +21,9 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/fs/lock" + fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/memmap" + "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" @@ -153,26 +154,6 @@ func (FileDescriptionDefaultImpl) Removexattr(ctx context.Context, name string) return syserror.ENOTSUP } -// LockBSD implements FileDescriptionImpl.LockBSD. -func (FileDescriptionDefaultImpl) LockBSD(ctx context.Context, uid lock.UniqueID, t lock.LockType, block lock.Blocker) error { - return syserror.EBADF -} - -// UnlockBSD implements FileDescriptionImpl.UnlockBSD. -func (FileDescriptionDefaultImpl) UnlockBSD(ctx context.Context, uid lock.UniqueID) error { - return syserror.EBADF -} - -// LockPOSIX implements FileDescriptionImpl.LockPOSIX. -func (FileDescriptionDefaultImpl) LockPOSIX(ctx context.Context, uid lock.UniqueID, t lock.LockType, rng lock.LockRange, block lock.Blocker) error { - return syserror.EBADF -} - -// UnlockPOSIX implements FileDescriptionImpl.UnlockPOSIX. -func (FileDescriptionDefaultImpl) UnlockPOSIX(ctx context.Context, uid lock.UniqueID, rng lock.LockRange) error { - return syserror.EBADF -} - // DirectoryFileDescriptionDefaultImpl may be embedded by implementations of // FileDescriptionImpl that always represent directories to obtain // implementations of non-directory I/O methods that return EISDIR. @@ -384,3 +365,60 @@ func GenericConfigureMMap(fd *FileDescription, m memmap.Mappable, opts *memmap.M fd.IncRef() return nil } + +// LockFD may be used by most implementations of FileDescriptionImpl.Lock* +// functions. Caller must call Init(). +type LockFD struct { + locks *lock.FileLocks +} + +// Init initializes fd with FileLocks to use. +func (fd *LockFD) Init(locks *lock.FileLocks) { + fd.locks = locks +} + +// LockBSD implements vfs.FileDescriptionImpl.LockBSD. +func (fd *LockFD) LockBSD(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, block fslock.Blocker) error { + return fd.locks.LockBSD(uid, t, block) +} + +// UnlockBSD implements vfs.FileDescriptionImpl.UnlockBSD. +func (fd *LockFD) UnlockBSD(ctx context.Context, uid fslock.UniqueID) error { + fd.locks.UnlockBSD(uid) + return nil +} + +// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. +func (fd *LockFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, rng fslock.LockRange, block fslock.Blocker) error { + return fd.locks.LockPOSIX(uid, t, rng, block) +} + +// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. +func (fd *LockFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, rng fslock.LockRange) error { + fd.locks.UnlockPOSIX(uid, rng) + return nil +} + +// NoLockFD implements Lock*/Unlock* portion of FileDescriptionImpl interface +// returning ENOLCK. +type NoLockFD struct{} + +// LockBSD implements vfs.FileDescriptionImpl.LockBSD. +func (NoLockFD) LockBSD(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, block fslock.Blocker) error { + return syserror.ENOLCK +} + +// UnlockBSD implements vfs.FileDescriptionImpl.UnlockBSD. +func (NoLockFD) UnlockBSD(ctx context.Context, uid fslock.UniqueID) error { + return syserror.ENOLCK +} + +// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. +func (NoLockFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, rng fslock.LockRange, block fslock.Blocker) error { + return syserror.ENOLCK +} + +// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. +func (NoLockFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, rng fslock.LockRange) error { + return syserror.ENOLCK +} diff --git a/pkg/sentry/vfs/file_description_impl_util_test.go b/pkg/sentry/vfs/file_description_impl_util_test.go index 3a75d4d62..5061f6ac9 100644 --- a/pkg/sentry/vfs/file_description_impl_util_test.go +++ b/pkg/sentry/vfs/file_description_impl_util_test.go @@ -33,6 +33,7 @@ import ( type fileDescription struct { vfsfd FileDescription FileDescriptionDefaultImpl + NoLockFD } // genCount contains the number of times its DynamicBytesSource.Generate() diff --git a/pkg/sentry/vfs/inotify.go b/pkg/sentry/vfs/inotify.go index 05a3051a4..7fa7d2d0c 100644 --- a/pkg/sentry/vfs/inotify.go +++ b/pkg/sentry/vfs/inotify.go @@ -57,6 +57,7 @@ type Inotify struct { vfsfd FileDescription FileDescriptionDefaultImpl DentryMetadataFileDescriptionImpl + NoLockFD // Unique identifier for this inotify instance. We don't just reuse the // inotify fd because fds can be duped. These should not be exposed to the diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD index ae2aa44dc..4a1486e14 100644 --- a/test/syscalls/linux/BUILD +++ b/test/syscalls/linux/BUILD @@ -802,10 +802,13 @@ cc_binary( ], linkstatic = 1, deps = [ + ":socket_test_util", "//test/util:file_descriptor", "@com_google_absl//absl/strings", "@com_google_absl//absl/time", gtest, + "//test/util:epoll_util", + "//test/util:eventfd_util", "//test/util:posix_error", "//test/util:temp_path", "//test/util:test_main", diff --git a/test/syscalls/linux/flock.cc b/test/syscalls/linux/flock.cc index 3ecb8db8e..638a93979 100644 --- a/test/syscalls/linux/flock.cc +++ b/test/syscalls/linux/flock.cc @@ -21,6 +21,7 @@ #include "absl/time/clock.h" #include "absl/time/time.h" #include "test/syscalls/linux/file_base.h" +#include "test/syscalls/linux/socket_test_util.h" #include "test/util/file_descriptor.h" #include "test/util/temp_path.h" #include "test/util/test_util.h" @@ -34,11 +35,6 @@ namespace { class FlockTest : public FileTest {}; -TEST_F(FlockTest, BadFD) { - // EBADF: fd is not an open file descriptor. - ASSERT_THAT(flock(-1, 0), SyscallFailsWithErrno(EBADF)); -} - TEST_F(FlockTest, InvalidOpCombinations) { // The operation cannot be both exclusive and shared. EXPECT_THAT(flock(test_file_fd_.get(), LOCK_EX | LOCK_SH | LOCK_NB), @@ -57,15 +53,6 @@ TEST_F(FlockTest, NoOperationSpecified) { SyscallFailsWithErrno(EINVAL)); } -TEST(FlockTestNoFixture, FlockSupportsPipes) { - int fds[2]; - ASSERT_THAT(pipe(fds), SyscallSucceeds()); - - EXPECT_THAT(flock(fds[0], LOCK_EX | LOCK_NB), SyscallSucceeds()); - EXPECT_THAT(close(fds[0]), SyscallSucceeds()); - EXPECT_THAT(close(fds[1]), SyscallSucceeds()); -} - TEST_F(FlockTest, TestSimpleExLock) { // Test that we can obtain an exclusive lock (no other holders) // and that we can unlock it. @@ -583,6 +570,66 @@ TEST_F(FlockTest, BlockingLockFirstExclusiveSecondExclusive_NoRandomSave) { EXPECT_THAT(flock(test_file_fd_.get(), LOCK_UN), SyscallSucceeds()); } +TEST(FlockTestNoFixture, BadFD) { + // EBADF: fd is not an open file descriptor. + ASSERT_THAT(flock(-1, 0), SyscallFailsWithErrno(EBADF)); +} + +TEST(FlockTestNoFixture, FlockDir) { + auto dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); + auto fd = ASSERT_NO_ERRNO_AND_VALUE(Open(dir.path(), O_RDONLY, 0000)); + EXPECT_THAT(flock(fd.get(), LOCK_EX | LOCK_NB), SyscallSucceeds()); +} + +TEST(FlockTestNoFixture, FlockSymlink) { + // TODO(gvisor.dev/issue/2782): Replace with IsRunningWithVFS1() when O_PATH + // is supported. + SKIP_IF(IsRunningOnGvisor()); + + auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile()); + auto symlink = ASSERT_NO_ERRNO_AND_VALUE( + TempPath::CreateSymlinkTo(GetAbsoluteTestTmpdir(), file.path())); + + auto fd = + ASSERT_NO_ERRNO_AND_VALUE(Open(symlink.path(), O_RDONLY | O_PATH, 0000)); + EXPECT_THAT(flock(fd.get(), LOCK_EX | LOCK_NB), SyscallFailsWithErrno(EBADF)); +} + +TEST(FlockTestNoFixture, FlockProc) { + auto fd = + ASSERT_NO_ERRNO_AND_VALUE(Open("/proc/self/status", O_RDONLY, 0000)); + EXPECT_THAT(flock(fd.get(), LOCK_EX | LOCK_NB), SyscallSucceeds()); +} + +TEST(FlockTestNoFixture, FlockPipe) { + int fds[2]; + ASSERT_THAT(pipe(fds), SyscallSucceeds()); + + EXPECT_THAT(flock(fds[0], LOCK_EX | LOCK_NB), SyscallSucceeds()); + // Check that the pipe was locked above. + EXPECT_THAT(flock(fds[1], LOCK_EX | LOCK_NB), SyscallFailsWithErrno(EAGAIN)); + + EXPECT_THAT(flock(fds[0], LOCK_UN), SyscallSucceeds()); + EXPECT_THAT(flock(fds[1], LOCK_EX | LOCK_NB), SyscallSucceeds()); + + EXPECT_THAT(close(fds[0]), SyscallSucceeds()); + EXPECT_THAT(close(fds[1]), SyscallSucceeds()); +} + +TEST(FlockTestNoFixture, FlockSocket) { + int sock = socket(AF_UNIX, SOCK_STREAM, 0); + ASSERT_THAT(sock, SyscallSucceeds()); + + struct sockaddr_un addr = + ASSERT_NO_ERRNO_AND_VALUE(UniqueUnixAddr(true /* abstract */, AF_UNIX)); + ASSERT_THAT( + bind(sock, reinterpret_cast(&addr), sizeof(addr)), + SyscallSucceeds()); + + EXPECT_THAT(flock(sock, LOCK_EX | LOCK_NB), SyscallSucceeds()); + EXPECT_THAT(close(sock), SyscallSucceeds()); +} + } // namespace } // namespace testing -- cgit v1.2.3 From d58d57606a460d49b8870d2c48cb75f662f65fda Mon Sep 17 00:00:00 2001 From: Fabricio Voznika Date: Thu, 11 Jun 2020 14:55:04 -0700 Subject: Don't copy structs with sync.Mutex during initialization During inititalization inode struct was copied around, but it isn't great pratice to copy it around since it contains ref count and sync.Mutex. Updates #1480 PiperOrigin-RevId: 315983788 --- pkg/sentry/fsimpl/ext/block_map_file.go | 9 +++++---- pkg/sentry/fsimpl/ext/block_map_test.go | 29 +++++++++++++---------------- pkg/sentry/fsimpl/ext/directory.go | 11 +++++------ pkg/sentry/fsimpl/ext/extent_file.go | 5 +++-- pkg/sentry/fsimpl/ext/extent_test.go | 22 ++++++++++------------ pkg/sentry/fsimpl/ext/inode.go | 23 +++++++++++++++++++---- pkg/sentry/fsimpl/ext/regular_file.go | 17 ++++------------- pkg/sentry/fsimpl/ext/symlink.go | 13 ++++++------- 8 files changed, 65 insertions(+), 64 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/ext/block_map_file.go b/pkg/sentry/fsimpl/ext/block_map_file.go index a2d8c3ad6..8bb104ff0 100644 --- a/pkg/sentry/fsimpl/ext/block_map_file.go +++ b/pkg/sentry/fsimpl/ext/block_map_file.go @@ -58,15 +58,16 @@ var _ io.ReaderAt = (*blockMapFile)(nil) // newBlockMapFile is the blockMapFile constructor. It initializes the file to // physical blocks map with (at most) the first 12 (direct) blocks. -func newBlockMapFile(regFile regularFile) (*blockMapFile, error) { - file := &blockMapFile{regFile: regFile} +func newBlockMapFile(args inodeArgs) (*blockMapFile, error) { + file := &blockMapFile{} file.regFile.impl = file + file.regFile.inode.init(args, &file.regFile) for i := uint(0); i < 4; i++ { - file.coverage[i] = getCoverage(regFile.inode.blkSize, i) + file.coverage[i] = getCoverage(file.regFile.inode.blkSize, i) } - blkMap := regFile.inode.diskInode.Data() + blkMap := file.regFile.inode.diskInode.Data() binary.Unmarshal(blkMap[:numDirectBlks*4], binary.LittleEndian, &file.directBlks) binary.Unmarshal(blkMap[numDirectBlks*4:(numDirectBlks+1)*4], binary.LittleEndian, &file.indirectBlk) binary.Unmarshal(blkMap[(numDirectBlks+1)*4:(numDirectBlks+2)*4], binary.LittleEndian, &file.doubleIndirectBlk) diff --git a/pkg/sentry/fsimpl/ext/block_map_test.go b/pkg/sentry/fsimpl/ext/block_map_test.go index 181727ef7..6fa84e7aa 100644 --- a/pkg/sentry/fsimpl/ext/block_map_test.go +++ b/pkg/sentry/fsimpl/ext/block_map_test.go @@ -85,20 +85,6 @@ func (n *blkNumGen) next() uint32 { // the inode covers and that is written to disk. func blockMapSetUp(t *testing.T) (*blockMapFile, []byte) { mockDisk := make([]byte, mockBMDiskSize) - regFile := regularFile{ - inode: inode{ - fs: &filesystem{ - dev: bytes.NewReader(mockDisk), - }, - diskInode: &disklayout.InodeNew{ - InodeOld: disklayout.InodeOld{ - SizeLo: getMockBMFileFize(), - }, - }, - blkSize: uint64(mockBMBlkSize), - }, - } - var fileData []byte blkNums := newBlkNumGen() var data []byte @@ -125,9 +111,20 @@ func blockMapSetUp(t *testing.T) (*blockMapFile, []byte) { data = binary.Marshal(data, binary.LittleEndian, triplyIndirectBlk) fileData = append(fileData, writeFileDataToBlock(mockDisk, triplyIndirectBlk, 3, blkNums)...) - copy(regFile.inode.diskInode.Data(), data) + args := inodeArgs{ + fs: &filesystem{ + dev: bytes.NewReader(mockDisk), + }, + diskInode: &disklayout.InodeNew{ + InodeOld: disklayout.InodeOld{ + SizeLo: getMockBMFileFize(), + }, + }, + blkSize: uint64(mockBMBlkSize), + } + copy(args.diskInode.Data(), data) - mockFile, err := newBlockMapFile(regFile) + mockFile, err := newBlockMapFile(args) if err != nil { t.Fatalf("newBlockMapFile failed: %v", err) } diff --git a/pkg/sentry/fsimpl/ext/directory.go b/pkg/sentry/fsimpl/ext/directory.go index 12b875c8f..43be6928a 100644 --- a/pkg/sentry/fsimpl/ext/directory.go +++ b/pkg/sentry/fsimpl/ext/directory.go @@ -54,16 +54,15 @@ type directory struct { } // newDirectory is the directory constructor. -func newDirectory(inode inode, newDirent bool) (*directory, error) { +func newDirectory(args inodeArgs, newDirent bool) (*directory, error) { file := &directory{ - inode: inode, childCache: make(map[string]*dentry), childMap: make(map[string]*dirent), } - file.inode.impl = file + file.inode.init(args, file) // Initialize childList by reading dirents from the underlying file. - if inode.diskInode.Flags().Index { + if args.diskInode.Flags().Index { // TODO(b/134676337): Support hash tree directories. Currently only the '.' // and '..' entries are read in. @@ -74,7 +73,7 @@ func newDirectory(inode inode, newDirent bool) (*directory, error) { // The dirents are organized in a linear array in the file data. // Extract the file data and decode the dirents. - regFile, err := newRegularFile(inode) + regFile, err := newRegularFile(args) if err != nil { return nil, err } @@ -82,7 +81,7 @@ func newDirectory(inode inode, newDirent bool) (*directory, error) { // buf is used as scratch space for reading in dirents from disk and // unmarshalling them into dirent structs. buf := make([]byte, disklayout.DirentSize) - size := inode.diskInode.Size() + size := args.diskInode.Size() for off, inc := uint64(0), uint64(0); off < size; off += inc { toRead := size - off if toRead > disklayout.DirentSize { diff --git a/pkg/sentry/fsimpl/ext/extent_file.go b/pkg/sentry/fsimpl/ext/extent_file.go index 11dcc0346..c36225a7c 100644 --- a/pkg/sentry/fsimpl/ext/extent_file.go +++ b/pkg/sentry/fsimpl/ext/extent_file.go @@ -38,9 +38,10 @@ var _ io.ReaderAt = (*extentFile)(nil) // newExtentFile is the extent file constructor. It reads the entire extent // tree into memory. // TODO(b/134676337): Build extent tree on demand to reduce memory usage. -func newExtentFile(regFile regularFile) (*extentFile, error) { - file := &extentFile{regFile: regFile} +func newExtentFile(args inodeArgs) (*extentFile, error) { + file := &extentFile{} file.regFile.impl = file + file.regFile.inode.init(args, &file.regFile) err := file.buildExtTree() if err != nil { return nil, err diff --git a/pkg/sentry/fsimpl/ext/extent_test.go b/pkg/sentry/fsimpl/ext/extent_test.go index a2382daa3..cd10d46ee 100644 --- a/pkg/sentry/fsimpl/ext/extent_test.go +++ b/pkg/sentry/fsimpl/ext/extent_test.go @@ -177,21 +177,19 @@ func extentTreeSetUp(t *testing.T, root *disklayout.ExtentNode) (*extentFile, [] t.Helper() mockDisk := make([]byte, mockExtentBlkSize*10) - mockExtentFile := &extentFile{ - regFile: regularFile{ - inode: inode{ - fs: &filesystem{ - dev: bytes.NewReader(mockDisk), - }, - diskInode: &disklayout.InodeNew{ - InodeOld: disklayout.InodeOld{ - SizeLo: uint32(mockExtentBlkSize) * getNumPhyBlks(root), - }, - }, - blkSize: mockExtentBlkSize, + mockExtentFile := &extentFile{} + args := inodeArgs{ + fs: &filesystem{ + dev: bytes.NewReader(mockDisk), + }, + diskInode: &disklayout.InodeNew{ + InodeOld: disklayout.InodeOld{ + SizeLo: uint32(mockExtentBlkSize) * getNumPhyBlks(root), }, }, + blkSize: mockExtentBlkSize, } + mockExtentFile.regFile.inode.init(args, &mockExtentFile.regFile) fileData := writeTree(&mockExtentFile.regFile.inode, mockDisk, node0, mockExtentBlkSize) diff --git a/pkg/sentry/fsimpl/ext/inode.go b/pkg/sentry/fsimpl/ext/inode.go index e4b434b13..5caaf14ed 100644 --- a/pkg/sentry/fsimpl/ext/inode.go +++ b/pkg/sentry/fsimpl/ext/inode.go @@ -118,7 +118,7 @@ func newInode(fs *filesystem, inodeNum uint32) (*inode, error) { } // Build the inode based on its type. - inode := inode{ + args := inodeArgs{ fs: fs, inodeNum: inodeNum, blkSize: blkSize, @@ -127,19 +127,19 @@ func newInode(fs *filesystem, inodeNum uint32) (*inode, error) { switch diskInode.Mode().FileType() { case linux.ModeSymlink: - f, err := newSymlink(inode) + f, err := newSymlink(args) if err != nil { return nil, err } return &f.inode, nil case linux.ModeRegular: - f, err := newRegularFile(inode) + f, err := newRegularFile(args) if err != nil { return nil, err } return &f.inode, nil case linux.ModeDirectory: - f, err := newDirectory(inode, fs.sb.IncompatibleFeatures().DirentFileType) + f, err := newDirectory(args, fs.sb.IncompatibleFeatures().DirentFileType) if err != nil { return nil, err } @@ -150,6 +150,21 @@ func newInode(fs *filesystem, inodeNum uint32) (*inode, error) { } } +type inodeArgs struct { + fs *filesystem + inodeNum uint32 + blkSize uint64 + diskInode disklayout.Inode +} + +func (in *inode) init(args inodeArgs, impl interface{}) { + in.fs = args.fs + in.inodeNum = args.inodeNum + in.blkSize = args.blkSize + in.diskInode = args.diskInode + in.impl = impl +} + // open creates and returns a file description for the dentry passed in. func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts *vfs.OpenOptions) (*vfs.FileDescription, error) { ats := vfs.AccessTypesForOpenFlags(opts) diff --git a/pkg/sentry/fsimpl/ext/regular_file.go b/pkg/sentry/fsimpl/ext/regular_file.go index f7015c44f..152036b2e 100644 --- a/pkg/sentry/fsimpl/ext/regular_file.go +++ b/pkg/sentry/fsimpl/ext/regular_file.go @@ -43,28 +43,19 @@ type regularFile struct { // newRegularFile is the regularFile constructor. It figures out what kind of // file this is and initializes the fileReader. -func newRegularFile(inode inode) (*regularFile, error) { - regFile := regularFile{ - inode: inode, - } - - inodeFlags := inode.diskInode.Flags() - - if inodeFlags.Extents { - file, err := newExtentFile(regFile) +func newRegularFile(args inodeArgs) (*regularFile, error) { + if args.diskInode.Flags().Extents { + file, err := newExtentFile(args) if err != nil { return nil, err } - - file.regFile.inode.impl = &file.regFile return &file.regFile, nil } - file, err := newBlockMapFile(regFile) + file, err := newBlockMapFile(args) if err != nil { return nil, err } - file.regFile.inode.impl = &file.regFile return &file.regFile, nil } diff --git a/pkg/sentry/fsimpl/ext/symlink.go b/pkg/sentry/fsimpl/ext/symlink.go index 1447a4dc1..acb28d85b 100644 --- a/pkg/sentry/fsimpl/ext/symlink.go +++ b/pkg/sentry/fsimpl/ext/symlink.go @@ -30,18 +30,17 @@ type symlink struct { // newSymlink is the symlink constructor. It reads out the symlink target from // the inode (however it might have been stored). -func newSymlink(inode inode) (*symlink, error) { - var file *symlink +func newSymlink(args inodeArgs) (*symlink, error) { var link []byte // If the symlink target is lesser than 60 bytes, its stores in inode.Data(). // Otherwise either extents or block maps will be used to store the link. - size := inode.diskInode.Size() + size := args.diskInode.Size() if size < 60 { - link = inode.diskInode.Data()[:size] + link = args.diskInode.Data()[:size] } else { // Create a regular file out of this inode and read out the target. - regFile, err := newRegularFile(inode) + regFile, err := newRegularFile(args) if err != nil { return nil, err } @@ -52,8 +51,8 @@ func newSymlink(inode inode) (*symlink, error) { } } - file = &symlink{inode: inode, target: string(link)} - file.inode.impl = file + file := &symlink{target: string(link)} + file.inode.init(args, file) return file, nil } -- cgit v1.2.3 From 77c206e3719962d27cc60781ca59712b040587ab Mon Sep 17 00:00:00 2001 From: Jamie Liu Date: Thu, 11 Jun 2020 18:33:35 -0700 Subject: Add //pkg/sentry/fsimpl/overlay. Major differences from existing overlay filesystems: - Linux allows lower layers in an overlay to require revalidation, but not the upper layer. VFS1 allows the upper layer in an overlay to require revalidation, but not the lower layer. VFS2 does not allow any layers to require revalidation. (Now that vfs.MkdirOptions.ForSyntheticMountpoint exists, no uses of overlay in VFS1 are believed to require upper layer revalidation; in particular, the requirement that the upper layer support the creation of "trusted." extended attributes for whiteouts effectively required the upper filesystem to be tmpfs in most cases.) - Like VFS1, but unlike Linux, VFS2 overlay does not attempt to make mutations of the upper layer atomic using a working directory and features like RENAME_WHITEOUT. (This may change in the future, since not having a working directory makes error recovery for some operations, e.g. rmdir, particularly painful.) - Like Linux, but unlike VFS1, VFS2 represents whiteouts using character devices with rdev == 0; the equivalent of the whiteout attribute on directories is xattr trusted.overlay.opaque = "y"; and there is no equivalent to the whiteout attribute on non-directories since non-directories are never merged with lower layers. - Device and inode numbers work as follows: - In Linux, modulo the xino feature and a special case for when all layers are the same filesystem: - Directories use the overlay filesystem's device number and an ephemeral inode number assigned by the overlay. - Non-directories that have been copied up use the device and inode number assigned by the upper filesystem. - Non-directories that have not been copied up use a per-(overlay, layer)-pair device number and the inode number assigned by the lower filesystem. - In VFS1, device and inode numbers always come from the lower layer unless "whited out"; this has the adverse effect of requiring interaction with the lower filesystem even for non-directory files that exist on the upper layer. - In VFS2, device and inode numbers are assigned as in Linux, except that xino and the samefs special case are not supported. - Like Linux, but unlike VFS1, VFS2 does not attempt to maintain memory mapping coherence across copy-up. (This may have to change in the future, as users may be dependent on this property.) - Like Linux, but unlike VFS1, VFS2 uses the overlayfs mounter's credentials when interacting with the overlay's layers, rather than the caller's. - Like Linux, but unlike VFS1, VFS2 permits multiple lower layers in an overlay. - Like Linux, but unlike VFS1, VFS2's overlay filesystem is application-mountable. Updates #1199 PiperOrigin-RevId: 316019067 --- pkg/sentry/fsimpl/gofer/filesystem.go | 12 +- pkg/sentry/fsimpl/kernfs/filesystem.go | 10 +- pkg/sentry/fsimpl/overlay/BUILD | 41 + pkg/sentry/fsimpl/overlay/copy_up.go | 262 ++++++ pkg/sentry/fsimpl/overlay/directory.go | 265 ++++++ pkg/sentry/fsimpl/overlay/filesystem.go | 1364 ++++++++++++++++++++++++++++ pkg/sentry/fsimpl/overlay/non_directory.go | 266 ++++++ pkg/sentry/fsimpl/overlay/overlay.go | 612 +++++++++++++ pkg/sentry/vfs/file_description.go | 17 +- pkg/syserror/syserror.go | 1 + runsc/boot/BUILD | 1 + runsc/boot/vfs.go | 5 + 12 files changed, 2843 insertions(+), 13 deletions(-) create mode 100644 pkg/sentry/fsimpl/overlay/BUILD create mode 100644 pkg/sentry/fsimpl/overlay/copy_up.go create mode 100644 pkg/sentry/fsimpl/overlay/directory.go create mode 100644 pkg/sentry/fsimpl/overlay/filesystem.go create mode 100644 pkg/sentry/fsimpl/overlay/non_directory.go create mode 100644 pkg/sentry/fsimpl/overlay/overlay.go (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go index 40933b74b..3c467e313 100644 --- a/pkg/sentry/fsimpl/gofer/filesystem.go +++ b/pkg/sentry/fsimpl/gofer/filesystem.go @@ -118,7 +118,7 @@ func putDentrySlice(ds *[]*dentry) { // must be up to date. // // Postconditions: The returned dentry's cached metadata is up to date. -func (fs *filesystem) stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, ds **[]*dentry) (*dentry, error) { +func (fs *filesystem) stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, mayFollowSymlinks bool, ds **[]*dentry) (*dentry, error) { if !d.isDir() { return nil, syserror.ENOTDIR } @@ -168,7 +168,7 @@ afterSymlink: if err := rp.CheckMount(&child.vfsd); err != nil { return nil, err } - if child.isSymlink() && rp.ShouldFollowSymlink() { + if child.isSymlink() && mayFollowSymlinks && rp.ShouldFollowSymlink() { target, err := child.readlink(ctx, rp.Mount()) if err != nil { return nil, err @@ -275,7 +275,7 @@ func (fs *filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.Vir func (fs *filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, ds **[]*dentry) (*dentry, error) { for !rp.Final() { d.dirMu.Lock() - next, err := fs.stepLocked(ctx, rp, d, ds) + next, err := fs.stepLocked(ctx, rp, d, true /* mayFollowSymlinks */, ds) d.dirMu.Unlock() if err != nil { return nil, err @@ -301,7 +301,7 @@ func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath, } for !rp.Done() { d.dirMu.Lock() - next, err := fs.stepLocked(ctx, rp, d, ds) + next, err := fs.stepLocked(ctx, rp, d, true /* mayFollowSymlinks */, ds) d.dirMu.Unlock() if err != nil { return nil, err @@ -754,7 +754,7 @@ afterTrailingSymlink: } // Determine whether or not we need to create a file. parent.dirMu.Lock() - child, err := fs.stepLocked(ctx, rp, parent, &ds) + child, err := fs.stepLocked(ctx, rp, parent, false /* mayFollowSymlinks */, &ds) if err == syserror.ENOENT && mayCreate { if parent.isSynthetic() { parent.dirMu.Unlock() @@ -939,7 +939,7 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving // Filter file creation flags and O_LARGEFILE out; the create RPC already // has the semantics of O_CREAT|O_EXCL, while some servers will choke on // O_LARGEFILE. - createFlags := p9.OpenFlags(opts.Flags &^ (linux.O_CREAT | linux.O_EXCL | linux.O_NOCTTY | linux.O_TRUNC | linux.O_LARGEFILE)) + createFlags := p9.OpenFlags(opts.Flags &^ (vfs.FileCreationFlags | linux.O_LARGEFILE)) fdobj, openFile, createQID, _, err := dirfile.create(ctx, name, createFlags, (p9.FileMode)(opts.Mode), (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID)) if err != nil { dirfile.close(ctx) diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go index 4a12ae245..8939871c1 100644 --- a/pkg/sentry/fsimpl/kernfs/filesystem.go +++ b/pkg/sentry/fsimpl/kernfs/filesystem.go @@ -35,7 +35,7 @@ import ( // Preconditions: Filesystem.mu must be locked for at least reading. !rp.Done(). // // Postcondition: Caller must call fs.processDeferredDecRefs*. -func (fs *Filesystem) stepExistingLocked(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry) (*vfs.Dentry, error) { +func (fs *Filesystem) stepExistingLocked(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, mayFollowSymlinks bool) (*vfs.Dentry, error) { d := vfsd.Impl().(*Dentry) if !d.isDir() { return nil, syserror.ENOTDIR @@ -81,7 +81,7 @@ afterSymlink: return nil, err } // Resolve any symlink at current path component. - if rp.ShouldFollowSymlink() && next.isSymlink() { + if mayFollowSymlinks && rp.ShouldFollowSymlink() && next.isSymlink() { targetVD, targetPathname, err := next.inode.Getlink(ctx, rp.Mount()) if err != nil { return nil, err @@ -152,7 +152,7 @@ func (fs *Filesystem) walkExistingLocked(ctx context.Context, rp *vfs.ResolvingP vfsd := rp.Start() for !rp.Done() { var err error - vfsd, err = fs.stepExistingLocked(ctx, rp, vfsd) + vfsd, err = fs.stepExistingLocked(ctx, rp, vfsd, true /* mayFollowSymlinks */) if err != nil { return nil, nil, err } @@ -178,7 +178,7 @@ func (fs *Filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.Resolving vfsd := rp.Start() for !rp.Final() { var err error - vfsd, err = fs.stepExistingLocked(ctx, rp, vfsd) + vfsd, err = fs.stepExistingLocked(ctx, rp, vfsd, true /* mayFollowSymlinks */) if err != nil { return nil, nil, err } @@ -449,7 +449,7 @@ afterTrailingSymlink: return nil, syserror.ENAMETOOLONG } // Determine whether or not we need to create a file. - childVFSD, err := fs.stepExistingLocked(ctx, rp, parentVFSD) + childVFSD, err := fs.stepExistingLocked(ctx, rp, parentVFSD, false /* mayFollowSymlinks */) if err == syserror.ENOENT { // Already checked for searchability above; now check for writability. if err := parentInode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite); err != nil { diff --git a/pkg/sentry/fsimpl/overlay/BUILD b/pkg/sentry/fsimpl/overlay/BUILD new file mode 100644 index 000000000..f9413bbdd --- /dev/null +++ b/pkg/sentry/fsimpl/overlay/BUILD @@ -0,0 +1,41 @@ +load("//tools:defs.bzl", "go_library") +load("//tools/go_generics:defs.bzl", "go_template_instance") + +licenses(["notice"]) + +go_template_instance( + name = "fstree", + out = "fstree.go", + package = "overlay", + prefix = "generic", + template = "//pkg/sentry/vfs/genericfstree:generic_fstree", + types = { + "Dentry": "dentry", + }, +) + +go_library( + name = "overlay", + srcs = [ + "copy_up.go", + "directory.go", + "filesystem.go", + "fstree.go", + "non_directory.go", + "overlay.go", + ], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/context", + "//pkg/fspath", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/memmap", + "//pkg/sentry/socket/unix/transport", + "//pkg/sentry/vfs", + "//pkg/sentry/vfs/lock", + "//pkg/sync", + "//pkg/syserror", + "//pkg/usermem", + ], +) diff --git a/pkg/sentry/fsimpl/overlay/copy_up.go b/pkg/sentry/fsimpl/overlay/copy_up.go new file mode 100644 index 000000000..8f8dcfafe --- /dev/null +++ b/pkg/sentry/fsimpl/overlay/copy_up.go @@ -0,0 +1,262 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package overlay + +import ( + "fmt" + "io" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +func (d *dentry) isCopiedUp() bool { + return atomic.LoadUint32(&d.copiedUp) != 0 +} + +// copyUpLocked ensures that d exists on the upper layer, i.e. d.upperVD.Ok(). +// +// Preconditions: filesystem.renameMu must be locked. +func (d *dentry) copyUpLocked(ctx context.Context) error { + // Fast path. + if d.isCopiedUp() { + return nil + } + + ftype := atomic.LoadUint32(&d.mode) & linux.S_IFMT + switch ftype { + case linux.S_IFREG, linux.S_IFDIR, linux.S_IFLNK, linux.S_IFBLK, linux.S_IFCHR: + // Can be copied-up. + default: + // Can't be copied-up. + return syserror.EPERM + } + + // Ensure that our parent directory is copied-up. + if d.parent == nil { + // d is a filesystem root with no upper layer. + return syserror.EROFS + } + if err := d.parent.copyUpLocked(ctx); err != nil { + return err + } + + d.copyMu.Lock() + defer d.copyMu.Unlock() + if d.upperVD.Ok() { + // Raced with another call to d.copyUpLocked(). + return nil + } + if d.vfsd.IsDead() { + // Raced with deletion of d. + return syserror.ENOENT + } + + // Perform copy-up. + vfsObj := d.fs.vfsfs.VirtualFilesystem() + newpop := vfs.PathOperation{ + Root: d.parent.upperVD, + Start: d.parent.upperVD, + Path: fspath.Parse(d.name), + } + cleanupUndoCopyUp := func() { + var err error + if ftype == linux.S_IFDIR { + err = vfsObj.RmdirAt(ctx, d.fs.creds, &newpop) + } else { + err = vfsObj.UnlinkAt(ctx, d.fs.creds, &newpop) + } + if err != nil { + ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer file after copy-up error: %v", err) + } + } + switch ftype { + case linux.S_IFREG: + oldFD, err := vfsObj.OpenAt(ctx, d.fs.creds, &vfs.PathOperation{ + Root: d.lowerVDs[0], + Start: d.lowerVDs[0], + }, &vfs.OpenOptions{ + Flags: linux.O_RDONLY, + }) + if err != nil { + return err + } + defer oldFD.DecRef() + newFD, err := vfsObj.OpenAt(ctx, d.fs.creds, &newpop, &vfs.OpenOptions{ + Flags: linux.O_WRONLY | linux.O_CREAT | linux.O_EXCL, + Mode: linux.FileMode(d.mode &^ linux.S_IFMT), + }) + if err != nil { + return err + } + defer newFD.DecRef() + bufIOSeq := usermem.BytesIOSequence(make([]byte, 32*1024)) // arbitrary buffer size + for { + readN, readErr := oldFD.Read(ctx, bufIOSeq, vfs.ReadOptions{}) + if readErr != nil && readErr != io.EOF { + cleanupUndoCopyUp() + return readErr + } + total := int64(0) + for total < readN { + writeN, writeErr := newFD.Write(ctx, bufIOSeq.DropFirst64(total), vfs.WriteOptions{}) + total += writeN + if writeErr != nil { + cleanupUndoCopyUp() + return writeErr + } + } + if readErr == io.EOF { + break + } + } + if err := newFD.SetStat(ctx, vfs.SetStatOptions{ + Stat: linux.Statx{ + Mask: linux.STATX_UID | linux.STATX_GID, + UID: d.uid, + GID: d.gid, + }, + }); err != nil { + cleanupUndoCopyUp() + return err + } + d.upperVD = newFD.VirtualDentry() + d.upperVD.IncRef() + + case linux.S_IFDIR: + if err := vfsObj.MkdirAt(ctx, d.fs.creds, &newpop, &vfs.MkdirOptions{ + Mode: linux.FileMode(d.mode &^ linux.S_IFMT), + }); err != nil { + return err + } + if err := vfsObj.SetStatAt(ctx, d.fs.creds, &newpop, &vfs.SetStatOptions{ + Stat: linux.Statx{ + Mask: linux.STATX_UID | linux.STATX_GID, + UID: d.uid, + GID: d.gid, + }, + }); err != nil { + cleanupUndoCopyUp() + return err + } + upperVD, err := vfsObj.GetDentryAt(ctx, d.fs.creds, &newpop, &vfs.GetDentryOptions{}) + if err != nil { + cleanupUndoCopyUp() + return err + } + d.upperVD = upperVD + + case linux.S_IFLNK: + target, err := vfsObj.ReadlinkAt(ctx, d.fs.creds, &vfs.PathOperation{ + Root: d.lowerVDs[0], + Start: d.lowerVDs[0], + }) + if err != nil { + return err + } + if err := vfsObj.SymlinkAt(ctx, d.fs.creds, &newpop, target); err != nil { + return err + } + if err := vfsObj.SetStatAt(ctx, d.fs.creds, &newpop, &vfs.SetStatOptions{ + Stat: linux.Statx{ + Mask: linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID, + Mode: uint16(d.mode), + UID: d.uid, + GID: d.gid, + }, + }); err != nil { + cleanupUndoCopyUp() + return err + } + upperVD, err := vfsObj.GetDentryAt(ctx, d.fs.creds, &newpop, &vfs.GetDentryOptions{}) + if err != nil { + cleanupUndoCopyUp() + return err + } + d.upperVD = upperVD + + case linux.S_IFBLK, linux.S_IFCHR: + lowerStat, err := vfsObj.StatAt(ctx, d.fs.creds, &vfs.PathOperation{ + Root: d.lowerVDs[0], + Start: d.lowerVDs[0], + }, &vfs.StatOptions{}) + if err != nil { + return err + } + if err := vfsObj.MknodAt(ctx, d.fs.creds, &newpop, &vfs.MknodOptions{ + Mode: linux.FileMode(d.mode), + DevMajor: lowerStat.RdevMajor, + DevMinor: lowerStat.RdevMinor, + }); err != nil { + return err + } + if err := vfsObj.SetStatAt(ctx, d.fs.creds, &newpop, &vfs.SetStatOptions{ + Stat: linux.Statx{ + Mask: linux.STATX_UID | linux.STATX_GID, + UID: d.uid, + GID: d.gid, + }, + }); err != nil { + cleanupUndoCopyUp() + return err + } + upperVD, err := vfsObj.GetDentryAt(ctx, d.fs.creds, &newpop, &vfs.GetDentryOptions{}) + if err != nil { + cleanupUndoCopyUp() + return err + } + d.upperVD = upperVD + + default: + // Should have rejected this at the beginning of this function? + panic(fmt.Sprintf("unexpected file type %o", ftype)) + } + + // TODO(gvisor.dev/issue/1199): copy up xattrs + + // Update the dentry's device and inode numbers (except for directories, + // for which these remain overlay-assigned). + if ftype != linux.S_IFDIR { + upperStat, err := vfsObj.StatAt(ctx, d.fs.creds, &vfs.PathOperation{ + Root: d.upperVD, + Start: d.upperVD, + }, &vfs.StatOptions{ + Mask: linux.STATX_INO, + }) + if err != nil { + d.upperVD.DecRef() + d.upperVD = vfs.VirtualDentry{} + cleanupUndoCopyUp() + return err + } + if upperStat.Mask&linux.STATX_INO == 0 { + d.upperVD.DecRef() + d.upperVD = vfs.VirtualDentry{} + cleanupUndoCopyUp() + return syserror.EREMOTE + } + atomic.StoreUint32(&d.devMajor, upperStat.DevMajor) + atomic.StoreUint32(&d.devMinor, upperStat.DevMinor) + atomic.StoreUint64(&d.ino, upperStat.Ino) + } + + atomic.StoreUint32(&d.copiedUp, 1) + return nil +} diff --git a/pkg/sentry/fsimpl/overlay/directory.go b/pkg/sentry/fsimpl/overlay/directory.go new file mode 100644 index 000000000..6f47167d3 --- /dev/null +++ b/pkg/sentry/fsimpl/overlay/directory.go @@ -0,0 +1,265 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package overlay + +import ( + "sync/atomic" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" +) + +func (d *dentry) isDir() bool { + return atomic.LoadUint32(&d.mode)&linux.S_IFMT == linux.S_IFDIR +} + +// Preconditions: d.dirMu must be locked. d.isDir(). +func (d *dentry) collectWhiteoutsForRmdirLocked(ctx context.Context) (map[string]bool, error) { + vfsObj := d.fs.vfsfs.VirtualFilesystem() + var readdirErr error + whiteouts := make(map[string]bool) + var maybeWhiteouts []string + d.iterLayers(func(layerVD vfs.VirtualDentry, isUpper bool) bool { + layerFD, err := vfsObj.OpenAt(ctx, d.fs.creds, &vfs.PathOperation{ + Root: layerVD, + Start: layerVD, + }, &vfs.OpenOptions{ + Flags: linux.O_RDONLY | linux.O_DIRECTORY, + }) + if err != nil { + readdirErr = err + return false + } + defer layerFD.DecRef() + + // Reuse slice allocated for maybeWhiteouts from a previous layer to + // reduce allocations. + maybeWhiteouts = maybeWhiteouts[:0] + if err := layerFD.IterDirents(ctx, vfs.IterDirentsCallbackFunc(func(dirent vfs.Dirent) error { + if dirent.Name == "." || dirent.Name == ".." { + return nil + } + if _, ok := whiteouts[dirent.Name]; ok { + // This file has been whited-out in a previous layer. + return nil + } + if dirent.Type == linux.DT_CHR { + // We have to determine if this is a whiteout, which doesn't + // count against the directory's emptiness. However, we can't + // do so while holding locks held by layerFD.IterDirents(). + maybeWhiteouts = append(maybeWhiteouts, dirent.Name) + return nil + } + // Non-whiteout file in the directory prevents rmdir. + return syserror.ENOTEMPTY + })); err != nil { + readdirErr = err + return false + } + + for _, maybeWhiteoutName := range maybeWhiteouts { + stat, err := vfsObj.StatAt(ctx, d.fs.creds, &vfs.PathOperation{ + Root: layerVD, + Start: layerVD, + Path: fspath.Parse(maybeWhiteoutName), + }, &vfs.StatOptions{}) + if err != nil { + readdirErr = err + return false + } + if stat.RdevMajor != 0 || stat.RdevMinor != 0 { + // This file is a real character device, not a whiteout. + readdirErr = syserror.ENOTEMPTY + return false + } + whiteouts[maybeWhiteoutName] = isUpper + } + // Continue iteration since we haven't found any non-whiteout files in + // this directory yet. + return true + }) + return whiteouts, readdirErr +} + +type directoryFD struct { + fileDescription + vfs.DirectoryFileDescriptionDefaultImpl + vfs.DentryMetadataFileDescriptionImpl + + mu sync.Mutex + off int64 + dirents []vfs.Dirent +} + +// Release implements vfs.FileDescriptionImpl.Release. +func (fd *directoryFD) Release() { +} + +// IterDirents implements vfs.FileDescriptionImpl.IterDirents. +func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error { + fd.mu.Lock() + defer fd.mu.Unlock() + + d := fd.dentry() + if fd.dirents == nil { + ds, err := d.getDirents(ctx) + if err != nil { + return err + } + fd.dirents = ds + } + + for fd.off < int64(len(fd.dirents)) { + if err := cb.Handle(fd.dirents[fd.off]); err != nil { + return err + } + fd.off++ + } + return nil +} + +// Preconditions: d.isDir(). +func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) { + d.fs.renameMu.RLock() + defer d.fs.renameMu.RUnlock() + d.dirMu.Lock() + defer d.dirMu.Unlock() + + if d.dirents != nil { + return d.dirents, nil + } + + parent := genericParentOrSelf(d) + dirents := []vfs.Dirent{ + { + Name: ".", + Type: linux.DT_DIR, + Ino: d.ino, + NextOff: 1, + }, + { + Name: "..", + Type: uint8(atomic.LoadUint32(&parent.mode) >> 12), + Ino: parent.ino, + NextOff: 2, + }, + } + + // Merge dirents from all layers comprising this directory. + vfsObj := d.fs.vfsfs.VirtualFilesystem() + var readdirErr error + prevDirents := make(map[string]struct{}) + var maybeWhiteouts []vfs.Dirent + d.iterLayers(func(layerVD vfs.VirtualDentry, isUpper bool) bool { + layerFD, err := vfsObj.OpenAt(ctx, d.fs.creds, &vfs.PathOperation{ + Root: layerVD, + Start: layerVD, + }, &vfs.OpenOptions{ + Flags: linux.O_RDONLY | linux.O_DIRECTORY, + }) + if err != nil { + readdirErr = err + return false + } + defer layerFD.DecRef() + + // Reuse slice allocated for maybeWhiteouts from a previous layer to + // reduce allocations. + maybeWhiteouts = maybeWhiteouts[:0] + if err := layerFD.IterDirents(ctx, vfs.IterDirentsCallbackFunc(func(dirent vfs.Dirent) error { + if dirent.Name == "." || dirent.Name == ".." { + return nil + } + if _, ok := prevDirents[dirent.Name]; ok { + // This file is hidden by, or merged with, another file with + // the same name in a previous layer. + return nil + } + prevDirents[dirent.Name] = struct{}{} + if dirent.Type == linux.DT_CHR { + // We can't determine if this file is a whiteout while holding + // locks held by layerFD.IterDirents(). + maybeWhiteouts = append(maybeWhiteouts, dirent) + return nil + } + dirent.NextOff = int64(len(dirents) + 1) + dirents = append(dirents, dirent) + return nil + })); err != nil { + readdirErr = err + return false + } + + for _, dirent := range maybeWhiteouts { + stat, err := vfsObj.StatAt(ctx, d.fs.creds, &vfs.PathOperation{ + Root: layerVD, + Start: layerVD, + Path: fspath.Parse(dirent.Name), + }, &vfs.StatOptions{}) + if err != nil { + readdirErr = err + return false + } + if stat.RdevMajor == 0 && stat.RdevMinor == 0 { + // This file is a whiteout; don't emit a dirent for it. + continue + } + dirent.NextOff = int64(len(dirents) + 1) + dirents = append(dirents, dirent) + } + return true + }) + if readdirErr != nil { + return nil, readdirErr + } + + // Cache dirents for future directoryFDs. + d.dirents = dirents + return dirents, nil +} + +// Seek implements vfs.FileDescriptionImpl.Seek. +func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { + fd.mu.Lock() + defer fd.mu.Unlock() + + switch whence { + case linux.SEEK_SET: + if offset < 0 { + return 0, syserror.EINVAL + } + if offset == 0 { + // Ensure that the next call to fd.IterDirents() calls + // fd.dentry().getDirents(). + fd.dirents = nil + } + fd.off = offset + return fd.off, nil + case linux.SEEK_CUR: + offset += fd.off + if offset < 0 { + return 0, syserror.EINVAL + } + // Don't clear fd.dirents in this case, even if offset == 0. + fd.off = offset + return fd.off, nil + default: + return 0, syserror.EINVAL + } +} diff --git a/pkg/sentry/fsimpl/overlay/filesystem.go b/pkg/sentry/fsimpl/overlay/filesystem.go new file mode 100644 index 000000000..ff82e1f20 --- /dev/null +++ b/pkg/sentry/fsimpl/overlay/filesystem.go @@ -0,0 +1,1364 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package overlay + +import ( + "sync/atomic" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" +) + +// _OVL_XATTR_OPAQUE is an extended attribute key whose value is set to "y" for +// opaque directories. +// Linux: fs/overlayfs/overlayfs.h:OVL_XATTR_OPAQUE +const _OVL_XATTR_OPAQUE = "trusted.overlay.opaque" + +func isWhiteout(stat *linux.Statx) bool { + return stat.Mode&linux.S_IFMT == linux.S_IFCHR && stat.RdevMajor == 0 && stat.RdevMinor == 0 +} + +// Sync implements vfs.FilesystemImpl.Sync. +func (fs *filesystem) Sync(ctx context.Context) error { + if fs.opts.UpperRoot.Ok() { + return fs.opts.UpperRoot.Mount().Filesystem().Impl().Sync(ctx) + } + return nil +} + +var dentrySlicePool = sync.Pool{ + New: func() interface{} { + ds := make([]*dentry, 0, 4) // arbitrary non-zero initial capacity + return &ds + }, +} + +func appendDentry(ds *[]*dentry, d *dentry) *[]*dentry { + if ds == nil { + ds = dentrySlicePool.Get().(*[]*dentry) + } + *ds = append(*ds, d) + return ds +} + +// Preconditions: ds != nil. +func putDentrySlice(ds *[]*dentry) { + // Allow dentries to be GC'd. + for i := range *ds { + (*ds)[i] = nil + } + *ds = (*ds)[:0] + dentrySlicePool.Put(ds) +} + +// renameMuRUnlockAndCheckDrop calls fs.renameMu.RUnlock(), then calls +// dentry.checkDropLocked on all dentries in *ds with fs.renameMu locked for +// writing. +// +// ds is a pointer-to-pointer since defer evaluates its arguments immediately, +// but dentry slices are allocated lazily, and it's much easier to say "defer +// fs.renameMuRUnlockAndCheckDrop(&ds)" than "defer func() { +// fs.renameMuRUnlockAndCheckDrop(ds) }()" to work around this. +func (fs *filesystem) renameMuRUnlockAndCheckDrop(ds **[]*dentry) { + fs.renameMu.RUnlock() + if *ds == nil { + return + } + if len(**ds) != 0 { + fs.renameMu.Lock() + for _, d := range **ds { + d.checkDropLocked() + } + fs.renameMu.Unlock() + } + putDentrySlice(*ds) +} + +func (fs *filesystem) renameMuUnlockAndCheckDrop(ds **[]*dentry) { + if *ds == nil { + fs.renameMu.Unlock() + return + } + for _, d := range **ds { + d.checkDropLocked() + } + fs.renameMu.Unlock() + putDentrySlice(*ds) +} + +// stepLocked resolves rp.Component() to an existing file, starting from the +// given directory. +// +// Dentries which may have a reference count of zero, and which therefore +// should be dropped once traversal is complete, are appended to ds. +// +// Preconditions: fs.renameMu must be locked. d.dirMu must be locked. +// !rp.Done(). +func (fs *filesystem) stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, mayFollowSymlinks bool, ds **[]*dentry) (*dentry, error) { + if !d.isDir() { + return nil, syserror.ENOTDIR + } + if err := d.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { + return nil, err + } +afterSymlink: + name := rp.Component() + if name == "." { + rp.Advance() + return d, nil + } + if name == ".." { + if isRoot, err := rp.CheckRoot(&d.vfsd); err != nil { + return nil, err + } else if isRoot || d.parent == nil { + rp.Advance() + return d, nil + } + if err := rp.CheckMount(&d.parent.vfsd); err != nil { + return nil, err + } + rp.Advance() + return d.parent, nil + } + child, err := fs.getChildLocked(ctx, d, name, ds) + if err != nil { + return nil, err + } + if err := rp.CheckMount(&child.vfsd); err != nil { + return nil, err + } + if child.isSymlink() && mayFollowSymlinks && rp.ShouldFollowSymlink() { + target, err := child.readlink(ctx) + if err != nil { + return nil, err + } + if err := rp.HandleSymlink(target); err != nil { + return nil, err + } + goto afterSymlink // don't check the current directory again + } + rp.Advance() + return child, nil +} + +// Preconditions: fs.renameMu must be locked. d.dirMu must be locked. +func (fs *filesystem) getChildLocked(ctx context.Context, parent *dentry, name string, ds **[]*dentry) (*dentry, error) { + if child, ok := parent.children[name]; ok { + return child, nil + } + child, err := fs.lookupLocked(ctx, parent, name) + if err != nil { + return nil, err + } + if parent.children == nil { + parent.children = make(map[string]*dentry) + } + parent.children[name] = child + // child's refcount is initially 0, so it may be dropped after traversal. + *ds = appendDentry(*ds, child) + return child, nil +} + +// Preconditions: fs.renameMu must be locked. parent.dirMu must be locked. +func (fs *filesystem) lookupLocked(ctx context.Context, parent *dentry, name string) (*dentry, error) { + childPath := fspath.Parse(name) + child := fs.newDentry() + existsOnAnyLayer := false + var lookupErr error + + vfsObj := fs.vfsfs.VirtualFilesystem() + parent.iterLayers(func(parentVD vfs.VirtualDentry, isUpper bool) bool { + childVD, err := vfsObj.GetDentryAt(ctx, fs.creds, &vfs.PathOperation{ + Root: parentVD, + Start: parentVD, + Path: childPath, + }, &vfs.GetDentryOptions{}) + if err == syserror.ENOENT || err == syserror.ENAMETOOLONG { + // The file doesn't exist on this layer. Proceed to the next one. + return true + } + if err != nil { + lookupErr = err + return false + } + + mask := uint32(linux.STATX_TYPE) + if !existsOnAnyLayer { + // Mode, UID, GID, and (for non-directories) inode number come from + // the topmost layer on which the file exists. + mask |= linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO + } + stat, err := vfsObj.StatAt(ctx, fs.creds, &vfs.PathOperation{ + Root: childVD, + Start: childVD, + }, &vfs.StatOptions{ + Mask: mask, + }) + if err != nil { + lookupErr = err + return false + } + if stat.Mask&mask != mask { + lookupErr = syserror.EREMOTE + return false + } + + if isWhiteout(&stat) { + // This is a whiteout, so it "doesn't exist" on this layer, and + // layers below this one are ignored. + return false + } + isDir := stat.Mode&linux.S_IFMT == linux.S_IFDIR + if existsOnAnyLayer && !isDir { + // Directories are not merged with non-directory files from lower + // layers; instead, layers including and below the first + // non-directory file are ignored. (This file must be a directory + // on previous layers, since lower layers aren't searched for + // non-directory files.) + return false + } + + // Update child to include this layer. + if isUpper { + child.upperVD = childVD + child.copiedUp = 1 + } else { + child.lowerVDs = append(child.lowerVDs, childVD) + } + if !existsOnAnyLayer { + existsOnAnyLayer = true + child.mode = uint32(stat.Mode) + child.uid = stat.UID + child.gid = stat.GID + child.devMajor = stat.DevMajor + child.devMinor = stat.DevMinor + child.ino = stat.Ino + } + + // For non-directory files, only the topmost layer that contains a file + // matters. + if !isDir { + return false + } + + // Directories are merged with directories from lower layers if they + // are not explicitly opaque. + opaqueVal, err := vfsObj.GetxattrAt(ctx, fs.creds, &vfs.PathOperation{ + Root: childVD, + Start: childVD, + }, &vfs.GetxattrOptions{ + Name: _OVL_XATTR_OPAQUE, + Size: 1, + }) + return !(err == nil && opaqueVal == "y") + }) + + if lookupErr != nil { + child.destroyLocked() + return nil, lookupErr + } + if !existsOnAnyLayer { + child.destroyLocked() + return nil, syserror.ENOENT + } + + // Device and inode numbers were copied from the topmost layer above; + // override them if necessary. + if child.isDir() { + child.devMajor = linux.UNNAMED_MAJOR + child.devMinor = fs.dirDevMinor + child.ino = fs.newDirIno() + } else if !child.upperVD.Ok() { + child.devMajor = linux.UNNAMED_MAJOR + child.devMinor = fs.lowerDevMinors[child.lowerVDs[0].Mount().Filesystem()] + } + + parent.IncRef() + child.parent = parent + child.name = name + return child, nil +} + +// lookupLayerLocked is similar to lookupLocked, but only returns information +// about the file rather than a dentry. +// +// Preconditions: fs.renameMu must be locked. parent.dirMu must be locked. +func (fs *filesystem) lookupLayerLocked(ctx context.Context, parent *dentry, name string) (lookupLayer, error) { + childPath := fspath.Parse(name) + lookupLayer := lookupLayerNone + var lookupErr error + + parent.iterLayers(func(parentVD vfs.VirtualDentry, isUpper bool) bool { + stat, err := fs.vfsfs.VirtualFilesystem().StatAt(ctx, fs.creds, &vfs.PathOperation{ + Root: parentVD, + Start: parentVD, + Path: childPath, + }, &vfs.StatOptions{ + Mask: linux.STATX_TYPE, + }) + if err == syserror.ENOENT || err == syserror.ENAMETOOLONG { + // The file doesn't exist on this layer. Proceed to the next + // one. + return true + } + if err != nil { + lookupErr = err + return false + } + if stat.Mask&linux.STATX_TYPE == 0 { + // Linux's overlayfs tends to return EREMOTE in cases where a file + // is unusable for reasons that are not better captured by another + // errno. + lookupErr = syserror.EREMOTE + return false + } + if isWhiteout(&stat) { + // This is a whiteout, so it "doesn't exist" on this layer, and + // layers below this one are ignored. + if isUpper { + lookupLayer = lookupLayerUpperWhiteout + } + return false + } + // The file exists; we can stop searching. + if isUpper { + lookupLayer = lookupLayerUpper + } else { + lookupLayer = lookupLayerLower + } + return false + }) + + return lookupLayer, lookupErr +} + +type lookupLayer int + +const ( + // lookupLayerNone indicates that no file exists at the given path on the + // upper layer, and is either whited out or does not exist on lower layers. + // Therefore, the file does not exist in the overlay filesystem, and file + // creation may proceed normally (if an upper layer exists). + lookupLayerNone lookupLayer = iota + + // lookupLayerLower indicates that no file exists at the given path on the + // upper layer, but exists on a lower layer. Therefore, the file exists in + // the overlay filesystem, but must be copied-up before mutation. + lookupLayerLower + + // lookupLayerUpper indicates that a non-whiteout file exists at the given + // path on the upper layer. Therefore, the file exists in the overlay + // filesystem, and is already copied-up. + lookupLayerUpper + + // lookupLayerUpperWhiteout indicates that a whiteout exists at the given + // path on the upper layer. Therefore, the file does not exist in the + // overlay filesystem, and file creation must remove the whiteout before + // proceeding. + lookupLayerUpperWhiteout +) + +func (ll lookupLayer) existsInOverlay() bool { + return ll == lookupLayerLower || ll == lookupLayerUpper +} + +// walkParentDirLocked resolves all but the last path component of rp to an +// existing directory, starting from the given directory (which is usually +// rp.Start().Impl().(*dentry)). It does not check that the returned directory +// is searchable by the provider of rp. +// +// Preconditions: fs.renameMu must be locked. !rp.Done(). +func (fs *filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, ds **[]*dentry) (*dentry, error) { + for !rp.Final() { + d.dirMu.Lock() + next, err := fs.stepLocked(ctx, rp, d, true /* mayFollowSymlinks */, ds) + d.dirMu.Unlock() + if err != nil { + return nil, err + } + d = next + } + if !d.isDir() { + return nil, syserror.ENOTDIR + } + return d, nil +} + +// resolveLocked resolves rp to an existing file. +// +// Preconditions: fs.renameMu must be locked. +func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath, ds **[]*dentry) (*dentry, error) { + d := rp.Start().Impl().(*dentry) + for !rp.Done() { + d.dirMu.Lock() + next, err := fs.stepLocked(ctx, rp, d, true /* mayFollowSymlinks */, ds) + d.dirMu.Unlock() + if err != nil { + return nil, err + } + d = next + } + if rp.MustBeDir() && !d.isDir() { + return nil, syserror.ENOTDIR + } + return d, nil +} + +// doCreateAt checks that creating a file at rp is permitted, then invokes +// create to do so. +// +// Preconditions: !rp.Done(). For the final path component in rp, +// !rp.ShouldFollowSymlink(). +func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool, create func(parent *dentry, name string, haveUpperWhiteout bool) error) error { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckDrop(&ds) + start := rp.Start().Impl().(*dentry) + parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds) + if err != nil { + return err + } + if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { + return err + } + name := rp.Component() + if name == "." || name == ".." { + return syserror.EEXIST + } + if !dir && rp.MustBeDir() { + return syserror.ENOENT + } + if parent.vfsd.IsDead() { + return syserror.ENOENT + } + mnt := rp.Mount() + if err := mnt.CheckBeginWrite(); err != nil { + return err + } + defer mnt.EndWrite() + parent.dirMu.Lock() + defer parent.dirMu.Unlock() + + // Determine if a file already exists at name. + if _, ok := parent.children[name]; ok { + return syserror.EEXIST + } + childLayer, err := fs.lookupLayerLocked(ctx, parent, name) + if err != nil { + return err + } + if childLayer.existsInOverlay() { + return syserror.EEXIST + } + + // Ensure that the parent directory is copied-up so that we can create the + // new file in the upper layer. + if err := parent.copyUpLocked(ctx); err != nil { + return err + } + + // Finally create the new file. + if err := create(parent, name, childLayer == lookupLayerUpperWhiteout); err != nil { + return err + } + parent.dirents = nil + return nil +} + +// Preconditions: pop's parent directory has been copied up. +func (fs *filesystem) createWhiteout(ctx context.Context, vfsObj *vfs.VirtualFilesystem, pop *vfs.PathOperation) error { + return vfsObj.MknodAt(ctx, fs.creds, pop, &vfs.MknodOptions{ + Mode: linux.S_IFCHR, // permissions == include/linux/fs.h:WHITEOUT_MODE == 0 + // DevMajor == DevMinor == 0, from include/linux/fs.h:WHITEOUT_DEV + }) +} + +func (fs *filesystem) cleanupRecreateWhiteout(ctx context.Context, vfsObj *vfs.VirtualFilesystem, pop *vfs.PathOperation) { + if err := fs.createWhiteout(ctx, vfsObj, pop); err != nil { + ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to recreate whiteout after failed file creation: %v", err) + } +} + +// AccessAt implements vfs.Filesystem.Impl.AccessAt. +func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckDrop(&ds) + d, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + return err + } + return d.checkPermissions(creds, ats) +} + +// BoundEndpointAt implements vfs.FilesystemImpl.BoundEndpointAt. +func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckDrop(&ds) + d, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + return nil, err + } + if err := d.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil { + return nil, err + } + layerVD := d.topLayer() + return fs.vfsfs.VirtualFilesystem().BoundEndpointAt(ctx, fs.creds, &vfs.PathOperation{ + Root: layerVD, + Start: layerVD, + }, &opts) +} + +// GetDentryAt implements vfs.FilesystemImpl.GetDentryAt. +func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckDrop(&ds) + d, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + return nil, err + } + if opts.CheckSearchable { + if !d.isDir() { + return nil, syserror.ENOTDIR + } + if err := d.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { + return nil, err + } + } + d.IncRef() + return &d.vfsd, nil +} + +// GetParentDentryAt implements vfs.FilesystemImpl.GetParentDentryAt. +func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckDrop(&ds) + start := rp.Start().Impl().(*dentry) + d, err := fs.walkParentDirLocked(ctx, rp, start, &ds) + if err != nil { + return nil, err + } + d.IncRef() + return &d.vfsd, nil +} + +// LinkAt implements vfs.FilesystemImpl.LinkAt. +func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error { + return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, childName string, haveUpperWhiteout bool) error { + if rp.Mount() != vd.Mount() { + return syserror.EXDEV + } + old := vd.Dentry().Impl().(*dentry) + if old.isDir() { + return syserror.EPERM + } + if err := old.copyUpLocked(ctx); err != nil { + return err + } + vfsObj := fs.vfsfs.VirtualFilesystem() + newpop := vfs.PathOperation{ + Root: parent.upperVD, + Start: parent.upperVD, + Path: fspath.Parse(childName), + } + if haveUpperWhiteout { + if err := vfsObj.UnlinkAt(ctx, fs.creds, &newpop); err != nil { + return err + } + } + if err := vfsObj.LinkAt(ctx, fs.creds, &vfs.PathOperation{ + Root: old.upperVD, + Start: old.upperVD, + }, &newpop); err != nil { + if haveUpperWhiteout { + fs.cleanupRecreateWhiteout(ctx, vfsObj, &newpop) + } + return err + } + creds := rp.Credentials() + if err := vfsObj.SetStatAt(ctx, fs.creds, &newpop, &vfs.SetStatOptions{ + Stat: linux.Statx{ + Mask: linux.STATX_UID | linux.STATX_GID, + UID: uint32(creds.EffectiveKUID), + GID: uint32(creds.EffectiveKGID), + }, + }); err != nil { + if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &newpop); cleanupErr != nil { + ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer file after LinkAt metadata update failure: %v", cleanupErr) + } else if haveUpperWhiteout { + fs.cleanupRecreateWhiteout(ctx, vfsObj, &newpop) + } + return err + } + return nil + }) +} + +// MkdirAt implements vfs.FilesystemImpl.MkdirAt. +func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error { + return fs.doCreateAt(ctx, rp, true /* dir */, func(parent *dentry, childName string, haveUpperWhiteout bool) error { + vfsObj := fs.vfsfs.VirtualFilesystem() + pop := vfs.PathOperation{ + Root: parent.upperVD, + Start: parent.upperVD, + Path: fspath.Parse(childName), + } + if haveUpperWhiteout { + if err := vfsObj.UnlinkAt(ctx, fs.creds, &pop); err != nil { + return err + } + } + if err := vfsObj.MkdirAt(ctx, fs.creds, &pop, &opts); err != nil { + if haveUpperWhiteout { + fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop) + } + return err + } + creds := rp.Credentials() + if err := vfsObj.SetStatAt(ctx, fs.creds, &pop, &vfs.SetStatOptions{ + Stat: linux.Statx{ + Mask: linux.STATX_UID | linux.STATX_GID, + UID: uint32(creds.EffectiveKUID), + GID: uint32(creds.EffectiveKGID), + }, + }); err != nil { + if cleanupErr := vfsObj.RmdirAt(ctx, fs.creds, &pop); cleanupErr != nil { + ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer directory after MkdirAt metadata update failure: %v", cleanupErr) + } else if haveUpperWhiteout { + fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop) + } + return err + } + if haveUpperWhiteout { + // There may be directories on lower layers (previously hidden by + // the whiteout) that the new directory should not be merged with. + // Mark it opaque to prevent merging. + if err := vfsObj.SetxattrAt(ctx, fs.creds, &pop, &vfs.SetxattrOptions{ + Name: _OVL_XATTR_OPAQUE, + Value: "y", + }); err != nil { + if cleanupErr := vfsObj.RmdirAt(ctx, fs.creds, &pop); cleanupErr != nil { + ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer directory after MkdirAt set-opaque failure: %v", cleanupErr) + } else { + fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop) + } + return err + } + } + return nil + }) +} + +// MknodAt implements vfs.FilesystemImpl.MknodAt. +func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error { + return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, childName string, haveUpperWhiteout bool) error { + // Disallow attempts to create whiteouts. + if opts.Mode&linux.S_IFMT == linux.S_IFCHR && opts.DevMajor == 0 && opts.DevMinor == 0 { + return syserror.EPERM + } + vfsObj := fs.vfsfs.VirtualFilesystem() + pop := vfs.PathOperation{ + Root: parent.upperVD, + Start: parent.upperVD, + Path: fspath.Parse(childName), + } + if haveUpperWhiteout { + if err := vfsObj.UnlinkAt(ctx, fs.creds, &pop); err != nil { + return err + } + } + if err := vfsObj.MknodAt(ctx, fs.creds, &pop, &opts); err != nil { + if haveUpperWhiteout { + fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop) + } + return err + } + creds := rp.Credentials() + if err := vfsObj.SetStatAt(ctx, fs.creds, &pop, &vfs.SetStatOptions{ + Stat: linux.Statx{ + Mask: linux.STATX_UID | linux.STATX_GID, + UID: uint32(creds.EffectiveKUID), + GID: uint32(creds.EffectiveKGID), + }, + }); err != nil { + if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &pop); cleanupErr != nil { + ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer file after MknodAt metadata update failure: %v", cleanupErr) + } else if haveUpperWhiteout { + fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop) + } + return err + } + return nil + }) +} + +// OpenAt implements vfs.FilesystemImpl.OpenAt. +func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + mayCreate := opts.Flags&linux.O_CREAT != 0 + mustCreate := opts.Flags&(linux.O_CREAT|linux.O_EXCL) == (linux.O_CREAT | linux.O_EXCL) + + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckDrop(&ds) + + start := rp.Start().Impl().(*dentry) + if rp.Done() { + if mustCreate { + return nil, syserror.EEXIST + } + return start.openLocked(ctx, rp, &opts) + } + +afterTrailingSymlink: + parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds) + if err != nil { + return nil, err + } + // Check for search permission in the parent directory. + if err := parent.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { + return nil, err + } + // Determine whether or not we need to create a file. + parent.dirMu.Lock() + child, err := fs.stepLocked(ctx, rp, parent, false /* mayFollowSymlinks */, &ds) + if err == syserror.ENOENT && mayCreate { + fd, err := fs.createAndOpenLocked(ctx, rp, parent, &opts, &ds) + parent.dirMu.Unlock() + return fd, err + } + if err != nil { + parent.dirMu.Unlock() + return nil, err + } + // Open existing child or follow symlink. + parent.dirMu.Unlock() + if mustCreate { + return nil, syserror.EEXIST + } + if child.isSymlink() && rp.ShouldFollowSymlink() { + target, err := child.readlink(ctx) + if err != nil { + return nil, err + } + if err := rp.HandleSymlink(target); err != nil { + return nil, err + } + start = parent + goto afterTrailingSymlink + } + return child.openLocked(ctx, rp, &opts) +} + +// Preconditions: fs.renameMu must be locked. +func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions) (*vfs.FileDescription, error) { + ats := vfs.AccessTypesForOpenFlags(opts) + if err := d.checkPermissions(rp.Credentials(), ats); err != nil { + return nil, err + } + if ats.MayWrite() { + if err := d.copyUpLocked(ctx); err != nil { + return nil, err + } + } + mnt := rp.Mount() + + // Directory FDs open FDs from each layer when directory entries are read, + // so they don't require opening an FD from d.topLayer() up front. + ftype := atomic.LoadUint32(&d.mode) & linux.S_IFMT + if ftype == linux.S_IFDIR { + // Can't open directories with O_CREAT. + if opts.Flags&linux.O_CREAT != 0 { + return nil, syserror.EISDIR + } + // Can't open directories writably. + if ats&vfs.MayWrite != 0 { + return nil, syserror.EISDIR + } + if opts.Flags&linux.O_DIRECT != 0 { + return nil, syserror.EINVAL + } + fd := &directoryFD{} + fd.LockFD.Init(&d.locks) + if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{ + UseDentryMetadata: true, + }); err != nil { + return nil, err + } + return &fd.vfsfd, nil + } + + layerVD, isUpper := d.topLayerInfo() + layerFD, err := rp.VirtualFilesystem().OpenAt(ctx, d.fs.creds, &vfs.PathOperation{ + Root: layerVD, + Start: layerVD, + }, opts) + if err != nil { + return nil, err + } + layerFlags := layerFD.StatusFlags() + fd := &nonDirectoryFD{ + copiedUp: isUpper, + cachedFD: layerFD, + cachedFlags: layerFlags, + } + fd.LockFD.Init(&d.locks) + layerFDOpts := layerFD.Options() + if err := fd.vfsfd.Init(fd, layerFlags, mnt, &d.vfsd, &layerFDOpts); err != nil { + layerFD.DecRef() + return nil, err + } + return &fd.vfsfd, nil +} + +// Preconditions: parent.dirMu must be locked. parent does not already contain +// a child named rp.Component(). +func (fs *filesystem) createAndOpenLocked(ctx context.Context, rp *vfs.ResolvingPath, parent *dentry, opts *vfs.OpenOptions, ds **[]*dentry) (*vfs.FileDescription, error) { + creds := rp.Credentials() + if err := parent.checkPermissions(creds, vfs.MayWrite); err != nil { + return nil, err + } + if parent.vfsd.IsDead() { + return nil, syserror.ENOENT + } + mnt := rp.Mount() + if err := mnt.CheckBeginWrite(); err != nil { + return nil, err + } + defer mnt.EndWrite() + + if err := parent.copyUpLocked(ctx); err != nil { + return nil, err + } + + vfsObj := fs.vfsfs.VirtualFilesystem() + childName := rp.Component() + pop := vfs.PathOperation{ + Root: parent.upperVD, + Start: parent.upperVD, + Path: fspath.Parse(childName), + } + // We don't know if a whiteout exists on the upper layer; speculatively + // unlink it. + // + // TODO(gvisor.dev/issue/1199): Modify OpenAt => stepLocked so that we do + // know whether a whiteout exists. + var haveUpperWhiteout bool + switch err := vfsObj.UnlinkAt(ctx, fs.creds, &pop); err { + case nil: + haveUpperWhiteout = true + case syserror.ENOENT: + haveUpperWhiteout = false + default: + return nil, err + } + // Create the file on the upper layer, and get an FD representing it. + upperFD, err := vfsObj.OpenAt(ctx, fs.creds, &pop, &vfs.OpenOptions{ + Flags: opts.Flags&^vfs.FileCreationFlags | linux.O_CREAT | linux.O_EXCL, + Mode: opts.Mode, + }) + if err != nil { + if haveUpperWhiteout { + fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop) + } + return nil, err + } + // Change the file's owner to the caller. We can't use upperFD.SetStat() + // because it will pick up creds from ctx. + if err := vfsObj.SetStatAt(ctx, fs.creds, &pop, &vfs.SetStatOptions{ + Stat: linux.Statx{ + Mask: linux.STATX_UID | linux.STATX_GID, + UID: uint32(creds.EffectiveKUID), + GID: uint32(creds.EffectiveKGID), + }, + }); err != nil { + if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &pop); cleanupErr != nil { + ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer file after OpenAt(O_CREAT) metadata update failure: %v", cleanupErr) + } else if haveUpperWhiteout { + fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop) + } + return nil, err + } + // Re-lookup to get a dentry representing the new file, which is needed for + // the returned FD. + child, err := fs.getChildLocked(ctx, parent, childName, ds) + if err != nil { + if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &pop); cleanupErr != nil { + ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer file after OpenAt(O_CREAT) dentry lookup failure: %v", cleanupErr) + } else if haveUpperWhiteout { + fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop) + } + return nil, err + } + // Finally construct the overlay FD. + upperFlags := upperFD.StatusFlags() + fd := &nonDirectoryFD{ + copiedUp: true, + cachedFD: upperFD, + cachedFlags: upperFlags, + } + fd.LockFD.Init(&child.locks) + upperFDOpts := upperFD.Options() + if err := fd.vfsfd.Init(fd, upperFlags, mnt, &child.vfsd, &upperFDOpts); err != nil { + upperFD.DecRef() + // Don't bother with cleanup; the file was created successfully, we + // just can't open it anymore for some reason. + return nil, err + } + return &fd.vfsfd, nil +} + +// ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt. +func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckDrop(&ds) + d, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + return "", err + } + layerVD := d.topLayer() + return fs.vfsfs.VirtualFilesystem().ReadlinkAt(ctx, d.fs.creds, &vfs.PathOperation{ + Root: layerVD, + Start: layerVD, + }) +} + +// RenameAt implements vfs.FilesystemImpl.RenameAt. +func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldParentVD vfs.VirtualDentry, oldName string, opts vfs.RenameOptions) error { + if opts.Flags != 0 { + return syserror.EINVAL + } + + var ds *[]*dentry + fs.renameMu.Lock() + defer fs.renameMuUnlockAndCheckDrop(&ds) + newParent, err := fs.walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry), &ds) + if err != nil { + return err + } + newName := rp.Component() + if newName == "." || newName == ".." { + return syserror.EBUSY + } + mnt := rp.Mount() + if mnt != oldParentVD.Mount() { + return syserror.EXDEV + } + if err := mnt.CheckBeginWrite(); err != nil { + return err + } + defer mnt.EndWrite() + + // FIXME(gvisor.dev/issue/1199): Actually implement rename. + _ = newParent + return syserror.EXDEV +} + +// RmdirAt implements vfs.FilesystemImpl.RmdirAt. +func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckDrop(&ds) + start := rp.Start().Impl().(*dentry) + parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds) + if err != nil { + return err + } + if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { + return err + } + if err := rp.Mount().CheckBeginWrite(); err != nil { + return err + } + defer rp.Mount().EndWrite() + name := rp.Component() + if name == "." { + return syserror.EINVAL + } + if name == ".." { + return syserror.ENOTEMPTY + } + vfsObj := rp.VirtualFilesystem() + mntns := vfs.MountNamespaceFromContext(ctx) + defer mntns.DecRef() + parent.dirMu.Lock() + defer parent.dirMu.Unlock() + + // Ensure that parent is copied-up before potentially holding child.copyMu + // below. + if err := parent.copyUpLocked(ctx); err != nil { + return err + } + + // Unlike UnlinkAt, we need a dentry representing the child directory being + // removed in order to verify that it's empty. + child, err := fs.getChildLocked(ctx, parent, name, &ds) + if err != nil { + return err + } + if !child.isDir() { + return syserror.ENOTDIR + } + child.dirMu.Lock() + defer child.dirMu.Unlock() + whiteouts, err := child.collectWhiteoutsForRmdirLocked(ctx) + if err != nil { + return err + } + child.copyMu.RLock() + defer child.copyMu.RUnlock() + if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil { + return err + } + + pop := vfs.PathOperation{ + Root: parent.upperVD, + Start: parent.upperVD, + Path: fspath.Parse(name), + } + if child.upperVD.Ok() { + cleanupRecreateWhiteouts := func() { + if !child.upperVD.Ok() { + return + } + for whiteoutName, whiteoutUpper := range whiteouts { + if !whiteoutUpper { + continue + } + if err := fs.createWhiteout(ctx, vfsObj, &vfs.PathOperation{ + Root: child.upperVD, + Start: child.upperVD, + Path: fspath.Parse(whiteoutName), + }); err != nil && err != syserror.EEXIST { + ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to recreate deleted whiteout after RmdirAt failure: %v", err) + } + } + } + // Remove existing whiteouts on the upper layer. + for whiteoutName, whiteoutUpper := range whiteouts { + if !whiteoutUpper { + continue + } + if err := vfsObj.UnlinkAt(ctx, fs.creds, &vfs.PathOperation{ + Root: child.upperVD, + Start: child.upperVD, + Path: fspath.Parse(whiteoutName), + }); err != nil { + cleanupRecreateWhiteouts() + vfsObj.AbortDeleteDentry(&child.vfsd) + return err + } + } + // Remove the existing directory on the upper layer. + if err := vfsObj.RmdirAt(ctx, fs.creds, &pop); err != nil { + cleanupRecreateWhiteouts() + vfsObj.AbortDeleteDentry(&child.vfsd) + return err + } + } + if err := fs.createWhiteout(ctx, vfsObj, &pop); err != nil { + // Don't attempt to recover from this: the original directory is + // already gone, so any dentries representing it are invalid, and + // creating a new directory won't undo that. + ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to create whiteout during RmdirAt: %v", err) + vfsObj.AbortDeleteDentry(&child.vfsd) + return err + } + + vfsObj.CommitDeleteDentry(&child.vfsd) + delete(parent.children, name) + ds = appendDentry(ds, child) + parent.dirents = nil + return nil +} + +// SetStatAt implements vfs.FilesystemImpl.SetStatAt. +func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckDrop(&ds) + d, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + return err + } + + mode := linux.FileMode(atomic.LoadUint32(&d.mode)) + if err := vfs.CheckSetStat(ctx, rp.Credentials(), &opts.Stat, mode, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))); err != nil { + return err + } + mnt := rp.Mount() + if err := mnt.CheckBeginWrite(); err != nil { + return err + } + defer mnt.EndWrite() + if err := d.copyUpLocked(ctx); err != nil { + return err + } + // Changes to d's attributes are serialized by d.copyMu. + d.copyMu.Lock() + defer d.copyMu.Unlock() + if err := d.fs.vfsfs.VirtualFilesystem().SetStatAt(ctx, d.fs.creds, &vfs.PathOperation{ + Root: d.upperVD, + Start: d.upperVD, + }, &opts); err != nil { + return err + } + d.updateAfterSetStatLocked(&opts) + return nil +} + +// StatAt implements vfs.FilesystemImpl.StatAt. +func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckDrop(&ds) + d, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + return linux.Statx{}, err + } + + var stat linux.Statx + if layerMask := opts.Mask &^ statInternalMask; layerMask != 0 { + layerVD := d.topLayer() + stat, err = fs.vfsfs.VirtualFilesystem().StatAt(ctx, fs.creds, &vfs.PathOperation{ + Root: layerVD, + Start: layerVD, + }, &vfs.StatOptions{ + Mask: layerMask, + Sync: opts.Sync, + }) + if err != nil { + return linux.Statx{}, err + } + } + d.statInternalTo(ctx, &opts, &stat) + return stat, nil +} + +// StatFSAt implements vfs.FilesystemImpl.StatFSAt. +func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckDrop(&ds) + _, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + return linux.Statfs{}, err + } + return fs.statFS(ctx) +} + +// SymlinkAt implements vfs.FilesystemImpl.SymlinkAt. +func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error { + return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, childName string, haveUpperWhiteout bool) error { + vfsObj := fs.vfsfs.VirtualFilesystem() + pop := vfs.PathOperation{ + Root: parent.upperVD, + Start: parent.upperVD, + Path: fspath.Parse(childName), + } + if haveUpperWhiteout { + if err := vfsObj.UnlinkAt(ctx, fs.creds, &pop); err != nil { + return err + } + } + if err := vfsObj.SymlinkAt(ctx, fs.creds, &pop, target); err != nil { + if haveUpperWhiteout { + fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop) + } + return err + } + creds := rp.Credentials() + if err := vfsObj.SetStatAt(ctx, fs.creds, &pop, &vfs.SetStatOptions{ + Stat: linux.Statx{ + Mask: linux.STATX_UID | linux.STATX_GID, + UID: uint32(creds.EffectiveKUID), + GID: uint32(creds.EffectiveKGID), + }, + }); err != nil { + if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &pop); cleanupErr != nil { + ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer file after SymlinkAt metadata update failure: %v", cleanupErr) + } else if haveUpperWhiteout { + fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop) + } + return err + } + return nil + }) +} + +// UnlinkAt implements vfs.FilesystemImpl.UnlinkAt. +func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckDrop(&ds) + start := rp.Start().Impl().(*dentry) + parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds) + if err != nil { + return err + } + if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { + return err + } + if err := rp.Mount().CheckBeginWrite(); err != nil { + return err + } + defer rp.Mount().EndWrite() + name := rp.Component() + if name == "." || name == ".." { + return syserror.EISDIR + } + if rp.MustBeDir() { + return syserror.ENOTDIR + } + vfsObj := rp.VirtualFilesystem() + mntns := vfs.MountNamespaceFromContext(ctx) + defer mntns.DecRef() + parent.dirMu.Lock() + defer parent.dirMu.Unlock() + + // Ensure that parent is copied-up before potentially holding child.copyMu + // below. + if err := parent.copyUpLocked(ctx); err != nil { + return err + } + + child := parent.children[name] + var childLayer lookupLayer + if child != nil { + if child.isDir() { + return syserror.EISDIR + } + if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil { + return err + } + // Hold child.copyMu to prevent it from being copied-up during + // deletion. + child.copyMu.RLock() + defer child.copyMu.RUnlock() + if child.upperVD.Ok() { + childLayer = lookupLayerUpper + } else { + childLayer = lookupLayerLower + } + } else { + // Determine if the file being unlinked actually exists. Holding + // parent.dirMu prevents a dentry from being instantiated for the file, + // which in turn prevents it from being copied-up, so this result is + // stable. + childLayer, err = fs.lookupLayerLocked(ctx, parent, name) + if err != nil { + return err + } + if !childLayer.existsInOverlay() { + return syserror.ENOENT + } + } + + pop := vfs.PathOperation{ + Root: parent.upperVD, + Start: parent.upperVD, + Path: fspath.Parse(name), + } + if childLayer == lookupLayerUpper { + // Remove the existing file on the upper layer. + if err := vfsObj.UnlinkAt(ctx, fs.creds, &pop); err != nil { + if child != nil { + vfsObj.AbortDeleteDentry(&child.vfsd) + } + return err + } + } + if err := fs.createWhiteout(ctx, vfsObj, &pop); err != nil { + ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to create whiteout during UnlinkAt: %v", err) + if child != nil { + vfsObj.AbortDeleteDentry(&child.vfsd) + } + return err + } + + if child != nil { + vfsObj.CommitDeleteDentry(&child.vfsd) + delete(parent.children, name) + ds = appendDentry(ds, child) + } + parent.dirents = nil + return nil +} + +// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt. +func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckDrop(&ds) + _, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + return nil, err + } + // TODO(gvisor.dev/issue/1199): Linux overlayfs actually allows listxattr, + // but not any other xattr syscalls. For now we just reject all of them. + return nil, syserror.ENOTSUP +} + +// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt. +func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckDrop(&ds) + _, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + return "", err + } + return "", syserror.ENOTSUP +} + +// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt. +func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckDrop(&ds) + _, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + return err + } + return syserror.ENOTSUP +} + +// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt. +func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckDrop(&ds) + _, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + return err + } + return syserror.ENOTSUP +} + +// PrependPath implements vfs.FilesystemImpl.PrependPath. +func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error { + fs.renameMu.RLock() + defer fs.renameMu.RUnlock() + return genericPrependPath(vfsroot, vd.Mount(), vd.Dentry().Impl().(*dentry), b) +} diff --git a/pkg/sentry/fsimpl/overlay/non_directory.go b/pkg/sentry/fsimpl/overlay/non_directory.go new file mode 100644 index 000000000..a3c1f7a8d --- /dev/null +++ b/pkg/sentry/fsimpl/overlay/non_directory.go @@ -0,0 +1,266 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package overlay + +import ( + "sync/atomic" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/memmap" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/usermem" +) + +func (d *dentry) isSymlink() bool { + return atomic.LoadUint32(&d.mode)&linux.S_IFMT == linux.S_IFLNK +} + +func (d *dentry) readlink(ctx context.Context) (string, error) { + layerVD := d.topLayer() + return d.fs.vfsfs.VirtualFilesystem().ReadlinkAt(ctx, d.fs.creds, &vfs.PathOperation{ + Root: layerVD, + Start: layerVD, + }) +} + +type nonDirectoryFD struct { + fileDescription + + // If copiedUp is false, cachedFD represents + // fileDescription.dentry().lowerVDs[0]; otherwise, cachedFD represents + // fileDescription.dentry().upperVD. cachedFlags is the last known value of + // cachedFD.StatusFlags(). copiedUp, cachedFD, and cachedFlags are + // protected by mu. + mu sync.Mutex + copiedUp bool + cachedFD *vfs.FileDescription + cachedFlags uint32 +} + +func (fd *nonDirectoryFD) getCurrentFD(ctx context.Context) (*vfs.FileDescription, error) { + fd.mu.Lock() + defer fd.mu.Unlock() + wrappedFD, err := fd.currentFDLocked(ctx) + if err != nil { + return nil, err + } + wrappedFD.IncRef() + return wrappedFD, nil +} + +func (fd *nonDirectoryFD) currentFDLocked(ctx context.Context) (*vfs.FileDescription, error) { + d := fd.dentry() + statusFlags := fd.vfsfd.StatusFlags() + if !fd.copiedUp && d.isCopiedUp() { + // Switch to the copied-up file. + upperVD := d.topLayer() + upperFD, err := fd.filesystem().vfsfs.VirtualFilesystem().OpenAt(ctx, d.fs.creds, &vfs.PathOperation{ + Root: upperVD, + Start: upperVD, + }, &vfs.OpenOptions{ + Flags: statusFlags, + }) + if err != nil { + return nil, err + } + oldOff, oldOffErr := fd.cachedFD.Seek(ctx, 0, linux.SEEK_CUR) + if oldOffErr == nil { + if _, err := upperFD.Seek(ctx, oldOff, linux.SEEK_SET); err != nil { + upperFD.DecRef() + return nil, err + } + } + fd.cachedFD.DecRef() + fd.copiedUp = true + fd.cachedFD = upperFD + fd.cachedFlags = statusFlags + } else if fd.cachedFlags != statusFlags { + if err := fd.cachedFD.SetStatusFlags(ctx, d.fs.creds, statusFlags); err != nil { + return nil, err + } + fd.cachedFlags = statusFlags + } + return fd.cachedFD, nil +} + +// Release implements vfs.FileDescriptionImpl.Release. +func (fd *nonDirectoryFD) Release() { + fd.cachedFD.DecRef() + fd.cachedFD = nil +} + +// OnClose implements vfs.FileDescriptionImpl.OnClose. +func (fd *nonDirectoryFD) OnClose(ctx context.Context) error { + // Linux doesn't define ovl_file_operations.flush at all (i.e. its + // equivalent to OnClose is a no-op). We pass through to + // fd.cachedFD.OnClose() without upgrading if fd.dentry() has been + // copied-up, since OnClose is mostly used to define post-close writeback, + // and if fd.cachedFD hasn't been updated then it can't have been used to + // mutate fd.dentry() anyway. + fd.mu.Lock() + if statusFlags := fd.vfsfd.StatusFlags(); fd.cachedFlags != statusFlags { + if err := fd.cachedFD.SetStatusFlags(ctx, fd.filesystem().creds, statusFlags); err != nil { + fd.mu.Unlock() + return err + } + fd.cachedFlags = statusFlags + } + wrappedFD := fd.cachedFD + defer wrappedFD.IncRef() + fd.mu.Unlock() + return wrappedFD.OnClose(ctx) +} + +// Stat implements vfs.FileDescriptionImpl.Stat. +func (fd *nonDirectoryFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { + var stat linux.Statx + if layerMask := opts.Mask &^ statInternalMask; layerMask != 0 { + wrappedFD, err := fd.getCurrentFD(ctx) + if err != nil { + return linux.Statx{}, err + } + stat, err = wrappedFD.Stat(ctx, vfs.StatOptions{ + Mask: layerMask, + Sync: opts.Sync, + }) + wrappedFD.DecRef() + if err != nil { + return linux.Statx{}, err + } + } + fd.dentry().statInternalTo(ctx, &opts, &stat) + return stat, nil +} + +// SetStat implements vfs.FileDescriptionImpl.SetStat. +func (fd *nonDirectoryFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { + d := fd.dentry() + mode := linux.FileMode(atomic.LoadUint32(&d.mode)) + if err := vfs.CheckSetStat(ctx, auth.CredentialsFromContext(ctx), &opts.Stat, mode, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))); err != nil { + return err + } + mnt := fd.vfsfd.Mount() + if err := mnt.CheckBeginWrite(); err != nil { + return err + } + defer mnt.EndWrite() + if err := d.copyUpLocked(ctx); err != nil { + return err + } + // Changes to d's attributes are serialized by d.copyMu. + d.copyMu.Lock() + defer d.copyMu.Unlock() + wrappedFD, err := fd.currentFDLocked(ctx) + if err != nil { + return err + } + if err := wrappedFD.SetStat(ctx, opts); err != nil { + return err + } + d.updateAfterSetStatLocked(&opts) + return nil +} + +// StatFS implements vfs.FileDesciptionImpl.StatFS. +func (fd *nonDirectoryFD) StatFS(ctx context.Context) (linux.Statfs, error) { + return fd.filesystem().statFS(ctx) +} + +// PRead implements vfs.FileDescriptionImpl.PRead. +func (fd *nonDirectoryFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { + wrappedFD, err := fd.getCurrentFD(ctx) + if err != nil { + return 0, err + } + defer wrappedFD.DecRef() + return wrappedFD.PRead(ctx, dst, offset, opts) +} + +// Read implements vfs.FileDescriptionImpl.Read. +func (fd *nonDirectoryFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + // Hold fd.mu during the read to serialize the file offset. + fd.mu.Lock() + defer fd.mu.Unlock() + wrappedFD, err := fd.currentFDLocked(ctx) + if err != nil { + return 0, err + } + return wrappedFD.Read(ctx, dst, opts) +} + +// PWrite implements vfs.FileDescriptionImpl.PWrite. +func (fd *nonDirectoryFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { + wrappedFD, err := fd.getCurrentFD(ctx) + if err != nil { + return 0, err + } + defer wrappedFD.DecRef() + return wrappedFD.PWrite(ctx, src, offset, opts) +} + +// Write implements vfs.FileDescriptionImpl.Write. +func (fd *nonDirectoryFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { + // Hold fd.mu during the write to serialize the file offset. + fd.mu.Lock() + defer fd.mu.Unlock() + wrappedFD, err := fd.currentFDLocked(ctx) + if err != nil { + return 0, err + } + return wrappedFD.Write(ctx, src, opts) +} + +// Seek implements vfs.FileDescriptionImpl.Seek. +func (fd *nonDirectoryFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { + // Hold fd.mu during the seek to serialize the file offset. + fd.mu.Lock() + defer fd.mu.Unlock() + wrappedFD, err := fd.currentFDLocked(ctx) + if err != nil { + return 0, err + } + return wrappedFD.Seek(ctx, offset, whence) +} + +// Sync implements vfs.FileDescriptionImpl.Sync. +func (fd *nonDirectoryFD) Sync(ctx context.Context) error { + fd.mu.Lock() + if !fd.dentry().isCopiedUp() { + fd.mu.Unlock() + return nil + } + wrappedFD, err := fd.currentFDLocked(ctx) + if err != nil { + fd.mu.Unlock() + return err + } + wrappedFD.IncRef() + defer wrappedFD.DecRef() + fd.mu.Unlock() + return wrappedFD.Sync(ctx) +} + +// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap. +func (fd *nonDirectoryFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { + wrappedFD, err := fd.getCurrentFD(ctx) + if err != nil { + return err + } + defer wrappedFD.DecRef() + return wrappedFD.ConfigureMMap(ctx, opts) +} diff --git a/pkg/sentry/fsimpl/overlay/overlay.go b/pkg/sentry/fsimpl/overlay/overlay.go new file mode 100644 index 000000000..e660d0e2c --- /dev/null +++ b/pkg/sentry/fsimpl/overlay/overlay.go @@ -0,0 +1,612 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package overlay provides an overlay filesystem implementation, which +// synthesizes a filesystem by composing one or more immutable filesystems +// ("lower layers") with an optional mutable filesystem ("upper layer"). +// +// Lock order: +// +// directoryFD.mu / nonDirectoryFD.mu +// filesystem.renameMu +// dentry.dirMu +// dentry.copyMu +// +// Locking dentry.dirMu in multiple dentries requires that parent dentries are +// locked before child dentries, and that filesystem.renameMu is locked to +// stabilize this relationship. +package overlay + +import ( + "strings" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sentry/vfs/lock" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" +) + +// Name is the default filesystem name. +const Name = "overlay" + +// FilesystemType implements vfs.FilesystemType. +type FilesystemType struct{} + +// Name implements vfs.FilesystemType.Name. +func (FilesystemType) Name() string { + return Name +} + +// FilesystemOptions may be passed as vfs.GetFilesystemOptions.InternalData to +// FilesystemType.GetFilesystem. +type FilesystemOptions struct { + // Callers passing FilesystemOptions to + // overlay.FilesystemType.GetFilesystem() are responsible for ensuring that + // the vfs.Mounts comprising the layers of the overlay filesystem do not + // contain submounts. + + // If UpperRoot.Ok(), it is the root of the writable upper layer of the + // overlay. + UpperRoot vfs.VirtualDentry + + // LowerRoots contains the roots of the immutable lower layers of the + // overlay. LowerRoots is immutable. + LowerRoots []vfs.VirtualDentry +} + +// filesystem implements vfs.FilesystemImpl. +type filesystem struct { + vfsfs vfs.Filesystem + + // Immutable options. + opts FilesystemOptions + + // creds is a copy of the filesystem's creator's credentials, which are + // used for accesses to the filesystem's layers. creds is immutable. + creds *auth.Credentials + + // dirDevMinor is the device minor number used for directories. dirDevMinor + // is immutable. + dirDevMinor uint32 + + // lowerDevMinors maps lower layer filesystems to device minor numbers + // assigned to non-directory files originating from that filesystem. + // lowerDevMinors is immutable. + lowerDevMinors map[*vfs.Filesystem]uint32 + + // renameMu synchronizes renaming with non-renaming operations in order to + // ensure consistent lock ordering between dentry.dirMu in different + // dentries. + renameMu sync.RWMutex + + // lastDirIno is the last inode number assigned to a directory. lastDirIno + // is accessed using atomic memory operations. + lastDirIno uint64 +} + +// GetFilesystem implements vfs.FilesystemType.GetFilesystem. +func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { + mopts := vfs.GenericParseMountOptions(opts.Data) + fsoptsRaw := opts.InternalData + fsopts, haveFSOpts := fsoptsRaw.(FilesystemOptions) + if fsoptsRaw != nil && !haveFSOpts { + ctx.Warningf("overlay.FilesystemType.GetFilesystem: GetFilesystemOptions.InternalData has type %T, wanted overlay.FilesystemOptions or nil", fsoptsRaw) + return nil, nil, syserror.EINVAL + } + if haveFSOpts { + if len(fsopts.LowerRoots) == 0 { + ctx.Warningf("overlay.FilesystemType.GetFilesystem: LowerRoots must be non-empty") + return nil, nil, syserror.EINVAL + } + if len(fsopts.LowerRoots) < 2 && !fsopts.UpperRoot.Ok() { + ctx.Warningf("overlay.FilesystemType.GetFilesystem: at least two LowerRoots are required when UpperRoot is unspecified") + return nil, nil, syserror.EINVAL + } + // We don't enforce a maximum number of lower layers when not + // configured by applications; the sandbox owner can have an overlay + // filesystem with any number of lower layers. + } else { + vfsroot := vfs.RootFromContext(ctx) + defer vfsroot.DecRef() + upperPathname, ok := mopts["upperdir"] + if ok { + delete(mopts, "upperdir") + // Linux overlayfs also requires a workdir when upperdir is + // specified; we don't, so silently ignore this option. + delete(mopts, "workdir") + upperPath := fspath.Parse(upperPathname) + if !upperPath.Absolute { + ctx.Warningf("overlay.FilesystemType.GetFilesystem: upperdir %q must be absolute", upperPathname) + return nil, nil, syserror.EINVAL + } + upperRoot, err := vfsObj.GetDentryAt(ctx, creds, &vfs.PathOperation{ + Root: vfsroot, + Start: vfsroot, + Path: upperPath, + FollowFinalSymlink: true, + }, &vfs.GetDentryOptions{ + CheckSearchable: true, + }) + if err != nil { + ctx.Warningf("overlay.FilesystemType.GetFilesystem: failed to resolve upperdir %q: %v", upperPathname, err) + return nil, nil, err + } + defer upperRoot.DecRef() + privateUpperRoot, err := clonePrivateMount(vfsObj, upperRoot, false /* forceReadOnly */) + if err != nil { + ctx.Warningf("overlay.FilesystemType.GetFilesystem: failed to make private bind mount of upperdir %q: %v", upperPathname, err) + return nil, nil, err + } + defer privateUpperRoot.DecRef() + fsopts.UpperRoot = privateUpperRoot + } + lowerPathnamesStr, ok := mopts["lowerdir"] + if !ok { + ctx.Warningf("overlay.FilesystemType.GetFilesystem: missing required option lowerdir") + return nil, nil, syserror.EINVAL + } + delete(mopts, "lowerdir") + lowerPathnames := strings.Split(lowerPathnamesStr, ":") + const maxLowerLayers = 500 // Linux: fs/overlay/super.c:OVL_MAX_STACK + if len(lowerPathnames) < 2 && !fsopts.UpperRoot.Ok() { + ctx.Warningf("overlay.FilesystemType.GetFilesystem: at least two lowerdirs are required when upperdir is unspecified") + return nil, nil, syserror.EINVAL + } + if len(lowerPathnames) > maxLowerLayers { + ctx.Warningf("overlay.FilesystemType.GetFilesystem: %d lowerdirs specified, maximum %d", len(lowerPathnames), maxLowerLayers) + return nil, nil, syserror.EINVAL + } + for _, lowerPathname := range lowerPathnames { + lowerPath := fspath.Parse(lowerPathname) + if !lowerPath.Absolute { + ctx.Warningf("overlay.FilesystemType.GetFilesystem: lowerdir %q must be absolute", lowerPathname) + return nil, nil, syserror.EINVAL + } + lowerRoot, err := vfsObj.GetDentryAt(ctx, creds, &vfs.PathOperation{ + Root: vfsroot, + Start: vfsroot, + Path: lowerPath, + FollowFinalSymlink: true, + }, &vfs.GetDentryOptions{ + CheckSearchable: true, + }) + if err != nil { + ctx.Warningf("overlay.FilesystemType.GetFilesystem: failed to resolve lowerdir %q: %v", lowerPathname, err) + return nil, nil, err + } + defer lowerRoot.DecRef() + privateLowerRoot, err := clonePrivateMount(vfsObj, lowerRoot, true /* forceReadOnly */) + if err != nil { + ctx.Warningf("overlay.FilesystemType.GetFilesystem: failed to make private bind mount of lowerdir %q: %v", lowerPathname, err) + return nil, nil, err + } + defer privateLowerRoot.DecRef() + fsopts.LowerRoots = append(fsopts.LowerRoots, privateLowerRoot) + } + } + if len(mopts) != 0 { + ctx.Warningf("overlay.FilesystemType.GetFilesystem: unused options: %v", mopts) + return nil, nil, syserror.EINVAL + } + + // Allocate device numbers. + dirDevMinor, err := vfsObj.GetAnonBlockDevMinor() + if err != nil { + return nil, nil, err + } + lowerDevMinors := make(map[*vfs.Filesystem]uint32) + for _, lowerRoot := range fsopts.LowerRoots { + lowerFS := lowerRoot.Mount().Filesystem() + if _, ok := lowerDevMinors[lowerFS]; !ok { + devMinor, err := vfsObj.GetAnonBlockDevMinor() + if err != nil { + vfsObj.PutAnonBlockDevMinor(dirDevMinor) + for _, lowerDevMinor := range lowerDevMinors { + vfsObj.PutAnonBlockDevMinor(lowerDevMinor) + } + return nil, nil, err + } + lowerDevMinors[lowerFS] = devMinor + } + } + + // Take extra references held by the filesystem. + if fsopts.UpperRoot.Ok() { + fsopts.UpperRoot.IncRef() + } + for _, lowerRoot := range fsopts.LowerRoots { + lowerRoot.IncRef() + } + + fs := &filesystem{ + opts: fsopts, + creds: creds.Fork(), + dirDevMinor: dirDevMinor, + lowerDevMinors: lowerDevMinors, + } + fs.vfsfs.Init(vfsObj, &fstype, fs) + + // Construct the root dentry. + root := fs.newDentry() + root.refs = 1 + if fs.opts.UpperRoot.Ok() { + fs.opts.UpperRoot.IncRef() + root.copiedUp = 1 + root.upperVD = fs.opts.UpperRoot + } + for _, lowerRoot := range fs.opts.LowerRoots { + lowerRoot.IncRef() + root.lowerVDs = append(root.lowerVDs, lowerRoot) + } + rootTopVD := root.topLayer() + // Get metadata from the topmost layer. See fs.lookupLocked(). + const rootStatMask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO + rootStat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{ + Root: rootTopVD, + Start: rootTopVD, + }, &vfs.StatOptions{ + Mask: rootStatMask, + }) + if err != nil { + root.destroyLocked() + fs.vfsfs.DecRef() + return nil, nil, err + } + if rootStat.Mask&rootStatMask != rootStatMask { + root.destroyLocked() + fs.vfsfs.DecRef() + return nil, nil, syserror.EREMOTE + } + if isWhiteout(&rootStat) { + ctx.Warningf("overlay.FilesystemType.GetFilesystem: filesystem root is a whiteout") + root.destroyLocked() + fs.vfsfs.DecRef() + return nil, nil, syserror.EINVAL + } + root.mode = uint32(rootStat.Mode) + root.uid = rootStat.UID + root.gid = rootStat.GID + if rootStat.Mode&linux.S_IFMT == linux.S_IFDIR { + root.devMajor = linux.UNNAMED_MAJOR + root.devMinor = fs.dirDevMinor + root.ino = fs.newDirIno() + } else if !root.upperVD.Ok() { + root.devMajor = linux.UNNAMED_MAJOR + root.devMinor = fs.lowerDevMinors[root.lowerVDs[0].Mount().Filesystem()] + root.ino = rootStat.Ino + } else { + root.devMajor = rootStat.DevMajor + root.devMinor = rootStat.DevMinor + root.ino = rootStat.Ino + } + + return &fs.vfsfs, &root.vfsd, nil +} + +// clonePrivateMount creates a non-recursive bind mount rooted at vd, not +// associated with any MountNamespace, and returns the root of the new mount. +// (This is required to ensure that each layer of an overlay comprises only a +// single mount, and therefore can't cross into e.g. the overlay filesystem +// itself, risking lock recursion.) A reference is held on the returned +// VirtualDentry. +func clonePrivateMount(vfsObj *vfs.VirtualFilesystem, vd vfs.VirtualDentry, forceReadOnly bool) (vfs.VirtualDentry, error) { + oldmnt := vd.Mount() + opts := oldmnt.Options() + if forceReadOnly { + opts.ReadOnly = true + } + newmnt, err := vfsObj.NewDisconnectedMount(oldmnt.Filesystem(), vd.Dentry(), &opts) + if err != nil { + return vfs.VirtualDentry{}, err + } + return vfs.MakeVirtualDentry(newmnt, vd.Dentry()), nil +} + +// Release implements vfs.FilesystemImpl.Release. +func (fs *filesystem) Release() { + vfsObj := fs.vfsfs.VirtualFilesystem() + vfsObj.PutAnonBlockDevMinor(fs.dirDevMinor) + for _, lowerDevMinor := range fs.lowerDevMinors { + vfsObj.PutAnonBlockDevMinor(lowerDevMinor) + } + if fs.opts.UpperRoot.Ok() { + fs.opts.UpperRoot.DecRef() + } + for _, lowerRoot := range fs.opts.LowerRoots { + lowerRoot.DecRef() + } +} + +func (fs *filesystem) statFS(ctx context.Context) (linux.Statfs, error) { + // Always statfs the root of the topmost layer. Compare Linux's + // fs/overlayfs/super.c:ovl_statfs(). + var rootVD vfs.VirtualDentry + if fs.opts.UpperRoot.Ok() { + rootVD = fs.opts.UpperRoot + } else { + rootVD = fs.opts.LowerRoots[0] + } + fsstat, err := fs.vfsfs.VirtualFilesystem().StatFSAt(ctx, fs.creds, &vfs.PathOperation{ + Root: rootVD, + Start: rootVD, + }) + if err != nil { + return linux.Statfs{}, err + } + fsstat.Type = linux.OVERLAYFS_SUPER_MAGIC + return fsstat, nil +} + +func (fs *filesystem) newDirIno() uint64 { + return atomic.AddUint64(&fs.lastDirIno, 1) +} + +// dentry implements vfs.DentryImpl. +type dentry struct { + vfsd vfs.Dentry + + refs int64 + + // fs is the owning filesystem. fs is immutable. + fs *filesystem + + // mode, uid, and gid are the file mode, owner, and group of the file in + // the topmost layer (and therefore the overlay file as well), and are used + // for permission checks on this dentry. These fields are protected by + // copyMu and accessed using atomic memory operations. + mode uint32 + uid uint32 + gid uint32 + + // copiedUp is 1 if this dentry has been copied-up (i.e. upperVD.Ok()) and + // 0 otherwise. copiedUp is accessed using atomic memory operations. + copiedUp uint32 + + // parent is the dentry corresponding to this dentry's parent directory. + // name is this dentry's name in parent. If this dentry is a filesystem + // root, parent is nil and name is the empty string. parent and name are + // protected by fs.renameMu. + parent *dentry + name string + + // If this dentry represents a directory, children maps the names of + // children for which dentries have been instantiated to those dentries, + // and dirents (if not nil) is a cache of dirents as returned by + // directoryFDs representing this directory. children is protected by + // dirMu. + dirMu sync.Mutex + children map[string]*dentry + dirents []vfs.Dirent + + // upperVD and lowerVDs are the files from the overlay filesystem's layers + // that comprise the file on the overlay filesystem. + // + // If !upperVD.Ok(), it can transition to a valid vfs.VirtualDentry (i.e. + // be copied up) with copyMu locked for writing; otherwise, it is + // immutable. lowerVDs is always immutable. + copyMu sync.RWMutex + upperVD vfs.VirtualDentry + lowerVDs []vfs.VirtualDentry + + // inlineLowerVDs backs lowerVDs in the common case where len(lowerVDs) <= + // len(inlineLowerVDs). + inlineLowerVDs [1]vfs.VirtualDentry + + // devMajor, devMinor, and ino are the device major/minor and inode numbers + // used by this dentry. These fields are protected by copyMu and accessed + // using atomic memory operations. + devMajor uint32 + devMinor uint32 + ino uint64 + + locks lock.FileLocks +} + +// newDentry creates a new dentry. The dentry initially has no references; it +// is the caller's responsibility to set the dentry's reference count and/or +// call dentry.destroy() as appropriate. The dentry is initially invalid in +// that it contains no layers; the caller is responsible for setting them. +func (fs *filesystem) newDentry() *dentry { + d := &dentry{ + fs: fs, + } + d.lowerVDs = d.inlineLowerVDs[:0] + d.vfsd.Init(d) + return d +} + +// IncRef implements vfs.DentryImpl.IncRef. +func (d *dentry) IncRef() { + // d.refs may be 0 if d.fs.renameMu is locked, which serializes against + // d.checkDropLocked(). + atomic.AddInt64(&d.refs, 1) +} + +// TryIncRef implements vfs.DentryImpl.TryIncRef. +func (d *dentry) TryIncRef() bool { + for { + refs := atomic.LoadInt64(&d.refs) + if refs <= 0 { + return false + } + if atomic.CompareAndSwapInt64(&d.refs, refs, refs+1) { + return true + } + } +} + +// DecRef implements vfs.DentryImpl.DecRef. +func (d *dentry) DecRef() { + if refs := atomic.AddInt64(&d.refs, -1); refs == 0 { + d.fs.renameMu.Lock() + d.checkDropLocked() + d.fs.renameMu.Unlock() + } else if refs < 0 { + panic("overlay.dentry.DecRef() called without holding a reference") + } +} + +// checkDropLocked should be called after d's reference count becomes 0 or it +// becomes deleted. +// +// Preconditions: d.fs.renameMu must be locked for writing. +func (d *dentry) checkDropLocked() { + // Dentries with a positive reference count must be retained. (The only way + // to obtain a reference on a dentry with zero references is via path + // resolution, which requires renameMu, so if d.refs is zero then it will + // remain zero while we hold renameMu for writing.) Dentries with a + // negative reference count have already been destroyed. + if atomic.LoadInt64(&d.refs) != 0 { + return + } + // Refs is still zero; destroy it. + d.destroyLocked() + return +} + +// destroyLocked destroys the dentry. +// +// Preconditions: d.fs.renameMu must be locked for writing. d.refs == 0. +func (d *dentry) destroyLocked() { + switch atomic.LoadInt64(&d.refs) { + case 0: + // Mark the dentry destroyed. + atomic.StoreInt64(&d.refs, -1) + case -1: + panic("overlay.dentry.destroyLocked() called on already destroyed dentry") + default: + panic("overlay.dentry.destroyLocked() called with references on the dentry") + } + + if d.upperVD.Ok() { + d.upperVD.DecRef() + } + for _, lowerVD := range d.lowerVDs { + lowerVD.DecRef() + } + + if d.parent != nil { + d.parent.dirMu.Lock() + if !d.vfsd.IsDead() { + delete(d.parent.children, d.name) + } + d.parent.dirMu.Unlock() + // Drop the reference held by d on its parent without recursively + // locking d.fs.renameMu. + if refs := atomic.AddInt64(&d.parent.refs, -1); refs == 0 { + d.parent.checkDropLocked() + } else if refs < 0 { + panic("overlay.dentry.DecRef() called without holding a reference") + } + } +} + +// InotifyWithParent implements vfs.DentryImpl.InotifyWithParent. +func (d *dentry) InotifyWithParent(events uint32, cookie uint32, et vfs.EventType) { + // TODO(gvisor.dev/issue/1479): Implement inotify. +} + +// Watches implements vfs.DentryImpl.Watches. +func (d *dentry) Watches() *vfs.Watches { + // TODO(gvisor.dev/issue/1479): Implement inotify. + return nil +} + +// iterLayers invokes yield on each layer comprising d, from top to bottom. If +// any call to yield returns false, iterLayer stops iteration. +func (d *dentry) iterLayers(yield func(vd vfs.VirtualDentry, isUpper bool) bool) { + if d.isCopiedUp() { + if !yield(d.upperVD, true) { + return + } + } + for _, lowerVD := range d.lowerVDs { + if !yield(lowerVD, false) { + return + } + } +} + +func (d *dentry) topLayerInfo() (vd vfs.VirtualDentry, isUpper bool) { + if d.isCopiedUp() { + return d.upperVD, true + } + return d.lowerVDs[0], false +} + +func (d *dentry) topLayer() vfs.VirtualDentry { + vd, _ := d.topLayerInfo() + return vd +} + +func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error { + return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(atomic.LoadUint32(&d.mode)), auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))) +} + +// statInternalMask is the set of stat fields that is set by +// dentry.statInternalTo(). +const statInternalMask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO + +// statInternalTo writes fields to stat that are stored in d, and therefore do +// not requiring invoking StatAt on the overlay's layers. +func (d *dentry) statInternalTo(ctx context.Context, opts *vfs.StatOptions, stat *linux.Statx) { + stat.Mask |= statInternalMask + if d.isDir() { + // Linux sets nlink to 1 for merged directories + // (fs/overlayfs/inode.c:ovl_getattr()); we set it to 2 because this is + // correct more often ("." and the directory's entry in its parent), + // and some of our tests expect this. + stat.Nlink = 2 + } + stat.UID = atomic.LoadUint32(&d.uid) + stat.GID = atomic.LoadUint32(&d.gid) + stat.Mode = uint16(atomic.LoadUint32(&d.mode)) + stat.Ino = atomic.LoadUint64(&d.ino) + stat.DevMajor = atomic.LoadUint32(&d.devMajor) + stat.DevMinor = atomic.LoadUint32(&d.devMinor) +} + +// Preconditions: d.copyMu must be locked for writing. +func (d *dentry) updateAfterSetStatLocked(opts *vfs.SetStatOptions) { + if opts.Stat.Mask&linux.STATX_MODE != 0 { + atomic.StoreUint32(&d.mode, (d.mode&linux.S_IFMT)|uint32(opts.Stat.Mode&^linux.S_IFMT)) + } + if opts.Stat.Mask&linux.STATX_UID != 0 { + atomic.StoreUint32(&d.uid, opts.Stat.UID) + } + if opts.Stat.Mask&linux.STATX_GID != 0 { + atomic.StoreUint32(&d.gid, opts.Stat.GID) + } +} + +// fileDescription is embedded by overlay implementations of +// vfs.FileDescriptionImpl. +type fileDescription struct { + vfsfd vfs.FileDescription + vfs.FileDescriptionDefaultImpl + vfs.LockFD +} + +func (fd *fileDescription) filesystem() *filesystem { + return fd.vfsfd.Mount().Filesystem().Impl().(*filesystem) +} + +func (fd *fileDescription) dentry() *dentry { + return fd.vfsfd.Dentry().Impl().(*dentry) +} diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go index 97b9b18d7..13c48824e 100644 --- a/pkg/sentry/vfs/file_description.go +++ b/pkg/sentry/vfs/file_description.go @@ -108,6 +108,10 @@ type FileDescriptionOptions struct { UseDentryMetadata bool } +// FileCreationFlags are the set of flags passed to FileDescription.Init() but +// omitted from FileDescription.StatusFlags(). +const FileCreationFlags = linux.O_CREAT | linux.O_EXCL | linux.O_NOCTTY | linux.O_TRUNC + // Init must be called before first use of fd. If it succeeds, it takes // references on mnt and d. flags is the initial file description flags, which // is usually the full set of flags passed to open(2). @@ -122,8 +126,8 @@ func (fd *FileDescription) Init(impl FileDescriptionImpl, flags uint32, mnt *Mou fd.refs = 1 // Remove "file creation flags" to mirror the behavior from file.f_flags in - // fs/open.c:do_dentry_open - fd.statusFlags = flags &^ (linux.O_CREAT | linux.O_EXCL | linux.O_NOCTTY | linux.O_TRUNC) + // fs/open.c:do_dentry_open. + fd.statusFlags = flags &^ FileCreationFlags fd.vd = VirtualDentry{ mount: mnt, dentry: d, @@ -471,6 +475,15 @@ type IterDirentsCallback interface { Handle(dirent Dirent) error } +// IterDirentsCallbackFunc implements IterDirentsCallback for a function with +// the semantics of IterDirentsCallback.Handle. +type IterDirentsCallbackFunc func(dirent Dirent) error + +// Handle implements IterDirentsCallback.Handle. +func (f IterDirentsCallbackFunc) Handle(dirent Dirent) error { + return f(dirent) +} + // OnClose is called when a file descriptor representing the FileDescription is // closed. Returning a non-nil error should not prevent the file descriptor // from being closed. diff --git a/pkg/syserror/syserror.go b/pkg/syserror/syserror.go index f86db0999..c73072c42 100644 --- a/pkg/syserror/syserror.go +++ b/pkg/syserror/syserror.go @@ -72,6 +72,7 @@ var ( EPERM = error(syscall.EPERM) EPIPE = error(syscall.EPIPE) ERANGE = error(syscall.ERANGE) + EREMOTE = error(syscall.EREMOTE) EROFS = error(syscall.EROFS) ESPIPE = error(syscall.ESPIPE) ESRCH = error(syscall.ESRCH) diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD index a907c103b..2b1e6b13e 100644 --- a/runsc/boot/BUILD +++ b/runsc/boot/BUILD @@ -55,6 +55,7 @@ go_library( "//pkg/sentry/fsimpl/devtmpfs", "//pkg/sentry/fsimpl/gofer", "//pkg/sentry/fsimpl/host", + "//pkg/sentry/fsimpl/overlay", "//pkg/sentry/fsimpl/proc", "//pkg/sentry/fsimpl/sys", "//pkg/sentry/fsimpl/tmpfs", diff --git a/runsc/boot/vfs.go b/runsc/boot/vfs.go index 8eeb43e79..d1653b279 100644 --- a/runsc/boot/vfs.go +++ b/runsc/boot/vfs.go @@ -30,6 +30,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fsimpl/devpts" "gvisor.dev/gvisor/pkg/sentry/fsimpl/devtmpfs" "gvisor.dev/gvisor/pkg/sentry/fsimpl/gofer" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/overlay" "gvisor.dev/gvisor/pkg/sentry/fsimpl/proc" "gvisor.dev/gvisor/pkg/sentry/fsimpl/sys" "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs" @@ -53,6 +54,10 @@ func registerFilesystems(ctx context.Context, vfsObj *vfs.VirtualFilesystem, cre vfsObj.MustRegisterFilesystemType(gofer.Name, &gofer.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ AllowUserList: true, }) + vfsObj.MustRegisterFilesystemType(overlay.Name, &overlay.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserMount: true, + AllowUserList: true, + }) vfsObj.MustRegisterFilesystemType(proc.Name, &proc.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ AllowUserMount: true, AllowUserList: true, -- cgit v1.2.3 From 3b0b1f104d963a1d11973c444934e6744ab7e79b Mon Sep 17 00:00:00 2001 From: Jamie Liu Date: Tue, 16 Jun 2020 00:14:07 -0700 Subject: Miscellaneous VFS2 fixes. PiperOrigin-RevId: 316627764 --- pkg/sentry/fsimpl/gofer/gofer.go | 26 ++++++++++++++++---------- pkg/sentry/kernel/kernel.go | 2 +- pkg/sentry/syscalls/linux/vfs2/ioctl.go | 29 +++++++++++++++++++++++++++++ 3 files changed, 46 insertions(+), 11 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go index 0d88a328e..ac051b3a7 100644 --- a/pkg/sentry/fsimpl/gofer/gofer.go +++ b/pkg/sentry/fsimpl/gofer/gofer.go @@ -835,6 +835,14 @@ func (d *dentry) statTo(stat *linux.Statx) { stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME | linux.STATX_INO | linux.STATX_SIZE | linux.STATX_BLOCKS | linux.STATX_BTIME stat.Blksize = atomic.LoadUint32(&d.blockSize) stat.Nlink = atomic.LoadUint32(&d.nlink) + if stat.Nlink == 0 { + // The remote filesystem doesn't support link count; just make + // something up. This is consistent with Linux, where + // fs/inode.c:inode_init_always() initializes link count to 1, and + // fs/9p/vfs_inode_dotl.c:v9fs_stat2inode_dotl() doesn't touch it if + // it's not provided by the remote filesystem. + stat.Nlink = 1 + } stat.UID = atomic.LoadUint32(&d.uid) stat.GID = atomic.LoadUint32(&d.gid) stat.Mode = uint16(atomic.LoadUint32(&d.mode)) @@ -1346,23 +1354,21 @@ func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool } // incLinks increments link count. -// -// Preconditions: d.nlink != 0 && d.nlink < math.MaxUint32. func (d *dentry) incLinks() { - v := atomic.AddUint32(&d.nlink, 1) - if v < 2 { - panic(fmt.Sprintf("dentry.nlink is invalid (was 0 or overflowed): %d", v)) + if atomic.LoadUint32(&d.nlink) == 0 { + // The remote filesystem doesn't support link count. + return } + atomic.AddUint32(&d.nlink, 1) } // decLinks decrements link count. -// -// Preconditions: d.nlink > 1. func (d *dentry) decLinks() { - v := atomic.AddUint32(&d.nlink, ^uint32(0)) - if v == 0 { - panic(fmt.Sprintf("dentry.nlink must be greater than 0: %d", v)) + if atomic.LoadUint32(&d.nlink) == 0 { + // The remote filesystem doesn't support link count. + return } + atomic.AddUint32(&d.nlink, ^uint32(0)) } // fileDescription is embedded by gofer implementations of diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index bcbeb6a39..52491da7a 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -892,7 +892,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, if mntnsVFS2 == nil { // MountNamespaceVFS2 adds a reference to the namespace, which is // transferred to the new process. - mntnsVFS2 = k.GlobalInit().Leader().MountNamespaceVFS2() + mntnsVFS2 = k.globalInit.Leader().MountNamespaceVFS2() } // Get the root directory from the MountNamespace. root := args.MountNamespaceVFS2.Root() diff --git a/pkg/sentry/syscalls/linux/vfs2/ioctl.go b/pkg/sentry/syscalls/linux/vfs2/ioctl.go index 5a2418da9..0399c0db4 100644 --- a/pkg/sentry/syscalls/linux/vfs2/ioctl.go +++ b/pkg/sentry/syscalls/linux/vfs2/ioctl.go @@ -15,6 +15,7 @@ package vfs2 import ( + "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/syserror" @@ -30,6 +31,34 @@ func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall } defer file.DecRef() + // Handle ioctls that apply to all FDs. + switch args[1].Int() { + case linux.FIONCLEX: + t.FDTable().SetFlagsVFS2(fd, kernel.FDFlags{ + CloseOnExec: false, + }) + return 0, nil, nil + + case linux.FIOCLEX: + t.FDTable().SetFlagsVFS2(fd, kernel.FDFlags{ + CloseOnExec: true, + }) + return 0, nil, nil + + case linux.FIONBIO: + var set int32 + if _, err := t.CopyIn(args[2].Pointer(), &set); err != nil { + return 0, nil, err + } + flags := file.StatusFlags() + if set != 0 { + flags |= linux.O_NONBLOCK + } else { + flags &^= linux.O_NONBLOCK + } + return 0, nil, file.SetStatusFlags(t, t.Credentials(), flags) + } + ret, err := file.Ioctl(t, t.MemoryManager(), args) return ret, nil, err } -- cgit v1.2.3 From 96519e2c9d3fa1f15537c4dfc081a19d8d1ce1a2 Mon Sep 17 00:00:00 2001 From: Fabricio Voznika Date: Wed, 17 Jun 2020 10:02:41 -0700 Subject: Implement POSIX locks - Change FileDescriptionImpl Lock/UnlockPOSIX signature to take {start,length,whence}, so the correct offset can be calculated in the implementations. - Create PosixLocker interface to make it possible to share the same locking code from different implementations. Closes #1480 PiperOrigin-RevId: 316910286 --- pkg/sentry/fsimpl/devpts/BUILD | 2 +- pkg/sentry/fsimpl/devpts/devpts.go | 3 +- pkg/sentry/fsimpl/devpts/master.go | 14 +++- pkg/sentry/fsimpl/devpts/slave.go | 14 +++- pkg/sentry/fsimpl/ext/BUILD | 2 +- pkg/sentry/fsimpl/ext/directory.go | 11 +++ pkg/sentry/fsimpl/ext/inode.go | 3 +- pkg/sentry/fsimpl/ext/regular_file.go | 11 +++ pkg/sentry/fsimpl/ext/symlink.go | 1 + pkg/sentry/fsimpl/gofer/BUILD | 1 - pkg/sentry/fsimpl/gofer/gofer.go | 12 ++- pkg/sentry/fsimpl/gofer/special_file.go | 3 +- pkg/sentry/fsimpl/host/BUILD | 2 +- pkg/sentry/fsimpl/host/host.go | 14 +++- pkg/sentry/fsimpl/host/tty.go | 11 +++ pkg/sentry/fsimpl/kernfs/BUILD | 3 +- pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go | 16 +++- pkg/sentry/fsimpl/kernfs/fd_impl_util.go | 16 +++- pkg/sentry/fsimpl/kernfs/inode_impl_util.go | 3 +- pkg/sentry/fsimpl/kernfs/kernfs_test.go | 5 +- pkg/sentry/fsimpl/overlay/BUILD | 2 +- pkg/sentry/fsimpl/overlay/overlay.go | 14 +++- pkg/sentry/fsimpl/pipefs/BUILD | 1 - pkg/sentry/fsimpl/pipefs/pipefs.go | 3 +- pkg/sentry/fsimpl/proc/BUILD | 2 +- pkg/sentry/fsimpl/proc/subtasks.go | 3 +- pkg/sentry/fsimpl/proc/task.go | 3 +- pkg/sentry/fsimpl/proc/task_fds.go | 3 +- pkg/sentry/fsimpl/proc/task_files.go | 14 +++- pkg/sentry/fsimpl/proc/tasks.go | 3 +- pkg/sentry/fsimpl/sys/BUILD | 1 - pkg/sentry/fsimpl/sys/sys.go | 3 +- pkg/sentry/fsimpl/tmpfs/BUILD | 1 - pkg/sentry/fsimpl/tmpfs/regular_file_test.go | 33 +++----- pkg/sentry/fsimpl/tmpfs/tmpfs.go | 15 +++- pkg/sentry/kernel/fd_table.go | 10 ++- pkg/sentry/kernel/pipe/BUILD | 2 +- pkg/sentry/kernel/pipe/vfs.go | 18 +++- pkg/sentry/socket/hostinet/BUILD | 2 +- pkg/sentry/socket/hostinet/socket_vfs2.go | 14 +++- pkg/sentry/socket/netlink/BUILD | 2 +- pkg/sentry/socket/netlink/socket_vfs2.go | 14 +++- pkg/sentry/socket/netstack/BUILD | 2 +- pkg/sentry/socket/netstack/netstack_vfs2.go | 14 +++- pkg/sentry/socket/unix/BUILD | 2 +- pkg/sentry/socket/unix/unix_vfs2.go | 16 +++- pkg/sentry/syscalls/linux/vfs2/fd.go | 38 +++++++++ pkg/sentry/vfs/BUILD | 2 +- pkg/sentry/vfs/file_description.go | 18 ++-- pkg/sentry/vfs/file_description_impl_util.go | 25 ++---- pkg/sentry/vfs/lock.go | 109 +++++++++++++++++++++++++ pkg/sentry/vfs/lock/BUILD | 13 --- pkg/sentry/vfs/lock/lock.go | 72 ---------------- test/syscalls/linux/BUILD | 1 + test/syscalls/linux/fcntl.cc | 96 +++++++++++++++------- 55 files changed, 484 insertions(+), 234 deletions(-) create mode 100644 pkg/sentry/vfs/lock.go delete mode 100644 pkg/sentry/vfs/lock/BUILD delete mode 100644 pkg/sentry/vfs/lock/lock.go (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/devpts/BUILD b/pkg/sentry/fsimpl/devpts/BUILD index cf440dce8..93512c9b6 100644 --- a/pkg/sentry/fsimpl/devpts/BUILD +++ b/pkg/sentry/fsimpl/devpts/BUILD @@ -18,12 +18,12 @@ go_library( "//pkg/context", "//pkg/safemem", "//pkg/sentry/arch", + "//pkg/sentry/fs/lock", "//pkg/sentry/fsimpl/kernfs", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", "//pkg/sentry/unimpl", "//pkg/sentry/vfs", - "//pkg/sentry/vfs/lock", "//pkg/sync", "//pkg/syserror", "//pkg/usermem", diff --git a/pkg/sentry/fsimpl/devpts/devpts.go b/pkg/sentry/fsimpl/devpts/devpts.go index 9b0e0cca2..e6fda2b4f 100644 --- a/pkg/sentry/fsimpl/devpts/devpts.go +++ b/pkg/sentry/fsimpl/devpts/devpts.go @@ -28,7 +28,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/syserror" ) @@ -117,7 +116,7 @@ type rootInode struct { kernfs.InodeNotSymlink kernfs.OrderedChildren - locks lock.FileLocks + locks vfs.FileLocks // Keep a reference to this inode's dentry. dentry kernfs.Dentry diff --git a/pkg/sentry/fsimpl/devpts/master.go b/pkg/sentry/fsimpl/devpts/master.go index 1d22adbe3..69879498a 100644 --- a/pkg/sentry/fsimpl/devpts/master.go +++ b/pkg/sentry/fsimpl/devpts/master.go @@ -18,11 +18,11 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" + fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/unimpl" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" @@ -35,7 +35,7 @@ type masterInode struct { kernfs.InodeNotDirectory kernfs.InodeNotSymlink - locks lock.FileLocks + locks vfs.FileLocks // Keep a reference to this inode's dentry. dentry kernfs.Dentry @@ -189,6 +189,16 @@ func (mfd *masterFileDescription) Stat(ctx context.Context, opts vfs.StatOptions return mfd.inode.Stat(fs, opts) } +// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. +func (mfd *masterFileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { + return mfd.Locks().LockPOSIX(ctx, &mfd.vfsfd, uid, t, start, length, whence, block) +} + +// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. +func (mfd *masterFileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { + return mfd.Locks().UnlockPOSIX(ctx, &mfd.vfsfd, uid, start, length, whence) +} + // maybeEmitUnimplementedEvent emits unimplemented event if cmd is valid. func maybeEmitUnimplementedEvent(ctx context.Context, cmd uint32) { switch cmd { diff --git a/pkg/sentry/fsimpl/devpts/slave.go b/pkg/sentry/fsimpl/devpts/slave.go index 7fe475080..cf1a0f0ac 100644 --- a/pkg/sentry/fsimpl/devpts/slave.go +++ b/pkg/sentry/fsimpl/devpts/slave.go @@ -18,10 +18,10 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" + fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" @@ -34,7 +34,7 @@ type slaveInode struct { kernfs.InodeNotDirectory kernfs.InodeNotSymlink - locks lock.FileLocks + locks vfs.FileLocks // Keep a reference to this inode's dentry. dentry kernfs.Dentry @@ -185,3 +185,13 @@ func (sfd *slaveFileDescription) Stat(ctx context.Context, opts vfs.StatOptions) fs := sfd.vfsfd.VirtualDentry().Mount().Filesystem() return sfd.inode.Stat(fs, opts) } + +// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. +func (sfd *slaveFileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { + return sfd.Locks().LockPOSIX(ctx, &sfd.vfsfd, uid, t, start, length, whence, block) +} + +// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. +func (sfd *slaveFileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { + return sfd.Locks().UnlockPOSIX(ctx, &sfd.vfsfd, uid, start, length, whence) +} diff --git a/pkg/sentry/fsimpl/ext/BUILD b/pkg/sentry/fsimpl/ext/BUILD index 973fa0def..ef24f8159 100644 --- a/pkg/sentry/fsimpl/ext/BUILD +++ b/pkg/sentry/fsimpl/ext/BUILD @@ -54,13 +54,13 @@ go_library( "//pkg/safemem", "//pkg/sentry/arch", "//pkg/sentry/fs", + "//pkg/sentry/fs/lock", "//pkg/sentry/fsimpl/ext/disklayout", "//pkg/sentry/kernel/auth", "//pkg/sentry/memmap", "//pkg/sentry/socket/unix/transport", "//pkg/sentry/syscalls/linux", "//pkg/sentry/vfs", - "//pkg/sentry/vfs/lock", "//pkg/sync", "//pkg/syserror", "//pkg/usermem", diff --git a/pkg/sentry/fsimpl/ext/directory.go b/pkg/sentry/fsimpl/ext/directory.go index 43be6928a..357512c7e 100644 --- a/pkg/sentry/fsimpl/ext/directory.go +++ b/pkg/sentry/fsimpl/ext/directory.go @@ -20,6 +20,7 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/fs" + fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" @@ -305,3 +306,13 @@ func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (in fd.off = offset return offset, nil } + +// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. +func (fd *directoryFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { + return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block) +} + +// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. +func (fd *directoryFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { + return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence) +} diff --git a/pkg/sentry/fsimpl/ext/inode.go b/pkg/sentry/fsimpl/ext/inode.go index 5caaf14ed..30636cf66 100644 --- a/pkg/sentry/fsimpl/ext/inode.go +++ b/pkg/sentry/fsimpl/ext/inode.go @@ -22,7 +22,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/syserror" ) @@ -55,7 +54,7 @@ type inode struct { // diskInode gives us access to the inode struct on disk. Immutable. diskInode disklayout.Inode - locks lock.FileLocks + locks vfs.FileLocks // This is immutable. The first field of the implementations must have inode // as the first field to ensure temporality. diff --git a/pkg/sentry/fsimpl/ext/regular_file.go b/pkg/sentry/fsimpl/ext/regular_file.go index 152036b2e..66d14bb95 100644 --- a/pkg/sentry/fsimpl/ext/regular_file.go +++ b/pkg/sentry/fsimpl/ext/regular_file.go @@ -20,6 +20,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/safemem" + fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" @@ -149,3 +150,13 @@ func (fd *regularFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpt // TODO(b/134676337): Implement mmap(2). return syserror.ENODEV } + +// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. +func (fd *regularFileFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { + return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block) +} + +// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. +func (fd *regularFileFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { + return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence) +} diff --git a/pkg/sentry/fsimpl/ext/symlink.go b/pkg/sentry/fsimpl/ext/symlink.go index acb28d85b..62efd4095 100644 --- a/pkg/sentry/fsimpl/ext/symlink.go +++ b/pkg/sentry/fsimpl/ext/symlink.go @@ -66,6 +66,7 @@ func (in *inode) isSymlink() bool { // O_PATH. For this reason most of the functions return EBADF. type symlinkFD struct { fileDescription + vfs.NoLockFD } // Compiles only if symlinkFD implements vfs.FileDescriptionImpl. diff --git a/pkg/sentry/fsimpl/gofer/BUILD b/pkg/sentry/fsimpl/gofer/BUILD index 5cdeeaeb5..4a800dcf9 100644 --- a/pkg/sentry/fsimpl/gofer/BUILD +++ b/pkg/sentry/fsimpl/gofer/BUILD @@ -69,7 +69,6 @@ go_library( "//pkg/sentry/socket/unix/transport", "//pkg/sentry/usage", "//pkg/sentry/vfs", - "//pkg/sentry/vfs/lock", "//pkg/syserr", "//pkg/syserror", "//pkg/unet", diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go index ac051b3a7..d8ae475ed 100644 --- a/pkg/sentry/fsimpl/gofer/gofer.go +++ b/pkg/sentry/fsimpl/gofer/gofer.go @@ -53,7 +53,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/unet" "gvisor.dev/gvisor/pkg/usermem" @@ -665,7 +664,7 @@ type dentry struct { // endpoint bound to this file. pipe *pipe.VFSPipe - locks lock.FileLocks + locks vfs.FileLocks } // dentryAttrMask returns a p9.AttrMask enabling all attributes used by the @@ -1439,9 +1438,14 @@ func (fd *fileDescription) LockBSD(ctx context.Context, uid fslock.UniqueID, t f } // LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. -func (fd *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, rng fslock.LockRange, block fslock.Blocker) error { +func (fd *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { fd.lockLogging.Do(func() { log.Infof("Range lock using gofer file handled internally.") }) - return fd.LockFD.LockPOSIX(ctx, uid, t, rng, block) + return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block) +} + +// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. +func (fd *fileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { + return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence) } diff --git a/pkg/sentry/fsimpl/gofer/special_file.go b/pkg/sentry/fsimpl/gofer/special_file.go index 289efdd25..e6e29b329 100644 --- a/pkg/sentry/fsimpl/gofer/special_file.go +++ b/pkg/sentry/fsimpl/gofer/special_file.go @@ -22,7 +22,6 @@ import ( "gvisor.dev/gvisor/pkg/fdnotifier" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" "gvisor.dev/gvisor/pkg/waiter" @@ -52,7 +51,7 @@ type specialFileFD struct { off int64 } -func newSpecialFileFD(h handle, mnt *vfs.Mount, d *dentry, locks *lock.FileLocks, flags uint32) (*specialFileFD, error) { +func newSpecialFileFD(h handle, mnt *vfs.Mount, d *dentry, locks *vfs.FileLocks, flags uint32) (*specialFileFD, error) { ftype := d.fileType() seekable := ftype == linux.S_IFREG mayBlock := ftype == linux.S_IFIFO || ftype == linux.S_IFSOCK diff --git a/pkg/sentry/fsimpl/host/BUILD b/pkg/sentry/fsimpl/host/BUILD index 54f16ad63..44a09d87a 100644 --- a/pkg/sentry/fsimpl/host/BUILD +++ b/pkg/sentry/fsimpl/host/BUILD @@ -27,6 +27,7 @@ go_library( "//pkg/safemem", "//pkg/sentry/arch", "//pkg/sentry/fs/fsutil", + "//pkg/sentry/fs/lock", "//pkg/sentry/fsimpl/kernfs", "//pkg/sentry/hostfd", "//pkg/sentry/kernel", @@ -39,7 +40,6 @@ go_library( "//pkg/sentry/unimpl", "//pkg/sentry/uniqueid", "//pkg/sentry/vfs", - "//pkg/sentry/vfs/lock", "//pkg/sync", "//pkg/syserr", "//pkg/syserror", diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go index 5ec5100b8..7906242c9 100644 --- a/pkg/sentry/fsimpl/host/host.go +++ b/pkg/sentry/fsimpl/host/host.go @@ -28,13 +28,13 @@ import ( "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/refs" + fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/hostfd" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/memmap" unixsocket "gvisor.dev/gvisor/pkg/sentry/socket/unix" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" @@ -183,7 +183,7 @@ type inode struct { kernfs.InodeNotDirectory kernfs.InodeNotSymlink - locks lock.FileLocks + locks vfs.FileLocks // When the reference count reaches zero, the host fd is closed. refs.AtomicRefCount @@ -718,3 +718,13 @@ func (f *fileDescription) EventUnregister(e *waiter.Entry) { func (f *fileDescription) Readiness(mask waiter.EventMask) waiter.EventMask { return fdnotifier.NonBlockingPoll(int32(f.inode.hostFD), mask) } + +// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. +func (f *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { + return f.Locks().LockPOSIX(ctx, &f.vfsfd, uid, t, start, length, whence, block) +} + +// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. +func (f *fileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { + return f.Locks().UnlockPOSIX(ctx, &f.vfsfd, uid, start, length, whence) +} diff --git a/pkg/sentry/fsimpl/host/tty.go b/pkg/sentry/fsimpl/host/tty.go index 68af6e5af..0fbc543b1 100644 --- a/pkg/sentry/fsimpl/host/tty.go +++ b/pkg/sentry/fsimpl/host/tty.go @@ -18,6 +18,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" + fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/unimpl" "gvisor.dev/gvisor/pkg/sentry/vfs" @@ -377,3 +378,13 @@ func (t *TTYFileDescription) checkChange(ctx context.Context, sig linux.Signal) _ = pg.SendSignal(kernel.SignalInfoPriv(sig)) return kernel.ERESTARTSYS } + +// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. +func (t *TTYFileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, typ fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { + return t.Locks().LockPOSIX(ctx, &t.vfsfd, uid, typ, start, length, whence, block) +} + +// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. +func (t *TTYFileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { + return t.Locks().UnlockPOSIX(ctx, &t.vfsfd, uid, start, length, whence) +} diff --git a/pkg/sentry/fsimpl/kernfs/BUILD b/pkg/sentry/fsimpl/kernfs/BUILD index 0299dbde9..179df6c1e 100644 --- a/pkg/sentry/fsimpl/kernfs/BUILD +++ b/pkg/sentry/fsimpl/kernfs/BUILD @@ -45,11 +45,11 @@ go_library( "//pkg/fspath", "//pkg/log", "//pkg/refs", + "//pkg/sentry/fs/lock", "//pkg/sentry/kernel/auth", "//pkg/sentry/memmap", "//pkg/sentry/socket/unix/transport", "//pkg/sentry/vfs", - "//pkg/sentry/vfs/lock", "//pkg/sync", "//pkg/syserror", "//pkg/usermem", @@ -68,7 +68,6 @@ go_test( "//pkg/sentry/fsimpl/testutil", "//pkg/sentry/kernel/auth", "//pkg/sentry/vfs", - "//pkg/sentry/vfs/lock", "//pkg/syserror", "//pkg/usermem", "@com_github_google_go-cmp//cmp:go_default_library", diff --git a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go index 6418de0a3..c1215b70a 100644 --- a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go +++ b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go @@ -19,9 +19,9 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -39,7 +39,7 @@ type DynamicBytesFile struct { InodeNotDirectory InodeNotSymlink - locks lock.FileLocks + locks vfs.FileLocks data vfs.DynamicBytesSource } @@ -86,7 +86,7 @@ type DynamicBytesFD struct { } // Init initializes a DynamicBytesFD. -func (fd *DynamicBytesFD) Init(m *vfs.Mount, d *vfs.Dentry, data vfs.DynamicBytesSource, locks *lock.FileLocks, flags uint32) error { +func (fd *DynamicBytesFD) Init(m *vfs.Mount, d *vfs.Dentry, data vfs.DynamicBytesSource, locks *vfs.FileLocks, flags uint32) error { fd.LockFD.Init(locks) if err := fd.vfsfd.Init(fd, flags, m, d, &vfs.FileDescriptionOptions{}); err != nil { return err @@ -135,3 +135,13 @@ func (fd *DynamicBytesFD) SetStat(context.Context, vfs.SetStatOptions) error { // DynamicBytesFiles are immutable. return syserror.EPERM } + +// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. +func (fd *DynamicBytesFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { + return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block) +} + +// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. +func (fd *DynamicBytesFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { + return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence) +} diff --git a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go index 33a5968ca..5f7853a2a 100644 --- a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go @@ -19,10 +19,10 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" @@ -57,7 +57,7 @@ type GenericDirectoryFD struct { // NewGenericDirectoryFD creates a new GenericDirectoryFD and returns its // dentry. -func NewGenericDirectoryFD(m *vfs.Mount, d *vfs.Dentry, children *OrderedChildren, locks *lock.FileLocks, opts *vfs.OpenOptions) (*GenericDirectoryFD, error) { +func NewGenericDirectoryFD(m *vfs.Mount, d *vfs.Dentry, children *OrderedChildren, locks *vfs.FileLocks, opts *vfs.OpenOptions) (*GenericDirectoryFD, error) { fd := &GenericDirectoryFD{} if err := fd.Init(children, locks, opts); err != nil { return nil, err @@ -71,7 +71,7 @@ func NewGenericDirectoryFD(m *vfs.Mount, d *vfs.Dentry, children *OrderedChildre // Init initializes a GenericDirectoryFD. Use it when overriding // GenericDirectoryFD. Caller must call fd.VFSFileDescription.Init() with the // correct implementation. -func (fd *GenericDirectoryFD) Init(children *OrderedChildren, locks *lock.FileLocks, opts *vfs.OpenOptions) error { +func (fd *GenericDirectoryFD) Init(children *OrderedChildren, locks *vfs.FileLocks, opts *vfs.OpenOptions) error { if vfs.AccessTypesForOpenFlags(opts)&vfs.MayWrite != 0 { // Can't open directories for writing. return syserror.EISDIR @@ -235,3 +235,13 @@ func (fd *GenericDirectoryFD) SetStat(ctx context.Context, opts vfs.SetStatOptio inode := fd.vfsfd.VirtualDentry().Dentry().Impl().(*Dentry).inode return inode.SetStat(ctx, fd.filesystem(), creds, opts) } + +// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. +func (fd *GenericDirectoryFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { + return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block) +} + +// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. +func (fd *GenericDirectoryFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { + return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence) +} diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go index 0e4927215..650bd7b88 100644 --- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go @@ -23,7 +23,6 @@ import ( "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) @@ -557,7 +556,7 @@ type StaticDirectory struct { InodeNoDynamicLookup OrderedChildren - locks lock.FileLocks + locks vfs.FileLocks } var _ Inode = (*StaticDirectory)(nil) diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go index 6749facf7..dc407eb1d 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs_test.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go @@ -27,7 +27,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fsimpl/testutil" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -103,7 +102,7 @@ type readonlyDir struct { kernfs.InodeDirectoryNoNewChildren kernfs.OrderedChildren - locks lock.FileLocks + locks vfs.FileLocks dentry kernfs.Dentry } @@ -133,7 +132,7 @@ type dir struct { kernfs.InodeNoDynamicLookup kernfs.OrderedChildren - locks lock.FileLocks + locks vfs.FileLocks fs *filesystem dentry kernfs.Dentry diff --git a/pkg/sentry/fsimpl/overlay/BUILD b/pkg/sentry/fsimpl/overlay/BUILD index f9413bbdd..8cf5b35d3 100644 --- a/pkg/sentry/fsimpl/overlay/BUILD +++ b/pkg/sentry/fsimpl/overlay/BUILD @@ -29,11 +29,11 @@ go_library( "//pkg/abi/linux", "//pkg/context", "//pkg/fspath", + "//pkg/sentry/fs/lock", "//pkg/sentry/kernel/auth", "//pkg/sentry/memmap", "//pkg/sentry/socket/unix/transport", "//pkg/sentry/vfs", - "//pkg/sentry/vfs/lock", "//pkg/sync", "//pkg/syserror", "//pkg/usermem", diff --git a/pkg/sentry/fsimpl/overlay/overlay.go b/pkg/sentry/fsimpl/overlay/overlay.go index e660d0e2c..e11a3ff19 100644 --- a/pkg/sentry/fsimpl/overlay/overlay.go +++ b/pkg/sentry/fsimpl/overlay/overlay.go @@ -35,9 +35,9 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fspath" + fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) @@ -415,7 +415,7 @@ type dentry struct { devMinor uint32 ino uint64 - locks lock.FileLocks + locks vfs.FileLocks } // newDentry creates a new dentry. The dentry initially has no references; it @@ -610,3 +610,13 @@ func (fd *fileDescription) filesystem() *filesystem { func (fd *fileDescription) dentry() *dentry { return fd.vfsfd.Dentry().Impl().(*dentry) } + +// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. +func (fd *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { + return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block) +} + +// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. +func (fd *fileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { + return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence) +} diff --git a/pkg/sentry/fsimpl/pipefs/BUILD b/pkg/sentry/fsimpl/pipefs/BUILD index c618dbe6c..5950a2d59 100644 --- a/pkg/sentry/fsimpl/pipefs/BUILD +++ b/pkg/sentry/fsimpl/pipefs/BUILD @@ -15,7 +15,6 @@ go_library( "//pkg/sentry/kernel/pipe", "//pkg/sentry/kernel/time", "//pkg/sentry/vfs", - "//pkg/sentry/vfs/lock", "//pkg/syserror", "//pkg/usermem", ], diff --git a/pkg/sentry/fsimpl/pipefs/pipefs.go b/pkg/sentry/fsimpl/pipefs/pipefs.go index e4dabaa33..dd7eaf4a8 100644 --- a/pkg/sentry/fsimpl/pipefs/pipefs.go +++ b/pkg/sentry/fsimpl/pipefs/pipefs.go @@ -27,7 +27,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -82,7 +81,7 @@ type inode struct { kernfs.InodeNotSymlink kernfs.InodeNoopRefCount - locks lock.FileLocks + locks vfs.FileLocks pipe *pipe.VFSPipe ino uint64 diff --git a/pkg/sentry/fsimpl/proc/BUILD b/pkg/sentry/fsimpl/proc/BUILD index 351ba4ee9..6014138ff 100644 --- a/pkg/sentry/fsimpl/proc/BUILD +++ b/pkg/sentry/fsimpl/proc/BUILD @@ -22,6 +22,7 @@ go_library( "//pkg/log", "//pkg/refs", "//pkg/safemem", + "//pkg/sentry/fs/lock", "//pkg/sentry/fsbridge", "//pkg/sentry/fsimpl/kernfs", "//pkg/sentry/inet", @@ -35,7 +36,6 @@ go_library( "//pkg/sentry/socket/unix/transport", "//pkg/sentry/usage", "//pkg/sentry/vfs", - "//pkg/sentry/vfs/lock", "//pkg/syserror", "//pkg/tcpip/header", "//pkg/usermem", diff --git a/pkg/sentry/fsimpl/proc/subtasks.go b/pkg/sentry/fsimpl/proc/subtasks.go index e2cdb7ee9..36a89540c 100644 --- a/pkg/sentry/fsimpl/proc/subtasks.go +++ b/pkg/sentry/fsimpl/proc/subtasks.go @@ -24,7 +24,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/syserror" ) @@ -38,7 +37,7 @@ type subtasksInode struct { kernfs.OrderedChildren kernfs.AlwaysValid - locks lock.FileLocks + locks vfs.FileLocks fs *filesystem task *kernel.Task diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go index 44078a765..8bb2b0ce1 100644 --- a/pkg/sentry/fsimpl/proc/task.go +++ b/pkg/sentry/fsimpl/proc/task.go @@ -25,7 +25,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/mm" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/syserror" ) @@ -39,7 +38,7 @@ type taskInode struct { kernfs.InodeAttrs kernfs.OrderedChildren - locks lock.FileLocks + locks vfs.FileLocks task *kernel.Task } diff --git a/pkg/sentry/fsimpl/proc/task_fds.go b/pkg/sentry/fsimpl/proc/task_fds.go index ef6c1d04f..7debdb07a 100644 --- a/pkg/sentry/fsimpl/proc/task_fds.go +++ b/pkg/sentry/fsimpl/proc/task_fds.go @@ -27,7 +27,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/syserror" ) @@ -54,7 +53,7 @@ func taskFDExists(t *kernel.Task, fd int32) bool { } type fdDir struct { - locks lock.FileLocks + locks vfs.FileLocks fs *filesystem task *kernel.Task diff --git a/pkg/sentry/fsimpl/proc/task_files.go b/pkg/sentry/fsimpl/proc/task_files.go index e5eaa91cd..ba4405026 100644 --- a/pkg/sentry/fsimpl/proc/task_files.go +++ b/pkg/sentry/fsimpl/proc/task_files.go @@ -22,6 +22,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/safemem" + fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/fsbridge" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" @@ -30,7 +31,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/mm" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) @@ -777,7 +777,7 @@ type namespaceInode struct { kernfs.InodeNotDirectory kernfs.InodeNotSymlink - locks lock.FileLocks + locks vfs.FileLocks } var _ kernfs.Inode = (*namespaceInode)(nil) @@ -830,3 +830,13 @@ func (fd *namespaceFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) err func (fd *namespaceFD) Release() { fd.inode.DecRef() } + +// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. +func (fd *namespaceFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { + return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block) +} + +// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. +func (fd *namespaceFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { + return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence) +} diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go index 58c8b9d05..2f214d0c2 100644 --- a/pkg/sentry/fsimpl/proc/tasks.go +++ b/pkg/sentry/fsimpl/proc/tasks.go @@ -25,7 +25,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/syserror" ) @@ -44,7 +43,7 @@ type tasksInode struct { kernfs.OrderedChildren kernfs.AlwaysValid - locks lock.FileLocks + locks vfs.FileLocks fs *filesystem pidns *kernel.PIDNamespace diff --git a/pkg/sentry/fsimpl/sys/BUILD b/pkg/sentry/fsimpl/sys/BUILD index 237f17def..a741e2bb6 100644 --- a/pkg/sentry/fsimpl/sys/BUILD +++ b/pkg/sentry/fsimpl/sys/BUILD @@ -15,7 +15,6 @@ go_library( "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", "//pkg/sentry/vfs", - "//pkg/sentry/vfs/lock", "//pkg/syserror", ], ) diff --git a/pkg/sentry/fsimpl/sys/sys.go b/pkg/sentry/fsimpl/sys/sys.go index b84463d3a..fe02f7ee9 100644 --- a/pkg/sentry/fsimpl/sys/sys.go +++ b/pkg/sentry/fsimpl/sys/sys.go @@ -25,7 +25,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/syserror" ) @@ -101,7 +100,7 @@ type dir struct { kernfs.InodeDirectoryNoNewChildren kernfs.OrderedChildren - locks lock.FileLocks + locks vfs.FileLocks dentry kernfs.Dentry } diff --git a/pkg/sentry/fsimpl/tmpfs/BUILD b/pkg/sentry/fsimpl/tmpfs/BUILD index 062321cbc..e73732a6b 100644 --- a/pkg/sentry/fsimpl/tmpfs/BUILD +++ b/pkg/sentry/fsimpl/tmpfs/BUILD @@ -62,7 +62,6 @@ go_library( "//pkg/sentry/uniqueid", "//pkg/sentry/usage", "//pkg/sentry/vfs", - "//pkg/sentry/vfs/lock", "//pkg/sentry/vfs/memxattr", "//pkg/sync", "//pkg/syserror", diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file_test.go b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go index 64e1c40ad..146c7fdfe 100644 --- a/pkg/sentry/fsimpl/tmpfs/regular_file_test.go +++ b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go @@ -138,48 +138,37 @@ func TestLocks(t *testing.T) { } defer cleanup() - var ( - uid1 lock.UniqueID - uid2 lock.UniqueID - // Non-blocking. - block lock.Blocker - ) - - uid1 = 123 - uid2 = 456 - - if err := fd.Impl().LockBSD(ctx, uid1, lock.ReadLock, block); err != nil { + uid1 := 123 + uid2 := 456 + if err := fd.Impl().LockBSD(ctx, uid1, lock.ReadLock, nil); err != nil { t.Fatalf("fd.Impl().LockBSD failed: err = %v", err) } - if err := fd.Impl().LockBSD(ctx, uid2, lock.ReadLock, block); err != nil { + if err := fd.Impl().LockBSD(ctx, uid2, lock.ReadLock, nil); err != nil { t.Fatalf("fd.Impl().LockBSD failed: err = %v", err) } - if got, want := fd.Impl().LockBSD(ctx, uid2, lock.WriteLock, block), syserror.ErrWouldBlock; got != want { + if got, want := fd.Impl().LockBSD(ctx, uid2, lock.WriteLock, nil), syserror.ErrWouldBlock; got != want { t.Fatalf("fd.Impl().LockBSD failed: got = %v, want = %v", got, want) } if err := fd.Impl().UnlockBSD(ctx, uid1); err != nil { t.Fatalf("fd.Impl().UnlockBSD failed: err = %v", err) } - if err := fd.Impl().LockBSD(ctx, uid2, lock.WriteLock, block); err != nil { + if err := fd.Impl().LockBSD(ctx, uid2, lock.WriteLock, nil); err != nil { t.Fatalf("fd.Impl().LockBSD failed: err = %v", err) } - rng1 := lock.LockRange{0, 1} - rng2 := lock.LockRange{1, 2} - - if err := fd.Impl().LockPOSIX(ctx, uid1, lock.ReadLock, rng1, block); err != nil { + if err := fd.Impl().LockPOSIX(ctx, uid1, lock.ReadLock, 0, 1, linux.SEEK_SET, nil); err != nil { t.Fatalf("fd.Impl().LockPOSIX failed: err = %v", err) } - if err := fd.Impl().LockPOSIX(ctx, uid2, lock.ReadLock, rng2, block); err != nil { + if err := fd.Impl().LockPOSIX(ctx, uid2, lock.ReadLock, 1, 2, linux.SEEK_SET, nil); err != nil { t.Fatalf("fd.Impl().LockPOSIX failed: err = %v", err) } - if err := fd.Impl().LockPOSIX(ctx, uid1, lock.WriteLock, rng1, block); err != nil { + if err := fd.Impl().LockPOSIX(ctx, uid1, lock.WriteLock, 0, 1, linux.SEEK_SET, nil); err != nil { t.Fatalf("fd.Impl().LockPOSIX failed: err = %v", err) } - if got, want := fd.Impl().LockPOSIX(ctx, uid2, lock.ReadLock, rng1, block), syserror.ErrWouldBlock; got != want { + if got, want := fd.Impl().LockPOSIX(ctx, uid2, lock.ReadLock, 0, 1, linux.SEEK_SET, nil), syserror.ErrWouldBlock; got != want { t.Fatalf("fd.Impl().LockPOSIX failed: got = %v, want = %v", got, want) } - if err := fd.Impl().UnlockPOSIX(ctx, uid1, rng1); err != nil { + if err := fd.Impl().UnlockPOSIX(ctx, uid1, 0, 1, linux.SEEK_SET); err != nil { t.Fatalf("fd.Impl().UnlockPOSIX failed: err = %v", err) } } diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go index 71a7522af..d0a3e1a5c 100644 --- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go +++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go @@ -36,11 +36,11 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/sentry/vfs/memxattr" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" @@ -310,7 +310,7 @@ type inode struct { ctime int64 // nanoseconds mtime int64 // nanoseconds - locks lock.FileLocks + locks vfs.FileLocks // Inotify watches for this inode. watches vfs.Watches @@ -761,9 +761,20 @@ func NewMemfd(mount *vfs.Mount, creds *auth.Credentials, allowSeals bool, name s // Per Linux, mm/shmem.c:__shmem_file_setup(), memfd files are set up with // FMODE_READ | FMODE_WRITE. var fd regularFileFD + fd.Init(&inode.locks) flags := uint32(linux.O_RDWR) if err := fd.vfsfd.Init(&fd, flags, mount, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil { return nil, err } return &fd.vfsfd, nil } + +// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. +func (fd *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { + return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block) +} + +// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. +func (fd *fileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { + return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence) +} diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go index 48911240f..4b7d234a4 100644 --- a/pkg/sentry/kernel/fd_table.go +++ b/pkg/sentry/kernel/fd_table.go @@ -29,6 +29,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" ) // FDFlags define flags for an individual descriptor. @@ -148,7 +149,12 @@ func (f *FDTable) drop(file *fs.File) { // dropVFS2 drops the table reference. func (f *FDTable) dropVFS2(file *vfs.FileDescription) { - // TODO(gvisor.dev/issue/1480): Release locks. + // Release any POSIX lock possibly held by the FDTable. Range {0, 0} means the + // entire file. + err := file.UnlockPOSIX(context.Background(), f, 0, 0, linux.SEEK_SET) + if err != nil && err != syserror.ENOLCK { + panic(fmt.Sprintf("UnlockPOSIX failed: %v", err)) + } // Generate inotify events. ev := uint32(linux.IN_CLOSE_NOWRITE) @@ -157,7 +163,7 @@ func (f *FDTable) dropVFS2(file *vfs.FileDescription) { } file.Dentry().InotifyWithParent(ev, 0, vfs.PathEvent) - // Drop the table reference. + // Drop the table's reference. file.DecRef() } diff --git a/pkg/sentry/kernel/pipe/BUILD b/pkg/sentry/kernel/pipe/BUILD index 0db546b98..449643118 100644 --- a/pkg/sentry/kernel/pipe/BUILD +++ b/pkg/sentry/kernel/pipe/BUILD @@ -26,8 +26,8 @@ go_library( "//pkg/sentry/device", "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", + "//pkg/sentry/fs/lock", "//pkg/sentry/vfs", - "//pkg/sentry/vfs/lock", "//pkg/sync", "//pkg/syserror", "//pkg/usermem", diff --git a/pkg/sentry/kernel/pipe/vfs.go b/pkg/sentry/kernel/pipe/vfs.go index c0e9ee1f4..a4519363f 100644 --- a/pkg/sentry/kernel/pipe/vfs.go +++ b/pkg/sentry/kernel/pipe/vfs.go @@ -20,8 +20,8 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/arch" + fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" @@ -63,12 +63,12 @@ func NewVFSPipe(isNamed bool, sizeBytes, atomicIOBytes int64) *VFSPipe { // Preconditions: statusFlags should not contain an open access mode. func (vp *VFSPipe) ReaderWriterPair(mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32) (*vfs.FileDescription, *vfs.FileDescription) { // Connected pipes share the same locks. - locks := &lock.FileLocks{} + locks := &vfs.FileLocks{} return vp.newFD(mnt, vfsd, linux.O_RDONLY|statusFlags, locks), vp.newFD(mnt, vfsd, linux.O_WRONLY|statusFlags, locks) } // Open opens the pipe represented by vp. -func (vp *VFSPipe) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32, locks *lock.FileLocks) (*vfs.FileDescription, error) { +func (vp *VFSPipe) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32, locks *vfs.FileLocks) (*vfs.FileDescription, error) { vp.mu.Lock() defer vp.mu.Unlock() @@ -130,7 +130,7 @@ func (vp *VFSPipe) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, s } // Preconditions: vp.mu must be held. -func (vp *VFSPipe) newFD(mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32, locks *lock.FileLocks) *vfs.FileDescription { +func (vp *VFSPipe) newFD(mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32, locks *vfs.FileLocks) *vfs.FileDescription { fd := &VFSPipeFD{ pipe: &vp.pipe, } @@ -451,3 +451,13 @@ func spliceOrTee(ctx context.Context, dst, src *VFSPipeFD, count int64, removeFr } return n, err } + +// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. +func (fd *VFSPipeFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { + return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block) +} + +// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. +func (fd *VFSPipeFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { + return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence) +} diff --git a/pkg/sentry/socket/hostinet/BUILD b/pkg/sentry/socket/hostinet/BUILD index 60c9896fc..ff81ea6e6 100644 --- a/pkg/sentry/socket/hostinet/BUILD +++ b/pkg/sentry/socket/hostinet/BUILD @@ -26,6 +26,7 @@ go_library( "//pkg/sentry/device", "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", + "//pkg/sentry/fs/lock", "//pkg/sentry/fsimpl/sockfs", "//pkg/sentry/hostfd", "//pkg/sentry/inet", @@ -34,7 +35,6 @@ go_library( "//pkg/sentry/socket", "//pkg/sentry/socket/control", "//pkg/sentry/vfs", - "//pkg/sentry/vfs/lock", "//pkg/syserr", "//pkg/syserror", "//pkg/tcpip/stack", diff --git a/pkg/sentry/socket/hostinet/socket_vfs2.go b/pkg/sentry/socket/hostinet/socket_vfs2.go index 027add1fd..ad5f64799 100644 --- a/pkg/sentry/socket/hostinet/socket_vfs2.go +++ b/pkg/sentry/socket/hostinet/socket_vfs2.go @@ -21,12 +21,12 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fdnotifier" "gvisor.dev/gvisor/pkg/sentry/arch" + fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs" "gvisor.dev/gvisor/pkg/sentry/hostfd" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/socket" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" @@ -61,7 +61,7 @@ func newVFS2Socket(t *kernel.Task, family int, stype linux.SockType, protocol in fd: fd, }, } - s.LockFD.Init(&lock.FileLocks{}) + s.LockFD.Init(&vfs.FileLocks{}) if err := fdnotifier.AddFD(int32(fd), &s.queue); err != nil { return nil, syserr.FromError(err) } @@ -134,6 +134,16 @@ func (s *socketVFS2) Write(ctx context.Context, src usermem.IOSequence, opts vfs return int64(n), err } +// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. +func (s *socketVFS2) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { + return s.Locks().LockPOSIX(ctx, &s.vfsfd, uid, t, start, length, whence, block) +} + +// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. +func (s *socketVFS2) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { + return s.Locks().UnlockPOSIX(ctx, &s.vfsfd, uid, start, length, whence) +} + type socketProviderVFS2 struct { family int } diff --git a/pkg/sentry/socket/netlink/BUILD b/pkg/sentry/socket/netlink/BUILD index 420e573c9..d5ca3ac56 100644 --- a/pkg/sentry/socket/netlink/BUILD +++ b/pkg/sentry/socket/netlink/BUILD @@ -20,6 +20,7 @@ go_library( "//pkg/sentry/device", "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", + "//pkg/sentry/fs/lock", "//pkg/sentry/fsimpl/sockfs", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", @@ -29,7 +30,6 @@ go_library( "//pkg/sentry/socket/unix", "//pkg/sentry/socket/unix/transport", "//pkg/sentry/vfs", - "//pkg/sentry/vfs/lock", "//pkg/sync", "//pkg/syserr", "//pkg/syserror", diff --git a/pkg/sentry/socket/netlink/socket_vfs2.go b/pkg/sentry/socket/netlink/socket_vfs2.go index 8bfee5193..dbcd8b49a 100644 --- a/pkg/sentry/socket/netlink/socket_vfs2.go +++ b/pkg/sentry/socket/netlink/socket_vfs2.go @@ -18,12 +18,12 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" + fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/socket" "gvisor.dev/gvisor/pkg/sentry/socket/unix" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/tcpip" @@ -78,7 +78,7 @@ func NewVFS2(t *kernel.Task, skType linux.SockType, protocol Protocol) (*SocketV sendBufferSize: defaultSendBufferSize, }, } - fd.LockFD.Init(&lock.FileLocks{}) + fd.LockFD.Init(&vfs.FileLocks{}) return fd, nil } @@ -140,3 +140,13 @@ func (s *SocketVFS2) Write(ctx context.Context, src usermem.IOSequence, opts vfs n, err := s.sendMsg(ctx, src, nil, 0, socket.ControlMessages{}) return int64(n), err.ToError() } + +// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. +func (s *SocketVFS2) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { + return s.Locks().LockPOSIX(ctx, &s.vfsfd, uid, t, start, length, whence, block) +} + +// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. +func (s *SocketVFS2) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { + return s.Locks().UnlockPOSIX(ctx, &s.vfsfd, uid, start, length, whence) +} diff --git a/pkg/sentry/socket/netstack/BUILD b/pkg/sentry/socket/netstack/BUILD index 0f592ecc3..ea6ebd0e2 100644 --- a/pkg/sentry/socket/netstack/BUILD +++ b/pkg/sentry/socket/netstack/BUILD @@ -28,6 +28,7 @@ go_library( "//pkg/sentry/device", "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", + "//pkg/sentry/fs/lock", "//pkg/sentry/fsimpl/sockfs", "//pkg/sentry/inet", "//pkg/sentry/kernel", @@ -37,7 +38,6 @@ go_library( "//pkg/sentry/socket/netfilter", "//pkg/sentry/unimpl", "//pkg/sentry/vfs", - "//pkg/sentry/vfs/lock", "//pkg/sync", "//pkg/syserr", "//pkg/syserror", diff --git a/pkg/sentry/socket/netstack/netstack_vfs2.go b/pkg/sentry/socket/netstack/netstack_vfs2.go index 1412a4810..d65a89316 100644 --- a/pkg/sentry/socket/netstack/netstack_vfs2.go +++ b/pkg/sentry/socket/netstack/netstack_vfs2.go @@ -19,13 +19,13 @@ import ( "gvisor.dev/gvisor/pkg/amutex" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" + fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/socket" "gvisor.dev/gvisor/pkg/sentry/socket/netfilter" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/tcpip" @@ -66,7 +66,7 @@ func NewVFS2(t *kernel.Task, family int, skType linux.SockType, protocol int, qu protocol: protocol, }, } - s.LockFD.Init(&lock.FileLocks{}) + s.LockFD.Init(&vfs.FileLocks{}) vfsfd := &s.vfsfd if err := vfsfd.Init(s, linux.O_RDWR, mnt, d, &vfs.FileDescriptionOptions{ DenyPRead: true, @@ -318,3 +318,13 @@ func (s *SocketVFS2) SetSockOpt(t *kernel.Task, level int, name int, optVal []by return SetSockOpt(t, s, s.Endpoint, level, name, optVal) } + +// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. +func (s *SocketVFS2) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { + return s.Locks().LockPOSIX(ctx, &s.vfsfd, uid, t, start, length, whence, block) +} + +// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. +func (s *SocketVFS2) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { + return s.Locks().UnlockPOSIX(ctx, &s.vfsfd, uid, start, length, whence) +} diff --git a/pkg/sentry/socket/unix/BUILD b/pkg/sentry/socket/unix/BUILD index 7d4cc80fe..cca5e70f1 100644 --- a/pkg/sentry/socket/unix/BUILD +++ b/pkg/sentry/socket/unix/BUILD @@ -21,6 +21,7 @@ go_library( "//pkg/sentry/device", "//pkg/sentry/fs", "//pkg/sentry/fs/fsutil", + "//pkg/sentry/fs/lock", "//pkg/sentry/fsimpl/sockfs", "//pkg/sentry/kernel", "//pkg/sentry/kernel/time", @@ -29,7 +30,6 @@ go_library( "//pkg/sentry/socket/netstack", "//pkg/sentry/socket/unix/transport", "//pkg/sentry/vfs", - "//pkg/sentry/vfs/lock", "//pkg/syserr", "//pkg/syserror", "//pkg/tcpip", diff --git a/pkg/sentry/socket/unix/unix_vfs2.go b/pkg/sentry/socket/unix/unix_vfs2.go index 8c32371a2..ff2149250 100644 --- a/pkg/sentry/socket/unix/unix_vfs2.go +++ b/pkg/sentry/socket/unix/unix_vfs2.go @@ -19,6 +19,7 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/arch" + fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/socket" @@ -26,7 +27,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/socket/netstack" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/syserr" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/tcpip" @@ -53,7 +53,7 @@ func NewSockfsFile(t *kernel.Task, ep transport.Endpoint, stype linux.SockType) mnt := t.Kernel().SocketMount() d := sockfs.NewDentry(t.Credentials(), mnt) - fd, err := NewFileDescription(ep, stype, linux.O_RDWR, mnt, d, &lock.FileLocks{}) + fd, err := NewFileDescription(ep, stype, linux.O_RDWR, mnt, d, &vfs.FileLocks{}) if err != nil { return nil, syserr.FromError(err) } @@ -62,7 +62,7 @@ func NewSockfsFile(t *kernel.Task, ep transport.Endpoint, stype linux.SockType) // NewFileDescription creates and returns a socket file description // corresponding to the given mount and dentry. -func NewFileDescription(ep transport.Endpoint, stype linux.SockType, flags uint32, mnt *vfs.Mount, d *vfs.Dentry, locks *lock.FileLocks) (*vfs.FileDescription, error) { +func NewFileDescription(ep transport.Endpoint, stype linux.SockType, flags uint32, mnt *vfs.Mount, d *vfs.Dentry, locks *vfs.FileLocks) (*vfs.FileDescription, error) { // You can create AF_UNIX, SOCK_RAW sockets. They're the same as // SOCK_DGRAM and don't require CAP_NET_RAW. if stype == linux.SOCK_RAW { @@ -300,6 +300,16 @@ func (s *SocketVFS2) SetSockOpt(t *kernel.Task, level int, name int, optVal []by return netstack.SetSockOpt(t, s, s.ep, level, name, optVal) } +// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. +func (s *SocketVFS2) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { + return s.Locks().LockPOSIX(ctx, &s.vfsfd, uid, t, start, length, whence, block) +} + +// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. +func (s *SocketVFS2) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { + return s.Locks().UnlockPOSIX(ctx, &s.vfsfd, uid, start, length, whence) +} + // providerVFS2 is a unix domain socket provider for VFS2. type providerVFS2 struct{} diff --git a/pkg/sentry/syscalls/linux/vfs2/fd.go b/pkg/sentry/syscalls/linux/vfs2/fd.go index f9ccb303c..f5eaa076b 100644 --- a/pkg/sentry/syscalls/linux/vfs2/fd.go +++ b/pkg/sentry/syscalls/linux/vfs2/fd.go @@ -17,10 +17,12 @@ package vfs2 import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/pipe" slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux" + "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" ) @@ -167,8 +169,44 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall } err := tmpfs.AddSeals(file, args[2].Uint()) return 0, nil, err + case linux.F_SETLK, linux.F_SETLKW: + return 0, nil, posixLock(t, args, file, cmd) default: // TODO(gvisor.dev/issue/2920): Everything else is not yet supported. return 0, nil, syserror.EINVAL } } + +func posixLock(t *kernel.Task, args arch.SyscallArguments, file *vfs.FileDescription, cmd int32) error { + // Copy in the lock request. + flockAddr := args[2].Pointer() + var flock linux.Flock + if _, err := t.CopyIn(flockAddr, &flock); err != nil { + return err + } + + var blocker lock.Blocker + if cmd == linux.F_SETLKW { + blocker = t + } + + switch flock.Type { + case linux.F_RDLCK: + if !file.IsReadable() { + return syserror.EBADF + } + return file.LockPOSIX(t, t.FDTable(), lock.ReadLock, uint64(flock.Start), uint64(flock.Len), flock.Whence, blocker) + + case linux.F_WRLCK: + if !file.IsWritable() { + return syserror.EBADF + } + return file.LockPOSIX(t, t.FDTable(), lock.WriteLock, uint64(flock.Start), uint64(flock.Len), flock.Whence, blocker) + + case linux.F_UNLCK: + return file.UnlockPOSIX(t, t.FDTable(), uint64(flock.Start), uint64(flock.Len), flock.Whence) + + default: + return syserror.EINVAL + } +} diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD index 16d9f3a28..642769e7c 100644 --- a/pkg/sentry/vfs/BUILD +++ b/pkg/sentry/vfs/BUILD @@ -44,6 +44,7 @@ go_library( "filesystem_impl_util.go", "filesystem_type.go", "inotify.go", + "lock.go", "mount.go", "mount_unsafe.go", "options.go", @@ -72,7 +73,6 @@ go_library( "//pkg/sentry/memmap", "//pkg/sentry/socket/unix/transport", "//pkg/sentry/uniqueid", - "//pkg/sentry/vfs/lock", "//pkg/sync", "//pkg/syserror", "//pkg/usermem", diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go index 13c48824e..e0538ea53 100644 --- a/pkg/sentry/vfs/file_description.go +++ b/pkg/sentry/vfs/file_description.go @@ -438,14 +438,10 @@ type FileDescriptionImpl interface { UnlockBSD(ctx context.Context, uid lock.UniqueID) error // LockPOSIX tries to acquire a POSIX-style advisory file lock. - // - // TODO(gvisor.dev/issue/1480): POSIX-style file locking - LockPOSIX(ctx context.Context, uid lock.UniqueID, t lock.LockType, rng lock.LockRange, block lock.Blocker) error + LockPOSIX(ctx context.Context, uid lock.UniqueID, t lock.LockType, start, length uint64, whence int16, block lock.Blocker) error // UnlockPOSIX releases a POSIX-style advisory file lock. - // - // TODO(gvisor.dev/issue/1480): POSIX-style file locking - UnlockPOSIX(ctx context.Context, uid lock.UniqueID, rng lock.LockRange) error + UnlockPOSIX(ctx context.Context, uid lock.UniqueID, start, length uint64, whence int16) error } // Dirent holds the information contained in struct linux_dirent64. @@ -764,3 +760,13 @@ func (fd *FileDescription) LockBSD(ctx context.Context, lockType lock.LockType, func (fd *FileDescription) UnlockBSD(ctx context.Context) error { return fd.impl.UnlockBSD(ctx, fd) } + +// LockPOSIX locks a POSIX-style file range lock. +func (fd *FileDescription) LockPOSIX(ctx context.Context, uid lock.UniqueID, t lock.LockType, start, end uint64, whence int16, block lock.Blocker) error { + return fd.impl.LockPOSIX(ctx, uid, t, start, end, whence, block) +} + +// UnlockPOSIX unlocks a POSIX-style file range lock. +func (fd *FileDescription) UnlockPOSIX(ctx context.Context, uid lock.UniqueID, start, end uint64, whence int16) error { + return fd.impl.UnlockPOSIX(ctx, uid, start, end, whence) +} diff --git a/pkg/sentry/vfs/file_description_impl_util.go b/pkg/sentry/vfs/file_description_impl_util.go index af7213dfd..1e66997ce 100644 --- a/pkg/sentry/vfs/file_description_impl_util.go +++ b/pkg/sentry/vfs/file_description_impl_util.go @@ -23,7 +23,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/arch" fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" @@ -369,14 +368,19 @@ func GenericConfigureMMap(fd *FileDescription, m memmap.Mappable, opts *memmap.M // LockFD may be used by most implementations of FileDescriptionImpl.Lock* // functions. Caller must call Init(). type LockFD struct { - locks *lock.FileLocks + locks *FileLocks } // Init initializes fd with FileLocks to use. -func (fd *LockFD) Init(locks *lock.FileLocks) { +func (fd *LockFD) Init(locks *FileLocks) { fd.locks = locks } +// Locks returns the locks associated with this file. +func (fd *LockFD) Locks() *FileLocks { + return fd.locks +} + // LockBSD implements vfs.FileDescriptionImpl.LockBSD. func (fd *LockFD) LockBSD(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, block fslock.Blocker) error { return fd.locks.LockBSD(uid, t, block) @@ -388,17 +392,6 @@ func (fd *LockFD) UnlockBSD(ctx context.Context, uid fslock.UniqueID) error { return nil } -// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. -func (fd *LockFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, rng fslock.LockRange, block fslock.Blocker) error { - return fd.locks.LockPOSIX(uid, t, rng, block) -} - -// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. -func (fd *LockFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, rng fslock.LockRange) error { - fd.locks.UnlockPOSIX(uid, rng) - return nil -} - // NoLockFD implements Lock*/Unlock* portion of FileDescriptionImpl interface // returning ENOLCK. type NoLockFD struct{} @@ -414,11 +407,11 @@ func (NoLockFD) UnlockBSD(ctx context.Context, uid fslock.UniqueID) error { } // LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. -func (NoLockFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, rng fslock.LockRange, block fslock.Blocker) error { +func (NoLockFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { return syserror.ENOLCK } // UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. -func (NoLockFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, rng fslock.LockRange) error { +func (NoLockFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { return syserror.ENOLCK } diff --git a/pkg/sentry/vfs/lock.go b/pkg/sentry/vfs/lock.go new file mode 100644 index 000000000..6c7583a81 --- /dev/null +++ b/pkg/sentry/vfs/lock.go @@ -0,0 +1,109 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package lock provides POSIX and BSD style file locking for VFS2 file +// implementations. +// +// The actual implementations can be found in the lock package under +// sentry/fs/lock. +package vfs + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" + "gvisor.dev/gvisor/pkg/syserror" +) + +// FileLocks supports POSIX and BSD style locks, which correspond to fcntl(2) +// and flock(2) respectively in Linux. It can be embedded into various file +// implementations for VFS2 that support locking. +// +// Note that in Linux these two types of locks are _not_ cooperative, because +// race and deadlock conditions make merging them prohibitive. We do the same +// and keep them oblivious to each other. +type FileLocks struct { + // bsd is a set of BSD-style advisory file wide locks, see flock(2). + bsd fslock.Locks + + // posix is a set of POSIX-style regional advisory locks, see fcntl(2). + posix fslock.Locks +} + +// LockBSD tries to acquire a BSD-style lock on the entire file. +func (fl *FileLocks) LockBSD(uid fslock.UniqueID, t fslock.LockType, block fslock.Blocker) error { + if fl.bsd.LockRegion(uid, t, fslock.LockRange{0, fslock.LockEOF}, block) { + return nil + } + return syserror.ErrWouldBlock +} + +// UnlockBSD releases a BSD-style lock on the entire file. +// +// This operation is always successful, even if there did not exist a lock on +// the requested region held by uid in the first place. +func (fl *FileLocks) UnlockBSD(uid fslock.UniqueID) { + fl.bsd.UnlockRegion(uid, fslock.LockRange{0, fslock.LockEOF}) +} + +// LockPOSIX tries to acquire a POSIX-style lock on a file region. +func (fl *FileLocks) LockPOSIX(ctx context.Context, fd *FileDescription, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { + rng, err := computeRange(ctx, fd, start, length, whence) + if err != nil { + return err + } + if fl.posix.LockRegion(uid, t, rng, block) { + return nil + } + return syserror.ErrWouldBlock +} + +// UnlockPOSIX releases a POSIX-style lock on a file region. +// +// This operation is always successful, even if there did not exist a lock on +// the requested region held by uid in the first place. +func (fl *FileLocks) UnlockPOSIX(ctx context.Context, fd *FileDescription, uid fslock.UniqueID, start, length uint64, whence int16) error { + rng, err := computeRange(ctx, fd, start, length, whence) + if err != nil { + return err + } + fl.posix.UnlockRegion(uid, rng) + return nil +} + +func computeRange(ctx context.Context, fd *FileDescription, start uint64, length uint64, whence int16) (fslock.LockRange, error) { + var off int64 + switch whence { + case linux.SEEK_SET: + off = 0 + case linux.SEEK_CUR: + // Note that Linux does not hold any mutexes while retrieving the file + // offset, see fs/locks.c:flock_to_posix_lock and fs/locks.c:fcntl_setlk. + curOff, err := fd.Seek(ctx, 0, linux.SEEK_CUR) + if err != nil { + return fslock.LockRange{}, err + } + off = curOff + case linux.SEEK_END: + stat, err := fd.Stat(ctx, StatOptions{Mask: linux.STATX_SIZE}) + if err != nil { + return fslock.LockRange{}, err + } + off = int64(stat.Size) + default: + return fslock.LockRange{}, syserror.EINVAL + } + + return fslock.ComputeRange(int64(start), int64(length), off) +} diff --git a/pkg/sentry/vfs/lock/BUILD b/pkg/sentry/vfs/lock/BUILD deleted file mode 100644 index d9ab063b7..000000000 --- a/pkg/sentry/vfs/lock/BUILD +++ /dev/null @@ -1,13 +0,0 @@ -load("//tools:defs.bzl", "go_library") - -package(licenses = ["notice"]) - -go_library( - name = "lock", - srcs = ["lock.go"], - visibility = ["//pkg/sentry:internal"], - deps = [ - "//pkg/sentry/fs/lock", - "//pkg/syserror", - ], -) diff --git a/pkg/sentry/vfs/lock/lock.go b/pkg/sentry/vfs/lock/lock.go deleted file mode 100644 index 724dfe743..000000000 --- a/pkg/sentry/vfs/lock/lock.go +++ /dev/null @@ -1,72 +0,0 @@ -// Copyright 2020 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Package lock provides POSIX and BSD style file locking for VFS2 file -// implementations. -// -// The actual implementations can be found in the lock package under -// sentry/fs/lock. -package lock - -import ( - fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" - "gvisor.dev/gvisor/pkg/syserror" -) - -// FileLocks supports POSIX and BSD style locks, which correspond to fcntl(2) -// and flock(2) respectively in Linux. It can be embedded into various file -// implementations for VFS2 that support locking. -// -// Note that in Linux these two types of locks are _not_ cooperative, because -// race and deadlock conditions make merging them prohibitive. We do the same -// and keep them oblivious to each other. -type FileLocks struct { - // bsd is a set of BSD-style advisory file wide locks, see flock(2). - bsd fslock.Locks - - // posix is a set of POSIX-style regional advisory locks, see fcntl(2). - posix fslock.Locks -} - -// LockBSD tries to acquire a BSD-style lock on the entire file. -func (fl *FileLocks) LockBSD(uid fslock.UniqueID, t fslock.LockType, block fslock.Blocker) error { - if fl.bsd.LockRegion(uid, t, fslock.LockRange{0, fslock.LockEOF}, block) { - return nil - } - return syserror.ErrWouldBlock -} - -// UnlockBSD releases a BSD-style lock on the entire file. -// -// This operation is always successful, even if there did not exist a lock on -// the requested region held by uid in the first place. -func (fl *FileLocks) UnlockBSD(uid fslock.UniqueID) { - fl.bsd.UnlockRegion(uid, fslock.LockRange{0, fslock.LockEOF}) -} - -// LockPOSIX tries to acquire a POSIX-style lock on a file region. -func (fl *FileLocks) LockPOSIX(uid fslock.UniqueID, t fslock.LockType, rng fslock.LockRange, block fslock.Blocker) error { - if fl.posix.LockRegion(uid, t, rng, block) { - return nil - } - return syserror.ErrWouldBlock -} - -// UnlockPOSIX releases a POSIX-style lock on a file region. -// -// This operation is always successful, even if there did not exist a lock on -// the requested region held by uid in the first place. -func (fl *FileLocks) UnlockPOSIX(uid fslock.UniqueID, rng fslock.LockRange) { - fl.posix.UnlockRegion(uid, rng) -} diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD index 96044928e..078e4a284 100644 --- a/test/syscalls/linux/BUILD +++ b/test/syscalls/linux/BUILD @@ -791,6 +791,7 @@ cc_binary( deps = [ ":socket_test_util", "//test/util:cleanup", + "//test/util:epoll_util", "//test/util:eventfd_util", "//test/util:fs_util", "@com_google_absl//absl/base:core_headers", diff --git a/test/syscalls/linux/fcntl.cc b/test/syscalls/linux/fcntl.cc index 35e8a4ff3..25bef2522 100644 --- a/test/syscalls/linux/fcntl.cc +++ b/test/syscalls/linux/fcntl.cc @@ -191,45 +191,85 @@ TEST(FcntlTest, SetFlags) { EXPECT_EQ(rflags, expected); } -TEST_F(FcntlLockTest, SetLockBadFd) { +void TestLock(int fd, short lock_type = F_RDLCK) { // NOLINT, type in flock struct flock fl; - fl.l_type = F_WRLCK; + fl.l_type = lock_type; fl.l_whence = SEEK_SET; fl.l_start = 0; - // len 0 has a special meaning: lock all bytes despite how - // large the file grows. + // len 0 locks all bytes despite how large the file grows. fl.l_len = 0; - EXPECT_THAT(fcntl(-1, F_SETLK, &fl), SyscallFailsWithErrno(EBADF)); + EXPECT_THAT(fcntl(fd, F_SETLK, &fl), SyscallSucceeds()); } -TEST_F(FcntlLockTest, SetLockPipe) { - int fds[2]; - ASSERT_THAT(pipe(fds), SyscallSucceeds()); - +void TestLockBadFD(int fd, + short lock_type = F_RDLCK) { // NOLINT, type in flock struct flock fl; - fl.l_type = F_WRLCK; + fl.l_type = lock_type; fl.l_whence = SEEK_SET; fl.l_start = 0; - // Same as SetLockBadFd, but doesn't matter, we expect this to fail. + // len 0 locks all bytes despite how large the file grows. fl.l_len = 0; - EXPECT_THAT(fcntl(fds[0], F_SETLK, &fl), SyscallFailsWithErrno(EBADF)); - EXPECT_THAT(close(fds[0]), SyscallSucceeds()); - EXPECT_THAT(close(fds[1]), SyscallSucceeds()); + EXPECT_THAT(fcntl(fd, F_SETLK, &fl), SyscallFailsWithErrno(EBADF)); } +TEST_F(FcntlLockTest, SetLockBadFd) { TestLockBadFD(-1); } + TEST_F(FcntlLockTest, SetLockDir) { auto dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); - FileDescriptor fd = - ASSERT_NO_ERRNO_AND_VALUE(Open(dir.path(), O_RDONLY, 0666)); + auto fd = ASSERT_NO_ERRNO_AND_VALUE(Open(dir.path(), O_RDONLY, 0000)); + TestLock(fd.get()); +} - struct flock fl; - fl.l_type = F_RDLCK; - fl.l_whence = SEEK_SET; - fl.l_start = 0; - // Same as SetLockBadFd. - fl.l_len = 0; +TEST_F(FcntlLockTest, SetLockSymlink) { + // TODO(gvisor.dev/issue/2782): Replace with IsRunningWithVFS1() when O_PATH + // is supported. + SKIP_IF(IsRunningOnGvisor()); - EXPECT_THAT(fcntl(fd.get(), F_SETLK, &fl), SyscallSucceeds()); + auto file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile()); + auto symlink = ASSERT_NO_ERRNO_AND_VALUE( + TempPath::CreateSymlinkTo(GetAbsoluteTestTmpdir(), file.path())); + + auto fd = + ASSERT_NO_ERRNO_AND_VALUE(Open(symlink.path(), O_RDONLY | O_PATH, 0000)); + TestLockBadFD(fd.get()); +} + +TEST_F(FcntlLockTest, SetLockProc) { + auto fd = + ASSERT_NO_ERRNO_AND_VALUE(Open("/proc/self/status", O_RDONLY, 0000)); + TestLock(fd.get()); +} + +TEST_F(FcntlLockTest, SetLockPipe) { + SKIP_IF(IsRunningWithVFS1()); + + int fds[2]; + ASSERT_THAT(pipe(fds), SyscallSucceeds()); + + TestLock(fds[0]); + TestLockBadFD(fds[0], F_WRLCK); + + TestLock(fds[1], F_WRLCK); + TestLockBadFD(fds[1]); + + EXPECT_THAT(close(fds[0]), SyscallSucceeds()); + EXPECT_THAT(close(fds[1]), SyscallSucceeds()); +} + +TEST_F(FcntlLockTest, SetLockSocket) { + SKIP_IF(IsRunningWithVFS1()); + + int sock = socket(AF_UNIX, SOCK_STREAM, 0); + ASSERT_THAT(sock, SyscallSucceeds()); + + struct sockaddr_un addr = + ASSERT_NO_ERRNO_AND_VALUE(UniqueUnixAddr(true /* abstract */, AF_UNIX)); + ASSERT_THAT( + bind(sock, reinterpret_cast(&addr), sizeof(addr)), + SyscallSucceeds()); + + TestLock(sock); + EXPECT_THAT(close(sock), SyscallSucceeds()); } TEST_F(FcntlLockTest, SetLockBadOpenFlagsWrite) { @@ -241,8 +281,7 @@ TEST_F(FcntlLockTest, SetLockBadOpenFlagsWrite) { fl0.l_type = F_WRLCK; fl0.l_whence = SEEK_SET; fl0.l_start = 0; - // Same as SetLockBadFd. - fl0.l_len = 0; + fl0.l_len = 0; // Lock all file // Expect that setting a write lock using a read only file descriptor // won't work. @@ -704,7 +743,7 @@ TEST_F(FcntlLockTest, SetWriteLockThenBlockingWriteLock) { << "Exited with code: " << status; } -// This test will veirfy that blocking works as expected when another process +// This test will verify that blocking works as expected when another process // holds a read lock when obtaining a write lock. This test will hold the lock // for some amount of time and then wait for the second process to send over the // socket_fd the amount of time it was blocked for before the lock succeeded. @@ -1109,8 +1148,7 @@ int main(int argc, char** argv) { fl.l_start = absl::GetFlag(FLAGS_child_setlock_start); fl.l_len = absl::GetFlag(FLAGS_child_setlock_len); - // Test the fcntl, no need to log, the error is unambiguously - // from fcntl at this point. + // Test the fcntl. int err = 0; int ret = 0; @@ -1123,6 +1161,8 @@ int main(int argc, char** argv) { if (ret == -1 && errno != 0) { err = errno; + std::cerr << "CHILD lock " << setlock_on << " failed " << err + << std::endl; } // If there is a socket fd let's send back the time in microseconds it took -- cgit v1.2.3 From e5d97cbcc1e64185b8fab1cf563c8754edd2e52e Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Wed, 17 Jun 2020 11:32:23 -0700 Subject: Refactor host.canMap. Simplify the canMap check. We do not have plans to allow mmap for anything beyond regular files, so we can just inline canMap() as a simple file mode check. Updates #1672. PiperOrigin-RevId: 316929654 --- pkg/sentry/fsimpl/host/host.go | 4 +++- pkg/sentry/fsimpl/host/util.go | 10 ---------- 2 files changed, 3 insertions(+), 11 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go index 7906242c9..2b084860e 100644 --- a/pkg/sentry/fsimpl/host/host.go +++ b/pkg/sentry/fsimpl/host/host.go @@ -91,7 +91,9 @@ func NewFD(ctx context.Context, mnt *vfs.Mount, hostFD int, opts *NewFDOptions) isTTY: opts.IsTTY, wouldBlock: wouldBlock(uint32(fileType)), seekable: seekable, - canMap: canMap(uint32(fileType)), + // NOTE(b/38213152): Technically, some obscure char devices can be memory + // mapped, but we only allow regular files. + canMap: fileType == linux.S_IFREG, } i.pf.inode = i diff --git a/pkg/sentry/fsimpl/host/util.go b/pkg/sentry/fsimpl/host/util.go index 2bc757b1a..412bdb2eb 100644 --- a/pkg/sentry/fsimpl/host/util.go +++ b/pkg/sentry/fsimpl/host/util.go @@ -49,16 +49,6 @@ func wouldBlock(fileType uint32) bool { return fileType == syscall.S_IFIFO || fileType == syscall.S_IFCHR || fileType == syscall.S_IFSOCK } -// canMap returns true if a file with fileType is allowed to be memory mapped. -// This is ported over from VFS1, but it's probably not the best way for us -// to check if a file can be memory mapped. -func canMap(fileType uint32) bool { - // TODO(gvisor.dev/issue/1672): Also allow "special files" to be mapped (see fs/host:canMap()). - // - // TODO(b/38213152): Some obscure character devices can be mapped. - return fileType == syscall.S_IFREG -} - // isBlockError checks if an error is EAGAIN or EWOULDBLOCK. // If so, they can be transformed into syserror.ErrWouldBlock. func isBlockError(err error) bool { -- cgit v1.2.3 From 6d806ee7198422973a2e4efa9b539de7792b933f Mon Sep 17 00:00:00 2001 From: Michael Pratt Date: Wed, 17 Jun 2020 12:32:59 -0700 Subject: Remove various uses of 'blacklist' Updates #2972 PiperOrigin-RevId: 316942245 --- pkg/sentry/fs/host/tty.go | 6 +++--- pkg/sentry/fsimpl/host/tty.go | 6 +++--- test/runner/runner.go | 6 +++--- tools/nogo/matchers.go | 2 +- 4 files changed, 10 insertions(+), 10 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fs/host/tty.go b/pkg/sentry/fs/host/tty.go index cb91355ab..82a02fcb2 100644 --- a/pkg/sentry/fs/host/tty.go +++ b/pkg/sentry/fs/host/tty.go @@ -308,9 +308,9 @@ func (t *TTYFileOperations) checkChange(ctx context.Context, sig linux.Signal) e task := kernel.TaskFromContext(ctx) if task == nil { // No task? Linux does not have an analog for this case, but - // tty_check_change is more of a blacklist of cases than a - // whitelist, and is surprisingly permissive. Allowing the - // change seems most appropriate. + // tty_check_change only blocks specific cases and is + // surprisingly permissive. Allowing the change seems + // appropriate. return nil } diff --git a/pkg/sentry/fsimpl/host/tty.go b/pkg/sentry/fsimpl/host/tty.go index 0fbc543b1..4ee9270cc 100644 --- a/pkg/sentry/fsimpl/host/tty.go +++ b/pkg/sentry/fsimpl/host/tty.go @@ -326,9 +326,9 @@ func (t *TTYFileDescription) checkChange(ctx context.Context, sig linux.Signal) task := kernel.TaskFromContext(ctx) if task == nil { // No task? Linux does not have an analog for this case, but - // tty_check_change is more of a blacklist of cases than a - // whitelist, and is surprisingly permissive. Allowing the - // change seems most appropriate. + // tty_check_change only blocks specific cases and is + // surprisingly permissive. Allowing the change seems + // appropriate. return nil } diff --git a/test/runner/runner.go b/test/runner/runner.go index f8baf61b0..5456e46a6 100644 --- a/test/runner/runner.go +++ b/test/runner/runner.go @@ -391,12 +391,12 @@ func runTestCaseRunsc(testBin string, tc gtest.TestCase, t *testing.T) { } } -// filterEnv returns an environment with the blacklisted variables removed. -func filterEnv(env, blacklist []string) []string { +// filterEnv returns an environment with the excluded variables removed. +func filterEnv(env, exclude []string) []string { var out []string for _, kv := range env { ok := true - for _, k := range blacklist { + for _, k := range exclude { if strings.HasPrefix(kv, k+"=") { ok = false break diff --git a/tools/nogo/matchers.go b/tools/nogo/matchers.go index bc5772303..32f099925 100644 --- a/tools/nogo/matchers.go +++ b/tools/nogo/matchers.go @@ -89,7 +89,7 @@ func (r resultExcluded) ShouldReport(d analysis.Diagnostic, _ *token.FileSet) bo return false } } - return true // Not blacklisted. + return true // Not excluded. } // andMatcher is a composite matcher. -- cgit v1.2.3 From 6e0c170522279ca72119b17c41e2e1dc93c49d6a Mon Sep 17 00:00:00 2001 From: Fabricio Voznika Date: Wed, 17 Jun 2020 21:21:08 -0700 Subject: Implement Sync() to directories Updates #1035, #1199 PiperOrigin-RevId: 317028108 --- pkg/sentry/fsimpl/gofer/directory.go | 5 +++++ pkg/sentry/fsimpl/host/host.go | 3 ++- pkg/sentry/fsimpl/overlay/directory.go | 22 ++++++++++++++++++++++ pkg/sentry/fsimpl/tmpfs/regular_file.go | 5 ----- pkg/sentry/fsimpl/tmpfs/tmpfs.go | 6 ++++++ 5 files changed, 35 insertions(+), 6 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/gofer/directory.go b/pkg/sentry/fsimpl/gofer/directory.go index b98218753..480510b2a 100644 --- a/pkg/sentry/fsimpl/gofer/directory.go +++ b/pkg/sentry/fsimpl/gofer/directory.go @@ -299,3 +299,8 @@ func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (in return 0, syserror.EINVAL } } + +// Sync implements vfs.FileDescriptionImpl.Sync. +func (fd *directoryFD) Sync(ctx context.Context) error { + return fd.dentry().handle.sync(ctx) +} diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go index 2b084860e..a3a312edb 100644 --- a/pkg/sentry/fsimpl/host/host.go +++ b/pkg/sentry/fsimpl/host/host.go @@ -690,7 +690,8 @@ func (f *fileDescription) Seek(_ context.Context, offset int64, whence int32) (i // Sync implements FileDescriptionImpl. func (f *fileDescription) Sync(context.Context) error { - // TODO(gvisor.dev/issue/1672): Currently we do not support the SyncData optimization, so we always sync everything. + // TODO(gvisor.dev/issue/1672): Currently we do not support the SyncData + // optimization, so we always sync everything. return unix.Fsync(f.inode.hostFD) } diff --git a/pkg/sentry/fsimpl/overlay/directory.go b/pkg/sentry/fsimpl/overlay/directory.go index 6f47167d3..f5c2462a5 100644 --- a/pkg/sentry/fsimpl/overlay/directory.go +++ b/pkg/sentry/fsimpl/overlay/directory.go @@ -263,3 +263,25 @@ func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (in return 0, syserror.EINVAL } } + +// Sync implements vfs.FileDescriptionImpl.Sync. Forwards sync to the upper +// layer, if there is one. The lower layer doesn't need to sync because it +// never changes. +func (fd *directoryFD) Sync(ctx context.Context) error { + d := fd.dentry() + if !d.isCopiedUp() { + return nil + } + vfsObj := d.fs.vfsfs.VirtualFilesystem() + pop := vfs.PathOperation{ + Root: d.upperVD, + Start: d.upperVD, + } + upperFD, err := vfsObj.OpenAt(ctx, d.fs.creds, &pop, &vfs.OpenOptions{Flags: linux.O_RDONLY | linux.O_DIRECTORY}) + if err != nil { + return err + } + err = upperFD.Sync(ctx) + upperFD.DecRef() + return err +} diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file.go b/pkg/sentry/fsimpl/tmpfs/regular_file.go index 77447b32c..bfd9c5995 100644 --- a/pkg/sentry/fsimpl/tmpfs/regular_file.go +++ b/pkg/sentry/fsimpl/tmpfs/regular_file.go @@ -360,11 +360,6 @@ func (fd *regularFileFD) Seek(ctx context.Context, offset int64, whence int32) ( return offset, nil } -// Sync implements vfs.FileDescriptionImpl.Sync. -func (fd *regularFileFD) Sync(ctx context.Context) error { - return nil -} - // ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap. func (fd *regularFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { file := fd.inode().impl.(*regularFile) diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go index d0a3e1a5c..a94333ee0 100644 --- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go +++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go @@ -778,3 +778,9 @@ func (fd *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t func (fd *fileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence) } + +// Sync implements vfs.FileDescriptionImpl.Sync. It does nothing because all +// filesystem state is in-memory. +func (*fileDescription) Sync(context.Context) error { + return nil +} -- cgit v1.2.3 From 3970c127434817304f67a2ad192cbe8094ad3353 Mon Sep 17 00:00:00 2001 From: Michael Pratt Date: Thu, 18 Jun 2020 09:02:14 -0700 Subject: Remove various uses of 'whitelist' Updates #2972 PiperOrigin-RevId: 317113059 --- pkg/bpf/interpreter_test.go | 2 +- pkg/cpuid/cpuid_arm64.go | 5 +++-- pkg/cpuid/cpuid_x86.go | 7 +++---- pkg/seccomp/seccomp_rules.go | 4 ++-- pkg/sentry/fs/filesystems.go | 14 -------------- pkg/sentry/fsimpl/host/host.go | 5 +++-- pkg/sentry/socket/hostinet/socket.go | 8 ++++---- pkg/sentry/vfs/README.md | 2 -- test/syscalls/linux/xattr.cc | 5 +++-- tools/nogo/matchers.go | 27 ++++++++++++++++----------- 10 files changed, 35 insertions(+), 44 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/bpf/interpreter_test.go b/pkg/bpf/interpreter_test.go index 547921d0a..c85d786b9 100644 --- a/pkg/bpf/interpreter_test.go +++ b/pkg/bpf/interpreter_test.go @@ -767,7 +767,7 @@ func TestSimpleFilter(t *testing.T) { expectedRet: 0, }, { - desc: "Whitelisted syscall is allowed", + desc: "Allowed syscall is indeed allowed", seccompData: seccompData{nr: 231 /* __NR_exit_group */, arch: 0xc000003e}, expectedRet: 0x7fff0000, }, diff --git a/pkg/cpuid/cpuid_arm64.go b/pkg/cpuid/cpuid_arm64.go index 08381c1c0..ac7bb6774 100644 --- a/pkg/cpuid/cpuid_arm64.go +++ b/pkg/cpuid/cpuid_arm64.go @@ -312,8 +312,9 @@ func HostFeatureSet() *FeatureSet { } } -// Reads bogomips from host /proc/cpuinfo. Must run before whitelisting. -// This value is used to create the fake /proc/cpuinfo from a FeatureSet. +// Reads bogomips from host /proc/cpuinfo. Must run before syscall filter +// installation. This value is used to create the fake /proc/cpuinfo from a +// FeatureSet. func initCPUInfo() { cpuinfob, err := ioutil.ReadFile("/proc/cpuinfo") if err != nil { diff --git a/pkg/cpuid/cpuid_x86.go b/pkg/cpuid/cpuid_x86.go index 562f8f405..17a89c00d 100644 --- a/pkg/cpuid/cpuid_x86.go +++ b/pkg/cpuid/cpuid_x86.go @@ -1057,9 +1057,9 @@ func HostFeatureSet() *FeatureSet { } } -// Reads max cpu frequency from host /proc/cpuinfo. Must run before -// whitelisting. This value is used to create the fake /proc/cpuinfo from a -// FeatureSet. +// Reads max cpu frequency from host /proc/cpuinfo. Must run before syscall +// filter installation. This value is used to create the fake /proc/cpuinfo +// from a FeatureSet. func initCPUFreq() { cpuinfob, err := ioutil.ReadFile("/proc/cpuinfo") if err != nil { @@ -1106,7 +1106,6 @@ func initFeaturesFromString() { } func init() { - // initCpuFreq must be run before whitelists are enabled. initCPUFreq() initFeaturesFromString() } diff --git a/pkg/seccomp/seccomp_rules.go b/pkg/seccomp/seccomp_rules.go index 06308cd29..a52dc1b4e 100644 --- a/pkg/seccomp/seccomp_rules.go +++ b/pkg/seccomp/seccomp_rules.go @@ -56,7 +56,7 @@ func (a AllowValue) String() (s string) { return fmt.Sprintf("%#x ", uintptr(a)) } -// Rule stores the whitelist of syscall arguments. +// Rule stores the allowed syscall arguments. // // For example: // rule := Rule { @@ -82,7 +82,7 @@ func (r Rule) String() (s string) { return } -// SyscallRules stores a map of OR'ed whitelist rules indexed by the syscall number. +// SyscallRules stores a map of OR'ed argument rules indexed by the syscall number. // If the 'Rules' is empty, we treat it as any argument is allowed. // // For example: diff --git a/pkg/sentry/fs/filesystems.go b/pkg/sentry/fs/filesystems.go index 084da2a8d..d41f30bbb 100644 --- a/pkg/sentry/fs/filesystems.go +++ b/pkg/sentry/fs/filesystems.go @@ -87,20 +87,6 @@ func RegisterFilesystem(f Filesystem) { filesystems.registered[f.Name()] = f } -// UnregisterFilesystem removes a file system from the global set. To keep the -// file system set compatible with save/restore, UnregisterFilesystem must be -// called before save/restore methods. -// -// For instance, packages may unregister their file system after it is mounted. -// This makes sense for pseudo file systems that should not be visible or -// mountable. See whitelistfs in fs/host/fs.go for one example. -func UnregisterFilesystem(name string) { - filesystems.mu.Lock() - defer filesystems.mu.Unlock() - - delete(filesystems.registered, name) -} - // FindFilesystem returns a Filesystem registered at name or (nil, false) if name // is not a file system type that can be found in /proc/filesystems. func FindFilesystem(name string) (Filesystem, bool) { diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go index a3a312edb..43a173bc9 100644 --- a/pkg/sentry/fsimpl/host/host.go +++ b/pkg/sentry/fsimpl/host/host.go @@ -476,8 +476,9 @@ func (i *inode) open(ctx context.Context, d *vfs.Dentry, mnt *vfs.Mount, flags u return unixsocket.NewFileDescription(ep, ep.Type(), flags, mnt, d, &i.locks) } - // TODO(gvisor.dev/issue/1672): Whitelist specific file types here, so that - // we don't allow importing arbitrary file types without proper support. + // TODO(gvisor.dev/issue/1672): Allow only specific file types here, so + // that we don't allow importing arbitrary file types without proper + // support. if i.isTTY { fd := &TTYFileDescription{ fileDescription: fileDescription{inode: i}, diff --git a/pkg/sentry/socket/hostinet/socket.go b/pkg/sentry/socket/hostinet/socket.go index c11e82c10..a92aed2c9 100644 --- a/pkg/sentry/socket/hostinet/socket.go +++ b/pkg/sentry/socket/hostinet/socket.go @@ -324,7 +324,7 @@ func (s *socketOpsCommon) GetSockOpt(t *kernel.Task, level int, name int, outPtr return nil, syserr.ErrInvalidArgument } - // Whitelist options and constrain option length. + // Only allow known and safe options. optlen := getSockOptLen(t, level, name) switch level { case linux.SOL_IP: @@ -369,7 +369,7 @@ func (s *socketOpsCommon) GetSockOpt(t *kernel.Task, level int, name int, outPtr // SetSockOpt implements socket.Socket.SetSockOpt. func (s *socketOpsCommon) SetSockOpt(t *kernel.Task, level int, name int, opt []byte) *syserr.Error { - // Whitelist options and constrain option length. + // Only allow known and safe options. optlen := setSockOptLen(t, level, name) switch level { case linux.SOL_IP: @@ -415,7 +415,7 @@ func (s *socketOpsCommon) SetSockOpt(t *kernel.Task, level int, name int, opt [] // RecvMsg implements socket.Socket.RecvMsg. func (s *socketOpsCommon) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlLen uint64) (int, int, linux.SockAddr, uint32, socket.ControlMessages, *syserr.Error) { - // Whitelist flags. + // Only allow known and safe flags. // // FIXME(jamieliu): We can't support MSG_ERRQUEUE because it uses ancillary // messages that gvisor/pkg/tcpip/transport/unix doesn't understand. Kill the @@ -537,7 +537,7 @@ func (s *socketOpsCommon) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags // SendMsg implements socket.Socket.SendMsg. func (s *socketOpsCommon) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, haveDeadline bool, deadline ktime.Time, controlMessages socket.ControlMessages) (int, *syserr.Error) { - // Whitelist flags. + // Only allow known and safe flags. if flags&^(syscall.MSG_DONTWAIT|syscall.MSG_EOR|syscall.MSG_FASTOPEN|syscall.MSG_MORE|syscall.MSG_NOSIGNAL) != 0 { return 0, syserr.ErrInvalidArgument } diff --git a/pkg/sentry/vfs/README.md b/pkg/sentry/vfs/README.md index 66f3105bd..4b9faf2ea 100644 --- a/pkg/sentry/vfs/README.md +++ b/pkg/sentry/vfs/README.md @@ -169,8 +169,6 @@ This construction, which is essentially a type-safe analogue to Linux's - binder, which is similarly far too incomplete to use. - - whitelistfs, which we are already actively attempting to remove. - - Save/restore. For instance, it is unclear if the current implementation of the `state` package supports the inheritance pattern described above. diff --git a/test/syscalls/linux/xattr.cc b/test/syscalls/linux/xattr.cc index 3231732ec..cbcf08451 100644 --- a/test/syscalls/linux/xattr.cc +++ b/test/syscalls/linux/xattr.cc @@ -73,9 +73,10 @@ TEST_F(XattrTest, XattrLargeName) { std::string name = "user."; name += std::string(XATTR_NAME_MAX - name.length(), 'a'); - // An xattr should be whitelisted before it can be accessed--do not allow - // arbitrary xattrs to be read/written in gVisor. if (!IsRunningOnGvisor()) { + // In gVisor, access to xattrs is controlled with an explicit list of + // allowed names. This name isn't going to be configured to allow access, so + // don't test it. EXPECT_THAT(setxattr(path, name.c_str(), nullptr, 0, /*flags=*/0), SyscallSucceeds()); EXPECT_THAT(getxattr(path, name.c_str(), nullptr, 0), diff --git a/tools/nogo/matchers.go b/tools/nogo/matchers.go index 32f099925..57a250501 100644 --- a/tools/nogo/matchers.go +++ b/tools/nogo/matchers.go @@ -27,10 +27,15 @@ type matcher interface { ShouldReport(d analysis.Diagnostic, fs *token.FileSet) bool } -// pathRegexps excludes explicit paths. +// pathRegexps filters explicit paths. type pathRegexps struct { - expr []*regexp.Regexp - whitelist bool + expr []*regexp.Regexp + + // include, if true, indicates that paths matching any regexp in expr + // match. + // + // If false, paths matching no regexps in expr match. + include bool } // buildRegexps builds a list of regular expressions. @@ -49,33 +54,33 @@ func (p *pathRegexps) ShouldReport(d analysis.Diagnostic, fs *token.FileSet) boo fullPos := fs.Position(d.Pos).String() for _, path := range p.expr { if path.MatchString(fullPos) { - return p.whitelist + return p.include } } - return !p.whitelist + return !p.include } // internalExcluded excludes specific internal paths. func internalExcluded(paths ...string) *pathRegexps { return &pathRegexps{ - expr: buildRegexps(internalPrefix, paths...), - whitelist: false, + expr: buildRegexps(internalPrefix, paths...), + include: false, } } // excludedExcluded excludes specific external paths. func externalExcluded(paths ...string) *pathRegexps { return &pathRegexps{ - expr: buildRegexps(externalPrefix, paths...), - whitelist: false, + expr: buildRegexps(externalPrefix, paths...), + include: false, } } // internalMatches returns a path matcher for internal packages. func internalMatches() *pathRegexps { return &pathRegexps{ - expr: buildRegexps(internalPrefix, ".*"), - whitelist: true, + expr: buildRegexps(internalPrefix, ".*"), + include: true, } } -- cgit v1.2.3 From 408f3d2cd64cae6b2f76a940c76236e9841c095f Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Thu, 18 Jun 2020 22:00:56 -0700 Subject: Fix vfs2 tmpfs link permission checks. Updates #2923. PiperOrigin-RevId: 317246916 --- pkg/sentry/fsimpl/tmpfs/filesystem.go | 16 ++++++++++------ pkg/sentry/vfs/permissions.go | 31 +++++++++++++++++++++++++++++++ test/syscalls/BUILD | 1 + test/syscalls/linux/link.cc | 15 +++++++++++++-- test/util/test_util.h | 1 + 5 files changed, 56 insertions(+), 8 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go index 72399b321..ac359cf7b 100644 --- a/pkg/sentry/fsimpl/tmpfs/filesystem.go +++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go @@ -237,18 +237,22 @@ func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs. return syserror.EXDEV } d := vd.Dentry().Impl().(*dentry) - if d.inode.isDir() { + i := d.inode + if i.isDir() { return syserror.EPERM } - if d.inode.nlink == 0 { + if err := vfs.MayLink(auth.CredentialsFromContext(ctx), linux.FileMode(atomic.LoadUint32(&i.mode)), auth.KUID(atomic.LoadUint32(&i.uid)), auth.KGID(atomic.LoadUint32(&i.gid))); err != nil { + return err + } + if i.nlink == 0 { return syserror.ENOENT } - if d.inode.nlink == maxLinks { + if i.nlink == maxLinks { return syserror.EMLINK } - d.inode.incLinksLocked() - d.inode.watches.Notify("", linux.IN_ATTRIB, 0, vfs.InodeEvent) - parentDir.insertChildLocked(fs.newDentry(d.inode), name) + i.incLinksLocked() + i.watches.Notify("", linux.IN_ATTRIB, 0, vfs.InodeEvent) + parentDir.insertChildLocked(fs.newDentry(i), name) return nil }) } diff --git a/pkg/sentry/vfs/permissions.go b/pkg/sentry/vfs/permissions.go index f9647f90e..afe2be8d7 100644 --- a/pkg/sentry/vfs/permissions.go +++ b/pkg/sentry/vfs/permissions.go @@ -94,6 +94,37 @@ func GenericCheckPermissions(creds *auth.Credentials, ats AccessTypes, mode linu return syserror.EACCES } +// MayLink determines whether creating a hard link to a file with the given +// mode, kuid, and kgid is permitted. +// +// This corresponds to Linux's fs/namei.c:may_linkat. +func MayLink(creds *auth.Credentials, mode linux.FileMode, kuid auth.KUID, kgid auth.KGID) error { + // Source inode owner can hardlink all they like; otherwise, it must be a + // safe source. + if CanActAsOwner(creds, kuid) { + return nil + } + + // Only regular files can be hard linked. + if mode.FileType() != linux.S_IFREG { + return syserror.EPERM + } + + // Setuid files should not get pinned to the filesystem. + if mode&linux.S_ISUID != 0 { + return syserror.EPERM + } + + // Executable setgid files should not get pinned to the filesystem, but we + // don't support S_IXGRP anyway. + + // Hardlinking to unreadable or unwritable sources is dangerous. + if err := GenericCheckPermissions(creds, MayRead|MayWrite, mode, kuid, kgid); err != nil { + return syserror.EPERM + } + return nil +} + // AccessTypesForOpenFlags returns the access types required to open a file // with the given OpenOptions.Flags. Note that this is NOT the same thing as // the set of accesses permitted for the opened file: diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD index 1638a11c7..65a6a7f37 100644 --- a/test/syscalls/BUILD +++ b/test/syscalls/BUILD @@ -305,6 +305,7 @@ syscall_test( add_overlay = True, test = "//test/syscalls/linux:link_test", use_tmpfs = True, # gofer needs CAP_DAC_READ_SEARCH to use AT_EMPTY_PATH with linkat(2) + vfs2 = "True", ) syscall_test( diff --git a/test/syscalls/linux/link.cc b/test/syscalls/linux/link.cc index e74fa2ed5..544681168 100644 --- a/test/syscalls/linux/link.cc +++ b/test/syscalls/linux/link.cc @@ -79,8 +79,13 @@ TEST(LinkTest, PermissionDenied) { // Make the file "unsafe" to link by making it only readable, but not // writable. - const auto oldfile = + const auto unwriteable_file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileMode(0400)); + const std::string special_path = NewTempAbsPath(); + ASSERT_THAT(mkfifo(special_path.c_str(), 0666), SyscallSucceeds()); + const auto setuid_file = + ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileMode(0666 | S_ISUID)); + const std::string newname = NewTempAbsPath(); // Do setuid in a separate thread so that after finishing this test, the @@ -97,8 +102,14 @@ TEST(LinkTest, PermissionDenied) { EXPECT_THAT(syscall(SYS_setuid, absl::GetFlag(FLAGS_scratch_uid)), SyscallSucceeds()); - EXPECT_THAT(link(oldfile.path().c_str(), newname.c_str()), + EXPECT_THAT(link(unwriteable_file.path().c_str(), newname.c_str()), + SyscallFailsWithErrno(EPERM)); + EXPECT_THAT(link(special_path.c_str(), newname.c_str()), SyscallFailsWithErrno(EPERM)); + if (!IsRunningWithVFS1()) { + EXPECT_THAT(link(setuid_file.path().c_str(), newname.c_str()), + SyscallFailsWithErrno(EPERM)); + } }); } diff --git a/test/util/test_util.h b/test/util/test_util.h index 8e3245b27..e635827e6 100644 --- a/test/util/test_util.h +++ b/test/util/test_util.h @@ -220,6 +220,7 @@ constexpr char kKVM[] = "kvm"; bool IsRunningOnGvisor(); const std::string GvisorPlatform(); bool IsRunningWithHostinet(); +// TODO(gvisor.dev/issue/1624): Delete once VFS1 is gone. bool IsRunningWithVFS1(); #ifdef __linux__ -- cgit v1.2.3 From 46957ed24f21396683cf9aff13fa0cd3086ea466 Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Fri, 19 Jun 2020 05:55:35 -0700 Subject: Fix synthetic file bugs in gofer fs. Always check if a synthetic file already exists at a location before creating a file there, and do not try to delete synthetic gofer files from the remote fs. This fixes runsc_ptrace socket tests that create/unlink synthetic, named socket files. Updates #2923. PiperOrigin-RevId: 317293648 --- pkg/sentry/fsimpl/gofer/filesystem.go | 19 +++++++++++-------- test/syscalls/BUILD | 15 +++++++++++++++ 2 files changed, 26 insertions(+), 8 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go index 3c467e313..21eb976cb 100644 --- a/pkg/sentry/fsimpl/gofer/filesystem.go +++ b/pkg/sentry/fsimpl/gofer/filesystem.go @@ -374,13 +374,16 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir return nil } if fs.opts.interop == InteropModeShared { - // The existence of a dentry at name would be inconclusive because the - // file it represents may have been deleted from the remote filesystem, - // so we would need to make an RPC to revalidate the dentry. Just - // attempt the file creation RPC instead. If a file does exist, the RPC - // will fail with EEXIST like we would have. If the RPC succeeds, and a - // stale dentry exists, the dentry will fail revalidation next time - // it's used. + if child := parent.children[name]; child != nil && child.isSynthetic() { + return syserror.EEXIST + } + // The existence of a non-synthetic dentry at name would be inconclusive + // because the file it represents may have been deleted from the remote + // filesystem, so we would need to make an RPC to revalidate the dentry. + // Just attempt the file creation RPC instead. If a file does exist, the + // RPC will fail with EEXIST like we would have. If the RPC succeeds, and a + // stale dentry exists, the dentry will fail revalidation next time it's + // used. return createInRemoteDir(parent, name) } if child := parent.children[name]; child != nil { @@ -518,7 +521,7 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b if child == nil { return syserror.ENOENT } - } else { + } else if child == nil || !child.isSynthetic() { err = parent.file.unlinkAt(ctx, name, flags) if err != nil { if child != nil { diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD index 65a6a7f37..7b0248a39 100644 --- a/test/syscalls/BUILD +++ b/test/syscalls/BUILD @@ -57,6 +57,7 @@ syscall_test( size = "large", add_overlay = True, test = "//test/syscalls/linux:bind_test", + vfs2 = "True", ) syscall_test( @@ -700,6 +701,7 @@ syscall_test( size = "medium", add_overlay = True, test = "//test/syscalls/linux:socket_filesystem_non_blocking_test", + vfs2 = "True", ) syscall_test( @@ -707,12 +709,14 @@ syscall_test( add_overlay = True, shard_count = 50, test = "//test/syscalls/linux:socket_filesystem_test", + vfs2 = "True", ) syscall_test( size = "large", shard_count = 50, test = "//test/syscalls/linux:socket_inet_loopback_test", + vfs2 = "True", ) syscall_test( @@ -721,6 +725,7 @@ syscall_test( # Takes too long for TSAN. Creates a lot of TCP sockets. tags = ["nogotsan"], test = "//test/syscalls/linux:socket_inet_loopback_nogotsan_test", + vfs2 = "True", ) syscall_test( @@ -796,6 +801,7 @@ syscall_test( syscall_test( test = "//test/syscalls/linux:socket_blocking_local_test", + vfs2 = "True", ) syscall_test( @@ -805,6 +811,7 @@ syscall_test( syscall_test( test = "//test/syscalls/linux:socket_non_stream_blocking_local_test", + vfs2 = "True", ) syscall_test( @@ -815,6 +822,7 @@ syscall_test( syscall_test( size = "large", test = "//test/syscalls/linux:socket_stream_blocking_local_test", + vfs2 = "True", ) syscall_test( @@ -826,11 +834,13 @@ syscall_test( syscall_test( size = "medium", test = "//test/syscalls/linux:socket_stream_local_test", + vfs2 = "True", ) syscall_test( size = "medium", test = "//test/syscalls/linux:socket_stream_nonblock_local_test", + vfs2 = "True", ) syscall_test( @@ -838,6 +848,7 @@ syscall_test( size = "enormous", shard_count = 5, test = "//test/syscalls/linux:socket_unix_dgram_local_test", + vfs2 = "True", ) syscall_test( @@ -859,11 +870,13 @@ syscall_test( size = "enormous", shard_count = 5, test = "//test/syscalls/linux:socket_unix_seqpacket_local_test", + vfs2 = "True", ) syscall_test( size = "medium", test = "//test/syscalls/linux:socket_unix_stream_test", + vfs2 = "True", ) syscall_test( @@ -881,6 +894,7 @@ syscall_test( syscall_test( size = "medium", test = "//test/syscalls/linux:socket_unix_unbound_filesystem_test", + vfs2 = "True", ) syscall_test( @@ -894,6 +908,7 @@ syscall_test( size = "large", shard_count = 50, test = "//test/syscalls/linux:socket_unix_unbound_stream_test", + vfs2 = "True", ) syscall_test( -- cgit v1.2.3 From 6b69b955d7613ff391984661a7269eabc86020e3 Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Fri, 19 Jun 2020 06:37:40 -0700 Subject: Support all seek options in gofer specialFileFD.Seek. Updates #2923. PiperOrigin-RevId: 317298186 --- pkg/sentry/fsimpl/gofer/regular_file.go | 16 ++++++++++++---- pkg/sentry/fsimpl/gofer/special_file.go | 19 +++++-------------- 2 files changed, 17 insertions(+), 18 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/gofer/regular_file.go b/pkg/sentry/fsimpl/gofer/regular_file.go index 0d10cf7ac..f7a15a2ed 100644 --- a/pkg/sentry/fsimpl/gofer/regular_file.go +++ b/pkg/sentry/fsimpl/gofer/regular_file.go @@ -489,15 +489,24 @@ func (d *dentry) writeback(ctx context.Context, offset, size int64) error { func (fd *regularFileFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { fd.mu.Lock() defer fd.mu.Unlock() + newOffset, err := regularFileSeekLocked(ctx, fd.dentry(), fd.off, offset, whence) + if err != nil { + return 0, err + } + fd.off = newOffset + return newOffset, nil +} + +// Calculate the new offset for a seek operation on a regular file. +func regularFileSeekLocked(ctx context.Context, d *dentry, fdOffset, offset int64, whence int32) (int64, error) { switch whence { case linux.SEEK_SET: // Use offset as specified. case linux.SEEK_CUR: - offset += fd.off + offset += fdOffset case linux.SEEK_END, linux.SEEK_DATA, linux.SEEK_HOLE: // Ensure file size is up to date. - d := fd.dentry() - if fd.filesystem().opts.interop == InteropModeShared { + if !d.cachedMetadataAuthoritative() { if err := d.updateFromGetattr(ctx); err != nil { return 0, err } @@ -525,7 +534,6 @@ func (fd *regularFileFD) Seek(ctx context.Context, offset int64, whence int32) ( if offset < 0 { return 0, syserror.EINVAL } - fd.off = offset return offset, nil } diff --git a/pkg/sentry/fsimpl/gofer/special_file.go b/pkg/sentry/fsimpl/gofer/special_file.go index e6e29b329..a92008208 100644 --- a/pkg/sentry/fsimpl/gofer/special_file.go +++ b/pkg/sentry/fsimpl/gofer/special_file.go @@ -221,21 +221,12 @@ func (fd *specialFileFD) Seek(ctx context.Context, offset int64, whence int32) ( } fd.mu.Lock() defer fd.mu.Unlock() - switch whence { - case linux.SEEK_SET: - // Use offset as given. - case linux.SEEK_CUR: - offset += fd.off - default: - // SEEK_END, SEEK_DATA, and SEEK_HOLE aren't supported since it's not - // clear that file size is even meaningful for these files. - return 0, syserror.EINVAL - } - if offset < 0 { - return 0, syserror.EINVAL + newOffset, err := regularFileSeekLocked(ctx, fd.dentry(), fd.off, offset, whence) + if err != nil { + return 0, err } - fd.off = offset - return offset, nil + fd.off = newOffset + return newOffset, nil } // Sync implements vfs.FileDescriptionImpl.Sync. -- cgit v1.2.3 From f40d023ad6f8c19898ca105842a88961b3c2994c Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Fri, 19 Jun 2020 08:44:26 -0700 Subject: Don't adjust parent link count if we replace a child dir with another. Updates #2923. PiperOrigin-RevId: 317314460 --- pkg/sentry/fsimpl/gofer/filesystem.go | 3 ++- test/syscalls/BUILD | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go index 21eb976cb..5501781ac 100644 --- a/pkg/sentry/fsimpl/gofer/filesystem.go +++ b/pkg/sentry/fsimpl/gofer/filesystem.go @@ -1196,7 +1196,8 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa if newParent.cachedMetadataAuthoritative() { newParent.dirents = nil newParent.touchCMtime() - if renamed.isDir() { + if renamed.isDir() && (replaced == nil || !replaced.isDir()) { + // Increase the link count if we did not replace another directory. newParent.incLinks() } } diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD index 7b0248a39..131f09ab4 100644 --- a/test/syscalls/BUILD +++ b/test/syscalls/BUILD @@ -572,6 +572,7 @@ syscall_test( size = "medium", add_overlay = True, test = "//test/syscalls/linux:rename_test", + vfs2 = "True", ) syscall_test( -- cgit v1.2.3 From a609fff9d1516a095341c2016ec36f952550a46f Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Fri, 19 Jun 2020 10:18:35 -0700 Subject: Fix vfs2 handling of preadv2/pwritev2 flags. Check for unsupported flags, and silently support RWF_HIPRI by doing nothing. From pkg/abi/linux/file.go: "gVisor does not implement the RWF_HIPRI feature, but the flag is accepted as a valid flag argument for preadv2/pwritev2." Updates #2923. PiperOrigin-RevId: 317330631 --- pkg/sentry/fsimpl/gofer/regular_file.go | 9 +++++++-- pkg/sentry/fsimpl/gofer/special_file.go | 8 ++++++-- pkg/sentry/fsimpl/tmpfs/regular_file.go | 12 ++++++++++++ test/syscalls/BUILD | 1 + 4 files changed, 26 insertions(+), 4 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/gofer/regular_file.go b/pkg/sentry/fsimpl/gofer/regular_file.go index f7a15a2ed..404a452c4 100644 --- a/pkg/sentry/fsimpl/gofer/regular_file.go +++ b/pkg/sentry/fsimpl/gofer/regular_file.go @@ -72,7 +72,9 @@ func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offs if offset < 0 { return 0, syserror.EINVAL } - if opts.Flags != 0 { + + // Check that flags are supported. Silently ignore RWF_HIPRI. + if opts.Flags&^linux.RWF_HIPRI != 0 { return 0, syserror.EOPNOTSUPP } @@ -123,9 +125,12 @@ func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, off if offset < 0 { return 0, syserror.EINVAL } - if opts.Flags != 0 { + + // Check that flags are supported. Silently ignore RWF_HIPRI. + if opts.Flags&^linux.RWF_HIPRI != 0 { return 0, syserror.EOPNOTSUPP } + limit, err := vfs.CheckLimit(ctx, offset, src.NumBytes()) if err != nil { return 0, err diff --git a/pkg/sentry/fsimpl/gofer/special_file.go b/pkg/sentry/fsimpl/gofer/special_file.go index a92008208..a016cbae1 100644 --- a/pkg/sentry/fsimpl/gofer/special_file.go +++ b/pkg/sentry/fsimpl/gofer/special_file.go @@ -129,7 +129,9 @@ func (fd *specialFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offs if fd.seekable && offset < 0 { return 0, syserror.EINVAL } - if opts.Flags != 0 { + + // Check that flags are supported. Silently ignore RWF_HIPRI. + if opts.Flags&^linux.RWF_HIPRI != 0 { return 0, syserror.EOPNOTSUPP } @@ -173,7 +175,9 @@ func (fd *specialFileFD) PWrite(ctx context.Context, src usermem.IOSequence, off if fd.seekable && offset < 0 { return 0, syserror.EINVAL } - if opts.Flags != 0 { + + // Check that flags are supported. Silently ignore RWF_HIPRI. + if opts.Flags&^linux.RWF_HIPRI != 0 { return 0, syserror.EOPNOTSUPP } diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file.go b/pkg/sentry/fsimpl/tmpfs/regular_file.go index bfd9c5995..b805aadd0 100644 --- a/pkg/sentry/fsimpl/tmpfs/regular_file.go +++ b/pkg/sentry/fsimpl/tmpfs/regular_file.go @@ -279,6 +279,12 @@ func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offs if offset < 0 { return 0, syserror.EINVAL } + + // Check that flags are supported. Silently ignore RWF_HIPRI. + if opts.Flags&^linux.RWF_HIPRI != 0 { + return 0, syserror.EOPNOTSUPP + } + if dst.NumBytes() == 0 { return 0, nil } @@ -304,6 +310,12 @@ func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, off if offset < 0 { return 0, syserror.EINVAL } + + // Check that flags are supported. Silently ignore RWF_HIPRI. + if opts.Flags&^linux.RWF_HIPRI != 0 { + return 0, syserror.EOPNOTSUPP + } + srclen := src.NumBytes() if srclen == 0 { return 0, nil diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD index 131f09ab4..9555dc308 100644 --- a/test/syscalls/BUILD +++ b/test/syscalls/BUILD @@ -463,6 +463,7 @@ syscall_test( syscall_test( add_overlay = True, test = "//test/syscalls/linux:preadv2_test", + vfs2 = "True", ) syscall_test( -- cgit v1.2.3 From 8655fb72482e179923987759f378543b2f489f08 Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Fri, 19 Jun 2020 13:35:02 -0700 Subject: Fix vfs2 proc/self/fd dirent iteration. Make proc/self/fd iteration work properly. Also, the comment on kernfs.Inode.IterDirents did not accurately reflect how parameters should be used/were used in kernfs.Inode impls other than fdDir. Updates #2923. PiperOrigin-RevId: 317370325 --- pkg/sentry/fsimpl/kernfs/kernfs.go | 8 ++++---- pkg/sentry/fsimpl/proc/task_fds.go | 3 +-- 2 files changed, 5 insertions(+), 6 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go index bbee8ccda..07a9dc830 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs.go @@ -425,10 +425,10 @@ type inodeDynamicLookup interface { // IterDirents is used to iterate over dynamically created entries. It invokes // cb on each entry in the directory represented by the FileDescription. // 'offset' is the offset for the entire IterDirents call, which may include - // results from the caller. 'relOffset' is the offset inside the entries - // returned by this IterDirents invocation. In other words, - // 'offset+relOffset+1' is the value that should be set in vfs.Dirent.NextOff, - // while 'relOffset' is the place where iteration should start from. + // results from the caller (e.g. "." and ".."). 'relOffset' is the offset + // inside the entries returned by this IterDirents invocation. In other words, + // 'offset' should be used to calculate each vfs.Dirent.NextOff as well as + // the return value, while 'relOffset' is the place to start iteration. IterDirents(ctx context.Context, callback vfs.IterDirentsCallback, offset, relOffset int64) (newOffset int64, err error) } diff --git a/pkg/sentry/fsimpl/proc/task_fds.go b/pkg/sentry/fsimpl/proc/task_fds.go index 7debdb07a..fea29e5f0 100644 --- a/pkg/sentry/fsimpl/proc/task_fds.go +++ b/pkg/sentry/fsimpl/proc/task_fds.go @@ -64,7 +64,7 @@ type fdDir struct { } // IterDirents implements kernfs.inodeDynamicLookup. -func (i *fdDir) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, absOffset, relOffset int64) (int64, error) { +func (i *fdDir) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) { var fds []int32 i.task.WithMuLocked(func(t *kernel.Task) { if fdTable := t.FDTable(); fdTable != nil { @@ -72,7 +72,6 @@ func (i *fdDir) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, abs } }) - offset := absOffset + relOffset typ := uint8(linux.DT_REG) if i.produceSymlink { typ = linux.DT_LNK -- cgit v1.2.3 From ad9f4691741cfada0ae09f73053d6195d43465ae Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Fri, 19 Jun 2020 14:39:31 -0700 Subject: Fix bugs in vfs2 to make symlink tests pass. - Return ENOENT if target path is empty. - Make sure open(2) with O_CREAT|O_EXCL returns EEXIST when necessary. - Correctly update atime in tmpfs using touchATime(). Updates #2923. PiperOrigin-RevId: 317382655 --- pkg/sentry/fsimpl/tmpfs/filesystem.go | 7 +++++-- pkg/sentry/syscalls/linux/vfs2/filesystem.go | 3 +++ 2 files changed, 8 insertions(+), 2 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go index ac359cf7b..637d84e04 100644 --- a/pkg/sentry/fsimpl/tmpfs/filesystem.go +++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go @@ -79,7 +79,7 @@ afterSymlink: } if symlink, ok := child.inode.impl.(*symlink); ok && rp.ShouldFollowSymlink() { // Symlink traversal updates access time. - atomic.StoreInt64(&d.inode.atime, d.inode.fs.clock.Now().Nanoseconds()) + child.inode.touchAtime(rp.Mount()) if err := rp.HandleSymlink(symlink.target); err != nil { return nil, err } @@ -372,6 +372,9 @@ afterTrailingSymlink: parentDir.inode.touchCMtime() return fd, nil } + if mustCreate { + return nil, syserror.EEXIST + } // Is the file mounted over? if err := rp.CheckMount(&child.vfsd); err != nil { return nil, err @@ -379,7 +382,7 @@ afterTrailingSymlink: // Do we need to resolve a trailing symlink? if symlink, ok := child.inode.impl.(*symlink); ok && rp.ShouldFollowSymlink() { // Symlink traversal updates access time. - atomic.StoreInt64(&child.inode.atime, child.inode.fs.clock.Now().Nanoseconds()) + child.inode.touchAtime(rp.Mount()) if err := rp.HandleSymlink(symlink.target); err != nil { return nil, err } diff --git a/pkg/sentry/syscalls/linux/vfs2/filesystem.go b/pkg/sentry/syscalls/linux/vfs2/filesystem.go index 46d3e189c..5dac77e4d 100644 --- a/pkg/sentry/syscalls/linux/vfs2/filesystem.go +++ b/pkg/sentry/syscalls/linux/vfs2/filesystem.go @@ -313,6 +313,9 @@ func symlinkat(t *kernel.Task, targetAddr usermem.Addr, newdirfd int32, linkpath if err != nil { return err } + if len(target) == 0 { + return syserror.ENOENT + } linkpath, err := copyInPath(t, linkpathAddr) if err != nil { return err -- cgit v1.2.3 From 00928d142dd580c44a392e8e51246b543dc4f957 Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Sun, 21 Jun 2020 21:46:57 -0700 Subject: Fix vfs2 extended attributes. Correct behavior when given zero size arguments and trying to set user.* xattrs on files other than regular files or directories. Updates #2923. PiperOrigin-RevId: 317590409 --- pkg/sentry/fsimpl/gofer/gofer.go | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go index d8ae475ed..43c8153a4 100644 --- a/pkg/sentry/fsimpl/gofer/gofer.go +++ b/pkg/sentry/fsimpl/gofer/gofer.go @@ -1206,7 +1206,7 @@ func (d *dentry) setDeleted() { // We only support xattrs prefixed with "user." (see b/148380782). Currently, // there is no need to expose any other xattrs through a gofer. func (d *dentry) listxattr(ctx context.Context, creds *auth.Credentials, size uint64) ([]string, error) { - if d.file.isNil() { + if d.file.isNil() || !d.userXattrSupported() { return nil, nil } xattrMap, err := d.file.listXattr(ctx, size) @@ -1232,6 +1232,9 @@ func (d *dentry) getxattr(ctx context.Context, creds *auth.Credentials, opts *vf if !strings.HasPrefix(opts.Name, linux.XATTR_USER_PREFIX) { return "", syserror.EOPNOTSUPP } + if !d.userXattrSupported() { + return "", syserror.ENODATA + } return d.file.getXattr(ctx, opts.Name, opts.Size) } @@ -1245,6 +1248,9 @@ func (d *dentry) setxattr(ctx context.Context, creds *auth.Credentials, opts *vf if !strings.HasPrefix(opts.Name, linux.XATTR_USER_PREFIX) { return syserror.EOPNOTSUPP } + if !d.userXattrSupported() { + return syserror.EPERM + } return d.file.setXattr(ctx, opts.Name, opts.Value, opts.Flags) } @@ -1258,9 +1264,19 @@ func (d *dentry) removexattr(ctx context.Context, creds *auth.Credentials, name if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) { return syserror.EOPNOTSUPP } + if !d.userXattrSupported() { + return syserror.EPERM + } return d.file.removeXattr(ctx, name) } +// Extended attributes in the user.* namespace are only supported for regular +// files and directories. +func (d *dentry) userXattrSupported() bool { + filetype := linux.S_IFMT & atomic.LoadUint32(&d.mode) + return filetype == linux.S_IFREG || filetype == linux.S_IFDIR +} + // Preconditions: !d.isSynthetic(). d.isRegularFile() || d.isDirectory(). func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool) error { // O_TRUNC unconditionally requires us to obtain a new handle (opened with -- cgit v1.2.3 From 4573e7d863d59d59c6a4f72f396f72b0f6458cb2 Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Mon, 22 Jun 2020 11:38:25 -0700 Subject: Check for invalid trailing / when traversing path in gofer OpenAt. Updates #2923. PiperOrigin-RevId: 317700049 --- pkg/sentry/fsimpl/gofer/filesystem.go | 8 +++++--- test/syscalls/BUILD | 1 + test/syscalls/linux/open.cc | 6 ++++++ 3 files changed, 12 insertions(+), 3 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go index 5501781ac..f065c4bad 100644 --- a/pkg/sentry/fsimpl/gofer/filesystem.go +++ b/pkg/sentry/fsimpl/gofer/filesystem.go @@ -767,15 +767,17 @@ afterTrailingSymlink: parent.dirMu.Unlock() return fd, err } + parent.dirMu.Unlock() if err != nil { - parent.dirMu.Unlock() return nil, err } - // Open existing child or follow symlink. - parent.dirMu.Unlock() if mustCreate { return nil, syserror.EEXIST } + if !child.isDir() && rp.MustBeDir() { + return nil, syserror.ENOTDIR + } + // Open existing child or follow symlink. if child.isSymlink() && rp.ShouldFollowSymlink() { target, err := child.readlink(ctx, rp.Mount()) if err != nil { diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD index 46a98a4cd..f94c383ae 100644 --- a/test/syscalls/BUILD +++ b/test/syscalls/BUILD @@ -181,6 +181,7 @@ syscall_test( size = "medium", add_overlay = True, test = "//test/syscalls/linux:exec_binary_test", + vfs2 = "True", ) syscall_test( diff --git a/test/syscalls/linux/open.cc b/test/syscalls/linux/open.cc index 670c0284b..bb7d108e8 100644 --- a/test/syscalls/linux/open.cc +++ b/test/syscalls/linux/open.cc @@ -439,6 +439,12 @@ TEST_F(OpenTest, CanTruncateWithStrangePermissions) { EXPECT_THAT(close(fd), SyscallSucceeds()); } +TEST_F(OpenTest, OpenNonDirectoryWithTrailingSlash) { + const TempPath file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile()); + const std::string bad_path = file.path() + "/"; + EXPECT_THAT(open(bad_path.c_str(), O_RDONLY), SyscallFailsWithErrno(ENOTDIR)); +} + } // namespace } // namespace testing -- cgit v1.2.3 From 38d7b2fe5630a8f3169cfef7703921c4bc4056c2 Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Mon, 22 Jun 2020 21:29:31 -0700 Subject: Only allow regular files, sockets, pipes, and char devices to be imported. PiperOrigin-RevId: 317796028 --- pkg/sentry/fsimpl/host/host.go | 35 +++++++++++++++++++---------------- 1 file changed, 19 insertions(+), 16 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go index 43a173bc9..a65543028 100644 --- a/pkg/sentry/fsimpl/host/host.go +++ b/pkg/sentry/fsimpl/host/host.go @@ -462,7 +462,8 @@ func (i *inode) open(ctx context.Context, d *vfs.Dentry, mnt *vfs.Mount, flags u // TODO(gvisor.dev/issue/1672): implement behavior corresponding to these allowed flags. flags &= syscall.O_ACCMODE | syscall.O_DIRECT | syscall.O_NONBLOCK | syscall.O_DSYNC | syscall.O_SYNC | syscall.O_APPEND - if fileType == syscall.S_IFSOCK { + switch fileType { + case syscall.S_IFSOCK: if i.isTTY { log.Warningf("cannot use host socket fd %d as TTY", i.hostFD) return nil, syserror.ENOTTY @@ -474,31 +475,33 @@ func (i *inode) open(ctx context.Context, d *vfs.Dentry, mnt *vfs.Mount, flags u } // Currently, we only allow Unix sockets to be imported. return unixsocket.NewFileDescription(ep, ep.Type(), flags, mnt, d, &i.locks) - } - // TODO(gvisor.dev/issue/1672): Allow only specific file types here, so - // that we don't allow importing arbitrary file types without proper - // support. - if i.isTTY { - fd := &TTYFileDescription{ - fileDescription: fileDescription{inode: i}, - termios: linux.DefaultSlaveTermios, + case syscall.S_IFREG, syscall.S_IFIFO, syscall.S_IFCHR: + if i.isTTY { + fd := &TTYFileDescription{ + fileDescription: fileDescription{inode: i}, + termios: linux.DefaultSlaveTermios, + } + fd.LockFD.Init(&i.locks) + vfsfd := &fd.vfsfd + if err := vfsfd.Init(fd, flags, mnt, d, &vfs.FileDescriptionOptions{}); err != nil { + return nil, err + } + return vfsfd, nil } + + fd := &fileDescription{inode: i} fd.LockFD.Init(&i.locks) vfsfd := &fd.vfsfd if err := vfsfd.Init(fd, flags, mnt, d, &vfs.FileDescriptionOptions{}); err != nil { return nil, err } return vfsfd, nil - } - fd := &fileDescription{inode: i} - fd.LockFD.Init(&i.locks) - vfsfd := &fd.vfsfd - if err := vfsfd.Init(fd, flags, mnt, d, &vfs.FileDescriptionOptions{}); err != nil { - return nil, err + default: + log.Warningf("cannot import host fd %d with file type %o", i.hostFD, fileType) + return nil, syserror.EPERM } - return vfsfd, nil } // fileDescription is embedded by host fd implementations of FileDescriptionImpl. -- cgit v1.2.3 From 0c628c3152a727fff287a98897d83ee45ad990e5 Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Tue, 23 Jun 2020 16:11:31 -0700 Subject: Support inotify in vfs2 gofer fs. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Because there is no inode structure stored in the sandbox, inotify watches must be held on the dentry. This would be an issue in the presence of hard links, where multiple dentries would need to share the same set of watches, but in VFS2, we do not support the internal creation of hard links on gofer fs. As a result, we make the assumption that every dentry corresponds to a unique inode. Furthermore, dentries can be cached and then evicted, even if the underlying file has not be deleted. We must prevent this from occurring if there are any watches that would be lost. Note that if the dentry was deleted or invalidated (d.vfsd.IsDead()), we should still destroy it along with its watches. Additionally, when a dentry’s last watch is removed, we cache it if it also has zero references. This way, the dentry can eventually be evicted from memory if it is no longer needed. This is accomplished with a new dentry method, OnZeroWatches(), which is called by Inotify.RmWatch and Inotify.Release. Note that it must be called after all inotify locks are released to avoid violating lock order. Stress tests are added to make sure that inotify operations don't deadlock with gofer.OnZeroWatches. Updates #1479. PiperOrigin-RevId: 317958034 --- pkg/sentry/fsimpl/ext/dentry.go | 11 +- pkg/sentry/fsimpl/gofer/directory.go | 1 + pkg/sentry/fsimpl/gofer/filesystem.go | 59 ++++++- pkg/sentry/fsimpl/gofer/gofer.go | 73 +++++++-- pkg/sentry/fsimpl/kernfs/kernfs.go | 7 +- pkg/sentry/fsimpl/overlay/overlay.go | 5 + pkg/sentry/fsimpl/tmpfs/directory.go | 4 +- pkg/sentry/fsimpl/tmpfs/filesystem.go | 12 +- pkg/sentry/fsimpl/tmpfs/tmpfs.go | 19 ++- pkg/sentry/syscalls/linux/vfs2/inotify.go | 7 +- pkg/sentry/vfs/anonfs.go | 7 +- pkg/sentry/vfs/dentry.go | 32 +++- pkg/sentry/vfs/inotify.go | 108 +++++++++---- pkg/sentry/vfs/vfs.go | 3 + test/syscalls/linux/BUILD | 1 + test/syscalls/linux/inotify.cc | 251 +++++++++++++++++++++++++++++- 16 files changed, 523 insertions(+), 77 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/ext/dentry.go b/pkg/sentry/fsimpl/ext/dentry.go index 6bd1a9fc6..55902322a 100644 --- a/pkg/sentry/fsimpl/ext/dentry.go +++ b/pkg/sentry/fsimpl/ext/dentry.go @@ -63,12 +63,17 @@ func (d *dentry) DecRef() { // InotifyWithParent implements vfs.DentryImpl.InotifyWithParent. // -// TODO(gvisor.dev/issue/1479): Implement inotify. -func (d *dentry) InotifyWithParent(events uint32, cookie uint32, et vfs.EventType) {} +// TODO(b/134676337): Implement inotify. +func (d *dentry) InotifyWithParent(events, cookie uint32, et vfs.EventType) {} // Watches implements vfs.DentryImpl.Watches. // -// TODO(gvisor.dev/issue/1479): Implement inotify. +// TODO(b/134676337): Implement inotify. func (d *dentry) Watches() *vfs.Watches { return nil } + +// OnZeroWatches implements vfs.Dentry.OnZeroWatches. +// +// TODO(b/134676337): Implement inotify. +func (d *dentry) OnZeroWatches() {} diff --git a/pkg/sentry/fsimpl/gofer/directory.go b/pkg/sentry/fsimpl/gofer/directory.go index 480510b2a..5d83fe363 100644 --- a/pkg/sentry/fsimpl/gofer/directory.go +++ b/pkg/sentry/fsimpl/gofer/directory.go @@ -138,6 +138,7 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba fd.dirents = ds } + d.InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent) if d.cachedMetadataAuthoritative() { d.touchAtime(fd.vfsfd.Mount()) } diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go index f065c4bad..0321fd384 100644 --- a/pkg/sentry/fsimpl/gofer/filesystem.go +++ b/pkg/sentry/fsimpl/gofer/filesystem.go @@ -371,6 +371,11 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir } parent.touchCMtime() parent.dirents = nil + ev := linux.IN_CREATE + if dir { + ev |= linux.IN_ISDIR + } + parent.watches.Notify(name, uint32(ev), 0, vfs.InodeEvent) return nil } if fs.opts.interop == InteropModeShared { @@ -400,6 +405,11 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir } parent.touchCMtime() parent.dirents = nil + ev := linux.IN_CREATE + if dir { + ev |= linux.IN_ISDIR + } + parent.watches.Notify(name, uint32(ev), 0, vfs.InodeEvent) return nil } @@ -530,6 +540,18 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b return err } } + + // Generate inotify events for rmdir or unlink. + if dir { + parent.watches.Notify(name, linux.IN_DELETE|linux.IN_ISDIR, 0, vfs.InodeEvent) + } else { + var cw *vfs.Watches + if child != nil { + cw = &child.watches + } + vfs.InotifyRemoveChild(cw, &parent.watches, name) + } + if child != nil { vfsObj.CommitDeleteDentry(&child.vfsd) child.setDeleted() @@ -1018,6 +1040,7 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving } childVFSFD = &fd.vfsfd } + d.watches.Notify(name, linux.IN_CREATE, 0, vfs.PathEvent) return childVFSFD, nil } @@ -1203,6 +1226,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa newParent.incLinks() } } + vfs.InotifyRename(ctx, &renamed.watches, &oldParent.watches, &newParent.watches, oldName, newName, renamed.isDir()) return nil } @@ -1215,12 +1239,21 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckCaching(&ds) d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { + fs.renameMuRUnlockAndCheckCaching(&ds) + return err + } + if err := d.setStat(ctx, rp.Credentials(), &opts.Stat, rp.Mount()); err != nil { + fs.renameMuRUnlockAndCheckCaching(&ds) return err } - return d.setStat(ctx, rp.Credentials(), &opts.Stat, rp.Mount()) + fs.renameMuRUnlockAndCheckCaching(&ds) + + if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 { + d.InotifyWithParent(ev, 0, vfs.InodeEvent) + } + return nil } // StatAt implements vfs.FilesystemImpl.StatAt. @@ -1344,24 +1377,38 @@ func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckCaching(&ds) d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { + fs.renameMuRUnlockAndCheckCaching(&ds) return err } - return d.setxattr(ctx, rp.Credentials(), &opts) + if err := d.setxattr(ctx, rp.Credentials(), &opts); err != nil { + fs.renameMuRUnlockAndCheckCaching(&ds) + return err + } + fs.renameMuRUnlockAndCheckCaching(&ds) + + d.InotifyWithParent(linux.IN_ATTRIB, 0, vfs.InodeEvent) + return nil } // RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt. func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckCaching(&ds) d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { + fs.renameMuRUnlockAndCheckCaching(&ds) + return err + } + if err := d.removexattr(ctx, rp.Credentials(), name); err != nil { + fs.renameMuRUnlockAndCheckCaching(&ds) return err } - return d.removexattr(ctx, rp.Credentials(), name) + fs.renameMuRUnlockAndCheckCaching(&ds) + + d.InotifyWithParent(linux.IN_ATTRIB, 0, vfs.InodeEvent) + return nil } // PrependPath implements vfs.FilesystemImpl.PrependPath. diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go index 43c8153a4..c6c284ff3 100644 --- a/pkg/sentry/fsimpl/gofer/gofer.go +++ b/pkg/sentry/fsimpl/gofer/gofer.go @@ -665,6 +665,9 @@ type dentry struct { pipe *pipe.VFSPipe locks vfs.FileLocks + + // Inotify watches for this dentry. + watches vfs.Watches } // dentryAttrMask returns a p9.AttrMask enabling all attributes used by the @@ -947,6 +950,8 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *lin } else { atomic.StoreInt64(&d.atime, dentryTimestampFromStatx(stat.Atime)) } + // Restore mask bits that we cleared earlier. + stat.Mask |= linux.STATX_ATIME } if setLocalMtime { if stat.Mtime.Nsec == linux.UTIME_NOW { @@ -954,6 +959,8 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *lin } else { atomic.StoreInt64(&d.mtime, dentryTimestampFromStatx(stat.Mtime)) } + // Restore mask bits that we cleared earlier. + stat.Mask |= linux.STATX_MTIME } atomic.StoreInt64(&d.ctime, now) if stat.Mask&linux.STATX_SIZE != 0 { @@ -1051,15 +1058,34 @@ func (d *dentry) decRefLocked() { } // InotifyWithParent implements vfs.DentryImpl.InotifyWithParent. -// -// TODO(gvisor.dev/issue/1479): Implement inotify. -func (d *dentry) InotifyWithParent(events uint32, cookie uint32, et vfs.EventType) {} +func (d *dentry) InotifyWithParent(events, cookie uint32, et vfs.EventType) { + if d.isDir() { + events |= linux.IN_ISDIR + } + + d.fs.renameMu.RLock() + // The ordering below is important, Linux always notifies the parent first. + if d.parent != nil { + d.parent.watches.NotifyWithExclusions(d.name, events, cookie, et, d.isDeleted()) + } + d.watches.Notify("", events, cookie, et) + d.fs.renameMu.RUnlock() +} // Watches implements vfs.DentryImpl.Watches. -// -// TODO(gvisor.dev/issue/1479): Implement inotify. func (d *dentry) Watches() *vfs.Watches { - return nil + return &d.watches +} + +// OnZeroWatches implements vfs.DentryImpl.OnZeroWatches. +// +// If no watches are left on this dentry and it has no references, cache it. +func (d *dentry) OnZeroWatches() { + if atomic.LoadInt64(&d.refs) == 0 { + d.fs.renameMu.Lock() + d.checkCachingLocked() + d.fs.renameMu.Unlock() + } } // checkCachingLocked should be called after d's reference count becomes 0 or it @@ -1093,6 +1119,9 @@ func (d *dentry) checkCachingLocked() { // Deleted and invalidated dentries with zero references are no longer // reachable by path resolution and should be dropped immediately. if d.vfsd.IsDead() { + if d.isDeleted() { + d.watches.HandleDeletion() + } if d.cached { d.fs.cachedDentries.Remove(d) d.fs.cachedDentriesLen-- @@ -1101,6 +1130,14 @@ func (d *dentry) checkCachingLocked() { d.destroyLocked() return } + // If d still has inotify watches and it is not deleted or invalidated, we + // cannot cache it and allow it to be evicted. Otherwise, we will lose its + // watches, even if a new dentry is created for the same file in the future. + // Note that the size of d.watches cannot concurrently transition from zero + // to non-zero, because adding a watch requires holding a reference on d. + if d.watches.Size() > 0 { + return + } // If d is already cached, just move it to the front of the LRU. if d.cached { d.fs.cachedDentries.Remove(d) @@ -1277,7 +1314,7 @@ func (d *dentry) userXattrSupported() bool { return filetype == linux.S_IFREG || filetype == linux.S_IFDIR } -// Preconditions: !d.isSynthetic(). d.isRegularFile() || d.isDirectory(). +// Preconditions: !d.isSynthetic(). d.isRegularFile() || d.isDir(). func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool) error { // O_TRUNC unconditionally requires us to obtain a new handle (opened with // O_TRUNC). @@ -1422,7 +1459,13 @@ func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linu // SetStat implements vfs.FileDescriptionImpl.SetStat. func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { - return fd.dentry().setStat(ctx, auth.CredentialsFromContext(ctx), &opts.Stat, fd.vfsfd.Mount()) + if err := fd.dentry().setStat(ctx, auth.CredentialsFromContext(ctx), &opts.Stat, fd.vfsfd.Mount()); err != nil { + return err + } + if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 { + fd.dentry().InotifyWithParent(ev, 0, vfs.InodeEvent) + } + return nil } // Listxattr implements vfs.FileDescriptionImpl.Listxattr. @@ -1437,12 +1480,22 @@ func (fd *fileDescription) Getxattr(ctx context.Context, opts vfs.GetxattrOption // Setxattr implements vfs.FileDescriptionImpl.Setxattr. func (fd *fileDescription) Setxattr(ctx context.Context, opts vfs.SetxattrOptions) error { - return fd.dentry().setxattr(ctx, auth.CredentialsFromContext(ctx), &opts) + d := fd.dentry() + if err := d.setxattr(ctx, auth.CredentialsFromContext(ctx), &opts); err != nil { + return err + } + d.InotifyWithParent(linux.IN_ATTRIB, 0, vfs.InodeEvent) + return nil } // Removexattr implements vfs.FileDescriptionImpl.Removexattr. func (fd *fileDescription) Removexattr(ctx context.Context, name string) error { - return fd.dentry().removexattr(ctx, auth.CredentialsFromContext(ctx), name) + d := fd.dentry() + if err := d.removexattr(ctx, auth.CredentialsFromContext(ctx), name); err != nil { + return err + } + d.InotifyWithParent(linux.IN_ATTRIB, 0, vfs.InodeEvent) + return nil } // LockBSD implements vfs.FileDescriptionImpl.LockBSD. diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go index 07a9dc830..55349f2a3 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs.go @@ -228,7 +228,7 @@ func (d *Dentry) destroy() { // InotifyWithParent implements vfs.DentryImpl.InotifyWithParent. // // TODO(gvisor.dev/issue/1479): Implement inotify. -func (d *Dentry) InotifyWithParent(events uint32, cookie uint32, et vfs.EventType) {} +func (d *Dentry) InotifyWithParent(events, cookie uint32, et vfs.EventType) {} // Watches implements vfs.DentryImpl.Watches. // @@ -237,6 +237,11 @@ func (d *Dentry) Watches() *vfs.Watches { return nil } +// OnZeroWatches implements vfs.Dentry.OnZeroWatches. +// +// TODO(gvisor.dev/issue/1479): Implement inotify. +func (d *Dentry) OnZeroWatches() {} + // InsertChild inserts child into the vfs dentry cache with the given name under // this dentry. This does not update the directory inode, so calling this on // its own isn't sufficient to insert a child into a directory. InsertChild diff --git a/pkg/sentry/fsimpl/overlay/overlay.go b/pkg/sentry/fsimpl/overlay/overlay.go index e11a3ff19..e720d4825 100644 --- a/pkg/sentry/fsimpl/overlay/overlay.go +++ b/pkg/sentry/fsimpl/overlay/overlay.go @@ -528,6 +528,11 @@ func (d *dentry) Watches() *vfs.Watches { return nil } +// OnZeroWatches implements vfs.DentryImpl.OnZeroWatches. +// +// TODO(gvisor.dev/issue/1479): Implement inotify. +func (d *dentry) OnZeroWatches() {} + // iterLayers invokes yield on each layer comprising d, from top to bottom. If // any call to yield returns false, iterLayer stops iteration. func (d *dentry) iterLayers(yield func(vd vfs.VirtualDentry, isUpper bool) bool) { diff --git a/pkg/sentry/fsimpl/tmpfs/directory.go b/pkg/sentry/fsimpl/tmpfs/directory.go index 913b8a6c5..b172abc15 100644 --- a/pkg/sentry/fsimpl/tmpfs/directory.go +++ b/pkg/sentry/fsimpl/tmpfs/directory.go @@ -79,7 +79,6 @@ func (dir *directory) removeChildLocked(child *dentry) { dir.iterMu.Lock() dir.childList.Remove(child) dir.iterMu.Unlock() - child.unlinked = true } type directoryFD struct { @@ -107,13 +106,14 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba fs := fd.filesystem() dir := fd.inode().impl.(*directory) + defer fd.dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent) + // fs.mu is required to read d.parent and dentry.name. fs.mu.RLock() defer fs.mu.RUnlock() dir.iterMu.Lock() defer dir.iterMu.Unlock() - fd.dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent) fd.inode().touchAtime(fd.vfsfd.Mount()) if fd.off == 0 { diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go index 637d84e04..85d8e37b2 100644 --- a/pkg/sentry/fsimpl/tmpfs/filesystem.go +++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go @@ -638,14 +638,16 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error // SetStatAt implements vfs.FilesystemImpl.SetStatAt. func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error { fs.mu.RLock() - defer fs.mu.RUnlock() d, err := resolveLocked(rp) if err != nil { + fs.mu.RUnlock() return err } if err := d.inode.setStat(ctx, rp.Credentials(), &opts.Stat); err != nil { + fs.mu.RUnlock() return err } + fs.mu.RUnlock() if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 { d.InotifyWithParent(ev, 0, vfs.InodeEvent) @@ -788,14 +790,16 @@ func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt // SetxattrAt implements vfs.FilesystemImpl.SetxattrAt. func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error { fs.mu.RLock() - defer fs.mu.RUnlock() d, err := resolveLocked(rp) if err != nil { + fs.mu.RUnlock() return err } if err := d.inode.setxattr(rp.Credentials(), &opts); err != nil { + fs.mu.RUnlock() return err } + fs.mu.RUnlock() d.InotifyWithParent(linux.IN_ATTRIB, 0, vfs.InodeEvent) return nil @@ -804,14 +808,16 @@ func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt // RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt. func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error { fs.mu.RLock() - defer fs.mu.RUnlock() d, err := resolveLocked(rp) if err != nil { + fs.mu.RUnlock() return err } if err := d.inode.removexattr(rp.Credentials(), name); err != nil { + fs.mu.RUnlock() return err } + fs.mu.RUnlock() d.InotifyWithParent(linux.IN_ATTRIB, 0, vfs.InodeEvent) return nil diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go index a94333ee0..a85bfc968 100644 --- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go +++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go @@ -215,11 +215,6 @@ type dentry struct { // filesystem.mu. name string - // unlinked indicates whether this dentry has been unlinked from its parent. - // It is only set to true on an unlink operation, and never set from true to - // false. unlinked is protected by filesystem.mu. - unlinked bool - // dentryEntry (ugh) links dentries into their parent directory.childList. dentryEntry @@ -259,18 +254,20 @@ func (d *dentry) DecRef() { } // InotifyWithParent implements vfs.DentryImpl.InotifyWithParent. -func (d *dentry) InotifyWithParent(events uint32, cookie uint32, et vfs.EventType) { +func (d *dentry) InotifyWithParent(events, cookie uint32, et vfs.EventType) { if d.inode.isDir() { events |= linux.IN_ISDIR } + d.inode.fs.mu.RLock() // The ordering below is important, Linux always notifies the parent first. if d.parent != nil { - // Note that d.parent or d.name may be stale if there is a concurrent - // rename operation. Inotify does not provide consistency guarantees. - d.parent.inode.watches.NotifyWithExclusions(d.name, events, cookie, et, d.unlinked) + // tmpfs never calls VFS.InvalidateDentry(), so d.vfsd.IsDead() indicates + // that d was deleted. + d.parent.inode.watches.NotifyWithExclusions(d.name, events, cookie, et, d.vfsd.IsDead()) } d.inode.watches.Notify("", events, cookie, et) + d.inode.fs.mu.RUnlock() } // Watches implements vfs.DentryImpl.Watches. @@ -278,6 +275,9 @@ func (d *dentry) Watches() *vfs.Watches { return &d.inode.watches } +// OnZeroWatches implements vfs.Dentry.OnZeroWatches. +func (d *dentry) OnZeroWatches() {} + // inode represents a filesystem object. type inode struct { // fs is the owning filesystem. fs is immutable. @@ -336,7 +336,6 @@ func (i *inode) init(impl interface{}, fs *filesystem, kuid auth.KUID, kgid auth i.ctime = now i.mtime = now // i.nlink initialized by caller - i.watches = vfs.Watches{} i.impl = impl } diff --git a/pkg/sentry/syscalls/linux/vfs2/inotify.go b/pkg/sentry/syscalls/linux/vfs2/inotify.go index 7d50b6a16..74e5287b7 100644 --- a/pkg/sentry/syscalls/linux/vfs2/inotify.go +++ b/pkg/sentry/syscalls/linux/vfs2/inotify.go @@ -116,8 +116,11 @@ func InotifyAddWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kern } defer d.DecRef() - fd = ino.AddWatch(d.Dentry(), mask) - return uintptr(fd), nil, err + fd, err = ino.AddWatch(d.Dentry(), mask) + if err != nil { + return 0, nil, err + } + return uintptr(fd), nil, nil } // InotifyRmWatch implements the inotify_rm_watch() syscall. diff --git a/pkg/sentry/vfs/anonfs.go b/pkg/sentry/vfs/anonfs.go index b7c6b60b8..746af03ec 100644 --- a/pkg/sentry/vfs/anonfs.go +++ b/pkg/sentry/vfs/anonfs.go @@ -301,7 +301,7 @@ func (d *anonDentry) DecRef() { // InotifyWithParent implements DentryImpl.InotifyWithParent. // // TODO(gvisor.dev/issue/1479): Implement inotify. -func (d *anonDentry) InotifyWithParent(events uint32, cookie uint32, et EventType) {} +func (d *anonDentry) InotifyWithParent(events, cookie uint32, et EventType) {} // Watches implements DentryImpl.Watches. // @@ -309,3 +309,8 @@ func (d *anonDentry) InotifyWithParent(events uint32, cookie uint32, et EventTyp func (d *anonDentry) Watches() *Watches { return nil } + +// OnZeroWatches implements Dentry.OnZeroWatches. +// +// TODO(gvisor.dev/issue/1479): Implement inotify. +func (d *anonDentry) OnZeroWatches() {} diff --git a/pkg/sentry/vfs/dentry.go b/pkg/sentry/vfs/dentry.go index 24af13eb1..cea3e6955 100644 --- a/pkg/sentry/vfs/dentry.go +++ b/pkg/sentry/vfs/dentry.go @@ -113,12 +113,29 @@ type DentryImpl interface { // // Note that the events may not actually propagate up to the user, depending // on the event masks. - InotifyWithParent(events uint32, cookie uint32, et EventType) + InotifyWithParent(events, cookie uint32, et EventType) // Watches returns the set of inotify watches for the file corresponding to // the Dentry. Dentries that are hard links to the same underlying file // share the same watches. + // + // Watches may return nil if the dentry belongs to a FilesystemImpl that + // does not support inotify. If an implementation returns a non-nil watch + // set, it must always return a non-nil watch set. Likewise, if an + // implementation returns a nil watch set, it must always return a nil watch + // set. + // + // The caller does not need to hold a reference on the dentry. Watches() *Watches + + // OnZeroWatches is called whenever the number of watches on a dentry drops + // to zero. This is needed by some FilesystemImpls (e.g. gofer) to manage + // dentry lifetime. + // + // The caller does not need to hold a reference on the dentry. OnZeroWatches + // may acquire inotify locks, so to prevent deadlock, no inotify locks should + // be held by the caller. + OnZeroWatches() } // IncRef increments d's reference count. @@ -149,17 +166,26 @@ func (d *Dentry) isMounted() bool { return atomic.LoadUint32(&d.mounts) != 0 } -// InotifyWithParent notifies all watches on the inodes for this dentry and +// InotifyWithParent notifies all watches on the targets represented by d and // its parent of events. -func (d *Dentry) InotifyWithParent(events uint32, cookie uint32, et EventType) { +func (d *Dentry) InotifyWithParent(events, cookie uint32, et EventType) { d.impl.InotifyWithParent(events, cookie, et) } // Watches returns the set of inotify watches associated with d. +// +// Watches will return nil if d belongs to a FilesystemImpl that does not +// support inotify. func (d *Dentry) Watches() *Watches { return d.impl.Watches() } +// OnZeroWatches performs cleanup tasks whenever the number of watches on a +// dentry drops to zero. +func (d *Dentry) OnZeroWatches() { + d.impl.OnZeroWatches() +} + // The following functions are exported so that filesystem implementations can // use them. The vfs package, and users of VFS, should not call these // functions. diff --git a/pkg/sentry/vfs/inotify.go b/pkg/sentry/vfs/inotify.go index 7fa7d2d0c..36e7a3767 100644 --- a/pkg/sentry/vfs/inotify.go +++ b/pkg/sentry/vfs/inotify.go @@ -49,9 +49,6 @@ const ( // Inotify represents an inotify instance created by inotify_init(2) or // inotify_init1(2). Inotify implements FileDescriptionImpl. // -// Lock ordering: -// Inotify.mu -> Watches.mu -> Inotify.evMu -// // +stateify savable type Inotify struct { vfsfd FileDescription @@ -122,17 +119,31 @@ func NewInotifyFD(ctx context.Context, vfsObj *VirtualFilesystem, flags uint32) // Release implements FileDescriptionImpl.Release. Release removes all // watches and frees all resources for an inotify instance. func (i *Inotify) Release() { + var ds []*Dentry // We need to hold i.mu to avoid a race with concurrent calls to // Inotify.handleDeletion from Watches. There's no risk of Watches // accessing this Inotify after the destructor ends, because we remove all // references to it below. i.mu.Lock() - defer i.mu.Unlock() for _, w := range i.watches { // Remove references to the watch from the watches set on the target. We // don't need to worry about the references from i.watches, since this // file description is about to be destroyed. - w.set.Remove(i.id) + d := w.target + ws := d.Watches() + // Watchable dentries should never return a nil watch set. + if ws == nil { + panic("Cannot remove watch from an unwatchable dentry") + } + ws.Remove(i.id) + if ws.Size() == 0 { + ds = append(ds, d) + } + } + i.mu.Unlock() + + for _, d := range ds { + d.OnZeroWatches() } } @@ -272,20 +283,19 @@ func (i *Inotify) queueEvent(ev *Event) { // newWatchLocked creates and adds a new watch to target. // -// Precondition: i.mu must be locked. -func (i *Inotify) newWatchLocked(target *Dentry, mask uint32) *Watch { - targetWatches := target.Watches() +// Precondition: i.mu must be locked. ws must be the watch set for target d. +func (i *Inotify) newWatchLocked(d *Dentry, ws *Watches, mask uint32) *Watch { w := &Watch{ - owner: i, - wd: i.nextWatchIDLocked(), - set: targetWatches, - mask: mask, + owner: i, + wd: i.nextWatchIDLocked(), + target: d, + mask: mask, } // Hold the watch in this inotify instance as well as the watch set on the // target. i.watches[w.wd] = w - targetWatches.Add(w) + ws.Add(w) return w } @@ -312,7 +322,9 @@ func (i *Inotify) handleDeletion(w *Watch) { // AddWatch constructs a new inotify watch and adds it to the target. It // returns the watch descriptor returned by inotify_add_watch(2). -func (i *Inotify) AddWatch(target *Dentry, mask uint32) int32 { +// +// The caller must hold a reference on target. +func (i *Inotify) AddWatch(target *Dentry, mask uint32) (int32, error) { // Note: Locking this inotify instance protects the result returned by // Lookup() below. With the lock held, we know for sure the lookup result // won't become stale because it's impossible for *this* instance to @@ -320,8 +332,14 @@ func (i *Inotify) AddWatch(target *Dentry, mask uint32) int32 { i.mu.Lock() defer i.mu.Unlock() + ws := target.Watches() + if ws == nil { + // While Linux supports inotify watches on all filesystem types, watches on + // filesystems like kernfs are not generally useful, so we do not. + return 0, syserror.EPERM + } // Does the target already have a watch from this inotify instance? - if existing := target.Watches().Lookup(i.id); existing != nil { + if existing := ws.Lookup(i.id); existing != nil { newmask := mask if mask&linux.IN_MASK_ADD != 0 { // "Add (OR) events to watch mask for this pathname if it already @@ -329,12 +347,12 @@ func (i *Inotify) AddWatch(target *Dentry, mask uint32) int32 { newmask |= atomic.LoadUint32(&existing.mask) } atomic.StoreUint32(&existing.mask, newmask) - return existing.wd + return existing.wd, nil } // No existing watch, create a new watch. - w := i.newWatchLocked(target, mask) - return w.wd + w := i.newWatchLocked(target, ws, mask) + return w.wd, nil } // RmWatch looks up an inotify watch for the given 'wd' and configures the @@ -353,9 +371,19 @@ func (i *Inotify) RmWatch(wd int32) error { delete(i.watches, wd) // Remove the watch from the watch target. - w.set.Remove(w.OwnerID()) + ws := w.target.Watches() + // AddWatch ensures that w.target has a non-nil watch set. + if ws == nil { + panic("Watched dentry cannot have nil watch set") + } + ws.Remove(w.OwnerID()) + remaining := ws.Size() i.mu.Unlock() + if remaining == 0 { + w.target.OnZeroWatches() + } + // Generate the event for the removal. i.queueEvent(newEvent(wd, "", linux.IN_IGNORED, 0)) @@ -374,6 +402,13 @@ type Watches struct { ws map[uint64]*Watch } +// Size returns the number of watches held by w. +func (w *Watches) Size() int { + w.mu.Lock() + defer w.mu.Unlock() + return len(w.ws) +} + // Lookup returns the watch owned by an inotify instance with the given id. // Returns nil if no such watch exists. // @@ -459,10 +494,6 @@ func (w *Watches) NotifyWithExclusions(name string, events, cookie uint32, et Ev func (w *Watches) HandleDeletion() { w.Notify("", linux.IN_DELETE_SELF, 0, InodeEvent) - // TODO(gvisor.dev/issue/1479): This doesn't work because maps are not copied - // by value. Ideally, we wouldn't have this circular locking so we can just - // notify of IN_DELETE_SELF in the same loop below. - // // We can't hold w.mu while calling watch.handleDeletion to preserve lock // ordering w.r.t to the owner inotify instances. Instead, atomically move // the watches map into a local variable so we can iterate over it safely. @@ -495,9 +526,8 @@ type Watch struct { // Descriptor for this watch. This is unique across an inotify instance. wd int32 - // set is the watch set containing this watch. It belongs to the target file - // of this watch. - set *Watches + // target is a dentry representing the watch target. Its watch set contains this watch. + target *Dentry // Events being monitored via this watch. Must be accessed with atomic // memory operations. @@ -606,7 +636,7 @@ func (e *Event) setName(name string) { func (e *Event) sizeOf() int { s := inotifyEventBaseSize + int(e.len) if s < inotifyEventBaseSize { - panic("overflow") + panic("Overflowed event size") } return s } @@ -676,11 +706,15 @@ func InotifyEventFromStatMask(mask uint32) uint32 { } // InotifyRemoveChild sends the appriopriate notifications to the watch sets of -// the child being removed and its parent. +// the child being removed and its parent. Note that unlike most pairs of +// parent/child notifications, the child is notified first in this case. func InotifyRemoveChild(self, parent *Watches, name string) { - self.Notify("", linux.IN_ATTRIB, 0, InodeEvent) - parent.Notify(name, linux.IN_DELETE, 0, InodeEvent) - // TODO(gvisor.dev/issue/1479): implement IN_EXCL_UNLINK. + if self != nil { + self.Notify("", linux.IN_ATTRIB, 0, InodeEvent) + } + if parent != nil { + parent.Notify(name, linux.IN_DELETE, 0, InodeEvent) + } } // InotifyRename sends the appriopriate notifications to the watch sets of the @@ -691,8 +725,14 @@ func InotifyRename(ctx context.Context, renamed, oldParent, newParent *Watches, dirEv = linux.IN_ISDIR } cookie := uniqueid.InotifyCookie(ctx) - oldParent.Notify(oldName, dirEv|linux.IN_MOVED_FROM, cookie, InodeEvent) - newParent.Notify(newName, dirEv|linux.IN_MOVED_TO, cookie, InodeEvent) + if oldParent != nil { + oldParent.Notify(oldName, dirEv|linux.IN_MOVED_FROM, cookie, InodeEvent) + } + if newParent != nil { + newParent.Notify(newName, dirEv|linux.IN_MOVED_TO, cookie, InodeEvent) + } // Somewhat surprisingly, self move events do not have a cookie. - renamed.Notify("", linux.IN_MOVE_SELF, 0, InodeEvent) + if renamed != nil { + renamed.Notify("", linux.IN_MOVE_SELF, 0, InodeEvent) + } } diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go index 9acca8bc7..58c7ad778 100644 --- a/pkg/sentry/vfs/vfs.go +++ b/pkg/sentry/vfs/vfs.go @@ -24,6 +24,9 @@ // Locks acquired by FilesystemImpls between Prepare{Delete,Rename}Dentry and Commit{Delete,Rename*}Dentry // VirtualFilesystem.filesystemsMu // EpollInstance.mu +// Inotify.mu +// Watches.mu +// Inotify.evMu // VirtualFilesystem.fsTypesMu // // Locking Dentry.mu in multiple Dentries requires holding diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD index 078e4a284..7282d675e 100644 --- a/test/syscalls/linux/BUILD +++ b/test/syscalls/linux/BUILD @@ -974,6 +974,7 @@ cc_binary( "//test/util:thread_util", "@com_google_absl//absl/strings", "@com_google_absl//absl/strings:str_format", + "@com_google_absl//absl/synchronization", "@com_google_absl//absl/time", ], ) diff --git a/test/syscalls/linux/inotify.cc b/test/syscalls/linux/inotify.cc index 1d1a7171d..13e096e9c 100644 --- a/test/syscalls/linux/inotify.cc +++ b/test/syscalls/linux/inotify.cc @@ -29,6 +29,7 @@ #include "absl/strings/str_cat.h" #include "absl/strings/str_format.h" #include "absl/strings/str_join.h" +#include "absl/synchronization/mutex.h" #include "absl/time/clock.h" #include "absl/time/time.h" #include "test/util/epoll_util.h" @@ -1305,7 +1306,7 @@ TEST(Inotify, UtimesGeneratesAttribEvent) { const int wd = ASSERT_NO_ERRNO_AND_VALUE( InotifyAddWatch(fd.get(), root.path(), IN_ALL_EVENTS)); - struct timeval times[2] = {{1, 0}, {2, 0}}; + const struct timeval times[2] = {{1, 0}, {2, 0}}; EXPECT_THAT(futimes(file1_fd.get(), times), SyscallSucceeds()); const std::vector events = @@ -2018,7 +2019,7 @@ TEST(Inotify, ExcludeUnlinkInodeEvents_NoRandomSave) { Event(IN_MODIFY, wd, Basename(file.path())), })); - struct timeval times[2] = {{1, 0}, {2, 0}}; + const struct timeval times[2] = {{1, 0}, {2, 0}}; ASSERT_THAT(futimes(fd.get(), times), SyscallSucceeds()); events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get())); EXPECT_THAT(events, Are({Event(IN_ATTRIB, wd, Basename(file.path()))})); @@ -2030,6 +2031,252 @@ TEST(Inotify, ExcludeUnlinkInodeEvents_NoRandomSave) { EXPECT_THAT(events, Are({Event(IN_ATTRIB, wd, Basename(file.path()))})); } +// This test helps verify that the lock order of filesystem and inotify locks +// is respected when inotify instances and watch targets are concurrently being +// destroyed. +TEST(InotifyTest, InotifyAndTargetDestructionDoNotDeadlock_NoRandomSave) { + const DisableSave ds; // Too many syscalls. + + // A file descriptor protected by a mutex. This ensures that while a + // descriptor is in use, it cannot be closed and reused for a different file + // description. + struct atomic_fd { + int fd; + absl::Mutex mu; + }; + + // Set up initial inotify instances. + constexpr int num_fds = 3; + std::vector fds(num_fds); + for (int i = 0; i < num_fds; i++) { + int fd; + ASSERT_THAT(fd = inotify_init1(IN_NONBLOCK), SyscallSucceeds()); + fds[i].fd = fd; + } + + // Set up initial watch targets. + std::vector paths; + for (int i = 0; i < 3; i++) { + paths.push_back(NewTempAbsPath()); + ASSERT_THAT(mknod(paths[i].c_str(), S_IFREG | 0600, 0), SyscallSucceeds()); + } + + constexpr absl::Duration runtime = absl::Seconds(4); + + // Constantly replace each inotify instance with a new one. + auto replace_fds = [&] { + for (auto start = absl::Now(); absl::Now() - start < runtime;) { + for (auto& afd : fds) { + int new_fd; + ASSERT_THAT(new_fd = inotify_init1(IN_NONBLOCK), SyscallSucceeds()); + absl::MutexLock l(&afd.mu); + ASSERT_THAT(close(afd.fd), SyscallSucceeds()); + afd.fd = new_fd; + for (auto& p : paths) { + // inotify_add_watch may fail if the file at p was deleted. + ASSERT_THAT(inotify_add_watch(afd.fd, p.c_str(), IN_ALL_EVENTS), + AnyOf(SyscallSucceeds(), SyscallFailsWithErrno(ENOENT))); + } + } + sched_yield(); + } + }; + + std::list ts; + for (int i = 0; i < 3; i++) { + ts.emplace_back(replace_fds); + } + + // Constantly replace each watch target with a new one. + for (auto start = absl::Now(); absl::Now() - start < runtime;) { + for (auto& p : paths) { + ASSERT_THAT(unlink(p.c_str()), SyscallSucceeds()); + ASSERT_THAT(mknod(p.c_str(), S_IFREG | 0600, 0), SyscallSucceeds()); + } + sched_yield(); + } +} + +// This test helps verify that the lock order of filesystem and inotify locks +// is respected when adding/removing watches occurs concurrently with the +// removal of their targets. +TEST(InotifyTest, AddRemoveUnlinkDoNotDeadlock_NoRandomSave) { + const DisableSave ds; // Too many syscalls. + + // Set up inotify instances. + constexpr int num_fds = 3; + std::vector fds(num_fds); + for (int i = 0; i < num_fds; i++) { + ASSERT_THAT(fds[i] = inotify_init1(IN_NONBLOCK), SyscallSucceeds()); + } + + // Set up initial watch targets. + std::vector paths; + for (int i = 0; i < 3; i++) { + paths.push_back(NewTempAbsPath()); + ASSERT_THAT(mknod(paths[i].c_str(), S_IFREG | 0600, 0), SyscallSucceeds()); + } + + constexpr absl::Duration runtime = absl::Seconds(1); + + // Constantly add/remove watches for each inotify instance/watch target pair. + auto add_remove_watches = [&] { + for (auto start = absl::Now(); absl::Now() - start < runtime;) { + for (int fd : fds) { + for (auto& p : paths) { + // Do not assert on inotify_add_watch and inotify_rm_watch. They may + // fail if the file at p was deleted. inotify_add_watch may also fail + // if another thread beat us to adding a watch. + const int wd = inotify_add_watch(fd, p.c_str(), IN_ALL_EVENTS); + if (wd > 0) { + inotify_rm_watch(fd, wd); + } + } + } + sched_yield(); + } + }; + + std::list ts; + for (int i = 0; i < 15; i++) { + ts.emplace_back(add_remove_watches); + } + + // Constantly replace each watch target with a new one. + for (auto start = absl::Now(); absl::Now() - start < runtime;) { + for (auto& p : paths) { + ASSERT_THAT(unlink(p.c_str()), SyscallSucceeds()); + ASSERT_THAT(mknod(p.c_str(), S_IFREG | 0600, 0), SyscallSucceeds()); + } + sched_yield(); + } +} + +// This test helps verify that the lock order of filesystem and inotify locks +// is respected when many inotify events and filesystem operations occur +// simultaneously. +TEST(InotifyTest, NotifyNoDeadlock_NoRandomSave) { + const DisableSave ds; // Too many syscalls. + + const TempPath parent = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); + const std::string dir = parent.path(); + + // mu protects file, which will change on rename. + absl::Mutex mu; + std::string file = NewTempAbsPathInDir(dir); + ASSERT_THAT(mknod(file.c_str(), 0644 | S_IFREG, 0), SyscallSucceeds()); + + const absl::Duration runtime = absl::Milliseconds(300); + + // Add/remove watches on dir and file. + ScopedThread add_remove_watches([&] { + const FileDescriptor ifd = + ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK)); + int dir_wd = ASSERT_NO_ERRNO_AND_VALUE( + InotifyAddWatch(ifd.get(), dir, IN_ALL_EVENTS)); + int file_wd; + { + absl::ReaderMutexLock l(&mu); + file_wd = ASSERT_NO_ERRNO_AND_VALUE( + InotifyAddWatch(ifd.get(), file, IN_ALL_EVENTS)); + } + for (auto start = absl::Now(); absl::Now() - start < runtime;) { + ASSERT_THAT(inotify_rm_watch(ifd.get(), file_wd), SyscallSucceeds()); + ASSERT_THAT(inotify_rm_watch(ifd.get(), dir_wd), SyscallSucceeds()); + dir_wd = ASSERT_NO_ERRNO_AND_VALUE( + InotifyAddWatch(ifd.get(), dir, IN_ALL_EVENTS)); + { + absl::ReaderMutexLock l(&mu); + file_wd = ASSERT_NO_ERRNO_AND_VALUE( + InotifyAddWatch(ifd.get(), file, IN_ALL_EVENTS)); + } + sched_yield(); + } + }); + + // Modify attributes on dir and file. + ScopedThread stats([&] { + int fd, dir_fd; + { + absl::ReaderMutexLock l(&mu); + ASSERT_THAT(fd = open(file.c_str(), O_RDONLY), SyscallSucceeds()); + } + ASSERT_THAT(dir_fd = open(dir.c_str(), O_RDONLY | O_DIRECTORY), + SyscallSucceeds()); + const struct timeval times[2] = {{1, 0}, {2, 0}}; + + for (auto start = absl::Now(); absl::Now() - start < runtime;) { + { + absl::ReaderMutexLock l(&mu); + EXPECT_THAT(utimes(file.c_str(), times), SyscallSucceeds()); + } + EXPECT_THAT(futimes(fd, times), SyscallSucceeds()); + EXPECT_THAT(utimes(dir.c_str(), times), SyscallSucceeds()); + EXPECT_THAT(futimes(dir_fd, times), SyscallSucceeds()); + sched_yield(); + } + }); + + // Modify extended attributes on dir and file. + ScopedThread xattrs([&] { + // TODO(gvisor.dev/issue/1636): Support extended attributes in runsc gofer. + if (!IsRunningOnGvisor()) { + int fd; + { + absl::ReaderMutexLock l(&mu); + ASSERT_THAT(fd = open(file.c_str(), O_RDONLY), SyscallSucceeds()); + } + + const char* name = "user.test"; + int val = 123; + for (auto start = absl::Now(); absl::Now() - start < runtime;) { + { + absl::ReaderMutexLock l(&mu); + ASSERT_THAT( + setxattr(file.c_str(), name, &val, sizeof(val), /*flags=*/0), + SyscallSucceeds()); + ASSERT_THAT(removexattr(file.c_str(), name), SyscallSucceeds()); + } + + ASSERT_THAT(fsetxattr(fd, name, &val, sizeof(val), /*flags=*/0), + SyscallSucceeds()); + ASSERT_THAT(fremovexattr(fd, name), SyscallSucceeds()); + sched_yield(); + } + } + }); + + // Read and write file's contents. Read and write dir's entries. + ScopedThread read_write([&] { + int fd; + { + absl::ReaderMutexLock l(&mu); + ASSERT_THAT(fd = open(file.c_str(), O_RDWR), SyscallSucceeds()); + } + for (auto start = absl::Now(); absl::Now() - start < runtime;) { + int val = 123; + ASSERT_THAT(write(fd, &val, sizeof(val)), SyscallSucceeds()); + ASSERT_THAT(read(fd, &val, sizeof(val)), SyscallSucceeds()); + TempPath new_file = + ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(dir)); + ASSERT_NO_ERRNO(ListDir(dir, false)); + new_file.reset(); + sched_yield(); + } + }); + + // Rename file. + for (auto start = absl::Now(); absl::Now() - start < runtime;) { + const std::string new_path = NewTempAbsPathInDir(dir); + { + absl::WriterMutexLock l(&mu); + ASSERT_THAT(rename(file.c_str(), new_path.c_str()), SyscallSucceeds()); + file = new_path; + } + sched_yield(); + } +} + } // namespace } // namespace testing } // namespace gvisor -- cgit v1.2.3 From 65a587dedf1a30b3614a66532d2b448026b9c540 Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Tue, 23 Jun 2020 18:31:53 -0700 Subject: Complete inotify IN_EXCL_UNLINK implementation in VFS2. Events were only skipped on parent directories after their children were unlinked; events on the unlinked file itself need to be skipped as well. As a result, all Watches.Notify() calls need to know whether the dentry where the call came from was unlinked. Updates #1479. PiperOrigin-RevId: 317979476 --- pkg/sentry/fsimpl/gofer/filesystem.go | 8 ++-- pkg/sentry/fsimpl/gofer/gofer.go | 4 +- pkg/sentry/fsimpl/tmpfs/filesystem.go | 8 ++-- pkg/sentry/fsimpl/tmpfs/tmpfs.go | 10 +++-- pkg/sentry/vfs/inotify.go | 39 +++++++--------- test/syscalls/linux/inotify.cc | 84 ++++++++++++++++++++++++++--------- 6 files changed, 95 insertions(+), 58 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go index 0321fd384..d253c996c 100644 --- a/pkg/sentry/fsimpl/gofer/filesystem.go +++ b/pkg/sentry/fsimpl/gofer/filesystem.go @@ -375,7 +375,7 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir if dir { ev |= linux.IN_ISDIR } - parent.watches.Notify(name, uint32(ev), 0, vfs.InodeEvent) + parent.watches.Notify(name, uint32(ev), 0, vfs.InodeEvent, false /* unlinked */) return nil } if fs.opts.interop == InteropModeShared { @@ -409,7 +409,7 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir if dir { ev |= linux.IN_ISDIR } - parent.watches.Notify(name, uint32(ev), 0, vfs.InodeEvent) + parent.watches.Notify(name, uint32(ev), 0, vfs.InodeEvent, false /* unlinked */) return nil } @@ -543,7 +543,7 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b // Generate inotify events for rmdir or unlink. if dir { - parent.watches.Notify(name, linux.IN_DELETE|linux.IN_ISDIR, 0, vfs.InodeEvent) + parent.watches.Notify(name, linux.IN_DELETE|linux.IN_ISDIR, 0, vfs.InodeEvent, true /* unlinked */) } else { var cw *vfs.Watches if child != nil { @@ -1040,7 +1040,7 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving } childVFSFD = &fd.vfsfd } - d.watches.Notify(name, linux.IN_CREATE, 0, vfs.PathEvent) + d.watches.Notify(name, linux.IN_CREATE, 0, vfs.PathEvent, false /* unlinked */) return childVFSFD, nil } diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go index c6c284ff3..b0b6d8c64 100644 --- a/pkg/sentry/fsimpl/gofer/gofer.go +++ b/pkg/sentry/fsimpl/gofer/gofer.go @@ -1066,9 +1066,9 @@ func (d *dentry) InotifyWithParent(events, cookie uint32, et vfs.EventType) { d.fs.renameMu.RLock() // The ordering below is important, Linux always notifies the parent first. if d.parent != nil { - d.parent.watches.NotifyWithExclusions(d.name, events, cookie, et, d.isDeleted()) + d.parent.watches.Notify(d.name, events, cookie, et, d.isDeleted()) } - d.watches.Notify("", events, cookie, et) + d.watches.Notify("", events, cookie, et, d.isDeleted()) d.fs.renameMu.RUnlock() } diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go index 85d8e37b2..f766b7ce2 100644 --- a/pkg/sentry/fsimpl/tmpfs/filesystem.go +++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go @@ -182,7 +182,7 @@ func (fs *filesystem) doCreateAt(rp *vfs.ResolvingPath, dir bool, create func(pa if dir { ev |= linux.IN_ISDIR } - parentDir.inode.watches.Notify(name, uint32(ev), 0, vfs.InodeEvent) + parentDir.inode.watches.Notify(name, uint32(ev), 0, vfs.InodeEvent, false /* unlinked */) parentDir.inode.touchCMtime() return nil } @@ -251,7 +251,7 @@ func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs. return syserror.EMLINK } i.incLinksLocked() - i.watches.Notify("", linux.IN_ATTRIB, 0, vfs.InodeEvent) + i.watches.Notify("", linux.IN_ATTRIB, 0, vfs.InodeEvent, false /* unlinked */) parentDir.insertChildLocked(fs.newDentry(i), name) return nil }) @@ -368,7 +368,7 @@ afterTrailingSymlink: if err != nil { return nil, err } - parentDir.inode.watches.Notify(name, linux.IN_CREATE, 0, vfs.PathEvent) + parentDir.inode.watches.Notify(name, linux.IN_CREATE, 0, vfs.PathEvent, false /* unlinked */) parentDir.inode.touchCMtime() return fd, nil } @@ -625,7 +625,7 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error return err } parentDir.removeChildLocked(child) - parentDir.inode.watches.Notify(name, linux.IN_DELETE|linux.IN_ISDIR, 0, vfs.InodeEvent) + parentDir.inode.watches.Notify(name, linux.IN_DELETE|linux.IN_ISDIR, 0, vfs.InodeEvent, true /* unlinked */) // Remove links for child, child/., and child/.. child.inode.decLinksLocked() child.inode.decLinksLocked() diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go index a85bfc968..d7f4f0779 100644 --- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go +++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go @@ -259,14 +259,16 @@ func (d *dentry) InotifyWithParent(events, cookie uint32, et vfs.EventType) { events |= linux.IN_ISDIR } + // tmpfs never calls VFS.InvalidateDentry(), so d.vfsd.IsDead() indicates + // that d was deleted. + deleted := d.vfsd.IsDead() + d.inode.fs.mu.RLock() // The ordering below is important, Linux always notifies the parent first. if d.parent != nil { - // tmpfs never calls VFS.InvalidateDentry(), so d.vfsd.IsDead() indicates - // that d was deleted. - d.parent.inode.watches.NotifyWithExclusions(d.name, events, cookie, et, d.vfsd.IsDead()) + d.parent.inode.watches.Notify(d.name, events, cookie, et, deleted) } - d.inode.watches.Notify("", events, cookie, et) + d.inode.watches.Notify("", events, cookie, et, deleted) d.inode.fs.mu.RUnlock() } diff --git a/pkg/sentry/vfs/inotify.go b/pkg/sentry/vfs/inotify.go index 36e7a3767..6043a0566 100644 --- a/pkg/sentry/vfs/inotify.go +++ b/pkg/sentry/vfs/inotify.go @@ -467,21 +467,16 @@ func (w *Watches) Remove(id uint64) { delete(w.ws, id) } -// Notify queues a new event with all watches in this set. -func (w *Watches) Notify(name string, events, cookie uint32, et EventType) { - w.NotifyWithExclusions(name, events, cookie, et, false) -} - -// NotifyWithExclusions queues a new event with watches in this set. Watches -// with IN_EXCL_UNLINK are skipped if the event is coming from a child that -// has been unlinked. -func (w *Watches) NotifyWithExclusions(name string, events, cookie uint32, et EventType, unlinked bool) { +// Notify queues a new event with watches in this set. Watches with +// IN_EXCL_UNLINK are skipped if the event is coming from a child that has been +// unlinked. +func (w *Watches) Notify(name string, events, cookie uint32, et EventType, unlinked bool) { // N.B. We don't defer the unlocks because Notify is in the hot path of // all IO operations, and the defer costs too much for small IO // operations. w.mu.RLock() for _, watch := range w.ws { - if unlinked && watch.ExcludeUnlinkedChildren() && et == PathEvent { + if unlinked && watch.ExcludeUnlinked() && et == PathEvent { continue } watch.Notify(name, events, cookie) @@ -492,7 +487,7 @@ func (w *Watches) NotifyWithExclusions(name string, events, cookie uint32, et Ev // HandleDeletion is called when the watch target is destroyed to emit // the appropriate events. func (w *Watches) HandleDeletion() { - w.Notify("", linux.IN_DELETE_SELF, 0, InodeEvent) + w.Notify("", linux.IN_DELETE_SELF, 0, InodeEvent, true /* unlinked */) // We can't hold w.mu while calling watch.handleDeletion to preserve lock // ordering w.r.t to the owner inotify instances. Instead, atomically move @@ -539,14 +534,12 @@ func (w *Watch) OwnerID() uint64 { return w.owner.id } -// ExcludeUnlinkedChildren indicates whether the watched object should continue -// to be notified of events of its children after they have been unlinked, e.g. -// for an open file descriptor. +// ExcludeUnlinked indicates whether the watched object should continue to be +// notified of events originating from a path that has been unlinked. // -// TODO(gvisor.dev/issue/1479): Implement IN_EXCL_UNLINK. -// We can do this by keeping track of the set of unlinked children in Watches -// to skip notification. -func (w *Watch) ExcludeUnlinkedChildren() bool { +// For example, if "foo/bar" is opened and then unlinked, operations on the +// open fd may be ignored by watches on "foo" and "foo/bar" with IN_EXCL_UNLINK. +func (w *Watch) ExcludeUnlinked() bool { return atomic.LoadUint32(&w.mask)&linux.IN_EXCL_UNLINK != 0 } @@ -710,10 +703,10 @@ func InotifyEventFromStatMask(mask uint32) uint32 { // parent/child notifications, the child is notified first in this case. func InotifyRemoveChild(self, parent *Watches, name string) { if self != nil { - self.Notify("", linux.IN_ATTRIB, 0, InodeEvent) + self.Notify("", linux.IN_ATTRIB, 0, InodeEvent, true /* unlinked */) } if parent != nil { - parent.Notify(name, linux.IN_DELETE, 0, InodeEvent) + parent.Notify(name, linux.IN_DELETE, 0, InodeEvent, true /* unlinked */) } } @@ -726,13 +719,13 @@ func InotifyRename(ctx context.Context, renamed, oldParent, newParent *Watches, } cookie := uniqueid.InotifyCookie(ctx) if oldParent != nil { - oldParent.Notify(oldName, dirEv|linux.IN_MOVED_FROM, cookie, InodeEvent) + oldParent.Notify(oldName, dirEv|linux.IN_MOVED_FROM, cookie, InodeEvent, false /* unlinked */) } if newParent != nil { - newParent.Notify(newName, dirEv|linux.IN_MOVED_TO, cookie, InodeEvent) + newParent.Notify(newName, dirEv|linux.IN_MOVED_TO, cookie, InodeEvent, false /* unlinked */) } // Somewhat surprisingly, self move events do not have a cookie. if renamed != nil { - renamed.Notify("", linux.IN_MOVE_SELF, 0, InodeEvent) + renamed.Notify("", linux.IN_MOVE_SELF, 0, InodeEvent, false /* unlinked */) } } diff --git a/test/syscalls/linux/inotify.cc b/test/syscalls/linux/inotify.cc index 13e096e9c..731c7046c 100644 --- a/test/syscalls/linux/inotify.cc +++ b/test/syscalls/linux/inotify.cc @@ -1840,9 +1840,7 @@ TEST(Inotify, IncludeUnlinkedFile_NoRandomSave) { const TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); const TempPath file = ASSERT_NO_ERRNO_AND_VALUE( TempPath::CreateFileWith(dir.path(), "123", TempPath::kDefaultFileMode)); - - const FileDescriptor fd = - ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR)); + FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR)); const FileDescriptor inotify_fd = ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK)); @@ -1855,7 +1853,7 @@ TEST(Inotify, IncludeUnlinkedFile_NoRandomSave) { int val = 0; ASSERT_THAT(read(fd.get(), &val, sizeof(val)), SyscallSucceeds()); ASSERT_THAT(write(fd.get(), &val, sizeof(val)), SyscallSucceeds()); - const std::vector events = + std::vector events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get())); EXPECT_THAT(events, Are({ Event(IN_ATTRIB, file_wd), @@ -1865,6 +1863,15 @@ TEST(Inotify, IncludeUnlinkedFile_NoRandomSave) { Event(IN_MODIFY, dir_wd, Basename(file.path())), Event(IN_MODIFY, file_wd), })); + + fd.reset(); + events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get())); + EXPECT_THAT(events, Are({ + Event(IN_CLOSE_WRITE, dir_wd, Basename(file.path())), + Event(IN_CLOSE_WRITE, file_wd), + Event(IN_DELETE_SELF, file_wd), + Event(IN_IGNORED, file_wd), + })); } // Watches created with IN_EXCL_UNLINK will stop emitting events on fds for @@ -1881,13 +1888,14 @@ TEST(Inotify, ExcludeUnlink_NoRandomSave) { const TempPath file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(dir.path())); - const FileDescriptor fd = - ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR)); + FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR)); const FileDescriptor inotify_fd = ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK)); - const int wd = ASSERT_NO_ERRNO_AND_VALUE(InotifyAddWatch( + const int dir_wd = ASSERT_NO_ERRNO_AND_VALUE(InotifyAddWatch( inotify_fd.get(), dir.path(), IN_ALL_EVENTS | IN_EXCL_UNLINK)); + const int file_wd = ASSERT_NO_ERRNO_AND_VALUE(InotifyAddWatch( + inotify_fd.get(), file.path(), IN_ALL_EVENTS | IN_EXCL_UNLINK)); // Unlink the child, which should cause further operations on the open file // descriptor to be ignored. @@ -1895,14 +1903,28 @@ TEST(Inotify, ExcludeUnlink_NoRandomSave) { int val = 0; ASSERT_THAT(write(fd.get(), &val, sizeof(val)), SyscallSucceeds()); ASSERT_THAT(read(fd.get(), &val, sizeof(val)), SyscallSucceeds()); - const std::vector events = + std::vector events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get())); - EXPECT_THAT(events, Are({Event(IN_DELETE, wd, Basename(file.path()))})); + EXPECT_THAT(events, Are({ + Event(IN_ATTRIB, file_wd), + Event(IN_DELETE, dir_wd, Basename(file.path())), + })); + + fd.reset(); + events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get())); + ASSERT_THAT(events, Are({ + Event(IN_DELETE_SELF, file_wd), + Event(IN_IGNORED, file_wd), + })); } // We need to disable S/R because there are filesystems where we cannot re-open // fds to an unlinked file across S/R, e.g. gofer-backed filesytems. TEST(Inotify, ExcludeUnlinkDirectory_NoRandomSave) { + // TODO(gvisor.dev/issue/1624): This test fails on VFS1. Remove once VFS1 is + // deleted. + SKIP_IF(IsRunningWithVFS1()); + const DisableSave ds; const TempPath parent = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); @@ -1912,20 +1934,29 @@ TEST(Inotify, ExcludeUnlinkDirectory_NoRandomSave) { const FileDescriptor inotify_fd = ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK)); - const FileDescriptor fd = + FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(dirPath.c_str(), O_RDONLY | O_DIRECTORY)); - const int wd = ASSERT_NO_ERRNO_AND_VALUE(InotifyAddWatch( + const int parent_wd = ASSERT_NO_ERRNO_AND_VALUE(InotifyAddWatch( inotify_fd.get(), parent.path(), IN_ALL_EVENTS | IN_EXCL_UNLINK)); + const int self_wd = ASSERT_NO_ERRNO_AND_VALUE(InotifyAddWatch( + inotify_fd.get(), dir.path(), IN_ALL_EVENTS | IN_EXCL_UNLINK)); // Unlink the dir, and then close the open fd. ASSERT_THAT(rmdir(dirPath.c_str()), SyscallSucceeds()); dir.reset(); - const std::vector events = + std::vector events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get())); // No close event should appear. ASSERT_THAT(events, - Are({Event(IN_DELETE | IN_ISDIR, wd, Basename(dirPath))})); + Are({Event(IN_DELETE | IN_ISDIR, parent_wd, Basename(dirPath))})); + + fd.reset(); + events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get())); + ASSERT_THAT(events, Are({ + Event(IN_DELETE_SELF, self_wd), + Event(IN_IGNORED, self_wd), + })); } // If "dir/child" and "dir/child2" are links to the same file, and "dir/child" @@ -1989,10 +2020,6 @@ TEST(Inotify, ExcludeUnlinkInodeEvents_NoRandomSave) { const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(file.path().c_str(), O_RDWR)); - const FileDescriptor inotify_fd = - ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK)); - const int wd = ASSERT_NO_ERRNO_AND_VALUE(InotifyAddWatch( - inotify_fd.get(), dir.path(), IN_ALL_EVENTS | IN_EXCL_UNLINK)); // NOTE(b/157163751): Create another link before unlinking. This is needed for // the gofer filesystem in gVisor, where open fds will not work once the link @@ -2006,6 +2033,13 @@ TEST(Inotify, ExcludeUnlinkInodeEvents_NoRandomSave) { ASSERT_THAT(rc, SyscallSucceeds()); } + const FileDescriptor inotify_fd = + ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK)); + const int dir_wd = ASSERT_NO_ERRNO_AND_VALUE(InotifyAddWatch( + inotify_fd.get(), dir.path(), IN_ALL_EVENTS | IN_EXCL_UNLINK)); + const int file_wd = ASSERT_NO_ERRNO_AND_VALUE(InotifyAddWatch( + inotify_fd.get(), file.path(), IN_ALL_EVENTS | IN_EXCL_UNLINK)); + // Even after unlinking, inode-level operations will trigger events regardless // of IN_EXCL_UNLINK. ASSERT_THAT(unlink(file.path().c_str()), SyscallSucceeds()); @@ -2015,20 +2049,28 @@ TEST(Inotify, ExcludeUnlinkInodeEvents_NoRandomSave) { std::vector events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get())); EXPECT_THAT(events, Are({ - Event(IN_DELETE, wd, Basename(file.path())), - Event(IN_MODIFY, wd, Basename(file.path())), + Event(IN_ATTRIB, file_wd), + Event(IN_DELETE, dir_wd, Basename(file.path())), + Event(IN_MODIFY, dir_wd, Basename(file.path())), + Event(IN_MODIFY, file_wd), })); const struct timeval times[2] = {{1, 0}, {2, 0}}; ASSERT_THAT(futimes(fd.get(), times), SyscallSucceeds()); events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get())); - EXPECT_THAT(events, Are({Event(IN_ATTRIB, wd, Basename(file.path()))})); + EXPECT_THAT(events, Are({ + Event(IN_ATTRIB, dir_wd, Basename(file.path())), + Event(IN_ATTRIB, file_wd), + })); // S/R is disabled on this entire test due to behavior with unlink; it must // also be disabled after this point because of fchmod. ASSERT_THAT(fchmod(fd.get(), 0777), SyscallSucceeds()); events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get())); - EXPECT_THAT(events, Are({Event(IN_ATTRIB, wd, Basename(file.path()))})); + EXPECT_THAT(events, Are({ + Event(IN_ATTRIB, dir_wd, Basename(file.path())), + Event(IN_ATTRIB, file_wd), + })); } // This test helps verify that the lock order of filesystem and inotify locks -- cgit v1.2.3 From 2189e0a660e8355167bee4939fffeb57f0312b5d Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Tue, 23 Jun 2020 19:22:45 -0700 Subject: Clean up hostfs TODOs. This CL does a handful of things: - Support O_DSYNC, O_SYNC - Support O_APPEND and document an unavoidable race condition - Ignore O_DIRECT; we probably don't want to allow applications to set O_DIRECT on the host fd itself. - Leave a TODO for supporting O_NONBLOCK, which is a simple fix once RWF_NOWAIT is supported. - Get rid of caching TODO; force_page_cache is not configurable for host fs in vfs1 or vfs2 after whitelist fs was removed. - For the remaining TODOs, link to more specific bugs. Fixes #1672. PiperOrigin-RevId: 317985269 --- pkg/sentry/fsimpl/host/host.go | 47 ++++++++++++++++++++++++++++-------------- 1 file changed, 32 insertions(+), 15 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go index a65543028..d5e73ddae 100644 --- a/pkg/sentry/fsimpl/host/host.go +++ b/pkg/sentry/fsimpl/host/host.go @@ -459,8 +459,9 @@ func (i *inode) open(ctx context.Context, d *vfs.Dentry, mnt *vfs.Mount, flags u fileType := s.Mode & linux.FileTypeMask // Constrain flags to a subset we can handle. - // TODO(gvisor.dev/issue/1672): implement behavior corresponding to these allowed flags. - flags &= syscall.O_ACCMODE | syscall.O_DIRECT | syscall.O_NONBLOCK | syscall.O_DSYNC | syscall.O_SYNC | syscall.O_APPEND + // + // TODO(gvisor.dev/issue/2601): Support O_NONBLOCK by adding RWF_NOWAIT to pread/pwrite calls. + flags &= syscall.O_ACCMODE | syscall.O_NONBLOCK | syscall.O_DSYNC | syscall.O_SYNC | syscall.O_APPEND switch fileType { case syscall.S_IFSOCK: @@ -568,7 +569,7 @@ func (f *fileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts } return n, err } - // TODO(gvisor.dev/issue/1672): Cache pages, when forced to do so. + f.offsetMu.Lock() n, err := readFromHostFD(ctx, i.hostFD, dst, f.offset, opts.Flags) f.offset += n @@ -577,7 +578,7 @@ func (f *fileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts } func readFromHostFD(ctx context.Context, hostFD int, dst usermem.IOSequence, offset int64, flags uint32) (int64, error) { - // TODO(gvisor.dev/issue/1672): Support select preadv2 flags. + // TODO(gvisor.dev/issue/2601): Support select preadv2 flags. if flags != 0 { return 0, syserror.EOPNOTSUPP } @@ -589,41 +590,58 @@ func readFromHostFD(ctx context.Context, hostFD int, dst usermem.IOSequence, off // PWrite implements FileDescriptionImpl. func (f *fileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { - i := f.inode - if !i.seekable { + if !f.inode.seekable { return 0, syserror.ESPIPE } - return writeToHostFD(ctx, i.hostFD, src, offset, opts.Flags) + return f.writeToHostFD(ctx, src, offset, opts.Flags) } // Write implements FileDescriptionImpl. func (f *fileDescription) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { i := f.inode if !i.seekable { - n, err := writeToHostFD(ctx, i.hostFD, src, -1, opts.Flags) + n, err := f.writeToHostFD(ctx, src, -1, opts.Flags) if isBlockError(err) { err = syserror.ErrWouldBlock } return n, err } - // TODO(gvisor.dev/issue/1672): Cache pages, when forced to do so. - // TODO(gvisor.dev/issue/1672): Write to end of file and update offset if O_APPEND is set on this file. + f.offsetMu.Lock() - n, err := writeToHostFD(ctx, i.hostFD, src, f.offset, opts.Flags) + // NOTE(gvisor.dev/issue/2983): O_APPEND may cause memory corruption if + // another process modifies the host file between retrieving the file size + // and writing to the host fd. This is an unavoidable race condition because + // we cannot enforce synchronization on the host. + if f.vfsfd.StatusFlags()&linux.O_APPEND != 0 { + var s syscall.Stat_t + if err := syscall.Fstat(i.hostFD, &s); err != nil { + f.offsetMu.Unlock() + return 0, err + } + f.offset = s.Size + } + n, err := f.writeToHostFD(ctx, src, f.offset, opts.Flags) f.offset += n f.offsetMu.Unlock() return n, err } -func writeToHostFD(ctx context.Context, hostFD int, src usermem.IOSequence, offset int64, flags uint32) (int64, error) { - // TODO(gvisor.dev/issue/1672): Support select pwritev2 flags. +func (f *fileDescription) writeToHostFD(ctx context.Context, src usermem.IOSequence, offset int64, flags uint32) (int64, error) { + hostFD := f.inode.hostFD + // TODO(gvisor.dev/issue/2601): Support select pwritev2 flags. if flags != 0 { return 0, syserror.EOPNOTSUPP } writer := hostfd.GetReadWriterAt(int32(hostFD), offset, flags) n, err := src.CopyInTo(ctx, writer) hostfd.PutReadWriterAt(writer) + // NOTE(gvisor.dev/issue/2979): We always sync everything, even for O_DSYNC. + if n > 0 && f.vfsfd.StatusFlags()&(linux.O_DSYNC|linux.O_SYNC) != 0 { + if syncErr := unix.Fsync(hostFD); syncErr != nil { + return int64(n), syncErr + } + } return int64(n), err } @@ -694,8 +712,7 @@ func (f *fileDescription) Seek(_ context.Context, offset int64, whence int32) (i // Sync implements FileDescriptionImpl. func (f *fileDescription) Sync(context.Context) error { - // TODO(gvisor.dev/issue/1672): Currently we do not support the SyncData - // optimization, so we always sync everything. + // TODO(gvisor.dev/issue/1897): Currently, we always sync everything. return unix.Fsync(f.inode.hostFD) } -- cgit v1.2.3 From 399c52888db609296fd1341ed0daa994ad2d02b0 Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Tue, 23 Jun 2020 20:04:15 -0700 Subject: Resolve remaining inotify TODOs. Also refactor HandleDeletion(). Updates #1479. PiperOrigin-RevId: 317989000 --- pkg/sentry/fsimpl/kernfs/kernfs.go | 8 +++----- pkg/sentry/vfs/anonfs.go | 8 +++----- pkg/sentry/vfs/filesystem.go | 2 -- pkg/sentry/vfs/inotify.go | 42 ++++++++++++++++++-------------------- 4 files changed, 26 insertions(+), 34 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go index 55349f2a3..596de1edf 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs.go @@ -227,19 +227,17 @@ func (d *Dentry) destroy() { // InotifyWithParent implements vfs.DentryImpl.InotifyWithParent. // -// TODO(gvisor.dev/issue/1479): Implement inotify. +// Although Linux technically supports inotify on pseudo filesystems (inotify +// is implemented at the vfs layer), it is not particularly useful. It is left +// unimplemented until someone actually needs it. func (d *Dentry) InotifyWithParent(events, cookie uint32, et vfs.EventType) {} // Watches implements vfs.DentryImpl.Watches. -// -// TODO(gvisor.dev/issue/1479): Implement inotify. func (d *Dentry) Watches() *vfs.Watches { return nil } // OnZeroWatches implements vfs.Dentry.OnZeroWatches. -// -// TODO(gvisor.dev/issue/1479): Implement inotify. func (d *Dentry) OnZeroWatches() {} // InsertChild inserts child into the vfs dentry cache with the given name under diff --git a/pkg/sentry/vfs/anonfs.go b/pkg/sentry/vfs/anonfs.go index 746af03ec..641e3e502 100644 --- a/pkg/sentry/vfs/anonfs.go +++ b/pkg/sentry/vfs/anonfs.go @@ -300,17 +300,15 @@ func (d *anonDentry) DecRef() { // InotifyWithParent implements DentryImpl.InotifyWithParent. // -// TODO(gvisor.dev/issue/1479): Implement inotify. +// Although Linux technically supports inotify on pseudo filesystems (inotify +// is implemented at the vfs layer), it is not particularly useful. It is left +// unimplemented until someone actually needs it. func (d *anonDentry) InotifyWithParent(events, cookie uint32, et EventType) {} // Watches implements DentryImpl.Watches. -// -// TODO(gvisor.dev/issue/1479): Implement inotify. func (d *anonDentry) Watches() *Watches { return nil } // OnZeroWatches implements Dentry.OnZeroWatches. -// -// TODO(gvisor.dev/issue/1479): Implement inotify. func (d *anonDentry) OnZeroWatches() {} diff --git a/pkg/sentry/vfs/filesystem.go b/pkg/sentry/vfs/filesystem.go index 1edd584c9..6bb9ca180 100644 --- a/pkg/sentry/vfs/filesystem.go +++ b/pkg/sentry/vfs/filesystem.go @@ -524,8 +524,6 @@ type FilesystemImpl interface { // // Preconditions: vd.Mount().Filesystem().Impl() == this FilesystemImpl. PrependPath(ctx context.Context, vfsroot, vd VirtualDentry, b *fspath.Builder) error - - // TODO(gvisor.dev/issue/1479): inotify_add_watch() } // PrependPathAtVFSRootError is returned by implementations of diff --git a/pkg/sentry/vfs/inotify.go b/pkg/sentry/vfs/inotify.go index 6043a0566..35960394c 100644 --- a/pkg/sentry/vfs/inotify.go +++ b/pkg/sentry/vfs/inotify.go @@ -120,6 +120,7 @@ func NewInotifyFD(ctx context.Context, vfsObj *VirtualFilesystem, flags uint32) // watches and frees all resources for an inotify instance. func (i *Inotify) Release() { var ds []*Dentry + // We need to hold i.mu to avoid a race with concurrent calls to // Inotify.handleDeletion from Watches. There's no risk of Watches // accessing this Inotify after the destructor ends, because we remove all @@ -307,19 +308,6 @@ func (i *Inotify) nextWatchIDLocked() int32 { return i.nextWatchMinusOne } -// handleDeletion handles the deletion of the target of watch w. It removes w -// from i.watches and a watch removal event is generated. -func (i *Inotify) handleDeletion(w *Watch) { - i.mu.Lock() - _, found := i.watches[w.wd] - delete(i.watches, w.wd) - i.mu.Unlock() - - if found { - i.queueEvent(newEvent(w.wd, "", linux.IN_IGNORED, 0)) - } -} - // AddWatch constructs a new inotify watch and adds it to the target. It // returns the watch descriptor returned by inotify_add_watch(2). // @@ -484,8 +472,9 @@ func (w *Watches) Notify(name string, events, cookie uint32, et EventType, unlin w.mu.RUnlock() } -// HandleDeletion is called when the watch target is destroyed to emit -// the appropriate events. +// HandleDeletion is called when the watch target is destroyed. Clear the +// watch set, detach watches from the inotify instances they belong to, and +// generate the appropriate events. func (w *Watches) HandleDeletion() { w.Notify("", linux.IN_DELETE_SELF, 0, InodeEvent, true /* unlinked */) @@ -505,9 +494,23 @@ func (w *Watches) HandleDeletion() { w.ws = nil w.mu.Unlock() + // Remove each watch from its owner's watch set, and generate a corresponding + // watch removal event. for _, watch := range ws { - // TODO(gvisor.dev/issue/1479): consider refactoring this. - watch.handleDeletion() + i := watch.owner + i.mu.Lock() + _, found := i.watches[watch.wd] + delete(i.watches, watch.wd) + + // Release mutex before notifying waiters because we don't control what + // they can do. + i.mu.Unlock() + + // If watch was not found, it was removed from the inotify instance before + // we could get to it, in which case we should not generate an event. + if found { + i.queueEvent(newEvent(watch.wd, "", linux.IN_IGNORED, 0)) + } } } @@ -559,11 +562,6 @@ func (w *Watch) Notify(name string, events uint32, cookie uint32) { w.owner.queueEvent(newEvent(w.wd, name, matchedEvents, cookie)) } -// handleDeletion handles the deletion of w's target. -func (w *Watch) handleDeletion() { - w.owner.handleDeletion(w) -} - // Event represents a struct inotify_event from linux. // // +stateify savable -- cgit v1.2.3 From 58880bf551f4fdaeecf0a355816f1af353ef81b6 Mon Sep 17 00:00:00 2001 From: Nicolas Lacasse Date: Wed, 24 Jun 2020 16:21:53 -0700 Subject: Port /dev/net/tun device to VFS2. Updates #2912 #1035 PiperOrigin-RevId: 318162565 --- pkg/sentry/devices/tundev/BUILD | 23 +++++ pkg/sentry/devices/tundev/tundev.go | 178 +++++++++++++++++++++++++++++++++ pkg/sentry/fsimpl/devtmpfs/devtmpfs.go | 16 ++- runsc/boot/BUILD | 1 + runsc/boot/vfs.go | 11 +- test/syscalls/BUILD | 1 + 6 files changed, 227 insertions(+), 3 deletions(-) create mode 100644 pkg/sentry/devices/tundev/BUILD create mode 100644 pkg/sentry/devices/tundev/tundev.go (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/devices/tundev/BUILD b/pkg/sentry/devices/tundev/BUILD new file mode 100644 index 000000000..71c59287c --- /dev/null +++ b/pkg/sentry/devices/tundev/BUILD @@ -0,0 +1,23 @@ +load("//tools:defs.bzl", "go_library") + +licenses(["notice"]) + +go_library( + name = "tundev", + srcs = ["tundev.go"], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/context", + "//pkg/sentry/arch", + "//pkg/sentry/fsimpl/devtmpfs", + "//pkg/sentry/inet", + "//pkg/sentry/kernel", + "//pkg/sentry/socket/netstack", + "//pkg/sentry/vfs", + "//pkg/syserror", + "//pkg/tcpip/link/tun", + "//pkg/usermem", + "//pkg/waiter", + ], +) diff --git a/pkg/sentry/devices/tundev/tundev.go b/pkg/sentry/devices/tundev/tundev.go new file mode 100644 index 000000000..dfbd069af --- /dev/null +++ b/pkg/sentry/devices/tundev/tundev.go @@ -0,0 +1,178 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package tundev implements the /dev/net/tun device. +package tundev + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/devtmpfs" + "gvisor.dev/gvisor/pkg/sentry/inet" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/socket/netstack" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/tcpip/link/tun" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +const ( + netTunDevMajor = 10 + netTunDevMinor = 200 +) + +// tunDevice implements vfs.Device for /dev/net/tun. +type tunDevice struct{} + +// Open implements vfs.Device.Open. +func (tunDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + fd := &tunFD{} + if err := fd.vfsfd.Init(fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{ + UseDentryMetadata: true, + }); err != nil { + return nil, err + } + return &fd.vfsfd, nil +} + +// tunFD implements vfs.FileDescriptionImpl for /dev/net/tun. +type tunFD struct { + vfsfd vfs.FileDescription + vfs.FileDescriptionDefaultImpl + vfs.DentryMetadataFileDescriptionImpl + vfs.NoLockFD + + device tun.Device +} + +// Ioctl implements vfs.FileDescriptionImpl.Ioctl. +func (fd *tunFD) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) { + request := args[1].Uint() + data := args[2].Pointer() + + switch request { + case linux.TUNSETIFF: + t := kernel.TaskFromContext(ctx) + if t == nil { + panic("Ioctl should be called from a task context") + } + if !t.HasCapability(linux.CAP_NET_ADMIN) { + return 0, syserror.EPERM + } + stack, ok := t.NetworkContext().(*netstack.Stack) + if !ok { + return 0, syserror.EINVAL + } + + var req linux.IFReq + if _, err := usermem.CopyObjectIn(ctx, uio, data, &req, usermem.IOOpts{ + AddressSpaceActive: true, + }); err != nil { + return 0, err + } + flags := usermem.ByteOrder.Uint16(req.Data[:]) + return 0, fd.device.SetIff(stack.Stack, req.Name(), flags) + + case linux.TUNGETIFF: + var req linux.IFReq + + copy(req.IFName[:], fd.device.Name()) + + // Linux adds IFF_NOFILTER (the same value as IFF_NO_PI unfortunately) when + // there is no sk_filter. See __tun_chr_ioctl() in net/drivers/tun.c. + flags := fd.device.Flags() | linux.IFF_NOFILTER + usermem.ByteOrder.PutUint16(req.Data[:], flags) + + _, err := usermem.CopyObjectOut(ctx, uio, data, &req, usermem.IOOpts{ + AddressSpaceActive: true, + }) + return 0, err + + default: + return 0, syserror.ENOTTY + } +} + +// Release implements vfs.FileDescriptionImpl.Release. +func (fd *tunFD) Release() { + fd.device.Release() +} + +// PRead implements vfs.FileDescriptionImpl.PRead. +func (fd *tunFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { + return fd.Read(ctx, dst, opts) +} + +// Read implements vfs.FileDescriptionImpl.Read. +func (fd *tunFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + data, err := fd.device.Read() + if err != nil { + return 0, err + } + n, err := dst.CopyOut(ctx, data) + if n > 0 && n < len(data) { + // Not an error for partial copying. Packet truncated. + err = nil + } + return int64(n), err +} + +// PWrite implements vfs.FileDescriptionImpl.PWrite. +func (fd *tunFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { + return fd.Write(ctx, src, opts) +} + +// Write implements vfs.FileDescriptionImpl.Write. +func (fd *tunFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { + data := make([]byte, src.NumBytes()) + if _, err := src.CopyIn(ctx, data); err != nil { + return 0, err + } + return fd.device.Write(data) +} + +// Readiness implements watier.Waitable.Readiness. +func (fd *tunFD) Readiness(mask waiter.EventMask) waiter.EventMask { + return fd.device.Readiness(mask) +} + +// EventRegister implements watier.Waitable.EventRegister. +func (fd *tunFD) EventRegister(e *waiter.Entry, mask waiter.EventMask) { + fd.device.EventRegister(e, mask) +} + +// EventUnregister implements watier.Waitable.EventUnregister. +func (fd *tunFD) EventUnregister(e *waiter.Entry) { + fd.device.EventUnregister(e) +} + +// isNetTunSupported returns whether /dev/net/tun device is supported for s. +func isNetTunSupported(s inet.Stack) bool { + _, ok := s.(*netstack.Stack) + return ok +} + +// Register registers all devices implemented by this package in vfsObj. +func Register(vfsObj *vfs.VirtualFilesystem) error { + return vfsObj.RegisterDevice(vfs.CharDevice, netTunDevMajor, netTunDevMinor, tunDevice{}, &vfs.RegisterDeviceOptions{}) +} + +// CreateDevtmpfsFiles creates device special files in dev representing all +// devices implemented by this package. +func CreateDevtmpfsFiles(ctx context.Context, dev *devtmpfs.Accessor) error { + return dev.CreateDeviceFile(ctx, "net/tun", vfs.CharDevice, netTunDevMajor, netTunDevMinor, 0666 /* mode */) +} diff --git a/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go b/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go index 142ee53b0..d0e06cdc0 100644 --- a/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go +++ b/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go @@ -136,6 +136,8 @@ func (a *Accessor) pathOperationAt(pathname string) *vfs.PathOperation { // CreateDeviceFile creates a device special file at the given pathname in the // devtmpfs instance accessed by the Accessor. func (a *Accessor) CreateDeviceFile(ctx context.Context, pathname string, kind vfs.DeviceKind, major, minor uint32, perms uint16) error { + actx := a.wrapContext(ctx) + mode := (linux.FileMode)(perms) switch kind { case vfs.BlockDevice: @@ -145,12 +147,24 @@ func (a *Accessor) CreateDeviceFile(ctx context.Context, pathname string, kind v default: panic(fmt.Sprintf("invalid vfs.DeviceKind: %v", kind)) } + + // Create any parent directories. See + // devtmpfs.c:handle_create()=>path_create(). + for it := fspath.Parse(pathname).Begin; it.NextOk(); it = it.Next() { + pop := a.pathOperationAt(it.String()) + if err := a.vfsObj.MkdirAt(actx, a.creds, pop, &vfs.MkdirOptions{ + Mode: 0755, + }); err != nil { + return fmt.Errorf("failed to create directory %q: %v", it.String(), err) + } + } + // NOTE: Linux's devtmpfs refuses to automatically delete files it didn't // create, which it recognizes by storing a pointer to the kdevtmpfs struct // thread in struct inode::i_private. Accessor doesn't yet support deletion // of files at all, and probably won't as long as we don't need to support // kernel modules, so this is moot for now. - return a.vfsObj.MknodAt(a.wrapContext(ctx), a.creds, a.pathOperationAt(pathname), &vfs.MknodOptions{ + return a.vfsObj.MknodAt(actx, a.creds, a.pathOperationAt(pathname), &vfs.MknodOptions{ Mode: mode, DevMajor: major, DevMinor: minor, diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD index b701c65d2..45d7f7d09 100644 --- a/runsc/boot/BUILD +++ b/runsc/boot/BUILD @@ -41,6 +41,7 @@ go_library( "//pkg/sentry/control", "//pkg/sentry/devices/memdev", "//pkg/sentry/devices/ttydev", + "//pkg/sentry/devices/tundev", "//pkg/sentry/fdimport", "//pkg/sentry/fs", "//pkg/sentry/fs/dev", diff --git a/runsc/boot/vfs.go b/runsc/boot/vfs.go index 2fdddc719..9ba5e9cd9 100644 --- a/runsc/boot/vfs.go +++ b/runsc/boot/vfs.go @@ -27,6 +27,7 @@ import ( "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/devices/memdev" "gvisor.dev/gvisor/pkg/sentry/devices/ttydev" + "gvisor.dev/gvisor/pkg/sentry/devices/tundev" "gvisor.dev/gvisor/pkg/sentry/fs/user" "gvisor.dev/gvisor/pkg/sentry/fsimpl/devpts" "gvisor.dev/gvisor/pkg/sentry/fsimpl/devtmpfs" @@ -79,6 +80,9 @@ func registerFilesystems(ctx context.Context, vfsObj *vfs.VirtualFilesystem, cre if err := ttydev.Register(vfsObj); err != nil { return fmt.Errorf("registering ttydev: %w", err) } + if err := tundev.Register(vfsObj); err != nil { + return fmt.Errorf("registering tundev: %v", err) + } a, err := devtmpfs.NewAccessor(ctx, vfsObj, creds, devtmpfs.Name) if err != nil { return fmt.Errorf("creating devtmpfs accessor: %w", err) @@ -89,10 +93,13 @@ func registerFilesystems(ctx context.Context, vfsObj *vfs.VirtualFilesystem, cre return fmt.Errorf("initializing userspace: %w", err) } if err := memdev.CreateDevtmpfsFiles(ctx, a); err != nil { - return fmt.Errorf("creating devtmpfs files: %w", err) + return fmt.Errorf("creating memdev devtmpfs files: %w", err) } if err := ttydev.CreateDevtmpfsFiles(ctx, a); err != nil { - return fmt.Errorf("creating devtmpfs files: %w", err) + return fmt.Errorf("creating ttydev devtmpfs files: %w", err) + } + if err := tundev.CreateDevtmpfsFiles(ctx, a); err != nil { + return fmt.Errorf("creating tundev devtmpfs files: %v", err) } return nil } diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD index 315af9efe..39c1f79ad 100644 --- a/test/syscalls/BUILD +++ b/test/syscalls/BUILD @@ -1013,6 +1013,7 @@ syscall_test( syscall_test( test = "//test/syscalls/linux:tuntap_test", + vfs2 = "True", ) syscall_test( -- cgit v1.2.3 From b5e814445a4db5df7f4f58027422a5dba97ea766 Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Wed, 24 Jun 2020 19:20:58 -0700 Subject: Fix procfs bugs in vfs2. - Support writing on proc/[pid]/{uid,gid}map - Return EIO for writing to static files. Updates #2923. PiperOrigin-RevId: 318188503 --- pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go | 4 +- pkg/sentry/fsimpl/kernfs/inode_impl_util.go | 2 + pkg/sentry/fsimpl/proc/task_files.go | 60 ++++++++++++++++++++++- pkg/sentry/vfs/file_description_impl_util.go | 2 +- pkg/sentry/vfs/file_description_impl_util_test.go | 8 +-- test/syscalls/BUILD | 2 + test/syscalls/linux/proc.cc | 49 +++++++++++++++++- 7 files changed, 117 insertions(+), 10 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go index c1215b70a..6886b0876 100644 --- a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go +++ b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go @@ -101,12 +101,12 @@ func (fd *DynamicBytesFD) Seek(ctx context.Context, offset int64, whence int32) return fd.DynamicBytesFileDescriptionImpl.Seek(ctx, offset, whence) } -// Read implmenets vfs.FileDescriptionImpl.Read. +// Read implements vfs.FileDescriptionImpl.Read. func (fd *DynamicBytesFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { return fd.DynamicBytesFileDescriptionImpl.Read(ctx, dst, opts) } -// PRead implmenets vfs.FileDescriptionImpl.PRead. +// PRead implements vfs.FileDescriptionImpl.PRead. func (fd *DynamicBytesFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { return fd.DynamicBytesFileDescriptionImpl.PRead(ctx, dst, offset, opts) } diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go index 650bd7b88..53aec4918 100644 --- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go @@ -293,6 +293,8 @@ func (a *InodeAttrs) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *aut // inode numbers are immutable after node creation. // TODO(gvisor.dev/issue/1193): Implement other stat fields like timestamps. + // Also, STATX_SIZE will need some special handling, because read-only static + // files should return EIO for truncate operations. return nil } diff --git a/pkg/sentry/fsimpl/proc/task_files.go b/pkg/sentry/fsimpl/proc/task_files.go index ba4405026..286c23f01 100644 --- a/pkg/sentry/fsimpl/proc/task_files.go +++ b/pkg/sentry/fsimpl/proc/task_files.go @@ -35,6 +35,10 @@ import ( "gvisor.dev/gvisor/pkg/usermem" ) +// "There is an (arbitrary) limit on the number of lines in the file. As at +// Linux 3.18, the limit is five lines." - user_namespaces(7) +const maxIDMapLines = 5 + // mm gets the kernel task's MemoryManager. No additional reference is taken on // mm here. This is safe because MemoryManager.destroy is required to leave the // MemoryManager in a state where it's still usable as a DynamicBytesSource. @@ -283,7 +287,8 @@ func (d *commData) Generate(ctx context.Context, buf *bytes.Buffer) error { return nil } -// idMapData implements vfs.DynamicBytesSource for /proc/[pid]/{gid_map|uid_map}. +// idMapData implements vfs.WritableDynamicBytesSource for +// /proc/[pid]/{gid_map|uid_map}. // // +stateify savable type idMapData struct { @@ -309,6 +314,59 @@ func (d *idMapData) Generate(ctx context.Context, buf *bytes.Buffer) error { return nil } +func (d *idMapData) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) { + // "In addition, the number of bytes written to the file must be less than + // the system page size, and the write must be performed at the start of + // the file ..." - user_namespaces(7) + srclen := src.NumBytes() + if srclen >= usermem.PageSize || offset != 0 { + return 0, syserror.EINVAL + } + b := make([]byte, srclen) + if _, err := src.CopyIn(ctx, b); err != nil { + return 0, err + } + + // Truncate from the first NULL byte. + var nul int64 + nul = int64(bytes.IndexByte(b, 0)) + if nul == -1 { + nul = srclen + } + b = b[:nul] + // Remove the last \n. + if nul >= 1 && b[nul-1] == '\n' { + b = b[:nul-1] + } + lines := bytes.SplitN(b, []byte("\n"), maxIDMapLines+1) + if len(lines) > maxIDMapLines { + return 0, syserror.EINVAL + } + + entries := make([]auth.IDMapEntry, len(lines)) + for i, l := range lines { + var e auth.IDMapEntry + _, err := fmt.Sscan(string(l), &e.FirstID, &e.FirstParentID, &e.Length) + if err != nil { + return 0, syserror.EINVAL + } + entries[i] = e + } + var err error + if d.gids { + err = d.task.UserNamespace().SetGIDMap(ctx, entries) + } else { + err = d.task.UserNamespace().SetUIDMap(ctx, entries) + } + if err != nil { + return 0, err + } + + // On success, Linux's kernel/user_namespace.c:map_write() always returns + // count, even if fewer bytes were used. + return int64(srclen), nil +} + // mapsData implements vfs.DynamicBytesSource for /proc/[pid]/maps. // // +stateify savable diff --git a/pkg/sentry/vfs/file_description_impl_util.go b/pkg/sentry/vfs/file_description_impl_util.go index 1e66997ce..3fec0d6d6 100644 --- a/pkg/sentry/vfs/file_description_impl_util.go +++ b/pkg/sentry/vfs/file_description_impl_util.go @@ -327,7 +327,7 @@ func (fd *DynamicBytesFileDescriptionImpl) pwriteLocked(ctx context.Context, src writable, ok := fd.data.(WritableDynamicBytesSource) if !ok { - return 0, syserror.EINVAL + return 0, syserror.EIO } n, err := writable.Write(ctx, src, offset) if err != nil { diff --git a/pkg/sentry/vfs/file_description_impl_util_test.go b/pkg/sentry/vfs/file_description_impl_util_test.go index 5061f6ac9..3b7e1c273 100644 --- a/pkg/sentry/vfs/file_description_impl_util_test.go +++ b/pkg/sentry/vfs/file_description_impl_util_test.go @@ -155,11 +155,11 @@ func TestGenCountFD(t *testing.T) { } // Write and PWrite fails. - if _, err := fd.Write(ctx, ioseq, WriteOptions{}); err != syserror.EINVAL { - t.Errorf("Write: got err %v, wanted %v", err, syserror.EINVAL) + if _, err := fd.Write(ctx, ioseq, WriteOptions{}); err != syserror.EIO { + t.Errorf("Write: got err %v, wanted %v", err, syserror.EIO) } - if _, err := fd.PWrite(ctx, ioseq, 0, WriteOptions{}); err != syserror.EINVAL { - t.Errorf("Write: got err %v, wanted %v", err, syserror.EINVAL) + if _, err := fd.PWrite(ctx, ioseq, 0, WriteOptions{}); err != syserror.EIO { + t.Errorf("Write: got err %v, wanted %v", err, syserror.EIO) } } diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD index 39c1f79ad..23123006f 100644 --- a/test/syscalls/BUILD +++ b/test/syscalls/BUILD @@ -479,6 +479,7 @@ syscall_test( syscall_test( size = "medium", test = "//test/syscalls/linux:proc_test", + vfs2 = "True", ) syscall_test( @@ -498,6 +499,7 @@ syscall_test( syscall_test( test = "//test/syscalls/linux:proc_pid_uid_gid_map_test", + vfs2 = "True", ) syscall_test( diff --git a/test/syscalls/linux/proc.cc b/test/syscalls/linux/proc.cc index e77e9820e..d6b875dbf 100644 --- a/test/syscalls/linux/proc.cc +++ b/test/syscalls/linux/proc.cc @@ -754,8 +754,53 @@ TEST(ProcCpuinfo, RequiredFieldsArePresent) { } } -TEST(ProcCpuinfo, DeniesWrite) { - EXPECT_THAT(open("/proc/cpuinfo", O_WRONLY), SyscallFailsWithErrno(EACCES)); +TEST(ProcCpuinfo, DeniesWriteNonRoot) { + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_FOWNER))); + + // Do setuid in a separate thread so that after finishing this test, the + // process can still open files the test harness created before starting this + // test. Otherwise, the files are created by root (UID before the test), but + // cannot be opened by the `uid` set below after the test. After calling + // setuid(non-zero-UID), there is no way to get root privileges back. + ScopedThread([&] { + // Use syscall instead of glibc setuid wrapper because we want this setuid + // call to only apply to this task. POSIX threads, however, require that all + // threads have the same UIDs, so using the setuid wrapper sets all threads' + // real UID. + // Also drops capabilities. + constexpr int kNobody = 65534; + EXPECT_THAT(syscall(SYS_setuid, kNobody), SyscallSucceeds()); + EXPECT_THAT(open("/proc/cpuinfo", O_WRONLY), SyscallFailsWithErrno(EACCES)); + // TODO(gvisor.dev/issue/1193): Properly support setting size attributes in + // kernfs. + if (!IsRunningOnGvisor() || IsRunningWithVFS1()) { + EXPECT_THAT(truncate("/proc/cpuinfo", 123), + SyscallFailsWithErrno(EACCES)); + } + }); +} + +// With root privileges, it is possible to open /proc/cpuinfo with write mode, +// but all write operations will return EIO. +TEST(ProcCpuinfo, DeniesWriteRoot) { + // VFS1 does not behave differently for root/non-root. + SKIP_IF(IsRunningWithVFS1()); + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_FOWNER))); + + int fd; + EXPECT_THAT(fd = open("/proc/cpuinfo", O_WRONLY), SyscallSucceeds()); + if (fd > 0) { + EXPECT_THAT(write(fd, "x", 1), SyscallFailsWithErrno(EIO)); + EXPECT_THAT(pwrite(fd, "x", 1, 123), SyscallFailsWithErrno(EIO)); + } + // TODO(gvisor.dev/issue/1193): Properly support setting size attributes in + // kernfs. + if (!IsRunningOnGvisor() || IsRunningWithVFS1()) { + if (fd > 0) { + EXPECT_THAT(ftruncate(fd, 123), SyscallFailsWithErrno(EIO)); + } + EXPECT_THAT(truncate("/proc/cpuinfo", 123), SyscallFailsWithErrno(EIO)); + } } // Sanity checks that uptime is present. -- cgit v1.2.3 From a63db7d90303280de9431f369e5a9c8db351a9e8 Mon Sep 17 00:00:00 2001 From: Ridwan Sharif Date: Wed, 17 Jun 2020 16:23:27 -0400 Subject: Moved FUSE device under the fuse directory --- pkg/sentry/devices/miscdev/BUILD | 20 ------- pkg/sentry/devices/miscdev/fuse.go | 78 -------------------------- pkg/sentry/devices/miscdev/miscdev.go | 54 ------------------ pkg/sentry/fsimpl/fuse/BUILD | 19 +++++++ pkg/sentry/fsimpl/fuse/dev.go | 100 ++++++++++++++++++++++++++++++++++ runsc/boot/BUILD | 1 + runsc/boot/vfs.go | 7 +++ test/syscalls/linux/dev.cc | 3 +- 8 files changed, 129 insertions(+), 153 deletions(-) delete mode 100644 pkg/sentry/devices/miscdev/BUILD delete mode 100644 pkg/sentry/devices/miscdev/fuse.go delete mode 100644 pkg/sentry/devices/miscdev/miscdev.go create mode 100644 pkg/sentry/fsimpl/fuse/BUILD create mode 100644 pkg/sentry/fsimpl/fuse/dev.go (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/devices/miscdev/BUILD b/pkg/sentry/devices/miscdev/BUILD deleted file mode 100644 index aaa76c5d2..000000000 --- a/pkg/sentry/devices/miscdev/BUILD +++ /dev/null @@ -1,20 +0,0 @@ -load("//tools:defs.bzl", "go_library") - -licenses(["notice"]) - -go_library( - name = "miscdev", - srcs = [ - "fuse.go", - "miscdev.go", - ], - visibility = ["//pkg/sentry:internal"], - deps = [ - "//pkg/abi/linux", - "//pkg/context", - "//pkg/sentry/fsimpl/devtmpfs", - "//pkg/sentry/vfs", - "//pkg/syserror", - "//pkg/usermem", - ], -) diff --git a/pkg/sentry/devices/miscdev/fuse.go b/pkg/sentry/devices/miscdev/fuse.go deleted file mode 100644 index d0a963191..000000000 --- a/pkg/sentry/devices/miscdev/fuse.go +++ /dev/null @@ -1,78 +0,0 @@ -// Copyright 2020 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package miscdev - -import ( - "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/sentry/vfs" - "gvisor.dev/gvisor/pkg/syserror" - "gvisor.dev/gvisor/pkg/usermem" -) - -const fuseDevMinor = 229 - -// fuseDevice implements vfs.Device for /dev/fuse. -type fuseDevice struct{} - -// Open implements vfs.Device.Open. -func (fuseDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - var fd FUSEDeviceFile - if err := fd.vfsfd.Init(&fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{ - UseDentryMetadata: true, - }); err != nil { - return nil, err - } - return &fd.vfsfd, nil -} - -// FUSEDeviceFile implements vfs.FileDescriptionImpl for /dev/fuse. -type FUSEDeviceFile struct { - vfsfd vfs.FileDescription - vfs.FileDescriptionDefaultImpl - vfs.DentryMetadataFileDescriptionImpl - vfs.NoLockFD - - // TODO(gvisor.dev/issue/2987): Add all the data structures needed to enqueue - // and deque requests, control synchronization and establish communication - // between the FUSE kernel module and the /dev/fuse character device. -} - -// Release implements vfs.FileDescriptionImpl.Release. -func (fd *FUSEDeviceFile) Release() {} - -// PRead implements vfs.FileDescriptionImpl.PRead. -func (fd *FUSEDeviceFile) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { - return 0, syserror.ENOSYS -} - -// Read implements vfs.FileDescriptionImpl.Read. -func (fd *FUSEDeviceFile) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { - return 0, syserror.ENOSYS -} - -// PWrite implements vfs.FileDescriptionImpl.PWrite. -func (fd *FUSEDeviceFile) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { - return 0, syserror.ENOSYS -} - -// Write implements vfs.FileDescriptionImpl.Write. -func (fd *FUSEDeviceFile) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { - return 0, syserror.ENOSYS -} - -// Seek implements vfs.FileDescriptionImpl.Seek. -func (fd *FUSEDeviceFile) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { - return 0, syserror.ENOSYS -} diff --git a/pkg/sentry/devices/miscdev/miscdev.go b/pkg/sentry/devices/miscdev/miscdev.go deleted file mode 100644 index 500d92ed9..000000000 --- a/pkg/sentry/devices/miscdev/miscdev.go +++ /dev/null @@ -1,54 +0,0 @@ -// Copyright 2020 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Package miscdev implements "misc" character devices, as implemented in Linux -// by drivers/char/misc.c and fs/fuse/dev.c. -package miscdev - -import ( - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/sentry/fsimpl/devtmpfs" - "gvisor.dev/gvisor/pkg/sentry/vfs" -) - -// miscDevMajor is the major device number for devices defined in this package. -const miscDevMajor = linux.MISC_MAJOR - -// Register registers all devices implemented by this package in vfsObj. -func Register(vfsObj *vfs.VirtualFilesystem) error { - for minor, dev := range map[uint32]vfs.Device{ - fuseDevMinor: fuseDevice{}, - } { - if err := vfsObj.RegisterDevice(vfs.CharDevice, miscDevMajor, minor, dev, &vfs.RegisterDeviceOptions{ - GroupName: "misc", - }); err != nil { - return err - } - } - return nil -} - -// CreateDevtmpfsFiles creates device special files in dev representing all -// devices implemented by this package. -func CreateDevtmpfsFiles(ctx context.Context, dev *devtmpfs.Accessor) error { - for minor, name := range map[uint32]string{ - fuseDevMinor: "fuse", - } { - if err := dev.CreateDeviceFile(ctx, name, vfs.CharDevice, miscDevMajor, minor, 0666 /* mode */); err != nil { - return err - } - } - return nil -} diff --git a/pkg/sentry/fsimpl/fuse/BUILD b/pkg/sentry/fsimpl/fuse/BUILD new file mode 100644 index 000000000..41567967d --- /dev/null +++ b/pkg/sentry/fsimpl/fuse/BUILD @@ -0,0 +1,19 @@ +load("//tools:defs.bzl", "go_library") + +licenses(["notice"]) + +go_library( + name = "fuse", + srcs = [ + "dev.go", + ], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/context", + "//pkg/sentry/fsimpl/devtmpfs", + "//pkg/sentry/vfs", + "//pkg/syserror", + "//pkg/usermem", + ], +) diff --git a/pkg/sentry/fsimpl/fuse/dev.go b/pkg/sentry/fsimpl/fuse/dev.go new file mode 100644 index 000000000..f6a67d005 --- /dev/null +++ b/pkg/sentry/fsimpl/fuse/dev.go @@ -0,0 +1,100 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fuse + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/devtmpfs" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" +) + +const fuseDevMinor = 229 + +// fuseDevice implements vfs.Device for /dev/fuse. +type fuseDevice struct{} + +// Open implements vfs.Device.Open. +func (fuseDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + var fd DeviceFD + if err := fd.vfsfd.Init(&fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{ + UseDentryMetadata: true, + }); err != nil { + return nil, err + } + return &fd.vfsfd, nil +} + +// DeviceFD implements vfs.FileDescriptionImpl for /dev/fuse. +type DeviceFD struct { + vfsfd vfs.FileDescription + vfs.FileDescriptionDefaultImpl + vfs.DentryMetadataFileDescriptionImpl + vfs.NoLockFD + + // TODO(gvisor.dev/issue/2987): Add all the data structures needed to enqueue + // and deque requests, control synchronization and establish communication + // between the FUSE kernel module and the /dev/fuse character device. +} + +// Release implements vfs.FileDescriptionImpl.Release. +func (fd *DeviceFD) Release() {} + +// PRead implements vfs.FileDescriptionImpl.PRead. +func (fd *DeviceFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { + return 0, syserror.ENOSYS +} + +// Read implements vfs.FileDescriptionImpl.Read. +func (fd *DeviceFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + return 0, syserror.ENOSYS +} + +// PWrite implements vfs.FileDescriptionImpl.PWrite. +func (fd *DeviceFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { + return 0, syserror.ENOSYS +} + +// Write implements vfs.FileDescriptionImpl.Write. +func (fd *DeviceFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { + return 0, syserror.ENOSYS +} + +// Seek implements vfs.FileDescriptionImpl.Seek. +func (fd *DeviceFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { + return 0, syserror.ENOSYS +} + +// Register registers the FUSE device with vfsObj. +func Register(vfsObj *vfs.VirtualFilesystem) error { + if err := vfsObj.RegisterDevice(vfs.CharDevice, linux.MISC_MAJOR, fuseDevMinor, fuseDevice{}, &vfs.RegisterDeviceOptions{ + GroupName: "misc", + }); err != nil { + return err + } + + return nil +} + +// CreateDevtmpfsFile creates a device special file in devtmpfs. +func CreateDevtmpfsFile(ctx context.Context, dev *devtmpfs.Accessor) error { + if err := dev.CreateDeviceFile(ctx, "fuse", vfs.CharDevice, linux.MISC_MAJOR, fuseDevMinor, 0666 /* mode */); err != nil { + return err + } + + return nil +} diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD index 45d7f7d09..aad2a41de 100644 --- a/runsc/boot/BUILD +++ b/runsc/boot/BUILD @@ -55,6 +55,7 @@ go_library( "//pkg/sentry/fs/user", "//pkg/sentry/fsimpl/devpts", "//pkg/sentry/fsimpl/devtmpfs", + "//pkg/sentry/fsimpl/fuse", "//pkg/sentry/fsimpl/gofer", "//pkg/sentry/fsimpl/host", "//pkg/sentry/fsimpl/overlay", diff --git a/runsc/boot/vfs.go b/runsc/boot/vfs.go index 9ba5e9cd9..0f5ad1e05 100644 --- a/runsc/boot/vfs.go +++ b/runsc/boot/vfs.go @@ -31,6 +31,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs/user" "gvisor.dev/gvisor/pkg/sentry/fsimpl/devpts" "gvisor.dev/gvisor/pkg/sentry/fsimpl/devtmpfs" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/fuse" "gvisor.dev/gvisor/pkg/sentry/fsimpl/gofer" "gvisor.dev/gvisor/pkg/sentry/fsimpl/overlay" "gvisor.dev/gvisor/pkg/sentry/fsimpl/proc" @@ -79,6 +80,9 @@ func registerFilesystems(ctx context.Context, vfsObj *vfs.VirtualFilesystem, cre } if err := ttydev.Register(vfsObj); err != nil { return fmt.Errorf("registering ttydev: %w", err) + + if err := fuse.Register(vfsObj); err != nil { + return fmt.Errorf("registering /dev/fuse: %w", err) } if err := tundev.Register(vfsObj); err != nil { return fmt.Errorf("registering tundev: %v", err) @@ -101,6 +105,9 @@ func registerFilesystems(ctx context.Context, vfsObj *vfs.VirtualFilesystem, cre if err := tundev.CreateDevtmpfsFiles(ctx, a); err != nil { return fmt.Errorf("creating tundev devtmpfs files: %v", err) } + if err := fuse.CreateDevtmpfsFile(ctx, a); err != nil { + return fmt.Errorf("creating devtmpfs fuse device file: %w", err) + } return nil } diff --git a/test/syscalls/linux/dev.cc b/test/syscalls/linux/dev.cc index 82b115981..6be173c14 100644 --- a/test/syscalls/linux/dev.cc +++ b/test/syscalls/linux/dev.cc @@ -165,7 +165,8 @@ TEST(DevTest, WriteDevFuse) { } TEST(DevTest, TTYExists) { - SKIP_IF(!IsRunningWithVFS1()); + // Run test if running on VFS1 or on Linux. + SKIP_IF(!IsRunningWithVFS1() && IsRunningOnGvisor()); struct stat statbuf = {}; ASSERT_THAT(stat("/dev/tty", &statbuf), SyscallSucceeds()); -- cgit v1.2.3 From 54a31e219ca9d6086a367213a92d2a72ce3af07b Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Fri, 26 Jun 2020 13:46:01 -0700 Subject: Support inotify IN_ONESHOT. Also, while we're here, make sure that gofer inotify events are generated when files are created in remote revalidating mode. Updates #1479. PiperOrigin-RevId: 318536354 --- pkg/sentry/fsimpl/gofer/filesystem.go | 10 +++- pkg/sentry/syscalls/linux/vfs2/inotify.go | 2 +- pkg/sentry/vfs/inotify.go | 84 +++++++++++++++++++++++-------- test/syscalls/linux/inotify.cc | 48 ++++++++++++++++-- 4 files changed, 115 insertions(+), 29 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go index d253c996c..73bac738d 100644 --- a/pkg/sentry/fsimpl/gofer/filesystem.go +++ b/pkg/sentry/fsimpl/gofer/filesystem.go @@ -389,7 +389,15 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir // RPC will fail with EEXIST like we would have. If the RPC succeeds, and a // stale dentry exists, the dentry will fail revalidation next time it's // used. - return createInRemoteDir(parent, name) + if err := createInRemoteDir(parent, name); err != nil { + return err + } + ev := linux.IN_CREATE + if dir { + ev |= linux.IN_ISDIR + } + parent.watches.Notify(name, uint32(ev), 0, vfs.InodeEvent, false /* unlinked */) + return nil } if child := parent.children[name]; child != nil { return syserror.EEXIST diff --git a/pkg/sentry/syscalls/linux/vfs2/inotify.go b/pkg/sentry/syscalls/linux/vfs2/inotify.go index 74e5287b7..5d98134a5 100644 --- a/pkg/sentry/syscalls/linux/vfs2/inotify.go +++ b/pkg/sentry/syscalls/linux/vfs2/inotify.go @@ -81,7 +81,7 @@ func InotifyAddWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kern // "EINVAL: The given event mask contains no valid events." // -- inotify_add_watch(2) - if validBits := mask & linux.ALL_INOTIFY_BITS; validBits == 0 { + if mask&linux.ALL_INOTIFY_BITS == 0 { return 0, nil, syserror.EINVAL } diff --git a/pkg/sentry/vfs/inotify.go b/pkg/sentry/vfs/inotify.go index 35960394c..509034531 100644 --- a/pkg/sentry/vfs/inotify.go +++ b/pkg/sentry/vfs/inotify.go @@ -447,29 +447,51 @@ func (w *Watches) Remove(id uint64) { return } - if _, ok := w.ws[id]; !ok { - // While there's technically no problem with silently ignoring a missing - // watch, this is almost certainly a bug. - panic(fmt.Sprintf("Attempt to remove a watch, but no watch found with provided id %+v.", id)) + // It is possible for w.Remove() to be called for the same watch multiple + // times. See the treatment of one-shot watches in Watches.Notify(). + if _, ok := w.ws[id]; ok { + delete(w.ws, id) } - delete(w.ws, id) } // Notify queues a new event with watches in this set. Watches with // IN_EXCL_UNLINK are skipped if the event is coming from a child that has been // unlinked. func (w *Watches) Notify(name string, events, cookie uint32, et EventType, unlinked bool) { - // N.B. We don't defer the unlocks because Notify is in the hot path of - // all IO operations, and the defer costs too much for small IO - // operations. + var hasExpired bool w.mu.RLock() for _, watch := range w.ws { if unlinked && watch.ExcludeUnlinked() && et == PathEvent { continue } - watch.Notify(name, events, cookie) + if watch.Notify(name, events, cookie) { + hasExpired = true + } + } + w.mu.RUnlock() + + if hasExpired { + w.cleanupExpiredWatches() + } +} + +// This function is relatively expensive and should only be called where there +// are expired watches. +func (w *Watches) cleanupExpiredWatches() { + // Because of lock ordering, we cannot acquire Inotify.mu for each watch + // owner while holding w.mu. As a result, store expired watches locally + // before removing. + var toRemove []*Watch + w.mu.RLock() + for _, watch := range w.ws { + if atomic.LoadInt32(&watch.expired) == 1 { + toRemove = append(toRemove, watch) + } } w.mu.RUnlock() + for _, watch := range toRemove { + watch.owner.RmWatch(watch.wd) + } } // HandleDeletion is called when the watch target is destroyed. Clear the @@ -478,16 +500,10 @@ func (w *Watches) Notify(name string, events, cookie uint32, et EventType, unlin func (w *Watches) HandleDeletion() { w.Notify("", linux.IN_DELETE_SELF, 0, InodeEvent, true /* unlinked */) - // We can't hold w.mu while calling watch.handleDeletion to preserve lock - // ordering w.r.t to the owner inotify instances. Instead, atomically move - // the watches map into a local variable so we can iterate over it safely. - // - // Because of this however, it is possible for the watches' owners to reach - // this inode while the inode has no refs. This is still safe because the - // owners can only reach the inode until this function finishes calling - // watch.handleDeletion below and the inode is guaranteed to exist in the - // meantime. But we still have to be very careful not to rely on inode state - // that may have been already destroyed. + // As in Watches.Notify, we can't hold w.mu while acquiring Inotify.mu for + // the owner of each watch being deleted. Instead, atomically store the + // watches map in a local variable and set it to nil so we can iterate over + // it with the assurance that there will be no concurrent accesses. var ws map[uint64]*Watch w.mu.Lock() ws = w.ws @@ -519,17 +535,28 @@ func (w *Watches) HandleDeletion() { // +stateify savable type Watch struct { // Inotify instance which owns this watch. + // + // This field is immutable after creation. owner *Inotify // Descriptor for this watch. This is unique across an inotify instance. + // + // This field is immutable after creation. wd int32 // target is a dentry representing the watch target. Its watch set contains this watch. + // + // This field is immutable after creation. target *Dentry // Events being monitored via this watch. Must be accessed with atomic // memory operations. mask uint32 + + // expired is set to 1 to indicate that this watch is a one-shot that has + // already sent a notification and therefore can be removed. Must be accessed + // with atomic memory operations. + expired int32 } // OwnerID returns the id of the inotify instance that owns this watch. @@ -546,12 +573,20 @@ func (w *Watch) ExcludeUnlinked() bool { return atomic.LoadUint32(&w.mask)&linux.IN_EXCL_UNLINK != 0 } -// Notify queues a new event on this watch. -func (w *Watch) Notify(name string, events uint32, cookie uint32) { +// Notify queues a new event on this watch. Returns true if this is a one-shot +// watch that should be deleted, after this event was successfully queued. +func (w *Watch) Notify(name string, events uint32, cookie uint32) bool { + if atomic.LoadInt32(&w.expired) == 1 { + // This is a one-shot watch that is already in the process of being + // removed. This may happen if a second event reaches the watch target + // before this watch has been removed. + return false + } + mask := atomic.LoadUint32(&w.mask) if mask&events == 0 { // We weren't watching for this event. - return + return false } // Event mask should include bits matched from the watch plus all control @@ -560,6 +595,11 @@ func (w *Watch) Notify(name string, events uint32, cookie uint32) { effectiveMask := unmaskableBits | mask matchedEvents := effectiveMask & events w.owner.queueEvent(newEvent(w.wd, name, matchedEvents, cookie)) + if mask&linux.IN_ONESHOT != 0 { + atomic.StoreInt32(&w.expired, 1) + return true + } + return false } // Event represents a struct inotify_event from linux. diff --git a/test/syscalls/linux/inotify.cc b/test/syscalls/linux/inotify.cc index 731c7046c..bdb645c35 100644 --- a/test/syscalls/linux/inotify.cc +++ b/test/syscalls/linux/inotify.cc @@ -1485,20 +1485,26 @@ TEST(Inotify, DuplicateWatchReturnsSameWatchDescriptor) { TEST(Inotify, UnmatchedEventsAreDiscarded) { const TempPath root = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); - const TempPath file1 = + TempPath file1 = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(root.path())); const FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK)); - ASSERT_NO_ERRNO_AND_VALUE(InotifyAddWatch(fd.get(), file1.path(), IN_ACCESS)); + const int wd = ASSERT_NO_ERRNO_AND_VALUE( + InotifyAddWatch(fd.get(), file1.path(), IN_ACCESS)); - const FileDescriptor file1_fd = + FileDescriptor file1_fd = ASSERT_NO_ERRNO_AND_VALUE(Open(file1.path(), O_WRONLY)); - const std::vector events = - ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get())); + std::vector events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get())); // We only asked for access events, the open event should be discarded. ASSERT_THAT(events, Are({})); + + // IN_IGNORED events are always generated, regardless of the mask. + file1_fd.reset(); + file1.reset(); + events = ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(fd.get())); + ASSERT_THAT(events, Are({Event(IN_IGNORED, wd)})); } TEST(Inotify, AddWatchWithInvalidEventMaskFails) { @@ -2073,6 +2079,38 @@ TEST(Inotify, ExcludeUnlinkInodeEvents_NoRandomSave) { })); } +TEST(Inotify, OneShot) { + // TODO(gvisor.dev/issue/1624): IN_ONESHOT not supported in VFS1. + SKIP_IF(IsRunningWithVFS1()); + + const TempPath file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFile()); + const FileDescriptor inotify_fd = + ASSERT_NO_ERRNO_AND_VALUE(InotifyInit1(IN_NONBLOCK)); + + const int wd = ASSERT_NO_ERRNO_AND_VALUE( + InotifyAddWatch(inotify_fd.get(), file.path(), IN_MODIFY | IN_ONESHOT)); + + // Open an fd, write to it, and then close it. + FileDescriptor fd = ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_WRONLY)); + ASSERT_THAT(write(fd.get(), "x", 1), SyscallSucceedsWithValue(1)); + fd.reset(); + + // We should get a single event followed by IN_IGNORED indicating removal + // of the one-shot watch. Prior activity (i.e. open) that is not in the mask + // should not trigger removal, and activity after removal (i.e. close) should + // not generate events. + std::vector events = + ASSERT_NO_ERRNO_AND_VALUE(DrainEvents(inotify_fd.get())); + EXPECT_THAT(events, Are({ + Event(IN_MODIFY, wd), + Event(IN_IGNORED, wd), + })); + + // The watch should already have been removed. + EXPECT_THAT(inotify_rm_watch(inotify_fd.get(), wd), + SyscallFailsWithErrno(EINVAL)); +} + // This test helps verify that the lock order of filesystem and inotify locks // is respected when inotify instances and watch targets are concurrently being // destroyed. -- cgit v1.2.3 From 02d552d07c4415978d2ce418fb16baf238d0ff78 Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Sat, 27 Jun 2020 14:38:20 -0700 Subject: Support sticky bit in vfs2. Updates #2923. PiperOrigin-RevId: 318648128 --- pkg/sentry/fsimpl/gofer/filesystem.go | 65 ++++++++++++++++++++++++----- pkg/sentry/fsimpl/gofer/gofer.go | 4 ++ pkg/sentry/fsimpl/kernfs/inode_impl_util.go | 4 ++ pkg/sentry/fsimpl/tmpfs/directory.go | 4 ++ pkg/sentry/fsimpl/tmpfs/filesystem.go | 9 ++++ pkg/sentry/vfs/permissions.go | 14 +++++++ test/syscalls/BUILD | 1 + test/syscalls/linux/sticky.cc | 57 +++++++++++++++++-------- 8 files changed, 131 insertions(+), 27 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go index 73bac738d..51082359d 100644 --- a/pkg/sentry/fsimpl/gofer/filesystem.go +++ b/pkg/sentry/fsimpl/gofer/filesystem.go @@ -16,6 +16,7 @@ package gofer import ( "sync" + "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" @@ -464,21 +465,61 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b defer mntns.DecRef() parent.dirMu.Lock() defer parent.dirMu.Unlock() + child, ok := parent.children[name] if ok && child == nil { return syserror.ENOENT } - // We only need a dentry representing the file at name if it can be a mount - // point. If child is nil, then it can't be a mount point. If child is - // non-nil but stale, the actual file can't be a mount point either; we - // detect this case by just speculatively calling PrepareDeleteDentry and - // only revalidating the dentry if that fails (indicating that the existing - // dentry is a mount point). + + sticky := atomic.LoadUint32(&parent.mode)&linux.ModeSticky != 0 + if sticky { + if !ok { + // If the sticky bit is set, we need to retrieve the child to determine + // whether removing it is allowed. + child, err = fs.stepLocked(ctx, rp, parent, false /* mayFollowSymlinks */, &ds) + if err != nil { + return err + } + } else if child != nil && !child.cachedMetadataAuthoritative() { + // Make sure the dentry representing the file at name is up to date + // before examining its metadata. + child, err = fs.revalidateChildLocked(ctx, vfsObj, parent, name, child, &ds) + if err != nil { + return err + } + } + if err := parent.mayDelete(rp.Credentials(), child); err != nil { + return err + } + } + + // If a child dentry exists, prepare to delete it. This should fail if it is + // a mount point. We detect mount points by speculatively calling + // PrepareDeleteDentry, which fails if child is a mount point. However, we + // may need to revalidate the file in this case to make sure that it has not + // been deleted or replaced on the remote fs, in which case the mount point + // will have disappeared. If calling PrepareDeleteDentry fails again on the + // up-to-date dentry, we can be sure that it is a mount point. + // + // Also note that if child is nil, then it can't be a mount point. if child != nil { + // Hold child.dirMu so we can check child.children and + // child.syntheticChildren. We don't access these fields until a bit later, + // but locking child.dirMu after calling vfs.PrepareDeleteDentry() would + // create an inconsistent lock ordering between dentry.dirMu and + // vfs.Dentry.mu (in the VFS lock order, it would make dentry.dirMu both "a + // FilesystemImpl lock" and "a lock acquired by a FilesystemImpl between + // PrepareDeleteDentry and CommitDeleteDentry). To avoid this, lock + // child.dirMu before calling PrepareDeleteDentry. child.dirMu.Lock() defer child.dirMu.Unlock() if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil { - if parent.cachedMetadataAuthoritative() { + // We can skip revalidation in several cases: + // - We are not in InteropModeShared + // - The parent directory is synthetic, in which case the child must also + // be synthetic + // - We already updated the child during the sticky bit check above + if parent.cachedMetadataAuthoritative() || sticky { return err } child, err = fs.revalidateChildLocked(ctx, vfsObj, parent, name, child, &ds) @@ -1100,7 +1141,8 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa return err } } - if err := oldParent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { + creds := rp.Credentials() + if err := oldParent.checkPermissions(creds, vfs.MayWrite|vfs.MayExec); err != nil { return err } vfsObj := rp.VirtualFilesystem() @@ -1115,12 +1157,15 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa if renamed == nil { return syserror.ENOENT } + if err := oldParent.mayDelete(creds, renamed); err != nil { + return err + } if renamed.isDir() { if renamed == newParent || genericIsAncestorDentry(renamed, newParent) { return syserror.EINVAL } if oldParent != newParent { - if err := renamed.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil { + if err := renamed.checkPermissions(creds, vfs.MayWrite); err != nil { return err } } @@ -1131,7 +1176,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa } if oldParent != newParent { - if err := newParent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { + if err := newParent.checkPermissions(creds, vfs.MayWrite|vfs.MayExec); err != nil { return err } newParent.dirMu.Lock() diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go index b0b6d8c64..71c8d3ae1 100644 --- a/pkg/sentry/fsimpl/gofer/gofer.go +++ b/pkg/sentry/fsimpl/gofer/gofer.go @@ -1003,6 +1003,10 @@ func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(atomic.LoadUint32(&d.mode)), auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))) } +func (d *dentry) mayDelete(creds *auth.Credentials, child *dentry) error { + return vfs.CheckDeleteSticky(creds, linux.FileMode(atomic.LoadUint32(&d.mode)), auth.KUID(atomic.LoadUint32(&child.uid))) +} + func dentryUIDFromP9UID(uid p9.UID) uint32 { if !uid.Ok() { return uint32(auth.OverflowUID) diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go index 53aec4918..4cb885d87 100644 --- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go @@ -471,6 +471,8 @@ func (o *OrderedChildren) Unlink(ctx context.Context, name string, child *vfs.De if err := o.checkExistingLocked(name, child); err != nil { return err } + + // TODO(gvisor.dev/issue/3027): Check sticky bit before removing. o.removeLocked(name) return nil } @@ -518,6 +520,8 @@ func (o *OrderedChildren) Rename(ctx context.Context, oldname, newname string, c if err := o.checkExistingLocked(oldname, child); err != nil { return nil, err } + + // TODO(gvisor.dev/issue/3027): Check sticky bit before removing. replaced := dst.replaceChildLocked(newname, child) return replaced, nil } diff --git a/pkg/sentry/fsimpl/tmpfs/directory.go b/pkg/sentry/fsimpl/tmpfs/directory.go index b172abc15..0a1ad4765 100644 --- a/pkg/sentry/fsimpl/tmpfs/directory.go +++ b/pkg/sentry/fsimpl/tmpfs/directory.go @@ -81,6 +81,10 @@ func (dir *directory) removeChildLocked(child *dentry) { dir.iterMu.Unlock() } +func (dir *directory) mayDelete(creds *auth.Credentials, child *dentry) error { + return vfs.CheckDeleteSticky(creds, linux.FileMode(atomic.LoadUint32(&dir.inode.mode)), auth.KUID(atomic.LoadUint32(&child.inode.uid))) +} + type directoryFD struct { fileDescription vfs.DirectoryFileDescriptionDefaultImpl diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go index f766b7ce2..71ac7b8e6 100644 --- a/pkg/sentry/fsimpl/tmpfs/filesystem.go +++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go @@ -492,6 +492,9 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa if !ok { return syserror.ENOENT } + if err := oldParentDir.mayDelete(rp.Credentials(), renamed); err != nil { + return err + } // Note that we don't need to call rp.CheckMount(), since if renamed is a // mount point then we want to rename the mount point, not anything in the // mounted filesystem. @@ -606,6 +609,9 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error if !ok { return syserror.ENOENT } + if err := parentDir.mayDelete(rp.Credentials(), child); err != nil { + return err + } childDir, ok := child.inode.impl.(*directory) if !ok { return syserror.ENOTDIR @@ -716,6 +722,9 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error if !ok { return syserror.ENOENT } + if err := parentDir.mayDelete(rp.Credentials(), child); err != nil { + return err + } if child.inode.isDir() { return syserror.EISDIR } diff --git a/pkg/sentry/vfs/permissions.go b/pkg/sentry/vfs/permissions.go index afe2be8d7..9cb050597 100644 --- a/pkg/sentry/vfs/permissions.go +++ b/pkg/sentry/vfs/permissions.go @@ -230,6 +230,20 @@ func CheckSetStat(ctx context.Context, creds *auth.Credentials, stat *linux.Stat return nil } +// CheckDeleteSticky checks whether the sticky bit is set on a directory with +// the given file mode, and if so, checks whether creds has permission to +// remove a file owned by childKUID from a directory with the given mode. +// CheckDeleteSticky is consistent with fs/linux.h:check_sticky(). +func CheckDeleteSticky(creds *auth.Credentials, parentMode linux.FileMode, childKUID auth.KUID) error { + if parentMode&linux.ModeSticky == 0 { + return nil + } + if CanActAsOwner(creds, childKUID) { + return nil + } + return syserror.EPERM +} + // CanActAsOwner returns true if creds can act as the owner of a file with the // given owning UID, consistent with Linux's // fs/inode.c:inode_owner_or_capable(). diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD index c4fff0ac8..36c178e4a 100644 --- a/test/syscalls/BUILD +++ b/test/syscalls/BUILD @@ -942,6 +942,7 @@ syscall_test( syscall_test( add_overlay = True, test = "//test/syscalls/linux:sticky_test", + vfs2 = "True", ) syscall_test( diff --git a/test/syscalls/linux/sticky.cc b/test/syscalls/linux/sticky.cc index 92eec0449..39f4fb801 100644 --- a/test/syscalls/linux/sticky.cc +++ b/test/syscalls/linux/sticky.cc @@ -40,11 +40,14 @@ namespace { TEST(StickyTest, StickyBitPermDenied) { SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SETUID))); - auto dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); - EXPECT_THAT(chmod(dir.path().c_str(), 0777 | S_ISVTX), SyscallSucceeds()); - const FileDescriptor dirfd = - ASSERT_NO_ERRNO_AND_VALUE(Open(dir.path(), O_DIRECTORY)); - ASSERT_THAT(mkdirat(dirfd.get(), "NewDir", 0755), SyscallSucceeds()); + const TempPath parent = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); + EXPECT_THAT(chmod(parent.path().c_str(), 0777 | S_ISVTX), SyscallSucceeds()); + const TempPath file = ASSERT_NO_ERRNO_AND_VALUE( + TempPath::CreateFileWith(parent.path(), "some content", 0755)); + const TempPath dir = + ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirWith(parent.path(), 0755)); + const TempPath link = ASSERT_NO_ERRNO_AND_VALUE( + TempPath::CreateSymlinkTo(parent.path(), file.path())); // Drop privileges and change IDs only in child thread, or else this parent // thread won't be able to open some log files after the test ends. @@ -62,18 +65,26 @@ TEST(StickyTest, StickyBitPermDenied) { syscall(SYS_setresuid, -1, absl::GetFlag(FLAGS_scratch_uid), -1), SyscallSucceeds()); - EXPECT_THAT(unlinkat(dirfd.get(), "NewDir", AT_REMOVEDIR), + std::string new_path = NewTempAbsPath(); + EXPECT_THAT(rename(file.path().c_str(), new_path.c_str()), SyscallFailsWithErrno(EPERM)); + EXPECT_THAT(unlink(file.path().c_str()), SyscallFailsWithErrno(EPERM)); + EXPECT_THAT(rmdir(dir.path().c_str()), SyscallFailsWithErrno(EPERM)); + EXPECT_THAT(unlink(link.path().c_str()), SyscallFailsWithErrno(EPERM)); }); } TEST(StickyTest, StickyBitSameUID) { SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SETUID))); - auto dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); - EXPECT_THAT(chmod(dir.path().c_str(), 0777 | S_ISVTX), SyscallSucceeds()); - std::string path = JoinPath(dir.path(), "NewDir"); - ASSERT_THAT(mkdir(path.c_str(), 0755), SyscallSucceeds()); + const TempPath parent = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); + EXPECT_THAT(chmod(parent.path().c_str(), 0777 | S_ISVTX), SyscallSucceeds()); + const TempPath file = ASSERT_NO_ERRNO_AND_VALUE( + TempPath::CreateFileWith(parent.path(), "some content", 0755)); + const TempPath dir = + ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirWith(parent.path(), 0755)); + const TempPath link = ASSERT_NO_ERRNO_AND_VALUE( + TempPath::CreateSymlinkTo(parent.path(), file.path())); // Drop privileges and change IDs only in child thread, or else this parent // thread won't be able to open some log files after the test ends. @@ -89,18 +100,26 @@ TEST(StickyTest, StickyBitSameUID) { SyscallSucceeds()); // We still have the same EUID. - EXPECT_THAT(rmdir(path.c_str()), SyscallSucceeds()); + std::string new_path = NewTempAbsPath(); + EXPECT_THAT(rename(file.path().c_str(), new_path.c_str()), + SyscallSucceeds()); + EXPECT_THAT(unlink(new_path.c_str()), SyscallSucceeds()); + EXPECT_THAT(rmdir(dir.path().c_str()), SyscallSucceeds()); + EXPECT_THAT(unlink(link.path().c_str()), SyscallSucceeds()); }); } TEST(StickyTest, StickyBitCapFOWNER) { SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SETUID))); - auto dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); - EXPECT_THAT(chmod(dir.path().c_str(), 0777 | S_ISVTX), SyscallSucceeds()); - const FileDescriptor dirfd = - ASSERT_NO_ERRNO_AND_VALUE(Open(dir.path(), O_DIRECTORY)); - ASSERT_THAT(mkdirat(dirfd.get(), "NewDir", 0755), SyscallSucceeds()); + const TempPath parent = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); + EXPECT_THAT(chmod(parent.path().c_str(), 0777 | S_ISVTX), SyscallSucceeds()); + const TempPath file = ASSERT_NO_ERRNO_AND_VALUE( + TempPath::CreateFileWith(parent.path(), "some content", 0755)); + const TempPath dir = + ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDirWith(parent.path(), 0755)); + const TempPath link = ASSERT_NO_ERRNO_AND_VALUE( + TempPath::CreateSymlinkTo(parent.path(), file.path())); // Drop privileges and change IDs only in child thread, or else this parent // thread won't be able to open some log files after the test ends. @@ -117,8 +136,12 @@ TEST(StickyTest, StickyBitCapFOWNER) { SyscallSucceeds()); EXPECT_NO_ERRNO(SetCapability(CAP_FOWNER, true)); - EXPECT_THAT(unlinkat(dirfd.get(), "NewDir", AT_REMOVEDIR), + std::string new_path = NewTempAbsPath(); + EXPECT_THAT(rename(file.path().c_str(), new_path.c_str()), SyscallSucceeds()); + EXPECT_THAT(unlink(new_path.c_str()), SyscallSucceeds()); + EXPECT_THAT(rmdir(dir.path().c_str()), SyscallSucceeds()); + EXPECT_THAT(unlink(link.path().c_str()), SyscallSucceeds()); }); } } // namespace -- cgit v1.2.3 From c4bdd0118f5dc9090979697e31a18731968b4066 Mon Sep 17 00:00:00 2001 From: Bhasker Hariharan Date: Tue, 30 Jun 2020 19:00:13 -0700 Subject: Add missing newline in /sys/devices/systen/cpu/onine PiperOrigin-RevId: 319143410 --- pkg/sentry/fsimpl/sys/sys.go | 2 +- pkg/sentry/fsimpl/sys/sys_test.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/sys/sys.go b/pkg/sentry/fsimpl/sys/sys.go index fe02f7ee9..01ce30a4d 100644 --- a/pkg/sentry/fsimpl/sys/sys.go +++ b/pkg/sentry/fsimpl/sys/sys.go @@ -138,7 +138,7 @@ type cpuFile struct { // Generate implements vfs.DynamicBytesSource.Generate. func (c *cpuFile) Generate(ctx context.Context, buf *bytes.Buffer) error { - fmt.Fprintf(buf, "0-%d", c.maxCores-1) + fmt.Fprintf(buf, "0-%d\n", c.maxCores-1) return nil } diff --git a/pkg/sentry/fsimpl/sys/sys_test.go b/pkg/sentry/fsimpl/sys/sys_test.go index 4b3602d47..242d5fd12 100644 --- a/pkg/sentry/fsimpl/sys/sys_test.go +++ b/pkg/sentry/fsimpl/sys/sys_test.go @@ -51,7 +51,7 @@ func TestReadCPUFile(t *testing.T) { k := kernel.KernelFromContext(s.Ctx) maxCPUCores := k.ApplicationCores() - expected := fmt.Sprintf("0-%d", maxCPUCores-1) + expected := fmt.Sprintf("0-%d\n", maxCPUCores-1) for _, fname := range []string{"online", "possible", "present"} { pop := s.PathOpAtRoot(fmt.Sprintf("devices/system/cpu/%s", fname)) -- cgit v1.2.3 From 20d571b0c181023cc02521ad746a2b6d91e6794d Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Tue, 30 Jun 2020 20:47:02 -0700 Subject: Allow O_DIRECT on vfs2 tmpfs files. Updates #2923. PiperOrigin-RevId: 319153792 --- pkg/sentry/fsimpl/tmpfs/filesystem.go | 4 ++-- pkg/sentry/vfs/file_description.go | 3 +-- 2 files changed, 3 insertions(+), 4 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go index 71ac7b8e6..ce0dbb154 100644 --- a/pkg/sentry/fsimpl/tmpfs/filesystem.go +++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go @@ -407,7 +407,7 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.Open case *regularFile: var fd regularFileFD fd.LockFD.Init(&d.inode.locks) - if err := fd.vfsfd.Init(&fd, opts.Flags, rp.Mount(), &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil { + if err := fd.vfsfd.Init(&fd, opts.Flags, rp.Mount(), &d.vfsd, &vfs.FileDescriptionOptions{AllowDirectIO: true}); err != nil { return nil, err } if opts.Flags&linux.O_TRUNC != 0 { @@ -423,7 +423,7 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.Open } var fd directoryFD fd.LockFD.Init(&d.inode.locks) - if err := fd.vfsfd.Init(&fd, opts.Flags, rp.Mount(), &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil { + if err := fd.vfsfd.Init(&fd, opts.Flags, rp.Mount(), &d.vfsd, &vfs.FileDescriptionOptions{AllowDirectIO: true}); err != nil { return nil, err } return &fd.vfsfd, nil diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go index cd1db14ac..eb5dfd7e2 100644 --- a/pkg/sentry/vfs/file_description.go +++ b/pkg/sentry/vfs/file_description.go @@ -91,8 +91,7 @@ type FileDescription struct { // FileDescriptionOptions contains options to FileDescription.Init(). type FileDescriptionOptions struct { - // If AllowDirectIO is true, allow O_DIRECT to be set on the file. This is - // usually only the case if O_DIRECT would actually have an effect. + // If AllowDirectIO is true, allow O_DIRECT to be set on the file. AllowDirectIO bool // If DenyPRead is true, calls to FileDescription.PRead() return ESPIPE. -- cgit v1.2.3 From 43f5dd95a1c58a6e3260c31093bfc3f97885f4b0 Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Tue, 30 Jun 2020 20:59:32 -0700 Subject: Fix index calculation for /proc/[pid]/cmdline. We were truncating buf using a index relative to the middle of the slice (i.e. where envv begins), but we need to calculate the index relative to the entire slice. Updates #2923. PiperOrigin-RevId: 319154950 --- pkg/sentry/fsimpl/proc/task_files.go | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/proc/task_files.go b/pkg/sentry/fsimpl/proc/task_files.go index 286c23f01..9af43b859 100644 --- a/pkg/sentry/fsimpl/proc/task_files.go +++ b/pkg/sentry/fsimpl/proc/task_files.go @@ -231,8 +231,9 @@ func (d *cmdlineData) Generate(ctx context.Context, buf *bytes.Buffer) error { // Linux will return envp up to and including the first NULL character, // so find it. - if end := bytes.IndexByte(buf.Bytes()[ar.Length():], 0); end != -1 { - buf.Truncate(end) + envStart := int(ar.Length()) + if nullIdx := bytes.IndexByte(buf.Bytes()[envStart:], 0); nullIdx != -1 { + buf.Truncate(envStart + nullIdx) } } @@ -300,7 +301,7 @@ type idMapData struct { var _ dynamicInode = (*idMapData)(nil) -// Generate implements vfs.DynamicBytesSource.Generate. +// Generate implements vfs.WritableDynamicBytesSource.Generate. func (d *idMapData) Generate(ctx context.Context, buf *bytes.Buffer) error { var entries []auth.IDMapEntry if d.gids { @@ -314,6 +315,7 @@ func (d *idMapData) Generate(ctx context.Context, buf *bytes.Buffer) error { return nil } +// Write implements vfs.WritableDynamicBytesSource.Write. func (d *idMapData) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) { // "In addition, the number of bytes written to the file must be less than // the system page size, and the write must be performed at the start of -- cgit v1.2.3 From 6a90c88b97481a6d81b05f09d5c8ed7158225dd5 Mon Sep 17 00:00:00 2001 From: Zach Koopmans Date: Wed, 1 Jul 2020 13:13:04 -0700 Subject: Port fallocate to VFS2. PiperOrigin-RevId: 319283715 --- pkg/p9/p9.go | 13 ++++++++ pkg/sentry/fsimpl/gofer/regular_file.go | 29 ++++++++++++++++ pkg/sentry/fsimpl/host/host.go | 10 ++++++ pkg/sentry/fsimpl/kernfs/fd_impl_util.go | 5 +++ pkg/sentry/fsimpl/tmpfs/regular_file.go | 15 +++++++++ pkg/sentry/kernel/pipe/vfs.go | 5 +++ pkg/sentry/socket/hostinet/socket_vfs2.go | 7 +++- pkg/sentry/syscalls/linux/vfs2/filesystem.go | 50 ++++++++++++++++++++++++++++ pkg/sentry/syscalls/linux/vfs2/vfs2.go | 2 +- pkg/sentry/vfs/file_description.go | 4 +++ pkg/sentry/vfs/file_description_impl_util.go | 11 ++++++ pkg/sentry/vfs/inotify.go | 5 +++ test/syscalls/BUILD | 1 + test/syscalls/linux/BUILD | 5 +++ test/syscalls/linux/fallocate.cc | 45 +++++++++++++++++++++++++ 15 files changed, 205 insertions(+), 2 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/p9/p9.go b/pkg/p9/p9.go index 28d851ff5..122c457d2 100644 --- a/pkg/p9/p9.go +++ b/pkg/p9/p9.go @@ -1091,6 +1091,19 @@ type AllocateMode struct { Unshare bool } +// ToAllocateMode returns an AllocateMode from a fallocate(2) mode. +func ToAllocateMode(mode uint64) AllocateMode { + return AllocateMode{ + KeepSize: mode&unix.FALLOC_FL_KEEP_SIZE != 0, + PunchHole: mode&unix.FALLOC_FL_PUNCH_HOLE != 0, + NoHideStale: mode&unix.FALLOC_FL_NO_HIDE_STALE != 0, + CollapseRange: mode&unix.FALLOC_FL_COLLAPSE_RANGE != 0, + ZeroRange: mode&unix.FALLOC_FL_ZERO_RANGE != 0, + InsertRange: mode&unix.FALLOC_FL_INSERT_RANGE != 0, + Unshare: mode&unix.FALLOC_FL_UNSHARE_RANGE != 0, + } +} + // ToLinux converts to a value compatible with fallocate(2)'s mode. func (a *AllocateMode) ToLinux() uint32 { rv := uint32(0) diff --git a/pkg/sentry/fsimpl/gofer/regular_file.go b/pkg/sentry/fsimpl/gofer/regular_file.go index 404a452c4..faba73af2 100644 --- a/pkg/sentry/fsimpl/gofer/regular_file.go +++ b/pkg/sentry/fsimpl/gofer/regular_file.go @@ -24,6 +24,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/p9" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/memmap" @@ -67,6 +68,34 @@ func (fd *regularFileFD) OnClose(ctx context.Context) error { return d.handle.file.flush(ctx) } +// Allocate implements vfs.FileDescriptionImpl.Allocate. +func (fd *regularFileFD) Allocate(ctx context.Context, mode, offset, length uint64) error { + + d := fd.dentry() + d.metadataMu.Lock() + defer d.metadataMu.Unlock() + + size := offset + length + + // Allocating a smaller size is a noop. + if size <= d.size { + return nil + } + + d.handleMu.Lock() + defer d.handleMu.Unlock() + + err := d.handle.file.allocate(ctx, p9.ToAllocateMode(mode), offset, length) + if err != nil { + return err + } + d.size = size + if !d.cachedMetadataAuthoritative() { + d.touchCMtimeLocked() + } + return nil +} + // PRead implements vfs.FileDescriptionImpl.PRead. func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { if offset < 0 { diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go index d5e73ddae..007b3332e 100644 --- a/pkg/sentry/fsimpl/host/host.go +++ b/pkg/sentry/fsimpl/host/host.go @@ -543,6 +543,16 @@ func (f *fileDescription) Release() { // noop } +// Allocate implements vfs.FileDescriptionImpl. +func (f *fileDescription) Allocate(ctx context.Context, mode, offset, length uint64) error { + if !f.inode.seekable { + return syserror.ESPIPE + } + + // TODO(gvisor.dev/issue/2923): Implement Allocate for non-pipe hostfds. + return syserror.EOPNOTSUPP +} + // PRead implements FileDescriptionImpl. func (f *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { i := f.inode diff --git a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go index 5f7853a2a..ca8b8c63b 100644 --- a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go @@ -236,6 +236,11 @@ func (fd *GenericDirectoryFD) SetStat(ctx context.Context, opts vfs.SetStatOptio return inode.SetStat(ctx, fd.filesystem(), creds, opts) } +// Allocate implements vfs.FileDescriptionImpl.Allocate. +func (fd *GenericDirectoryFD) Allocate(ctx context.Context, mode, offset, length uint64) error { + return fd.DirectoryFileDescriptionDefaultImpl.Allocate(ctx, mode, offset, length) +} + // LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. func (fd *GenericDirectoryFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block) diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file.go b/pkg/sentry/fsimpl/tmpfs/regular_file.go index b805aadd0..6691add96 100644 --- a/pkg/sentry/fsimpl/tmpfs/regular_file.go +++ b/pkg/sentry/fsimpl/tmpfs/regular_file.go @@ -274,6 +274,21 @@ func (fd *regularFileFD) Release() { // noop } +// Allocate implements vfs.FileDescriptionImpl.Allocate. +func (fd *regularFileFD) Allocate(ctx context.Context, mode, offset, length uint64) error { + f := fd.inode().impl.(*regularFile) + + f.inode.mu.Lock() + defer f.inode.mu.Unlock() + oldSize := f.size + size := offset + length + if oldSize >= size { + return nil + } + _, err := f.truncateLocked(size) + return err +} + // PRead implements vfs.FileDescriptionImpl.PRead. func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { if offset < 0 { diff --git a/pkg/sentry/kernel/pipe/vfs.go b/pkg/sentry/kernel/pipe/vfs.go index a4519363f..45d4c5fc1 100644 --- a/pkg/sentry/kernel/pipe/vfs.go +++ b/pkg/sentry/kernel/pipe/vfs.go @@ -200,6 +200,11 @@ func (fd *VFSPipeFD) Readiness(mask waiter.EventMask) waiter.EventMask { } } +// Allocate implements vfs.FileDescriptionImpl.Allocate. +func (fd *VFSPipeFD) Allocate(ctx context.Context, mode, offset, length uint64) error { + return syserror.ESPIPE +} + // EventRegister implements waiter.Waitable.EventRegister. func (fd *VFSPipeFD) EventRegister(e *waiter.Entry, mask waiter.EventMask) { fd.pipe.EventRegister(e, mask) diff --git a/pkg/sentry/socket/hostinet/socket_vfs2.go b/pkg/sentry/socket/hostinet/socket_vfs2.go index ad5f64799..8f192c62f 100644 --- a/pkg/sentry/socket/hostinet/socket_vfs2.go +++ b/pkg/sentry/socket/hostinet/socket_vfs2.go @@ -96,7 +96,12 @@ func (s *socketVFS2) Ioctl(ctx context.Context, uio usermem.IO, args arch.Syscal return ioctl(ctx, s.fd, uio, args) } -// PRead implements vfs.FileDescriptionImpl. +// Allocate implements vfs.FileDescriptionImpl.Allocate. +func (s *socketVFS2) Allocate(ctx context.Context, mode, offset, length uint64) error { + return syserror.ENODEV +} + +// PRead implements vfs.FileDescriptionImpl.PRead. func (s *socketVFS2) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { return 0, syserror.ESPIPE } diff --git a/pkg/sentry/syscalls/linux/vfs2/filesystem.go b/pkg/sentry/syscalls/linux/vfs2/filesystem.go index 5dac77e4d..b12b5967b 100644 --- a/pkg/sentry/syscalls/linux/vfs2/filesystem.go +++ b/pkg/sentry/syscalls/linux/vfs2/filesystem.go @@ -18,6 +18,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" @@ -239,6 +240,55 @@ func renameat(t *kernel.Task, olddirfd int32, oldpathAddr usermem.Addr, newdirfd }) } +// Fallocate implements linux system call fallocate(2). +func Fallocate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + mode := args[1].Uint64() + offset := args[2].Int64() + length := args[3].Int64() + + file := t.GetFileVFS2(fd) + + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + if !file.IsWritable() { + return 0, nil, syserror.EBADF + } + + if mode != 0 { + return 0, nil, syserror.ENOTSUP + } + + if offset < 0 || length <= 0 { + return 0, nil, syserror.EINVAL + } + + size := offset + length + + if size < 0 { + return 0, nil, syserror.EFBIG + } + + limit := limits.FromContext(t).Get(limits.FileSize).Cur + + if uint64(size) >= limit { + t.SendSignal(&arch.SignalInfo{ + Signo: int32(linux.SIGXFSZ), + Code: arch.SignalInfoUser, + }) + return 0, nil, syserror.EFBIG + } + + return 0, nil, file.Impl().Allocate(t, mode, uint64(offset), uint64(length)) + + // File length modified, generate notification. + // TODO(gvisor.dev/issue/1479): Reenable when Inotify is ported. + // file.Dirent.InotifyEvent(linux.IN_MODIFY, 0) +} + // Rmdir implements Linux syscall rmdir(2). func Rmdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { pathAddr := args[0].Pointer() diff --git a/pkg/sentry/syscalls/linux/vfs2/vfs2.go b/pkg/sentry/syscalls/linux/vfs2/vfs2.go index 9e60c4a1c..8f497ecc7 100644 --- a/pkg/sentry/syscalls/linux/vfs2/vfs2.go +++ b/pkg/sentry/syscalls/linux/vfs2/vfs2.go @@ -138,7 +138,7 @@ func Override() { s.Table[282] = syscalls.Supported("signalfd", Signalfd) s.Table[283] = syscalls.Supported("timerfd_create", TimerfdCreate) s.Table[284] = syscalls.Supported("eventfd", Eventfd) - delete(s.Table, 285) // fallocate + s.Table[285] = syscalls.PartiallySupported("fallocate", Fallocate, "Not all options are supported.", nil) s.Table[286] = syscalls.Supported("timerfd_settime", TimerfdSettime) s.Table[287] = syscalls.Supported("timerfd_gettime", TimerfdGettime) s.Table[288] = syscalls.Supported("accept4", Accept4) diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go index eb5dfd7e2..0c42574db 100644 --- a/pkg/sentry/vfs/file_description.go +++ b/pkg/sentry/vfs/file_description.go @@ -354,6 +354,10 @@ type FileDescriptionImpl interface { // represented by the FileDescription. StatFS(ctx context.Context) (linux.Statfs, error) + // Allocate grows file represented by FileDescription to offset + length bytes. + // Only mode == 0 is supported currently. + Allocate(ctx context.Context, mode, offset, length uint64) error + // waiter.Waitable methods may be used to poll for I/O events. waiter.Waitable diff --git a/pkg/sentry/vfs/file_description_impl_util.go b/pkg/sentry/vfs/file_description_impl_util.go index 3fec0d6d6..6b8b4ad49 100644 --- a/pkg/sentry/vfs/file_description_impl_util.go +++ b/pkg/sentry/vfs/file_description_impl_util.go @@ -56,6 +56,12 @@ func (FileDescriptionDefaultImpl) StatFS(ctx context.Context) (linux.Statfs, err return linux.Statfs{}, syserror.ENOSYS } +// Allocate implements FileDescriptionImpl.Allocate analogously to +// fallocate called on regular file, directory or FIFO in Linux. +func (FileDescriptionDefaultImpl) Allocate(ctx context.Context, mode, offset, length uint64) error { + return syserror.ENODEV +} + // Readiness implements waiter.Waitable.Readiness analogously to // file_operations::poll == NULL in Linux. func (FileDescriptionDefaultImpl) Readiness(mask waiter.EventMask) waiter.EventMask { @@ -158,6 +164,11 @@ func (FileDescriptionDefaultImpl) Removexattr(ctx context.Context, name string) // implementations of non-directory I/O methods that return EISDIR. type DirectoryFileDescriptionDefaultImpl struct{} +// Allocate implements DirectoryFileDescriptionDefaultImpl.Allocate. +func (DirectoryFileDescriptionDefaultImpl) Allocate(ctx context.Context, mode, offset, length uint64) error { + return syserror.EISDIR +} + // PRead implements FileDescriptionImpl.PRead. func (DirectoryFileDescriptionDefaultImpl) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) { return 0, syserror.EISDIR diff --git a/pkg/sentry/vfs/inotify.go b/pkg/sentry/vfs/inotify.go index 509034531..c2e21ac5f 100644 --- a/pkg/sentry/vfs/inotify.go +++ b/pkg/sentry/vfs/inotify.go @@ -148,6 +148,11 @@ func (i *Inotify) Release() { } } +// Allocate implements FileDescription.Allocate. +func (i *Inotify) Allocate(ctx context.Context, mode, offset, length uint64) error { + panic("Allocate should not be called on read-only inotify fds") +} + // EventRegister implements waiter.Waitable. func (i *Inotify) EventRegister(e *waiter.Entry, mask waiter.EventMask) { i.queue.EventRegister(e, mask) diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD index 88ed36b69..67645cc83 100644 --- a/test/syscalls/BUILD +++ b/test/syscalls/BUILD @@ -199,6 +199,7 @@ syscall_test( syscall_test( add_overlay = True, test = "//test/syscalls/linux:fallocate_test", + vfs2 = "True", ) syscall_test( diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD index 8c7d54b21..9e097c888 100644 --- a/test/syscalls/linux/BUILD +++ b/test/syscalls/linux/BUILD @@ -748,9 +748,14 @@ cc_binary( linkstatic = 1, deps = [ ":file_base", + ":socket_test_util", "//test/util:cleanup", + "//test/util:eventfd_util", "//test/util:file_descriptor", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/time", gtest, + "//test/util:posix_error", "//test/util:temp_path", "//test/util:test_main", "//test/util:test_util", diff --git a/test/syscalls/linux/fallocate.cc b/test/syscalls/linux/fallocate.cc index 7819f4ac3..cabc2b751 100644 --- a/test/syscalls/linux/fallocate.cc +++ b/test/syscalls/linux/fallocate.cc @@ -15,16 +15,27 @@ #include #include #include +#include #include +#include +#include #include +#include #include #include #include +#include + #include "gtest/gtest.h" +#include "absl/strings/str_cat.h" +#include "absl/time/time.h" #include "test/syscalls/linux/file_base.h" +#include "test/syscalls/linux/socket_test_util.h" #include "test/util/cleanup.h" +#include "test/util/eventfd_util.h" #include "test/util/file_descriptor.h" +#include "test/util/posix_error.h" #include "test/util/temp_path.h" #include "test/util/test_util.h" @@ -70,6 +81,12 @@ TEST_F(AllocateTest, Fallocate) { ASSERT_THAT(fallocate(test_file_fd_.get(), 0, 39, 1), SyscallSucceeds()); ASSERT_THAT(fstat(test_file_fd_.get(), &buf), SyscallSucceeds()); EXPECT_EQ(buf.st_size, 40); + + // Given length 0 should fail with EINVAL. + ASSERT_THAT(fallocate(test_file_fd_.get(), 0, 50, 0), + SyscallFailsWithErrno(EINVAL)); + ASSERT_THAT(fstat(test_file_fd_.get(), &buf), SyscallSucceeds()); + EXPECT_EQ(buf.st_size, 40); } TEST_F(AllocateTest, FallocateInvalid) { @@ -136,6 +153,34 @@ TEST_F(AllocateTest, FallocateRlimit) { ASSERT_THAT(sigprocmask(SIG_UNBLOCK, &new_mask, nullptr), SyscallSucceeds()); } +TEST_F(AllocateTest, FallocateOtherFDs) { + int fd; + ASSERT_THAT(fd = timerfd_create(CLOCK_MONOTONIC, 0), SyscallSucceeds()); + auto timer_fd = FileDescriptor(fd); + EXPECT_THAT(fallocate(timer_fd.get(), 0, 0, 10), + SyscallFailsWithErrno(ENODEV)); + + sigset_t mask; + sigemptyset(&mask); + ASSERT_THAT(fd = signalfd(-1, &mask, 0), SyscallSucceeds()); + auto sfd = FileDescriptor(fd); + EXPECT_THAT(fallocate(sfd.get(), 0, 0, 10), SyscallFailsWithErrno(ENODEV)); + + auto efd = + ASSERT_NO_ERRNO_AND_VALUE(NewEventFD(0, EFD_NONBLOCK | EFD_SEMAPHORE)); + EXPECT_THAT(fallocate(efd.get(), 0, 0, 10), SyscallFailsWithErrno(ENODEV)); + + auto sockfd = ASSERT_NO_ERRNO_AND_VALUE(Socket(AF_INET, SOCK_DGRAM, 0)); + EXPECT_THAT(fallocate(sockfd.get(), 0, 0, 10), SyscallFailsWithErrno(ENODEV)); + + int socks[2]; + ASSERT_THAT(socketpair(AF_UNIX, SOCK_STREAM, PF_UNIX, socks), + SyscallSucceeds()); + auto sock0 = FileDescriptor(socks[0]); + auto sock1 = FileDescriptor(socks[1]); + EXPECT_THAT(fallocate(sock0.get(), 0, 0, 10), SyscallFailsWithErrno(ENODEV)); +} + } // namespace } // namespace testing } // namespace gvisor -- cgit v1.2.3 From 65d99855583a21b6ea511ea74aa52318d0a1e5b2 Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Wed, 1 Jul 2020 17:09:26 -0700 Subject: Port vfs1 implementation of sync_file_range to vfs2. Currently, we always perform a full-file sync which could be extremely expensive for some applications. Although vfs1 did not fully support sync_file_range, there were some optimizations that allowed us skip some unnecessary write-outs. Updates #2923, #1897. PiperOrigin-RevId: 319324213 --- pkg/sentry/fsimpl/gofer/special_file.go | 5 +--- pkg/sentry/syscalls/linux/vfs2/sync.go | 42 +++++++++++++++++++++++++++------ test/syscalls/BUILD | 1 + 3 files changed, 37 insertions(+), 11 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/gofer/special_file.go b/pkg/sentry/fsimpl/gofer/special_file.go index a016cbae1..2b381af05 100644 --- a/pkg/sentry/fsimpl/gofer/special_file.go +++ b/pkg/sentry/fsimpl/gofer/special_file.go @@ -235,8 +235,5 @@ func (fd *specialFileFD) Seek(ctx context.Context, offset int64, whence int32) ( // Sync implements vfs.FileDescriptionImpl.Sync. func (fd *specialFileFD) Sync(ctx context.Context) error { - if !fd.vfsfd.IsWritable() { - return nil - } - return fd.handle.sync(ctx) + return fd.dentry().syncSharedHandle(ctx) } diff --git a/pkg/sentry/syscalls/linux/vfs2/sync.go b/pkg/sentry/syscalls/linux/vfs2/sync.go index 365250b0b..0d0ebf46a 100644 --- a/pkg/sentry/syscalls/linux/vfs2/sync.go +++ b/pkg/sentry/syscalls/linux/vfs2/sync.go @@ -65,10 +65,8 @@ func SyncFileRange(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel nbytes := args[2].Int64() flags := args[3].Uint() - if offset < 0 { - return 0, nil, syserror.EINVAL - } - if nbytes < 0 { + // Check for negative values and overflow. + if offset < 0 || offset+nbytes < 0 { return 0, nil, syserror.EINVAL } if flags&^(linux.SYNC_FILE_RANGE_WAIT_BEFORE|linux.SYNC_FILE_RANGE_WRITE|linux.SYNC_FILE_RANGE_WAIT_AFTER) != 0 { @@ -81,7 +79,37 @@ func SyncFileRange(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel } defer file.DecRef() - // TODO(gvisor.dev/issue/1897): Avoid writeback of data ranges outside of - // [offset, offset+nbytes). - return 0, nil, file.Sync(t) + // TODO(gvisor.dev/issue/1897): Currently, the only file syncing we support + // is a full-file sync, i.e. fsync(2). As a result, there are severe + // limitations on how much we support sync_file_range: + // - In Linux, sync_file_range(2) doesn't write out the file's metadata, even + // if the file size is changed. We do. + // - We always sync the entire file instead of [offset, offset+nbytes). + // - We do not support the use of WAIT_BEFORE without WAIT_AFTER. For + // correctness, we would have to perform a write-out every time WAIT_BEFORE + // was used, but this would be much more expensive than expected if there + // were no write-out operations in progress. + // - Whenever WAIT_AFTER is used, we sync the file. + // - Ignore WRITE. If this flag is used with WAIT_AFTER, then the file will + // be synced anyway. If this flag is used without WAIT_AFTER, then it is + // safe (and less expensive) to do nothing, because the syscall will not + // wait for the write-out to complete--we only need to make sure that the + // next time WAIT_BEFORE or WAIT_AFTER are used, the write-out completes. + // - According to fs/sync.c, WAIT_BEFORE|WAIT_AFTER "will detect any I/O + // errors or ENOSPC conditions and will return those to the caller, after + // clearing the EIO and ENOSPC flags in the address_space." We don't do + // this. + + if flags&linux.SYNC_FILE_RANGE_WAIT_BEFORE != 0 && + flags&linux.SYNC_FILE_RANGE_WAIT_AFTER == 0 { + t.Kernel().EmitUnimplementedEvent(t) + return 0, nil, syserror.ENOSYS + } + + if flags&linux.SYNC_FILE_RANGE_WAIT_AFTER != 0 { + if err := file.Sync(t); err != nil { + return 0, nil, syserror.ConvertIntr(err, kernel.ERESTARTSYS) + } + } + return 0, nil, nil } diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD index 67645cc83..790190273 100644 --- a/test/syscalls/BUILD +++ b/test/syscalls/BUILD @@ -962,6 +962,7 @@ syscall_test( syscall_test( add_overlay = True, test = "//test/syscalls/linux:sync_file_range_test", + vfs2 = "True", ) syscall_test( -- cgit v1.2.3 From 3b26d2121e3afbbc005b5e919000d55a235b4bbe Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Wed, 1 Jul 2020 17:39:07 -0700 Subject: Remove maxSendBufferSize from vfs2. Complements cl/315991648. PiperOrigin-RevId: 319327853 --- pkg/sentry/fsimpl/host/socket.go | 9 --------- 1 file changed, 9 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/host/socket.go b/pkg/sentry/fsimpl/host/socket.go index 38f1fbfba..fd16bd92d 100644 --- a/pkg/sentry/fsimpl/host/socket.go +++ b/pkg/sentry/fsimpl/host/socket.go @@ -47,11 +47,6 @@ func newEndpoint(ctx context.Context, hostFD int, queue *waiter.Queue) (transpor return ep, nil } -// maxSendBufferSize is the maximum host send buffer size allowed for endpoint. -// -// N.B. 8MB is the default maximum on Linux (2 * sysctl_wmem_max). -const maxSendBufferSize = 8 << 20 - // ConnectedEndpoint is an implementation of transport.ConnectedEndpoint and // transport.Receiver. It is backed by a host fd that was imported at sentry // startup. This fd is shared with a hostfs inode, which retains ownership of @@ -114,10 +109,6 @@ func (c *ConnectedEndpoint) init() *syserr.Error { if err != nil { return syserr.FromError(err) } - if sndbuf > maxSendBufferSize { - log.Warningf("Socket send buffer too large: %d", sndbuf) - return syserr.ErrInvalidEndpointState - } c.stype = linux.SockType(stype) c.sndbuf = int64(sndbuf) -- cgit v1.2.3 From 52b44719d6e14ec299d0d953b4dc07a712b897fa Mon Sep 17 00:00:00 2001 From: Ayush Ranjan Date: Wed, 1 Jul 2020 19:47:51 -0700 Subject: [vfs2][gofer] Update file size to 0 on O_TRUNC Some Open:TruncateXxx syscall tests were failing because the file size was not being updated when the file was opened with O_TRUNC. Fixes Truncate tests in test/syscalls:open_test_runsc_ptrace_vfs2. Updates #2923 PiperOrigin-RevId: 319340127 --- pkg/sentry/fsimpl/gofer/filesystem.go | 35 ++++++++++++++++-- pkg/sentry/fsimpl/gofer/gofer.go | 69 ++++++++++++++++++----------------- pkg/sentry/fsimpl/gofer/time.go | 12 +++--- pkg/sentry/fsimpl/tmpfs/filesystem.go | 2 +- 4 files changed, 76 insertions(+), 42 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go index 51082359d..7bcc99b29 100644 --- a/pkg/sentry/fsimpl/gofer/filesystem.go +++ b/pkg/sentry/fsimpl/gofer/filesystem.go @@ -869,11 +869,22 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vf if err := d.checkPermissions(rp.Credentials(), ats); err != nil { return nil, err } + + trunc := opts.Flags&linux.O_TRUNC != 0 && d.fileType() == linux.S_IFREG + if trunc { + // Lock metadataMu *while* we open a regular file with O_TRUNC because + // open(2) will change the file size on server. + d.metadataMu.Lock() + defer d.metadataMu.Unlock() + } + + var vfd *vfs.FileDescription + var err error mnt := rp.Mount() switch d.fileType() { case linux.S_IFREG: if !d.fs.opts.regularFilesUseSpecialFileFD { - if err := d.ensureSharedHandle(ctx, ats&vfs.MayRead != 0, ats&vfs.MayWrite != 0, opts.Flags&linux.O_TRUNC != 0); err != nil { + if err := d.ensureSharedHandle(ctx, ats&vfs.MayRead != 0, ats&vfs.MayWrite != 0, trunc); err != nil { return nil, err } fd := ®ularFileFD{} @@ -883,7 +894,7 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vf }); err != nil { return nil, err } - return &fd.vfsfd, nil + vfd = &fd.vfsfd } case linux.S_IFDIR: // Can't open directories with O_CREAT. @@ -923,7 +934,25 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vf return d.pipe.Open(ctx, mnt, &d.vfsd, opts.Flags, &d.locks) } } - return d.openSpecialFileLocked(ctx, mnt, opts) + + if vfd == nil { + if vfd, err = d.openSpecialFileLocked(ctx, mnt, opts); err != nil { + return nil, err + } + } + + if trunc { + // If no errors occured so far then update file size in memory. This + // step is required even if !d.cachedMetadataAuthoritative() because + // d.mappings has to be updated. + // d.metadataMu has already been acquired if trunc == true. + d.updateFileSizeLocked(0) + + if d.cachedMetadataAuthoritative() { + d.touchCMtimeLocked() + } + } + return vfd, err } func (d *dentry) connectSocketLocked(ctx context.Context, opts *vfs.OpenOptions) (*vfs.FileDescription, error) { diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go index 71c8d3ae1..709b5e496 100644 --- a/pkg/sentry/fsimpl/gofer/gofer.go +++ b/pkg/sentry/fsimpl/gofer/gofer.go @@ -794,9 +794,7 @@ func (d *dentry) updateFromP9Attrs(mask p9.AttrMask, attr *p9.Attr) { atomic.StoreUint32(&d.nlink, uint32(attr.NLink)) } if mask.Size { - d.dataMu.Lock() - atomic.StoreUint64(&d.size, attr.Size) - d.dataMu.Unlock() + d.updateFileSizeLocked(attr.Size) } d.metadataMu.Unlock() } @@ -964,39 +962,44 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *lin } atomic.StoreInt64(&d.ctime, now) if stat.Mask&linux.STATX_SIZE != 0 { + d.updateFileSizeLocked(stat.Size) + } + return nil +} + +// Preconditions: d.metadataMu must be locked. +func (d *dentry) updateFileSizeLocked(newSize uint64) { + d.dataMu.Lock() + oldSize := d.size + d.size = newSize + // d.dataMu must be unlocked to lock d.mapsMu and invalidate mappings + // below. This allows concurrent calls to Read/Translate/etc. These + // functions synchronize with truncation by refusing to use cache + // contents beyond the new d.size. (We are still holding d.metadataMu, + // so we can't race with Write or another truncate.) + d.dataMu.Unlock() + if d.size < oldSize { + oldpgend, _ := usermem.PageRoundUp(oldSize) + newpgend, _ := usermem.PageRoundUp(d.size) + if oldpgend != newpgend { + d.mapsMu.Lock() + d.mappings.Invalidate(memmap.MappableRange{newpgend, oldpgend}, memmap.InvalidateOpts{ + // Compare Linux's mm/truncate.c:truncate_setsize() => + // truncate_pagecache() => + // mm/memory.c:unmap_mapping_range(evencows=1). + InvalidatePrivate: true, + }) + d.mapsMu.Unlock() + } + // We are now guaranteed that there are no translations of + // truncated pages, and can remove them from the cache. Since + // truncated pages have been removed from the remote file, they + // should be dropped without being written back. d.dataMu.Lock() - oldSize := d.size - d.size = stat.Size - // d.dataMu must be unlocked to lock d.mapsMu and invalidate mappings - // below. This allows concurrent calls to Read/Translate/etc. These - // functions synchronize with truncation by refusing to use cache - // contents beyond the new d.size. (We are still holding d.metadataMu, - // so we can't race with Write or another truncate.) + d.cache.Truncate(d.size, d.fs.mfp.MemoryFile()) + d.dirty.KeepClean(memmap.MappableRange{d.size, oldpgend}) d.dataMu.Unlock() - if d.size < oldSize { - oldpgend, _ := usermem.PageRoundUp(oldSize) - newpgend, _ := usermem.PageRoundUp(d.size) - if oldpgend != newpgend { - d.mapsMu.Lock() - d.mappings.Invalidate(memmap.MappableRange{newpgend, oldpgend}, memmap.InvalidateOpts{ - // Compare Linux's mm/truncate.c:truncate_setsize() => - // truncate_pagecache() => - // mm/memory.c:unmap_mapping_range(evencows=1). - InvalidatePrivate: true, - }) - d.mapsMu.Unlock() - } - // We are now guaranteed that there are no translations of - // truncated pages, and can remove them from the cache. Since - // truncated pages have been removed from the remote file, they - // should be dropped without being written back. - d.dataMu.Lock() - d.cache.Truncate(d.size, d.fs.mfp.MemoryFile()) - d.dirty.KeepClean(memmap.MappableRange{d.size, oldpgend}) - d.dataMu.Unlock() - } } - return nil } func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error { diff --git a/pkg/sentry/fsimpl/gofer/time.go b/pkg/sentry/fsimpl/gofer/time.go index 1d5aa82dc..0eef4e16e 100644 --- a/pkg/sentry/fsimpl/gofer/time.go +++ b/pkg/sentry/fsimpl/gofer/time.go @@ -36,7 +36,7 @@ func statxTimestampFromDentry(ns int64) linux.StatxTimestamp { } } -// Preconditions: fs.interop != InteropModeShared. +// Preconditions: d.cachedMetadataAuthoritative() == true. func (d *dentry) touchAtime(mnt *vfs.Mount) { if mnt.Flags.NoATime { return @@ -51,8 +51,8 @@ func (d *dentry) touchAtime(mnt *vfs.Mount) { mnt.EndWrite() } -// Preconditions: fs.interop != InteropModeShared. The caller has successfully -// called vfs.Mount.CheckBeginWrite(). +// Preconditions: d.cachedMetadataAuthoritative() == true. The caller has +// successfully called vfs.Mount.CheckBeginWrite(). func (d *dentry) touchCtime() { now := d.fs.clock.Now().Nanoseconds() d.metadataMu.Lock() @@ -60,8 +60,8 @@ func (d *dentry) touchCtime() { d.metadataMu.Unlock() } -// Preconditions: fs.interop != InteropModeShared. The caller has successfully -// called vfs.Mount.CheckBeginWrite(). +// Preconditions: d.cachedMetadataAuthoritative() == true. The caller has +// successfully called vfs.Mount.CheckBeginWrite(). func (d *dentry) touchCMtime() { now := d.fs.clock.Now().Nanoseconds() d.metadataMu.Lock() @@ -70,6 +70,8 @@ func (d *dentry) touchCMtime() { d.metadataMu.Unlock() } +// Preconditions: d.cachedMetadataAuthoritative() == true. The caller has +// locked d.metadataMu. func (d *dentry) touchCMtimeLocked() { now := d.fs.clock.Now().Nanoseconds() atomic.StoreInt64(&d.mtime, now) diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go index ce0dbb154..ed40f6b52 100644 --- a/pkg/sentry/fsimpl/tmpfs/filesystem.go +++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go @@ -410,7 +410,7 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.Open if err := fd.vfsfd.Init(&fd, opts.Flags, rp.Mount(), &d.vfsd, &vfs.FileDescriptionOptions{AllowDirectIO: true}); err != nil { return nil, err } - if opts.Flags&linux.O_TRUNC != 0 { + if !afterCreate && opts.Flags&linux.O_TRUNC != 0 { if _, err := impl.truncate(0); err != nil { return nil, err } -- cgit v1.2.3 From 514955c1a8f3927c928a57935d511ffbbf9c6f01 Mon Sep 17 00:00:00 2001 From: Ayush Ranjan Date: Wed, 1 Jul 2020 21:04:43 -0700 Subject: [vfs2][gofer] Fix mmap syscall test. We were not invalidating mappings when the file size changed in shared mode. Enabled the syscall test for vfs2. Updates #2923 PiperOrigin-RevId: 319346569 --- pkg/sentry/fsimpl/gofer/gofer.go | 9 ++++++--- test/syscalls/BUILD | 1 + 2 files changed, 7 insertions(+), 3 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go index 709b5e496..8e74e60a5 100644 --- a/pkg/sentry/fsimpl/gofer/gofer.go +++ b/pkg/sentry/fsimpl/gofer/gofer.go @@ -900,6 +900,12 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *lin } d.metadataMu.Lock() defer d.metadataMu.Unlock() + if stat.Mask&linux.STATX_SIZE != 0 { + // The size needs to be changed even when + // !d.cachedMetadataAuthoritative() because d.mappings has to be + // updated. + d.updateFileSizeLocked(stat.Size) + } if !d.isSynthetic() { if stat.Mask != 0 { if err := d.file.setAttr(ctx, p9.SetAttrMask{ @@ -961,9 +967,6 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *lin stat.Mask |= linux.STATX_MTIME } atomic.StoreInt64(&d.ctime, now) - if stat.Mask&linux.STATX_SIZE != 0 { - d.updateFileSizeLocked(stat.Size) - } return nil } diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD index 790190273..6fe397af9 100644 --- a/test/syscalls/BUILD +++ b/test/syscalls/BUILD @@ -357,6 +357,7 @@ syscall_test( size = "medium", shard_count = 5, test = "//test/syscalls/linux:mmap_test", + vfs2 = "True", ) syscall_test( -- cgit v1.2.3 From 6c099d830091b3153e0dc39cf500dba441f45707 Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Wed, 1 Jul 2020 22:03:14 -0700 Subject: Update preadv2/pwritev2 flag handling in vfs2. We do not support RWF_SYNC/RWF_DSYNC and probably shouldn't silently accept them, since the user may incorrectly believe that we are synchronizing I/O. Remove the pwritev2 test verifying that we support these flags. gvisor.dev/issue/2601 is the tracking bug for deciding which RWF_.* flags we need and supporting them. Updates #2923, #2601. PiperOrigin-RevId: 319351286 --- pkg/abi/linux/file.go | 5 +-- pkg/sentry/fsimpl/gofer/regular_file.go | 8 +++-- pkg/sentry/fsimpl/gofer/special_file.go | 8 +++-- pkg/sentry/fsimpl/host/host.go | 4 ++- pkg/sentry/fsimpl/tmpfs/regular_file.go | 14 +++++--- test/syscalls/BUILD | 1 + test/syscalls/linux/pwritev2.cc | 59 ++++++--------------------------- 7 files changed, 39 insertions(+), 60 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/abi/linux/file.go b/pkg/abi/linux/file.go index 055ac1d7c..e11ca2d62 100644 --- a/pkg/abi/linux/file.go +++ b/pkg/abi/linux/file.go @@ -191,8 +191,9 @@ var DirentType = abi.ValueSet{ // Values for preadv2/pwritev2. const ( - // Note: gVisor does not implement the RWF_HIPRI feature, but the flag is - // accepted as a valid flag argument for preadv2/pwritev2. + // NOTE(b/120162627): gVisor does not implement the RWF_HIPRI feature, but + // the flag is accepted as a valid flag argument for preadv2/pwritev2 and + // silently ignored. RWF_HIPRI = 0x00000001 RWF_DSYNC = 0x00000002 RWF_SYNC = 0x00000004 diff --git a/pkg/sentry/fsimpl/gofer/regular_file.go b/pkg/sentry/fsimpl/gofer/regular_file.go index faba73af2..3d2d3530a 100644 --- a/pkg/sentry/fsimpl/gofer/regular_file.go +++ b/pkg/sentry/fsimpl/gofer/regular_file.go @@ -102,7 +102,9 @@ func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offs return 0, syserror.EINVAL } - // Check that flags are supported. Silently ignore RWF_HIPRI. + // Check that flags are supported. + // + // TODO(gvisor.dev/issue/2601): Support select preadv2 flags. if opts.Flags&^linux.RWF_HIPRI != 0 { return 0, syserror.EOPNOTSUPP } @@ -155,7 +157,9 @@ func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, off return 0, syserror.EINVAL } - // Check that flags are supported. Silently ignore RWF_HIPRI. + // Check that flags are supported. + // + // TODO(gvisor.dev/issue/2601): Support select pwritev2 flags. if opts.Flags&^linux.RWF_HIPRI != 0 { return 0, syserror.EOPNOTSUPP } diff --git a/pkg/sentry/fsimpl/gofer/special_file.go b/pkg/sentry/fsimpl/gofer/special_file.go index 2b381af05..3c4e7e2e4 100644 --- a/pkg/sentry/fsimpl/gofer/special_file.go +++ b/pkg/sentry/fsimpl/gofer/special_file.go @@ -130,7 +130,9 @@ func (fd *specialFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offs return 0, syserror.EINVAL } - // Check that flags are supported. Silently ignore RWF_HIPRI. + // Check that flags are supported. + // + // TODO(gvisor.dev/issue/2601): Support select preadv2 flags. if opts.Flags&^linux.RWF_HIPRI != 0 { return 0, syserror.EOPNOTSUPP } @@ -176,7 +178,9 @@ func (fd *specialFileFD) PWrite(ctx context.Context, src usermem.IOSequence, off return 0, syserror.EINVAL } - // Check that flags are supported. Silently ignore RWF_HIPRI. + // Check that flags are supported. + // + // TODO(gvisor.dev/issue/2601): Support select pwritev2 flags. if opts.Flags&^linux.RWF_HIPRI != 0 { return 0, syserror.EOPNOTSUPP } diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go index 007b3332e..1cd2982cb 100644 --- a/pkg/sentry/fsimpl/host/host.go +++ b/pkg/sentry/fsimpl/host/host.go @@ -588,8 +588,10 @@ func (f *fileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts } func readFromHostFD(ctx context.Context, hostFD int, dst usermem.IOSequence, offset int64, flags uint32) (int64, error) { + // Check that flags are supported. + // // TODO(gvisor.dev/issue/2601): Support select preadv2 flags. - if flags != 0 { + if flags&^linux.RWF_HIPRI != 0 { return 0, syserror.EOPNOTSUPP } reader := hostfd.GetReadWriterAt(int32(hostFD), offset, flags) diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file.go b/pkg/sentry/fsimpl/tmpfs/regular_file.go index 6691add96..1cdb46e6f 100644 --- a/pkg/sentry/fsimpl/tmpfs/regular_file.go +++ b/pkg/sentry/fsimpl/tmpfs/regular_file.go @@ -295,8 +295,11 @@ func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offs return 0, syserror.EINVAL } - // Check that flags are supported. Silently ignore RWF_HIPRI. - if opts.Flags&^linux.RWF_HIPRI != 0 { + // Check that flags are supported. RWF_DSYNC/RWF_SYNC can be ignored since + // all state is in-memory. + // + // TODO(gvisor.dev/issue/2601): Support select preadv2 flags. + if opts.Flags&^(linux.RWF_HIPRI|linux.RWF_DSYNC|linux.RWF_SYNC) != 0 { return 0, syserror.EOPNOTSUPP } @@ -326,8 +329,11 @@ func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, off return 0, syserror.EINVAL } - // Check that flags are supported. Silently ignore RWF_HIPRI. - if opts.Flags&^linux.RWF_HIPRI != 0 { + // Check that flags are supported. RWF_DSYNC/RWF_SYNC can be ignored since + // all state is in-memory. + // + // TODO(gvisor.dev/issue/2601): Support select preadv2 flags. + if opts.Flags&^(linux.RWF_HIPRI|linux.RWF_DSYNC|linux.RWF_SYNC) != 0 { return 0, syserror.EOPNOTSUPP } diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD index 6fe397af9..7c4cd8192 100644 --- a/test/syscalls/BUILD +++ b/test/syscalls/BUILD @@ -531,6 +531,7 @@ syscall_test( syscall_test( add_overlay = True, test = "//test/syscalls/linux:pwritev2_test", + vfs2 = "True", ) syscall_test( diff --git a/test/syscalls/linux/pwritev2.cc b/test/syscalls/linux/pwritev2.cc index 3fe5a600f..63b686c62 100644 --- a/test/syscalls/linux/pwritev2.cc +++ b/test/syscalls/linux/pwritev2.cc @@ -69,7 +69,7 @@ ssize_t pwritev2(unsigned long fd, const struct iovec* iov, } // This test is the base case where we call pwritev (no offset, no flags). -TEST(Writev2Test, TestBaseCall) { +TEST(Writev2Test, BaseCall) { SKIP_IF(pwritev2(-1, nullptr, 0, 0, 0) < 0 && errno == ENOSYS); const TempPath file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith( @@ -97,7 +97,7 @@ TEST(Writev2Test, TestBaseCall) { } // This test is where we call pwritev2 with a positive offset and no flags. -TEST(Pwritev2Test, TestValidPositiveOffset) { +TEST(Pwritev2Test, ValidPositiveOffset) { SKIP_IF(pwritev2(-1, nullptr, 0, 0, 0) < 0 && errno == ENOSYS); std::string prefix(kBufSize, '0'); @@ -129,7 +129,7 @@ TEST(Pwritev2Test, TestValidPositiveOffset) { // This test is the base case where we call writev by using -1 as the offset. // The write should use the file offset, so the test increments the file offset // prior to call pwritev2. -TEST(Pwritev2Test, TestNegativeOneOffset) { +TEST(Pwritev2Test, NegativeOneOffset) { SKIP_IF(pwritev2(-1, nullptr, 0, 0, 0) < 0 && errno == ENOSYS); const std::string prefix = "00"; @@ -164,7 +164,7 @@ TEST(Pwritev2Test, TestNegativeOneOffset) { // pwritev2 requires if the RWF_HIPRI flag is passed, the fd must be opened with // O_DIRECT. This test implements a correct call with the RWF_HIPRI flag. -TEST(Pwritev2Test, TestCallWithRWF_HIPRI) { +TEST(Pwritev2Test, CallWithRWF_HIPRI) { SKIP_IF(pwritev2(-1, nullptr, 0, 0, 0) < 0 && errno == ENOSYS); const TempPath file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith( @@ -189,47 +189,8 @@ TEST(Pwritev2Test, TestCallWithRWF_HIPRI) { EXPECT_EQ(buf, content); } -// This test checks that pwritev2 can be called with valid flags -TEST(Pwritev2Test, TestCallWithValidFlags) { - SKIP_IF(pwritev2(-1, nullptr, 0, 0, 0) < 0 && errno == ENOSYS); - - const TempPath file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith( - GetAbsoluteTestTmpdir(), "", TempPath::kDefaultFileMode)); - const FileDescriptor fd = - ASSERT_NO_ERRNO_AND_VALUE(Open(file.path(), O_RDWR)); - - std::vector content(kBufSize, '0'); - struct iovec iov; - iov.iov_base = content.data(); - iov.iov_len = content.size(); - - EXPECT_THAT(pwritev2(fd.get(), &iov, /*iovcnt=*/1, - /*offset=*/0, /*flags=*/RWF_DSYNC), - SyscallSucceedsWithValue(kBufSize)); - - std::vector buf(content.size()); - EXPECT_THAT(read(fd.get(), buf.data(), buf.size()), - SyscallSucceedsWithValue(buf.size())); - - EXPECT_EQ(buf, content); - - SetContent(content); - - EXPECT_THAT(pwritev2(fd.get(), &iov, /*iovcnt=*/1, - /*offset=*/0, /*flags=*/0x4), - SyscallSucceedsWithValue(kBufSize)); - - ASSERT_THAT(lseek(fd.get(), 0, SEEK_CUR), - SyscallSucceedsWithValue(content.size())); - - EXPECT_THAT(pread(fd.get(), buf.data(), buf.size(), /*offset=*/0), - SyscallSucceedsWithValue(buf.size())); - - EXPECT_EQ(buf, content); -} - // This test calls pwritev2 with a bad file descriptor. -TEST(Writev2Test, TestBadFile) { +TEST(Writev2Test, BadFile) { SKIP_IF(pwritev2(-1, nullptr, 0, 0, 0) < 0 && errno == ENOSYS); ASSERT_THAT(pwritev2(/*fd=*/-1, /*iov=*/nullptr, /*iovcnt=*/0, /*offset=*/0, /*flags=*/0), @@ -237,7 +198,7 @@ TEST(Writev2Test, TestBadFile) { } // This test calls pwrite2 with an invalid offset. -TEST(Pwritev2Test, TestInvalidOffset) { +TEST(Pwritev2Test, InvalidOffset) { SKIP_IF(pwritev2(-1, nullptr, 0, 0, 0) < 0 && errno == ENOSYS); const TempPath file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith( @@ -255,7 +216,7 @@ TEST(Pwritev2Test, TestInvalidOffset) { SyscallFailsWithErrno(EINVAL)); } -TEST(Pwritev2Test, TestUnseekableFileValid) { +TEST(Pwritev2Test, UnseekableFileValid) { SKIP_IF(pwritev2(-1, nullptr, 0, 0, 0) < 0 && errno == ENOSYS); int pipe_fds[2]; @@ -285,7 +246,7 @@ TEST(Pwritev2Test, TestUnseekableFileValid) { // Calling pwritev2 with a non-negative offset calls pwritev. Calling pwritev // with an unseekable file is not allowed. A pipe is used for an unseekable // file. -TEST(Pwritev2Test, TestUnseekableFileInValid) { +TEST(Pwritev2Test, UnseekableFileInvalid) { SKIP_IF(pwritev2(-1, nullptr, 0, 0, 0) < 0 && errno == ENOSYS); int pipe_fds[2]; @@ -304,7 +265,7 @@ TEST(Pwritev2Test, TestUnseekableFileInValid) { EXPECT_THAT(close(pipe_fds[1]), SyscallSucceeds()); } -TEST(Pwritev2Test, TestReadOnlyFile) { +TEST(Pwritev2Test, ReadOnlyFile) { SKIP_IF(pwritev2(-1, nullptr, 0, 0, 0) < 0 && errno == ENOSYS); const TempPath file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith( @@ -323,7 +284,7 @@ TEST(Pwritev2Test, TestReadOnlyFile) { } // This test calls pwritev2 with an invalid flag. -TEST(Pwritev2Test, TestInvalidFlag) { +TEST(Pwritev2Test, InvalidFlag) { SKIP_IF(pwritev2(-1, nullptr, 0, 0, 0) < 0 && errno == ENOSYS); const TempPath file = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileWith( -- cgit v1.2.3 From bd43368f491a02b050cd9f87f69185dc74386f2b Mon Sep 17 00:00:00 2001 From: Fabricio Voznika Date: Mon, 6 Jul 2020 15:38:51 -0700 Subject: Add inode number to synthetic dentries Reserve the MSB from ino for synthetic dentries to prevent conflict with regular dentries. Log warning in case MSB is set for regular dentries. Updates #1487 PiperOrigin-RevId: 319869858 --- pkg/sentry/fsimpl/gofer/directory.go | 9 +++++---- pkg/sentry/fsimpl/gofer/filesystem.go | 9 ++++++--- pkg/sentry/fsimpl/gofer/gofer.go | 34 +++++++++++++++++++++++++++------- 3 files changed, 38 insertions(+), 14 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/gofer/directory.go b/pkg/sentry/fsimpl/gofer/directory.go index 5d83fe363..8c7c8e1b3 100644 --- a/pkg/sentry/fsimpl/gofer/directory.go +++ b/pkg/sentry/fsimpl/gofer/directory.go @@ -85,6 +85,7 @@ func (d *dentry) createSyntheticChildLocked(opts *createSyntheticOpts) { d2 := &dentry{ refs: 1, // held by d fs: d.fs, + ino: d.fs.nextSyntheticIno(), mode: uint32(opts.mode), uid: uint32(opts.kuid), gid: uint32(opts.kgid), @@ -184,13 +185,13 @@ func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) { { Name: ".", Type: linux.DT_DIR, - Ino: d.ino, + Ino: uint64(d.ino), NextOff: 1, }, { Name: "..", Type: uint8(atomic.LoadUint32(&parent.mode) >> 12), - Ino: parent.ino, + Ino: uint64(parent.ino), NextOff: 2, }, } @@ -226,7 +227,7 @@ func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) { } dirent := vfs.Dirent{ Name: p9d.Name, - Ino: p9d.QID.Path, + Ino: uint64(inoFromPath(p9d.QID.Path)), NextOff: int64(len(dirents) + 1), } // p9 does not expose 9P2000.U's DMDEVICE, DMNAMEDPIPE, or @@ -259,7 +260,7 @@ func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) { dirents = append(dirents, vfs.Dirent{ Name: child.name, Type: uint8(atomic.LoadUint32(&child.mode) >> 12), - Ino: child.ino, + Ino: uint64(child.ino), NextOff: int64(len(dirents) + 1), }) } diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go index 7bcc99b29..cd5f5049e 100644 --- a/pkg/sentry/fsimpl/gofer/filesystem.go +++ b/pkg/sentry/fsimpl/gofer/filesystem.go @@ -214,9 +214,8 @@ func (fs *filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.Vir return nil, err } if child != nil { - if !file.isNil() && qid.Path == child.ino { - // The file at this path hasn't changed. Just update cached - // metadata. + if !file.isNil() && inoFromPath(qid.Path) == child.ino { + // The file at this path hasn't changed. Just update cached metadata. file.close(ctx) child.updateFromP9Attrs(attrMask, &attr) return child, nil @@ -1499,3 +1498,7 @@ func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDe defer fs.renameMu.RUnlock() return genericPrependPath(vfsroot, vd.Mount(), vd.Dentry().Impl().(*dentry), b) } + +func (fs *filesystem) nextSyntheticIno() inodeNumber { + return inodeNumber(atomic.AddUint64(&fs.syntheticSeq, 1) | syntheticInoMask) +} diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go index 8e74e60a5..2b83094cd 100644 --- a/pkg/sentry/fsimpl/gofer/gofer.go +++ b/pkg/sentry/fsimpl/gofer/gofer.go @@ -110,6 +110,26 @@ type filesystem struct { syncMu sync.Mutex syncableDentries map[*dentry]struct{} specialFileFDs map[*specialFileFD]struct{} + + // syntheticSeq stores a counter to used to generate unique inodeNumber for + // synthetic dentries. + syntheticSeq uint64 +} + +// inodeNumber represents inode number reported in Dirent.Ino. For regular +// dentries, it comes from QID.Path from the 9P server. Synthetic dentries +// have have their inodeNumber generated sequentially, with the MSB reserved to +// prevent conflicts with regular dentries. +type inodeNumber uint64 + +// Reserve MSB for synthetic mounts. +const syntheticInoMask = uint64(1) << 63 + +func inoFromPath(path uint64) inodeNumber { + if path&syntheticInoMask != 0 { + log.Warningf("Dropping MSB from ino, collision is possible. Original: %d, new: %d", path, path&^syntheticInoMask) + } + return inodeNumber(path &^ syntheticInoMask) } type filesystemOptions struct { @@ -585,11 +605,11 @@ type dentry struct { // Cached metadata; protected by metadataMu and accessed using atomic // memory operations unless otherwise specified. metadataMu sync.Mutex - ino uint64 // immutable - mode uint32 // type is immutable, perms are mutable - uid uint32 // auth.KUID, but stored as raw uint32 for sync/atomic - gid uint32 // auth.KGID, but ... - blockSize uint32 // 0 if unknown + ino inodeNumber // immutable + mode uint32 // type is immutable, perms are mutable + uid uint32 // auth.KUID, but stored as raw uint32 for sync/atomic + gid uint32 // auth.KGID, but ... + blockSize uint32 // 0 if unknown // Timestamps, all nsecs from the Unix epoch. atime int64 mtime int64 @@ -704,7 +724,7 @@ func (fs *filesystem) newDentry(ctx context.Context, file p9file, qid p9.QID, ma d := &dentry{ fs: fs, file: file, - ino: qid.Path, + ino: inoFromPath(qid.Path), mode: uint32(attr.Mode), uid: uint32(fs.opts.dfltuid), gid: uint32(fs.opts.dfltgid), @@ -846,7 +866,7 @@ func (d *dentry) statTo(stat *linux.Statx) { stat.UID = atomic.LoadUint32(&d.uid) stat.GID = atomic.LoadUint32(&d.gid) stat.Mode = uint16(atomic.LoadUint32(&d.mode)) - stat.Ino = d.ino + stat.Ino = uint64(d.ino) stat.Size = atomic.LoadUint64(&d.size) // This is consistent with regularFileFD.Seek(), which treats regular files // as having no holes. -- cgit v1.2.3 From 937912a4847cb1fd2acb79c0dd8fbc4b27695156 Mon Sep 17 00:00:00 2001 From: Fabricio Voznika Date: Mon, 6 Jul 2020 17:24:25 -0700 Subject: Ensure sync is called for readonly file Calling sync on a readonly file flushes metadata that may have been modified, like last access time. Updates #1198 PiperOrigin-RevId: 319888290 --- pkg/sentry/fsimpl/gofer/handle.go | 5 +++++ pkg/sentry/fsimpl/gofer/regular_file.go | 25 ++++++++++++------------- 2 files changed, 17 insertions(+), 13 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/gofer/handle.go b/pkg/sentry/fsimpl/gofer/handle.go index 724a3f1f7..8792ca4f2 100644 --- a/pkg/sentry/fsimpl/gofer/handle.go +++ b/pkg/sentry/fsimpl/gofer/handle.go @@ -126,11 +126,16 @@ func (h *handle) writeFromBlocksAt(ctx context.Context, srcs safemem.BlockSeq, o } func (h *handle) sync(ctx context.Context) error { + // Handle most common case first. if h.fd >= 0 { ctx.UninterruptibleSleepStart(false) err := syscall.Fsync(int(h.fd)) ctx.UninterruptibleSleepFinish(false) return err } + if h.file.isNil() { + // File hasn't been touched, there is nothing to sync. + return nil + } return h.file.fsync(ctx) } diff --git a/pkg/sentry/fsimpl/gofer/regular_file.go b/pkg/sentry/fsimpl/gofer/regular_file.go index 3d2d3530a..a2f02d9c7 100644 --- a/pkg/sentry/fsimpl/gofer/regular_file.go +++ b/pkg/sentry/fsimpl/gofer/regular_file.go @@ -582,20 +582,19 @@ func (fd *regularFileFD) Sync(ctx context.Context) error { func (d *dentry) syncSharedHandle(ctx context.Context) error { d.handleMu.RLock() - if !d.handleWritable { - d.handleMu.RUnlock() - return nil - } - d.dataMu.Lock() - // Write dirty cached data to the remote file. - err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, d.fs.mfp.MemoryFile(), d.handle.writeFromBlocksAt) - d.dataMu.Unlock() - if err == nil { - // Sync the remote file. - err = d.handle.sync(ctx) + defer d.handleMu.RUnlock() + + if d.handleWritable { + d.dataMu.Lock() + // Write dirty cached data to the remote file. + err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, d.fs.mfp.MemoryFile(), d.handle.writeFromBlocksAt) + d.dataMu.Unlock() + if err != nil { + return err + } } - d.handleMu.RUnlock() - return err + // Sync the remote file. + return d.handle.sync(ctx) } // ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap. -- cgit v1.2.3 From e227450dc1ec1ef07b9a6a5983ce2f2e0dec22b1 Mon Sep 17 00:00:00 2001 From: Jamie Liu Date: Mon, 6 Jul 2020 22:54:01 -0700 Subject: Call fdnotifier.UpdateFD() from fsimpl/gofer.specialFileFD. The fdnotifier package provides an API to a thread that continually epolls arbitrary host FDs. The set of events polled for each host FD is (intended to be) all events for which a waiter.Entry has expressed interest, as returned by waiter.Queue.Events() for the waiter.Queue registered to the given host FD. When the set of events changes (due to a change in the set of registered waiter.Entries), the mutator must call fdnotifier.UpdateFD() to recalculate the new event set and propagate it to the epoll FD. PiperOrigin-RevId: 319924719 --- pkg/sentry/fsimpl/gofer/special_file.go | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/gofer/special_file.go b/pkg/sentry/fsimpl/gofer/special_file.go index 3c4e7e2e4..c1e6b13e5 100644 --- a/pkg/sentry/fsimpl/gofer/special_file.go +++ b/pkg/sentry/fsimpl/gofer/special_file.go @@ -41,10 +41,10 @@ type specialFileFD struct { // file offset is significant, i.e. a regular file. seekable is immutable. seekable bool - // mayBlock is true if this file description represents a file for which - // queue may send I/O readiness events. mayBlock is immutable. - mayBlock bool - queue waiter.Queue + // haveQueue is true if this file description represents a file for which + // queue may send I/O readiness events. haveQueue is immutable. + haveQueue bool + queue waiter.Queue // If seekable is true, off is the file offset. off is protected by mu. mu sync.Mutex @@ -54,14 +54,14 @@ type specialFileFD struct { func newSpecialFileFD(h handle, mnt *vfs.Mount, d *dentry, locks *vfs.FileLocks, flags uint32) (*specialFileFD, error) { ftype := d.fileType() seekable := ftype == linux.S_IFREG - mayBlock := ftype == linux.S_IFIFO || ftype == linux.S_IFSOCK + haveQueue := (ftype == linux.S_IFIFO || ftype == linux.S_IFSOCK) && h.fd >= 0 fd := &specialFileFD{ - handle: h, - seekable: seekable, - mayBlock: mayBlock, + handle: h, + seekable: seekable, + haveQueue: haveQueue, } fd.LockFD.Init(locks) - if mayBlock && h.fd >= 0 { + if haveQueue { if err := fdnotifier.AddFD(h.fd, &fd.queue); err != nil { return nil, err } @@ -70,7 +70,7 @@ func newSpecialFileFD(h handle, mnt *vfs.Mount, d *dentry, locks *vfs.FileLocks, DenyPRead: !seekable, DenyPWrite: !seekable, }); err != nil { - if mayBlock && h.fd >= 0 { + if haveQueue { fdnotifier.RemoveFD(h.fd) } return nil, err @@ -80,7 +80,7 @@ func newSpecialFileFD(h handle, mnt *vfs.Mount, d *dentry, locks *vfs.FileLocks, // Release implements vfs.FileDescriptionImpl.Release. func (fd *specialFileFD) Release() { - if fd.mayBlock && fd.handle.fd >= 0 { + if fd.haveQueue { fdnotifier.RemoveFD(fd.handle.fd) } fd.handle.close(context.Background()) @@ -100,7 +100,7 @@ func (fd *specialFileFD) OnClose(ctx context.Context) error { // Readiness implements waiter.Waitable.Readiness. func (fd *specialFileFD) Readiness(mask waiter.EventMask) waiter.EventMask { - if fd.mayBlock { + if fd.haveQueue { return fdnotifier.NonBlockingPoll(fd.handle.fd, mask) } return fd.fileDescription.Readiness(mask) @@ -108,8 +108,9 @@ func (fd *specialFileFD) Readiness(mask waiter.EventMask) waiter.EventMask { // EventRegister implements waiter.Waitable.EventRegister. func (fd *specialFileFD) EventRegister(e *waiter.Entry, mask waiter.EventMask) { - if fd.mayBlock { + if fd.haveQueue { fd.queue.EventRegister(e, mask) + fdnotifier.UpdateFD(fd.handle.fd) return } fd.fileDescription.EventRegister(e, mask) @@ -117,8 +118,9 @@ func (fd *specialFileFD) EventRegister(e *waiter.Entry, mask waiter.EventMask) { // EventUnregister implements waiter.Waitable.EventUnregister. func (fd *specialFileFD) EventUnregister(e *waiter.Entry) { - if fd.mayBlock { + if fd.haveQueue { fd.queue.EventUnregister(e) + fdnotifier.UpdateFD(fd.handle.fd) return } fd.fileDescription.EventUnregister(e) -- cgit v1.2.3 From 10930189c3cd938e7526c55188ba2d814a7b8a43 Mon Sep 17 00:00:00 2001 From: Ayush Ranjan Date: Tue, 7 Jul 2020 15:33:32 -0700 Subject: Fix mknod and inotify syscall test This change fixes a few things: - creating sockets using mknod(2) is supported via vfs2 - fsgofer can create regular files via mknod(2) - mode = 0 for mknod(2) will be interpreted as regular file in vfs2 as well Updates #2923 PiperOrigin-RevId: 320074267 --- pkg/sentry/fsimpl/tmpfs/filesystem.go | 2 +- pkg/sentry/syscalls/linux/vfs2/filesystem.go | 13 ++++-- runsc/fsgofer/filter/config.go | 1 + runsc/fsgofer/fsgofer.go | 60 ++++++++++++++++++++++------ test/syscalls/BUILD | 2 + test/syscalls/linux/mknod.cc | 26 +++++++++++- 6 files changed, 84 insertions(+), 20 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go index ed40f6b52..a0f20c2d4 100644 --- a/pkg/sentry/fsimpl/tmpfs/filesystem.go +++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go @@ -277,7 +277,7 @@ func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v creds := rp.Credentials() var childInode *inode switch opts.Mode.FileType() { - case 0, linux.S_IFREG: + case linux.S_IFREG: childInode = fs.newRegularFile(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode) case linux.S_IFIFO: childInode = fs.newNamedPipe(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode) diff --git a/pkg/sentry/syscalls/linux/vfs2/filesystem.go b/pkg/sentry/syscalls/linux/vfs2/filesystem.go index b12b5967b..6b14c2bef 100644 --- a/pkg/sentry/syscalls/linux/vfs2/filesystem.go +++ b/pkg/sentry/syscalls/linux/vfs2/filesystem.go @@ -107,7 +107,7 @@ func Mknod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall addr := args[0].Pointer() mode := args[1].ModeT() dev := args[2].Uint() - return 0, nil, mknodat(t, linux.AT_FDCWD, addr, mode, dev) + return 0, nil, mknodat(t, linux.AT_FDCWD, addr, linux.FileMode(mode), dev) } // Mknodat implements Linux syscall mknodat(2). @@ -116,10 +116,10 @@ func Mknodat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca addr := args[1].Pointer() mode := args[2].ModeT() dev := args[3].Uint() - return 0, nil, mknodat(t, dirfd, addr, mode, dev) + return 0, nil, mknodat(t, dirfd, addr, linux.FileMode(mode), dev) } -func mknodat(t *kernel.Task, dirfd int32, addr usermem.Addr, mode uint, dev uint32) error { +func mknodat(t *kernel.Task, dirfd int32, addr usermem.Addr, mode linux.FileMode, dev uint32) error { path, err := copyInPath(t, addr) if err != nil { return err @@ -129,9 +129,14 @@ func mknodat(t *kernel.Task, dirfd int32, addr usermem.Addr, mode uint, dev uint return err } defer tpop.Release() + + // "Zero file type is equivalent to type S_IFREG." - mknod(2) + if mode.FileType() == 0 { + mode |= linux.ModeRegular + } major, minor := linux.DecodeDeviceID(dev) return t.Kernel().VFS().MknodAt(t, t.Credentials(), &tpop.pop, &vfs.MknodOptions{ - Mode: linux.FileMode(mode &^ t.FSContext().Umask()), + Mode: mode &^ linux.FileMode(t.FSContext().Umask()), DevMajor: uint32(major), DevMinor: minor, }) diff --git a/runsc/fsgofer/filter/config.go b/runsc/fsgofer/filter/config.go index 1dce36965..88814b83c 100644 --- a/runsc/fsgofer/filter/config.go +++ b/runsc/fsgofer/filter/config.go @@ -128,6 +128,7 @@ var allowedSyscalls = seccomp.SyscallRules{ syscall.SYS_MADVISE: {}, unix.SYS_MEMFD_CREATE: {}, /// Used by flipcall.PacketWindowAllocator.Init(). syscall.SYS_MKDIRAT: {}, + syscall.SYS_MKNODAT: {}, // Used by the Go runtime as a temporarily workaround for a Linux // 5.2-5.4 bug. // diff --git a/runsc/fsgofer/fsgofer.go b/runsc/fsgofer/fsgofer.go index 74977c313..b7521bda7 100644 --- a/runsc/fsgofer/fsgofer.go +++ b/runsc/fsgofer/fsgofer.go @@ -139,7 +139,7 @@ func (a *attachPoint) Attach() (p9.File, error) { return nil, fmt.Errorf("unable to open %q: %v", a.prefix, err) } - stat, err := stat(f.FD()) + stat, err := fstat(f.FD()) if err != nil { return nil, fmt.Errorf("unable to stat %q: %v", a.prefix, err) } @@ -352,7 +352,7 @@ func newFDMaybe(file *fd.FD) *fd.FD { return dup } -func stat(fd int) (syscall.Stat_t, error) { +func fstat(fd int) (syscall.Stat_t, error) { var stat syscall.Stat_t if err := syscall.Fstat(fd, &stat); err != nil { return syscall.Stat_t{}, err @@ -360,6 +360,14 @@ func stat(fd int) (syscall.Stat_t, error) { return stat, nil } +func stat(path string) (syscall.Stat_t, error) { + var stat syscall.Stat_t + if err := syscall.Stat(path, &stat); err != nil { + return syscall.Stat_t{}, err + } + return stat, nil +} + func fchown(fd int, uid p9.UID, gid p9.GID) error { return syscall.Fchownat(fd, "", int(uid), int(gid), linux.AT_EMPTY_PATH|unix.AT_SYMLINK_NOFOLLOW) } @@ -388,7 +396,7 @@ func (l *localFile) Open(flags p9.OpenFlags) (*fd.FD, p9.QID, uint32, error) { } } - stat, err := stat(newFile.FD()) + stat, err := fstat(newFile.FD()) if err != nil { if newFile != l.file { newFile.Close() @@ -449,7 +457,7 @@ func (l *localFile) Create(name string, mode p9.OpenFlags, perm p9.FileMode, uid if err := fchown(child.FD(), uid, gid); err != nil { return nil, nil, p9.QID{}, 0, extractErrno(err) } - stat, err := stat(child.FD()) + stat, err := fstat(child.FD()) if err != nil { return nil, nil, p9.QID{}, 0, extractErrno(err) } @@ -497,7 +505,7 @@ func (l *localFile) Mkdir(name string, perm p9.FileMode, uid p9.UID, gid p9.GID) if err := fchown(f.FD(), uid, gid); err != nil { return p9.QID{}, extractErrno(err) } - stat, err := stat(f.FD()) + stat, err := fstat(f.FD()) if err != nil { return p9.QID{}, extractErrno(err) } @@ -517,7 +525,7 @@ func (l *localFile) Walk(names []string) ([]p9.QID, p9.File, error) { return nil, nil, extractErrno(err) } - stat, err := stat(newFile.FD()) + stat, err := fstat(newFile.FD()) if err != nil { newFile.Close() return nil, nil, extractErrno(err) @@ -542,7 +550,7 @@ func (l *localFile) Walk(names []string) ([]p9.QID, p9.File, error) { if err != nil { return nil, nil, extractErrno(err) } - stat, err := stat(f.FD()) + stat, err := fstat(f.FD()) if err != nil { f.Close() return nil, nil, extractErrno(err) @@ -592,7 +600,7 @@ func (l *localFile) FSync() error { // GetAttr implements p9.File. func (l *localFile) GetAttr(_ p9.AttrMask) (p9.QID, p9.AttrMask, p9.Attr, error) { - stat, err := stat(l.file.FD()) + stat, err := fstat(l.file.FD()) if err != nil { return p9.QID{}, p9.AttrMask{}, p9.Attr{}, extractErrno(err) } @@ -880,7 +888,7 @@ func (l *localFile) Symlink(target, newName string, uid p9.UID, gid p9.GID) (p9. if err := fchown(f.FD(), uid, gid); err != nil { return p9.QID{}, extractErrno(err) } - stat, err := stat(f.FD()) + stat, err := fstat(f.FD()) if err != nil { return p9.QID{}, extractErrno(err) } @@ -907,13 +915,39 @@ func (l *localFile) Link(target p9.File, newName string) error { } // Mknod implements p9.File. -// -// Not implemented. -func (*localFile) Mknod(_ string, _ p9.FileMode, _ uint32, _ uint32, _ p9.UID, _ p9.GID) (p9.QID, error) { +func (l *localFile) Mknod(name string, mode p9.FileMode, _ uint32, _ uint32, uid p9.UID, gid p9.GID) (p9.QID, error) { + conf := l.attachPoint.conf + if conf.ROMount { + if conf.PanicOnWrite { + panic("attempt to write to RO mount") + } + return p9.QID{}, syscall.EROFS + } + + hostPath := path.Join(l.hostPath, name) + + // Return EEXIST if the file already exists. + if _, err := stat(hostPath); err == nil { + return p9.QID{}, syscall.EEXIST + } + // From mknod(2) man page: // "EPERM: [...] if the filesystem containing pathname does not support // the type of node requested." - return p9.QID{}, syscall.EPERM + if mode.FileType() != p9.ModeRegular { + return p9.QID{}, syscall.EPERM + } + + // Allow Mknod to create regular files. + if err := syscall.Mknod(hostPath, uint32(mode), 0); err != nil { + return p9.QID{}, err + } + + stat, err := stat(hostPath) + if err != nil { + return p9.QID{}, extractErrno(err) + } + return l.attachPoint.makeQID(stat), nil } // UnlinkAt implements p9.File. diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD index 7c4cd8192..28ef55945 100644 --- a/test/syscalls/BUILD +++ b/test/syscalls/BUILD @@ -283,6 +283,7 @@ syscall_test( size = "medium", add_overlay = False, # TODO(gvisor.dev/issue/317): enable when fixed. test = "//test/syscalls/linux:inotify_test", + vfs2 = "True", ) syscall_test( @@ -351,6 +352,7 @@ syscall_test( syscall_test( add_overlay = True, test = "//test/syscalls/linux:mknod_test", + vfs2 = "True", ) syscall_test( diff --git a/test/syscalls/linux/mknod.cc b/test/syscalls/linux/mknod.cc index 4c45766c7..05dfb375a 100644 --- a/test/syscalls/linux/mknod.cc +++ b/test/syscalls/linux/mknod.cc @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -39,7 +40,28 @@ TEST(MknodTest, RegularFile) { EXPECT_THAT(mknod(node1.c_str(), 0, 0), SyscallSucceeds()); } -TEST(MknodTest, MknodAtRegularFile) { +TEST(MknodTest, RegularFilePermissions) { + const std::string node = NewTempAbsPath(); + mode_t newUmask = 0077; + umask(newUmask); + + // Attempt to open file with mode 0777. Not specifying file type should create + // a regualar file. + mode_t perms = S_IRWXU | S_IRWXG | S_IRWXO; + EXPECT_THAT(mknod(node.c_str(), perms, 0), SyscallSucceeds()); + + // In the absence of a default ACL, the permissions of the created node are + // (mode & ~umask). -- mknod(2) + mode_t wantPerms = perms & ~newUmask; + struct stat st; + ASSERT_THAT(stat(node.c_str(), &st), SyscallSucceeds()); + ASSERT_EQ(st.st_mode & 0777, wantPerms); + + // "Zero file type is equivalent to type S_IFREG." - mknod(2) + ASSERT_EQ(st.st_mode & S_IFMT, S_IFREG); +} + +TEST(MknodTest, MknodAtFIFO) { const TempPath dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); const std::string fifo_relpath = NewTempRelPath(); const std::string fifo = JoinPath(dir.path(), fifo_relpath); @@ -72,7 +94,7 @@ TEST(MknodTest, MknodOnExistingPathFails) { TEST(MknodTest, UnimplementedTypesReturnError) { const std::string path = NewTempAbsPath(); - if (IsRunningOnGvisor()) { + if (IsRunningWithVFS1()) { ASSERT_THAT(mknod(path.c_str(), S_IFSOCK, 0), SyscallFailsWithErrno(EOPNOTSUPP)); } -- cgit v1.2.3 From efa2615eb008a642dc542176759dc560c5f48a2d Mon Sep 17 00:00:00 2001 From: Ayush Ranjan Date: Tue, 7 Jul 2020 21:35:47 -0700 Subject: [vfs2] Remove VFS1 usage in VDSO. Removed VDSO dependency on VFS1. Resolves #2921 PiperOrigin-RevId: 320122176 --- pkg/sentry/fsimpl/testutil/kernel.go | 2 +- pkg/sentry/loader/BUILD | 4 --- pkg/sentry/loader/elf.go | 15 +++++++-- pkg/sentry/loader/loader.go | 17 ---------- pkg/sentry/loader/vdso.go | 61 +++--------------------------------- runsc/boot/loader.go | 4 +-- 6 files changed, 19 insertions(+), 84 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/testutil/kernel.go b/pkg/sentry/fsimpl/testutil/kernel.go index c16a36cdb..ecc81e6bf 100644 --- a/pkg/sentry/fsimpl/testutil/kernel.go +++ b/pkg/sentry/fsimpl/testutil/kernel.go @@ -73,7 +73,7 @@ func Boot() (*kernel.Kernel, error) { k.SetMemoryFile(mf) // Pass k as the platform since it is savable, unlike the actual platform. - vdso, err := loader.PrepareVDSO(nil, k) + vdso, err := loader.PrepareVDSO(k) if err != nil { return nil, fmt.Errorf("creating vdso: %v", err) } diff --git a/pkg/sentry/loader/BUILD b/pkg/sentry/loader/BUILD index c6aa65f28..34bdb0b69 100644 --- a/pkg/sentry/loader/BUILD +++ b/pkg/sentry/loader/BUILD @@ -30,9 +30,6 @@ go_library( "//pkg/rand", "//pkg/safemem", "//pkg/sentry/arch", - "//pkg/sentry/fs", - "//pkg/sentry/fs/anon", - "//pkg/sentry/fs/fsutil", "//pkg/sentry/fsbridge", "//pkg/sentry/kernel/auth", "//pkg/sentry/limits", @@ -45,6 +42,5 @@ go_library( "//pkg/syserr", "//pkg/syserror", "//pkg/usermem", - "//pkg/waiter", ], ) diff --git a/pkg/sentry/loader/elf.go b/pkg/sentry/loader/elf.go index 616fafa2c..ddeaff3db 100644 --- a/pkg/sentry/loader/elf.go +++ b/pkg/sentry/loader/elf.go @@ -90,14 +90,23 @@ type elfInfo struct { sharedObject bool } +// fullReader interface extracts the ReadFull method from fsbridge.File so that +// client code does not need to define an entire fsbridge.File when only read +// functionality is needed. +// +// TODO(gvisor.dev/issue/1035): Once VFS2 ships, rewrite this to wrap +// vfs.FileDescription's PRead/Read instead. +type fullReader interface { + // ReadFull is the same as fsbridge.File.ReadFull. + ReadFull(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) +} + // parseHeader parse the ELF header, verifying that this is a supported ELF // file and returning the ELF program headers. // // This is similar to elf.NewFile, except that it is more strict about what it // accepts from the ELF, and it doesn't parse unnecessary parts of the file. -// -// ctx may be nil if f does not need it. -func parseHeader(ctx context.Context, f fsbridge.File) (elfInfo, error) { +func parseHeader(ctx context.Context, f fullReader) (elfInfo, error) { // Check ident first; it will tell us the endianness of the rest of the // structs. var ident [elf.EI_NIDENT]byte diff --git a/pkg/sentry/loader/loader.go b/pkg/sentry/loader/loader.go index 88449fe95..e65ec4771 100644 --- a/pkg/sentry/loader/loader.go +++ b/pkg/sentry/loader/loader.go @@ -27,7 +27,6 @@ import ( "gvisor.dev/gvisor/pkg/cpuid" "gvisor.dev/gvisor/pkg/rand" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fsbridge" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/mm" @@ -80,22 +79,6 @@ type LoadArgs struct { Features *cpuid.FeatureSet } -// readFull behaves like io.ReadFull for an *fs.File. -func readFull(ctx context.Context, f *fs.File, dst usermem.IOSequence, offset int64) (int64, error) { - var total int64 - for dst.NumBytes() > 0 { - n, err := f.Preadv(ctx, dst, offset+total) - total += n - if err == io.EOF && total != 0 { - return total, io.ErrUnexpectedEOF - } else if err != nil { - return total, err - } - dst = dst.DropFirst64(n) - } - return total, nil -} - // openPath opens args.Filename and checks that it is valid for loading. // // openPath returns an *fs.Dirent and *fs.File for args.Filename, which is not diff --git a/pkg/sentry/loader/vdso.go b/pkg/sentry/loader/vdso.go index 165869028..05a294fe6 100644 --- a/pkg/sentry/loader/vdso.go +++ b/pkg/sentry/loader/vdso.go @@ -26,10 +26,6 @@ import ( "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/arch" - "gvisor.dev/gvisor/pkg/sentry/fs" - "gvisor.dev/gvisor/pkg/sentry/fs/anon" - "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" - "gvisor.dev/gvisor/pkg/sentry/fsbridge" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/mm" "gvisor.dev/gvisor/pkg/sentry/pgalloc" @@ -37,7 +33,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" - "gvisor.dev/gvisor/pkg/waiter" ) const vdsoPrelink = 0xffffffffff700000 @@ -55,52 +50,11 @@ func (f *fileContext) Value(key interface{}) interface{} { } } -// byteReader implements fs.FileOperations for reading from a []byte source. -type byteReader struct { - fsutil.FileNoFsync `state:"nosave"` - fsutil.FileNoIoctl `state:"nosave"` - fsutil.FileNoMMap `state:"nosave"` - fsutil.FileNoSplice `state:"nosave"` - fsutil.FileNoopFlush `state:"nosave"` - fsutil.FileNoopRelease `state:"nosave"` - fsutil.FileNotDirReaddir `state:"nosave"` - fsutil.FilePipeSeek `state:"nosave"` - fsutil.FileUseInodeUnstableAttr `state:"nosave"` - waiter.AlwaysReady `state:"nosave"` - +type byteFullReader struct { data []byte } -var _ fs.FileOperations = (*byteReader)(nil) - -// newByteReaderFile creates a fake file to read data from. -// -// TODO(gvisor.dev/issue/2921): Convert to VFS2. -func newByteReaderFile(ctx context.Context, data []byte) *fs.File { - // Create a fake inode. - inode := fs.NewInode( - ctx, - &fsutil.SimpleFileInode{}, - fs.NewPseudoMountSource(ctx), - fs.StableAttr{ - Type: fs.Anonymous, - DeviceID: anon.PseudoDevice.DeviceID(), - InodeID: anon.PseudoDevice.NextIno(), - BlockSize: usermem.PageSize, - }) - - // Use the fake inode to create a fake dirent. - dirent := fs.NewTransientDirent(inode) - defer dirent.DecRef() - - // Use the fake dirent to make a fake file. - flags := fs.FileFlags{Read: true, Pread: true} - return fs.NewFile(&fileContext{Context: context.Background()}, dirent, flags, &byteReader{ - data: data, - }) -} - -func (b *byteReader) Read(ctx context.Context, file *fs.File, dst usermem.IOSequence, offset int64) (int64, error) { +func (b *byteFullReader) ReadFull(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) { if offset < 0 { return 0, syserror.EINVAL } @@ -111,10 +65,6 @@ func (b *byteReader) Read(ctx context.Context, file *fs.File, dst usermem.IOSequ return int64(n), err } -func (b *byteReader) Write(ctx context.Context, file *fs.File, src usermem.IOSequence, offset int64) (int64, error) { - panic("Write not supported") -} - // validateVDSO checks that the VDSO can be loaded by loadVDSO. // // VDSOs are special (see below). Since we are going to map the VDSO directly @@ -130,7 +80,7 @@ func (b *byteReader) Write(ctx context.Context, file *fs.File, src usermem.IOSeq // * PT_LOAD segments don't extend beyond the end of the file. // // ctx may be nil if f does not need it. -func validateVDSO(ctx context.Context, f fsbridge.File, size uint64) (elfInfo, error) { +func validateVDSO(ctx context.Context, f fullReader, size uint64) (elfInfo, error) { info, err := parseHeader(ctx, f) if err != nil { log.Infof("Unable to parse VDSO header: %v", err) @@ -248,13 +198,12 @@ func getSymbolValueFromVDSO(symbol string) (uint64, error) { // PrepareVDSO validates the system VDSO and returns a VDSO, containing the // param page for updating by the kernel. -func PrepareVDSO(ctx context.Context, mfp pgalloc.MemoryFileProvider) (*VDSO, error) { - vdsoFile := fsbridge.NewFSFile(newByteReaderFile(ctx, vdsoBin)) +func PrepareVDSO(mfp pgalloc.MemoryFileProvider) (*VDSO, error) { + vdsoFile := &byteFullReader{data: vdsoBin} // First make sure the VDSO is valid. vdsoFile does not use ctx, so a // nil context can be passed. info, err := validateVDSO(nil, vdsoFile, uint64(len(vdsoBin))) - vdsoFile.DecRef() if err != nil { return nil, err } diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index b5df1deb9..0940d2a6d 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -227,9 +227,7 @@ func New(args Args) (*Loader, error) { // Create VDSO. // // Pass k as the platform since it is savable, unlike the actual platform. - // - // FIXME(b/109889800): Use non-nil context. - vdso, err := loader.PrepareVDSO(nil, k) + vdso, err := loader.PrepareVDSO(k) if err != nil { return nil, fmt.Errorf("creating vdso: %v", err) } -- cgit v1.2.3 From c4815af9475cc4680c6d598d9c930de892c98aae Mon Sep 17 00:00:00 2001 From: Fabricio Voznika Date: Wed, 8 Jul 2020 17:10:35 -0700 Subject: Add shared mount hints to VFS2 Container restart test is disabled for VFS2 for now. Updates #1487 PiperOrigin-RevId: 320296401 --- pkg/sentry/fsimpl/testutil/kernel.go | 6 +- pkg/sentry/loader/loader.go | 4 +- pkg/sentry/vfs/vfs.go | 3 + runsc/boot/fs.go | 9 +- runsc/boot/loader.go | 8 +- runsc/boot/loader_test.go | 2 +- runsc/boot/vfs.go | 116 ++++++++++++++++---- runsc/container/multi_container_test.go | 184 +++++++++++++++----------------- 8 files changed, 208 insertions(+), 124 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/testutil/kernel.go b/pkg/sentry/fsimpl/testutil/kernel.go index ecc81e6bf..e743e8114 100644 --- a/pkg/sentry/fsimpl/testutil/kernel.go +++ b/pkg/sentry/fsimpl/testutil/kernel.go @@ -62,6 +62,7 @@ func Boot() (*kernel.Kernel, error) { return nil, fmt.Errorf("creating platform: %v", err) } + kernel.VFS2Enabled = true k := &kernel.Kernel{ Platform: plat, } @@ -103,11 +104,6 @@ func Boot() (*kernel.Kernel, error) { return nil, fmt.Errorf("initializing kernel: %v", err) } - kernel.VFS2Enabled = true - - if err := k.VFS().Init(); err != nil { - return nil, fmt.Errorf("VFS init: %v", err) - } k.VFS().MustRegisterFilesystemType(tmpfs.Name, &tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ AllowUserMount: true, AllowUserList: true, diff --git a/pkg/sentry/loader/loader.go b/pkg/sentry/loader/loader.go index e65ec4771..986c7fb4d 100644 --- a/pkg/sentry/loader/loader.go +++ b/pkg/sentry/loader/loader.go @@ -221,14 +221,14 @@ func Load(ctx context.Context, args LoadArgs, extraAuxv []arch.AuxEntry, vdso *V // Load the executable itself. loaded, ac, file, newArgv, err := loadExecutable(ctx, args) if err != nil { - return 0, nil, "", syserr.NewDynamic(fmt.Sprintf("Failed to load %s: %v", args.Filename, err), syserr.FromError(err).ToLinux()) + return 0, nil, "", syserr.NewDynamic(fmt.Sprintf("failed to load %s: %v", args.Filename, err), syserr.FromError(err).ToLinux()) } defer file.DecRef() // Load the VDSO. vdsoAddr, err := loadVDSO(ctx, args.MemoryManager, vdso, loaded) if err != nil { - return 0, nil, "", syserr.NewDynamic(fmt.Sprintf("Error loading VDSO: %v", err), syserr.FromError(err).ToLinux()) + return 0, nil, "", syserr.NewDynamic(fmt.Sprintf("error loading VDSO: %v", err), syserr.FromError(err).ToLinux()) } // Setup the heap. brk starts at the next page after the end of the diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go index 58c7ad778..522e27475 100644 --- a/pkg/sentry/vfs/vfs.go +++ b/pkg/sentry/vfs/vfs.go @@ -123,6 +123,9 @@ type VirtualFilesystem struct { // Init initializes a new VirtualFilesystem with no mounts or FilesystemTypes. func (vfs *VirtualFilesystem) Init() error { + if vfs.mountpoints != nil { + panic("VFS already initialized") + } vfs.mountpoints = make(map[*Dentry]map[*Mount]struct{}) vfs.devices = make(map[devTuple]*registeredDevice) vfs.anonBlockDevMinorNext = 1 diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go index e83584b82..59639ba19 100644 --- a/runsc/boot/fs.go +++ b/runsc/boot/fs.go @@ -29,6 +29,7 @@ import ( _ "gvisor.dev/gvisor/pkg/sentry/fs/sys" _ "gvisor.dev/gvisor/pkg/sentry/fs/tmpfs" _ "gvisor.dev/gvisor/pkg/sentry/fs/tty" + "gvisor.dev/gvisor/pkg/sentry/vfs" specs "github.com/opencontainers/runtime-spec/specs-go" "gvisor.dev/gvisor/pkg/abi/linux" @@ -390,6 +391,10 @@ type mountHint struct { // root is the inode where the volume is mounted. For mounts with 'pod' share // the volume is mounted once and then bind mounted inside the containers. root *fs.Inode + + // vfsMount is the master mount for the volume. For mounts with 'pod' share + // the master volume is bind mounted inside the containers. + vfsMount *vfs.Mount } func (m *mountHint) setField(key, val string) error { @@ -571,9 +576,9 @@ func newContainerMounter(spec *specs.Spec, goferFDs []int, k *kernel.Kernel, hin // processHints processes annotations that container hints about how volumes // should be mounted (e.g. a volume shared between containers). It must be // called for the root container only. -func (c *containerMounter) processHints(conf *Config) error { +func (c *containerMounter) processHints(conf *Config, creds *auth.Credentials) error { if conf.VFS2 { - return nil + return c.processHintsVFS2(conf, creds) } ctx := c.k.SupervisorContext() for _, hint := range c.hints.mounts { diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index 0940d2a6d..0c0423ab2 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -298,6 +298,12 @@ func New(args Args) (*Loader, error) { return nil, fmt.Errorf("initializing kernel: %v", err) } + if kernel.VFS2Enabled { + if err := registerFilesystems(k); err != nil { + return nil, fmt.Errorf("registering filesystems: %w", err) + } + } + if err := adjustDirentCache(k); err != nil { return nil, err } @@ -559,7 +565,7 @@ func (l *Loader) run() error { l.startGoferMonitor(l.sandboxID, l.goferFDs) mntr := newContainerMounter(l.spec, l.goferFDs, l.k, l.mountHints) - if err := mntr.processHints(l.conf); err != nil { + if err := mntr.processHints(l.conf, l.rootProcArgs.Credentials); err != nil { return err } if err := setupContainerFS(ctx, l.conf, mntr, &l.rootProcArgs); err != nil { diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go index e448fd773..b723e4335 100644 --- a/runsc/boot/loader_test.go +++ b/runsc/boot/loader_test.go @@ -480,7 +480,7 @@ func TestCreateMountNamespaceVFS2(t *testing.T) { defer loaderCleanup() mntr := newContainerMounter(l.spec, l.goferFDs, l.k, l.mountHints) - if err := mntr.processHints(l.conf); err != nil { + if err := mntr.processHints(l.conf, l.rootProcArgs.Credentials); err != nil { t.Fatalf("failed process hints: %v", err) } diff --git a/runsc/boot/vfs.go b/runsc/boot/vfs.go index b68117867..6ee6fae04 100644 --- a/runsc/boot/vfs.go +++ b/runsc/boot/vfs.go @@ -43,7 +43,11 @@ import ( "gvisor.dev/gvisor/pkg/syserror" ) -func registerFilesystems(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials) error { +func registerFilesystems(k *kernel.Kernel) error { + ctx := k.SupervisorContext() + creds := auth.NewRootCredentials(k.RootUserNamespace()) + vfsObj := k.VFS() + vfsObj.MustRegisterFilesystemType(devpts.Name, &devpts.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ AllowUserList: true, // TODO(b/29356795): Users may mount this once the terminals are in a @@ -113,9 +117,6 @@ func registerFilesystems(ctx context.Context, vfsObj *vfs.VirtualFilesystem, cre } func setupContainerVFS2(ctx context.Context, conf *Config, mntr *containerMounter, procArgs *kernel.CreateProcessArgs) error { - if err := mntr.k.VFS().Init(); err != nil { - return fmt.Errorf("failed to initialize VFS: %w", err) - } mns, err := mntr.setupVFS2(ctx, conf, procArgs) if err != nil { return fmt.Errorf("failed to setupFS: %w", err) @@ -144,10 +145,6 @@ func (c *containerMounter) setupVFS2(ctx context.Context, conf *Config, procArgs rootProcArgs.MaxSymlinkTraversals = linux.MaxSymlinkTraversals rootCtx := procArgs.NewContext(c.k) - if err := registerFilesystems(rootCtx, c.k.VFS(), rootCreds); err != nil { - return nil, fmt.Errorf("register filesystems: %w", err) - } - mns, err := c.createMountNamespaceVFS2(rootCtx, conf, rootCreds) if err != nil { return nil, fmt.Errorf("creating mount namespace: %w", err) @@ -182,8 +179,14 @@ func (c *containerMounter) mountSubmountsVFS2(ctx context.Context, conf *Config, for i := range mounts { submount := &mounts[i] log.Debugf("Mounting %q to %q, type: %s, options: %s", submount.Source, submount.Destination, submount.Type, submount.Options) - if err := c.mountSubmountVFS2(ctx, conf, mns, creds, submount); err != nil { - return err + if hint := c.hints.findMount(submount.Mount); hint != nil && hint.isSupported() { + if err := c.mountSharedSubmountVFS2(ctx, conf, mns, creds, submount.Mount, hint); err != nil { + return fmt.Errorf("mount shared mount %q to %q: %v", hint.name, submount.Destination, err) + } + } else { + if err := c.mountSubmountVFS2(ctx, conf, mns, creds, submount); err != nil { + return fmt.Errorf("mount submount %q: %w", submount.Destination, err) + } } } @@ -257,20 +260,18 @@ func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *Config, // getMountNameAndOptionsVFS2 retrieves the fsName, opts, and useOverlay values // used for mounts. func (c *containerMounter) getMountNameAndOptionsVFS2(conf *Config, m *mountAndFD) (string, *vfs.MountOptions, error) { - var ( - fsName string - data []string - ) + fsName := m.Type + var data []string // Find filesystem name and FS specific data field. switch m.Type { case devpts.Name, devtmpfs.Name, proc.Name, sys.Name: - fsName = m.Type + // Nothing to do. + case nonefs: fsName = sys.Name - case tmpfs.Name: - fsName = m.Type + case tmpfs.Name: var err error data, err = parseAndFilterOptions(m.Options, tmpfsAllowedData...) if err != nil { @@ -279,10 +280,16 @@ func (c *containerMounter) getMountNameAndOptionsVFS2(conf *Config, m *mountAndF case bind: fsName = gofer.Name + if m.fd == 0 { + // Check that an FD was provided to fails fast. Technically FD=0 is valid, + // but unlikely to be correct in this context. + return "", nil, fmt.Errorf("9P mount requires a connection FD") + } data = p9MountData(m.fd, c.getMountAccessType(m.Mount), true /* vfs2 */) default: log.Warningf("ignoring unknown filesystem type %q", m.Type) + return "", nil, nil } opts := &vfs.MountOptions{ @@ -322,7 +329,7 @@ func (c *containerMounter) makeSyntheticMount(ctx context.Context, currentPath s } _, err := c.k.VFS().StatAt(ctx, creds, target, &vfs.StatOptions{}) if err == nil { - // Mount point exists, nothing else to do. + log.Debugf("Mount point %q already exists", currentPath) return nil } if err != syserror.ENOENT { @@ -400,3 +407,76 @@ func (c *containerMounter) mountTmpVFS2(ctx context.Context, conf *Config, creds return fmt.Errorf(`stating "/tmp" inside container: %w`, err) } } + +// processHintsVFS2 processes annotations that container hints about how volumes +// should be mounted (e.g. a volume shared between containers). It must be +// called for the root container only. +func (c *containerMounter) processHintsVFS2(conf *Config, creds *auth.Credentials) error { + ctx := c.k.SupervisorContext() + for _, hint := range c.hints.mounts { + // TODO(b/142076984): Only support tmpfs for now. Bind mounts require a + // common gofer to mount all shared volumes. + if hint.mount.Type != tmpfs.Name { + continue + } + + log.Infof("Mounting master of shared mount %q from %q type %q", hint.name, hint.mount.Source, hint.mount.Type) + mnt, err := c.mountSharedMasterVFS2(ctx, conf, hint, creds) + if err != nil { + return fmt.Errorf("mounting shared master %q: %v", hint.name, err) + } + hint.vfsMount = mnt + } + return nil +} + +// mountSharedMasterVFS2 mounts the master of a volume that is shared among +// containers in a pod. +func (c *containerMounter) mountSharedMasterVFS2(ctx context.Context, conf *Config, hint *mountHint, creds *auth.Credentials) (*vfs.Mount, error) { + // Map mount type to filesystem name, and parse out the options that we are + // capable of dealing with. + mntFD := &mountAndFD{Mount: hint.mount} + fsName, opts, err := c.getMountNameAndOptionsVFS2(conf, mntFD) + if err != nil { + return nil, err + } + if len(fsName) == 0 { + return nil, fmt.Errorf("mount type not supported %q", hint.mount.Type) + } + return c.k.VFS().MountDisconnected(ctx, creds, "", fsName, opts) +} + +// mountSharedSubmount binds mount to a previously mounted volume that is shared +// among containers in the same pod. +func (c *containerMounter) mountSharedSubmountVFS2(ctx context.Context, conf *Config, mns *vfs.MountNamespace, creds *auth.Credentials, mount specs.Mount, source *mountHint) error { + if err := source.checkCompatible(mount); err != nil { + return err + } + + _, opts, err := c.getMountNameAndOptionsVFS2(conf, &mountAndFD{Mount: mount}) + if err != nil { + return err + } + newMnt, err := c.k.VFS().NewDisconnectedMount(source.vfsMount.Filesystem(), source.vfsMount.Root(), opts) + if err != nil { + return err + } + defer newMnt.DecRef() + + root := mns.Root() + defer root.DecRef() + if err := c.makeSyntheticMount(ctx, mount.Destination, root, creds); err != nil { + return err + } + + target := &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(mount.Destination), + } + if err := c.k.VFS().ConnectMountAt(ctx, creds, newMnt, target); err != nil { + return err + } + log.Infof("Mounted %q type shared bind to %q", mount.Destination, source.name) + return nil +} diff --git a/runsc/container/multi_container_test.go b/runsc/container/multi_container_test.go index a27a01942..e189648f4 100644 --- a/runsc/container/multi_container_test.go +++ b/runsc/container/multi_container_test.go @@ -100,19 +100,20 @@ type execDesc struct { c *Container cmd []string want int - desc string + name string } -func execMany(execs []execDesc) error { +func execMany(t *testing.T, execs []execDesc) { for _, exec := range execs { - args := &control.ExecArgs{Argv: exec.cmd} - if ws, err := exec.c.executeSync(args); err != nil { - return fmt.Errorf("error executing %+v: %v", args, err) - } else if ws.ExitStatus() != exec.want { - return fmt.Errorf("%q: exec %q got exit status: %d, want: %d", exec.desc, exec.cmd, ws.ExitStatus(), exec.want) - } + t.Run(exec.name, func(t *testing.T) { + args := &control.ExecArgs{Argv: exec.cmd} + if ws, err := exec.c.executeSync(args); err != nil { + t.Errorf("error executing %+v: %v", args, err) + } else if ws.ExitStatus() != exec.want { + t.Errorf("%q: exec %q got exit status: %d, want: %d", exec.name, exec.cmd, ws.ExitStatus(), exec.want) + } + }) } - return nil } func createSharedMount(mount specs.Mount, name string, pod ...*specs.Spec) { @@ -1072,7 +1073,7 @@ func TestMultiContainerContainerDestroyStress(t *testing.T) { // Test that pod shared mounts are properly mounted in 2 containers and that // changes from one container is reflected in the other. func TestMultiContainerSharedMount(t *testing.T) { - for name, conf := range configs(t, all...) { + for name, conf := range configsWithVFS2(t, all...) { t.Run(name, func(t *testing.T) { rootDir, cleanup, err := testutil.SetupRootDir() if err != nil { @@ -1110,84 +1111,82 @@ func TestMultiContainerSharedMount(t *testing.T) { { c: containers[0], cmd: []string{"/usr/bin/test", "-d", mnt0.Destination}, - desc: "directory is mounted in container0", + name: "directory is mounted in container0", }, { c: containers[1], cmd: []string{"/usr/bin/test", "-d", mnt1.Destination}, - desc: "directory is mounted in container1", + name: "directory is mounted in container1", }, { c: containers[0], - cmd: []string{"/usr/bin/touch", file0}, - desc: "create file in container0", + cmd: []string{"/bin/touch", file0}, + name: "create file in container0", }, { c: containers[0], cmd: []string{"/usr/bin/test", "-f", file0}, - desc: "file appears in container0", + name: "file appears in container0", }, { c: containers[1], cmd: []string{"/usr/bin/test", "-f", file1}, - desc: "file appears in container1", + name: "file appears in container1", }, { c: containers[1], cmd: []string{"/bin/rm", file1}, - desc: "file removed from container1", + name: "remove file from container1", }, { c: containers[0], cmd: []string{"/usr/bin/test", "!", "-f", file0}, - desc: "file removed from container0", + name: "file removed from container0", }, { c: containers[1], cmd: []string{"/usr/bin/test", "!", "-f", file1}, - desc: "file removed from container1", + name: "file removed from container1", }, { c: containers[1], cmd: []string{"/bin/mkdir", file1}, - desc: "create directory in container1", + name: "create directory in container1", }, { c: containers[0], cmd: []string{"/usr/bin/test", "-d", file0}, - desc: "dir appears in container0", + name: "dir appears in container0", }, { c: containers[1], cmd: []string{"/usr/bin/test", "-d", file1}, - desc: "dir appears in container1", + name: "dir appears in container1", }, { c: containers[0], cmd: []string{"/bin/rmdir", file0}, - desc: "create directory in container0", + name: "remove directory from container0", }, { c: containers[0], cmd: []string{"/usr/bin/test", "!", "-d", file0}, - desc: "dir removed from container0", + name: "dir removed from container0", }, { c: containers[1], cmd: []string{"/usr/bin/test", "!", "-d", file1}, - desc: "dir removed from container1", + name: "dir removed from container1", }, } - if err := execMany(execs); err != nil { - t.Fatal(err.Error()) - } + execMany(t, execs) }) } } // Test that pod mounts are mounted as readonly when requested. func TestMultiContainerSharedMountReadonly(t *testing.T) { - for name, conf := range configs(t, all...) { + for name, conf := range configsWithVFS2(t, all...) { t.Run(name, func(t *testing.T) { rootDir, cleanup, err := testutil.SetupRootDir() if err != nil { @@ -1225,35 +1224,34 @@ func TestMultiContainerSharedMountReadonly(t *testing.T) { { c: containers[0], cmd: []string{"/usr/bin/test", "-d", mnt0.Destination}, - desc: "directory is mounted in container0", + name: "directory is mounted in container0", }, { c: containers[1], cmd: []string{"/usr/bin/test", "-d", mnt1.Destination}, - desc: "directory is mounted in container1", + name: "directory is mounted in container1", }, { c: containers[0], - cmd: []string{"/usr/bin/touch", file0}, + cmd: []string{"/bin/touch", file0}, want: 1, - desc: "fails to write to container0", + name: "fails to write to container0", }, { c: containers[1], - cmd: []string{"/usr/bin/touch", file1}, + cmd: []string{"/bin/touch", file1}, want: 1, - desc: "fails to write to container1", + name: "fails to write to container1", }, } - if err := execMany(execs); err != nil { - t.Fatal(err.Error()) - } + execMany(t, execs) }) } } // Test that shared pod mounts continue to work after container is restarted. func TestMultiContainerSharedMountRestart(t *testing.T) { + //TODO(gvisor.dev/issue/1487): This is failing with VFS2. for name, conf := range configs(t, all...) { t.Run(name, func(t *testing.T) { rootDir, cleanup, err := testutil.SetupRootDir() @@ -1291,23 +1289,21 @@ func TestMultiContainerSharedMountRestart(t *testing.T) { execs := []execDesc{ { c: containers[0], - cmd: []string{"/usr/bin/touch", file0}, - desc: "create file in container0", + cmd: []string{"/bin/touch", file0}, + name: "create file in container0", }, { c: containers[0], cmd: []string{"/usr/bin/test", "-f", file0}, - desc: "file appears in container0", + name: "file appears in container0", }, { c: containers[1], cmd: []string{"/usr/bin/test", "-f", file1}, - desc: "file appears in container1", + name: "file appears in container1", }, } - if err := execMany(execs); err != nil { - t.Fatal(err.Error()) - } + execMany(t, execs) containers[1].Destroy() @@ -1334,32 +1330,30 @@ func TestMultiContainerSharedMountRestart(t *testing.T) { { c: containers[0], cmd: []string{"/usr/bin/test", "-f", file0}, - desc: "file is still in container0", + name: "file is still in container0", }, { c: containers[1], cmd: []string{"/usr/bin/test", "-f", file1}, - desc: "file is still in container1", + name: "file is still in container1", }, { c: containers[1], cmd: []string{"/bin/rm", file1}, - desc: "file removed from container1", + name: "file removed from container1", }, { c: containers[0], cmd: []string{"/usr/bin/test", "!", "-f", file0}, - desc: "file removed from container0", + name: "file removed from container0", }, { c: containers[1], cmd: []string{"/usr/bin/test", "!", "-f", file1}, - desc: "file removed from container1", + name: "file removed from container1", }, } - if err := execMany(execs); err != nil { - t.Fatal(err.Error()) - } + execMany(t, execs) }) } } @@ -1367,53 +1361,53 @@ func TestMultiContainerSharedMountRestart(t *testing.T) { // Test that unsupported pod mounts options are ignored when matching master and // slave mounts. func TestMultiContainerSharedMountUnsupportedOptions(t *testing.T) { - rootDir, cleanup, err := testutil.SetupRootDir() - if err != nil { - t.Fatalf("error creating root dir: %v", err) - } - defer cleanup() - - conf := testutil.TestConfig(t) - conf.RootDir = rootDir + for name, conf := range configsWithVFS2(t, all...) { + t.Run(name, func(t *testing.T) { + rootDir, cleanup, err := testutil.SetupRootDir() + if err != nil { + t.Fatalf("error creating root dir: %v", err) + } + defer cleanup() + conf.RootDir = rootDir - // Setup the containers. - sleep := []string{"/bin/sleep", "100"} - podSpec, ids := createSpecs(sleep, sleep) - mnt0 := specs.Mount{ - Destination: "/mydir/test", - Source: "/some/dir", - Type: "tmpfs", - Options: []string{"rw", "rbind", "relatime"}, - } - podSpec[0].Mounts = append(podSpec[0].Mounts, mnt0) + // Setup the containers. + sleep := []string{"/bin/sleep", "100"} + podSpec, ids := createSpecs(sleep, sleep) + mnt0 := specs.Mount{ + Destination: "/mydir/test", + Source: "/some/dir", + Type: "tmpfs", + Options: []string{"rw", "rbind", "relatime"}, + } + podSpec[0].Mounts = append(podSpec[0].Mounts, mnt0) - mnt1 := mnt0 - mnt1.Destination = "/mydir2/test2" - mnt1.Options = []string{"rw", "nosuid"} - podSpec[1].Mounts = append(podSpec[1].Mounts, mnt1) + mnt1 := mnt0 + mnt1.Destination = "/mydir2/test2" + mnt1.Options = []string{"rw", "nosuid"} + podSpec[1].Mounts = append(podSpec[1].Mounts, mnt1) - createSharedMount(mnt0, "test-mount", podSpec...) + createSharedMount(mnt0, "test-mount", podSpec...) - containers, cleanup, err := startContainers(conf, podSpec, ids) - if err != nil { - t.Fatalf("error starting containers: %v", err) - } - defer cleanup() + containers, cleanup, err := startContainers(conf, podSpec, ids) + if err != nil { + t.Fatalf("error starting containers: %v", err) + } + defer cleanup() - execs := []execDesc{ - { - c: containers[0], - cmd: []string{"/usr/bin/test", "-d", mnt0.Destination}, - desc: "directory is mounted in container0", - }, - { - c: containers[1], - cmd: []string{"/usr/bin/test", "-d", mnt1.Destination}, - desc: "directory is mounted in container1", - }, - } - if err := execMany(execs); err != nil { - t.Fatal(err.Error()) + execs := []execDesc{ + { + c: containers[0], + cmd: []string{"/usr/bin/test", "-d", mnt0.Destination}, + name: "directory is mounted in container0", + }, + { + c: containers[1], + cmd: []string{"/usr/bin/test", "-d", mnt1.Destination}, + name: "directory is mounted in container1", + }, + } + execMany(t, execs) + }) } } -- cgit v1.2.3 From abffebde7be2dcdb4564e45f845d7c150ced0ccb Mon Sep 17 00:00:00 2001 From: Ridwan Sharif Date: Tue, 7 Jul 2020 21:48:25 -0400 Subject: Gate FUSE behind a runsc flag This change gates all FUSE commands (by gating /dev/fuse) behind a runsc flag. In order to use FUSE commands, use the --fuse flag with the --vfs2 flag. Check if FUSE is enabled by running dmesg in the sandbox. --- pkg/sentry/fsimpl/fuse/BUILD | 1 + pkg/sentry/fsimpl/fuse/dev.go | 5 +++++ pkg/sentry/kernel/kernel.go | 4 ++++ pkg/sentry/kernel/syslog.go | 9 +++++++++ runsc/boot/config.go | 7 +++++++ runsc/boot/loader.go | 4 ++++ runsc/boot/vfs.go | 14 ++++++++++---- runsc/main.go | 2 ++ test/runner/defs.bzl | 20 +++++++++++++++++++- test/runner/runner.go | 10 ++++++++++ test/syscalls/BUILD | 1 + test/syscalls/linux/dev.cc | 2 +- test/util/test_util.cc | 6 ++++++ test/util/test_util.h | 1 + 14 files changed, 80 insertions(+), 6 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/fuse/BUILD b/pkg/sentry/fsimpl/fuse/BUILD index 41567967d..3e00c2abb 100644 --- a/pkg/sentry/fsimpl/fuse/BUILD +++ b/pkg/sentry/fsimpl/fuse/BUILD @@ -12,6 +12,7 @@ go_library( "//pkg/abi/linux", "//pkg/context", "//pkg/sentry/fsimpl/devtmpfs", + "//pkg/sentry/kernel", "//pkg/sentry/vfs", "//pkg/syserror", "//pkg/usermem", diff --git a/pkg/sentry/fsimpl/fuse/dev.go b/pkg/sentry/fsimpl/fuse/dev.go index f6a67d005..dc33268af 100644 --- a/pkg/sentry/fsimpl/fuse/dev.go +++ b/pkg/sentry/fsimpl/fuse/dev.go @@ -18,6 +18,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/fsimpl/devtmpfs" + "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" @@ -30,6 +31,10 @@ type fuseDevice struct{} // Open implements vfs.Device.Open. func (fuseDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + if !kernel.FUSEEnabled { + return nil, syserror.ENOENT + } + var fd DeviceFD if err := fd.vfsfd.Init(&fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{ UseDentryMetadata: true, diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index 2177b785a..240cd6fe0 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -81,6 +81,10 @@ import ( // easy access everywhere. To be removed once VFS2 becomes the default. var VFS2Enabled = false +// FUSEEnabled is set to true when FUSE is enabled. Added as a global for allow +// easy access everywhere. To be removed once FUSE is completed. +var FUSEEnabled = false + // Kernel represents an emulated Linux kernel. It must be initialized by calling // Init() or LoadFrom(). // diff --git a/pkg/sentry/kernel/syslog.go b/pkg/sentry/kernel/syslog.go index 4607cde2f..a83ce219c 100644 --- a/pkg/sentry/kernel/syslog.go +++ b/pkg/sentry/kernel/syslog.go @@ -98,6 +98,15 @@ func (s *syslog) Log() []byte { s.msg = append(s.msg, []byte(fmt.Sprintf(format, time, selectMessage()))...) } + if VFS2Enabled { + time += rand.Float64() / 2 + s.msg = append(s.msg, []byte(fmt.Sprintf(format, time, "Setting up VFS2..."))...) + if FUSEEnabled { + time += rand.Float64() / 2 + s.msg = append(s.msg, []byte(fmt.Sprintf(format, time, "Setting up FUSE..."))...) + } + } + time += rand.Float64() / 2 s.msg = append(s.msg, []byte(fmt.Sprintf(format, time, "Ready!"))...) diff --git a/runsc/boot/config.go b/runsc/boot/config.go index bb01b8fb5..80da8b3e6 100644 --- a/runsc/boot/config.go +++ b/runsc/boot/config.go @@ -274,6 +274,9 @@ type Config struct { // Enables VFS2 (not plumbled through yet). VFS2 bool + + // Enables FUSE usage (not plumbled through yet). + FUSE bool } // ToFlags returns a slice of flags that correspond to the given Config. @@ -325,5 +328,9 @@ func (c *Config) ToFlags() []string { f = append(f, "--vfs2=true") } + if c.FUSE { + f = append(f, "--fuse=true") + } + return f } diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index 0c0423ab2..93ac7ec41 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -205,6 +205,10 @@ func New(args Args) (*Loader, error) { // Is this a VFSv2 kernel? if args.Conf.VFS2 { kernel.VFS2Enabled = true + if args.Conf.FUSE { + kernel.FUSEEnabled = true + } + vfs2.Override() } diff --git a/runsc/boot/vfs.go b/runsc/boot/vfs.go index 6ee6fae04..56f4ba15d 100644 --- a/runsc/boot/vfs.go +++ b/runsc/boot/vfs.go @@ -86,9 +86,12 @@ func registerFilesystems(k *kernel.Kernel) error { return fmt.Errorf("registering ttydev: %w", err) } - if err := fuse.Register(vfsObj); err != nil { - return fmt.Errorf("registering fusedev: %w", err) + if kernel.FUSEEnabled { + if err := fuse.Register(vfsObj); err != nil { + return fmt.Errorf("registering fusedev: %w", err) + } } + if err := tundev.Register(vfsObj); err != nil { return fmt.Errorf("registering tundev: %v", err) } @@ -110,8 +113,11 @@ func registerFilesystems(k *kernel.Kernel) error { if err := tundev.CreateDevtmpfsFiles(ctx, a); err != nil { return fmt.Errorf("creating tundev devtmpfs files: %v", err) } - if err := fuse.CreateDevtmpfsFile(ctx, a); err != nil { - return fmt.Errorf("creating fusedev devtmpfs files: %w", err) + + if kernel.FUSEEnabled { + if err := fuse.CreateDevtmpfsFile(ctx, a); err != nil { + return fmt.Errorf("creating fusedev devtmpfs files: %w", err) + } } return nil } diff --git a/runsc/main.go b/runsc/main.go index c9f47c579..69cb505fa 100644 --- a/runsc/main.go +++ b/runsc/main.go @@ -88,6 +88,7 @@ var ( referenceLeakMode = flag.String("ref-leak-mode", "disabled", "sets reference leak check mode: disabled (default), log-names, log-traces.") cpuNumFromQuota = flag.Bool("cpu-num-from-quota", false, "set cpu number to cpu quota (least integer greater or equal to quota value, but not less than 2)") vfs2Enabled = flag.Bool("vfs2", false, "TEST ONLY; use while VFSv2 is landing. This uses the new experimental VFS layer.") + fuseEnabled = flag.Bool("fuse", false, "TEST ONLY; use while FUSE in VFSv2 is landing. This allows the use of the new experimental FUSE filesystem.") // Test flags, not to be used outside tests, ever. testOnlyAllowRunAsCurrentUserWithoutChroot = flag.Bool("TESTONLY-unsafe-nonroot", false, "TEST ONLY; do not ever use! This skips many security measures that isolate the host from the sandbox.") @@ -242,6 +243,7 @@ func main() { OverlayfsStaleRead: *overlayfsStaleRead, CPUNumFromQuota: *cpuNumFromQuota, VFS2: *vfs2Enabled, + FUSE: *fuseEnabled, QDisc: queueingDiscipline, TestOnlyAllowRunAsCurrentUserWithoutChroot: *testOnlyAllowRunAsCurrentUserWithoutChroot, TestOnlyTestNameEnv: *testOnlyTestNameEnv, diff --git a/test/runner/defs.bzl b/test/runner/defs.bzl index 921e499be..600cb5192 100644 --- a/test/runner/defs.bzl +++ b/test/runner/defs.bzl @@ -61,7 +61,8 @@ def _syscall_test( file_access = "exclusive", overlay = False, add_uds_tree = False, - vfs2 = False): + vfs2 = False, + fuse = False): # Prepend "runsc" to non-native platform names. full_platform = platform if platform == "native" else "runsc_" + platform @@ -73,6 +74,8 @@ def _syscall_test( name += "_overlay" if vfs2: name += "_vfs2" + if fuse: + name += "_fuse" if network != "none": name += "_" + network + "net" @@ -107,6 +110,7 @@ def _syscall_test( "--overlay=" + str(overlay), "--add-uds-tree=" + str(add_uds_tree), "--vfs2=" + str(vfs2), + "--fuse=" + str(fuse), ] # Call the rule above. @@ -129,6 +133,7 @@ def syscall_test( add_uds_tree = False, add_hostinet = False, vfs2 = False, + fuse = False, tags = None): """syscall_test is a macro that will create targets for all platforms. @@ -188,6 +193,19 @@ def syscall_test( vfs2 = True, ) + if vfs2 and fuse: + _syscall_test( + test = test, + shard_count = shard_count, + size = size, + platform = default_platform, + use_tmpfs = use_tmpfs, + add_uds_tree = add_uds_tree, + tags = platforms[default_platform] + vfs2_tags, + vfs2 = True, + fuse = True, + ) + # TODO(gvisor.dev/issue/1487): Enable VFS2 overlay tests. if add_overlay: _syscall_test( diff --git a/test/runner/runner.go b/test/runner/runner.go index 5456e46a6..2296f3a46 100644 --- a/test/runner/runner.go +++ b/test/runner/runner.go @@ -47,6 +47,7 @@ var ( fileAccess = flag.String("file-access", "exclusive", "mounts root in exclusive or shared mode") overlay = flag.Bool("overlay", false, "wrap filesystem mounts with writable tmpfs overlay") vfs2 = flag.Bool("vfs2", false, "enable VFS2") + fuse = flag.Bool("fuse", false, "enable FUSE") parallel = flag.Bool("parallel", false, "run tests in parallel") runscPath = flag.String("runsc", "", "path to runsc binary") @@ -149,6 +150,9 @@ func runRunsc(tc gtest.TestCase, spec *specs.Spec) error { } if *vfs2 { args = append(args, "-vfs2") + if *fuse { + args = append(args, "-fuse") + } } if *debug { args = append(args, "-debug", "-log-packets=true") @@ -358,6 +362,12 @@ func runTestCaseRunsc(testBin string, tc gtest.TestCase, t *testing.T) { vfsVar := "GVISOR_VFS" if *vfs2 { env = append(env, vfsVar+"=VFS2") + fuseVar := "FUSE_ENABLED" + if *fuse { + env = append(env, fuseVar+"=TRUE") + } else { + env = append(env, fuseVar+"=FALSE") + } } else { env = append(env, vfsVar+"=VFS1") } diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD index 28ef55945..c06a75ada 100644 --- a/test/syscalls/BUILD +++ b/test/syscalls/BUILD @@ -146,6 +146,7 @@ syscall_test( ) syscall_test( + fuse = "True", test = "//test/syscalls/linux:dev_test", vfs2 = "True", ) diff --git a/test/syscalls/linux/dev.cc b/test/syscalls/linux/dev.cc index 3c88c4cbd..6fa16208e 100644 --- a/test/syscalls/linux/dev.cc +++ b/test/syscalls/linux/dev.cc @@ -156,7 +156,7 @@ TEST(DevTest, TTYExists) { TEST(DevTest, OpenDevFuse) { // Note(gvisor.dev/issue/3076) This won't work in the sentry until the new // device registration is complete. - SKIP_IF(IsRunningWithVFS1() || IsRunningOnGvisor()); + SKIP_IF(IsRunningWithVFS1() || IsRunningOnGvisor() || !IsFUSEEnabled()); ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/fuse", O_RDONLY)); } diff --git a/test/util/test_util.cc b/test/util/test_util.cc index 8a037f45f..d0c1d6426 100644 --- a/test/util/test_util.cc +++ b/test/util/test_util.cc @@ -42,6 +42,7 @@ namespace testing { constexpr char kGvisorNetwork[] = "GVISOR_NETWORK"; constexpr char kGvisorVfs[] = "GVISOR_VFS"; +constexpr char kFuseEnabled[] = "FUSE_ENABLED"; bool IsRunningOnGvisor() { return GvisorPlatform() != Platform::kNative; } @@ -68,6 +69,11 @@ bool IsRunningWithVFS1() { return strcmp(env, "VFS1") == 0; } +bool IsFUSEEnabled() { + const char* env = getenv(kFuseEnabled); + return env && strcmp(env, "TRUE") == 0; +} + // Inline cpuid instruction. Preserve %ebx/%rbx register. In PIC compilations // %ebx contains the address of the global offset table. %rbx is occasionally // used to address stack variables in presence of dynamic allocas. diff --git a/test/util/test_util.h b/test/util/test_util.h index 109078fc7..89ac575bd 100644 --- a/test/util/test_util.h +++ b/test/util/test_util.h @@ -225,6 +225,7 @@ const std::string GvisorPlatform(); bool IsRunningWithHostinet(); // TODO(gvisor.dev/issue/1624): Delete once VFS1 is gone. bool IsRunningWithVFS1(); +bool IsFUSEEnabled(); #ifdef __linux__ void SetupGvisorDeathTest(); -- cgit v1.2.3 From 6994f4d5912ce9dc9233aebe9902892824904b71 Mon Sep 17 00:00:00 2001 From: Ayush Ranjan Date: Mon, 13 Jul 2020 13:33:09 -0700 Subject: [vfs2] Make gofer metadata atomics consistent For accessing metadata fields: - If metadataMu is locked, we can access without atomics - If metadataMu is unlocked, we should use atomics For mutating metadata fields: - Always lock metadataMu and use atomics. There were some instances of inconsistencies which have been fixed. PiperOrigin-RevId: 321022895 --- pkg/sentry/fsimpl/gofer/gofer.go | 24 +++++++++++++++--------- pkg/sentry/fsimpl/gofer/regular_file.go | 4 +++- 2 files changed, 18 insertions(+), 10 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go index 2b83094cd..b74d489a0 100644 --- a/pkg/sentry/fsimpl/gofer/gofer.go +++ b/pkg/sentry/fsimpl/gofer/gofer.go @@ -602,8 +602,14 @@ type dentry struct { // returned by the server. dirents is protected by dirMu. dirents []vfs.Dirent - // Cached metadata; protected by metadataMu and accessed using atomic - // memory operations unless otherwise specified. + // Cached metadata; protected by metadataMu. + // To access: + // - In situations where consistency is not required (like stat), these + // can be accessed using atomic operations only (without locking). + // - Lock metadataMu and can access without atomic operations. + // To mutate: + // - Lock metadataMu and use atomic operations to update because we might + // have atomic readers that don't hold the lock. metadataMu sync.Mutex ino inodeNumber // immutable mode uint32 // type is immutable, perms are mutable @@ -616,7 +622,7 @@ type dentry struct { ctime int64 btime int64 // File size, protected by both metadataMu and dataMu (i.e. both must be - // locked to mutate it). + // locked to mutate it; locking either is sufficient to access it). size uint64 // nlink counts the number of hard links to this dentry. It's updated and @@ -904,14 +910,14 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *lin // Prepare for truncate. if stat.Mask&linux.STATX_SIZE != 0 { - switch d.mode & linux.S_IFMT { - case linux.S_IFREG: + switch mode.FileType() { + case linux.ModeRegular: if !setLocalMtime { // Truncate updates mtime. setLocalMtime = true stat.Mtime.Nsec = linux.UTIME_NOW } - case linux.S_IFDIR: + case linux.ModeDirectory: return syserror.EISDIR default: return syserror.EINVAL @@ -994,7 +1000,7 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *lin func (d *dentry) updateFileSizeLocked(newSize uint64) { d.dataMu.Lock() oldSize := d.size - d.size = newSize + atomic.StoreUint64(&d.size, newSize) // d.dataMu must be unlocked to lock d.mapsMu and invalidate mappings // below. This allows concurrent calls to Read/Translate/etc. These // functions synchronize with truncation by refusing to use cache @@ -1340,8 +1346,8 @@ func (d *dentry) removexattr(ctx context.Context, creds *auth.Credentials, name // Extended attributes in the user.* namespace are only supported for regular // files and directories. func (d *dentry) userXattrSupported() bool { - filetype := linux.S_IFMT & atomic.LoadUint32(&d.mode) - return filetype == linux.S_IFREG || filetype == linux.S_IFDIR + filetype := linux.FileMode(atomic.LoadUint32(&d.mode)).FileType() + return filetype == linux.ModeRegular || filetype == linux.ModeDirectory } // Preconditions: !d.isSynthetic(). d.isRegularFile() || d.isDir(). diff --git a/pkg/sentry/fsimpl/gofer/regular_file.go b/pkg/sentry/fsimpl/gofer/regular_file.go index a2f02d9c7..f10350c97 100644 --- a/pkg/sentry/fsimpl/gofer/regular_file.go +++ b/pkg/sentry/fsimpl/gofer/regular_file.go @@ -89,7 +89,9 @@ func (fd *regularFileFD) Allocate(ctx context.Context, mode, offset, length uint if err != nil { return err } - d.size = size + d.dataMu.Lock() + atomic.StoreUint64(&d.size, size) + d.dataMu.Unlock() if !d.cachedMetadataAuthoritative() { d.touchCMtimeLocked() } -- cgit v1.2.3 From 8494a0325d2ca7ba7bc9c17e55546f1665500660 Mon Sep 17 00:00:00 2001 From: Craig Chi Date: Tue, 14 Jul 2020 15:42:47 -0700 Subject: Include context in kernfs.Inode.Stat method To implement stat(2) in FUSE, we have to embed credentials and pid in request header. The information should be extracted from the context passed to VFS layer. Therefore `Stat()` signature in `kernfs.Inode` interface should include context as first argument. Some other fs implementations need to be modified as well, such as devpts, host, pipefs, and proc. Fixes #3235 --- pkg/sentry/fsimpl/devpts/master.go | 6 +++--- pkg/sentry/fsimpl/devpts/slave.go | 6 +++--- pkg/sentry/fsimpl/host/host.go | 6 +++--- pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go | 2 +- pkg/sentry/fsimpl/kernfs/fd_impl_util.go | 8 ++++---- pkg/sentry/fsimpl/kernfs/filesystem.go | 2 +- pkg/sentry/fsimpl/kernfs/inode_impl_util.go | 2 +- pkg/sentry/fsimpl/kernfs/kernfs.go | 2 +- pkg/sentry/fsimpl/pipefs/pipefs.go | 2 +- pkg/sentry/fsimpl/proc/subtasks.go | 4 ++-- pkg/sentry/fsimpl/proc/task.go | 4 ++-- pkg/sentry/fsimpl/proc/task_files.go | 2 +- pkg/sentry/fsimpl/proc/tasks.go | 4 ++-- 13 files changed, 25 insertions(+), 25 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/devpts/master.go b/pkg/sentry/fsimpl/devpts/master.go index 69879498a..1081fff52 100644 --- a/pkg/sentry/fsimpl/devpts/master.go +++ b/pkg/sentry/fsimpl/devpts/master.go @@ -67,8 +67,8 @@ func (mi *masterInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vf } // Stat implements kernfs.Inode.Stat. -func (mi *masterInode) Stat(vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) { - statx, err := mi.InodeAttrs.Stat(vfsfs, opts) +func (mi *masterInode) Stat(ctx context.Context, vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) { + statx, err := mi.InodeAttrs.Stat(ctx, vfsfs, opts) if err != nil { return linux.Statx{}, err } @@ -186,7 +186,7 @@ func (mfd *masterFileDescription) SetStat(ctx context.Context, opts vfs.SetStatO // Stat implements vfs.FileDescriptionImpl.Stat. func (mfd *masterFileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { fs := mfd.vfsfd.VirtualDentry().Mount().Filesystem() - return mfd.inode.Stat(fs, opts) + return mfd.inode.Stat(ctx, fs, opts) } // LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. diff --git a/pkg/sentry/fsimpl/devpts/slave.go b/pkg/sentry/fsimpl/devpts/slave.go index cf1a0f0ac..2018b978a 100644 --- a/pkg/sentry/fsimpl/devpts/slave.go +++ b/pkg/sentry/fsimpl/devpts/slave.go @@ -73,8 +73,8 @@ func (si *slaveInode) Valid(context.Context) bool { } // Stat implements kernfs.Inode.Stat. -func (si *slaveInode) Stat(vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) { - statx, err := si.InodeAttrs.Stat(vfsfs, opts) +func (si *slaveInode) Stat(ctx context.Context, vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) { + statx, err := si.InodeAttrs.Stat(ctx, vfsfs, opts) if err != nil { return linux.Statx{}, err } @@ -183,7 +183,7 @@ func (sfd *slaveFileDescription) SetStat(ctx context.Context, opts vfs.SetStatOp // Stat implements vfs.FileDescriptionImpl.Stat. func (sfd *slaveFileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { fs := sfd.vfsfd.VirtualDentry().Mount().Filesystem() - return sfd.inode.Stat(fs, opts) + return sfd.inode.Stat(ctx, fs, opts) } // LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go index 1cd2982cb..1a88cb657 100644 --- a/pkg/sentry/fsimpl/host/host.go +++ b/pkg/sentry/fsimpl/host/host.go @@ -259,7 +259,7 @@ func (i *inode) Mode() linux.FileMode { } // Stat implements kernfs.Inode. -func (i *inode) Stat(vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) { +func (i *inode) Stat(ctx context.Context, vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) { if opts.Mask&linux.STATX__RESERVED != 0 { return linux.Statx{}, syserror.EINVAL } @@ -534,8 +534,8 @@ func (f *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) } // Stat implements vfs.FileDescriptionImpl. -func (f *fileDescription) Stat(_ context.Context, opts vfs.StatOptions) (linux.Statx, error) { - return f.inode.Stat(f.vfsfd.Mount().Filesystem(), opts) +func (f *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { + return f.inode.Stat(ctx, f.vfsfd.Mount().Filesystem(), opts) } // Release implements vfs.FileDescriptionImpl. diff --git a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go index 6886b0876..c6c4472e7 100644 --- a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go +++ b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go @@ -127,7 +127,7 @@ func (fd *DynamicBytesFD) Release() {} // Stat implements vfs.FileDescriptionImpl.Stat. func (fd *DynamicBytesFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { fs := fd.vfsfd.VirtualDentry().Mount().Filesystem() - return fd.inode.Stat(fs, opts) + return fd.inode.Stat(ctx, fs, opts) } // SetStat implements vfs.FileDescriptionImpl.SetStat. diff --git a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go index ca8b8c63b..3f0aea73a 100644 --- a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go @@ -132,7 +132,7 @@ func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirent opts := vfs.StatOptions{Mask: linux.STATX_INO} // Handle ".". if fd.off == 0 { - stat, err := fd.inode().Stat(fd.filesystem(), opts) + stat, err := fd.inode().Stat(ctx, fd.filesystem(), opts) if err != nil { return err } @@ -152,7 +152,7 @@ func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirent if fd.off == 1 { vfsd := fd.vfsfd.VirtualDentry().Dentry() parentInode := genericParentOrSelf(vfsd.Impl().(*Dentry)).inode - stat, err := parentInode.Stat(fd.filesystem(), opts) + stat, err := parentInode.Stat(ctx, fd.filesystem(), opts) if err != nil { return err } @@ -176,7 +176,7 @@ func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirent childIdx := fd.off - 2 for it := fd.children.nthLocked(childIdx); it != nil; it = it.Next() { inode := it.Dentry.Impl().(*Dentry).inode - stat, err := inode.Stat(fd.filesystem(), opts) + stat, err := inode.Stat(ctx, fd.filesystem(), opts) if err != nil { return err } @@ -226,7 +226,7 @@ func (fd *GenericDirectoryFD) Seek(ctx context.Context, offset int64, whence int func (fd *GenericDirectoryFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { fs := fd.filesystem() inode := fd.inode() - return inode.Stat(fs, opts) + return inode.Stat(ctx, fs, opts) } // SetStat implements vfs.FileDescriptionImpl.SetStat. diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go index 8939871c1..61a36cff9 100644 --- a/pkg/sentry/fsimpl/kernfs/filesystem.go +++ b/pkg/sentry/fsimpl/kernfs/filesystem.go @@ -684,7 +684,7 @@ func (fs *Filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf if err != nil { return linux.Statx{}, err } - return inode.Stat(fs.VFSFilesystem(), opts) + return inode.Stat(ctx, fs.VFSFilesystem(), opts) } // StatFSAt implements vfs.FilesystemImpl.StatFSAt. diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go index 4cb885d87..2ab3f1761 100644 --- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go @@ -243,7 +243,7 @@ func (a *InodeAttrs) Mode() linux.FileMode { // Stat partially implements Inode.Stat. Note that this function doesn't provide // all the stat fields, and the embedder should consider extending the result // with filesystem-specific fields. -func (a *InodeAttrs) Stat(*vfs.Filesystem, vfs.StatOptions) (linux.Statx, error) { +func (a *InodeAttrs) Stat(context.Context, *vfs.Filesystem, vfs.StatOptions) (linux.Statx, error) { var stat linux.Statx stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO | linux.STATX_NLINK stat.DevMajor = a.devMajor diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go index 596de1edf..46f207664 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs.go @@ -346,7 +346,7 @@ type inodeMetadata interface { // Stat returns the metadata for this inode. This corresponds to // vfs.FilesystemImpl.StatAt. - Stat(fs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) + Stat(ctx context.Context, fs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) // SetStat updates the metadata for this inode. This corresponds to // vfs.FilesystemImpl.SetStatAt. Implementations are responsible for checking diff --git a/pkg/sentry/fsimpl/pipefs/pipefs.go b/pkg/sentry/fsimpl/pipefs/pipefs.go index dd7eaf4a8..811f80a5f 100644 --- a/pkg/sentry/fsimpl/pipefs/pipefs.go +++ b/pkg/sentry/fsimpl/pipefs/pipefs.go @@ -115,7 +115,7 @@ func (i *inode) Mode() linux.FileMode { } // Stat implements kernfs.Inode.Stat. -func (i *inode) Stat(vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) { +func (i *inode) Stat(_ context.Context, vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) { ts := linux.NsecToStatxTimestamp(i.ctime.Nanoseconds()) return linux.Statx{ Mask: linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME | linux.STATX_INO | linux.STATX_SIZE | linux.STATX_BLOCKS, diff --git a/pkg/sentry/fsimpl/proc/subtasks.go b/pkg/sentry/fsimpl/proc/subtasks.go index 36a89540c..dad4db1a7 100644 --- a/pkg/sentry/fsimpl/proc/subtasks.go +++ b/pkg/sentry/fsimpl/proc/subtasks.go @@ -165,8 +165,8 @@ func (i *subtasksInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *v } // Stat implements kernfs.Inode. -func (i *subtasksInode) Stat(vsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) { - stat, err := i.InodeAttrs.Stat(vsfs, opts) +func (i *subtasksInode) Stat(ctx context.Context, vsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) { + stat, err := i.InodeAttrs.Stat(ctx, vsfs, opts) if err != nil { return linux.Statx{}, err } diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go index 8bb2b0ce1..a5c7aa470 100644 --- a/pkg/sentry/fsimpl/proc/task.go +++ b/pkg/sentry/fsimpl/proc/task.go @@ -156,8 +156,8 @@ func (fs *filesystem) newTaskOwnedDir(task *kernel.Task, ino uint64, perm linux. } // Stat implements kernfs.Inode. -func (i *taskOwnedInode) Stat(fs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) { - stat, err := i.Inode.Stat(fs, opts) +func (i *taskOwnedInode) Stat(ctx context.Context, fs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) { + stat, err := i.Inode.Stat(ctx, fs, opts) if err != nil { return linux.Statx{}, err } diff --git a/pkg/sentry/fsimpl/proc/task_files.go b/pkg/sentry/fsimpl/proc/task_files.go index 9af43b859..859b7d727 100644 --- a/pkg/sentry/fsimpl/proc/task_files.go +++ b/pkg/sentry/fsimpl/proc/task_files.go @@ -876,7 +876,7 @@ var _ vfs.FileDescriptionImpl = (*namespaceFD)(nil) // Stat implements FileDescriptionImpl. func (fd *namespaceFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { vfs := fd.vfsfd.VirtualDentry().Mount().Filesystem() - return fd.inode.Stat(vfs, opts) + return fd.inode.Stat(ctx, vfs, opts) } // SetStat implements FileDescriptionImpl. diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go index 2f214d0c2..6d2b90a8b 100644 --- a/pkg/sentry/fsimpl/proc/tasks.go +++ b/pkg/sentry/fsimpl/proc/tasks.go @@ -206,8 +206,8 @@ func (i *tasksInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs. return fd.VFSFileDescription(), nil } -func (i *tasksInode) Stat(vsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) { - stat, err := i.InodeAttrs.Stat(vsfs, opts) +func (i *tasksInode) Stat(ctx context.Context, vsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) { + stat, err := i.InodeAttrs.Stat(ctx, vsfs, opts) if err != nil { return linux.Statx{}, err } -- cgit v1.2.3 From 1b9965e06a966977a99569484da139d64d1db95e Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Tue, 14 Jul 2020 17:29:53 -0700 Subject: Update special file option name in comment. PiperOrigin-RevId: 321269281 --- pkg/sentry/fsimpl/gofer/special_file.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/gofer/special_file.go b/pkg/sentry/fsimpl/gofer/special_file.go index c1e6b13e5..a7b53b2d2 100644 --- a/pkg/sentry/fsimpl/gofer/special_file.go +++ b/pkg/sentry/fsimpl/gofer/special_file.go @@ -28,9 +28,9 @@ import ( ) // specialFileFD implements vfs.FileDescriptionImpl for pipes, sockets, device -// special files, and (when filesystemOptions.specialRegularFiles is in effect) -// regular files. specialFileFD differs from regularFileFD by using per-FD -// handles instead of shared per-dentry handles, and never buffering I/O. +// special files, and (when filesystemOptions.regularFilesUseSpecialFileFD is +// in effect) regular files. specialFileFD differs from regularFileFD by using +// per-FD handles instead of shared per-dentry handles, and never buffering I/O. type specialFileFD struct { fileDescription -- cgit v1.2.3 From db653bb34baeba24b8b658eb0985e4c5185344cb Mon Sep 17 00:00:00 2001 From: Ting-Yu Wang Date: Wed, 15 Jul 2020 15:04:10 -0700 Subject: fdbased: Vectorized write for packet; relax writev syscall filter. Now it calls pkt.Data.ToView() when writing the packet. This may require copying when the packet is large, which puts the worse case in an even worse situation. This sent out in a separate preparation change as it requires syscall filter changes. This change will be followed by the change for the adoption of the new PacketHeader API. PiperOrigin-RevId: 321447003 --- pkg/iovec/BUILD | 18 +++++ pkg/iovec/iovec.go | 75 +++++++++++++++++++ pkg/iovec/iovec_test.go | 121 +++++++++++++++++++++++++++++++ pkg/sentry/fs/host/BUILD | 1 + pkg/sentry/fs/host/socket_iovec.go | 7 +- pkg/sentry/fsimpl/host/BUILD | 1 + pkg/sentry/fsimpl/host/socket_iovec.go | 7 +- pkg/tcpip/link/fdbased/BUILD | 1 + pkg/tcpip/link/fdbased/endpoint.go | 69 ++++++------------ pkg/tcpip/link/rawfile/rawfile_unsafe.go | 32 +------- runsc/boot/filter/config.go | 11 +-- 11 files changed, 248 insertions(+), 95 deletions(-) create mode 100644 pkg/iovec/BUILD create mode 100644 pkg/iovec/iovec.go create mode 100644 pkg/iovec/iovec_test.go (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/iovec/BUILD b/pkg/iovec/BUILD new file mode 100644 index 000000000..eda82cfc1 --- /dev/null +++ b/pkg/iovec/BUILD @@ -0,0 +1,18 @@ +load("//tools:defs.bzl", "go_library", "go_test") + +package(licenses = ["notice"]) + +go_library( + name = "iovec", + srcs = ["iovec.go"], + visibility = ["//:sandbox"], + deps = ["//pkg/abi/linux"], +) + +go_test( + name = "iovec_test", + size = "small", + srcs = ["iovec_test.go"], + library = ":iovec", + deps = ["@org_golang_x_sys//unix:go_default_library"], +) diff --git a/pkg/iovec/iovec.go b/pkg/iovec/iovec.go new file mode 100644 index 000000000..dd70fe80f --- /dev/null +++ b/pkg/iovec/iovec.go @@ -0,0 +1,75 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build linux + +// Package iovec provides helpers to interact with vectorized I/O on host +// system. +package iovec + +import ( + "syscall" + + "gvisor.dev/gvisor/pkg/abi/linux" +) + +// MaxIovs is the maximum number of iovecs host platform can accept. +var MaxIovs = linux.UIO_MAXIOV + +// Builder is a builder for slice of syscall.Iovec. +type Builder struct { + iovec []syscall.Iovec + storage [8]syscall.Iovec + + // overflow tracks the last buffer when iovec length is at MaxIovs. + overflow []byte +} + +// Add adds buf to b preparing to be written. Zero-length buf won't be added. +func (b *Builder) Add(buf []byte) { + if len(buf) == 0 { + return + } + if b.iovec == nil { + b.iovec = b.storage[:0] + } + if len(b.iovec) >= MaxIovs { + b.addByAppend(buf) + return + } + b.iovec = append(b.iovec, syscall.Iovec{ + Base: &buf[0], + Len: uint64(len(buf)), + }) + // Keep the last buf if iovec is at max capacity. We will need to append to it + // for later bufs. + if len(b.iovec) == MaxIovs { + n := len(buf) + b.overflow = buf[:n:n] + } +} + +func (b *Builder) addByAppend(buf []byte) { + b.overflow = append(b.overflow, buf...) + b.iovec[len(b.iovec)-1] = syscall.Iovec{ + Base: &b.overflow[0], + Len: uint64(len(b.overflow)), + } +} + +// Build returns the final Iovec slice. The length of returned iovec will not +// excceed MaxIovs. +func (b *Builder) Build() []syscall.Iovec { + return b.iovec +} diff --git a/pkg/iovec/iovec_test.go b/pkg/iovec/iovec_test.go new file mode 100644 index 000000000..a3900c299 --- /dev/null +++ b/pkg/iovec/iovec_test.go @@ -0,0 +1,121 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build linux + +package iovec + +import ( + "bytes" + "fmt" + "syscall" + "testing" + "unsafe" + + "golang.org/x/sys/unix" +) + +func TestBuilderEmpty(t *testing.T) { + var builder Builder + iovecs := builder.Build() + if got, want := len(iovecs), 0; got != want { + t.Errorf("len(iovecs) = %d, want %d", got, want) + } +} + +func TestBuilderBuild(t *testing.T) { + a := []byte{1, 2} + b := []byte{3, 4, 5} + + var builder Builder + builder.Add(a) + builder.Add(b) + builder.Add(nil) // Nil slice won't be added. + builder.Add([]byte{}) // Empty slice won't be added. + iovecs := builder.Build() + + if got, want := len(iovecs), 2; got != want { + t.Fatalf("len(iovecs) = %d, want %d", got, want) + } + for i, data := range [][]byte{a, b} { + if got, want := *iovecs[i].Base, data[0]; got != want { + t.Fatalf("*iovecs[%d].Base = %d, want %d", i, got, want) + } + if got, want := iovecs[i].Len, uint64(len(data)); got != want { + t.Fatalf("iovecs[%d].Len = %d, want %d", i, got, want) + } + } +} + +func TestBuilderBuildMaxIov(t *testing.T) { + for _, test := range []struct { + numIov int + }{ + { + numIov: MaxIovs - 1, + }, + { + numIov: MaxIovs, + }, + { + numIov: MaxIovs + 1, + }, + { + numIov: MaxIovs + 10, + }, + } { + name := fmt.Sprintf("numIov=%v", test.numIov) + t.Run(name, func(t *testing.T) { + var data []byte + var builder Builder + for i := 0; i < test.numIov; i++ { + buf := []byte{byte(i)} + builder.Add(buf) + data = append(data, buf...) + } + iovec := builder.Build() + + // Check the expected length of iovec. + wantNum := test.numIov + if wantNum > MaxIovs { + wantNum = MaxIovs + } + if got, want := len(iovec), wantNum; got != want { + t.Errorf("len(iovec) = %d, want %d", got, want) + } + + // Test a real read-write. + var fds [2]int + if err := unix.Pipe(fds[:]); err != nil { + t.Fatalf("Pipe: %v", err) + } + defer syscall.Close(fds[0]) + defer syscall.Close(fds[1]) + + wrote, _, e := syscall.RawSyscall(syscall.SYS_WRITEV, uintptr(fds[1]), uintptr(unsafe.Pointer(&iovec[0])), uintptr(len(iovec))) + if int(wrote) != len(data) || e != 0 { + t.Fatalf("writev: %v, %v; want %v, 0", wrote, e, len(data)) + } + + got := make([]byte, len(data)) + if n, err := syscall.Read(fds[0], got); n != len(got) || err != nil { + t.Fatalf("read: %v, %v; want %v, nil", n, err, len(got)) + } + + if !bytes.Equal(got, data) { + t.Errorf("read: got data %v, want %v", got, data) + } + }) + } +} diff --git a/pkg/sentry/fs/host/BUILD b/pkg/sentry/fs/host/BUILD index aabce6cc9..d41d23a43 100644 --- a/pkg/sentry/fs/host/BUILD +++ b/pkg/sentry/fs/host/BUILD @@ -30,6 +30,7 @@ go_library( "//pkg/context", "//pkg/fd", "//pkg/fdnotifier", + "//pkg/iovec", "//pkg/log", "//pkg/refs", "//pkg/safemem", diff --git a/pkg/sentry/fs/host/socket_iovec.go b/pkg/sentry/fs/host/socket_iovec.go index 5c18dbd5e..905afb50d 100644 --- a/pkg/sentry/fs/host/socket_iovec.go +++ b/pkg/sentry/fs/host/socket_iovec.go @@ -17,15 +17,12 @@ package host import ( "syscall" - "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/iovec" "gvisor.dev/gvisor/pkg/syserror" ) // LINT.IfChange -// maxIovs is the maximum number of iovecs to pass to the host. -var maxIovs = linux.UIO_MAXIOV - // copyToMulti copies as many bytes from src to dst as possible. func copyToMulti(dst [][]byte, src []byte) { for _, d := range dst { @@ -76,7 +73,7 @@ func buildIovec(bufs [][]byte, maxlen int64, truncate bool) (length int64, iovec } } - if iovsRequired > maxIovs { + if iovsRequired > iovec.MaxIovs { // The kernel will reject our call if we pass this many iovs. // Use a single intermediate buffer instead. b := make([]byte, stopLen) diff --git a/pkg/sentry/fsimpl/host/BUILD b/pkg/sentry/fsimpl/host/BUILD index 44a09d87a..e86fbe2d5 100644 --- a/pkg/sentry/fsimpl/host/BUILD +++ b/pkg/sentry/fsimpl/host/BUILD @@ -22,6 +22,7 @@ go_library( "//pkg/context", "//pkg/fdnotifier", "//pkg/fspath", + "//pkg/iovec", "//pkg/log", "//pkg/refs", "//pkg/safemem", diff --git a/pkg/sentry/fsimpl/host/socket_iovec.go b/pkg/sentry/fsimpl/host/socket_iovec.go index 584c247d2..fc0d5fd38 100644 --- a/pkg/sentry/fsimpl/host/socket_iovec.go +++ b/pkg/sentry/fsimpl/host/socket_iovec.go @@ -17,13 +17,10 @@ package host import ( "syscall" - "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/iovec" "gvisor.dev/gvisor/pkg/syserror" ) -// maxIovs is the maximum number of iovecs to pass to the host. -var maxIovs = linux.UIO_MAXIOV - // copyToMulti copies as many bytes from src to dst as possible. func copyToMulti(dst [][]byte, src []byte) { for _, d := range dst { @@ -74,7 +71,7 @@ func buildIovec(bufs [][]byte, maxlen int64, truncate bool) (length int64, iovec } } - if iovsRequired > maxIovs { + if iovsRequired > iovec.MaxIovs { // The kernel will reject our call if we pass this many iovs. // Use a single intermediate buffer instead. b := make([]byte, stopLen) diff --git a/pkg/tcpip/link/fdbased/BUILD b/pkg/tcpip/link/fdbased/BUILD index aa6db9aea..507b44abc 100644 --- a/pkg/tcpip/link/fdbased/BUILD +++ b/pkg/tcpip/link/fdbased/BUILD @@ -15,6 +15,7 @@ go_library( visibility = ["//visibility:public"], deps = [ "//pkg/binary", + "//pkg/iovec", "//pkg/sync", "//pkg/tcpip", "//pkg/tcpip/buffer", diff --git a/pkg/tcpip/link/fdbased/endpoint.go b/pkg/tcpip/link/fdbased/endpoint.go index 32abe2a13..6aa1badc7 100644 --- a/pkg/tcpip/link/fdbased/endpoint.go +++ b/pkg/tcpip/link/fdbased/endpoint.go @@ -45,6 +45,7 @@ import ( "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/binary" + "gvisor.dev/gvisor/pkg/iovec" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/tcpip" "gvisor.dev/gvisor/pkg/tcpip/buffer" @@ -406,6 +407,8 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.Ne eth.Encode(ethHdr) } + var builder iovec.Builder + fd := e.fds[pkt.Hash%uint32(len(e.fds))] if e.Capabilities()&stack.CapabilityHardwareGSO != 0 { vnetHdr := virtioNetHdr{} @@ -430,29 +433,25 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.Ne } vnetHdrBuf := binary.Marshal(make([]byte, 0, virtioNetHdrSize), binary.LittleEndian, vnetHdr) - return rawfile.NonBlockingWrite3(fd, vnetHdrBuf, pkt.Header.View(), pkt.Data.ToView()) + builder.Add(vnetHdrBuf) } - if pkt.Data.Size() == 0 { - return rawfile.NonBlockingWrite(fd, pkt.Header.View()) - } - if pkt.Header.UsedLength() == 0 { - return rawfile.NonBlockingWrite(fd, pkt.Data.ToView()) + builder.Add(pkt.Header.View()) + for _, v := range pkt.Data.Views() { + builder.Add(v) } - return rawfile.NonBlockingWrite3(fd, pkt.Header.View(), pkt.Data.ToView(), nil) + return rawfile.NonBlockingWriteIovec(fd, builder.Build()) } func (e *endpoint) sendBatch(batchFD int, batch []*stack.PacketBuffer) (int, *tcpip.Error) { // Send a batch of packets through batchFD. mmsgHdrs := make([]rawfile.MMsgHdr, 0, len(batch)) for _, pkt := range batch { - var ethHdrBuf []byte - iovLen := 0 + var eth header.Ethernet if e.hdrSize > 0 { // Add ethernet header if needed. - ethHdrBuf = make([]byte, header.EthernetMinimumSize) - eth := header.Ethernet(ethHdrBuf) + eth = make(header.Ethernet, header.EthernetMinimumSize) ethHdr := &header.EthernetFields{ DstAddr: pkt.EgressRoute.RemoteLinkAddress, Type: pkt.NetworkProtocolNumber, @@ -465,12 +464,11 @@ func (e *endpoint) sendBatch(batchFD int, batch []*stack.PacketBuffer) (int, *tc ethHdr.SrcAddr = e.addr } eth.Encode(ethHdr) - iovLen++ } - vnetHdr := virtioNetHdr{} var vnetHdrBuf []byte if e.Capabilities()&stack.CapabilityHardwareGSO != 0 { + vnetHdr := virtioNetHdr{} if pkt.GSOOptions != nil { vnetHdr.hdrLen = uint16(pkt.Header.UsedLength()) if pkt.GSOOptions.NeedsCsum { @@ -491,45 +489,20 @@ func (e *endpoint) sendBatch(batchFD int, batch []*stack.PacketBuffer) (int, *tc } } vnetHdrBuf = binary.Marshal(make([]byte, 0, virtioNetHdrSize), binary.LittleEndian, vnetHdr) - iovLen++ } - iovecs := make([]syscall.Iovec, iovLen+1+len(pkt.Data.Views())) + var builder iovec.Builder + builder.Add(vnetHdrBuf) + builder.Add(eth) + builder.Add(pkt.Header.View()) + for _, v := range pkt.Data.Views() { + builder.Add(v) + } + iovecs := builder.Build() + var mmsgHdr rawfile.MMsgHdr mmsgHdr.Msg.Iov = &iovecs[0] - iovecIdx := 0 - if vnetHdrBuf != nil { - v := &iovecs[iovecIdx] - v.Base = &vnetHdrBuf[0] - v.Len = uint64(len(vnetHdrBuf)) - iovecIdx++ - } - if ethHdrBuf != nil { - v := &iovecs[iovecIdx] - v.Base = ðHdrBuf[0] - v.Len = uint64(len(ethHdrBuf)) - iovecIdx++ - } - pktSize := uint64(0) - // Encode L3 Header - v := &iovecs[iovecIdx] - hdr := &pkt.Header - hdrView := hdr.View() - v.Base = &hdrView[0] - v.Len = uint64(len(hdrView)) - pktSize += v.Len - iovecIdx++ - - // Now encode the Transport Payload. - pktViews := pkt.Data.Views() - for i := range pktViews { - vec := &iovecs[iovecIdx] - iovecIdx++ - vec.Base = &pktViews[i][0] - vec.Len = uint64(len(pktViews[i])) - pktSize += vec.Len - } - mmsgHdr.Msg.Iovlen = uint64(iovecIdx) + mmsgHdr.Msg.Iovlen = uint64(len(iovecs)) mmsgHdrs = append(mmsgHdrs, mmsgHdr) } diff --git a/pkg/tcpip/link/rawfile/rawfile_unsafe.go b/pkg/tcpip/link/rawfile/rawfile_unsafe.go index 69de6eb3e..f4c32c2da 100644 --- a/pkg/tcpip/link/rawfile/rawfile_unsafe.go +++ b/pkg/tcpip/link/rawfile/rawfile_unsafe.go @@ -66,38 +66,14 @@ func NonBlockingWrite(fd int, buf []byte) *tcpip.Error { return nil } -// NonBlockingWrite3 writes up to three byte slices to a file descriptor in a -// single syscall. It fails if partial data is written. -func NonBlockingWrite3(fd int, b1, b2, b3 []byte) *tcpip.Error { - // If there is no second and third buffer, issue a regular write. - if len(b2) == 0 && len(b3) == 0 { - return NonBlockingWrite(fd, b1) - } - - // Build the iovec that represents them and issue a writev syscall. - iovec := [3]syscall.Iovec{ - { - Base: &b1[0], - Len: uint64(len(b1)), - }, - { - Base: &b2[0], - Len: uint64(len(b2)), - }, - } - iovecLen := uintptr(2) - - if len(b3) > 0 { - iovecLen++ - iovec[2].Base = &b3[0] - iovec[2].Len = uint64(len(b3)) - } - +// NonBlockingWriteIovec writes iovec to a file descriptor in a single syscall. +// It fails if partial data is written. +func NonBlockingWriteIovec(fd int, iovec []syscall.Iovec) *tcpip.Error { + iovecLen := uintptr(len(iovec)) _, _, e := syscall.RawSyscall(syscall.SYS_WRITEV, uintptr(fd), uintptr(unsafe.Pointer(&iovec[0])), iovecLen) if e != 0 { return TranslateErrno(e) } - return nil } diff --git a/runsc/boot/filter/config.go b/runsc/boot/filter/config.go index 60e33425f..149eb0b1b 100644 --- a/runsc/boot/filter/config.go +++ b/runsc/boot/filter/config.go @@ -310,19 +310,12 @@ var allowedSyscalls = seccomp.SyscallRules{ }, }, syscall.SYS_WRITE: {}, - // The only user in rawfile.NonBlockingWrite3 always passes iovcnt with - // values 2 or 3. Three iovec-s are passed, when the PACKET_VNET_HDR - // option is enabled for a packet socket. + // For rawfile.NonBlockingWriteIovec. syscall.SYS_WRITEV: []seccomp.Rule{ { seccomp.AllowAny{}, seccomp.AllowAny{}, - seccomp.AllowValue(2), - }, - { - seccomp.AllowAny{}, - seccomp.AllowAny{}, - seccomp.AllowValue(3), + seccomp.GreaterThan(0), }, }, } -- cgit v1.2.3 From 39525d64cbd5901ea80651e323c8554af132f4dd Mon Sep 17 00:00:00 2001 From: Ayush Ranjan Date: Wed, 22 Jul 2020 15:40:44 -0700 Subject: Add O_APPEND support in vfs2 gofer. Helps in fixing open syscall tests: AppendConcurrentWrite and AppendOnly. We also now update the file size for seekable special files (regular files) which we were not doing earlier. Updates #2923 PiperOrigin-RevId: 322670843 --- pkg/sentry/fsimpl/gofer/regular_file.go | 41 +++++++++++++++++++----- pkg/sentry/fsimpl/gofer/special_file.go | 56 +++++++++++++++++++++++++++------ test/syscalls/linux/open.cc | 4 +-- 3 files changed, 83 insertions(+), 18 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/gofer/regular_file.go b/pkg/sentry/fsimpl/gofer/regular_file.go index f10350c97..02317a133 100644 --- a/pkg/sentry/fsimpl/gofer/regular_file.go +++ b/pkg/sentry/fsimpl/gofer/regular_file.go @@ -155,26 +155,53 @@ func (fd *regularFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts // PWrite implements vfs.FileDescriptionImpl.PWrite. func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { + n, _, err := fd.pwrite(ctx, src, offset, opts) + return n, err +} + +// pwrite returns the number of bytes written, final offset, error. The final +// offset should be ignored by PWrite. +func (fd *regularFileFD) pwrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (written, finalOff int64, err error) { if offset < 0 { - return 0, syserror.EINVAL + return 0, offset, syserror.EINVAL } // Check that flags are supported. // // TODO(gvisor.dev/issue/2601): Support select pwritev2 flags. if opts.Flags&^linux.RWF_HIPRI != 0 { - return 0, syserror.EOPNOTSUPP + return 0, offset, syserror.EOPNOTSUPP + } + + d := fd.dentry() + // If the fd was opened with O_APPEND, make sure the file size is updated. + // There is a possible race here if size is modified externally after + // metadata cache is updated. + if fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 && !d.cachedMetadataAuthoritative() { + if err := d.updateFromGetattr(ctx); err != nil { + return 0, offset, err + } } + d.metadataMu.Lock() + defer d.metadataMu.Unlock() + // Set offset to file size if the fd was opened with O_APPEND. + if fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 { + // Holding d.metadataMu is sufficient for reading d.size. + offset = int64(d.size) + } limit, err := vfs.CheckLimit(ctx, offset, src.NumBytes()) if err != nil { - return 0, err + return 0, offset, err } src = src.TakeFirst64(limit) + n, err := fd.pwriteLocked(ctx, src, offset, opts) + return n, offset + n, err +} +// Preconditions: fd.dentry().metatdataMu must be locked. +func (fd *regularFileFD) pwriteLocked(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { d := fd.dentry() - d.metadataMu.Lock() - defer d.metadataMu.Unlock() if d.fs.opts.interop != InteropModeShared { // Compare Linux's mm/filemap.c:__generic_file_write_iter() => // file_update_time(). This is d.touchCMtime(), but without locking @@ -237,8 +264,8 @@ func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, off // Write implements vfs.FileDescriptionImpl.Write. func (fd *regularFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { fd.mu.Lock() - n, err := fd.PWrite(ctx, src, fd.off, opts) - fd.off += n + n, off, err := fd.pwrite(ctx, src, fd.off, opts) + fd.off = off fd.mu.Unlock() return n, err } diff --git a/pkg/sentry/fsimpl/gofer/special_file.go b/pkg/sentry/fsimpl/gofer/special_file.go index a7b53b2d2..811528982 100644 --- a/pkg/sentry/fsimpl/gofer/special_file.go +++ b/pkg/sentry/fsimpl/gofer/special_file.go @@ -16,6 +16,7 @@ package gofer import ( "sync" + "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" @@ -144,7 +145,7 @@ func (fd *specialFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offs // mmap due to lock ordering; MM locks precede dentry.dataMu. That doesn't // hold here since specialFileFD doesn't client-cache data. Just buffer the // read instead. - if d := fd.dentry(); d.fs.opts.interop != InteropModeShared { + if d := fd.dentry(); d.cachedMetadataAuthoritative() { d.touchAtime(fd.vfsfd.Mount()) } buf := make([]byte, dst.NumBytes()) @@ -176,39 +177,76 @@ func (fd *specialFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts // PWrite implements vfs.FileDescriptionImpl.PWrite. func (fd *specialFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { + n, _, err := fd.pwrite(ctx, src, offset, opts) + return n, err +} + +// pwrite returns the number of bytes written, final offset, error. The final +// offset should be ignored by PWrite. +func (fd *specialFileFD) pwrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (written, finalOff int64, err error) { if fd.seekable && offset < 0 { - return 0, syserror.EINVAL + return 0, offset, syserror.EINVAL } // Check that flags are supported. // // TODO(gvisor.dev/issue/2601): Support select pwritev2 flags. if opts.Flags&^linux.RWF_HIPRI != 0 { - return 0, syserror.EOPNOTSUPP + return 0, offset, syserror.EOPNOTSUPP + } + + d := fd.dentry() + // If the regular file fd was opened with O_APPEND, make sure the file size + // is updated. There is a possible race here if size is modified externally + // after metadata cache is updated. + if fd.seekable && fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 && !d.cachedMetadataAuthoritative() { + if err := d.updateFromGetattr(ctx); err != nil { + return 0, offset, err + } } if fd.seekable { + // We need to hold the metadataMu *while* writing to a regular file. + d.metadataMu.Lock() + defer d.metadataMu.Unlock() + + // Set offset to file size if the regular file was opened with O_APPEND. + if fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 { + // Holding d.metadataMu is sufficient for reading d.size. + offset = int64(d.size) + } limit, err := vfs.CheckLimit(ctx, offset, src.NumBytes()) if err != nil { - return 0, err + return 0, offset, err } src = src.TakeFirst64(limit) } // Do a buffered write. See rationale in PRead. - if d := fd.dentry(); d.fs.opts.interop != InteropModeShared { + if d.cachedMetadataAuthoritative() { d.touchCMtime() } buf := make([]byte, src.NumBytes()) // Don't do partial writes if we get a partial read from src. if _, err := src.CopyIn(ctx, buf); err != nil { - return 0, err + return 0, offset, err } n, err := fd.handle.writeFromBlocksAt(ctx, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf)), uint64(offset)) if err == syserror.EAGAIN { err = syserror.ErrWouldBlock } - return int64(n), err + finalOff = offset + // Update file size for regular files. + if fd.seekable { + finalOff += int64(n) + // d.metadataMu is already locked at this point. + if uint64(finalOff) > d.size { + d.dataMu.Lock() + defer d.dataMu.Unlock() + atomic.StoreUint64(&d.size, uint64(finalOff)) + } + } + return int64(n), finalOff, err } // Write implements vfs.FileDescriptionImpl.Write. @@ -218,8 +256,8 @@ func (fd *specialFileFD) Write(ctx context.Context, src usermem.IOSequence, opts } fd.mu.Lock() - n, err := fd.PWrite(ctx, src, fd.off, opts) - fd.off += n + n, off, err := fd.pwrite(ctx, src, fd.off, opts) + fd.off = off fd.mu.Unlock() return n, err } diff --git a/test/syscalls/linux/open.cc b/test/syscalls/linux/open.cc index bb7d108e8..bf350946b 100644 --- a/test/syscalls/linux/open.cc +++ b/test/syscalls/linux/open.cc @@ -235,7 +235,7 @@ TEST_F(OpenTest, AppendOnly) { ASSERT_NO_ERRNO_AND_VALUE(Open(test_file_name_, O_RDWR | O_APPEND)); EXPECT_THAT(lseek(fd2.get(), 0, SEEK_CUR), SyscallSucceedsWithValue(0)); - // Then try to write to the first file and make sure the bytes are appended. + // Then try to write to the first fd and make sure the bytes are appended. EXPECT_THAT(WriteFd(fd1.get(), buf.data(), buf.size()), SyscallSucceedsWithValue(buf.size())); @@ -247,7 +247,7 @@ TEST_F(OpenTest, AppendOnly) { EXPECT_THAT(lseek(fd1.get(), 0, SEEK_CUR), SyscallSucceedsWithValue(kBufSize * 2)); - // Then try to write to the second file and make sure the bytes are appended. + // Then try to write to the second fd and make sure the bytes are appended. EXPECT_THAT(WriteFd(fd2.get(), buf.data(), buf.size()), SyscallSucceedsWithValue(buf.size())); -- cgit v1.2.3 From 9654bf04acad7300e0bf964183f7860bd553f563 Mon Sep 17 00:00:00 2001 From: Ayush Ranjan Date: Wed, 22 Jul 2020 15:44:08 -0700 Subject: [vfs2][tmpfs] Implement O_APPEND Updates #2923 PiperOrigin-RevId: 322671489 --- pkg/sentry/fsimpl/tmpfs/regular_file.go | 35 ++++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 12 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file.go b/pkg/sentry/fsimpl/tmpfs/regular_file.go index 1cdb46e6f..abbaa5d60 100644 --- a/pkg/sentry/fsimpl/tmpfs/regular_file.go +++ b/pkg/sentry/fsimpl/tmpfs/regular_file.go @@ -325,8 +325,15 @@ func (fd *regularFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts // PWrite implements vfs.FileDescriptionImpl.PWrite. func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { + n, _, err := fd.pwrite(ctx, src, offset, opts) + return n, err +} + +// pwrite returns the number of bytes written, final offset and error. The +// final offset should be ignored by PWrite. +func (fd *regularFileFD) pwrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (written, finalOff int64, err error) { if offset < 0 { - return 0, syserror.EINVAL + return 0, offset, syserror.EINVAL } // Check that flags are supported. RWF_DSYNC/RWF_SYNC can be ignored since @@ -334,40 +341,44 @@ func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, off // // TODO(gvisor.dev/issue/2601): Support select preadv2 flags. if opts.Flags&^(linux.RWF_HIPRI|linux.RWF_DSYNC|linux.RWF_SYNC) != 0 { - return 0, syserror.EOPNOTSUPP + return 0, offset, syserror.EOPNOTSUPP } srclen := src.NumBytes() if srclen == 0 { - return 0, nil + return 0, offset, nil } f := fd.inode().impl.(*regularFile) + f.inode.mu.Lock() + defer f.inode.mu.Unlock() + // If the file is opened with O_APPEND, update offset to file size. + if fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 { + // Locking f.inode.mu is sufficient for reading f.size. + offset = int64(f.size) + } if end := offset + srclen; end < offset { // Overflow. - return 0, syserror.EINVAL + return 0, offset, syserror.EINVAL } - var err error srclen, err = vfs.CheckLimit(ctx, offset, srclen) if err != nil { - return 0, err + return 0, offset, err } src = src.TakeFirst64(srclen) - f.inode.mu.Lock() rw := getRegularFileReadWriter(f, offset) n, err := src.CopyInTo(ctx, rw) - fd.inode().touchCMtimeLocked() - f.inode.mu.Unlock() + f.inode.touchCMtimeLocked() putRegularFileReadWriter(rw) - return n, err + return n, n + offset, err } // Write implements vfs.FileDescriptionImpl.Write. func (fd *regularFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { fd.offMu.Lock() - n, err := fd.PWrite(ctx, src, fd.off, opts) - fd.off += n + n, off, err := fd.pwrite(ctx, src, fd.off, opts) + fd.off = off fd.offMu.Unlock() return n, err } -- cgit v1.2.3 From 4fbd0728ac5e0aa33a7f5e2c5189751b4baa94b5 Mon Sep 17 00:00:00 2001 From: Ayush Ranjan Date: Thu, 23 Jul 2020 09:43:05 -0700 Subject: [vfs2][gofer] Fix update attributes race condition. We were getting the file attributes before locking the metadataMu which was causing stale updates to the file attributes. Fixes OpenTest_AppendConcurrentWrite. Updates #2923 PiperOrigin-RevId: 322804438 --- pkg/sentry/fsimpl/gofer/filesystem.go | 17 +++++++++++++---- pkg/sentry/fsimpl/gofer/gofer.go | 11 +++++++---- 2 files changed, 20 insertions(+), 8 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go index cd5f5049e..491495b16 100644 --- a/pkg/sentry/fsimpl/gofer/filesystem.go +++ b/pkg/sentry/fsimpl/gofer/filesystem.go @@ -150,11 +150,9 @@ afterSymlink: return nil, err } if d != d.parent && !d.cachedMetadataAuthoritative() { - _, attrMask, attr, err := d.parent.file.getAttr(ctx, dentryAttrMask()) - if err != nil { + if err := d.parent.updateFromGetattr(ctx); err != nil { return nil, err } - d.parent.updateFromP9Attrs(attrMask, &attr) } rp.Advance() return d.parent, nil @@ -209,17 +207,28 @@ func (fs *filesystem) getChildLocked(ctx context.Context, vfsObj *vfs.VirtualFil // Preconditions: As for getChildLocked. !parent.isSynthetic(). func (fs *filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.VirtualFilesystem, parent *dentry, name string, child *dentry, ds **[]*dentry) (*dentry, error) { + if child != nil { + // Need to lock child.metadataMu because we might be updating child + // metadata. We need to hold the lock *before* getting metadata from the + // server and release it after updating local metadata. + child.metadataMu.Lock() + } qid, file, attrMask, attr, err := parent.file.walkGetAttrOne(ctx, name) if err != nil && err != syserror.ENOENT { + if child != nil { + child.metadataMu.Unlock() + } return nil, err } if child != nil { if !file.isNil() && inoFromPath(qid.Path) == child.ino { // The file at this path hasn't changed. Just update cached metadata. file.close(ctx) - child.updateFromP9Attrs(attrMask, &attr) + child.updateFromP9AttrsLocked(attrMask, &attr) + child.metadataMu.Unlock() return child, nil } + child.metadataMu.Unlock() if file.isNil() && child.isSynthetic() { // We have a synthetic file, and no remote file has arisen to // replace it. diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go index b74d489a0..ba7f650b3 100644 --- a/pkg/sentry/fsimpl/gofer/gofer.go +++ b/pkg/sentry/fsimpl/gofer/gofer.go @@ -785,8 +785,8 @@ func (d *dentry) cachedMetadataAuthoritative() bool { // updateFromP9Attrs is called to update d's metadata after an update from the // remote filesystem. -func (d *dentry) updateFromP9Attrs(mask p9.AttrMask, attr *p9.Attr) { - d.metadataMu.Lock() +// Precondition: d.metadataMu must be locked. +func (d *dentry) updateFromP9AttrsLocked(mask p9.AttrMask, attr *p9.Attr) { if mask.Mode { if got, want := uint32(attr.Mode.FileType()), d.fileType(); got != want { d.metadataMu.Unlock() @@ -822,7 +822,6 @@ func (d *dentry) updateFromP9Attrs(mask p9.AttrMask, attr *p9.Attr) { if mask.Size { d.updateFileSizeLocked(attr.Size) } - d.metadataMu.Unlock() } // Preconditions: !d.isSynthetic() @@ -834,6 +833,10 @@ func (d *dentry) updateFromGetattr(ctx context.Context) error { file p9file handleMuRLocked bool ) + // d.metadataMu must be locked *before* we getAttr so that we do not end up + // updating stale attributes in d.updateFromP9AttrsLocked(). + d.metadataMu.Lock() + defer d.metadataMu.Unlock() d.handleMu.RLock() if !d.handle.file.isNil() { file = d.handle.file @@ -849,7 +852,7 @@ func (d *dentry) updateFromGetattr(ctx context.Context) error { if err != nil { return err } - d.updateFromP9Attrs(attrMask, &attr) + d.updateFromP9AttrsLocked(attrMask, &attr) return nil } -- cgit v1.2.3 From 3e0e3b9b11fee58835a0a492d66e72b354459e27 Mon Sep 17 00:00:00 2001 From: Ridwan Sharif Date: Tue, 9 Jun 2020 12:35:39 -0400 Subject: Added stub FUSE filesystem Allow FUSE filesystems to be mounted using libfuse. The appropriate flags and mount options are parsed and understood by fusefs. --- pkg/sentry/fsimpl/fuse/BUILD | 5 + pkg/sentry/fsimpl/fuse/dev.go | 28 +++++ pkg/sentry/fsimpl/fuse/fusefs.go | 201 ++++++++++++++++++++++++++++++++ pkg/sentry/syscalls/linux/vfs2/mount.go | 9 +- pkg/sentry/vfs/options.go | 11 ++ runsc/boot/vfs.go | 5 + test/syscalls/linux/dev.cc | 12 ++ test/syscalls/linux/mount.cc | 28 +++++ 8 files changed, 297 insertions(+), 2 deletions(-) create mode 100644 pkg/sentry/fsimpl/fuse/fusefs.go (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/fuse/BUILD b/pkg/sentry/fsimpl/fuse/BUILD index 3e00c2abb..1fb9c438b 100644 --- a/pkg/sentry/fsimpl/fuse/BUILD +++ b/pkg/sentry/fsimpl/fuse/BUILD @@ -6,15 +6,20 @@ go_library( name = "fuse", srcs = [ "dev.go", + "fusefs.go", ], visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", "//pkg/context", + "//pkg/log", "//pkg/sentry/fsimpl/devtmpfs", + "//pkg/sentry/fsimpl/kernfs", "//pkg/sentry/kernel", + "//pkg/sentry/kernel/auth", "//pkg/sentry/vfs", "//pkg/syserror", "//pkg/usermem", + "@org_golang_x_sys//unix:go_default_library", ], ) diff --git a/pkg/sentry/fsimpl/fuse/dev.go b/pkg/sentry/fsimpl/fuse/dev.go index dc33268af..c9e12a94f 100644 --- a/pkg/sentry/fsimpl/fuse/dev.go +++ b/pkg/sentry/fsimpl/fuse/dev.go @@ -51,6 +51,9 @@ type DeviceFD struct { vfs.DentryMetadataFileDescriptionImpl vfs.NoLockFD + // mounted specifies whether a FUSE filesystem was mounted using the DeviceFD. + mounted bool + // TODO(gvisor.dev/issue/2987): Add all the data structures needed to enqueue // and deque requests, control synchronization and establish communication // between the FUSE kernel module and the /dev/fuse character device. @@ -61,26 +64,51 @@ func (fd *DeviceFD) Release() {} // PRead implements vfs.FileDescriptionImpl.PRead. func (fd *DeviceFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { + // Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted. + if !fd.mounted { + return 0, syserror.EPERM + } + return 0, syserror.ENOSYS } // Read implements vfs.FileDescriptionImpl.Read. func (fd *DeviceFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + // Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted. + if !fd.mounted { + return 0, syserror.EPERM + } + return 0, syserror.ENOSYS } // PWrite implements vfs.FileDescriptionImpl.PWrite. func (fd *DeviceFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { + // Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted. + if !fd.mounted { + return 0, syserror.EPERM + } + return 0, syserror.ENOSYS } // Write implements vfs.FileDescriptionImpl.Write. func (fd *DeviceFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { + // Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted. + if !fd.mounted { + return 0, syserror.EPERM + } + return 0, syserror.ENOSYS } // Seek implements vfs.FileDescriptionImpl.Seek. func (fd *DeviceFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { + // Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted. + if !fd.mounted { + return 0, syserror.EPERM + } + return 0, syserror.ENOSYS } diff --git a/pkg/sentry/fsimpl/fuse/fusefs.go b/pkg/sentry/fsimpl/fuse/fusefs.go new file mode 100644 index 000000000..6d40e2d87 --- /dev/null +++ b/pkg/sentry/fsimpl/fuse/fusefs.go @@ -0,0 +1,201 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package fuse implements fusefs. +package fuse + +import ( + "strconv" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +// Name is the default filesystem name. +const Name = "fuse" + +// FilesystemType implements vfs.FilesystemType. +type FilesystemType struct{} + +type filesystemOptions struct { + // userID specifies the numeric uid of the mount owner. + // This option should not be specified by the filesystem owner. + // It is set by libfuse (or, if libfuse is not used, must be set + // by the filesystem itself). For more information, see man page + // for fuse(8) + userID uint32 + + // groupID specifies the numeric gid of the mount owner. + // This option should not be specified by the filesystem owner. + // It is set by libfuse (or, if libfuse is not used, must be set + // by the filesystem itself). For more information, see man page + // for fuse(8) + groupID uint32 + + // rootMode specifies the the file mode of the filesystem's root. + rootMode linux.FileMode +} + +// filesystem implements vfs.FilesystemImpl. +type filesystem struct { + kernfs.Filesystem + devMinor uint32 + + // fuseFD is the FD returned when opening /dev/fuse. It is used for communication + // between the FUSE server daemon and the sentry fusefs. + fuseFD *DeviceFD + + // opts is the options the fusefs is initialized with. + opts filesystemOptions +} + +// Name implements vfs.FilesystemType.Name. +func (FilesystemType) Name() string { + return Name +} + +// GetFilesystem implements vfs.FilesystemType.GetFilesystem. +func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { + devMinor, err := vfsObj.GetAnonBlockDevMinor() + if err != nil { + return nil, nil, err + } + + var fsopts filesystemOptions + mopts := vfs.GenericParseMountOptions(opts.Data) + deviceDescriptorStr, ok := mopts["fd"] + if !ok { + log.Warningf("%s.GetFilesystem: communication file descriptor N (obtained by opening /dev/fuse) must be specified as 'fd=N'", fsType.Name()) + return nil, nil, syserror.EINVAL + } + delete(mopts, "fd") + + deviceDescriptor, err := strconv.ParseInt(deviceDescriptorStr, 10 /* base */, 32 /* bitSize */) + if err != nil { + return nil, nil, err + } + + kernelTask := kernel.TaskFromContext(ctx) + if kernelTask == nil { + log.Warningf("%s.GetFilesystem: couldn't get kernel task from context", fsType.Name()) + return nil, nil, syserror.EINVAL + } + fuseFd := kernelTask.GetFileVFS2(int32(deviceDescriptor)) + + // Parse and set all the other supported FUSE mount options. + // TODO: Expand the supported mount options. + if userIDStr, ok := mopts["user_id"]; ok { + delete(mopts, "user_id") + userID, err := strconv.ParseUint(userIDStr, 10, 32) + if err != nil { + log.Warningf("%s.GetFilesystem: invalid user_id: user_id=%s", fsType.Name(), userIDStr) + return nil, nil, syserror.EINVAL + } + fsopts.userID = uint32(userID) + } + + if groupIDStr, ok := mopts["group_id"]; ok { + delete(mopts, "group_id") + groupID, err := strconv.ParseUint(groupIDStr, 10, 32) + if err != nil { + log.Warningf("%s.GetFilesystem: invalid group_id: group_id=%s", fsType.Name(), groupIDStr) + return nil, nil, syserror.EINVAL + } + fsopts.groupID = uint32(groupID) + } + + rootMode := linux.FileMode(0777) + modeStr, ok := mopts["rootmode"] + if ok { + delete(mopts, "rootmode") + mode, err := strconv.ParseUint(modeStr, 8, 32) + if err != nil { + log.Warningf("%s.GetFilesystem: invalid mode: %q", fsType.Name(), modeStr) + return nil, nil, syserror.EINVAL + } + rootMode = linux.FileMode(mode & 07777) + } + fsopts.rootMode = rootMode + + // Check for unparsed options. + if len(mopts) != 0 { + log.Warningf("%s.GetFilesystem: unknown options: %v", fsType.Name(), mopts) + return nil, nil, syserror.EINVAL + } + + // Mark the device as ready so it can be used. /dev/fuse can only be used if the FD was used to + // mount a FUSE filesystem. + fuseFD := fuseFd.Impl().(*DeviceFD) + fuseFD.mounted = true + + fs := &filesystem{ + devMinor: devMinor, + fuseFD: fuseFD, + opts: fsopts, + } + + fs.VFSFilesystem().Init(vfsObj, &fsType, fs) + + // TODO: dispatch a FUSE_INIT request to the FUSE daemon server before + // returning. Mount will not block on this dispatched request. + + // root is the fusefs root directory. + defaultFusefsDirMode := linux.FileMode(0755) + root := fs.newInode(creds, defaultFusefsDirMode) + + return fs.VFSFilesystem(), root.VFSDentry(), nil +} + +// Release implements vfs.FilesystemImpl.Release. +func (fs *filesystem) Release() { + fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) + fs.Filesystem.Release() +} + +// Inode implements kernfs.Inode. +type Inode struct { + kernfs.InodeAttrs + kernfs.InodeNoDynamicLookup + kernfs.InodeNotSymlink + kernfs.InodeDirectoryNoNewChildren + kernfs.OrderedChildren + + locks vfs.FileLocks + + dentry kernfs.Dentry +} + +func (fs *filesystem) newInode(creds *auth.Credentials, mode linux.FileMode) *kernfs.Dentry { + i := &Inode{} + i.InodeAttrs.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0755) + i.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) + i.dentry.Init(i) + + return &i.dentry +} + +// Open implements kernfs.Inode.Open. +func (i *Inode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts) + if err != nil { + return nil, err + } + return fd.VFSFileDescription(), nil +} diff --git a/pkg/sentry/syscalls/linux/vfs2/mount.go b/pkg/sentry/syscalls/linux/vfs2/mount.go index adeaa39cc..ea337de7c 100644 --- a/pkg/sentry/syscalls/linux/vfs2/mount.go +++ b/pkg/sentry/syscalls/linux/vfs2/mount.go @@ -77,8 +77,7 @@ func Mount(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall // Silently allow MS_NOSUID, since we don't implement set-id bits // anyway. - const unsupportedFlags = linux.MS_NODEV | - linux.MS_NODIRATIME | linux.MS_STRICTATIME + const unsupportedFlags = linux.MS_NODIRATIME | linux.MS_STRICTATIME // Linux just allows passing any flags to mount(2) - it won't fail when // unknown or unsupported flags are passed. Since we don't implement @@ -94,6 +93,12 @@ func Mount(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if flags&linux.MS_NOEXEC == linux.MS_NOEXEC { opts.Flags.NoExec = true } + if flags&linux.MS_NODEV == linux.MS_NODEV { + opts.Flags.NoDev = true + } + if flags&linux.MS_NOSUID == linux.MS_NOSUID { + opts.Flags.NoSUID = true + } if flags&linux.MS_RDONLY == linux.MS_RDONLY { opts.ReadOnly = true } diff --git a/pkg/sentry/vfs/options.go b/pkg/sentry/vfs/options.go index f223aeda8..d37208a1f 100644 --- a/pkg/sentry/vfs/options.go +++ b/pkg/sentry/vfs/options.go @@ -79,6 +79,17 @@ type MountFlags struct { // NoATime is equivalent to MS_NOATIME and indicates that the // filesystem should not update access time in-place. NoATime bool + + // NoDev is equivalent to MS_NODEV and indicates that the + // filesystem should not allow access to devices (special files). + // TODO(gVisor.dev/issue/3186): respect this flag in non FUSE + // filesystems. + NoDev bool + + // NoSUID is equivalent to MS_NOSUID and indicates that the + // filesystem should not honor set-user-ID and set-group-ID bits or + // file capabilities when executing programs. + NoSUID bool } // MountOptions contains options to VirtualFilesystem.MountAt(). diff --git a/runsc/boot/vfs.go b/runsc/boot/vfs.go index 56f4ba15d..9a1ed8e9e 100644 --- a/runsc/boot/vfs.go +++ b/runsc/boot/vfs.go @@ -77,6 +77,10 @@ func registerFilesystems(k *kernel.Kernel) error { AllowUserMount: true, AllowUserList: true, }) + vfsObj.MustRegisterFilesystemType(fuse.Name, &fuse.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserMount: true, + AllowUserList: true, + }) // Setup files in devtmpfs. if err := memdev.Register(vfsObj); err != nil { @@ -119,6 +123,7 @@ func registerFilesystems(k *kernel.Kernel) error { return fmt.Errorf("creating fusedev devtmpfs files: %w", err) } } + return nil } diff --git a/test/syscalls/linux/dev.cc b/test/syscalls/linux/dev.cc index 6fa16208e..69e8b76d3 100644 --- a/test/syscalls/linux/dev.cc +++ b/test/syscalls/linux/dev.cc @@ -161,6 +161,18 @@ TEST(DevTest, OpenDevFuse) { ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/fuse", O_RDONLY)); } +TEST(DevTest, ReadDevFuseWithoutMount) { + // Note(gvisor.dev/issue/3076) This won't work in the sentry until the new + // device registration is complete. + SKIP_IF(IsRunningWithVFS1() || IsRunningOnGvisor()); + + const FileDescriptor fd = + ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/fuse", O_RDONLY)); + + std::vector buf(1); + EXPECT_THAT(ReadFd(fd.get(), buf.data(), sizeof(buf)), SyscallFailsWithErrno(EPERM)); +} + } // namespace } // namespace testing diff --git a/test/syscalls/linux/mount.cc b/test/syscalls/linux/mount.cc index a3e9745cf..7664fa73d 100644 --- a/test/syscalls/linux/mount.cc +++ b/test/syscalls/linux/mount.cc @@ -321,6 +321,34 @@ TEST(MountTest, RenameRemoveMountPoint) { ASSERT_THAT(rmdir(dir.path().c_str()), SyscallFailsWithErrno(EBUSY)); } +TEST(MountTest, MountFuseFilesystemNoDevice) { + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN))); + + // Note(gvisor.dev/issue/3076) This won't work in the sentry until the new + // device registration is complete. + SKIP_IF(IsRunningWithVFS1() || IsRunningOnGvisor()); + + auto const dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); + EXPECT_THAT(mount("", dir.path().c_str(), "fuse", 0, ""), + SyscallFailsWithErrno(EINVAL)); +} + +TEST(MountTest, MountFuseFilesystem) { + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN))); + + // Note(gvisor.dev/issue/3076) This won't work in the sentry until the new + // device registration is complete. + SKIP_IF(IsRunningWithVFS1() || IsRunningOnGvisor()); + + const FileDescriptor fd = + ASSERT_NO_ERRNO_AND_VALUE(Open("/dev/fuse", O_WRONLY)); + std::string mopts = "fd=" + std::to_string(fd.get()); + + auto const dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); + auto const mount = + ASSERT_NO_ERRNO_AND_VALUE(Mount("", dir.path(), "fuse", 0, mopts, 0)); +} + } // namespace } // namespace testing -- cgit v1.2.3 From 2f78c487f17e12dfee08214311c500073cb03fde Mon Sep 17 00:00:00 2001 From: Ridwan Sharif Date: Mon, 20 Jul 2020 16:24:27 -0400 Subject: Use mode supplied by the mount options --- pkg/sentry/fsimpl/fuse/BUILD | 1 - pkg/sentry/fsimpl/fuse/fusefs.go | 7 +++---- 2 files changed, 3 insertions(+), 5 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/fuse/BUILD b/pkg/sentry/fsimpl/fuse/BUILD index 1fb9c438b..737007748 100644 --- a/pkg/sentry/fsimpl/fuse/BUILD +++ b/pkg/sentry/fsimpl/fuse/BUILD @@ -20,6 +20,5 @@ go_library( "//pkg/sentry/vfs", "//pkg/syserror", "//pkg/usermem", - "@org_golang_x_sys//unix:go_default_library", ], ) diff --git a/pkg/sentry/fsimpl/fuse/fusefs.go b/pkg/sentry/fsimpl/fuse/fusefs.go index 6d40e2d87..f7775fb9b 100644 --- a/pkg/sentry/fsimpl/fuse/fusefs.go +++ b/pkg/sentry/fsimpl/fuse/fusefs.go @@ -1,4 +1,4 @@ -// Copyright 2019 The gVisor Authors. +// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -130,7 +130,7 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt log.Warningf("%s.GetFilesystem: invalid mode: %q", fsType.Name(), modeStr) return nil, nil, syserror.EINVAL } - rootMode = linux.FileMode(mode & 07777) + rootMode = linux.FileMode(mode) } fsopts.rootMode = rootMode @@ -157,8 +157,7 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt // returning. Mount will not block on this dispatched request. // root is the fusefs root directory. - defaultFusefsDirMode := linux.FileMode(0755) - root := fs.newInode(creds, defaultFusefsDirMode) + root := fs.newInode(creds, fsopts.rootMode) return fs.VFSFilesystem(), root.VFSDentry(), nil } -- cgit v1.2.3 From bac4ebaabfac95f7b467b9c777a890fcf31a42ae Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Thu, 23 Jul 2020 18:43:20 -0700 Subject: FileDescription is hard to spell. Fix typos. PiperOrigin-RevId: 322913282 --- pkg/sentry/fsimpl/devpts/slave.go | 2 +- pkg/sentry/fsimpl/kernfs/fd_impl_util.go | 6 +++--- pkg/sentry/fsimpl/overlay/non_directory.go | 2 +- pkg/sentry/fsimpl/proc/subtasks.go | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/devpts/slave.go b/pkg/sentry/fsimpl/devpts/slave.go index 2018b978a..a91cae3ef 100644 --- a/pkg/sentry/fsimpl/devpts/slave.go +++ b/pkg/sentry/fsimpl/devpts/slave.go @@ -132,7 +132,7 @@ func (sfd *slaveFileDescription) Write(ctx context.Context, src usermem.IOSequen return sfd.inode.t.ld.outputQueueWrite(ctx, src) } -// Ioctl implements vfs.FileDescripionImpl.Ioctl. +// Ioctl implements vfs.FileDescriptionImpl.Ioctl. func (sfd *slaveFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { switch cmd := args[1].Uint(); cmd { case linux.FIONREAD: // linux.FIONREAD == linux.TIOCINQ diff --git a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go index 3f0aea73a..1d37ccb98 100644 --- a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go @@ -112,7 +112,7 @@ func (fd *GenericDirectoryFD) PWrite(ctx context.Context, src usermem.IOSequence return fd.DirectoryFileDescriptionDefaultImpl.PWrite(ctx, src, offset, opts) } -// Release implements vfs.FileDecriptionImpl.Release. +// Release implements vfs.FileDescriptionImpl.Release. func (fd *GenericDirectoryFD) Release() {} func (fd *GenericDirectoryFD) filesystem() *vfs.Filesystem { @@ -123,7 +123,7 @@ func (fd *GenericDirectoryFD) inode() Inode { return fd.vfsfd.VirtualDentry().Dentry().Impl().(*Dentry).inode } -// IterDirents implements vfs.FileDecriptionImpl.IterDirents. IterDirents holds +// IterDirents implements vfs.FileDescriptionImpl.IterDirents. IterDirents holds // o.mu when calling cb. func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error { fd.mu.Lock() @@ -198,7 +198,7 @@ func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirent return err } -// Seek implements vfs.FileDecriptionImpl.Seek. +// Seek implements vfs.FileDescriptionImpl.Seek. func (fd *GenericDirectoryFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { fd.mu.Lock() defer fd.mu.Unlock() diff --git a/pkg/sentry/fsimpl/overlay/non_directory.go b/pkg/sentry/fsimpl/overlay/non_directory.go index a3c1f7a8d..6c8445ea1 100644 --- a/pkg/sentry/fsimpl/overlay/non_directory.go +++ b/pkg/sentry/fsimpl/overlay/non_directory.go @@ -176,7 +176,7 @@ func (fd *nonDirectoryFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) return nil } -// StatFS implements vfs.FileDesciptionImpl.StatFS. +// StatFS implements vfs.FileDescriptionImpl.StatFS. func (fd *nonDirectoryFD) StatFS(ctx context.Context) (linux.Statfs, error) { return fd.filesystem().statFS(ctx) } diff --git a/pkg/sentry/fsimpl/proc/subtasks.go b/pkg/sentry/fsimpl/proc/subtasks.go index dad4db1a7..79c2725f3 100644 --- a/pkg/sentry/fsimpl/proc/subtasks.go +++ b/pkg/sentry/fsimpl/proc/subtasks.go @@ -128,7 +128,7 @@ func (fd *subtasksFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallbac return fd.GenericDirectoryFD.IterDirents(ctx, cb) } -// Seek implements vfs.FileDecriptionImpl.Seek. +// Seek implements vfs.FileDescriptionImpl.Seek. func (fd *subtasksFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { if fd.task.ExitState() >= kernel.TaskExitZombie { return 0, syserror.ENOENT -- cgit v1.2.3 From d9a3f5d0c7d675b3cb4519eccca341bac33456af Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Thu, 23 Jul 2020 18:46:10 -0700 Subject: Add permission checks to vfs2 truncate. - Check write permission on truncate(2). Unlike ftruncate(2), truncate(2) fails if the user does not have write permissions on the file. - For gofers under InteropModeShared, check file type before making a truncate request. We should fail early and avoid making an rpc when possible. Furthermore, depending on the remote host's failure may give us unexpected behavior--if the host converts the truncate request to an ftruncate syscall on an open fd, we will get EINVAL instead of EISDIR. Updates #2923. PiperOrigin-RevId: 322913569 --- pkg/sentry/fsimpl/gofer/filesystem.go | 2 +- pkg/sentry/fsimpl/gofer/gofer.go | 18 +++++++++++++++--- pkg/sentry/fsimpl/host/host.go | 7 +++++-- pkg/sentry/fsimpl/kernfs/inode_impl_util.go | 2 +- pkg/sentry/fsimpl/overlay/filesystem.go | 2 +- pkg/sentry/fsimpl/overlay/non_directory.go | 2 +- pkg/sentry/fsimpl/tmpfs/filesystem.go | 2 +- pkg/sentry/fsimpl/tmpfs/tmpfs.go | 7 ++++--- pkg/sentry/syscalls/linux/vfs2/setstat.go | 5 +++++ pkg/sentry/vfs/options.go | 6 ++++++ pkg/sentry/vfs/permissions.go | 8 +++++++- 11 files changed, 47 insertions(+), 14 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go index 491495b16..00e3c99cd 100644 --- a/pkg/sentry/fsimpl/gofer/filesystem.go +++ b/pkg/sentry/fsimpl/gofer/filesystem.go @@ -1334,7 +1334,7 @@ func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts fs.renameMuRUnlockAndCheckCaching(&ds) return err } - if err := d.setStat(ctx, rp.Credentials(), &opts.Stat, rp.Mount()); err != nil { + if err := d.setStat(ctx, rp.Credentials(), &opts, rp.Mount()); err != nil { fs.renameMuRUnlockAndCheckCaching(&ds) return err } diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go index ba7f650b3..e20de84b5 100644 --- a/pkg/sentry/fsimpl/gofer/gofer.go +++ b/pkg/sentry/fsimpl/gofer/gofer.go @@ -888,7 +888,8 @@ func (d *dentry) statTo(stat *linux.Statx) { stat.DevMinor = d.fs.devMinor } -func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *linux.Statx, mnt *vfs.Mount) error { +func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs.SetStatOptions, mnt *vfs.Mount) error { + stat := &opts.Stat if stat.Mask == 0 { return nil } @@ -896,7 +897,7 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *lin return syserror.EPERM } mode := linux.FileMode(atomic.LoadUint32(&d.mode)) - if err := vfs.CheckSetStat(ctx, creds, stat, mode, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))); err != nil { + if err := vfs.CheckSetStat(ctx, creds, opts, mode, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))); err != nil { return err } if err := mnt.CheckBeginWrite(); err != nil { @@ -937,6 +938,17 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *lin } if !d.isSynthetic() { if stat.Mask != 0 { + if stat.Mask&linux.STATX_SIZE != 0 { + // Check whether to allow a truncate request to be made. + switch d.mode & linux.S_IFMT { + case linux.S_IFREG: + // Allow. + case linux.S_IFDIR: + return syserror.EISDIR + default: + return syserror.EINVAL + } + } if err := d.file.setAttr(ctx, p9.SetAttrMask{ Permissions: stat.Mask&linux.STATX_MODE != 0, UID: stat.Mask&linux.STATX_UID != 0, @@ -1498,7 +1510,7 @@ func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linu // SetStat implements vfs.FileDescriptionImpl.SetStat. func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { - if err := fd.dentry().setStat(ctx, auth.CredentialsFromContext(ctx), &opts.Stat, fd.vfsfd.Mount()); err != nil { + if err := fd.dentry().setStat(ctx, auth.CredentialsFromContext(ctx), &opts, fd.vfsfd.Mount()); err != nil { return err } if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 { diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go index 1a88cb657..c894f2ca0 100644 --- a/pkg/sentry/fsimpl/host/host.go +++ b/pkg/sentry/fsimpl/host/host.go @@ -373,7 +373,7 @@ func (i *inode) fstat(fs *filesystem) (linux.Statx, error) { // SetStat implements kernfs.Inode. func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error { - s := opts.Stat + s := &opts.Stat m := s.Mask if m == 0 { @@ -386,7 +386,7 @@ func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Cre if err := syscall.Fstat(i.hostFD, &hostStat); err != nil { return err } - if err := vfs.CheckSetStat(ctx, creds, &s, linux.FileMode(hostStat.Mode&linux.PermissionsMask), auth.KUID(hostStat.Uid), auth.KGID(hostStat.Gid)); err != nil { + if err := vfs.CheckSetStat(ctx, creds, &opts, linux.FileMode(hostStat.Mode), auth.KUID(hostStat.Uid), auth.KGID(hostStat.Gid)); err != nil { return err } @@ -396,6 +396,9 @@ func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Cre } } if m&linux.STATX_SIZE != 0 { + if hostStat.Mode&linux.S_IFMT != linux.S_IFREG { + return syserror.EINVAL + } if err := syscall.Ftruncate(i.hostFD, int64(s.Size)); err != nil { return err } diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go index 2ab3f1761..579e627f0 100644 --- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go @@ -267,7 +267,7 @@ func (a *InodeAttrs) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *aut if opts.Stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID) != 0 { return syserror.EPERM } - if err := vfs.CheckSetStat(ctx, creds, &opts.Stat, a.Mode(), auth.KUID(atomic.LoadUint32(&a.uid)), auth.KGID(atomic.LoadUint32(&a.gid))); err != nil { + if err := vfs.CheckSetStat(ctx, creds, &opts, a.Mode(), auth.KUID(atomic.LoadUint32(&a.uid)), auth.KGID(atomic.LoadUint32(&a.gid))); err != nil { return err } diff --git a/pkg/sentry/fsimpl/overlay/filesystem.go b/pkg/sentry/fsimpl/overlay/filesystem.go index ff82e1f20..6b705e955 100644 --- a/pkg/sentry/fsimpl/overlay/filesystem.go +++ b/pkg/sentry/fsimpl/overlay/filesystem.go @@ -1104,7 +1104,7 @@ func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts } mode := linux.FileMode(atomic.LoadUint32(&d.mode)) - if err := vfs.CheckSetStat(ctx, rp.Credentials(), &opts.Stat, mode, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))); err != nil { + if err := vfs.CheckSetStat(ctx, rp.Credentials(), &opts, mode, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))); err != nil { return err } mnt := rp.Mount() diff --git a/pkg/sentry/fsimpl/overlay/non_directory.go b/pkg/sentry/fsimpl/overlay/non_directory.go index 6c8445ea1..c0749e711 100644 --- a/pkg/sentry/fsimpl/overlay/non_directory.go +++ b/pkg/sentry/fsimpl/overlay/non_directory.go @@ -151,7 +151,7 @@ func (fd *nonDirectoryFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux func (fd *nonDirectoryFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { d := fd.dentry() mode := linux.FileMode(atomic.LoadUint32(&d.mode)) - if err := vfs.CheckSetStat(ctx, auth.CredentialsFromContext(ctx), &opts.Stat, mode, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))); err != nil { + if err := vfs.CheckSetStat(ctx, auth.CredentialsFromContext(ctx), &opts, mode, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))); err != nil { return err } mnt := fd.vfsfd.Mount() diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go index a0f20c2d4..ef210a69b 100644 --- a/pkg/sentry/fsimpl/tmpfs/filesystem.go +++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go @@ -649,7 +649,7 @@ func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts fs.mu.RUnlock() return err } - if err := d.inode.setStat(ctx, rp.Credentials(), &opts.Stat); err != nil { + if err := d.inode.setStat(ctx, rp.Credentials(), &opts); err != nil { fs.mu.RUnlock() return err } diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go index d7f4f0779..2545d88e9 100644 --- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go +++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go @@ -452,7 +452,8 @@ func (i *inode) statTo(stat *linux.Statx) { } } -func (i *inode) setStat(ctx context.Context, creds *auth.Credentials, stat *linux.Statx) error { +func (i *inode) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs.SetStatOptions) error { + stat := &opts.Stat if stat.Mask == 0 { return nil } @@ -460,7 +461,7 @@ func (i *inode) setStat(ctx context.Context, creds *auth.Credentials, stat *linu return syserror.EPERM } mode := linux.FileMode(atomic.LoadUint32(&i.mode)) - if err := vfs.CheckSetStat(ctx, creds, stat, mode, auth.KUID(atomic.LoadUint32(&i.uid)), auth.KGID(atomic.LoadUint32(&i.gid))); err != nil { + if err := vfs.CheckSetStat(ctx, creds, opts, mode, auth.KUID(atomic.LoadUint32(&i.uid)), auth.KGID(atomic.LoadUint32(&i.gid))); err != nil { return err } i.mu.Lock() @@ -695,7 +696,7 @@ func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linu func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { creds := auth.CredentialsFromContext(ctx) d := fd.dentry() - if err := d.inode.setStat(ctx, creds, &opts.Stat); err != nil { + if err := d.inode.setStat(ctx, creds, &opts); err != nil { return err } diff --git a/pkg/sentry/syscalls/linux/vfs2/setstat.go b/pkg/sentry/syscalls/linux/vfs2/setstat.go index 09ecfed26..6daedd173 100644 --- a/pkg/sentry/syscalls/linux/vfs2/setstat.go +++ b/pkg/sentry/syscalls/linux/vfs2/setstat.go @@ -178,6 +178,7 @@ func Truncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc Mask: linux.STATX_SIZE, Size: uint64(length), }, + NeedWritePerm: true, }) return 0, nil, handleSetSizeError(t, err) } @@ -197,6 +198,10 @@ func Ftruncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys } defer file.DecRef() + if !file.IsWritable() { + return 0, nil, syserror.EINVAL + } + err := file.SetStat(t, vfs.SetStatOptions{ Stat: linux.Statx{ Mask: linux.STATX_SIZE, diff --git a/pkg/sentry/vfs/options.go b/pkg/sentry/vfs/options.go index d37208a1f..dfc8573fd 100644 --- a/pkg/sentry/vfs/options.go +++ b/pkg/sentry/vfs/options.go @@ -164,6 +164,12 @@ type SetStatOptions struct { // == UTIME_OMIT (VFS users must unset the corresponding bit in Stat.Mask // instead). Stat linux.Statx + + // NeedWritePerm indicates that write permission on the file is needed for + // this operation. This is needed for truncate(2) (note that ftruncate(2) + // does not require the same check--instead, it checks that the fd is + // writable). + NeedWritePerm bool } // BoundEndpointOptions contains options to VirtualFilesystem.BoundEndpointAt() diff --git a/pkg/sentry/vfs/permissions.go b/pkg/sentry/vfs/permissions.go index 9cb050597..33389c1df 100644 --- a/pkg/sentry/vfs/permissions.go +++ b/pkg/sentry/vfs/permissions.go @@ -183,7 +183,8 @@ func MayWriteFileWithOpenFlags(flags uint32) bool { // CheckSetStat checks that creds has permission to change the metadata of a // file with the given permissions, UID, and GID as specified by stat, subject // to the rules of Linux's fs/attr.c:setattr_prepare(). -func CheckSetStat(ctx context.Context, creds *auth.Credentials, stat *linux.Statx, mode linux.FileMode, kuid auth.KUID, kgid auth.KGID) error { +func CheckSetStat(ctx context.Context, creds *auth.Credentials, opts *SetStatOptions, mode linux.FileMode, kuid auth.KUID, kgid auth.KGID) error { + stat := &opts.Stat if stat.Mask&linux.STATX_SIZE != 0 { limit, err := CheckLimit(ctx, 0, int64(stat.Size)) if err != nil { @@ -215,6 +216,11 @@ func CheckSetStat(ctx context.Context, creds *auth.Credentials, stat *linux.Stat return syserror.EPERM } } + if opts.NeedWritePerm && !creds.HasCapability(linux.CAP_DAC_OVERRIDE) { + if err := GenericCheckPermissions(creds, MayWrite, mode, kuid, kgid); err != nil { + return err + } + } if stat.Mask&(linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_CTIME) != 0 { if !CanActAsOwner(creds, kuid) { if (stat.Mask&linux.STATX_ATIME != 0 && stat.Atime.Nsec != linux.UTIME_NOW) || -- cgit v1.2.3 From f347a578b79c96c13ed492b2cf9aec1cb3e60f3f Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Mon, 27 Jul 2020 11:57:11 -0700 Subject: Move platform.File in memmap The subsequent systrap changes will need to import memmap from the platform package. PiperOrigin-RevId: 323409486 --- pkg/sentry/fs/fsutil/BUILD | 7 +--- pkg/sentry/fs/fsutil/dirty_set.go | 7 ++-- pkg/sentry/fs/fsutil/file_range_set.go | 15 ++++---- pkg/sentry/fs/fsutil/frame_ref_set.go | 10 ++--- pkg/sentry/fs/fsutil/host_file_mapper.go | 5 +-- pkg/sentry/fs/fsutil/host_mappable.go | 19 +++++---- pkg/sentry/fs/fsutil/inode_cached.go | 25 ++++++------ pkg/sentry/fsimpl/gofer/regular_file.go | 27 +++++++------ pkg/sentry/fsimpl/host/BUILD | 1 - pkg/sentry/fsimpl/host/mmap.go | 21 +++++----- pkg/sentry/kernel/shm/BUILD | 1 - pkg/sentry/kernel/shm/shm.go | 3 +- pkg/sentry/kernel/timekeeper.go | 4 +- pkg/sentry/kernel/vdso.go | 6 +-- pkg/sentry/memmap/BUILD | 14 ++++++- pkg/sentry/memmap/memmap.go | 60 +++++++++++++++++++++++++---- pkg/sentry/mm/BUILD | 4 +- pkg/sentry/mm/aio_context.go | 3 +- pkg/sentry/mm/mm.go | 10 ++--- pkg/sentry/mm/pma.go | 25 ++++++------ pkg/sentry/mm/special_mappable.go | 7 ++-- pkg/sentry/pgalloc/BUILD | 10 ++--- pkg/sentry/pgalloc/pgalloc.go | 66 ++++++++++++++++---------------- pkg/sentry/platform/BUILD | 20 +--------- pkg/sentry/platform/kvm/BUILD | 1 + pkg/sentry/platform/kvm/address_space.go | 3 +- pkg/sentry/platform/platform.go | 50 +----------------------- pkg/sentry/platform/ptrace/BUILD | 1 + pkg/sentry/platform/ptrace/subprocess.go | 3 +- 29 files changed, 205 insertions(+), 223 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fs/fsutil/BUILD b/pkg/sentry/fs/fsutil/BUILD index 789369220..5fb419bcd 100644 --- a/pkg/sentry/fs/fsutil/BUILD +++ b/pkg/sentry/fs/fsutil/BUILD @@ -8,7 +8,6 @@ go_template_instance( out = "dirty_set_impl.go", imports = { "memmap": "gvisor.dev/gvisor/pkg/sentry/memmap", - "platform": "gvisor.dev/gvisor/pkg/sentry/platform", }, package = "fsutil", prefix = "Dirty", @@ -25,14 +24,14 @@ go_template_instance( name = "frame_ref_set_impl", out = "frame_ref_set_impl.go", imports = { - "platform": "gvisor.dev/gvisor/pkg/sentry/platform", + "memmap": "gvisor.dev/gvisor/pkg/sentry/memmap", }, package = "fsutil", prefix = "FrameRef", template = "//pkg/segment:generic_set", types = { "Key": "uint64", - "Range": "platform.FileRange", + "Range": "memmap.FileRange", "Value": "uint64", "Functions": "FrameRefSetFunctions", }, @@ -43,7 +42,6 @@ go_template_instance( out = "file_range_set_impl.go", imports = { "memmap": "gvisor.dev/gvisor/pkg/sentry/memmap", - "platform": "gvisor.dev/gvisor/pkg/sentry/platform", }, package = "fsutil", prefix = "FileRange", @@ -86,7 +84,6 @@ go_library( "//pkg/sentry/kernel/time", "//pkg/sentry/memmap", "//pkg/sentry/pgalloc", - "//pkg/sentry/platform", "//pkg/sentry/socket/unix/transport", "//pkg/sentry/usage", "//pkg/state", diff --git a/pkg/sentry/fs/fsutil/dirty_set.go b/pkg/sentry/fs/fsutil/dirty_set.go index c6cd45087..2c9446c1d 100644 --- a/pkg/sentry/fs/fsutil/dirty_set.go +++ b/pkg/sentry/fs/fsutil/dirty_set.go @@ -20,7 +20,6 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/usermem" ) @@ -159,7 +158,7 @@ func (ds *DirtySet) AllowClean(mr memmap.MappableRange) { // repeatedly until all bytes have been written. max is the true size of the // cached object; offsets beyond max will not be passed to writeAt, even if // they are marked dirty. -func SyncDirty(ctx context.Context, mr memmap.MappableRange, cache *FileRangeSet, dirty *DirtySet, max uint64, mem platform.File, writeAt func(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error)) error { +func SyncDirty(ctx context.Context, mr memmap.MappableRange, cache *FileRangeSet, dirty *DirtySet, max uint64, mem memmap.File, writeAt func(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error)) error { var changedDirty bool defer func() { if changedDirty { @@ -194,7 +193,7 @@ func SyncDirty(ctx context.Context, mr memmap.MappableRange, cache *FileRangeSet // successful partial write, SyncDirtyAll will call it repeatedly until all // bytes have been written. max is the true size of the cached object; offsets // beyond max will not be passed to writeAt, even if they are marked dirty. -func SyncDirtyAll(ctx context.Context, cache *FileRangeSet, dirty *DirtySet, max uint64, mem platform.File, writeAt func(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error)) error { +func SyncDirtyAll(ctx context.Context, cache *FileRangeSet, dirty *DirtySet, max uint64, mem memmap.File, writeAt func(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error)) error { dseg := dirty.FirstSegment() for dseg.Ok() { if err := syncDirtyRange(ctx, dseg.Range(), cache, max, mem, writeAt); err != nil { @@ -210,7 +209,7 @@ func SyncDirtyAll(ctx context.Context, cache *FileRangeSet, dirty *DirtySet, max } // Preconditions: mr must be page-aligned. -func syncDirtyRange(ctx context.Context, mr memmap.MappableRange, cache *FileRangeSet, max uint64, mem platform.File, writeAt func(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error)) error { +func syncDirtyRange(ctx context.Context, mr memmap.MappableRange, cache *FileRangeSet, max uint64, mem memmap.File, writeAt func(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error)) error { for cseg := cache.LowerBoundSegment(mr.Start); cseg.Ok() && cseg.Start() < mr.End; cseg = cseg.NextSegment() { wbr := cseg.Range().Intersect(mr) if max < wbr.Start { diff --git a/pkg/sentry/fs/fsutil/file_range_set.go b/pkg/sentry/fs/fsutil/file_range_set.go index 5643cdac9..bbafebf03 100644 --- a/pkg/sentry/fs/fsutil/file_range_set.go +++ b/pkg/sentry/fs/fsutil/file_range_set.go @@ -23,13 +23,12 @@ import ( "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" - "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/usermem" ) // FileRangeSet maps offsets into a memmap.Mappable to offsets into a -// platform.File. It is used to implement Mappables that store data in +// memmap.File. It is used to implement Mappables that store data in // sparsely-allocated memory. // // type FileRangeSet @@ -65,20 +64,20 @@ func (FileRangeSetFunctions) Split(mr memmap.MappableRange, frstart uint64, spli } // FileRange returns the FileRange mapped by seg. -func (seg FileRangeIterator) FileRange() platform.FileRange { +func (seg FileRangeIterator) FileRange() memmap.FileRange { return seg.FileRangeOf(seg.Range()) } // FileRangeOf returns the FileRange mapped by mr. // // Preconditions: seg.Range().IsSupersetOf(mr). mr.Length() != 0. -func (seg FileRangeIterator) FileRangeOf(mr memmap.MappableRange) platform.FileRange { +func (seg FileRangeIterator) FileRangeOf(mr memmap.MappableRange) memmap.FileRange { frstart := seg.Value() + (mr.Start - seg.Start()) - return platform.FileRange{frstart, frstart + mr.Length()} + return memmap.FileRange{frstart, frstart + mr.Length()} } // Fill attempts to ensure that all memmap.Mappable offsets in required are -// mapped to a platform.File offset, by allocating from mf with the given +// mapped to a memmap.File offset, by allocating from mf with the given // memory usage kind and invoking readAt to store data into memory. (If readAt // returns a successful partial read, Fill will call it repeatedly until all // bytes have been read.) EOF is handled consistently with the requirements of @@ -141,7 +140,7 @@ func (frs *FileRangeSet) Fill(ctx context.Context, required, optional memmap.Map } // Drop removes segments for memmap.Mappable offsets in mr, freeing the -// corresponding platform.FileRanges. +// corresponding memmap.FileRanges. // // Preconditions: mr must be page-aligned. func (frs *FileRangeSet) Drop(mr memmap.MappableRange, mf *pgalloc.MemoryFile) { @@ -154,7 +153,7 @@ func (frs *FileRangeSet) Drop(mr memmap.MappableRange, mf *pgalloc.MemoryFile) { } // DropAll removes all segments in mr, freeing the corresponding -// platform.FileRanges. +// memmap.FileRanges. func (frs *FileRangeSet) DropAll(mf *pgalloc.MemoryFile) { for seg := frs.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { mf.DecRef(seg.FileRange()) diff --git a/pkg/sentry/fs/fsutil/frame_ref_set.go b/pkg/sentry/fs/fsutil/frame_ref_set.go index dd6f5aba6..a808894df 100644 --- a/pkg/sentry/fs/fsutil/frame_ref_set.go +++ b/pkg/sentry/fs/fsutil/frame_ref_set.go @@ -17,7 +17,7 @@ package fsutil import ( "math" - "gvisor.dev/gvisor/pkg/sentry/platform" + "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/usage" ) @@ -39,7 +39,7 @@ func (FrameRefSetFunctions) ClearValue(val *uint64) { } // Merge implements segment.Functions.Merge. -func (FrameRefSetFunctions) Merge(_ platform.FileRange, val1 uint64, _ platform.FileRange, val2 uint64) (uint64, bool) { +func (FrameRefSetFunctions) Merge(_ memmap.FileRange, val1 uint64, _ memmap.FileRange, val2 uint64) (uint64, bool) { if val1 != val2 { return 0, false } @@ -47,13 +47,13 @@ func (FrameRefSetFunctions) Merge(_ platform.FileRange, val1 uint64, _ platform. } // Split implements segment.Functions.Split. -func (FrameRefSetFunctions) Split(_ platform.FileRange, val uint64, _ uint64) (uint64, uint64) { +func (FrameRefSetFunctions) Split(_ memmap.FileRange, val uint64, _ uint64) (uint64, uint64) { return val, val } // IncRefAndAccount adds a reference on the range fr. All newly inserted segments // are accounted as host page cache memory mappings. -func (refs *FrameRefSet) IncRefAndAccount(fr platform.FileRange) { +func (refs *FrameRefSet) IncRefAndAccount(fr memmap.FileRange) { seg, gap := refs.Find(fr.Start) for { switch { @@ -74,7 +74,7 @@ func (refs *FrameRefSet) IncRefAndAccount(fr platform.FileRange) { // DecRefAndAccount removes a reference on the range fr and untracks segments // that are removed from memory accounting. -func (refs *FrameRefSet) DecRefAndAccount(fr platform.FileRange) { +func (refs *FrameRefSet) DecRefAndAccount(fr memmap.FileRange) { seg := refs.FindSegment(fr.Start) for seg.Ok() && seg.Start() < fr.End { diff --git a/pkg/sentry/fs/fsutil/host_file_mapper.go b/pkg/sentry/fs/fsutil/host_file_mapper.go index e82afd112..ef0113b52 100644 --- a/pkg/sentry/fs/fsutil/host_file_mapper.go +++ b/pkg/sentry/fs/fsutil/host_file_mapper.go @@ -21,7 +21,6 @@ import ( "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/usermem" ) @@ -126,7 +125,7 @@ func (f *HostFileMapper) DecRefOn(mr memmap.MappableRange) { // offsets in fr or until the next call to UnmapAll. // // Preconditions: The caller must hold a reference on all offsets in fr. -func (f *HostFileMapper) MapInternal(fr platform.FileRange, fd int, write bool) (safemem.BlockSeq, error) { +func (f *HostFileMapper) MapInternal(fr memmap.FileRange, fd int, write bool) (safemem.BlockSeq, error) { chunks := ((fr.End + chunkMask) >> chunkShift) - (fr.Start >> chunkShift) f.mapsMu.Lock() defer f.mapsMu.Unlock() @@ -146,7 +145,7 @@ func (f *HostFileMapper) MapInternal(fr platform.FileRange, fd int, write bool) } // Preconditions: f.mapsMu must be locked. -func (f *HostFileMapper) forEachMappingBlockLocked(fr platform.FileRange, fd int, write bool, fn func(safemem.Block)) error { +func (f *HostFileMapper) forEachMappingBlockLocked(fr memmap.FileRange, fd int, write bool, fn func(safemem.Block)) error { prot := syscall.PROT_READ if write { prot |= syscall.PROT_WRITE diff --git a/pkg/sentry/fs/fsutil/host_mappable.go b/pkg/sentry/fs/fsutil/host_mappable.go index 78fec553e..c15d8a946 100644 --- a/pkg/sentry/fs/fsutil/host_mappable.go +++ b/pkg/sentry/fs/fsutil/host_mappable.go @@ -21,18 +21,17 @@ import ( "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/usermem" ) -// HostMappable implements memmap.Mappable and platform.File over a +// HostMappable implements memmap.Mappable and memmap.File over a // CachedFileObject. // // Lock order (compare the lock order model in mm/mm.go): // truncateMu ("fs locks") // mu ("memmap.Mappable locks not taken by Translate") -// ("platform.File locks") +// ("memmap.File locks") // backingFile ("CachedFileObject locks") // // +stateify savable @@ -124,24 +123,24 @@ func (h *HostMappable) NotifyChangeFD() error { return nil } -// MapInternal implements platform.File.MapInternal. -func (h *HostMappable) MapInternal(fr platform.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) { +// MapInternal implements memmap.File.MapInternal. +func (h *HostMappable) MapInternal(fr memmap.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) { return h.hostFileMapper.MapInternal(fr, h.backingFile.FD(), at.Write) } -// FD implements platform.File.FD. +// FD implements memmap.File.FD. func (h *HostMappable) FD() int { return h.backingFile.FD() } -// IncRef implements platform.File.IncRef. -func (h *HostMappable) IncRef(fr platform.FileRange) { +// IncRef implements memmap.File.IncRef. +func (h *HostMappable) IncRef(fr memmap.FileRange) { mr := memmap.MappableRange{Start: fr.Start, End: fr.End} h.hostFileMapper.IncRefOn(mr) } -// DecRef implements platform.File.DecRef. -func (h *HostMappable) DecRef(fr platform.FileRange) { +// DecRef implements memmap.File.DecRef. +func (h *HostMappable) DecRef(fr memmap.FileRange) { mr := memmap.MappableRange{Start: fr.Start, End: fr.End} h.hostFileMapper.DecRefOn(mr) } diff --git a/pkg/sentry/fs/fsutil/inode_cached.go b/pkg/sentry/fs/fsutil/inode_cached.go index 800c8b4e1..fe8b0b6ac 100644 --- a/pkg/sentry/fs/fsutil/inode_cached.go +++ b/pkg/sentry/fs/fsutil/inode_cached.go @@ -26,7 +26,6 @@ import ( ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" - "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/usermem" @@ -934,7 +933,7 @@ func maxFillRange(required, optional memmap.MappableRange) memmap.MappableRange // InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable. func (c *CachingInodeOperations) InvalidateUnsavable(ctx context.Context) error { - // Whether we have a host fd (and consequently what platform.File is + // Whether we have a host fd (and consequently what memmap.File is // mapped) can change across save/restore, so invalidate all translations // unconditionally. c.mapsMu.Lock() @@ -999,10 +998,10 @@ func (c *CachingInodeOperations) Evict(ctx context.Context, er pgalloc.Evictable } } -// IncRef implements platform.File.IncRef. This is used when we directly map an -// underlying host fd and CachingInodeOperations is used as the platform.File +// IncRef implements memmap.File.IncRef. This is used when we directly map an +// underlying host fd and CachingInodeOperations is used as the memmap.File // during translation. -func (c *CachingInodeOperations) IncRef(fr platform.FileRange) { +func (c *CachingInodeOperations) IncRef(fr memmap.FileRange) { // Hot path. Avoid defers. c.dataMu.Lock() seg, gap := c.refs.Find(fr.Start) @@ -1024,10 +1023,10 @@ func (c *CachingInodeOperations) IncRef(fr platform.FileRange) { } } -// DecRef implements platform.File.DecRef. This is used when we directly map an -// underlying host fd and CachingInodeOperations is used as the platform.File +// DecRef implements memmap.File.DecRef. This is used when we directly map an +// underlying host fd and CachingInodeOperations is used as the memmap.File // during translation. -func (c *CachingInodeOperations) DecRef(fr platform.FileRange) { +func (c *CachingInodeOperations) DecRef(fr memmap.FileRange) { // Hot path. Avoid defers. c.dataMu.Lock() seg := c.refs.FindSegment(fr.Start) @@ -1046,15 +1045,15 @@ func (c *CachingInodeOperations) DecRef(fr platform.FileRange) { c.dataMu.Unlock() } -// MapInternal implements platform.File.MapInternal. This is used when we +// MapInternal implements memmap.File.MapInternal. This is used when we // directly map an underlying host fd and CachingInodeOperations is used as the -// platform.File during translation. -func (c *CachingInodeOperations) MapInternal(fr platform.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) { +// memmap.File during translation. +func (c *CachingInodeOperations) MapInternal(fr memmap.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) { return c.hostFileMapper.MapInternal(fr, c.backingFile.FD(), at.Write) } -// FD implements platform.File.FD. This is used when we directly map an -// underlying host fd and CachingInodeOperations is used as the platform.File +// FD implements memmap.File.FD. This is used when we directly map an +// underlying host fd and CachingInodeOperations is used as the memmap.File // during translation. func (c *CachingInodeOperations) FD() int { return c.backingFile.FD() diff --git a/pkg/sentry/fsimpl/gofer/regular_file.go b/pkg/sentry/fsimpl/gofer/regular_file.go index 02317a133..09f142cfc 100644 --- a/pkg/sentry/fsimpl/gofer/regular_file.go +++ b/pkg/sentry/fsimpl/gofer/regular_file.go @@ -29,7 +29,6 @@ import ( "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" - "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" @@ -221,12 +220,12 @@ func (fd *regularFileFD) pwriteLocked(ctx context.Context, src usermem.IOSequenc return 0, syserror.EINVAL } mr := memmap.MappableRange{pgstart, pgend} - var freed []platform.FileRange + var freed []memmap.FileRange d.dataMu.Lock() cseg := d.cache.LowerBoundSegment(mr.Start) for cseg.Ok() && cseg.Start() < mr.End { cseg = d.cache.Isolate(cseg, mr) - freed = append(freed, platform.FileRange{cseg.Value(), cseg.Value() + cseg.Range().Length()}) + freed = append(freed, memmap.FileRange{cseg.Value(), cseg.Value() + cseg.Range().Length()}) cseg = d.cache.Remove(cseg).NextSegment() } d.dataMu.Unlock() @@ -821,7 +820,7 @@ func maxFillRange(required, optional memmap.MappableRange) memmap.MappableRange // InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable. func (d *dentry) InvalidateUnsavable(ctx context.Context) error { - // Whether we have a host fd (and consequently what platform.File is + // Whether we have a host fd (and consequently what memmap.File is // mapped) can change across save/restore, so invalidate all translations // unconditionally. d.mapsMu.Lock() @@ -869,8 +868,8 @@ func (d *dentry) Evict(ctx context.Context, er pgalloc.EvictableRange) { } } -// dentryPlatformFile implements platform.File. It exists solely because dentry -// cannot implement both vfs.DentryImpl.IncRef and platform.File.IncRef. +// dentryPlatformFile implements memmap.File. It exists solely because dentry +// cannot implement both vfs.DentryImpl.IncRef and memmap.File.IncRef. // // dentryPlatformFile is only used when a host FD representing the remote file // is available (i.e. dentry.handle.fd >= 0), and that FD is used for @@ -878,7 +877,7 @@ func (d *dentry) Evict(ctx context.Context, er pgalloc.EvictableRange) { type dentryPlatformFile struct { *dentry - // fdRefs counts references on platform.File offsets. fdRefs is protected + // fdRefs counts references on memmap.File offsets. fdRefs is protected // by dentry.dataMu. fdRefs fsutil.FrameRefSet @@ -890,29 +889,29 @@ type dentryPlatformFile struct { hostFileMapperInitOnce sync.Once } -// IncRef implements platform.File.IncRef. -func (d *dentryPlatformFile) IncRef(fr platform.FileRange) { +// IncRef implements memmap.File.IncRef. +func (d *dentryPlatformFile) IncRef(fr memmap.FileRange) { d.dataMu.Lock() d.fdRefs.IncRefAndAccount(fr) d.dataMu.Unlock() } -// DecRef implements platform.File.DecRef. -func (d *dentryPlatformFile) DecRef(fr platform.FileRange) { +// DecRef implements memmap.File.DecRef. +func (d *dentryPlatformFile) DecRef(fr memmap.FileRange) { d.dataMu.Lock() d.fdRefs.DecRefAndAccount(fr) d.dataMu.Unlock() } -// MapInternal implements platform.File.MapInternal. -func (d *dentryPlatformFile) MapInternal(fr platform.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) { +// MapInternal implements memmap.File.MapInternal. +func (d *dentryPlatformFile) MapInternal(fr memmap.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) { d.handleMu.RLock() bs, err := d.hostFileMapper.MapInternal(fr, int(d.handle.fd), at.Write) d.handleMu.RUnlock() return bs, err } -// FD implements platform.File.FD. +// FD implements memmap.File.FD. func (d *dentryPlatformFile) FD() int { d.handleMu.RLock() fd := d.handle.fd diff --git a/pkg/sentry/fsimpl/host/BUILD b/pkg/sentry/fsimpl/host/BUILD index e86fbe2d5..bd701bbc7 100644 --- a/pkg/sentry/fsimpl/host/BUILD +++ b/pkg/sentry/fsimpl/host/BUILD @@ -34,7 +34,6 @@ go_library( "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", "//pkg/sentry/memmap", - "//pkg/sentry/platform", "//pkg/sentry/socket/control", "//pkg/sentry/socket/unix", "//pkg/sentry/socket/unix/transport", diff --git a/pkg/sentry/fsimpl/host/mmap.go b/pkg/sentry/fsimpl/host/mmap.go index 8545a82f0..65d3af38c 100644 --- a/pkg/sentry/fsimpl/host/mmap.go +++ b/pkg/sentry/fsimpl/host/mmap.go @@ -19,13 +19,12 @@ import ( "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/fs/fsutil" "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/usermem" ) -// inodePlatformFile implements platform.File. It exists solely because inode -// cannot implement both kernfs.Inode.IncRef and platform.File.IncRef. +// inodePlatformFile implements memmap.File. It exists solely because inode +// cannot implement both kernfs.Inode.IncRef and memmap.File.IncRef. // // inodePlatformFile should only be used if inode.canMap is true. type inodePlatformFile struct { @@ -34,7 +33,7 @@ type inodePlatformFile struct { // fdRefsMu protects fdRefs. fdRefsMu sync.Mutex - // fdRefs counts references on platform.File offsets. It is used solely for + // fdRefs counts references on memmap.File offsets. It is used solely for // memory accounting. fdRefs fsutil.FrameRefSet @@ -45,32 +44,32 @@ type inodePlatformFile struct { fileMapperInitOnce sync.Once } -// IncRef implements platform.File.IncRef. +// IncRef implements memmap.File.IncRef. // // Precondition: i.inode.canMap must be true. -func (i *inodePlatformFile) IncRef(fr platform.FileRange) { +func (i *inodePlatformFile) IncRef(fr memmap.FileRange) { i.fdRefsMu.Lock() i.fdRefs.IncRefAndAccount(fr) i.fdRefsMu.Unlock() } -// DecRef implements platform.File.DecRef. +// DecRef implements memmap.File.DecRef. // // Precondition: i.inode.canMap must be true. -func (i *inodePlatformFile) DecRef(fr platform.FileRange) { +func (i *inodePlatformFile) DecRef(fr memmap.FileRange) { i.fdRefsMu.Lock() i.fdRefs.DecRefAndAccount(fr) i.fdRefsMu.Unlock() } -// MapInternal implements platform.File.MapInternal. +// MapInternal implements memmap.File.MapInternal. // // Precondition: i.inode.canMap must be true. -func (i *inodePlatformFile) MapInternal(fr platform.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) { +func (i *inodePlatformFile) MapInternal(fr memmap.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) { return i.fileMapper.MapInternal(fr, i.hostFD, at.Write) } -// FD implements platform.File.FD. +// FD implements memmap.File.FD. func (i *inodePlatformFile) FD() int { return i.hostFD } diff --git a/pkg/sentry/kernel/shm/BUILD b/pkg/sentry/kernel/shm/BUILD index bfd779837..c211fc8d0 100644 --- a/pkg/sentry/kernel/shm/BUILD +++ b/pkg/sentry/kernel/shm/BUILD @@ -20,7 +20,6 @@ go_library( "//pkg/sentry/kernel/time", "//pkg/sentry/memmap", "//pkg/sentry/pgalloc", - "//pkg/sentry/platform", "//pkg/sentry/usage", "//pkg/sync", "//pkg/syserror", diff --git a/pkg/sentry/kernel/shm/shm.go b/pkg/sentry/kernel/shm/shm.go index f66cfcc7f..55b4c2cdb 100644 --- a/pkg/sentry/kernel/shm/shm.go +++ b/pkg/sentry/kernel/shm/shm.go @@ -45,7 +45,6 @@ import ( ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" - "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" @@ -370,7 +369,7 @@ type Shm struct { // fr is the offset into mfp.MemoryFile() that backs this contents of this // segment. Immutable. - fr platform.FileRange + fr memmap.FileRange // mu protects all fields below. mu sync.Mutex `state:"nosave"` diff --git a/pkg/sentry/kernel/timekeeper.go b/pkg/sentry/kernel/timekeeper.go index 5f3908d8b..7c4fefb16 100644 --- a/pkg/sentry/kernel/timekeeper.go +++ b/pkg/sentry/kernel/timekeeper.go @@ -21,8 +21,8 @@ import ( "gvisor.dev/gvisor/pkg/log" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" + "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" - "gvisor.dev/gvisor/pkg/sentry/platform" sentrytime "gvisor.dev/gvisor/pkg/sentry/time" "gvisor.dev/gvisor/pkg/sync" ) @@ -90,7 +90,7 @@ type Timekeeper struct { // NewTimekeeper does not take ownership of paramPage. // // SetClocks must be called on the returned Timekeeper before it is usable. -func NewTimekeeper(mfp pgalloc.MemoryFileProvider, paramPage platform.FileRange) (*Timekeeper, error) { +func NewTimekeeper(mfp pgalloc.MemoryFileProvider, paramPage memmap.FileRange) (*Timekeeper, error) { return &Timekeeper{ params: NewVDSOParamPage(mfp, paramPage), }, nil diff --git a/pkg/sentry/kernel/vdso.go b/pkg/sentry/kernel/vdso.go index f1b3c212c..290c32466 100644 --- a/pkg/sentry/kernel/vdso.go +++ b/pkg/sentry/kernel/vdso.go @@ -19,8 +19,8 @@ import ( "gvisor.dev/gvisor/pkg/binary" "gvisor.dev/gvisor/pkg/safemem" + "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" - "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/usermem" ) @@ -58,7 +58,7 @@ type vdsoParams struct { type VDSOParamPage struct { // The parameter page is fr, allocated from mfp.MemoryFile(). mfp pgalloc.MemoryFileProvider - fr platform.FileRange + fr memmap.FileRange // seq is the current sequence count written to the page. // @@ -81,7 +81,7 @@ type VDSOParamPage struct { // * VDSOParamPage must be the only writer to fr. // // * mfp.MemoryFile().MapInternal(fr) must return a single safemem.Block. -func NewVDSOParamPage(mfp pgalloc.MemoryFileProvider, fr platform.FileRange) *VDSOParamPage { +func NewVDSOParamPage(mfp pgalloc.MemoryFileProvider, fr memmap.FileRange) *VDSOParamPage { return &VDSOParamPage{mfp: mfp, fr: fr} } diff --git a/pkg/sentry/memmap/BUILD b/pkg/sentry/memmap/BUILD index a98b66de1..2c95669cd 100644 --- a/pkg/sentry/memmap/BUILD +++ b/pkg/sentry/memmap/BUILD @@ -28,9 +28,21 @@ go_template_instance( }, ) +go_template_instance( + name = "file_range", + out = "file_range.go", + package = "memmap", + prefix = "File", + template = "//pkg/segment:generic_range", + types = { + "T": "uint64", + }, +) + go_library( name = "memmap", srcs = [ + "file_range.go", "mappable_range.go", "mapping_set.go", "mapping_set_impl.go", @@ -40,7 +52,7 @@ go_library( deps = [ "//pkg/context", "//pkg/log", - "//pkg/sentry/platform", + "//pkg/safemem", "//pkg/syserror", "//pkg/usermem", ], diff --git a/pkg/sentry/memmap/memmap.go b/pkg/sentry/memmap/memmap.go index c6db9fc8f..c188f6c29 100644 --- a/pkg/sentry/memmap/memmap.go +++ b/pkg/sentry/memmap/memmap.go @@ -19,12 +19,12 @@ import ( "fmt" "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/sentry/platform" + "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/usermem" ) // Mappable represents a memory-mappable object, a mutable mapping from uint64 -// offsets to (platform.File, uint64 File offset) pairs. +// offsets to (File, uint64 File offset) pairs. // // See mm/mm.go for Mappable's place in the lock order. // @@ -74,7 +74,7 @@ type Mappable interface { // Translations are valid until invalidated by a callback to // MappingSpace.Invalidate or until the caller removes its mapping of the // translated range. Mappable implementations must ensure that at least one - // reference is held on all pages in a platform.File that may be the result + // reference is held on all pages in a File that may be the result // of a valid Translation. // // Preconditions: required.Length() > 0. optional.IsSupersetOf(required). @@ -100,7 +100,7 @@ type Translation struct { Source MappableRange // File is the mapped file. - File platform.File + File File // Offset is the offset into File at which this Translation begins. Offset uint64 @@ -110,9 +110,9 @@ type Translation struct { Perms usermem.AccessType } -// FileRange returns the platform.FileRange represented by t. -func (t Translation) FileRange() platform.FileRange { - return platform.FileRange{t.Offset, t.Offset + t.Source.Length()} +// FileRange returns the FileRange represented by t. +func (t Translation) FileRange() FileRange { + return FileRange{t.Offset, t.Offset + t.Source.Length()} } // CheckTranslateResult returns an error if (ts, terr) does not satisfy all @@ -361,3 +361,49 @@ type MMapOpts struct { // TODO(jamieliu): Replace entirely with MappingIdentity? Hint string } + +// File represents a host file that may be mapped into an platform.AddressSpace. +type File interface { + // All pages in a File are reference-counted. + + // IncRef increments the reference count on all pages in fr. + // + // Preconditions: fr.Start and fr.End must be page-aligned. fr.Length() > + // 0. At least one reference must be held on all pages in fr. (The File + // interface does not provide a way to acquire an initial reference; + // implementors may define mechanisms for doing so.) + IncRef(fr FileRange) + + // DecRef decrements the reference count on all pages in fr. + // + // Preconditions: fr.Start and fr.End must be page-aligned. fr.Length() > + // 0. At least one reference must be held on all pages in fr. + DecRef(fr FileRange) + + // MapInternal returns a mapping of the given file offsets in the invoking + // process' address space for reading and writing. + // + // Note that fr.Start and fr.End need not be page-aligned. + // + // Preconditions: fr.Length() > 0. At least one reference must be held on + // all pages in fr. + // + // Postconditions: The returned mapping is valid as long as at least one + // reference is held on the mapped pages. + MapInternal(fr FileRange, at usermem.AccessType) (safemem.BlockSeq, error) + + // FD returns the file descriptor represented by the File. + // + // The only permitted operation on the returned file descriptor is to map + // pages from it consistent with the requirements of AddressSpace.MapFile. + FD() int +} + +// FileRange represents a range of uint64 offsets into a File. +// +// type FileRange + +// String implements fmt.Stringer.String. +func (fr FileRange) String() string { + return fmt.Sprintf("[%#x, %#x)", fr.Start, fr.End) +} diff --git a/pkg/sentry/mm/BUILD b/pkg/sentry/mm/BUILD index a036ce53c..f9d0837a1 100644 --- a/pkg/sentry/mm/BUILD +++ b/pkg/sentry/mm/BUILD @@ -7,14 +7,14 @@ go_template_instance( name = "file_refcount_set", out = "file_refcount_set.go", imports = { - "platform": "gvisor.dev/gvisor/pkg/sentry/platform", + "memmap": "gvisor.dev/gvisor/pkg/sentry/memmap", }, package = "mm", prefix = "fileRefcount", template = "//pkg/segment:generic_set", types = { "Key": "uint64", - "Range": "platform.FileRange", + "Range": "memmap.FileRange", "Value": "int32", "Functions": "fileRefcountSetFunctions", }, diff --git a/pkg/sentry/mm/aio_context.go b/pkg/sentry/mm/aio_context.go index 379148903..1999ec706 100644 --- a/pkg/sentry/mm/aio_context.go +++ b/pkg/sentry/mm/aio_context.go @@ -20,7 +20,6 @@ import ( "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" - "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" @@ -243,7 +242,7 @@ type aioMappable struct { refs.AtomicRefCount mfp pgalloc.MemoryFileProvider - fr platform.FileRange + fr memmap.FileRange } var aioRingBufferSize = uint64(usermem.Addr(linux.AIORingSize).MustRoundUp()) diff --git a/pkg/sentry/mm/mm.go b/pkg/sentry/mm/mm.go index 6db7c3d40..3e85964e4 100644 --- a/pkg/sentry/mm/mm.go +++ b/pkg/sentry/mm/mm.go @@ -25,7 +25,7 @@ // Locks taken by memmap.Mappable.Translate // mm.privateRefs.mu // platform.AddressSpace locks -// platform.File locks +// memmap.File locks // mm.aioManager.mu // mm.AIOContext.mu // @@ -396,7 +396,7 @@ type pma struct { // file is the file mapped by this pma. Only pmas for which file == // MemoryManager.mfp.MemoryFile() may be saved. pmas hold a reference to // the corresponding file range while they exist. - file platform.File `state:"nosave"` + file memmap.File `state:"nosave"` // off is the offset into file at which this pma begins. // @@ -436,7 +436,7 @@ type pma struct { private bool // If internalMappings is not empty, it is the cached return value of - // file.MapInternal for the platform.FileRange mapped by this pma. + // file.MapInternal for the memmap.FileRange mapped by this pma. internalMappings safemem.BlockSeq `state:"nosave"` } @@ -469,10 +469,10 @@ func (fileRefcountSetFunctions) MaxKey() uint64 { func (fileRefcountSetFunctions) ClearValue(_ *int32) { } -func (fileRefcountSetFunctions) Merge(_ platform.FileRange, rc1 int32, _ platform.FileRange, rc2 int32) (int32, bool) { +func (fileRefcountSetFunctions) Merge(_ memmap.FileRange, rc1 int32, _ memmap.FileRange, rc2 int32) (int32, bool) { return rc1, rc1 == rc2 } -func (fileRefcountSetFunctions) Split(_ platform.FileRange, rc int32, _ uint64) (int32, int32) { +func (fileRefcountSetFunctions) Split(_ memmap.FileRange, rc int32, _ uint64) (int32, int32) { return rc, rc } diff --git a/pkg/sentry/mm/pma.go b/pkg/sentry/mm/pma.go index 62e4c20af..930ec895f 100644 --- a/pkg/sentry/mm/pma.go +++ b/pkg/sentry/mm/pma.go @@ -21,7 +21,6 @@ import ( "gvisor.dev/gvisor/pkg/safecopy" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/memmap" - "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" @@ -604,7 +603,7 @@ func (mm *MemoryManager) invalidateLocked(ar usermem.AddrRange, invalidatePrivat } } -// Pin returns the platform.File ranges currently mapped by addresses in ar in +// Pin returns the memmap.File ranges currently mapped by addresses in ar in // mm, acquiring a reference on the returned ranges which the caller must // release by calling Unpin. If not all addresses are mapped, Pin returns a // non-nil error. Note that Pin may return both a non-empty slice of @@ -674,15 +673,15 @@ type PinnedRange struct { Source usermem.AddrRange // File is the mapped file. - File platform.File + File memmap.File // Offset is the offset into File at which this PinnedRange begins. Offset uint64 } -// FileRange returns the platform.File offsets mapped by pr. -func (pr PinnedRange) FileRange() platform.FileRange { - return platform.FileRange{pr.Offset, pr.Offset + uint64(pr.Source.Length())} +// FileRange returns the memmap.File offsets mapped by pr. +func (pr PinnedRange) FileRange() memmap.FileRange { + return memmap.FileRange{pr.Offset, pr.Offset + uint64(pr.Source.Length())} } // Unpin releases the reference held by prs. @@ -857,7 +856,7 @@ func (mm *MemoryManager) vecInternalMappingsLocked(ars usermem.AddrRangeSeq) saf } // incPrivateRef acquires a reference on private pages in fr. -func (mm *MemoryManager) incPrivateRef(fr platform.FileRange) { +func (mm *MemoryManager) incPrivateRef(fr memmap.FileRange) { mm.privateRefs.mu.Lock() defer mm.privateRefs.mu.Unlock() refSet := &mm.privateRefs.refs @@ -878,8 +877,8 @@ func (mm *MemoryManager) incPrivateRef(fr platform.FileRange) { } // decPrivateRef releases a reference on private pages in fr. -func (mm *MemoryManager) decPrivateRef(fr platform.FileRange) { - var freed []platform.FileRange +func (mm *MemoryManager) decPrivateRef(fr memmap.FileRange) { + var freed []memmap.FileRange mm.privateRefs.mu.Lock() refSet := &mm.privateRefs.refs @@ -951,7 +950,7 @@ func (pmaSetFunctions) Merge(ar1 usermem.AddrRange, pma1 pma, ar2 usermem.AddrRa // Discard internal mappings instead of trying to merge them, since merging // them requires an allocation and getting them again from the - // platform.File might not. + // memmap.File might not. pma1.internalMappings = safemem.BlockSeq{} return pma1, true } @@ -1012,12 +1011,12 @@ func (pseg pmaIterator) getInternalMappingsLocked() error { return nil } -func (pseg pmaIterator) fileRange() platform.FileRange { +func (pseg pmaIterator) fileRange() memmap.FileRange { return pseg.fileRangeOf(pseg.Range()) } // Preconditions: pseg.Range().IsSupersetOf(ar). ar.Length != 0. -func (pseg pmaIterator) fileRangeOf(ar usermem.AddrRange) platform.FileRange { +func (pseg pmaIterator) fileRangeOf(ar usermem.AddrRange) memmap.FileRange { if checkInvariants { if !pseg.Ok() { panic("terminal pma iterator") @@ -1032,5 +1031,5 @@ func (pseg pmaIterator) fileRangeOf(ar usermem.AddrRange) platform.FileRange { pma := pseg.ValuePtr() pstart := pseg.Start() - return platform.FileRange{pma.off + uint64(ar.Start-pstart), pma.off + uint64(ar.End-pstart)} + return memmap.FileRange{pma.off + uint64(ar.Start-pstart), pma.off + uint64(ar.End-pstart)} } diff --git a/pkg/sentry/mm/special_mappable.go b/pkg/sentry/mm/special_mappable.go index 9ad52082d..0e142fb11 100644 --- a/pkg/sentry/mm/special_mappable.go +++ b/pkg/sentry/mm/special_mappable.go @@ -19,7 +19,6 @@ import ( "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/pgalloc" - "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" @@ -35,7 +34,7 @@ type SpecialMappable struct { refs.AtomicRefCount mfp pgalloc.MemoryFileProvider - fr platform.FileRange + fr memmap.FileRange name string } @@ -44,7 +43,7 @@ type SpecialMappable struct { // SpecialMappable will use the given name in /proc/[pid]/maps. // // Preconditions: fr.Length() != 0. -func NewSpecialMappable(name string, mfp pgalloc.MemoryFileProvider, fr platform.FileRange) *SpecialMappable { +func NewSpecialMappable(name string, mfp pgalloc.MemoryFileProvider, fr memmap.FileRange) *SpecialMappable { m := SpecialMappable{mfp: mfp, fr: fr, name: name} m.EnableLeakCheck("mm.SpecialMappable") return &m @@ -126,7 +125,7 @@ func (m *SpecialMappable) MemoryFileProvider() pgalloc.MemoryFileProvider { // FileRange returns the offsets into MemoryFileProvider().MemoryFile() that // store the SpecialMappable's contents. -func (m *SpecialMappable) FileRange() platform.FileRange { +func (m *SpecialMappable) FileRange() memmap.FileRange { return m.fr } diff --git a/pkg/sentry/pgalloc/BUILD b/pkg/sentry/pgalloc/BUILD index e1fcb175f..7a3311a70 100644 --- a/pkg/sentry/pgalloc/BUILD +++ b/pkg/sentry/pgalloc/BUILD @@ -36,14 +36,14 @@ go_template_instance( "trackGaps": "1", }, imports = { - "platform": "gvisor.dev/gvisor/pkg/sentry/platform", + "memmap": "gvisor.dev/gvisor/pkg/sentry/memmap", }, package = "pgalloc", prefix = "usage", template = "//pkg/segment:generic_set", types = { "Key": "uint64", - "Range": "platform.FileRange", + "Range": "memmap.FileRange", "Value": "usageInfo", "Functions": "usageSetFunctions", }, @@ -56,14 +56,14 @@ go_template_instance( "minDegree": "10", }, imports = { - "platform": "gvisor.dev/gvisor/pkg/sentry/platform", + "memmap": "gvisor.dev/gvisor/pkg/sentry/memmap", }, package = "pgalloc", prefix = "reclaim", template = "//pkg/segment:generic_set", types = { "Key": "uint64", - "Range": "platform.FileRange", + "Range": "memmap.FileRange", "Value": "reclaimSetValue", "Functions": "reclaimSetFunctions", }, @@ -89,7 +89,7 @@ go_library( "//pkg/safemem", "//pkg/sentry/arch", "//pkg/sentry/hostmm", - "//pkg/sentry/platform", + "//pkg/sentry/memmap", "//pkg/sentry/usage", "//pkg/state", "//pkg/state/wire", diff --git a/pkg/sentry/pgalloc/pgalloc.go b/pkg/sentry/pgalloc/pgalloc.go index afab97c0a..3243d7214 100644 --- a/pkg/sentry/pgalloc/pgalloc.go +++ b/pkg/sentry/pgalloc/pgalloc.go @@ -33,14 +33,14 @@ import ( "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/sentry/hostmm" - "gvisor.dev/gvisor/pkg/sentry/platform" + "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) -// MemoryFile is a platform.File whose pages may be allocated to arbitrary +// MemoryFile is a memmap.File whose pages may be allocated to arbitrary // users. type MemoryFile struct { // opts holds options passed to NewMemoryFile. opts is immutable. @@ -372,7 +372,7 @@ func (f *MemoryFile) Destroy() { // to Allocate. // // Preconditions: length must be page-aligned and non-zero. -func (f *MemoryFile) Allocate(length uint64, kind usage.MemoryKind) (platform.FileRange, error) { +func (f *MemoryFile) Allocate(length uint64, kind usage.MemoryKind) (memmap.FileRange, error) { if length == 0 || length%usermem.PageSize != 0 { panic(fmt.Sprintf("invalid allocation length: %#x", length)) } @@ -390,7 +390,7 @@ func (f *MemoryFile) Allocate(length uint64, kind usage.MemoryKind) (platform.Fi // Find a range in the underlying file. fr, ok := findAvailableRange(&f.usage, f.fileSize, length, alignment) if !ok { - return platform.FileRange{}, syserror.ENOMEM + return memmap.FileRange{}, syserror.ENOMEM } // Expand the file if needed. @@ -398,7 +398,7 @@ func (f *MemoryFile) Allocate(length uint64, kind usage.MemoryKind) (platform.Fi // Round the new file size up to be chunk-aligned. newFileSize := (int64(fr.End) + chunkMask) &^ chunkMask if err := f.file.Truncate(newFileSize); err != nil { - return platform.FileRange{}, err + return memmap.FileRange{}, err } f.fileSize = newFileSize f.mappingsMu.Lock() @@ -416,7 +416,7 @@ func (f *MemoryFile) Allocate(length uint64, kind usage.MemoryKind) (platform.Fi bs[i] = 0 } }); err != nil { - return platform.FileRange{}, err + return memmap.FileRange{}, err } } if !f.usage.Add(fr, usageInfo{ @@ -439,7 +439,7 @@ func (f *MemoryFile) Allocate(length uint64, kind usage.MemoryKind) (platform.Fi // space for mappings to be allocated downwards. // // Precondition: alignment must be a power of 2. -func findAvailableRange(usage *usageSet, fileSize int64, length, alignment uint64) (platform.FileRange, bool) { +func findAvailableRange(usage *usageSet, fileSize int64, length, alignment uint64) (memmap.FileRange, bool) { alignmentMask := alignment - 1 // Search for space in existing gaps, starting at the current end of the @@ -461,7 +461,7 @@ func findAvailableRange(usage *usageSet, fileSize int64, length, alignment uint6 break } if start := unalignedStart &^ alignmentMask; start >= gap.Start() { - return platform.FileRange{start, start + length}, true + return memmap.FileRange{start, start + length}, true } gap = gap.PrevLargeEnoughGap(length) @@ -475,7 +475,7 @@ func findAvailableRange(usage *usageSet, fileSize int64, length, alignment uint6 min = (min + alignmentMask) &^ alignmentMask if min+length < min { // Overflow: allocation would exceed the range of uint64. - return platform.FileRange{}, false + return memmap.FileRange{}, false } // Determine the minimum file size required to fit this allocation at its end. @@ -484,7 +484,7 @@ func findAvailableRange(usage *usageSet, fileSize int64, length, alignment uint6 if newFileSize <= fileSize { if fileSize != 0 { // Overflow: allocation would exceed the range of int64. - return platform.FileRange{}, false + return memmap.FileRange{}, false } newFileSize = chunkSize } @@ -496,7 +496,7 @@ func findAvailableRange(usage *usageSet, fileSize int64, length, alignment uint6 continue } if start := unalignedStart &^ alignmentMask; start >= min { - return platform.FileRange{start, start + length}, true + return memmap.FileRange{start, start + length}, true } } } @@ -508,22 +508,22 @@ func findAvailableRange(usage *usageSet, fileSize int64, length, alignment uint6 // by r.ReadToBlocks(), it returns that error. // // Preconditions: length > 0. length must be page-aligned. -func (f *MemoryFile) AllocateAndFill(length uint64, kind usage.MemoryKind, r safemem.Reader) (platform.FileRange, error) { +func (f *MemoryFile) AllocateAndFill(length uint64, kind usage.MemoryKind, r safemem.Reader) (memmap.FileRange, error) { fr, err := f.Allocate(length, kind) if err != nil { - return platform.FileRange{}, err + return memmap.FileRange{}, err } dsts, err := f.MapInternal(fr, usermem.Write) if err != nil { f.DecRef(fr) - return platform.FileRange{}, err + return memmap.FileRange{}, err } n, err := safemem.ReadFullToBlocks(r, dsts) un := uint64(usermem.Addr(n).RoundDown()) if un < length { // Free unused memory and update fr to contain only the memory that is // still allocated. - f.DecRef(platform.FileRange{fr.Start + un, fr.End}) + f.DecRef(memmap.FileRange{fr.Start + un, fr.End}) fr.End = fr.Start + un } return fr, err @@ -540,7 +540,7 @@ const ( // will read zeroes. // // Preconditions: fr.Length() > 0. -func (f *MemoryFile) Decommit(fr platform.FileRange) error { +func (f *MemoryFile) Decommit(fr memmap.FileRange) error { if !fr.WellFormed() || fr.Length() == 0 || fr.Start%usermem.PageSize != 0 || fr.End%usermem.PageSize != 0 { panic(fmt.Sprintf("invalid range: %v", fr)) } @@ -560,7 +560,7 @@ func (f *MemoryFile) Decommit(fr platform.FileRange) error { return nil } -func (f *MemoryFile) markDecommitted(fr platform.FileRange) { +func (f *MemoryFile) markDecommitted(fr memmap.FileRange) { f.mu.Lock() defer f.mu.Unlock() // Since we're changing the knownCommitted attribute, we need to merge @@ -581,8 +581,8 @@ func (f *MemoryFile) markDecommitted(fr platform.FileRange) { f.usage.MergeRange(fr) } -// IncRef implements platform.File.IncRef. -func (f *MemoryFile) IncRef(fr platform.FileRange) { +// IncRef implements memmap.File.IncRef. +func (f *MemoryFile) IncRef(fr memmap.FileRange) { if !fr.WellFormed() || fr.Length() == 0 || fr.Start%usermem.PageSize != 0 || fr.End%usermem.PageSize != 0 { panic(fmt.Sprintf("invalid range: %v", fr)) } @@ -600,8 +600,8 @@ func (f *MemoryFile) IncRef(fr platform.FileRange) { f.usage.MergeAdjacent(fr) } -// DecRef implements platform.File.DecRef. -func (f *MemoryFile) DecRef(fr platform.FileRange) { +// DecRef implements memmap.File.DecRef. +func (f *MemoryFile) DecRef(fr memmap.FileRange) { if !fr.WellFormed() || fr.Length() == 0 || fr.Start%usermem.PageSize != 0 || fr.End%usermem.PageSize != 0 { panic(fmt.Sprintf("invalid range: %v", fr)) } @@ -637,8 +637,8 @@ func (f *MemoryFile) DecRef(fr platform.FileRange) { } } -// MapInternal implements platform.File.MapInternal. -func (f *MemoryFile) MapInternal(fr platform.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) { +// MapInternal implements memmap.File.MapInternal. +func (f *MemoryFile) MapInternal(fr memmap.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) { if !fr.WellFormed() || fr.Length() == 0 { panic(fmt.Sprintf("invalid range: %v", fr)) } @@ -664,7 +664,7 @@ func (f *MemoryFile) MapInternal(fr platform.FileRange, at usermem.AccessType) ( // forEachMappingSlice invokes fn on a sequence of byte slices that // collectively map all bytes in fr. -func (f *MemoryFile) forEachMappingSlice(fr platform.FileRange, fn func([]byte)) error { +func (f *MemoryFile) forEachMappingSlice(fr memmap.FileRange, fn func([]byte)) error { mappings := f.mappings.Load().([]uintptr) for chunkStart := fr.Start &^ chunkMask; chunkStart < fr.End; chunkStart += chunkSize { chunk := int(chunkStart >> chunkShift) @@ -944,7 +944,7 @@ func (f *MemoryFile) updateUsageLocked(currentUsage uint64, checkCommitted func( continue case !populated && populatedRun: // Finish the run by changing this segment. - runRange := platform.FileRange{ + runRange := memmap.FileRange{ Start: r.Start + uint64(populatedRunStart*usermem.PageSize), End: r.Start + uint64(i*usermem.PageSize), } @@ -1009,7 +1009,7 @@ func (f *MemoryFile) File() *os.File { return f.file } -// FD implements platform.File.FD. +// FD implements memmap.File.FD. func (f *MemoryFile) FD() int { return int(f.file.Fd()) } @@ -1090,13 +1090,13 @@ func (f *MemoryFile) runReclaim() { // // Note that there returned range will be removed from tracking. It // must be reclaimed (removed from f.usage) at this point. -func (f *MemoryFile) findReclaimable() (platform.FileRange, bool) { +func (f *MemoryFile) findReclaimable() (memmap.FileRange, bool) { f.mu.Lock() defer f.mu.Unlock() for { for { if f.destroyed { - return platform.FileRange{}, false + return memmap.FileRange{}, false } if f.reclaimable { break @@ -1120,7 +1120,7 @@ func (f *MemoryFile) findReclaimable() (platform.FileRange, bool) { } } -func (f *MemoryFile) markReclaimed(fr platform.FileRange) { +func (f *MemoryFile) markReclaimed(fr memmap.FileRange) { f.mu.Lock() defer f.mu.Unlock() seg := f.usage.FindSegment(fr.Start) @@ -1222,11 +1222,11 @@ func (usageSetFunctions) MaxKey() uint64 { func (usageSetFunctions) ClearValue(val *usageInfo) { } -func (usageSetFunctions) Merge(_ platform.FileRange, val1 usageInfo, _ platform.FileRange, val2 usageInfo) (usageInfo, bool) { +func (usageSetFunctions) Merge(_ memmap.FileRange, val1 usageInfo, _ memmap.FileRange, val2 usageInfo) (usageInfo, bool) { return val1, val1 == val2 } -func (usageSetFunctions) Split(_ platform.FileRange, val usageInfo, _ uint64) (usageInfo, usageInfo) { +func (usageSetFunctions) Split(_ memmap.FileRange, val usageInfo, _ uint64) (usageInfo, usageInfo) { return val, val } @@ -1270,10 +1270,10 @@ func (reclaimSetFunctions) MaxKey() uint64 { func (reclaimSetFunctions) ClearValue(val *reclaimSetValue) { } -func (reclaimSetFunctions) Merge(_ platform.FileRange, _ reclaimSetValue, _ platform.FileRange, _ reclaimSetValue) (reclaimSetValue, bool) { +func (reclaimSetFunctions) Merge(_ memmap.FileRange, _ reclaimSetValue, _ memmap.FileRange, _ reclaimSetValue) (reclaimSetValue, bool) { return reclaimSetValue{}, true } -func (reclaimSetFunctions) Split(_ platform.FileRange, _ reclaimSetValue, _ uint64) (reclaimSetValue, reclaimSetValue) { +func (reclaimSetFunctions) Split(_ memmap.FileRange, _ reclaimSetValue, _ uint64) (reclaimSetValue, reclaimSetValue) { return reclaimSetValue{}, reclaimSetValue{} } diff --git a/pkg/sentry/platform/BUILD b/pkg/sentry/platform/BUILD index 453241eca..209b28053 100644 --- a/pkg/sentry/platform/BUILD +++ b/pkg/sentry/platform/BUILD @@ -1,39 +1,21 @@ load("//tools:defs.bzl", "go_library") -load("//tools/go_generics:defs.bzl", "go_template_instance") package(licenses = ["notice"]) -go_template_instance( - name = "file_range", - out = "file_range.go", - package = "platform", - prefix = "File", - template = "//pkg/segment:generic_range", - types = { - "T": "uint64", - }, -) - go_library( name = "platform", srcs = [ "context.go", - "file_range.go", "mmap_min_addr.go", "platform.go", ], visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", - "//pkg/atomicbitops", "//pkg/context", - "//pkg/log", - "//pkg/safecopy", - "//pkg/safemem", "//pkg/seccomp", "//pkg/sentry/arch", - "//pkg/sentry/usage", - "//pkg/syserror", + "//pkg/sentry/memmap", "//pkg/usermem", ], ) diff --git a/pkg/sentry/platform/kvm/BUILD b/pkg/sentry/platform/kvm/BUILD index 10a10bfe2..b5d27a72a 100644 --- a/pkg/sentry/platform/kvm/BUILD +++ b/pkg/sentry/platform/kvm/BUILD @@ -47,6 +47,7 @@ go_library( "//pkg/safecopy", "//pkg/seccomp", "//pkg/sentry/arch", + "//pkg/sentry/memmap", "//pkg/sentry/platform", "//pkg/sentry/platform/interrupt", "//pkg/sentry/platform/ring0", diff --git a/pkg/sentry/platform/kvm/address_space.go b/pkg/sentry/platform/kvm/address_space.go index faf1d5e1c..98a3e539d 100644 --- a/pkg/sentry/platform/kvm/address_space.go +++ b/pkg/sentry/platform/kvm/address_space.go @@ -18,6 +18,7 @@ import ( "sync/atomic" "gvisor.dev/gvisor/pkg/atomicbitops" + "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sentry/platform/ring0/pagetables" "gvisor.dev/gvisor/pkg/sync" @@ -150,7 +151,7 @@ func (as *addressSpace) mapLocked(addr usermem.Addr, m hostMapEntry, at usermem. } // MapFile implements platform.AddressSpace.MapFile. -func (as *addressSpace) MapFile(addr usermem.Addr, f platform.File, fr platform.FileRange, at usermem.AccessType, precommit bool) error { +func (as *addressSpace) MapFile(addr usermem.Addr, f memmap.File, fr memmap.FileRange, at usermem.AccessType, precommit bool) error { as.mu.Lock() defer as.mu.Unlock() diff --git a/pkg/sentry/platform/platform.go b/pkg/sentry/platform/platform.go index 171513f3f..4b13eec30 100644 --- a/pkg/sentry/platform/platform.go +++ b/pkg/sentry/platform/platform.go @@ -22,9 +22,9 @@ import ( "os" "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/safemem" "gvisor.dev/gvisor/pkg/seccomp" "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/usermem" ) @@ -207,7 +207,7 @@ type AddressSpace interface { // Preconditions: addr and fr must be page-aligned. fr.Length() > 0. // at.Any() == true. At least one reference must be held on all pages in // fr, and must continue to be held as long as pages are mapped. - MapFile(addr usermem.Addr, f File, fr FileRange, at usermem.AccessType, precommit bool) error + MapFile(addr usermem.Addr, f memmap.File, fr memmap.FileRange, at usermem.AccessType, precommit bool) error // Unmap unmaps the given range. // @@ -310,52 +310,6 @@ func (f SegmentationFault) Error() string { return fmt.Sprintf("segmentation fault at %#x", f.Addr) } -// File represents a host file that may be mapped into an AddressSpace. -type File interface { - // All pages in a File are reference-counted. - - // IncRef increments the reference count on all pages in fr. - // - // Preconditions: fr.Start and fr.End must be page-aligned. fr.Length() > - // 0. At least one reference must be held on all pages in fr. (The File - // interface does not provide a way to acquire an initial reference; - // implementors may define mechanisms for doing so.) - IncRef(fr FileRange) - - // DecRef decrements the reference count on all pages in fr. - // - // Preconditions: fr.Start and fr.End must be page-aligned. fr.Length() > - // 0. At least one reference must be held on all pages in fr. - DecRef(fr FileRange) - - // MapInternal returns a mapping of the given file offsets in the invoking - // process' address space for reading and writing. - // - // Note that fr.Start and fr.End need not be page-aligned. - // - // Preconditions: fr.Length() > 0. At least one reference must be held on - // all pages in fr. - // - // Postconditions: The returned mapping is valid as long as at least one - // reference is held on the mapped pages. - MapInternal(fr FileRange, at usermem.AccessType) (safemem.BlockSeq, error) - - // FD returns the file descriptor represented by the File. - // - // The only permitted operation on the returned file descriptor is to map - // pages from it consistent with the requirements of AddressSpace.MapFile. - FD() int -} - -// FileRange represents a range of uint64 offsets into a File. -// -// type FileRange - -// String implements fmt.Stringer.String. -func (fr FileRange) String() string { - return fmt.Sprintf("[%#x, %#x)", fr.Start, fr.End) -} - // Requirements is used to specify platform specific requirements. type Requirements struct { // RequiresCurrentPIDNS indicates that the sandbox has to be started in the diff --git a/pkg/sentry/platform/ptrace/BUILD b/pkg/sentry/platform/ptrace/BUILD index 30402c2df..29fd23cc3 100644 --- a/pkg/sentry/platform/ptrace/BUILD +++ b/pkg/sentry/platform/ptrace/BUILD @@ -30,6 +30,7 @@ go_library( "//pkg/seccomp", "//pkg/sentry/arch", "//pkg/sentry/hostcpu", + "//pkg/sentry/memmap", "//pkg/sentry/platform", "//pkg/sentry/platform/interrupt", "//pkg/sync", diff --git a/pkg/sentry/platform/ptrace/subprocess.go b/pkg/sentry/platform/ptrace/subprocess.go index 2389423b0..c990f3454 100644 --- a/pkg/sentry/platform/ptrace/subprocess.go +++ b/pkg/sentry/platform/ptrace/subprocess.go @@ -24,6 +24,7 @@ import ( "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/procid" "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/platform" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/usermem" @@ -616,7 +617,7 @@ func (s *subprocess) syscall(sysno uintptr, args ...arch.SyscallArgument) (uintp } // MapFile implements platform.AddressSpace.MapFile. -func (s *subprocess) MapFile(addr usermem.Addr, f platform.File, fr platform.FileRange, at usermem.AccessType, precommit bool) error { +func (s *subprocess) MapFile(addr usermem.Addr, f memmap.File, fr memmap.FileRange, at usermem.AccessType, precommit bool) error { var flags int if precommit { flags |= syscall.MAP_POPULATE -- cgit v1.2.3 From 112eb0c5b9e6d45b58470fb5536a9fd91fb4222b Mon Sep 17 00:00:00 2001 From: Ridwan Sharif Date: Mon, 27 Jul 2020 13:25:21 -0700 Subject: Add device implementation for /dev/fuse This PR adds the following: - [x] Marshall-able structs for fuse headers - [x] Data structures needed in /dev/fuse to communicate with the daemon server - [x] Implementation of the device interface - [x] Go unit tests This change adds the `/dev/fuse` implementation. `Connection` controls the communication between the server and the sentry. The FUSE server uses the `FileDescription` interface to interact with the Sentry. The Sentry implmenetation of fusefs, uses `Connection` and the Connection interface to interact with the Server. All communication messages are in the form of `go_marshal` backed structs defined in the ABI package. This change also adds some go unit tests that test (pretty basically) the interfaces and should be used as an example of an end to end FUSE operation. COPYBARA_INTEGRATE_REVIEW=https://github.com/google/gvisor/pull/3083 from ridwanmsharif:ridwanmsharif/fuse-device-impl 69aa2ce970004938fe9f918168dfe57636ab856e PiperOrigin-RevId: 323428180 --- pkg/abi/linux/BUILD | 1 + pkg/abi/linux/fuse.go | 143 ++++++++++++ pkg/sentry/fsimpl/fuse/BUILD | 41 +++- pkg/sentry/fsimpl/fuse/connection.go | 255 +++++++++++++++++++++ pkg/sentry/fsimpl/fuse/dev.go | 289 +++++++++++++++++++++-- pkg/sentry/fsimpl/fuse/dev_test.go | 429 +++++++++++++++++++++++++++++++++++ pkg/sentry/fsimpl/fuse/fusefs.go | 52 +++-- pkg/sentry/fsimpl/fuse/register.go | 42 ++++ 8 files changed, 1223 insertions(+), 29 deletions(-) create mode 100644 pkg/abi/linux/fuse.go create mode 100644 pkg/sentry/fsimpl/fuse/connection.go create mode 100644 pkg/sentry/fsimpl/fuse/dev_test.go create mode 100644 pkg/sentry/fsimpl/fuse/register.go (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/abi/linux/BUILD b/pkg/abi/linux/BUILD index a4bb62013..05ca5342f 100644 --- a/pkg/abi/linux/BUILD +++ b/pkg/abi/linux/BUILD @@ -29,6 +29,7 @@ go_library( "file_amd64.go", "file_arm64.go", "fs.go", + "fuse.go", "futex.go", "inotify.go", "ioctl.go", diff --git a/pkg/abi/linux/fuse.go b/pkg/abi/linux/fuse.go new file mode 100644 index 000000000..d3ebbccc4 --- /dev/null +++ b/pkg/abi/linux/fuse.go @@ -0,0 +1,143 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package linux + +// +marshal +type FUSEOpcode uint32 + +// +marshal +type FUSEOpID uint64 + +// Opcodes for FUSE operations. Analogous to the opcodes in include/linux/fuse.h. +const ( + FUSE_LOOKUP FUSEOpcode = 1 + FUSE_FORGET = 2 /* no reply */ + FUSE_GETATTR = 3 + FUSE_SETATTR = 4 + FUSE_READLINK = 5 + FUSE_SYMLINK = 6 + _ + FUSE_MKNOD = 8 + FUSE_MKDIR = 9 + FUSE_UNLINK = 10 + FUSE_RMDIR = 11 + FUSE_RENAME = 12 + FUSE_LINK = 13 + FUSE_OPEN = 14 + FUSE_READ = 15 + FUSE_WRITE = 16 + FUSE_STATFS = 17 + FUSE_RELEASE = 18 + _ + FUSE_FSYNC = 20 + FUSE_SETXATTR = 21 + FUSE_GETXATTR = 22 + FUSE_LISTXATTR = 23 + FUSE_REMOVEXATTR = 24 + FUSE_FLUSH = 25 + FUSE_INIT = 26 + FUSE_OPENDIR = 27 + FUSE_READDIR = 28 + FUSE_RELEASEDIR = 29 + FUSE_FSYNCDIR = 30 + FUSE_GETLK = 31 + FUSE_SETLK = 32 + FUSE_SETLKW = 33 + FUSE_ACCESS = 34 + FUSE_CREATE = 35 + FUSE_INTERRUPT = 36 + FUSE_BMAP = 37 + FUSE_DESTROY = 38 + FUSE_IOCTL = 39 + FUSE_POLL = 40 + FUSE_NOTIFY_REPLY = 41 + FUSE_BATCH_FORGET = 42 +) + +const ( + // FUSE_MIN_READ_BUFFER is the minimum size the read can be for any FUSE filesystem. + // This is the minimum size Linux supports. See linux.fuse.h. + FUSE_MIN_READ_BUFFER uint32 = 8192 +) + +// FUSEHeaderIn is the header read by the daemon with each request. +// +// +marshal +type FUSEHeaderIn struct { + // Len specifies the total length of the data, including this header. + Len uint32 + + // Opcode specifies the kind of operation of the request. + Opcode FUSEOpcode + + // Unique specifies the unique identifier for this request. + Unique FUSEOpID + + // NodeID is the ID of the filesystem object being operated on. + NodeID uint64 + + // UID is the UID of the requesting process. + UID uint32 + + // GID is the GID of the requesting process. + GID uint32 + + // PID is the PID of the requesting process. + PID uint32 + + _ uint32 +} + +// FUSEHeaderOut is the header written by the daemon when it processes +// a request and wants to send a reply (almost all operations require a +// reply; if they do not, this will be explicitly documented). +// +// +marshal +type FUSEHeaderOut struct { + // Len specifies the total length of the data, including this header. + Len uint32 + + // Error specifies the error that occurred (0 if none). + Error int32 + + // Unique specifies the unique identifier of the corresponding request. + Unique FUSEOpID +} + +// FUSEWriteIn is the header written by a daemon when it makes a +// write request to the FUSE filesystem. +// +// +marshal +type FUSEWriteIn struct { + // Fh specifies the file handle that is being written to. + Fh uint64 + + // Offset is the offset of the write. + Offset uint64 + + // Size is the size of data being written. + Size uint32 + + // WriteFlags is the flags used during the write. + WriteFlags uint32 + + // LockOwner is the ID of the lock owner. + LockOwner uint64 + + // Flags is the flags for the request. + Flags uint32 + + _ uint32 +} diff --git a/pkg/sentry/fsimpl/fuse/BUILD b/pkg/sentry/fsimpl/fuse/BUILD index 737007748..67649e811 100644 --- a/pkg/sentry/fsimpl/fuse/BUILD +++ b/pkg/sentry/fsimpl/fuse/BUILD @@ -1,12 +1,28 @@ -load("//tools:defs.bzl", "go_library") +load("//tools:defs.bzl", "go_library", "go_test") +load("//tools/go_generics:defs.bzl", "go_template_instance") licenses(["notice"]) +go_template_instance( + name = "request_list", + out = "request_list.go", + package = "fuse", + prefix = "request", + template = "//pkg/ilist:generic_list", + types = { + "Element": "*Request", + "Linker": "*Request", + }, +) + go_library( name = "fuse", srcs = [ + "connection.go", "dev.go", "fusefs.go", + "register.go", + "request_list.go", ], visibility = ["//pkg/sentry:internal"], deps = [ @@ -18,7 +34,30 @@ go_library( "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", "//pkg/sentry/vfs", + "//pkg/sync", + "//pkg/syserror", + "//pkg/usermem", + "//pkg/waiter", + "//tools/go_marshal/marshal", + "@org_golang_x_sys//unix:go_default_library", + ], +) + +go_test( + name = "dev_test", + size = "small", + srcs = ["dev_test.go"], + library = ":fuse", + deps = [ + "//pkg/abi/linux", + "//pkg/sentry/fsimpl/testutil", + "//pkg/sentry/fsimpl/tmpfs", + "//pkg/sentry/kernel", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/vfs", "//pkg/syserror", "//pkg/usermem", + "//pkg/waiter", + "//tools/go_marshal/marshal", ], ) diff --git a/pkg/sentry/fsimpl/fuse/connection.go b/pkg/sentry/fsimpl/fuse/connection.go new file mode 100644 index 000000000..f330da0bd --- /dev/null +++ b/pkg/sentry/fsimpl/fuse/connection.go @@ -0,0 +1,255 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fuse + +import ( + "errors" + "fmt" + "syscall" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/waiter" + "gvisor.dev/gvisor/tools/go_marshal/marshal" +) + +// MaxActiveRequestsDefault is the default setting controlling the upper bound +// on the number of active requests at any given time. +const MaxActiveRequestsDefault = 10000 + +var ( + // Ordinary requests have even IDs, while interrupts IDs are odd. + InitReqBit uint64 = 1 + ReqIDStep uint64 = 2 +) + +// Request represents a FUSE operation request that hasn't been sent to the +// server yet. +// +// +stateify savable +type Request struct { + requestEntry + + id linux.FUSEOpID + hdr *linux.FUSEHeaderIn + data []byte +} + +// Response represents an actual response from the server, including the +// response payload. +// +// +stateify savable +type Response struct { + opcode linux.FUSEOpcode + hdr linux.FUSEHeaderOut + data []byte +} + +// Connection is the struct by which the sentry communicates with the FUSE server daemon. +type Connection struct { + fd *DeviceFD + + // MaxWrite is the daemon's maximum size of a write buffer. + // This is negotiated during FUSE_INIT. + MaxWrite uint32 +} + +// NewFUSEConnection creates a FUSE connection to fd +func NewFUSEConnection(_ context.Context, fd *vfs.FileDescription, maxInFlightRequests uint64) (*Connection, error) { + // Mark the device as ready so it can be used. /dev/fuse can only be used if the FD was used to + // mount a FUSE filesystem. + fuseFD := fd.Impl().(*DeviceFD) + fuseFD.mounted = true + + // Create the writeBuf for the header to be stored in. + hdrLen := uint32((*linux.FUSEHeaderOut)(nil).SizeBytes()) + fuseFD.writeBuf = make([]byte, hdrLen) + fuseFD.completions = make(map[linux.FUSEOpID]*futureResponse) + fuseFD.fullQueueCh = make(chan struct{}, maxInFlightRequests) + fuseFD.writeCursor = 0 + + return &Connection{ + fd: fuseFD, + }, nil +} + +// NewRequest creates a new request that can be sent to the FUSE server. +func (conn *Connection) NewRequest(creds *auth.Credentials, pid uint32, ino uint64, opcode linux.FUSEOpcode, payload marshal.Marshallable) (*Request, error) { + conn.fd.mu.Lock() + defer conn.fd.mu.Unlock() + conn.fd.nextOpID += linux.FUSEOpID(ReqIDStep) + + hdrLen := (*linux.FUSEHeaderIn)(nil).SizeBytes() + hdr := linux.FUSEHeaderIn{ + Len: uint32(hdrLen + payload.SizeBytes()), + Opcode: opcode, + Unique: conn.fd.nextOpID, + NodeID: ino, + UID: uint32(creds.EffectiveKUID), + GID: uint32(creds.EffectiveKGID), + PID: pid, + } + + buf := make([]byte, hdr.Len) + hdr.MarshalUnsafe(buf[:hdrLen]) + payload.MarshalUnsafe(buf[hdrLen:]) + + return &Request{ + id: hdr.Unique, + hdr: &hdr, + data: buf, + }, nil +} + +// Call makes a request to the server and blocks the invoking task until a +// server responds with a response. +// NOTE: If no task is provided then the Call will simply enqueue the request +// and return a nil response. No blocking will happen in this case. Instead, +// this is used to signify that the processing of this request will happen by +// the kernel.Task that writes the response. See FUSE_INIT for such an +// invocation. +func (conn *Connection) Call(t *kernel.Task, r *Request) (*Response, error) { + fut, err := conn.callFuture(t, r) + if err != nil { + return nil, err + } + + return fut.resolve(t) +} + +// Error returns the error of the FUSE call. +func (r *Response) Error() error { + errno := r.hdr.Error + if errno >= 0 { + return nil + } + + sysErrNo := syscall.Errno(-errno) + return error(sysErrNo) +} + +// UnmarshalPayload unmarshals the response data into m. +func (r *Response) UnmarshalPayload(m marshal.Marshallable) error { + hdrLen := r.hdr.SizeBytes() + haveDataLen := r.hdr.Len - uint32(hdrLen) + wantDataLen := uint32(m.SizeBytes()) + + if haveDataLen < wantDataLen { + return fmt.Errorf("payload too small. Minimum data lenth required: %d, but got data length %d", wantDataLen, haveDataLen) + } + + m.UnmarshalUnsafe(r.data[hdrLen:]) + return nil +} + +// callFuture makes a request to the server and returns a future response. +// Call resolve() when the response needs to be fulfilled. +func (conn *Connection) callFuture(t *kernel.Task, r *Request) (*futureResponse, error) { + conn.fd.mu.Lock() + defer conn.fd.mu.Unlock() + + // Is the queue full? + // + // We must busy wait here until the request can be queued. We don't + // block on the fd.fullQueueCh with a lock - so after being signalled, + // before we acquire the lock, it is possible that a barging task enters + // and queues a request. As a result, upon acquiring the lock we must + // again check if the room is available. + // + // This can potentially starve a request forever but this can only happen + // if there are always too many ongoing requests all the time. The + // supported maxActiveRequests setting should be really high to avoid this. + for conn.fd.numActiveRequests == conn.fd.fs.opts.maxActiveRequests { + if t == nil { + // Since there is no task that is waiting. We must error out. + return nil, errors.New("FUSE request queue full") + } + + log.Infof("Blocking request %v from being queued. Too many active requests: %v", + r.id, conn.fd.numActiveRequests) + conn.fd.mu.Unlock() + err := t.Block(conn.fd.fullQueueCh) + conn.fd.mu.Lock() + if err != nil { + return nil, err + } + } + + return conn.callFutureLocked(t, r) +} + +// callFutureLocked makes a request to the server and returns a future response. +func (conn *Connection) callFutureLocked(t *kernel.Task, r *Request) (*futureResponse, error) { + conn.fd.queue.PushBack(r) + conn.fd.numActiveRequests += 1 + fut := newFutureResponse(r.hdr.Opcode) + conn.fd.completions[r.id] = fut + + // Signal the readers that there is something to read. + conn.fd.waitQueue.Notify(waiter.EventIn) + + return fut, nil +} + +// futureResponse represents an in-flight request, that may or may not have +// completed yet. Convert it to a resolved Response by calling Resolve, but note +// that this may block. +// +// +stateify savable +type futureResponse struct { + opcode linux.FUSEOpcode + ch chan struct{} + hdr *linux.FUSEHeaderOut + data []byte +} + +// newFutureResponse creates a future response to a FUSE request. +func newFutureResponse(opcode linux.FUSEOpcode) *futureResponse { + return &futureResponse{ + opcode: opcode, + ch: make(chan struct{}), + } +} + +// resolve blocks the task until the server responds to its corresponding request, +// then returns a resolved response. +func (f *futureResponse) resolve(t *kernel.Task) (*Response, error) { + // If there is no Task associated with this request - then we don't try to resolve + // the response. Instead, the task writing the response (proxy to the server) will + // process the response on our behalf. + if t == nil { + log.Infof("fuse.Response.resolve: Not waiting on a response from server.") + return nil, nil + } + + if err := t.Block(f.ch); err != nil { + return nil, err + } + + return f.getResponse(), nil +} + +// getResponse creates a Response from the data the futureResponse has. +func (f *futureResponse) getResponse() *Response { + return &Response{ + opcode: f.opcode, + hdr: *f.hdr, + data: f.data, + } +} diff --git a/pkg/sentry/fsimpl/fuse/dev.go b/pkg/sentry/fsimpl/fuse/dev.go index c9e12a94f..f3443ac71 100644 --- a/pkg/sentry/fsimpl/fuse/dev.go +++ b/pkg/sentry/fsimpl/fuse/dev.go @@ -15,13 +15,17 @@ package fuse import ( + "syscall" + "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/sentry/fsimpl/devtmpfs" + "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" ) const fuseDevMinor = 229 @@ -54,9 +58,43 @@ type DeviceFD struct { // mounted specifies whether a FUSE filesystem was mounted using the DeviceFD. mounted bool - // TODO(gvisor.dev/issue/2987): Add all the data structures needed to enqueue - // and deque requests, control synchronization and establish communication - // between the FUSE kernel module and the /dev/fuse character device. + // nextOpID is used to create new requests. + nextOpID linux.FUSEOpID + + // queue is the list of requests that need to be processed by the FUSE server. + queue requestList + + // numActiveRequests is the number of requests made by the Sentry that has + // yet to be responded to. + numActiveRequests uint64 + + // completions is used to map a request to its response. A Writer will use this + // to notify the caller of a completed response. + completions map[linux.FUSEOpID]*futureResponse + + writeCursor uint32 + + // writeBuf is the memory buffer used to copy in the FUSE out header from + // userspace. + writeBuf []byte + + // writeCursorFR current FR being copied from server. + writeCursorFR *futureResponse + + // mu protects all the queues, maps, buffers and cursors and nextOpID. + mu sync.Mutex + + // waitQueue is used to notify interested parties when the device becomes + // readable or writable. + waitQueue waiter.Queue + + // fullQueueCh is a channel used to synchronize the readers with the writers. + // Writers (inbound requests to the filesystem) block if there are too many + // unprocessed in-flight requests. + fullQueueCh chan struct{} + + // fs is the FUSE filesystem that this FD is being used for. + fs *filesystem } // Release implements vfs.FileDescriptionImpl.Release. @@ -79,7 +117,75 @@ func (fd *DeviceFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.R return 0, syserror.EPERM } - return 0, syserror.ENOSYS + // We require that any Read done on this filesystem have a sane minimum + // read buffer. It must have the capacity for the fixed parts of any request + // header (Linux uses the request header and the FUSEWriteIn header for this + // calculation) + the negotiated MaxWrite room for the data. + minBuffSize := linux.FUSE_MIN_READ_BUFFER + inHdrLen := uint32((*linux.FUSEHeaderIn)(nil).SizeBytes()) + writeHdrLen := uint32((*linux.FUSEWriteIn)(nil).SizeBytes()) + negotiatedMinBuffSize := inHdrLen + writeHdrLen + fd.fs.conn.MaxWrite + if minBuffSize < negotiatedMinBuffSize { + minBuffSize = negotiatedMinBuffSize + } + + // If the read buffer is too small, error out. + if dst.NumBytes() < int64(minBuffSize) { + return 0, syserror.EINVAL + } + + fd.mu.Lock() + defer fd.mu.Unlock() + return fd.readLocked(ctx, dst, opts) +} + +// readLocked implements the reading of the fuse device while locked with DeviceFD.mu. +func (fd *DeviceFD) readLocked(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + if fd.queue.Empty() { + return 0, syserror.ErrWouldBlock + } + + var readCursor uint32 + var bytesRead int64 + for { + req := fd.queue.Front() + if dst.NumBytes() < int64(req.hdr.Len) { + // The request is too large. Cannot process it. All requests must be smaller than the + // negotiated size as specified by Connection.MaxWrite set as part of the FUSE_INIT + // handshake. + errno := -int32(syscall.EIO) + if req.hdr.Opcode == linux.FUSE_SETXATTR { + errno = -int32(syscall.E2BIG) + } + + // Return the error to the calling task. + if err := fd.sendError(ctx, errno, req); err != nil { + return 0, err + } + + // We're done with this request. + fd.queue.Remove(req) + + // Restart the read as this request was invalid. + log.Warningf("fuse.DeviceFD.Read: request found was too large. Restarting read.") + return fd.readLocked(ctx, dst, opts) + } + + n, err := dst.CopyOut(ctx, req.data[readCursor:]) + if err != nil { + return 0, err + } + readCursor += uint32(n) + bytesRead += int64(n) + + if readCursor >= req.hdr.Len { + // Fully done with this req, remove it from the queue. + fd.queue.Remove(req) + break + } + } + + return bytesRead, nil } // PWrite implements vfs.FileDescriptionImpl.PWrite. @@ -94,12 +200,128 @@ func (fd *DeviceFD) PWrite(ctx context.Context, src usermem.IOSequence, offset i // Write implements vfs.FileDescriptionImpl.Write. func (fd *DeviceFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { + fd.mu.Lock() + defer fd.mu.Unlock() + return fd.writeLocked(ctx, src, opts) +} + +// writeLocked implements writing to the fuse device while locked with DeviceFD.mu. +func (fd *DeviceFD) writeLocked(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { // Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted. if !fd.mounted { return 0, syserror.EPERM } - return 0, syserror.ENOSYS + var cn, n int64 + hdrLen := uint32((*linux.FUSEHeaderOut)(nil).SizeBytes()) + + for src.NumBytes() > 0 { + if fd.writeCursorFR != nil { + // Already have common header, and we're now copying the payload. + wantBytes := fd.writeCursorFR.hdr.Len + + // Note that the FR data doesn't have the header. Copy it over if its necessary. + if fd.writeCursorFR.data == nil { + fd.writeCursorFR.data = make([]byte, wantBytes) + } + + bytesCopied, err := src.CopyIn(ctx, fd.writeCursorFR.data[fd.writeCursor:wantBytes]) + if err != nil { + return 0, err + } + src = src.DropFirst(bytesCopied) + + cn = int64(bytesCopied) + n += cn + fd.writeCursor += uint32(cn) + if fd.writeCursor == wantBytes { + // Done reading this full response. Clean up and unblock the + // initiator. + break + } + + // Check if we have more data in src. + continue + } + + // Assert that the header isn't read into the writeBuf yet. + if fd.writeCursor >= hdrLen { + return 0, syserror.EINVAL + } + + // We don't have the full common response header yet. + wantBytes := hdrLen - fd.writeCursor + bytesCopied, err := src.CopyIn(ctx, fd.writeBuf[fd.writeCursor:wantBytes]) + if err != nil { + return 0, err + } + src = src.DropFirst(bytesCopied) + + cn = int64(bytesCopied) + n += cn + fd.writeCursor += uint32(cn) + if fd.writeCursor == hdrLen { + // Have full header in the writeBuf. Use it to fetch the actual futureResponse + // from the device's completions map. + var hdr linux.FUSEHeaderOut + hdr.UnmarshalBytes(fd.writeBuf) + + // We have the header now and so the writeBuf has served its purpose. + // We could reset it manually here but instead of doing that, at the + // end of the write, the writeCursor will be set to 0 thereby allowing + // the next request to overwrite whats in the buffer, + + fut, ok := fd.completions[hdr.Unique] + if !ok { + // Server sent us a response for a request we never sent? + return 0, syserror.EINVAL + } + + delete(fd.completions, hdr.Unique) + + // Copy over the header into the future response. The rest of the payload + // will be copied over to the FR's data in the next iteration. + fut.hdr = &hdr + fd.writeCursorFR = fut + + // Next iteration will now try read the complete request, if src has + // any data remaining. Otherwise we're done. + } + } + + if fd.writeCursorFR != nil { + if err := fd.sendResponse(ctx, fd.writeCursorFR); err != nil { + return 0, err + } + + // Ready the device for the next request. + fd.writeCursorFR = nil + fd.writeCursor = 0 + } + + return n, nil +} + +// Readiness implements vfs.FileDescriptionImpl.Readiness. +func (fd *DeviceFD) Readiness(mask waiter.EventMask) waiter.EventMask { + var ready waiter.EventMask + ready |= waiter.EventOut // FD is always writable + if !fd.queue.Empty() { + // Have reqs available, FD is readable. + ready |= waiter.EventIn + } + + return ready & mask +} + +// EventRegister implements waiter.Waitable.EventRegister. +func (fd *DeviceFD) EventRegister(e *waiter.Entry, mask waiter.EventMask) { + fd.waitQueue.EventRegister(e, mask) +} + +// EventUnregister implements waiter.Waitable.EventUnregister. +func (fd *DeviceFD) EventUnregister(e *waiter.Entry) { + fd.waitQueue.EventUnregister(e) } // Seek implements vfs.FileDescriptionImpl.Seek. @@ -112,22 +334,61 @@ func (fd *DeviceFD) Seek(ctx context.Context, offset int64, whence int32) (int64 return 0, syserror.ENOSYS } -// Register registers the FUSE device with vfsObj. -func Register(vfsObj *vfs.VirtualFilesystem) error { - if err := vfsObj.RegisterDevice(vfs.CharDevice, linux.MISC_MAJOR, fuseDevMinor, fuseDevice{}, &vfs.RegisterDeviceOptions{ - GroupName: "misc", - }); err != nil { +// sendResponse sends a response to the waiting task (if any). +func (fd *DeviceFD) sendResponse(ctx context.Context, fut *futureResponse) error { + // See if the running task need to perform some action before returning. + // Since we just finished writing the future, we can be sure that + // getResponse generates a populated response. + if err := fd.noReceiverAction(ctx, fut.getResponse()); err != nil { return err } + // Signal that the queue is no longer full. + select { + case fd.fullQueueCh <- struct{}{}: + default: + } + fd.numActiveRequests -= 1 + + // Signal the task waiting on a response. + close(fut.ch) return nil } -// CreateDevtmpfsFile creates a device special file in devtmpfs. -func CreateDevtmpfsFile(ctx context.Context, dev *devtmpfs.Accessor) error { - if err := dev.CreateDeviceFile(ctx, "fuse", vfs.CharDevice, linux.MISC_MAJOR, fuseDevMinor, 0666 /* mode */); err != nil { +// sendError sends an error response to the waiting task (if any). +func (fd *DeviceFD) sendError(ctx context.Context, errno int32, req *Request) error { + // Return the error to the calling task. + outHdrLen := uint32((*linux.FUSEHeaderOut)(nil).SizeBytes()) + respHdr := linux.FUSEHeaderOut{ + Len: outHdrLen, + Error: errno, + Unique: req.hdr.Unique, + } + + fut, ok := fd.completions[respHdr.Unique] + if !ok { + // Server sent us a response for a request we never sent? + return syserror.EINVAL + } + delete(fd.completions, respHdr.Unique) + + fut.hdr = &respHdr + if err := fd.sendResponse(ctx, fut); err != nil { return err } return nil } + +// noReceiverAction has the calling kernel.Task do some action if its known that no +// receiver is going to be waiting on the future channel. This is to be used by: +// FUSE_INIT. +func (fd *DeviceFD) noReceiverAction(ctx context.Context, r *Response) error { + if r.opcode == linux.FUSE_INIT { + // TODO: process init response here. + // Maybe get the creds from the context? + // creds := auth.CredentialsFromContext(ctx) + } + + return nil +} diff --git a/pkg/sentry/fsimpl/fuse/dev_test.go b/pkg/sentry/fsimpl/fuse/dev_test.go new file mode 100644 index 000000000..fcd77832a --- /dev/null +++ b/pkg/sentry/fsimpl/fuse/dev_test.go @@ -0,0 +1,429 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fuse + +import ( + "fmt" + "io" + "math/rand" + "testing" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/testutil" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" + "gvisor.dev/gvisor/tools/go_marshal/marshal" +) + +// echoTestOpcode is the Opcode used during testing. The server used in tests +// will simply echo the payload back with the appropriate headers. +const echoTestOpcode linux.FUSEOpcode = 1000 + +type testPayload struct { + data uint32 +} + +// TestFUSECommunication tests that the communication layer between the Sentry and the +// FUSE server daemon works as expected. +func TestFUSECommunication(t *testing.T) { + s := setup(t) + defer s.Destroy() + + k := kernel.KernelFromContext(s.Ctx) + creds := auth.CredentialsFromContext(s.Ctx) + + // Create test cases with different number of concurrent clients and servers. + testCases := []struct { + Name string + NumClients int + NumServers int + MaxActiveRequests uint64 + }{ + { + Name: "SingleClientSingleServer", + NumClients: 1, + NumServers: 1, + MaxActiveRequests: MaxActiveRequestsDefault, + }, + { + Name: "SingleClientMultipleServers", + NumClients: 1, + NumServers: 10, + MaxActiveRequests: MaxActiveRequestsDefault, + }, + { + Name: "MultipleClientsSingleServer", + NumClients: 10, + NumServers: 1, + MaxActiveRequests: MaxActiveRequestsDefault, + }, + { + Name: "MultipleClientsMultipleServers", + NumClients: 10, + NumServers: 10, + MaxActiveRequests: MaxActiveRequestsDefault, + }, + { + Name: "RequestCapacityFull", + NumClients: 10, + NumServers: 1, + MaxActiveRequests: 1, + }, + { + Name: "RequestCapacityContinuouslyFull", + NumClients: 100, + NumServers: 2, + MaxActiveRequests: 2, + }, + } + + for _, testCase := range testCases { + t.Run(testCase.Name, func(t *testing.T) { + conn, fd, err := newTestConnection(s, k, testCase.MaxActiveRequests) + if err != nil { + t.Fatalf("newTestConnection: %v", err) + } + + clientsDone := make([]chan struct{}, testCase.NumClients) + serversDone := make([]chan struct{}, testCase.NumServers) + serversKill := make([]chan struct{}, testCase.NumServers) + + // FUSE clients. + for i := 0; i < testCase.NumClients; i++ { + clientsDone[i] = make(chan struct{}) + go func(i int) { + fuseClientRun(t, s, k, conn, creds, uint32(i), uint64(i), clientsDone[i]) + }(i) + } + + // FUSE servers. + for j := 0; j < testCase.NumServers; j++ { + serversDone[j] = make(chan struct{}) + serversKill[j] = make(chan struct{}, 1) // The kill command shouldn't block. + go func(j int) { + fuseServerRun(t, s, k, fd, serversDone[j], serversKill[j]) + }(j) + } + + // Tear down. + // + // Make sure all the clients are done. + for i := 0; i < testCase.NumClients; i++ { + <-clientsDone[i] + } + + // Kill any server that is potentially waiting. + for j := 0; j < testCase.NumServers; j++ { + serversKill[j] <- struct{}{} + } + + // Make sure all the servers are done. + for j := 0; j < testCase.NumServers; j++ { + <-serversDone[j] + } + }) + } +} + +// CallTest makes a request to the server and blocks the invoking +// goroutine until a server responds with a response. Doesn't block +// a kernel.Task. Analogous to Connection.Call but used for testing. +func CallTest(conn *Connection, t *kernel.Task, r *Request, i uint32) (*Response, error) { + conn.fd.mu.Lock() + + // Wait until we're certain that a new request can be processed. + for conn.fd.numActiveRequests == conn.fd.fs.opts.maxActiveRequests { + conn.fd.mu.Unlock() + select { + case <-conn.fd.fullQueueCh: + } + conn.fd.mu.Lock() + } + + fut, err := conn.callFutureLocked(t, r) // No task given. + conn.fd.mu.Unlock() + + if err != nil { + return nil, err + } + + // Resolve the response. + // + // Block without a task. + select { + case <-fut.ch: + } + + // A response is ready. Resolve and return it. + return fut.getResponse(), nil +} + +// ReadTest is analogous to vfs.FileDescription.Read and reads from the FUSE +// device. However, it does so by - not blocking the task that is calling - and +// instead just waits on a channel. The behaviour is essentially the same as +// DeviceFD.Read except it guarantees that the task is not blocked. +func ReadTest(serverTask *kernel.Task, fd *vfs.FileDescription, inIOseq usermem.IOSequence, killServer chan struct{}) (int64, bool, error) { + var err error + var n, total int64 + + dev := fd.Impl().(*DeviceFD) + + // Register for notifications. + w, ch := waiter.NewChannelEntry(nil) + dev.EventRegister(&w, waiter.EventIn) + for { + // Issue the request and break out if it completes with anything other than + // "would block". + n, err = dev.Read(serverTask, inIOseq, vfs.ReadOptions{}) + total += n + if err != syserror.ErrWouldBlock { + break + } + + // Wait for a notification that we should retry. + // Emulate the blocking for when no requests are available + select { + case <-ch: + case <-killServer: + // Server killed by the main program. + return 0, true, nil + } + } + + dev.EventUnregister(&w) + return total, false, err +} + +// fuseClientRun emulates all the actions of a normal FUSE request. It creates +// a header, a payload, calls the server, waits for the response, and processes +// the response. +func fuseClientRun(t *testing.T, s *testutil.System, k *kernel.Kernel, conn *Connection, creds *auth.Credentials, pid uint32, inode uint64, clientDone chan struct{}) { + defer func() { clientDone <- struct{}{} }() + + tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits()) + clientTask, err := testutil.CreateTask(s.Ctx, fmt.Sprintf("fuse-client-%v", pid), tc, s.MntNs, s.Root, s.Root) + if err != nil { + t.Fatal(err) + } + testObj := &testPayload{ + data: rand.Uint32(), + } + + req, err := conn.NewRequest(creds, pid, inode, echoTestOpcode, testObj) + if err != nil { + t.Fatalf("NewRequest creation failed: %v", err) + } + + // Queue up a request. + // Analogous to Call except it doesn't block on the task. + resp, err := CallTest(conn, clientTask, req, pid) + if err != nil { + t.Fatalf("CallTaskNonBlock failed: %v", err) + } + + if err = resp.Error(); err != nil { + t.Fatalf("Server responded with an error: %v", err) + } + + var respTestPayload testPayload + if err := resp.UnmarshalPayload(&respTestPayload); err != nil { + t.Fatalf("Unmarshalling payload error: %v", err) + } + + if resp.hdr.Unique != req.hdr.Unique { + t.Fatalf("got response for another request. Expected response for req %v but got response for req %v", + req.hdr.Unique, resp.hdr.Unique) + } + + if respTestPayload.data != testObj.data { + t.Fatalf("read incorrect data. Data expected: %v, but got %v", testObj.data, respTestPayload.data) + } + +} + +// fuseServerRun creates a task and emulates all the actions of a simple FUSE server +// that simply reads a request and echos the same struct back as a response using the +// appropriate headers. +func fuseServerRun(t *testing.T, s *testutil.System, k *kernel.Kernel, fd *vfs.FileDescription, serverDone, killServer chan struct{}) { + defer func() { serverDone <- struct{}{} }() + + // Create the tasks that the server will be using. + tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits()) + var readPayload testPayload + + serverTask, err := testutil.CreateTask(s.Ctx, "fuse-server", tc, s.MntNs, s.Root, s.Root) + if err != nil { + t.Fatal(err) + } + + // Read the request. + for { + inHdrLen := uint32((*linux.FUSEHeaderIn)(nil).SizeBytes()) + payloadLen := uint32(readPayload.SizeBytes()) + + // The raed buffer must meet some certain size criteria. + buffSize := inHdrLen + payloadLen + if buffSize < linux.FUSE_MIN_READ_BUFFER { + buffSize = linux.FUSE_MIN_READ_BUFFER + } + inBuf := make([]byte, buffSize) + inIOseq := usermem.BytesIOSequence(inBuf) + + n, serverKilled, err := ReadTest(serverTask, fd, inIOseq, killServer) + if err != nil { + t.Fatalf("Read failed :%v", err) + } + + // Server should shut down. No new requests are going to be made. + if serverKilled { + break + } + + if n <= 0 { + t.Fatalf("Read read no bytes") + } + + var readFUSEHeaderIn linux.FUSEHeaderIn + readFUSEHeaderIn.UnmarshalUnsafe(inBuf[:inHdrLen]) + readPayload.UnmarshalUnsafe(inBuf[inHdrLen : inHdrLen+payloadLen]) + + if readFUSEHeaderIn.Opcode != echoTestOpcode { + t.Fatalf("read incorrect data. Header: %v, Payload: %v", readFUSEHeaderIn, readPayload) + } + + // Write the response. + outHdrLen := uint32((*linux.FUSEHeaderOut)(nil).SizeBytes()) + outBuf := make([]byte, outHdrLen+payloadLen) + outHeader := linux.FUSEHeaderOut{ + Len: outHdrLen + payloadLen, + Error: 0, + Unique: readFUSEHeaderIn.Unique, + } + + // Echo the payload back. + outHeader.MarshalUnsafe(outBuf[:outHdrLen]) + readPayload.MarshalUnsafe(outBuf[outHdrLen:]) + outIOseq := usermem.BytesIOSequence(outBuf) + + n, err = fd.Write(s.Ctx, outIOseq, vfs.WriteOptions{}) + if err != nil { + t.Fatalf("Write failed :%v", err) + } + } +} + +func setup(t *testing.T) *testutil.System { + k, err := testutil.Boot() + if err != nil { + t.Fatalf("Error creating kernel: %v", err) + } + + ctx := k.SupervisorContext() + creds := auth.CredentialsFromContext(ctx) + + k.VFS().MustRegisterFilesystemType(Name, &FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserList: true, + AllowUserMount: true, + }) + + mntns, err := k.VFS().NewMountNamespace(ctx, creds, "", tmpfs.Name, &vfs.GetFilesystemOptions{}) + if err != nil { + t.Fatalf("NewMountNamespace(): %v", err) + } + + return testutil.NewSystem(ctx, t, k.VFS(), mntns) +} + +// newTestConnection creates a fuse connection that the sentry can communicate with +// and the FD for the server to communicate with. +func newTestConnection(system *testutil.System, k *kernel.Kernel, maxActiveRequests uint64) (*Connection, *vfs.FileDescription, error) { + vfsObj := &vfs.VirtualFilesystem{} + fuseDev := &DeviceFD{} + + if err := vfsObj.Init(); err != nil { + return nil, nil, err + } + + vd := vfsObj.NewAnonVirtualDentry("genCountFD") + defer vd.DecRef() + if err := fuseDev.vfsfd.Init(fuseDev, linux.O_RDWR|linux.O_CREAT, vd.Mount(), vd.Dentry(), &vfs.FileDescriptionOptions{}); err != nil { + return nil, nil, err + } + + fsopts := filesystemOptions{ + maxActiveRequests: maxActiveRequests, + } + fs, err := NewFUSEFilesystem(system.Ctx, 0, &fsopts, &fuseDev.vfsfd) + if err != nil { + return nil, nil, err + } + + return fs.conn, &fuseDev.vfsfd, nil +} + +// SizeBytes implements marshal.Marshallable.SizeBytes. +func (t *testPayload) SizeBytes() int { + return 4 +} + +// MarshalBytes implements marshal.Marshallable.MarshalBytes. +func (t *testPayload) MarshalBytes(dst []byte) { + usermem.ByteOrder.PutUint32(dst[:4], t.data) +} + +// UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes. +func (t *testPayload) UnmarshalBytes(src []byte) { + *t = testPayload{data: usermem.ByteOrder.Uint32(src[:4])} +} + +// Packed implements marshal.Marshallable.Packed. +func (t *testPayload) Packed() bool { + return true +} + +// MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe. +func (t *testPayload) MarshalUnsafe(dst []byte) { + t.MarshalBytes(dst) +} + +// UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe. +func (t *testPayload) UnmarshalUnsafe(src []byte) { + t.UnmarshalBytes(src) +} + +// CopyOutN implements marshal.Marshallable.CopyOutN. +func (t *testPayload) CopyOutN(task marshal.Task, addr usermem.Addr, limit int) (int, error) { + panic("not implemented") +} + +// CopyOut implements marshal.Marshallable.CopyOut. +func (t *testPayload) CopyOut(task marshal.Task, addr usermem.Addr) (int, error) { + panic("not implemented") +} + +// CopyIn implements marshal.Marshallable.CopyIn. +func (t *testPayload) CopyIn(task marshal.Task, addr usermem.Addr) (int, error) { + panic("not implemented") +} + +// WriteTo implements io.WriterTo.WriteTo. +func (t *testPayload) WriteTo(w io.Writer) (int64, error) { + panic("not implemented") +} diff --git a/pkg/sentry/fsimpl/fuse/fusefs.go b/pkg/sentry/fsimpl/fuse/fusefs.go index f7775fb9b..911b6f7cb 100644 --- a/pkg/sentry/fsimpl/fuse/fusefs.go +++ b/pkg/sentry/fsimpl/fuse/fusefs.go @@ -51,6 +51,11 @@ type filesystemOptions struct { // rootMode specifies the the file mode of the filesystem's root. rootMode linux.FileMode + + // maxActiveRequests specifies the maximum number of active requests that can + // exist at any time. Any further requests will block when trying to + // Call the server. + maxActiveRequests uint64 } // filesystem implements vfs.FilesystemImpl. @@ -58,12 +63,12 @@ type filesystem struct { kernfs.Filesystem devMinor uint32 - // fuseFD is the FD returned when opening /dev/fuse. It is used for communication - // between the FUSE server daemon and the sentry fusefs. - fuseFD *DeviceFD + // conn is used for communication between the FUSE server + // daemon and the sentry fusefs. + conn *Connection // opts is the options the fusefs is initialized with. - opts filesystemOptions + opts *filesystemOptions } // Name implements vfs.FilesystemType.Name. @@ -100,7 +105,7 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt fuseFd := kernelTask.GetFileVFS2(int32(deviceDescriptor)) // Parse and set all the other supported FUSE mount options. - // TODO: Expand the supported mount options. + // TODO(gVisor.dev/issue/3229): Expand the supported mount options. if userIDStr, ok := mopts["user_id"]; ok { delete(mopts, "user_id") userID, err := strconv.ParseUint(userIDStr, 10, 32) @@ -134,21 +139,20 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt } fsopts.rootMode = rootMode + // Set the maxInFlightRequests option. + fsopts.maxActiveRequests = MaxActiveRequestsDefault + // Check for unparsed options. if len(mopts) != 0 { log.Warningf("%s.GetFilesystem: unknown options: %v", fsType.Name(), mopts) return nil, nil, syserror.EINVAL } - // Mark the device as ready so it can be used. /dev/fuse can only be used if the FD was used to - // mount a FUSE filesystem. - fuseFD := fuseFd.Impl().(*DeviceFD) - fuseFD.mounted = true - - fs := &filesystem{ - devMinor: devMinor, - fuseFD: fuseFD, - opts: fsopts, + // Create a new FUSE filesystem. + fs, err := NewFUSEFilesystem(ctx, devMinor, &fsopts, fuseFd) + if err != nil { + log.Warningf("%s.NewFUSEFilesystem: failed with error: %v", fsType.Name(), err) + return nil, nil, err } fs.VFSFilesystem().Init(vfsObj, &fsType, fs) @@ -162,6 +166,26 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt return fs.VFSFilesystem(), root.VFSDentry(), nil } +// NewFUSEFilesystem creates a new FUSE filesystem. +func NewFUSEFilesystem(ctx context.Context, devMinor uint32, opts *filesystemOptions, device *vfs.FileDescription) (*filesystem, error) { + fs := &filesystem{ + devMinor: devMinor, + opts: opts, + } + + conn, err := NewFUSEConnection(ctx, device, opts.maxActiveRequests) + if err != nil { + log.Warningf("fuse.NewFUSEFilesystem: NewFUSEConnection failed with error: %v", err) + return nil, syserror.EINVAL + } + + fs.conn = conn + fuseFD := device.Impl().(*DeviceFD) + fuseFD.fs = fs + + return fs, nil +} + // Release implements vfs.FilesystemImpl.Release. func (fs *filesystem) Release() { fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) diff --git a/pkg/sentry/fsimpl/fuse/register.go b/pkg/sentry/fsimpl/fuse/register.go new file mode 100644 index 000000000..b5b581152 --- /dev/null +++ b/pkg/sentry/fsimpl/fuse/register.go @@ -0,0 +1,42 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fuse + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/devtmpfs" + "gvisor.dev/gvisor/pkg/sentry/vfs" +) + +// Register registers the FUSE device with vfsObj. +func Register(vfsObj *vfs.VirtualFilesystem) error { + if err := vfsObj.RegisterDevice(vfs.CharDevice, linux.MISC_MAJOR, fuseDevMinor, fuseDevice{}, &vfs.RegisterDeviceOptions{ + GroupName: "misc", + }); err != nil { + return err + } + + return nil +} + +// CreateDevtmpfsFile creates a device special file in devtmpfs. +func CreateDevtmpfsFile(ctx context.Context, dev *devtmpfs.Accessor) error { + if err := dev.CreateDeviceFile(ctx, "fuse", vfs.CharDevice, linux.MISC_MAJOR, fuseDevMinor, 0666 /* mode */); err != nil { + return err + } + + return nil +} -- cgit v1.2.3 From 2e19a8b951e9402b28b4e601e65c51e69c815db1 Mon Sep 17 00:00:00 2001 From: Jinmou Li Date: Sat, 27 Jun 2020 02:14:56 +0000 Subject: Add FUSE_INIT This change allows the sentry to send FUSE_INIT request and process the reply. It adds the corresponding structs, employs the fuse device to send and read the message, and stores the results of negotiation in corresponding places (inside connection struct). It adds a CallAsync() function to the FUSE connection interface: - like Call(), but it's for requests that do not expect immediate response (init, release, interrupt etc.) - will block if the connection hasn't initialized, which is the same for Call() --- pkg/abi/linux/fuse.go | 105 ++++++++++++++++ pkg/sentry/fsimpl/fuse/BUILD | 4 +- pkg/sentry/fsimpl/fuse/connection.go | 234 +++++++++++++++++++++++++++++++---- pkg/sentry/fsimpl/fuse/dev.go | 13 +- pkg/sentry/fsimpl/fuse/dev_test.go | 19 ++- pkg/sentry/fsimpl/fuse/fusefs.go | 22 ++-- pkg/sentry/fsimpl/fuse/init.go | 166 +++++++++++++++++++++++++ pkg/syserror/syserror.go | 1 + 8 files changed, 512 insertions(+), 52 deletions(-) create mode 100644 pkg/sentry/fsimpl/fuse/init.go (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/abi/linux/fuse.go b/pkg/abi/linux/fuse.go index d3ebbccc4..5c6ffe4a3 100644 --- a/pkg/abi/linux/fuse.go +++ b/pkg/abi/linux/fuse.go @@ -141,3 +141,108 @@ type FUSEWriteIn struct { _ uint32 } + +// FUSE_INIT flags, consistent with the ones in include/uapi/linux/fuse.h. +const ( + FUSE_ASYNC_READ = 1 << 0 + FUSE_POSIX_LOCKS = 1 << 1 + FUSE_FILE_OPS = 1 << 2 + FUSE_ATOMIC_O_TRUNC = 1 << 3 + FUSE_EXPORT_SUPPORT = 1 << 4 + FUSE_BIG_WRITES = 1 << 5 + FUSE_DONT_MASK = 1 << 6 + FUSE_SPLICE_WRITE = 1 << 7 + FUSE_SPLICE_MOVE = 1 << 8 + FUSE_SPLICE_READ = 1 << 9 + FUSE_FLOCK_LOCKS = 1 << 10 + FUSE_HAS_IOCTL_DIR = 1 << 11 + FUSE_AUTO_INVAL_DATA = 1 << 12 + FUSE_DO_READDIRPLUS = 1 << 13 + FUSE_READDIRPLUS_AUTO = 1 << 14 + FUSE_ASYNC_DIO = 1 << 15 + FUSE_WRITEBACK_CACHE = 1 << 16 + FUSE_NO_OPEN_SUPPORT = 1 << 17 + FUSE_PARALLEL_DIROPS = 1 << 18 + FUSE_HANDLE_KILLPRIV = 1 << 19 + FUSE_POSIX_ACL = 1 << 20 + FUSE_ABORT_ERROR = 1 << 21 + FUSE_MAX_PAGES = 1 << 22 + FUSE_CACHE_SYMLINKS = 1 << 23 + FUSE_NO_OPENDIR_SUPPORT = 1 << 24 + FUSE_EXPLICIT_INVAL_DATA = 1 << 25 + FUSE_MAP_ALIGNMENT = 1 << 26 +) + +// currently supported FUSE protocol version numbers. +const ( + FUSE_KERNEL_VERSION = 7 + FUSE_KERNEL_MINOR_VERSION = 31 +) + +// FUSEInitIn is the request sent by the kernel to the daemon, +// to negotiate the version and flags. +// +// +marshal +type FUSEInitIn struct { + // Major version supported by kernel. + Major uint32 + + // Minor version supported by the kernel. + Minor uint32 + + // MaxReadahead is the maximum number of bytes to read-ahead + // decided by the kernel. + MaxReadahead uint32 + + // Flags of this init request. + Flags uint32 +} + +// FUSEInitOut is the reply sent by the daemon to the kernel +// for FUSEInitIn. +// +// +marshal +type FUSEInitOut struct { + // Major version supported by daemon. + Major uint32 + + // Minor version supported by daemon. + Minor uint32 + + // MaxReadahead is the maximum number of bytes to read-ahead. + // Decided by the daemon, after receiving the value from kernel. + MaxReadahead uint32 + + // Flags of this init reply. + Flags uint32 + + // MaxBackground is the maximum number of pending background requests + // that the daemon wants. + MaxBackground uint16 + + // CongestionThreshold is the daemon-decided threshold for + // the number of the pending background requests. + CongestionThreshold uint16 + + // MaxWrite is the daemon's maximum size of a write buffer. + // Kernel adjusts it to the minimum (fuse/init.go:fuseMinMaxWrite). + // if the value from daemon is too small. + MaxWrite uint32 + + // TimeGran is the daemon's time granularity for mtime and ctime metadata. + // The unit is nanosecond. + // Value should be power of 10. + // 1 indicates full nanosecond granularity support. + TimeGran uint32 + + // MaxPages is the daemon's maximum number of pages for one write operation. + // Kernel adjusts it to the maximum (fuse/init.go:FUSE_MAX_MAX_PAGES). + // if the value from daemon is too large. + MaxPages uint16 + + // MapAlignment is an unknown field and not used by this package at this moment. + // Use as a placeholder to be consistent with the FUSE protocol. + MapAlignment uint16 + + _ [8]uint32 +} diff --git a/pkg/sentry/fsimpl/fuse/BUILD b/pkg/sentry/fsimpl/fuse/BUILD index 67649e811..999111deb 100644 --- a/pkg/sentry/fsimpl/fuse/BUILD +++ b/pkg/sentry/fsimpl/fuse/BUILD @@ -21,6 +21,7 @@ go_library( "connection.go", "dev.go", "fusefs.go", + "init.go", "register.go", "request_list.go", ], @@ -44,14 +45,13 @@ go_library( ) go_test( - name = "dev_test", + name = "fuse_test", size = "small", srcs = ["dev_test.go"], library = ":fuse", deps = [ "//pkg/abi/linux", "//pkg/sentry/fsimpl/testutil", - "//pkg/sentry/fsimpl/tmpfs", "//pkg/sentry/kernel", "//pkg/sentry/kernel/auth", "//pkg/sentry/vfs", diff --git a/pkg/sentry/fsimpl/fuse/connection.go b/pkg/sentry/fsimpl/fuse/connection.go index f330da0bd..6df2728ab 100644 --- a/pkg/sentry/fsimpl/fuse/connection.go +++ b/pkg/sentry/fsimpl/fuse/connection.go @@ -17,6 +17,8 @@ package fuse import ( "errors" "fmt" + "sync" + "sync/atomic" "syscall" "gvisor.dev/gvisor/pkg/abi/linux" @@ -25,18 +27,29 @@ import ( "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/waiter" "gvisor.dev/gvisor/tools/go_marshal/marshal" ) -// MaxActiveRequestsDefault is the default setting controlling the upper bound +// maxActiveRequestsDefault is the default setting controlling the upper bound // on the number of active requests at any given time. -const MaxActiveRequestsDefault = 10000 +const maxActiveRequestsDefault = 10000 -var ( - // Ordinary requests have even IDs, while interrupts IDs are odd. - InitReqBit uint64 = 1 - ReqIDStep uint64 = 2 +// Ordinary requests have even IDs, while interrupts IDs are odd. +// Used to increment the unique ID for each FUSE request. +var reqIDStep uint64 = 2 + +const ( + // fuseDefaultMaxBackground is the default value for MaxBackground. + fuseDefaultMaxBackground = 12 + + // fuseDefaultCongestionThreshold is the default value for CongestionThreshold, + // and is 75% of the default maximum of MaxGround. + fuseDefaultCongestionThreshold = (fuseDefaultMaxBackground * 3 / 4) + + // fuseDefaultMaxPagesPerReq is the default value for MaxPagesPerReq. + fuseDefaultMaxPagesPerReq = 32 ) // Request represents a FUSE operation request that hasn't been sent to the @@ -61,17 +74,125 @@ type Response struct { data []byte } -// Connection is the struct by which the sentry communicates with the FUSE server daemon. -type Connection struct { +// connection is the struct by which the sentry communicates with the FUSE server daemon. +type connection struct { fd *DeviceFD - // MaxWrite is the daemon's maximum size of a write buffer. - // This is negotiated during FUSE_INIT. - MaxWrite uint32 + // The following FUSE_INIT flags are currently unsupported by this implementation: + // - FUSE_ATOMIC_O_TRUNC: requires open(..., O_TRUNC) + // - FUSE_EXPORT_SUPPORT + // - FUSE_HANDLE_KILLPRIV + // - FUSE_POSIX_LOCKS: requires POSIX locks + // - FUSE_FLOCK_LOCKS: requires POSIX locks + // - FUSE_AUTO_INVAL_DATA: requires page caching eviction + // - FUSE_EXPLICIT_INVAL_DATA: requires page caching eviction + // - FUSE_DO_READDIRPLUS/FUSE_READDIRPLUS_AUTO: requires FUSE_READDIRPLUS implementation + // - FUSE_ASYNC_DIO + // - FUSE_POSIX_ACL: affects defaultPermissions, posixACL, xattr handler + + // initialized after receiving FUSE_INIT reply. + // Until it's set, suspend sending FUSE requests. + // Use SetInitialized() and IsInitialized() for atomic access. + initialized int32 + + // initializedChan is used to block requests before initialization. + initializedChan chan struct{} + + // blocked when there are too many outstading backgrounds requests (NumBackground == MaxBackground). + // TODO(gvisor.dev/issue/3185): update the numBackground accordingly; use a channel to block. + blocked bool + + // connected (connection established) when a new FUSE file system is created. + // Set to false when: + // umount, + // connection abort, + // device release. + connected bool + + // aborted via sysfs. + // TODO(gvisor.dev/issue/3185): abort all queued requests. + aborted bool + + // connInitError if FUSE_INIT encountered error (major version mismatch). + // Only set in INIT. + connInitError bool + + // connInitSuccess if FUSE_INIT is successful. + // Only set in INIT. + // Used for destory. + connInitSuccess bool + + // TODO(gvisor.dev/issue/3185): All the queue logic are working in progress. + + // NumberBackground is the number of requests in the background. + numBackground uint16 + + // congestionThreshold for NumBackground. + // Negotiated in FUSE_INIT. + congestionThreshold uint16 + + // maxBackground is the maximum number of NumBackground. + // Block connection when it is reached. + // Negotiated in FUSE_INIT. + maxBackground uint16 + + // numActiveBackground is the number of requests in background and has being marked as active. + numActiveBackground uint16 + + // numWating is the number of requests waiting for completion. + numWaiting uint32 + + // TODO(gvisor.dev/issue/3185): BgQueue + // some queue for background queued requests. + + // bgLock protects: + // MaxBackground, CongestionThreshold, NumBackground, + // NumActiveBackground, BgQueue, Blocked. + bgLock sync.Mutex + + // maxRead is the maximum size of a read buffer in in bytes. + maxRead uint32 + + // maxWrite is the maximum size of a write buffer in bytes. + // Negotiated in FUSE_INIT. + maxWrite uint32 + + // maxPages is the maximum number of pages for a single request to use. + // Negotiated in FUSE_INIT. + maxPages uint16 + + // minor version of the FUSE protocol. + // Negotiated and only set in INIT. + minor uint32 + + // asyncRead if read pages asynchronously. + // Negotiated and only set in INIT. + asyncRead bool + + // abortErr is true if kernel need to return an unique read error after abort. + // Negotiated and only set in INIT. + abortErr bool + + // writebackCache is true for write-back cache policy, + // false for write-through policy. + // Negotiated and only set in INIT. + writebackCache bool + + // cacheSymlinks if filesystem needs to cache READLINK responses in page cache. + // Negotiated and only set in INIT. + cacheSymlinks bool + + // bigWrites if doing multi-page cached writes. + // Negotiated and only set in INIT. + bigWrites bool + + // dontMask if filestestem does not apply umask to creation modes. + // Negotiated in INIT. + dontMask bool } -// NewFUSEConnection creates a FUSE connection to fd -func NewFUSEConnection(_ context.Context, fd *vfs.FileDescription, maxInFlightRequests uint64) (*Connection, error) { +// newFUSEConnection creates a FUSE connection to fd. +func newFUSEConnection(_ context.Context, fd *vfs.FileDescription, maxInFlightRequests uint64) (*connection, error) { // Mark the device as ready so it can be used. /dev/fuse can only be used if the FD was used to // mount a FUSE filesystem. fuseFD := fd.Impl().(*DeviceFD) @@ -84,16 +205,41 @@ func NewFUSEConnection(_ context.Context, fd *vfs.FileDescription, maxInFlightRe fuseFD.fullQueueCh = make(chan struct{}, maxInFlightRequests) fuseFD.writeCursor = 0 - return &Connection{ - fd: fuseFD, + return &connection{ + fd: fuseFD, + maxBackground: fuseDefaultMaxBackground, + congestionThreshold: fuseDefaultCongestionThreshold, + maxPages: fuseDefaultMaxPagesPerReq, + initializedChan: make(chan struct{}), + connected: true, }, nil } +// SetInitialized atomically sets the connection as initialized. +func (conn *connection) SetInitialized() { + // Unblock the requests sent before INIT. + close(conn.initializedChan) + + // Close the channel first to avoid the non-atomic situation + // where conn.initialized is true but there are + // tasks being blocked on the channel. + // And it prevents the newer tasks from gaining + // unnecessary higher chance to be issued before the blocked one. + + atomic.StoreInt32(&(conn.initialized), int32(1)) +} + +// IsInitialized atomically check if the connection is initialized. +// pairs with SetInitialized(). +func (conn *connection) Initialized() bool { + return atomic.LoadInt32(&(conn.initialized)) != 0 +} + // NewRequest creates a new request that can be sent to the FUSE server. -func (conn *Connection) NewRequest(creds *auth.Credentials, pid uint32, ino uint64, opcode linux.FUSEOpcode, payload marshal.Marshallable) (*Request, error) { +func (conn *connection) NewRequest(creds *auth.Credentials, pid uint32, ino uint64, opcode linux.FUSEOpcode, payload marshal.Marshallable) (*Request, error) { conn.fd.mu.Lock() defer conn.fd.mu.Unlock() - conn.fd.nextOpID += linux.FUSEOpID(ReqIDStep) + conn.fd.nextOpID += linux.FUSEOpID(reqIDStep) hdrLen := (*linux.FUSEHeaderIn)(nil).SizeBytes() hdr := linux.FUSEHeaderIn{ @@ -118,13 +264,49 @@ func (conn *Connection) NewRequest(creds *auth.Credentials, pid uint32, ino uint } // Call makes a request to the server and blocks the invoking task until a -// server responds with a response. -// NOTE: If no task is provided then the Call will simply enqueue the request -// and return a nil response. No blocking will happen in this case. Instead, -// this is used to signify that the processing of this request will happen by -// the kernel.Task that writes the response. See FUSE_INIT for such an -// invocation. -func (conn *Connection) Call(t *kernel.Task, r *Request) (*Response, error) { +// server responds with a response. Task should never be nil. +// Requests will not be sent before the connection is initialized. +// For async tasks, use CallAsync(). +func (conn *connection) Call(t *kernel.Task, r *Request) (*Response, error) { + // Block requests sent before connection is initalized. + if !conn.Initialized() { + if err := t.Block(conn.initializedChan); err != nil { + return nil, err + } + } + + return conn.call(t, r) +} + +// CallAsync makes an async (aka background) request. +// Those requests either do not expect a response (e.g. release) or +// the response should be handled by others (e.g. init). +// Return immediately unless the connection is blocked (before initialization). +// Async call example: init, release, forget, aio, interrupt. +// When the Request is FUSE_INIT, it will not be blocked before initialization. +func (conn *connection) CallAsync(t *kernel.Task, r *Request) error { + // Block requests sent before connection is initalized. + if !conn.Initialized() && r.hdr.Opcode != linux.FUSE_INIT { + if err := t.Block(conn.initializedChan); err != nil { + return err + } + } + + // This should be the only place that invokes call() with a nil task. + _, err := conn.call(nil, r) + return err +} + +// call makes a call without blocking checks. +func (conn *connection) call(t *kernel.Task, r *Request) (*Response, error) { + if !conn.connected { + return nil, syserror.ENOTCONN + } + + if conn.connInitError { + return nil, syserror.ECONNREFUSED + } + fut, err := conn.callFuture(t, r) if err != nil { return nil, err @@ -160,7 +342,7 @@ func (r *Response) UnmarshalPayload(m marshal.Marshallable) error { // callFuture makes a request to the server and returns a future response. // Call resolve() when the response needs to be fulfilled. -func (conn *Connection) callFuture(t *kernel.Task, r *Request) (*futureResponse, error) { +func (conn *connection) callFuture(t *kernel.Task, r *Request) (*futureResponse, error) { conn.fd.mu.Lock() defer conn.fd.mu.Unlock() @@ -195,7 +377,7 @@ func (conn *Connection) callFuture(t *kernel.Task, r *Request) (*futureResponse, } // callFutureLocked makes a request to the server and returns a future response. -func (conn *Connection) callFutureLocked(t *kernel.Task, r *Request) (*futureResponse, error) { +func (conn *connection) callFutureLocked(t *kernel.Task, r *Request) (*futureResponse, error) { conn.fd.queue.PushBack(r) conn.fd.numActiveRequests += 1 fut := newFutureResponse(r.hdr.Opcode) diff --git a/pkg/sentry/fsimpl/fuse/dev.go b/pkg/sentry/fsimpl/fuse/dev.go index f3443ac71..2225076bc 100644 --- a/pkg/sentry/fsimpl/fuse/dev.go +++ b/pkg/sentry/fsimpl/fuse/dev.go @@ -21,6 +21,7 @@ import ( "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" @@ -98,7 +99,9 @@ type DeviceFD struct { } // Release implements vfs.FileDescriptionImpl.Release. -func (fd *DeviceFD) Release() {} +func (fd *DeviceFD) Release() { + fd.fs.conn.connected = false +} // PRead implements vfs.FileDescriptionImpl.PRead. func (fd *DeviceFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { @@ -124,7 +127,7 @@ func (fd *DeviceFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.R minBuffSize := linux.FUSE_MIN_READ_BUFFER inHdrLen := uint32((*linux.FUSEHeaderIn)(nil).SizeBytes()) writeHdrLen := uint32((*linux.FUSEWriteIn)(nil).SizeBytes()) - negotiatedMinBuffSize := inHdrLen + writeHdrLen + fd.fs.conn.MaxWrite + negotiatedMinBuffSize := inHdrLen + writeHdrLen + fd.fs.conn.maxWrite if minBuffSize < negotiatedMinBuffSize { minBuffSize = negotiatedMinBuffSize } @@ -385,9 +388,9 @@ func (fd *DeviceFD) sendError(ctx context.Context, errno int32, req *Request) er // FUSE_INIT. func (fd *DeviceFD) noReceiverAction(ctx context.Context, r *Response) error { if r.opcode == linux.FUSE_INIT { - // TODO: process init response here. - // Maybe get the creds from the context? - // creds := auth.CredentialsFromContext(ctx) + creds := auth.CredentialsFromContext(ctx) + rootUserNs := kernel.KernelFromContext(ctx).RootUserNamespace() + return fd.fs.conn.InitRecv(r, creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, rootUserNs)) } return nil diff --git a/pkg/sentry/fsimpl/fuse/dev_test.go b/pkg/sentry/fsimpl/fuse/dev_test.go index fcd77832a..122415275 100644 --- a/pkg/sentry/fsimpl/fuse/dev_test.go +++ b/pkg/sentry/fsimpl/fuse/dev_test.go @@ -22,7 +22,6 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/fsimpl/testutil" - "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs" "gvisor.dev/gvisor/pkg/sentry/kernel" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" @@ -60,25 +59,25 @@ func TestFUSECommunication(t *testing.T) { Name: "SingleClientSingleServer", NumClients: 1, NumServers: 1, - MaxActiveRequests: MaxActiveRequestsDefault, + MaxActiveRequests: maxActiveRequestsDefault, }, { Name: "SingleClientMultipleServers", NumClients: 1, NumServers: 10, - MaxActiveRequests: MaxActiveRequestsDefault, + MaxActiveRequests: maxActiveRequestsDefault, }, { Name: "MultipleClientsSingleServer", NumClients: 10, NumServers: 1, - MaxActiveRequests: MaxActiveRequestsDefault, + MaxActiveRequests: maxActiveRequestsDefault, }, { Name: "MultipleClientsMultipleServers", NumClients: 10, NumServers: 10, - MaxActiveRequests: MaxActiveRequestsDefault, + MaxActiveRequests: maxActiveRequestsDefault, }, { Name: "RequestCapacityFull", @@ -145,7 +144,7 @@ func TestFUSECommunication(t *testing.T) { // CallTest makes a request to the server and blocks the invoking // goroutine until a server responds with a response. Doesn't block // a kernel.Task. Analogous to Connection.Call but used for testing. -func CallTest(conn *Connection, t *kernel.Task, r *Request, i uint32) (*Response, error) { +func CallTest(conn *connection, t *kernel.Task, r *Request, i uint32) (*Response, error) { conn.fd.mu.Lock() // Wait until we're certain that a new request can be processed. @@ -214,7 +213,7 @@ func ReadTest(serverTask *kernel.Task, fd *vfs.FileDescription, inIOseq usermem. // fuseClientRun emulates all the actions of a normal FUSE request. It creates // a header, a payload, calls the server, waits for the response, and processes // the response. -func fuseClientRun(t *testing.T, s *testutil.System, k *kernel.Kernel, conn *Connection, creds *auth.Credentials, pid uint32, inode uint64, clientDone chan struct{}) { +func fuseClientRun(t *testing.T, s *testutil.System, k *kernel.Kernel, conn *connection, creds *auth.Credentials, pid uint32, inode uint64, clientDone chan struct{}) { defer func() { clientDone <- struct{}{} }() tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits()) @@ -343,7 +342,7 @@ func setup(t *testing.T) *testutil.System { AllowUserMount: true, }) - mntns, err := k.VFS().NewMountNamespace(ctx, creds, "", tmpfs.Name, &vfs.GetFilesystemOptions{}) + mntns, err := k.VFS().NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{}) if err != nil { t.Fatalf("NewMountNamespace(): %v", err) } @@ -353,7 +352,7 @@ func setup(t *testing.T) *testutil.System { // newTestConnection creates a fuse connection that the sentry can communicate with // and the FD for the server to communicate with. -func newTestConnection(system *testutil.System, k *kernel.Kernel, maxActiveRequests uint64) (*Connection, *vfs.FileDescription, error) { +func newTestConnection(system *testutil.System, k *kernel.Kernel, maxActiveRequests uint64) (*connection, *vfs.FileDescription, error) { vfsObj := &vfs.VirtualFilesystem{} fuseDev := &DeviceFD{} @@ -426,4 +425,4 @@ func (t *testPayload) CopyIn(task marshal.Task, addr usermem.Addr) (int, error) // WriteTo implements io.WriterTo.WriteTo. func (t *testPayload) WriteTo(w io.Writer) (int64, error) { panic("not implemented") -} +} \ No newline at end of file diff --git a/pkg/sentry/fsimpl/fuse/fusefs.go b/pkg/sentry/fsimpl/fuse/fusefs.go index 911b6f7cb..200a93bbf 100644 --- a/pkg/sentry/fsimpl/fuse/fusefs.go +++ b/pkg/sentry/fsimpl/fuse/fusefs.go @@ -65,7 +65,7 @@ type filesystem struct { // conn is used for communication between the FUSE server // daemon and the sentry fusefs. - conn *Connection + conn *connection // opts is the options the fusefs is initialized with. opts *filesystemOptions @@ -140,7 +140,7 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt fsopts.rootMode = rootMode // Set the maxInFlightRequests option. - fsopts.maxActiveRequests = MaxActiveRequestsDefault + fsopts.maxActiveRequests = maxActiveRequestsDefault // Check for unparsed options. if len(mopts) != 0 { @@ -157,8 +157,12 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt fs.VFSFilesystem().Init(vfsObj, &fsType, fs) - // TODO: dispatch a FUSE_INIT request to the FUSE daemon server before - // returning. Mount will not block on this dispatched request. + // Send a FUSE_INIT request to the FUSE daemon server before returning. + // This call is not blocking. + if err := fs.conn.InitSend(creds, uint32(kernelTask.ThreadID())); err != nil { + log.Warningf("%s.InitSend: failed with error: %v", fsType.Name(), err) + return nil, nil, err + } // root is the fusefs root directory. root := fs.newInode(creds, fsopts.rootMode) @@ -173,7 +177,7 @@ func NewFUSEFilesystem(ctx context.Context, devMinor uint32, opts *filesystemOpt opts: opts, } - conn, err := NewFUSEConnection(ctx, device, opts.maxActiveRequests) + conn, err := newFUSEConnection(ctx, device, opts.maxActiveRequests) if err != nil { log.Warningf("fuse.NewFUSEFilesystem: NewFUSEConnection failed with error: %v", err) return nil, syserror.EINVAL @@ -192,8 +196,8 @@ func (fs *filesystem) Release() { fs.Filesystem.Release() } -// Inode implements kernfs.Inode. -type Inode struct { +// inode implements kernfs.Inode. +type inode struct { kernfs.InodeAttrs kernfs.InodeNoDynamicLookup kernfs.InodeNotSymlink @@ -206,7 +210,7 @@ type Inode struct { } func (fs *filesystem) newInode(creds *auth.Credentials, mode linux.FileMode) *kernfs.Dentry { - i := &Inode{} + i := &inode{} i.InodeAttrs.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0755) i.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) i.dentry.Init(i) @@ -215,7 +219,7 @@ func (fs *filesystem) newInode(creds *auth.Credentials, mode linux.FileMode) *ke } // Open implements kernfs.Inode.Open. -func (i *Inode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { +func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts) if err != nil { return nil, err diff --git a/pkg/sentry/fsimpl/fuse/init.go b/pkg/sentry/fsimpl/fuse/init.go new file mode 100644 index 000000000..779c2bd3f --- /dev/null +++ b/pkg/sentry/fsimpl/fuse/init.go @@ -0,0 +1,166 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fuse + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" +) + +// consts used by FUSE_INIT negotiation. +const ( + // fuseMaxMaxPages is the maximum value for MaxPages received in InitOut. + // Follow the same behavior as unix fuse implementation. + fuseMaxMaxPages = 256 + + // Maximum value for the time granularity for file time stamps, 1s. + // Follow the same behavior as unix fuse implementation. + fuseMaxTimeGranNs = 1000000000 + + // Minimum value for MaxWrite. + // Follow the same behavior as unix fuse implementation. + fuseMinMaxWrite = 4096 + + // Temporary default value for max readahead, 128kb. + fuseDefaultMaxReadahead = 131072 + + // The FUSE_INIT_IN flags sent to the daemon. + // TODO(gvisor.dev/issue/3199): complete the flags. + fuseDefaultInitFlags = linux.FUSE_MAX_PAGES +) + +// Adjustable maximums for Connection's cogestion control parameters. +// Used as the upperbound of the config values. +// Currently we do not support adjustment to them. +var ( + MaxUserBackgroundRequest uint16 = fuseDefaultMaxBackground + MaxUserCongestionThreshold uint16 = fuseDefaultCongestionThreshold +) + +// InitSend sends a FUSE_INIT request. +func (conn *connection) InitSend(creds *auth.Credentials, pid uint32) error { + in := linux.FUSEInitIn{ + Major: linux.FUSE_KERNEL_VERSION, + Minor: linux.FUSE_KERNEL_MINOR_VERSION, + // TODO(gvisor.dev/issue/3196): find appropriate way to calculate this + MaxReadahead: fuseDefaultMaxReadahead, + Flags: fuseDefaultInitFlags, + } + + req, err := conn.NewRequest(creds, pid, 0, linux.FUSE_INIT, &in) + if err != nil { + return err + } + + // Since there is no task to block on and FUSE_INIT is the request + // to unblock other requests, use nil. + return conn.CallAsync(nil, req) +} + +// InitRecv receives a FUSE_INIT reply and process it. +func (conn *connection) InitRecv(res *Response, hasSysAdminCap bool) error { + if err := res.Error(); err != nil { + return err + } + + var out linux.FUSEInitOut + if err := res.UnmarshalPayload(&out); err != nil { + return err + } + + return conn.initProcessReply(&out, hasSysAdminCap) +} + +// Process the FUSE_INIT reply from the FUSE server. +func (conn *connection) initProcessReply(out *linux.FUSEInitOut, hasSysAdminCap bool) error { + // No support for old major fuse versions. + if out.Major != linux.FUSE_KERNEL_VERSION { + conn.connInitError = true + + // Set the connection as initialized and unblock the blocked requests + // (i.e. return error for them). + conn.SetInitialized() + + return nil + } + + // Start processing the reply. + conn.connInitSuccess = true + conn.minor = out.Minor + + // No support for limits before minor version 13. + if out.Minor >= 13 { + conn.bgLock.Lock() + + if out.MaxBackground > 0 { + conn.maxBackground = out.MaxBackground + + if !hasSysAdminCap && + conn.maxBackground > MaxUserBackgroundRequest { + conn.maxBackground = MaxUserBackgroundRequest + } + } + + if out.CongestionThreshold > 0 { + conn.congestionThreshold = out.CongestionThreshold + + if !hasSysAdminCap && + conn.congestionThreshold > MaxUserCongestionThreshold { + conn.congestionThreshold = MaxUserCongestionThreshold + } + } + + conn.bgLock.Unlock() + } + + // No support for the following flags before minor version 6. + if out.Minor >= 6 { + conn.asyncRead = out.Flags&linux.FUSE_ASYNC_READ != 0 + conn.bigWrites = out.Flags&linux.FUSE_BIG_WRITES != 0 + conn.dontMask = out.Flags&linux.FUSE_DONT_MASK != 0 + conn.writebackCache = out.Flags&linux.FUSE_WRITEBACK_CACHE != 0 + conn.cacheSymlinks = out.Flags&linux.FUSE_CACHE_SYMLINKS != 0 + conn.abortErr = out.Flags&linux.FUSE_ABORT_ERROR != 0 + + // TODO(gvisor.dev/issue/3195): figure out how to use TimeGran (0 < TimeGran <= fuseMaxTimeGranNs). + + if out.Flags&linux.FUSE_MAX_PAGES != 0 { + maxPages := out.MaxPages + if maxPages < 1 { + maxPages = 1 + } + if maxPages > fuseMaxMaxPages { + maxPages = fuseMaxMaxPages + } + conn.maxPages = maxPages + } + } + + // No support for negotiating MaxWrite before minor version 5. + if out.Minor >= 5 { + conn.maxWrite = out.MaxWrite + } else { + conn.maxWrite = fuseMinMaxWrite + } + if conn.maxWrite < fuseMinMaxWrite { + conn.maxWrite = fuseMinMaxWrite + } + + // Set connection as initialized and unblock the requests + // issued before init. + conn.SetInitialized() + + return nil +} diff --git a/pkg/syserror/syserror.go b/pkg/syserror/syserror.go index c73072c42..798e07b01 100644 --- a/pkg/syserror/syserror.go +++ b/pkg/syserror/syserror.go @@ -61,6 +61,7 @@ var ( ENOMEM = error(syscall.ENOMEM) ENOSPC = error(syscall.ENOSPC) ENOSYS = error(syscall.ENOSYS) + ENOTCONN = error(syscall.ENOTCONN) ENOTDIR = error(syscall.ENOTDIR) ENOTEMPTY = error(syscall.ENOTEMPTY) ENOTSOCK = error(syscall.ENOTSOCK) -- cgit v1.2.3 From b2ae7ea1bb207eddadd7962080e7bd0b8634db96 Mon Sep 17 00:00:00 2001 From: Nayana Bidari Date: Mon, 3 Aug 2020 13:33:47 -0700 Subject: Plumbing context.Context to DecRef() and Release(). context is passed to DecRef() and Release() which is needed for SO_LINGER implementation. PiperOrigin-RevId: 324672584 --- pkg/refs/BUILD | 6 +- pkg/refs/refcounter.go | 19 +-- pkg/refs/refcounter_test.go | 38 ++--- pkg/sentry/control/proc.go | 2 +- pkg/sentry/devices/memdev/full.go | 2 +- pkg/sentry/devices/memdev/null.go | 2 +- pkg/sentry/devices/memdev/random.go | 2 +- pkg/sentry/devices/memdev/zero.go | 2 +- pkg/sentry/devices/ttydev/ttydev.go | 2 +- pkg/sentry/devices/tundev/tundev.go | 4 +- pkg/sentry/fdimport/fdimport.go | 8 +- pkg/sentry/fs/copy_up.go | 12 +- pkg/sentry/fs/copy_up_test.go | 4 +- pkg/sentry/fs/dev/net_tun.go | 4 +- pkg/sentry/fs/dirent.go | 110 +++++++------- pkg/sentry/fs/dirent_cache.go | 3 +- pkg/sentry/fs/dirent_refs_test.go | 16 +-- pkg/sentry/fs/dirent_state.go | 3 +- pkg/sentry/fs/fdpipe/pipe.go | 2 +- pkg/sentry/fs/fdpipe/pipe_opener_test.go | 16 +-- pkg/sentry/fs/fdpipe/pipe_test.go | 18 +-- pkg/sentry/fs/file.go | 10 +- pkg/sentry/fs/file_operations.go | 2 +- pkg/sentry/fs/file_overlay.go | 22 +-- pkg/sentry/fs/fsutil/file.go | 4 +- pkg/sentry/fs/gofer/file.go | 4 +- pkg/sentry/fs/gofer/gofer_test.go | 8 +- pkg/sentry/fs/gofer/handles.go | 5 +- pkg/sentry/fs/gofer/inode.go | 5 +- pkg/sentry/fs/gofer/path.go | 6 +- pkg/sentry/fs/gofer/session.go | 16 +-- pkg/sentry/fs/gofer/session_state.go | 3 +- pkg/sentry/fs/gofer/socket.go | 6 +- pkg/sentry/fs/host/control.go | 2 +- pkg/sentry/fs/host/file.go | 4 +- pkg/sentry/fs/host/inode_test.go | 2 +- pkg/sentry/fs/host/socket.go | 10 +- pkg/sentry/fs/host/socket_test.go | 38 ++--- pkg/sentry/fs/host/tty.go | 4 +- pkg/sentry/fs/host/wait_test.go | 2 +- pkg/sentry/fs/inode.go | 11 +- pkg/sentry/fs/inode_inotify.go | 5 +- pkg/sentry/fs/inode_overlay.go | 30 ++-- pkg/sentry/fs/inode_overlay_test.go | 8 +- pkg/sentry/fs/inotify.go | 8 +- pkg/sentry/fs/inotify_watch.go | 9 +- pkg/sentry/fs/mount.go | 12 +- pkg/sentry/fs/mount_overlay.go | 6 +- pkg/sentry/fs/mount_test.go | 29 ++-- pkg/sentry/fs/mounts.go | 30 ++-- pkg/sentry/fs/mounts_test.go | 2 +- pkg/sentry/fs/overlay.go | 10 +- pkg/sentry/fs/proc/fds.go | 18 +-- pkg/sentry/fs/proc/mounts.go | 8 +- pkg/sentry/fs/proc/net.go | 12 +- pkg/sentry/fs/proc/proc.go | 2 +- pkg/sentry/fs/proc/task.go | 4 +- pkg/sentry/fs/ramfs/dir.go | 18 +-- pkg/sentry/fs/ramfs/tree_test.go | 2 +- pkg/sentry/fs/timerfd/timerfd.go | 4 +- pkg/sentry/fs/tmpfs/file_test.go | 2 +- pkg/sentry/fs/tty/dir.go | 8 +- pkg/sentry/fs/tty/fs.go | 2 +- pkg/sentry/fs/tty/master.go | 8 +- pkg/sentry/fs/tty/slave.go | 4 +- pkg/sentry/fs/user/path.go | 8 +- pkg/sentry/fs/user/user.go | 8 +- pkg/sentry/fs/user/user_test.go | 8 +- pkg/sentry/fsbridge/bridge.go | 2 +- pkg/sentry/fsbridge/fs.go | 8 +- pkg/sentry/fsbridge/vfs.go | 6 +- pkg/sentry/fsimpl/devpts/devpts.go | 4 +- pkg/sentry/fsimpl/devpts/master.go | 6 +- pkg/sentry/fsimpl/devpts/slave.go | 6 +- pkg/sentry/fsimpl/devtmpfs/devtmpfs.go | 6 +- pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go | 8 +- pkg/sentry/fsimpl/eventfd/eventfd.go | 6 +- pkg/sentry/fsimpl/eventfd/eventfd_test.go | 12 +- pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go | 6 +- pkg/sentry/fsimpl/ext/dentry.go | 7 +- pkg/sentry/fsimpl/ext/directory.go | 2 +- pkg/sentry/fsimpl/ext/ext.go | 10 +- pkg/sentry/fsimpl/ext/ext_test.go | 4 +- pkg/sentry/fsimpl/ext/filesystem.go | 68 ++++----- pkg/sentry/fsimpl/ext/regular_file.go | 2 +- pkg/sentry/fsimpl/ext/symlink.go | 2 +- pkg/sentry/fsimpl/fuse/dev.go | 2 +- pkg/sentry/fsimpl/fuse/dev_test.go | 4 +- pkg/sentry/fsimpl/fuse/fusefs.go | 4 +- pkg/sentry/fsimpl/gofer/directory.go | 4 +- pkg/sentry/fsimpl/gofer/filesystem.go | 90 ++++++------ pkg/sentry/fsimpl/gofer/gofer.go | 40 +++--- pkg/sentry/fsimpl/gofer/gofer_test.go | 6 +- pkg/sentry/fsimpl/gofer/regular_file.go | 2 +- pkg/sentry/fsimpl/gofer/socket.go | 6 +- pkg/sentry/fsimpl/gofer/special_file.go | 4 +- pkg/sentry/fsimpl/host/control.go | 2 +- pkg/sentry/fsimpl/host/host.go | 14 +- pkg/sentry/fsimpl/host/socket.go | 12 +- pkg/sentry/fsimpl/host/tty.go | 4 +- pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go | 2 +- pkg/sentry/fsimpl/kernfs/fd_impl_util.go | 2 +- pkg/sentry/fsimpl/kernfs/filesystem.go | 68 ++++----- pkg/sentry/fsimpl/kernfs/inode_impl_util.go | 10 +- pkg/sentry/fsimpl/kernfs/kernfs.go | 26 ++-- pkg/sentry/fsimpl/kernfs/kernfs_test.go | 18 +-- pkg/sentry/fsimpl/overlay/copy_up.go | 8 +- pkg/sentry/fsimpl/overlay/directory.go | 8 +- pkg/sentry/fsimpl/overlay/filesystem.go | 64 ++++----- pkg/sentry/fsimpl/overlay/non_directory.go | 18 +-- pkg/sentry/fsimpl/overlay/overlay.go | 48 +++---- pkg/sentry/fsimpl/pipefs/pipefs.go | 6 +- pkg/sentry/fsimpl/proc/filesystem.go | 4 +- pkg/sentry/fsimpl/proc/task_fds.go | 18 +-- pkg/sentry/fsimpl/proc/task_files.go | 14 +- pkg/sentry/fsimpl/proc/task_net.go | 12 +- pkg/sentry/fsimpl/proc/tasks_test.go | 10 +- pkg/sentry/fsimpl/signalfd/signalfd.go | 4 +- pkg/sentry/fsimpl/sockfs/sockfs.go | 4 +- pkg/sentry/fsimpl/sys/sys.go | 4 +- pkg/sentry/fsimpl/sys/sys_test.go | 2 +- pkg/sentry/fsimpl/testutil/kernel.go | 2 +- pkg/sentry/fsimpl/testutil/testutil.go | 6 +- pkg/sentry/fsimpl/timerfd/timerfd.go | 6 +- pkg/sentry/fsimpl/tmpfs/benchmark_test.go | 50 +++---- pkg/sentry/fsimpl/tmpfs/directory.go | 4 +- pkg/sentry/fsimpl/tmpfs/filesystem.go | 106 +++++++------- pkg/sentry/fsimpl/tmpfs/pipe_test.go | 20 +-- pkg/sentry/fsimpl/tmpfs/regular_file.go | 2 +- pkg/sentry/fsimpl/tmpfs/tmpfs.go | 34 ++--- pkg/sentry/fsimpl/tmpfs/tmpfs_test.go | 6 +- pkg/sentry/kernel/abstract_socket_namespace.go | 13 +- pkg/sentry/kernel/epoll/epoll.go | 14 +- pkg/sentry/kernel/epoll/epoll_test.go | 5 +- pkg/sentry/kernel/eventfd/eventfd.go | 4 +- pkg/sentry/kernel/fd_table.go | 55 +++---- pkg/sentry/kernel/fd_table_test.go | 6 +- pkg/sentry/kernel/fs_context.go | 31 ++-- pkg/sentry/kernel/futex/BUILD | 1 + pkg/sentry/kernel/futex/futex.go | 35 ++--- pkg/sentry/kernel/futex/futex_test.go | 66 +++++---- pkg/sentry/kernel/kernel.go | 57 ++++---- pkg/sentry/kernel/pipe/node.go | 6 +- pkg/sentry/kernel/pipe/node_test.go | 2 +- pkg/sentry/kernel/pipe/pipe.go | 2 +- pkg/sentry/kernel/pipe/pipe_test.go | 16 +-- pkg/sentry/kernel/pipe/pipe_util.go | 2 +- pkg/sentry/kernel/pipe/reader.go | 3 +- pkg/sentry/kernel/pipe/vfs.go | 8 +- pkg/sentry/kernel/pipe/writer.go | 3 +- pkg/sentry/kernel/sessions.go | 5 +- pkg/sentry/kernel/shm/shm.go | 10 +- pkg/sentry/kernel/signalfd/signalfd.go | 2 +- pkg/sentry/kernel/task.go | 8 +- pkg/sentry/kernel/task_clone.go | 10 +- pkg/sentry/kernel/task_exec.go | 6 +- pkg/sentry/kernel/task_exit.go | 8 +- pkg/sentry/kernel/task_log.go | 2 +- pkg/sentry/kernel/task_start.go | 6 +- pkg/sentry/kernel/thread_group.go | 4 +- pkg/sentry/loader/elf.go | 4 +- pkg/sentry/loader/loader.go | 6 +- pkg/sentry/memmap/memmap.go | 2 +- pkg/sentry/mm/aio_context.go | 6 +- pkg/sentry/mm/lifecycle.go | 2 +- pkg/sentry/mm/metadata.go | 5 +- pkg/sentry/mm/special_mappable.go | 4 +- pkg/sentry/mm/syscalls.go | 4 +- pkg/sentry/mm/vma.go | 4 +- pkg/sentry/socket/control/control.go | 8 +- pkg/sentry/socket/control/control_vfs2.go | 8 +- pkg/sentry/socket/hostinet/socket.go | 8 +- pkg/sentry/socket/netlink/provider.go | 2 +- pkg/sentry/socket/netlink/socket.go | 14 +- pkg/sentry/socket/netlink/socket_vfs2.go | 4 +- pkg/sentry/socket/netstack/netstack.go | 6 +- pkg/sentry/socket/netstack/netstack_vfs2.go | 2 +- pkg/sentry/socket/socket.go | 4 +- pkg/sentry/socket/unix/transport/connectioned.go | 10 +- pkg/sentry/socket/unix/transport/connectionless.go | 12 +- pkg/sentry/socket/unix/transport/queue.go | 13 +- pkg/sentry/socket/unix/transport/unix.go | 48 +++---- pkg/sentry/socket/unix/unix.go | 38 ++--- pkg/sentry/socket/unix/unix_vfs2.go | 18 +-- pkg/sentry/strace/strace.go | 12 +- pkg/sentry/syscalls/epoll.go | 18 +-- pkg/sentry/syscalls/linux/sys_aio.go | 8 +- pkg/sentry/syscalls/linux/sys_eventfd.go | 2 +- pkg/sentry/syscalls/linux/sys_file.go | 78 +++++----- pkg/sentry/syscalls/linux/sys_futex.go | 6 +- pkg/sentry/syscalls/linux/sys_getdents.go | 2 +- pkg/sentry/syscalls/linux/sys_inotify.go | 10 +- pkg/sentry/syscalls/linux/sys_lseek.go | 2 +- pkg/sentry/syscalls/linux/sys_mmap.go | 4 +- pkg/sentry/syscalls/linux/sys_mount.go | 2 +- pkg/sentry/syscalls/linux/sys_pipe.go | 6 +- pkg/sentry/syscalls/linux/sys_poll.go | 10 +- pkg/sentry/syscalls/linux/sys_prctl.go | 4 +- pkg/sentry/syscalls/linux/sys_read.go | 12 +- pkg/sentry/syscalls/linux/sys_shm.go | 10 +- pkg/sentry/syscalls/linux/sys_signal.go | 4 +- pkg/sentry/syscalls/linux/sys_socket.go | 46 +++--- pkg/sentry/syscalls/linux/sys_splice.go | 12 +- pkg/sentry/syscalls/linux/sys_stat.go | 8 +- pkg/sentry/syscalls/linux/sys_sync.go | 8 +- pkg/sentry/syscalls/linux/sys_thread.go | 6 +- pkg/sentry/syscalls/linux/sys_timerfd.go | 6 +- pkg/sentry/syscalls/linux/sys_write.go | 10 +- pkg/sentry/syscalls/linux/sys_xattr.go | 8 +- pkg/sentry/syscalls/linux/vfs2/aio.go | 8 +- pkg/sentry/syscalls/linux/vfs2/epoll.go | 14 +- pkg/sentry/syscalls/linux/vfs2/eventfd.go | 4 +- pkg/sentry/syscalls/linux/vfs2/execve.go | 12 +- pkg/sentry/syscalls/linux/vfs2/fd.go | 12 +- pkg/sentry/syscalls/linux/vfs2/filesystem.go | 22 +-- pkg/sentry/syscalls/linux/vfs2/fscontext.go | 22 +-- pkg/sentry/syscalls/linux/vfs2/getdents.go | 2 +- pkg/sentry/syscalls/linux/vfs2/inotify.go | 14 +- pkg/sentry/syscalls/linux/vfs2/ioctl.go | 2 +- pkg/sentry/syscalls/linux/vfs2/lock.go | 2 +- pkg/sentry/syscalls/linux/vfs2/memfd.go | 2 +- pkg/sentry/syscalls/linux/vfs2/mmap.go | 4 +- pkg/sentry/syscalls/linux/vfs2/mount.go | 4 +- pkg/sentry/syscalls/linux/vfs2/path.go | 12 +- pkg/sentry/syscalls/linux/vfs2/pipe.go | 6 +- pkg/sentry/syscalls/linux/vfs2/poll.go | 10 +- pkg/sentry/syscalls/linux/vfs2/read_write.go | 48 +++---- pkg/sentry/syscalls/linux/vfs2/setstat.go | 20 +-- pkg/sentry/syscalls/linux/vfs2/signal.go | 4 +- pkg/sentry/syscalls/linux/vfs2/socket.go | 46 +++--- pkg/sentry/syscalls/linux/vfs2/splice.go | 20 +-- pkg/sentry/syscalls/linux/vfs2/stat.go | 30 ++-- pkg/sentry/syscalls/linux/vfs2/sync.go | 6 +- pkg/sentry/syscalls/linux/vfs2/timerfd.go | 8 +- pkg/sentry/syscalls/linux/vfs2/xattr.go | 16 +-- pkg/sentry/vfs/anonfs.go | 8 +- pkg/sentry/vfs/dentry.go | 37 ++--- pkg/sentry/vfs/epoll.go | 6 +- pkg/sentry/vfs/file_description.go | 24 ++-- pkg/sentry/vfs/file_description_impl_util_test.go | 18 +-- pkg/sentry/vfs/filesystem.go | 6 +- pkg/sentry/vfs/inotify.go | 34 ++--- pkg/sentry/vfs/mount.go | 54 +++---- pkg/sentry/vfs/pathname.go | 18 +-- pkg/sentry/vfs/resolving_path.go | 43 +++--- pkg/sentry/vfs/vfs.go | 160 ++++++++++----------- pkg/tcpip/link/tun/BUILD | 1 + pkg/tcpip/link/tun/device.go | 9 +- runsc/boot/fs.go | 12 +- runsc/boot/loader.go | 16 +-- runsc/boot/loader_test.go | 8 +- runsc/boot/vfs.go | 10 +- 252 files changed, 1711 insertions(+), 1668 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/refs/BUILD b/pkg/refs/BUILD index 74affc887..9888cce9c 100644 --- a/pkg/refs/BUILD +++ b/pkg/refs/BUILD @@ -24,6 +24,7 @@ go_library( ], visibility = ["//:sandbox"], deps = [ + "//pkg/context", "//pkg/log", "//pkg/sync", ], @@ -34,5 +35,8 @@ go_test( size = "small", srcs = ["refcounter_test.go"], library = ":refs", - deps = ["//pkg/sync"], + deps = [ + "//pkg/context", + "//pkg/sync", + ], ) diff --git a/pkg/refs/refcounter.go b/pkg/refs/refcounter.go index c45ba8200..61790221b 100644 --- a/pkg/refs/refcounter.go +++ b/pkg/refs/refcounter.go @@ -23,6 +23,7 @@ import ( "runtime" "sync/atomic" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sync" ) @@ -38,7 +39,7 @@ type RefCounter interface { // Note that AtomicRefCounter.DecRef() does not support destructors. // If a type has a destructor, it must implement its own DecRef() // method and call AtomicRefCounter.DecRefWithDestructor(destructor). - DecRef() + DecRef(ctx context.Context) // TryIncRef attempts to increase the reference counter on the object, // but may fail if all references have already been dropped. This @@ -57,7 +58,7 @@ type RefCounter interface { // A WeakRefUser is notified when the last non-weak reference is dropped. type WeakRefUser interface { // WeakRefGone is called when the last non-weak reference is dropped. - WeakRefGone() + WeakRefGone(ctx context.Context) } // WeakRef is a weak reference. @@ -123,7 +124,7 @@ func (w *WeakRef) Get() RefCounter { // Drop drops this weak reference. You should always call drop when you are // finished with the weak reference. You may not use this object after calling // drop. -func (w *WeakRef) Drop() { +func (w *WeakRef) Drop(ctx context.Context) { rc, ok := w.get() if !ok { // We've been zapped already. When the refcounter has called @@ -145,7 +146,7 @@ func (w *WeakRef) Drop() { // And now aren't on the object's list of weak references. So it won't // zap us if this causes the reference count to drop to zero. - rc.DecRef() + rc.DecRef(ctx) // Return to the pool. weakRefPool.Put(w) @@ -427,7 +428,7 @@ func (r *AtomicRefCount) dropWeakRef(w *WeakRef) { // A: TryIncRef [transform speculative to real] // //go:nosplit -func (r *AtomicRefCount) DecRefWithDestructor(destroy func()) { +func (r *AtomicRefCount) DecRefWithDestructor(ctx context.Context, destroy func(context.Context)) { switch v := atomic.AddInt64(&r.refCount, -1); { case v < -1: panic("Decrementing non-positive ref count") @@ -448,7 +449,7 @@ func (r *AtomicRefCount) DecRefWithDestructor(destroy func()) { if user != nil { r.mu.Unlock() - user.WeakRefGone() + user.WeakRefGone(ctx) r.mu.Lock() } } @@ -456,7 +457,7 @@ func (r *AtomicRefCount) DecRefWithDestructor(destroy func()) { // Call the destructor. if destroy != nil { - destroy() + destroy(ctx) } } } @@ -464,6 +465,6 @@ func (r *AtomicRefCount) DecRefWithDestructor(destroy func()) { // DecRef decrements this object's reference count. // //go:nosplit -func (r *AtomicRefCount) DecRef() { - r.DecRefWithDestructor(nil) +func (r *AtomicRefCount) DecRef(ctx context.Context) { + r.DecRefWithDestructor(ctx, nil) } diff --git a/pkg/refs/refcounter_test.go b/pkg/refs/refcounter_test.go index 1ab4a4440..6d0dd1018 100644 --- a/pkg/refs/refcounter_test.go +++ b/pkg/refs/refcounter_test.go @@ -18,6 +18,7 @@ import ( "reflect" "testing" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sync" ) @@ -31,11 +32,11 @@ type testCounter struct { destroyed bool } -func (t *testCounter) DecRef() { - t.AtomicRefCount.DecRefWithDestructor(t.destroy) +func (t *testCounter) DecRef(ctx context.Context) { + t.AtomicRefCount.DecRefWithDestructor(ctx, t.destroy) } -func (t *testCounter) destroy() { +func (t *testCounter) destroy(context.Context) { t.mu.Lock() defer t.mu.Unlock() t.destroyed = true @@ -53,7 +54,7 @@ func newTestCounter() *testCounter { func TestOneRef(t *testing.T) { tc := newTestCounter() - tc.DecRef() + tc.DecRef(context.Background()) if !tc.IsDestroyed() { t.Errorf("object should have been destroyed") @@ -63,8 +64,9 @@ func TestOneRef(t *testing.T) { func TestTwoRefs(t *testing.T) { tc := newTestCounter() tc.IncRef() - tc.DecRef() - tc.DecRef() + ctx := context.Background() + tc.DecRef(ctx) + tc.DecRef(ctx) if !tc.IsDestroyed() { t.Errorf("object should have been destroyed") @@ -74,12 +76,13 @@ func TestTwoRefs(t *testing.T) { func TestMultiRefs(t *testing.T) { tc := newTestCounter() tc.IncRef() - tc.DecRef() + ctx := context.Background() + tc.DecRef(ctx) tc.IncRef() - tc.DecRef() + tc.DecRef(ctx) - tc.DecRef() + tc.DecRef(ctx) if !tc.IsDestroyed() { t.Errorf("object should have been destroyed") @@ -89,19 +92,20 @@ func TestMultiRefs(t *testing.T) { func TestWeakRef(t *testing.T) { tc := newTestCounter() w := NewWeakRef(tc, nil) + ctx := context.Background() // Try resolving. if x := w.Get(); x == nil { t.Errorf("weak reference didn't resolve: expected %v, got nil", tc) } else { - x.DecRef() + x.DecRef(ctx) } // Try resolving again. if x := w.Get(); x == nil { t.Errorf("weak reference didn't resolve: expected %v, got nil", tc) } else { - x.DecRef() + x.DecRef(ctx) } // Shouldn't be destroyed yet. (Can't continue if this fails.) @@ -110,7 +114,7 @@ func TestWeakRef(t *testing.T) { } // Drop the original reference. - tc.DecRef() + tc.DecRef(ctx) // Assert destroyed. if !tc.IsDestroyed() { @@ -126,7 +130,8 @@ func TestWeakRef(t *testing.T) { func TestWeakRefDrop(t *testing.T) { tc := newTestCounter() w := NewWeakRef(tc, nil) - w.Drop() + ctx := context.Background() + w.Drop(ctx) // Just assert the list is empty. if !tc.weakRefs.Empty() { @@ -134,14 +139,14 @@ func TestWeakRefDrop(t *testing.T) { } // Drop the original reference. - tc.DecRef() + tc.DecRef(ctx) } type testWeakRefUser struct { weakRefGone func() } -func (u *testWeakRefUser) WeakRefGone() { +func (u *testWeakRefUser) WeakRefGone(ctx context.Context) { u.weakRefGone() } @@ -165,7 +170,8 @@ func TestCallback(t *testing.T) { }}) // Drop the original reference, this must trigger the callback. - tc.DecRef() + ctx := context.Background() + tc.DecRef(ctx) if !called { t.Fatalf("Callback not called") diff --git a/pkg/sentry/control/proc.go b/pkg/sentry/control/proc.go index 1bae7cfaf..dfa936563 100644 --- a/pkg/sentry/control/proc.go +++ b/pkg/sentry/control/proc.go @@ -139,7 +139,6 @@ func ExecAsync(proc *Proc, args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadID func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadID, *host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) { // Import file descriptors. fdTable := proc.Kernel.NewFDTable() - defer fdTable.DecRef() creds := auth.NewUserCredentials( args.KUID, @@ -177,6 +176,7 @@ func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadI initArgs.MountNamespaceVFS2.IncRef() } ctx := initArgs.NewContext(proc.Kernel) + defer fdTable.DecRef(ctx) if kernel.VFS2Enabled { // Get the full path to the filename from the PATH env variable. diff --git a/pkg/sentry/devices/memdev/full.go b/pkg/sentry/devices/memdev/full.go index af66fe4dc..511179e31 100644 --- a/pkg/sentry/devices/memdev/full.go +++ b/pkg/sentry/devices/memdev/full.go @@ -46,7 +46,7 @@ type fullFD struct { } // Release implements vfs.FileDescriptionImpl.Release. -func (fd *fullFD) Release() { +func (fd *fullFD) Release(context.Context) { // noop } diff --git a/pkg/sentry/devices/memdev/null.go b/pkg/sentry/devices/memdev/null.go index 92d3d71be..4918dbeeb 100644 --- a/pkg/sentry/devices/memdev/null.go +++ b/pkg/sentry/devices/memdev/null.go @@ -47,7 +47,7 @@ type nullFD struct { } // Release implements vfs.FileDescriptionImpl.Release. -func (fd *nullFD) Release() { +func (fd *nullFD) Release(context.Context) { // noop } diff --git a/pkg/sentry/devices/memdev/random.go b/pkg/sentry/devices/memdev/random.go index 6b81da5ef..5e7fe0280 100644 --- a/pkg/sentry/devices/memdev/random.go +++ b/pkg/sentry/devices/memdev/random.go @@ -56,7 +56,7 @@ type randomFD struct { } // Release implements vfs.FileDescriptionImpl.Release. -func (fd *randomFD) Release() { +func (fd *randomFD) Release(context.Context) { // noop } diff --git a/pkg/sentry/devices/memdev/zero.go b/pkg/sentry/devices/memdev/zero.go index c6f15054d..2e631a252 100644 --- a/pkg/sentry/devices/memdev/zero.go +++ b/pkg/sentry/devices/memdev/zero.go @@ -48,7 +48,7 @@ type zeroFD struct { } // Release implements vfs.FileDescriptionImpl.Release. -func (fd *zeroFD) Release() { +func (fd *zeroFD) Release(context.Context) { // noop } diff --git a/pkg/sentry/devices/ttydev/ttydev.go b/pkg/sentry/devices/ttydev/ttydev.go index fbb7fd92c..fd4b79c46 100644 --- a/pkg/sentry/devices/ttydev/ttydev.go +++ b/pkg/sentry/devices/ttydev/ttydev.go @@ -55,7 +55,7 @@ type ttyFD struct { } // Release implements vfs.FileDescriptionImpl.Release. -func (fd *ttyFD) Release() {} +func (fd *ttyFD) Release(context.Context) {} // PRead implements vfs.FileDescriptionImpl.PRead. func (fd *ttyFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { diff --git a/pkg/sentry/devices/tundev/tundev.go b/pkg/sentry/devices/tundev/tundev.go index dfbd069af..852ec3c5c 100644 --- a/pkg/sentry/devices/tundev/tundev.go +++ b/pkg/sentry/devices/tundev/tundev.go @@ -108,8 +108,8 @@ func (fd *tunFD) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArg } // Release implements vfs.FileDescriptionImpl.Release. -func (fd *tunFD) Release() { - fd.device.Release() +func (fd *tunFD) Release(ctx context.Context) { + fd.device.Release(ctx) } // PRead implements vfs.FileDescriptionImpl.PRead. diff --git a/pkg/sentry/fdimport/fdimport.go b/pkg/sentry/fdimport/fdimport.go index b8686adb4..1b7cb94c0 100644 --- a/pkg/sentry/fdimport/fdimport.go +++ b/pkg/sentry/fdimport/fdimport.go @@ -50,7 +50,7 @@ func importFS(ctx context.Context, fdTable *kernel.FDTable, console bool, fds [] if err != nil { return nil, err } - defer appFile.DecRef() + defer appFile.DecRef(ctx) // Remember this in the TTY file, as we will // use it for the other stdio FDs. @@ -69,7 +69,7 @@ func importFS(ctx context.Context, fdTable *kernel.FDTable, console bool, fds [] if err != nil { return nil, err } - defer appFile.DecRef() + defer appFile.DecRef(ctx) } // Add the file to the FD map. @@ -102,7 +102,7 @@ func importVFS2(ctx context.Context, fdTable *kernel.FDTable, console bool, stdi if err != nil { return nil, err } - defer appFile.DecRef() + defer appFile.DecRef(ctx) // Remember this in the TTY file, as we will use it for the other stdio // FDs. @@ -119,7 +119,7 @@ func importVFS2(ctx context.Context, fdTable *kernel.FDTable, console bool, stdi if err != nil { return nil, err } - defer appFile.DecRef() + defer appFile.DecRef(ctx) } if err := fdTable.NewFDAtVFS2(ctx, int32(appFD), appFile, kernel.FDFlags{}); err != nil { diff --git a/pkg/sentry/fs/copy_up.go b/pkg/sentry/fs/copy_up.go index ab1424c95..735452b07 100644 --- a/pkg/sentry/fs/copy_up.go +++ b/pkg/sentry/fs/copy_up.go @@ -201,7 +201,7 @@ func copyUpLocked(ctx context.Context, parent *Dirent, next *Dirent) error { parentUpper := parent.Inode.overlay.upper root := RootFromContext(ctx) if root != nil { - defer root.DecRef() + defer root.DecRef(ctx) } // Create the file in the upper filesystem and get an Inode for it. @@ -212,7 +212,7 @@ func copyUpLocked(ctx context.Context, parent *Dirent, next *Dirent) error { log.Warningf("copy up failed to create file: %v", err) return syserror.EIO } - defer childFile.DecRef() + defer childFile.DecRef(ctx) childUpperInode = childFile.Dirent.Inode case Directory: @@ -226,7 +226,7 @@ func copyUpLocked(ctx context.Context, parent *Dirent, next *Dirent) error { cleanupUpper(ctx, parentUpper, next.name, werr) return syserror.EIO } - defer childUpper.DecRef() + defer childUpper.DecRef(ctx) childUpperInode = childUpper.Inode case Symlink: @@ -246,7 +246,7 @@ func copyUpLocked(ctx context.Context, parent *Dirent, next *Dirent) error { cleanupUpper(ctx, parentUpper, next.name, werr) return syserror.EIO } - defer childUpper.DecRef() + defer childUpper.DecRef(ctx) childUpperInode = childUpper.Inode default: @@ -352,14 +352,14 @@ func copyContentsLocked(ctx context.Context, upper *Inode, lower *Inode, size in if err != nil { return err } - defer upperFile.DecRef() + defer upperFile.DecRef(ctx) // Get a handle to the lower filesystem, which we will read from. lowerFile, err := overlayFile(ctx, lower, FileFlags{Read: true}) if err != nil { return err } - defer lowerFile.DecRef() + defer lowerFile.DecRef(ctx) // Use a buffer pool to minimize allocations. buf := copyUpBuffers.Get().([]byte) diff --git a/pkg/sentry/fs/copy_up_test.go b/pkg/sentry/fs/copy_up_test.go index 91792d9fe..c7a11eec1 100644 --- a/pkg/sentry/fs/copy_up_test.go +++ b/pkg/sentry/fs/copy_up_test.go @@ -126,7 +126,7 @@ func makeOverlayTestFiles(t *testing.T) []*overlayTestFile { if err != nil { t.Fatalf("failed to create file %q: %v", name, err) } - defer f.DecRef() + defer f.DecRef(ctx) relname, _ := f.Dirent.FullName(lowerRoot) @@ -171,7 +171,7 @@ func makeOverlayTestFiles(t *testing.T) []*overlayTestFile { if err != nil { t.Fatalf("failed to find %q: %v", f.name, err) } - defer d.DecRef() + defer d.DecRef(ctx) f.File, err = d.Inode.GetFile(ctx, d, fs.FileFlags{Read: true}) if err != nil { diff --git a/pkg/sentry/fs/dev/net_tun.go b/pkg/sentry/fs/dev/net_tun.go index dc7ad075a..ec474e554 100644 --- a/pkg/sentry/fs/dev/net_tun.go +++ b/pkg/sentry/fs/dev/net_tun.go @@ -80,8 +80,8 @@ type netTunFileOperations struct { var _ fs.FileOperations = (*netTunFileOperations)(nil) // Release implements fs.FileOperations.Release. -func (fops *netTunFileOperations) Release() { - fops.device.Release() +func (fops *netTunFileOperations) Release(ctx context.Context) { + fops.device.Release(ctx) } // Ioctl implements fs.FileOperations.Ioctl. diff --git a/pkg/sentry/fs/dirent.go b/pkg/sentry/fs/dirent.go index 65be12175..a2f751068 100644 --- a/pkg/sentry/fs/dirent.go +++ b/pkg/sentry/fs/dirent.go @@ -325,7 +325,7 @@ func (d *Dirent) SyncAll(ctx context.Context) { for _, w := range d.children { if child := w.Get(); child != nil { child.(*Dirent).SyncAll(ctx) - child.DecRef() + child.DecRef(ctx) } } } @@ -451,7 +451,7 @@ func (d *Dirent) walk(ctx context.Context, root *Dirent, name string, walkMayUnl // which don't hold a hard reference on their parent (their parent holds a // hard reference on them, and they contain virtually no state). But this is // good house-keeping. - child.DecRef() + child.DecRef(ctx) return nil, syscall.ENOENT } @@ -468,20 +468,20 @@ func (d *Dirent) walk(ctx context.Context, root *Dirent, name string, walkMayUnl // their pins on the child. Inotify doesn't properly support filesystems that // revalidate dirents (since watches are lost on revalidation), but if we fail // to unpin the watches child will never be GCed. - cd.Inode.Watches.Unpin(cd) + cd.Inode.Watches.Unpin(ctx, cd) // This child needs to be revalidated, fallthrough to unhash it. Make sure // to not leak a reference from Get(). // // Note that previous lookups may still have a reference to this stale child; // this can't be helped, but we can ensure that *new* lookups are up-to-date. - child.DecRef() + child.DecRef(ctx) } // Either our weak reference expired or we need to revalidate it. Unhash child first, we're // about to replace it. delete(d.children, name) - w.Drop() + w.Drop(ctx) } // Slow path: load the InodeOperations into memory. Since this is a hot path and the lookup may be @@ -512,12 +512,12 @@ func (d *Dirent) walk(ctx context.Context, root *Dirent, name string, walkMayUnl // There are active references to the existing child, prefer it to the one we // retrieved from Lookup. Likely the Lookup happened very close to the insertion // of child, so considering one stale over the other is fairly arbitrary. - c.DecRef() + c.DecRef(ctx) // The child that was installed could be negative. if cd.IsNegative() { // If so, don't leak a reference and short circuit. - child.DecRef() + child.DecRef(ctx) return nil, syscall.ENOENT } @@ -531,7 +531,7 @@ func (d *Dirent) walk(ctx context.Context, root *Dirent, name string, walkMayUnl // we did the Inode.Lookup. Fully drop the weak reference and fallback to using the child // we looked up. delete(d.children, name) - w.Drop() + w.Drop(ctx) } // Give the looked up child a parent. We cannot kick out entries, since we just checked above @@ -587,7 +587,7 @@ func (d *Dirent) exists(ctx context.Context, root *Dirent, name string) bool { return false } // Child exists. - child.DecRef() + child.DecRef(ctx) return true } @@ -622,7 +622,7 @@ func (d *Dirent) Create(ctx context.Context, root *Dirent, name string, flags Fi } child := file.Dirent - d.finishCreate(child, name) + d.finishCreate(ctx, child, name) // Return the reference and the new file. When the last reference to // the file is dropped, file.Dirent may no longer be cached. @@ -631,7 +631,7 @@ func (d *Dirent) Create(ctx context.Context, root *Dirent, name string, flags Fi // finishCreate validates the created file, adds it as a child of this dirent, // and notifies any watchers. -func (d *Dirent) finishCreate(child *Dirent, name string) { +func (d *Dirent) finishCreate(ctx context.Context, child *Dirent, name string) { // Sanity check c, its name must be consistent. if child.name != name { panic(fmt.Sprintf("create from %q to %q returned unexpected name %q", d.name, name, child.name)) @@ -650,14 +650,14 @@ func (d *Dirent) finishCreate(child *Dirent, name string) { panic(fmt.Sprintf("hashed child %q over a positive child", child.name)) } // Don't leak a reference. - old.DecRef() + old.DecRef(ctx) // Drop d's reference. - old.DecRef() + old.DecRef(ctx) } // Finally drop the useless weak reference on the floor. - w.Drop() + w.Drop(ctx) } d.Inode.Watches.Notify(name, linux.IN_CREATE, 0) @@ -686,17 +686,17 @@ func (d *Dirent) genericCreate(ctx context.Context, root *Dirent, name string, c panic(fmt.Sprintf("hashed over a positive child %q", old.(*Dirent).name)) } // Don't leak a reference. - old.DecRef() + old.DecRef(ctx) // Drop d's reference. - old.DecRef() + old.DecRef(ctx) } // Unhash the negative Dirent, name needs to exist now. delete(d.children, name) // Finally drop the useless weak reference on the floor. - w.Drop() + w.Drop(ctx) } // Execute the create operation. @@ -756,7 +756,7 @@ func (d *Dirent) Bind(ctx context.Context, root *Dirent, name string, data trans if e != nil { return e } - d.finishCreate(childDir, name) + d.finishCreate(ctx, childDir, name) return nil }) if err == syscall.EEXIST { @@ -901,7 +901,7 @@ func direntReaddir(ctx context.Context, d *Dirent, it DirIterator, root *Dirent, // references to children. // // Preconditions: d.mu must be held. -func (d *Dirent) flush() { +func (d *Dirent) flush(ctx context.Context) { expired := make(map[string]*refs.WeakRef) for n, w := range d.children { // Call flush recursively on each child before removing our @@ -912,7 +912,7 @@ func (d *Dirent) flush() { if !cd.IsNegative() { // Flush the child. cd.mu.Lock() - cd.flush() + cd.flush(ctx) cd.mu.Unlock() // Allow the file system to drop extra references on child. @@ -920,13 +920,13 @@ func (d *Dirent) flush() { } // Don't leak a reference. - child.DecRef() + child.DecRef(ctx) } // Check if the child dirent is closed, and mark it as expired if it is. // We must call w.Get() again here, since the child could have been closed // by the calls to flush() and cache.Remove() in the above if-block. if child := w.Get(); child != nil { - child.DecRef() + child.DecRef(ctx) } else { expired[n] = w } @@ -935,7 +935,7 @@ func (d *Dirent) flush() { // Remove expired entries. for n, w := range expired { delete(d.children, n) - w.Drop() + w.Drop(ctx) } } @@ -977,7 +977,7 @@ func (d *Dirent) mount(ctx context.Context, inode *Inode) (newChild *Dirent, err if !ok { panic("mount must mount over an existing dirent") } - weakRef.Drop() + weakRef.Drop(ctx) // Note that even though `d` is now hidden, it still holds a reference // to its parent. @@ -1002,13 +1002,13 @@ func (d *Dirent) unmount(ctx context.Context, replacement *Dirent) error { if !ok { panic("mount must mount over an existing dirent") } - weakRef.Drop() + weakRef.Drop(ctx) // d is not reachable anymore, and hence not mounted anymore. d.mounted = false // Drop mount reference. - d.DecRef() + d.DecRef(ctx) return nil } @@ -1029,7 +1029,7 @@ func (d *Dirent) Remove(ctx context.Context, root *Dirent, name string, dirPath // Child does not exist. return err } - defer child.DecRef() + defer child.DecRef(ctx) // Remove cannot remove directories. if IsDir(child.Inode.StableAttr) { @@ -1055,7 +1055,7 @@ func (d *Dirent) Remove(ctx context.Context, root *Dirent, name string, dirPath atomic.StoreInt32(&child.deleted, 1) if w, ok := d.children[name]; ok { delete(d.children, name) - w.Drop() + w.Drop(ctx) } // Allow the file system to drop extra references on child. @@ -1067,7 +1067,7 @@ func (d *Dirent) Remove(ctx context.Context, root *Dirent, name string, dirPath // inode may have other links. If this was the last link, the events for the // watch removal will be queued by the inode destructor. child.Inode.Watches.MarkUnlinked() - child.Inode.Watches.Unpin(child) + child.Inode.Watches.Unpin(ctx, child) d.Inode.Watches.Notify(name, linux.IN_DELETE, 0) return nil @@ -1100,7 +1100,7 @@ func (d *Dirent) RemoveDirectory(ctx context.Context, root *Dirent, name string) // Child does not exist. return err } - defer child.DecRef() + defer child.DecRef(ctx) // RemoveDirectory can only remove directories. if !IsDir(child.Inode.StableAttr) { @@ -1121,7 +1121,7 @@ func (d *Dirent) RemoveDirectory(ctx context.Context, root *Dirent, name string) atomic.StoreInt32(&child.deleted, 1) if w, ok := d.children[name]; ok { delete(d.children, name) - w.Drop() + w.Drop(ctx) } // Allow the file system to drop extra references on child. @@ -1130,14 +1130,14 @@ func (d *Dirent) RemoveDirectory(ctx context.Context, root *Dirent, name string) // Finally, let inotify know the child is being unlinked. Drop any extra // refs from inotify to this child dirent. child.Inode.Watches.MarkUnlinked() - child.Inode.Watches.Unpin(child) + child.Inode.Watches.Unpin(ctx, child) d.Inode.Watches.Notify(name, linux.IN_ISDIR|linux.IN_DELETE, 0) return nil } // destroy closes this node and all children. -func (d *Dirent) destroy() { +func (d *Dirent) destroy(ctx context.Context) { if d.IsNegative() { // Nothing to tear-down and no parent references to drop, since a negative // Dirent does not take a references on its parent, has no Inode and no children. @@ -1153,19 +1153,19 @@ func (d *Dirent) destroy() { if c.(*Dirent).IsNegative() { // The parent holds both weak and strong refs in the case of // negative dirents. - c.DecRef() + c.DecRef(ctx) } // Drop the reference we just acquired in WeakRef.Get. - c.DecRef() + c.DecRef(ctx) } - w.Drop() + w.Drop(ctx) } d.children = nil allDirents.remove(d) // Drop our reference to the Inode. - d.Inode.DecRef() + d.Inode.DecRef(ctx) // Allow the Dirent to be GC'ed after this point, since the Inode may still // be referenced after the Dirent is destroyed (for instance by filesystem @@ -1175,7 +1175,7 @@ func (d *Dirent) destroy() { // Drop the reference we have on our parent if we took one. renameMu doesn't need to be // held because d can't be reparented without any references to it left. if d.parent != nil { - d.parent.DecRef() + d.parent.DecRef(ctx) } } @@ -1201,14 +1201,14 @@ func (d *Dirent) TryIncRef() bool { // DecRef decreases the Dirent's refcount and drops its reference on its mount. // // DecRef implements RefCounter.DecRef with destructor d.destroy. -func (d *Dirent) DecRef() { +func (d *Dirent) DecRef(ctx context.Context) { if d.Inode != nil { // Keep mount around, since DecRef may destroy d.Inode. msrc := d.Inode.MountSource - d.DecRefWithDestructor(d.destroy) + d.DecRefWithDestructor(ctx, d.destroy) msrc.DecDirentRefs() } else { - d.DecRefWithDestructor(d.destroy) + d.DecRefWithDestructor(ctx, d.destroy) } } @@ -1359,7 +1359,7 @@ func (d *Dirent) MayDelete(ctx context.Context, root *Dirent, name string) error if err != nil { return err } - defer victim.DecRef() + defer victim.DecRef(ctx) return d.mayDelete(ctx, victim) } @@ -1411,7 +1411,7 @@ func Rename(ctx context.Context, root *Dirent, oldParent *Dirent, oldName string if err != nil { return err } - defer renamed.DecRef() + defer renamed.DecRef(ctx) // Check that the renamed dirent is deletable. if err := oldParent.mayDelete(ctx, renamed); err != nil { @@ -1453,13 +1453,13 @@ func Rename(ctx context.Context, root *Dirent, oldParent *Dirent, oldName string // Check that we can delete replaced. if err := newParent.mayDelete(ctx, replaced); err != nil { - replaced.DecRef() + replaced.DecRef(ctx) return err } // Target should not be an ancestor of source. if oldParent.descendantOf(replaced) { - replaced.DecRef() + replaced.DecRef(ctx) // Note that Linux returns EINVAL if the source is an // ancestor of target, but ENOTEMPTY if the target is @@ -1470,7 +1470,7 @@ func Rename(ctx context.Context, root *Dirent, oldParent *Dirent, oldName string // Check that replaced is not a mount point. if replaced.isMountPointLocked() { - replaced.DecRef() + replaced.DecRef(ctx) return syscall.EBUSY } @@ -1478,11 +1478,11 @@ func Rename(ctx context.Context, root *Dirent, oldParent *Dirent, oldName string oldIsDir := IsDir(renamed.Inode.StableAttr) newIsDir := IsDir(replaced.Inode.StableAttr) if !newIsDir && oldIsDir { - replaced.DecRef() + replaced.DecRef(ctx) return syscall.ENOTDIR } if !oldIsDir && newIsDir { - replaced.DecRef() + replaced.DecRef(ctx) return syscall.EISDIR } @@ -1493,13 +1493,13 @@ func Rename(ctx context.Context, root *Dirent, oldParent *Dirent, oldName string // open across renames is currently broken for multiple // reasons, so we flush all references on the replaced node and // its children. - replaced.Inode.Watches.Unpin(replaced) + replaced.Inode.Watches.Unpin(ctx, replaced) replaced.mu.Lock() - replaced.flush() + replaced.flush(ctx) replaced.mu.Unlock() // Done with replaced. - replaced.DecRef() + replaced.DecRef(ctx) } if err := renamed.Inode.Rename(ctx, oldParent, renamed, newParent, newName, replaced != nil); err != nil { @@ -1513,14 +1513,14 @@ func Rename(ctx context.Context, root *Dirent, oldParent *Dirent, oldName string // can't destroy oldParent (and try to retake its lock) because // Rename's caller must be holding a reference. newParent.IncRef() - oldParent.DecRef() + oldParent.DecRef(ctx) } if w, ok := newParent.children[newName]; ok { - w.Drop() + w.Drop(ctx) delete(newParent.children, newName) } if w, ok := oldParent.children[oldName]; ok { - w.Drop() + w.Drop(ctx) delete(oldParent.children, oldName) } @@ -1551,7 +1551,7 @@ func Rename(ctx context.Context, root *Dirent, oldParent *Dirent, oldName string // Same as replaced.flush above. renamed.mu.Lock() - renamed.flush() + renamed.flush(ctx) renamed.mu.Unlock() return nil diff --git a/pkg/sentry/fs/dirent_cache.go b/pkg/sentry/fs/dirent_cache.go index 33de32c69..7d9dd717e 100644 --- a/pkg/sentry/fs/dirent_cache.go +++ b/pkg/sentry/fs/dirent_cache.go @@ -17,6 +17,7 @@ package fs import ( "fmt" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sync" ) @@ -101,7 +102,7 @@ func (c *DirentCache) remove(d *Dirent) { panic(fmt.Sprintf("trying to remove %v, which is not in the dirent cache", d)) } c.list.Remove(d) - d.DecRef() + d.DecRef(context.Background()) c.currentSize-- if c.limit != nil { c.limit.dec() diff --git a/pkg/sentry/fs/dirent_refs_test.go b/pkg/sentry/fs/dirent_refs_test.go index 98d69c6f2..176b894ba 100644 --- a/pkg/sentry/fs/dirent_refs_test.go +++ b/pkg/sentry/fs/dirent_refs_test.go @@ -51,7 +51,7 @@ func TestWalkPositive(t *testing.T) { t.Fatalf("child name = %q has a ref count of %d, want %d", d.name, got, 1) } - d.DecRef() + d.DecRef(ctx) if got := root.ReadRefs(); got != 1 { t.Fatalf("root has a ref count of %d, want %d", got, 1) @@ -61,7 +61,7 @@ func TestWalkPositive(t *testing.T) { t.Fatalf("child name = %q has a ref count of %d, want %d", d.name, got, 0) } - root.flush() + root.flush(ctx) if got := len(root.children); got != 0 { t.Fatalf("root has %d children, want %d", got, 0) @@ -114,7 +114,7 @@ func TestWalkNegative(t *testing.T) { t.Fatalf("child has a ref count of %d, want %d", got, 2) } - child.DecRef() + child.DecRef(ctx) if got := child.(*Dirent).ReadRefs(); got != 1 { t.Fatalf("child has a ref count of %d, want %d", got, 1) @@ -124,7 +124,7 @@ func TestWalkNegative(t *testing.T) { t.Fatalf("root has %d children, want %d", got, 1) } - root.DecRef() + root.DecRef(ctx) if got := root.ReadRefs(); got != 0 { t.Fatalf("root has a ref count of %d, want %d", got, 0) @@ -351,9 +351,9 @@ func TestRemoveExtraRefs(t *testing.T) { t.Fatalf("dirent has a ref count of %d, want %d", got, 1) } - d.DecRef() + d.DecRef(ctx) - test.root.flush() + test.root.flush(ctx) if got := len(test.root.children); got != 0 { t.Errorf("root has %d children, want %d", got, 0) @@ -403,8 +403,8 @@ func TestRenameExtraRefs(t *testing.T) { t.Fatalf("Rename got error %v, want nil", err) } - oldParent.flush() - newParent.flush() + oldParent.flush(ctx) + newParent.flush(ctx) // Expect to have only active references. if got := renamed.ReadRefs(); got != 1 { diff --git a/pkg/sentry/fs/dirent_state.go b/pkg/sentry/fs/dirent_state.go index f623d6c0e..67a35f0b2 100644 --- a/pkg/sentry/fs/dirent_state.go +++ b/pkg/sentry/fs/dirent_state.go @@ -18,6 +18,7 @@ import ( "fmt" "sync/atomic" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/refs" ) @@ -48,7 +49,7 @@ func (d *Dirent) saveChildren() map[string]*Dirent { for name, w := range d.children { if rc := w.Get(); rc != nil { // Drop the reference count obtain in w.Get() - rc.DecRef() + rc.DecRef(context.Background()) cd := rc.(*Dirent) if cd.IsNegative() { diff --git a/pkg/sentry/fs/fdpipe/pipe.go b/pkg/sentry/fs/fdpipe/pipe.go index 9fce177ad..b99199798 100644 --- a/pkg/sentry/fs/fdpipe/pipe.go +++ b/pkg/sentry/fs/fdpipe/pipe.go @@ -115,7 +115,7 @@ func (p *pipeOperations) Readiness(mask waiter.EventMask) (eventMask waiter.Even } // Release implements fs.FileOperations.Release. -func (p *pipeOperations) Release() { +func (p *pipeOperations) Release(context.Context) { fdnotifier.RemoveFD(int32(p.file.FD())) p.file.Close() p.file = nil diff --git a/pkg/sentry/fs/fdpipe/pipe_opener_test.go b/pkg/sentry/fs/fdpipe/pipe_opener_test.go index e556da48a..b9cec4b13 100644 --- a/pkg/sentry/fs/fdpipe/pipe_opener_test.go +++ b/pkg/sentry/fs/fdpipe/pipe_opener_test.go @@ -182,7 +182,7 @@ func TestTryOpen(t *testing.T) { // Cleanup the state of the pipe, and remove the fd from the // fdnotifier. Sadly this needed to maintain the correctness // of other tests because the fdnotifier is global. - pipeOps.Release() + pipeOps.Release(ctx) } continue } @@ -191,7 +191,7 @@ func TestTryOpen(t *testing.T) { } if pipeOps != nil { // Same as above. - pipeOps.Release() + pipeOps.Release(ctx) } } } @@ -279,7 +279,7 @@ func TestPipeOpenUnblocksEventually(t *testing.T) { pipeOps, err := Open(ctx, opener, flags) if pipeOps != nil { // Same as TestTryOpen. - pipeOps.Release() + pipeOps.Release(ctx) } // Check that the partner opened the file successfully. @@ -325,7 +325,7 @@ func TestCopiedReadAheadBuffer(t *testing.T) { ctx := contexttest.Context(t) pipeOps, err := pipeOpenState.TryOpen(ctx, opener, fs.FileFlags{Read: true}) if pipeOps != nil { - pipeOps.Release() + pipeOps.Release(ctx) t.Fatalf("open(%s, %o) got file, want nil", name, syscall.O_RDONLY) } if err != syserror.ErrWouldBlock { @@ -351,7 +351,7 @@ func TestCopiedReadAheadBuffer(t *testing.T) { if pipeOps == nil { t.Fatalf("open(%s, %o) got nil file, want not nil", name, syscall.O_RDONLY) } - defer pipeOps.Release() + defer pipeOps.Release(ctx) if err != nil { t.Fatalf("open(%s, %o) got error %v, want nil", name, syscall.O_RDONLY, err) @@ -471,14 +471,14 @@ func TestPipeHangup(t *testing.T) { f := <-fdchan if f < 0 { t.Errorf("%s: partner routine got fd %d, want > 0", test.desc, f) - pipeOps.Release() + pipeOps.Release(ctx) continue } if test.hangupSelf { // Hangup self and assert that our partner got the expected hangup // error. - pipeOps.Release() + pipeOps.Release(ctx) if test.flags.Read { // Partner is writer. @@ -490,7 +490,7 @@ func TestPipeHangup(t *testing.T) { } else { // Hangup our partner and expect us to get the hangup error. syscall.Close(f) - defer pipeOps.Release() + defer pipeOps.Release(ctx) if test.flags.Read { assertReaderHungup(t, test.desc, pipeOps.(*pipeOperations).file) diff --git a/pkg/sentry/fs/fdpipe/pipe_test.go b/pkg/sentry/fs/fdpipe/pipe_test.go index a0082ecca..1c9e82562 100644 --- a/pkg/sentry/fs/fdpipe/pipe_test.go +++ b/pkg/sentry/fs/fdpipe/pipe_test.go @@ -98,10 +98,11 @@ func TestNewPipe(t *testing.T) { } f := fd.New(gfd) - p, err := newPipeOperations(contexttest.Context(t), nil, test.flags, f, test.readAheadBuffer) + ctx := contexttest.Context(t) + p, err := newPipeOperations(ctx, nil, test.flags, f, test.readAheadBuffer) if p != nil { // This is necessary to remove the fd from the global fd notifier. - defer p.Release() + defer p.Release(ctx) } else { // If there is no p to DecRef on, because newPipeOperations failed, then the // file still needs to be closed. @@ -153,13 +154,14 @@ func TestPipeDestruction(t *testing.T) { syscall.Close(fds[1]) // Test the read end, but it doesn't really matter which. - p, err := newPipeOperations(contexttest.Context(t), nil, fs.FileFlags{Read: true}, f, nil) + ctx := contexttest.Context(t) + p, err := newPipeOperations(ctx, nil, fs.FileFlags{Read: true}, f, nil) if err != nil { f.Close() t.Fatalf("newPipeOperations got error %v, want nil", err) } // Drop our only reference, which should trigger the destructor. - p.Release() + p.Release(ctx) if fdnotifier.HasFD(int32(fds[0])) { t.Fatalf("after DecRef fdnotifier has fd %d, want no longer registered", fds[0]) @@ -282,7 +284,7 @@ func TestPipeRequest(t *testing.T) { if err != nil { t.Fatalf("%s: newPipeOperations got error %v, want nil", test.desc, err) } - defer p.Release() + defer p.Release(ctx) inode := fs.NewMockInode(ctx, fs.NewMockMountSource(nil), fs.StableAttr{Type: fs.Pipe}) file := fs.NewFile(ctx, fs.NewDirent(ctx, inode, "pipe"), fs.FileFlags{Read: true}, p) @@ -334,7 +336,7 @@ func TestPipeReadAheadBuffer(t *testing.T) { rfile.Close() t.Fatalf("newPipeOperations got error %v, want nil", err) } - defer p.Release() + defer p.Release(ctx) inode := fs.NewMockInode(ctx, fs.NewMockMountSource(nil), fs.StableAttr{ Type: fs.Pipe, @@ -380,7 +382,7 @@ func TestPipeReadsAccumulate(t *testing.T) { } // Don't forget to remove the fd from the fd notifier. Otherwise other tests will // likely be borked, because it's global :( - defer p.Release() + defer p.Release(ctx) inode := fs.NewMockInode(ctx, fs.NewMockMountSource(nil), fs.StableAttr{ Type: fs.Pipe, @@ -448,7 +450,7 @@ func TestPipeWritesAccumulate(t *testing.T) { } // Don't forget to remove the fd from the fd notifier. Otherwise other tests // will likely be borked, because it's global :( - defer p.Release() + defer p.Release(ctx) inode := fs.NewMockInode(ctx, fs.NewMockMountSource(nil), fs.StableAttr{ Type: fs.Pipe, diff --git a/pkg/sentry/fs/file.go b/pkg/sentry/fs/file.go index ca41520b4..72ea70fcf 100644 --- a/pkg/sentry/fs/file.go +++ b/pkg/sentry/fs/file.go @@ -142,17 +142,17 @@ func NewFile(ctx context.Context, dirent *Dirent, flags FileFlags, fops FileOper } // DecRef destroys the File when it is no longer referenced. -func (f *File) DecRef() { - f.DecRefWithDestructor(func() { +func (f *File) DecRef(ctx context.Context) { + f.DecRefWithDestructor(ctx, func(context.Context) { // Drop BSD style locks. lockRng := lock.LockRange{Start: 0, End: lock.LockEOF} f.Dirent.Inode.LockCtx.BSD.UnlockRegion(f, lockRng) // Release resources held by the FileOperations. - f.FileOperations.Release() + f.FileOperations.Release(ctx) // Release a reference on the Dirent. - f.Dirent.DecRef() + f.Dirent.DecRef(ctx) // Only unregister if we are currently registered. There is nothing // to register if f.async is nil (this happens when async mode is @@ -460,7 +460,7 @@ func (f *File) UnstableAttr(ctx context.Context) (UnstableAttr, error) { func (f *File) MappedName(ctx context.Context) string { root := RootFromContext(ctx) if root != nil { - defer root.DecRef() + defer root.DecRef(ctx) } name, _ := f.Dirent.FullName(root) return name diff --git a/pkg/sentry/fs/file_operations.go b/pkg/sentry/fs/file_operations.go index f5537411e..305c0f840 100644 --- a/pkg/sentry/fs/file_operations.go +++ b/pkg/sentry/fs/file_operations.go @@ -67,7 +67,7 @@ type SpliceOpts struct { // - File.Flags(): This value may change during the operation. type FileOperations interface { // Release release resources held by FileOperations. - Release() + Release(ctx context.Context) // Waitable defines how this File can be waited on for read and // write readiness. diff --git a/pkg/sentry/fs/file_overlay.go b/pkg/sentry/fs/file_overlay.go index dcc1df38f..9dc58d5ff 100644 --- a/pkg/sentry/fs/file_overlay.go +++ b/pkg/sentry/fs/file_overlay.go @@ -54,7 +54,7 @@ func overlayFile(ctx context.Context, inode *Inode, flags FileFlags) (*File, err // Drop the extra reference on the Dirent. Now there's only one reference // on the dirent, either owned by f (if non-nil), or the Dirent is about // to be destroyed (if GetFile failed). - dirent.DecRef() + dirent.DecRef(ctx) return f, err } @@ -89,12 +89,12 @@ type overlayFileOperations struct { } // Release implements FileOperations.Release. -func (f *overlayFileOperations) Release() { +func (f *overlayFileOperations) Release(ctx context.Context) { if f.upper != nil { - f.upper.DecRef() + f.upper.DecRef(ctx) } if f.lower != nil { - f.lower.DecRef() + f.lower.DecRef(ctx) } } @@ -164,7 +164,7 @@ func (f *overlayFileOperations) Seek(ctx context.Context, file *File, whence See func (f *overlayFileOperations) Readdir(ctx context.Context, file *File, serializer DentrySerializer) (int64, error) { root := RootFromContext(ctx) if root != nil { - defer root.DecRef() + defer root.DecRef(ctx) } dirCtx := &DirCtx{ @@ -497,7 +497,7 @@ func readdirOne(ctx context.Context, d *Dirent) (map[string]DentAttr, error) { if err != nil { return nil, err } - defer dir.DecRef() + defer dir.DecRef(ctx) // Use a stub serializer to read the entries into memory. stubSerializer := &CollectEntriesSerializer{} @@ -521,10 +521,10 @@ type overlayMappingIdentity struct { } // DecRef implements AtomicRefCount.DecRef. -func (omi *overlayMappingIdentity) DecRef() { - omi.AtomicRefCount.DecRefWithDestructor(func() { - omi.overlayFile.DecRef() - omi.id.DecRef() +func (omi *overlayMappingIdentity) DecRef(ctx context.Context) { + omi.AtomicRefCount.DecRefWithDestructor(ctx, func(context.Context) { + omi.overlayFile.DecRef(ctx) + omi.id.DecRef(ctx) }) } @@ -544,7 +544,7 @@ func (omi *overlayMappingIdentity) InodeID() uint64 { func (omi *overlayMappingIdentity) MappedName(ctx context.Context) string { root := RootFromContext(ctx) if root != nil { - defer root.DecRef() + defer root.DecRef(ctx) } name, _ := omi.overlayFile.Dirent.FullName(root) return name diff --git a/pkg/sentry/fs/fsutil/file.go b/pkg/sentry/fs/fsutil/file.go index 08695391c..dc9efa5df 100644 --- a/pkg/sentry/fs/fsutil/file.go +++ b/pkg/sentry/fs/fsutil/file.go @@ -31,7 +31,7 @@ import ( type FileNoopRelease struct{} // Release is a no-op. -func (FileNoopRelease) Release() {} +func (FileNoopRelease) Release(context.Context) {} // SeekWithDirCursor is used to implement fs.FileOperations.Seek. If dirCursor // is not nil and the seek was on a directory, the cursor will be updated. @@ -296,7 +296,7 @@ func (sdfo *StaticDirFileOperations) IterateDir(ctx context.Context, d *fs.Diren func (sdfo *StaticDirFileOperations) Readdir(ctx context.Context, file *fs.File, serializer fs.DentrySerializer) (int64, error) { root := fs.RootFromContext(ctx) if root != nil { - defer root.DecRef() + defer root.DecRef(ctx) } dirCtx := &fs.DirCtx{ Serializer: serializer, diff --git a/pkg/sentry/fs/gofer/file.go b/pkg/sentry/fs/gofer/file.go index b2fcab127..c0bc63a32 100644 --- a/pkg/sentry/fs/gofer/file.go +++ b/pkg/sentry/fs/gofer/file.go @@ -114,7 +114,7 @@ func NewFile(ctx context.Context, dirent *fs.Dirent, name string, flags fs.FileF } // Release implements fs.FileOpeations.Release. -func (f *fileOperations) Release() { +func (f *fileOperations) Release(context.Context) { f.handles.DecRef() } @@ -122,7 +122,7 @@ func (f *fileOperations) Release() { func (f *fileOperations) Readdir(ctx context.Context, file *fs.File, serializer fs.DentrySerializer) (int64, error) { root := fs.RootFromContext(ctx) if root != nil { - defer root.DecRef() + defer root.DecRef(ctx) } dirCtx := &fs.DirCtx{ diff --git a/pkg/sentry/fs/gofer/gofer_test.go b/pkg/sentry/fs/gofer/gofer_test.go index 2df2fe889..326fed954 100644 --- a/pkg/sentry/fs/gofer/gofer_test.go +++ b/pkg/sentry/fs/gofer/gofer_test.go @@ -232,7 +232,7 @@ func TestRevalidation(t *testing.T) { // We must release the dirent, of the test will fail // with a reference leak. This is tracked by p9test. - defer dirent.DecRef() + defer dirent.DecRef(ctx) // Walk again. Depending on the cache policy, we may // get a new dirent. @@ -246,7 +246,7 @@ func TestRevalidation(t *testing.T) { if !test.preModificationWantReload && dirent != newDirent { t.Errorf("Lookup with cachePolicy=%s got new dirent %+v, wanted old dirent %+v", test.cachePolicy, newDirent, dirent) } - newDirent.DecRef() // See above. + newDirent.DecRef(ctx) // See above. // Modify the underlying mocked file's modification // time for the next walk that occurs. @@ -287,7 +287,7 @@ func TestRevalidation(t *testing.T) { if test.postModificationWantUpdatedAttrs && gotModTimeSeconds != nowSeconds { t.Fatalf("Lookup with cachePolicy=%s got new modification time %v, wanted %v", test.cachePolicy, gotModTimeSeconds, nowSeconds) } - newDirent.DecRef() // See above. + newDirent.DecRef(ctx) // See above. // Remove the file from the remote fs, subsequent walks // should now fail to find anything. @@ -303,7 +303,7 @@ func TestRevalidation(t *testing.T) { t.Errorf("Lookup with cachePolicy=%s got new dirent and error %v, wanted old dirent and nil error", test.cachePolicy, err) } if err == nil { - newDirent.DecRef() // See above. + newDirent.DecRef(ctx) // See above. } }) } diff --git a/pkg/sentry/fs/gofer/handles.go b/pkg/sentry/fs/gofer/handles.go index fc14249be..f324dbf26 100644 --- a/pkg/sentry/fs/gofer/handles.go +++ b/pkg/sentry/fs/gofer/handles.go @@ -47,7 +47,8 @@ type handles struct { // DecRef drops a reference on handles. func (h *handles) DecRef() { - h.DecRefWithDestructor(func() { + ctx := context.Background() + h.DecRefWithDestructor(ctx, func(context.Context) { if h.Host != nil { if h.isHostBorrowed { h.Host.Release() @@ -57,7 +58,7 @@ func (h *handles) DecRef() { } } } - if err := h.File.close(context.Background()); err != nil { + if err := h.File.close(ctx); err != nil { log.Warningf("error closing p9 file: %v", err) } }) diff --git a/pkg/sentry/fs/gofer/inode.go b/pkg/sentry/fs/gofer/inode.go index 51d7368a1..3a225fd39 100644 --- a/pkg/sentry/fs/gofer/inode.go +++ b/pkg/sentry/fs/gofer/inode.go @@ -441,8 +441,9 @@ func (i *inodeOperations) Release(ctx context.Context) { // asynchronously. // // We use AsyncWithContext to avoid needing to allocate an extra - // anonymous function on the heap. - fs.AsyncWithContext(ctx, i.fileState.Release) + // anonymous function on the heap. We must use background context + // because the async work cannot happen on the task context. + fs.AsyncWithContext(context.Background(), i.fileState.Release) } // Mappable implements fs.InodeOperations.Mappable. diff --git a/pkg/sentry/fs/gofer/path.go b/pkg/sentry/fs/gofer/path.go index cf9800100..3c66dc3c2 100644 --- a/pkg/sentry/fs/gofer/path.go +++ b/pkg/sentry/fs/gofer/path.go @@ -168,7 +168,7 @@ func (i *inodeOperations) Create(ctx context.Context, dir *fs.Inode, name string // Construct the positive Dirent. d := fs.NewDirent(ctx, fs.NewInode(ctx, iops, dir.MountSource, sattr), name) - defer d.DecRef() + defer d.DecRef(ctx) // Construct the new file, caching the handles if allowed. h := handles{ @@ -371,7 +371,7 @@ func (i *inodeOperations) Remove(ctx context.Context, dir *fs.Inode, name string // Find out if file being deleted is a socket or pipe that needs to be // removed from endpoint map. if d, err := i.Lookup(ctx, dir, name); err == nil { - defer d.DecRef() + defer d.DecRef(ctx) if fs.IsSocket(d.Inode.StableAttr) || fs.IsPipe(d.Inode.StableAttr) { switch iops := d.Inode.InodeOperations.(type) { @@ -392,7 +392,7 @@ func (i *inodeOperations) Remove(ctx context.Context, dir *fs.Inode, name string return err } if key != nil { - i.session().overrides.remove(*key) + i.session().overrides.remove(ctx, *key) } i.touchModificationAndStatusChangeTime(ctx, dir) diff --git a/pkg/sentry/fs/gofer/session.go b/pkg/sentry/fs/gofer/session.go index b5efc86f2..7cf3522ff 100644 --- a/pkg/sentry/fs/gofer/session.go +++ b/pkg/sentry/fs/gofer/session.go @@ -89,10 +89,10 @@ func (e *overrideMaps) addPipe(key device.MultiDeviceKey, d *fs.Dirent, inode *f // remove deletes the key from the maps. // // Precondition: maps must have been locked with 'lock'. -func (e *overrideMaps) remove(key device.MultiDeviceKey) { +func (e *overrideMaps) remove(ctx context.Context, key device.MultiDeviceKey) { endpoint := e.keyMap[key] delete(e.keyMap, key) - endpoint.dirent.DecRef() + endpoint.dirent.DecRef(ctx) } // lock blocks other addition and removal operations from happening while @@ -197,7 +197,7 @@ type session struct { } // Destroy tears down the session. -func (s *session) Destroy() { +func (s *session) Destroy(ctx context.Context) { s.client.Close() } @@ -329,7 +329,7 @@ func Root(ctx context.Context, dev string, filesystem fs.Filesystem, superBlockF s.client, err = p9.NewClient(conn, s.msize, s.version) if err != nil { // Drop our reference on the session, it needs to be torn down. - s.DecRef() + s.DecRef(ctx) return nil, err } @@ -340,7 +340,7 @@ func Root(ctx context.Context, dev string, filesystem fs.Filesystem, superBlockF ctx.UninterruptibleSleepFinish(false) if err != nil { // Same as above. - s.DecRef() + s.DecRef(ctx) return nil, err } @@ -348,7 +348,7 @@ func Root(ctx context.Context, dev string, filesystem fs.Filesystem, superBlockF if err != nil { s.attach.close(ctx) // Same as above, but after we execute the Close request. - s.DecRef() + s.DecRef(ctx) return nil, err } @@ -393,13 +393,13 @@ func (s *session) fillKeyMap(ctx context.Context) error { // fillPathMap populates paths for overrides from dirents in direntMap // before save. -func (s *session) fillPathMap() error { +func (s *session) fillPathMap(ctx context.Context) error { unlock := s.overrides.lock() defer unlock() for _, endpoint := range s.overrides.keyMap { mountRoot := endpoint.dirent.MountRoot() - defer mountRoot.DecRef() + defer mountRoot.DecRef(ctx) dirPath, _ := endpoint.dirent.FullName(mountRoot) if dirPath == "" { return fmt.Errorf("error getting path from dirent") diff --git a/pkg/sentry/fs/gofer/session_state.go b/pkg/sentry/fs/gofer/session_state.go index 2d398b753..48b423dd8 100644 --- a/pkg/sentry/fs/gofer/session_state.go +++ b/pkg/sentry/fs/gofer/session_state.go @@ -26,7 +26,8 @@ import ( // beforeSave is invoked by stateify. func (s *session) beforeSave() { if s.overrides != nil { - if err := s.fillPathMap(); err != nil { + ctx := &dummyClockContext{context.Background()} + if err := s.fillPathMap(ctx); err != nil { panic("failed to save paths to override map before saving" + err.Error()) } } diff --git a/pkg/sentry/fs/gofer/socket.go b/pkg/sentry/fs/gofer/socket.go index 40f2c1cad..8a1c69ac2 100644 --- a/pkg/sentry/fs/gofer/socket.go +++ b/pkg/sentry/fs/gofer/socket.go @@ -134,14 +134,14 @@ func (e *endpoint) UnidirectionalConnect(ctx context.Context) (transport.Connect // We don't need the receiver. c.CloseRecv() - c.Release() + c.Release(ctx) return c, nil } // Release implements transport.BoundEndpoint.Release. -func (e *endpoint) Release() { - e.inode.DecRef() +func (e *endpoint) Release(ctx context.Context) { + e.inode.DecRef(ctx) } // Passcred implements transport.BoundEndpoint.Passcred. diff --git a/pkg/sentry/fs/host/control.go b/pkg/sentry/fs/host/control.go index 39299b7e4..0d8d36afa 100644 --- a/pkg/sentry/fs/host/control.go +++ b/pkg/sentry/fs/host/control.go @@ -57,7 +57,7 @@ func (c *scmRights) Clone() transport.RightsControlMessage { } // Release implements transport.RightsControlMessage.Release. -func (c *scmRights) Release() { +func (c *scmRights) Release(ctx context.Context) { for _, fd := range c.fds { syscall.Close(fd) } diff --git a/pkg/sentry/fs/host/file.go b/pkg/sentry/fs/host/file.go index 3e48b8b2c..86d1a87f0 100644 --- a/pkg/sentry/fs/host/file.go +++ b/pkg/sentry/fs/host/file.go @@ -110,7 +110,7 @@ func newFileFromDonatedFD(ctx context.Context, donated int, saveable, isTTY bool name := fmt.Sprintf("host:[%d]", inode.StableAttr.InodeID) dirent := fs.NewDirent(ctx, inode, name) - defer dirent.DecRef() + defer dirent.DecRef(ctx) if isTTY { return newTTYFile(ctx, dirent, flags, iops), nil @@ -169,7 +169,7 @@ func (f *fileOperations) Readiness(mask waiter.EventMask) waiter.EventMask { func (f *fileOperations) Readdir(ctx context.Context, file *fs.File, serializer fs.DentrySerializer) (int64, error) { root := fs.RootFromContext(ctx) if root != nil { - defer root.DecRef() + defer root.DecRef(ctx) } dirCtx := &fs.DirCtx{ Serializer: serializer, diff --git a/pkg/sentry/fs/host/inode_test.go b/pkg/sentry/fs/host/inode_test.go index c507f57eb..41a23b5da 100644 --- a/pkg/sentry/fs/host/inode_test.go +++ b/pkg/sentry/fs/host/inode_test.go @@ -36,7 +36,7 @@ func TestCloseFD(t *testing.T) { if err != nil { t.Fatalf("Failed to create File: %v", err) } - file.DecRef() + file.DecRef(ctx) s := make([]byte, 10) if c, err := syscall.Read(p[0], s); c != 0 || err != nil { diff --git a/pkg/sentry/fs/host/socket.go b/pkg/sentry/fs/host/socket.go index cfb089e43..a2f3d5918 100644 --- a/pkg/sentry/fs/host/socket.go +++ b/pkg/sentry/fs/host/socket.go @@ -194,7 +194,7 @@ func newSocket(ctx context.Context, orgfd int, saveable bool) (*fs.File, error) } // Send implements transport.ConnectedEndpoint.Send. -func (c *ConnectedEndpoint) Send(data [][]byte, controlMessages transport.ControlMessages, from tcpip.FullAddress) (int64, bool, *syserr.Error) { +func (c *ConnectedEndpoint) Send(ctx context.Context, data [][]byte, controlMessages transport.ControlMessages, from tcpip.FullAddress) (int64, bool, *syserr.Error) { c.mu.RLock() defer c.mu.RUnlock() @@ -271,7 +271,7 @@ func (c *ConnectedEndpoint) EventUpdate() { } // Recv implements transport.Receiver.Recv. -func (c *ConnectedEndpoint) Recv(data [][]byte, creds bool, numRights int, peek bool) (int64, int64, transport.ControlMessages, bool, tcpip.FullAddress, bool, *syserr.Error) { +func (c *ConnectedEndpoint) Recv(ctx context.Context, data [][]byte, creds bool, numRights int, peek bool) (int64, int64, transport.ControlMessages, bool, tcpip.FullAddress, bool, *syserr.Error) { c.mu.RLock() defer c.mu.RUnlock() @@ -318,7 +318,7 @@ func (c *ConnectedEndpoint) Recv(data [][]byte, creds bool, numRights int, peek } // close releases all resources related to the endpoint. -func (c *ConnectedEndpoint) close() { +func (c *ConnectedEndpoint) close(context.Context) { fdnotifier.RemoveFD(int32(c.file.FD())) c.file.Close() c.file = nil @@ -374,8 +374,8 @@ func (c *ConnectedEndpoint) RecvMaxQueueSize() int64 { } // Release implements transport.ConnectedEndpoint.Release and transport.Receiver.Release. -func (c *ConnectedEndpoint) Release() { - c.ref.DecRefWithDestructor(c.close) +func (c *ConnectedEndpoint) Release(ctx context.Context) { + c.ref.DecRefWithDestructor(ctx, c.close) } // CloseUnread implements transport.ConnectedEndpoint.CloseUnread. diff --git a/pkg/sentry/fs/host/socket_test.go b/pkg/sentry/fs/host/socket_test.go index affdbcacb..9d58ea448 100644 --- a/pkg/sentry/fs/host/socket_test.go +++ b/pkg/sentry/fs/host/socket_test.go @@ -67,11 +67,12 @@ func TestSocketIsBlocking(t *testing.T) { if fl&syscall.O_NONBLOCK == syscall.O_NONBLOCK { t.Fatalf("Expected socket %v to be blocking", pair[1]) } - sock, err := newSocket(contexttest.Context(t), pair[0], false) + ctx := contexttest.Context(t) + sock, err := newSocket(ctx, pair[0], false) if err != nil { t.Fatalf("newSocket(%v) failed => %v", pair[0], err) } - defer sock.DecRef() + defer sock.DecRef(ctx) // Test that the socket now is non-blocking. if fl, err = getFl(pair[0]); err != nil { t.Fatalf("getFl: fcntl(%v, GETFL) => %v", pair[0], err) @@ -93,11 +94,12 @@ func TestSocketWritev(t *testing.T) { if err != nil { t.Fatalf("host socket creation failed: %v", err) } - socket, err := newSocket(contexttest.Context(t), pair[0], false) + ctx := contexttest.Context(t) + socket, err := newSocket(ctx, pair[0], false) if err != nil { t.Fatalf("newSocket(%v) => %v", pair[0], err) } - defer socket.DecRef() + defer socket.DecRef(ctx) buf := []byte("hello world\n") n, err := socket.Writev(contexttest.Context(t), usermem.BytesIOSequence(buf)) if err != nil { @@ -115,11 +117,12 @@ func TestSocketWritevLen0(t *testing.T) { if err != nil { t.Fatalf("host socket creation failed: %v", err) } - socket, err := newSocket(contexttest.Context(t), pair[0], false) + ctx := contexttest.Context(t) + socket, err := newSocket(ctx, pair[0], false) if err != nil { t.Fatalf("newSocket(%v) => %v", pair[0], err) } - defer socket.DecRef() + defer socket.DecRef(ctx) n, err := socket.Writev(contexttest.Context(t), usermem.BytesIOSequence(nil)) if err != nil { t.Fatalf("socket writev failed: %v", err) @@ -136,11 +139,12 @@ func TestSocketSendMsgLen0(t *testing.T) { if err != nil { t.Fatalf("host socket creation failed: %v", err) } - sfile, err := newSocket(contexttest.Context(t), pair[0], false) + ctx := contexttest.Context(t) + sfile, err := newSocket(ctx, pair[0], false) if err != nil { t.Fatalf("newSocket(%v) => %v", pair[0], err) } - defer sfile.DecRef() + defer sfile.DecRef(ctx) s := sfile.FileOperations.(socket.Socket) n, terr := s.SendMsg(nil, usermem.BytesIOSequence(nil), []byte{}, 0, false, ktime.Time{}, socket.ControlMessages{}) @@ -158,18 +162,19 @@ func TestListen(t *testing.T) { if err != nil { t.Fatalf("syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM, 0) => %v", err) } - sfile1, err := newSocket(contexttest.Context(t), pair[0], false) + ctx := contexttest.Context(t) + sfile1, err := newSocket(ctx, pair[0], false) if err != nil { t.Fatalf("newSocket(%v) => %v", pair[0], err) } - defer sfile1.DecRef() + defer sfile1.DecRef(ctx) socket1 := sfile1.FileOperations.(socket.Socket) - sfile2, err := newSocket(contexttest.Context(t), pair[1], false) + sfile2, err := newSocket(ctx, pair[1], false) if err != nil { t.Fatalf("newSocket(%v) => %v", pair[1], err) } - defer sfile2.DecRef() + defer sfile2.DecRef(ctx) socket2 := sfile2.FileOperations.(socket.Socket) // Socketpairs can not be listened to. @@ -185,11 +190,11 @@ func TestListen(t *testing.T) { if err != nil { t.Fatalf("syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM, 0) => %v", err) } - sfile3, err := newSocket(contexttest.Context(t), sock, false) + sfile3, err := newSocket(ctx, sock, false) if err != nil { t.Fatalf("newSocket(%v) => %v", sock, err) } - defer sfile3.DecRef() + defer sfile3.DecRef(ctx) socket3 := sfile3.FileOperations.(socket.Socket) // This socket is not bound so we can't listen on it. @@ -237,9 +242,10 @@ func TestRelease(t *testing.T) { } c := &ConnectedEndpoint{queue: &waiter.Queue{}, file: fd.New(f)} want := &ConnectedEndpoint{queue: c.queue} - want.ref.DecRef() + ctx := contexttest.Context(t) + want.ref.DecRef(ctx) fdnotifier.AddFD(int32(c.file.FD()), nil) - c.Release() + c.Release(ctx) if !reflect.DeepEqual(c, want) { t.Errorf("got = %#v, want = %#v", c, want) } diff --git a/pkg/sentry/fs/host/tty.go b/pkg/sentry/fs/host/tty.go index 82a02fcb2..b5229098c 100644 --- a/pkg/sentry/fs/host/tty.go +++ b/pkg/sentry/fs/host/tty.go @@ -113,12 +113,12 @@ func (t *TTYFileOperations) Write(ctx context.Context, file *fs.File, src userme } // Release implements fs.FileOperations.Release. -func (t *TTYFileOperations) Release() { +func (t *TTYFileOperations) Release(ctx context.Context) { t.mu.Lock() t.fgProcessGroup = nil t.mu.Unlock() - t.fileOperations.Release() + t.fileOperations.Release(ctx) } // Ioctl implements fs.FileOperations.Ioctl. diff --git a/pkg/sentry/fs/host/wait_test.go b/pkg/sentry/fs/host/wait_test.go index ce397a5e3..c143f4ce2 100644 --- a/pkg/sentry/fs/host/wait_test.go +++ b/pkg/sentry/fs/host/wait_test.go @@ -39,7 +39,7 @@ func TestWait(t *testing.T) { t.Fatalf("NewFile failed: %v", err) } - defer file.DecRef() + defer file.DecRef(ctx) r := file.Readiness(waiter.EventIn) if r != 0 { diff --git a/pkg/sentry/fs/inode.go b/pkg/sentry/fs/inode.go index a34fbc946..b79cd9877 100644 --- a/pkg/sentry/fs/inode.go +++ b/pkg/sentry/fs/inode.go @@ -96,13 +96,12 @@ func NewInode(ctx context.Context, iops InodeOperations, msrc *MountSource, satt } // DecRef drops a reference on the Inode. -func (i *Inode) DecRef() { - i.DecRefWithDestructor(i.destroy) +func (i *Inode) DecRef(ctx context.Context) { + i.DecRefWithDestructor(ctx, i.destroy) } // destroy releases the Inode and releases the msrc reference taken. -func (i *Inode) destroy() { - ctx := context.Background() +func (i *Inode) destroy(ctx context.Context) { if err := i.WriteOut(ctx); err != nil { // FIXME(b/65209558): Mark as warning again once noatime is // properly supported. @@ -122,12 +121,12 @@ func (i *Inode) destroy() { i.Watches.targetDestroyed() if i.overlay != nil { - i.overlay.release() + i.overlay.release(ctx) } else { i.InodeOperations.Release(ctx) } - i.MountSource.DecRef() + i.MountSource.DecRef(ctx) } // Mappable calls i.InodeOperations.Mappable. diff --git a/pkg/sentry/fs/inode_inotify.go b/pkg/sentry/fs/inode_inotify.go index efd3c962b..9911a00c2 100644 --- a/pkg/sentry/fs/inode_inotify.go +++ b/pkg/sentry/fs/inode_inotify.go @@ -17,6 +17,7 @@ package fs import ( "fmt" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sync" ) @@ -136,11 +137,11 @@ func (w *Watches) Notify(name string, events, cookie uint32) { } // Unpin unpins dirent from all watches in this set. -func (w *Watches) Unpin(d *Dirent) { +func (w *Watches) Unpin(ctx context.Context, d *Dirent) { w.mu.RLock() defer w.mu.RUnlock() for _, watch := range w.ws { - watch.Unpin(d) + watch.Unpin(ctx, d) } } diff --git a/pkg/sentry/fs/inode_overlay.go b/pkg/sentry/fs/inode_overlay.go index 537c8d257..dc2e353d9 100644 --- a/pkg/sentry/fs/inode_overlay.go +++ b/pkg/sentry/fs/inode_overlay.go @@ -85,7 +85,7 @@ func overlayLookup(ctx context.Context, parent *overlayEntry, inode *Inode, name upperInode = child.Inode upperInode.IncRef() } - child.DecRef() + child.DecRef(ctx) } // Are we done? @@ -108,7 +108,7 @@ func overlayLookup(ctx context.Context, parent *overlayEntry, inode *Inode, name entry, err := newOverlayEntry(ctx, upperInode, nil, false) if err != nil { // Don't leak resources. - upperInode.DecRef() + upperInode.DecRef(ctx) parent.copyMu.RUnlock() return nil, false, err } @@ -129,7 +129,7 @@ func overlayLookup(ctx context.Context, parent *overlayEntry, inode *Inode, name if err != nil && err != syserror.ENOENT { // Don't leak resources. if upperInode != nil { - upperInode.DecRef() + upperInode.DecRef(ctx) } parent.copyMu.RUnlock() return nil, false, err @@ -152,7 +152,7 @@ func overlayLookup(ctx context.Context, parent *overlayEntry, inode *Inode, name } } } - child.DecRef() + child.DecRef(ctx) } } @@ -183,7 +183,7 @@ func overlayLookup(ctx context.Context, parent *overlayEntry, inode *Inode, name // unnecessary because we don't need to copy-up and we will always // operate (e.g. read/write) on the upper Inode. if !IsDir(upperInode.StableAttr) { - lowerInode.DecRef() + lowerInode.DecRef(ctx) lowerInode = nil } } @@ -194,10 +194,10 @@ func overlayLookup(ctx context.Context, parent *overlayEntry, inode *Inode, name // Well, not quite, we failed at the last moment, how depressing. // Be sure not to leak resources. if upperInode != nil { - upperInode.DecRef() + upperInode.DecRef(ctx) } if lowerInode != nil { - lowerInode.DecRef() + lowerInode.DecRef(ctx) } parent.copyMu.RUnlock() return nil, false, err @@ -248,7 +248,7 @@ func overlayCreate(ctx context.Context, o *overlayEntry, parent *Dirent, name st // user) will clobber the real path for the underlying Inode. upperFile.Dirent.Inode.IncRef() upperDirent := NewTransientDirent(upperFile.Dirent.Inode) - upperFile.Dirent.DecRef() + upperFile.Dirent.DecRef(ctx) upperFile.Dirent = upperDirent // Create the overlay inode and dirent. We need this to construct the @@ -259,7 +259,7 @@ func overlayCreate(ctx context.Context, o *overlayEntry, parent *Dirent, name st // The overlay file created below with NewFile will take a reference on // the overlayDirent, and it should be the only thing holding a // reference at the time of creation, so we must drop this reference. - defer overlayDirent.DecRef() + defer overlayDirent.DecRef(ctx) // Create a new overlay file that wraps the upper file. flags.Pread = upperFile.Flags().Pread @@ -399,7 +399,7 @@ func overlayRename(ctx context.Context, o *overlayEntry, oldParent *Dirent, rena if !replaced.IsNegative() && IsDir(replaced.Inode.StableAttr) { children, err := readdirOne(ctx, replaced) if err != nil { - replaced.DecRef() + replaced.DecRef(ctx) return err } @@ -407,12 +407,12 @@ func overlayRename(ctx context.Context, o *overlayEntry, oldParent *Dirent, rena // included among the returned children, so we don't // need to bother checking for them. if len(children) > 0 { - replaced.DecRef() + replaced.DecRef(ctx) return syserror.ENOTEMPTY } } - replaced.DecRef() + replaced.DecRef(ctx) } } @@ -455,12 +455,12 @@ func overlayBind(ctx context.Context, o *overlayEntry, parent *Dirent, name stri // Grab the inode and drop the dirent, we don't need it. inode := d.Inode inode.IncRef() - d.DecRef() + d.DecRef(ctx) // Create a new overlay entry and dirent for the socket. entry, err := newOverlayEntry(ctx, inode, nil, false) if err != nil { - inode.DecRef() + inode.DecRef(ctx) return nil, err } // Use the parent's MountSource, since that corresponds to the overlay, @@ -672,7 +672,7 @@ func overlayGetlink(ctx context.Context, o *overlayEntry) (*Dirent, error) { // ground and claim that jumping around the filesystem like this // is not supported. name, _ := dirent.FullName(nil) - dirent.DecRef() + dirent.DecRef(ctx) // Claim that the path is not accessible. err = syserror.EACCES diff --git a/pkg/sentry/fs/inode_overlay_test.go b/pkg/sentry/fs/inode_overlay_test.go index 389c219d6..aa9851b26 100644 --- a/pkg/sentry/fs/inode_overlay_test.go +++ b/pkg/sentry/fs/inode_overlay_test.go @@ -316,7 +316,7 @@ func TestCacheFlush(t *testing.T) { t.Fatalf("NewMountNamespace failed: %v", err) } root := mns.Root() - defer root.DecRef() + defer root.DecRef(ctx) ctx = &rootContext{ Context: ctx, @@ -345,7 +345,7 @@ func TestCacheFlush(t *testing.T) { } // Drop the file reference. - file.DecRef() + file.DecRef(ctx) // Dirent should have 2 refs left. if got, want := dirent.ReadRefs(), 2; int(got) != want { @@ -361,7 +361,7 @@ func TestCacheFlush(t *testing.T) { } // Drop our ref. - dirent.DecRef() + dirent.DecRef(ctx) // We should be back to zero refs. if got, want := dirent.ReadRefs(), 0; int(got) != want { @@ -398,7 +398,7 @@ func (d *dir) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags if err != nil { return nil, err } - defer file.DecRef() + defer file.DecRef(ctx) // Wrap the file's FileOperations in a dirFile. fops := &dirFile{ FileOperations: file.FileOperations, diff --git a/pkg/sentry/fs/inotify.go b/pkg/sentry/fs/inotify.go index e3a715c1f..c5c07d564 100644 --- a/pkg/sentry/fs/inotify.go +++ b/pkg/sentry/fs/inotify.go @@ -80,7 +80,7 @@ func NewInotify(ctx context.Context) *Inotify { // Release implements FileOperations.Release. Release removes all watches and // frees all resources for an inotify instance. -func (i *Inotify) Release() { +func (i *Inotify) Release(ctx context.Context) { // We need to hold i.mu to avoid a race with concurrent calls to // Inotify.targetDestroyed from Watches. There's no risk of Watches // accessing this Inotify after the destructor ends, because we remove all @@ -93,7 +93,7 @@ func (i *Inotify) Release() { // the owner's destructor. w.target.Watches.Remove(w.ID()) // Don't leak any references to the target, held by pins in the watch. - w.destroy() + w.destroy(ctx) } } @@ -321,7 +321,7 @@ func (i *Inotify) AddWatch(target *Dirent, mask uint32) int32 { // // RmWatch looks up an inotify watch for the given 'wd' and configures the // target dirent to stop sending events to this inotify instance. -func (i *Inotify) RmWatch(wd int32) error { +func (i *Inotify) RmWatch(ctx context.Context, wd int32) error { i.mu.Lock() // Find the watch we were asked to removed. @@ -346,7 +346,7 @@ func (i *Inotify) RmWatch(wd int32) error { i.queueEvent(newEvent(watch.wd, "", linux.IN_IGNORED, 0)) // Remove all pins. - watch.destroy() + watch.destroy(ctx) return nil } diff --git a/pkg/sentry/fs/inotify_watch.go b/pkg/sentry/fs/inotify_watch.go index 900cba3ca..605423d22 100644 --- a/pkg/sentry/fs/inotify_watch.go +++ b/pkg/sentry/fs/inotify_watch.go @@ -18,6 +18,7 @@ import ( "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sync" ) @@ -105,12 +106,12 @@ func (w *Watch) Pin(d *Dirent) { // Unpin drops any extra refs held on dirent due to a previous Pin // call. Calling Unpin multiple times for the same dirent, or on a dirent // without a corresponding Pin call is a no-op. -func (w *Watch) Unpin(d *Dirent) { +func (w *Watch) Unpin(ctx context.Context, d *Dirent) { w.mu.Lock() defer w.mu.Unlock() if w.pins[d] { delete(w.pins, d) - d.DecRef() + d.DecRef(ctx) } } @@ -125,11 +126,11 @@ func (w *Watch) TargetDestroyed() { // this watch. Destroy does not cause any new events to be generated. The caller // is responsible for ensuring there are no outstanding references to this // watch. -func (w *Watch) destroy() { +func (w *Watch) destroy(ctx context.Context) { w.mu.Lock() defer w.mu.Unlock() for d := range w.pins { - d.DecRef() + d.DecRef(ctx) } w.pins = nil } diff --git a/pkg/sentry/fs/mount.go b/pkg/sentry/fs/mount.go index 37bae6810..ee69b10e8 100644 --- a/pkg/sentry/fs/mount.go +++ b/pkg/sentry/fs/mount.go @@ -51,7 +51,7 @@ type MountSourceOperations interface { DirentOperations // Destroy destroys the MountSource. - Destroy() + Destroy(ctx context.Context) // Below are MountSourceOperations that do not conform to Linux. @@ -165,16 +165,16 @@ func (msrc *MountSource) DecDirentRefs() { } } -func (msrc *MountSource) destroy() { +func (msrc *MountSource) destroy(ctx context.Context) { if c := msrc.DirentRefs(); c != 0 { panic(fmt.Sprintf("MountSource with non-zero direntRefs is being destroyed: %d", c)) } - msrc.MountSourceOperations.Destroy() + msrc.MountSourceOperations.Destroy(ctx) } // DecRef drops a reference on the MountSource. -func (msrc *MountSource) DecRef() { - msrc.DecRefWithDestructor(msrc.destroy) +func (msrc *MountSource) DecRef(ctx context.Context) { + msrc.DecRefWithDestructor(ctx, msrc.destroy) } // FlushDirentRefs drops all references held by the MountSource on Dirents. @@ -264,7 +264,7 @@ func (*SimpleMountSourceOperations) ResetInodeMappings() {} func (*SimpleMountSourceOperations) SaveInodeMapping(*Inode, string) {} // Destroy implements MountSourceOperations.Destroy. -func (*SimpleMountSourceOperations) Destroy() {} +func (*SimpleMountSourceOperations) Destroy(context.Context) {} // Info defines attributes of a filesystem. type Info struct { diff --git a/pkg/sentry/fs/mount_overlay.go b/pkg/sentry/fs/mount_overlay.go index 78e35b1e6..7badc75d6 100644 --- a/pkg/sentry/fs/mount_overlay.go +++ b/pkg/sentry/fs/mount_overlay.go @@ -115,9 +115,9 @@ func (o *overlayMountSourceOperations) SaveInodeMapping(inode *Inode, path strin } // Destroy drops references on the upper and lower MountSource. -func (o *overlayMountSourceOperations) Destroy() { - o.upper.DecRef() - o.lower.DecRef() +func (o *overlayMountSourceOperations) Destroy(ctx context.Context) { + o.upper.DecRef(ctx) + o.lower.DecRef(ctx) } // type overlayFilesystem is the filesystem for overlay mounts. diff --git a/pkg/sentry/fs/mount_test.go b/pkg/sentry/fs/mount_test.go index a3d10770b..6c296f5d0 100644 --- a/pkg/sentry/fs/mount_test.go +++ b/pkg/sentry/fs/mount_test.go @@ -18,6 +18,7 @@ import ( "fmt" "testing" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/contexttest" ) @@ -32,13 +33,13 @@ func cacheReallyContains(cache *DirentCache, d *Dirent) bool { return false } -func mountPathsAre(root *Dirent, got []*Mount, want ...string) error { +func mountPathsAre(ctx context.Context, root *Dirent, got []*Mount, want ...string) error { gotPaths := make(map[string]struct{}, len(got)) gotStr := make([]string, len(got)) for i, g := range got { if groot := g.Root(); groot != nil { name, _ := groot.FullName(root) - groot.DecRef() + groot.DecRef(ctx) gotStr[i] = name gotPaths[name] = struct{}{} } @@ -69,7 +70,7 @@ func TestMountSourceOnlyCachedOnce(t *testing.T) { t.Fatalf("NewMountNamespace failed: %v", err) } rootDirent := mm.Root() - defer rootDirent.DecRef() + defer rootDirent.DecRef(ctx) // Get a child of the root which we will mount over. Note that the // MockInodeOperations causes Walk to always succeed. @@ -125,7 +126,7 @@ func TestAllMountsUnder(t *testing.T) { t.Fatalf("NewMountNamespace failed: %v", err) } rootDirent := mm.Root() - defer rootDirent.DecRef() + defer rootDirent.DecRef(ctx) // Add mounts at the following paths: paths := []string{ @@ -150,14 +151,14 @@ func TestAllMountsUnder(t *testing.T) { if err := mm.Mount(ctx, d, submountInode); err != nil { t.Fatalf("could not mount at %q: %v", p, err) } - d.DecRef() + d.DecRef(ctx) } // mm root should contain all submounts (and does not include the root mount). rootMnt := mm.FindMount(rootDirent) submounts := mm.AllMountsUnder(rootMnt) allPaths := append(paths, "/") - if err := mountPathsAre(rootDirent, submounts, allPaths...); err != nil { + if err := mountPathsAre(ctx, rootDirent, submounts, allPaths...); err != nil { t.Error(err) } @@ -181,9 +182,9 @@ func TestAllMountsUnder(t *testing.T) { if err != nil { t.Fatalf("could not find path %q in mount manager: %v", "/foo", err) } - defer d.DecRef() + defer d.DecRef(ctx) submounts = mm.AllMountsUnder(mm.FindMount(d)) - if err := mountPathsAre(rootDirent, submounts, "/foo", "/foo/bar", "/foo/qux", "/foo/bar/baz"); err != nil { + if err := mountPathsAre(ctx, rootDirent, submounts, "/foo", "/foo/bar", "/foo/qux", "/foo/bar/baz"); err != nil { t.Error(err) } @@ -193,9 +194,9 @@ func TestAllMountsUnder(t *testing.T) { if err != nil { t.Fatalf("could not find path %q in mount manager: %v", "/waldo", err) } - defer waldo.DecRef() + defer waldo.DecRef(ctx) submounts = mm.AllMountsUnder(mm.FindMount(waldo)) - if err := mountPathsAre(rootDirent, submounts, "/waldo"); err != nil { + if err := mountPathsAre(ctx, rootDirent, submounts, "/waldo"); err != nil { t.Error(err) } } @@ -212,7 +213,7 @@ func TestUnmount(t *testing.T) { t.Fatalf("NewMountNamespace failed: %v", err) } rootDirent := mm.Root() - defer rootDirent.DecRef() + defer rootDirent.DecRef(ctx) // Add mounts at the following paths: paths := []string{ @@ -240,7 +241,7 @@ func TestUnmount(t *testing.T) { if err := mm.Mount(ctx, d, submountInode); err != nil { t.Fatalf("could not mount at %q: %v", p, err) } - d.DecRef() + d.DecRef(ctx) } allPaths := make([]string, len(paths)+1) @@ -259,13 +260,13 @@ func TestUnmount(t *testing.T) { if err := mm.Unmount(ctx, d, false); err != nil { t.Fatalf("could not unmount at %q: %v", p, err) } - d.DecRef() + d.DecRef(ctx) // Remove the path that has been unmounted and the check that the remaining // mounts are still there. allPaths = allPaths[:len(allPaths)-1] submounts := mm.AllMountsUnder(rootMnt) - if err := mountPathsAre(rootDirent, submounts, allPaths...); err != nil { + if err := mountPathsAre(ctx, rootDirent, submounts, allPaths...); err != nil { t.Error(err) } } diff --git a/pkg/sentry/fs/mounts.go b/pkg/sentry/fs/mounts.go index 3f2bd0e87..d741c4339 100644 --- a/pkg/sentry/fs/mounts.go +++ b/pkg/sentry/fs/mounts.go @@ -234,7 +234,7 @@ func (mns *MountNamespace) flushMountSourceRefsLocked() { // After destroy is called, the MountNamespace may continue to be referenced (for // example via /proc/mounts), but should free all resources and shouldn't have // Find* methods called. -func (mns *MountNamespace) destroy() { +func (mns *MountNamespace) destroy(ctx context.Context) { mns.mu.Lock() defer mns.mu.Unlock() @@ -247,13 +247,13 @@ func (mns *MountNamespace) destroy() { for _, mp := range mns.mounts { // Drop the mount reference on all mounted dirents. for ; mp != nil; mp = mp.previous { - mp.root.DecRef() + mp.root.DecRef(ctx) } } mns.mounts = nil // Drop reference on the root. - mns.root.DecRef() + mns.root.DecRef(ctx) // Ensure that root cannot be accessed via this MountNamespace any // more. @@ -265,8 +265,8 @@ func (mns *MountNamespace) destroy() { } // DecRef implements RefCounter.DecRef with destructor mns.destroy. -func (mns *MountNamespace) DecRef() { - mns.DecRefWithDestructor(mns.destroy) +func (mns *MountNamespace) DecRef(ctx context.Context) { + mns.DecRefWithDestructor(ctx, mns.destroy) } // withMountLocked prevents further walks to `node`, because `node` is about to @@ -312,7 +312,7 @@ func (mns *MountNamespace) Mount(ctx context.Context, mountPoint *Dirent, inode if err != nil { return err } - defer replacement.DecRef() + defer replacement.DecRef(ctx) // Set the mount's root dirent and id. parentMnt := mns.findMountLocked(mountPoint) @@ -394,7 +394,7 @@ func (mns *MountNamespace) Unmount(ctx context.Context, node *Dirent, detachOnly panic(fmt.Sprintf("Last mount in the chain must be a undo mount: %+v", prev)) } // Drop mount reference taken at the end of MountNamespace.Mount. - prev.root.DecRef() + prev.root.DecRef(ctx) } else { mns.mounts[prev.root] = prev } @@ -496,11 +496,11 @@ func (mns *MountNamespace) FindLink(ctx context.Context, root, wd *Dirent, path // non-directory root is hopeless. if current != root { if !IsDir(current.Inode.StableAttr) { - current.DecRef() // Drop reference from above. + current.DecRef(ctx) // Drop reference from above. return nil, syserror.ENOTDIR } if err := current.Inode.CheckPermission(ctx, PermMask{Execute: true}); err != nil { - current.DecRef() // Drop reference from above. + current.DecRef(ctx) // Drop reference from above. return nil, err } } @@ -511,12 +511,12 @@ func (mns *MountNamespace) FindLink(ctx context.Context, root, wd *Dirent, path // Allow failed walks to cache the dirent, because no // children will acquire a reference at the end. current.maybeExtendReference() - current.DecRef() + current.DecRef(ctx) return nil, err } // Drop old reference. - current.DecRef() + current.DecRef(ctx) if remainder != "" { // Ensure it's resolved, unless it's the last level. @@ -570,11 +570,11 @@ func (mns *MountNamespace) resolve(ctx context.Context, root, node *Dirent, rema case nil: // Make sure we didn't exhaust the traversal budget. if *remainingTraversals == 0 { - target.DecRef() + target.DecRef(ctx) return nil, syscall.ELOOP } - node.DecRef() // Drop the original reference. + node.DecRef(ctx) // Drop the original reference. return target, nil case syscall.ENOLINK: @@ -582,7 +582,7 @@ func (mns *MountNamespace) resolve(ctx context.Context, root, node *Dirent, rema return node, nil case ErrResolveViaReadlink: - defer node.DecRef() // See above. + defer node.DecRef(ctx) // See above. // First, check if we should traverse. if *remainingTraversals == 0 { @@ -608,7 +608,7 @@ func (mns *MountNamespace) resolve(ctx context.Context, root, node *Dirent, rema return d, err default: - node.DecRef() // Drop for err; see above. + node.DecRef(ctx) // Drop for err; see above. // Propagate the error. return nil, err diff --git a/pkg/sentry/fs/mounts_test.go b/pkg/sentry/fs/mounts_test.go index a69b41468..975d6cbc9 100644 --- a/pkg/sentry/fs/mounts_test.go +++ b/pkg/sentry/fs/mounts_test.go @@ -51,7 +51,7 @@ func TestFindLink(t *testing.T) { } root := mm.Root() - defer root.DecRef() + defer root.DecRef(ctx) foo, err := root.Walk(ctx, root, "foo") if err != nil { t.Fatalf("Error walking to foo: %v", err) diff --git a/pkg/sentry/fs/overlay.go b/pkg/sentry/fs/overlay.go index a8ae7d81d..35013a21b 100644 --- a/pkg/sentry/fs/overlay.go +++ b/pkg/sentry/fs/overlay.go @@ -107,7 +107,7 @@ func NewOverlayRoot(ctx context.Context, upper *Inode, lower *Inode, flags Mount msrc := newOverlayMountSource(ctx, upper.MountSource, lower.MountSource, flags) overlay, err := newOverlayEntry(ctx, upper, lower, true) if err != nil { - msrc.DecRef() + msrc.DecRef(ctx) return nil, err } @@ -130,7 +130,7 @@ func NewOverlayRootFile(ctx context.Context, upperMS *MountSource, lower *Inode, msrc := newOverlayMountSource(ctx, upperMS, lower.MountSource, flags) overlay, err := newOverlayEntry(ctx, nil, lower, true) if err != nil { - msrc.DecRef() + msrc.DecRef(ctx) return nil, err } return newOverlayInode(ctx, overlay, msrc), nil @@ -230,16 +230,16 @@ func newOverlayEntry(ctx context.Context, upper *Inode, lower *Inode, lowerExist }, nil } -func (o *overlayEntry) release() { +func (o *overlayEntry) release(ctx context.Context) { // We drop a reference on upper and lower file system Inodes // rather than releasing them, because in-memory filesystems // may hold an extra reference to these Inodes so that they // stay in memory. if o.upper != nil { - o.upper.DecRef() + o.upper.DecRef(ctx) } if o.lower != nil { - o.lower.DecRef() + o.lower.DecRef(ctx) } } diff --git a/pkg/sentry/fs/proc/fds.go b/pkg/sentry/fs/proc/fds.go index 35972e23c..45523adf8 100644 --- a/pkg/sentry/fs/proc/fds.go +++ b/pkg/sentry/fs/proc/fds.go @@ -56,11 +56,11 @@ func walkDescriptors(t *kernel.Task, p string, toInode func(*fs.File, kernel.FDF // readDescriptors reads fds in the task starting at offset, and calls the // toDentAttr callback for each to get a DentAttr, which it then emits. This is // a helper for implementing fs.InodeOperations.Readdir. -func readDescriptors(t *kernel.Task, c *fs.DirCtx, offset int64, toDentAttr func(int) fs.DentAttr) (int64, error) { +func readDescriptors(ctx context.Context, t *kernel.Task, c *fs.DirCtx, offset int64, toDentAttr func(int) fs.DentAttr) (int64, error) { var fds []int32 t.WithMuLocked(func(t *kernel.Task) { if fdTable := t.FDTable(); fdTable != nil { - fds = fdTable.GetFDs() + fds = fdTable.GetFDs(ctx) } }) @@ -116,7 +116,7 @@ func (f *fd) GetFile(context.Context, *fs.Dirent, fs.FileFlags) (*fs.File, error func (f *fd) Readlink(ctx context.Context, _ *fs.Inode) (string, error) { root := fs.RootFromContext(ctx) if root != nil { - defer root.DecRef() + defer root.DecRef(ctx) } n, _ := f.file.Dirent.FullName(root) return n, nil @@ -135,13 +135,7 @@ func (f *fd) Truncate(context.Context, *fs.Inode, int64) error { func (f *fd) Release(ctx context.Context) { f.Symlink.Release(ctx) - f.file.DecRef() -} - -// Close releases the reference on the file. -func (f *fd) Close() error { - f.file.DecRef() - return nil + f.file.DecRef(ctx) } // fdDir is an InodeOperations for /proc/TID/fd. @@ -227,7 +221,7 @@ func (f *fdDirFile) Readdir(ctx context.Context, file *fs.File, ser fs.DentrySer if f.isInfoFile { typ = fs.Symlink } - return readDescriptors(f.t, dirCtx, file.Offset(), func(fd int) fs.DentAttr { + return readDescriptors(ctx, f.t, dirCtx, file.Offset(), func(fd int) fs.DentAttr { return fs.GenericDentAttr(typ, device.ProcDevice) }) } @@ -261,7 +255,7 @@ func (fdid *fdInfoDir) Lookup(ctx context.Context, dir *fs.Inode, p string) (*fs // locks, and other data. For now we only have flags. // See https://www.kernel.org/doc/Documentation/filesystems/proc.txt flags := file.Flags().ToLinux() | fdFlags.ToLinuxFileFlags() - file.DecRef() + file.DecRef(ctx) contents := []byte(fmt.Sprintf("flags:\t0%o\n", flags)) return newStaticProcInode(ctx, dir.MountSource, contents) }) diff --git a/pkg/sentry/fs/proc/mounts.go b/pkg/sentry/fs/proc/mounts.go index 1fc9c703c..6a63c47b3 100644 --- a/pkg/sentry/fs/proc/mounts.go +++ b/pkg/sentry/fs/proc/mounts.go @@ -47,7 +47,7 @@ func forEachMount(t *kernel.Task, fn func(string, *fs.Mount)) { // The task has been destroyed. Nothing to show here. return } - defer rootDir.DecRef() + defer rootDir.DecRef(t) mnt := t.MountNamespace().FindMount(rootDir) if mnt == nil { @@ -64,7 +64,7 @@ func forEachMount(t *kernel.Task, fn func(string, *fs.Mount)) { continue // No longer valid. } mountPath, desc := mroot.FullName(rootDir) - mroot.DecRef() + mroot.DecRef(t) if !desc { // MountSources that are not descendants of the chroot jail are ignored. continue @@ -97,7 +97,7 @@ func (mif *mountInfoFile) ReadSeqFileData(ctx context.Context, handle seqfile.Se if mroot == nil { return // No longer valid. } - defer mroot.DecRef() + defer mroot.DecRef(ctx) // Format: // 36 35 98:0 /mnt1 /mnt2 rw,noatime master:1 - ext3 /dev/root rw,errors=continue @@ -216,7 +216,7 @@ func (mf *mountsFile) ReadSeqFileData(ctx context.Context, handle seqfile.SeqHan if root == nil { return // No longer valid. } - defer root.DecRef() + defer root.DecRef(ctx) flags := root.Inode.MountSource.Flags opts := "rw" diff --git a/pkg/sentry/fs/proc/net.go b/pkg/sentry/fs/proc/net.go index bd18177d4..83a43aa26 100644 --- a/pkg/sentry/fs/proc/net.go +++ b/pkg/sentry/fs/proc/net.go @@ -419,7 +419,7 @@ func (n *netUnix) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]s } sfile := s.(*fs.File) if family, _, _ := sfile.FileOperations.(socket.Socket).Type(); family != linux.AF_UNIX { - s.DecRef() + s.DecRef(ctx) // Not a unix socket. continue } @@ -479,7 +479,7 @@ func (n *netUnix) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]s } fmt.Fprintf(&buf, "\n") - s.DecRef() + s.DecRef(ctx) } data := []seqfile.SeqData{ @@ -574,7 +574,7 @@ func commonReadSeqFileDataTCP(ctx context.Context, n seqfile.SeqHandle, k *kerne panic(fmt.Sprintf("Found non-socket file in socket table: %+v", sfile)) } if family, stype, _ := sops.Type(); !(family == fa && stype == linux.SOCK_STREAM) { - s.DecRef() + s.DecRef(ctx) // Not tcp4 sockets. continue } @@ -664,7 +664,7 @@ func commonReadSeqFileDataTCP(ctx context.Context, n seqfile.SeqHandle, k *kerne fmt.Fprintf(&buf, "\n") - s.DecRef() + s.DecRef(ctx) } data := []seqfile.SeqData{ @@ -752,7 +752,7 @@ func (n *netUDP) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]se panic(fmt.Sprintf("Found non-socket file in socket table: %+v", sfile)) } if family, stype, _ := sops.Type(); family != linux.AF_INET || stype != linux.SOCK_DGRAM { - s.DecRef() + s.DecRef(ctx) // Not udp4 socket. continue } @@ -822,7 +822,7 @@ func (n *netUDP) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]se fmt.Fprintf(&buf, "\n") - s.DecRef() + s.DecRef(ctx) } data := []seqfile.SeqData{ diff --git a/pkg/sentry/fs/proc/proc.go b/pkg/sentry/fs/proc/proc.go index c659224a7..77e0e1d26 100644 --- a/pkg/sentry/fs/proc/proc.go +++ b/pkg/sentry/fs/proc/proc.go @@ -213,7 +213,7 @@ func (rpf *rootProcFile) Readdir(ctx context.Context, file *fs.File, ser fs.Dent // Add dot and dotdot. root := fs.RootFromContext(ctx) if root != nil { - defer root.DecRef() + defer root.DecRef(ctx) } dot, dotdot := file.Dirent.GetDotAttrs(root) names = append(names, ".", "..") diff --git a/pkg/sentry/fs/proc/task.go b/pkg/sentry/fs/proc/task.go index 4bbe90198..9cf7f2a62 100644 --- a/pkg/sentry/fs/proc/task.go +++ b/pkg/sentry/fs/proc/task.go @@ -185,7 +185,7 @@ func (f *subtasksFile) Readdir(ctx context.Context, file *fs.File, ser fs.Dentry // Serialize "." and "..". root := fs.RootFromContext(ctx) if root != nil { - defer root.DecRef() + defer root.DecRef(ctx) } dot, dotdot := file.Dirent.GetDotAttrs(root) if err := dirCtx.DirEmit(".", dot); err != nil { @@ -295,7 +295,7 @@ func (e *exe) Readlink(ctx context.Context, inode *fs.Inode) (string, error) { if err != nil { return "", err } - defer exec.DecRef() + defer exec.DecRef(ctx) return exec.PathnameWithDeleted(ctx), nil } diff --git a/pkg/sentry/fs/ramfs/dir.go b/pkg/sentry/fs/ramfs/dir.go index bfa304552..f4fcddecb 100644 --- a/pkg/sentry/fs/ramfs/dir.go +++ b/pkg/sentry/fs/ramfs/dir.go @@ -219,7 +219,7 @@ func (d *Dir) Remove(ctx context.Context, _ *fs.Inode, name string) error { } // Remove our reference on the inode. - inode.DecRef() + inode.DecRef(ctx) return nil } @@ -250,7 +250,7 @@ func (d *Dir) RemoveDirectory(ctx context.Context, _ *fs.Inode, name string) err } // Remove our reference on the inode. - inode.DecRef() + inode.DecRef(ctx) return nil } @@ -326,7 +326,7 @@ func (d *Dir) Create(ctx context.Context, dir *fs.Inode, name string, flags fs.F // Create the Dirent and corresponding file. created := fs.NewDirent(ctx, inode, name) - defer created.DecRef() + defer created.DecRef(ctx) return created.Inode.GetFile(ctx, created, flags) } @@ -412,11 +412,11 @@ func (*Dir) Rename(ctx context.Context, inode *fs.Inode, oldParent *fs.Inode, ol } // Release implements fs.InodeOperation.Release. -func (d *Dir) Release(_ context.Context) { +func (d *Dir) Release(ctx context.Context) { // Drop references on all children. d.mu.Lock() for _, i := range d.children { - i.DecRef() + i.DecRef(ctx) } d.mu.Unlock() } @@ -456,7 +456,7 @@ func (dfo *dirFileOperations) IterateDir(ctx context.Context, d *fs.Dirent, dirC func (dfo *dirFileOperations) Readdir(ctx context.Context, file *fs.File, serializer fs.DentrySerializer) (int64, error) { root := fs.RootFromContext(ctx) if root != nil { - defer root.DecRef() + defer root.DecRef(ctx) } dirCtx := &fs.DirCtx{ Serializer: serializer, @@ -473,13 +473,13 @@ func hasChildren(ctx context.Context, inode *fs.Inode) (bool, error) { // dropped when that dirent is destroyed. inode.IncRef() d := fs.NewTransientDirent(inode) - defer d.DecRef() + defer d.DecRef(ctx) file, err := inode.GetFile(ctx, d, fs.FileFlags{Read: true}) if err != nil { return false, err } - defer file.DecRef() + defer file.DecRef(ctx) ser := &fs.CollectEntriesSerializer{} if err := file.Readdir(ctx, ser); err != nil { @@ -530,7 +530,7 @@ func Rename(ctx context.Context, oldParent fs.InodeOperations, oldName string, n if err != nil { return err } - inode.DecRef() + inode.DecRef(ctx) } // Be careful, we may have already grabbed this mutex above. diff --git a/pkg/sentry/fs/ramfs/tree_test.go b/pkg/sentry/fs/ramfs/tree_test.go index a6ed8b2c5..3e0d1e07e 100644 --- a/pkg/sentry/fs/ramfs/tree_test.go +++ b/pkg/sentry/fs/ramfs/tree_test.go @@ -67,7 +67,7 @@ func TestMakeDirectoryTree(t *testing.T) { continue } root := mm.Root() - defer mm.DecRef() + defer mm.DecRef(ctx) for _, p := range test.subdirs { maxTraversals := uint(0) diff --git a/pkg/sentry/fs/timerfd/timerfd.go b/pkg/sentry/fs/timerfd/timerfd.go index 88c344089..f362ca9b6 100644 --- a/pkg/sentry/fs/timerfd/timerfd.go +++ b/pkg/sentry/fs/timerfd/timerfd.go @@ -55,7 +55,7 @@ type TimerOperations struct { func NewFile(ctx context.Context, c ktime.Clock) *fs.File { dirent := fs.NewDirent(ctx, anon.NewInode(ctx), "anon_inode:[timerfd]") // Release the initial dirent reference after NewFile takes a reference. - defer dirent.DecRef() + defer dirent.DecRef(ctx) tops := &TimerOperations{} tops.timer = ktime.NewTimer(c, tops) // Timerfds reject writes, but the Write flag must be set in order to @@ -65,7 +65,7 @@ func NewFile(ctx context.Context, c ktime.Clock) *fs.File { } // Release implements fs.FileOperations.Release. -func (t *TimerOperations) Release() { +func (t *TimerOperations) Release(context.Context) { t.timer.Destroy() } diff --git a/pkg/sentry/fs/tmpfs/file_test.go b/pkg/sentry/fs/tmpfs/file_test.go index aaba35502..d4d613ea9 100644 --- a/pkg/sentry/fs/tmpfs/file_test.go +++ b/pkg/sentry/fs/tmpfs/file_test.go @@ -46,7 +46,7 @@ func newFile(ctx context.Context) *fs.File { func TestGrow(t *testing.T) { ctx := contexttest.Context(t) f := newFile(ctx) - defer f.DecRef() + defer f.DecRef(ctx) abuf := bytes.Repeat([]byte{'a'}, 68) n, err := f.Pwritev(ctx, usermem.BytesIOSequence(abuf), 0) diff --git a/pkg/sentry/fs/tty/dir.go b/pkg/sentry/fs/tty/dir.go index 108654827..463f6189e 100644 --- a/pkg/sentry/fs/tty/dir.go +++ b/pkg/sentry/fs/tty/dir.go @@ -132,7 +132,7 @@ func (d *dirInodeOperations) Release(ctx context.Context) { d.mu.Lock() defer d.mu.Unlock() - d.master.DecRef() + d.master.DecRef(ctx) if len(d.slaves) != 0 { panic(fmt.Sprintf("devpts directory still contains active terminals: %+v", d)) } @@ -263,7 +263,7 @@ func (d *dirInodeOperations) allocateTerminal(ctx context.Context) (*Terminal, e } // masterClose is called when the master end of t is closed. -func (d *dirInodeOperations) masterClose(t *Terminal) { +func (d *dirInodeOperations) masterClose(ctx context.Context, t *Terminal) { d.mu.Lock() defer d.mu.Unlock() @@ -277,7 +277,7 @@ func (d *dirInodeOperations) masterClose(t *Terminal) { panic(fmt.Sprintf("Terminal %+v doesn't exist in %+v?", t, d)) } - s.DecRef() + s.DecRef(ctx) delete(d.slaves, t.n) d.dentryMap.Remove(strconv.FormatUint(uint64(t.n), 10)) } @@ -322,7 +322,7 @@ func (df *dirFileOperations) IterateDir(ctx context.Context, d *fs.Dirent, dirCt func (df *dirFileOperations) Readdir(ctx context.Context, file *fs.File, serializer fs.DentrySerializer) (int64, error) { root := fs.RootFromContext(ctx) if root != nil { - defer root.DecRef() + defer root.DecRef(ctx) } dirCtx := &fs.DirCtx{ Serializer: serializer, diff --git a/pkg/sentry/fs/tty/fs.go b/pkg/sentry/fs/tty/fs.go index 8fe05ebe5..2d4d44bf3 100644 --- a/pkg/sentry/fs/tty/fs.go +++ b/pkg/sentry/fs/tty/fs.go @@ -108,4 +108,4 @@ func (superOperations) ResetInodeMappings() {} func (superOperations) SaveInodeMapping(*fs.Inode, string) {} // Destroy implements MountSourceOperations.Destroy. -func (superOperations) Destroy() {} +func (superOperations) Destroy(context.Context) {} diff --git a/pkg/sentry/fs/tty/master.go b/pkg/sentry/fs/tty/master.go index fe07fa929..e00746017 100644 --- a/pkg/sentry/fs/tty/master.go +++ b/pkg/sentry/fs/tty/master.go @@ -75,7 +75,7 @@ func newMasterInode(ctx context.Context, d *dirInodeOperations, owner fs.FileOwn } // Release implements fs.InodeOperations.Release. -func (mi *masterInodeOperations) Release(ctx context.Context) { +func (mi *masterInodeOperations) Release(context.Context) { } // Truncate implements fs.InodeOperations.Truncate. @@ -120,9 +120,9 @@ type masterFileOperations struct { var _ fs.FileOperations = (*masterFileOperations)(nil) // Release implements fs.FileOperations.Release. -func (mf *masterFileOperations) Release() { - mf.d.masterClose(mf.t) - mf.t.DecRef() +func (mf *masterFileOperations) Release(ctx context.Context) { + mf.d.masterClose(ctx, mf.t) + mf.t.DecRef(ctx) } // EventRegister implements waiter.Waitable.EventRegister. diff --git a/pkg/sentry/fs/tty/slave.go b/pkg/sentry/fs/tty/slave.go index 9871f6fc6..7c7292687 100644 --- a/pkg/sentry/fs/tty/slave.go +++ b/pkg/sentry/fs/tty/slave.go @@ -71,7 +71,7 @@ func newSlaveInode(ctx context.Context, d *dirInodeOperations, t *Terminal, owne // Release implements fs.InodeOperations.Release. func (si *slaveInodeOperations) Release(ctx context.Context) { - si.t.DecRef() + si.t.DecRef(ctx) } // Truncate implements fs.InodeOperations.Truncate. @@ -106,7 +106,7 @@ type slaveFileOperations struct { var _ fs.FileOperations = (*slaveFileOperations)(nil) // Release implements fs.FileOperations.Release. -func (sf *slaveFileOperations) Release() { +func (sf *slaveFileOperations) Release(context.Context) { } // EventRegister implements waiter.Waitable.EventRegister. diff --git a/pkg/sentry/fs/user/path.go b/pkg/sentry/fs/user/path.go index 397e96045..2f5a43b84 100644 --- a/pkg/sentry/fs/user/path.go +++ b/pkg/sentry/fs/user/path.go @@ -82,7 +82,7 @@ func resolve(ctx context.Context, mns *fs.MountNamespace, paths []string, name s // Caller has no root. Don't bother traversing anything. return "", syserror.ENOENT } - defer root.DecRef() + defer root.DecRef(ctx) for _, p := range paths { if !path.IsAbs(p) { // Relative paths aren't safe, no one should be using them. @@ -100,7 +100,7 @@ func resolve(ctx context.Context, mns *fs.MountNamespace, paths []string, name s if err != nil { return "", err } - defer d.DecRef() + defer d.DecRef(ctx) // Check that it is a regular file. if !fs.IsRegular(d.Inode.StableAttr) { @@ -121,7 +121,7 @@ func resolve(ctx context.Context, mns *fs.MountNamespace, paths []string, name s func resolveVFS2(ctx context.Context, creds *auth.Credentials, mns *vfs.MountNamespace, paths []string, name string) (string, error) { root := mns.Root() - defer root.DecRef() + defer root.DecRef(ctx) for _, p := range paths { if !path.IsAbs(p) { // Relative paths aren't safe, no one should be using them. @@ -148,7 +148,7 @@ func resolveVFS2(ctx context.Context, creds *auth.Credentials, mns *vfs.MountNam if err != nil { return "", err } - dentry.DecRef() + dentry.DecRef(ctx) return binPath, nil } diff --git a/pkg/sentry/fs/user/user.go b/pkg/sentry/fs/user/user.go index f4d525523..936fd3932 100644 --- a/pkg/sentry/fs/user/user.go +++ b/pkg/sentry/fs/user/user.go @@ -62,7 +62,7 @@ func getExecUserHome(ctx context.Context, rootMns *fs.MountNamespace, uid auth.K // doesn't exist we will return the default home directory. return defaultHome, nil } - defer dirent.DecRef() + defer dirent.DecRef(ctx) // Check read permissions on the file. if err := dirent.Inode.CheckPermission(ctx, fs.PermMask{Read: true}); err != nil { @@ -81,7 +81,7 @@ func getExecUserHome(ctx context.Context, rootMns *fs.MountNamespace, uid auth.K if err != nil { return "", err } - defer f.DecRef() + defer f.DecRef(ctx) r := &fileReader{ Ctx: ctx, @@ -105,7 +105,7 @@ func getExecUserHomeVFS2(ctx context.Context, mns *vfs.MountNamespace, uid auth. const defaultHome = "/" root := mns.Root() - defer root.DecRef() + defer root.DecRef(ctx) creds := auth.CredentialsFromContext(ctx) @@ -123,7 +123,7 @@ func getExecUserHomeVFS2(ctx context.Context, mns *vfs.MountNamespace, uid auth. if err != nil { return defaultHome, nil } - defer fd.DecRef() + defer fd.DecRef(ctx) r := &fileReaderVFS2{ ctx: ctx, diff --git a/pkg/sentry/fs/user/user_test.go b/pkg/sentry/fs/user/user_test.go index 7d8e9ac7c..12b786224 100644 --- a/pkg/sentry/fs/user/user_test.go +++ b/pkg/sentry/fs/user/user_test.go @@ -39,7 +39,7 @@ func createEtcPasswd(ctx context.Context, root *fs.Dirent, contents string, mode if err != nil { return err } - defer etc.DecRef() + defer etc.DecRef(ctx) switch mode.FileType() { case 0: // Don't create anything. @@ -49,7 +49,7 @@ func createEtcPasswd(ctx context.Context, root *fs.Dirent, contents string, mode if err != nil { return err } - defer passwd.DecRef() + defer passwd.DecRef(ctx) if _, err := passwd.Writev(ctx, usermem.BytesIOSequence([]byte(contents))); err != nil { return err } @@ -110,9 +110,9 @@ func TestGetExecUserHome(t *testing.T) { if err != nil { t.Fatalf("NewMountNamespace failed: %v", err) } - defer mns.DecRef() + defer mns.DecRef(ctx) root := mns.Root() - defer root.DecRef() + defer root.DecRef(ctx) ctx = fs.WithRoot(ctx, root) if err := createEtcPasswd(ctx, root, tc.passwdContents, tc.passwdMode); err != nil { diff --git a/pkg/sentry/fsbridge/bridge.go b/pkg/sentry/fsbridge/bridge.go index 8e7590721..7e61209ee 100644 --- a/pkg/sentry/fsbridge/bridge.go +++ b/pkg/sentry/fsbridge/bridge.go @@ -44,7 +44,7 @@ type File interface { IncRef() // DecRef decrements reference. - DecRef() + DecRef(ctx context.Context) } // Lookup provides a common interface to open files. diff --git a/pkg/sentry/fsbridge/fs.go b/pkg/sentry/fsbridge/fs.go index 093ce1fb3..9785fd62a 100644 --- a/pkg/sentry/fsbridge/fs.go +++ b/pkg/sentry/fsbridge/fs.go @@ -49,7 +49,7 @@ func (f *fsFile) PathnameWithDeleted(ctx context.Context) string { // global there. return "" } - defer root.DecRef() + defer root.DecRef(ctx) name, _ := f.file.Dirent.FullName(root) return name @@ -87,8 +87,8 @@ func (f *fsFile) IncRef() { } // DecRef implements File. -func (f *fsFile) DecRef() { - f.file.DecRef() +func (f *fsFile) DecRef(ctx context.Context) { + f.file.DecRef(ctx) } // fsLookup implements Lookup interface using fs.File. @@ -124,7 +124,7 @@ func (l *fsLookup) OpenPath(ctx context.Context, path string, opts vfs.OpenOptio if err != nil { return nil, err } - defer d.DecRef() + defer d.DecRef(ctx) if !resolveFinal && fs.IsSymlink(d.Inode.StableAttr) { return nil, syserror.ELOOP diff --git a/pkg/sentry/fsbridge/vfs.go b/pkg/sentry/fsbridge/vfs.go index 89168220a..323506d33 100644 --- a/pkg/sentry/fsbridge/vfs.go +++ b/pkg/sentry/fsbridge/vfs.go @@ -43,7 +43,7 @@ func NewVFSFile(file *vfs.FileDescription) File { // PathnameWithDeleted implements File. func (f *VFSFile) PathnameWithDeleted(ctx context.Context) string { root := vfs.RootFromContext(ctx) - defer root.DecRef() + defer root.DecRef(ctx) vfsObj := f.file.VirtualDentry().Mount().Filesystem().VirtualFilesystem() name, _ := vfsObj.PathnameWithDeleted(ctx, root, f.file.VirtualDentry()) @@ -86,8 +86,8 @@ func (f *VFSFile) IncRef() { } // DecRef implements File. -func (f *VFSFile) DecRef() { - f.file.DecRef() +func (f *VFSFile) DecRef(ctx context.Context) { + f.file.DecRef(ctx) } // FileDescription returns the FileDescription represented by f. It does not diff --git a/pkg/sentry/fsimpl/devpts/devpts.go b/pkg/sentry/fsimpl/devpts/devpts.go index e6fda2b4f..7169e91af 100644 --- a/pkg/sentry/fsimpl/devpts/devpts.go +++ b/pkg/sentry/fsimpl/devpts/devpts.go @@ -103,9 +103,9 @@ func (fstype FilesystemType) newFilesystem(vfsObj *vfs.VirtualFilesystem, creds } // Release implements vfs.FilesystemImpl.Release. -func (fs *filesystem) Release() { +func (fs *filesystem) Release(ctx context.Context) { fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) - fs.Filesystem.Release() + fs.Filesystem.Release(ctx) } // rootInode is the root directory inode for the devpts mounts. diff --git a/pkg/sentry/fsimpl/devpts/master.go b/pkg/sentry/fsimpl/devpts/master.go index 1081fff52..3bb397f71 100644 --- a/pkg/sentry/fsimpl/devpts/master.go +++ b/pkg/sentry/fsimpl/devpts/master.go @@ -60,7 +60,7 @@ func (mi *masterInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vf } fd.LockFD.Init(&mi.locks) if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{}); err != nil { - mi.DecRef() + mi.DecRef(ctx) return nil, err } return &fd.vfsfd, nil @@ -98,9 +98,9 @@ type masterFileDescription struct { var _ vfs.FileDescriptionImpl = (*masterFileDescription)(nil) // Release implements vfs.FileDescriptionImpl.Release. -func (mfd *masterFileDescription) Release() { +func (mfd *masterFileDescription) Release(ctx context.Context) { mfd.inode.root.masterClose(mfd.t) - mfd.inode.DecRef() + mfd.inode.DecRef(ctx) } // EventRegister implements waiter.Waitable.EventRegister. diff --git a/pkg/sentry/fsimpl/devpts/slave.go b/pkg/sentry/fsimpl/devpts/slave.go index a91cae3ef..32e4e1908 100644 --- a/pkg/sentry/fsimpl/devpts/slave.go +++ b/pkg/sentry/fsimpl/devpts/slave.go @@ -56,7 +56,7 @@ func (si *slaveInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs } fd.LockFD.Init(&si.locks) if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{}); err != nil { - si.DecRef() + si.DecRef(ctx) return nil, err } return &fd.vfsfd, nil @@ -103,8 +103,8 @@ type slaveFileDescription struct { var _ vfs.FileDescriptionImpl = (*slaveFileDescription)(nil) // Release implements fs.FileOperations.Release. -func (sfd *slaveFileDescription) Release() { - sfd.inode.DecRef() +func (sfd *slaveFileDescription) Release(ctx context.Context) { + sfd.inode.DecRef(ctx) } // EventRegister implements waiter.Waitable.EventRegister. diff --git a/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go b/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go index d0e06cdc0..2ed5fa8a9 100644 --- a/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go +++ b/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go @@ -92,9 +92,9 @@ func NewAccessor(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth } // Release must be called when a is no longer in use. -func (a *Accessor) Release() { - a.root.DecRef() - a.mntns.DecRef() +func (a *Accessor) Release(ctx context.Context) { + a.root.DecRef(ctx) + a.mntns.DecRef(ctx) } // accessorContext implements context.Context by extending an existing diff --git a/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go b/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go index b6d52c015..747867cca 100644 --- a/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go +++ b/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go @@ -30,7 +30,7 @@ func TestDevtmpfs(t *testing.T) { creds := auth.CredentialsFromContext(ctx) vfsObj := &vfs.VirtualFilesystem{} - if err := vfsObj.Init(); err != nil { + if err := vfsObj.Init(ctx); err != nil { t.Fatalf("VFS init: %v", err) } // Register tmpfs just so that we can have a root filesystem that isn't @@ -48,9 +48,9 @@ func TestDevtmpfs(t *testing.T) { if err != nil { t.Fatalf("failed to create tmpfs root mount: %v", err) } - defer mntns.DecRef() + defer mntns.DecRef(ctx) root := mntns.Root() - defer root.DecRef() + defer root.DecRef(ctx) devpop := vfs.PathOperation{ Root: root, Start: root, @@ -69,7 +69,7 @@ func TestDevtmpfs(t *testing.T) { if err != nil { t.Fatalf("failed to create devtmpfs.Accessor: %v", err) } - defer a.Release() + defer a.Release(ctx) // Create "userspace-initialized" files using a devtmpfs.Accessor. if err := a.UserspaceInit(ctx); err != nil { diff --git a/pkg/sentry/fsimpl/eventfd/eventfd.go b/pkg/sentry/fsimpl/eventfd/eventfd.go index d12d78b84..812171fa3 100644 --- a/pkg/sentry/fsimpl/eventfd/eventfd.go +++ b/pkg/sentry/fsimpl/eventfd/eventfd.go @@ -59,9 +59,9 @@ type EventFileDescription struct { var _ vfs.FileDescriptionImpl = (*EventFileDescription)(nil) // New creates a new event fd. -func New(vfsObj *vfs.VirtualFilesystem, initVal uint64, semMode bool, flags uint32) (*vfs.FileDescription, error) { +func New(ctx context.Context, vfsObj *vfs.VirtualFilesystem, initVal uint64, semMode bool, flags uint32) (*vfs.FileDescription, error) { vd := vfsObj.NewAnonVirtualDentry("[eventfd]") - defer vd.DecRef() + defer vd.DecRef(ctx) efd := &EventFileDescription{ val: initVal, semMode: semMode, @@ -107,7 +107,7 @@ func (efd *EventFileDescription) HostFD() (int, error) { } // Release implements FileDescriptionImpl.Release() -func (efd *EventFileDescription) Release() { +func (efd *EventFileDescription) Release(context.Context) { efd.mu.Lock() defer efd.mu.Unlock() if efd.hostfd >= 0 { diff --git a/pkg/sentry/fsimpl/eventfd/eventfd_test.go b/pkg/sentry/fsimpl/eventfd/eventfd_test.go index 20e3adffc..49916fa81 100644 --- a/pkg/sentry/fsimpl/eventfd/eventfd_test.go +++ b/pkg/sentry/fsimpl/eventfd/eventfd_test.go @@ -36,16 +36,16 @@ func TestEventFD(t *testing.T) { for _, initVal := range initVals { ctx := contexttest.Context(t) vfsObj := &vfs.VirtualFilesystem{} - if err := vfsObj.Init(); err != nil { + if err := vfsObj.Init(ctx); err != nil { t.Fatalf("VFS init: %v", err) } // Make a new eventfd that is writable. - eventfd, err := New(vfsObj, initVal, false, linux.O_RDWR) + eventfd, err := New(ctx, vfsObj, initVal, false, linux.O_RDWR) if err != nil { t.Fatalf("New() failed: %v", err) } - defer eventfd.DecRef() + defer eventfd.DecRef(ctx) // Register a callback for a write event. w, ch := waiter.NewChannelEntry(nil) @@ -74,16 +74,16 @@ func TestEventFD(t *testing.T) { func TestEventFDStat(t *testing.T) { ctx := contexttest.Context(t) vfsObj := &vfs.VirtualFilesystem{} - if err := vfsObj.Init(); err != nil { + if err := vfsObj.Init(ctx); err != nil { t.Fatalf("VFS init: %v", err) } // Make a new eventfd that is writable. - eventfd, err := New(vfsObj, 0, false, linux.O_RDWR) + eventfd, err := New(ctx, vfsObj, 0, false, linux.O_RDWR) if err != nil { t.Fatalf("New() failed: %v", err) } - defer eventfd.DecRef() + defer eventfd.DecRef(ctx) statx, err := eventfd.Stat(ctx, vfs.StatOptions{ Mask: linux.STATX_BASIC_STATS, diff --git a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go index 89caee3df..8f7d5a9bb 100644 --- a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go +++ b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go @@ -53,7 +53,7 @@ func setUp(b *testing.B, imagePath string) (context.Context, *vfs.VirtualFilesys // Create VFS. vfsObj := &vfs.VirtualFilesystem{} - if err := vfsObj.Init(); err != nil { + if err := vfsObj.Init(ctx); err != nil { return nil, nil, nil, nil, err } vfsObj.MustRegisterFilesystemType("extfs", ext.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ @@ -68,7 +68,7 @@ func setUp(b *testing.B, imagePath string) (context.Context, *vfs.VirtualFilesys root := mntns.Root() tearDown := func() { - root.DecRef() + root.DecRef(ctx) if err := f.Close(); err != nil { b.Fatalf("tearDown failed: %v", err) @@ -169,7 +169,7 @@ func BenchmarkVFS2ExtfsMountStat(b *testing.B) { if err != nil { b.Fatalf("failed to walk to mount point: %v", err) } - defer mountPoint.DecRef() + defer mountPoint.DecRef(ctx) // Create extfs submount. mountTearDown := mount(b, fmt.Sprintf("/tmp/image-%d.ext4", depth), vfsfs, &pop) diff --git a/pkg/sentry/fsimpl/ext/dentry.go b/pkg/sentry/fsimpl/ext/dentry.go index 55902322a..7a1b4219f 100644 --- a/pkg/sentry/fsimpl/ext/dentry.go +++ b/pkg/sentry/fsimpl/ext/dentry.go @@ -15,6 +15,7 @@ package ext import ( + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/vfs" ) @@ -55,7 +56,7 @@ func (d *dentry) TryIncRef() bool { } // DecRef implements vfs.DentryImpl.DecRef. -func (d *dentry) DecRef() { +func (d *dentry) DecRef(ctx context.Context) { // FIXME(b/134676337): filesystem.mu may not be locked as required by // inode.decRef(). d.inode.decRef() @@ -64,7 +65,7 @@ func (d *dentry) DecRef() { // InotifyWithParent implements vfs.DentryImpl.InotifyWithParent. // // TODO(b/134676337): Implement inotify. -func (d *dentry) InotifyWithParent(events, cookie uint32, et vfs.EventType) {} +func (d *dentry) InotifyWithParent(ctx context.Context, events, cookie uint32, et vfs.EventType) {} // Watches implements vfs.DentryImpl.Watches. // @@ -76,4 +77,4 @@ func (d *dentry) Watches() *vfs.Watches { // OnZeroWatches implements vfs.Dentry.OnZeroWatches. // // TODO(b/134676337): Implement inotify. -func (d *dentry) OnZeroWatches() {} +func (d *dentry) OnZeroWatches(context.Context) {} diff --git a/pkg/sentry/fsimpl/ext/directory.go b/pkg/sentry/fsimpl/ext/directory.go index 357512c7e..0fc01668d 100644 --- a/pkg/sentry/fsimpl/ext/directory.go +++ b/pkg/sentry/fsimpl/ext/directory.go @@ -142,7 +142,7 @@ type directoryFD struct { var _ vfs.FileDescriptionImpl = (*directoryFD)(nil) // Release implements vfs.FileDescriptionImpl.Release. -func (fd *directoryFD) Release() { +func (fd *directoryFD) Release(ctx context.Context) { if fd.iter == nil { return } diff --git a/pkg/sentry/fsimpl/ext/ext.go b/pkg/sentry/fsimpl/ext/ext.go index dac6effbf..08ffc2834 100644 --- a/pkg/sentry/fsimpl/ext/ext.go +++ b/pkg/sentry/fsimpl/ext/ext.go @@ -123,32 +123,32 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt fs.vfsfs.Init(vfsObj, &fsType, &fs) fs.sb, err = readSuperBlock(dev) if err != nil { - fs.vfsfs.DecRef() + fs.vfsfs.DecRef(ctx) return nil, nil, err } if fs.sb.Magic() != linux.EXT_SUPER_MAGIC { // mount(2) specifies that EINVAL should be returned if the superblock is // invalid. - fs.vfsfs.DecRef() + fs.vfsfs.DecRef(ctx) return nil, nil, syserror.EINVAL } // Refuse to mount if the filesystem is incompatible. if !isCompatible(fs.sb) { - fs.vfsfs.DecRef() + fs.vfsfs.DecRef(ctx) return nil, nil, syserror.EINVAL } fs.bgs, err = readBlockGroups(dev, fs.sb) if err != nil { - fs.vfsfs.DecRef() + fs.vfsfs.DecRef(ctx) return nil, nil, err } rootInode, err := fs.getOrCreateInodeLocked(disklayout.RootDirInode) if err != nil { - fs.vfsfs.DecRef() + fs.vfsfs.DecRef(ctx) return nil, nil, err } rootInode.incRef() diff --git a/pkg/sentry/fsimpl/ext/ext_test.go b/pkg/sentry/fsimpl/ext/ext_test.go index 64e9a579f..2dbaee287 100644 --- a/pkg/sentry/fsimpl/ext/ext_test.go +++ b/pkg/sentry/fsimpl/ext/ext_test.go @@ -65,7 +65,7 @@ func setUp(t *testing.T, imagePath string) (context.Context, *vfs.VirtualFilesys // Create VFS. vfsObj := &vfs.VirtualFilesystem{} - if err := vfsObj.Init(); err != nil { + if err := vfsObj.Init(ctx); err != nil { t.Fatalf("VFS init: %v", err) } vfsObj.MustRegisterFilesystemType("extfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ @@ -80,7 +80,7 @@ func setUp(t *testing.T, imagePath string) (context.Context, *vfs.VirtualFilesys root := mntns.Root() tearDown := func() { - root.DecRef() + root.DecRef(ctx) if err := f.Close(); err != nil { t.Fatalf("tearDown failed: %v", err) diff --git a/pkg/sentry/fsimpl/ext/filesystem.go b/pkg/sentry/fsimpl/ext/filesystem.go index 557963e03..c714ddf73 100644 --- a/pkg/sentry/fsimpl/ext/filesystem.go +++ b/pkg/sentry/fsimpl/ext/filesystem.go @@ -84,7 +84,7 @@ var _ vfs.FilesystemImpl = (*filesystem)(nil) // - filesystem.mu must be locked (for writing if write param is true). // - !rp.Done(). // - inode == vfsd.Impl().(*Dentry).inode. -func stepLocked(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, inode *inode, write bool) (*vfs.Dentry, *inode, error) { +func stepLocked(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, inode *inode, write bool) (*vfs.Dentry, *inode, error) { if !inode.isDir() { return nil, nil, syserror.ENOTDIR } @@ -100,7 +100,7 @@ func stepLocked(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, inode *inode, write boo } d := vfsd.Impl().(*dentry) if name == ".." { - isRoot, err := rp.CheckRoot(vfsd) + isRoot, err := rp.CheckRoot(ctx, vfsd) if err != nil { return nil, nil, err } @@ -108,7 +108,7 @@ func stepLocked(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, inode *inode, write boo rp.Advance() return vfsd, inode, nil } - if err := rp.CheckMount(&d.parent.vfsd); err != nil { + if err := rp.CheckMount(ctx, &d.parent.vfsd); err != nil { return nil, nil, err } rp.Advance() @@ -143,7 +143,7 @@ func stepLocked(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, inode *inode, write boo child.name = name dir.childCache[name] = child } - if err := rp.CheckMount(&child.vfsd); err != nil { + if err := rp.CheckMount(ctx, &child.vfsd); err != nil { return nil, nil, err } if child.inode.isSymlink() && rp.ShouldFollowSymlink() { @@ -167,12 +167,12 @@ func stepLocked(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, inode *inode, write boo // // Preconditions: // - filesystem.mu must be locked (for writing if write param is true). -func walkLocked(rp *vfs.ResolvingPath, write bool) (*vfs.Dentry, *inode, error) { +func walkLocked(ctx context.Context, rp *vfs.ResolvingPath, write bool) (*vfs.Dentry, *inode, error) { vfsd := rp.Start() inode := vfsd.Impl().(*dentry).inode for !rp.Done() { var err error - vfsd, inode, err = stepLocked(rp, vfsd, inode, write) + vfsd, inode, err = stepLocked(ctx, rp, vfsd, inode, write) if err != nil { return nil, nil, err } @@ -196,12 +196,12 @@ func walkLocked(rp *vfs.ResolvingPath, write bool) (*vfs.Dentry, *inode, error) // Preconditions: // - filesystem.mu must be locked (for writing if write param is true). // - !rp.Done(). -func walkParentLocked(rp *vfs.ResolvingPath, write bool) (*vfs.Dentry, *inode, error) { +func walkParentLocked(ctx context.Context, rp *vfs.ResolvingPath, write bool) (*vfs.Dentry, *inode, error) { vfsd := rp.Start() inode := vfsd.Impl().(*dentry).inode for !rp.Final() { var err error - vfsd, inode, err = stepLocked(rp, vfsd, inode, write) + vfsd, inode, err = stepLocked(ctx, rp, vfsd, inode, write) if err != nil { return nil, nil, err } @@ -216,7 +216,7 @@ func walkParentLocked(rp *vfs.ResolvingPath, write bool) (*vfs.Dentry, *inode, e // the rp till the parent of the last component which should be an existing // directory. If parent is false then resolves rp entirely. Attemps to resolve // the path as far as it can with a read lock and upgrades the lock if needed. -func (fs *filesystem) walk(rp *vfs.ResolvingPath, parent bool) (*vfs.Dentry, *inode, error) { +func (fs *filesystem) walk(ctx context.Context, rp *vfs.ResolvingPath, parent bool) (*vfs.Dentry, *inode, error) { var ( vfsd *vfs.Dentry inode *inode @@ -227,9 +227,9 @@ func (fs *filesystem) walk(rp *vfs.ResolvingPath, parent bool) (*vfs.Dentry, *in // of disk. This reduces congestion (allows concurrent walks). fs.mu.RLock() if parent { - vfsd, inode, err = walkParentLocked(rp, false) + vfsd, inode, err = walkParentLocked(ctx, rp, false) } else { - vfsd, inode, err = walkLocked(rp, false) + vfsd, inode, err = walkLocked(ctx, rp, false) } fs.mu.RUnlock() @@ -238,9 +238,9 @@ func (fs *filesystem) walk(rp *vfs.ResolvingPath, parent bool) (*vfs.Dentry, *in // walk is fine as this is a read only filesystem. fs.mu.Lock() if parent { - vfsd, inode, err = walkParentLocked(rp, true) + vfsd, inode, err = walkParentLocked(ctx, rp, true) } else { - vfsd, inode, err = walkLocked(rp, true) + vfsd, inode, err = walkLocked(ctx, rp, true) } fs.mu.Unlock() } @@ -283,7 +283,7 @@ func (fs *filesystem) statTo(stat *linux.Statfs) { // AccessAt implements vfs.Filesystem.Impl.AccessAt. func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error { - _, inode, err := fs.walk(rp, false) + _, inode, err := fs.walk(ctx, rp, false) if err != nil { return err } @@ -292,7 +292,7 @@ func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds // GetDentryAt implements vfs.FilesystemImpl.GetDentryAt. func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) { - vfsd, inode, err := fs.walk(rp, false) + vfsd, inode, err := fs.walk(ctx, rp, false) if err != nil { return nil, err } @@ -312,7 +312,7 @@ func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, op // GetParentDentryAt implements vfs.FilesystemImpl.GetParentDentryAt. func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) { - vfsd, inode, err := fs.walk(rp, true) + vfsd, inode, err := fs.walk(ctx, rp, true) if err != nil { return nil, err } @@ -322,7 +322,7 @@ func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPa // OpenAt implements vfs.FilesystemImpl.OpenAt. func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) { - vfsd, inode, err := fs.walk(rp, false) + vfsd, inode, err := fs.walk(ctx, rp, false) if err != nil { return nil, err } @@ -336,7 +336,7 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf // ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt. func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) { - _, inode, err := fs.walk(rp, false) + _, inode, err := fs.walk(ctx, rp, false) if err != nil { return "", err } @@ -349,7 +349,7 @@ func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (st // StatAt implements vfs.FilesystemImpl.StatAt. func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) { - _, inode, err := fs.walk(rp, false) + _, inode, err := fs.walk(ctx, rp, false) if err != nil { return linux.Statx{}, err } @@ -360,7 +360,7 @@ func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf // StatFSAt implements vfs.FilesystemImpl.StatFSAt. func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) { - if _, _, err := fs.walk(rp, false); err != nil { + if _, _, err := fs.walk(ctx, rp, false); err != nil { return linux.Statfs{}, err } @@ -370,7 +370,7 @@ func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linu } // Release implements vfs.FilesystemImpl.Release. -func (fs *filesystem) Release() { +func (fs *filesystem) Release(ctx context.Context) { fs.vfsfs.VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) } @@ -390,7 +390,7 @@ func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs. return syserror.EEXIST } - if _, _, err := fs.walk(rp, true); err != nil { + if _, _, err := fs.walk(ctx, rp, true); err != nil { return err } @@ -403,7 +403,7 @@ func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v return syserror.EEXIST } - if _, _, err := fs.walk(rp, true); err != nil { + if _, _, err := fs.walk(ctx, rp, true); err != nil { return err } @@ -416,7 +416,7 @@ func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v return syserror.EEXIST } - _, _, err := fs.walk(rp, true) + _, _, err := fs.walk(ctx, rp, true) if err != nil { return err } @@ -430,7 +430,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa return syserror.ENOENT } - _, _, err := fs.walk(rp, false) + _, _, err := fs.walk(ctx, rp, false) if err != nil { return err } @@ -440,7 +440,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa // RmdirAt implements vfs.FilesystemImpl.RmdirAt. func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error { - _, inode, err := fs.walk(rp, false) + _, inode, err := fs.walk(ctx, rp, false) if err != nil { return err } @@ -454,7 +454,7 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error // SetStatAt implements vfs.FilesystemImpl.SetStatAt. func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error { - _, _, err := fs.walk(rp, false) + _, _, err := fs.walk(ctx, rp, false) if err != nil { return err } @@ -468,7 +468,7 @@ func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, targ return syserror.EEXIST } - _, _, err := fs.walk(rp, true) + _, _, err := fs.walk(ctx, rp, true) if err != nil { return err } @@ -478,7 +478,7 @@ func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, targ // UnlinkAt implements vfs.FilesystemImpl.UnlinkAt. func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error { - _, inode, err := fs.walk(rp, false) + _, inode, err := fs.walk(ctx, rp, false) if err != nil { return err } @@ -492,7 +492,7 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error // BoundEndpointAt implements FilesystemImpl.BoundEndpointAt. func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) { - _, inode, err := fs.walk(rp, false) + _, inode, err := fs.walk(ctx, rp, false) if err != nil { return nil, err } @@ -506,7 +506,7 @@ func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath // ListxattrAt implements vfs.FilesystemImpl.ListxattrAt. func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) { - _, _, err := fs.walk(rp, false) + _, _, err := fs.walk(ctx, rp, false) if err != nil { return nil, err } @@ -515,7 +515,7 @@ func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, si // GetxattrAt implements vfs.FilesystemImpl.GetxattrAt. func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) { - _, _, err := fs.walk(rp, false) + _, _, err := fs.walk(ctx, rp, false) if err != nil { return "", err } @@ -524,7 +524,7 @@ func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt // SetxattrAt implements vfs.FilesystemImpl.SetxattrAt. func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error { - _, _, err := fs.walk(rp, false) + _, _, err := fs.walk(ctx, rp, false) if err != nil { return err } @@ -533,7 +533,7 @@ func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt // RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt. func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error { - _, _, err := fs.walk(rp, false) + _, _, err := fs.walk(ctx, rp, false) if err != nil { return err } diff --git a/pkg/sentry/fsimpl/ext/regular_file.go b/pkg/sentry/fsimpl/ext/regular_file.go index 66d14bb95..e73e740d6 100644 --- a/pkg/sentry/fsimpl/ext/regular_file.go +++ b/pkg/sentry/fsimpl/ext/regular_file.go @@ -79,7 +79,7 @@ type regularFileFD struct { } // Release implements vfs.FileDescriptionImpl.Release. -func (fd *regularFileFD) Release() {} +func (fd *regularFileFD) Release(context.Context) {} // PRead implements vfs.FileDescriptionImpl.PRead. func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { diff --git a/pkg/sentry/fsimpl/ext/symlink.go b/pkg/sentry/fsimpl/ext/symlink.go index 62efd4095..2fd0d1fa8 100644 --- a/pkg/sentry/fsimpl/ext/symlink.go +++ b/pkg/sentry/fsimpl/ext/symlink.go @@ -73,7 +73,7 @@ type symlinkFD struct { var _ vfs.FileDescriptionImpl = (*symlinkFD)(nil) // Release implements vfs.FileDescriptionImpl.Release. -func (fd *symlinkFD) Release() {} +func (fd *symlinkFD) Release(context.Context) {} // PRead implements vfs.FileDescriptionImpl.PRead. func (fd *symlinkFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { diff --git a/pkg/sentry/fsimpl/fuse/dev.go b/pkg/sentry/fsimpl/fuse/dev.go index 2225076bc..e522ff9a0 100644 --- a/pkg/sentry/fsimpl/fuse/dev.go +++ b/pkg/sentry/fsimpl/fuse/dev.go @@ -99,7 +99,7 @@ type DeviceFD struct { } // Release implements vfs.FileDescriptionImpl.Release. -func (fd *DeviceFD) Release() { +func (fd *DeviceFD) Release(context.Context) { fd.fs.conn.connected = false } diff --git a/pkg/sentry/fsimpl/fuse/dev_test.go b/pkg/sentry/fsimpl/fuse/dev_test.go index 84c222ad6..1ffe7ccd2 100644 --- a/pkg/sentry/fsimpl/fuse/dev_test.go +++ b/pkg/sentry/fsimpl/fuse/dev_test.go @@ -356,12 +356,12 @@ func newTestConnection(system *testutil.System, k *kernel.Kernel, maxActiveReque vfsObj := &vfs.VirtualFilesystem{} fuseDev := &DeviceFD{} - if err := vfsObj.Init(); err != nil { + if err := vfsObj.Init(system.Ctx); err != nil { return nil, nil, err } vd := vfsObj.NewAnonVirtualDentry("genCountFD") - defer vd.DecRef() + defer vd.DecRef(system.Ctx) if err := fuseDev.vfsfd.Init(fuseDev, linux.O_RDWR|linux.O_CREAT, vd.Mount(), vd.Dentry(), &vfs.FileDescriptionOptions{}); err != nil { return nil, nil, err } diff --git a/pkg/sentry/fsimpl/fuse/fusefs.go b/pkg/sentry/fsimpl/fuse/fusefs.go index 200a93bbf..a1405f7c3 100644 --- a/pkg/sentry/fsimpl/fuse/fusefs.go +++ b/pkg/sentry/fsimpl/fuse/fusefs.go @@ -191,9 +191,9 @@ func NewFUSEFilesystem(ctx context.Context, devMinor uint32, opts *filesystemOpt } // Release implements vfs.FilesystemImpl.Release. -func (fs *filesystem) Release() { +func (fs *filesystem) Release(ctx context.Context) { fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) - fs.Filesystem.Release() + fs.Filesystem.Release(ctx) } // inode implements kernfs.Inode. diff --git a/pkg/sentry/fsimpl/gofer/directory.go b/pkg/sentry/fsimpl/gofer/directory.go index 8c7c8e1b3..1679066ba 100644 --- a/pkg/sentry/fsimpl/gofer/directory.go +++ b/pkg/sentry/fsimpl/gofer/directory.go @@ -122,7 +122,7 @@ type directoryFD struct { } // Release implements vfs.FileDescriptionImpl.Release. -func (fd *directoryFD) Release() { +func (fd *directoryFD) Release(context.Context) { } // IterDirents implements vfs.FileDescriptionImpl.IterDirents. @@ -139,7 +139,7 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba fd.dirents = ds } - d.InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent) + d.InotifyWithParent(ctx, linux.IN_ACCESS, 0, vfs.PathEvent) if d.cachedMetadataAuthoritative() { d.touchAtime(fd.vfsfd.Mount()) } diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go index 00e3c99cd..e6af37d0d 100644 --- a/pkg/sentry/fsimpl/gofer/filesystem.go +++ b/pkg/sentry/fsimpl/gofer/filesystem.go @@ -55,7 +55,7 @@ func (fs *filesystem) Sync(ctx context.Context) error { // Sync regular files. for _, d := range ds { err := d.syncSharedHandle(ctx) - d.DecRef() + d.DecRef(ctx) if err != nil && retErr == nil { retErr = err } @@ -65,7 +65,7 @@ func (fs *filesystem) Sync(ctx context.Context) error { // handles (so they won't be synced by the above). for _, sffd := range sffds { err := sffd.Sync(ctx) - sffd.vfsfd.DecRef() + sffd.vfsfd.DecRef(ctx) if err != nil && retErr == nil { retErr = err } @@ -133,7 +133,7 @@ afterSymlink: return d, nil } if name == ".." { - if isRoot, err := rp.CheckRoot(&d.vfsd); err != nil { + if isRoot, err := rp.CheckRoot(ctx, &d.vfsd); err != nil { return nil, err } else if isRoot || d.parent == nil { rp.Advance() @@ -146,7 +146,7 @@ afterSymlink: // // Call rp.CheckMount() before updating d.parent's metadata, since if // we traverse to another mount then d.parent's metadata is irrelevant. - if err := rp.CheckMount(&d.parent.vfsd); err != nil { + if err := rp.CheckMount(ctx, &d.parent.vfsd); err != nil { return nil, err } if d != d.parent && !d.cachedMetadataAuthoritative() { @@ -164,7 +164,7 @@ afterSymlink: if child == nil { return nil, syserror.ENOENT } - if err := rp.CheckMount(&child.vfsd); err != nil { + if err := rp.CheckMount(ctx, &child.vfsd); err != nil { return nil, err } if child.isSymlink() && mayFollowSymlinks && rp.ShouldFollowSymlink() { @@ -239,7 +239,7 @@ func (fs *filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.Vir // has 0 references, drop it). Wait to update parent.children until we // know what to replace the existing dentry with (i.e. one of the // returns below), to avoid a redundant map access. - vfsObj.InvalidateDentry(&child.vfsd) + vfsObj.InvalidateDentry(ctx, &child.vfsd) if child.isSynthetic() { // Normally we don't mark invalidated dentries as deleted since // they may still exist (but at a different path), and also for @@ -332,7 +332,7 @@ func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath, func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool, createInRemoteDir func(parent *dentry, name string) error, createInSyntheticDir func(parent *dentry, name string) error) error { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckCaching(&ds) + defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) start := rp.Start().Impl().(*dentry) if !start.cachedMetadataAuthoritative() { // Get updated metadata for start as required by @@ -384,7 +384,7 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir if dir { ev |= linux.IN_ISDIR } - parent.watches.Notify(name, uint32(ev), 0, vfs.InodeEvent, false /* unlinked */) + parent.watches.Notify(ctx, name, uint32(ev), 0, vfs.InodeEvent, false /* unlinked */) return nil } if fs.opts.interop == InteropModeShared { @@ -405,7 +405,7 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir if dir { ev |= linux.IN_ISDIR } - parent.watches.Notify(name, uint32(ev), 0, vfs.InodeEvent, false /* unlinked */) + parent.watches.Notify(ctx, name, uint32(ev), 0, vfs.InodeEvent, false /* unlinked */) return nil } if child := parent.children[name]; child != nil { @@ -426,7 +426,7 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir if dir { ev |= linux.IN_ISDIR } - parent.watches.Notify(name, uint32(ev), 0, vfs.InodeEvent, false /* unlinked */) + parent.watches.Notify(ctx, name, uint32(ev), 0, vfs.InodeEvent, false /* unlinked */) return nil } @@ -434,7 +434,7 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool) error { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckCaching(&ds) + defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) start := rp.Start().Impl().(*dentry) if !start.cachedMetadataAuthoritative() { // Get updated metadata for start as required by @@ -470,7 +470,7 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b } vfsObj := rp.VirtualFilesystem() mntns := vfs.MountNamespaceFromContext(ctx) - defer mntns.DecRef() + defer mntns.DecRef(ctx) parent.dirMu.Lock() defer parent.dirMu.Unlock() @@ -600,17 +600,17 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b // Generate inotify events for rmdir or unlink. if dir { - parent.watches.Notify(name, linux.IN_DELETE|linux.IN_ISDIR, 0, vfs.InodeEvent, true /* unlinked */) + parent.watches.Notify(ctx, name, linux.IN_DELETE|linux.IN_ISDIR, 0, vfs.InodeEvent, true /* unlinked */) } else { var cw *vfs.Watches if child != nil { cw = &child.watches } - vfs.InotifyRemoveChild(cw, &parent.watches, name) + vfs.InotifyRemoveChild(ctx, cw, &parent.watches, name) } if child != nil { - vfsObj.CommitDeleteDentry(&child.vfsd) + vfsObj.CommitDeleteDentry(ctx, &child.vfsd) child.setDeleted() if child.isSynthetic() { parent.syntheticChildren-- @@ -637,7 +637,7 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b // but dentry slices are allocated lazily, and it's much easier to say "defer // fs.renameMuRUnlockAndCheckCaching(&ds)" than "defer func() { // fs.renameMuRUnlockAndCheckCaching(ds) }()" to work around this. -func (fs *filesystem) renameMuRUnlockAndCheckCaching(ds **[]*dentry) { +func (fs *filesystem) renameMuRUnlockAndCheckCaching(ctx context.Context, ds **[]*dentry) { fs.renameMu.RUnlock() if *ds == nil { return @@ -645,20 +645,20 @@ func (fs *filesystem) renameMuRUnlockAndCheckCaching(ds **[]*dentry) { if len(**ds) != 0 { fs.renameMu.Lock() for _, d := range **ds { - d.checkCachingLocked() + d.checkCachingLocked(ctx) } fs.renameMu.Unlock() } putDentrySlice(*ds) } -func (fs *filesystem) renameMuUnlockAndCheckCaching(ds **[]*dentry) { +func (fs *filesystem) renameMuUnlockAndCheckCaching(ctx context.Context, ds **[]*dentry) { if *ds == nil { fs.renameMu.Unlock() return } for _, d := range **ds { - d.checkCachingLocked() + d.checkCachingLocked(ctx) } fs.renameMu.Unlock() putDentrySlice(*ds) @@ -668,7 +668,7 @@ func (fs *filesystem) renameMuUnlockAndCheckCaching(ds **[]*dentry) { func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckCaching(&ds) + defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return err @@ -680,7 +680,7 @@ func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckCaching(&ds) + defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return nil, err @@ -701,7 +701,7 @@ func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, op func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckCaching(&ds) + defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) start := rp.Start().Impl().(*dentry) if !start.cachedMetadataAuthoritative() { // Get updated metadata for start as required by @@ -812,7 +812,7 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckCaching(&ds) + defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) start := rp.Start().Impl().(*dentry) if !start.cachedMetadataAuthoritative() { @@ -1126,7 +1126,7 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving } childVFSFD = &fd.vfsfd } - d.watches.Notify(name, linux.IN_CREATE, 0, vfs.PathEvent, false /* unlinked */) + d.watches.Notify(ctx, name, linux.IN_CREATE, 0, vfs.PathEvent, false /* unlinked */) return childVFSFD, nil } @@ -1134,7 +1134,7 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckCaching(&ds) + defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return "", err @@ -1154,7 +1154,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa var ds *[]*dentry fs.renameMu.Lock() - defer fs.renameMuUnlockAndCheckCaching(&ds) + defer fs.renameMuUnlockAndCheckCaching(ctx, &ds) newParent, err := fs.walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry), &ds) if err != nil { return err @@ -1244,7 +1244,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa return nil } mntns := vfs.MountNamespaceFromContext(ctx) - defer mntns.DecRef() + defer mntns.DecRef(ctx) if err := vfsObj.PrepareRenameDentry(mntns, &renamed.vfsd, replacedVFSD); err != nil { return err } @@ -1269,7 +1269,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa } // Update the dentry tree. - vfsObj.CommitRenameReplaceDentry(&renamed.vfsd, replacedVFSD) + vfsObj.CommitRenameReplaceDentry(ctx, &renamed.vfsd, replacedVFSD) if replaced != nil { replaced.setDeleted() if replaced.isSynthetic() { @@ -1331,17 +1331,17 @@ func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts fs.renameMu.RLock() d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { - fs.renameMuRUnlockAndCheckCaching(&ds) + fs.renameMuRUnlockAndCheckCaching(ctx, &ds) return err } if err := d.setStat(ctx, rp.Credentials(), &opts, rp.Mount()); err != nil { - fs.renameMuRUnlockAndCheckCaching(&ds) + fs.renameMuRUnlockAndCheckCaching(ctx, &ds) return err } - fs.renameMuRUnlockAndCheckCaching(&ds) + fs.renameMuRUnlockAndCheckCaching(ctx, &ds) if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 { - d.InotifyWithParent(ev, 0, vfs.InodeEvent) + d.InotifyWithParent(ctx, ev, 0, vfs.InodeEvent) } return nil } @@ -1350,7 +1350,7 @@ func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckCaching(&ds) + defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return linux.Statx{}, err @@ -1367,7 +1367,7 @@ func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckCaching(&ds) + defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return linux.Statfs{}, err @@ -1417,7 +1417,7 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckCaching(&ds) + defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return nil, err @@ -1443,7 +1443,7 @@ func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckCaching(&ds) + defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return nil, err @@ -1455,7 +1455,7 @@ func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, si func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckCaching(&ds) + defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return "", err @@ -1469,16 +1469,16 @@ func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt fs.renameMu.RLock() d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { - fs.renameMuRUnlockAndCheckCaching(&ds) + fs.renameMuRUnlockAndCheckCaching(ctx, &ds) return err } if err := d.setxattr(ctx, rp.Credentials(), &opts); err != nil { - fs.renameMuRUnlockAndCheckCaching(&ds) + fs.renameMuRUnlockAndCheckCaching(ctx, &ds) return err } - fs.renameMuRUnlockAndCheckCaching(&ds) + fs.renameMuRUnlockAndCheckCaching(ctx, &ds) - d.InotifyWithParent(linux.IN_ATTRIB, 0, vfs.InodeEvent) + d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent) return nil } @@ -1488,16 +1488,16 @@ func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, fs.renameMu.RLock() d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { - fs.renameMuRUnlockAndCheckCaching(&ds) + fs.renameMuRUnlockAndCheckCaching(ctx, &ds) return err } if err := d.removexattr(ctx, rp.Credentials(), name); err != nil { - fs.renameMuRUnlockAndCheckCaching(&ds) + fs.renameMuRUnlockAndCheckCaching(ctx, &ds) return err } - fs.renameMuRUnlockAndCheckCaching(&ds) + fs.renameMuRUnlockAndCheckCaching(ctx, &ds) - d.InotifyWithParent(linux.IN_ATTRIB, 0, vfs.InodeEvent) + d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent) return nil } diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go index e20de84b5..2e5575d8d 100644 --- a/pkg/sentry/fsimpl/gofer/gofer.go +++ b/pkg/sentry/fsimpl/gofer/gofer.go @@ -482,7 +482,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt root, err := fs.newDentry(ctx, attachFile, qid, attrMask, &attr) if err != nil { attachFile.close(ctx) - fs.vfsfs.DecRef() + fs.vfsfs.DecRef(ctx) return nil, nil, err } // Set the root's reference count to 2. One reference is returned to the @@ -495,8 +495,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt } // Release implements vfs.FilesystemImpl.Release. -func (fs *filesystem) Release() { - ctx := context.Background() +func (fs *filesystem) Release(ctx context.Context) { mf := fs.mfp.MemoryFile() fs.syncMu.Lock() @@ -1089,10 +1088,10 @@ func (d *dentry) TryIncRef() bool { } // DecRef implements vfs.DentryImpl.DecRef. -func (d *dentry) DecRef() { +func (d *dentry) DecRef(ctx context.Context) { if refs := atomic.AddInt64(&d.refs, -1); refs == 0 { d.fs.renameMu.Lock() - d.checkCachingLocked() + d.checkCachingLocked(ctx) d.fs.renameMu.Unlock() } else if refs < 0 { panic("gofer.dentry.DecRef() called without holding a reference") @@ -1109,7 +1108,7 @@ func (d *dentry) decRefLocked() { } // InotifyWithParent implements vfs.DentryImpl.InotifyWithParent. -func (d *dentry) InotifyWithParent(events, cookie uint32, et vfs.EventType) { +func (d *dentry) InotifyWithParent(ctx context.Context, events, cookie uint32, et vfs.EventType) { if d.isDir() { events |= linux.IN_ISDIR } @@ -1117,9 +1116,9 @@ func (d *dentry) InotifyWithParent(events, cookie uint32, et vfs.EventType) { d.fs.renameMu.RLock() // The ordering below is important, Linux always notifies the parent first. if d.parent != nil { - d.parent.watches.Notify(d.name, events, cookie, et, d.isDeleted()) + d.parent.watches.Notify(ctx, d.name, events, cookie, et, d.isDeleted()) } - d.watches.Notify("", events, cookie, et, d.isDeleted()) + d.watches.Notify(ctx, "", events, cookie, et, d.isDeleted()) d.fs.renameMu.RUnlock() } @@ -1131,10 +1130,10 @@ func (d *dentry) Watches() *vfs.Watches { // OnZeroWatches implements vfs.DentryImpl.OnZeroWatches. // // If no watches are left on this dentry and it has no references, cache it. -func (d *dentry) OnZeroWatches() { +func (d *dentry) OnZeroWatches(ctx context.Context) { if atomic.LoadInt64(&d.refs) == 0 { d.fs.renameMu.Lock() - d.checkCachingLocked() + d.checkCachingLocked(ctx) d.fs.renameMu.Unlock() } } @@ -1149,7 +1148,7 @@ func (d *dentry) OnZeroWatches() { // do nothing. // // Preconditions: d.fs.renameMu must be locked for writing. -func (d *dentry) checkCachingLocked() { +func (d *dentry) checkCachingLocked(ctx context.Context) { // Dentries with a non-zero reference count must be retained. (The only way // to obtain a reference on a dentry with zero references is via path // resolution, which requires renameMu, so if d.refs is zero then it will @@ -1171,14 +1170,14 @@ func (d *dentry) checkCachingLocked() { // reachable by path resolution and should be dropped immediately. if d.vfsd.IsDead() { if d.isDeleted() { - d.watches.HandleDeletion() + d.watches.HandleDeletion(ctx) } if d.cached { d.fs.cachedDentries.Remove(d) d.fs.cachedDentriesLen-- d.cached = false } - d.destroyLocked() + d.destroyLocked(ctx) return } // If d still has inotify watches and it is not deleted or invalidated, we @@ -1213,7 +1212,7 @@ func (d *dentry) checkCachingLocked() { if !victim.vfsd.IsDead() { // Note that victim can't be a mount point (in any mount // namespace), since VFS holds references on mount points. - d.fs.vfsfs.VirtualFilesystem().InvalidateDentry(&victim.vfsd) + d.fs.vfsfs.VirtualFilesystem().InvalidateDentry(ctx, &victim.vfsd) delete(victim.parent.children, victim.name) // We're only deleting the dentry, not the file it // represents, so we don't need to update @@ -1221,7 +1220,7 @@ func (d *dentry) checkCachingLocked() { } victim.parent.dirMu.Unlock() } - victim.destroyLocked() + victim.destroyLocked(ctx) } // Whether or not victim was destroyed, we brought fs.cachedDentriesLen // back down to fs.opts.maxCachedDentries, so we don't loop. @@ -1233,7 +1232,7 @@ func (d *dentry) checkCachingLocked() { // // Preconditions: d.fs.renameMu must be locked for writing. d.refs == 0. d is // not a child dentry. -func (d *dentry) destroyLocked() { +func (d *dentry) destroyLocked(ctx context.Context) { switch atomic.LoadInt64(&d.refs) { case 0: // Mark the dentry destroyed. @@ -1244,7 +1243,6 @@ func (d *dentry) destroyLocked() { panic("dentry.destroyLocked() called with references on the dentry") } - ctx := context.Background() d.handleMu.Lock() if !d.handle.file.isNil() { mf := d.fs.mfp.MemoryFile() @@ -1276,7 +1274,7 @@ func (d *dentry) destroyLocked() { // d.fs.renameMu. if d.parent != nil { if refs := atomic.AddInt64(&d.parent.refs, -1); refs == 0 { - d.parent.checkCachingLocked() + d.parent.checkCachingLocked(ctx) } else if refs < 0 { panic("gofer.dentry.DecRef() called without holding a reference") } @@ -1514,7 +1512,7 @@ func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) return err } if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 { - fd.dentry().InotifyWithParent(ev, 0, vfs.InodeEvent) + fd.dentry().InotifyWithParent(ctx, ev, 0, vfs.InodeEvent) } return nil } @@ -1535,7 +1533,7 @@ func (fd *fileDescription) Setxattr(ctx context.Context, opts vfs.SetxattrOption if err := d.setxattr(ctx, auth.CredentialsFromContext(ctx), &opts); err != nil { return err } - d.InotifyWithParent(linux.IN_ATTRIB, 0, vfs.InodeEvent) + d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent) return nil } @@ -1545,7 +1543,7 @@ func (fd *fileDescription) Removexattr(ctx context.Context, name string) error { if err := d.removexattr(ctx, auth.CredentialsFromContext(ctx), name); err != nil { return err } - d.InotifyWithParent(linux.IN_ATTRIB, 0, vfs.InodeEvent) + d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent) return nil } diff --git a/pkg/sentry/fsimpl/gofer/gofer_test.go b/pkg/sentry/fsimpl/gofer/gofer_test.go index adff39490..56d80bcf8 100644 --- a/pkg/sentry/fsimpl/gofer/gofer_test.go +++ b/pkg/sentry/fsimpl/gofer/gofer_test.go @@ -50,7 +50,7 @@ func TestDestroyIdempotent(t *testing.T) { } parent.cacheNewChildLocked(child, "child") - child.checkCachingLocked() + child.checkCachingLocked(ctx) if got := atomic.LoadInt64(&child.refs); got != -1 { t.Fatalf("child.refs=%d, want: -1", got) } @@ -58,6 +58,6 @@ func TestDestroyIdempotent(t *testing.T) { if got := atomic.LoadInt64(&parent.refs); got != -1 { t.Fatalf("parent.refs=%d, want: -1", got) } - child.checkCachingLocked() - child.checkCachingLocked() + child.checkCachingLocked(ctx) + child.checkCachingLocked(ctx) } diff --git a/pkg/sentry/fsimpl/gofer/regular_file.go b/pkg/sentry/fsimpl/gofer/regular_file.go index 09f142cfc..420e8efe2 100644 --- a/pkg/sentry/fsimpl/gofer/regular_file.go +++ b/pkg/sentry/fsimpl/gofer/regular_file.go @@ -48,7 +48,7 @@ type regularFileFD struct { } // Release implements vfs.FileDescriptionImpl.Release. -func (fd *regularFileFD) Release() { +func (fd *regularFileFD) Release(context.Context) { } // OnClose implements vfs.FileDescriptionImpl.OnClose. diff --git a/pkg/sentry/fsimpl/gofer/socket.go b/pkg/sentry/fsimpl/gofer/socket.go index d6dbe9092..85d2bee72 100644 --- a/pkg/sentry/fsimpl/gofer/socket.go +++ b/pkg/sentry/fsimpl/gofer/socket.go @@ -108,7 +108,7 @@ func (e *endpoint) UnidirectionalConnect(ctx context.Context) (transport.Connect // We don't need the receiver. c.CloseRecv() - c.Release() + c.Release(ctx) return c, nil } @@ -136,8 +136,8 @@ func (e *endpoint) newConnectedEndpoint(ctx context.Context, flags p9.ConnectFla } // Release implements transport.BoundEndpoint.Release. -func (e *endpoint) Release() { - e.dentry.DecRef() +func (e *endpoint) Release(ctx context.Context) { + e.dentry.DecRef(ctx) } // Passcred implements transport.BoundEndpoint.Passcred. diff --git a/pkg/sentry/fsimpl/gofer/special_file.go b/pkg/sentry/fsimpl/gofer/special_file.go index 811528982..fc269ef2b 100644 --- a/pkg/sentry/fsimpl/gofer/special_file.go +++ b/pkg/sentry/fsimpl/gofer/special_file.go @@ -80,11 +80,11 @@ func newSpecialFileFD(h handle, mnt *vfs.Mount, d *dentry, locks *vfs.FileLocks, } // Release implements vfs.FileDescriptionImpl.Release. -func (fd *specialFileFD) Release() { +func (fd *specialFileFD) Release(ctx context.Context) { if fd.haveQueue { fdnotifier.RemoveFD(fd.handle.fd) } - fd.handle.close(context.Background()) + fd.handle.close(ctx) fs := fd.vfsfd.Mount().Filesystem().Impl().(*filesystem) fs.syncMu.Lock() delete(fs.specialFileFDs, fd) diff --git a/pkg/sentry/fsimpl/host/control.go b/pkg/sentry/fsimpl/host/control.go index b9082a20f..0135e4428 100644 --- a/pkg/sentry/fsimpl/host/control.go +++ b/pkg/sentry/fsimpl/host/control.go @@ -58,7 +58,7 @@ func (c *scmRights) Clone() transport.RightsControlMessage { } // Release implements transport.RightsControlMessage.Release. -func (c *scmRights) Release() { +func (c *scmRights) Release(ctx context.Context) { for _, fd := range c.fds { syscall.Close(fd) } diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go index c894f2ca0..bf922c566 100644 --- a/pkg/sentry/fsimpl/host/host.go +++ b/pkg/sentry/fsimpl/host/host.go @@ -117,7 +117,7 @@ func NewFD(ctx context.Context, mnt *vfs.Mount, hostFD int, opts *NewFDOptions) d.Init(i) // i.open will take a reference on d. - defer d.DecRef() + defer d.DecRef(ctx) // For simplicity, fileDescription.offset is set to 0. Technically, we // should only set to 0 on files that are not seekable (sockets, pipes, @@ -168,9 +168,9 @@ type filesystem struct { devMinor uint32 } -func (fs *filesystem) Release() { +func (fs *filesystem) Release(ctx context.Context) { fs.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) - fs.Filesystem.Release() + fs.Filesystem.Release(ctx) } func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error { @@ -431,12 +431,12 @@ func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Cre } // DecRef implements kernfs.Inode. -func (i *inode) DecRef() { - i.AtomicRefCount.DecRefWithDestructor(i.Destroy) +func (i *inode) DecRef(ctx context.Context) { + i.AtomicRefCount.DecRefWithDestructor(ctx, i.Destroy) } // Destroy implements kernfs.Inode. -func (i *inode) Destroy() { +func (i *inode) Destroy(context.Context) { if i.wouldBlock { fdnotifier.RemoveFD(int32(i.hostFD)) } @@ -542,7 +542,7 @@ func (f *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux } // Release implements vfs.FileDescriptionImpl. -func (f *fileDescription) Release() { +func (f *fileDescription) Release(context.Context) { // noop } diff --git a/pkg/sentry/fsimpl/host/socket.go b/pkg/sentry/fsimpl/host/socket.go index fd16bd92d..4979dd0a9 100644 --- a/pkg/sentry/fsimpl/host/socket.go +++ b/pkg/sentry/fsimpl/host/socket.go @@ -139,7 +139,7 @@ func NewConnectedEndpoint(ctx context.Context, hostFD int, addr string, saveable } // Send implements transport.ConnectedEndpoint.Send. -func (c *ConnectedEndpoint) Send(data [][]byte, controlMessages transport.ControlMessages, from tcpip.FullAddress) (int64, bool, *syserr.Error) { +func (c *ConnectedEndpoint) Send(ctx context.Context, data [][]byte, controlMessages transport.ControlMessages, from tcpip.FullAddress) (int64, bool, *syserr.Error) { c.mu.RLock() defer c.mu.RUnlock() @@ -216,7 +216,7 @@ func (c *ConnectedEndpoint) EventUpdate() { } // Recv implements transport.Receiver.Recv. -func (c *ConnectedEndpoint) Recv(data [][]byte, creds bool, numRights int, peek bool) (int64, int64, transport.ControlMessages, bool, tcpip.FullAddress, bool, *syserr.Error) { +func (c *ConnectedEndpoint) Recv(ctx context.Context, data [][]byte, creds bool, numRights int, peek bool) (int64, int64, transport.ControlMessages, bool, tcpip.FullAddress, bool, *syserr.Error) { c.mu.RLock() defer c.mu.RUnlock() @@ -317,8 +317,8 @@ func (c *ConnectedEndpoint) destroyLocked() { // Release implements transport.ConnectedEndpoint.Release and // transport.Receiver.Release. -func (c *ConnectedEndpoint) Release() { - c.ref.DecRefWithDestructor(func() { +func (c *ConnectedEndpoint) Release(ctx context.Context) { + c.ref.DecRefWithDestructor(ctx, func(context.Context) { c.mu.Lock() c.destroyLocked() c.mu.Unlock() @@ -347,8 +347,8 @@ func (e *SCMConnectedEndpoint) Init() error { // Release implements transport.ConnectedEndpoint.Release and // transport.Receiver.Release. -func (e *SCMConnectedEndpoint) Release() { - e.ref.DecRefWithDestructor(func() { +func (e *SCMConnectedEndpoint) Release(ctx context.Context) { + e.ref.DecRefWithDestructor(ctx, func(context.Context) { e.mu.Lock() if err := syscall.Close(e.fd); err != nil { log.Warningf("Failed to close host fd %d: %v", err) diff --git a/pkg/sentry/fsimpl/host/tty.go b/pkg/sentry/fsimpl/host/tty.go index 4ee9270cc..d372c60cb 100644 --- a/pkg/sentry/fsimpl/host/tty.go +++ b/pkg/sentry/fsimpl/host/tty.go @@ -67,12 +67,12 @@ func (t *TTYFileDescription) ForegroundProcessGroup() *kernel.ProcessGroup { } // Release implements fs.FileOperations.Release. -func (t *TTYFileDescription) Release() { +func (t *TTYFileDescription) Release(ctx context.Context) { t.mu.Lock() t.fgProcessGroup = nil t.mu.Unlock() - t.fileDescription.Release() + t.fileDescription.Release(ctx) } // PRead implements vfs.FileDescriptionImpl. diff --git a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go index c6c4472e7..12adf727a 100644 --- a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go +++ b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go @@ -122,7 +122,7 @@ func (fd *DynamicBytesFD) PWrite(ctx context.Context, src usermem.IOSequence, of } // Release implements vfs.FileDescriptionImpl.Release. -func (fd *DynamicBytesFD) Release() {} +func (fd *DynamicBytesFD) Release(context.Context) {} // Stat implements vfs.FileDescriptionImpl.Stat. func (fd *DynamicBytesFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { diff --git a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go index 1d37ccb98..fcee6200a 100644 --- a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go @@ -113,7 +113,7 @@ func (fd *GenericDirectoryFD) PWrite(ctx context.Context, src usermem.IOSequence } // Release implements vfs.FileDescriptionImpl.Release. -func (fd *GenericDirectoryFD) Release() {} +func (fd *GenericDirectoryFD) Release(context.Context) {} func (fd *GenericDirectoryFD) filesystem() *vfs.Filesystem { return fd.vfsfd.VirtualDentry().Mount().Filesystem() diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go index 61a36cff9..d7edb6342 100644 --- a/pkg/sentry/fsimpl/kernfs/filesystem.go +++ b/pkg/sentry/fsimpl/kernfs/filesystem.go @@ -56,13 +56,13 @@ afterSymlink: return vfsd, nil } if name == ".." { - if isRoot, err := rp.CheckRoot(vfsd); err != nil { + if isRoot, err := rp.CheckRoot(ctx, vfsd); err != nil { return nil, err } else if isRoot || d.parent == nil { rp.Advance() return vfsd, nil } - if err := rp.CheckMount(&d.parent.vfsd); err != nil { + if err := rp.CheckMount(ctx, &d.parent.vfsd); err != nil { return nil, err } rp.Advance() @@ -77,7 +77,7 @@ afterSymlink: if err != nil { return nil, err } - if err := rp.CheckMount(&next.vfsd); err != nil { + if err := rp.CheckMount(ctx, &next.vfsd); err != nil { return nil, err } // Resolve any symlink at current path component. @@ -88,7 +88,7 @@ afterSymlink: } if targetVD.Ok() { err := rp.HandleJump(targetVD) - targetVD.DecRef() + targetVD.DecRef(ctx) if err != nil { return nil, err } @@ -116,7 +116,7 @@ func (fs *Filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.Vir // Cached dentry exists, revalidate. if !child.inode.Valid(ctx) { delete(parent.children, name) - vfsObj.InvalidateDentry(&child.vfsd) + vfsObj.InvalidateDentry(ctx, &child.vfsd) fs.deferDecRef(&child.vfsd) // Reference from Lookup. child = nil } @@ -234,7 +234,7 @@ func checkDeleteLocked(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Den } // Release implements vfs.FilesystemImpl.Release. -func (fs *Filesystem) Release() { +func (fs *Filesystem) Release(context.Context) { } // Sync implements vfs.FilesystemImpl.Sync. @@ -246,7 +246,7 @@ func (fs *Filesystem) Sync(ctx context.Context) error { // AccessAt implements vfs.Filesystem.Impl.AccessAt. func (fs *Filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error { fs.mu.RLock() - defer fs.processDeferredDecRefs() + defer fs.processDeferredDecRefs(ctx) defer fs.mu.RUnlock() _, inode, err := fs.walkExistingLocked(ctx, rp) @@ -259,7 +259,7 @@ func (fs *Filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds // GetDentryAt implements vfs.FilesystemImpl.GetDentryAt. func (fs *Filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) { fs.mu.RLock() - defer fs.processDeferredDecRefs() + defer fs.processDeferredDecRefs(ctx) defer fs.mu.RUnlock() vfsd, inode, err := fs.walkExistingLocked(ctx, rp) if err != nil { @@ -282,7 +282,7 @@ func (fs *Filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, op // GetParentDentryAt implements vfs.FilesystemImpl.GetParentDentryAt. func (fs *Filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) { fs.mu.RLock() - defer fs.processDeferredDecRefs() + defer fs.processDeferredDecRefs(ctx) defer fs.mu.RUnlock() vfsd, _, err := fs.walkParentDirLocked(ctx, rp) if err != nil { @@ -300,7 +300,7 @@ func (fs *Filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs. fs.mu.Lock() defer fs.mu.Unlock() parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp) - fs.processDeferredDecRefsLocked() + fs.processDeferredDecRefsLocked(ctx) if err != nil { return err } @@ -337,7 +337,7 @@ func (fs *Filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v fs.mu.Lock() defer fs.mu.Unlock() parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp) - fs.processDeferredDecRefsLocked() + fs.processDeferredDecRefsLocked(ctx) if err != nil { return err } @@ -365,7 +365,7 @@ func (fs *Filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v fs.mu.Lock() defer fs.mu.Unlock() parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp) - fs.processDeferredDecRefsLocked() + fs.processDeferredDecRefsLocked(ctx) if err != nil { return err } @@ -397,7 +397,7 @@ func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf // Do not create new file. if opts.Flags&linux.O_CREAT == 0 { fs.mu.RLock() - defer fs.processDeferredDecRefs() + defer fs.processDeferredDecRefs(ctx) defer fs.mu.RUnlock() vfsd, inode, err := fs.walkExistingLocked(ctx, rp) if err != nil { @@ -429,7 +429,7 @@ func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf } afterTrailingSymlink: parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp) - fs.processDeferredDecRefsLocked() + fs.processDeferredDecRefsLocked(ctx) if err != nil { return nil, err } @@ -483,7 +483,7 @@ afterTrailingSymlink: } if targetVD.Ok() { err := rp.HandleJump(targetVD) - targetVD.DecRef() + targetVD.DecRef(ctx) if err != nil { return nil, err } @@ -507,7 +507,7 @@ func (fs *Filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (st fs.mu.RLock() d, inode, err := fs.walkExistingLocked(ctx, rp) fs.mu.RUnlock() - fs.processDeferredDecRefs() + fs.processDeferredDecRefs(ctx) if err != nil { return "", err } @@ -526,7 +526,7 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa noReplace := opts.Flags&linux.RENAME_NOREPLACE != 0 fs.mu.Lock() - defer fs.processDeferredDecRefsLocked() + defer fs.processDeferredDecRefsLocked(ctx) defer fs.mu.Unlock() // Resolve the destination directory first to verify that it's on this @@ -584,7 +584,7 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa } mntns := vfs.MountNamespaceFromContext(ctx) - defer mntns.DecRef() + defer mntns.DecRef(ctx) virtfs := rp.VirtualFilesystem() // We can't deadlock here due to lock ordering because we're protected from @@ -615,7 +615,7 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa dstDir.children = make(map[string]*Dentry) } dstDir.children[pc] = src - virtfs.CommitRenameReplaceDentry(srcVFSD, replaced) + virtfs.CommitRenameReplaceDentry(ctx, srcVFSD, replaced) return nil } @@ -624,7 +624,7 @@ func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error fs.mu.Lock() defer fs.mu.Unlock() vfsd, inode, err := fs.walkExistingLocked(ctx, rp) - fs.processDeferredDecRefsLocked() + fs.processDeferredDecRefsLocked(ctx) if err != nil { return err } @@ -648,7 +648,7 @@ func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error defer parentDentry.dirMu.Unlock() mntns := vfs.MountNamespaceFromContext(ctx) - defer mntns.DecRef() + defer mntns.DecRef(ctx) if err := virtfs.PrepareDeleteDentry(mntns, vfsd); err != nil { return err } @@ -656,7 +656,7 @@ func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error virtfs.AbortDeleteDentry(vfsd) return err } - virtfs.CommitDeleteDentry(vfsd) + virtfs.CommitDeleteDentry(ctx, vfsd) return nil } @@ -665,7 +665,7 @@ func (fs *Filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts fs.mu.RLock() _, inode, err := fs.walkExistingLocked(ctx, rp) fs.mu.RUnlock() - fs.processDeferredDecRefs() + fs.processDeferredDecRefs(ctx) if err != nil { return err } @@ -680,7 +680,7 @@ func (fs *Filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf fs.mu.RLock() _, inode, err := fs.walkExistingLocked(ctx, rp) fs.mu.RUnlock() - fs.processDeferredDecRefs() + fs.processDeferredDecRefs(ctx) if err != nil { return linux.Statx{}, err } @@ -692,7 +692,7 @@ func (fs *Filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linu fs.mu.RLock() _, _, err := fs.walkExistingLocked(ctx, rp) fs.mu.RUnlock() - fs.processDeferredDecRefs() + fs.processDeferredDecRefs(ctx) if err != nil { return linux.Statfs{}, err } @@ -708,7 +708,7 @@ func (fs *Filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, targ fs.mu.Lock() defer fs.mu.Unlock() parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp) - fs.processDeferredDecRefsLocked() + fs.processDeferredDecRefsLocked(ctx) if err != nil { return err } @@ -733,7 +733,7 @@ func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error fs.mu.Lock() defer fs.mu.Unlock() vfsd, _, err := fs.walkExistingLocked(ctx, rp) - fs.processDeferredDecRefsLocked() + fs.processDeferredDecRefsLocked(ctx) if err != nil { return err } @@ -753,7 +753,7 @@ func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error parentDentry.dirMu.Lock() defer parentDentry.dirMu.Unlock() mntns := vfs.MountNamespaceFromContext(ctx) - defer mntns.DecRef() + defer mntns.DecRef(ctx) if err := virtfs.PrepareDeleteDentry(mntns, vfsd); err != nil { return err } @@ -761,7 +761,7 @@ func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error virtfs.AbortDeleteDentry(vfsd) return err } - virtfs.CommitDeleteDentry(vfsd) + virtfs.CommitDeleteDentry(ctx, vfsd) return nil } @@ -770,7 +770,7 @@ func (fs *Filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath fs.mu.RLock() _, inode, err := fs.walkExistingLocked(ctx, rp) fs.mu.RUnlock() - fs.processDeferredDecRefs() + fs.processDeferredDecRefs(ctx) if err != nil { return nil, err } @@ -785,7 +785,7 @@ func (fs *Filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, si fs.mu.RLock() _, _, err := fs.walkExistingLocked(ctx, rp) fs.mu.RUnlock() - fs.processDeferredDecRefs() + fs.processDeferredDecRefs(ctx) if err != nil { return nil, err } @@ -798,7 +798,7 @@ func (fs *Filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt fs.mu.RLock() _, _, err := fs.walkExistingLocked(ctx, rp) fs.mu.RUnlock() - fs.processDeferredDecRefs() + fs.processDeferredDecRefs(ctx) if err != nil { return "", err } @@ -811,7 +811,7 @@ func (fs *Filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt fs.mu.RLock() _, _, err := fs.walkExistingLocked(ctx, rp) fs.mu.RUnlock() - fs.processDeferredDecRefs() + fs.processDeferredDecRefs(ctx) if err != nil { return err } @@ -824,7 +824,7 @@ func (fs *Filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, fs.mu.RLock() _, _, err := fs.walkExistingLocked(ctx, rp) fs.mu.RUnlock() - fs.processDeferredDecRefs() + fs.processDeferredDecRefs(ctx) if err != nil { return err } diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go index 579e627f0..c3efcf3ec 100644 --- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go +++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go @@ -40,7 +40,7 @@ func (InodeNoopRefCount) IncRef() { } // DecRef implements Inode.DecRef. -func (InodeNoopRefCount) DecRef() { +func (InodeNoopRefCount) DecRef(context.Context) { } // TryIncRef implements Inode.TryIncRef. @@ -49,7 +49,7 @@ func (InodeNoopRefCount) TryIncRef() bool { } // Destroy implements Inode.Destroy. -func (InodeNoopRefCount) Destroy() { +func (InodeNoopRefCount) Destroy(context.Context) { } // InodeDirectoryNoNewChildren partially implements the Inode interface. @@ -366,12 +366,12 @@ func (o *OrderedChildren) Init(opts OrderedChildrenOptions) { } // DecRef implements Inode.DecRef. -func (o *OrderedChildren) DecRef() { - o.AtomicRefCount.DecRefWithDestructor(o.Destroy) +func (o *OrderedChildren) DecRef(ctx context.Context) { + o.AtomicRefCount.DecRefWithDestructor(ctx, o.Destroy) } // Destroy cleans up resources referenced by this OrderedChildren. -func (o *OrderedChildren) Destroy() { +func (o *OrderedChildren) Destroy(context.Context) { o.mu.Lock() defer o.mu.Unlock() o.order.Reset() diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go index 46f207664..080118841 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs.go @@ -116,17 +116,17 @@ func (fs *Filesystem) deferDecRef(d *vfs.Dentry) { // processDeferredDecRefs calls vfs.Dentry.DecRef on all dentries in the // droppedDentries list. See comment on Filesystem.mu. -func (fs *Filesystem) processDeferredDecRefs() { +func (fs *Filesystem) processDeferredDecRefs(ctx context.Context) { fs.mu.Lock() - fs.processDeferredDecRefsLocked() + fs.processDeferredDecRefsLocked(ctx) fs.mu.Unlock() } // Precondition: fs.mu must be held for writing. -func (fs *Filesystem) processDeferredDecRefsLocked() { +func (fs *Filesystem) processDeferredDecRefsLocked(ctx context.Context) { fs.droppedDentriesMu.Lock() for _, d := range fs.droppedDentries { - d.DecRef() + d.DecRef(ctx) } fs.droppedDentries = fs.droppedDentries[:0] // Keep slice memory for reuse. fs.droppedDentriesMu.Unlock() @@ -212,16 +212,16 @@ func (d *Dentry) isSymlink() bool { } // DecRef implements vfs.DentryImpl.DecRef. -func (d *Dentry) DecRef() { - d.AtomicRefCount.DecRefWithDestructor(d.destroy) +func (d *Dentry) DecRef(ctx context.Context) { + d.AtomicRefCount.DecRefWithDestructor(ctx, d.destroy) } // Precondition: Dentry must be removed from VFS' dentry cache. -func (d *Dentry) destroy() { - d.inode.DecRef() // IncRef from Init. +func (d *Dentry) destroy(ctx context.Context) { + d.inode.DecRef(ctx) // IncRef from Init. d.inode = nil if d.parent != nil { - d.parent.DecRef() // IncRef from Dentry.InsertChild. + d.parent.DecRef(ctx) // IncRef from Dentry.InsertChild. } } @@ -230,7 +230,7 @@ func (d *Dentry) destroy() { // Although Linux technically supports inotify on pseudo filesystems (inotify // is implemented at the vfs layer), it is not particularly useful. It is left // unimplemented until someone actually needs it. -func (d *Dentry) InotifyWithParent(events, cookie uint32, et vfs.EventType) {} +func (d *Dentry) InotifyWithParent(ctx context.Context, events, cookie uint32, et vfs.EventType) {} // Watches implements vfs.DentryImpl.Watches. func (d *Dentry) Watches() *vfs.Watches { @@ -238,7 +238,7 @@ func (d *Dentry) Watches() *vfs.Watches { } // OnZeroWatches implements vfs.Dentry.OnZeroWatches. -func (d *Dentry) OnZeroWatches() {} +func (d *Dentry) OnZeroWatches(context.Context) {} // InsertChild inserts child into the vfs dentry cache with the given name under // this dentry. This does not update the directory inode, so calling this on @@ -326,12 +326,12 @@ type Inode interface { type inodeRefs interface { IncRef() - DecRef() + DecRef(ctx context.Context) TryIncRef() bool // Destroy is called when the inode reaches zero references. Destroy release // all resources (references) on objects referenced by the inode, including // any child dentries. - Destroy() + Destroy(ctx context.Context) } type inodeMetadata interface { diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go index dc407eb1d..c5d5afedf 100644 --- a/pkg/sentry/fsimpl/kernfs/kernfs_test.go +++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go @@ -46,7 +46,7 @@ func newTestSystem(t *testing.T, rootFn RootDentryFn) *testutil.System { ctx := contexttest.Context(t) creds := auth.CredentialsFromContext(ctx) v := &vfs.VirtualFilesystem{} - if err := v.Init(); err != nil { + if err := v.Init(ctx); err != nil { t.Fatalf("VFS init: %v", err) } v.MustRegisterFilesystemType("testfs", &fsType{rootFn: rootFn}, &vfs.RegisterFilesystemTypeOptions{ @@ -163,7 +163,7 @@ func (d *dir) NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (* dir := d.fs.newDir(creds, opts.Mode, nil) dirVFSD := dir.VFSDentry() if err := d.OrderedChildren.Insert(name, dirVFSD); err != nil { - dir.DecRef() + dir.DecRef(ctx) return nil, err } d.IncLinks(1) @@ -175,7 +175,7 @@ func (d *dir) NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (* f := d.fs.newFile(creds, "") fVFSD := f.VFSDentry() if err := d.OrderedChildren.Insert(name, fVFSD); err != nil { - f.DecRef() + f.DecRef(ctx) return nil, err } return fVFSD, nil @@ -213,7 +213,7 @@ func TestBasic(t *testing.T) { }) }) defer sys.Destroy() - sys.GetDentryOrDie(sys.PathOpAtRoot("file1")).DecRef() + sys.GetDentryOrDie(sys.PathOpAtRoot("file1")).DecRef(sys.Ctx) } func TestMkdirGetDentry(t *testing.T) { @@ -228,7 +228,7 @@ func TestMkdirGetDentry(t *testing.T) { if err := sys.VFS.MkdirAt(sys.Ctx, sys.Creds, pop, &vfs.MkdirOptions{Mode: 0755}); err != nil { t.Fatalf("MkdirAt for PathOperation %+v failed: %v", pop, err) } - sys.GetDentryOrDie(pop).DecRef() + sys.GetDentryOrDie(pop).DecRef(sys.Ctx) } func TestReadStaticFile(t *testing.T) { @@ -246,7 +246,7 @@ func TestReadStaticFile(t *testing.T) { if err != nil { t.Fatalf("OpenAt for PathOperation %+v failed: %v", pop, err) } - defer fd.DecRef() + defer fd.DecRef(sys.Ctx) content, err := sys.ReadToEnd(fd) if err != nil { @@ -273,7 +273,7 @@ func TestCreateNewFileInStaticDir(t *testing.T) { } // Close the file. The file should persist. - fd.DecRef() + fd.DecRef(sys.Ctx) fd, err = sys.VFS.OpenAt(sys.Ctx, sys.Creds, pop, &vfs.OpenOptions{ Flags: linux.O_RDONLY, @@ -281,7 +281,7 @@ func TestCreateNewFileInStaticDir(t *testing.T) { if err != nil { t.Fatalf("OpenAt(pop:%+v) = %+v failed: %v", pop, fd, err) } - fd.DecRef() + fd.DecRef(sys.Ctx) } func TestDirFDReadWrite(t *testing.T) { @@ -297,7 +297,7 @@ func TestDirFDReadWrite(t *testing.T) { if err != nil { t.Fatalf("OpenAt for PathOperation %+v failed: %v", pop, err) } - defer fd.DecRef() + defer fd.DecRef(sys.Ctx) // Read/Write should fail for directory FDs. if _, err := fd.Read(sys.Ctx, usermem.BytesIOSequence([]byte{}), vfs.ReadOptions{}); err != syserror.EISDIR { diff --git a/pkg/sentry/fsimpl/overlay/copy_up.go b/pkg/sentry/fsimpl/overlay/copy_up.go index 8f8dcfafe..b3d19ff82 100644 --- a/pkg/sentry/fsimpl/overlay/copy_up.go +++ b/pkg/sentry/fsimpl/overlay/copy_up.go @@ -98,7 +98,7 @@ func (d *dentry) copyUpLocked(ctx context.Context) error { if err != nil { return err } - defer oldFD.DecRef() + defer oldFD.DecRef(ctx) newFD, err := vfsObj.OpenAt(ctx, d.fs.creds, &newpop, &vfs.OpenOptions{ Flags: linux.O_WRONLY | linux.O_CREAT | linux.O_EXCL, Mode: linux.FileMode(d.mode &^ linux.S_IFMT), @@ -106,7 +106,7 @@ func (d *dentry) copyUpLocked(ctx context.Context) error { if err != nil { return err } - defer newFD.DecRef() + defer newFD.DecRef(ctx) bufIOSeq := usermem.BytesIOSequence(make([]byte, 32*1024)) // arbitrary buffer size for { readN, readErr := oldFD.Read(ctx, bufIOSeq, vfs.ReadOptions{}) @@ -241,13 +241,13 @@ func (d *dentry) copyUpLocked(ctx context.Context) error { Mask: linux.STATX_INO, }) if err != nil { - d.upperVD.DecRef() + d.upperVD.DecRef(ctx) d.upperVD = vfs.VirtualDentry{} cleanupUndoCopyUp() return err } if upperStat.Mask&linux.STATX_INO == 0 { - d.upperVD.DecRef() + d.upperVD.DecRef(ctx) d.upperVD = vfs.VirtualDentry{} cleanupUndoCopyUp() return syserror.EREMOTE diff --git a/pkg/sentry/fsimpl/overlay/directory.go b/pkg/sentry/fsimpl/overlay/directory.go index f5c2462a5..fccb94105 100644 --- a/pkg/sentry/fsimpl/overlay/directory.go +++ b/pkg/sentry/fsimpl/overlay/directory.go @@ -46,7 +46,7 @@ func (d *dentry) collectWhiteoutsForRmdirLocked(ctx context.Context) (map[string readdirErr = err return false } - defer layerFD.DecRef() + defer layerFD.DecRef(ctx) // Reuse slice allocated for maybeWhiteouts from a previous layer to // reduce allocations. @@ -108,7 +108,7 @@ type directoryFD struct { } // Release implements vfs.FileDescriptionImpl.Release. -func (fd *directoryFD) Release() { +func (fd *directoryFD) Release(ctx context.Context) { } // IterDirents implements vfs.FileDescriptionImpl.IterDirents. @@ -177,7 +177,7 @@ func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) { readdirErr = err return false } - defer layerFD.DecRef() + defer layerFD.DecRef(ctx) // Reuse slice allocated for maybeWhiteouts from a previous layer to // reduce allocations. @@ -282,6 +282,6 @@ func (fd *directoryFD) Sync(ctx context.Context) error { return err } err = upperFD.Sync(ctx) - upperFD.DecRef() + upperFD.DecRef(ctx) return err } diff --git a/pkg/sentry/fsimpl/overlay/filesystem.go b/pkg/sentry/fsimpl/overlay/filesystem.go index 6b705e955..986b36ead 100644 --- a/pkg/sentry/fsimpl/overlay/filesystem.go +++ b/pkg/sentry/fsimpl/overlay/filesystem.go @@ -77,7 +77,7 @@ func putDentrySlice(ds *[]*dentry) { // but dentry slices are allocated lazily, and it's much easier to say "defer // fs.renameMuRUnlockAndCheckDrop(&ds)" than "defer func() { // fs.renameMuRUnlockAndCheckDrop(ds) }()" to work around this. -func (fs *filesystem) renameMuRUnlockAndCheckDrop(ds **[]*dentry) { +func (fs *filesystem) renameMuRUnlockAndCheckDrop(ctx context.Context, ds **[]*dentry) { fs.renameMu.RUnlock() if *ds == nil { return @@ -85,20 +85,20 @@ func (fs *filesystem) renameMuRUnlockAndCheckDrop(ds **[]*dentry) { if len(**ds) != 0 { fs.renameMu.Lock() for _, d := range **ds { - d.checkDropLocked() + d.checkDropLocked(ctx) } fs.renameMu.Unlock() } putDentrySlice(*ds) } -func (fs *filesystem) renameMuUnlockAndCheckDrop(ds **[]*dentry) { +func (fs *filesystem) renameMuUnlockAndCheckDrop(ctx context.Context, ds **[]*dentry) { if *ds == nil { fs.renameMu.Unlock() return } for _, d := range **ds { - d.checkDropLocked() + d.checkDropLocked(ctx) } fs.renameMu.Unlock() putDentrySlice(*ds) @@ -126,13 +126,13 @@ afterSymlink: return d, nil } if name == ".." { - if isRoot, err := rp.CheckRoot(&d.vfsd); err != nil { + if isRoot, err := rp.CheckRoot(ctx, &d.vfsd); err != nil { return nil, err } else if isRoot || d.parent == nil { rp.Advance() return d, nil } - if err := rp.CheckMount(&d.parent.vfsd); err != nil { + if err := rp.CheckMount(ctx, &d.parent.vfsd); err != nil { return nil, err } rp.Advance() @@ -142,7 +142,7 @@ afterSymlink: if err != nil { return nil, err } - if err := rp.CheckMount(&child.vfsd); err != nil { + if err := rp.CheckMount(ctx, &child.vfsd); err != nil { return nil, err } if child.isSymlink() && mayFollowSymlinks && rp.ShouldFollowSymlink() { @@ -272,11 +272,11 @@ func (fs *filesystem) lookupLocked(ctx context.Context, parent *dentry, name str }) if lookupErr != nil { - child.destroyLocked() + child.destroyLocked(ctx) return nil, lookupErr } if !existsOnAnyLayer { - child.destroyLocked() + child.destroyLocked(ctx) return nil, syserror.ENOENT } @@ -430,7 +430,7 @@ func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath, func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool, create func(parent *dentry, name string, haveUpperWhiteout bool) error) error { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckDrop(&ds) + defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) start := rp.Start().Impl().(*dentry) parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds) if err != nil { @@ -501,7 +501,7 @@ func (fs *filesystem) cleanupRecreateWhiteout(ctx context.Context, vfsObj *vfs.V func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckDrop(&ds) + defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return err @@ -513,7 +513,7 @@ func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckDrop(&ds) + defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return nil, err @@ -532,7 +532,7 @@ func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckDrop(&ds) + defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return nil, err @@ -553,7 +553,7 @@ func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, op func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckDrop(&ds) + defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) start := rp.Start().Impl().(*dentry) d, err := fs.walkParentDirLocked(ctx, rp, start, &ds) if err != nil { @@ -720,7 +720,7 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckDrop(&ds) + defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) start := rp.Start().Impl().(*dentry) if rp.Done() { @@ -825,7 +825,7 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vf fd.LockFD.Init(&d.locks) layerFDOpts := layerFD.Options() if err := fd.vfsfd.Init(fd, layerFlags, mnt, &d.vfsd, &layerFDOpts); err != nil { - layerFD.DecRef() + layerFD.DecRef(ctx) return nil, err } return &fd.vfsfd, nil @@ -920,7 +920,7 @@ func (fs *filesystem) createAndOpenLocked(ctx context.Context, rp *vfs.Resolving fd.LockFD.Init(&child.locks) upperFDOpts := upperFD.Options() if err := fd.vfsfd.Init(fd, upperFlags, mnt, &child.vfsd, &upperFDOpts); err != nil { - upperFD.DecRef() + upperFD.DecRef(ctx) // Don't bother with cleanup; the file was created successfully, we // just can't open it anymore for some reason. return nil, err @@ -932,7 +932,7 @@ func (fs *filesystem) createAndOpenLocked(ctx context.Context, rp *vfs.Resolving func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckDrop(&ds) + defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return "", err @@ -952,7 +952,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa var ds *[]*dentry fs.renameMu.Lock() - defer fs.renameMuUnlockAndCheckDrop(&ds) + defer fs.renameMuUnlockAndCheckDrop(ctx, &ds) newParent, err := fs.walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry), &ds) if err != nil { return err @@ -979,7 +979,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckDrop(&ds) + defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) start := rp.Start().Impl().(*dentry) parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds) if err != nil { @@ -1001,7 +1001,7 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error } vfsObj := rp.VirtualFilesystem() mntns := vfs.MountNamespaceFromContext(ctx) - defer mntns.DecRef() + defer mntns.DecRef(ctx) parent.dirMu.Lock() defer parent.dirMu.Unlock() @@ -1086,7 +1086,7 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error return err } - vfsObj.CommitDeleteDentry(&child.vfsd) + vfsObj.CommitDeleteDentry(ctx, &child.vfsd) delete(parent.children, name) ds = appendDentry(ds, child) parent.dirents = nil @@ -1097,7 +1097,7 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckDrop(&ds) + defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return err @@ -1132,7 +1132,7 @@ func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckDrop(&ds) + defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) d, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return linux.Statx{}, err @@ -1160,7 +1160,7 @@ func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckDrop(&ds) + defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) _, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return linux.Statfs{}, err @@ -1211,7 +1211,7 @@ func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, targ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckDrop(&ds) + defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) start := rp.Start().Impl().(*dentry) parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds) if err != nil { @@ -1233,7 +1233,7 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error } vfsObj := rp.VirtualFilesystem() mntns := vfs.MountNamespaceFromContext(ctx) - defer mntns.DecRef() + defer mntns.DecRef(ctx) parent.dirMu.Lock() defer parent.dirMu.Unlock() @@ -1298,7 +1298,7 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error } if child != nil { - vfsObj.CommitDeleteDentry(&child.vfsd) + vfsObj.CommitDeleteDentry(ctx, &child.vfsd) delete(parent.children, name) ds = appendDentry(ds, child) } @@ -1310,7 +1310,7 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckDrop(&ds) + defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) _, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return nil, err @@ -1324,7 +1324,7 @@ func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, si func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckDrop(&ds) + defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) _, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return "", err @@ -1336,7 +1336,7 @@ func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckDrop(&ds) + defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) _, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return err @@ -1348,7 +1348,7 @@ func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error { var ds *[]*dentry fs.renameMu.RLock() - defer fs.renameMuRUnlockAndCheckDrop(&ds) + defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) _, err := fs.resolveLocked(ctx, rp, &ds) if err != nil { return err diff --git a/pkg/sentry/fsimpl/overlay/non_directory.go b/pkg/sentry/fsimpl/overlay/non_directory.go index c0749e711..d3060a481 100644 --- a/pkg/sentry/fsimpl/overlay/non_directory.go +++ b/pkg/sentry/fsimpl/overlay/non_directory.go @@ -81,11 +81,11 @@ func (fd *nonDirectoryFD) currentFDLocked(ctx context.Context) (*vfs.FileDescrip oldOff, oldOffErr := fd.cachedFD.Seek(ctx, 0, linux.SEEK_CUR) if oldOffErr == nil { if _, err := upperFD.Seek(ctx, oldOff, linux.SEEK_SET); err != nil { - upperFD.DecRef() + upperFD.DecRef(ctx) return nil, err } } - fd.cachedFD.DecRef() + fd.cachedFD.DecRef(ctx) fd.copiedUp = true fd.cachedFD = upperFD fd.cachedFlags = statusFlags @@ -99,8 +99,8 @@ func (fd *nonDirectoryFD) currentFDLocked(ctx context.Context) (*vfs.FileDescrip } // Release implements vfs.FileDescriptionImpl.Release. -func (fd *nonDirectoryFD) Release() { - fd.cachedFD.DecRef() +func (fd *nonDirectoryFD) Release(ctx context.Context) { + fd.cachedFD.DecRef(ctx) fd.cachedFD = nil } @@ -138,7 +138,7 @@ func (fd *nonDirectoryFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux Mask: layerMask, Sync: opts.Sync, }) - wrappedFD.DecRef() + wrappedFD.DecRef(ctx) if err != nil { return linux.Statx{}, err } @@ -187,7 +187,7 @@ func (fd *nonDirectoryFD) PRead(ctx context.Context, dst usermem.IOSequence, off if err != nil { return 0, err } - defer wrappedFD.DecRef() + defer wrappedFD.DecRef(ctx) return wrappedFD.PRead(ctx, dst, offset, opts) } @@ -209,7 +209,7 @@ func (fd *nonDirectoryFD) PWrite(ctx context.Context, src usermem.IOSequence, of if err != nil { return 0, err } - defer wrappedFD.DecRef() + defer wrappedFD.DecRef(ctx) return wrappedFD.PWrite(ctx, src, offset, opts) } @@ -250,7 +250,7 @@ func (fd *nonDirectoryFD) Sync(ctx context.Context) error { return err } wrappedFD.IncRef() - defer wrappedFD.DecRef() + defer wrappedFD.DecRef(ctx) fd.mu.Unlock() return wrappedFD.Sync(ctx) } @@ -261,6 +261,6 @@ func (fd *nonDirectoryFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOp if err != nil { return err } - defer wrappedFD.DecRef() + defer wrappedFD.DecRef(ctx) return wrappedFD.ConfigureMMap(ctx, opts) } diff --git a/pkg/sentry/fsimpl/overlay/overlay.go b/pkg/sentry/fsimpl/overlay/overlay.go index e720d4825..75cc006bf 100644 --- a/pkg/sentry/fsimpl/overlay/overlay.go +++ b/pkg/sentry/fsimpl/overlay/overlay.go @@ -123,7 +123,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt // filesystem with any number of lower layers. } else { vfsroot := vfs.RootFromContext(ctx) - defer vfsroot.DecRef() + defer vfsroot.DecRef(ctx) upperPathname, ok := mopts["upperdir"] if ok { delete(mopts, "upperdir") @@ -147,13 +147,13 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt ctx.Warningf("overlay.FilesystemType.GetFilesystem: failed to resolve upperdir %q: %v", upperPathname, err) return nil, nil, err } - defer upperRoot.DecRef() + defer upperRoot.DecRef(ctx) privateUpperRoot, err := clonePrivateMount(vfsObj, upperRoot, false /* forceReadOnly */) if err != nil { ctx.Warningf("overlay.FilesystemType.GetFilesystem: failed to make private bind mount of upperdir %q: %v", upperPathname, err) return nil, nil, err } - defer privateUpperRoot.DecRef() + defer privateUpperRoot.DecRef(ctx) fsopts.UpperRoot = privateUpperRoot } lowerPathnamesStr, ok := mopts["lowerdir"] @@ -190,13 +190,13 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt ctx.Warningf("overlay.FilesystemType.GetFilesystem: failed to resolve lowerdir %q: %v", lowerPathname, err) return nil, nil, err } - defer lowerRoot.DecRef() + defer lowerRoot.DecRef(ctx) privateLowerRoot, err := clonePrivateMount(vfsObj, lowerRoot, true /* forceReadOnly */) if err != nil { ctx.Warningf("overlay.FilesystemType.GetFilesystem: failed to make private bind mount of lowerdir %q: %v", lowerPathname, err) return nil, nil, err } - defer privateLowerRoot.DecRef() + defer privateLowerRoot.DecRef(ctx) fsopts.LowerRoots = append(fsopts.LowerRoots, privateLowerRoot) } } @@ -264,19 +264,19 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt Mask: rootStatMask, }) if err != nil { - root.destroyLocked() - fs.vfsfs.DecRef() + root.destroyLocked(ctx) + fs.vfsfs.DecRef(ctx) return nil, nil, err } if rootStat.Mask&rootStatMask != rootStatMask { - root.destroyLocked() - fs.vfsfs.DecRef() + root.destroyLocked(ctx) + fs.vfsfs.DecRef(ctx) return nil, nil, syserror.EREMOTE } if isWhiteout(&rootStat) { ctx.Warningf("overlay.FilesystemType.GetFilesystem: filesystem root is a whiteout") - root.destroyLocked() - fs.vfsfs.DecRef() + root.destroyLocked(ctx) + fs.vfsfs.DecRef(ctx) return nil, nil, syserror.EINVAL } root.mode = uint32(rootStat.Mode) @@ -319,17 +319,17 @@ func clonePrivateMount(vfsObj *vfs.VirtualFilesystem, vd vfs.VirtualDentry, forc } // Release implements vfs.FilesystemImpl.Release. -func (fs *filesystem) Release() { +func (fs *filesystem) Release(ctx context.Context) { vfsObj := fs.vfsfs.VirtualFilesystem() vfsObj.PutAnonBlockDevMinor(fs.dirDevMinor) for _, lowerDevMinor := range fs.lowerDevMinors { vfsObj.PutAnonBlockDevMinor(lowerDevMinor) } if fs.opts.UpperRoot.Ok() { - fs.opts.UpperRoot.DecRef() + fs.opts.UpperRoot.DecRef(ctx) } for _, lowerRoot := range fs.opts.LowerRoots { - lowerRoot.DecRef() + lowerRoot.DecRef(ctx) } } @@ -452,10 +452,10 @@ func (d *dentry) TryIncRef() bool { } // DecRef implements vfs.DentryImpl.DecRef. -func (d *dentry) DecRef() { +func (d *dentry) DecRef(ctx context.Context) { if refs := atomic.AddInt64(&d.refs, -1); refs == 0 { d.fs.renameMu.Lock() - d.checkDropLocked() + d.checkDropLocked(ctx) d.fs.renameMu.Unlock() } else if refs < 0 { panic("overlay.dentry.DecRef() called without holding a reference") @@ -466,7 +466,7 @@ func (d *dentry) DecRef() { // becomes deleted. // // Preconditions: d.fs.renameMu must be locked for writing. -func (d *dentry) checkDropLocked() { +func (d *dentry) checkDropLocked(ctx context.Context) { // Dentries with a positive reference count must be retained. (The only way // to obtain a reference on a dentry with zero references is via path // resolution, which requires renameMu, so if d.refs is zero then it will @@ -476,14 +476,14 @@ func (d *dentry) checkDropLocked() { return } // Refs is still zero; destroy it. - d.destroyLocked() + d.destroyLocked(ctx) return } // destroyLocked destroys the dentry. // // Preconditions: d.fs.renameMu must be locked for writing. d.refs == 0. -func (d *dentry) destroyLocked() { +func (d *dentry) destroyLocked(ctx context.Context) { switch atomic.LoadInt64(&d.refs) { case 0: // Mark the dentry destroyed. @@ -495,10 +495,10 @@ func (d *dentry) destroyLocked() { } if d.upperVD.Ok() { - d.upperVD.DecRef() + d.upperVD.DecRef(ctx) } for _, lowerVD := range d.lowerVDs { - lowerVD.DecRef() + lowerVD.DecRef(ctx) } if d.parent != nil { @@ -510,7 +510,7 @@ func (d *dentry) destroyLocked() { // Drop the reference held by d on its parent without recursively // locking d.fs.renameMu. if refs := atomic.AddInt64(&d.parent.refs, -1); refs == 0 { - d.parent.checkDropLocked() + d.parent.checkDropLocked(ctx) } else if refs < 0 { panic("overlay.dentry.DecRef() called without holding a reference") } @@ -518,7 +518,7 @@ func (d *dentry) destroyLocked() { } // InotifyWithParent implements vfs.DentryImpl.InotifyWithParent. -func (d *dentry) InotifyWithParent(events uint32, cookie uint32, et vfs.EventType) { +func (d *dentry) InotifyWithParent(ctx context.Context, events uint32, cookie uint32, et vfs.EventType) { // TODO(gvisor.dev/issue/1479): Implement inotify. } @@ -531,7 +531,7 @@ func (d *dentry) Watches() *vfs.Watches { // OnZeroWatches implements vfs.DentryImpl.OnZeroWatches. // // TODO(gvisor.dev/issue/1479): Implement inotify. -func (d *dentry) OnZeroWatches() {} +func (d *dentry) OnZeroWatches(context.Context) {} // iterLayers invokes yield on each layer comprising d, from top to bottom. If // any call to yield returns false, iterLayer stops iteration. diff --git a/pkg/sentry/fsimpl/pipefs/pipefs.go b/pkg/sentry/fsimpl/pipefs/pipefs.go index 811f80a5f..2ca793db9 100644 --- a/pkg/sentry/fsimpl/pipefs/pipefs.go +++ b/pkg/sentry/fsimpl/pipefs/pipefs.go @@ -63,9 +63,9 @@ func NewFilesystem(vfsObj *vfs.VirtualFilesystem) (*vfs.Filesystem, error) { } // Release implements vfs.FilesystemImpl.Release. -func (fs *filesystem) Release() { +func (fs *filesystem) Release(ctx context.Context) { fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) - fs.Filesystem.Release() + fs.Filesystem.Release(ctx) } // PrependPath implements vfs.FilesystemImpl.PrependPath. @@ -160,6 +160,6 @@ func NewConnectedPipeFDs(ctx context.Context, mnt *vfs.Mount, flags uint32) (*vf inode := newInode(ctx, fs) var d kernfs.Dentry d.Init(inode) - defer d.DecRef() + defer d.DecRef(ctx) return inode.pipe.ReaderWriterPair(mnt, d.VFSDentry(), flags) } diff --git a/pkg/sentry/fsimpl/proc/filesystem.go b/pkg/sentry/fsimpl/proc/filesystem.go index 609210253..2463d51cd 100644 --- a/pkg/sentry/fsimpl/proc/filesystem.go +++ b/pkg/sentry/fsimpl/proc/filesystem.go @@ -77,9 +77,9 @@ func (ft FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualF } // Release implements vfs.FilesystemImpl.Release. -func (fs *filesystem) Release() { +func (fs *filesystem) Release(ctx context.Context) { fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) - fs.Filesystem.Release() + fs.Filesystem.Release(ctx) } // dynamicInode is an overfitted interface for common Inodes with diff --git a/pkg/sentry/fsimpl/proc/task_fds.go b/pkg/sentry/fsimpl/proc/task_fds.go index fea29e5f0..f0d3f7f5e 100644 --- a/pkg/sentry/fsimpl/proc/task_fds.go +++ b/pkg/sentry/fsimpl/proc/task_fds.go @@ -43,12 +43,12 @@ func getTaskFD(t *kernel.Task, fd int32) (*vfs.FileDescription, kernel.FDFlags) return file, flags } -func taskFDExists(t *kernel.Task, fd int32) bool { +func taskFDExists(ctx context.Context, t *kernel.Task, fd int32) bool { file, _ := getTaskFD(t, fd) if file == nil { return false } - file.DecRef() + file.DecRef(ctx) return true } @@ -68,7 +68,7 @@ func (i *fdDir) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, off var fds []int32 i.task.WithMuLocked(func(t *kernel.Task) { if fdTable := t.FDTable(); fdTable != nil { - fds = fdTable.GetFDs() + fds = fdTable.GetFDs(ctx) } }) @@ -135,7 +135,7 @@ func (i *fdDirInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, erro return nil, syserror.ENOENT } fd := int32(fdInt) - if !taskFDExists(i.task, fd) { + if !taskFDExists(ctx, i.task, fd) { return nil, syserror.ENOENT } taskDentry := i.fs.newFDSymlink(i.task, fd, i.fs.NextIno()) @@ -204,9 +204,9 @@ func (s *fdSymlink) Readlink(ctx context.Context) (string, error) { if file == nil { return "", syserror.ENOENT } - defer file.DecRef() + defer file.DecRef(ctx) root := vfs.RootFromContext(ctx) - defer root.DecRef() + defer root.DecRef(ctx) return s.task.Kernel().VFS().PathnameWithDeleted(ctx, root, file.VirtualDentry()) } @@ -215,7 +215,7 @@ func (s *fdSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDen if file == nil { return vfs.VirtualDentry{}, "", syserror.ENOENT } - defer file.DecRef() + defer file.DecRef(ctx) vd := file.VirtualDentry() vd.IncRef() return vd, "", nil @@ -258,7 +258,7 @@ func (i *fdInfoDirInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, return nil, syserror.ENOENT } fd := int32(fdInt) - if !taskFDExists(i.task, fd) { + if !taskFDExists(ctx, i.task, fd) { return nil, syserror.ENOENT } data := &fdInfoData{ @@ -297,7 +297,7 @@ func (d *fdInfoData) Generate(ctx context.Context, buf *bytes.Buffer) error { if file == nil { return syserror.ENOENT } - defer file.DecRef() + defer file.DecRef(ctx) // TODO(b/121266871): Include pos, locks, and other data. For now we only // have flags. // See https://www.kernel.org/doc/Documentation/filesystems/proc.txt diff --git a/pkg/sentry/fsimpl/proc/task_files.go b/pkg/sentry/fsimpl/proc/task_files.go index 859b7d727..830b78949 100644 --- a/pkg/sentry/fsimpl/proc/task_files.go +++ b/pkg/sentry/fsimpl/proc/task_files.go @@ -677,7 +677,7 @@ func (s *exeSymlink) Readlink(ctx context.Context) (string, error) { if err != nil { return "", err } - defer exec.DecRef() + defer exec.DecRef(ctx) return exec.PathnameWithDeleted(ctx), nil } @@ -692,7 +692,7 @@ func (s *exeSymlink) Getlink(ctx context.Context, _ *vfs.Mount) (vfs.VirtualDent if err != nil { return vfs.VirtualDentry{}, "", err } - defer exec.DecRef() + defer exec.DecRef(ctx) vd := exec.(*fsbridge.VFSFile).FileDescription().VirtualDentry() vd.IncRef() @@ -748,7 +748,7 @@ func (i *mountInfoData) Generate(ctx context.Context, buf *bytes.Buffer) error { // Root has been destroyed. Don't try to read mounts. return nil } - defer rootDir.DecRef() + defer rootDir.DecRef(ctx) i.task.Kernel().VFS().GenerateProcMountInfo(ctx, rootDir, buf) return nil } @@ -779,7 +779,7 @@ func (i *mountsData) Generate(ctx context.Context, buf *bytes.Buffer) error { // Root has been destroyed. Don't try to read mounts. return nil } - defer rootDir.DecRef() + defer rootDir.DecRef(ctx) i.task.Kernel().VFS().GenerateProcMounts(ctx, rootDir, buf) return nil } @@ -825,7 +825,7 @@ func (s *namespaceSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.Vir dentry.Init(&namespaceInode{}) vd := vfs.MakeVirtualDentry(mnt, dentry.VFSDentry()) vd.IncRef() - dentry.DecRef() + dentry.DecRef(ctx) return vd, "", nil } @@ -887,8 +887,8 @@ func (fd *namespaceFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) err } // Release implements FileDescriptionImpl. -func (fd *namespaceFD) Release() { - fd.inode.DecRef() +func (fd *namespaceFD) Release(ctx context.Context) { + fd.inode.DecRef(ctx) } // LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. diff --git a/pkg/sentry/fsimpl/proc/task_net.go b/pkg/sentry/fsimpl/proc/task_net.go index 6bde27376..a4c884bf9 100644 --- a/pkg/sentry/fsimpl/proc/task_net.go +++ b/pkg/sentry/fsimpl/proc/task_net.go @@ -212,7 +212,7 @@ func (n *netUnixData) Generate(ctx context.Context, buf *bytes.Buffer) error { continue } if family, _, _ := s.Impl().(socket.SocketVFS2).Type(); family != linux.AF_UNIX { - s.DecRef() + s.DecRef(ctx) // Not a unix socket. continue } @@ -281,7 +281,7 @@ func (n *netUnixData) Generate(ctx context.Context, buf *bytes.Buffer) error { } fmt.Fprintf(buf, "\n") - s.DecRef() + s.DecRef(ctx) } return nil } @@ -359,7 +359,7 @@ func commonGenerateTCP(ctx context.Context, buf *bytes.Buffer, k *kernel.Kernel, panic(fmt.Sprintf("Found non-socket file in socket table: %+v", s)) } if fa, stype, _ := sops.Type(); !(family == fa && stype == linux.SOCK_STREAM) { - s.DecRef() + s.DecRef(ctx) // Not tcp4 sockets. continue } @@ -455,7 +455,7 @@ func commonGenerateTCP(ctx context.Context, buf *bytes.Buffer, k *kernel.Kernel, fmt.Fprintf(buf, "\n") - s.DecRef() + s.DecRef(ctx) } return nil @@ -524,7 +524,7 @@ func (d *netUDPData) Generate(ctx context.Context, buf *bytes.Buffer) error { panic(fmt.Sprintf("Found non-socket file in socket table: %+v", s)) } if family, stype, _ := sops.Type(); family != linux.AF_INET || stype != linux.SOCK_DGRAM { - s.DecRef() + s.DecRef(ctx) // Not udp4 socket. continue } @@ -600,7 +600,7 @@ func (d *netUDPData) Generate(ctx context.Context, buf *bytes.Buffer) error { fmt.Fprintf(buf, "\n") - s.DecRef() + s.DecRef(ctx) } return nil } diff --git a/pkg/sentry/fsimpl/proc/tasks_test.go b/pkg/sentry/fsimpl/proc/tasks_test.go index 19abb5034..3c9297dee 100644 --- a/pkg/sentry/fsimpl/proc/tasks_test.go +++ b/pkg/sentry/fsimpl/proc/tasks_test.go @@ -218,7 +218,7 @@ func TestTasks(t *testing.T) { if err != nil { t.Fatalf("vfsfs.OpenAt(%q) failed: %v", path, err) } - defer fd.DecRef() + defer fd.DecRef(s.Ctx) buf := make([]byte, 1) bufIOSeq := usermem.BytesIOSequence(buf) if _, err := fd.Read(s.Ctx, bufIOSeq, vfs.ReadOptions{}); err != syserror.EISDIR { @@ -336,7 +336,7 @@ func TestTasksOffset(t *testing.T) { if err != nil { t.Fatalf("vfsfs.OpenAt(/) failed: %v", err) } - defer fd.DecRef() + defer fd.DecRef(s.Ctx) if _, err := fd.Seek(s.Ctx, tc.offset, linux.SEEK_SET); err != nil { t.Fatalf("Seek(%d, SEEK_SET): %v", tc.offset, err) } @@ -441,7 +441,7 @@ func iterateDir(ctx context.Context, t *testing.T, s *testutil.System, fd *vfs.F t.Errorf("vfsfs.OpenAt(%v) failed: %v", absPath, err) continue } - defer child.DecRef() + defer child.DecRef(ctx) stat, err := child.Stat(ctx, vfs.StatOptions{}) if err != nil { t.Errorf("Stat(%v) failed: %v", absPath, err) @@ -476,7 +476,7 @@ func TestTree(t *testing.T) { if err != nil { t.Fatalf("failed to create test file: %v", err) } - defer file.DecRef() + defer file.DecRef(s.Ctx) var tasks []*kernel.Task for i := 0; i < 5; i++ { @@ -501,5 +501,5 @@ func TestTree(t *testing.T) { t.Fatalf("vfsfs.OpenAt(/proc) failed: %v", err) } iterateDir(ctx, t, s, fd) - fd.DecRef() + fd.DecRef(ctx) } diff --git a/pkg/sentry/fsimpl/signalfd/signalfd.go b/pkg/sentry/fsimpl/signalfd/signalfd.go index 242ba9b5d..6297e1df4 100644 --- a/pkg/sentry/fsimpl/signalfd/signalfd.go +++ b/pkg/sentry/fsimpl/signalfd/signalfd.go @@ -54,7 +54,7 @@ var _ vfs.FileDescriptionImpl = (*SignalFileDescription)(nil) // New creates a new signal fd. func New(vfsObj *vfs.VirtualFilesystem, target *kernel.Task, mask linux.SignalSet, flags uint32) (*vfs.FileDescription, error) { vd := vfsObj.NewAnonVirtualDentry("[signalfd]") - defer vd.DecRef() + defer vd.DecRef(target) sfd := &SignalFileDescription{ target: target, mask: mask, @@ -133,4 +133,4 @@ func (sfd *SignalFileDescription) EventUnregister(entry *waiter.Entry) { } // Release implements FileDescriptionImpl.Release() -func (sfd *SignalFileDescription) Release() {} +func (sfd *SignalFileDescription) Release(context.Context) {} diff --git a/pkg/sentry/fsimpl/sockfs/sockfs.go b/pkg/sentry/fsimpl/sockfs/sockfs.go index ee0828a15..c61818ff6 100644 --- a/pkg/sentry/fsimpl/sockfs/sockfs.go +++ b/pkg/sentry/fsimpl/sockfs/sockfs.go @@ -67,9 +67,9 @@ func NewFilesystem(vfsObj *vfs.VirtualFilesystem) (*vfs.Filesystem, error) { } // Release implements vfs.FilesystemImpl.Release. -func (fs *filesystem) Release() { +func (fs *filesystem) Release(ctx context.Context) { fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) - fs.Filesystem.Release() + fs.Filesystem.Release(ctx) } // PrependPath implements vfs.FilesystemImpl.PrependPath. diff --git a/pkg/sentry/fsimpl/sys/sys.go b/pkg/sentry/fsimpl/sys/sys.go index 01ce30a4d..f81b0c38f 100644 --- a/pkg/sentry/fsimpl/sys/sys.go +++ b/pkg/sentry/fsimpl/sys/sys.go @@ -87,9 +87,9 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt } // Release implements vfs.FilesystemImpl.Release. -func (fs *filesystem) Release() { +func (fs *filesystem) Release(ctx context.Context) { fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) - fs.Filesystem.Release() + fs.Filesystem.Release(ctx) } // dir implements kernfs.Inode. diff --git a/pkg/sentry/fsimpl/sys/sys_test.go b/pkg/sentry/fsimpl/sys/sys_test.go index 242d5fd12..9fd38b295 100644 --- a/pkg/sentry/fsimpl/sys/sys_test.go +++ b/pkg/sentry/fsimpl/sys/sys_test.go @@ -59,7 +59,7 @@ func TestReadCPUFile(t *testing.T) { if err != nil { t.Fatalf("OpenAt(pop:%+v) = %+v failed: %v", pop, fd, err) } - defer fd.DecRef() + defer fd.DecRef(s.Ctx) content, err := s.ReadToEnd(fd) if err != nil { t.Fatalf("Read failed: %v", err) diff --git a/pkg/sentry/fsimpl/testutil/kernel.go b/pkg/sentry/fsimpl/testutil/kernel.go index e743e8114..1e57744e8 100644 --- a/pkg/sentry/fsimpl/testutil/kernel.go +++ b/pkg/sentry/fsimpl/testutil/kernel.go @@ -127,7 +127,7 @@ func CreateTask(ctx context.Context, name string, tc *kernel.ThreadGroup, mntns return nil, err } m := mm.NewMemoryManager(k, k, k.SleepForAddressSpaceActivation) - m.SetExecutable(fsbridge.NewVFSFile(exe)) + m.SetExecutable(ctx, fsbridge.NewVFSFile(exe)) config := &kernel.TaskConfig{ Kernel: k, diff --git a/pkg/sentry/fsimpl/testutil/testutil.go b/pkg/sentry/fsimpl/testutil/testutil.go index 0556af877..568132121 100644 --- a/pkg/sentry/fsimpl/testutil/testutil.go +++ b/pkg/sentry/fsimpl/testutil/testutil.go @@ -97,8 +97,8 @@ func (s *System) WithTemporaryContext(ctx context.Context) *System { // Destroy release resources associated with a test system. func (s *System) Destroy() { - s.Root.DecRef() - s.MntNs.DecRef() // Reference on MntNs passed to NewSystem. + s.Root.DecRef(s.Ctx) + s.MntNs.DecRef(s.Ctx) // Reference on MntNs passed to NewSystem. } // ReadToEnd reads the contents of fd until EOF to a string. @@ -149,7 +149,7 @@ func (s *System) ListDirents(pop *vfs.PathOperation) *DirentCollector { if err != nil { s.t.Fatalf("OpenAt for PathOperation %+v failed: %v", pop, err) } - defer fd.DecRef() + defer fd.DecRef(s.Ctx) collector := &DirentCollector{} if err := fd.IterDirents(s.Ctx, collector); err != nil { diff --git a/pkg/sentry/fsimpl/timerfd/timerfd.go b/pkg/sentry/fsimpl/timerfd/timerfd.go index 2dc90d484..86beaa0a8 100644 --- a/pkg/sentry/fsimpl/timerfd/timerfd.go +++ b/pkg/sentry/fsimpl/timerfd/timerfd.go @@ -47,9 +47,9 @@ var _ vfs.FileDescriptionImpl = (*TimerFileDescription)(nil) var _ ktime.TimerListener = (*TimerFileDescription)(nil) // New returns a new timer fd. -func New(vfsObj *vfs.VirtualFilesystem, clock ktime.Clock, flags uint32) (*vfs.FileDescription, error) { +func New(ctx context.Context, vfsObj *vfs.VirtualFilesystem, clock ktime.Clock, flags uint32) (*vfs.FileDescription, error) { vd := vfsObj.NewAnonVirtualDentry("[timerfd]") - defer vd.DecRef() + defer vd.DecRef(ctx) tfd := &TimerFileDescription{} tfd.timer = ktime.NewTimer(clock, tfd) if err := tfd.vfsfd.Init(tfd, flags, vd.Mount(), vd.Dentry(), &vfs.FileDescriptionOptions{ @@ -129,7 +129,7 @@ func (tfd *TimerFileDescription) ResumeTimer() { } // Release implements FileDescriptionImpl.Release() -func (tfd *TimerFileDescription) Release() { +func (tfd *TimerFileDescription) Release(context.Context) { tfd.timer.Destroy() } diff --git a/pkg/sentry/fsimpl/tmpfs/benchmark_test.go b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go index 2fb5c4d84..d263147c2 100644 --- a/pkg/sentry/fsimpl/tmpfs/benchmark_test.go +++ b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go @@ -83,7 +83,7 @@ func fileOpOn(ctx context.Context, mntns *fs.MountNamespace, root, wd *fs.Dirent } err = fn(root, d) - d.DecRef() + d.DecRef(ctx) return err } @@ -105,17 +105,17 @@ func BenchmarkVFS1TmpfsStat(b *testing.B) { if err != nil { b.Fatalf("failed to create mount namespace: %v", err) } - defer mntns.DecRef() + defer mntns.DecRef(ctx) var filePathBuilder strings.Builder filePathBuilder.WriteByte('/') // Create nested directories with given depth. root := mntns.Root() - defer root.DecRef() + defer root.DecRef(ctx) d := root d.IncRef() - defer d.DecRef() + defer d.DecRef(ctx) for i := depth; i > 0; i-- { name := fmt.Sprintf("%d", i) if err := d.Inode.CreateDirectory(ctx, d, name, fs.FilePermsFromMode(0755)); err != nil { @@ -125,7 +125,7 @@ func BenchmarkVFS1TmpfsStat(b *testing.B) { if err != nil { b.Fatalf("failed to walk to directory %q: %v", name, err) } - d.DecRef() + d.DecRef(ctx) d = next filePathBuilder.WriteString(name) filePathBuilder.WriteByte('/') @@ -136,7 +136,7 @@ func BenchmarkVFS1TmpfsStat(b *testing.B) { if err != nil { b.Fatalf("failed to create file %q: %v", filename, err) } - file.DecRef() + file.DecRef(ctx) filePathBuilder.WriteString(filename) filePath := filePathBuilder.String() @@ -176,7 +176,7 @@ func BenchmarkVFS2TmpfsStat(b *testing.B) { // Create VFS. vfsObj := vfs.VirtualFilesystem{} - if err := vfsObj.Init(); err != nil { + if err := vfsObj.Init(ctx); err != nil { b.Fatalf("VFS init: %v", err) } vfsObj.MustRegisterFilesystemType("tmpfs", tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ @@ -186,14 +186,14 @@ func BenchmarkVFS2TmpfsStat(b *testing.B) { if err != nil { b.Fatalf("failed to create tmpfs root mount: %v", err) } - defer mntns.DecRef() + defer mntns.DecRef(ctx) var filePathBuilder strings.Builder filePathBuilder.WriteByte('/') // Create nested directories with given depth. root := mntns.Root() - defer root.DecRef() + defer root.DecRef(ctx) vd := root vd.IncRef() for i := depth; i > 0; i-- { @@ -212,7 +212,7 @@ func BenchmarkVFS2TmpfsStat(b *testing.B) { if err != nil { b.Fatalf("failed to walk to directory %q: %v", name, err) } - vd.DecRef() + vd.DecRef(ctx) vd = nextVD filePathBuilder.WriteString(name) filePathBuilder.WriteByte('/') @@ -228,12 +228,12 @@ func BenchmarkVFS2TmpfsStat(b *testing.B) { Flags: linux.O_RDWR | linux.O_CREAT | linux.O_EXCL, Mode: 0644, }) - vd.DecRef() + vd.DecRef(ctx) vd = vfs.VirtualDentry{} if err != nil { b.Fatalf("failed to create file %q: %v", filename, err) } - defer fd.DecRef() + defer fd.DecRef(ctx) filePathBuilder.WriteString(filename) filePath := filePathBuilder.String() @@ -278,14 +278,14 @@ func BenchmarkVFS1TmpfsMountStat(b *testing.B) { if err != nil { b.Fatalf("failed to create mount namespace: %v", err) } - defer mntns.DecRef() + defer mntns.DecRef(ctx) var filePathBuilder strings.Builder filePathBuilder.WriteByte('/') // Create and mount the submount. root := mntns.Root() - defer root.DecRef() + defer root.DecRef(ctx) if err := root.Inode.CreateDirectory(ctx, root, mountPointName, fs.FilePermsFromMode(0755)); err != nil { b.Fatalf("failed to create mount point: %v", err) } @@ -293,7 +293,7 @@ func BenchmarkVFS1TmpfsMountStat(b *testing.B) { if err != nil { b.Fatalf("failed to walk to mount point: %v", err) } - defer mountPoint.DecRef() + defer mountPoint.DecRef(ctx) submountInode, err := tmpfsFS.Mount(ctx, "tmpfs", fs.MountSourceFlags{}, "", nil) if err != nil { b.Fatalf("failed to create tmpfs submount: %v", err) @@ -309,7 +309,7 @@ func BenchmarkVFS1TmpfsMountStat(b *testing.B) { if err != nil { b.Fatalf("failed to walk to mount root: %v", err) } - defer d.DecRef() + defer d.DecRef(ctx) for i := depth; i > 0; i-- { name := fmt.Sprintf("%d", i) if err := d.Inode.CreateDirectory(ctx, d, name, fs.FilePermsFromMode(0755)); err != nil { @@ -319,7 +319,7 @@ func BenchmarkVFS1TmpfsMountStat(b *testing.B) { if err != nil { b.Fatalf("failed to walk to directory %q: %v", name, err) } - d.DecRef() + d.DecRef(ctx) d = next filePathBuilder.WriteString(name) filePathBuilder.WriteByte('/') @@ -330,7 +330,7 @@ func BenchmarkVFS1TmpfsMountStat(b *testing.B) { if err != nil { b.Fatalf("failed to create file %q: %v", filename, err) } - file.DecRef() + file.DecRef(ctx) filePathBuilder.WriteString(filename) filePath := filePathBuilder.String() @@ -370,7 +370,7 @@ func BenchmarkVFS2TmpfsMountStat(b *testing.B) { // Create VFS. vfsObj := vfs.VirtualFilesystem{} - if err := vfsObj.Init(); err != nil { + if err := vfsObj.Init(ctx); err != nil { b.Fatalf("VFS init: %v", err) } vfsObj.MustRegisterFilesystemType("tmpfs", tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ @@ -380,14 +380,14 @@ func BenchmarkVFS2TmpfsMountStat(b *testing.B) { if err != nil { b.Fatalf("failed to create tmpfs root mount: %v", err) } - defer mntns.DecRef() + defer mntns.DecRef(ctx) var filePathBuilder strings.Builder filePathBuilder.WriteByte('/') // Create the mount point. root := mntns.Root() - defer root.DecRef() + defer root.DecRef(ctx) pop := vfs.PathOperation{ Root: root, Start: root, @@ -403,7 +403,7 @@ func BenchmarkVFS2TmpfsMountStat(b *testing.B) { if err != nil { b.Fatalf("failed to walk to mount point: %v", err) } - defer mountPoint.DecRef() + defer mountPoint.DecRef(ctx) // Create and mount the submount. if err := vfsObj.MountAt(ctx, creds, "", &pop, "tmpfs", &vfs.MountOptions{}); err != nil { b.Fatalf("failed to mount tmpfs submount: %v", err) @@ -432,7 +432,7 @@ func BenchmarkVFS2TmpfsMountStat(b *testing.B) { if err != nil { b.Fatalf("failed to walk to directory %q: %v", name, err) } - vd.DecRef() + vd.DecRef(ctx) vd = nextVD filePathBuilder.WriteString(name) filePathBuilder.WriteByte('/') @@ -448,11 +448,11 @@ func BenchmarkVFS2TmpfsMountStat(b *testing.B) { Flags: linux.O_RDWR | linux.O_CREAT | linux.O_EXCL, Mode: 0644, }) - vd.DecRef() + vd.DecRef(ctx) if err != nil { b.Fatalf("failed to create file %q: %v", filename, err) } - fd.DecRef() + fd.DecRef(ctx) filePathBuilder.WriteString(filename) filePath := filePathBuilder.String() diff --git a/pkg/sentry/fsimpl/tmpfs/directory.go b/pkg/sentry/fsimpl/tmpfs/directory.go index 0a1ad4765..78b4fc5be 100644 --- a/pkg/sentry/fsimpl/tmpfs/directory.go +++ b/pkg/sentry/fsimpl/tmpfs/directory.go @@ -95,7 +95,7 @@ type directoryFD struct { } // Release implements vfs.FileDescriptionImpl.Release. -func (fd *directoryFD) Release() { +func (fd *directoryFD) Release(ctx context.Context) { if fd.iter != nil { dir := fd.inode().impl.(*directory) dir.iterMu.Lock() @@ -110,7 +110,7 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba fs := fd.filesystem() dir := fd.inode().impl.(*directory) - defer fd.dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent) + defer fd.dentry().InotifyWithParent(ctx, linux.IN_ACCESS, 0, vfs.PathEvent) // fs.mu is required to read d.parent and dentry.name. fs.mu.RLock() diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go index ef210a69b..fb77f95cc 100644 --- a/pkg/sentry/fsimpl/tmpfs/filesystem.go +++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go @@ -40,7 +40,7 @@ func (fs *filesystem) Sync(ctx context.Context) error { // stepLocked is loosely analogous to fs/namei.c:walk_component(). // // Preconditions: filesystem.mu must be locked. !rp.Done(). -func stepLocked(rp *vfs.ResolvingPath, d *dentry) (*dentry, error) { +func stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry) (*dentry, error) { dir, ok := d.inode.impl.(*directory) if !ok { return nil, syserror.ENOTDIR @@ -55,13 +55,13 @@ afterSymlink: return d, nil } if name == ".." { - if isRoot, err := rp.CheckRoot(&d.vfsd); err != nil { + if isRoot, err := rp.CheckRoot(ctx, &d.vfsd); err != nil { return nil, err } else if isRoot || d.parent == nil { rp.Advance() return d, nil } - if err := rp.CheckMount(&d.parent.vfsd); err != nil { + if err := rp.CheckMount(ctx, &d.parent.vfsd); err != nil { return nil, err } rp.Advance() @@ -74,7 +74,7 @@ afterSymlink: if !ok { return nil, syserror.ENOENT } - if err := rp.CheckMount(&child.vfsd); err != nil { + if err := rp.CheckMount(ctx, &child.vfsd); err != nil { return nil, err } if symlink, ok := child.inode.impl.(*symlink); ok && rp.ShouldFollowSymlink() { @@ -98,9 +98,9 @@ afterSymlink: // fs/namei.c:path_parentat(). // // Preconditions: filesystem.mu must be locked. !rp.Done(). -func walkParentDirLocked(rp *vfs.ResolvingPath, d *dentry) (*directory, error) { +func walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry) (*directory, error) { for !rp.Final() { - next, err := stepLocked(rp, d) + next, err := stepLocked(ctx, rp, d) if err != nil { return nil, err } @@ -118,10 +118,10 @@ func walkParentDirLocked(rp *vfs.ResolvingPath, d *dentry) (*directory, error) { // resolveLocked is loosely analogous to Linux's fs/namei.c:path_lookupat(). // // Preconditions: filesystem.mu must be locked. -func resolveLocked(rp *vfs.ResolvingPath) (*dentry, error) { +func resolveLocked(ctx context.Context, rp *vfs.ResolvingPath) (*dentry, error) { d := rp.Start().Impl().(*dentry) for !rp.Done() { - next, err := stepLocked(rp, d) + next, err := stepLocked(ctx, rp, d) if err != nil { return nil, err } @@ -141,10 +141,10 @@ func resolveLocked(rp *vfs.ResolvingPath) (*dentry, error) { // // Preconditions: !rp.Done(). For the final path component in rp, // !rp.ShouldFollowSymlink(). -func (fs *filesystem) doCreateAt(rp *vfs.ResolvingPath, dir bool, create func(parentDir *directory, name string) error) error { +func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool, create func(parentDir *directory, name string) error) error { fs.mu.Lock() defer fs.mu.Unlock() - parentDir, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry)) + parentDir, err := walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry)) if err != nil { return err } @@ -182,7 +182,7 @@ func (fs *filesystem) doCreateAt(rp *vfs.ResolvingPath, dir bool, create func(pa if dir { ev |= linux.IN_ISDIR } - parentDir.inode.watches.Notify(name, uint32(ev), 0, vfs.InodeEvent, false /* unlinked */) + parentDir.inode.watches.Notify(ctx, name, uint32(ev), 0, vfs.InodeEvent, false /* unlinked */) parentDir.inode.touchCMtime() return nil } @@ -191,7 +191,7 @@ func (fs *filesystem) doCreateAt(rp *vfs.ResolvingPath, dir bool, create func(pa func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error { fs.mu.RLock() defer fs.mu.RUnlock() - d, err := resolveLocked(rp) + d, err := resolveLocked(ctx, rp) if err != nil { return err } @@ -202,7 +202,7 @@ func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) { fs.mu.RLock() defer fs.mu.RUnlock() - d, err := resolveLocked(rp) + d, err := resolveLocked(ctx, rp) if err != nil { return nil, err } @@ -222,7 +222,7 @@ func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, op func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) { fs.mu.RLock() defer fs.mu.RUnlock() - dir, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry)) + dir, err := walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry)) if err != nil { return nil, err } @@ -232,7 +232,7 @@ func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPa // LinkAt implements vfs.FilesystemImpl.LinkAt. func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error { - return fs.doCreateAt(rp, false /* dir */, func(parentDir *directory, name string) error { + return fs.doCreateAt(ctx, rp, false /* dir */, func(parentDir *directory, name string) error { if rp.Mount() != vd.Mount() { return syserror.EXDEV } @@ -251,7 +251,7 @@ func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs. return syserror.EMLINK } i.incLinksLocked() - i.watches.Notify("", linux.IN_ATTRIB, 0, vfs.InodeEvent, false /* unlinked */) + i.watches.Notify(ctx, "", linux.IN_ATTRIB, 0, vfs.InodeEvent, false /* unlinked */) parentDir.insertChildLocked(fs.newDentry(i), name) return nil }) @@ -259,7 +259,7 @@ func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs. // MkdirAt implements vfs.FilesystemImpl.MkdirAt. func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error { - return fs.doCreateAt(rp, true /* dir */, func(parentDir *directory, name string) error { + return fs.doCreateAt(ctx, rp, true /* dir */, func(parentDir *directory, name string) error { creds := rp.Credentials() if parentDir.inode.nlink == maxLinks { return syserror.EMLINK @@ -273,7 +273,7 @@ func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v // MknodAt implements vfs.FilesystemImpl.MknodAt. func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error { - return fs.doCreateAt(rp, false /* dir */, func(parentDir *directory, name string) error { + return fs.doCreateAt(ctx, rp, false /* dir */, func(parentDir *directory, name string) error { creds := rp.Credentials() var childInode *inode switch opts.Mode.FileType() { @@ -308,7 +308,7 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf if opts.Flags&linux.O_CREAT == 0 { fs.mu.RLock() defer fs.mu.RUnlock() - d, err := resolveLocked(rp) + d, err := resolveLocked(ctx, rp) if err != nil { return nil, err } @@ -330,7 +330,7 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf return start.open(ctx, rp, &opts, false /* afterCreate */) } afterTrailingSymlink: - parentDir, err := walkParentDirLocked(rp, start) + parentDir, err := walkParentDirLocked(ctx, rp, start) if err != nil { return nil, err } @@ -368,7 +368,7 @@ afterTrailingSymlink: if err != nil { return nil, err } - parentDir.inode.watches.Notify(name, linux.IN_CREATE, 0, vfs.PathEvent, false /* unlinked */) + parentDir.inode.watches.Notify(ctx, name, linux.IN_CREATE, 0, vfs.PathEvent, false /* unlinked */) parentDir.inode.touchCMtime() return fd, nil } @@ -376,7 +376,7 @@ afterTrailingSymlink: return nil, syserror.EEXIST } // Is the file mounted over? - if err := rp.CheckMount(&child.vfsd); err != nil { + if err := rp.CheckMount(ctx, &child.vfsd); err != nil { return nil, err } // Do we need to resolve a trailing symlink? @@ -445,7 +445,7 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.Open func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) { fs.mu.RLock() defer fs.mu.RUnlock() - d, err := resolveLocked(rp) + d, err := resolveLocked(ctx, rp) if err != nil { return "", err } @@ -467,7 +467,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa // Resolve newParent first to verify that it's on this Mount. fs.mu.Lock() defer fs.mu.Unlock() - newParentDir, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry)) + newParentDir, err := walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry)) if err != nil { return err } @@ -555,7 +555,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa } vfsObj := rp.VirtualFilesystem() mntns := vfs.MountNamespaceFromContext(ctx) - defer mntns.DecRef() + defer mntns.DecRef(ctx) var replacedVFSD *vfs.Dentry if replaced != nil { replacedVFSD = &replaced.vfsd @@ -566,17 +566,17 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa if replaced != nil { newParentDir.removeChildLocked(replaced) if replaced.inode.isDir() { - newParentDir.inode.decLinksLocked() // from replaced's ".." + newParentDir.inode.decLinksLocked(ctx) // from replaced's ".." } - replaced.inode.decLinksLocked() + replaced.inode.decLinksLocked(ctx) } oldParentDir.removeChildLocked(renamed) newParentDir.insertChildLocked(renamed, newName) - vfsObj.CommitRenameReplaceDentry(&renamed.vfsd, replacedVFSD) + vfsObj.CommitRenameReplaceDentry(ctx, &renamed.vfsd, replacedVFSD) oldParentDir.inode.touchCMtime() if oldParentDir != newParentDir { if renamed.inode.isDir() { - oldParentDir.inode.decLinksLocked() + oldParentDir.inode.decLinksLocked(ctx) newParentDir.inode.incLinksLocked() } newParentDir.inode.touchCMtime() @@ -591,7 +591,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error { fs.mu.Lock() defer fs.mu.Unlock() - parentDir, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry)) + parentDir, err := walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry)) if err != nil { return err } @@ -626,17 +626,17 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error defer mnt.EndWrite() vfsObj := rp.VirtualFilesystem() mntns := vfs.MountNamespaceFromContext(ctx) - defer mntns.DecRef() + defer mntns.DecRef(ctx) if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil { return err } parentDir.removeChildLocked(child) - parentDir.inode.watches.Notify(name, linux.IN_DELETE|linux.IN_ISDIR, 0, vfs.InodeEvent, true /* unlinked */) + parentDir.inode.watches.Notify(ctx, name, linux.IN_DELETE|linux.IN_ISDIR, 0, vfs.InodeEvent, true /* unlinked */) // Remove links for child, child/., and child/.. - child.inode.decLinksLocked() - child.inode.decLinksLocked() - parentDir.inode.decLinksLocked() - vfsObj.CommitDeleteDentry(&child.vfsd) + child.inode.decLinksLocked(ctx) + child.inode.decLinksLocked(ctx) + parentDir.inode.decLinksLocked(ctx) + vfsObj.CommitDeleteDentry(ctx, &child.vfsd) parentDir.inode.touchCMtime() return nil } @@ -644,7 +644,7 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error // SetStatAt implements vfs.FilesystemImpl.SetStatAt. func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error { fs.mu.RLock() - d, err := resolveLocked(rp) + d, err := resolveLocked(ctx, rp) if err != nil { fs.mu.RUnlock() return err @@ -656,7 +656,7 @@ func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts fs.mu.RUnlock() if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 { - d.InotifyWithParent(ev, 0, vfs.InodeEvent) + d.InotifyWithParent(ctx, ev, 0, vfs.InodeEvent) } return nil } @@ -665,7 +665,7 @@ func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) { fs.mu.RLock() defer fs.mu.RUnlock() - d, err := resolveLocked(rp) + d, err := resolveLocked(ctx, rp) if err != nil { return linux.Statx{}, err } @@ -678,7 +678,7 @@ func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) { fs.mu.RLock() defer fs.mu.RUnlock() - if _, err := resolveLocked(rp); err != nil { + if _, err := resolveLocked(ctx, rp); err != nil { return linux.Statfs{}, err } statfs := linux.Statfs{ @@ -695,7 +695,7 @@ func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linu // SymlinkAt implements vfs.FilesystemImpl.SymlinkAt. func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error { - return fs.doCreateAt(rp, false /* dir */, func(parentDir *directory, name string) error { + return fs.doCreateAt(ctx, rp, false /* dir */, func(parentDir *directory, name string) error { creds := rp.Credentials() child := fs.newDentry(fs.newSymlink(creds.EffectiveKUID, creds.EffectiveKGID, 0777, target)) parentDir.insertChildLocked(child, name) @@ -707,7 +707,7 @@ func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, targ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error { fs.mu.Lock() defer fs.mu.Unlock() - parentDir, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry)) + parentDir, err := walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry)) if err != nil { return err } @@ -738,7 +738,7 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error defer mnt.EndWrite() vfsObj := rp.VirtualFilesystem() mntns := vfs.MountNamespaceFromContext(ctx) - defer mntns.DecRef() + defer mntns.DecRef(ctx) if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil { return err } @@ -746,11 +746,11 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error // Generate inotify events. Note that this must take place before the link // count of the child is decremented, or else the watches may be dropped // before these events are added. - vfs.InotifyRemoveChild(&child.inode.watches, &parentDir.inode.watches, name) + vfs.InotifyRemoveChild(ctx, &child.inode.watches, &parentDir.inode.watches, name) parentDir.removeChildLocked(child) - child.inode.decLinksLocked() - vfsObj.CommitDeleteDentry(&child.vfsd) + child.inode.decLinksLocked(ctx) + vfsObj.CommitDeleteDentry(ctx, &child.vfsd) parentDir.inode.touchCMtime() return nil } @@ -759,7 +759,7 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) { fs.mu.RLock() defer fs.mu.RUnlock() - d, err := resolveLocked(rp) + d, err := resolveLocked(ctx, rp) if err != nil { return nil, err } @@ -778,7 +778,7 @@ func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) { fs.mu.RLock() defer fs.mu.RUnlock() - d, err := resolveLocked(rp) + d, err := resolveLocked(ctx, rp) if err != nil { return nil, err } @@ -789,7 +789,7 @@ func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, si func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) { fs.mu.RLock() defer fs.mu.RUnlock() - d, err := resolveLocked(rp) + d, err := resolveLocked(ctx, rp) if err != nil { return "", err } @@ -799,7 +799,7 @@ func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt // SetxattrAt implements vfs.FilesystemImpl.SetxattrAt. func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error { fs.mu.RLock() - d, err := resolveLocked(rp) + d, err := resolveLocked(ctx, rp) if err != nil { fs.mu.RUnlock() return err @@ -810,14 +810,14 @@ func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt } fs.mu.RUnlock() - d.InotifyWithParent(linux.IN_ATTRIB, 0, vfs.InodeEvent) + d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent) return nil } // RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt. func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error { fs.mu.RLock() - d, err := resolveLocked(rp) + d, err := resolveLocked(ctx, rp) if err != nil { fs.mu.RUnlock() return err @@ -828,7 +828,7 @@ func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, } fs.mu.RUnlock() - d.InotifyWithParent(linux.IN_ATTRIB, 0, vfs.InodeEvent) + d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent) return nil } diff --git a/pkg/sentry/fsimpl/tmpfs/pipe_test.go b/pkg/sentry/fsimpl/tmpfs/pipe_test.go index 1614f2c39..ec2701d8b 100644 --- a/pkg/sentry/fsimpl/tmpfs/pipe_test.go +++ b/pkg/sentry/fsimpl/tmpfs/pipe_test.go @@ -32,7 +32,7 @@ const fileName = "mypipe" func TestSeparateFDs(t *testing.T) { ctx, creds, vfsObj, root := setup(t) - defer root.DecRef() + defer root.DecRef(ctx) // Open the read side. This is done in a concurrently because opening // One end the pipe blocks until the other end is opened. @@ -55,13 +55,13 @@ func TestSeparateFDs(t *testing.T) { if err != nil { t.Fatalf("failed to open pipe for writing %q: %v", fileName, err) } - defer wfd.DecRef() + defer wfd.DecRef(ctx) rfd, ok := <-rfdchan if !ok { t.Fatalf("failed to open pipe for reading %q", fileName) } - defer rfd.DecRef() + defer rfd.DecRef(ctx) const msg = "vamos azul" checkEmpty(ctx, t, rfd) @@ -71,7 +71,7 @@ func TestSeparateFDs(t *testing.T) { func TestNonblockingRead(t *testing.T) { ctx, creds, vfsObj, root := setup(t) - defer root.DecRef() + defer root.DecRef(ctx) // Open the read side as nonblocking. pop := vfs.PathOperation{ @@ -85,7 +85,7 @@ func TestNonblockingRead(t *testing.T) { if err != nil { t.Fatalf("failed to open pipe for reading %q: %v", fileName, err) } - defer rfd.DecRef() + defer rfd.DecRef(ctx) // Open the write side. openOpts = vfs.OpenOptions{Flags: linux.O_WRONLY} @@ -93,7 +93,7 @@ func TestNonblockingRead(t *testing.T) { if err != nil { t.Fatalf("failed to open pipe for writing %q: %v", fileName, err) } - defer wfd.DecRef() + defer wfd.DecRef(ctx) const msg = "geh blau" checkEmpty(ctx, t, rfd) @@ -103,7 +103,7 @@ func TestNonblockingRead(t *testing.T) { func TestNonblockingWriteError(t *testing.T) { ctx, creds, vfsObj, root := setup(t) - defer root.DecRef() + defer root.DecRef(ctx) // Open the write side as nonblocking, which should return ENXIO. pop := vfs.PathOperation{ @@ -121,7 +121,7 @@ func TestNonblockingWriteError(t *testing.T) { func TestSingleFD(t *testing.T) { ctx, creds, vfsObj, root := setup(t) - defer root.DecRef() + defer root.DecRef(ctx) // Open the pipe as readable and writable. pop := vfs.PathOperation{ @@ -135,7 +135,7 @@ func TestSingleFD(t *testing.T) { if err != nil { t.Fatalf("failed to open pipe for writing %q: %v", fileName, err) } - defer fd.DecRef() + defer fd.DecRef(ctx) const msg = "forza blu" checkEmpty(ctx, t, fd) @@ -152,7 +152,7 @@ func setup(t *testing.T) (context.Context, *auth.Credentials, *vfs.VirtualFilesy // Create VFS. vfsObj := &vfs.VirtualFilesystem{} - if err := vfsObj.Init(); err != nil { + if err := vfsObj.Init(ctx); err != nil { t.Fatalf("VFS init: %v", err) } vfsObj.MustRegisterFilesystemType("tmpfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file.go b/pkg/sentry/fsimpl/tmpfs/regular_file.go index abbaa5d60..0710b65db 100644 --- a/pkg/sentry/fsimpl/tmpfs/regular_file.go +++ b/pkg/sentry/fsimpl/tmpfs/regular_file.go @@ -270,7 +270,7 @@ type regularFileFD struct { } // Release implements vfs.FileDescriptionImpl.Release. -func (fd *regularFileFD) Release() { +func (fd *regularFileFD) Release(context.Context) { // noop } diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go index 2545d88e9..68e615e8b 100644 --- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go +++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go @@ -185,7 +185,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt case linux.S_IFDIR: root = &fs.newDirectory(rootKUID, rootKGID, rootMode).dentry default: - fs.vfsfs.DecRef() + fs.vfsfs.DecRef(ctx) return nil, nil, fmt.Errorf("invalid tmpfs root file type: %#o", rootFileType) } return &fs.vfsfs, &root.vfsd, nil @@ -197,7 +197,7 @@ func NewFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *au } // Release implements vfs.FilesystemImpl.Release. -func (fs *filesystem) Release() { +func (fs *filesystem) Release(ctx context.Context) { fs.vfsfs.VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) } @@ -249,12 +249,12 @@ func (d *dentry) TryIncRef() bool { } // DecRef implements vfs.DentryImpl.DecRef. -func (d *dentry) DecRef() { - d.inode.decRef() +func (d *dentry) DecRef(ctx context.Context) { + d.inode.decRef(ctx) } // InotifyWithParent implements vfs.DentryImpl.InotifyWithParent. -func (d *dentry) InotifyWithParent(events, cookie uint32, et vfs.EventType) { +func (d *dentry) InotifyWithParent(ctx context.Context, events, cookie uint32, et vfs.EventType) { if d.inode.isDir() { events |= linux.IN_ISDIR } @@ -266,9 +266,9 @@ func (d *dentry) InotifyWithParent(events, cookie uint32, et vfs.EventType) { d.inode.fs.mu.RLock() // The ordering below is important, Linux always notifies the parent first. if d.parent != nil { - d.parent.inode.watches.Notify(d.name, events, cookie, et, deleted) + d.parent.inode.watches.Notify(ctx, d.name, events, cookie, et, deleted) } - d.inode.watches.Notify("", events, cookie, et, deleted) + d.inode.watches.Notify(ctx, "", events, cookie, et, deleted) d.inode.fs.mu.RUnlock() } @@ -278,7 +278,7 @@ func (d *dentry) Watches() *vfs.Watches { } // OnZeroWatches implements vfs.Dentry.OnZeroWatches. -func (d *dentry) OnZeroWatches() {} +func (d *dentry) OnZeroWatches(context.Context) {} // inode represents a filesystem object. type inode struct { @@ -359,12 +359,12 @@ func (i *inode) incLinksLocked() { // remove a reference on i as well. // // Preconditions: filesystem.mu must be locked for writing. i.nlink != 0. -func (i *inode) decLinksLocked() { +func (i *inode) decLinksLocked(ctx context.Context) { if i.nlink == 0 { panic("tmpfs.inode.decLinksLocked() called with no existing links") } if atomic.AddUint32(&i.nlink, ^uint32(0)) == 0 { - i.decRef() + i.decRef(ctx) } } @@ -386,9 +386,9 @@ func (i *inode) tryIncRef() bool { } } -func (i *inode) decRef() { +func (i *inode) decRef(ctx context.Context) { if refs := atomic.AddInt64(&i.refs, -1); refs == 0 { - i.watches.HandleDeletion() + i.watches.HandleDeletion(ctx) if regFile, ok := i.impl.(*regularFile); ok { // Release memory used by regFile to store data. Since regFile is // no longer usable, we don't need to grab any locks or update any @@ -701,7 +701,7 @@ func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) } if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 { - d.InotifyWithParent(ev, 0, vfs.InodeEvent) + d.InotifyWithParent(ctx, ev, 0, vfs.InodeEvent) } return nil } @@ -724,7 +724,7 @@ func (fd *fileDescription) Setxattr(ctx context.Context, opts vfs.SetxattrOption } // Generate inotify events. - d.InotifyWithParent(linux.IN_ATTRIB, 0, vfs.InodeEvent) + d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent) return nil } @@ -736,13 +736,13 @@ func (fd *fileDescription) Removexattr(ctx context.Context, name string) error { } // Generate inotify events. - d.InotifyWithParent(linux.IN_ATTRIB, 0, vfs.InodeEvent) + d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent) return nil } // NewMemfd creates a new tmpfs regular file and file description that can back // an anonymous fd created by memfd_create. -func NewMemfd(mount *vfs.Mount, creds *auth.Credentials, allowSeals bool, name string) (*vfs.FileDescription, error) { +func NewMemfd(ctx context.Context, creds *auth.Credentials, mount *vfs.Mount, allowSeals bool, name string) (*vfs.FileDescription, error) { fs, ok := mount.Filesystem().Impl().(*filesystem) if !ok { panic("NewMemfd() called with non-tmpfs mount") @@ -757,7 +757,7 @@ func NewMemfd(mount *vfs.Mount, creds *auth.Credentials, allowSeals bool, name s } d := fs.newDentry(inode) - defer d.DecRef() + defer d.DecRef(ctx) d.name = name // Per Linux, mm/shmem.c:__shmem_file_setup(), memfd files are set up with diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs_test.go b/pkg/sentry/fsimpl/tmpfs/tmpfs_test.go index a240fb276..6f3e3ae6f 100644 --- a/pkg/sentry/fsimpl/tmpfs/tmpfs_test.go +++ b/pkg/sentry/fsimpl/tmpfs/tmpfs_test.go @@ -34,7 +34,7 @@ func newTmpfsRoot(ctx context.Context) (*vfs.VirtualFilesystem, vfs.VirtualDentr creds := auth.CredentialsFromContext(ctx) vfsObj := &vfs.VirtualFilesystem{} - if err := vfsObj.Init(); err != nil { + if err := vfsObj.Init(ctx); err != nil { return nil, vfs.VirtualDentry{}, nil, fmt.Errorf("VFS init: %v", err) } @@ -47,8 +47,8 @@ func newTmpfsRoot(ctx context.Context) (*vfs.VirtualFilesystem, vfs.VirtualDentr } root := mntns.Root() return vfsObj, root, func() { - root.DecRef() - mntns.DecRef() + root.DecRef(ctx) + mntns.DecRef(ctx) }, nil } diff --git a/pkg/sentry/kernel/abstract_socket_namespace.go b/pkg/sentry/kernel/abstract_socket_namespace.go index 920fe4329..52ed5cea2 100644 --- a/pkg/sentry/kernel/abstract_socket_namespace.go +++ b/pkg/sentry/kernel/abstract_socket_namespace.go @@ -17,6 +17,7 @@ package kernel import ( "syscall" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" "gvisor.dev/gvisor/pkg/sync" @@ -31,7 +32,7 @@ type abstractEndpoint struct { } // WeakRefGone implements refs.WeakRefUser.WeakRefGone. -func (e *abstractEndpoint) WeakRefGone() { +func (e *abstractEndpoint) WeakRefGone(context.Context) { e.ns.mu.Lock() if e.ns.endpoints[e.name].ep == e.ep { delete(e.ns.endpoints, e.name) @@ -64,9 +65,9 @@ type boundEndpoint struct { } // Release implements transport.BoundEndpoint.Release. -func (e *boundEndpoint) Release() { - e.rc.DecRef() - e.BoundEndpoint.Release() +func (e *boundEndpoint) Release(ctx context.Context) { + e.rc.DecRef(ctx) + e.BoundEndpoint.Release(ctx) } // BoundEndpoint retrieves the endpoint bound to the given name. The return @@ -93,13 +94,13 @@ func (a *AbstractSocketNamespace) BoundEndpoint(name string) transport.BoundEndp // // When the last reference managed by rc is dropped, ep may be removed from the // namespace. -func (a *AbstractSocketNamespace) Bind(name string, ep transport.BoundEndpoint, rc refs.RefCounter) error { +func (a *AbstractSocketNamespace) Bind(ctx context.Context, name string, ep transport.BoundEndpoint, rc refs.RefCounter) error { a.mu.Lock() defer a.mu.Unlock() if ep, ok := a.endpoints[name]; ok { if rc := ep.wr.Get(); rc != nil { - rc.DecRef() + rc.DecRef(ctx) return syscall.EADDRINUSE } } diff --git a/pkg/sentry/kernel/epoll/epoll.go b/pkg/sentry/kernel/epoll/epoll.go index 4c0f1e41f..15519f0df 100644 --- a/pkg/sentry/kernel/epoll/epoll.go +++ b/pkg/sentry/kernel/epoll/epoll.go @@ -76,8 +76,8 @@ type pollEntry struct { // WeakRefGone implements refs.WeakRefUser.WeakRefGone. // weakReferenceGone is called when the file in the weak reference is destroyed. // The poll entry is removed in response to this. -func (p *pollEntry) WeakRefGone() { - p.epoll.RemoveEntry(p.id) +func (p *pollEntry) WeakRefGone(ctx context.Context) { + p.epoll.RemoveEntry(ctx, p.id) } // EventPoll holds all the state associated with an event poll object, that is, @@ -144,14 +144,14 @@ func NewEventPoll(ctx context.Context) *fs.File { // name matches fs/eventpoll.c:epoll_create1. dirent := fs.NewDirent(ctx, anon.NewInode(ctx), fmt.Sprintf("anon_inode:[eventpoll]")) // Release the initial dirent reference after NewFile takes a reference. - defer dirent.DecRef() + defer dirent.DecRef(ctx) return fs.NewFile(ctx, dirent, fs.FileFlags{}, &EventPoll{ files: make(map[FileIdentifier]*pollEntry), }) } // Release implements fs.FileOperations.Release. -func (e *EventPoll) Release() { +func (e *EventPoll) Release(ctx context.Context) { // We need to take the lock now because files may be attempting to // remove entries in parallel if they get destroyed. e.mu.Lock() @@ -160,7 +160,7 @@ func (e *EventPoll) Release() { // Go through all entries and clean up. for _, entry := range e.files { entry.id.File.EventUnregister(&entry.waiter) - entry.file.Drop() + entry.file.Drop(ctx) } e.files = nil } @@ -423,7 +423,7 @@ func (e *EventPoll) UpdateEntry(id FileIdentifier, flags EntryFlags, mask waiter } // RemoveEntry a files from the collection of observed files. -func (e *EventPoll) RemoveEntry(id FileIdentifier) error { +func (e *EventPoll) RemoveEntry(ctx context.Context, id FileIdentifier) error { e.mu.Lock() defer e.mu.Unlock() @@ -445,7 +445,7 @@ func (e *EventPoll) RemoveEntry(id FileIdentifier) error { // Remove file from map, and drop weak reference. delete(e.files, id) - entry.file.Drop() + entry.file.Drop(ctx) return nil } diff --git a/pkg/sentry/kernel/epoll/epoll_test.go b/pkg/sentry/kernel/epoll/epoll_test.go index 22630e9c5..55b505593 100644 --- a/pkg/sentry/kernel/epoll/epoll_test.go +++ b/pkg/sentry/kernel/epoll/epoll_test.go @@ -26,7 +26,8 @@ func TestFileDestroyed(t *testing.T) { f := filetest.NewTestFile(t) id := FileIdentifier{f, 12} - efile := NewEventPoll(contexttest.Context(t)) + ctx := contexttest.Context(t) + efile := NewEventPoll(ctx) e := efile.FileOperations.(*EventPoll) if err := e.AddEntry(id, 0, waiter.EventIn, [2]int32{}); err != nil { t.Fatalf("addEntry failed: %v", err) @@ -44,7 +45,7 @@ func TestFileDestroyed(t *testing.T) { } // Destroy the file. Check that we get no more events. - f.DecRef() + f.DecRef(ctx) evt = e.ReadEvents(1) if len(evt) != 0 { diff --git a/pkg/sentry/kernel/eventfd/eventfd.go b/pkg/sentry/kernel/eventfd/eventfd.go index 87951adeb..bbf568dfc 100644 --- a/pkg/sentry/kernel/eventfd/eventfd.go +++ b/pkg/sentry/kernel/eventfd/eventfd.go @@ -70,7 +70,7 @@ func New(ctx context.Context, initVal uint64, semMode bool) *fs.File { // name matches fs/eventfd.c:eventfd_file_create. dirent := fs.NewDirent(ctx, anon.NewInode(ctx), "anon_inode:[eventfd]") // Release the initial dirent reference after NewFile takes a reference. - defer dirent.DecRef() + defer dirent.DecRef(ctx) return fs.NewFile(ctx, dirent, fs.FileFlags{Read: true, Write: true}, &EventOperations{ val: initVal, semMode: semMode, @@ -106,7 +106,7 @@ func (e *EventOperations) HostFD() (int, error) { } // Release implements fs.FileOperations.Release. -func (e *EventOperations) Release() { +func (e *EventOperations) Release(context.Context) { e.mu.Lock() defer e.mu.Unlock() if e.hostfd >= 0 { diff --git a/pkg/sentry/kernel/fd_table.go b/pkg/sentry/kernel/fd_table.go index 4b7d234a4..ce53af69b 100644 --- a/pkg/sentry/kernel/fd_table.go +++ b/pkg/sentry/kernel/fd_table.go @@ -98,7 +98,7 @@ type FDTable struct { func (f *FDTable) saveDescriptorTable() map[int32]descriptor { m := make(map[int32]descriptor) - f.forEach(func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) { + f.forEach(context.Background(), func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) { m[fd] = descriptor{ file: file, fileVFS2: fileVFS2, @@ -109,6 +109,7 @@ func (f *FDTable) saveDescriptorTable() map[int32]descriptor { } func (f *FDTable) loadDescriptorTable(m map[int32]descriptor) { + ctx := context.Background() f.init() // Initialize table. for fd, d := range m { f.setAll(fd, d.file, d.fileVFS2, d.flags) @@ -118,9 +119,9 @@ func (f *FDTable) loadDescriptorTable(m map[int32]descriptor) { // reference taken by set above. switch { case d.file != nil: - d.file.DecRef() + d.file.DecRef(ctx) case d.fileVFS2 != nil: - d.fileVFS2.DecRef() + d.fileVFS2.DecRef(ctx) } } } @@ -144,14 +145,15 @@ func (f *FDTable) drop(file *fs.File) { d.InotifyEvent(ev, 0) // Drop the table reference. - file.DecRef() + file.DecRef(context.Background()) } // dropVFS2 drops the table reference. func (f *FDTable) dropVFS2(file *vfs.FileDescription) { // Release any POSIX lock possibly held by the FDTable. Range {0, 0} means the // entire file. - err := file.UnlockPOSIX(context.Background(), f, 0, 0, linux.SEEK_SET) + ctx := context.Background() + err := file.UnlockPOSIX(ctx, f, 0, 0, linux.SEEK_SET) if err != nil && err != syserror.ENOLCK { panic(fmt.Sprintf("UnlockPOSIX failed: %v", err)) } @@ -161,10 +163,10 @@ func (f *FDTable) dropVFS2(file *vfs.FileDescription) { if file.IsWritable() { ev = linux.IN_CLOSE_WRITE } - file.Dentry().InotifyWithParent(ev, 0, vfs.PathEvent) + file.Dentry().InotifyWithParent(ctx, ev, 0, vfs.PathEvent) // Drop the table's reference. - file.DecRef() + file.DecRef(ctx) } // NewFDTable allocates a new FDTable that may be used by tasks in k. @@ -175,15 +177,15 @@ func (k *Kernel) NewFDTable() *FDTable { } // destroy removes all of the file descriptors from the map. -func (f *FDTable) destroy() { - f.RemoveIf(func(*fs.File, *vfs.FileDescription, FDFlags) bool { +func (f *FDTable) destroy(ctx context.Context) { + f.RemoveIf(ctx, func(*fs.File, *vfs.FileDescription, FDFlags) bool { return true }) } // DecRef implements RefCounter.DecRef with destructor f.destroy. -func (f *FDTable) DecRef() { - f.DecRefWithDestructor(f.destroy) +func (f *FDTable) DecRef(ctx context.Context) { + f.DecRefWithDestructor(ctx, f.destroy) } // Size returns the number of file descriptor slots currently allocated. @@ -195,7 +197,7 @@ func (f *FDTable) Size() int { // forEach iterates over all non-nil files in sorted order. // // It is the caller's responsibility to acquire an appropriate lock. -func (f *FDTable) forEach(fn func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags)) { +func (f *FDTable) forEach(ctx context.Context, fn func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags)) { // retries tracks the number of failed TryIncRef attempts for the same FD. retries := 0 fd := int32(0) @@ -214,7 +216,7 @@ func (f *FDTable) forEach(fn func(fd int32, file *fs.File, fileVFS2 *vfs.FileDes continue // Race caught. } fn(fd, file, nil, flags) - file.DecRef() + file.DecRef(ctx) case fileVFS2 != nil: if !fileVFS2.TryIncRef() { retries++ @@ -224,7 +226,7 @@ func (f *FDTable) forEach(fn func(fd int32, file *fs.File, fileVFS2 *vfs.FileDes continue // Race caught. } fn(fd, nil, fileVFS2, flags) - fileVFS2.DecRef() + fileVFS2.DecRef(ctx) } retries = 0 fd++ @@ -234,7 +236,8 @@ func (f *FDTable) forEach(fn func(fd int32, file *fs.File, fileVFS2 *vfs.FileDes // String is a stringer for FDTable. func (f *FDTable) String() string { var buf strings.Builder - f.forEach(func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) { + ctx := context.Background() + f.forEach(ctx, func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) { switch { case file != nil: n, _ := file.Dirent.FullName(nil /* root */) @@ -242,7 +245,7 @@ func (f *FDTable) String() string { case fileVFS2 != nil: vfsObj := fileVFS2.Mount().Filesystem().VirtualFilesystem() - name, err := vfsObj.PathnameWithDeleted(context.Background(), vfs.VirtualDentry{}, fileVFS2.VirtualDentry()) + name, err := vfsObj.PathnameWithDeleted(ctx, vfs.VirtualDentry{}, fileVFS2.VirtualDentry()) if err != nil { fmt.Fprintf(&buf, "\n", err) return @@ -541,9 +544,9 @@ func (f *FDTable) GetVFS2(fd int32) (*vfs.FileDescription, FDFlags) { // // Precondition: The caller must be running on the task goroutine, or Task.mu // must be locked. -func (f *FDTable) GetFDs() []int32 { +func (f *FDTable) GetFDs(ctx context.Context) []int32 { fds := make([]int32, 0, int(atomic.LoadInt32(&f.used))) - f.forEach(func(fd int32, _ *fs.File, _ *vfs.FileDescription, _ FDFlags) { + f.forEach(ctx, func(fd int32, _ *fs.File, _ *vfs.FileDescription, _ FDFlags) { fds = append(fds, fd) }) return fds @@ -552,9 +555,9 @@ func (f *FDTable) GetFDs() []int32 { // GetRefs returns a stable slice of references to all files and bumps the // reference count on each. The caller must use DecRef on each reference when // they're done using the slice. -func (f *FDTable) GetRefs() []*fs.File { +func (f *FDTable) GetRefs(ctx context.Context) []*fs.File { files := make([]*fs.File, 0, f.Size()) - f.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) { + f.forEach(ctx, func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) { file.IncRef() // Acquire a reference for caller. files = append(files, file) }) @@ -564,9 +567,9 @@ func (f *FDTable) GetRefs() []*fs.File { // GetRefsVFS2 returns a stable slice of references to all files and bumps the // reference count on each. The caller must use DecRef on each reference when // they're done using the slice. -func (f *FDTable) GetRefsVFS2() []*vfs.FileDescription { +func (f *FDTable) GetRefsVFS2(ctx context.Context) []*vfs.FileDescription { files := make([]*vfs.FileDescription, 0, f.Size()) - f.forEach(func(_ int32, _ *fs.File, file *vfs.FileDescription, _ FDFlags) { + f.forEach(ctx, func(_ int32, _ *fs.File, file *vfs.FileDescription, _ FDFlags) { file.IncRef() // Acquire a reference for caller. files = append(files, file) }) @@ -574,10 +577,10 @@ func (f *FDTable) GetRefsVFS2() []*vfs.FileDescription { } // Fork returns an independent FDTable. -func (f *FDTable) Fork() *FDTable { +func (f *FDTable) Fork(ctx context.Context) *FDTable { clone := f.k.NewFDTable() - f.forEach(func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) { + f.forEach(ctx, func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) { // The set function here will acquire an appropriate table // reference for the clone. We don't need anything else. switch { @@ -622,11 +625,11 @@ func (f *FDTable) Remove(fd int32) (*fs.File, *vfs.FileDescription) { } // RemoveIf removes all FDs where cond is true. -func (f *FDTable) RemoveIf(cond func(*fs.File, *vfs.FileDescription, FDFlags) bool) { +func (f *FDTable) RemoveIf(ctx context.Context, cond func(*fs.File, *vfs.FileDescription, FDFlags) bool) { f.mu.Lock() defer f.mu.Unlock() - f.forEach(func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) { + f.forEach(ctx, func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) { if cond(file, fileVFS2, flags) { f.set(fd, nil, FDFlags{}) // Clear from table. // Update current available position. diff --git a/pkg/sentry/kernel/fd_table_test.go b/pkg/sentry/kernel/fd_table_test.go index 29f95a2c4..e3f30ba2a 100644 --- a/pkg/sentry/kernel/fd_table_test.go +++ b/pkg/sentry/kernel/fd_table_test.go @@ -154,7 +154,7 @@ func TestFDTable(t *testing.T) { if ref == nil { t.Fatalf("fdTable.Remove(1) for an existing FD: failed, want success") } - ref.DecRef() + ref.DecRef(ctx) if ref, _ := fdTable.Remove(1); ref != nil { t.Fatalf("r.Remove(1) for a removed FD: got success, want failure") @@ -191,7 +191,7 @@ func BenchmarkFDLookupAndDecRef(b *testing.B) { b.StartTimer() // Benchmark. for i := 0; i < b.N; i++ { tf, _ := fdTable.Get(fds[i%len(fds)]) - tf.DecRef() + tf.DecRef(ctx) } }) } @@ -219,7 +219,7 @@ func BenchmarkFDLookupAndDecRefConcurrent(b *testing.B) { defer wg.Done() for i := 0; i < each; i++ { tf, _ := fdTable.Get(fds[i%len(fds)]) - tf.DecRef() + tf.DecRef(ctx) } }() } diff --git a/pkg/sentry/kernel/fs_context.go b/pkg/sentry/kernel/fs_context.go index 47f78df9a..8f2d36d5a 100644 --- a/pkg/sentry/kernel/fs_context.go +++ b/pkg/sentry/kernel/fs_context.go @@ -17,6 +17,7 @@ package kernel import ( "fmt" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/vfs" @@ -89,28 +90,28 @@ func NewFSContextVFS2(root, cwd vfs.VirtualDentry, umask uint) *FSContext { // Note that there may still be calls to WorkingDirectory() or RootDirectory() // (that return nil). This is because valid references may still be held via // proc files or other mechanisms. -func (f *FSContext) destroy() { +func (f *FSContext) destroy(ctx context.Context) { // Hold f.mu so that we don't race with RootDirectory() and // WorkingDirectory(). f.mu.Lock() defer f.mu.Unlock() if VFS2Enabled { - f.rootVFS2.DecRef() + f.rootVFS2.DecRef(ctx) f.rootVFS2 = vfs.VirtualDentry{} - f.cwdVFS2.DecRef() + f.cwdVFS2.DecRef(ctx) f.cwdVFS2 = vfs.VirtualDentry{} } else { - f.root.DecRef() + f.root.DecRef(ctx) f.root = nil - f.cwd.DecRef() + f.cwd.DecRef(ctx) f.cwd = nil } } // DecRef implements RefCounter.DecRef with destructor f.destroy. -func (f *FSContext) DecRef() { - f.DecRefWithDestructor(f.destroy) +func (f *FSContext) DecRef(ctx context.Context) { + f.DecRefWithDestructor(ctx, f.destroy) } // Fork forks this FSContext. @@ -165,7 +166,7 @@ func (f *FSContext) WorkingDirectoryVFS2() vfs.VirtualDentry { // This will take an extra reference on the Dirent. // // This is not a valid call after destroy. -func (f *FSContext) SetWorkingDirectory(d *fs.Dirent) { +func (f *FSContext) SetWorkingDirectory(ctx context.Context, d *fs.Dirent) { if d == nil { panic("FSContext.SetWorkingDirectory called with nil dirent") } @@ -180,21 +181,21 @@ func (f *FSContext) SetWorkingDirectory(d *fs.Dirent) { old := f.cwd f.cwd = d d.IncRef() - old.DecRef() + old.DecRef(ctx) } // SetWorkingDirectoryVFS2 sets the current working directory. // This will take an extra reference on the VirtualDentry. // // This is not a valid call after destroy. -func (f *FSContext) SetWorkingDirectoryVFS2(d vfs.VirtualDentry) { +func (f *FSContext) SetWorkingDirectoryVFS2(ctx context.Context, d vfs.VirtualDentry) { f.mu.Lock() defer f.mu.Unlock() old := f.cwdVFS2 f.cwdVFS2 = d d.IncRef() - old.DecRef() + old.DecRef(ctx) } // RootDirectory returns the current filesystem root. @@ -226,7 +227,7 @@ func (f *FSContext) RootDirectoryVFS2() vfs.VirtualDentry { // This will take an extra reference on the Dirent. // // This is not a valid call after free. -func (f *FSContext) SetRootDirectory(d *fs.Dirent) { +func (f *FSContext) SetRootDirectory(ctx context.Context, d *fs.Dirent) { if d == nil { panic("FSContext.SetRootDirectory called with nil dirent") } @@ -241,13 +242,13 @@ func (f *FSContext) SetRootDirectory(d *fs.Dirent) { old := f.root f.root = d d.IncRef() - old.DecRef() + old.DecRef(ctx) } // SetRootDirectoryVFS2 sets the root directory. It takes a reference on vd. // // This is not a valid call after free. -func (f *FSContext) SetRootDirectoryVFS2(vd vfs.VirtualDentry) { +func (f *FSContext) SetRootDirectoryVFS2(ctx context.Context, vd vfs.VirtualDentry) { if !vd.Ok() { panic("FSContext.SetRootDirectoryVFS2 called with zero-value VirtualDentry") } @@ -263,7 +264,7 @@ func (f *FSContext) SetRootDirectoryVFS2(vd vfs.VirtualDentry) { vd.IncRef() f.rootVFS2 = vd f.mu.Unlock() - old.DecRef() + old.DecRef(ctx) } // Umask returns the current umask. diff --git a/pkg/sentry/kernel/futex/BUILD b/pkg/sentry/kernel/futex/BUILD index c5021f2db..daa2dae76 100644 --- a/pkg/sentry/kernel/futex/BUILD +++ b/pkg/sentry/kernel/futex/BUILD @@ -51,6 +51,7 @@ go_test( srcs = ["futex_test.go"], library = ":futex", deps = [ + "//pkg/context", "//pkg/sync", "//pkg/usermem", ], diff --git a/pkg/sentry/kernel/futex/futex.go b/pkg/sentry/kernel/futex/futex.go index bcc1b29a8..e4dcc4d40 100644 --- a/pkg/sentry/kernel/futex/futex.go +++ b/pkg/sentry/kernel/futex/futex.go @@ -19,6 +19,7 @@ package futex import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" @@ -66,9 +67,9 @@ type Key struct { Offset uint64 } -func (k *Key) release() { +func (k *Key) release(t Target) { if k.MappingIdentity != nil { - k.MappingIdentity.DecRef() + k.MappingIdentity.DecRef(t) } k.Mappable = nil k.MappingIdentity = nil @@ -94,6 +95,8 @@ func (k *Key) matches(k2 *Key) bool { // Target abstracts memory accesses and keys. type Target interface { + context.Context + // SwapUint32 gives access to usermem.IO.SwapUint32. SwapUint32(addr usermem.Addr, new uint32) (uint32, error) @@ -296,7 +299,7 @@ func (b *bucket) wakeWaiterLocked(w *Waiter) { // bucket "to". // // Preconditions: b and to must be locked. -func (b *bucket) requeueLocked(to *bucket, key, nkey *Key, n int) int { +func (b *bucket) requeueLocked(t Target, to *bucket, key, nkey *Key, n int) int { done := 0 for w := b.waiters.Front(); done < n && w != nil; { if !w.key.matches(key) { @@ -308,7 +311,7 @@ func (b *bucket) requeueLocked(to *bucket, key, nkey *Key, n int) int { requeued := w w = w.Next() // Next iteration. b.waiters.Remove(requeued) - requeued.key.release() + requeued.key.release(t) requeued.key = nkey.clone() to.waiters.PushBack(requeued) requeued.bucket.Store(to) @@ -456,7 +459,7 @@ func (m *Manager) Wake(t Target, addr usermem.Addr, private bool, bitmask uint32 r := b.wakeLocked(&k, bitmask, n) b.mu.Unlock() - k.release() + k.release(t) return r, nil } @@ -465,12 +468,12 @@ func (m *Manager) doRequeue(t Target, addr, naddr usermem.Addr, private bool, ch if err != nil { return 0, err } - defer k1.release() + defer k1.release(t) k2, err := getKey(t, naddr, private) if err != nil { return 0, err } - defer k2.release() + defer k2.release(t) b1, b2 := m.lockBuckets(&k1, &k2) defer b1.mu.Unlock() @@ -488,7 +491,7 @@ func (m *Manager) doRequeue(t Target, addr, naddr usermem.Addr, private bool, ch done := b1.wakeLocked(&k1, ^uint32(0), nwake) // Requeue the number required. - b1.requeueLocked(b2, &k1, &k2, nreq) + b1.requeueLocked(t, b2, &k1, &k2, nreq) return done, nil } @@ -515,12 +518,12 @@ func (m *Manager) WakeOp(t Target, addr1, addr2 usermem.Addr, private bool, nwak if err != nil { return 0, err } - defer k1.release() + defer k1.release(t) k2, err := getKey(t, addr2, private) if err != nil { return 0, err } - defer k2.release() + defer k2.release(t) b1, b2 := m.lockBuckets(&k1, &k2) defer b1.mu.Unlock() @@ -571,7 +574,7 @@ func (m *Manager) WaitPrepare(w *Waiter, t Target, addr usermem.Addr, private bo // Perform our atomic check. if err := check(t, addr, val); err != nil { b.mu.Unlock() - w.key.release() + w.key.release(t) return err } @@ -585,7 +588,7 @@ func (m *Manager) WaitPrepare(w *Waiter, t Target, addr usermem.Addr, private bo // WaitComplete must be called when a Waiter previously added by WaitPrepare is // no longer eligible to be woken. -func (m *Manager) WaitComplete(w *Waiter) { +func (m *Manager) WaitComplete(w *Waiter, t Target) { // Remove w from the bucket it's in. for { b := w.bucket.Load() @@ -617,7 +620,7 @@ func (m *Manager) WaitComplete(w *Waiter) { } // Release references held by the waiter. - w.key.release() + w.key.release(t) } // LockPI attempts to lock the futex following the Priority-inheritance futex @@ -648,13 +651,13 @@ func (m *Manager) LockPI(w *Waiter, t Target, addr usermem.Addr, tid uint32, pri success, err := m.lockPILocked(w, t, addr, tid, b, try) if err != nil { - w.key.release() + w.key.release(t) b.mu.Unlock() return false, err } if success || try { // Release waiter if it's not going to be a wait. - w.key.release() + w.key.release(t) } b.mu.Unlock() return success, nil @@ -730,7 +733,7 @@ func (m *Manager) UnlockPI(t Target, addr usermem.Addr, tid uint32, private bool err = m.unlockPILocked(t, addr, tid, b, &k) - k.release() + k.release(t) b.mu.Unlock() return err } diff --git a/pkg/sentry/kernel/futex/futex_test.go b/pkg/sentry/kernel/futex/futex_test.go index 7c5c7665b..d0128c548 100644 --- a/pkg/sentry/kernel/futex/futex_test.go +++ b/pkg/sentry/kernel/futex/futex_test.go @@ -22,6 +22,7 @@ import ( "testing" "unsafe" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/usermem" ) @@ -29,28 +30,33 @@ import ( // testData implements the Target interface, and allows us to // treat the address passed for futex operations as an index in // a byte slice for testing simplicity. -type testData []byte +type testData struct { + context.Context + data []byte +} const sizeofInt32 = 4 func newTestData(size uint) testData { - return make([]byte, size) + return testData{ + data: make([]byte, size), + } } func (t testData) SwapUint32(addr usermem.Addr, new uint32) (uint32, error) { - val := atomic.SwapUint32((*uint32)(unsafe.Pointer(&t[addr])), new) + val := atomic.SwapUint32((*uint32)(unsafe.Pointer(&t.data[addr])), new) return val, nil } func (t testData) CompareAndSwapUint32(addr usermem.Addr, old, new uint32) (uint32, error) { - if atomic.CompareAndSwapUint32((*uint32)(unsafe.Pointer(&t[addr])), old, new) { + if atomic.CompareAndSwapUint32((*uint32)(unsafe.Pointer(&t.data[addr])), old, new) { return old, nil } - return atomic.LoadUint32((*uint32)(unsafe.Pointer(&t[addr]))), nil + return atomic.LoadUint32((*uint32)(unsafe.Pointer(&t.data[addr]))), nil } func (t testData) LoadUint32(addr usermem.Addr) (uint32, error) { - return atomic.LoadUint32((*uint32)(unsafe.Pointer(&t[addr]))), nil + return atomic.LoadUint32((*uint32)(unsafe.Pointer(&t.data[addr]))), nil } func (t testData) GetSharedKey(addr usermem.Addr) (Key, error) { @@ -83,7 +89,7 @@ func TestFutexWake(t *testing.T) { // Start waiting for wakeup. w := newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0)) - defer m.WaitComplete(w) + defer m.WaitComplete(w, d) // Perform a wakeup. if n, err := m.Wake(d, 0, private, ^uint32(0), 1); err != nil || n != 1 { @@ -106,7 +112,7 @@ func TestFutexWakeBitmask(t *testing.T) { // Start waiting for wakeup. w := newPreparedTestWaiter(t, m, d, 0, private, 0, 0x0000ffff) - defer m.WaitComplete(w) + defer m.WaitComplete(w, d) // Perform a wakeup using the wrong bitmask. if n, err := m.Wake(d, 0, private, 0xffff0000, 1); err != nil || n != 0 { @@ -141,7 +147,7 @@ func TestFutexWakeTwo(t *testing.T) { var ws [3]*Waiter for i := range ws { ws[i] = newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0)) - defer m.WaitComplete(ws[i]) + defer m.WaitComplete(ws[i], d) } // Perform two wakeups. @@ -174,9 +180,9 @@ func TestFutexWakeUnrelated(t *testing.T) { // Start two waiters waiting for wakeup on different addresses. w1 := newPreparedTestWaiter(t, m, d, 0*sizeofInt32, private, 0, ^uint32(0)) - defer m.WaitComplete(w1) + defer m.WaitComplete(w1, d) w2 := newPreparedTestWaiter(t, m, d, 1*sizeofInt32, private, 0, ^uint32(0)) - defer m.WaitComplete(w2) + defer m.WaitComplete(w2, d) // Perform two wakeups on the second address. if n, err := m.Wake(d, 1*sizeofInt32, private, ^uint32(0), 2); err != nil || n != 1 { @@ -216,9 +222,9 @@ func TestWakeOpFirstNonEmpty(t *testing.T) { // Add two waiters on address 0. w1 := newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0)) - defer m.WaitComplete(w1) + defer m.WaitComplete(w1, d) w2 := newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0)) - defer m.WaitComplete(w2) + defer m.WaitComplete(w2, d) // Perform 10 wakeups on address 0. if n, err := m.WakeOp(d, 0, sizeofInt32, private, 10, 0, 0); err != nil || n != 2 { @@ -244,9 +250,9 @@ func TestWakeOpSecondNonEmpty(t *testing.T) { // Add two waiters on address sizeofInt32. w1 := newPreparedTestWaiter(t, m, d, sizeofInt32, private, 0, ^uint32(0)) - defer m.WaitComplete(w1) + defer m.WaitComplete(w1, d) w2 := newPreparedTestWaiter(t, m, d, sizeofInt32, private, 0, ^uint32(0)) - defer m.WaitComplete(w2) + defer m.WaitComplete(w2, d) // Perform 10 wakeups on address sizeofInt32 (contingent on // d.Op(0), which should succeed). @@ -273,9 +279,9 @@ func TestWakeOpSecondNonEmptyFailingOp(t *testing.T) { // Add two waiters on address sizeofInt32. w1 := newPreparedTestWaiter(t, m, d, sizeofInt32, private, 0, ^uint32(0)) - defer m.WaitComplete(w1) + defer m.WaitComplete(w1, d) w2 := newPreparedTestWaiter(t, m, d, sizeofInt32, private, 0, ^uint32(0)) - defer m.WaitComplete(w2) + defer m.WaitComplete(w2, d) // Perform 10 wakeups on address sizeofInt32 (contingent on // d.Op(1), which should fail). @@ -302,15 +308,15 @@ func TestWakeOpAllNonEmpty(t *testing.T) { // Add two waiters on address 0. w1 := newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0)) - defer m.WaitComplete(w1) + defer m.WaitComplete(w1, d) w2 := newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0)) - defer m.WaitComplete(w2) + defer m.WaitComplete(w2, d) // Add two waiters on address sizeofInt32. w3 := newPreparedTestWaiter(t, m, d, sizeofInt32, private, 0, ^uint32(0)) - defer m.WaitComplete(w3) + defer m.WaitComplete(w3, d) w4 := newPreparedTestWaiter(t, m, d, sizeofInt32, private, 0, ^uint32(0)) - defer m.WaitComplete(w4) + defer m.WaitComplete(w4, d) // Perform 10 wakeups on address 0 (unconditionally), and 10 // wakeups on address sizeofInt32 (contingent on d.Op(0), which @@ -344,15 +350,15 @@ func TestWakeOpAllNonEmptyFailingOp(t *testing.T) { // Add two waiters on address 0. w1 := newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0)) - defer m.WaitComplete(w1) + defer m.WaitComplete(w1, d) w2 := newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0)) - defer m.WaitComplete(w2) + defer m.WaitComplete(w2, d) // Add two waiters on address sizeofInt32. w3 := newPreparedTestWaiter(t, m, d, sizeofInt32, private, 0, ^uint32(0)) - defer m.WaitComplete(w3) + defer m.WaitComplete(w3, d) w4 := newPreparedTestWaiter(t, m, d, sizeofInt32, private, 0, ^uint32(0)) - defer m.WaitComplete(w4) + defer m.WaitComplete(w4, d) // Perform 10 wakeups on address 0 (unconditionally), and 10 // wakeups on address sizeofInt32 (contingent on d.Op(1), which @@ -388,7 +394,7 @@ func TestWakeOpSameAddress(t *testing.T) { var ws [4]*Waiter for i := range ws { ws[i] = newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0)) - defer m.WaitComplete(ws[i]) + defer m.WaitComplete(ws[i], d) } // Perform 1 wakeup on address 0 (unconditionally), and 1 wakeup @@ -422,7 +428,7 @@ func TestWakeOpSameAddressFailingOp(t *testing.T) { var ws [4]*Waiter for i := range ws { ws[i] = newPreparedTestWaiter(t, m, d, 0, private, 0, ^uint32(0)) - defer m.WaitComplete(ws[i]) + defer m.WaitComplete(ws[i], d) } // Perform 1 wakeup on address 0 (unconditionally), and 1 wakeup @@ -472,7 +478,7 @@ func (t *testMutex) Lock() { for { // Attempt to grab the lock. if atomic.CompareAndSwapUint32( - (*uint32)(unsafe.Pointer(&t.d[t.a])), + (*uint32)(unsafe.Pointer(&t.d.data[t.a])), testMutexUnlocked, testMutexLocked) { // Lock held. @@ -490,7 +496,7 @@ func (t *testMutex) Lock() { panic("WaitPrepare returned unexpected error: " + err.Error()) } <-w.C - t.m.WaitComplete(w) + t.m.WaitComplete(w, t.d) } } @@ -498,7 +504,7 @@ func (t *testMutex) Lock() { // This will notify any waiters via the futex manager. func (t *testMutex) Unlock() { // Unlock. - atomic.StoreUint32((*uint32)(unsafe.Pointer(&t.d[t.a])), testMutexUnlocked) + atomic.StoreUint32((*uint32)(unsafe.Pointer(&t.d.data[t.a])), testMutexUnlocked) // Notify all waiters. t.m.Wake(t.d, t.a, true, ^uint32(0), math.MaxInt32) diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index 15dae0f5b..316df249d 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -376,7 +376,8 @@ func (k *Kernel) Init(args InitKernelArgs) error { k.netlinkPorts = port.New() if VFS2Enabled { - if err := k.vfs.Init(); err != nil { + ctx := k.SupervisorContext() + if err := k.vfs.Init(ctx); err != nil { return fmt.Errorf("failed to initialize VFS: %v", err) } @@ -384,19 +385,19 @@ func (k *Kernel) Init(args InitKernelArgs) error { if err != nil { return fmt.Errorf("failed to create pipefs filesystem: %v", err) } - defer pipeFilesystem.DecRef() + defer pipeFilesystem.DecRef(ctx) pipeMount, err := k.vfs.NewDisconnectedMount(pipeFilesystem, nil, &vfs.MountOptions{}) if err != nil { return fmt.Errorf("failed to create pipefs mount: %v", err) } k.pipeMount = pipeMount - tmpfsFilesystem, tmpfsRoot, err := tmpfs.NewFilesystem(k.SupervisorContext(), &k.vfs, auth.NewRootCredentials(k.rootUserNamespace)) + tmpfsFilesystem, tmpfsRoot, err := tmpfs.NewFilesystem(ctx, &k.vfs, auth.NewRootCredentials(k.rootUserNamespace)) if err != nil { return fmt.Errorf("failed to create tmpfs filesystem: %v", err) } - defer tmpfsFilesystem.DecRef() - defer tmpfsRoot.DecRef() + defer tmpfsFilesystem.DecRef(ctx) + defer tmpfsRoot.DecRef(ctx) shmMount, err := k.vfs.NewDisconnectedMount(tmpfsFilesystem, tmpfsRoot, &vfs.MountOptions{}) if err != nil { return fmt.Errorf("failed to create tmpfs mount: %v", err) @@ -407,7 +408,7 @@ func (k *Kernel) Init(args InitKernelArgs) error { if err != nil { return fmt.Errorf("failed to create sockfs filesystem: %v", err) } - defer socketFilesystem.DecRef() + defer socketFilesystem.DecRef(ctx) socketMount, err := k.vfs.NewDisconnectedMount(socketFilesystem, nil, &vfs.MountOptions{}) if err != nil { return fmt.Errorf("failed to create sockfs mount: %v", err) @@ -430,8 +431,8 @@ func (k *Kernel) SaveTo(w wire.Writer) error { defer k.extMu.Unlock() // Stop time. - k.pauseTimeLocked() - defer k.resumeTimeLocked() + k.pauseTimeLocked(ctx) + defer k.resumeTimeLocked(ctx) // Evict all evictable MemoryFile allocations. k.mf.StartEvictions() @@ -447,12 +448,12 @@ func (k *Kernel) SaveTo(w wire.Writer) error { // Remove all epoll waiter objects from underlying wait queues. // NOTE: for programs to resume execution in future snapshot scenarios, // we will need to re-establish these waiter objects after saving. - k.tasks.unregisterEpollWaiters() + k.tasks.unregisterEpollWaiters(ctx) // Clear the dirent cache before saving because Dirents must be Loaded in a // particular order (parents before children), and Loading dirents from a cache // breaks that order. - if err := k.flushMountSourceRefs(); err != nil { + if err := k.flushMountSourceRefs(ctx); err != nil { return err } @@ -505,7 +506,7 @@ func (k *Kernel) SaveTo(w wire.Writer) error { // flushMountSourceRefs flushes the MountSources for all mounted filesystems // and open FDs. -func (k *Kernel) flushMountSourceRefs() error { +func (k *Kernel) flushMountSourceRefs(ctx context.Context) error { // Flush all mount sources for currently mounted filesystems in each task. flushed := make(map[*fs.MountNamespace]struct{}) k.tasks.mu.RLock() @@ -521,7 +522,7 @@ func (k *Kernel) flushMountSourceRefs() error { // There may be some open FDs whose filesystems have been unmounted. We // must flush those as well. - return k.tasks.forEachFDPaused(func(file *fs.File, _ *vfs.FileDescription) error { + return k.tasks.forEachFDPaused(ctx, func(file *fs.File, _ *vfs.FileDescription) error { file.Dirent.Inode.MountSource.FlushDirentRefs() return nil }) @@ -531,7 +532,7 @@ func (k *Kernel) flushMountSourceRefs() error { // each task. // // Precondition: Must be called with the kernel paused. -func (ts *TaskSet) forEachFDPaused(f func(*fs.File, *vfs.FileDescription) error) (err error) { +func (ts *TaskSet) forEachFDPaused(ctx context.Context, f func(*fs.File, *vfs.FileDescription) error) (err error) { // TODO(gvisor.dev/issue/1663): Add save support for VFS2. if VFS2Enabled { return nil @@ -544,7 +545,7 @@ func (ts *TaskSet) forEachFDPaused(f func(*fs.File, *vfs.FileDescription) error) if t.fdTable == nil { continue } - t.fdTable.forEach(func(_ int32, file *fs.File, fileVFS2 *vfs.FileDescription, _ FDFlags) { + t.fdTable.forEach(ctx, func(_ int32, file *fs.File, fileVFS2 *vfs.FileDescription, _ FDFlags) { if lastErr := f(file, fileVFS2); lastErr != nil && err == nil { err = lastErr } @@ -555,7 +556,7 @@ func (ts *TaskSet) forEachFDPaused(f func(*fs.File, *vfs.FileDescription) error) func (ts *TaskSet) flushWritesToFiles(ctx context.Context) error { // TODO(gvisor.dev/issue/1663): Add save support for VFS2. - return ts.forEachFDPaused(func(file *fs.File, _ *vfs.FileDescription) error { + return ts.forEachFDPaused(ctx, func(file *fs.File, _ *vfs.FileDescription) error { if flags := file.Flags(); !flags.Write { return nil } @@ -602,7 +603,7 @@ func (k *Kernel) invalidateUnsavableMappings(ctx context.Context) error { return nil } -func (ts *TaskSet) unregisterEpollWaiters() { +func (ts *TaskSet) unregisterEpollWaiters(ctx context.Context) { // TODO(gvisor.dev/issue/1663): Add save support for VFS2. if VFS2Enabled { return @@ -623,7 +624,7 @@ func (ts *TaskSet) unregisterEpollWaiters() { if _, ok := processed[t.fdTable]; ok { continue } - t.fdTable.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) { + t.fdTable.forEach(ctx, func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) { if e, ok := file.FileOperations.(*epoll.EventPoll); ok { e.UnregisterEpollWaiters() } @@ -900,7 +901,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, root := args.MountNamespaceVFS2.Root() // The call to newFSContext below will take a reference on root, so we // don't need to hold this one. - defer root.DecRef() + defer root.DecRef(ctx) // Grab the working directory. wd := root // Default. @@ -918,7 +919,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, if err != nil { return nil, 0, fmt.Errorf("failed to find initial working directory %q: %v", args.WorkingDirectory, err) } - defer wd.DecRef() + defer wd.DecRef(ctx) } opener = fsbridge.NewVFSLookup(mntnsVFS2, root, wd) fsContext = NewFSContextVFS2(root, wd, args.Umask) @@ -933,7 +934,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, root := mntns.Root() // The call to newFSContext below will take a reference on root, so we // don't need to hold this one. - defer root.DecRef() + defer root.DecRef(ctx) // Grab the working directory. remainingTraversals := args.MaxSymlinkTraversals @@ -944,7 +945,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, if err != nil { return nil, 0, fmt.Errorf("failed to find initial working directory %q: %v", args.WorkingDirectory, err) } - defer wd.DecRef() + defer wd.DecRef(ctx) } opener = fsbridge.NewFSLookup(mntns, root, wd) fsContext = newFSContext(root, wd, args.Umask) @@ -1054,7 +1055,7 @@ func (k *Kernel) Start() error { // If k was created by LoadKernelFrom, timers were stopped during // Kernel.SaveTo and need to be resumed. If k was created by NewKernel, // this is a no-op. - k.resumeTimeLocked() + k.resumeTimeLocked(k.SupervisorContext()) // Start task goroutines. k.tasks.mu.RLock() defer k.tasks.mu.RUnlock() @@ -1068,7 +1069,7 @@ func (k *Kernel) Start() error { // // Preconditions: Any task goroutines running in k must be stopped. k.extMu // must be locked. -func (k *Kernel) pauseTimeLocked() { +func (k *Kernel) pauseTimeLocked(ctx context.Context) { // k.cpuClockTicker may be nil since Kernel.SaveTo() may be called before // Kernel.Start(). if k.cpuClockTicker != nil { @@ -1090,7 +1091,7 @@ func (k *Kernel) pauseTimeLocked() { // This means we'll iterate FDTables shared by multiple tasks repeatedly, // but ktime.Timer.Pause is idempotent so this is harmless. if t.fdTable != nil { - t.fdTable.forEach(func(_ int32, file *fs.File, fd *vfs.FileDescription, _ FDFlags) { + t.fdTable.forEach(ctx, func(_ int32, file *fs.File, fd *vfs.FileDescription, _ FDFlags) { if VFS2Enabled { if tfd, ok := fd.Impl().(*timerfd.TimerFileDescription); ok { tfd.PauseTimer() @@ -1112,7 +1113,7 @@ func (k *Kernel) pauseTimeLocked() { // // Preconditions: Any task goroutines running in k must be stopped. k.extMu // must be locked. -func (k *Kernel) resumeTimeLocked() { +func (k *Kernel) resumeTimeLocked(ctx context.Context) { if k.cpuClockTicker != nil { k.cpuClockTicker.Resume() } @@ -1126,7 +1127,7 @@ func (k *Kernel) resumeTimeLocked() { } } if t.fdTable != nil { - t.fdTable.forEach(func(_ int32, file *fs.File, fd *vfs.FileDescription, _ FDFlags) { + t.fdTable.forEach(ctx, func(_ int32, file *fs.File, fd *vfs.FileDescription, _ FDFlags) { if VFS2Enabled { if tfd, ok := fd.Impl().(*timerfd.TimerFileDescription); ok { tfd.ResumeTimer() @@ -1511,7 +1512,7 @@ type SocketEntry struct { } // WeakRefGone implements refs.WeakRefUser.WeakRefGone. -func (s *SocketEntry) WeakRefGone() { +func (s *SocketEntry) WeakRefGone(context.Context) { s.k.extMu.Lock() s.k.sockets.Remove(s) s.k.extMu.Unlock() @@ -1600,7 +1601,7 @@ func (ctx supervisorContext) Value(key interface{}) interface{} { return vfs.VirtualDentry{} } mntns := ctx.k.GlobalInit().Leader().MountNamespaceVFS2() - defer mntns.DecRef() + defer mntns.DecRef(ctx) // Root() takes a reference on the root dirent for us. return mntns.Root() case vfs.CtxMountNamespace: diff --git a/pkg/sentry/kernel/pipe/node.go b/pkg/sentry/kernel/pipe/node.go index 4b688c627..6497dc4ba 100644 --- a/pkg/sentry/kernel/pipe/node.go +++ b/pkg/sentry/kernel/pipe/node.go @@ -93,7 +93,7 @@ func (i *inodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.Fi if i.p.isNamed && !flags.NonBlocking && !i.p.HasWriters() { if !waitFor(&i.mu, &i.wWakeup, ctx) { - r.DecRef() + r.DecRef(ctx) return nil, syserror.ErrInterrupted } } @@ -111,12 +111,12 @@ func (i *inodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.Fi // On a nonblocking, write-only open, the open fails with ENXIO if the // read side isn't open yet. if flags.NonBlocking { - w.DecRef() + w.DecRef(ctx) return nil, syserror.ENXIO } if !waitFor(&i.mu, &i.rWakeup, ctx) { - w.DecRef() + w.DecRef(ctx) return nil, syserror.ErrInterrupted } } diff --git a/pkg/sentry/kernel/pipe/node_test.go b/pkg/sentry/kernel/pipe/node_test.go index ab75a87ff..ce0db5583 100644 --- a/pkg/sentry/kernel/pipe/node_test.go +++ b/pkg/sentry/kernel/pipe/node_test.go @@ -167,7 +167,7 @@ func TestClosedReaderBlocksWriteOpen(t *testing.T) { f := NewInodeOperations(ctx, perms, newNamedPipe(t)) rFile, _ := testOpenOrDie(ctx, t, f, fs.FileFlags{Read: true, NonBlocking: true}, nil) - rFile.DecRef() + rFile.DecRef(ctx) wDone := make(chan struct{}) // This open for write should block because the reader is now gone. diff --git a/pkg/sentry/kernel/pipe/pipe.go b/pkg/sentry/kernel/pipe/pipe.go index 79645d7d2..297e8f28f 100644 --- a/pkg/sentry/kernel/pipe/pipe.go +++ b/pkg/sentry/kernel/pipe/pipe.go @@ -152,7 +152,7 @@ func NewConnectedPipe(ctx context.Context, sizeBytes, atomicIOBytes int64) (*fs. d := fs.NewDirent(ctx, fs.NewInode(ctx, iops, ms, sattr), fmt.Sprintf("pipe:[%d]", ino)) // The p.Open calls below will each take a reference on the Dirent. We // must drop the one we already have. - defer d.DecRef() + defer d.DecRef(ctx) return p.Open(ctx, d, fs.FileFlags{Read: true}), p.Open(ctx, d, fs.FileFlags{Write: true}) } diff --git a/pkg/sentry/kernel/pipe/pipe_test.go b/pkg/sentry/kernel/pipe/pipe_test.go index bda739dbe..fe97e9800 100644 --- a/pkg/sentry/kernel/pipe/pipe_test.go +++ b/pkg/sentry/kernel/pipe/pipe_test.go @@ -27,8 +27,8 @@ import ( func TestPipeRW(t *testing.T) { ctx := contexttest.Context(t) r, w := NewConnectedPipe(ctx, 65536, 4096) - defer r.DecRef() - defer w.DecRef() + defer r.DecRef(ctx) + defer w.DecRef(ctx) msg := []byte("here's some bytes") wantN := int64(len(msg)) @@ -47,8 +47,8 @@ func TestPipeRW(t *testing.T) { func TestPipeReadBlock(t *testing.T) { ctx := contexttest.Context(t) r, w := NewConnectedPipe(ctx, 65536, 4096) - defer r.DecRef() - defer w.DecRef() + defer r.DecRef(ctx) + defer w.DecRef(ctx) n, err := r.Readv(ctx, usermem.BytesIOSequence(make([]byte, 1))) if n != 0 || err != syserror.ErrWouldBlock { @@ -62,8 +62,8 @@ func TestPipeWriteBlock(t *testing.T) { ctx := contexttest.Context(t) r, w := NewConnectedPipe(ctx, capacity, atomicIOBytes) - defer r.DecRef() - defer w.DecRef() + defer r.DecRef(ctx) + defer w.DecRef(ctx) msg := make([]byte, capacity+1) n, err := w.Writev(ctx, usermem.BytesIOSequence(msg)) @@ -77,8 +77,8 @@ func TestPipeWriteUntilEnd(t *testing.T) { ctx := contexttest.Context(t) r, w := NewConnectedPipe(ctx, atomicIOBytes, atomicIOBytes) - defer r.DecRef() - defer w.DecRef() + defer r.DecRef(ctx) + defer w.DecRef(ctx) msg := []byte("here's some bytes") diff --git a/pkg/sentry/kernel/pipe/pipe_util.go b/pkg/sentry/kernel/pipe/pipe_util.go index aacf28da2..6d58b682f 100644 --- a/pkg/sentry/kernel/pipe/pipe_util.go +++ b/pkg/sentry/kernel/pipe/pipe_util.go @@ -33,7 +33,7 @@ import ( // the old fs architecture. // Release cleans up the pipe's state. -func (p *Pipe) Release() { +func (p *Pipe) Release(context.Context) { p.rClose() p.wClose() diff --git a/pkg/sentry/kernel/pipe/reader.go b/pkg/sentry/kernel/pipe/reader.go index 7724b4452..ac18785c0 100644 --- a/pkg/sentry/kernel/pipe/reader.go +++ b/pkg/sentry/kernel/pipe/reader.go @@ -15,6 +15,7 @@ package pipe import ( + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/waiter" ) @@ -29,7 +30,7 @@ type Reader struct { // Release implements fs.FileOperations.Release. // // This overrides ReaderWriter.Release. -func (r *Reader) Release() { +func (r *Reader) Release(context.Context) { r.Pipe.rClose() // Wake up writers. diff --git a/pkg/sentry/kernel/pipe/vfs.go b/pkg/sentry/kernel/pipe/vfs.go index 45d4c5fc1..28f998e45 100644 --- a/pkg/sentry/kernel/pipe/vfs.go +++ b/pkg/sentry/kernel/pipe/vfs.go @@ -101,7 +101,7 @@ func (vp *VFSPipe) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, s // If this pipe is being opened as blocking and there's no // writer, we have to wait for a writer to open the other end. if vp.pipe.isNamed && statusFlags&linux.O_NONBLOCK == 0 && !vp.pipe.HasWriters() && !waitFor(&vp.mu, &vp.wWakeup, ctx) { - fd.DecRef() + fd.DecRef(ctx) return nil, syserror.EINTR } @@ -112,12 +112,12 @@ func (vp *VFSPipe) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, s // Non-blocking, write-only opens fail with ENXIO when the read // side isn't open yet. if statusFlags&linux.O_NONBLOCK != 0 { - fd.DecRef() + fd.DecRef(ctx) return nil, syserror.ENXIO } // Wait for a reader to open the other end. if !waitFor(&vp.mu, &vp.rWakeup, ctx) { - fd.DecRef() + fd.DecRef(ctx) return nil, syserror.EINTR } } @@ -169,7 +169,7 @@ type VFSPipeFD struct { } // Release implements vfs.FileDescriptionImpl.Release. -func (fd *VFSPipeFD) Release() { +func (fd *VFSPipeFD) Release(context.Context) { var event waiter.EventMask if fd.vfsfd.IsReadable() { fd.pipe.rClose() diff --git a/pkg/sentry/kernel/pipe/writer.go b/pkg/sentry/kernel/pipe/writer.go index 5bc6aa931..ef4b70ca3 100644 --- a/pkg/sentry/kernel/pipe/writer.go +++ b/pkg/sentry/kernel/pipe/writer.go @@ -15,6 +15,7 @@ package pipe import ( + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/waiter" ) @@ -29,7 +30,7 @@ type Writer struct { // Release implements fs.FileOperations.Release. // // This overrides ReaderWriter.Release. -func (w *Writer) Release() { +func (w *Writer) Release(context.Context) { w.Pipe.wClose() // Wake up readers. diff --git a/pkg/sentry/kernel/sessions.go b/pkg/sentry/kernel/sessions.go index 0e19286de..5c4c622c2 100644 --- a/pkg/sentry/kernel/sessions.go +++ b/pkg/sentry/kernel/sessions.go @@ -16,6 +16,7 @@ package kernel import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/syserror" @@ -70,7 +71,7 @@ func (s *Session) incRef() { // // Precondition: callers must hold TaskSet.mu for writing. func (s *Session) decRef() { - s.refs.DecRefWithDestructor(func() { + s.refs.DecRefWithDestructor(nil, func(context.Context) { // Remove translations from the leader. for ns := s.leader.pidns; ns != nil; ns = ns.parent { id := ns.sids[s] @@ -162,7 +163,7 @@ func (pg *ProcessGroup) decRefWithParent(parentPG *ProcessGroup) { } alive := true - pg.refs.DecRefWithDestructor(func() { + pg.refs.DecRefWithDestructor(nil, func(context.Context) { alive = false // don't bother with handleOrphan. // Remove translations from the originator. diff --git a/pkg/sentry/kernel/shm/shm.go b/pkg/sentry/kernel/shm/shm.go index 55b4c2cdb..13ec7afe0 100644 --- a/pkg/sentry/kernel/shm/shm.go +++ b/pkg/sentry/kernel/shm/shm.go @@ -431,8 +431,8 @@ func (s *Shm) InodeID() uint64 { // DecRef overrides refs.RefCount.DecRef with a destructor. // // Precondition: Caller must not hold s.mu. -func (s *Shm) DecRef() { - s.DecRefWithDestructor(s.destroy) +func (s *Shm) DecRef(ctx context.Context) { + s.DecRefWithDestructor(ctx, s.destroy) } // Msync implements memmap.MappingIdentity.Msync. Msync is a no-op for shm @@ -642,7 +642,7 @@ func (s *Shm) Set(ctx context.Context, ds *linux.ShmidDS) error { return nil } -func (s *Shm) destroy() { +func (s *Shm) destroy(context.Context) { s.mfp.MemoryFile().DecRef(s.fr) s.registry.remove(s) } @@ -651,7 +651,7 @@ func (s *Shm) destroy() { // destroyed once it has no references. MarkDestroyed may be called multiple // times, and is safe to call after a segment has already been destroyed. See // shmctl(IPC_RMID). -func (s *Shm) MarkDestroyed() { +func (s *Shm) MarkDestroyed(ctx context.Context) { s.registry.dissociateKey(s) s.mu.Lock() @@ -663,7 +663,7 @@ func (s *Shm) MarkDestroyed() { // // N.B. This cannot be the final DecRef, as the caller also // holds a reference. - s.DecRef() + s.DecRef(ctx) return } } diff --git a/pkg/sentry/kernel/signalfd/signalfd.go b/pkg/sentry/kernel/signalfd/signalfd.go index 8243bb93e..b07e1c1bd 100644 --- a/pkg/sentry/kernel/signalfd/signalfd.go +++ b/pkg/sentry/kernel/signalfd/signalfd.go @@ -76,7 +76,7 @@ func New(ctx context.Context, mask linux.SignalSet) (*fs.File, error) { } // Release implements fs.FileOperations.Release. -func (s *SignalOperations) Release() {} +func (s *SignalOperations) Release(context.Context) {} // Mask returns the signal mask. func (s *SignalOperations) Mask() linux.SignalSet { diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go index c4db05bd8..5aee699e7 100644 --- a/pkg/sentry/kernel/task.go +++ b/pkg/sentry/kernel/task.go @@ -730,17 +730,17 @@ func (t *Task) SyscallRestartBlock() SyscallRestartBlock { func (t *Task) IsChrooted() bool { if VFS2Enabled { realRoot := t.mountNamespaceVFS2.Root() - defer realRoot.DecRef() + defer realRoot.DecRef(t) root := t.fsContext.RootDirectoryVFS2() - defer root.DecRef() + defer root.DecRef(t) return root != realRoot } realRoot := t.tg.mounts.Root() - defer realRoot.DecRef() + defer realRoot.DecRef(t) root := t.fsContext.RootDirectory() if root != nil { - defer root.DecRef() + defer root.DecRef(t) } return root != realRoot } diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go index e1ecca99e..fe6ba6041 100644 --- a/pkg/sentry/kernel/task_clone.go +++ b/pkg/sentry/kernel/task_clone.go @@ -237,7 +237,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { var fdTable *FDTable if opts.NewFiles { - fdTable = t.fdTable.Fork() + fdTable = t.fdTable.Fork(t) } else { fdTable = t.fdTable fdTable.IncRef() @@ -294,7 +294,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { nt, err := t.tg.pidns.owner.NewTask(cfg) if err != nil { if opts.NewThreadGroup { - tg.release() + tg.release(t) } return 0, nil, err } @@ -510,7 +510,7 @@ func (t *Task) Unshare(opts *SharingOptions) error { var oldFDTable *FDTable if opts.NewFiles { oldFDTable = t.fdTable - t.fdTable = oldFDTable.Fork() + t.fdTable = oldFDTable.Fork(t) } var oldFSContext *FSContext if opts.NewFSContext { @@ -519,10 +519,10 @@ func (t *Task) Unshare(opts *SharingOptions) error { } t.mu.Unlock() if oldFDTable != nil { - oldFDTable.DecRef() + oldFDTable.DecRef(t) } if oldFSContext != nil { - oldFSContext.DecRef() + oldFSContext.DecRef(t) } return nil } diff --git a/pkg/sentry/kernel/task_exec.go b/pkg/sentry/kernel/task_exec.go index 7803b98d0..47c28b8ff 100644 --- a/pkg/sentry/kernel/task_exec.go +++ b/pkg/sentry/kernel/task_exec.go @@ -199,11 +199,11 @@ func (r *runSyscallAfterExecStop) execute(t *Task) taskRunState { t.tg.pidns.owner.mu.Unlock() oldFDTable := t.fdTable - t.fdTable = t.fdTable.Fork() - oldFDTable.DecRef() + t.fdTable = t.fdTable.Fork(t) + oldFDTable.DecRef(t) // Remove FDs with the CloseOnExec flag set. - t.fdTable.RemoveIf(func(_ *fs.File, _ *vfs.FileDescription, flags FDFlags) bool { + t.fdTable.RemoveIf(t, func(_ *fs.File, _ *vfs.FileDescription, flags FDFlags) bool { return flags.CloseOnExec }) diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go index 231ac548a..c165d6cb1 100644 --- a/pkg/sentry/kernel/task_exit.go +++ b/pkg/sentry/kernel/task_exit.go @@ -269,12 +269,12 @@ func (*runExitMain) execute(t *Task) taskRunState { // Releasing the MM unblocks a blocked CLONE_VFORK parent. t.unstopVforkParent() - t.fsContext.DecRef() - t.fdTable.DecRef() + t.fsContext.DecRef(t) + t.fdTable.DecRef(t) t.mu.Lock() if t.mountNamespaceVFS2 != nil { - t.mountNamespaceVFS2.DecRef() + t.mountNamespaceVFS2.DecRef(t) t.mountNamespaceVFS2 = nil } t.mu.Unlock() @@ -282,7 +282,7 @@ func (*runExitMain) execute(t *Task) taskRunState { // If this is the last task to exit from the thread group, release the // thread group's resources. if lastExiter { - t.tg.release() + t.tg.release(t) } // Detach tracees. diff --git a/pkg/sentry/kernel/task_log.go b/pkg/sentry/kernel/task_log.go index eeccaa197..ab86ceedc 100644 --- a/pkg/sentry/kernel/task_log.go +++ b/pkg/sentry/kernel/task_log.go @@ -203,6 +203,6 @@ func (t *Task) traceExecEvent(tc *TaskContext) { trace.Logf(t.traceContext, traceCategory, "exec: << unknown >>") return } - defer file.DecRef() + defer file.DecRef(t) trace.Logf(t.traceContext, traceCategory, "exec: %s", file.PathnameWithDeleted(t)) } diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go index 8485fb4b6..64c1e120a 100644 --- a/pkg/sentry/kernel/task_start.go +++ b/pkg/sentry/kernel/task_start.go @@ -102,10 +102,10 @@ func (ts *TaskSet) NewTask(cfg *TaskConfig) (*Task, error) { t, err := ts.newTask(cfg) if err != nil { cfg.TaskContext.release() - cfg.FSContext.DecRef() - cfg.FDTable.DecRef() + cfg.FSContext.DecRef(t) + cfg.FDTable.DecRef(t) if cfg.MountNamespaceVFS2 != nil { - cfg.MountNamespaceVFS2.DecRef() + cfg.MountNamespaceVFS2.DecRef(t) } return nil, err } diff --git a/pkg/sentry/kernel/thread_group.go b/pkg/sentry/kernel/thread_group.go index 4dfd2c990..0b34c0099 100644 --- a/pkg/sentry/kernel/thread_group.go +++ b/pkg/sentry/kernel/thread_group.go @@ -308,7 +308,7 @@ func (tg *ThreadGroup) Limits() *limits.LimitSet { } // release releases the thread group's resources. -func (tg *ThreadGroup) release() { +func (tg *ThreadGroup) release(t *Task) { // Timers must be destroyed without holding the TaskSet or signal mutexes // since timers send signals with Timer.mu locked. tg.itimerRealTimer.Destroy() @@ -325,7 +325,7 @@ func (tg *ThreadGroup) release() { it.DestroyTimer() } if tg.mounts != nil { - tg.mounts.DecRef() + tg.mounts.DecRef(t) } } diff --git a/pkg/sentry/loader/elf.go b/pkg/sentry/loader/elf.go index ddeaff3db..20dd1cc21 100644 --- a/pkg/sentry/loader/elf.go +++ b/pkg/sentry/loader/elf.go @@ -281,7 +281,7 @@ func mapSegment(ctx context.Context, m *mm.MemoryManager, f fsbridge.File, phdr } defer func() { if mopts.MappingIdentity != nil { - mopts.MappingIdentity.DecRef() + mopts.MappingIdentity.DecRef(ctx) } }() if err := f.ConfigureMMap(ctx, &mopts); err != nil { @@ -663,7 +663,7 @@ func loadELF(ctx context.Context, args LoadArgs) (loadedELF, arch.Context, error ctx.Infof("Error opening interpreter %s: %v", bin.interpreter, err) return loadedELF{}, nil, err } - defer intFile.DecRef() + defer intFile.DecRef(ctx) interp, err = loadInterpreterELF(ctx, args.MemoryManager, intFile, bin) if err != nil { diff --git a/pkg/sentry/loader/loader.go b/pkg/sentry/loader/loader.go index 986c7fb4d..8d6802ea3 100644 --- a/pkg/sentry/loader/loader.go +++ b/pkg/sentry/loader/loader.go @@ -154,7 +154,7 @@ func loadExecutable(ctx context.Context, args LoadArgs) (loadedELF, arch.Context return loadedELF{}, nil, nil, nil, err } // Ensure file is release in case the code loops or errors out. - defer args.File.DecRef() + defer args.File.DecRef(ctx) } else { if err := checkIsRegularFile(ctx, args.File, args.Filename); err != nil { return loadedELF{}, nil, nil, nil, err @@ -223,7 +223,7 @@ func Load(ctx context.Context, args LoadArgs, extraAuxv []arch.AuxEntry, vdso *V if err != nil { return 0, nil, "", syserr.NewDynamic(fmt.Sprintf("failed to load %s: %v", args.Filename, err), syserr.FromError(err).ToLinux()) } - defer file.DecRef() + defer file.DecRef(ctx) // Load the VDSO. vdsoAddr, err := loadVDSO(ctx, args.MemoryManager, vdso, loaded) @@ -292,7 +292,7 @@ func Load(ctx context.Context, args LoadArgs, extraAuxv []arch.AuxEntry, vdso *V m.SetEnvvStart(sl.EnvvStart) m.SetEnvvEnd(sl.EnvvEnd) m.SetAuxv(auxv) - m.SetExecutable(file) + m.SetExecutable(ctx, file) symbolValue, err := getSymbolValueFromVDSO("rt_sigreturn") if err != nil { diff --git a/pkg/sentry/memmap/memmap.go b/pkg/sentry/memmap/memmap.go index c188f6c29..59c92c7e8 100644 --- a/pkg/sentry/memmap/memmap.go +++ b/pkg/sentry/memmap/memmap.go @@ -238,7 +238,7 @@ type MappingIdentity interface { IncRef() // DecRef decrements the MappingIdentity's reference count. - DecRef() + DecRef(ctx context.Context) // MappedName returns the application-visible name shown in // /proc/[pid]/maps. diff --git a/pkg/sentry/mm/aio_context.go b/pkg/sentry/mm/aio_context.go index 1999ec706..16fea53c4 100644 --- a/pkg/sentry/mm/aio_context.go +++ b/pkg/sentry/mm/aio_context.go @@ -258,8 +258,8 @@ func newAIOMappable(mfp pgalloc.MemoryFileProvider) (*aioMappable, error) { } // DecRef implements refs.RefCounter.DecRef. -func (m *aioMappable) DecRef() { - m.AtomicRefCount.DecRefWithDestructor(func() { +func (m *aioMappable) DecRef(ctx context.Context) { + m.AtomicRefCount.DecRefWithDestructor(ctx, func(context.Context) { m.mfp.MemoryFile().DecRef(m.fr) }) } @@ -367,7 +367,7 @@ func (mm *MemoryManager) NewAIOContext(ctx context.Context, events uint32) (uint if err != nil { return 0, err } - defer m.DecRef() + defer m.DecRef(ctx) addr, err := mm.MMap(ctx, memmap.MMapOpts{ Length: aioRingBufferSize, MappingIdentity: m, diff --git a/pkg/sentry/mm/lifecycle.go b/pkg/sentry/mm/lifecycle.go index aac56679b..4d7773f8b 100644 --- a/pkg/sentry/mm/lifecycle.go +++ b/pkg/sentry/mm/lifecycle.go @@ -258,7 +258,7 @@ func (mm *MemoryManager) DecUsers(ctx context.Context) { mm.executable = nil mm.metadataMu.Unlock() if exe != nil { - exe.DecRef() + exe.DecRef(ctx) } mm.activeMu.Lock() diff --git a/pkg/sentry/mm/metadata.go b/pkg/sentry/mm/metadata.go index 28e5057f7..0cfd60f6c 100644 --- a/pkg/sentry/mm/metadata.go +++ b/pkg/sentry/mm/metadata.go @@ -15,6 +15,7 @@ package mm import ( + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fsbridge" "gvisor.dev/gvisor/pkg/usermem" @@ -147,7 +148,7 @@ func (mm *MemoryManager) Executable() fsbridge.File { // SetExecutable sets the executable. // // This takes a reference on d. -func (mm *MemoryManager) SetExecutable(file fsbridge.File) { +func (mm *MemoryManager) SetExecutable(ctx context.Context, file fsbridge.File) { mm.metadataMu.Lock() // Grab a new reference. @@ -164,7 +165,7 @@ func (mm *MemoryManager) SetExecutable(file fsbridge.File) { // Do this without holding the lock, since it may wind up doing some // I/O to sync the dirent, etc. if orig != nil { - orig.DecRef() + orig.DecRef(ctx) } } diff --git a/pkg/sentry/mm/special_mappable.go b/pkg/sentry/mm/special_mappable.go index 0e142fb11..4cdb52eb6 100644 --- a/pkg/sentry/mm/special_mappable.go +++ b/pkg/sentry/mm/special_mappable.go @@ -50,8 +50,8 @@ func NewSpecialMappable(name string, mfp pgalloc.MemoryFileProvider, fr memmap.F } // DecRef implements refs.RefCounter.DecRef. -func (m *SpecialMappable) DecRef() { - m.AtomicRefCount.DecRefWithDestructor(func() { +func (m *SpecialMappable) DecRef(ctx context.Context) { + m.AtomicRefCount.DecRefWithDestructor(ctx, func(context.Context) { m.mfp.MemoryFile().DecRef(m.fr) }) } diff --git a/pkg/sentry/mm/syscalls.go b/pkg/sentry/mm/syscalls.go index 3f496aa9f..e74d4e1c1 100644 --- a/pkg/sentry/mm/syscalls.go +++ b/pkg/sentry/mm/syscalls.go @@ -101,7 +101,7 @@ func (mm *MemoryManager) MMap(ctx context.Context, opts memmap.MMapOpts) (userme if err != nil { return 0, err } - defer m.DecRef() + defer m.DecRef(ctx) opts.MappingIdentity = m opts.Mappable = m } @@ -1191,7 +1191,7 @@ func (mm *MemoryManager) MSync(ctx context.Context, addr usermem.Addr, length ui mr := vseg.mappableRangeOf(vseg.Range().Intersect(ar)) mm.mappingMu.RUnlock() err := id.Msync(ctx, mr) - id.DecRef() + id.DecRef(ctx) if err != nil { return err } diff --git a/pkg/sentry/mm/vma.go b/pkg/sentry/mm/vma.go index 16d8207e9..bd751d696 100644 --- a/pkg/sentry/mm/vma.go +++ b/pkg/sentry/mm/vma.go @@ -377,7 +377,7 @@ func (mm *MemoryManager) removeVMAsLocked(ctx context.Context, ar usermem.AddrRa vma.mappable.RemoveMapping(ctx, mm, vmaAR, vma.off, vma.canWriteMappableLocked()) } if vma.id != nil { - vma.id.DecRef() + vma.id.DecRef(ctx) } mm.usageAS -= uint64(vmaAR.Length()) if vma.isPrivateDataLocked() { @@ -446,7 +446,7 @@ func (vmaSetFunctions) Merge(ar1 usermem.AddrRange, vma1 vma, ar2 usermem.AddrRa } if vma2.id != nil { - vma2.id.DecRef() + vma2.id.DecRef(context.Background()) } return vma1, true } diff --git a/pkg/sentry/socket/control/control.go b/pkg/sentry/socket/control/control.go index 8b439a078..70ccf77a7 100644 --- a/pkg/sentry/socket/control/control.go +++ b/pkg/sentry/socket/control/control.go @@ -68,7 +68,7 @@ func NewSCMRights(t *kernel.Task, fds []int32) (SCMRights, error) { for _, fd := range fds { file := t.GetFile(fd) if file == nil { - files.Release() + files.Release(t) return nil, syserror.EBADF } files = append(files, file) @@ -100,9 +100,9 @@ func (fs *RightsFiles) Clone() transport.RightsControlMessage { } // Release implements transport.RightsControlMessage.Release. -func (fs *RightsFiles) Release() { +func (fs *RightsFiles) Release(ctx context.Context) { for _, f := range *fs { - f.DecRef() + f.DecRef(ctx) } *fs = nil } @@ -115,7 +115,7 @@ func rightsFDs(t *kernel.Task, rights SCMRights, cloexec bool, max int) ([]int32 fd, err := t.NewFDFrom(0, files[0], kernel.FDFlags{ CloseOnExec: cloexec, }) - files[0].DecRef() + files[0].DecRef(t) files = files[1:] if err != nil { t.Warningf("Error inserting FD: %v", err) diff --git a/pkg/sentry/socket/control/control_vfs2.go b/pkg/sentry/socket/control/control_vfs2.go index fd08179be..d9621968c 100644 --- a/pkg/sentry/socket/control/control_vfs2.go +++ b/pkg/sentry/socket/control/control_vfs2.go @@ -46,7 +46,7 @@ func NewSCMRightsVFS2(t *kernel.Task, fds []int32) (SCMRightsVFS2, error) { for _, fd := range fds { file := t.GetFileVFS2(fd) if file == nil { - files.Release() + files.Release(t) return nil, syserror.EBADF } files = append(files, file) @@ -78,9 +78,9 @@ func (fs *RightsFilesVFS2) Clone() transport.RightsControlMessage { } // Release implements transport.RightsControlMessage.Release. -func (fs *RightsFilesVFS2) Release() { +func (fs *RightsFilesVFS2) Release(ctx context.Context) { for _, f := range *fs { - f.DecRef() + f.DecRef(ctx) } *fs = nil } @@ -93,7 +93,7 @@ func rightsFDsVFS2(t *kernel.Task, rights SCMRightsVFS2, cloexec bool, max int) fd, err := t.NewFDFromVFS2(0, files[0], kernel.FDFlags{ CloseOnExec: cloexec, }) - files[0].DecRef() + files[0].DecRef(t) files = files[1:] if err != nil { t.Warningf("Error inserting FD: %v", err) diff --git a/pkg/sentry/socket/hostinet/socket.go b/pkg/sentry/socket/hostinet/socket.go index 532a1ea5d..242e6bf76 100644 --- a/pkg/sentry/socket/hostinet/socket.go +++ b/pkg/sentry/socket/hostinet/socket.go @@ -100,12 +100,12 @@ func newSocketFile(ctx context.Context, family int, stype linux.SockType, protoc return nil, syserr.FromError(err) } dirent := socket.NewDirent(ctx, socketDevice) - defer dirent.DecRef() + defer dirent.DecRef(ctx) return fs.NewFile(ctx, dirent, fs.FileFlags{NonBlocking: nonblock, Read: true, Write: true, NonSeekable: true}, s), nil } // Release implements fs.FileOperations.Release. -func (s *socketOpsCommon) Release() { +func (s *socketOpsCommon) Release(context.Context) { fdnotifier.RemoveFD(int32(s.fd)) syscall.Close(s.fd) } @@ -269,7 +269,7 @@ func (s *socketOpsCommon) Accept(t *kernel.Task, peerRequested bool, flags int, syscall.Close(fd) return 0, nil, 0, err } - defer f.DecRef() + defer f.DecRef(t) kfd, kerr = t.NewFDFromVFS2(0, f, kernel.FDFlags{ CloseOnExec: flags&syscall.SOCK_CLOEXEC != 0, @@ -281,7 +281,7 @@ func (s *socketOpsCommon) Accept(t *kernel.Task, peerRequested bool, flags int, syscall.Close(fd) return 0, nil, 0, err } - defer f.DecRef() + defer f.DecRef(t) kfd, kerr = t.NewFDFrom(0, f, kernel.FDFlags{ CloseOnExec: flags&syscall.SOCK_CLOEXEC != 0, diff --git a/pkg/sentry/socket/netlink/provider.go b/pkg/sentry/socket/netlink/provider.go index 0d45e5053..31e374833 100644 --- a/pkg/sentry/socket/netlink/provider.go +++ b/pkg/sentry/socket/netlink/provider.go @@ -97,7 +97,7 @@ func (*socketProvider) Socket(t *kernel.Task, stype linux.SockType, protocol int } d := socket.NewDirent(t, netlinkSocketDevice) - defer d.DecRef() + defer d.DecRef(t) return fs.NewFile(t, d, fs.FileFlags{Read: true, Write: true, NonSeekable: true}, s), nil } diff --git a/pkg/sentry/socket/netlink/socket.go b/pkg/sentry/socket/netlink/socket.go index 98ca7add0..68a9b9a96 100644 --- a/pkg/sentry/socket/netlink/socket.go +++ b/pkg/sentry/socket/netlink/socket.go @@ -140,14 +140,14 @@ func NewSocket(t *kernel.Task, skType linux.SockType, protocol Protocol) (*Socke // Bind the endpoint for good measure so we can connect to it. The // bound address will never be exposed. if err := ep.Bind(tcpip.FullAddress{Addr: "dummy"}, nil); err != nil { - ep.Close() + ep.Close(t) return nil, err } // Create a connection from which the kernel can write messages. connection, err := ep.(transport.BoundEndpoint).UnidirectionalConnect(t) if err != nil { - ep.Close() + ep.Close(t) return nil, err } @@ -164,9 +164,9 @@ func NewSocket(t *kernel.Task, skType linux.SockType, protocol Protocol) (*Socke } // Release implements fs.FileOperations.Release. -func (s *socketOpsCommon) Release() { - s.connection.Release() - s.ep.Close() +func (s *socketOpsCommon) Release(ctx context.Context) { + s.connection.Release(ctx) + s.ep.Close(ctx) if s.bound { s.ports.Release(s.protocol.Protocol(), s.portID) @@ -621,7 +621,7 @@ func (s *socketOpsCommon) sendResponse(ctx context.Context, ms *MessageSet) *sys if len(bufs) > 0 { // RecvMsg never receives the address, so we don't need to send // one. - _, notify, err := s.connection.Send(bufs, cms, tcpip.FullAddress{}) + _, notify, err := s.connection.Send(ctx, bufs, cms, tcpip.FullAddress{}) // If the buffer is full, we simply drop messages, just like // Linux. if err != nil && err != syserr.ErrWouldBlock { @@ -648,7 +648,7 @@ func (s *socketOpsCommon) sendResponse(ctx context.Context, ms *MessageSet) *sys // Add the dump_done_errno payload. m.Put(int64(0)) - _, notify, err := s.connection.Send([][]byte{m.Finalize()}, cms, tcpip.FullAddress{}) + _, notify, err := s.connection.Send(ctx, [][]byte{m.Finalize()}, cms, tcpip.FullAddress{}) if err != nil && err != syserr.ErrWouldBlock { return err } diff --git a/pkg/sentry/socket/netlink/socket_vfs2.go b/pkg/sentry/socket/netlink/socket_vfs2.go index dbcd8b49a..a38d25da9 100644 --- a/pkg/sentry/socket/netlink/socket_vfs2.go +++ b/pkg/sentry/socket/netlink/socket_vfs2.go @@ -57,14 +57,14 @@ func NewVFS2(t *kernel.Task, skType linux.SockType, protocol Protocol) (*SocketV // Bind the endpoint for good measure so we can connect to it. The // bound address will never be exposed. if err := ep.Bind(tcpip.FullAddress{Addr: "dummy"}, nil); err != nil { - ep.Close() + ep.Close(t) return nil, err } // Create a connection from which the kernel can write messages. connection, err := ep.(transport.BoundEndpoint).UnidirectionalConnect(t) if err != nil { - ep.Close() + ep.Close(t) return nil, err } diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go index 31a168f7e..e4846bc0b 100644 --- a/pkg/sentry/socket/netstack/netstack.go +++ b/pkg/sentry/socket/netstack/netstack.go @@ -330,7 +330,7 @@ func New(t *kernel.Task, family int, skType linux.SockType, protocol int, queue } dirent := socket.NewDirent(t, netstackDevice) - defer dirent.DecRef() + defer dirent.DecRef(t) return fs.NewFile(t, dirent, fs.FileFlags{Read: true, Write: true, NonSeekable: true}, &SocketOperations{ socketOpsCommon: socketOpsCommon{ Queue: queue, @@ -479,7 +479,7 @@ func (s *socketOpsCommon) fetchReadView() *syserr.Error { } // Release implements fs.FileOperations.Release. -func (s *socketOpsCommon) Release() { +func (s *socketOpsCommon) Release(context.Context) { s.Endpoint.Close() } @@ -854,7 +854,7 @@ func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, if err != nil { return 0, nil, 0, err } - defer ns.DecRef() + defer ns.DecRef(t) if flags&linux.SOCK_NONBLOCK != 0 { flags := ns.Flags() diff --git a/pkg/sentry/socket/netstack/netstack_vfs2.go b/pkg/sentry/socket/netstack/netstack_vfs2.go index a9025b0ec..3335e7430 100644 --- a/pkg/sentry/socket/netstack/netstack_vfs2.go +++ b/pkg/sentry/socket/netstack/netstack_vfs2.go @@ -169,7 +169,7 @@ func (s *SocketVFS2) Accept(t *kernel.Task, peerRequested bool, flags int, block if err != nil { return 0, nil, 0, err } - defer ns.DecRef() + defer ns.DecRef(t) if err := ns.SetStatusFlags(t, t.Credentials(), uint32(flags&linux.SOCK_NONBLOCK)); err != nil { return 0, nil, 0, syserr.FromError(err) diff --git a/pkg/sentry/socket/socket.go b/pkg/sentry/socket/socket.go index d112757fb..04b259d27 100644 --- a/pkg/sentry/socket/socket.go +++ b/pkg/sentry/socket/socket.go @@ -46,8 +46,8 @@ type ControlMessages struct { } // Release releases Unix domain socket credentials and rights. -func (c *ControlMessages) Release() { - c.Unix.Release() +func (c *ControlMessages) Release(ctx context.Context) { + c.Unix.Release(ctx) } // Socket is an interface combining fs.FileOperations and SocketOps, diff --git a/pkg/sentry/socket/unix/transport/connectioned.go b/pkg/sentry/socket/unix/transport/connectioned.go index a1e49cc57..c67b602f0 100644 --- a/pkg/sentry/socket/unix/transport/connectioned.go +++ b/pkg/sentry/socket/unix/transport/connectioned.go @@ -211,7 +211,7 @@ func (e *connectionedEndpoint) Listening() bool { // The socket will be a fresh state after a call to close and may be reused. // That is, close may be used to "unbind" or "disconnect" the socket in error // paths. -func (e *connectionedEndpoint) Close() { +func (e *connectionedEndpoint) Close(ctx context.Context) { e.Lock() var c ConnectedEndpoint var r Receiver @@ -233,7 +233,7 @@ func (e *connectionedEndpoint) Close() { case e.Listening(): close(e.acceptedChan) for n := range e.acceptedChan { - n.Close() + n.Close(ctx) } e.acceptedChan = nil e.path = "" @@ -241,11 +241,11 @@ func (e *connectionedEndpoint) Close() { e.Unlock() if c != nil { c.CloseNotify() - c.Release() + c.Release(ctx) } if r != nil { r.CloseNotify() - r.Release() + r.Release(ctx) } } @@ -340,7 +340,7 @@ func (e *connectionedEndpoint) BidirectionalConnect(ctx context.Context, ce Conn return nil default: // Busy; return ECONNREFUSED per spec. - ne.Close() + ne.Close(ctx) e.Unlock() ce.Unlock() return syserr.ErrConnectionRefused diff --git a/pkg/sentry/socket/unix/transport/connectionless.go b/pkg/sentry/socket/unix/transport/connectionless.go index 4b06d63ac..70ee8f9b8 100644 --- a/pkg/sentry/socket/unix/transport/connectionless.go +++ b/pkg/sentry/socket/unix/transport/connectionless.go @@ -54,10 +54,10 @@ func (e *connectionlessEndpoint) isBound() bool { // Close puts the endpoint in a closed state and frees all resources associated // with it. -func (e *connectionlessEndpoint) Close() { +func (e *connectionlessEndpoint) Close(ctx context.Context) { e.Lock() if e.connected != nil { - e.connected.Release() + e.connected.Release(ctx) e.connected = nil } @@ -71,7 +71,7 @@ func (e *connectionlessEndpoint) Close() { e.Unlock() r.CloseNotify() - r.Release() + r.Release(ctx) } // BidirectionalConnect implements BoundEndpoint.BidirectionalConnect. @@ -108,10 +108,10 @@ func (e *connectionlessEndpoint) SendMsg(ctx context.Context, data [][]byte, c C if err != nil { return 0, syserr.ErrInvalidEndpointState } - defer connected.Release() + defer connected.Release(ctx) e.Lock() - n, notify, err := connected.Send(data, c, tcpip.FullAddress{Addr: tcpip.Address(e.path)}) + n, notify, err := connected.Send(ctx, data, c, tcpip.FullAddress{Addr: tcpip.Address(e.path)}) e.Unlock() if notify { @@ -135,7 +135,7 @@ func (e *connectionlessEndpoint) Connect(ctx context.Context, server BoundEndpoi e.Lock() if e.connected != nil { - e.connected.Release() + e.connected.Release(ctx) } e.connected = connected e.Unlock() diff --git a/pkg/sentry/socket/unix/transport/queue.go b/pkg/sentry/socket/unix/transport/queue.go index d8f3ad63d..ef6043e19 100644 --- a/pkg/sentry/socket/unix/transport/queue.go +++ b/pkg/sentry/socket/unix/transport/queue.go @@ -15,6 +15,7 @@ package transport import ( + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserr" @@ -57,10 +58,10 @@ func (q *queue) Close() { // Both the read and write queues must be notified after resetting: // q.ReaderQueue.Notify(waiter.EventIn) // q.WriterQueue.Notify(waiter.EventOut) -func (q *queue) Reset() { +func (q *queue) Reset(ctx context.Context) { q.mu.Lock() for cur := q.dataList.Front(); cur != nil; cur = cur.Next() { - cur.Release() + cur.Release(ctx) } q.dataList.Reset() q.used = 0 @@ -68,8 +69,8 @@ func (q *queue) Reset() { } // DecRef implements RefCounter.DecRef with destructor q.Reset. -func (q *queue) DecRef() { - q.DecRefWithDestructor(q.Reset) +func (q *queue) DecRef(ctx context.Context) { + q.DecRefWithDestructor(ctx, q.Reset) // We don't need to notify after resetting because no one cares about // this queue after all references have been dropped. } @@ -111,7 +112,7 @@ func (q *queue) IsWritable() bool { // // If notify is true, ReaderQueue.Notify must be called: // q.ReaderQueue.Notify(waiter.EventIn) -func (q *queue) Enqueue(data [][]byte, c ControlMessages, from tcpip.FullAddress, discardEmpty bool, truncate bool) (l int64, notify bool, err *syserr.Error) { +func (q *queue) Enqueue(ctx context.Context, data [][]byte, c ControlMessages, from tcpip.FullAddress, discardEmpty bool, truncate bool) (l int64, notify bool, err *syserr.Error) { q.mu.Lock() if q.closed { @@ -124,7 +125,7 @@ func (q *queue) Enqueue(data [][]byte, c ControlMessages, from tcpip.FullAddress } if discardEmpty && l == 0 { q.mu.Unlock() - c.Release() + c.Release(ctx) return 0, false, nil } diff --git a/pkg/sentry/socket/unix/transport/unix.go b/pkg/sentry/socket/unix/transport/unix.go index 2f1b127df..475d7177e 100644 --- a/pkg/sentry/socket/unix/transport/unix.go +++ b/pkg/sentry/socket/unix/transport/unix.go @@ -37,7 +37,7 @@ type RightsControlMessage interface { Clone() RightsControlMessage // Release releases any resources owned by the RightsControlMessage. - Release() + Release(ctx context.Context) } // A CredentialsControlMessage is a control message containing Unix credentials. @@ -74,9 +74,9 @@ func (c *ControlMessages) Clone() ControlMessages { } // Release releases both the credentials and the rights. -func (c *ControlMessages) Release() { +func (c *ControlMessages) Release(ctx context.Context) { if c.Rights != nil { - c.Rights.Release() + c.Rights.Release(ctx) } *c = ControlMessages{} } @@ -90,7 +90,7 @@ type Endpoint interface { // Close puts the endpoint in a closed state and frees all resources // associated with it. - Close() + Close(ctx context.Context) // RecvMsg reads data and a control message from the endpoint. This method // does not block if there is no data pending. @@ -252,7 +252,7 @@ type BoundEndpoint interface { // Release releases any resources held by the BoundEndpoint. It must be // called before dropping all references to a BoundEndpoint returned by a // function. - Release() + Release(ctx context.Context) } // message represents a message passed over a Unix domain socket. @@ -281,8 +281,8 @@ func (m *message) Length() int64 { } // Release releases any resources held by the message. -func (m *message) Release() { - m.Control.Release() +func (m *message) Release(ctx context.Context) { + m.Control.Release(ctx) } // Peek returns a copy of the message. @@ -304,7 +304,7 @@ type Receiver interface { // See Endpoint.RecvMsg for documentation on shared arguments. // // notify indicates if RecvNotify should be called. - Recv(data [][]byte, creds bool, numRights int, peek bool) (recvLen, msgLen int64, cm ControlMessages, CMTruncated bool, source tcpip.FullAddress, notify bool, err *syserr.Error) + Recv(ctx context.Context, data [][]byte, creds bool, numRights int, peek bool) (recvLen, msgLen int64, cm ControlMessages, CMTruncated bool, source tcpip.FullAddress, notify bool, err *syserr.Error) // RecvNotify notifies the Receiver of a successful Recv. This must not be // called while holding any endpoint locks. @@ -333,7 +333,7 @@ type Receiver interface { // Release releases any resources owned by the Receiver. It should be // called before droping all references to a Receiver. - Release() + Release(ctx context.Context) } // queueReceiver implements Receiver for datagram sockets. @@ -344,7 +344,7 @@ type queueReceiver struct { } // Recv implements Receiver.Recv. -func (q *queueReceiver) Recv(data [][]byte, creds bool, numRights int, peek bool) (int64, int64, ControlMessages, bool, tcpip.FullAddress, bool, *syserr.Error) { +func (q *queueReceiver) Recv(ctx context.Context, data [][]byte, creds bool, numRights int, peek bool) (int64, int64, ControlMessages, bool, tcpip.FullAddress, bool, *syserr.Error) { var m *message var notify bool var err *syserr.Error @@ -398,8 +398,8 @@ func (q *queueReceiver) RecvMaxQueueSize() int64 { } // Release implements Receiver.Release. -func (q *queueReceiver) Release() { - q.readQueue.DecRef() +func (q *queueReceiver) Release(ctx context.Context) { + q.readQueue.DecRef(ctx) } // streamQueueReceiver implements Receiver for stream sockets. @@ -456,7 +456,7 @@ func (q *streamQueueReceiver) RecvMaxQueueSize() int64 { } // Recv implements Receiver.Recv. -func (q *streamQueueReceiver) Recv(data [][]byte, wantCreds bool, numRights int, peek bool) (int64, int64, ControlMessages, bool, tcpip.FullAddress, bool, *syserr.Error) { +func (q *streamQueueReceiver) Recv(ctx context.Context, data [][]byte, wantCreds bool, numRights int, peek bool) (int64, int64, ControlMessages, bool, tcpip.FullAddress, bool, *syserr.Error) { q.mu.Lock() defer q.mu.Unlock() @@ -502,7 +502,7 @@ func (q *streamQueueReceiver) Recv(data [][]byte, wantCreds bool, numRights int, var cmTruncated bool if c.Rights != nil && numRights == 0 { - c.Rights.Release() + c.Rights.Release(ctx) c.Rights = nil cmTruncated = true } @@ -557,7 +557,7 @@ func (q *streamQueueReceiver) Recv(data [][]byte, wantCreds bool, numRights int, // Consume rights. if numRights == 0 { cmTruncated = true - q.control.Rights.Release() + q.control.Rights.Release(ctx) } else { c.Rights = q.control.Rights haveRights = true @@ -582,7 +582,7 @@ type ConnectedEndpoint interface { // // syserr.ErrWouldBlock can be returned along with a partial write if // the caller should block to send the rest of the data. - Send(data [][]byte, c ControlMessages, from tcpip.FullAddress) (n int64, notify bool, err *syserr.Error) + Send(ctx context.Context, data [][]byte, c ControlMessages, from tcpip.FullAddress) (n int64, notify bool, err *syserr.Error) // SendNotify notifies the ConnectedEndpoint of a successful Send. This // must not be called while holding any endpoint locks. @@ -616,7 +616,7 @@ type ConnectedEndpoint interface { // Release releases any resources owned by the ConnectedEndpoint. It should // be called before droping all references to a ConnectedEndpoint. - Release() + Release(ctx context.Context) // CloseUnread sets the fact that this end is closed with unread data to // the peer socket. @@ -654,7 +654,7 @@ func (e *connectedEndpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) } // Send implements ConnectedEndpoint.Send. -func (e *connectedEndpoint) Send(data [][]byte, c ControlMessages, from tcpip.FullAddress) (int64, bool, *syserr.Error) { +func (e *connectedEndpoint) Send(ctx context.Context, data [][]byte, c ControlMessages, from tcpip.FullAddress) (int64, bool, *syserr.Error) { discardEmpty := false truncate := false if e.endpoint.Type() == linux.SOCK_STREAM { @@ -669,7 +669,7 @@ func (e *connectedEndpoint) Send(data [][]byte, c ControlMessages, from tcpip.Fu truncate = true } - return e.writeQueue.Enqueue(data, c, from, discardEmpty, truncate) + return e.writeQueue.Enqueue(ctx, data, c, from, discardEmpty, truncate) } // SendNotify implements ConnectedEndpoint.SendNotify. @@ -707,8 +707,8 @@ func (e *connectedEndpoint) SendMaxQueueSize() int64 { } // Release implements ConnectedEndpoint.Release. -func (e *connectedEndpoint) Release() { - e.writeQueue.DecRef() +func (e *connectedEndpoint) Release(ctx context.Context) { + e.writeQueue.DecRef(ctx) } // CloseUnread implements ConnectedEndpoint.CloseUnread. @@ -798,7 +798,7 @@ func (e *baseEndpoint) RecvMsg(ctx context.Context, data [][]byte, creds bool, n return 0, 0, ControlMessages{}, false, syserr.ErrNotConnected } - recvLen, msgLen, cms, cmt, a, notify, err := e.receiver.Recv(data, creds, numRights, peek) + recvLen, msgLen, cms, cmt, a, notify, err := e.receiver.Recv(ctx, data, creds, numRights, peek) e.Unlock() if err != nil { return 0, 0, ControlMessages{}, false, err @@ -827,7 +827,7 @@ func (e *baseEndpoint) SendMsg(ctx context.Context, data [][]byte, c ControlMess return 0, syserr.ErrAlreadyConnected } - n, notify, err := e.connected.Send(data, c, tcpip.FullAddress{Addr: tcpip.Address(e.path)}) + n, notify, err := e.connected.Send(ctx, data, c, tcpip.FullAddress{Addr: tcpip.Address(e.path)}) e.Unlock() if notify { @@ -1001,6 +1001,6 @@ func (e *baseEndpoint) GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error) { } // Release implements BoundEndpoint.Release. -func (*baseEndpoint) Release() { +func (*baseEndpoint) Release(context.Context) { // Binding a baseEndpoint doesn't take a reference. } diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go index 0482d33cf..2b8454edb 100644 --- a/pkg/sentry/socket/unix/unix.go +++ b/pkg/sentry/socket/unix/unix.go @@ -62,7 +62,7 @@ type SocketOperations struct { // New creates a new unix socket. func New(ctx context.Context, endpoint transport.Endpoint, stype linux.SockType) *fs.File { dirent := socket.NewDirent(ctx, unixSocketDevice) - defer dirent.DecRef() + defer dirent.DecRef(ctx) return NewWithDirent(ctx, dirent, endpoint, stype, fs.FileFlags{Read: true, Write: true, NonSeekable: true}) } @@ -97,17 +97,17 @@ type socketOpsCommon struct { } // DecRef implements RefCounter.DecRef. -func (s *socketOpsCommon) DecRef() { - s.DecRefWithDestructor(func() { - s.ep.Close() +func (s *socketOpsCommon) DecRef(ctx context.Context) { + s.DecRefWithDestructor(ctx, func(context.Context) { + s.ep.Close(ctx) }) } // Release implemements fs.FileOperations.Release. -func (s *socketOpsCommon) Release() { +func (s *socketOpsCommon) Release(ctx context.Context) { // Release only decrements a reference on s because s may be referenced in // the abstract socket namespace. - s.DecRef() + s.DecRef(ctx) } func (s *socketOpsCommon) isPacket() bool { @@ -234,7 +234,7 @@ func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, } ns := New(t, ep, s.stype) - defer ns.DecRef() + defer ns.DecRef(t) if flags&linux.SOCK_NONBLOCK != 0 { flags := ns.Flags() @@ -284,7 +284,7 @@ func (s *SocketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { if t.IsNetworkNamespaced() { return syserr.ErrInvalidEndpointState } - if err := t.AbstractSockets().Bind(p[1:], bep, s); err != nil { + if err := t.AbstractSockets().Bind(t, p[1:], bep, s); err != nil { // syserr.ErrPortInUse corresponds to EADDRINUSE. return syserr.ErrPortInUse } @@ -294,7 +294,7 @@ func (s *SocketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { var name string cwd := t.FSContext().WorkingDirectory() - defer cwd.DecRef() + defer cwd.DecRef(t) // Is there no slash at all? if !strings.Contains(p, "/") { @@ -302,7 +302,7 @@ func (s *SocketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { name = p } else { root := t.FSContext().RootDirectory() - defer root.DecRef() + defer root.DecRef(t) // Find the last path component, we know that something follows // that final slash, otherwise extractPath() would have failed. lastSlash := strings.LastIndex(p, "/") @@ -318,7 +318,7 @@ func (s *SocketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { // No path available. return syserr.ErrNoSuchFile } - defer d.DecRef() + defer d.DecRef(t) name = p[lastSlash+1:] } @@ -332,7 +332,7 @@ func (s *SocketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { if err != nil { return syserr.ErrPortInUse } - childDir.DecRef() + childDir.DecRef(t) } return nil @@ -378,9 +378,9 @@ func extractEndpoint(t *kernel.Task, sockaddr []byte) (transport.BoundEndpoint, FollowFinalSymlink: true, } ep, e := t.Kernel().VFS().BoundEndpointAt(t, t.Credentials(), &pop, &vfs.BoundEndpointOptions{path}) - root.DecRef() + root.DecRef(t) if relPath { - start.DecRef() + start.DecRef(t) } if e != nil { return nil, syserr.FromError(e) @@ -393,15 +393,15 @@ func extractEndpoint(t *kernel.Task, sockaddr []byte) (transport.BoundEndpoint, cwd := t.FSContext().WorkingDirectory() remainingTraversals := uint(fs.DefaultTraversalLimit) d, e := t.MountNamespace().FindInode(t, root, cwd, path, &remainingTraversals) - cwd.DecRef() - root.DecRef() + cwd.DecRef(t) + root.DecRef(t) if e != nil { return nil, syserr.FromError(e) } // Extract the endpoint if one is there. ep := d.Inode.BoundEndpoint(path) - d.DecRef() + d.DecRef(t) if ep == nil { // No socket! return nil, syserr.ErrConnectionRefused @@ -415,7 +415,7 @@ func (s *socketOpsCommon) Connect(t *kernel.Task, sockaddr []byte, blocking bool if err != nil { return err } - defer ep.Release() + defer ep.Release(t) // Connect the server endpoint. err = s.ep.Connect(t, ep) @@ -473,7 +473,7 @@ func (s *socketOpsCommon) SendMsg(t *kernel.Task, src usermem.IOSequence, to []b if err != nil { return 0, err } - defer ep.Release() + defer ep.Release(t) w.To = ep if ep.Passcred() && w.Control.Credentials == nil { diff --git a/pkg/sentry/socket/unix/unix_vfs2.go b/pkg/sentry/socket/unix/unix_vfs2.go index 05c16fcfe..dfa25241a 100644 --- a/pkg/sentry/socket/unix/unix_vfs2.go +++ b/pkg/sentry/socket/unix/unix_vfs2.go @@ -136,7 +136,7 @@ func (s *SocketVFS2) Accept(t *kernel.Task, peerRequested bool, flags int, block if err != nil { return 0, nil, 0, err } - defer ns.DecRef() + defer ns.DecRef(t) if flags&linux.SOCK_NONBLOCK != 0 { ns.SetStatusFlags(t, t.Credentials(), linux.SOCK_NONBLOCK) @@ -183,19 +183,19 @@ func (s *SocketVFS2) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error { if t.IsNetworkNamespaced() { return syserr.ErrInvalidEndpointState } - if err := t.AbstractSockets().Bind(p[1:], bep, s); err != nil { + if err := t.AbstractSockets().Bind(t, p[1:], bep, s); err != nil { // syserr.ErrPortInUse corresponds to EADDRINUSE. return syserr.ErrPortInUse } } else { path := fspath.Parse(p) root := t.FSContext().RootDirectoryVFS2() - defer root.DecRef() + defer root.DecRef(t) start := root relPath := !path.Absolute if relPath { start = t.FSContext().WorkingDirectoryVFS2() - defer start.DecRef() + defer start.DecRef(t) } pop := vfs.PathOperation{ Root: root, @@ -333,7 +333,7 @@ func (*providerVFS2) Socket(t *kernel.Task, stype linux.SockType, protocol int) f, err := NewSockfsFile(t, ep, stype) if err != nil { - ep.Close() + ep.Close(t) return nil, err } return f, nil @@ -357,14 +357,14 @@ func (*providerVFS2) Pair(t *kernel.Task, stype linux.SockType, protocol int) (* ep1, ep2 := transport.NewPair(t, stype, t.Kernel()) s1, err := NewSockfsFile(t, ep1, stype) if err != nil { - ep1.Close() - ep2.Close() + ep1.Close(t) + ep2.Close(t) return nil, nil, err } s2, err := NewSockfsFile(t, ep2, stype) if err != nil { - s1.DecRef() - ep2.Close() + s1.DecRef(t) + ep2.Close(t) return nil, nil, err } diff --git a/pkg/sentry/strace/strace.go b/pkg/sentry/strace/strace.go index 68ca537c8..87b239730 100644 --- a/pkg/sentry/strace/strace.go +++ b/pkg/sentry/strace/strace.go @@ -147,14 +147,14 @@ func fd(t *kernel.Task, fd int32) string { root := t.FSContext().RootDirectory() if root != nil { - defer root.DecRef() + defer root.DecRef(t) } if fd == linux.AT_FDCWD { wd := t.FSContext().WorkingDirectory() var name string if wd != nil { - defer wd.DecRef() + defer wd.DecRef(t) name, _ = wd.FullName(root) } else { name = "(unknown cwd)" @@ -167,7 +167,7 @@ func fd(t *kernel.Task, fd int32) string { // Cast FD to uint64 to avoid printing negative hex. return fmt.Sprintf("%#x (bad FD)", uint64(fd)) } - defer file.DecRef() + defer file.DecRef(t) name, _ := file.Dirent.FullName(root) return fmt.Sprintf("%#x %s", fd, name) @@ -175,12 +175,12 @@ func fd(t *kernel.Task, fd int32) string { func fdVFS2(t *kernel.Task, fd int32) string { root := t.FSContext().RootDirectoryVFS2() - defer root.DecRef() + defer root.DecRef(t) vfsObj := root.Mount().Filesystem().VirtualFilesystem() if fd == linux.AT_FDCWD { wd := t.FSContext().WorkingDirectoryVFS2() - defer wd.DecRef() + defer wd.DecRef(t) name, _ := vfsObj.PathnameWithDeleted(t, root, wd) return fmt.Sprintf("AT_FDCWD %s", name) @@ -191,7 +191,7 @@ func fdVFS2(t *kernel.Task, fd int32) string { // Cast FD to uint64 to avoid printing negative hex. return fmt.Sprintf("%#x (bad FD)", uint64(fd)) } - defer file.DecRef() + defer file.DecRef(t) name, _ := vfsObj.PathnameWithDeleted(t, root, file.VirtualDentry()) return fmt.Sprintf("%#x %s", fd, name) diff --git a/pkg/sentry/syscalls/epoll.go b/pkg/sentry/syscalls/epoll.go index d9fb808c0..d23a0068a 100644 --- a/pkg/sentry/syscalls/epoll.go +++ b/pkg/sentry/syscalls/epoll.go @@ -28,7 +28,7 @@ import ( // CreateEpoll implements the epoll_create(2) linux syscall. func CreateEpoll(t *kernel.Task, closeOnExec bool) (int32, error) { file := epoll.NewEventPoll(t) - defer file.DecRef() + defer file.DecRef(t) fd, err := t.NewFDFrom(0, file, kernel.FDFlags{ CloseOnExec: closeOnExec, @@ -47,14 +47,14 @@ func AddEpoll(t *kernel.Task, epfd int32, fd int32, flags epoll.EntryFlags, mask if epollfile == nil { return syserror.EBADF } - defer epollfile.DecRef() + defer epollfile.DecRef(t) // Get the target file id. file := t.GetFile(fd) if file == nil { return syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the epollPoll operations. e, ok := epollfile.FileOperations.(*epoll.EventPoll) @@ -73,14 +73,14 @@ func UpdateEpoll(t *kernel.Task, epfd int32, fd int32, flags epoll.EntryFlags, m if epollfile == nil { return syserror.EBADF } - defer epollfile.DecRef() + defer epollfile.DecRef(t) // Get the target file id. file := t.GetFile(fd) if file == nil { return syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the epollPoll operations. e, ok := epollfile.FileOperations.(*epoll.EventPoll) @@ -99,14 +99,14 @@ func RemoveEpoll(t *kernel.Task, epfd int32, fd int32) error { if epollfile == nil { return syserror.EBADF } - defer epollfile.DecRef() + defer epollfile.DecRef(t) // Get the target file id. file := t.GetFile(fd) if file == nil { return syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the epollPoll operations. e, ok := epollfile.FileOperations.(*epoll.EventPoll) @@ -115,7 +115,7 @@ func RemoveEpoll(t *kernel.Task, epfd int32, fd int32) error { } // Try to remove the entry. - return e.RemoveEntry(epoll.FileIdentifier{file, fd}) + return e.RemoveEntry(t, epoll.FileIdentifier{file, fd}) } // WaitEpoll implements the epoll_wait(2) linux syscall. @@ -125,7 +125,7 @@ func WaitEpoll(t *kernel.Task, fd int32, max int, timeout int) ([]linux.EpollEve if epollfile == nil { return nil, syserror.EBADF } - defer epollfile.DecRef() + defer epollfile.DecRef(t) // Extract the epollPoll operations. e, ok := epollfile.FileOperations.(*epoll.EventPoll) diff --git a/pkg/sentry/syscalls/linux/sys_aio.go b/pkg/sentry/syscalls/linux/sys_aio.go index ba2557c52..e9d64dec5 100644 --- a/pkg/sentry/syscalls/linux/sys_aio.go +++ b/pkg/sentry/syscalls/linux/sys_aio.go @@ -247,7 +247,7 @@ func getAIOCallback(t *kernel.Task, file *fs.File, cbAddr usermem.Addr, cb *linu ev.Result = -int64(kernel.ExtractErrno(err, 0)) } - file.DecRef() + file.DecRef(ctx) // Queue the result for delivery. actx.FinishRequest(ev) @@ -257,7 +257,7 @@ func getAIOCallback(t *kernel.Task, file *fs.File, cbAddr usermem.Addr, cb *linu // wake up. if eventFile != nil { eventFile.FileOperations.(*eventfd.EventOperations).Signal(1) - eventFile.DecRef() + eventFile.DecRef(ctx) } } } @@ -269,7 +269,7 @@ func submitCallback(t *kernel.Task, id uint64, cb *linux.IOCallback, cbAddr user // File not found. return syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Was there an eventFD? Extract it. var eventFile *fs.File @@ -279,7 +279,7 @@ func submitCallback(t *kernel.Task, id uint64, cb *linux.IOCallback, cbAddr user // Bad FD. return syserror.EBADF } - defer eventFile.DecRef() + defer eventFile.DecRef(t) // Check that it is an eventfd. if _, ok := eventFile.FileOperations.(*eventfd.EventOperations); !ok { diff --git a/pkg/sentry/syscalls/linux/sys_eventfd.go b/pkg/sentry/syscalls/linux/sys_eventfd.go index ed3413ca6..3b4f879e4 100644 --- a/pkg/sentry/syscalls/linux/sys_eventfd.go +++ b/pkg/sentry/syscalls/linux/sys_eventfd.go @@ -37,7 +37,7 @@ func Eventfd2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc event.SetFlags(fs.SettableFileFlags{ NonBlocking: flags&linux.EFD_NONBLOCK != 0, }) - defer event.DecRef() + defer event.DecRef(t) fd, err := t.NewFDFrom(0, event, kernel.FDFlags{ CloseOnExec: flags&linux.EFD_CLOEXEC != 0, diff --git a/pkg/sentry/syscalls/linux/sys_file.go b/pkg/sentry/syscalls/linux/sys_file.go index 8cf6401e7..1bc9b184e 100644 --- a/pkg/sentry/syscalls/linux/sys_file.go +++ b/pkg/sentry/syscalls/linux/sys_file.go @@ -40,7 +40,7 @@ func fileOpAt(t *kernel.Task, dirFD int32, path string, fn func(root *fs.Dirent, // Common case: we are accessing a file in the root. root := t.FSContext().RootDirectory() err := fn(root, root, name, linux.MaxSymlinkTraversals) - root.DecRef() + root.DecRef(t) return err } else if dir == "." && dirFD == linux.AT_FDCWD { // Common case: we are accessing a file relative to the current @@ -48,8 +48,8 @@ func fileOpAt(t *kernel.Task, dirFD int32, path string, fn func(root *fs.Dirent, wd := t.FSContext().WorkingDirectory() root := t.FSContext().RootDirectory() err := fn(root, wd, name, linux.MaxSymlinkTraversals) - wd.DecRef() - root.DecRef() + wd.DecRef(t) + root.DecRef(t) return err } @@ -97,19 +97,19 @@ func fileOpOn(t *kernel.Task, dirFD int32, path string, resolve bool, fn func(ro } else { d, err = t.MountNamespace().FindLink(t, root, rel, path, &remainingTraversals) } - root.DecRef() + root.DecRef(t) if wd != nil { - wd.DecRef() + wd.DecRef(t) } if f != nil { - f.DecRef() + f.DecRef(t) } if err != nil { return err } err = fn(root, d, remainingTraversals) - d.DecRef() + d.DecRef(t) return err } @@ -186,7 +186,7 @@ func openAt(t *kernel.Task, dirFD int32, addr usermem.Addr, flags uint) (fd uint if err != nil { return syserror.ConvertIntr(err, kernel.ERESTARTSYS) } - defer file.DecRef() + defer file.DecRef(t) // Success. newFD, err := t.NewFDFrom(0, file, kernel.FDFlags{ @@ -242,7 +242,7 @@ func mknodAt(t *kernel.Task, dirFD int32, addr usermem.Addr, mode linux.FileMode if err != nil { return err } - file.DecRef() + file.DecRef(t) return nil case linux.ModeNamedPipe: @@ -332,7 +332,7 @@ func createAt(t *kernel.Task, dirFD int32, addr usermem.Addr, flags uint, mode l if err != nil { break } - defer found.DecRef() + defer found.DecRef(t) // We found something (possibly a symlink). If the // O_EXCL flag was passed, then we can immediately @@ -357,7 +357,7 @@ func createAt(t *kernel.Task, dirFD int32, addr usermem.Addr, flags uint, mode l resolved, err = found.Inode.Getlink(t) if err == nil { // No more resolution necessary. - defer resolved.DecRef() + defer resolved.DecRef(t) break } if err != fs.ErrResolveViaReadlink { @@ -384,7 +384,7 @@ func createAt(t *kernel.Task, dirFD int32, addr usermem.Addr, flags uint, mode l if err != nil { break } - defer newParent.DecRef() + defer newParent.DecRef(t) // Repeat the process with the parent and name of the // symlink target. @@ -416,7 +416,7 @@ func createAt(t *kernel.Task, dirFD int32, addr usermem.Addr, flags uint, mode l if err != nil { return syserror.ConvertIntr(err, kernel.ERESTARTSYS) } - defer newFile.DecRef() + defer newFile.DecRef(t) case syserror.ENOENT: // File does not exist. Proceed with creation. @@ -432,7 +432,7 @@ func createAt(t *kernel.Task, dirFD int32, addr usermem.Addr, flags uint, mode l // No luck, bail. return err } - defer newFile.DecRef() + defer newFile.DecRef(t) found = newFile.Dirent default: return err @@ -596,7 +596,7 @@ func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Shared flags between file and socket. switch request { @@ -671,9 +671,9 @@ func Getcwd(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal addr := args[0].Pointer() size := args[1].SizeT() cwd := t.FSContext().WorkingDirectory() - defer cwd.DecRef() + defer cwd.DecRef(t) root := t.FSContext().RootDirectory() - defer root.DecRef() + defer root.DecRef(t) // Get our fullname from the root and preprend unreachable if the root was // unreachable from our current dirent this is the same behavior as on linux. @@ -722,7 +722,7 @@ func Chroot(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal return err } - t.FSContext().SetRootDirectory(d) + t.FSContext().SetRootDirectory(t, d) return nil }) } @@ -747,7 +747,7 @@ func Chdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall return err } - t.FSContext().SetWorkingDirectory(d) + t.FSContext().SetWorkingDirectory(t, d) return nil }) } @@ -760,7 +760,7 @@ func Fchdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Is it a directory? if !fs.IsDir(file.Dirent.Inode.StableAttr) { @@ -772,7 +772,7 @@ func Fchdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal return 0, nil, err } - t.FSContext().SetWorkingDirectory(file.Dirent) + t.FSContext().SetWorkingDirectory(t, file.Dirent) return 0, nil, nil } @@ -791,7 +791,7 @@ func Close(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) err := file.Flush(t) return 0, nil, handleIOError(t, false /* partial */, err, syserror.EINTR, "close", file) @@ -805,7 +805,7 @@ func Dup(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallCo if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) newFD, err := t.NewFDFrom(0, file, kernel.FDFlags{}) if err != nil { @@ -826,7 +826,7 @@ func Dup2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC if oldFile == nil { return 0, nil, syserror.EBADF } - defer oldFile.DecRef() + defer oldFile.DecRef(t) return uintptr(newfd), nil, nil } @@ -850,7 +850,7 @@ func Dup3(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC if oldFile == nil { return 0, nil, syserror.EBADF } - defer oldFile.DecRef() + defer oldFile.DecRef(t) err := t.NewFDAt(newfd, oldFile, kernel.FDFlags{CloseOnExec: flags&linux.O_CLOEXEC != 0}) if err != nil { @@ -925,7 +925,7 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) switch cmd { case linux.F_DUPFD, linux.F_DUPFD_CLOEXEC: @@ -1132,7 +1132,7 @@ func Fadvise64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // If the FD refers to a pipe or FIFO, return error. if fs.IsPipe(file.Dirent.Inode.StableAttr) { @@ -1171,7 +1171,7 @@ func mkdirAt(t *kernel.Task, dirFD int32, addr usermem.Addr, mode linux.FileMode switch err { case nil: // The directory existed. - defer f.DecRef() + defer f.DecRef(t) return syserror.EEXIST case syserror.EACCES: // Permission denied while walking to the directory. @@ -1349,7 +1349,7 @@ func linkAt(t *kernel.Task, oldDirFD int32, oldAddr usermem.Addr, newDirFD int32 if target == nil { return syserror.EBADF } - defer target.DecRef() + defer target.DecRef(t) if err := mayLinkAt(t, target.Dirent.Inode); err != nil { return err } @@ -1602,7 +1602,7 @@ func Ftruncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Reject truncation if the file flags do not permit this operation. // This is different from truncate(2) above. @@ -1730,7 +1730,7 @@ func chownAt(t *kernel.Task, fd int32, addr usermem.Addr, resolve, allowEmpty bo if file == nil { return syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) return chown(t, file.Dirent, uid, gid) } @@ -1768,7 +1768,7 @@ func Fchown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) return 0, nil, chown(t, file.Dirent, uid, gid) } @@ -1833,7 +1833,7 @@ func Fchmod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) return 0, nil, chmod(t, file.Dirent, mode) } @@ -1893,10 +1893,10 @@ func utimes(t *kernel.Task, dirFD int32, addr usermem.Addr, ts fs.TimeSpec, reso if f == nil { return syserror.EBADF } - defer f.DecRef() + defer f.DecRef(t) root := t.FSContext().RootDirectory() - defer root.DecRef() + defer root.DecRef(t) return setTimestamp(root, f.Dirent, linux.MaxSymlinkTraversals) } @@ -2088,7 +2088,7 @@ func Fallocate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) if offset < 0 || length <= 0 { return 0, nil, syserror.EINVAL @@ -2141,7 +2141,7 @@ func Flock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall // flock(2): EBADF fd is not an open file descriptor. return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) nonblocking := operation&linux.LOCK_NB != 0 operation &^= linux.LOCK_NB @@ -2224,8 +2224,8 @@ func MemfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S return 0, nil, err } - defer dirent.DecRef() - defer file.DecRef() + defer dirent.DecRef(t) + defer file.DecRef(t) newFD, err := t.NewFDFrom(0, file, kernel.FDFlags{ CloseOnExec: cloExec, diff --git a/pkg/sentry/syscalls/linux/sys_futex.go b/pkg/sentry/syscalls/linux/sys_futex.go index f04d78856..9d1b2edb1 100644 --- a/pkg/sentry/syscalls/linux/sys_futex.go +++ b/pkg/sentry/syscalls/linux/sys_futex.go @@ -73,7 +73,7 @@ func futexWaitAbsolute(t *kernel.Task, clockRealtime bool, ts linux.Timespec, fo err = t.BlockWithDeadline(w.C, true, ktime.FromTimespec(ts)) } - t.Futex().WaitComplete(w) + t.Futex().WaitComplete(w, t) return 0, syserror.ConvertIntr(err, kernel.ERESTARTSYS) } @@ -95,7 +95,7 @@ func futexWaitDuration(t *kernel.Task, duration time.Duration, forever bool, add } remaining, err := t.BlockWithTimeout(w.C, !forever, duration) - t.Futex().WaitComplete(w) + t.Futex().WaitComplete(w, t) if err == nil { return 0, nil } @@ -148,7 +148,7 @@ func futexLockPI(t *kernel.Task, ts linux.Timespec, forever bool, addr usermem.A timer.Destroy() } - t.Futex().WaitComplete(w) + t.Futex().WaitComplete(w, t) return syserror.ConvertIntr(err, kernel.ERESTARTSYS) } diff --git a/pkg/sentry/syscalls/linux/sys_getdents.go b/pkg/sentry/syscalls/linux/sys_getdents.go index b126fecc0..f5699e55d 100644 --- a/pkg/sentry/syscalls/linux/sys_getdents.go +++ b/pkg/sentry/syscalls/linux/sys_getdents.go @@ -68,7 +68,7 @@ func getdents(t *kernel.Task, fd int32, addr usermem.Addr, size int, f func(*dir if dir == nil { return 0, syserror.EBADF } - defer dir.DecRef() + defer dir.DecRef(t) w := &usermem.IOReadWriter{ Ctx: t, diff --git a/pkg/sentry/syscalls/linux/sys_inotify.go b/pkg/sentry/syscalls/linux/sys_inotify.go index b2c7b3444..cf47bb9dd 100644 --- a/pkg/sentry/syscalls/linux/sys_inotify.go +++ b/pkg/sentry/syscalls/linux/sys_inotify.go @@ -40,7 +40,7 @@ func InotifyInit1(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel. NonBlocking: flags&linux.IN_NONBLOCK != 0, } n := fs.NewFile(t, dirent, fileFlags, fs.NewInotify(t)) - defer n.DecRef() + defer n.DecRef(t) fd, err := t.NewFDFrom(0, n, kernel.FDFlags{ CloseOnExec: flags&linux.IN_CLOEXEC != 0, @@ -71,7 +71,7 @@ func fdToInotify(t *kernel.Task, fd int32) (*fs.Inotify, *fs.File, error) { ino, ok := file.FileOperations.(*fs.Inotify) if !ok { // Not an inotify fd. - file.DecRef() + file.DecRef(t) return nil, nil, syserror.EINVAL } @@ -98,7 +98,7 @@ func InotifyAddWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kern if err != nil { return 0, nil, err } - defer file.DecRef() + defer file.DecRef(t) path, _, err := copyInPath(t, addr, false /* allowEmpty */) if err != nil { @@ -128,6 +128,6 @@ func InotifyRmWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne if err != nil { return 0, nil, err } - defer file.DecRef() - return 0, nil, ino.RmWatch(wd) + defer file.DecRef(t) + return 0, nil, ino.RmWatch(t, wd) } diff --git a/pkg/sentry/syscalls/linux/sys_lseek.go b/pkg/sentry/syscalls/linux/sys_lseek.go index 3f7691eae..1c38f8f4f 100644 --- a/pkg/sentry/syscalls/linux/sys_lseek.go +++ b/pkg/sentry/syscalls/linux/sys_lseek.go @@ -33,7 +33,7 @@ func Lseek(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) var sw fs.SeekWhence switch whence { diff --git a/pkg/sentry/syscalls/linux/sys_mmap.go b/pkg/sentry/syscalls/linux/sys_mmap.go index 91694d374..72786b032 100644 --- a/pkg/sentry/syscalls/linux/sys_mmap.go +++ b/pkg/sentry/syscalls/linux/sys_mmap.go @@ -75,7 +75,7 @@ func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC } defer func() { if opts.MappingIdentity != nil { - opts.MappingIdentity.DecRef() + opts.MappingIdentity.DecRef(t) } }() @@ -85,7 +85,7 @@ func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) flags := file.Flags() // mmap unconditionally requires that the FD is readable. diff --git a/pkg/sentry/syscalls/linux/sys_mount.go b/pkg/sentry/syscalls/linux/sys_mount.go index eb5ff48f5..bd0633564 100644 --- a/pkg/sentry/syscalls/linux/sys_mount.go +++ b/pkg/sentry/syscalls/linux/sys_mount.go @@ -115,7 +115,7 @@ func Mount(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall }); err != nil { // Something went wrong. Drop our ref on rootInode before // returning the error. - rootInode.DecRef() + rootInode.DecRef(t) return 0, nil, err } diff --git a/pkg/sentry/syscalls/linux/sys_pipe.go b/pkg/sentry/syscalls/linux/sys_pipe.go index 43c510930..3149e4aad 100644 --- a/pkg/sentry/syscalls/linux/sys_pipe.go +++ b/pkg/sentry/syscalls/linux/sys_pipe.go @@ -34,10 +34,10 @@ func pipe2(t *kernel.Task, addr usermem.Addr, flags uint) (uintptr, error) { r, w := pipe.NewConnectedPipe(t, pipe.DefaultPipeSize, usermem.PageSize) r.SetFlags(linuxToFlags(flags).Settable()) - defer r.DecRef() + defer r.DecRef(t) w.SetFlags(linuxToFlags(flags).Settable()) - defer w.DecRef() + defer w.DecRef(t) fds, err := t.NewFDs(0, []*fs.File{r, w}, kernel.FDFlags{ CloseOnExec: flags&linux.O_CLOEXEC != 0, @@ -49,7 +49,7 @@ func pipe2(t *kernel.Task, addr usermem.Addr, flags uint) (uintptr, error) { if _, err := t.CopyOut(addr, fds); err != nil { for _, fd := range fds { if file, _ := t.FDTable().Remove(fd); file != nil { - file.DecRef() + file.DecRef(t) } } return 0, err diff --git a/pkg/sentry/syscalls/linux/sys_poll.go b/pkg/sentry/syscalls/linux/sys_poll.go index f0198141c..3435bdf77 100644 --- a/pkg/sentry/syscalls/linux/sys_poll.go +++ b/pkg/sentry/syscalls/linux/sys_poll.go @@ -70,7 +70,7 @@ func initReadiness(t *kernel.Task, pfd *linux.PollFD, state *pollState, ch chan } if ch == nil { - defer file.DecRef() + defer file.DecRef(t) } else { state.file = file state.waiter, _ = waiter.NewChannelEntry(ch) @@ -82,11 +82,11 @@ func initReadiness(t *kernel.Task, pfd *linux.PollFD, state *pollState, ch chan } // releaseState releases all the pollState in "state". -func releaseState(state []pollState) { +func releaseState(t *kernel.Task, state []pollState) { for i := range state { if state[i].file != nil { state[i].file.EventUnregister(&state[i].waiter) - state[i].file.DecRef() + state[i].file.DecRef(t) } } } @@ -107,7 +107,7 @@ func pollBlock(t *kernel.Task, pfd []linux.PollFD, timeout time.Duration) (time. // result, we stop registering for events but still go through all files // to get their ready masks. state := make([]pollState, len(pfd)) - defer releaseState(state) + defer releaseState(t, state) n := uintptr(0) for i := range pfd { initReadiness(t, &pfd[i], &state[i], ch) @@ -266,7 +266,7 @@ func doSelect(t *kernel.Task, nfds int, readFDs, writeFDs, exceptFDs usermem.Add if file == nil { return 0, syserror.EBADF } - file.DecRef() + file.DecRef(t) var mask int16 if (rV & m) != 0 { diff --git a/pkg/sentry/syscalls/linux/sys_prctl.go b/pkg/sentry/syscalls/linux/sys_prctl.go index f92bf8096..64a725296 100644 --- a/pkg/sentry/syscalls/linux/sys_prctl.go +++ b/pkg/sentry/syscalls/linux/sys_prctl.go @@ -128,7 +128,7 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // They trying to set exe to a non-file? if !fs.IsFile(file.Dirent.Inode.StableAttr) { @@ -136,7 +136,7 @@ func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall } // Set the underlying executable. - t.MemoryManager().SetExecutable(fsbridge.NewFSFile(file)) + t.MemoryManager().SetExecutable(t, fsbridge.NewFSFile(file)) case linux.PR_SET_MM_AUXV, linux.PR_SET_MM_START_CODE, diff --git a/pkg/sentry/syscalls/linux/sys_read.go b/pkg/sentry/syscalls/linux/sys_read.go index 071b4bacc..3bbc3fa4b 100644 --- a/pkg/sentry/syscalls/linux/sys_read.go +++ b/pkg/sentry/syscalls/linux/sys_read.go @@ -48,7 +48,7 @@ func Read(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Check that the file is readable. if !file.Flags().Read { @@ -84,7 +84,7 @@ func Readahead(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Check that the file is readable. if !file.Flags().Read { @@ -118,7 +118,7 @@ func Pread64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Check that the offset is legitimate and does not overflow. if offset < 0 || offset+int64(size) < 0 { @@ -164,7 +164,7 @@ func Readv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Check that the file is readable. if !file.Flags().Read { @@ -195,7 +195,7 @@ func Preadv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Check that the offset is legitimate. if offset < 0 { @@ -244,7 +244,7 @@ func Preadv2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Check that the offset is legitimate. if offset < -1 { diff --git a/pkg/sentry/syscalls/linux/sys_shm.go b/pkg/sentry/syscalls/linux/sys_shm.go index 4a8bc24a2..f0ae8fa8e 100644 --- a/pkg/sentry/syscalls/linux/sys_shm.go +++ b/pkg/sentry/syscalls/linux/sys_shm.go @@ -39,7 +39,7 @@ func Shmget(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if err != nil { return 0, nil, err } - defer segment.DecRef() + defer segment.DecRef(t) return uintptr(segment.ID), nil, nil } @@ -66,7 +66,7 @@ func Shmat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if err != nil { return 0, nil, syserror.EINVAL } - defer segment.DecRef() + defer segment.DecRef(t) opts, err := segment.ConfigureAttach(t, addr, shm.AttachOpts{ Execute: flag&linux.SHM_EXEC == linux.SHM_EXEC, @@ -108,7 +108,7 @@ func Shmctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if err != nil { return 0, nil, syserror.EINVAL } - defer segment.DecRef() + defer segment.DecRef(t) stat, err := segment.IPCStat(t) if err == nil { @@ -132,7 +132,7 @@ func Shmctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if err != nil { return 0, nil, syserror.EINVAL } - defer segment.DecRef() + defer segment.DecRef(t) switch cmd { case linux.IPC_SET: @@ -145,7 +145,7 @@ func Shmctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal return 0, nil, err case linux.IPC_RMID: - segment.MarkDestroyed() + segment.MarkDestroyed(t) return 0, nil, nil case linux.SHM_LOCK, linux.SHM_UNLOCK: diff --git a/pkg/sentry/syscalls/linux/sys_signal.go b/pkg/sentry/syscalls/linux/sys_signal.go index d2b0012ae..20cb1a5cb 100644 --- a/pkg/sentry/syscalls/linux/sys_signal.go +++ b/pkg/sentry/syscalls/linux/sys_signal.go @@ -536,7 +536,7 @@ func sharedSignalfd(t *kernel.Task, fd int32, sigset usermem.Addr, sigsetsize ui if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Is this a signalfd? if s, ok := file.FileOperations.(*signalfd.SignalOperations); ok { @@ -553,7 +553,7 @@ func sharedSignalfd(t *kernel.Task, fd int32, sigset usermem.Addr, sigsetsize ui if err != nil { return 0, nil, err } - defer file.DecRef() + defer file.DecRef(t) // Set appropriate flags. file.SetFlags(fs.SettableFileFlags{ diff --git a/pkg/sentry/syscalls/linux/sys_socket.go b/pkg/sentry/syscalls/linux/sys_socket.go index 414fce8e3..fec1c1974 100644 --- a/pkg/sentry/syscalls/linux/sys_socket.go +++ b/pkg/sentry/syscalls/linux/sys_socket.go @@ -200,7 +200,7 @@ func Socket(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal s.SetFlags(fs.SettableFileFlags{ NonBlocking: stype&linux.SOCK_NONBLOCK != 0, }) - defer s.DecRef() + defer s.DecRef(t) fd, err := t.NewFDFrom(0, s, kernel.FDFlags{ CloseOnExec: stype&linux.SOCK_CLOEXEC != 0, @@ -235,8 +235,8 @@ func SocketPair(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy } s1.SetFlags(fileFlags) s2.SetFlags(fileFlags) - defer s1.DecRef() - defer s2.DecRef() + defer s1.DecRef(t) + defer s2.DecRef(t) // Create the FDs for the sockets. fds, err := t.NewFDs(0, []*fs.File{s1, s2}, kernel.FDFlags{ @@ -250,7 +250,7 @@ func SocketPair(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy if _, err := t.CopyOut(socks, fds); err != nil { for _, fd := range fds { if file, _ := t.FDTable().Remove(fd); file != nil { - file.DecRef() + file.DecRef(t) } } return 0, nil, err @@ -270,7 +270,7 @@ func Connect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.FileOperations.(socket.Socket) @@ -301,7 +301,7 @@ func accept(t *kernel.Task, fd int32, addr usermem.Addr, addrLen usermem.Addr, f if file == nil { return 0, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.FileOperations.(socket.Socket) @@ -360,7 +360,7 @@ func Bind(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.FileOperations.(socket.Socket) @@ -387,7 +387,7 @@ func Listen(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.FileOperations.(socket.Socket) @@ -416,7 +416,7 @@ func Shutdown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.FileOperations.(socket.Socket) @@ -447,7 +447,7 @@ func GetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.FileOperations.(socket.Socket) @@ -529,7 +529,7 @@ func SetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.FileOperations.(socket.Socket) @@ -567,7 +567,7 @@ func GetSockName(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.FileOperations.(socket.Socket) @@ -595,7 +595,7 @@ func GetPeerName(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.FileOperations.(socket.Socket) @@ -628,7 +628,7 @@ func RecvMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.FileOperations.(socket.Socket) @@ -681,7 +681,7 @@ func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.FileOperations.(socket.Socket) @@ -775,7 +775,7 @@ func recvSingleMsg(t *kernel.Task, s socket.Socket, msgPtr usermem.Addr, flags i } if !cms.Unix.Empty() { mflags |= linux.MSG_CTRUNC - cms.Release() + cms.Release(t) } if int(msg.Flags) != mflags { @@ -795,7 +795,7 @@ func recvSingleMsg(t *kernel.Task, s socket.Socket, msgPtr usermem.Addr, flags i if e != nil { return 0, syserror.ConvertIntr(e.ToError(), kernel.ERESTARTSYS) } - defer cms.Release() + defer cms.Release(t) controlData := make([]byte, 0, msg.ControlLen) controlData = control.PackControlMessages(t, cms, controlData) @@ -851,7 +851,7 @@ func recvFrom(t *kernel.Task, fd int32, bufPtr usermem.Addr, bufLen uint64, flag if file == nil { return 0, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.FileOperations.(socket.Socket) @@ -880,7 +880,7 @@ func recvFrom(t *kernel.Task, fd int32, bufPtr usermem.Addr, bufLen uint64, flag } n, _, sender, senderLen, cm, e := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, nameLenPtr != 0, 0) - cm.Release() + cm.Release(t) if e != nil { return 0, syserror.ConvertIntr(e.ToError(), kernel.ERESTARTSYS) } @@ -924,7 +924,7 @@ func SendMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.FileOperations.(socket.Socket) @@ -962,7 +962,7 @@ func SendMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.FileOperations.(socket.Socket) @@ -1066,7 +1066,7 @@ func sendSingleMsg(t *kernel.Task, s socket.Socket, file *fs.File, msgPtr userme n, e := s.SendMsg(t, src, to, int(flags), haveDeadline, deadline, controlMessages) err = handleIOError(t, n != 0, e.ToError(), kernel.ERESTARTSYS, "sendmsg", file) if err != nil { - controlMessages.Release() + controlMessages.Release(t) } return uintptr(n), err } @@ -1084,7 +1084,7 @@ func sendTo(t *kernel.Task, fd int32, bufPtr usermem.Addr, bufLen uint64, flags if file == nil { return 0, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.FileOperations.(socket.Socket) diff --git a/pkg/sentry/syscalls/linux/sys_splice.go b/pkg/sentry/syscalls/linux/sys_splice.go index 77c78889d..b8846a10a 100644 --- a/pkg/sentry/syscalls/linux/sys_splice.go +++ b/pkg/sentry/syscalls/linux/sys_splice.go @@ -101,7 +101,7 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc if inFile == nil { return 0, nil, syserror.EBADF } - defer inFile.DecRef() + defer inFile.DecRef(t) if !inFile.Flags().Read { return 0, nil, syserror.EBADF @@ -111,7 +111,7 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc if outFile == nil { return 0, nil, syserror.EBADF } - defer outFile.DecRef() + defer outFile.DecRef(t) if !outFile.Flags().Write { return 0, nil, syserror.EBADF @@ -192,13 +192,13 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if outFile == nil { return 0, nil, syserror.EBADF } - defer outFile.DecRef() + defer outFile.DecRef(t) inFile := t.GetFile(inFD) if inFile == nil { return 0, nil, syserror.EBADF } - defer inFile.DecRef() + defer inFile.DecRef(t) // The operation is non-blocking if anything is non-blocking. // @@ -300,13 +300,13 @@ func Tee(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallCo if outFile == nil { return 0, nil, syserror.EBADF } - defer outFile.DecRef() + defer outFile.DecRef(t) inFile := t.GetFile(inFD) if inFile == nil { return 0, nil, syserror.EBADF } - defer inFile.DecRef() + defer inFile.DecRef(t) // All files must be pipes. if !fs.IsPipe(inFile.Dirent.Inode.StableAttr) || !fs.IsPipe(outFile.Dirent.Inode.StableAttr) { diff --git a/pkg/sentry/syscalls/linux/sys_stat.go b/pkg/sentry/syscalls/linux/sys_stat.go index 46ebf27a2..a5826f2dd 100644 --- a/pkg/sentry/syscalls/linux/sys_stat.go +++ b/pkg/sentry/syscalls/linux/sys_stat.go @@ -58,7 +58,7 @@ func Fstatat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) return 0, nil, fstat(t, file, statAddr) } @@ -100,7 +100,7 @@ func Fstat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) return 0, nil, fstat(t, file, statAddr) } @@ -158,7 +158,7 @@ func Statx(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) uattr, err := file.UnstableAttr(t) if err != nil { return 0, nil, err @@ -249,7 +249,7 @@ func Fstatfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) return 0, nil, statfsImpl(t, file.Dirent, statfsAddr) } diff --git a/pkg/sentry/syscalls/linux/sys_sync.go b/pkg/sentry/syscalls/linux/sys_sync.go index 5ad465ae3..f2c0e5069 100644 --- a/pkg/sentry/syscalls/linux/sys_sync.go +++ b/pkg/sentry/syscalls/linux/sys_sync.go @@ -39,7 +39,7 @@ func Syncfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Use "sync-the-world" for now, it's guaranteed that fd is at least // on the root filesystem. @@ -54,7 +54,7 @@ func Fsync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) err := file.Fsync(t, 0, fs.FileMaxOffset, fs.SyncAll) return 0, nil, syserror.ConvertIntr(err, kernel.ERESTARTSYS) @@ -70,7 +70,7 @@ func Fdatasync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) err := file.Fsync(t, 0, fs.FileMaxOffset, fs.SyncData) return 0, nil, syserror.ConvertIntr(err, kernel.ERESTARTSYS) @@ -103,7 +103,7 @@ func SyncFileRange(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // SYNC_FILE_RANGE_WAIT_BEFORE waits upon write-out of all pages in the // specified range that have already been submitted to the device diff --git a/pkg/sentry/syscalls/linux/sys_thread.go b/pkg/sentry/syscalls/linux/sys_thread.go index 00915fdde..2d16e4933 100644 --- a/pkg/sentry/syscalls/linux/sys_thread.go +++ b/pkg/sentry/syscalls/linux/sys_thread.go @@ -117,7 +117,7 @@ func execveat(t *kernel.Task, dirFD int32, pathnameAddr, argvAddr, envvAddr user resolveFinal := flags&linux.AT_SYMLINK_NOFOLLOW == 0 root := t.FSContext().RootDirectory() - defer root.DecRef() + defer root.DecRef(t) var wd *fs.Dirent var executable fsbridge.File @@ -133,7 +133,7 @@ func execveat(t *kernel.Task, dirFD int32, pathnameAddr, argvAddr, envvAddr user if f == nil { return 0, nil, syserror.EBADF } - defer f.DecRef() + defer f.DecRef(t) closeOnExec = fdFlags.CloseOnExec if atEmptyPath && len(pathname) == 0 { @@ -155,7 +155,7 @@ func execveat(t *kernel.Task, dirFD int32, pathnameAddr, argvAddr, envvAddr user } } if wd != nil { - defer wd.DecRef() + defer wd.DecRef(t) } // Load the new TaskContext. diff --git a/pkg/sentry/syscalls/linux/sys_timerfd.go b/pkg/sentry/syscalls/linux/sys_timerfd.go index cf49b43db..34b03e4ee 100644 --- a/pkg/sentry/syscalls/linux/sys_timerfd.go +++ b/pkg/sentry/syscalls/linux/sys_timerfd.go @@ -43,7 +43,7 @@ func TimerfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel return 0, nil, syserror.EINVAL } f := timerfd.NewFile(t, c) - defer f.DecRef() + defer f.DecRef(t) f.SetFlags(fs.SettableFileFlags{ NonBlocking: flags&linux.TFD_NONBLOCK != 0, }) @@ -73,7 +73,7 @@ func TimerfdSettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne if f == nil { return 0, nil, syserror.EBADF } - defer f.DecRef() + defer f.DecRef(t) tf, ok := f.FileOperations.(*timerfd.TimerOperations) if !ok { @@ -107,7 +107,7 @@ func TimerfdGettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne if f == nil { return 0, nil, syserror.EBADF } - defer f.DecRef() + defer f.DecRef(t) tf, ok := f.FileOperations.(*timerfd.TimerOperations) if !ok { diff --git a/pkg/sentry/syscalls/linux/sys_write.go b/pkg/sentry/syscalls/linux/sys_write.go index 6ec0de96e..485526e28 100644 --- a/pkg/sentry/syscalls/linux/sys_write.go +++ b/pkg/sentry/syscalls/linux/sys_write.go @@ -48,7 +48,7 @@ func Write(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Check that the file is writable. if !file.Flags().Write { @@ -85,7 +85,7 @@ func Pwrite64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Check that the offset is legitimate and does not overflow. if offset < 0 || offset+int64(size) < 0 { @@ -131,7 +131,7 @@ func Writev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Check that the file is writable. if !file.Flags().Write { @@ -162,7 +162,7 @@ func Pwritev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Check that the offset is legitimate. if offset < 0 { @@ -215,7 +215,7 @@ func Pwritev2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Check that the offset is legitimate. if offset < -1 { diff --git a/pkg/sentry/syscalls/linux/sys_xattr.go b/pkg/sentry/syscalls/linux/sys_xattr.go index c24946160..97474fd3c 100644 --- a/pkg/sentry/syscalls/linux/sys_xattr.go +++ b/pkg/sentry/syscalls/linux/sys_xattr.go @@ -49,7 +49,7 @@ func FGetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys if f == nil { return 0, nil, syserror.EBADF } - defer f.DecRef() + defer f.DecRef(t) n, err := getXattr(t, f.Dirent, nameAddr, valueAddr, size) if err != nil { @@ -153,7 +153,7 @@ func FSetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys if f == nil { return 0, nil, syserror.EBADF } - defer f.DecRef() + defer f.DecRef(t) return 0, nil, setXattr(t, f.Dirent, nameAddr, valueAddr, uint64(size), flags) } @@ -270,7 +270,7 @@ func FListXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy if f == nil { return 0, nil, syserror.EBADF } - defer f.DecRef() + defer f.DecRef(t) n, err := listXattr(t, f.Dirent, listAddr, size) if err != nil { @@ -384,7 +384,7 @@ func FRemoveXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel. if f == nil { return 0, nil, syserror.EBADF } - defer f.DecRef() + defer f.DecRef(t) return 0, nil, removeXattr(t, f.Dirent, nameAddr) } diff --git a/pkg/sentry/syscalls/linux/vfs2/aio.go b/pkg/sentry/syscalls/linux/vfs2/aio.go index e5cdefc50..399b4f60c 100644 --- a/pkg/sentry/syscalls/linux/vfs2/aio.go +++ b/pkg/sentry/syscalls/linux/vfs2/aio.go @@ -88,7 +88,7 @@ func submitCallback(t *kernel.Task, id uint64, cb *linux.IOCallback, cbAddr user if fd == nil { return syserror.EBADF } - defer fd.DecRef() + defer fd.DecRef(t) // Was there an eventFD? Extract it. var eventFD *vfs.FileDescription @@ -97,7 +97,7 @@ func submitCallback(t *kernel.Task, id uint64, cb *linux.IOCallback, cbAddr user if eventFD == nil { return syserror.EBADF } - defer eventFD.DecRef() + defer eventFD.DecRef(t) // Check that it is an eventfd. if _, ok := eventFD.Impl().(*eventfd.EventFileDescription); !ok { @@ -169,7 +169,7 @@ func getAIOCallback(t *kernel.Task, fd, eventFD *vfs.FileDescription, cbAddr use ev.Result = -int64(kernel.ExtractErrno(err, 0)) } - fd.DecRef() + fd.DecRef(ctx) // Queue the result for delivery. aioCtx.FinishRequest(ev) @@ -179,7 +179,7 @@ func getAIOCallback(t *kernel.Task, fd, eventFD *vfs.FileDescription, cbAddr use // wake up. if eventFD != nil { eventFD.Impl().(*eventfd.EventFileDescription).Signal(1) - eventFD.DecRef() + eventFD.DecRef(ctx) } } } diff --git a/pkg/sentry/syscalls/linux/vfs2/epoll.go b/pkg/sentry/syscalls/linux/vfs2/epoll.go index 34c90ae3e..c62f03509 100644 --- a/pkg/sentry/syscalls/linux/vfs2/epoll.go +++ b/pkg/sentry/syscalls/linux/vfs2/epoll.go @@ -37,11 +37,11 @@ func EpollCreate1(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel. return 0, nil, syserror.EINVAL } - file, err := t.Kernel().VFS().NewEpollInstanceFD() + file, err := t.Kernel().VFS().NewEpollInstanceFD(t) if err != nil { return 0, nil, err } - defer file.DecRef() + defer file.DecRef(t) fd, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{ CloseOnExec: flags&linux.EPOLL_CLOEXEC != 0, @@ -62,11 +62,11 @@ func EpollCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S return 0, nil, syserror.EINVAL } - file, err := t.Kernel().VFS().NewEpollInstanceFD() + file, err := t.Kernel().VFS().NewEpollInstanceFD(t) if err != nil { return 0, nil, err } - defer file.DecRef() + defer file.DecRef(t) fd, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{}) if err != nil { @@ -86,7 +86,7 @@ func EpollCtl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc if epfile == nil { return 0, nil, syserror.EBADF } - defer epfile.DecRef() + defer epfile.DecRef(t) ep, ok := epfile.Impl().(*vfs.EpollInstance) if !ok { return 0, nil, syserror.EINVAL @@ -95,7 +95,7 @@ func EpollCtl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) if epfile == file { return 0, nil, syserror.EINVAL } @@ -135,7 +135,7 @@ func EpollWait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys if epfile == nil { return 0, nil, syserror.EBADF } - defer epfile.DecRef() + defer epfile.DecRef(t) ep, ok := epfile.Impl().(*vfs.EpollInstance) if !ok { return 0, nil, syserror.EINVAL diff --git a/pkg/sentry/syscalls/linux/vfs2/eventfd.go b/pkg/sentry/syscalls/linux/vfs2/eventfd.go index aff1a2070..807f909da 100644 --- a/pkg/sentry/syscalls/linux/vfs2/eventfd.go +++ b/pkg/sentry/syscalls/linux/vfs2/eventfd.go @@ -38,11 +38,11 @@ func Eventfd2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc fileFlags |= linux.O_NONBLOCK } semMode := flags&linux.EFD_SEMAPHORE != 0 - eventfd, err := eventfd.New(vfsObj, initVal, semMode, fileFlags) + eventfd, err := eventfd.New(t, vfsObj, initVal, semMode, fileFlags) if err != nil { return 0, nil, err } - defer eventfd.DecRef() + defer eventfd.DecRef(t) fd, err := t.NewFDFromVFS2(0, eventfd, kernel.FDFlags{ CloseOnExec: flags&linux.EFD_CLOEXEC != 0, diff --git a/pkg/sentry/syscalls/linux/vfs2/execve.go b/pkg/sentry/syscalls/linux/vfs2/execve.go index aef0078a8..066ee0863 100644 --- a/pkg/sentry/syscalls/linux/vfs2/execve.go +++ b/pkg/sentry/syscalls/linux/vfs2/execve.go @@ -71,7 +71,7 @@ func execveat(t *kernel.Task, dirfd int32, pathnameAddr, argvAddr, envvAddr user } root := t.FSContext().RootDirectoryVFS2() - defer root.DecRef() + defer root.DecRef(t) var executable fsbridge.File closeOnExec := false if path := fspath.Parse(pathname); dirfd != linux.AT_FDCWD && !path.Absolute { @@ -90,7 +90,7 @@ func execveat(t *kernel.Task, dirfd int32, pathnameAddr, argvAddr, envvAddr user } start := dirfile.VirtualDentry() start.IncRef() - dirfile.DecRef() + dirfile.DecRef(t) closeOnExec = dirfileFlags.CloseOnExec file, err := t.Kernel().VFS().OpenAt(t, t.Credentials(), &vfs.PathOperation{ Root: root, @@ -101,19 +101,19 @@ func execveat(t *kernel.Task, dirfd int32, pathnameAddr, argvAddr, envvAddr user Flags: linux.O_RDONLY, FileExec: true, }) - start.DecRef() + start.DecRef(t) if err != nil { return 0, nil, err } - defer file.DecRef() + defer file.DecRef(t) executable = fsbridge.NewVFSFile(file) } // Load the new TaskContext. mntns := t.MountNamespaceVFS2() // FIXME(jamieliu): useless refcount change - defer mntns.DecRef() + defer mntns.DecRef(t) wd := t.FSContext().WorkingDirectoryVFS2() - defer wd.DecRef() + defer wd.DecRef(t) remainingTraversals := uint(linux.MaxSymlinkTraversals) loadArgs := loader.LoadArgs{ Opener: fsbridge.NewVFSLookup(mntns, root, wd), diff --git a/pkg/sentry/syscalls/linux/vfs2/fd.go b/pkg/sentry/syscalls/linux/vfs2/fd.go index 67f191551..72ca916a0 100644 --- a/pkg/sentry/syscalls/linux/vfs2/fd.go +++ b/pkg/sentry/syscalls/linux/vfs2/fd.go @@ -38,7 +38,7 @@ func Close(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) err := file.OnClose(t) return 0, nil, slinux.HandleIOErrorVFS2(t, false /* partial */, err, syserror.EINTR, "close", file) @@ -52,7 +52,7 @@ func Dup(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallCo if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) newFD, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{}) if err != nil { @@ -72,7 +72,7 @@ func Dup2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC if file == nil { return 0, nil, syserror.EBADF } - file.DecRef() + file.DecRef(t) return uintptr(newfd), nil, nil } @@ -101,7 +101,7 @@ func dup3(t *kernel.Task, oldfd, newfd int32, flags uint32) (uintptr, *kernel.Sy if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) err := t.NewFDAtVFS2(newfd, file, kernel.FDFlags{ CloseOnExec: flags&linux.O_CLOEXEC != 0, @@ -121,7 +121,7 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) switch cmd { case linux.F_DUPFD, linux.F_DUPFD_CLOEXEC: @@ -332,7 +332,7 @@ func Fadvise64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // If the FD refers to a pipe or FIFO, return error. if _, isPipe := file.Impl().(*pipe.VFSPipeFD); isPipe { diff --git a/pkg/sentry/syscalls/linux/vfs2/filesystem.go b/pkg/sentry/syscalls/linux/vfs2/filesystem.go index b6d2ddd65..01e0f9010 100644 --- a/pkg/sentry/syscalls/linux/vfs2/filesystem.go +++ b/pkg/sentry/syscalls/linux/vfs2/filesystem.go @@ -56,7 +56,7 @@ func linkat(t *kernel.Task, olddirfd int32, oldpathAddr usermem.Addr, newdirfd i if err != nil { return err } - defer oldtpop.Release() + defer oldtpop.Release(t) newpath, err := copyInPath(t, newpathAddr) if err != nil { @@ -66,7 +66,7 @@ func linkat(t *kernel.Task, olddirfd int32, oldpathAddr usermem.Addr, newdirfd i if err != nil { return err } - defer newtpop.Release() + defer newtpop.Release(t) return t.Kernel().VFS().LinkAt(t, t.Credentials(), &oldtpop.pop, &newtpop.pop) } @@ -95,7 +95,7 @@ func mkdirat(t *kernel.Task, dirfd int32, addr usermem.Addr, mode uint) error { if err != nil { return err } - defer tpop.Release() + defer tpop.Release(t) return t.Kernel().VFS().MkdirAt(t, t.Credentials(), &tpop.pop, &vfs.MkdirOptions{ Mode: linux.FileMode(mode & (0777 | linux.S_ISVTX) &^ t.FSContext().Umask()), }) @@ -127,7 +127,7 @@ func mknodat(t *kernel.Task, dirfd int32, addr usermem.Addr, mode linux.FileMode if err != nil { return err } - defer tpop.Release() + defer tpop.Release(t) // "Zero file type is equivalent to type S_IFREG." - mknod(2) if mode.FileType() == 0 { @@ -174,7 +174,7 @@ func openat(t *kernel.Task, dirfd int32, pathAddr usermem.Addr, flags uint32, mo if err != nil { return 0, nil, err } - defer tpop.Release() + defer tpop.Release(t) file, err := t.Kernel().VFS().OpenAt(t, t.Credentials(), &tpop.pop, &vfs.OpenOptions{ Flags: flags | linux.O_LARGEFILE, @@ -183,7 +183,7 @@ func openat(t *kernel.Task, dirfd int32, pathAddr usermem.Addr, flags uint32, mo if err != nil { return 0, nil, err } - defer file.DecRef() + defer file.DecRef(t) fd, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{ CloseOnExec: flags&linux.O_CLOEXEC != 0, @@ -227,7 +227,7 @@ func renameat(t *kernel.Task, olddirfd int32, oldpathAddr usermem.Addr, newdirfd if err != nil { return err } - defer oldtpop.Release() + defer oldtpop.Release(t) newpath, err := copyInPath(t, newpathAddr) if err != nil { @@ -237,7 +237,7 @@ func renameat(t *kernel.Task, olddirfd int32, oldpathAddr usermem.Addr, newdirfd if err != nil { return err } - defer newtpop.Release() + defer newtpop.Release(t) return t.Kernel().VFS().RenameAt(t, t.Credentials(), &oldtpop.pop, &newtpop.pop, &vfs.RenameOptions{ Flags: flags, @@ -259,7 +259,7 @@ func rmdirat(t *kernel.Task, dirfd int32, pathAddr usermem.Addr) error { if err != nil { return err } - defer tpop.Release() + defer tpop.Release(t) return t.Kernel().VFS().RmdirAt(t, t.Credentials(), &tpop.pop) } @@ -278,7 +278,7 @@ func unlinkat(t *kernel.Task, dirfd int32, pathAddr usermem.Addr) error { if err != nil { return err } - defer tpop.Release() + defer tpop.Release(t) return t.Kernel().VFS().UnlinkAt(t, t.Credentials(), &tpop.pop) } @@ -329,6 +329,6 @@ func symlinkat(t *kernel.Task, targetAddr usermem.Addr, newdirfd int32, linkpath if err != nil { return err } - defer tpop.Release() + defer tpop.Release(t) return t.Kernel().VFS().SymlinkAt(t, t.Credentials(), &tpop.pop, target) } diff --git a/pkg/sentry/syscalls/linux/vfs2/fscontext.go b/pkg/sentry/syscalls/linux/vfs2/fscontext.go index 317409a18..a7d4d2a36 100644 --- a/pkg/sentry/syscalls/linux/vfs2/fscontext.go +++ b/pkg/sentry/syscalls/linux/vfs2/fscontext.go @@ -31,8 +31,8 @@ func Getcwd(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal root := t.FSContext().RootDirectoryVFS2() wd := t.FSContext().WorkingDirectoryVFS2() s, err := t.Kernel().VFS().PathnameForGetcwd(t, root, wd) - root.DecRef() - wd.DecRef() + root.DecRef(t) + wd.DecRef(t) if err != nil { return 0, nil, err } @@ -67,7 +67,7 @@ func Chdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if err != nil { return 0, nil, err } - defer tpop.Release() + defer tpop.Release(t) vd, err := t.Kernel().VFS().GetDentryAt(t, t.Credentials(), &tpop.pop, &vfs.GetDentryOptions{ CheckSearchable: true, @@ -75,8 +75,8 @@ func Chdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if err != nil { return 0, nil, err } - t.FSContext().SetWorkingDirectoryVFS2(vd) - vd.DecRef() + t.FSContext().SetWorkingDirectoryVFS2(t, vd) + vd.DecRef(t) return 0, nil, nil } @@ -88,7 +88,7 @@ func Fchdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if err != nil { return 0, nil, err } - defer tpop.Release() + defer tpop.Release(t) vd, err := t.Kernel().VFS().GetDentryAt(t, t.Credentials(), &tpop.pop, &vfs.GetDentryOptions{ CheckSearchable: true, @@ -96,8 +96,8 @@ func Fchdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if err != nil { return 0, nil, err } - t.FSContext().SetWorkingDirectoryVFS2(vd) - vd.DecRef() + t.FSContext().SetWorkingDirectoryVFS2(t, vd) + vd.DecRef(t) return 0, nil, nil } @@ -117,7 +117,7 @@ func Chroot(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if err != nil { return 0, nil, err } - defer tpop.Release() + defer tpop.Release(t) vd, err := t.Kernel().VFS().GetDentryAt(t, t.Credentials(), &tpop.pop, &vfs.GetDentryOptions{ CheckSearchable: true, @@ -125,7 +125,7 @@ func Chroot(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if err != nil { return 0, nil, err } - t.FSContext().SetRootDirectoryVFS2(vd) - vd.DecRef() + t.FSContext().SetRootDirectoryVFS2(t, vd) + vd.DecRef(t) return 0, nil, nil } diff --git a/pkg/sentry/syscalls/linux/vfs2/getdents.go b/pkg/sentry/syscalls/linux/vfs2/getdents.go index c7c7bf7ce..5517595b5 100644 --- a/pkg/sentry/syscalls/linux/vfs2/getdents.go +++ b/pkg/sentry/syscalls/linux/vfs2/getdents.go @@ -44,7 +44,7 @@ func getdents(t *kernel.Task, args arch.SyscallArguments, isGetdents64 bool) (ui if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) cb := getGetdentsCallback(t, addr, size, isGetdents64) err := file.IterDirents(t, cb) diff --git a/pkg/sentry/syscalls/linux/vfs2/inotify.go b/pkg/sentry/syscalls/linux/vfs2/inotify.go index 5d98134a5..11753d8e5 100644 --- a/pkg/sentry/syscalls/linux/vfs2/inotify.go +++ b/pkg/sentry/syscalls/linux/vfs2/inotify.go @@ -35,7 +35,7 @@ func InotifyInit1(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel. if err != nil { return 0, nil, err } - defer ino.DecRef() + defer ino.DecRef(t) fd, err := t.NewFDFromVFS2(0, ino, kernel.FDFlags{ CloseOnExec: flags&linux.IN_CLOEXEC != 0, @@ -66,7 +66,7 @@ func fdToInotify(t *kernel.Task, fd int32) (*vfs.Inotify, *vfs.FileDescription, ino, ok := f.Impl().(*vfs.Inotify) if !ok { // Not an inotify fd. - f.DecRef() + f.DecRef(t) return nil, nil, syserror.EINVAL } @@ -96,7 +96,7 @@ func InotifyAddWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kern if err != nil { return 0, nil, err } - defer f.DecRef() + defer f.DecRef(t) path, err := copyInPath(t, addr) if err != nil { @@ -109,12 +109,12 @@ func InotifyAddWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kern if err != nil { return 0, nil, err } - defer tpop.Release() + defer tpop.Release(t) d, err := t.Kernel().VFS().GetDentryAt(t, t.Credentials(), &tpop.pop, &vfs.GetDentryOptions{}) if err != nil { return 0, nil, err } - defer d.DecRef() + defer d.DecRef(t) fd, err = ino.AddWatch(d.Dentry(), mask) if err != nil { @@ -132,6 +132,6 @@ func InotifyRmWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne if err != nil { return 0, nil, err } - defer f.DecRef() - return 0, nil, ino.RmWatch(wd) + defer f.DecRef(t) + return 0, nil, ino.RmWatch(t, wd) } diff --git a/pkg/sentry/syscalls/linux/vfs2/ioctl.go b/pkg/sentry/syscalls/linux/vfs2/ioctl.go index fd6ab94b2..38778a388 100644 --- a/pkg/sentry/syscalls/linux/vfs2/ioctl.go +++ b/pkg/sentry/syscalls/linux/vfs2/ioctl.go @@ -29,7 +29,7 @@ func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Handle ioctls that apply to all FDs. switch args[1].Int() { diff --git a/pkg/sentry/syscalls/linux/vfs2/lock.go b/pkg/sentry/syscalls/linux/vfs2/lock.go index bf19028c4..b910b5a74 100644 --- a/pkg/sentry/syscalls/linux/vfs2/lock.go +++ b/pkg/sentry/syscalls/linux/vfs2/lock.go @@ -32,7 +32,7 @@ func Flock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall // flock(2): EBADF fd is not an open file descriptor. return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) nonblocking := operation&linux.LOCK_NB != 0 operation &^= linux.LOCK_NB diff --git a/pkg/sentry/syscalls/linux/vfs2/memfd.go b/pkg/sentry/syscalls/linux/vfs2/memfd.go index bbe248d17..519583e4e 100644 --- a/pkg/sentry/syscalls/linux/vfs2/memfd.go +++ b/pkg/sentry/syscalls/linux/vfs2/memfd.go @@ -47,7 +47,7 @@ func MemfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S } shmMount := t.Kernel().ShmMount() - file, err := tmpfs.NewMemfd(shmMount, t.Credentials(), allowSeals, memfdPrefix+name) + file, err := tmpfs.NewMemfd(t, t.Credentials(), shmMount, allowSeals, memfdPrefix+name) if err != nil { return 0, nil, err } diff --git a/pkg/sentry/syscalls/linux/vfs2/mmap.go b/pkg/sentry/syscalls/linux/vfs2/mmap.go index 60a43f0a0..dc05c2994 100644 --- a/pkg/sentry/syscalls/linux/vfs2/mmap.go +++ b/pkg/sentry/syscalls/linux/vfs2/mmap.go @@ -61,7 +61,7 @@ func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC } defer func() { if opts.MappingIdentity != nil { - opts.MappingIdentity.DecRef() + opts.MappingIdentity.DecRef(t) } }() @@ -71,7 +71,7 @@ func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // mmap unconditionally requires that the FD is readable. if !file.IsReadable() { diff --git a/pkg/sentry/syscalls/linux/vfs2/mount.go b/pkg/sentry/syscalls/linux/vfs2/mount.go index ea337de7c..4bd5c7ca2 100644 --- a/pkg/sentry/syscalls/linux/vfs2/mount.go +++ b/pkg/sentry/syscalls/linux/vfs2/mount.go @@ -108,7 +108,7 @@ func Mount(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if err != nil { return 0, nil, err } - defer target.Release() + defer target.Release(t) return 0, nil, t.Kernel().VFS().MountAt(t, creds, source, &target.pop, fsType, &opts) } @@ -140,7 +140,7 @@ func Umount2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca if err != nil { return 0, nil, err } - defer tpop.Release() + defer tpop.Release(t) opts := vfs.UmountOptions{ Flags: uint32(flags), diff --git a/pkg/sentry/syscalls/linux/vfs2/path.go b/pkg/sentry/syscalls/linux/vfs2/path.go index 97da6c647..90a511d9a 100644 --- a/pkg/sentry/syscalls/linux/vfs2/path.go +++ b/pkg/sentry/syscalls/linux/vfs2/path.go @@ -42,7 +42,7 @@ func getTaskPathOperation(t *kernel.Task, dirfd int32, path fspath.Path, shouldA haveStartRef := false if !path.Absolute { if !path.HasComponents() && !bool(shouldAllowEmptyPath) { - root.DecRef() + root.DecRef(t) return taskPathOperation{}, syserror.ENOENT } if dirfd == linux.AT_FDCWD { @@ -51,13 +51,13 @@ func getTaskPathOperation(t *kernel.Task, dirfd int32, path fspath.Path, shouldA } else { dirfile := t.GetFileVFS2(dirfd) if dirfile == nil { - root.DecRef() + root.DecRef(t) return taskPathOperation{}, syserror.EBADF } start = dirfile.VirtualDentry() start.IncRef() haveStartRef = true - dirfile.DecRef() + dirfile.DecRef(t) } } return taskPathOperation{ @@ -71,10 +71,10 @@ func getTaskPathOperation(t *kernel.Task, dirfd int32, path fspath.Path, shouldA }, nil } -func (tpop *taskPathOperation) Release() { - tpop.pop.Root.DecRef() +func (tpop *taskPathOperation) Release(t *kernel.Task) { + tpop.pop.Root.DecRef(t) if tpop.haveStartRef { - tpop.pop.Start.DecRef() + tpop.pop.Start.DecRef(t) tpop.haveStartRef = false } } diff --git a/pkg/sentry/syscalls/linux/vfs2/pipe.go b/pkg/sentry/syscalls/linux/vfs2/pipe.go index 4a01e4209..9b4848d9e 100644 --- a/pkg/sentry/syscalls/linux/vfs2/pipe.go +++ b/pkg/sentry/syscalls/linux/vfs2/pipe.go @@ -42,8 +42,8 @@ func pipe2(t *kernel.Task, addr usermem.Addr, flags int32) error { return syserror.EINVAL } r, w := pipefs.NewConnectedPipeFDs(t, t.Kernel().PipeMount(), uint32(flags&linux.O_NONBLOCK)) - defer r.DecRef() - defer w.DecRef() + defer r.DecRef(t) + defer w.DecRef(t) fds, err := t.NewFDsVFS2(0, []*vfs.FileDescription{r, w}, kernel.FDFlags{ CloseOnExec: flags&linux.O_CLOEXEC != 0, @@ -54,7 +54,7 @@ func pipe2(t *kernel.Task, addr usermem.Addr, flags int32) error { if _, err := t.CopyOut(addr, fds); err != nil { for _, fd := range fds { if _, file := t.FDTable().Remove(fd); file != nil { - file.DecRef() + file.DecRef(t) } } return err diff --git a/pkg/sentry/syscalls/linux/vfs2/poll.go b/pkg/sentry/syscalls/linux/vfs2/poll.go index ff1b25d7b..7b9d5e18a 100644 --- a/pkg/sentry/syscalls/linux/vfs2/poll.go +++ b/pkg/sentry/syscalls/linux/vfs2/poll.go @@ -73,7 +73,7 @@ func initReadiness(t *kernel.Task, pfd *linux.PollFD, state *pollState, ch chan } if ch == nil { - defer file.DecRef() + defer file.DecRef(t) } else { state.file = file state.waiter, _ = waiter.NewChannelEntry(ch) @@ -85,11 +85,11 @@ func initReadiness(t *kernel.Task, pfd *linux.PollFD, state *pollState, ch chan } // releaseState releases all the pollState in "state". -func releaseState(state []pollState) { +func releaseState(t *kernel.Task, state []pollState) { for i := range state { if state[i].file != nil { state[i].file.EventUnregister(&state[i].waiter) - state[i].file.DecRef() + state[i].file.DecRef(t) } } } @@ -110,7 +110,7 @@ func pollBlock(t *kernel.Task, pfd []linux.PollFD, timeout time.Duration) (time. // result, we stop registering for events but still go through all files // to get their ready masks. state := make([]pollState, len(pfd)) - defer releaseState(state) + defer releaseState(t, state) n := uintptr(0) for i := range pfd { initReadiness(t, &pfd[i], &state[i], ch) @@ -269,7 +269,7 @@ func doSelect(t *kernel.Task, nfds int, readFDs, writeFDs, exceptFDs usermem.Add if file == nil { return 0, syserror.EBADF } - file.DecRef() + file.DecRef(t) var mask int16 if (rV & m) != 0 { diff --git a/pkg/sentry/syscalls/linux/vfs2/read_write.go b/pkg/sentry/syscalls/linux/vfs2/read_write.go index cd25597a7..a905dae0a 100644 --- a/pkg/sentry/syscalls/linux/vfs2/read_write.go +++ b/pkg/sentry/syscalls/linux/vfs2/read_write.go @@ -44,7 +44,7 @@ func Read(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Check that the size is legitimate. si := int(size) @@ -75,7 +75,7 @@ func Readv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Get the destination of the read. dst, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{ @@ -94,7 +94,7 @@ func read(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, opt n, err := file.Read(t, dst, opts) if err != syserror.ErrWouldBlock { if n > 0 { - file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent) + file.Dentry().InotifyWithParent(t, linux.IN_ACCESS, 0, vfs.PathEvent) } return n, err } @@ -102,7 +102,7 @@ func read(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, opt allowBlock, deadline, hasDeadline := blockPolicy(t, file) if !allowBlock { if n > 0 { - file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent) + file.Dentry().InotifyWithParent(t, linux.IN_ACCESS, 0, vfs.PathEvent) } return n, err } @@ -135,7 +135,7 @@ func read(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, opt file.EventUnregister(&w) if total > 0 { - file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent) + file.Dentry().InotifyWithParent(t, linux.IN_ACCESS, 0, vfs.PathEvent) } return total, err } @@ -151,7 +151,7 @@ func Pread64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Check that the offset is legitimate and does not overflow. if offset < 0 || offset+int64(size) < 0 { @@ -188,7 +188,7 @@ func Preadv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Check that the offset is legitimate. if offset < 0 { @@ -226,7 +226,7 @@ func Preadv2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Check that the offset is legitimate. if offset < -1 { @@ -258,7 +258,7 @@ func pread(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, of n, err := file.PRead(t, dst, offset, opts) if err != syserror.ErrWouldBlock { if n > 0 { - file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent) + file.Dentry().InotifyWithParent(t, linux.IN_ACCESS, 0, vfs.PathEvent) } return n, err } @@ -266,7 +266,7 @@ func pread(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, of allowBlock, deadline, hasDeadline := blockPolicy(t, file) if !allowBlock { if n > 0 { - file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent) + file.Dentry().InotifyWithParent(t, linux.IN_ACCESS, 0, vfs.PathEvent) } return n, err } @@ -299,7 +299,7 @@ func pread(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, of file.EventUnregister(&w) if total > 0 { - file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent) + file.Dentry().InotifyWithParent(t, linux.IN_ACCESS, 0, vfs.PathEvent) } return total, err } @@ -314,7 +314,7 @@ func Write(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Check that the size is legitimate. si := int(size) @@ -345,7 +345,7 @@ func Writev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Get the source of the write. src, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{ @@ -364,7 +364,7 @@ func write(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, op n, err := file.Write(t, src, opts) if err != syserror.ErrWouldBlock { if n > 0 { - file.Dentry().InotifyWithParent(linux.IN_MODIFY, 0, vfs.PathEvent) + file.Dentry().InotifyWithParent(t, linux.IN_MODIFY, 0, vfs.PathEvent) } return n, err } @@ -372,7 +372,7 @@ func write(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, op allowBlock, deadline, hasDeadline := blockPolicy(t, file) if !allowBlock { if n > 0 { - file.Dentry().InotifyWithParent(linux.IN_MODIFY, 0, vfs.PathEvent) + file.Dentry().InotifyWithParent(t, linux.IN_MODIFY, 0, vfs.PathEvent) } return n, err } @@ -405,7 +405,7 @@ func write(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, op file.EventUnregister(&w) if total > 0 { - file.Dentry().InotifyWithParent(linux.IN_MODIFY, 0, vfs.PathEvent) + file.Dentry().InotifyWithParent(t, linux.IN_MODIFY, 0, vfs.PathEvent) } return total, err } @@ -421,7 +421,7 @@ func Pwrite64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Check that the offset is legitimate and does not overflow. if offset < 0 || offset+int64(size) < 0 { @@ -458,7 +458,7 @@ func Pwritev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Check that the offset is legitimate. if offset < 0 { @@ -496,7 +496,7 @@ func Pwritev2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Check that the offset is legitimate. if offset < -1 { @@ -528,7 +528,7 @@ func pwrite(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, o n, err := file.PWrite(t, src, offset, opts) if err != syserror.ErrWouldBlock { if n > 0 { - file.Dentry().InotifyWithParent(linux.IN_MODIFY, 0, vfs.PathEvent) + file.Dentry().InotifyWithParent(t, linux.IN_MODIFY, 0, vfs.PathEvent) } return n, err } @@ -536,7 +536,7 @@ func pwrite(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, o allowBlock, deadline, hasDeadline := blockPolicy(t, file) if !allowBlock { if n > 0 { - file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent) + file.Dentry().InotifyWithParent(t, linux.IN_ACCESS, 0, vfs.PathEvent) } return n, err } @@ -569,7 +569,7 @@ func pwrite(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, o file.EventUnregister(&w) if total > 0 { - file.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent) + file.Dentry().InotifyWithParent(t, linux.IN_ACCESS, 0, vfs.PathEvent) } return total, err } @@ -601,7 +601,7 @@ func Lseek(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) newoff, err := file.Seek(t, offset, whence) return uintptr(newoff), nil, err @@ -617,7 +617,7 @@ func Readahead(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Check that the file is readable. if !file.IsReadable() { diff --git a/pkg/sentry/syscalls/linux/vfs2/setstat.go b/pkg/sentry/syscalls/linux/vfs2/setstat.go index 25cdb7a55..5e6eb13ba 100644 --- a/pkg/sentry/syscalls/linux/vfs2/setstat.go +++ b/pkg/sentry/syscalls/linux/vfs2/setstat.go @@ -66,7 +66,7 @@ func Fchmod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) return 0, nil, file.SetStat(t, vfs.SetStatOptions{ Stat: linux.Statx{ @@ -151,7 +151,7 @@ func Fchown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) var opts vfs.SetStatOptions if err := populateSetStatOptionsForChown(t, owner, group, &opts); err != nil { @@ -197,7 +197,7 @@ func Ftruncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) if !file.IsWritable() { return 0, nil, syserror.EINVAL @@ -224,7 +224,7 @@ func Fallocate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) if !file.IsWritable() { return 0, nil, syserror.EBADF @@ -258,7 +258,7 @@ func Fallocate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys return 0, nil, err } - file.Dentry().InotifyWithParent(linux.IN_MODIFY, 0, vfs.PathEvent) + file.Dentry().InotifyWithParent(t, linux.IN_MODIFY, 0, vfs.PathEvent) return 0, nil, nil } @@ -438,7 +438,7 @@ func populateSetStatOptionsForUtimens(t *kernel.Task, timesAddr usermem.Addr, op func setstatat(t *kernel.Task, dirfd int32, path fspath.Path, shouldAllowEmptyPath shouldAllowEmptyPath, shouldFollowFinalSymlink shouldFollowFinalSymlink, opts *vfs.SetStatOptions) error { root := t.FSContext().RootDirectoryVFS2() - defer root.DecRef() + defer root.DecRef(t) start := root if !path.Absolute { if !path.HasComponents() && !bool(shouldAllowEmptyPath) { @@ -446,7 +446,7 @@ func setstatat(t *kernel.Task, dirfd int32, path fspath.Path, shouldAllowEmptyPa } if dirfd == linux.AT_FDCWD { start = t.FSContext().WorkingDirectoryVFS2() - defer start.DecRef() + defer start.DecRef(t) } else { dirfile := t.GetFileVFS2(dirfd) if dirfile == nil { @@ -457,13 +457,13 @@ func setstatat(t *kernel.Task, dirfd int32, path fspath.Path, shouldAllowEmptyPa // VirtualFilesystem.SetStatAt(), since the former may be able // to use opened file state to expedite the SetStat. err := dirfile.SetStat(t, *opts) - dirfile.DecRef() + dirfile.DecRef(t) return err } start = dirfile.VirtualDentry() start.IncRef() - defer start.DecRef() - dirfile.DecRef() + defer start.DecRef(t) + dirfile.DecRef(t) } } return t.Kernel().VFS().SetStatAt(t, t.Credentials(), &vfs.PathOperation{ diff --git a/pkg/sentry/syscalls/linux/vfs2/signal.go b/pkg/sentry/syscalls/linux/vfs2/signal.go index 623992f6f..b89f34cdb 100644 --- a/pkg/sentry/syscalls/linux/vfs2/signal.go +++ b/pkg/sentry/syscalls/linux/vfs2/signal.go @@ -45,7 +45,7 @@ func sharedSignalfd(t *kernel.Task, fd int32, sigset usermem.Addr, sigsetsize ui if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Is this a signalfd? if sfd, ok := file.Impl().(*signalfd.SignalFileDescription); ok { @@ -68,7 +68,7 @@ func sharedSignalfd(t *kernel.Task, fd int32, sigset usermem.Addr, sigsetsize ui if err != nil { return 0, nil, err } - defer file.DecRef() + defer file.DecRef(t) // Create a new descriptor. fd, err = t.NewFDFromVFS2(0, file, kernel.FDFlags{ diff --git a/pkg/sentry/syscalls/linux/vfs2/socket.go b/pkg/sentry/syscalls/linux/vfs2/socket.go index 8096a8f9c..4a68c64f3 100644 --- a/pkg/sentry/syscalls/linux/vfs2/socket.go +++ b/pkg/sentry/syscalls/linux/vfs2/socket.go @@ -196,7 +196,7 @@ func Socket(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if e != nil { return 0, nil, e.ToError() } - defer s.DecRef() + defer s.DecRef(t) if err := s.SetStatusFlags(t, t.Credentials(), uint32(stype&linux.SOCK_NONBLOCK)); err != nil { return 0, nil, err @@ -230,8 +230,8 @@ func SocketPair(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy return 0, nil, e.ToError() } // Adding to the FD table will cause an extra reference to be acquired. - defer s1.DecRef() - defer s2.DecRef() + defer s1.DecRef(t) + defer s2.DecRef(t) nonblocking := uint32(stype & linux.SOCK_NONBLOCK) if err := s1.SetStatusFlags(t, t.Credentials(), nonblocking); err != nil { @@ -253,7 +253,7 @@ func SocketPair(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy if _, err := t.CopyOut(addr, fds); err != nil { for _, fd := range fds { if _, file := t.FDTable().Remove(fd); file != nil { - file.DecRef() + file.DecRef(t) } } return 0, nil, err @@ -273,7 +273,7 @@ func Connect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.Impl().(socket.SocketVFS2) @@ -304,7 +304,7 @@ func accept(t *kernel.Task, fd int32, addr usermem.Addr, addrLen usermem.Addr, f if file == nil { return 0, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.Impl().(socket.SocketVFS2) @@ -363,7 +363,7 @@ func Bind(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.Impl().(socket.SocketVFS2) @@ -390,7 +390,7 @@ func Listen(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.Impl().(socket.SocketVFS2) @@ -419,7 +419,7 @@ func Shutdown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.Impl().(socket.SocketVFS2) @@ -450,7 +450,7 @@ func GetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.Impl().(socket.SocketVFS2) @@ -532,7 +532,7 @@ func SetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.Impl().(socket.SocketVFS2) @@ -570,7 +570,7 @@ func GetSockName(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.Impl().(socket.SocketVFS2) @@ -598,7 +598,7 @@ func GetPeerName(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.Impl().(socket.SocketVFS2) @@ -631,7 +631,7 @@ func RecvMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.Impl().(socket.SocketVFS2) @@ -684,7 +684,7 @@ func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.Impl().(socket.SocketVFS2) @@ -778,7 +778,7 @@ func recvSingleMsg(t *kernel.Task, s socket.SocketVFS2, msgPtr usermem.Addr, fla } if !cms.Unix.Empty() { mflags |= linux.MSG_CTRUNC - cms.Release() + cms.Release(t) } if int(msg.Flags) != mflags { @@ -798,7 +798,7 @@ func recvSingleMsg(t *kernel.Task, s socket.SocketVFS2, msgPtr usermem.Addr, fla if e != nil { return 0, syserror.ConvertIntr(e.ToError(), kernel.ERESTARTSYS) } - defer cms.Release() + defer cms.Release(t) controlData := make([]byte, 0, msg.ControlLen) controlData = control.PackControlMessages(t, cms, controlData) @@ -854,7 +854,7 @@ func recvFrom(t *kernel.Task, fd int32, bufPtr usermem.Addr, bufLen uint64, flag if file == nil { return 0, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.Impl().(socket.SocketVFS2) @@ -883,7 +883,7 @@ func recvFrom(t *kernel.Task, fd int32, bufPtr usermem.Addr, bufLen uint64, flag } n, _, sender, senderLen, cm, e := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, nameLenPtr != 0, 0) - cm.Release() + cm.Release(t) if e != nil { return 0, syserror.ConvertIntr(e.ToError(), kernel.ERESTARTSYS) } @@ -927,7 +927,7 @@ func SendMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.Impl().(socket.SocketVFS2) @@ -965,7 +965,7 @@ func SendMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.Impl().(socket.SocketVFS2) @@ -1069,7 +1069,7 @@ func sendSingleMsg(t *kernel.Task, s socket.SocketVFS2, file *vfs.FileDescriptio n, e := s.SendMsg(t, src, to, int(flags), haveDeadline, deadline, controlMessages) err = slinux.HandleIOErrorVFS2(t, n != 0, e.ToError(), kernel.ERESTARTSYS, "sendmsg", file) if err != nil { - controlMessages.Release() + controlMessages.Release(t) } return uintptr(n), err } @@ -1087,7 +1087,7 @@ func sendTo(t *kernel.Task, fd int32, bufPtr usermem.Addr, bufLen uint64, flags if file == nil { return 0, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // Extract the socket. s, ok := file.Impl().(socket.SocketVFS2) diff --git a/pkg/sentry/syscalls/linux/vfs2/splice.go b/pkg/sentry/syscalls/linux/vfs2/splice.go index 63ab11f8c..16f59fce9 100644 --- a/pkg/sentry/syscalls/linux/vfs2/splice.go +++ b/pkg/sentry/syscalls/linux/vfs2/splice.go @@ -53,12 +53,12 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if inFile == nil { return 0, nil, syserror.EBADF } - defer inFile.DecRef() + defer inFile.DecRef(t) outFile := t.GetFileVFS2(outFD) if outFile == nil { return 0, nil, syserror.EBADF } - defer outFile.DecRef() + defer outFile.DecRef(t) // Check that both files support the required directionality. if !inFile.IsReadable() || !outFile.IsWritable() { @@ -175,7 +175,7 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal // On Linux, inotify behavior is not very consistent with splice(2). We try // our best to emulate Linux for very basic calls to splice, where for some // reason, events are generated for output files, but not input files. - outFile.Dentry().InotifyWithParent(linux.IN_MODIFY, 0, vfs.PathEvent) + outFile.Dentry().InotifyWithParent(t, linux.IN_MODIFY, 0, vfs.PathEvent) return uintptr(n), nil, nil } @@ -203,12 +203,12 @@ func Tee(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallCo if inFile == nil { return 0, nil, syserror.EBADF } - defer inFile.DecRef() + defer inFile.DecRef(t) outFile := t.GetFileVFS2(outFD) if outFile == nil { return 0, nil, syserror.EBADF } - defer outFile.DecRef() + defer outFile.DecRef(t) // Check that both files support the required directionality. if !inFile.IsReadable() || !outFile.IsWritable() { @@ -251,7 +251,7 @@ func Tee(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallCo if n == 0 { return 0, nil, err } - outFile.Dentry().InotifyWithParent(linux.IN_MODIFY, 0, vfs.PathEvent) + outFile.Dentry().InotifyWithParent(t, linux.IN_MODIFY, 0, vfs.PathEvent) return uintptr(n), nil, nil } @@ -266,7 +266,7 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc if inFile == nil { return 0, nil, syserror.EBADF } - defer inFile.DecRef() + defer inFile.DecRef(t) if !inFile.IsReadable() { return 0, nil, syserror.EBADF } @@ -275,7 +275,7 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc if outFile == nil { return 0, nil, syserror.EBADF } - defer outFile.DecRef() + defer outFile.DecRef(t) if !outFile.IsWritable() { return 0, nil, syserror.EBADF } @@ -419,8 +419,8 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc return 0, nil, err } - inFile.Dentry().InotifyWithParent(linux.IN_ACCESS, 0, vfs.PathEvent) - outFile.Dentry().InotifyWithParent(linux.IN_MODIFY, 0, vfs.PathEvent) + inFile.Dentry().InotifyWithParent(t, linux.IN_ACCESS, 0, vfs.PathEvent) + outFile.Dentry().InotifyWithParent(t, linux.IN_MODIFY, 0, vfs.PathEvent) return uintptr(n), nil, nil } diff --git a/pkg/sentry/syscalls/linux/vfs2/stat.go b/pkg/sentry/syscalls/linux/vfs2/stat.go index bb1d5cac4..0f5d5189c 100644 --- a/pkg/sentry/syscalls/linux/vfs2/stat.go +++ b/pkg/sentry/syscalls/linux/vfs2/stat.go @@ -65,7 +65,7 @@ func fstatat(t *kernel.Task, dirfd int32, pathAddr, statAddr usermem.Addr, flags } root := t.FSContext().RootDirectoryVFS2() - defer root.DecRef() + defer root.DecRef(t) start := root if !path.Absolute { if !path.HasComponents() && flags&linux.AT_EMPTY_PATH == 0 { @@ -73,7 +73,7 @@ func fstatat(t *kernel.Task, dirfd int32, pathAddr, statAddr usermem.Addr, flags } if dirfd == linux.AT_FDCWD { start = t.FSContext().WorkingDirectoryVFS2() - defer start.DecRef() + defer start.DecRef(t) } else { dirfile := t.GetFileVFS2(dirfd) if dirfile == nil { @@ -85,7 +85,7 @@ func fstatat(t *kernel.Task, dirfd int32, pathAddr, statAddr usermem.Addr, flags // former may be able to use opened file state to expedite the // Stat. statx, err := dirfile.Stat(t, opts) - dirfile.DecRef() + dirfile.DecRef(t) if err != nil { return err } @@ -96,8 +96,8 @@ func fstatat(t *kernel.Task, dirfd int32, pathAddr, statAddr usermem.Addr, flags } start = dirfile.VirtualDentry() start.IncRef() - defer start.DecRef() - dirfile.DecRef() + defer start.DecRef(t) + dirfile.DecRef(t) } } @@ -132,7 +132,7 @@ func Fstat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) statx, err := file.Stat(t, vfs.StatOptions{ Mask: linux.STATX_BASIC_STATS, @@ -177,7 +177,7 @@ func Statx(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall } root := t.FSContext().RootDirectoryVFS2() - defer root.DecRef() + defer root.DecRef(t) start := root if !path.Absolute { if !path.HasComponents() && flags&linux.AT_EMPTY_PATH == 0 { @@ -185,7 +185,7 @@ func Statx(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall } if dirfd == linux.AT_FDCWD { start = t.FSContext().WorkingDirectoryVFS2() - defer start.DecRef() + defer start.DecRef(t) } else { dirfile := t.GetFileVFS2(dirfd) if dirfile == nil { @@ -197,7 +197,7 @@ func Statx(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall // former may be able to use opened file state to expedite the // Stat. statx, err := dirfile.Stat(t, opts) - dirfile.DecRef() + dirfile.DecRef(t) if err != nil { return 0, nil, err } @@ -207,8 +207,8 @@ func Statx(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall } start = dirfile.VirtualDentry() start.IncRef() - defer start.DecRef() - dirfile.DecRef() + defer start.DecRef(t) + dirfile.DecRef(t) } } @@ -282,7 +282,7 @@ func accessAt(t *kernel.Task, dirfd int32, pathAddr usermem.Addr, mode uint) err if err != nil { return err } - defer tpop.Release() + defer tpop.Release(t) // access(2) and faccessat(2) check permissions using real // UID/GID, not effective UID/GID. @@ -328,7 +328,7 @@ func readlinkat(t *kernel.Task, dirfd int32, pathAddr, bufAddr usermem.Addr, siz if err != nil { return 0, nil, err } - defer tpop.Release() + defer tpop.Release(t) target, err := t.Kernel().VFS().ReadlinkAt(t, t.Credentials(), &tpop.pop) if err != nil { @@ -358,7 +358,7 @@ func Statfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if err != nil { return 0, nil, err } - defer tpop.Release() + defer tpop.Release(t) statfs, err := t.Kernel().VFS().StatFSAt(t, t.Credentials(), &tpop.pop) if err != nil { @@ -377,7 +377,7 @@ func Fstatfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca if err != nil { return 0, nil, err } - defer tpop.Release() + defer tpop.Release(t) statfs, err := t.Kernel().VFS().StatFSAt(t, t.Credentials(), &tpop.pop) if err != nil { diff --git a/pkg/sentry/syscalls/linux/vfs2/sync.go b/pkg/sentry/syscalls/linux/vfs2/sync.go index 0d0ebf46a..a6491ac37 100644 --- a/pkg/sentry/syscalls/linux/vfs2/sync.go +++ b/pkg/sentry/syscalls/linux/vfs2/sync.go @@ -34,7 +34,7 @@ func Syncfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) return 0, nil, file.SyncFS(t) } @@ -47,7 +47,7 @@ func Fsync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) return 0, nil, file.Sync(t) } @@ -77,7 +77,7 @@ func SyncFileRange(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) // TODO(gvisor.dev/issue/1897): Currently, the only file syncing we support // is a full-file sync, i.e. fsync(2). As a result, there are severe diff --git a/pkg/sentry/syscalls/linux/vfs2/timerfd.go b/pkg/sentry/syscalls/linux/vfs2/timerfd.go index 5ac79bc09..7a26890ef 100644 --- a/pkg/sentry/syscalls/linux/vfs2/timerfd.go +++ b/pkg/sentry/syscalls/linux/vfs2/timerfd.go @@ -50,11 +50,11 @@ func TimerfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel return 0, nil, syserror.EINVAL } vfsObj := t.Kernel().VFS() - file, err := timerfd.New(vfsObj, clock, fileFlags) + file, err := timerfd.New(t, vfsObj, clock, fileFlags) if err != nil { return 0, nil, err } - defer file.DecRef() + defer file.DecRef(t) fd, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{ CloseOnExec: flags&linux.TFD_CLOEXEC != 0, }) @@ -79,7 +79,7 @@ func TimerfdSettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) tfd, ok := file.Impl().(*timerfd.TimerFileDescription) if !ok { @@ -113,7 +113,7 @@ func TimerfdGettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kerne if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) tfd, ok := file.Impl().(*timerfd.TimerFileDescription) if !ok { diff --git a/pkg/sentry/syscalls/linux/vfs2/xattr.go b/pkg/sentry/syscalls/linux/vfs2/xattr.go index af455d5c1..ef99246ed 100644 --- a/pkg/sentry/syscalls/linux/vfs2/xattr.go +++ b/pkg/sentry/syscalls/linux/vfs2/xattr.go @@ -49,7 +49,7 @@ func listxattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSyml if err != nil { return 0, nil, err } - defer tpop.Release() + defer tpop.Release(t) names, err := t.Kernel().VFS().ListxattrAt(t, t.Credentials(), &tpop.pop, uint64(size)) if err != nil { @@ -72,7 +72,7 @@ func Flistxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) names, err := file.Listxattr(t, uint64(size)) if err != nil { @@ -109,7 +109,7 @@ func getxattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSymli if err != nil { return 0, nil, err } - defer tpop.Release() + defer tpop.Release(t) name, err := copyInXattrName(t, nameAddr) if err != nil { @@ -141,7 +141,7 @@ func Fgetxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) name, err := copyInXattrName(t, nameAddr) if err != nil { @@ -188,7 +188,7 @@ func setxattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSymli if err != nil { return err } - defer tpop.Release() + defer tpop.Release(t) name, err := copyInXattrName(t, nameAddr) if err != nil { @@ -222,7 +222,7 @@ func Fsetxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) name, err := copyInXattrName(t, nameAddr) if err != nil { @@ -262,7 +262,7 @@ func removexattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSy if err != nil { return err } - defer tpop.Release() + defer tpop.Release(t) name, err := copyInXattrName(t, nameAddr) if err != nil { @@ -281,7 +281,7 @@ func Fremovexattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel. if file == nil { return 0, nil, syserror.EBADF } - defer file.DecRef() + defer file.DecRef(t) name, err := copyInXattrName(t, nameAddr) if err != nil { diff --git a/pkg/sentry/vfs/anonfs.go b/pkg/sentry/vfs/anonfs.go index 641e3e502..5a0e3e6b5 100644 --- a/pkg/sentry/vfs/anonfs.go +++ b/pkg/sentry/vfs/anonfs.go @@ -82,7 +82,7 @@ type anonDentry struct { } // Release implements FilesystemImpl.Release. -func (fs *anonFilesystem) Release() { +func (fs *anonFilesystem) Release(ctx context.Context) { } // Sync implements FilesystemImpl.Sync. @@ -294,7 +294,7 @@ func (d *anonDentry) TryIncRef() bool { } // DecRef implements DentryImpl.DecRef. -func (d *anonDentry) DecRef() { +func (d *anonDentry) DecRef(ctx context.Context) { // no-op } @@ -303,7 +303,7 @@ func (d *anonDentry) DecRef() { // Although Linux technically supports inotify on pseudo filesystems (inotify // is implemented at the vfs layer), it is not particularly useful. It is left // unimplemented until someone actually needs it. -func (d *anonDentry) InotifyWithParent(events, cookie uint32, et EventType) {} +func (d *anonDentry) InotifyWithParent(ctx context.Context, events, cookie uint32, et EventType) {} // Watches implements DentryImpl.Watches. func (d *anonDentry) Watches() *Watches { @@ -311,4 +311,4 @@ func (d *anonDentry) Watches() *Watches { } // OnZeroWatches implements Dentry.OnZeroWatches. -func (d *anonDentry) OnZeroWatches() {} +func (d *anonDentry) OnZeroWatches(context.Context) {} diff --git a/pkg/sentry/vfs/dentry.go b/pkg/sentry/vfs/dentry.go index cea3e6955..bc7ea93ea 100644 --- a/pkg/sentry/vfs/dentry.go +++ b/pkg/sentry/vfs/dentry.go @@ -17,6 +17,7 @@ package vfs import ( "sync/atomic" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) @@ -102,7 +103,7 @@ type DentryImpl interface { TryIncRef() bool // DecRef decrements the Dentry's reference count. - DecRef() + DecRef(ctx context.Context) // InotifyWithParent notifies all watches on the targets represented by this // dentry and its parent. The parent's watches are notified first, followed @@ -113,7 +114,7 @@ type DentryImpl interface { // // Note that the events may not actually propagate up to the user, depending // on the event masks. - InotifyWithParent(events, cookie uint32, et EventType) + InotifyWithParent(ctx context.Context, events, cookie uint32, et EventType) // Watches returns the set of inotify watches for the file corresponding to // the Dentry. Dentries that are hard links to the same underlying file @@ -135,7 +136,7 @@ type DentryImpl interface { // The caller does not need to hold a reference on the dentry. OnZeroWatches // may acquire inotify locks, so to prevent deadlock, no inotify locks should // be held by the caller. - OnZeroWatches() + OnZeroWatches(ctx context.Context) } // IncRef increments d's reference count. @@ -150,8 +151,8 @@ func (d *Dentry) TryIncRef() bool { } // DecRef decrements d's reference count. -func (d *Dentry) DecRef() { - d.impl.DecRef() +func (d *Dentry) DecRef(ctx context.Context) { + d.impl.DecRef(ctx) } // IsDead returns true if d has been deleted or invalidated by its owning @@ -168,8 +169,8 @@ func (d *Dentry) isMounted() bool { // InotifyWithParent notifies all watches on the targets represented by d and // its parent of events. -func (d *Dentry) InotifyWithParent(events, cookie uint32, et EventType) { - d.impl.InotifyWithParent(events, cookie, et) +func (d *Dentry) InotifyWithParent(ctx context.Context, events, cookie uint32, et EventType) { + d.impl.InotifyWithParent(ctx, events, cookie, et) } // Watches returns the set of inotify watches associated with d. @@ -182,8 +183,8 @@ func (d *Dentry) Watches() *Watches { // OnZeroWatches performs cleanup tasks whenever the number of watches on a // dentry drops to zero. -func (d *Dentry) OnZeroWatches() { - d.impl.OnZeroWatches() +func (d *Dentry) OnZeroWatches(ctx context.Context) { + d.impl.OnZeroWatches(ctx) } // The following functions are exported so that filesystem implementations can @@ -214,11 +215,11 @@ func (vfs *VirtualFilesystem) AbortDeleteDentry(d *Dentry) { // CommitDeleteDentry must be called after PrepareDeleteDentry if the deletion // succeeds. -func (vfs *VirtualFilesystem) CommitDeleteDentry(d *Dentry) { +func (vfs *VirtualFilesystem) CommitDeleteDentry(ctx context.Context, d *Dentry) { d.dead = true d.mu.Unlock() if d.isMounted() { - vfs.forgetDeadMountpoint(d) + vfs.forgetDeadMountpoint(ctx, d) } } @@ -226,12 +227,12 @@ func (vfs *VirtualFilesystem) CommitDeleteDentry(d *Dentry) { // did for reasons outside of VFS' control (e.g. d represents the local state // of a file on a remote filesystem on which the file has already been // deleted). -func (vfs *VirtualFilesystem) InvalidateDentry(d *Dentry) { +func (vfs *VirtualFilesystem) InvalidateDentry(ctx context.Context, d *Dentry) { d.mu.Lock() d.dead = true d.mu.Unlock() if d.isMounted() { - vfs.forgetDeadMountpoint(d) + vfs.forgetDeadMountpoint(ctx, d) } } @@ -278,13 +279,13 @@ func (vfs *VirtualFilesystem) AbortRenameDentry(from, to *Dentry) { // that was replaced by from. // // Preconditions: PrepareRenameDentry was previously called on from and to. -func (vfs *VirtualFilesystem) CommitRenameReplaceDentry(from, to *Dentry) { +func (vfs *VirtualFilesystem) CommitRenameReplaceDentry(ctx context.Context, from, to *Dentry) { from.mu.Unlock() if to != nil { to.dead = true to.mu.Unlock() if to.isMounted() { - vfs.forgetDeadMountpoint(to) + vfs.forgetDeadMountpoint(ctx, to) } } } @@ -303,7 +304,7 @@ func (vfs *VirtualFilesystem) CommitRenameExchangeDentry(from, to *Dentry) { // // forgetDeadMountpoint is analogous to Linux's // fs/namespace.c:__detach_mounts(). -func (vfs *VirtualFilesystem) forgetDeadMountpoint(d *Dentry) { +func (vfs *VirtualFilesystem) forgetDeadMountpoint(ctx context.Context, d *Dentry) { var ( vdsToDecRef []VirtualDentry mountsToDecRef []*Mount @@ -316,9 +317,9 @@ func (vfs *VirtualFilesystem) forgetDeadMountpoint(d *Dentry) { vfs.mounts.seq.EndWrite() vfs.mountMu.Unlock() for _, vd := range vdsToDecRef { - vd.DecRef() + vd.DecRef(ctx) } for _, mnt := range mountsToDecRef { - mnt.DecRef() + mnt.DecRef(ctx) } } diff --git a/pkg/sentry/vfs/epoll.go b/pkg/sentry/vfs/epoll.go index 5b009b928..1b5af9f73 100644 --- a/pkg/sentry/vfs/epoll.go +++ b/pkg/sentry/vfs/epoll.go @@ -93,9 +93,9 @@ type epollInterest struct { // NewEpollInstanceFD returns a FileDescription representing a new epoll // instance. A reference is taken on the returned FileDescription. -func (vfs *VirtualFilesystem) NewEpollInstanceFD() (*FileDescription, error) { +func (vfs *VirtualFilesystem) NewEpollInstanceFD(ctx context.Context) (*FileDescription, error) { vd := vfs.NewAnonVirtualDentry("[eventpoll]") - defer vd.DecRef() + defer vd.DecRef(ctx) ep := &EpollInstance{ interest: make(map[epollInterestKey]*epollInterest), } @@ -110,7 +110,7 @@ func (vfs *VirtualFilesystem) NewEpollInstanceFD() (*FileDescription, error) { } // Release implements FileDescriptionImpl.Release. -func (ep *EpollInstance) Release() { +func (ep *EpollInstance) Release(ctx context.Context) { // Unregister all polled fds. ep.interestMu.Lock() defer ep.interestMu.Unlock() diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go index 93861fb4a..576ab3920 100644 --- a/pkg/sentry/vfs/file_description.go +++ b/pkg/sentry/vfs/file_description.go @@ -171,7 +171,7 @@ func (fd *FileDescription) TryIncRef() bool { } // DecRef decrements fd's reference count. -func (fd *FileDescription) DecRef() { +func (fd *FileDescription) DecRef(ctx context.Context) { if refs := atomic.AddInt64(&fd.refs, -1); refs == 0 { // Unregister fd from all epoll instances. fd.epollMu.Lock() @@ -196,11 +196,11 @@ func (fd *FileDescription) DecRef() { } // Release implementation resources. - fd.impl.Release() + fd.impl.Release(ctx) if fd.writable { fd.vd.mount.EndWrite() } - fd.vd.DecRef() + fd.vd.DecRef(ctx) fd.flagsMu.Lock() // TODO(gvisor.dev/issue/1663): We may need to unregister during save, as we do in VFS1. if fd.statusFlags&linux.O_ASYNC != 0 && fd.asyncHandler != nil { @@ -335,7 +335,7 @@ func (fd *FileDescription) Impl() FileDescriptionImpl { type FileDescriptionImpl interface { // Release is called when the associated FileDescription reaches zero // references. - Release() + Release(ctx context.Context) // OnClose is called when a file descriptor representing the // FileDescription is closed. Note that returning a non-nil error does not @@ -526,7 +526,7 @@ func (fd *FileDescription) Stat(ctx context.Context, opts StatOptions) (linux.St Start: fd.vd, }) stat, err := fd.vd.mount.fs.impl.StatAt(ctx, rp, opts) - vfsObj.putResolvingPath(rp) + vfsObj.putResolvingPath(ctx, rp) return stat, err } return fd.impl.Stat(ctx, opts) @@ -541,7 +541,7 @@ func (fd *FileDescription) SetStat(ctx context.Context, opts SetStatOptions) err Start: fd.vd, }) err := fd.vd.mount.fs.impl.SetStatAt(ctx, rp, opts) - vfsObj.putResolvingPath(rp) + vfsObj.putResolvingPath(ctx, rp) return err } return fd.impl.SetStat(ctx, opts) @@ -557,7 +557,7 @@ func (fd *FileDescription) StatFS(ctx context.Context) (linux.Statfs, error) { Start: fd.vd, }) statfs, err := fd.vd.mount.fs.impl.StatFSAt(ctx, rp) - vfsObj.putResolvingPath(rp) + vfsObj.putResolvingPath(ctx, rp) return statfs, err } return fd.impl.StatFS(ctx) @@ -674,7 +674,7 @@ func (fd *FileDescription) Listxattr(ctx context.Context, size uint64) ([]string Start: fd.vd, }) names, err := fd.vd.mount.fs.impl.ListxattrAt(ctx, rp, size) - vfsObj.putResolvingPath(rp) + vfsObj.putResolvingPath(ctx, rp) return names, err } names, err := fd.impl.Listxattr(ctx, size) @@ -703,7 +703,7 @@ func (fd *FileDescription) Getxattr(ctx context.Context, opts *GetxattrOptions) Start: fd.vd, }) val, err := fd.vd.mount.fs.impl.GetxattrAt(ctx, rp, *opts) - vfsObj.putResolvingPath(rp) + vfsObj.putResolvingPath(ctx, rp) return val, err } return fd.impl.Getxattr(ctx, *opts) @@ -719,7 +719,7 @@ func (fd *FileDescription) Setxattr(ctx context.Context, opts *SetxattrOptions) Start: fd.vd, }) err := fd.vd.mount.fs.impl.SetxattrAt(ctx, rp, *opts) - vfsObj.putResolvingPath(rp) + vfsObj.putResolvingPath(ctx, rp) return err } return fd.impl.Setxattr(ctx, *opts) @@ -735,7 +735,7 @@ func (fd *FileDescription) Removexattr(ctx context.Context, name string) error { Start: fd.vd, }) err := fd.vd.mount.fs.impl.RemovexattrAt(ctx, rp, name) - vfsObj.putResolvingPath(rp) + vfsObj.putResolvingPath(ctx, rp) return err } return fd.impl.Removexattr(ctx, name) @@ -752,7 +752,7 @@ func (fd *FileDescription) MappedName(ctx context.Context) string { vfsroot := RootFromContext(ctx) s, _ := fd.vd.mount.vfs.PathnameWithDeleted(ctx, vfsroot, fd.vd) if vfsroot.Ok() { - vfsroot.DecRef() + vfsroot.DecRef(ctx) } return s } diff --git a/pkg/sentry/vfs/file_description_impl_util_test.go b/pkg/sentry/vfs/file_description_impl_util_test.go index 3b7e1c273..1cd607c0a 100644 --- a/pkg/sentry/vfs/file_description_impl_util_test.go +++ b/pkg/sentry/vfs/file_description_impl_util_test.go @@ -80,9 +80,9 @@ type testFD struct { data DynamicBytesSource } -func newTestFD(vfsObj *VirtualFilesystem, statusFlags uint32, data DynamicBytesSource) *FileDescription { +func newTestFD(ctx context.Context, vfsObj *VirtualFilesystem, statusFlags uint32, data DynamicBytesSource) *FileDescription { vd := vfsObj.NewAnonVirtualDentry("genCountFD") - defer vd.DecRef() + defer vd.DecRef(ctx) var fd testFD fd.vfsfd.Init(&fd, statusFlags, vd.Mount(), vd.Dentry(), &FileDescriptionOptions{}) fd.DynamicBytesFileDescriptionImpl.SetDataSource(data) @@ -90,7 +90,7 @@ func newTestFD(vfsObj *VirtualFilesystem, statusFlags uint32, data DynamicBytesS } // Release implements FileDescriptionImpl.Release. -func (fd *testFD) Release() { +func (fd *testFD) Release(context.Context) { } // SetStatusFlags implements FileDescriptionImpl.SetStatusFlags. @@ -109,11 +109,11 @@ func TestGenCountFD(t *testing.T) { ctx := contexttest.Context(t) vfsObj := &VirtualFilesystem{} - if err := vfsObj.Init(); err != nil { + if err := vfsObj.Init(ctx); err != nil { t.Fatalf("VFS init: %v", err) } - fd := newTestFD(vfsObj, linux.O_RDWR, &genCount{}) - defer fd.DecRef() + fd := newTestFD(ctx, vfsObj, linux.O_RDWR, &genCount{}) + defer fd.DecRef(ctx) // The first read causes Generate to be called to fill the FD's buffer. buf := make([]byte, 2) @@ -167,11 +167,11 @@ func TestWritable(t *testing.T) { ctx := contexttest.Context(t) vfsObj := &VirtualFilesystem{} - if err := vfsObj.Init(); err != nil { + if err := vfsObj.Init(ctx); err != nil { t.Fatalf("VFS init: %v", err) } - fd := newTestFD(vfsObj, linux.O_RDWR, &storeData{data: "init"}) - defer fd.DecRef() + fd := newTestFD(ctx, vfsObj, linux.O_RDWR, &storeData{data: "init"}) + defer fd.DecRef(ctx) buf := make([]byte, 10) ioseq := usermem.BytesIOSequence(buf) diff --git a/pkg/sentry/vfs/filesystem.go b/pkg/sentry/vfs/filesystem.go index 6bb9ca180..df3758fd1 100644 --- a/pkg/sentry/vfs/filesystem.go +++ b/pkg/sentry/vfs/filesystem.go @@ -100,12 +100,12 @@ func (fs *Filesystem) TryIncRef() bool { } // DecRef decrements fs' reference count. -func (fs *Filesystem) DecRef() { +func (fs *Filesystem) DecRef(ctx context.Context) { if refs := atomic.AddInt64(&fs.refs, -1); refs == 0 { fs.vfs.filesystemsMu.Lock() delete(fs.vfs.filesystems, fs) fs.vfs.filesystemsMu.Unlock() - fs.impl.Release() + fs.impl.Release(ctx) } else if refs < 0 { panic("Filesystem.decRef() called without holding a reference") } @@ -149,7 +149,7 @@ func (fs *Filesystem) DecRef() { type FilesystemImpl interface { // Release is called when the associated Filesystem reaches zero // references. - Release() + Release(ctx context.Context) // Sync "causes all pending modifications to filesystem metadata and cached // file data to be written to the underlying [filesystem]", as by syncfs(2). diff --git a/pkg/sentry/vfs/inotify.go b/pkg/sentry/vfs/inotify.go index 167b731ac..aff220a61 100644 --- a/pkg/sentry/vfs/inotify.go +++ b/pkg/sentry/vfs/inotify.go @@ -100,7 +100,7 @@ func NewInotifyFD(ctx context.Context, vfsObj *VirtualFilesystem, flags uint32) id := uniqueid.GlobalFromContext(ctx) vd := vfsObj.NewAnonVirtualDentry(fmt.Sprintf("[inotifyfd:%d]", id)) - defer vd.DecRef() + defer vd.DecRef(ctx) fd := &Inotify{ id: id, scratch: make([]byte, inotifyEventBaseSize), @@ -118,7 +118,7 @@ func NewInotifyFD(ctx context.Context, vfsObj *VirtualFilesystem, flags uint32) // Release implements FileDescriptionImpl.Release. Release removes all // watches and frees all resources for an inotify instance. -func (i *Inotify) Release() { +func (i *Inotify) Release(ctx context.Context) { var ds []*Dentry // We need to hold i.mu to avoid a race with concurrent calls to @@ -144,7 +144,7 @@ func (i *Inotify) Release() { i.mu.Unlock() for _, d := range ds { - d.OnZeroWatches() + d.OnZeroWatches(ctx) } } @@ -350,7 +350,7 @@ func (i *Inotify) AddWatch(target *Dentry, mask uint32) (int32, error) { // RmWatch looks up an inotify watch for the given 'wd' and configures the // target to stop sending events to this inotify instance. -func (i *Inotify) RmWatch(wd int32) error { +func (i *Inotify) RmWatch(ctx context.Context, wd int32) error { i.mu.Lock() // Find the watch we were asked to removed. @@ -374,7 +374,7 @@ func (i *Inotify) RmWatch(wd int32) error { i.mu.Unlock() if remaining == 0 { - w.target.OnZeroWatches() + w.target.OnZeroWatches(ctx) } // Generate the event for the removal. @@ -462,7 +462,7 @@ func (w *Watches) Remove(id uint64) { // Notify queues a new event with watches in this set. Watches with // IN_EXCL_UNLINK are skipped if the event is coming from a child that has been // unlinked. -func (w *Watches) Notify(name string, events, cookie uint32, et EventType, unlinked bool) { +func (w *Watches) Notify(ctx context.Context, name string, events, cookie uint32, et EventType, unlinked bool) { var hasExpired bool w.mu.RLock() for _, watch := range w.ws { @@ -476,13 +476,13 @@ func (w *Watches) Notify(name string, events, cookie uint32, et EventType, unlin w.mu.RUnlock() if hasExpired { - w.cleanupExpiredWatches() + w.cleanupExpiredWatches(ctx) } } // This function is relatively expensive and should only be called where there // are expired watches. -func (w *Watches) cleanupExpiredWatches() { +func (w *Watches) cleanupExpiredWatches(ctx context.Context) { // Because of lock ordering, we cannot acquire Inotify.mu for each watch // owner while holding w.mu. As a result, store expired watches locally // before removing. @@ -495,15 +495,15 @@ func (w *Watches) cleanupExpiredWatches() { } w.mu.RUnlock() for _, watch := range toRemove { - watch.owner.RmWatch(watch.wd) + watch.owner.RmWatch(ctx, watch.wd) } } // HandleDeletion is called when the watch target is destroyed. Clear the // watch set, detach watches from the inotify instances they belong to, and // generate the appropriate events. -func (w *Watches) HandleDeletion() { - w.Notify("", linux.IN_DELETE_SELF, 0, InodeEvent, true /* unlinked */) +func (w *Watches) HandleDeletion(ctx context.Context) { + w.Notify(ctx, "", linux.IN_DELETE_SELF, 0, InodeEvent, true /* unlinked */) // As in Watches.Notify, we can't hold w.mu while acquiring Inotify.mu for // the owner of each watch being deleted. Instead, atomically store the @@ -744,12 +744,12 @@ func InotifyEventFromStatMask(mask uint32) uint32 { // InotifyRemoveChild sends the appriopriate notifications to the watch sets of // the child being removed and its parent. Note that unlike most pairs of // parent/child notifications, the child is notified first in this case. -func InotifyRemoveChild(self, parent *Watches, name string) { +func InotifyRemoveChild(ctx context.Context, self, parent *Watches, name string) { if self != nil { - self.Notify("", linux.IN_ATTRIB, 0, InodeEvent, true /* unlinked */) + self.Notify(ctx, "", linux.IN_ATTRIB, 0, InodeEvent, true /* unlinked */) } if parent != nil { - parent.Notify(name, linux.IN_DELETE, 0, InodeEvent, true /* unlinked */) + parent.Notify(ctx, name, linux.IN_DELETE, 0, InodeEvent, true /* unlinked */) } } @@ -762,13 +762,13 @@ func InotifyRename(ctx context.Context, renamed, oldParent, newParent *Watches, } cookie := uniqueid.InotifyCookie(ctx) if oldParent != nil { - oldParent.Notify(oldName, dirEv|linux.IN_MOVED_FROM, cookie, InodeEvent, false /* unlinked */) + oldParent.Notify(ctx, oldName, dirEv|linux.IN_MOVED_FROM, cookie, InodeEvent, false /* unlinked */) } if newParent != nil { - newParent.Notify(newName, dirEv|linux.IN_MOVED_TO, cookie, InodeEvent, false /* unlinked */) + newParent.Notify(ctx, newName, dirEv|linux.IN_MOVED_TO, cookie, InodeEvent, false /* unlinked */) } // Somewhat surprisingly, self move events do not have a cookie. if renamed != nil { - renamed.Notify("", linux.IN_MOVE_SELF, 0, InodeEvent, false /* unlinked */) + renamed.Notify(ctx, "", linux.IN_MOVE_SELF, 0, InodeEvent, false /* unlinked */) } } diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go index 32f901bd8..d1d29d0cd 100644 --- a/pkg/sentry/vfs/mount.go +++ b/pkg/sentry/vfs/mount.go @@ -200,8 +200,8 @@ func (vfs *VirtualFilesystem) MountDisconnected(ctx context.Context, creds *auth if err != nil { return nil, err } - defer root.DecRef() - defer fs.DecRef() + defer root.DecRef(ctx) + defer fs.DecRef(ctx) return vfs.NewDisconnectedMount(fs, root, opts) } @@ -221,7 +221,7 @@ func (vfs *VirtualFilesystem) ConnectMountAt(ctx context.Context, creds *auth.Cr if vd.dentry.dead { vd.dentry.mu.Unlock() vfs.mountMu.Unlock() - vd.DecRef() + vd.DecRef(ctx) return syserror.ENOENT } // vd might have been mounted over between vfs.GetDentryAt() and @@ -243,7 +243,7 @@ func (vfs *VirtualFilesystem) ConnectMountAt(ctx context.Context, creds *auth.Cr // This can't fail since we're holding vfs.mountMu. nextmnt.root.IncRef() vd.dentry.mu.Unlock() - vd.DecRef() + vd.DecRef(ctx) vd = VirtualDentry{ mount: nextmnt, dentry: nextmnt.root, @@ -268,7 +268,7 @@ func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentia if err != nil { return err } - defer mnt.DecRef() + defer mnt.DecRef(ctx) if err := vfs.ConnectMountAt(ctx, creds, mnt, target); err != nil { return err } @@ -293,13 +293,13 @@ func (vfs *VirtualFilesystem) UmountAt(ctx context.Context, creds *auth.Credenti if err != nil { return err } - defer vd.DecRef() + defer vd.DecRef(ctx) if vd.dentry != vd.mount.root { return syserror.EINVAL } vfs.mountMu.Lock() if mntns := MountNamespaceFromContext(ctx); mntns != nil { - defer mntns.DecRef() + defer mntns.DecRef(ctx) if mntns != vd.mount.ns { vfs.mountMu.Unlock() return syserror.EINVAL @@ -335,10 +335,10 @@ func (vfs *VirtualFilesystem) UmountAt(ctx context.Context, creds *auth.Credenti vfs.mounts.seq.EndWrite() vfs.mountMu.Unlock() for _, vd := range vdsToDecRef { - vd.DecRef() + vd.DecRef(ctx) } for _, mnt := range mountsToDecRef { - mnt.DecRef() + mnt.DecRef(ctx) } return nil } @@ -479,7 +479,7 @@ func (mnt *Mount) IncRef() { } // DecRef decrements mnt's reference count. -func (mnt *Mount) DecRef() { +func (mnt *Mount) DecRef(ctx context.Context) { refs := atomic.AddInt64(&mnt.refs, -1) if refs&^math.MinInt64 == 0 { // mask out MSB var vd VirtualDentry @@ -490,10 +490,10 @@ func (mnt *Mount) DecRef() { mnt.vfs.mounts.seq.EndWrite() mnt.vfs.mountMu.Unlock() } - mnt.root.DecRef() - mnt.fs.DecRef() + mnt.root.DecRef(ctx) + mnt.fs.DecRef(ctx) if vd.Ok() { - vd.DecRef() + vd.DecRef(ctx) } } } @@ -506,7 +506,7 @@ func (mntns *MountNamespace) IncRef() { } // DecRef decrements mntns' reference count. -func (mntns *MountNamespace) DecRef() { +func (mntns *MountNamespace) DecRef(ctx context.Context) { vfs := mntns.root.fs.VirtualFilesystem() if refs := atomic.AddInt64(&mntns.refs, -1); refs == 0 { vfs.mountMu.Lock() @@ -517,10 +517,10 @@ func (mntns *MountNamespace) DecRef() { vfs.mounts.seq.EndWrite() vfs.mountMu.Unlock() for _, vd := range vdsToDecRef { - vd.DecRef() + vd.DecRef(ctx) } for _, mnt := range mountsToDecRef { - mnt.DecRef() + mnt.DecRef(ctx) } } else if refs < 0 { panic("MountNamespace.DecRef() called without holding a reference") @@ -534,7 +534,7 @@ func (mntns *MountNamespace) DecRef() { // getMountAt is analogous to Linux's fs/namei.c:follow_mount(). // // Preconditions: References are held on mnt and d. -func (vfs *VirtualFilesystem) getMountAt(mnt *Mount, d *Dentry) *Mount { +func (vfs *VirtualFilesystem) getMountAt(ctx context.Context, mnt *Mount, d *Dentry) *Mount { // The first mount is special-cased: // // - The caller is assumed to have checked d.isMounted() already. (This @@ -565,7 +565,7 @@ retryFirst: // Raced with umount. continue } - mnt.DecRef() + mnt.DecRef(ctx) mnt = next d = next.root } @@ -578,7 +578,7 @@ retryFirst: // // Preconditions: References are held on mnt and root. vfsroot is not (mnt, // mnt.root). -func (vfs *VirtualFilesystem) getMountpointAt(mnt *Mount, vfsroot VirtualDentry) VirtualDentry { +func (vfs *VirtualFilesystem) getMountpointAt(ctx context.Context, mnt *Mount, vfsroot VirtualDentry) VirtualDentry { // The first mount is special-cased: // // - The caller must have already checked mnt against vfsroot. @@ -602,12 +602,12 @@ retryFirst: if !point.TryIncRef() { // Since Mount holds a reference on Mount.key.point, this can only // happen due to a racing change to Mount.key. - parent.DecRef() + parent.DecRef(ctx) goto retryFirst } if !vfs.mounts.seq.ReadOk(epoch) { - point.DecRef() - parent.DecRef() + point.DecRef(ctx) + parent.DecRef(ctx) goto retryFirst } mnt = parent @@ -635,16 +635,16 @@ retryFirst: if !point.TryIncRef() { // Since Mount holds a reference on Mount.key.point, this can // only happen due to a racing change to Mount.key. - parent.DecRef() + parent.DecRef(ctx) goto retryNotFirst } if !vfs.mounts.seq.ReadOk(epoch) { - point.DecRef() - parent.DecRef() + point.DecRef(ctx) + parent.DecRef(ctx) goto retryNotFirst } - d.DecRef() - mnt.DecRef() + d.DecRef(ctx) + mnt.DecRef(ctx) mnt = parent d = point } diff --git a/pkg/sentry/vfs/pathname.go b/pkg/sentry/vfs/pathname.go index cd78d66bc..e4da15009 100644 --- a/pkg/sentry/vfs/pathname.go +++ b/pkg/sentry/vfs/pathname.go @@ -47,7 +47,7 @@ func (vfs *VirtualFilesystem) PathnameWithDeleted(ctx context.Context, vfsroot, haveRef := false defer func() { if haveRef { - vd.DecRef() + vd.DecRef(ctx) } }() @@ -64,12 +64,12 @@ loop: // of FilesystemImpl.PrependPath() may return nil instead. break loop } - nextVD := vfs.getMountpointAt(vd.mount, vfsroot) + nextVD := vfs.getMountpointAt(ctx, vd.mount, vfsroot) if !nextVD.Ok() { break loop } if haveRef { - vd.DecRef() + vd.DecRef(ctx) } vd = nextVD haveRef = true @@ -101,7 +101,7 @@ func (vfs *VirtualFilesystem) PathnameReachable(ctx context.Context, vfsroot, vd haveRef := false defer func() { if haveRef { - vd.DecRef() + vd.DecRef(ctx) } }() loop: @@ -112,12 +112,12 @@ loop: if vd.mount == vfsroot.mount && vd.mount.root == vfsroot.dentry { break loop } - nextVD := vfs.getMountpointAt(vd.mount, vfsroot) + nextVD := vfs.getMountpointAt(ctx, vd.mount, vfsroot) if !nextVD.Ok() { return "", nil } if haveRef { - vd.DecRef() + vd.DecRef(ctx) } vd = nextVD haveRef = true @@ -145,7 +145,7 @@ func (vfs *VirtualFilesystem) PathnameForGetcwd(ctx context.Context, vfsroot, vd haveRef := false defer func() { if haveRef { - vd.DecRef() + vd.DecRef(ctx) } }() unreachable := false @@ -157,13 +157,13 @@ loop: if vd.mount == vfsroot.mount && vd.mount.root == vfsroot.dentry { break loop } - nextVD := vfs.getMountpointAt(vd.mount, vfsroot) + nextVD := vfs.getMountpointAt(ctx, vd.mount, vfsroot) if !nextVD.Ok() { unreachable = true break loop } if haveRef { - vd.DecRef() + vd.DecRef(ctx) } vd = nextVD haveRef = true diff --git a/pkg/sentry/vfs/resolving_path.go b/pkg/sentry/vfs/resolving_path.go index 9d047ff88..3304372d9 100644 --- a/pkg/sentry/vfs/resolving_path.go +++ b/pkg/sentry/vfs/resolving_path.go @@ -18,6 +18,7 @@ import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sync" @@ -136,31 +137,31 @@ func (vfs *VirtualFilesystem) getResolvingPath(creds *auth.Credentials, pop *Pat return rp } -func (vfs *VirtualFilesystem) putResolvingPath(rp *ResolvingPath) { +func (vfs *VirtualFilesystem) putResolvingPath(ctx context.Context, rp *ResolvingPath) { rp.root = VirtualDentry{} - rp.decRefStartAndMount() + rp.decRefStartAndMount(ctx) rp.mount = nil rp.start = nil - rp.releaseErrorState() + rp.releaseErrorState(ctx) resolvingPathPool.Put(rp) } -func (rp *ResolvingPath) decRefStartAndMount() { +func (rp *ResolvingPath) decRefStartAndMount(ctx context.Context) { if rp.flags&rpflagsHaveStartRef != 0 { - rp.start.DecRef() + rp.start.DecRef(ctx) } if rp.flags&rpflagsHaveMountRef != 0 { - rp.mount.DecRef() + rp.mount.DecRef(ctx) } } -func (rp *ResolvingPath) releaseErrorState() { +func (rp *ResolvingPath) releaseErrorState(ctx context.Context) { if rp.nextStart != nil { - rp.nextStart.DecRef() + rp.nextStart.DecRef(ctx) rp.nextStart = nil } if rp.nextMount != nil { - rp.nextMount.DecRef() + rp.nextMount.DecRef(ctx) rp.nextMount = nil } } @@ -236,13 +237,13 @@ func (rp *ResolvingPath) Advance() { // Restart resets the stream of path components represented by rp to its state // on entry to the current FilesystemImpl method. -func (rp *ResolvingPath) Restart() { +func (rp *ResolvingPath) Restart(ctx context.Context) { rp.pit = rp.origParts[rp.numOrigParts-1] rp.mustBeDir = rp.mustBeDirOrig rp.symlinks = rp.symlinksOrig rp.curPart = rp.numOrigParts - 1 copy(rp.parts[:], rp.origParts[:rp.numOrigParts]) - rp.releaseErrorState() + rp.releaseErrorState(ctx) } func (rp *ResolvingPath) relpathCommit() { @@ -260,13 +261,13 @@ func (rp *ResolvingPath) relpathCommit() { // Mount, CheckRoot returns (unspecified, non-nil error). Otherwise, path // resolution should resolve d's parent normally, and CheckRoot returns (false, // nil). -func (rp *ResolvingPath) CheckRoot(d *Dentry) (bool, error) { +func (rp *ResolvingPath) CheckRoot(ctx context.Context, d *Dentry) (bool, error) { if d == rp.root.dentry && rp.mount == rp.root.mount { // At contextual VFS root (due to e.g. chroot(2)). return true, nil } else if d == rp.mount.root { // At mount root ... - vd := rp.vfs.getMountpointAt(rp.mount, rp.root) + vd := rp.vfs.getMountpointAt(ctx, rp.mount, rp.root) if vd.Ok() { // ... of non-root mount. rp.nextMount = vd.mount @@ -283,11 +284,11 @@ func (rp *ResolvingPath) CheckRoot(d *Dentry) (bool, error) { // to d. If d is a mount point, such that path resolution should switch to // another Mount, CheckMount returns a non-nil error. Otherwise, CheckMount // returns nil. -func (rp *ResolvingPath) CheckMount(d *Dentry) error { +func (rp *ResolvingPath) CheckMount(ctx context.Context, d *Dentry) error { if !d.isMounted() { return nil } - if mnt := rp.vfs.getMountAt(rp.mount, d); mnt != nil { + if mnt := rp.vfs.getMountAt(ctx, rp.mount, d); mnt != nil { rp.nextMount = mnt return resolveMountPointError{} } @@ -389,11 +390,11 @@ func (rp *ResolvingPath) HandleJump(target VirtualDentry) error { return resolveMountRootOrJumpError{} } -func (rp *ResolvingPath) handleError(err error) bool { +func (rp *ResolvingPath) handleError(ctx context.Context, err error) bool { switch err.(type) { case resolveMountRootOrJumpError: // Switch to the new Mount. We hold references on the Mount and Dentry. - rp.decRefStartAndMount() + rp.decRefStartAndMount(ctx) rp.mount = rp.nextMount rp.start = rp.nextStart rp.flags |= rpflagsHaveMountRef | rpflagsHaveStartRef @@ -412,7 +413,7 @@ func (rp *ResolvingPath) handleError(err error) bool { case resolveMountPointError: // Switch to the new Mount. We hold a reference on the Mount, but // borrow the reference on the mount root from the Mount. - rp.decRefStartAndMount() + rp.decRefStartAndMount(ctx) rp.mount = rp.nextMount rp.start = rp.nextMount.root rp.flags = rp.flags&^rpflagsHaveStartRef | rpflagsHaveMountRef @@ -423,12 +424,12 @@ func (rp *ResolvingPath) handleError(err error) bool { // path. rp.relpathCommit() // Restart path resolution on the new Mount. - rp.releaseErrorState() + rp.releaseErrorState(ctx) return true case resolveAbsSymlinkError: // Switch to the new Mount. References are borrowed from rp.root. - rp.decRefStartAndMount() + rp.decRefStartAndMount(ctx) rp.mount = rp.root.mount rp.start = rp.root.dentry rp.flags &^= rpflagsHaveMountRef | rpflagsHaveStartRef @@ -440,7 +441,7 @@ func (rp *ResolvingPath) handleError(err error) bool { // path, including the symlink target we just prepended. rp.relpathCommit() // Restart path resolution on the new Mount. - rp.releaseErrorState() + rp.releaseErrorState(ctx) return true default: diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go index 522e27475..9c2420683 100644 --- a/pkg/sentry/vfs/vfs.go +++ b/pkg/sentry/vfs/vfs.go @@ -122,7 +122,7 @@ type VirtualFilesystem struct { } // Init initializes a new VirtualFilesystem with no mounts or FilesystemTypes. -func (vfs *VirtualFilesystem) Init() error { +func (vfs *VirtualFilesystem) Init(ctx context.Context) error { if vfs.mountpoints != nil { panic("VFS already initialized") } @@ -145,7 +145,7 @@ func (vfs *VirtualFilesystem) Init() error { devMinor: anonfsDevMinor, } anonfs.vfsfs.Init(vfs, &anonFilesystemType{}, &anonfs) - defer anonfs.vfsfs.DecRef() + defer anonfs.vfsfs.DecRef(ctx) anonMount, err := vfs.NewDisconnectedMount(&anonfs.vfsfs, nil, &MountOptions{}) if err != nil { // We should not be passing any MountOptions that would cause @@ -192,11 +192,11 @@ func (vfs *VirtualFilesystem) AccessAt(ctx context.Context, creds *auth.Credenti for { err := rp.mount.fs.impl.AccessAt(ctx, rp, creds, ats) if err == nil { - vfs.putResolvingPath(rp) + vfs.putResolvingPath(ctx, rp) return nil } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) + if !rp.handleError(ctx, err) { + vfs.putResolvingPath(ctx, rp) return err } } @@ -214,11 +214,11 @@ func (vfs *VirtualFilesystem) GetDentryAt(ctx context.Context, creds *auth.Crede dentry: d, } rp.mount.IncRef() - vfs.putResolvingPath(rp) + vfs.putResolvingPath(ctx, rp) return vd, nil } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) + if !rp.handleError(ctx, err) { + vfs.putResolvingPath(ctx, rp) return VirtualDentry{}, err } } @@ -236,7 +236,7 @@ func (vfs *VirtualFilesystem) getParentDirAndName(ctx context.Context, creds *au } rp.mount.IncRef() name := rp.Component() - vfs.putResolvingPath(rp) + vfs.putResolvingPath(ctx, rp) return parentVD, name, nil } if checkInvariants { @@ -244,8 +244,8 @@ func (vfs *VirtualFilesystem) getParentDirAndName(ctx context.Context, creds *au panic(fmt.Sprintf("%T.GetParentDentryAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) } } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) + if !rp.handleError(ctx, err) { + vfs.putResolvingPath(ctx, rp) return VirtualDentry{}, "", err } } @@ -260,14 +260,14 @@ func (vfs *VirtualFilesystem) LinkAt(ctx context.Context, creds *auth.Credential } if !newpop.Path.Begin.Ok() { - oldVD.DecRef() + oldVD.DecRef(ctx) if newpop.Path.Absolute { return syserror.EEXIST } return syserror.ENOENT } if newpop.FollowFinalSymlink { - oldVD.DecRef() + oldVD.DecRef(ctx) ctx.Warningf("VirtualFilesystem.LinkAt: file creation paths can't follow final symlink") return syserror.EINVAL } @@ -276,8 +276,8 @@ func (vfs *VirtualFilesystem) LinkAt(ctx context.Context, creds *auth.Credential for { err := rp.mount.fs.impl.LinkAt(ctx, rp, oldVD) if err == nil { - vfs.putResolvingPath(rp) - oldVD.DecRef() + vfs.putResolvingPath(ctx, rp) + oldVD.DecRef(ctx) return nil } if checkInvariants { @@ -285,9 +285,9 @@ func (vfs *VirtualFilesystem) LinkAt(ctx context.Context, creds *auth.Credential panic(fmt.Sprintf("%T.LinkAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) } } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) - oldVD.DecRef() + if !rp.handleError(ctx, err) { + vfs.putResolvingPath(ctx, rp) + oldVD.DecRef(ctx) return err } } @@ -313,7 +313,7 @@ func (vfs *VirtualFilesystem) MkdirAt(ctx context.Context, creds *auth.Credentia for { err := rp.mount.fs.impl.MkdirAt(ctx, rp, *opts) if err == nil { - vfs.putResolvingPath(rp) + vfs.putResolvingPath(ctx, rp) return nil } if checkInvariants { @@ -321,8 +321,8 @@ func (vfs *VirtualFilesystem) MkdirAt(ctx context.Context, creds *auth.Credentia panic(fmt.Sprintf("%T.MkdirAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) } } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) + if !rp.handleError(ctx, err) { + vfs.putResolvingPath(ctx, rp) return err } } @@ -346,7 +346,7 @@ func (vfs *VirtualFilesystem) MknodAt(ctx context.Context, creds *auth.Credentia for { err := rp.mount.fs.impl.MknodAt(ctx, rp, *opts) if err == nil { - vfs.putResolvingPath(rp) + vfs.putResolvingPath(ctx, rp) return nil } if checkInvariants { @@ -354,8 +354,8 @@ func (vfs *VirtualFilesystem) MknodAt(ctx context.Context, creds *auth.Credentia panic(fmt.Sprintf("%T.MknodAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) } } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) + if !rp.handleError(ctx, err) { + vfs.putResolvingPath(ctx, rp) return err } } @@ -408,31 +408,31 @@ func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credential for { fd, err := rp.mount.fs.impl.OpenAt(ctx, rp, *opts) if err == nil { - vfs.putResolvingPath(rp) + vfs.putResolvingPath(ctx, rp) if opts.FileExec { if fd.Mount().Flags.NoExec { - fd.DecRef() + fd.DecRef(ctx) return nil, syserror.EACCES } // Only a regular file can be executed. stat, err := fd.Stat(ctx, StatOptions{Mask: linux.STATX_TYPE}) if err != nil { - fd.DecRef() + fd.DecRef(ctx) return nil, err } if stat.Mask&linux.STATX_TYPE == 0 || stat.Mode&linux.S_IFMT != linux.S_IFREG { - fd.DecRef() + fd.DecRef(ctx) return nil, syserror.EACCES } } - fd.Dentry().InotifyWithParent(linux.IN_OPEN, 0, PathEvent) + fd.Dentry().InotifyWithParent(ctx, linux.IN_OPEN, 0, PathEvent) return fd, nil } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) + if !rp.handleError(ctx, err) { + vfs.putResolvingPath(ctx, rp) return nil, err } } @@ -444,11 +444,11 @@ func (vfs *VirtualFilesystem) ReadlinkAt(ctx context.Context, creds *auth.Creden for { target, err := rp.mount.fs.impl.ReadlinkAt(ctx, rp) if err == nil { - vfs.putResolvingPath(rp) + vfs.putResolvingPath(ctx, rp) return target, nil } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) + if !rp.handleError(ctx, err) { + vfs.putResolvingPath(ctx, rp) return "", err } } @@ -472,19 +472,19 @@ func (vfs *VirtualFilesystem) RenameAt(ctx context.Context, creds *auth.Credenti return err } if oldName == "." || oldName == ".." { - oldParentVD.DecRef() + oldParentVD.DecRef(ctx) return syserror.EBUSY } if !newpop.Path.Begin.Ok() { - oldParentVD.DecRef() + oldParentVD.DecRef(ctx) if newpop.Path.Absolute { return syserror.EBUSY } return syserror.ENOENT } if newpop.FollowFinalSymlink { - oldParentVD.DecRef() + oldParentVD.DecRef(ctx) ctx.Warningf("VirtualFilesystem.RenameAt: destination path can't follow final symlink") return syserror.EINVAL } @@ -497,8 +497,8 @@ func (vfs *VirtualFilesystem) RenameAt(ctx context.Context, creds *auth.Credenti for { err := rp.mount.fs.impl.RenameAt(ctx, rp, oldParentVD, oldName, renameOpts) if err == nil { - vfs.putResolvingPath(rp) - oldParentVD.DecRef() + vfs.putResolvingPath(ctx, rp) + oldParentVD.DecRef(ctx) return nil } if checkInvariants { @@ -506,9 +506,9 @@ func (vfs *VirtualFilesystem) RenameAt(ctx context.Context, creds *auth.Credenti panic(fmt.Sprintf("%T.RenameAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) } } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) - oldParentVD.DecRef() + if !rp.handleError(ctx, err) { + vfs.putResolvingPath(ctx, rp) + oldParentVD.DecRef(ctx) return err } } @@ -531,7 +531,7 @@ func (vfs *VirtualFilesystem) RmdirAt(ctx context.Context, creds *auth.Credentia for { err := rp.mount.fs.impl.RmdirAt(ctx, rp) if err == nil { - vfs.putResolvingPath(rp) + vfs.putResolvingPath(ctx, rp) return nil } if checkInvariants { @@ -539,8 +539,8 @@ func (vfs *VirtualFilesystem) RmdirAt(ctx context.Context, creds *auth.Credentia panic(fmt.Sprintf("%T.RmdirAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) } } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) + if !rp.handleError(ctx, err) { + vfs.putResolvingPath(ctx, rp) return err } } @@ -552,11 +552,11 @@ func (vfs *VirtualFilesystem) SetStatAt(ctx context.Context, creds *auth.Credent for { err := rp.mount.fs.impl.SetStatAt(ctx, rp, *opts) if err == nil { - vfs.putResolvingPath(rp) + vfs.putResolvingPath(ctx, rp) return nil } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) + if !rp.handleError(ctx, err) { + vfs.putResolvingPath(ctx, rp) return err } } @@ -568,11 +568,11 @@ func (vfs *VirtualFilesystem) StatAt(ctx context.Context, creds *auth.Credential for { stat, err := rp.mount.fs.impl.StatAt(ctx, rp, *opts) if err == nil { - vfs.putResolvingPath(rp) + vfs.putResolvingPath(ctx, rp) return stat, nil } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) + if !rp.handleError(ctx, err) { + vfs.putResolvingPath(ctx, rp) return linux.Statx{}, err } } @@ -585,11 +585,11 @@ func (vfs *VirtualFilesystem) StatFSAt(ctx context.Context, creds *auth.Credenti for { statfs, err := rp.mount.fs.impl.StatFSAt(ctx, rp) if err == nil { - vfs.putResolvingPath(rp) + vfs.putResolvingPath(ctx, rp) return statfs, nil } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) + if !rp.handleError(ctx, err) { + vfs.putResolvingPath(ctx, rp) return linux.Statfs{}, err } } @@ -612,7 +612,7 @@ func (vfs *VirtualFilesystem) SymlinkAt(ctx context.Context, creds *auth.Credent for { err := rp.mount.fs.impl.SymlinkAt(ctx, rp, target) if err == nil { - vfs.putResolvingPath(rp) + vfs.putResolvingPath(ctx, rp) return nil } if checkInvariants { @@ -620,8 +620,8 @@ func (vfs *VirtualFilesystem) SymlinkAt(ctx context.Context, creds *auth.Credent panic(fmt.Sprintf("%T.SymlinkAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) } } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) + if !rp.handleError(ctx, err) { + vfs.putResolvingPath(ctx, rp) return err } } @@ -644,7 +644,7 @@ func (vfs *VirtualFilesystem) UnlinkAt(ctx context.Context, creds *auth.Credenti for { err := rp.mount.fs.impl.UnlinkAt(ctx, rp) if err == nil { - vfs.putResolvingPath(rp) + vfs.putResolvingPath(ctx, rp) return nil } if checkInvariants { @@ -652,8 +652,8 @@ func (vfs *VirtualFilesystem) UnlinkAt(ctx context.Context, creds *auth.Credenti panic(fmt.Sprintf("%T.UnlinkAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) } } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) + if !rp.handleError(ctx, err) { + vfs.putResolvingPath(ctx, rp) return err } } @@ -671,7 +671,7 @@ func (vfs *VirtualFilesystem) BoundEndpointAt(ctx context.Context, creds *auth.C for { bep, err := rp.mount.fs.impl.BoundEndpointAt(ctx, rp, *opts) if err == nil { - vfs.putResolvingPath(rp) + vfs.putResolvingPath(ctx, rp) return bep, nil } if checkInvariants { @@ -679,8 +679,8 @@ func (vfs *VirtualFilesystem) BoundEndpointAt(ctx context.Context, creds *auth.C panic(fmt.Sprintf("%T.BoundEndpointAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) } } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) + if !rp.handleError(ctx, err) { + vfs.putResolvingPath(ctx, rp) return nil, err } } @@ -693,7 +693,7 @@ func (vfs *VirtualFilesystem) ListxattrAt(ctx context.Context, creds *auth.Crede for { names, err := rp.mount.fs.impl.ListxattrAt(ctx, rp, size) if err == nil { - vfs.putResolvingPath(rp) + vfs.putResolvingPath(ctx, rp) return names, nil } if err == syserror.ENOTSUP { @@ -701,11 +701,11 @@ func (vfs *VirtualFilesystem) ListxattrAt(ctx context.Context, creds *auth.Crede // fs/xattr.c:vfs_listxattr() falls back to allowing the security // subsystem to return security extended attributes, which by // default don't exist. - vfs.putResolvingPath(rp) + vfs.putResolvingPath(ctx, rp) return nil, nil } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) + if !rp.handleError(ctx, err) { + vfs.putResolvingPath(ctx, rp) return nil, err } } @@ -718,11 +718,11 @@ func (vfs *VirtualFilesystem) GetxattrAt(ctx context.Context, creds *auth.Creden for { val, err := rp.mount.fs.impl.GetxattrAt(ctx, rp, *opts) if err == nil { - vfs.putResolvingPath(rp) + vfs.putResolvingPath(ctx, rp) return val, nil } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) + if !rp.handleError(ctx, err) { + vfs.putResolvingPath(ctx, rp) return "", err } } @@ -735,11 +735,11 @@ func (vfs *VirtualFilesystem) SetxattrAt(ctx context.Context, creds *auth.Creden for { err := rp.mount.fs.impl.SetxattrAt(ctx, rp, *opts) if err == nil { - vfs.putResolvingPath(rp) + vfs.putResolvingPath(ctx, rp) return nil } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) + if !rp.handleError(ctx, err) { + vfs.putResolvingPath(ctx, rp) return err } } @@ -751,11 +751,11 @@ func (vfs *VirtualFilesystem) RemovexattrAt(ctx context.Context, creds *auth.Cre for { err := rp.mount.fs.impl.RemovexattrAt(ctx, rp, name) if err == nil { - vfs.putResolvingPath(rp) + vfs.putResolvingPath(ctx, rp) return nil } - if !rp.handleError(err) { - vfs.putResolvingPath(rp) + if !rp.handleError(ctx, err) { + vfs.putResolvingPath(ctx, rp) return err } } @@ -777,7 +777,7 @@ func (vfs *VirtualFilesystem) SyncAllFilesystems(ctx context.Context) error { if err := fs.impl.Sync(ctx); err != nil && retErr == nil { retErr = err } - fs.DecRef() + fs.DecRef(ctx) } return retErr } @@ -831,9 +831,9 @@ func (vd VirtualDentry) IncRef() { // DecRef decrements the reference counts on the Mount and Dentry represented // by vd. -func (vd VirtualDentry) DecRef() { - vd.dentry.DecRef() - vd.mount.DecRef() +func (vd VirtualDentry) DecRef(ctx context.Context) { + vd.dentry.DecRef(ctx) + vd.mount.DecRef(ctx) } // Mount returns the Mount associated with vd. It does not take a reference on diff --git a/pkg/tcpip/link/tun/BUILD b/pkg/tcpip/link/tun/BUILD index e0db6cf54..6c137f693 100644 --- a/pkg/tcpip/link/tun/BUILD +++ b/pkg/tcpip/link/tun/BUILD @@ -12,6 +12,7 @@ go_library( visibility = ["//visibility:public"], deps = [ "//pkg/abi/linux", + "//pkg/context", "//pkg/refs", "//pkg/sync", "//pkg/syserror", diff --git a/pkg/tcpip/link/tun/device.go b/pkg/tcpip/link/tun/device.go index 04ae58e59..22b0a12bd 100644 --- a/pkg/tcpip/link/tun/device.go +++ b/pkg/tcpip/link/tun/device.go @@ -18,6 +18,7 @@ import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" @@ -64,14 +65,14 @@ func (d *Device) beforeSave() { } // Release implements fs.FileOperations.Release. -func (d *Device) Release() { +func (d *Device) Release(ctx context.Context) { d.mu.Lock() defer d.mu.Unlock() // Decrease refcount if there is an endpoint associated with this file. if d.endpoint != nil { d.endpoint.RemoveNotify(d.notifyHandle) - d.endpoint.DecRef() + d.endpoint.DecRef(ctx) d.endpoint = nil } } @@ -341,8 +342,8 @@ type tunEndpoint struct { } // DecRef decrements refcount of e, removes NIC if refcount goes to 0. -func (e *tunEndpoint) DecRef() { - e.DecRefWithDestructor(func() { +func (e *tunEndpoint) DecRef(ctx context.Context) { + e.DecRefWithDestructor(ctx, func(context.Context) { e.stack.RemoveNIC(e.nicID) }) } diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go index 59639ba19..9dd5b0184 100644 --- a/runsc/boot/fs.go +++ b/runsc/boot/fs.go @@ -640,7 +640,7 @@ func (c *containerMounter) createMountNamespace(ctx context.Context, conf *Confi func (c *containerMounter) mountSubmounts(ctx context.Context, conf *Config, mns *fs.MountNamespace) error { root := mns.Root() - defer root.DecRef() + defer root.DecRef(ctx) for _, m := range c.mounts { log.Debugf("Mounting %q to %q, type: %s, options: %s", m.Source, m.Destination, m.Type, m.Options) @@ -868,7 +868,7 @@ func (c *containerMounter) mountSubmount(ctx context.Context, conf *Config, mns if err != nil { return fmt.Errorf("can't find mount destination %q: %v", m.Destination, err) } - defer dirent.DecRef() + defer dirent.DecRef(ctx) if err := mns.Mount(ctx, dirent, inode); err != nil { return fmt.Errorf("mount %q error: %v", m.Destination, err) } @@ -889,12 +889,12 @@ func (c *containerMounter) mountSharedSubmount(ctx context.Context, mns *fs.Moun if err != nil { return fmt.Errorf("can't find mount destination %q: %v", mount.Destination, err) } - defer target.DecRef() + defer target.DecRef(ctx) // Take a ref on the inode that is about to be (re)-mounted. source.root.IncRef() if err := mns.Mount(ctx, target, source.root); err != nil { - source.root.DecRef() + source.root.DecRef(ctx) return fmt.Errorf("bind mount %q error: %v", mount.Destination, err) } @@ -997,12 +997,12 @@ func (c *containerMounter) mountTmp(ctx context.Context, conf *Config, mns *fs.M switch err { case nil: // Found '/tmp' in filesystem, check if it's empty. - defer tmp.DecRef() + defer tmp.DecRef(ctx) f, err := tmp.Inode.GetFile(ctx, tmp, fs.FileFlags{Read: true, Directory: true}) if err != nil { return err } - defer f.DecRef() + defer f.DecRef(ctx) serializer := &fs.CollectEntriesSerializer{} if err := f.Readdir(ctx, serializer); err != nil { return err diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index 9cd9c5909..e0d077f5a 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -346,7 +346,7 @@ func New(args Args) (*Loader, error) { if err != nil { return nil, fmt.Errorf("failed to create hostfs filesystem: %v", err) } - defer hostFilesystem.DecRef() + defer hostFilesystem.DecRef(k.SupervisorContext()) hostMount, err := k.VFS().NewDisconnectedMount(hostFilesystem, nil, &vfs.MountOptions{}) if err != nil { return nil, fmt.Errorf("failed to create hostfs mount: %v", err) @@ -755,7 +755,7 @@ func (l *Loader) createContainerProcess(root bool, cid string, info *containerIn return nil, fmt.Errorf("creating process: %v", err) } // CreateProcess takes a reference on FDTable if successful. - info.procArgs.FDTable.DecRef() + info.procArgs.FDTable.DecRef(ctx) // Set the foreground process group on the TTY to the global init process // group, since that is what we are about to start running. @@ -890,22 +890,20 @@ func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) { // Add the HOME environment variable if it is not already set. if kernel.VFS2Enabled { - defer args.MountNamespaceVFS2.DecRef() - root := args.MountNamespaceVFS2.Root() - defer root.DecRef() ctx := vfs.WithRoot(l.k.SupervisorContext(), root) + defer args.MountNamespaceVFS2.DecRef(ctx) + defer root.DecRef(ctx) envv, err := user.MaybeAddExecUserHomeVFS2(ctx, args.MountNamespaceVFS2, args.KUID, args.Envv) if err != nil { return 0, err } args.Envv = envv } else { - defer args.MountNamespace.DecRef() - root := args.MountNamespace.Root() - defer root.DecRef() ctx := fs.WithRoot(l.k.SupervisorContext(), root) + defer args.MountNamespace.DecRef(ctx) + defer root.DecRef(ctx) envv, err := user.MaybeAddExecUserHome(ctx, args.MountNamespace, args.KUID, args.Envv) if err != nil { return 0, err @@ -1263,7 +1261,7 @@ func createFDTable(ctx context.Context, console bool, stdioFDs []int) (*kernel.F fdTable := k.NewFDTable() ttyFile, ttyFileVFS2, err := fdimport.Import(ctx, fdTable, console, stdioFDs) if err != nil { - fdTable.DecRef() + fdTable.DecRef(ctx) return nil, nil, nil, err } return fdTable, ttyFile, ttyFileVFS2, nil diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go index 8e6fe57e1..aa3fdf96c 100644 --- a/runsc/boot/loader_test.go +++ b/runsc/boot/loader_test.go @@ -450,13 +450,13 @@ func TestCreateMountNamespace(t *testing.T) { } root := mns.Root() - defer root.DecRef() + defer root.DecRef(ctx) for _, p := range tc.expectedPaths { maxTraversals := uint(0) if d, err := mns.FindInode(ctx, root, root, p, &maxTraversals); err != nil { t.Errorf("expected path %v to exist with spec %v, but got error %v", p, tc.spec, err) } else { - d.DecRef() + d.DecRef(ctx) } } }) @@ -491,7 +491,7 @@ func TestCreateMountNamespaceVFS2(t *testing.T) { } root := mns.Root() - defer root.DecRef() + defer root.DecRef(ctx) for _, p := range tc.expectedPaths { target := &vfs.PathOperation{ Root: root, @@ -502,7 +502,7 @@ func TestCreateMountNamespaceVFS2(t *testing.T) { if d, err := l.k.VFS().GetDentryAt(ctx, l.root.procArgs.Credentials, target, &vfs.GetDentryOptions{}); err != nil { t.Errorf("expected path %v to exist with spec %v, but got error %v", p, tc.spec, err) } else { - d.DecRef() + d.DecRef(ctx) } } }) diff --git a/runsc/boot/vfs.go b/runsc/boot/vfs.go index cfe2d36aa..252ca07e3 100644 --- a/runsc/boot/vfs.go +++ b/runsc/boot/vfs.go @@ -103,7 +103,7 @@ func registerFilesystems(k *kernel.Kernel) error { if err != nil { return fmt.Errorf("creating devtmpfs accessor: %w", err) } - defer a.Release() + defer a.Release(ctx) if err := a.UserspaceInit(ctx); err != nil { return fmt.Errorf("initializing userspace: %w", err) @@ -252,7 +252,7 @@ func (c *containerMounter) prepareMountsVFS2() ([]mountAndFD, error) { func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *Config, mns *vfs.MountNamespace, creds *auth.Credentials, submount *mountAndFD) error { root := mns.Root() - defer root.DecRef() + defer root.DecRef(ctx) target := &vfs.PathOperation{ Root: root, Start: root, @@ -387,7 +387,7 @@ func (c *containerMounter) mountTmpVFS2(ctx context.Context, conf *Config, creds } root := mns.Root() - defer root.DecRef() + defer root.DecRef(ctx) pop := vfs.PathOperation{ Root: root, Start: root, @@ -481,10 +481,10 @@ func (c *containerMounter) mountSharedSubmountVFS2(ctx context.Context, conf *Co if err != nil { return err } - defer newMnt.DecRef() + defer newMnt.DecRef(ctx) root := mns.Root() - defer root.DecRef() + defer root.DecRef(ctx) if err := c.makeSyntheticMount(ctx, mount.Destination, root, creds); err != nil { return err } -- cgit v1.2.3 From ad7c9fc4c3c027ea7ed211a6bdb35c38156280e6 Mon Sep 17 00:00:00 2001 From: Ayush Ranjan Date: Mon, 3 Aug 2020 18:15:11 -0700 Subject: [vfs2] Implement /sys/devices/system/cpu/cpuX. Fixes #3364 PiperOrigin-RevId: 324724614 --- pkg/sentry/fsimpl/sys/sys.go | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/sys/sys.go b/pkg/sentry/fsimpl/sys/sys.go index f81b0c38f..0401726b6 100644 --- a/pkg/sentry/fsimpl/sys/sys.go +++ b/pkg/sentry/fsimpl/sys/sys.go @@ -30,6 +30,7 @@ import ( // Name is the default filesystem name. const Name = "sysfs" +const defaultSysDirMode = linux.FileMode(0755) // FilesystemType implements vfs.FilesystemType. type FilesystemType struct{} @@ -57,9 +58,6 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt devMinor: devMinor, } fs.VFSFilesystem().Init(vfsObj, &fsType, fs) - k := kernel.KernelFromContext(ctx) - maxCPUCores := k.ApplicationCores() - defaultSysDirMode := linux.FileMode(0755) root := fs.newDir(creds, defaultSysDirMode, map[string]*kernfs.Dentry{ "block": fs.newDir(creds, defaultSysDirMode, nil), @@ -70,11 +68,7 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt "dev": fs.newDir(creds, defaultSysDirMode, nil), "devices": fs.newDir(creds, defaultSysDirMode, map[string]*kernfs.Dentry{ "system": fs.newDir(creds, defaultSysDirMode, map[string]*kernfs.Dentry{ - "cpu": fs.newDir(creds, defaultSysDirMode, map[string]*kernfs.Dentry{ - "online": fs.newCPUFile(creds, maxCPUCores, linux.FileMode(0444)), - "possible": fs.newCPUFile(creds, maxCPUCores, linux.FileMode(0444)), - "present": fs.newCPUFile(creds, maxCPUCores, linux.FileMode(0444)), - }), + "cpu": cpuDir(ctx, fs, creds), }), }), "firmware": fs.newDir(creds, defaultSysDirMode, nil), @@ -86,6 +80,20 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt return fs.VFSFilesystem(), root.VFSDentry(), nil } +func cpuDir(ctx context.Context, fs *filesystem, creds *auth.Credentials) *kernfs.Dentry { + k := kernel.KernelFromContext(ctx) + maxCPUCores := k.ApplicationCores() + children := map[string]*kernfs.Dentry{ + "online": fs.newCPUFile(creds, maxCPUCores, linux.FileMode(0444)), + "possible": fs.newCPUFile(creds, maxCPUCores, linux.FileMode(0444)), + "present": fs.newCPUFile(creds, maxCPUCores, linux.FileMode(0444)), + } + for i := uint(0); i < maxCPUCores; i++ { + children[fmt.Sprintf("cpu%d", i)] = fs.newDir(creds, linux.FileMode(0555), nil) + } + return fs.newDir(creds, defaultSysDirMode, children) +} + // Release implements vfs.FilesystemImpl.Release. func (fs *filesystem) Release(ctx context.Context) { fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) -- cgit v1.2.3 From d64ba89da3a5e8aaf28a9da1b40cb2231a4c6b42 Mon Sep 17 00:00:00 2001 From: gVisor bot Date: Tue, 4 Aug 2020 09:29:11 -0700 Subject: Internal change. PiperOrigin-RevId: 324826968 --- pkg/sentry/fsimpl/testutil/kernel.go | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/testutil/kernel.go b/pkg/sentry/fsimpl/testutil/kernel.go index 1e57744e8..1813269e0 100644 --- a/pkg/sentry/fsimpl/testutil/kernel.go +++ b/pkg/sentry/fsimpl/testutil/kernel.go @@ -122,6 +122,10 @@ func Boot() (*kernel.Kernel, error) { // CreateTask creates a new bare bones task for tests. func CreateTask(ctx context.Context, name string, tc *kernel.ThreadGroup, mntns *vfs.MountNamespace, root, cwd vfs.VirtualDentry) (*kernel.Task, error) { k := kernel.KernelFromContext(ctx) + if k == nil { + return nil, fmt.Errorf("cannot find kernel from context") + } + exe, err := newFakeExecutable(ctx, k.VFS(), auth.CredentialsFromContext(ctx), root) if err != nil { return nil, err -- cgit v1.2.3 From 0500f84b6f2e279ce953faaff20454aca69ae493 Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Tue, 4 Aug 2020 15:46:33 -0700 Subject: Add reference counting utility to VFS2. The utility has several differences from the VFS1 equivalent: - There are no weak references, which have a significant overhead - In order to print useful debug messages with the type of the reference- counted object, we use a generic Refs object with the owner type as a template parameter. In vfs1, this was accomplished by storing a type name and caller stack directly in the ref count (as in vfs1), which increases the struct size by 6x. (Note that the caller stack was needed because fs types like Dirent were shared by all fs implementations; in vfs2, each impl has its own data structures, so this is no longer necessary.) As an example, the utility is added to tmpfs.inode. Updates #1486. PiperOrigin-RevId: 324906582 --- pkg/refs/refcounter.go | 7 +++ pkg/refs_vfs2/BUILD | 28 +++++++++ pkg/refs_vfs2/refs.go | 36 +++++++++++ pkg/refs_vfs2/refs_template.go | 133 +++++++++++++++++++++++++++++++++++++++ pkg/sentry/fsimpl/tmpfs/BUILD | 13 ++++ pkg/sentry/fsimpl/tmpfs/tmpfs.go | 27 ++------ 6 files changed, 223 insertions(+), 21 deletions(-) create mode 100644 pkg/refs_vfs2/BUILD create mode 100644 pkg/refs_vfs2/refs.go create mode 100644 pkg/refs_vfs2/refs_template.go (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/refs/refcounter.go b/pkg/refs/refcounter.go index 61790221b..3f39edb66 100644 --- a/pkg/refs/refcounter.go +++ b/pkg/refs/refcounter.go @@ -215,6 +215,8 @@ type AtomicRefCount struct { // LeakMode configures the leak checker. type LeakMode uint32 +// TODO(gvisor.dev/issue/1624): Simplify down to two modes once vfs1 ref +// counting is gone. const ( // UninitializedLeakChecking indicates that the leak checker has not yet been initialized. UninitializedLeakChecking LeakMode = iota @@ -244,6 +246,11 @@ func SetLeakMode(mode LeakMode) { atomic.StoreUint32(&leakMode, uint32(mode)) } +// GetLeakMode returns the current leak mode. +func GetLeakMode() LeakMode { + return LeakMode(atomic.LoadUint32(&leakMode)) +} + const maxStackFrames = 40 type fileLine struct { diff --git a/pkg/refs_vfs2/BUILD b/pkg/refs_vfs2/BUILD new file mode 100644 index 000000000..7f180c7bd --- /dev/null +++ b/pkg/refs_vfs2/BUILD @@ -0,0 +1,28 @@ +load("//tools:defs.bzl", "go_library") +load("//tools/go_generics:defs.bzl", "go_template") + +package(licenses = ["notice"]) + +go_template( + name = "refs_template", + srcs = [ + "refs_template.go", + ], + types = [ + "T", + ], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/log", + "//pkg/refs", + ], +) + +go_library( + name = "refs", + srcs = [ + "refs.go", + ], + visibility = ["//pkg/sentry:internal"], + deps = ["//pkg/context"], +) diff --git a/pkg/refs_vfs2/refs.go b/pkg/refs_vfs2/refs.go new file mode 100644 index 000000000..ee01b17b0 --- /dev/null +++ b/pkg/refs_vfs2/refs.go @@ -0,0 +1,36 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package refs defines an interface for a reference-counted object. +package refs + +import ( + "gvisor.dev/gvisor/pkg/context" +) + +// RefCounter is the interface to be implemented by objects that are reference +// counted. +type RefCounter interface { + // IncRef increments the reference counter on the object. + IncRef() + + // DecRef decrements the object's reference count. Users of refs_template.Refs + // may specify a destructor to be called once the reference count reaches zero. + DecRef(ctx context.Context) + + // TryIncRef attempts to increment the reference count, but may fail if all + // references have already been dropped, in which case it returns false. If + // true is returned, then a valid reference is now held on the object. + TryIncRef() bool +} diff --git a/pkg/refs_vfs2/refs_template.go b/pkg/refs_vfs2/refs_template.go new file mode 100644 index 000000000..3e5b458c7 --- /dev/null +++ b/pkg/refs_vfs2/refs_template.go @@ -0,0 +1,133 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package refs_template defines a template that can be used by reference counted +// objects. +package refs_template + +import ( + "runtime" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/log" + refs_vfs1 "gvisor.dev/gvisor/pkg/refs" +) + +// T is the type of the reference counted object. It is only used to customize +// debug output when leak checking. +type T interface{} + +// ownerType is used to customize logging. Note that we use a pointer to T so +// that we do not copy the entire object when passed as a format parameter. +var ownerType *T + +// Refs implements refs.RefCounter. It keeps a reference count using atomic +// operations and calls the destructor when the count reaches zero. +// +// Note that the number of references is actually refCount + 1 so that a default +// zero-value Refs object contains one reference. +// +// +stateify savable +type Refs struct { + // refCount is composed of two fields: + // + // [32-bit speculative references]:[32-bit real references] + // + // Speculative references are used for TryIncRef, to avoid a CompareAndSwap + // loop. See IncRef, DecRef and TryIncRef for details of how these fields are + // used. + refCount int64 +} + +func (r *Refs) finalize() { + var note string + switch refs_vfs1.GetLeakMode() { + case refs_vfs1.NoLeakChecking: + return + case refs_vfs1.UninitializedLeakChecking: + note = "(Leak checker uninitialized): " + } + if n := r.ReadRefs(); n != 0 { + log.Warningf("%sAtomicRefCount %p owned by %T garbage collected with ref count of %d (want 0)", note, r, ownerType, n) + } +} + +// EnableLeakCheck checks for reference leaks when Refs gets garbage collected. +func (r *Refs) EnableLeakCheck() { + if refs_vfs1.GetLeakMode() != refs_vfs1.NoLeakChecking { + runtime.SetFinalizer(r, (*Refs).finalize) + } +} + +// ReadRefs returns the current number of references. The returned count is +// inherently racy and is unsafe to use without external synchronization. +func (r *Refs) ReadRefs() int64 { + // Account for the internal -1 offset on refcounts. + return atomic.LoadInt64(&r.refCount) + 1 +} + +// IncRef implements refs.RefCounter.IncRef. +// +//go:nosplit +func (r *Refs) IncRef() { + if v := atomic.AddInt64(&r.refCount, 1); v <= 0 { + panic("Incrementing non-positive ref count") + } +} + +// TryIncRef implements refs.RefCounter.TryIncRef. +// +// To do this safely without a loop, a speculative reference is first acquired +// on the object. This allows multiple concurrent TryIncRef calls to distinguish +// other TryIncRef calls from genuine references held. +// +//go:nosplit +func (r *Refs) TryIncRef() bool { + const speculativeRef = 1 << 32 + v := atomic.AddInt64(&r.refCount, speculativeRef) + if int32(v) < 0 { + // This object has already been freed. + atomic.AddInt64(&r.refCount, -speculativeRef) + return false + } + + // Turn into a real reference. + atomic.AddInt64(&r.refCount, -speculativeRef+1) + return true +} + +// DecRef implements refs.RefCounter.DecRef. +// +// Note that speculative references are counted here. Since they were added +// prior to real references reaching zero, they will successfully convert to +// real references. In other words, we see speculative references only in the +// following case: +// +// A: TryIncRef [speculative increase => sees non-negative references] +// B: DecRef [real decrease] +// A: TryIncRef [transform speculative to real] +// +//go:nosplit +func (r *Refs) DecRef(destroy func()) { + switch v := atomic.AddInt64(&r.refCount, -1); { + case v < -1: + panic("Decrementing non-positive ref count") + + case v == -1: + // Call the destructor. + if destroy != nil { + destroy() + } + } +} diff --git a/pkg/sentry/fsimpl/tmpfs/BUILD b/pkg/sentry/fsimpl/tmpfs/BUILD index e73732a6b..5cd428d64 100644 --- a/pkg/sentry/fsimpl/tmpfs/BUILD +++ b/pkg/sentry/fsimpl/tmpfs/BUILD @@ -26,6 +26,17 @@ go_template_instance( }, ) +go_template_instance( + name = "inode_refs", + out = "inode_refs.go", + package = "tmpfs", + prefix = "inode", + template = "//pkg/refs_vfs2:refs_template", + types = { + "T": "inode", + }, +) + go_library( name = "tmpfs", srcs = [ @@ -34,6 +45,7 @@ go_library( "directory.go", "filesystem.go", "fstree.go", + "inode_refs.go", "named_pipe.go", "regular_file.go", "socket_file.go", @@ -47,6 +59,7 @@ go_library( "//pkg/context", "//pkg/fspath", "//pkg/log", + "//pkg/refs", "//pkg/safemem", "//pkg/sentry/arch", "//pkg/sentry/fs", diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go index 68e615e8b..5640380dc 100644 --- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go +++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go @@ -285,13 +285,10 @@ type inode struct { // fs is the owning filesystem. fs is immutable. fs *filesystem - // refs is a reference count. refs is accessed using atomic memory - // operations. - // // A reference is held on all inodes as long as they are reachable in the // filesystem tree, i.e. nlink is nonzero. This reference is dropped when // nlink reaches 0. - refs int64 + refs inodeRefs // xattrs implements extended attributes. // @@ -327,7 +324,6 @@ func (i *inode) init(impl interface{}, fs *filesystem, kuid auth.KUID, kgid auth panic("file type is required in FileMode") } i.fs = fs - i.refs = 1 i.mode = uint32(mode) i.uid = uint32(kuid) i.gid = uint32(kgid) @@ -339,6 +335,7 @@ func (i *inode) init(impl interface{}, fs *filesystem, kuid auth.KUID, kgid auth i.mtime = now // i.nlink initialized by caller i.impl = impl + i.refs.EnableLeakCheck() } // incLinksLocked increments i's link count. @@ -369,25 +366,15 @@ func (i *inode) decLinksLocked(ctx context.Context) { } func (i *inode) incRef() { - if atomic.AddInt64(&i.refs, 1) <= 1 { - panic("tmpfs.inode.incRef() called without holding a reference") - } + i.refs.IncRef() } func (i *inode) tryIncRef() bool { - for { - refs := atomic.LoadInt64(&i.refs) - if refs == 0 { - return false - } - if atomic.CompareAndSwapInt64(&i.refs, refs, refs+1) { - return true - } - } + return i.refs.TryIncRef() } func (i *inode) decRef(ctx context.Context) { - if refs := atomic.AddInt64(&i.refs, -1); refs == 0 { + i.refs.DecRef(func() { i.watches.HandleDeletion(ctx) if regFile, ok := i.impl.(*regularFile); ok { // Release memory used by regFile to store data. Since regFile is @@ -395,9 +382,7 @@ func (i *inode) decRef(ctx context.Context) { // metadata. regFile.data.DropAll(regFile.memFile) } - } else if refs < 0 { - panic("tmpfs.inode.decRef() called without holding a reference") - } + }) } func (i *inode) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error { -- cgit v1.2.3 From b44408b40e3e8762a77ccf5eeb7f2ef567235c43 Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Tue, 4 Aug 2020 18:18:01 -0700 Subject: Automated rollback of changelist 324906582 PiperOrigin-RevId: 324931854 --- pkg/refs/refcounter.go | 7 --- pkg/refs_vfs2/BUILD | 28 --------- pkg/refs_vfs2/refs.go | 36 ----------- pkg/refs_vfs2/refs_template.go | 133 --------------------------------------- pkg/sentry/fsimpl/tmpfs/BUILD | 13 ---- pkg/sentry/fsimpl/tmpfs/tmpfs.go | 27 ++++++-- 6 files changed, 21 insertions(+), 223 deletions(-) delete mode 100644 pkg/refs_vfs2/BUILD delete mode 100644 pkg/refs_vfs2/refs.go delete mode 100644 pkg/refs_vfs2/refs_template.go (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/refs/refcounter.go b/pkg/refs/refcounter.go index 3f39edb66..61790221b 100644 --- a/pkg/refs/refcounter.go +++ b/pkg/refs/refcounter.go @@ -215,8 +215,6 @@ type AtomicRefCount struct { // LeakMode configures the leak checker. type LeakMode uint32 -// TODO(gvisor.dev/issue/1624): Simplify down to two modes once vfs1 ref -// counting is gone. const ( // UninitializedLeakChecking indicates that the leak checker has not yet been initialized. UninitializedLeakChecking LeakMode = iota @@ -246,11 +244,6 @@ func SetLeakMode(mode LeakMode) { atomic.StoreUint32(&leakMode, uint32(mode)) } -// GetLeakMode returns the current leak mode. -func GetLeakMode() LeakMode { - return LeakMode(atomic.LoadUint32(&leakMode)) -} - const maxStackFrames = 40 type fileLine struct { diff --git a/pkg/refs_vfs2/BUILD b/pkg/refs_vfs2/BUILD deleted file mode 100644 index 7f180c7bd..000000000 --- a/pkg/refs_vfs2/BUILD +++ /dev/null @@ -1,28 +0,0 @@ -load("//tools:defs.bzl", "go_library") -load("//tools/go_generics:defs.bzl", "go_template") - -package(licenses = ["notice"]) - -go_template( - name = "refs_template", - srcs = [ - "refs_template.go", - ], - types = [ - "T", - ], - visibility = ["//pkg/sentry:internal"], - deps = [ - "//pkg/log", - "//pkg/refs", - ], -) - -go_library( - name = "refs", - srcs = [ - "refs.go", - ], - visibility = ["//pkg/sentry:internal"], - deps = ["//pkg/context"], -) diff --git a/pkg/refs_vfs2/refs.go b/pkg/refs_vfs2/refs.go deleted file mode 100644 index ee01b17b0..000000000 --- a/pkg/refs_vfs2/refs.go +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright 2020 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Package refs defines an interface for a reference-counted object. -package refs - -import ( - "gvisor.dev/gvisor/pkg/context" -) - -// RefCounter is the interface to be implemented by objects that are reference -// counted. -type RefCounter interface { - // IncRef increments the reference counter on the object. - IncRef() - - // DecRef decrements the object's reference count. Users of refs_template.Refs - // may specify a destructor to be called once the reference count reaches zero. - DecRef(ctx context.Context) - - // TryIncRef attempts to increment the reference count, but may fail if all - // references have already been dropped, in which case it returns false. If - // true is returned, then a valid reference is now held on the object. - TryIncRef() bool -} diff --git a/pkg/refs_vfs2/refs_template.go b/pkg/refs_vfs2/refs_template.go deleted file mode 100644 index 3e5b458c7..000000000 --- a/pkg/refs_vfs2/refs_template.go +++ /dev/null @@ -1,133 +0,0 @@ -// Copyright 2020 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Package refs_template defines a template that can be used by reference counted -// objects. -package refs_template - -import ( - "runtime" - "sync/atomic" - - "gvisor.dev/gvisor/pkg/log" - refs_vfs1 "gvisor.dev/gvisor/pkg/refs" -) - -// T is the type of the reference counted object. It is only used to customize -// debug output when leak checking. -type T interface{} - -// ownerType is used to customize logging. Note that we use a pointer to T so -// that we do not copy the entire object when passed as a format parameter. -var ownerType *T - -// Refs implements refs.RefCounter. It keeps a reference count using atomic -// operations and calls the destructor when the count reaches zero. -// -// Note that the number of references is actually refCount + 1 so that a default -// zero-value Refs object contains one reference. -// -// +stateify savable -type Refs struct { - // refCount is composed of two fields: - // - // [32-bit speculative references]:[32-bit real references] - // - // Speculative references are used for TryIncRef, to avoid a CompareAndSwap - // loop. See IncRef, DecRef and TryIncRef for details of how these fields are - // used. - refCount int64 -} - -func (r *Refs) finalize() { - var note string - switch refs_vfs1.GetLeakMode() { - case refs_vfs1.NoLeakChecking: - return - case refs_vfs1.UninitializedLeakChecking: - note = "(Leak checker uninitialized): " - } - if n := r.ReadRefs(); n != 0 { - log.Warningf("%sAtomicRefCount %p owned by %T garbage collected with ref count of %d (want 0)", note, r, ownerType, n) - } -} - -// EnableLeakCheck checks for reference leaks when Refs gets garbage collected. -func (r *Refs) EnableLeakCheck() { - if refs_vfs1.GetLeakMode() != refs_vfs1.NoLeakChecking { - runtime.SetFinalizer(r, (*Refs).finalize) - } -} - -// ReadRefs returns the current number of references. The returned count is -// inherently racy and is unsafe to use without external synchronization. -func (r *Refs) ReadRefs() int64 { - // Account for the internal -1 offset on refcounts. - return atomic.LoadInt64(&r.refCount) + 1 -} - -// IncRef implements refs.RefCounter.IncRef. -// -//go:nosplit -func (r *Refs) IncRef() { - if v := atomic.AddInt64(&r.refCount, 1); v <= 0 { - panic("Incrementing non-positive ref count") - } -} - -// TryIncRef implements refs.RefCounter.TryIncRef. -// -// To do this safely without a loop, a speculative reference is first acquired -// on the object. This allows multiple concurrent TryIncRef calls to distinguish -// other TryIncRef calls from genuine references held. -// -//go:nosplit -func (r *Refs) TryIncRef() bool { - const speculativeRef = 1 << 32 - v := atomic.AddInt64(&r.refCount, speculativeRef) - if int32(v) < 0 { - // This object has already been freed. - atomic.AddInt64(&r.refCount, -speculativeRef) - return false - } - - // Turn into a real reference. - atomic.AddInt64(&r.refCount, -speculativeRef+1) - return true -} - -// DecRef implements refs.RefCounter.DecRef. -// -// Note that speculative references are counted here. Since they were added -// prior to real references reaching zero, they will successfully convert to -// real references. In other words, we see speculative references only in the -// following case: -// -// A: TryIncRef [speculative increase => sees non-negative references] -// B: DecRef [real decrease] -// A: TryIncRef [transform speculative to real] -// -//go:nosplit -func (r *Refs) DecRef(destroy func()) { - switch v := atomic.AddInt64(&r.refCount, -1); { - case v < -1: - panic("Decrementing non-positive ref count") - - case v == -1: - // Call the destructor. - if destroy != nil { - destroy() - } - } -} diff --git a/pkg/sentry/fsimpl/tmpfs/BUILD b/pkg/sentry/fsimpl/tmpfs/BUILD index 5cd428d64..e73732a6b 100644 --- a/pkg/sentry/fsimpl/tmpfs/BUILD +++ b/pkg/sentry/fsimpl/tmpfs/BUILD @@ -26,17 +26,6 @@ go_template_instance( }, ) -go_template_instance( - name = "inode_refs", - out = "inode_refs.go", - package = "tmpfs", - prefix = "inode", - template = "//pkg/refs_vfs2:refs_template", - types = { - "T": "inode", - }, -) - go_library( name = "tmpfs", srcs = [ @@ -45,7 +34,6 @@ go_library( "directory.go", "filesystem.go", "fstree.go", - "inode_refs.go", "named_pipe.go", "regular_file.go", "socket_file.go", @@ -59,7 +47,6 @@ go_library( "//pkg/context", "//pkg/fspath", "//pkg/log", - "//pkg/refs", "//pkg/safemem", "//pkg/sentry/arch", "//pkg/sentry/fs", diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go index 5640380dc..68e615e8b 100644 --- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go +++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go @@ -285,10 +285,13 @@ type inode struct { // fs is the owning filesystem. fs is immutable. fs *filesystem + // refs is a reference count. refs is accessed using atomic memory + // operations. + // // A reference is held on all inodes as long as they are reachable in the // filesystem tree, i.e. nlink is nonzero. This reference is dropped when // nlink reaches 0. - refs inodeRefs + refs int64 // xattrs implements extended attributes. // @@ -324,6 +327,7 @@ func (i *inode) init(impl interface{}, fs *filesystem, kuid auth.KUID, kgid auth panic("file type is required in FileMode") } i.fs = fs + i.refs = 1 i.mode = uint32(mode) i.uid = uint32(kuid) i.gid = uint32(kgid) @@ -335,7 +339,6 @@ func (i *inode) init(impl interface{}, fs *filesystem, kuid auth.KUID, kgid auth i.mtime = now // i.nlink initialized by caller i.impl = impl - i.refs.EnableLeakCheck() } // incLinksLocked increments i's link count. @@ -366,15 +369,25 @@ func (i *inode) decLinksLocked(ctx context.Context) { } func (i *inode) incRef() { - i.refs.IncRef() + if atomic.AddInt64(&i.refs, 1) <= 1 { + panic("tmpfs.inode.incRef() called without holding a reference") + } } func (i *inode) tryIncRef() bool { - return i.refs.TryIncRef() + for { + refs := atomic.LoadInt64(&i.refs) + if refs == 0 { + return false + } + if atomic.CompareAndSwapInt64(&i.refs, refs, refs+1) { + return true + } + } } func (i *inode) decRef(ctx context.Context) { - i.refs.DecRef(func() { + if refs := atomic.AddInt64(&i.refs, -1); refs == 0 { i.watches.HandleDeletion(ctx) if regFile, ok := i.impl.(*regularFile); ok { // Release memory used by regFile to store data. Since regFile is @@ -382,7 +395,9 @@ func (i *inode) decRef(ctx context.Context) { // metadata. regFile.data.DropAll(regFile.memFile) } - }) + } else if refs < 0 { + panic("tmpfs.inode.decRef() called without holding a reference") + } } func (i *inode) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error { -- cgit v1.2.3 From 102735bfb45820dd840df14827b42744da77c9a0 Mon Sep 17 00:00:00 2001 From: Fabricio Voznika Date: Tue, 4 Aug 2020 19:04:12 -0700 Subject: Inline gofer.regularFileFD.pwriteLocked Go compiler barely inlines anything, so inline by hand pwriteLocked since it's called from a single place. PiperOrigin-RevId: 324937734 --- pkg/sentry/fsimpl/gofer/regular_file.go | 102 ++++++++++++++++++-------------- 1 file changed, 56 insertions(+), 46 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/gofer/regular_file.go b/pkg/sentry/fsimpl/gofer/regular_file.go index 420e8efe2..db6bed4f6 100644 --- a/pkg/sentry/fsimpl/gofer/regular_file.go +++ b/pkg/sentry/fsimpl/gofer/regular_file.go @@ -184,6 +184,7 @@ func (fd *regularFileFD) pwrite(ctx context.Context, src usermem.IOSequence, off d.metadataMu.Lock() defer d.metadataMu.Unlock() + // Set offset to file size if the fd was opened with O_APPEND. if fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 { // Holding d.metadataMu is sufficient for reading d.size. @@ -194,70 +195,79 @@ func (fd *regularFileFD) pwrite(ctx context.Context, src usermem.IOSequence, off return 0, offset, err } src = src.TakeFirst64(limit) - n, err := fd.pwriteLocked(ctx, src, offset, opts) - return n, offset + n, err -} -// Preconditions: fd.dentry().metatdataMu must be locked. -func (fd *regularFileFD) pwriteLocked(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { - d := fd.dentry() if d.fs.opts.interop != InteropModeShared { // Compare Linux's mm/filemap.c:__generic_file_write_iter() => // file_update_time(). This is d.touchCMtime(), but without locking // d.metadataMu (recursively). d.touchCMtimeLocked() } - if fd.vfsfd.StatusFlags()&linux.O_DIRECT != 0 { - // Write dirty cached pages that will be touched by the write back to - // the remote file. - if err := d.writeback(ctx, offset, src.NumBytes()); err != nil { - return 0, err - } - // Remove touched pages from the cache. - pgstart := usermem.PageRoundDown(uint64(offset)) - pgend, ok := usermem.PageRoundUp(uint64(offset + src.NumBytes())) - if !ok { - return 0, syserror.EINVAL - } - mr := memmap.MappableRange{pgstart, pgend} - var freed []memmap.FileRange - d.dataMu.Lock() - cseg := d.cache.LowerBoundSegment(mr.Start) - for cseg.Ok() && cseg.Start() < mr.End { - cseg = d.cache.Isolate(cseg, mr) - freed = append(freed, memmap.FileRange{cseg.Value(), cseg.Value() + cseg.Range().Length()}) - cseg = d.cache.Remove(cseg).NextSegment() - } - d.dataMu.Unlock() - // Invalidate mappings of removed pages. - d.mapsMu.Lock() - d.mappings.Invalidate(mr, memmap.InvalidateOpts{}) - d.mapsMu.Unlock() - // Finally free pages removed from the cache. - mf := d.fs.mfp.MemoryFile() - for _, freedFR := range freed { - mf.DecRef(freedFR) - } - } + rw := getDentryReadWriter(ctx, d, offset) + defer putDentryReadWriter(rw) + if fd.vfsfd.StatusFlags()&linux.O_DIRECT != 0 { + if err := fd.writeCache(ctx, d, offset, src); err != nil { + return 0, offset, err + } + // Require the write to go to the remote file. rw.direct = true } + n, err := src.CopyInTo(ctx, rw) - putDentryReadWriter(rw) - if n != 0 && fd.vfsfd.StatusFlags()&(linux.O_DSYNC|linux.O_SYNC) != 0 { - // Write dirty cached pages touched by the write back to the remote - // file. + if err != nil { + return n, offset, err + } + if n > 0 && fd.vfsfd.StatusFlags()&(linux.O_DSYNC|linux.O_SYNC) != 0 { + // Write dirty cached pages touched by the write back to the remote file. if err := d.writeback(ctx, offset, src.NumBytes()); err != nil { - return 0, err + return n, offset, err } // Request the remote filesystem to sync the remote file. - if err := d.handle.file.fsync(ctx); err != nil { - return 0, err + if err := d.handle.sync(ctx); err != nil { + return n, offset, err } } - return n, err + return n, offset + n, nil +} + +func (fd *regularFileFD) writeCache(ctx context.Context, d *dentry, offset int64, src usermem.IOSequence) error { + // Write dirty cached pages that will be touched by the write back to + // the remote file. + if err := d.writeback(ctx, offset, src.NumBytes()); err != nil { + return err + } + + // Remove touched pages from the cache. + pgstart := usermem.PageRoundDown(uint64(offset)) + pgend, ok := usermem.PageRoundUp(uint64(offset + src.NumBytes())) + if !ok { + return syserror.EINVAL + } + mr := memmap.MappableRange{pgstart, pgend} + var freed []memmap.FileRange + + d.dataMu.Lock() + cseg := d.cache.LowerBoundSegment(mr.Start) + for cseg.Ok() && cseg.Start() < mr.End { + cseg = d.cache.Isolate(cseg, mr) + freed = append(freed, memmap.FileRange{cseg.Value(), cseg.Value() + cseg.Range().Length()}) + cseg = d.cache.Remove(cseg).NextSegment() + } + d.dataMu.Unlock() + + // Invalidate mappings of removed pages. + d.mapsMu.Lock() + d.mappings.Invalidate(mr, memmap.InvalidateOpts{}) + d.mapsMu.Unlock() + + // Finally free pages removed from the cache. + mf := d.fs.mfp.MemoryFile() + for _, freedFR := range freed { + mf.DecRef(freedFR) + } + return nil } // Write implements vfs.FileDescriptionImpl.Write. -- cgit v1.2.3 From a2e129b540e530c56cb7648ae7e04325c6e763b1 Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Wed, 5 Aug 2020 08:33:52 -0700 Subject: Add missing case in tmpfs.inode.direntType. This was discovered by syzkaller. PiperOrigin-RevId: 325025193 --- pkg/sentry/fsimpl/tmpfs/tmpfs.go | 2 ++ 1 file changed, 2 insertions(+) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go index 68e615e8b..4681a2f52 100644 --- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go +++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go @@ -558,6 +558,8 @@ func (i *inode) direntType() uint8 { return linux.DT_LNK case *socketFile: return linux.DT_SOCK + case *namedPipe: + return linux.DT_FIFO case *deviceFile: switch impl.kind { case vfs.BlockDevice: -- cgit v1.2.3 From 7ed4b2b5a6928b3a4a88d0117a764dd4795be61a Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Wed, 5 Aug 2020 18:15:01 -0700 Subject: Correctly decrement link counts in tmpfs rename operations. When a directory is replaced by a rename operation, its link count should reach zero. We were missing the link from `dir/.` PiperOrigin-RevId: 325141730 --- pkg/sentry/fsimpl/tmpfs/filesystem.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go index fb77f95cc..065812065 100644 --- a/pkg/sentry/fsimpl/tmpfs/filesystem.go +++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go @@ -566,7 +566,9 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa if replaced != nil { newParentDir.removeChildLocked(replaced) if replaced.inode.isDir() { - newParentDir.inode.decLinksLocked(ctx) // from replaced's ".." + // Remove links for replaced/. and replaced/.. + replaced.inode.decLinksLocked(ctx) + newParentDir.inode.decLinksLocked(ctx) } replaced.inode.decLinksLocked(ctx) } -- cgit v1.2.3 From 35312a95c4c8626365b4ece5ffb0bcab44b4bede Mon Sep 17 00:00:00 2001 From: Nayana Bidari Date: Wed, 5 Aug 2020 20:45:02 -0700 Subject: Add loss recovery option for TCP. /proc/sys/net/ipv4/tcp_recovery is used to enable RACK loss recovery in TCP. PiperOrigin-RevId: 325157807 --- pkg/sentry/fs/proc/sys_net.go | 95 +++++++++++++++++++++++++++++++++++++ pkg/sentry/fsimpl/proc/tasks_sys.go | 49 ++++++++++++++++++- pkg/sentry/inet/inet.go | 17 +++++++ pkg/sentry/inet/test_stack.go | 12 +++++ pkg/sentry/socket/hostinet/stack.go | 11 +++++ pkg/sentry/socket/netstack/stack.go | 14 ++++++ pkg/tcpip/transport/tcp/protocol.go | 33 +++++++++++++ test/syscalls/linux/proc_net.cc | 38 +++++++++++++++ 8 files changed, 268 insertions(+), 1 deletion(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fs/proc/sys_net.go b/pkg/sentry/fs/proc/sys_net.go index 702fdd392..8615b60f0 100644 --- a/pkg/sentry/fs/proc/sys_net.go +++ b/pkg/sentry/fs/proc/sys_net.go @@ -272,6 +272,96 @@ func (f *tcpSackFile) Write(ctx context.Context, _ *fs.File, src usermem.IOSeque return n, f.tcpSack.stack.SetTCPSACKEnabled(*f.tcpSack.enabled) } +// +stateify savable +type tcpRecovery struct { + fsutil.SimpleFileInode + + stack inet.Stack `state:"wait"` + recovery inet.TCPLossRecovery +} + +func newTCPRecoveryInode(ctx context.Context, msrc *fs.MountSource, s inet.Stack) *fs.Inode { + ts := &tcpRecovery{ + SimpleFileInode: *fsutil.NewSimpleFileInode(ctx, fs.RootOwner, fs.FilePermsFromMode(0644), linux.PROC_SUPER_MAGIC), + stack: s, + } + sattr := fs.StableAttr{ + DeviceID: device.ProcDevice.DeviceID(), + InodeID: device.ProcDevice.NextIno(), + BlockSize: usermem.PageSize, + Type: fs.SpecialFile, + } + return fs.NewInode(ctx, ts, msrc, sattr) +} + +// Truncate implements fs.InodeOperations.Truncate. +func (*tcpRecovery) Truncate(context.Context, *fs.Inode, int64) error { + return nil +} + +// GetFile implements fs.InodeOperations.GetFile. +func (r *tcpRecovery) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) { + flags.Pread = true + flags.Pwrite = true + return fs.NewFile(ctx, dirent, flags, &tcpRecoveryFile{ + tcpRecovery: r, + stack: r.stack, + }), nil +} + +// +stateify savable +type tcpRecoveryFile struct { + fsutil.FileGenericSeek `state:"nosave"` + fsutil.FileNoIoctl `state:"nosave"` + fsutil.FileNoMMap `state:"nosave"` + fsutil.FileNoSplice `state:"nosave"` + fsutil.FileNoopRelease `state:"nosave"` + fsutil.FileNoopFlush `state:"nosave"` + fsutil.FileNoopFsync `state:"nosave"` + fsutil.FileNotDirReaddir `state:"nosave"` + fsutil.FileUseInodeUnstableAttr `state:"nosave"` + waiter.AlwaysReady `state:"nosave"` + + tcpRecovery *tcpRecovery + + stack inet.Stack `state:"wait"` +} + +// Read implements fs.FileOperations.Read. +func (f *tcpRecoveryFile) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, offset int64) (int64, error) { + if offset != 0 { + return 0, io.EOF + } + + recovery, err := f.stack.TCPRecovery() + if err != nil { + return 0, err + } + f.tcpRecovery.recovery = recovery + s := fmt.Sprintf("%d\n", f.tcpRecovery.recovery) + n, err := dst.CopyOut(ctx, []byte(s)) + return int64(n), err +} + +// Write implements fs.FileOperations.Write. +func (f *tcpRecoveryFile) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, offset int64) (int64, error) { + if src.NumBytes() == 0 { + return 0, nil + } + src = src.TakeFirst(usermem.PageSize - 1) + + var v int32 + n, err := usermem.CopyInt32StringInVec(ctx, src.IO, src.Addrs, &v, src.Opts) + if err != nil { + return 0, err + } + f.tcpRecovery.recovery = inet.TCPLossRecovery(v) + if err := f.tcpRecovery.stack.SetTCPRecovery(f.tcpRecovery.recovery); err != nil { + return 0, err + } + return n, nil +} + func (p *proc) newSysNetCore(ctx context.Context, msrc *fs.MountSource, s inet.Stack) *fs.Inode { // The following files are simple stubs until they are implemented in // netstack, most of these files are configuration related. We use the @@ -351,6 +441,11 @@ func (p *proc) newSysNetIPv4Dir(ctx context.Context, msrc *fs.MountSource, s ine contents["tcp_wmem"] = newTCPMemInode(ctx, msrc, s, tcpWMem) } + // Add tcp_recovery. + if _, err := s.TCPRecovery(); err == nil { + contents["tcp_recovery"] = newTCPRecoveryInode(ctx, msrc, s) + } + d := ramfs.NewDir(ctx, contents, fs.RootOwner, fs.FilePermsFromMode(0555)) return newProcInode(ctx, d, msrc, fs.SpecialDirectory, nil) } diff --git a/pkg/sentry/fsimpl/proc/tasks_sys.go b/pkg/sentry/fsimpl/proc/tasks_sys.go index 6dac2afa4..b71778128 100644 --- a/pkg/sentry/fsimpl/proc/tasks_sys.go +++ b/pkg/sentry/fsimpl/proc/tasks_sys.go @@ -55,7 +55,8 @@ func (fs *filesystem) newSysNetDir(root *auth.Credentials, k *kernel.Kernel) *ke if stack := k.RootNetworkNamespace().Stack(); stack != nil { contents = map[string]*kernfs.Dentry{ "ipv4": kernfs.NewStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{ - "tcp_sack": fs.newDentry(root, fs.NextIno(), 0644, &tcpSackData{stack: stack}), + "tcp_recovery": fs.newDentry(root, fs.NextIno(), 0644, &tcpRecoveryData{stack: stack}), + "tcp_sack": fs.newDentry(root, fs.NextIno(), 0644, &tcpSackData{stack: stack}), // The following files are simple stubs until they are implemented in // netstack, most of these files are configuration related. We use the @@ -207,3 +208,49 @@ func (d *tcpSackData) Write(ctx context.Context, src usermem.IOSequence, offset *d.enabled = v != 0 return n, d.stack.SetTCPSACKEnabled(*d.enabled) } + +// tcpRecoveryData implements vfs.WritableDynamicBytesSource for +// /proc/sys/net/ipv4/tcp_recovery. +// +// +stateify savable +type tcpRecoveryData struct { + kernfs.DynamicBytesFile + + stack inet.Stack `state:"wait"` +} + +var _ vfs.WritableDynamicBytesSource = (*tcpRecoveryData)(nil) + +// Generate implements vfs.DynamicBytesSource. +func (d *tcpRecoveryData) Generate(ctx context.Context, buf *bytes.Buffer) error { + recovery, err := d.stack.TCPRecovery() + if err != nil { + return err + } + + buf.WriteString(fmt.Sprintf("%d\n", recovery)) + return nil +} + +func (d *tcpRecoveryData) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) { + if offset != 0 { + // No need to handle partial writes thus far. + return 0, syserror.EINVAL + } + if src.NumBytes() == 0 { + return 0, nil + } + + // Limit the amount of memory allocated. + src = src.TakeFirst(usermem.PageSize - 1) + + var v int32 + n, err := usermem.CopyInt32StringInVec(ctx, src.IO, src.Addrs, &v, src.Opts) + if err != nil { + return 0, err + } + if err := d.stack.SetTCPRecovery(inet.TCPLossRecovery(v)); err != nil { + return 0, err + } + return n, nil +} diff --git a/pkg/sentry/inet/inet.go b/pkg/sentry/inet/inet.go index 2916a0644..c0b4831d1 100644 --- a/pkg/sentry/inet/inet.go +++ b/pkg/sentry/inet/inet.go @@ -56,6 +56,12 @@ type Stack interface { // settings. SetTCPSACKEnabled(enabled bool) error + // TCPRecovery returns the TCP loss detection algorithm. + TCPRecovery() (TCPLossRecovery, error) + + // SetTCPRecovery attempts to change TCP loss detection algorithm. + SetTCPRecovery(recovery TCPLossRecovery) error + // Statistics reports stack statistics. Statistics(stat interface{}, arg string) error @@ -189,3 +195,14 @@ type StatSNMPUDP [8]uint64 // StatSNMPUDPLite describes UdpLite line of /proc/net/snmp. type StatSNMPUDPLite [8]uint64 + +// TCPLossRecovery indicates TCP loss detection and recovery methods to use. +type TCPLossRecovery int32 + +// Loss recovery constants from include/net/tcp.h which are used to set +// /proc/sys/net/ipv4/tcp_recovery. +const ( + TCP_RACK_LOSS_DETECTION TCPLossRecovery = 1 << iota + TCP_RACK_STATIC_REO_WND + TCP_RACK_NO_DUPTHRESH +) diff --git a/pkg/sentry/inet/test_stack.go b/pkg/sentry/inet/test_stack.go index d8961fc94..9771f01fc 100644 --- a/pkg/sentry/inet/test_stack.go +++ b/pkg/sentry/inet/test_stack.go @@ -25,6 +25,7 @@ type TestStack struct { TCPRecvBufSize TCPBufferSize TCPSendBufSize TCPBufferSize TCPSACKFlag bool + Recovery TCPLossRecovery } // NewTestStack returns a TestStack with no network interfaces. The value of @@ -91,6 +92,17 @@ func (s *TestStack) SetTCPSACKEnabled(enabled bool) error { return nil } +// TCPRecovery implements Stack.TCPRecovery. +func (s *TestStack) TCPRecovery() (TCPLossRecovery, error) { + return s.Recovery, nil +} + +// SetTCPRecovery implements Stack.SetTCPRecovery. +func (s *TestStack) SetTCPRecovery(recovery TCPLossRecovery) error { + s.Recovery = recovery + return nil +} + // Statistics implements inet.Stack.Statistics. func (s *TestStack) Statistics(stat interface{}, arg string) error { return nil diff --git a/pkg/sentry/socket/hostinet/stack.go b/pkg/sentry/socket/hostinet/stack.go index a48082631..fda3dcb35 100644 --- a/pkg/sentry/socket/hostinet/stack.go +++ b/pkg/sentry/socket/hostinet/stack.go @@ -53,6 +53,7 @@ type Stack struct { interfaceAddrs map[int32][]inet.InterfaceAddr routes []inet.Route supportsIPv6 bool + tcpRecovery inet.TCPLossRecovery tcpRecvBufSize inet.TCPBufferSize tcpSendBufSize inet.TCPBufferSize tcpSACKEnabled bool @@ -350,6 +351,16 @@ func (s *Stack) SetTCPSACKEnabled(enabled bool) error { return syserror.EACCES } +// TCPRecovery implements inet.Stack.TCPRecovery. +func (s *Stack) TCPRecovery() (inet.TCPLossRecovery, error) { + return s.tcpRecovery, nil +} + +// SetTCPRecovery implements inet.Stack.SetTCPRecovery. +func (s *Stack) SetTCPRecovery(recovery inet.TCPLossRecovery) error { + return syserror.EACCES +} + // getLine reads one line from proc file, with specified prefix. // The last argument, withHeader, specifies if it contains line header. func getLine(f *os.File, prefix string, withHeader bool) string { diff --git a/pkg/sentry/socket/netstack/stack.go b/pkg/sentry/socket/netstack/stack.go index 67737ae87..f0fe18684 100644 --- a/pkg/sentry/socket/netstack/stack.go +++ b/pkg/sentry/socket/netstack/stack.go @@ -207,6 +207,20 @@ func (s *Stack) SetTCPSACKEnabled(enabled bool) error { return syserr.TranslateNetstackError(s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SACKEnabled(enabled))).ToError() } +// TCPRecovery implements inet.Stack.TCPRecovery. +func (s *Stack) TCPRecovery() (inet.TCPLossRecovery, error) { + var recovery tcp.Recovery + if err := s.Stack.TransportProtocolOption(tcp.ProtocolNumber, &recovery); err != nil { + return 0, syserr.TranslateNetstackError(err).ToError() + } + return inet.TCPLossRecovery(recovery), nil +} + +// SetTCPRecovery implements inet.Stack.SetTCPRecovery. +func (s *Stack) SetTCPRecovery(recovery inet.TCPLossRecovery) error { + return syserr.TranslateNetstackError(s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.Recovery(recovery))).ToError() +} + // Statistics implements inet.Stack.Statistics. func (s *Stack) Statistics(stat interface{}, arg string) error { switch stats := stat.(type) { diff --git a/pkg/tcpip/transport/tcp/protocol.go b/pkg/tcpip/transport/tcp/protocol.go index b34e47bbd..d9abb8d94 100644 --- a/pkg/tcpip/transport/tcp/protocol.go +++ b/pkg/tcpip/transport/tcp/protocol.go @@ -80,6 +80,25 @@ const ( // enable/disable SACK support in TCP. See: https://tools.ietf.org/html/rfc2018. type SACKEnabled bool +// Recovery is used by stack.(*Stack).TransportProtocolOption to +// set loss detection algorithm in TCP. +type Recovery int32 + +const ( + // RACKLossDetection indicates RACK is used for loss detection and + // recovery. + RACKLossDetection Recovery = 1 << iota + + // RACKStaticReoWnd indicates the reordering window should not be + // adjusted when DSACK is received. + RACKStaticReoWnd + + // RACKNoDupTh indicates RACK should not consider the classic three + // duplicate acknowledgements rule to mark the segments as lost. This + // is used when reordering is not detected. + RACKNoDupTh +) + // DelayEnabled is used by stack.(Stack*).TransportProtocolOption to // enable/disable Nagle's algorithm in TCP. type DelayEnabled bool @@ -161,6 +180,7 @@ func (s *synRcvdCounter) Threshold() uint64 { type protocol struct { mu sync.RWMutex sackEnabled bool + recovery Recovery delayEnabled bool sendBufferSize SendBufferSizeOption recvBufferSize ReceiveBufferSizeOption @@ -280,6 +300,12 @@ func (p *protocol) SetOption(option interface{}) *tcpip.Error { p.mu.Unlock() return nil + case Recovery: + p.mu.Lock() + p.recovery = Recovery(v) + p.mu.Unlock() + return nil + case DelayEnabled: p.mu.Lock() p.delayEnabled = bool(v) @@ -394,6 +420,12 @@ func (p *protocol) Option(option interface{}) *tcpip.Error { p.mu.RUnlock() return nil + case *Recovery: + p.mu.RLock() + *v = Recovery(p.recovery) + p.mu.RUnlock() + return nil + case *DelayEnabled: p.mu.RLock() *v = DelayEnabled(p.delayEnabled) @@ -535,6 +567,7 @@ func NewProtocol() stack.TransportProtocol { minRTO: MinRTO, maxRTO: MaxRTO, maxRetries: MaxRetries, + recovery: RACKLossDetection, } p.dispatcher.init(runtime.GOMAXPROCS(0)) return &p diff --git a/test/syscalls/linux/proc_net.cc b/test/syscalls/linux/proc_net.cc index 3377b65cf..4fab097f4 100644 --- a/test/syscalls/linux/proc_net.cc +++ b/test/syscalls/linux/proc_net.cc @@ -477,6 +477,44 @@ TEST(ProcNetSnmp, CheckSnmp) { EXPECT_EQ(value_count, 1); } +TEST(ProcSysNetIpv4Recovery, Exists) { + EXPECT_THAT(open("/proc/sys/net/ipv4/tcp_recovery", O_RDONLY), + SyscallSucceeds()); +} + +TEST(ProcSysNetIpv4Recovery, CanReadAndWrite) { + // TODO(b/162988252): Enable save/restore for this test after the bug is + // fixed. + DisableSave ds; + + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability((CAP_DAC_OVERRIDE)))); + + auto const fd = ASSERT_NO_ERRNO_AND_VALUE( + Open("/proc/sys/net/ipv4/tcp_recovery", O_RDWR)); + + char buf[10] = {'\0'}; + char to_write = '2'; + + // Check initial value is set to 1. + EXPECT_THAT(PreadFd(fd.get(), &buf, sizeof(buf), 0), + SyscallSucceedsWithValue(sizeof(to_write) + 1)); + EXPECT_EQ(strcmp(buf, "1\n"), 0); + + // Set tcp_recovery to one of the allowed constants. + EXPECT_THAT(PwriteFd(fd.get(), &to_write, sizeof(to_write), 0), + SyscallSucceedsWithValue(sizeof(to_write))); + EXPECT_THAT(PreadFd(fd.get(), &buf, sizeof(buf), 0), + SyscallSucceedsWithValue(sizeof(to_write) + 1)); + EXPECT_EQ(strcmp(buf, "2\n"), 0); + + // Set tcp_recovery to any random value. + char kMessage[] = "100"; + EXPECT_THAT(PwriteFd(fd.get(), kMessage, strlen(kMessage), 0), + SyscallSucceedsWithValue(strlen(kMessage))); + EXPECT_THAT(PreadFd(fd.get(), buf, sizeof(kMessage), 0), + SyscallSucceedsWithValue(sizeof(kMessage))); + EXPECT_EQ(strcmp(buf, "100\n"), 0); +} } // namespace } // namespace testing } // namespace gvisor -- cgit v1.2.3 From f20e63e31b56784c596897e86f03441f9d05f567 Mon Sep 17 00:00:00 2001 From: Fabricio Voznika Date: Thu, 6 Aug 2020 18:13:55 -0700 Subject: Add LinkAt support to gofer Updates #1198 PiperOrigin-RevId: 325350818 --- images/basic/linktest/Dockerfile | 7 +++ images/basic/linktest/link_test.c | 93 +++++++++++++++++++++++++++++++++++ pkg/sentry/fsimpl/gofer/filesystem.go | 26 +++++++++- pkg/sentry/fsimpl/gofer/gofer.go | 2 - scripts/docker_tests.sh | 2 +- test/e2e/integration_test.go | 18 +++++++ 6 files changed, 143 insertions(+), 5 deletions(-) create mode 100644 images/basic/linktest/Dockerfile create mode 100644 images/basic/linktest/link_test.c (limited to 'pkg/sentry/fsimpl') diff --git a/images/basic/linktest/Dockerfile b/images/basic/linktest/Dockerfile new file mode 100644 index 000000000..baebc9b76 --- /dev/null +++ b/images/basic/linktest/Dockerfile @@ -0,0 +1,7 @@ +FROM ubuntu:bionic + +WORKDIR /root +COPY . . + +RUN apt-get update && apt-get install -y gcc +RUN gcc -O2 -o link_test link_test.c diff --git a/images/basic/linktest/link_test.c b/images/basic/linktest/link_test.c new file mode 100644 index 000000000..45ab00abe --- /dev/null +++ b/images/basic/linktest/link_test.c @@ -0,0 +1,93 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include +#include + +// Basic test for linkat(2). Syscall tests requires CAP_DAC_READ_SEARCH and it +// cannot use tricks like userns as root. For this reason, run a basic link test +// to ensure some coverage. +int main(int argc, char** argv) { + const char kOldPath[] = "old.txt"; + int fd = open(kOldPath, O_RDWR | O_CREAT | O_TRUNC, 0600); + if (fd < 0) { + errx(1, "open(%s) failed", kOldPath); + } + const char kData[] = "some random content"; + if (write(fd, kData, sizeof(kData)) < 0) { + err(1, "write failed"); + } + close(fd); + + struct stat old_stat; + if (stat(kOldPath, &old_stat)) { + errx(1, "stat(%s) failed", kOldPath); + } + + const char kNewPath[] = "new.txt"; + if (link(kOldPath, kNewPath)) { + errx(1, "link(%s, %s) failed", kOldPath, kNewPath); + } + + struct stat new_stat; + if (stat(kNewPath, &new_stat)) { + errx(1, "stat(%s) failed", kNewPath); + } + + // Check that files are the same. + if (old_stat.st_dev != new_stat.st_dev) { + errx(1, "files st_dev is different, want: %lu, got: %lu", old_stat.st_dev, + new_stat.st_dev); + } + if (old_stat.st_ino != new_stat.st_ino) { + errx(1, "files st_ino is different, want: %lu, got: %lu", old_stat.st_ino, + new_stat.st_ino); + } + + // Check that link count is correct. + if (new_stat.st_nlink != old_stat.st_nlink + 1) { + errx(1, "wrong nlink, want: %lu, got: %lu", old_stat.st_nlink + 1, + new_stat.st_nlink); + } + + // Check taht contents are the same. + fd = open(kNewPath, O_RDONLY); + if (fd < 0) { + errx(1, "open(%s) failed", kNewPath); + } + char buf[sizeof(kData)] = {}; + if (read(fd, buf, sizeof(buf)) < 0) { + err(1, "read failed"); + } + close(fd); + + if (strcmp(buf, kData) != 0) { + errx(1, "file content mismatch: %s", buf); + } + + // Cleanup. + if (unlink(kNewPath)) { + errx(1, "unlink(%s) failed", kNewPath); + } + if (unlink(kOldPath)) { + errx(1, "unlink(%s) failed", kOldPath); + } + + // Success! + return 0; +} diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go index e6af37d0d..582b744bb 100644 --- a/pkg/sentry/fsimpl/gofer/filesystem.go +++ b/pkg/sentry/fsimpl/gofer/filesystem.go @@ -15,6 +15,7 @@ package gofer import ( + "math" "sync" "sync/atomic" @@ -724,8 +725,29 @@ func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs. if rp.Mount() != vd.Mount() { return syserror.EXDEV } - // 9P2000.L supports hard links, but we don't. - return syserror.EPERM + d := vd.Dentry().Impl().(*dentry) + if d.isDir() { + return syserror.EPERM + } + gid := auth.KGID(atomic.LoadUint32(&d.gid)) + uid := auth.KUID(atomic.LoadUint32(&d.uid)) + mode := linux.FileMode(atomic.LoadUint32(&d.mode)) + if err := vfs.MayLink(rp.Credentials(), mode, uid, gid); err != nil { + return err + } + if d.nlink == 0 { + return syserror.ENOENT + } + if d.nlink == math.MaxUint32 { + return syserror.EMLINK + } + if err := parent.file.link(ctx, d.file, childName); err != nil { + return err + } + + // Success! + atomic.AddUint32(&d.nlink, 1) + return nil }, nil) } diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go index 2e5575d8d..6ae796c6d 100644 --- a/pkg/sentry/fsimpl/gofer/gofer.go +++ b/pkg/sentry/fsimpl/gofer/gofer.go @@ -558,8 +558,6 @@ type dentry struct { // filesystem.renameMu. name string - // We don't support hard links, so each dentry maps 1:1 to an inode. - // file is the unopened p9.File that backs this dentry. file is immutable. // // If file.isNil(), this dentry represents a synthetic file, i.e. a file diff --git a/scripts/docker_tests.sh b/scripts/docker_tests.sh index 07e9f3109..be0b0a3ec 100755 --- a/scripts/docker_tests.sh +++ b/scripts/docker_tests.sh @@ -23,5 +23,5 @@ test_runsc //test/image:image_test //test/e2e:integration_test install_runsc_for_test docker --vfs2 IMAGE_FILTER="Hello|Httpd|Ruby|Stdio" -INTEGRATION_FILTER="LifeCycle|Pause|Connect|JobControl|Overlay|Exec|DirCreation/root" +INTEGRATION_FILTER="LifeCycle|Pause|Connect|JobControl|Overlay|Exec|DirCreation/root|Link" test_runsc //test/e2e:integration_test //test/image:image_test --test_filter="${IMAGE_FILTER}|${INTEGRATION_FILTER}" diff --git a/test/e2e/integration_test.go b/test/e2e/integration_test.go index 6fe6d304f..71ec4791e 100644 --- a/test/e2e/integration_test.go +++ b/test/e2e/integration_test.go @@ -467,6 +467,24 @@ func TestHostOverlayfsRewindDir(t *testing.T) { } } +// Basic test for linkat(2). Syscall tests requires CAP_DAC_READ_SEARCH and it +// cannot use tricks like userns as root. For this reason, run a basic link test +// to ensure some coverage. +func TestLink(t *testing.T) { + ctx := context.Background() + d := dockerutil.MakeContainer(ctx, t) + defer d.CleanUp(ctx) + + if got, err := d.Run(ctx, dockerutil.RunOpts{ + Image: "basic/linktest", + WorkDir: "/root", + }, "./link_test"); err != nil { + t.Fatalf("docker run failed: %v", err) + } else if got != "" { + t.Errorf("test failed:\n%s", got) + } +} + func TestMain(m *testing.M) { dockerutil.EnsureSupportedDockerVersion() flag.Parse() -- cgit v1.2.3 From 4fa1c304a133297bc6895729d74aa35a015e759e Mon Sep 17 00:00:00 2001 From: Jamie Liu Date: Fri, 7 Aug 2020 00:10:35 -0700 Subject: Try to update atime and mtime on VFS2 gofer files on dentry eviction. PiperOrigin-RevId: 325388385 --- pkg/sentry/fsimpl/gofer/gofer.go | 150 +++++++++++++++++++++++---------------- pkg/sentry/fsimpl/gofer/time.go | 3 + 2 files changed, 91 insertions(+), 62 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go index 6ae796c6d..59323086d 100644 --- a/pkg/sentry/fsimpl/gofer/gofer.go +++ b/pkg/sentry/fsimpl/gofer/gofer.go @@ -192,10 +192,14 @@ const ( // // - File timestamps are based on client clocks. This ensures that users of // the client observe timestamps that are coherent with their own clocks - // and consistent with Linux's semantics. However, since it is not always - // possible for clients to set arbitrary atimes and mtimes, and never - // possible for clients to set arbitrary ctimes, file timestamp changes are - // stored in the client only and never sent to the remote filesystem. + // and consistent with Linux's semantics (in particular, it is not always + // possible for clients to set arbitrary atimes and mtimes depending on the + // remote filesystem implementation, and never possible for clients to set + // arbitrary ctimes.) If a dentry containing a client-defined atime or + // mtime is evicted from cache, client timestamps will be sent to the + // remote filesystem on a best-effort basis to attempt to ensure that + // timestamps will be preserved when another dentry representing the same + // file is instantiated. InteropModeExclusive InteropMode = iota // InteropModeWritethrough is appropriate when there are read-only users of @@ -621,6 +625,12 @@ type dentry struct { // File size, protected by both metadataMu and dataMu (i.e. both must be // locked to mutate it; locking either is sufficient to access it). size uint64 + // If this dentry does not represent a synthetic file, deleted is 0, and + // atimeDirty/mtimeDirty are non-zero, atime/mtime may have diverged from the + // remote file's timestamps, which should be updated when this dentry is + // evicted. + atimeDirty uint32 + mtimeDirty uint32 // nlink counts the number of hard links to this dentry. It's updated and // accessed using atomic operations. It's not protected by metadataMu like the @@ -801,10 +811,12 @@ func (d *dentry) updateFromP9AttrsLocked(mask p9.AttrMask, attr *p9.Attr) { if attr.BlockSize != 0 { atomic.StoreUint32(&d.blockSize, uint32(attr.BlockSize)) } - if mask.ATime { + // Don't override newer client-defined timestamps with old server-defined + // ones. + if mask.ATime && atomic.LoadUint32(&d.atimeDirty) == 0 { atomic.StoreInt64(&d.atime, dentryTimestampFromP9(attr.ATimeSeconds, attr.ATimeNanoSeconds)) } - if mask.MTime { + if mask.MTime && atomic.LoadUint32(&d.mtimeDirty) == 0 { atomic.StoreInt64(&d.mtime, dentryTimestampFromP9(attr.MTimeSeconds, attr.MTimeNanoSeconds)) } if mask.CTime { @@ -901,51 +913,44 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs return err } defer mnt.EndWrite() - setLocalAtime := false - setLocalMtime := false + + if stat.Mask&linux.STATX_SIZE != 0 { + // Reject attempts to truncate files other than regular files, since + // filesystem implementations may return the wrong errno. + switch mode.FileType() { + case linux.S_IFREG: + // ok + case linux.S_IFDIR: + return syserror.EISDIR + default: + return syserror.EINVAL + } + } + + var now int64 if d.cachedMetadataAuthoritative() { - // Timestamp updates will be handled locally. - setLocalAtime = stat.Mask&linux.STATX_ATIME != 0 - setLocalMtime = stat.Mask&linux.STATX_MTIME != 0 - stat.Mask &^= linux.STATX_ATIME | linux.STATX_MTIME - - // Prepare for truncate. - if stat.Mask&linux.STATX_SIZE != 0 { - switch mode.FileType() { - case linux.ModeRegular: - if !setLocalMtime { - // Truncate updates mtime. - setLocalMtime = true - stat.Mtime.Nsec = linux.UTIME_NOW - } - case linux.ModeDirectory: - return syserror.EISDIR - default: - return syserror.EINVAL + // Truncate updates mtime. + if stat.Mask&(linux.STATX_SIZE|linux.STATX_MTIME) == linux.STATX_SIZE { + stat.Mask |= linux.STATX_MTIME + stat.Mtime = linux.StatxTimestamp{ + Nsec: linux.UTIME_NOW, } } + + // Use client clocks for timestamps. + now = d.fs.clock.Now().Nanoseconds() + if stat.Mask&linux.STATX_ATIME != 0 && stat.Atime.Nsec == linux.UTIME_NOW { + stat.Atime = statxTimestampFromDentry(now) + } + if stat.Mask&linux.STATX_MTIME != 0 && stat.Mtime.Nsec == linux.UTIME_NOW { + stat.Mtime = statxTimestampFromDentry(now) + } } + d.metadataMu.Lock() defer d.metadataMu.Unlock() - if stat.Mask&linux.STATX_SIZE != 0 { - // The size needs to be changed even when - // !d.cachedMetadataAuthoritative() because d.mappings has to be - // updated. - d.updateFileSizeLocked(stat.Size) - } if !d.isSynthetic() { if stat.Mask != 0 { - if stat.Mask&linux.STATX_SIZE != 0 { - // Check whether to allow a truncate request to be made. - switch d.mode & linux.S_IFMT { - case linux.S_IFREG: - // Allow. - case linux.S_IFDIR: - return syserror.EISDIR - default: - return syserror.EINVAL - } - } if err := d.file.setAttr(ctx, p9.SetAttrMask{ Permissions: stat.Mask&linux.STATX_MODE != 0, UID: stat.Mask&linux.STATX_UID != 0, @@ -967,6 +972,11 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs }); err != nil { return err } + if stat.Mask&linux.STATX_SIZE != 0 { + // Privatized copy-on-write mappings of truncated pages need to + // be invalidated even if InteropModeShared is in effect. + d.updateFileSizeLocked(stat.Size) + } } if d.fs.opts.interop == InteropModeShared { // There's no point to updating d's metadata in this case since @@ -976,7 +986,6 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs return nil } } - now := d.fs.clock.Now().Nanoseconds() if stat.Mask&linux.STATX_MODE != 0 { atomic.StoreUint32(&d.mode, d.fileType()|uint32(stat.Mode)) } @@ -986,23 +995,18 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs if stat.Mask&linux.STATX_GID != 0 { atomic.StoreUint32(&d.gid, stat.GID) } - if setLocalAtime { - if stat.Atime.Nsec == linux.UTIME_NOW { - atomic.StoreInt64(&d.atime, now) - } else { - atomic.StoreInt64(&d.atime, dentryTimestampFromStatx(stat.Atime)) - } - // Restore mask bits that we cleared earlier. - stat.Mask |= linux.STATX_ATIME - } - if setLocalMtime { - if stat.Mtime.Nsec == linux.UTIME_NOW { - atomic.StoreInt64(&d.mtime, now) - } else { - atomic.StoreInt64(&d.mtime, dentryTimestampFromStatx(stat.Mtime)) - } - // Restore mask bits that we cleared earlier. - stat.Mask |= linux.STATX_MTIME + // Note that stat.Atime.Nsec and stat.Mtime.Nsec can't be UTIME_NOW because + // if d.cachedMetadataAuthoritative() then we converted stat.Atime and + // stat.Mtime to client-local timestamps above, and if + // !d.cachedMetadataAuthoritative() then we returned after calling + // d.file.setAttr(). For the same reason, now must have been initialized. + if stat.Mask&linux.STATX_ATIME != 0 { + atomic.StoreInt64(&d.atime, dentryTimestampFromStatx(stat.Atime)) + atomic.StoreUint32(&d.atimeDirty, 0) + } + if stat.Mask&linux.STATX_MTIME != 0 { + atomic.StoreInt64(&d.mtime, dentryTimestampFromStatx(stat.Mtime)) + atomic.StoreUint32(&d.mtimeDirty, 0) } atomic.StoreInt64(&d.ctime, now) return nil @@ -1248,7 +1252,7 @@ func (d *dentry) destroyLocked(ctx context.Context) { // Write dirty pages back to the remote filesystem. if d.handleWritable { if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, mf, d.handle.writeFromBlocksAt); err != nil { - log.Warningf("gofer.dentry.DecRef: failed to write dirty data back: %v", err) + log.Warningf("gofer.dentry.destroyLocked: failed to write dirty data back: %v", err) } } // Discard cached data. @@ -1261,6 +1265,28 @@ func (d *dentry) destroyLocked(ctx context.Context) { d.handleMu.Unlock() if !d.file.isNil() { + if !d.isDeleted() { + // Write dirty timestamps back to the remote filesystem. + atimeDirty := atomic.LoadUint32(&d.atimeDirty) != 0 + mtimeDirty := atomic.LoadUint32(&d.mtimeDirty) != 0 + if atimeDirty || mtimeDirty { + atime := atomic.LoadInt64(&d.atime) + mtime := atomic.LoadInt64(&d.mtime) + if err := d.file.setAttr(ctx, p9.SetAttrMask{ + ATime: atimeDirty, + ATimeNotSystemTime: atimeDirty, + MTime: mtimeDirty, + MTimeNotSystemTime: mtimeDirty, + }, p9.SetAttr{ + ATimeSeconds: uint64(atime / 1e9), + ATimeNanoSeconds: uint64(atime % 1e9), + MTimeSeconds: uint64(mtime / 1e9), + MTimeNanoSeconds: uint64(mtime % 1e9), + }); err != nil { + log.Warningf("gofer.dentry.destroyLocked: failed to write dirty timestamps back: %v", err) + } + } + } d.file.close(ctx) d.file = p9file{} // Remove d from the set of syncable dentries. diff --git a/pkg/sentry/fsimpl/gofer/time.go b/pkg/sentry/fsimpl/gofer/time.go index 0eef4e16e..2cb8191b9 100644 --- a/pkg/sentry/fsimpl/gofer/time.go +++ b/pkg/sentry/fsimpl/gofer/time.go @@ -47,6 +47,7 @@ func (d *dentry) touchAtime(mnt *vfs.Mount) { now := d.fs.clock.Now().Nanoseconds() d.metadataMu.Lock() atomic.StoreInt64(&d.atime, now) + atomic.StoreUint32(&d.atimeDirty, 1) d.metadataMu.Unlock() mnt.EndWrite() } @@ -67,6 +68,7 @@ func (d *dentry) touchCMtime() { d.metadataMu.Lock() atomic.StoreInt64(&d.mtime, now) atomic.StoreInt64(&d.ctime, now) + atomic.StoreUint32(&d.mtimeDirty, 1) d.metadataMu.Unlock() } @@ -76,4 +78,5 @@ func (d *dentry) touchCMtimeLocked() { now := d.fs.clock.Now().Nanoseconds() atomic.StoreInt64(&d.mtime, now) atomic.StoreInt64(&d.ctime, now) + atomic.StoreUint32(&d.mtimeDirty, 1) } -- cgit v1.2.3 From 93cb66825bf098f8a19b3d7f34b33272ceed8cb3 Mon Sep 17 00:00:00 2001 From: Jamie Liu Date: Fri, 7 Aug 2020 12:54:14 -0700 Subject: Support separate read/write handles in fsimpl/gofer.dentry. PiperOrigin-RevId: 325490674 --- pkg/sentry/fsimpl/gofer/BUILD | 1 + pkg/sentry/fsimpl/gofer/directory.go | 12 +- pkg/sentry/fsimpl/gofer/filesystem.go | 18 +- pkg/sentry/fsimpl/gofer/gofer.go | 308 +++++++++++++++++++++----------- pkg/sentry/fsimpl/gofer/gofer_test.go | 4 +- pkg/sentry/fsimpl/gofer/handle.go | 19 +- pkg/sentry/fsimpl/gofer/regular_file.go | 118 ++++++------ pkg/sentry/fsimpl/gofer/special_file.go | 11 +- pkg/sentry/vfs/file_description.go | 5 + test/syscalls/BUILD | 1 + 10 files changed, 312 insertions(+), 185 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/gofer/BUILD b/pkg/sentry/fsimpl/gofer/BUILD index 4a800dcf9..16787116f 100644 --- a/pkg/sentry/fsimpl/gofer/BUILD +++ b/pkg/sentry/fsimpl/gofer/BUILD @@ -85,5 +85,6 @@ go_test( deps = [ "//pkg/p9", "//pkg/sentry/contexttest", + "//pkg/sentry/pgalloc", ], ) diff --git a/pkg/sentry/fsimpl/gofer/directory.go b/pkg/sentry/fsimpl/gofer/directory.go index 1679066ba..2a8011eb4 100644 --- a/pkg/sentry/fsimpl/gofer/directory.go +++ b/pkg/sentry/fsimpl/gofer/directory.go @@ -90,10 +90,8 @@ func (d *dentry) createSyntheticChildLocked(opts *createSyntheticOpts) { uid: uint32(opts.kuid), gid: uint32(opts.kgid), blockSize: usermem.PageSize, // arbitrary - handle: handle{ - fd: -1, - }, - nlink: uint32(2), + hostFD: -1, + nlink: uint32(2), } switch opts.mode.FileType() { case linux.S_IFDIR: @@ -205,14 +203,14 @@ func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) { off := uint64(0) const count = 64 * 1024 // for consistency with the vfs1 client d.handleMu.RLock() - if !d.handleReadable { + if d.readFile.isNil() { // This should not be possible because a readable handle should // have been opened when the calling directoryFD was opened. d.handleMu.RUnlock() panic("gofer.dentry.getDirents called without a readable handle") } for { - p9ds, err := d.handle.file.readdir(ctx, off, count) + p9ds, err := d.readFile.readdir(ctx, off, count) if err != nil { d.handleMu.RUnlock() return nil, err @@ -304,5 +302,5 @@ func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (in // Sync implements vfs.FileDescriptionImpl.Sync. func (fd *directoryFD) Sync(ctx context.Context) error { - return fd.dentry().handle.sync(ctx) + return fd.dentry().syncRemoteFile(ctx) } diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go index 582b744bb..eaef2594d 100644 --- a/pkg/sentry/fsimpl/gofer/filesystem.go +++ b/pkg/sentry/fsimpl/gofer/filesystem.go @@ -55,7 +55,7 @@ func (fs *filesystem) Sync(ctx context.Context) error { // Sync regular files. for _, d := range ds { - err := d.syncSharedHandle(ctx) + err := d.syncCachedFile(ctx) d.DecRef(ctx) if err != nil && retErr == nil { retErr = err @@ -1107,12 +1107,18 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving useRegularFileFD := child.fileType() == linux.S_IFREG && !d.fs.opts.regularFilesUseSpecialFileFD if useRegularFileFD { child.handleMu.Lock() - child.handle.file = openFile - if fdobj != nil { - child.handle.fd = int32(fdobj.Release()) + if vfs.MayReadFileWithOpenFlags(opts.Flags) { + child.readFile = openFile + if fdobj != nil { + child.hostFD = int32(fdobj.Release()) + } + } else if fdobj != nil { + // Can't use fdobj if it's not readable. + fdobj.Close() + } + if vfs.MayWriteFileWithOpenFlags(opts.Flags) { + child.writeFile = openFile } - child.handleReadable = vfs.MayReadFileWithOpenFlags(opts.Flags) - child.handleWritable = vfs.MayWriteFileWithOpenFlags(opts.Flags) child.handleMu.Unlock() } // Insert the dentry into the tree. diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go index 59323086d..f1d3bf911 100644 --- a/pkg/sentry/fsimpl/gofer/gofer.go +++ b/pkg/sentry/fsimpl/gofer/gofer.go @@ -506,9 +506,9 @@ func (fs *filesystem) Release(ctx context.Context) { for d := range fs.syncableDentries { d.handleMu.Lock() d.dataMu.Lock() - if d.handleWritable { + if h := d.writeHandleLocked(); h.isOpen() { // Write dirty cached data to the remote file. - if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, fs.mfp.MemoryFile(), d.handle.writeFromBlocksAt); err != nil { + if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, fs.mfp.MemoryFile(), h.writeFromBlocksAt); err != nil { log.Warningf("gofer.filesystem.Release: failed to flush dentry: %v", err) } // TODO(jamieliu): Do we need to flushf/fsync d? @@ -518,9 +518,9 @@ func (fs *filesystem) Release(ctx context.Context) { d.dirty.RemoveAll() d.dataMu.Unlock() // Close the host fd if one exists. - if d.handle.fd >= 0 { - syscall.Close(int(d.handle.fd)) - d.handle.fd = -1 + if d.hostFD >= 0 { + syscall.Close(int(d.hostFD)) + d.hostFD = -1 } d.handleMu.Unlock() } @@ -622,7 +622,12 @@ type dentry struct { mtime int64 ctime int64 btime int64 - // File size, protected by both metadataMu and dataMu (i.e. both must be + // File size, which differs from other metadata in two ways: + // + // - We make a best-effort attempt to keep it up to date even if + // !dentry.cachedMetadataAuthoritative() for the sake of O_APPEND writes. + // + // - size is protected by both metadataMu and dataMu (i.e. both must be // locked to mutate it; locking either is sufficient to access it). size uint64 // If this dentry does not represent a synthetic file, deleted is 0, and @@ -643,30 +648,28 @@ type dentry struct { // the file into memmap.MappingSpaces. mappings is protected by mapsMu. mappings memmap.MappingSet - // If this dentry represents a regular file or directory: - // - // - handle is the I/O handle used by all regularFileFDs/directoryFDs - // representing this dentry. - // - // - handleReadable is true if handle is readable. - // - // - handleWritable is true if handle is writable. - // - // Invariants: + // - If this dentry represents a regular file or directory, readFile is the + // p9.File used for reads by all regularFileFDs/directoryFDs representing + // this dentry. // - // - If handleReadable == handleWritable == false, then handle.file == nil - // (i.e. there is no open handle). Conversely, if handleReadable || - // handleWritable == true, then handle.file != nil (i.e. there is an open - // handle). + // - If this dentry represents a regular file, writeFile is the p9.File + // used for writes by all regularFileFDs representing this dentry. // - // - handleReadable and handleWritable cannot transition from true to false - // (i.e. handles may not be downgraded). + // - If this dentry represents a regular file, hostFD is the host FD used + // for memory mappings and I/O (when applicable) in preference to readFile + // and writeFile. hostFD is always readable; if !writeFile.isNil(), it must + // also be writable. If hostFD is -1, no such host FD is available. // // These fields are protected by handleMu. - handleMu sync.RWMutex - handle handle - handleReadable bool - handleWritable bool + // + // readFile and writeFile may or may not represent the same p9.File. Once + // either p9.File transitions from closed (isNil() == true) to open + // (isNil() == false), it may be mutated with handleMu locked, but cannot + // be closed until the dentry is destroyed. + handleMu sync.RWMutex + readFile p9file + writeFile p9file + hostFD int32 dataMu sync.RWMutex @@ -680,7 +683,7 @@ type dentry struct { // tracks dirty segments in cache. dirty is protected by dataMu. dirty fsutil.DirtySet - // pf implements platform.File for mappings of handle.fd. + // pf implements platform.File for mappings of hostFD. pf dentryPlatformFile // If this dentry represents a symbolic link, InteropModeShared is not in @@ -742,9 +745,7 @@ func (fs *filesystem) newDentry(ctx context.Context, file p9file, qid p9.QID, ma uid: uint32(fs.opts.dfltuid), gid: uint32(fs.opts.dfltgid), blockSize: usermem.PageSize, - handle: handle{ - fd: -1, - }, + hostFD: -1, } d.pf.dentry = d if mask.UID { @@ -835,9 +836,13 @@ func (d *dentry) updateFromP9AttrsLocked(mask p9.AttrMask, attr *p9.Attr) { // Preconditions: !d.isSynthetic() func (d *dentry) updateFromGetattr(ctx context.Context) error { - // Use d.handle.file, which represents a 9P fid that has been opened, in - // preference to d.file, which represents a 9P fid that has not. This may - // be significantly more efficient in some implementations. + // Use d.readFile or d.writeFile, which represent 9P fids that have been + // opened, in preference to d.file, which represents a 9P fid that has not. + // This may be significantly more efficient in some implementations. Prefer + // d.writeFile over d.readFile since some filesystem implementations may + // update a writable handle's metadata after writes to that handle, without + // making metadata updates immediately visible to read-only handles + // representing the same file. var ( file p9file handleMuRLocked bool @@ -847,8 +852,11 @@ func (d *dentry) updateFromGetattr(ctx context.Context) error { d.metadataMu.Lock() defer d.metadataMu.Unlock() d.handleMu.RLock() - if !d.handle.file.isNil() { - file = d.handle.file + if !d.writeFile.isNil() { + file = d.writeFile + handleMuRLocked = true + } else if !d.readFile.isNil() { + file = d.readFile handleMuRLocked = true } else { file = d.file @@ -973,8 +981,9 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs return err } if stat.Mask&linux.STATX_SIZE != 0 { - // Privatized copy-on-write mappings of truncated pages need to - // be invalidated even if InteropModeShared is in effect. + // d.size should be kept up to date, and privatized + // copy-on-write mappings of truncated pages need to be + // invalidated, even if InteropModeShared is in effect. d.updateFileSizeLocked(stat.Size) } } @@ -1245,22 +1254,31 @@ func (d *dentry) destroyLocked(ctx context.Context) { panic("dentry.destroyLocked() called with references on the dentry") } + mf := d.fs.mfp.MemoryFile() d.handleMu.Lock() - if !d.handle.file.isNil() { - mf := d.fs.mfp.MemoryFile() - d.dataMu.Lock() + d.dataMu.Lock() + if h := d.writeHandleLocked(); h.isOpen() { // Write dirty pages back to the remote filesystem. - if d.handleWritable { - if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, mf, d.handle.writeFromBlocksAt); err != nil { - log.Warningf("gofer.dentry.destroyLocked: failed to write dirty data back: %v", err) - } + if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, mf, h.writeFromBlocksAt); err != nil { + log.Warningf("gofer.dentry.destroyLocked: failed to write dirty data back: %v", err) } - // Discard cached data. - d.cache.DropAll(mf) - d.dirty.RemoveAll() - d.dataMu.Unlock() - // Clunk open fids and close open host FDs. - d.handle.close(ctx) + } + // Discard cached data. + d.cache.DropAll(mf) + d.dirty.RemoveAll() + d.dataMu.Unlock() + // Clunk open fids and close open host FDs. + if !d.readFile.isNil() { + d.readFile.close(ctx) + } + if !d.writeFile.isNil() && d.readFile != d.writeFile { + d.writeFile.close(ctx) + } + d.readFile = p9file{} + d.writeFile = p9file{} + if d.hostFD >= 0 { + syscall.Close(int(d.hostFD)) + d.hostFD = -1 } d.handleMu.Unlock() @@ -1393,80 +1411,120 @@ func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool // O_TRUNC). if !trunc { d.handleMu.RLock() - if (!read || d.handleReadable) && (!write || d.handleWritable) { - // The current handle is sufficient. + if (!read || !d.readFile.isNil()) && (!write || !d.writeFile.isNil()) { + // Current handles are sufficient. d.handleMu.RUnlock() return nil } d.handleMu.RUnlock() } - haveOldFD := false + fdToClose := int32(-1) + invalidateTranslations := false d.handleMu.Lock() - if (read && !d.handleReadable) || (write && !d.handleWritable) || trunc { - // Get a new handle. - wantReadable := d.handleReadable || read - wantWritable := d.handleWritable || write - h, err := openHandle(ctx, d.file, wantReadable, wantWritable, trunc) + if (read && d.readFile.isNil()) || (write && d.writeFile.isNil()) || trunc { + // Get a new handle. If this file has been opened for both reading and + // writing, try to get a single handle that is usable for both: + // + // - Writable memory mappings of a host FD require that the host FD is + // opened for both reading and writing. + // + // - NOTE(b/141991141): Some filesystems may not ensure coherence + // between multiple handles for the same file. + openReadable := !d.readFile.isNil() || read + openWritable := !d.writeFile.isNil() || write + h, err := openHandle(ctx, d.file, openReadable, openWritable, trunc) + if err == syserror.EACCES && (openReadable != read || openWritable != write) { + // It may not be possible to use a single handle for both + // reading and writing, since permissions on the file may have + // changed to e.g. disallow reading after previously being + // opened for reading. In this case, we have no choice but to + // use separate handles for reading and writing. + ctx.Debugf("gofer.dentry.ensureSharedHandle: bifurcating read/write handles for dentry %p", d) + openReadable = read + openWritable = write + h, err = openHandle(ctx, d.file, openReadable, openWritable, trunc) + } if err != nil { d.handleMu.Unlock() return err } - if !d.handle.file.isNil() { - // Check that old and new handles are compatible: If the old handle - // includes a host file descriptor but the new one does not, or - // vice versa, old and new memory mappings may be incoherent. - haveOldFD = d.handle.fd >= 0 - haveNewFD := h.fd >= 0 - if haveOldFD != haveNewFD { - d.handleMu.Unlock() - ctx.Warningf("gofer.dentry.ensureSharedHandle: can't change host FD availability from %v to %v across dentry handle upgrade", haveOldFD, haveNewFD) - h.close(ctx) - return syserror.EIO - } - if haveOldFD { - // We may have raced with callers of d.pf.FD() that are now - // using the old file descriptor, preventing us from safely - // closing it. We could handle this by invalidating existing - // memmap.Translations, but this is expensive. Instead, use - // dup3 to make the old file descriptor refer to the new file - // description, then close the new file descriptor (which is no - // longer needed). Racing callers may use the old or new file - // description, but this doesn't matter since they refer to the - // same file (unless d.fs.opts.overlayfsStaleRead is true, - // which we handle separately). - if err := syscall.Dup3(int(h.fd), int(d.handle.fd), syscall.O_CLOEXEC); err != nil { + + if d.hostFD < 0 && openReadable && h.fd >= 0 { + // We have no existing FD; use the new FD for at least reading. + d.hostFD = h.fd + } else if d.hostFD >= 0 && d.writeFile.isNil() && openWritable { + // We have an existing read-only FD, but the file has just been + // opened for writing, so we need to start supporting writable memory + // mappings. This may race with callers of d.pf.FD() using the existing + // FD, so in most cases we need to delay closing the old FD until after + // invalidating memmap.Translations that might have observed it. + if !openReadable || h.fd < 0 { + // We don't have a read/write FD, so we have no FD that can be + // used to create writable memory mappings. Switch to using the + // internal page cache. + invalidateTranslations = true + fdToClose = d.hostFD + d.hostFD = -1 + } else if d.fs.opts.overlayfsStaleRead { + // We do have a read/write FD, but it may not be coherent with + // the existing read-only FD, so we must switch to mappings of + // the new FD in both the application and sentry. + if err := d.pf.hostFileMapper.RegenerateMappings(int(h.fd)); err != nil { d.handleMu.Unlock() - ctx.Warningf("gofer.dentry.ensureSharedHandle: failed to dup fd %d to fd %d: %v", h.fd, d.handle.fd, err) + ctx.Warningf("gofer.dentry.ensureSharedHandle: failed to replace sentry mappings of old FD with mappings of new FD: %v", err) h.close(ctx) return err } - syscall.Close(int(h.fd)) - h.fd = d.handle.fd - if d.fs.opts.overlayfsStaleRead { - // Replace sentry mappings of the old FD with mappings of - // the new FD, since the two are not necessarily coherent. - if err := d.pf.hostFileMapper.RegenerateMappings(int(h.fd)); err != nil { - d.handleMu.Unlock() - ctx.Warningf("gofer.dentry.ensureSharedHandle: failed to replace sentry mappings of old FD with mappings of new FD: %v", err) - h.close(ctx) - return err - } + invalidateTranslations = true + fdToClose = d.hostFD + d.hostFD = h.fd + } else { + // We do have a read/write FD. To avoid invalidating existing + // memmap.Translations (which is expensive), use dup3 to make + // the old file descriptor refer to the new file description, + // then close the new file descriptor (which is no longer + // needed). Racing callers of d.pf.FD() may use the old or new + // file description, but this doesn't matter since they refer + // to the same file, and any racing mappings must be read-only. + if err := syscall.Dup3(int(h.fd), int(d.hostFD), syscall.O_CLOEXEC); err != nil { + oldHostFD := d.hostFD + d.handleMu.Unlock() + ctx.Warningf("gofer.dentry.ensureSharedHandle: failed to dup fd %d to fd %d: %v", h.fd, oldHostFD, err) + h.close(ctx) + return err } - // Clunk the old fid before making the new handle visible (by - // unlocking d.handleMu). - d.handle.file.close(ctx) + fdToClose = h.fd } + } else { + // h.fd is not useful. + fdToClose = h.fd + } + + // Switch to new fids. + var oldReadFile p9file + if openReadable { + oldReadFile = d.readFile + d.readFile = h.file + } + var oldWriteFile p9file + if openWritable { + oldWriteFile = d.writeFile + d.writeFile = h.file + } + // NOTE(b/141991141): Clunk old fids before making new fids visible (by + // unlocking d.handleMu). + if !oldReadFile.isNil() { + oldReadFile.close(ctx) + } + if !oldWriteFile.isNil() && oldReadFile != oldWriteFile { + oldWriteFile.close(ctx) } - // Switch to the new handle. - d.handle = h - d.handleReadable = wantReadable - d.handleWritable = wantWritable } d.handleMu.Unlock() - if d.fs.opts.overlayfsStaleRead && haveOldFD { - // Invalidate application mappings that may be using the old FD; they + if invalidateTranslations { + // Invalidate application mappings that may be using an old FD; they // will be replaced with mappings using the new FD after future calls // to d.Translate(). This requires holding d.mapsMu, which precedes // d.handleMu in the lock order. @@ -1474,7 +1532,51 @@ func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool d.mappings.InvalidateAll(memmap.InvalidateOpts{}) d.mapsMu.Unlock() } + if fdToClose >= 0 { + syscall.Close(int(fdToClose)) + } + + return nil +} +// Preconditions: d.handleMu must be locked. +func (d *dentry) readHandleLocked() handle { + return handle{ + file: d.readFile, + fd: d.hostFD, + } +} + +// Preconditions: d.handleMu must be locked. +func (d *dentry) writeHandleLocked() handle { + return handle{ + file: d.writeFile, + fd: d.hostFD, + } +} + +func (d *dentry) syncRemoteFile(ctx context.Context) error { + d.handleMu.RLock() + defer d.handleMu.RUnlock() + return d.syncRemoteFileLocked(ctx) +} + +// Preconditions: d.handleMu must be locked. +func (d *dentry) syncRemoteFileLocked(ctx context.Context) error { + // If we have a host FD, fsyncing it is likely to be faster than an fsync + // RPC. + if d.hostFD >= 0 { + ctx.UninterruptibleSleepStart(false) + err := syscall.Fsync(int(d.hostFD)) + ctx.UninterruptibleSleepFinish(false) + return err + } + if !d.writeFile.isNil() { + return d.writeFile.fsync(ctx) + } + if !d.readFile.isNil() { + return d.readFile.fsync(ctx) + } return nil } diff --git a/pkg/sentry/fsimpl/gofer/gofer_test.go b/pkg/sentry/fsimpl/gofer/gofer_test.go index 56d80bcf8..36cca3625 100644 --- a/pkg/sentry/fsimpl/gofer/gofer_test.go +++ b/pkg/sentry/fsimpl/gofer/gofer_test.go @@ -20,10 +20,13 @@ import ( "gvisor.dev/gvisor/pkg/p9" "gvisor.dev/gvisor/pkg/sentry/contexttest" + "gvisor.dev/gvisor/pkg/sentry/pgalloc" ) func TestDestroyIdempotent(t *testing.T) { + ctx := contexttest.Context(t) fs := filesystem{ + mfp: pgalloc.MemoryFileProviderFromContext(ctx), syncableDentries: make(map[*dentry]struct{}), opts: filesystemOptions{ // Test relies on no dentry being held in the cache. @@ -31,7 +34,6 @@ func TestDestroyIdempotent(t *testing.T) { }, } - ctx := contexttest.Context(t) attr := &p9.Attr{ Mode: p9.ModeRegular, } diff --git a/pkg/sentry/fsimpl/gofer/handle.go b/pkg/sentry/fsimpl/gofer/handle.go index 8792ca4f2..104157512 100644 --- a/pkg/sentry/fsimpl/gofer/handle.go +++ b/pkg/sentry/fsimpl/gofer/handle.go @@ -63,6 +63,10 @@ func openHandle(ctx context.Context, file p9file, read, write, trunc bool) (hand }, nil } +func (h *handle) isOpen() bool { + return !h.file.isNil() +} + func (h *handle) close(ctx context.Context) { h.file.close(ctx) h.file = p9file{} @@ -124,18 +128,3 @@ func (h *handle) writeFromBlocksAt(ctx context.Context, srcs safemem.BlockSeq, o } return cp, cperr } - -func (h *handle) sync(ctx context.Context) error { - // Handle most common case first. - if h.fd >= 0 { - ctx.UninterruptibleSleepStart(false) - err := syscall.Fsync(int(h.fd)) - ctx.UninterruptibleSleepFinish(false) - return err - } - if h.file.isNil() { - // File hasn't been touched, there is nothing to sync. - return nil - } - return h.file.fsync(ctx) -} diff --git a/pkg/sentry/fsimpl/gofer/regular_file.go b/pkg/sentry/fsimpl/gofer/regular_file.go index db6bed4f6..7e1cbf065 100644 --- a/pkg/sentry/fsimpl/gofer/regular_file.go +++ b/pkg/sentry/fsimpl/gofer/regular_file.go @@ -64,34 +64,34 @@ func (fd *regularFileFD) OnClose(ctx context.Context) error { } d.handleMu.RLock() defer d.handleMu.RUnlock() - return d.handle.file.flush(ctx) + if d.writeFile.isNil() { + return nil + } + return d.writeFile.flush(ctx) } // Allocate implements vfs.FileDescriptionImpl.Allocate. func (fd *regularFileFD) Allocate(ctx context.Context, mode, offset, length uint64) error { - d := fd.dentry() d.metadataMu.Lock() defer d.metadataMu.Unlock() - size := offset + length - // Allocating a smaller size is a noop. - if size <= d.size { + size := offset + length + if d.cachedMetadataAuthoritative() && size <= d.size { return nil } - d.handleMu.Lock() - defer d.handleMu.Unlock() - - err := d.handle.file.allocate(ctx, p9.ToAllocateMode(mode), offset, length) + d.handleMu.RLock() + err := d.writeFile.allocate(ctx, p9.ToAllocateMode(mode), offset, length) + d.handleMu.RUnlock() if err != nil { return err } d.dataMu.Lock() atomic.StoreUint64(&d.size, size) d.dataMu.Unlock() - if !d.cachedMetadataAuthoritative() { + if d.cachedMetadataAuthoritative() { d.touchCMtimeLocked() } return nil @@ -113,7 +113,7 @@ func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offs // Check for reading at EOF before calling into MM (but not under // InteropModeShared, which makes d.size unreliable). d := fd.dentry() - if d.fs.opts.interop != InteropModeShared && uint64(offset) >= atomic.LoadUint64(&d.size) { + if d.cachedMetadataAuthoritative() && uint64(offset) >= atomic.LoadUint64(&d.size) { return 0, io.EOF } @@ -217,16 +217,23 @@ func (fd *regularFileFD) pwrite(ctx context.Context, src usermem.IOSequence, off n, err := src.CopyInTo(ctx, rw) if err != nil { - return n, offset, err + return n, offset + n, err } if n > 0 && fd.vfsfd.StatusFlags()&(linux.O_DSYNC|linux.O_SYNC) != 0 { - // Write dirty cached pages touched by the write back to the remote file. + // Note that if any of the following fail, then we can't guarantee that + // any data was actually written with the semantics of O_DSYNC or + // O_SYNC, so we return zero bytes written. Compare Linux's + // mm/filemap.c:generic_file_write_iter() => + // include/linux/fs.h:generic_write_sync(). + // + // Write dirty cached pages touched by the write back to the remote + // file. if err := d.writeback(ctx, offset, src.NumBytes()); err != nil { - return n, offset, err + return 0, offset, err } // Request the remote filesystem to sync the remote file. - if err := d.handle.sync(ctx); err != nil { - return n, offset, err + if err := d.syncRemoteFile(ctx); err != nil { + return 0, offset, err } } return n, offset + n, nil @@ -317,10 +324,11 @@ func (rw *dentryReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) // coherence with memory-mapped I/O), or if InteropModeShared is in effect // (which prevents us from caching file contents and makes dentry.size // unreliable), or if the file was opened O_DIRECT, read directly from - // dentry.handle without locking dentry.dataMu. + // dentry.readHandleLocked() without locking dentry.dataMu. rw.d.handleMu.RLock() - if (rw.d.handle.fd >= 0 && !rw.d.fs.opts.forcePageCache) || rw.d.fs.opts.interop == InteropModeShared || rw.direct { - n, err := rw.d.handle.readToBlocksAt(rw.ctx, dsts, rw.off) + h := rw.d.readHandleLocked() + if (rw.d.hostFD >= 0 && !rw.d.fs.opts.forcePageCache) || rw.d.fs.opts.interop == InteropModeShared || rw.direct { + n, err := h.readToBlocksAt(rw.ctx, dsts, rw.off) rw.d.handleMu.RUnlock() rw.off += n return n, err @@ -388,7 +396,7 @@ func (rw *dentryReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) End: gapEnd, } optMR := gap.Range() - err := rw.d.cache.Fill(rw.ctx, reqMR, maxFillRange(reqMR, optMR), mf, usage.PageCache, rw.d.handle.readToBlocksAt) + err := rw.d.cache.Fill(rw.ctx, reqMR, maxFillRange(reqMR, optMR), mf, usage.PageCache, h.readToBlocksAt) mf.MarkEvictable(rw.d, pgalloc.EvictableRange{optMR.Start, optMR.End}) seg, gap = rw.d.cache.Find(rw.off) if !seg.Ok() { @@ -403,7 +411,7 @@ func (rw *dentryReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) } else { // Read directly from the file. gapDsts := dsts.TakeFirst64(gapMR.Length()) - n, err := rw.d.handle.readToBlocksAt(rw.ctx, gapDsts, gapMR.Start) + n, err := h.readToBlocksAt(rw.ctx, gapDsts, gapMR.Start) done += n rw.off += n dsts = dsts.DropFirst64(n) @@ -435,11 +443,12 @@ func (rw *dentryReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, erro // If we have a mmappable host FD (which must be used here to ensure // coherence with memory-mapped I/O), or if InteropModeShared is in effect // (which prevents us from caching file contents), or if the file was - // opened with O_DIRECT, write directly to dentry.handle without locking - // dentry.dataMu. + // opened with O_DIRECT, write directly to dentry.writeHandleLocked() + // without locking dentry.dataMu. rw.d.handleMu.RLock() - if (rw.d.handle.fd >= 0 && !rw.d.fs.opts.forcePageCache) || rw.d.fs.opts.interop == InteropModeShared || rw.direct { - n, err := rw.d.handle.writeFromBlocksAt(rw.ctx, srcs, rw.off) + h := rw.d.writeHandleLocked() + if (rw.d.hostFD >= 0 && !rw.d.fs.opts.forcePageCache) || rw.d.fs.opts.interop == InteropModeShared || rw.direct { + n, err := h.writeFromBlocksAt(rw.ctx, srcs, rw.off) rw.off += n rw.d.dataMu.Lock() if rw.off > rw.d.size { @@ -501,7 +510,7 @@ func (rw *dentryReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, erro // for detecting or avoiding this. gapMR := gap.Range().Intersect(mr) gapSrcs := srcs.TakeFirst64(gapMR.Length()) - n, err := rw.d.handle.writeFromBlocksAt(rw.ctx, gapSrcs, gapMR.Start) + n, err := h.writeFromBlocksAt(rw.ctx, gapSrcs, gapMR.Start) done += n rw.off += n srcs = srcs.DropFirst64(n) @@ -527,7 +536,7 @@ exitLoop: if err := fsutil.SyncDirty(rw.ctx, memmap.MappableRange{ Start: start, End: rw.off, - }, &rw.d.cache, &rw.d.dirty, rw.d.size, mf, rw.d.handle.writeFromBlocksAt); err != nil { + }, &rw.d.cache, &rw.d.dirty, rw.d.size, mf, h.writeFromBlocksAt); err != nil { // We have no idea how many bytes were actually flushed. rw.off = start done = 0 @@ -545,6 +554,7 @@ func (d *dentry) writeback(ctx context.Context, offset, size int64) error { } d.handleMu.RLock() defer d.handleMu.RUnlock() + h := d.writeHandleLocked() d.dataMu.Lock() defer d.dataMu.Unlock() // Compute the range of valid bytes (overflow-checked). @@ -558,7 +568,7 @@ func (d *dentry) writeback(ctx context.Context, offset, size int64) error { return fsutil.SyncDirty(ctx, memmap.MappableRange{ Start: uint64(offset), End: uint64(end), - }, &d.cache, &d.dirty, d.size, d.fs.mfp.MemoryFile(), d.handle.writeFromBlocksAt) + }, &d.cache, &d.dirty, d.size, d.fs.mfp.MemoryFile(), h.writeFromBlocksAt) } // Seek implements vfs.FileDescriptionImpl.Seek. @@ -615,24 +625,23 @@ func regularFileSeekLocked(ctx context.Context, d *dentry, fdOffset, offset int6 // Sync implements vfs.FileDescriptionImpl.Sync. func (fd *regularFileFD) Sync(ctx context.Context) error { - return fd.dentry().syncSharedHandle(ctx) + return fd.dentry().syncCachedFile(ctx) } -func (d *dentry) syncSharedHandle(ctx context.Context) error { +func (d *dentry) syncCachedFile(ctx context.Context) error { d.handleMu.RLock() defer d.handleMu.RUnlock() - if d.handleWritable { + if h := d.writeHandleLocked(); h.isOpen() { d.dataMu.Lock() // Write dirty cached data to the remote file. - err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, d.fs.mfp.MemoryFile(), d.handle.writeFromBlocksAt) + err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, d.fs.mfp.MemoryFile(), h.writeFromBlocksAt) d.dataMu.Unlock() if err != nil { return err } } - // Sync the remote file. - return d.handle.sync(ctx) + return d.syncRemoteFileLocked(ctx) } // ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap. @@ -656,7 +665,7 @@ func (fd *regularFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpt return syserror.ENODEV } d.handleMu.RLock() - haveFD := d.handle.fd >= 0 + haveFD := d.hostFD >= 0 d.handleMu.RUnlock() if !haveFD { return syserror.ENODEV @@ -677,7 +686,7 @@ func (d *dentry) mayCachePages() bool { return true } d.handleMu.RLock() - haveFD := d.handle.fd >= 0 + haveFD := d.hostFD >= 0 d.handleMu.RUnlock() return haveFD } @@ -735,7 +744,7 @@ func (d *dentry) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, // Translate implements memmap.Mappable.Translate. func (d *dentry) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) { d.handleMu.RLock() - if d.handle.fd >= 0 && !d.fs.opts.forcePageCache { + if d.hostFD >= 0 && !d.fs.opts.forcePageCache { d.handleMu.RUnlock() mr := optional if d.fs.opts.limitHostFDTranslation { @@ -771,7 +780,8 @@ func (d *dentry) Translate(ctx context.Context, required, optional memmap.Mappab } mf := d.fs.mfp.MemoryFile() - cerr := d.cache.Fill(ctx, required, maxFillRange(required, optional), mf, usage.PageCache, d.handle.readToBlocksAt) + h := d.readHandleLocked() + cerr := d.cache.Fill(ctx, required, maxFillRange(required, optional), mf, usage.PageCache, h.readToBlocksAt) var ts []memmap.Translation var translatedEnd uint64 @@ -840,9 +850,12 @@ func (d *dentry) InvalidateUnsavable(ctx context.Context) error { // Write the cache's contents back to the remote file so that if we have a // host fd after restore, the remote file's contents are coherent. mf := d.fs.mfp.MemoryFile() + d.handleMu.RLock() + defer d.handleMu.RUnlock() + h := d.writeHandleLocked() d.dataMu.Lock() defer d.dataMu.Unlock() - if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, mf, d.handle.writeFromBlocksAt); err != nil { + if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, mf, h.writeFromBlocksAt); err != nil { return err } @@ -857,20 +870,23 @@ func (d *dentry) InvalidateUnsavable(ctx context.Context) error { // Evict implements pgalloc.EvictableMemoryUser.Evict. func (d *dentry) Evict(ctx context.Context, er pgalloc.EvictableRange) { + mr := memmap.MappableRange{er.Start, er.End} + mf := d.fs.mfp.MemoryFile() d.mapsMu.Lock() defer d.mapsMu.Unlock() + d.handleMu.RLock() + defer d.handleMu.RUnlock() + h := d.writeHandleLocked() d.dataMu.Lock() defer d.dataMu.Unlock() - mr := memmap.MappableRange{er.Start, er.End} - mf := d.fs.mfp.MemoryFile() // Only allow pages that are no longer memory-mapped to be evicted. for mgap := d.mappings.LowerBoundGap(mr.Start); mgap.Ok() && mgap.Start() < mr.End; mgap = mgap.NextGap() { mgapMR := mgap.Range().Intersect(mr) if mgapMR.Length() == 0 { continue } - if err := fsutil.SyncDirty(ctx, mgapMR, &d.cache, &d.dirty, d.size, mf, d.handle.writeFromBlocksAt); err != nil { + if err := fsutil.SyncDirty(ctx, mgapMR, &d.cache, &d.dirty, d.size, mf, h.writeFromBlocksAt); err != nil { log.Warningf("Failed to writeback cached data %v: %v", mgapMR, err) } d.cache.Drop(mgapMR, mf) @@ -882,8 +898,8 @@ func (d *dentry) Evict(ctx context.Context, er pgalloc.EvictableRange) { // cannot implement both vfs.DentryImpl.IncRef and memmap.File.IncRef. // // dentryPlatformFile is only used when a host FD representing the remote file -// is available (i.e. dentry.handle.fd >= 0), and that FD is used for -// application memory mappings (i.e. !filesystem.opts.forcePageCache). +// is available (i.e. dentry.hostFD >= 0), and that FD is used for application +// memory mappings (i.e. !filesystem.opts.forcePageCache). type dentryPlatformFile struct { *dentry @@ -891,8 +907,8 @@ type dentryPlatformFile struct { // by dentry.dataMu. fdRefs fsutil.FrameRefSet - // If this dentry represents a regular file, and handle.fd >= 0, - // hostFileMapper caches mappings of handle.fd. + // If this dentry represents a regular file, and dentry.hostFD >= 0, + // hostFileMapper caches mappings of dentry.hostFD. hostFileMapper fsutil.HostFileMapper // hostFileMapperInitOnce is used to lazily initialize hostFileMapper. @@ -916,15 +932,13 @@ func (d *dentryPlatformFile) DecRef(fr memmap.FileRange) { // MapInternal implements memmap.File.MapInternal. func (d *dentryPlatformFile) MapInternal(fr memmap.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) { d.handleMu.RLock() - bs, err := d.hostFileMapper.MapInternal(fr, int(d.handle.fd), at.Write) - d.handleMu.RUnlock() - return bs, err + defer d.handleMu.RUnlock() + return d.hostFileMapper.MapInternal(fr, int(d.hostFD), at.Write) } // FD implements memmap.File.FD. func (d *dentryPlatformFile) FD() int { d.handleMu.RLock() - fd := d.handle.fd - d.handleMu.RUnlock() - return int(fd) + defer d.handleMu.RUnlock() + return int(d.hostFD) } diff --git a/pkg/sentry/fsimpl/gofer/special_file.go b/pkg/sentry/fsimpl/gofer/special_file.go index fc269ef2b..a6368fdd0 100644 --- a/pkg/sentry/fsimpl/gofer/special_file.go +++ b/pkg/sentry/fsimpl/gofer/special_file.go @@ -17,6 +17,7 @@ package gofer import ( "sync" "sync/atomic" + "syscall" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" @@ -279,5 +280,13 @@ func (fd *specialFileFD) Seek(ctx context.Context, offset int64, whence int32) ( // Sync implements vfs.FileDescriptionImpl.Sync. func (fd *specialFileFD) Sync(ctx context.Context) error { - return fd.dentry().syncSharedHandle(ctx) + // If we have a host FD, fsyncing it is likely to be faster than an fsync + // RPC. + if fd.handle.fd >= 0 { + ctx.UninterruptibleSleepStart(false) + err := syscall.Fsync(int(fd.handle.fd)) + ctx.UninterruptibleSleepFinish(false) + return err + } + return fd.handle.file.fsync(ctx) } diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go index 576ab3920..d3c1197e3 100644 --- a/pkg/sentry/vfs/file_description.go +++ b/pkg/sentry/vfs/file_description.go @@ -356,6 +356,8 @@ type FileDescriptionImpl interface { // Allocate grows the file to offset + length bytes. // Only mode == 0 is supported currently. + // + // Preconditions: The FileDescription was opened for writing. Allocate(ctx context.Context, mode, offset, length uint64) error // waiter.Waitable methods may be used to poll for I/O events. @@ -565,6 +567,9 @@ func (fd *FileDescription) StatFS(ctx context.Context) (linux.Statfs, error) { // Allocate grows file represented by FileDescription to offset + length bytes. func (fd *FileDescription) Allocate(ctx context.Context, mode, offset, length uint64) error { + if !fd.IsWritable() { + return syserror.EBADF + } return fd.impl.Allocate(ctx, mode, offset, length) } diff --git a/test/syscalls/BUILD b/test/syscalls/BUILD index c19b30b4a..a31612b41 100644 --- a/test/syscalls/BUILD +++ b/test/syscalls/BUILD @@ -1023,6 +1023,7 @@ syscall_test( syscall_test( add_overlay = True, test = "//test/syscalls/linux:truncate_test", + vfs2 = "True", ) syscall_test( -- cgit v1.2.3 From 0b7cd1e751f70cf6b335754d2b173d9c12b674c7 Mon Sep 17 00:00:00 2001 From: Jamie Liu Date: Fri, 7 Aug 2020 16:33:37 -0700 Subject: Mark dropped pages unevictable in fsimpl/gofer.dentry.destroyLocked. PiperOrigin-RevId: 325531657 --- pkg/sentry/fsimpl/gofer/gofer.go | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go index f1d3bf911..4ac8dd81d 100644 --- a/pkg/sentry/fsimpl/gofer/gofer.go +++ b/pkg/sentry/fsimpl/gofer/gofer.go @@ -1264,8 +1264,11 @@ func (d *dentry) destroyLocked(ctx context.Context) { } } // Discard cached data. - d.cache.DropAll(mf) - d.dirty.RemoveAll() + if !d.cache.IsEmpty() { + mf.MarkAllUnevictable(d) + d.cache.DropAll(mf) + d.dirty.RemoveAll() + } d.dataMu.Unlock() // Clunk open fids and close open host FDs. if !d.readFile.isNil() { -- cgit v1.2.3 From 343661770aa96efe3b539a82addff13df235413f Mon Sep 17 00:00:00 2001 From: Jamie Liu Date: Fri, 7 Aug 2020 18:32:03 -0700 Subject: Don't hold gofer.filesystem.renameMu during dentry destruction. PiperOrigin-RevId: 325546629 --- pkg/sentry/fsimpl/gofer/gofer.go | 20 +++++++++++++++----- pkg/sentry/fsimpl/gofer/gofer_test.go | 2 ++ 2 files changed, 17 insertions(+), 5 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go index 4ac8dd81d..63e589859 100644 --- a/pkg/sentry/fsimpl/gofer/gofer.go +++ b/pkg/sentry/fsimpl/gofer/gofer.go @@ -1158,7 +1158,8 @@ func (d *dentry) OnZeroWatches(ctx context.Context) { // operation. One of the calls may destroy the dentry, so subsequent calls will // do nothing. // -// Preconditions: d.fs.renameMu must be locked for writing. +// Preconditions: d.fs.renameMu must be locked for writing; it may be +// temporarily unlocked. func (d *dentry) checkCachingLocked(ctx context.Context) { // Dentries with a non-zero reference count must be retained. (The only way // to obtain a reference on a dentry with zero references is via path @@ -1238,11 +1239,13 @@ func (d *dentry) checkCachingLocked(ctx context.Context) { } } -// destroyLocked destroys the dentry. It may flushes dirty pages from cache, -// close p9 file and remove reference on parent dentry. +// destroyLocked destroys the dentry. // -// Preconditions: d.fs.renameMu must be locked for writing. d.refs == 0. d is -// not a child dentry. +// Preconditions: +// * d.fs.renameMu must be locked for writing; it may be temporarily unlocked. +// * d.refs == 0. +// * d.parent.children[d.name] != d, i.e. d is not reachable by path traversal +// from its former parent dentry. func (d *dentry) destroyLocked(ctx context.Context) { switch atomic.LoadInt64(&d.refs) { case 0: @@ -1254,6 +1257,10 @@ func (d *dentry) destroyLocked(ctx context.Context) { panic("dentry.destroyLocked() called with references on the dentry") } + // Allow the following to proceed without renameMu locked to improve + // scalability. + d.fs.renameMu.Unlock() + mf := d.fs.mfp.MemoryFile() d.handleMu.Lock() d.dataMu.Lock() @@ -1315,6 +1322,9 @@ func (d *dentry) destroyLocked(ctx context.Context) { delete(d.fs.syncableDentries, d) d.fs.syncMu.Unlock() } + + d.fs.renameMu.Lock() + // Drop the reference held by d on its parent without recursively locking // d.fs.renameMu. if d.parent != nil { diff --git a/pkg/sentry/fsimpl/gofer/gofer_test.go b/pkg/sentry/fsimpl/gofer/gofer_test.go index 36cca3625..bfe75dfe4 100644 --- a/pkg/sentry/fsimpl/gofer/gofer_test.go +++ b/pkg/sentry/fsimpl/gofer/gofer_test.go @@ -52,6 +52,8 @@ func TestDestroyIdempotent(t *testing.T) { } parent.cacheNewChildLocked(child, "child") + fs.renameMu.Lock() + defer fs.renameMu.Unlock() child.checkCachingLocked(ctx) if got := atomic.LoadInt64(&child.refs); got != -1 { t.Fatalf("child.refs=%d, want: -1", got) -- cgit v1.2.3 From 3be26a271cd0fc9618fbcb34e1a2a84c4f234c86 Mon Sep 17 00:00:00 2001 From: Ayush Ranjan Date: Fri, 7 Aug 2020 20:06:39 -0700 Subject: [vfs2] Fix tmpfs mounting. Earlier we were using NLink to decide if /tmp is empty or not. However, NLink at best tells us about the number of subdirectories (via the ".." entries). NLink = n + 2 for n subdirectories. But it does not tell us if the directory is empty. There still might be non-directory files. We could also not rely on NLink because host overlayfs always returned 1. VFS1 uses Readdir to decide if the directory is empty. Used a similar approach. We now use IterDirents to decide if the "/tmp" directory is empty. Fixes #3369 PiperOrigin-RevId: 325554234 --- pkg/sentry/fsimpl/overlay/directory.go | 10 ++++++---- runsc/boot/vfs.go | 27 +++++++++++++++++++-------- 2 files changed, 25 insertions(+), 12 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/overlay/directory.go b/pkg/sentry/fsimpl/overlay/directory.go index fccb94105..6a79f7ffe 100644 --- a/pkg/sentry/fsimpl/overlay/directory.go +++ b/pkg/sentry/fsimpl/overlay/directory.go @@ -51,7 +51,7 @@ func (d *dentry) collectWhiteoutsForRmdirLocked(ctx context.Context) (map[string // Reuse slice allocated for maybeWhiteouts from a previous layer to // reduce allocations. maybeWhiteouts = maybeWhiteouts[:0] - if err := layerFD.IterDirents(ctx, vfs.IterDirentsCallbackFunc(func(dirent vfs.Dirent) error { + err = layerFD.IterDirents(ctx, vfs.IterDirentsCallbackFunc(func(dirent vfs.Dirent) error { if dirent.Name == "." || dirent.Name == ".." { return nil } @@ -68,7 +68,8 @@ func (d *dentry) collectWhiteoutsForRmdirLocked(ctx context.Context) (map[string } // Non-whiteout file in the directory prevents rmdir. return syserror.ENOTEMPTY - })); err != nil { + })) + if err != nil { readdirErr = err return false } @@ -182,7 +183,7 @@ func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) { // Reuse slice allocated for maybeWhiteouts from a previous layer to // reduce allocations. maybeWhiteouts = maybeWhiteouts[:0] - if err := layerFD.IterDirents(ctx, vfs.IterDirentsCallbackFunc(func(dirent vfs.Dirent) error { + err = layerFD.IterDirents(ctx, vfs.IterDirentsCallbackFunc(func(dirent vfs.Dirent) error { if dirent.Name == "." || dirent.Name == ".." { return nil } @@ -201,7 +202,8 @@ func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) { dirent.NextOff = int64(len(dirents) + 1) dirents = append(dirents, dirent) return nil - })); err != nil { + })) + if err != nil { readdirErr = err return false } diff --git a/runsc/boot/vfs.go b/runsc/boot/vfs.go index e7d6035bb..08dce8b6c 100644 --- a/runsc/boot/vfs.go +++ b/runsc/boot/vfs.go @@ -400,21 +400,28 @@ func (c *containerMounter) mountTmpVFS2(ctx context.Context, conf *Config, creds Path: fspath.Parse("/tmp"), } // TODO(gvisor.dev/issue/2782): Use O_PATH when available. - statx, err := c.k.VFS().StatAt(ctx, creds, &pop, &vfs.StatOptions{}) + fd, err := c.k.VFS().OpenAt(ctx, creds, &pop, &vfs.OpenOptions{Flags: linux.O_RDONLY | linux.O_DIRECTORY}) switch err { case nil: - // Found '/tmp' in filesystem, check if it's empty. - if linux.FileMode(statx.Mode).FileType() != linux.ModeDirectory { - // Not a dir?! Leave it be. + defer fd.DecRef(ctx) + + err := fd.IterDirents(ctx, vfs.IterDirentsCallbackFunc(func(dirent vfs.Dirent) error { + if dirent.Name != "." && dirent.Name != ".." { + return syserror.ENOTEMPTY + } return nil - } - if statx.Nlink > 2 { + })) + switch err { + case nil: + log.Infof(`Mounting internal tmpfs on top of empty "/tmp"`) + case syserror.ENOTEMPTY: // If more than "." and ".." is found, skip internal tmpfs to prevent // hiding existing files. log.Infof(`Skipping internal tmpfs mount for "/tmp" because it's not empty`) return nil + default: + return err } - log.Infof(`Mounting internal tmpfs on top of empty "/tmp"`) fallthrough case syserror.ENOENT: @@ -429,8 +436,12 @@ func (c *containerMounter) mountTmpVFS2(ctx context.Context, conf *Config, creds } return c.mountSubmountVFS2(ctx, conf, mns, creds, &mountAndFD{Mount: tmpMount}) + case syserror.ENOTDIR: + // Not a dir?! Let it be. + return nil + default: - return fmt.Errorf(`stating "/tmp" inside container: %w`, err) + return fmt.Errorf(`opening "/tmp" inside container: %w`, err) } } -- cgit v1.2.3 From 51e64d2fc590b0271d4e0cbbc75882cf81ada182 Mon Sep 17 00:00:00 2001 From: Craig Chi Date: Mon, 10 Aug 2020 18:15:32 -0700 Subject: Implement FUSE_GETATTR FUSE_GETATTR is called when a stat(2), fstat(2), or lstat(2) is issued from VFS2 layer to a FUSE filesystem. Fixes #3175 --- pkg/abi/linux/fuse.go | 55 +++++++++++++ pkg/sentry/fsimpl/fuse/fusefs.go | 96 ++++++++++++++++++++++ test/fuse/BUILD | 8 ++ test/fuse/linux/BUILD | 21 +++-- test/fuse/linux/fuse_base.cc | 15 ++-- test/fuse/linux/fuse_base.h | 6 +- test/fuse/linux/stat_test.cc | 169 +++++++++++++++++++++++++++++++++++++++ test/runner/defs.bzl | 45 +++++------ 8 files changed, 375 insertions(+), 40 deletions(-) create mode 100644 test/fuse/linux/stat_test.cc (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/abi/linux/fuse.go b/pkg/abi/linux/fuse.go index 5c6ffe4a3..7e30483ee 100644 --- a/pkg/abi/linux/fuse.go +++ b/pkg/abi/linux/fuse.go @@ -246,3 +246,58 @@ type FUSEInitOut struct { _ [8]uint32 } + +// FUSEGetAttrIn is the request sent by the kernel to the daemon, +// to get the attribute of a inode. +// +// +marshal +type FUSEGetAttrIn struct { + // GetAttrFlags specifies whether getattr request is sent with a nodeid or + // with a file handle. + GetAttrFlags uint32 + + _ uint32 + + // Fh is the file handler when GetAttrFlags has FUSE_GETATTR_FH bit. If + // used, the operation is analogous to fstat(2). + Fh uint64 +} + +// FUSEAttr is the struct used in the reponse FUSEGetAttrOut. +// +// +marshal +type FUSEAttr struct { + Ino uint64 + Size uint64 + Blocks uint64 + Atime uint64 + Mtime uint64 + Ctime uint64 + AtimeNsec uint32 + MtimeNsec uint32 + CtimeNsec uint32 + Mode uint32 + Nlink uint32 + UID uint32 + GID uint32 + Rdev uint32 + BlkSize uint32 + _ uint32 +} + +// FUSEGetAttrOut is the reply sent by the daemon to the kernel +// for FUSEGetAttrIn. +// +// +marshal +type FUSEGetAttrOut struct { + // AttrValid and AttrValidNsec describe the attribute cache duration + AttrValid uint64 + + // AttrValidNsec is the nanosecond part of the attribute cache duration + AttrValidNsec uint32 + + _ uint32 + + // Attr contains the metadata returned from the FUSE server + Attr FUSEAttr +} diff --git a/pkg/sentry/fsimpl/fuse/fusefs.go b/pkg/sentry/fsimpl/fuse/fusefs.go index a1405f7c3..83c24ec25 100644 --- a/pkg/sentry/fsimpl/fuse/fusefs.go +++ b/pkg/sentry/fsimpl/fuse/fusefs.go @@ -226,3 +226,99 @@ func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentr } return fd.VFSFileDescription(), nil } + +// statFromFUSEAttr makes attributes from linux.FUSEAttr to linux.Statx. The +// opts.Sync attribute is ignored since the synchronization is handled by the +// FUSE server. +func statFromFUSEAttr(attr linux.FUSEAttr, mask, devMinor uint32) linux.Statx { + var stat linux.Statx + stat.Blksize = attr.BlkSize + stat.DevMajor, stat.DevMinor = linux.UNNAMED_MAJOR, devMinor + + rdevMajor, rdevMinor := linux.DecodeDeviceID(attr.Rdev) + stat.RdevMajor, stat.RdevMinor = uint32(rdevMajor), rdevMinor + + if mask&linux.STATX_MODE != 0 { + stat.Mode = uint16(attr.Mode) + } + if mask&linux.STATX_NLINK != 0 { + stat.Nlink = attr.Nlink + } + if mask&linux.STATX_UID != 0 { + stat.UID = attr.UID + } + if mask&linux.STATX_GID != 0 { + stat.GID = attr.GID + } + if mask&linux.STATX_ATIME != 0 { + stat.Atime = linux.StatxTimestamp{ + Sec: int64(attr.Atime), + Nsec: attr.AtimeNsec, + } + } + if mask&linux.STATX_MTIME != 0 { + stat.Mtime = linux.StatxTimestamp{ + Sec: int64(attr.Mtime), + Nsec: attr.MtimeNsec, + } + } + if mask&linux.STATX_CTIME != 0 { + stat.Ctime = linux.StatxTimestamp{ + Sec: int64(attr.Ctime), + Nsec: attr.CtimeNsec, + } + } + if mask&linux.STATX_INO != 0 { + stat.Ino = attr.Ino + } + if mask&linux.STATX_SIZE != 0 { + stat.Size = attr.Size + } + if mask&linux.STATX_BLOCKS != 0 { + stat.Blocks = attr.Blocks + } + return stat +} + +// Stat implements kernfs.Inode.Stat. +func (i *inode) Stat(ctx context.Context, fs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) { + fusefs := fs.Impl().(*filesystem) + conn := fusefs.conn + task, creds := kernel.TaskFromContext(ctx), auth.CredentialsFromContext(ctx) + if task == nil { + log.Warningf("couldn't get kernel task from context") + return linux.Statx{}, syserror.EINVAL + } + + var in linux.FUSEGetAttrIn + // We don't set any attribute in the request, because in VFS2 fstat(2) will + // finally be translated into vfs.FilesystemImpl.StatAt() (see + // pkg/sentry/syscalls/linux/vfs2/stat.go), resulting in the same flow + // as stat(2). Thus GetAttrFlags and Fh variable will never be used in VFS2. + req, err := conn.NewRequest(creds, uint32(task.ThreadID()), i.Ino(), linux.FUSE_GETATTR, &in) + if err != nil { + return linux.Statx{}, err + } + + res, err := conn.Call(task, req) + if err != nil { + return linux.Statx{}, err + } + if err := res.Error(); err != nil { + return linux.Statx{}, err + } + + var out linux.FUSEGetAttrOut + if err := res.UnmarshalPayload(&out); err != nil { + return linux.Statx{}, err + } + + // Set all metadata into kernfs.InodeAttrs. + if err := i.SetStat(ctx, fs, creds, vfs.SetStatOptions{ + Stat: statFromFUSEAttr(out.Attr, linux.STATX_ALL, fusefs.devMinor), + }); err != nil { + return linux.Statx{}, err + } + + return statFromFUSEAttr(out.Attr, opts.Mask, fusefs.devMinor), nil +} diff --git a/test/fuse/BUILD b/test/fuse/BUILD index 34b950644..56157c96b 100644 --- a/test/fuse/BUILD +++ b/test/fuse/BUILD @@ -1 +1,9 @@ +load("//test/runner:defs.bzl", "syscall_test") + package(licenses = ["notice"]) + +syscall_test( + fuse = "True", + test = "//test/fuse/linux:stat_test", + vfs2 = "True", +) diff --git a/test/fuse/linux/BUILD b/test/fuse/linux/BUILD index 49dc96c20..4871bb531 100644 --- a/test/fuse/linux/BUILD +++ b/test/fuse/linux/BUILD @@ -1,20 +1,31 @@ -load("//tools:defs.bzl", "cc_library", "gtest") +load("//tools:defs.bzl", "cc_binary", "cc_library", "gtest") package( default_visibility = ["//:sandbox"], licenses = ["notice"], ) +cc_binary( + name = "stat_test", + testonly = 1, + srcs = ["stat_test.cc"], + deps = [ + gtest, + ":fuse_base", + "//test/util:test_main", + "//test/util:test_util", + ], +) + cc_library( name = "fuse_base", testonly = 1, - srcs = [ - "fuse_base.cc", - "fuse_base.h", - ], + srcs = ["fuse_base.cc"], + hdrs = ["fuse_base.h"], deps = [ gtest, "//test/util:posix_error", + "//test/util:temp_path", "//test/util:test_util", "@com_google_absl//absl/strings:str_format", ], diff --git a/test/fuse/linux/fuse_base.cc b/test/fuse/linux/fuse_base.cc index ce69276c9..4a2c64998 100644 --- a/test/fuse/linux/fuse_base.cc +++ b/test/fuse/linux/fuse_base.cc @@ -12,23 +12,23 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "fuse_base.h" +#include "test/fuse/linux/fuse_base.h" #include #include -#include #include #include #include +#include #include #include #include -#include "gmock/gmock.h" -#include "gtest/gtest.h" #include "absl/strings/str_format.h" +#include "gtest/gtest.h" #include "test/util/posix_error.h" +#include "test/util/temp_path.h" #include "test/util/test_util.h" namespace gvisor { @@ -78,13 +78,14 @@ void FuseTest::MountFuse() { EXPECT_THAT(dev_fd_ = open("/dev/fuse", O_RDWR), SyscallSucceeds()); std::string mount_opts = absl::StrFormat("fd=%d,%s", dev_fd_, kMountOpts); - EXPECT_THAT(mount("fuse", kMountPoint, "fuse", MS_NODEV | MS_NOSUID, - mount_opts.c_str()), + mount_point_ = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); + EXPECT_THAT(mount("fuse", mount_point_.path().c_str(), "fuse", + MS_NODEV | MS_NOSUID, mount_opts.c_str()), SyscallSucceedsWithValue(0)); } void FuseTest::UnmountFuse() { - EXPECT_THAT(umount(kMountPoint), SyscallSucceeds()); + EXPECT_THAT(umount(mount_point_.path().c_str()), SyscallSucceeds()); // TODO(gvisor.dev/issue/3330): ensure the process is terminated successfully. } diff --git a/test/fuse/linux/fuse_base.h b/test/fuse/linux/fuse_base.h index b008778de..3a2f255a9 100644 --- a/test/fuse/linux/fuse_base.h +++ b/test/fuse/linux/fuse_base.h @@ -20,14 +20,13 @@ #include -#include "gmock/gmock.h" #include "gtest/gtest.h" #include "test/util/posix_error.h" +#include "test/util/temp_path.h" namespace gvisor { namespace testing { -constexpr char kMountPoint[] = "/mnt"; constexpr char kMountOpts[] = "rootmode=755,user_id=0,group_id=0"; class FuseTest : public ::testing::Test { @@ -55,6 +54,9 @@ class FuseTest : public ::testing::Test { // complains if the FUSE server responds failure during tests. void WaitCompleted(); + protected: + TempPath mount_point_; + private: void MountFuse(); void UnmountFuse(); diff --git a/test/fuse/linux/stat_test.cc b/test/fuse/linux/stat_test.cc new file mode 100644 index 000000000..172e09867 --- /dev/null +++ b/test/fuse/linux/stat_test.cc @@ -0,0 +1,169 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "gtest/gtest.h" +#include "test/fuse/linux/fuse_base.h" +#include "test/util/test_util.h" + +namespace gvisor { +namespace testing { + +namespace { + +class StatTest : public FuseTest { + public: + bool CompareRequest(void* expected_mem, size_t expected_len, void* real_mem, + size_t real_len) override { + if (expected_len != real_len) return false; + struct fuse_in_header* real_header = + reinterpret_cast(real_mem); + + if (real_header->opcode != FUSE_GETATTR) { + std::cerr << "expect header opcode " << FUSE_GETATTR << " but got " + << real_header->opcode << std::endl; + return false; + } + return true; + } + + bool StatsAreEqual(struct stat expected, struct stat actual) { + // device number will be dynamically allocated by kernel, we cannot know + // in advance + actual.st_dev = expected.st_dev; + return memcmp(&expected, &actual, sizeof(struct stat)) == 0; + } +}; + +TEST_F(StatTest, StatNormal) { + struct iovec iov_in[2]; + struct iovec iov_out[2]; + + struct fuse_in_header in_header = { + .len = sizeof(struct fuse_in_header) + sizeof(struct fuse_getattr_in), + .opcode = FUSE_GETATTR, + .unique = 4, + .nodeid = 1, + .uid = 0, + .gid = 0, + .pid = 4, + .padding = 0, + }; + struct fuse_getattr_in in_payload = {0}; + iov_in[0].iov_len = sizeof(in_header); + iov_in[0].iov_base = &in_header; + iov_in[1].iov_len = sizeof(in_payload); + iov_in[1].iov_base = &in_payload; + + mode_t expected_mode = S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH; + struct timespec atime = {.tv_sec = 1595436289, .tv_nsec = 134150844}; + struct timespec mtime = {.tv_sec = 1595436290, .tv_nsec = 134150845}; + struct timespec ctime = {.tv_sec = 1595436291, .tv_nsec = 134150846}; + struct fuse_out_header out_header = { + .len = sizeof(struct fuse_out_header) + sizeof(struct fuse_attr_out), + .error = 0, + .unique = 4, + }; + struct fuse_attr attr = { + .ino = 1, + .size = 512, + .blocks = 4, + .atime = static_cast(atime.tv_sec), + .mtime = static_cast(mtime.tv_sec), + .ctime = static_cast(ctime.tv_sec), + .atimensec = static_cast(atime.tv_nsec), + .mtimensec = static_cast(mtime.tv_nsec), + .ctimensec = static_cast(ctime.tv_nsec), + .mode = expected_mode, + .nlink = 2, + .uid = 1234, + .gid = 4321, + .rdev = 12, + .blksize = 4096, + }; + struct fuse_attr_out out_payload = { + .attr = attr, + }; + iov_out[0].iov_len = sizeof(out_header); + iov_out[0].iov_base = &out_header; + iov_out[1].iov_len = sizeof(out_payload); + iov_out[1].iov_base = &out_payload; + + SetExpected(iov_in, 2, iov_out, 2); + + struct stat stat_buf; + EXPECT_THAT(stat(mount_point_.path().c_str(), &stat_buf), SyscallSucceeds()); + + struct stat expected_stat = { + .st_ino = attr.ino, + .st_nlink = attr.nlink, + .st_mode = expected_mode, + .st_uid = attr.uid, + .st_gid = attr.gid, + .st_rdev = attr.rdev, + .st_size = static_cast(attr.size), + .st_blksize = attr.blksize, + .st_blocks = static_cast(attr.blocks), + .st_atim = atime, + .st_mtim = mtime, + .st_ctim = ctime, + }; + EXPECT_TRUE(StatsAreEqual(stat_buf, expected_stat)); + WaitCompleted(); +} + +TEST_F(StatTest, StatNotFound) { + struct iovec iov_in[2]; + struct iovec iov_out[2]; + + struct fuse_in_header in_header = { + .len = sizeof(struct fuse_in_header) + sizeof(struct fuse_getattr_in), + .opcode = FUSE_GETATTR, + .unique = 4, + }; + struct fuse_getattr_in in_payload = {0}; + iov_in[0].iov_len = sizeof(in_header); + iov_in[0].iov_base = &in_header; + iov_in[1].iov_len = sizeof(in_payload); + iov_in[1].iov_base = &in_payload; + + struct fuse_out_header out_header = { + .len = sizeof(struct fuse_out_header), + .error = -ENOENT, + .unique = 4, + }; + iov_out[0].iov_len = sizeof(out_header); + iov_out[0].iov_base = &out_header; + + SetExpected(iov_in, 2, iov_out, 1); + + struct stat stat_buf; + EXPECT_THAT(stat(mount_point_.path().c_str(), &stat_buf), + SyscallFailsWithErrno(ENOENT)); + WaitCompleted(); +} + +} // namespace + +} // namespace testing +} // namespace gvisor diff --git a/test/runner/defs.bzl b/test/runner/defs.bzl index ba4732ca6..1ae13a4a5 100644 --- a/test/runner/defs.bzl +++ b/test/runner/defs.bzl @@ -150,31 +150,12 @@ def syscall_test( if not tags: tags = [] - _syscall_test( - test = test, - shard_count = shard_count, - size = size, - platform = "native", - use_tmpfs = False, - add_uds_tree = add_uds_tree, - tags = list(tags), - ) - - for (platform, platform_tags) in platforms.items(): - _syscall_test( - test = test, - shard_count = shard_count, - size = size, - platform = platform, - use_tmpfs = use_tmpfs, - add_uds_tree = add_uds_tree, - tags = platform_tags + tags, - ) - vfs2_tags = list(tags) if vfs2: # Add tag to easily run VFS2 tests with --test_tag_filters=vfs2 vfs2_tags.append("vfs2") + if fuse: + vfs2_tags.append("fuse") else: # Don't automatically run tests tests not yet passing. @@ -191,19 +172,31 @@ def syscall_test( add_uds_tree = add_uds_tree, tags = platforms[default_platform] + vfs2_tags, vfs2 = True, + fuse = fuse, + ) + if fuse: + # Only generate *_vfs2_fuse target if fuse parameter is enabled. + return + + _syscall_test( + test = test, + shard_count = shard_count, + size = size, + platform = "native", + use_tmpfs = False, + add_uds_tree = add_uds_tree, + tags = list(tags), ) - if vfs2 and fuse: + for (platform, platform_tags) in platforms.items(): _syscall_test( test = test, shard_count = shard_count, size = size, - platform = default_platform, + platform = platform, use_tmpfs = use_tmpfs, add_uds_tree = add_uds_tree, - tags = platforms[default_platform] + vfs2_tags + ["fuse"], - vfs2 = True, - fuse = True, + tags = platform_tags + tags, ) # TODO(gvisor.dev/issue/1487): Enable VFS2 overlay tests. -- cgit v1.2.3 From d797f2666629737920b39a2fbac369ce5853dc35 Mon Sep 17 00:00:00 2001 From: Fabricio Voznika Date: Wed, 12 Aug 2020 13:12:05 -0700 Subject: Redirect TODO Fixes #2923 PiperOrigin-RevId: 326296589 --- pkg/sentry/fsimpl/host/host.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go index bf922c566..bd6caba06 100644 --- a/pkg/sentry/fsimpl/host/host.go +++ b/pkg/sentry/fsimpl/host/host.go @@ -552,7 +552,7 @@ func (f *fileDescription) Allocate(ctx context.Context, mode, offset, length uin return syserror.ESPIPE } - // TODO(gvisor.dev/issue/2923): Implement Allocate for non-pipe hostfds. + // TODO(gvisor.dev/issue/3589): Implement Allocate for non-pipe hostfds. return syserror.EOPNOTSUPP } -- cgit v1.2.3 From 42b610d56750b4bb8e3d69b680e4fb538f8fb554 Mon Sep 17 00:00:00 2001 From: Ayush Ranjan Date: Wed, 12 Aug 2020 17:17:17 -0700 Subject: [vfs2][gofer] Return appropriate errors when opening and creating files. Fixes php test ext/standard/tests/file/touch_variation5.phpt on vfs2. Updates #3516 Also spotted a bug with O_EXCL, where we did not return EEXIST when we tried to open the root of the filesystem with O_EXCL | O_CREAT. Added some more tests for open() corner cases. PiperOrigin-RevId: 326346863 --- pkg/sentry/fsimpl/gofer/filesystem.go | 11 +++++++++++ pkg/sentry/fsimpl/tmpfs/filesystem.go | 2 +- test/syscalls/linux/open.cc | 32 ++++++++++++++++++++++++++++++++ 3 files changed, 44 insertions(+), 1 deletion(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go index eaef2594d..40fec890a 100644 --- a/pkg/sentry/fsimpl/gofer/filesystem.go +++ b/pkg/sentry/fsimpl/gofer/filesystem.go @@ -844,6 +844,13 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf } } if rp.Done() { + // Reject attempts to open mount root directory with O_CREAT. + if mayCreate && rp.MustBeDir() { + return nil, syserror.EISDIR + } + if mustCreate { + return nil, syserror.EEXIST + } return start.openLocked(ctx, rp, &opts) } @@ -856,6 +863,10 @@ afterTrailingSymlink: if err := parent.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { return nil, err } + // Reject attempts to open directories with O_CREAT. + if mayCreate && rp.MustBeDir() { + return nil, syserror.EISDIR + } // Determine whether or not we need to create a file. parent.dirMu.Lock() child, err := fs.stepLocked(ctx, rp, parent, false /* mayFollowSymlinks */, &ds) diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go index 065812065..a4864df53 100644 --- a/pkg/sentry/fsimpl/tmpfs/filesystem.go +++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go @@ -320,7 +320,7 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf fs.mu.Lock() defer fs.mu.Unlock() if rp.Done() { - // Reject attempts to open directories with O_CREAT. + // Reject attempts to open mount root directory with O_CREAT. if rp.MustBeDir() { return nil, syserror.EISDIR } diff --git a/test/syscalls/linux/open.cc b/test/syscalls/linux/open.cc index bf350946b..c7147c20b 100644 --- a/test/syscalls/linux/open.cc +++ b/test/syscalls/linux/open.cc @@ -95,6 +95,38 @@ TEST_F(OpenTest, OTruncAndReadOnlyFile) { Open(dirpath.c_str(), O_TRUNC | O_RDONLY, 0666)); } +TEST_F(OpenTest, OCreateDirectory) { + SKIP_IF(IsRunningWithVFS1()); + auto dirpath = GetAbsoluteTestTmpdir(); + + // Normal case: existing directory. + ASSERT_THAT(open(dirpath.c_str(), O_RDWR | O_CREAT, 0666), + SyscallFailsWithErrno(EISDIR)); + // Trailing separator on existing directory. + ASSERT_THAT(open(dirpath.append("/").c_str(), O_RDWR | O_CREAT, 0666), + SyscallFailsWithErrno(EISDIR)); + // Trailing separator on non-existing directory. + ASSERT_THAT(open(JoinPath(dirpath, "non-existent").append("/").c_str(), + O_RDWR | O_CREAT, 0666), + SyscallFailsWithErrno(EISDIR)); + // "." special case. + ASSERT_THAT(open(JoinPath(dirpath, ".").c_str(), O_RDWR | O_CREAT, 0666), + SyscallFailsWithErrno(EISDIR)); +} + +TEST_F(OpenTest, MustCreateExisting) { + auto dirPath = GetAbsoluteTestTmpdir(); + + // Existing directory. + ASSERT_THAT(open(dirPath.c_str(), O_RDWR | O_CREAT | O_EXCL, 0666), + SyscallFailsWithErrno(EEXIST)); + + // Existing file. + auto newFile = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateFileIn(dirPath)); + ASSERT_THAT(open(newFile.path().c_str(), O_RDWR | O_CREAT | O_EXCL, 0666), + SyscallFailsWithErrno(EEXIST)); +} + TEST_F(OpenTest, ReadOnly) { char buf; const FileDescriptor ro_file = -- cgit v1.2.3 From 36134667b289a3e96e90abf9e18e1c2137069d6b Mon Sep 17 00:00:00 2001 From: Dean Deng Date: Wed, 12 Aug 2020 18:09:34 -0700 Subject: Add reference leak checking to vfs2 tmpfs.inode. Updates #1486. PiperOrigin-RevId: 326354750 --- pkg/refs_vfs2/refs_template.go | 2 +- pkg/sentry/fsimpl/tmpfs/BUILD | 13 +++++++++++++ pkg/sentry/fsimpl/tmpfs/tmpfs.go | 27 ++++++--------------------- 3 files changed, 20 insertions(+), 22 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/refs_vfs2/refs_template.go b/pkg/refs_vfs2/refs_template.go index 3e5b458c7..99c43c065 100644 --- a/pkg/refs_vfs2/refs_template.go +++ b/pkg/refs_vfs2/refs_template.go @@ -59,7 +59,7 @@ func (r *Refs) finalize() { note = "(Leak checker uninitialized): " } if n := r.ReadRefs(); n != 0 { - log.Warningf("%sAtomicRefCount %p owned by %T garbage collected with ref count of %d (want 0)", note, r, ownerType, n) + log.Warningf("%sRefs %p owned by %T garbage collected with ref count of %d (want 0)", note, r, ownerType, n) } } diff --git a/pkg/sentry/fsimpl/tmpfs/BUILD b/pkg/sentry/fsimpl/tmpfs/BUILD index e73732a6b..5cd428d64 100644 --- a/pkg/sentry/fsimpl/tmpfs/BUILD +++ b/pkg/sentry/fsimpl/tmpfs/BUILD @@ -26,6 +26,17 @@ go_template_instance( }, ) +go_template_instance( + name = "inode_refs", + out = "inode_refs.go", + package = "tmpfs", + prefix = "inode", + template = "//pkg/refs_vfs2:refs_template", + types = { + "T": "inode", + }, +) + go_library( name = "tmpfs", srcs = [ @@ -34,6 +45,7 @@ go_library( "directory.go", "filesystem.go", "fstree.go", + "inode_refs.go", "named_pipe.go", "regular_file.go", "socket_file.go", @@ -47,6 +59,7 @@ go_library( "//pkg/context", "//pkg/fspath", "//pkg/log", + "//pkg/refs", "//pkg/safemem", "//pkg/sentry/arch", "//pkg/sentry/fs", diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go index 4681a2f52..de2af6d01 100644 --- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go +++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go @@ -285,13 +285,10 @@ type inode struct { // fs is the owning filesystem. fs is immutable. fs *filesystem - // refs is a reference count. refs is accessed using atomic memory - // operations. - // // A reference is held on all inodes as long as they are reachable in the // filesystem tree, i.e. nlink is nonzero. This reference is dropped when // nlink reaches 0. - refs int64 + refs inodeRefs // xattrs implements extended attributes. // @@ -327,7 +324,6 @@ func (i *inode) init(impl interface{}, fs *filesystem, kuid auth.KUID, kgid auth panic("file type is required in FileMode") } i.fs = fs - i.refs = 1 i.mode = uint32(mode) i.uid = uint32(kuid) i.gid = uint32(kgid) @@ -339,6 +335,7 @@ func (i *inode) init(impl interface{}, fs *filesystem, kuid auth.KUID, kgid auth i.mtime = now // i.nlink initialized by caller i.impl = impl + i.refs.EnableLeakCheck() } // incLinksLocked increments i's link count. @@ -369,25 +366,15 @@ func (i *inode) decLinksLocked(ctx context.Context) { } func (i *inode) incRef() { - if atomic.AddInt64(&i.refs, 1) <= 1 { - panic("tmpfs.inode.incRef() called without holding a reference") - } + i.refs.IncRef() } func (i *inode) tryIncRef() bool { - for { - refs := atomic.LoadInt64(&i.refs) - if refs == 0 { - return false - } - if atomic.CompareAndSwapInt64(&i.refs, refs, refs+1) { - return true - } - } + return i.refs.TryIncRef() } func (i *inode) decRef(ctx context.Context) { - if refs := atomic.AddInt64(&i.refs, -1); refs == 0 { + i.refs.DecRef(func() { i.watches.HandleDeletion(ctx) if regFile, ok := i.impl.(*regularFile); ok { // Release memory used by regFile to store data. Since regFile is @@ -395,9 +382,7 @@ func (i *inode) decRef(ctx context.Context) { // metadata. regFile.data.DropAll(regFile.memFile) } - } else if refs < 0 { - panic("tmpfs.inode.decRef() called without holding a reference") - } + }) } func (i *inode) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error { -- cgit v1.2.3 From d6520e1d0592f99161faedef3eba49439b140917 Mon Sep 17 00:00:00 2001 From: Ayush Ranjan Date: Thu, 13 Aug 2020 19:31:17 -0700 Subject: [vfs2][gofer] Fix file creation flags sent to gofer. Fixes php runtime test ext/standard/tests/file/readfile_basic.phpt Fixes #3516 fsgofers only want the access mode in the OpenFlags passed to Create(). If more flags are supplied (like O_APPEND in this case), read/write from that fd will fail with EBADF. See runsc/fsgofer/fsgofer.go:WriteAt() VFS2 was providing more than just access modes. So filtering the flags using p9.OpenFlagsModeMask == linux.O_ACCMODE fixes the issue. Gofer in VFS1 also only extracts the access mode flags while making the create RPC. See pkg/sentry/fs/gofer/path.go:Create() Even in VFS2, when we open a handle, we extract out only the access mode flags + O_TRUNC. See third_party/gvisor/pkg/sentry/fsimpl/gofer/handle.go:openHandle() Added a test for this. PiperOrigin-RevId: 326574829 --- pkg/sentry/fsimpl/gofer/filesystem.go | 6 ++---- test/syscalls/linux/open.cc | 20 ++++++++++++++++++++ 2 files changed, 22 insertions(+), 4 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go index 40fec890a..610a7ed78 100644 --- a/pkg/sentry/fsimpl/gofer/filesystem.go +++ b/pkg/sentry/fsimpl/gofer/filesystem.go @@ -1082,10 +1082,8 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving } creds := rp.Credentials() name := rp.Component() - // Filter file creation flags and O_LARGEFILE out; the create RPC already - // has the semantics of O_CREAT|O_EXCL, while some servers will choke on - // O_LARGEFILE. - createFlags := p9.OpenFlags(opts.Flags &^ (vfs.FileCreationFlags | linux.O_LARGEFILE)) + // We only want the access mode for creating the file. + createFlags := p9.OpenFlags(opts.Flags) & p9.OpenFlagsModeMask fdobj, openFile, createQID, _, err := dirfile.create(ctx, name, createFlags, (p9.FileMode)(opts.Mode), (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID)) if err != nil { dirfile.close(ctx) diff --git a/test/syscalls/linux/open.cc b/test/syscalls/linux/open.cc index c7147c20b..8f0c9cb49 100644 --- a/test/syscalls/linux/open.cc +++ b/test/syscalls/linux/open.cc @@ -147,6 +147,26 @@ TEST_F(OpenTest, WriteOnly) { EXPECT_THAT(write(wo_file.get(), &buf, 1), SyscallSucceedsWithValue(1)); } +TEST_F(OpenTest, CreateWithAppend) { + std::string data = "text"; + std::string new_file = NewTempAbsPath(); + const FileDescriptor file = ASSERT_NO_ERRNO_AND_VALUE( + Open(new_file, O_WRONLY | O_APPEND | O_CREAT, 0666)); + EXPECT_THAT(write(file.get(), data.c_str(), data.size()), + SyscallSucceedsWithValue(data.size())); + EXPECT_THAT(lseek(file.get(), 0, SEEK_SET), SyscallSucceeds()); + EXPECT_THAT(write(file.get(), data.c_str(), data.size()), + SyscallSucceedsWithValue(data.size())); + + // Check that the size of the file is correct and that the offset has been + // incremented to that size. + struct stat s0; + EXPECT_THAT(fstat(file.get(), &s0), SyscallSucceeds()); + EXPECT_EQ(s0.st_size, 2 * data.size()); + EXPECT_THAT(lseek(file.get(), 0, SEEK_CUR), + SyscallSucceedsWithValue(2 * data.size())); +} + TEST_F(OpenTest, ReadWrite) { char buf; const FileDescriptor rw_file = -- cgit v1.2.3 From e3e1b36896750e9c4d07ecb6eaf1a738b08cd876 Mon Sep 17 00:00:00 2001 From: Ayush Ranjan Date: Mon, 17 Aug 2020 13:24:09 -0700 Subject: [vfs] Do O_DIRECTORY check after resolving symlinks. Fixes python runtime test test_glob. Updates #3515 We were checking is the to-be-opened dentry is a dir or not before resolving symlinks. We should check that after resolving symlinks. This was preventing us from opening a symlink which pointed to a directory with O_DIRECTORY. Also added this check in tmpfs and removed a duplicate check. PiperOrigin-RevId: 327085895 --- pkg/sentry/fsimpl/gofer/filesystem.go | 6 +++--- pkg/sentry/fsimpl/tmpfs/filesystem.go | 5 ++--- test/syscalls/linux/open.cc | 8 ++++++++ 3 files changed, 13 insertions(+), 6 deletions(-) (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go index 610a7ed78..a3903db33 100644 --- a/pkg/sentry/fsimpl/gofer/filesystem.go +++ b/pkg/sentry/fsimpl/gofer/filesystem.go @@ -886,9 +886,6 @@ afterTrailingSymlink: if mustCreate { return nil, syserror.EEXIST } - if !child.isDir() && rp.MustBeDir() { - return nil, syserror.ENOTDIR - } // Open existing child or follow symlink. if child.isSymlink() && rp.ShouldFollowSymlink() { target, err := child.readlink(ctx, rp.Mount()) @@ -901,6 +898,9 @@ afterTrailingSymlink: start = parent goto afterTrailingSymlink } + if rp.MustBeDir() && !child.isDir() { + return nil, syserror.ENOTDIR + } return child.openLocked(ctx, rp, &opts) } diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go index a4864df53..cb8b2d944 100644 --- a/pkg/sentry/fsimpl/tmpfs/filesystem.go +++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go @@ -389,9 +389,8 @@ afterTrailingSymlink: start = &parentDir.dentry goto afterTrailingSymlink } - // Open existing file. - if mustCreate { - return nil, syserror.EEXIST + if rp.MustBeDir() && !child.inode.isDir() { + return nil, syserror.ENOTDIR } return child.open(ctx, rp, &opts, false) } diff --git a/test/syscalls/linux/open.cc b/test/syscalls/linux/open.cc index 8f0c9cb49..77f390f3c 100644 --- a/test/syscalls/linux/open.cc +++ b/test/syscalls/linux/open.cc @@ -27,6 +27,7 @@ #include "test/util/cleanup.h" #include "test/util/file_descriptor.h" #include "test/util/fs_util.h" +#include "test/util/posix_error.h" #include "test/util/temp_path.h" #include "test/util/test_util.h" #include "test/util/thread_util.h" @@ -408,6 +409,13 @@ TEST_F(OpenTest, FileNotDirectory) { SyscallFailsWithErrno(ENOTDIR)); } +TEST_F(OpenTest, SymlinkDirectory) { + auto dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); + std::string link = NewTempAbsPath(); + ASSERT_THAT(symlink(dir.path().c_str(), link.c_str()), SyscallSucceeds()); + ASSERT_NO_ERRNO(Open(link, O_RDONLY | O_DIRECTORY)); +} + TEST_F(OpenTest, Null) { char c = '\0'; ASSERT_THAT(open(&c, O_RDONLY), SyscallFailsWithErrno(ENOENT)); -- cgit v1.2.3 From 6d0c5803d557d453f15ac6f683697eeb46dab680 Mon Sep 17 00:00:00 2001 From: gVisor bot Date: Mon, 17 Aug 2020 16:29:10 -0700 Subject: Add a skeleton for verity file system PiperOrigin-RevId: 327123477 --- pkg/sentry/fsimpl/verity/BUILD | 23 +++ pkg/sentry/fsimpl/verity/filesystem.go | 333 +++++++++++++++++++++++++++++++ pkg/sentry/fsimpl/verity/verity.go | 355 +++++++++++++++++++++++++++++++++ 3 files changed, 711 insertions(+) create mode 100644 pkg/sentry/fsimpl/verity/BUILD create mode 100644 pkg/sentry/fsimpl/verity/filesystem.go create mode 100644 pkg/sentry/fsimpl/verity/verity.go (limited to 'pkg/sentry/fsimpl') diff --git a/pkg/sentry/fsimpl/verity/BUILD b/pkg/sentry/fsimpl/verity/BUILD new file mode 100644 index 000000000..28d2a4bcb --- /dev/null +++ b/pkg/sentry/fsimpl/verity/BUILD @@ -0,0 +1,23 @@ +load("//tools:defs.bzl", "go_library") + +licenses(["notice"]) + +go_library( + name = "verity", + srcs = [ + "filesystem.go", + "verity.go", + ], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/context", + "//pkg/fspath", + "//pkg/sentry/fs/lock", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/socket/unix/transport", + "//pkg/sentry/vfs", + "//pkg/sync", + "//pkg/syserror", + ], +) diff --git a/pkg/sentry/fsimpl/verity/filesystem.go b/pkg/sentry/fsimpl/verity/filesystem.go new file mode 100644 index 000000000..78c6074bd --- /dev/null +++ b/pkg/sentry/fsimpl/verity/filesystem.go @@ -0,0 +1,333 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package verity + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" +) + +// Sync implements vfs.FilesystemImpl.Sync. +func (fs *filesystem) Sync(ctx context.Context) error { + // All files should be read-only. + return nil +} + +var dentrySlicePool = sync.Pool{ + New: func() interface{} { + ds := make([]*dentry, 0, 4) // arbitrary non-zero initial capacity + return &ds + }, +} + +func appendDentry(ds *[]*dentry, d *dentry) *[]*dentry { + if ds == nil { + ds = dentrySlicePool.Get().(*[]*dentry) + } + *ds = append(*ds, d) + return ds +} + +// Preconditions: ds != nil. +func putDentrySlice(ds *[]*dentry) { + // Allow dentries to be GC'd. + for i := range *ds { + (*ds)[i] = nil + } + *ds = (*ds)[:0] + dentrySlicePool.Put(ds) +} + +// renameMuRUnlockAndCheckDrop calls fs.renameMu.RUnlock(), then calls +// dentry.checkDropLocked on all dentries in *ds with fs.renameMu locked for +// writing. +// +// ds is a pointer-to-pointer since defer evaluates its arguments immediately, +// but dentry slices are allocated lazily, and it's much easier to say "defer +// fs.renameMuRUnlockAndCheckDrop(&ds)" than "defer func() { +// fs.renameMuRUnlockAndCheckDrop(ds) }()" to work around this. +func (fs *filesystem) renameMuRUnlockAndCheckDrop(ctx context.Context, ds **[]*dentry) { + fs.renameMu.RUnlock() + if *ds == nil { + return + } + if len(**ds) != 0 { + fs.renameMu.Lock() + for _, d := range **ds { + d.checkDropLocked(ctx) + } + fs.renameMu.Unlock() + } + putDentrySlice(*ds) +} + +func (fs *filesystem) renameMuUnlockAndCheckDrop(ctx context.Context, ds **[]*dentry) { + if *ds == nil { + fs.renameMu.Unlock() + return + } + for _, d := range **ds { + d.checkDropLocked(ctx) + } + fs.renameMu.Unlock() + putDentrySlice(*ds) +} + +// resolveLocked resolves rp to an existing file. +func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath, ds **[]*dentry) (*dentry, error) { + // TODO(b/159261227): Implement resolveLocked. + return nil, nil +} + +// walkParentDirLocked resolves all but the last path component of rp to an +// existing directory, starting from the given directory (which is usually +// rp.Start().Impl().(*dentry)). It does not check that the returned directory +// is searchable by the provider of rp. +// +// Preconditions: fs.renameMu must be locked. !rp.Done(). +func (fs *filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, ds **[]*dentry) (*dentry, error) { + // TODO(b/159261227): Implement walkParentDirLocked. + return nil, nil +} + +// AccessAt implements vfs.Filesystem.Impl.AccessAt. +func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error { + // Verity file system is read-only. + if ats&vfs.MayWrite != 0 { + return syserror.EROFS + } + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) + d, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + return err + } + return d.checkPermissions(creds, ats) +} + +// GetDentryAt implements vfs.FilesystemImpl.GetDentryAt. +func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) + d, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + return nil, err + } + if opts.CheckSearchable { + if !d.isDir() { + return nil, syserror.ENOTDIR + } + if err := d.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { + return nil, err + } + } + d.IncRef() + return &d.vfsd, nil +} + +// GetParentDentryAt implements vfs.FilesystemImpl.GetParentDentryAt. +func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) + start := rp.Start().Impl().(*dentry) + d, err := fs.walkParentDirLocked(ctx, rp, start, &ds) + if err != nil { + return nil, err + } + d.IncRef() + return &d.vfsd, nil +} + +// LinkAt implements vfs.FilesystemImpl.LinkAt. +func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error { + // Verity file system is read-only. + return syserror.EROFS +} + +// MkdirAt implements vfs.FilesystemImpl.MkdirAt. +func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error { + // Verity file system is read-only. + return syserror.EROFS +} + +// MknodAt implements vfs.FilesystemImpl.MknodAt. +func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error { + // Verity file system is read-only. + return syserror.EROFS +} + +// OpenAt implements vfs.FilesystemImpl.OpenAt. +func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + //TODO(b/159261227): Implement OpenAt. + return nil, nil +} + +// ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt. +func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) + d, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + return "", err + } + //TODO(b/162787271): Provide integrity check for ReadlinkAt. + return fs.vfsfs.VirtualFilesystem().ReadlinkAt(ctx, d.fs.creds, &vfs.PathOperation{ + Root: d.lowerVD, + Start: d.lowerVD, + }) +} + +// RenameAt implements vfs.FilesystemImpl.RenameAt. +func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldParentVD vfs.VirtualDentry, oldName string, opts vfs.RenameOptions) error { + // Verity file system is read-only. + return syserror.EROFS +} + +// RmdirAt implements vfs.FilesystemImpl.RmdirAt. +func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error { + // Verity file system is read-only. + return syserror.EROFS +} + +// SetStatAt implements vfs.FilesystemImpl.SetStatAt. +func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error { + // Verity file system is read-only. + return syserror.EROFS +} + +// StatAt implements vfs.FilesystemImpl.StatAt. +func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) + d, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + return linux.Statx{}, err + } + + var stat linux.Statx + stat, err = fs.vfsfs.VirtualFilesystem().StatAt(ctx, fs.creds, &vfs.PathOperation{ + Root: d.lowerVD, + Start: d.lowerVD, + }, &opts) + if err != nil { + return linux.Statx{}, err + } + return stat, nil +} + +// StatFSAt implements vfs.FilesystemImpl.StatFSAt. +func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) { + // TODO(b/159261227): Implement StatFSAt. + return linux.Statfs{}, nil +} + +// SymlinkAt implements vfs.FilesystemImpl.SymlinkAt. +func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error { + // Verity file system is read-only. + return syserror.EROFS +} + +// UnlinkAt implements vfs.FilesystemImpl.UnlinkAt. +func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error { + // Verity file system is read-only. + return syserror.EROFS +} + +// BoundEndpointAt implements FilesystemImpl.BoundEndpointAt. +func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) + if _, err := fs.resolveLocked(ctx, rp, &ds); err != nil { + return nil, err + } + return nil, syserror.ECONNREFUSED +} + +// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt. +func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) + d, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + return nil, err + } + lowerVD := d.lowerVD + return fs.vfsfs.VirtualFilesystem().ListxattrAt(ctx, d.fs.creds, &vfs.PathOperation{ + Root: lowerVD, + Start: lowerVD, + }, size) +} + +// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt. +func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) { + var ds *[]*dentry + fs.renameMu.RLock() + defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds) + d, err := fs.resolveLocked(ctx, rp, &ds) + if err != nil { + return "", err + } + lowerVD := d.lowerVD + return fs.vfsfs.VirtualFilesystem().GetxattrAt(ctx, d.fs.creds, &vfs.PathOperation{ + Root: lowerVD, + Start: lowerVD, + }, &opts) +} + +// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt. +func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error { + // Verity file system is read-only. + return syserror.EROFS +} + +// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt. +func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error { + // Verity file system is read-only. + return syserror.EROFS +} + +// PrependPath implements vfs.FilesystemImpl.PrependPath. +func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error { + fs.renameMu.RLock() + defer fs.renameMu.RUnlock() + mnt := vd.Mount() + d := vd.Dentry().Impl().(*dentry) + for { + if mnt == vfsroot.Mount() && &d.vfsd == vfsroot.Dentry() { + return vfs.PrependPathAtVFSRootError{} + } + if &d.vfsd == mnt.Root() { + return nil + } + if d.parent == nil { + return vfs.PrependPathAtNonMountRootError{} + } + b.PrependComponent(d.name) + d = d.parent + } +} diff --git a/pkg/sentry/fsimpl/verity/verity.go b/pkg/sentry/fsimpl/verity/verity.go new file mode 100644 index 000000000..cb29d33a5 --- /dev/null +++ b/pkg/sentry/fsimpl/verity/verity.go @@ -0,0 +1,355 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package verity provides a filesystem implementation that is a wrapper of +// another file system. +// The verity file system provides integrity check for the underlying file +// system by providing verification for path traversals and each read. +// The verity file system is read-only, except for one case: when +// allowRuntimeEnable is true, additional Merkle files can be generated using +// the FS_IOC_ENABLE_VERITY ioctl. +package verity + +import ( + "sync/atomic" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/sync" + "gvisor.dev/gvisor/pkg/syserror" +) + +// Name is the default filesystem name. +const Name = "verity" + +// testOnlyDebugging allows verity file system to return error instead of +// crashing the application when a malicious action is detected. This should +// only be set for tests. +var testOnlyDebugging bool + +// FilesystemType implements vfs.FilesystemType. +type FilesystemType struct{} + +// filesystem implements vfs.FilesystemImpl. +type filesystem struct { + vfsfs vfs.Filesystem + + // creds is a copy of the filesystem's creator's credentials, which are + // used for accesses to the underlying file system. creds is immutable. + creds *auth.Credentials + + // allowRuntimeEnable is true if using ioctl with FS_IOC_ENABLE_VERITY + // to build Merkle trees in the verity file system is allowed. If this + // is false, no new Merkle trees can be built, and only the files that + // had Merkle trees before startup (e.g. from a host filesystem mounted + // with gofer fs) can be verified. + allowRuntimeEnable bool + + // lowerMount is the underlying file system mount. + lowerMount *vfs.Mount + + // rootDentry is the mount root Dentry for this file system, which + // stores the root hash of the whole file system in bytes. + rootDentry *dentry + + // renameMu synchronizes renaming with non-renaming operations in order + // to ensure consistent lock ordering between dentry.dirMu in different + // dentries. + renameMu sync.RWMutex +} + +// InternalFilesystemOptions may be passed as +// vfs.GetFilesystemOptions.InternalData to FilesystemType.GetFilesystem. +type InternalFilesystemOptions struct { + // RootMerkleFileName is the name of the verity root Merkle tree file. + RootMerkleFileName string + + // LowerName is the name of the filesystem wrapped by verity fs. + LowerName string + + // RootHash is the root hash of the overall verity file system. + RootHash []byte + + // AllowRuntimeEnable specifies whether the verity file system allows + // enabling verification for files (i.e. building Merkle trees) during + // runtime. + AllowRuntimeEnable bool + + // LowerGetFSOptions is the file system option for the lower layer file + // system wrapped by verity file system. + LowerGetFSOptions vfs.GetFilesystemOptions + + // TestOnlyDebugging allows verity file system to return error instead + // of crashing the application when a malicious action is detected. This + // should only be set for tests. + TestOnlyDebugging bool +} + +// Name implements vfs.FilesystemType.Name. +func (FilesystemType) Name() string { + return Name +} + +// GetFilesystem implements vfs.FilesystemType.GetFilesystem. +func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { + //TODO(b/159261227): Implement GetFilesystem. + return nil, nil, nil +} + +// Release implements vfs.FilesystemImpl.Release. +func (fs *filesystem) Release(ctx context.Context) { + fs.lowerMount.DecRef(ctx) +} + +// dentry implements vfs.DentryImpl. +type dentry struct { + vfsd vfs.Dentry + + refs int64 + + // fs is the owning filesystem. fs is immutable. + fs *filesystem + + // mode, uid and gid are the file mode, owner, and group of the file in + // the underlying file system. + mode uint32 + uid uint32 + gid uint32 + + // parent is the dentry corresponding to this dentry's parent directory. + // name is this dentry's name in parent. If this dentry is a filesystem + // root, parent is nil and name is the empty string. parent and name are + // protected by fs.renameMu. + parent *dentry + name string + + // If this dentry represents a directory, children maps the names of + // children for which dentries have been instantiated to those dentries, + // and dirents (if not nil) is a cache of dirents as returned by + // directoryFDs representing this directory. children is protected by + // dirMu. + dirMu sync.Mutex + children map[string]*dentry + + // lowerVD is the VirtualDentry in the underlying file system. + lowerVD vfs.VirtualDentry + + // lowerMerkleVD is the VirtualDentry of the corresponding Merkle tree + // in the underlying file system. + lowerMerkleVD vfs.VirtualDentry + + // rootHash is the rootHash for the current file or directory. + rootHash []byte +} + +// newDentry creates a new dentry representing the given verity file. The +// dentry initially has no references; it is the caller's responsibility to set +// the dentry's reference count and/or call dentry.destroy() as appropriate. +// The dentry is initially invalid in that it contains no underlying dentry; +// the caller is responsible for setting them. +func (fs *filesystem) newDentry() *dentry { + d := &dentry{ + fs: fs, + } + d.vfsd.Init(d) + return d +} + +// IncRef implements vfs.DentryImpl.IncRef. +func (d *dentry) IncRef() { + atomic.AddInt64(&d.refs, 1) +} + +// TryIncRef implements vfs.DentryImpl.TryIncRef. +func (d *dentry) TryIncRef() bool { + for { + refs := atomic.LoadInt64(&d.refs) + if refs <= 0 { + return false + } + if atomic.CompareAndSwapInt64(&d.refs, refs, refs+1) { + return true + } + } +} + +// DecRef implements vfs.DentryImpl.DecRef. +func (d *dentry) DecRef(ctx context.Context) { + if refs := atomic.AddInt64(&d.refs, -1); refs == 0 { + d.fs.renameMu.Lock() + d.checkDropLocked(ctx) + d.fs.renameMu.Unlock() + } else if refs < 0 { + panic("verity.dentry.DecRef() called without holding a reference") + } +} + +// checkDropLocked should be called after d's reference count becomes 0 or it +// becomes deleted. +func (d *dentry) checkDropLocked(ctx context.Context) { + // Dentries with a positive reference count must be retained. Dentries + // with a negative reference count have already been destroyed. + if atomic.LoadInt64(&d.refs) != 0 { + return + } + // Refs is still zero; destroy it. + d.destroyLocked(ctx) + return +} + +// destroyLocked destroys the dentry. +// +// Preconditions: d.fs.renameMu must be locked for writing. d.refs == 0. +func (d *dentry) destroyLocked(ctx context.Context) { + switch atomic.LoadInt64(&d.refs) { + case 0: + // Mark the dentry destroyed. + atomic.StoreInt64(&d.refs, -1) + case -1: + panic("verity.dentry.destroyLocked() called on already destroyed dentry") + default: + panic("verity.dentry.destroyLocked() called with references on the dentry") + } + + if d.lowerVD.Ok() { + d.lowerVD.DecRef(ctx) + } + + if d.lowerMerkleVD.Ok() { + d.lowerMerkleVD.DecRef(ctx) + } + + if d.parent != nil { + d.parent.dirMu.Lock() + if !d.vfsd.IsDead() { + delete(d.parent.children, d.name) + } + d.parent.dirMu.Unlock() + if refs := atomic.AddInt64(&d.parent.refs, -1); refs == 0 { + d.parent.checkDropLocked(ctx) + } else if refs < 0 { + panic("verity.dentry.DecRef() called without holding a reference") + } + } +} + +// InotifyWithParent implements vfs.DentryImpl.InotifyWithParent. +func (d *dentry) InotifyWithParent(ctx context.Context, events, cookie uint32, et vfs.EventType) { + //TODO(b/159261227): Implement InotifyWithParent. +} + +// Watches implements vfs.DentryImpl.Watches. +func (d *dentry) Watches() *vfs.Watches { + //TODO(b/159261227): Implement Watches. + return nil +} + +// OnZeroWatches implements vfs.DentryImpl.OnZeroWatches. +func (d *dentry) OnZeroWatches(context.Context) { + //TODO(b/159261227): Implement OnZeroWatches. +} + +func (d *dentry) isSymlink() bool { + return atomic.LoadUint32(&d.mode)&linux.S_IFMT == linux.S_IFLNK +} + +func (d *dentry) isDir() bool { + return atomic.LoadUint32(&d.mode)&linux.S_IFMT == linux.S_IFDIR +} + +func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error { + return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(atomic.LoadUint32(&d.mode)), auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))) +} + +func (d *dentry) readlink(ctx context.Context) (string, error) { + return d.fs.vfsfs.VirtualFilesystem().ReadlinkAt(ctx, d.fs.creds, &vfs.PathOperation{ + Root: d.lowerVD, + Start: d.lowerVD, + }) +} + +// FileDescription implements vfs.FileDescriptionImpl for verity fds. +// FileDescription is a wrapper of the underlying lowerFD, with support to build +// Merkle trees through the Linux fs-verity API to verify contents read from +// lowerFD. +type fileDescription struct { + vfsfd vfs.FileDescription + vfs.FileDescriptionDefaultImpl + vfs.LockFD + + // d is the corresponding dentry to the fileDescription. + d *dentry + + // isDir specifies whehter the fileDescription points to a directory. + isDir bool + + // lowerFD is the FileDescription corresponding to the file in the + // underlying file system. + lowerFD *vfs.FileDescription + + // merkleReader is the read-only FileDescription corresponding to the + // Merkle tree file in the underlying file system. + merkleReader *vfs.FileDescription + + // merkleWriter is the FileDescription corresponding to the Merkle tree + // file in the underlying file system for writing. This should only be + // used when allowRuntimeEnable is set to true. + merkleWriter *vfs.FileDescription + + // parentMerkleWriter is the FileDescription of the Merkle tree for the + // directory that contains the current file/directory. This is only used + // if allowRuntimeEnable is set to true. + parentMerkleWriter *vfs.FileDescription +} + +// Release implements vfs.FileDescriptionImpl.Release. +func (fd *fileDescription) Release(ctx context.Context) { + fd.lowerFD.DecRef(ctx) + fd.merkleReader.DecRef(ctx) + if fd.merkleWriter != nil { + fd.merkleWriter.DecRef(ctx) + } + if fd.parentMerkleWriter != nil { + fd.parentMerkleWriter.DecRef(ctx) + } +} + +// Stat implements vfs.FileDescriptionImpl.Stat. +func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { + // TODO(b/162788573): Add integrity check for metadata. + stat, err := fd.lowerFD.Stat(ctx, opts) + if err != nil { + return linux.Statx{}, err + } + return stat, nil +} + +// SetStat implements vfs.FileDescriptionImpl.SetStat. +func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { + // Verity files are read-only. + return syserror.EPERM +} + +// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX. +func (fd *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error { + return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block) +} + +// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX. +func (fd *fileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error { + return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence) +} -- cgit v1.2.3