From e1c8eaca8f8413b17dab8f01b2e123e9d4b9ddbc Mon Sep 17 00:00:00 2001 From: Nicolas Lacasse Date: Tue, 31 Mar 2020 15:00:26 -0700 Subject: Fix /proc/self/mounts and /proc/self/mountinfo in VFS2. Some extra fields were added to the Mount type to expose necessary data to the proc filesystem. PiperOrigin-RevId: 304053361 --- pkg/sentry/fsimpl/proc/task_files.go | 183 +++++---------------------------- pkg/sentry/vfs/mount.go | 192 ++++++++++++++++++++++++++++++++++- 2 files changed, 219 insertions(+), 156 deletions(-) (limited to 'pkg/sentry') diff --git a/pkg/sentry/fsimpl/proc/task_files.go b/pkg/sentry/fsimpl/proc/task_files.go index 8c743df8d..df0d1bcc5 100644 --- a/pkg/sentry/fsimpl/proc/task_files.go +++ b/pkg/sentry/fsimpl/proc/task_files.go @@ -18,13 +18,10 @@ import ( "bytes" "fmt" "io" - "sort" - "strings" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/safemem" - "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/fsbridge" "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" "gvisor.dev/gvisor/pkg/sentry/kernel" @@ -634,51 +631,6 @@ func (s *exeSymlink) executable() (file fsbridge.File, err error) { return } -// forEachMountSource runs f for the process root mount and each mount that is -// a descendant of the root. -func forEachMount(t *kernel.Task, fn func(string, *fs.Mount)) { - var fsctx *kernel.FSContext - t.WithMuLocked(func(t *kernel.Task) { - fsctx = t.FSContext() - }) - if fsctx == nil { - // The task has been destroyed. Nothing to show here. - return - } - - // All mount points must be relative to the rootDir, and mounts outside - // will be excluded. - rootDir := fsctx.RootDirectory() - if rootDir == nil { - // The task has been destroyed. Nothing to show here. - return - } - defer rootDir.DecRef() - - mnt := t.MountNamespace().FindMount(rootDir) - if mnt == nil { - // Has it just been unmounted? - return - } - ms := t.MountNamespace().AllMountsUnder(mnt) - sort.Slice(ms, func(i, j int) bool { - return ms[i].ID < ms[j].ID - }) - for _, m := range ms { - mroot := m.Root() - if mroot == nil { - continue // No longer valid. - } - mountPath, desc := mroot.FullName(rootDir) - mroot.DecRef() - if !desc { - // MountSources that are not descendants of the chroot jail are ignored. - continue - } - fn(mountPath, m) - } -} - // mountInfoData is used to implement /proc/[pid]/mountinfo. // // +stateify savable @@ -692,92 +644,22 @@ var _ dynamicInode = (*mountInfoData)(nil) // Generate implements vfs.DynamicBytesSource.Generate. func (i *mountInfoData) Generate(ctx context.Context, buf *bytes.Buffer) error { - forEachMount(i.task, func(mountPath string, m *fs.Mount) { - mroot := m.Root() - if mroot == nil { - return // No longer valid. - } - defer mroot.DecRef() - - // Format: - // 36 35 98:0 /mnt1 /mnt2 rw,noatime master:1 - ext3 /dev/root rw,errors=continue - // (1)(2)(3) (4) (5) (6) (7) (8) (9) (10) (11) - - // (1) MountSource ID. - fmt.Fprintf(buf, "%d ", m.ID) - - // (2) Parent ID (or this ID if there is no parent). - pID := m.ID - if !m.IsRoot() && !m.IsUndo() { - pID = m.ParentID - } - fmt.Fprintf(buf, "%d ", pID) - - // (3) Major:Minor device ID. We don't have a superblock, so we - // just use the root inode device number. - sa := mroot.Inode.StableAttr - fmt.Fprintf(buf, "%d:%d ", sa.DeviceFileMajor, sa.DeviceFileMinor) - - // (4) Root: the pathname of the directory in the filesystem - // which forms the root of this mount. - // - // NOTE(b/78135857): This will always be "/" until we implement - // bind mounts. - fmt.Fprintf(buf, "/ ") - - // (5) Mount point (relative to process root). - fmt.Fprintf(buf, "%s ", mountPath) - - // (6) Mount options. - flags := mroot.Inode.MountSource.Flags - opts := "rw" - if flags.ReadOnly { - opts = "ro" - } - if flags.NoAtime { - opts += ",noatime" - } - if flags.NoExec { - opts += ",noexec" - } - fmt.Fprintf(buf, "%s ", opts) - - // (7) Optional fields: zero or more fields of the form "tag[:value]". - // (8) Separator: the end of the optional fields is marked by a single hyphen. - fmt.Fprintf(buf, "- ") - - // (9) Filesystem type. - fmt.Fprintf(buf, "%s ", mroot.Inode.MountSource.FilesystemType) - - // (10) Mount source: filesystem-specific information or "none". - fmt.Fprintf(buf, "none ") - - // (11) Superblock options, and final newline. - fmt.Fprintf(buf, "%s\n", superBlockOpts(mountPath, mroot.Inode.MountSource)) + var fsctx *kernel.FSContext + i.task.WithMuLocked(func(t *kernel.Task) { + fsctx = t.FSContext() }) - return nil -} - -func superBlockOpts(mountPath string, msrc *fs.MountSource) string { - // gVisor doesn't (yet) have a concept of super block options, so we - // use the ro/rw bit from the mount flag. - opts := "rw" - if msrc.Flags.ReadOnly { - opts = "ro" + if fsctx == nil { + // The task has been destroyed. Nothing to show here. + return nil } - - // NOTE(b/147673608): If the mount is a cgroup, we also need to include - // the cgroup name in the options. For now we just read that from the - // path. - // TODO(gvisor.dev/issues/190): Once gVisor has full cgroup support, we - // should get this value from the cgroup itself, and not rely on the - // path. - if msrc.FilesystemType == "cgroup" { - splitPath := strings.Split(mountPath, "/") - cgroupType := splitPath[len(splitPath)-1] - opts += "," + cgroupType + rootDir := fsctx.RootDirectoryVFS2() + if !rootDir.Ok() { + // Root has been destroyed. Don't try to read mounts. + return nil } - return opts + defer rootDir.DecRef() + i.task.Kernel().VFS().GenerateProcMountInfo(ctx, rootDir, buf) + return nil } // mountsData is used to implement /proc/[pid]/mounts. @@ -789,33 +671,24 @@ type mountsData struct { task *kernel.Task } -var _ dynamicInode = (*mountInfoData)(nil) +var _ dynamicInode = (*mountsData)(nil) // Generate implements vfs.DynamicBytesSource.Generate. func (i *mountsData) Generate(ctx context.Context, buf *bytes.Buffer) error { - forEachMount(i.task, func(mountPath string, m *fs.Mount) { - // Format: - // - // - // We use the filesystem name as the first field, since there - // is no real block device we can point to, and we also should - // not expose anything about the remote filesystem. - // - // Only ro/rw option is supported for now. - // - // The "needs dump"and fsck flags are always 0, which is allowed. - root := m.Root() - if root == nil { - return // No longer valid. - } - defer root.DecRef() - - flags := root.Inode.MountSource.Flags - opts := "rw" - if flags.ReadOnly { - opts = "ro" - } - fmt.Fprintf(buf, "%s %s %s %s %d %d\n", "none", mountPath, root.Inode.MountSource.FilesystemType, opts, 0, 0) + var fsctx *kernel.FSContext + i.task.WithMuLocked(func(t *kernel.Task) { + fsctx = t.FSContext() }) + if fsctx == nil { + // The task has been destroyed. Nothing to show here. + return nil + } + rootDir := fsctx.RootDirectoryVFS2() + if !rootDir.Ok() { + // Root has been destroyed. Don't try to read mounts. + return nil + } + defer rootDir.DecRef() + i.task.Kernel().VFS().GenerateProcMounts(ctx, rootDir, buf) return nil } diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go index 4b68cabda..7792eb1a0 100644 --- a/pkg/sentry/vfs/mount.go +++ b/pkg/sentry/vfs/mount.go @@ -15,7 +15,11 @@ package vfs import ( + "bytes" + "fmt" "math" + "sort" + "strings" "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" @@ -44,7 +48,7 @@ var lastMountID uint64 // // +stateify savable type Mount struct { - // vfs, fs, and root are immutable. References are held on fs and root. + // vfs, fs, root are immutable. References are held on fs and root. // // Invariant: root belongs to fs. vfs *VirtualFilesystem @@ -639,12 +643,28 @@ func (mnt *Mount) setReadOnlyLocked(ro bool) error { return nil } +func (mnt *Mount) readOnly() bool { + return atomic.LoadInt64(&mnt.writers) < 0 +} + // Filesystem returns the mounted Filesystem. It does not take a reference on // the returned Filesystem. func (mnt *Mount) Filesystem() *Filesystem { return mnt.fs } +// submountsLocked returns this Mount and all Mounts that are descendents of +// it. +// +// Precondition: mnt.vfs.mountMu must be held. +func (mnt *Mount) submountsLocked() []*Mount { + mounts := []*Mount{mnt} + for m := range mnt.children { + mounts = append(mounts, m.submountsLocked()...) + } + return mounts +} + // Root returns mntns' root. A reference is taken on the returned // VirtualDentry. func (mntns *MountNamespace) Root() VirtualDentry { @@ -655,3 +675,173 @@ func (mntns *MountNamespace) Root() VirtualDentry { vd.IncRef() return vd } + +// GenerateProcMounts emits the contents of /proc/[pid]/mounts for vfs to buf. +// +// Preconditions: taskRootDir.Ok(). +func (vfs *VirtualFilesystem) GenerateProcMounts(ctx context.Context, taskRootDir VirtualDentry, buf *bytes.Buffer) { + vfs.mountMu.Lock() + defer vfs.mountMu.Unlock() + rootMnt := taskRootDir.mount + mounts := rootMnt.submountsLocked() + sort.Slice(mounts, func(i, j int) bool { return mounts[i].ID < mounts[j].ID }) + for _, mnt := range mounts { + // Get the path to this mount relative to task root. + mntRootVD := VirtualDentry{ + mount: mnt, + dentry: mnt.root, + } + path, err := vfs.PathnameReachable(ctx, taskRootDir, mntRootVD) + if err != nil { + // For some reason we didn't get a path. Log a warning + // and run with empty path. + ctx.Warningf("Error getting pathname for mount root %+v: %v", mnt.root, err) + path = "" + } + if path == "" { + // Either an error occurred, or path is not reachable + // from root. + break + } + + opts := "rw" + if mnt.readOnly() { + opts = "ro" + } + if mnt.flags.NoExec { + opts += ",noexec" + } + + // Format: + // + // + // The "needs dump" and "fsck order" flags are always 0, which + // is allowed. + fmt.Fprintf(buf, "%s %s %s %s %d %d\n", "none", path, mnt.fs.FilesystemType().Name(), opts, 0, 0) + } +} + +// GenerateProcMountInfo emits the contents of /proc/[pid]/mountinfo for vfs to +// buf. +// +// Preconditions: taskRootDir.Ok(). +func (vfs *VirtualFilesystem) GenerateProcMountInfo(ctx context.Context, taskRootDir VirtualDentry, buf *bytes.Buffer) { + vfs.mountMu.Lock() + defer vfs.mountMu.Unlock() + rootMnt := taskRootDir.mount + mounts := rootMnt.submountsLocked() + sort.Slice(mounts, func(i, j int) bool { return mounts[i].ID < mounts[j].ID }) + for _, mnt := range mounts { + // Get the path to this mount relative to task root. + mntRootVD := VirtualDentry{ + mount: mnt, + dentry: mnt.root, + } + path, err := vfs.PathnameReachable(ctx, taskRootDir, mntRootVD) + if err != nil { + // For some reason we didn't get a path. Log a warning + // and run with empty path. + ctx.Warningf("Error getting pathname for mount root %+v: %v", mnt.root, err) + path = "" + } + if path == "" { + // Either an error occurred, or path is not reachable + // from root. + break + } + // Stat the mount root to get the major/minor device numbers. + pop := &PathOperation{ + Root: mntRootVD, + Start: mntRootVD, + } + statx, err := vfs.StatAt(ctx, auth.NewAnonymousCredentials(), pop, &StatOptions{}) + if err != nil { + // Well that's not good. Ignore this mount. + break + } + + // Format: + // 36 35 98:0 /mnt1 /mnt2 rw,noatime master:1 - ext3 /dev/root rw,errors=continue + // (1)(2)(3) (4) (5) (6) (7) (8) (9) (10) (11) + + // (1) Mount ID. + fmt.Fprintf(buf, "%d ", mnt.ID) + + // (2) Parent ID (or this ID if there is no parent). + pID := mnt.ID + if p := mnt.parent(); p != nil { + pID = p.ID + } + fmt.Fprintf(buf, "%d ", pID) + + // (3) Major:Minor device ID. We don't have a superblock, so we + // just use the root inode device number. + fmt.Fprintf(buf, "%d:%d ", statx.DevMajor, statx.DevMinor) + + // (4) Root: the pathname of the directory in the filesystem + // which forms the root of this mount. + // + // NOTE(b/78135857): This will always be "/" until we implement + // bind mounts. + fmt.Fprintf(buf, "/ ") + + // (5) Mount point (relative to process root). + fmt.Fprintf(buf, "%s ", manglePath(path)) + + // (6) Mount options. + opts := "rw" + if mnt.readOnly() { + opts = "ro" + } + if mnt.flags.NoExec { + opts += ",noexec" + } + // TODO(gvisor.dev/issue/1193): Add "noatime" if MS_NOATIME is + // set. + fmt.Fprintf(buf, "%s ", opts) + + // (7) Optional fields: zero or more fields of the form "tag[:value]". + // (8) Separator: the end of the optional fields is marked by a single hyphen. + fmt.Fprintf(buf, "- ") + + // (9) Filesystem type. + fmt.Fprintf(buf, "%s ", mnt.fs.FilesystemType().Name()) + + // (10) Mount source: filesystem-specific information or "none". + fmt.Fprintf(buf, "none ") + + // (11) Superblock options, and final newline. + fmt.Fprintf(buf, "%s\n", superBlockOpts(path, mnt)) + } +} + +// manglePath replaces ' ', '\t', '\n', and '\\' with their octal equivalents. +// See Linux fs/seq_file.c:mangle_path. +func manglePath(p string) string { + r := strings.NewReplacer(" ", "\\040", "\t", "\\011", "\n", "\\012", "\\", "\\134") + return r.Replace(p) +} + +// superBlockOpts returns the super block options string for the the mount at +// the given path. +func superBlockOpts(mountPath string, mnt *Mount) string { + // gVisor doesn't (yet) have a concept of super block options, so we + // use the ro/rw bit from the mount flag. + opts := "rw" + if mnt.readOnly() { + opts = "ro" + } + + // NOTE(b/147673608): If the mount is a cgroup, we also need to include + // the cgroup name in the options. For now we just read that from the + // path. + // TODO(gvisor.dev/issues/190): Once gVisor has full cgroup support, we + // should get this value from the cgroup itself, and not rely on the + // path. + if mnt.fs.FilesystemType().Name() == "cgroup" { + splitPath := strings.Split(mountPath, "/") + cgroupType := splitPath[len(splitPath)-1] + opts += "," + cgroupType + } + return opts +} -- cgit v1.2.3 From b6639f77e59d885cb092c15a7a0c5a988e149b40 Mon Sep 17 00:00:00 2001 From: Nicolas Lacasse Date: Tue, 31 Mar 2020 15:00:30 -0700 Subject: Include original copyUp error in panic if cleanupUpper fails. When copyUp fails, we attempt to clean up the upper filesystem by removing any files that have already been copied-up. If the cleanup fails, we panic because the "overlay filesystem is in an inconsistent state". This CL adds the original copy-up error to the panic information, to hopefully make it easier to track down how the overlay filesystem got into the inconsistent state. PiperOrigin-RevId: 304053370 --- pkg/sentry/fs/copy_up.go | 28 +++++++++++++++------------- pkg/sentry/fs/inode_overlay.go | 3 ++- 2 files changed, 17 insertions(+), 14 deletions(-) (limited to 'pkg/sentry') diff --git a/pkg/sentry/fs/copy_up.go b/pkg/sentry/fs/copy_up.go index b060a12ff..ab1424c95 100644 --- a/pkg/sentry/fs/copy_up.go +++ b/pkg/sentry/fs/copy_up.go @@ -222,8 +222,8 @@ func copyUpLocked(ctx context.Context, parent *Dirent, next *Dirent) error { } childUpper, err := parentUpper.Lookup(ctx, next.name) if err != nil { - log.Warningf("copy up failed to lookup directory: %v", err) - cleanupUpper(ctx, parentUpper, next.name) + werr := fmt.Errorf("copy up failed to lookup directory: %v", err) + cleanupUpper(ctx, parentUpper, next.name, werr) return syserror.EIO } defer childUpper.DecRef() @@ -242,8 +242,8 @@ func copyUpLocked(ctx context.Context, parent *Dirent, next *Dirent) error { } childUpper, err := parentUpper.Lookup(ctx, next.name) if err != nil { - log.Warningf("copy up failed to lookup symlink: %v", err) - cleanupUpper(ctx, parentUpper, next.name) + werr := fmt.Errorf("copy up failed to lookup symlink: %v", err) + cleanupUpper(ctx, parentUpper, next.name, werr) return syserror.EIO } defer childUpper.DecRef() @@ -256,23 +256,23 @@ func copyUpLocked(ctx context.Context, parent *Dirent, next *Dirent) error { // Bring file attributes up to date. This does not include size, which will be // brought up to date with copyContentsLocked. if err := copyAttributesLocked(ctx, childUpperInode, next.Inode.overlay.lower); err != nil { - log.Warningf("copy up failed to copy up attributes: %v", err) - cleanupUpper(ctx, parentUpper, next.name) + werr := fmt.Errorf("copy up failed to copy up attributes: %v", err) + cleanupUpper(ctx, parentUpper, next.name, werr) return syserror.EIO } // Copy the entire file. if err := copyContentsLocked(ctx, childUpperInode, next.Inode.overlay.lower, attrs.Size); err != nil { - log.Warningf("copy up failed to copy up contents: %v", err) - cleanupUpper(ctx, parentUpper, next.name) + werr := fmt.Errorf("copy up failed to copy up contents: %v", err) + cleanupUpper(ctx, parentUpper, next.name, werr) return syserror.EIO } lowerMappable := next.Inode.overlay.lower.Mappable() upperMappable := childUpperInode.Mappable() if lowerMappable != nil && upperMappable == nil { - log.Warningf("copy up failed: cannot ensure memory mapping coherence") - cleanupUpper(ctx, parentUpper, next.name) + werr := fmt.Errorf("copy up failed: cannot ensure memory mapping coherence") + cleanupUpper(ctx, parentUpper, next.name, werr) return syserror.EIO } @@ -324,12 +324,14 @@ func copyUpLocked(ctx context.Context, parent *Dirent, next *Dirent) error { return nil } -// cleanupUpper removes name from parent, and panics if it is unsuccessful. -func cleanupUpper(ctx context.Context, parent *Inode, name string) { +// cleanupUpper is called when copy-up fails. It logs the copy-up error and +// attempts to remove name from parent. If that fails, then it panics. +func cleanupUpper(ctx context.Context, parent *Inode, name string, copyUpErr error) { + log.Warningf(copyUpErr.Error()) if err := parent.InodeOperations.Remove(ctx, parent, name); err != nil { // Unfortunately we don't have much choice. We shouldn't // willingly give the caller access to a nonsense filesystem. - panic(fmt.Sprintf("overlay filesystem is in an inconsistent state: failed to remove %q from upper filesystem: %v", name, err)) + panic(fmt.Sprintf("overlay filesystem is in an inconsistent state: copyUp got error: %v; then cleanup failed to remove %q from upper filesystem: %v.", copyUpErr, name, err)) } } diff --git a/pkg/sentry/fs/inode_overlay.go b/pkg/sentry/fs/inode_overlay.go index 5ada33a32..537c8d257 100644 --- a/pkg/sentry/fs/inode_overlay.go +++ b/pkg/sentry/fs/inode_overlay.go @@ -231,7 +231,8 @@ func overlayCreate(ctx context.Context, o *overlayEntry, parent *Dirent, name st upperFile.Dirent.Inode.IncRef() entry, err := newOverlayEntry(ctx, upperFile.Dirent.Inode, nil, false) if err != nil { - cleanupUpper(ctx, o.upper, name) + werr := fmt.Errorf("newOverlayEntry failed: %v", err) + cleanupUpper(ctx, o.upper, name, werr) return nil, err } -- cgit v1.2.3