From 12bde95635ac266aab8087b4705372bb177638f3 Mon Sep 17 00:00:00 2001 From: Zach Koopmans Date: Fri, 17 Apr 2020 10:38:04 -0700 Subject: Get /bin/true to run on VFS2 Included: - loader_test.go RunTest and TestStartSignal VFS2 - container_test.go TestAppExitStatus on VFS2 - experimental flag added to runsc to turn on VFS2 Note: shared mounts are not yet supported. PiperOrigin-RevId: 307070753 --- runsc/boot/vfs.go | 310 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 310 insertions(+) create mode 100644 runsc/boot/vfs.go (limited to 'runsc/boot/vfs.go') diff --git a/runsc/boot/vfs.go b/runsc/boot/vfs.go new file mode 100644 index 000000000..82083c57d --- /dev/null +++ b/runsc/boot/vfs.go @@ -0,0 +1,310 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package boot + +import ( + "fmt" + "path" + "strconv" + "strings" + + specs "github.com/opencontainers/runtime-spec/specs-go" + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/devices/memdev" + "gvisor.dev/gvisor/pkg/sentry/fs" + devtmpfsimpl "gvisor.dev/gvisor/pkg/sentry/fsimpl/devtmpfs" + goferimpl "gvisor.dev/gvisor/pkg/sentry/fsimpl/gofer" + procimpl "gvisor.dev/gvisor/pkg/sentry/fsimpl/proc" + sysimpl "gvisor.dev/gvisor/pkg/sentry/fsimpl/sys" + tmpfsimpl "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs" + "gvisor.dev/gvisor/pkg/syserror" + + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" +) + +func registerFilesystems(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials) error { + + vfsObj.MustRegisterFilesystemType(rootFsName, &goferimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserList: true, + }) + + vfsObj.MustRegisterFilesystemType(bind, &goferimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserList: true, + }) + + vfsObj.MustRegisterFilesystemType(devpts, &devtmpfsimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserMount: true, + AllowUserList: true, + }) + + vfsObj.MustRegisterFilesystemType(devtmpfs, &devtmpfsimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserMount: true, + AllowUserList: true, + }) + vfsObj.MustRegisterFilesystemType(proc, &procimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserMount: true, + AllowUserList: true, + }) + vfsObj.MustRegisterFilesystemType(sysfs, &sysimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserMount: true, + AllowUserList: true, + }) + vfsObj.MustRegisterFilesystemType(tmpfs, &tmpfsimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserMount: true, + AllowUserList: true, + }) + vfsObj.MustRegisterFilesystemType(nonefs, &sysimpl.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{ + AllowUserMount: true, + AllowUserList: true, + }) + + // Setup files in devtmpfs. + if err := memdev.Register(vfsObj); err != nil { + return fmt.Errorf("registering memdev: %w", err) + } + a, err := devtmpfsimpl.NewAccessor(ctx, vfsObj, creds, devtmpfsimpl.Name) + if err != nil { + return fmt.Errorf("creating devtmpfs accessor: %w", err) + } + defer a.Release() + + if err := a.UserspaceInit(ctx); err != nil { + return fmt.Errorf("initializing userspace: %w", err) + } + if err := memdev.CreateDevtmpfsFiles(ctx, a); err != nil { + return fmt.Errorf("creating devtmpfs files: %w", err) + } + return nil +} + +func setupContainerVFS2(ctx context.Context, conf *Config, mntr *containerMounter, procArgs *kernel.CreateProcessArgs) error { + if err := mntr.k.VFS().Init(); err != nil { + return fmt.Errorf("failed to initialize VFS: %w", err) + } + mns, err := mntr.setupVFS2(ctx, conf, procArgs) + if err != nil { + return fmt.Errorf("failed to setupFS: %w", err) + } + procArgs.MountNamespaceVFS2 = mns + return setExecutablePathVFS2(ctx, procArgs) +} + +func setExecutablePathVFS2(ctx context.Context, procArgs *kernel.CreateProcessArgs) error { + + exe := procArgs.Argv[0] + + // Absolute paths can be used directly. + if path.IsAbs(exe) { + procArgs.Filename = exe + return nil + } + + // Paths with '/' in them should be joined to the working directory, or + // to the root if working directory is not set. + if strings.IndexByte(exe, '/') > 0 { + + if !path.IsAbs(procArgs.WorkingDirectory) { + return fmt.Errorf("working directory %q must be absolute", procArgs.WorkingDirectory) + } + + procArgs.Filename = path.Join(procArgs.WorkingDirectory, exe) + return nil + } + + // Paths with a '/' are relative to the CWD. + if strings.IndexByte(exe, '/') > 0 { + procArgs.Filename = path.Join(procArgs.WorkingDirectory, exe) + return nil + } + + // Otherwise, We must lookup the name in the paths, starting from the + // root directory. + root := procArgs.MountNamespaceVFS2.Root() + defer root.DecRef() + + paths := fs.GetPath(procArgs.Envv) + creds := procArgs.Credentials + + for _, p := range paths { + + binPath := path.Join(p, exe) + + pop := &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(binPath), + FollowFinalSymlink: true, + } + + opts := &vfs.OpenOptions{ + FileExec: true, + Flags: linux.O_RDONLY, + } + + dentry, err := root.Mount().Filesystem().VirtualFilesystem().OpenAt(ctx, creds, pop, opts) + if err == syserror.ENOENT || err == syserror.EACCES { + // Didn't find it here. + continue + } + if err != nil { + return err + } + dentry.DecRef() + + procArgs.Filename = binPath + return nil + } + + return fmt.Errorf("executable %q not found in $PATH=%q", exe, strings.Join(paths, ":")) +} + +func (c *containerMounter) setupVFS2(ctx context.Context, conf *Config, procArgs *kernel.CreateProcessArgs) (*vfs.MountNamespace, error) { + log.Infof("Configuring container's file system with VFS2") + + // Create context with root credentials to mount the filesystem (the current + // user may not be privileged enough). + rootProcArgs := *procArgs + rootProcArgs.WorkingDirectory = "/" + rootProcArgs.Credentials = auth.NewRootCredentials(procArgs.Credentials.UserNamespace) + rootProcArgs.Umask = 0022 + rootProcArgs.MaxSymlinkTraversals = linux.MaxSymlinkTraversals + rootCtx := procArgs.NewContext(c.k) + + creds := procArgs.Credentials + if err := registerFilesystems(rootCtx, c.k.VFS(), creds); err != nil { + return nil, fmt.Errorf("register filesystems: %w", err) + } + + fd := c.fds.remove() + + opts := strings.Join(p9MountOptionsVFS2(fd, conf.FileAccess), ",") + + log.Infof("Mounting root over 9P, ioFD: %d", fd) + mns, err := c.k.VFS().NewMountNamespace(ctx, creds, "", rootFsName, &vfs.GetFilesystemOptions{Data: opts}) + if err != nil { + return nil, fmt.Errorf("setting up mountnamespace: %w", err) + } + + rootProcArgs.MountNamespaceVFS2 = mns + + // Mount submounts. + if err := c.mountSubmountsVFS2(rootCtx, conf, mns, creds); err != nil { + return nil, fmt.Errorf("mounting submounts vfs2: %w", err) + } + + return mns, nil +} + +func (c *containerMounter) mountSubmountsVFS2(ctx context.Context, conf *Config, mns *vfs.MountNamespace, creds *auth.Credentials) error { + + for _, submount := range c.mounts { + log.Debugf("Mounting %q to %q, type: %s, options: %s", submount.Source, submount.Destination, submount.Type, submount.Options) + if err := c.mountSubmountVFS2(ctx, conf, mns, creds, &submount); err != nil { + return err + } + } + + // TODO(gvisor.dev/issue/1487): implement mountTmp from fs.go. + + return c.checkDispenser() +} + +// TODO(gvisor.dev/issue/1487): Implement submount options similar to the VFS1 version. +func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *Config, mns *vfs.MountNamespace, creds *auth.Credentials, submount *specs.Mount) error { + root := mns.Root() + defer root.DecRef() + target := &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(submount.Destination), + } + + _, options, useOverlay, err := c.getMountNameAndOptionsVFS2(conf, *submount) + if err != nil { + return fmt.Errorf("mountOptions failed: %w", err) + } + + opts := &vfs.MountOptions{ + GetFilesystemOptions: vfs.GetFilesystemOptions{ + Data: strings.Join(options, ","), + }, + InternalMount: true, + } + + // All writes go to upper, be paranoid and make lower readonly. + opts.ReadOnly = useOverlay + + if err := c.k.VFS().MountAt(ctx, creds, "", target, submount.Type, opts); err != nil { + return fmt.Errorf("failed to mount %q (type: %s): %w, opts: %v", submount.Destination, submount.Type, err, opts) + } + log.Infof("Mounted %q to %q type: %s, internal-options: %q", submount.Source, submount.Destination, submount.Type, opts) + return nil +} + +// getMountNameAndOptionsVFS2 retrieves the fsName, opts, and useOverlay values +// used for mounts. +func (c *containerMounter) getMountNameAndOptionsVFS2(conf *Config, m specs.Mount) (string, []string, bool, error) { + var ( + fsName string + opts []string + useOverlay bool + ) + + switch m.Type { + case devpts, devtmpfs, proc, sysfs: + fsName = m.Type + case nonefs: + fsName = sysfs + case tmpfs: + fsName = m.Type + + var err error + opts, err = parseAndFilterOptions(m.Options, tmpfsAllowedOptions...) + if err != nil { + return "", nil, false, err + } + + case bind: + fd := c.fds.remove() + fsName = "9p" + opts = p9MountOptionsVFS2(fd, c.getMountAccessType(m)) + // If configured, add overlay to all writable mounts. + useOverlay = conf.Overlay && !mountFlags(m.Options).ReadOnly + + default: + log.Warningf("ignoring unknown filesystem type %q", m.Type) + } + return fsName, opts, useOverlay, nil +} + +// p9MountOptions creates a slice of options for a p9 mount. +// TODO(gvisor.dev/issue/1200): Remove this version in favor of the one in +// fs.go when privateunixsocket lands. +func p9MountOptionsVFS2(fd int, fa FileAccessType) []string { + opts := []string{ + "trans=fd", + "rfdno=" + strconv.Itoa(fd), + "wfdno=" + strconv.Itoa(fd), + } + if fa == FileAccessShared { + opts = append(opts, "cache=remote_revalidating") + } + return opts +} -- cgit v1.2.3 From 5042ea7e2cbdc0c04fd454583589a3b1e152f95d Mon Sep 17 00:00:00 2001 From: Jamie Liu Date: Thu, 23 Apr 2020 15:35:56 -0700 Subject: Add vfs.MkdirOptions.ForSyntheticMountpoint. PiperOrigin-RevId: 308143529 --- pkg/sentry/fsimpl/gofer/directory.go | 147 +++++++++++----- pkg/sentry/fsimpl/gofer/filesystem.go | 323 ++++++++++++++++++++++------------ pkg/sentry/fsimpl/gofer/gofer.go | 177 ++++++++++++------- pkg/sentry/fsimpl/gofer/gofer_test.go | 2 +- pkg/sentry/vfs/filesystem.go | 3 +- pkg/sentry/vfs/options.go | 19 ++ runsc/boot/vfs.go | 6 + 7 files changed, 461 insertions(+), 216 deletions(-) (limited to 'runsc/boot/vfs.go') diff --git a/pkg/sentry/fsimpl/gofer/directory.go b/pkg/sentry/fsimpl/gofer/directory.go index d02691232..c67766ab2 100644 --- a/pkg/sentry/fsimpl/gofer/directory.go +++ b/pkg/sentry/fsimpl/gofer/directory.go @@ -21,8 +21,10 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/p9" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" ) func (d *dentry) isDir() bool { @@ -41,15 +43,46 @@ func (d *dentry) cacheNewChildLocked(child *dentry, name string) { d.children[name] = child } -// Preconditions: d.dirMu must be locked. d.isDir(). fs.opts.interop != -// InteropModeShared. -func (d *dentry) cacheNegativeChildLocked(name string) { +// Preconditions: d.dirMu must be locked. d.isDir(). +func (d *dentry) cacheNegativeLookupLocked(name string) { + // Don't cache negative lookups if InteropModeShared is in effect (since + // this makes remote lookup unavoidable), or if d.isSynthetic() (in which + // case the only files in the directory are those for which a dentry exists + // in d.children). Instead, just delete any previously-cached dentry. + if d.fs.opts.interop == InteropModeShared || d.isSynthetic() { + delete(d.children, name) + return + } if d.children == nil { d.children = make(map[string]*dentry) } d.children[name] = nil } +// createSyntheticDirectory creates a synthetic directory with the given name +// in d. +// +// Preconditions: d.dirMu must be locked. d.isDir(). d does not already contain +// a child with the given name. +func (d *dentry) createSyntheticDirectoryLocked(name string, mode linux.FileMode, kuid auth.KUID, kgid auth.KGID) { + d2 := &dentry{ + refs: 1, // held by d + fs: d.fs, + mode: uint32(mode) | linux.S_IFDIR, + uid: uint32(kuid), + gid: uint32(kgid), + blockSize: usermem.PageSize, // arbitrary + handle: handle{ + fd: -1, + }, + } + d2.pf.dentry = d2 + d2.vfsd.Init(d2) + + d.cacheNewChildLocked(d2, name) + d.syntheticChildren++ +} + type directoryFD struct { fileDescription vfs.DirectoryFileDescriptionDefaultImpl @@ -77,7 +110,7 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba fd.dirents = ds } - if d.fs.opts.interop != InteropModeShared { + if d.cachedMetadataAuthoritative() { d.touchAtime(fd.vfsfd.Mount()) } @@ -108,10 +141,10 @@ func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) { // filesystem.renameMu is needed for d.parent, and must be locked before // dentry.dirMu. d.fs.renameMu.RLock() + defer d.fs.renameMu.RUnlock() d.dirMu.Lock() defer d.dirMu.Unlock() if d.dirents != nil { - d.fs.renameMu.RUnlock() return d.dirents, nil } @@ -132,51 +165,81 @@ func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) { NextOff: 2, }, } - d.fs.renameMu.RUnlock() - off := uint64(0) - const count = 64 * 1024 // for consistency with the vfs1 client - d.handleMu.RLock() - defer d.handleMu.RUnlock() - if !d.handleReadable { - // This should not be possible because a readable handle should have - // been opened when the calling directoryFD was opened. - panic("gofer.dentry.getDirents called without a readable handle") - } - for { - p9ds, err := d.handle.file.readdir(ctx, off, count) - if err != nil { - return nil, err + var realChildren map[string]struct{} + if !d.isSynthetic() { + if d.syntheticChildren != 0 && d.fs.opts.interop == InteropModeShared { + // Record the set of children d actually has so that we don't emit + // duplicate entries for synthetic children. + realChildren = make(map[string]struct{}) } - if len(p9ds) == 0 { - // Cache dirents for future directoryFDs if permitted. - if d.fs.opts.interop != InteropModeShared { - d.dirents = dirents + off := uint64(0) + const count = 64 * 1024 // for consistency with the vfs1 client + d.handleMu.RLock() + if !d.handleReadable { + // This should not be possible because a readable handle should + // have been opened when the calling directoryFD was opened. + d.handleMu.RUnlock() + panic("gofer.dentry.getDirents called without a readable handle") + } + for { + p9ds, err := d.handle.file.readdir(ctx, off, count) + if err != nil { + d.handleMu.RUnlock() + return nil, err + } + if len(p9ds) == 0 { + d.handleMu.RUnlock() + break + } + for _, p9d := range p9ds { + if p9d.Name == "." || p9d.Name == ".." { + continue + } + dirent := vfs.Dirent{ + Name: p9d.Name, + Ino: p9d.QID.Path, + NextOff: int64(len(dirents) + 1), + } + // p9 does not expose 9P2000.U's DMDEVICE, DMNAMEDPIPE, or + // DMSOCKET. + switch p9d.Type { + case p9.TypeSymlink: + dirent.Type = linux.DT_LNK + case p9.TypeDir: + dirent.Type = linux.DT_DIR + default: + dirent.Type = linux.DT_REG + } + dirents = append(dirents, dirent) + if realChildren != nil { + realChildren[p9d.Name] = struct{}{} + } } - return dirents, nil + off = p9ds[len(p9ds)-1].Offset } - for _, p9d := range p9ds { - if p9d.Name == "." || p9d.Name == ".." { + } + // Emit entries for synthetic children. + if d.syntheticChildren != 0 { + for _, child := range d.children { + if child == nil || !child.isSynthetic() { continue } - dirent := vfs.Dirent{ - Name: p9d.Name, - Ino: p9d.QID.Path, - NextOff: int64(len(dirents) + 1), - } - // p9 does not expose 9P2000.U's DMDEVICE, DMNAMEDPIPE, or - // DMSOCKET. - switch p9d.Type { - case p9.TypeSymlink: - dirent.Type = linux.DT_LNK - case p9.TypeDir: - dirent.Type = linux.DT_DIR - default: - dirent.Type = linux.DT_REG + if _, ok := realChildren[child.name]; ok { + continue } - dirents = append(dirents, dirent) + dirents = append(dirents, vfs.Dirent{ + Name: child.name, + Type: uint8(atomic.LoadUint32(&child.mode) >> 12), + Ino: child.ino, + NextOff: int64(len(dirents) + 1), + }) } - off = p9ds[len(p9ds)-1].Offset } + // Cache dirents for future directoryFDs if permitted. + if d.cachedMetadataAuthoritative() { + d.dirents = dirents + } + return dirents, nil } // Seek implements vfs.FileDescriptionImpl.Seek. diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go index eba4aabe8..98ccb42fd 100644 --- a/pkg/sentry/fsimpl/gofer/filesystem.go +++ b/pkg/sentry/fsimpl/gofer/filesystem.go @@ -29,14 +29,16 @@ import ( // Sync implements vfs.FilesystemImpl.Sync. func (fs *filesystem) Sync(ctx context.Context) error { - // Snapshot current dentries and special files. + // Snapshot current syncable dentries and special files. fs.syncMu.Lock() - ds := make([]*dentry, 0, len(fs.dentries)) - for d := range fs.dentries { + ds := make([]*dentry, 0, len(fs.syncableDentries)) + for d := range fs.syncableDentries { + d.IncRef() ds = append(ds, d) } sffds := make([]*specialFileFD, 0, len(fs.specialFileFDs)) for sffd := range fs.specialFileFDs { + sffd.vfsfd.IncRef() sffds = append(sffds, sffd) } fs.syncMu.Unlock() @@ -47,9 +49,6 @@ func (fs *filesystem) Sync(ctx context.Context) error { // Sync regular files. for _, d := range ds { - if !d.TryIncRef() { - continue - } err := d.syncSharedHandle(ctx) d.DecRef() if err != nil && retErr == nil { @@ -60,9 +59,6 @@ func (fs *filesystem) Sync(ctx context.Context) error { // Sync special files, which may be writable but do not use dentry shared // handles (so they won't be synced by the above). for _, sffd := range sffds { - if !sffd.vfsfd.TryIncRef() { - continue - } err := sffd.Sync(ctx) sffd.vfsfd.DecRef() if err != nil && retErr == nil { @@ -114,8 +110,8 @@ func putDentrySlice(ds *[]*dentry) { // to *ds. // // Preconditions: fs.renameMu must be locked. d.dirMu must be locked. -// !rp.Done(). If fs.opts.interop == InteropModeShared, then d's cached -// metadata must be up to date. +// !rp.Done(). If !d.cachedMetadataAuthoritative(), then d's cached metadata +// must be up to date. // // Postconditions: The returned dentry's cached metadata is up to date. func (fs *filesystem) stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, ds **[]*dentry) (*dentry, error) { @@ -148,7 +144,7 @@ afterSymlink: if err := rp.CheckMount(&d.parent.vfsd); err != nil { return nil, err } - if fs.opts.interop == InteropModeShared && d != d.parent { + if d != d.parent && !d.cachedMetadataAuthoritative() { _, attrMask, attr, err := d.parent.file.getAttr(ctx, dentryAttrMask()) if err != nil { return nil, err @@ -195,7 +191,7 @@ func (fs *filesystem) getChildLocked(ctx context.Context, vfsObj *vfs.VirtualFil return nil, syserror.ENAMETOOLONG } child, ok := parent.children[name] - if ok && fs.opts.interop != InteropModeShared { + if (ok && fs.opts.interop != InteropModeShared) || parent.isSynthetic() { // Whether child is nil or not, it is cached information that is // assumed to be correct. return child, nil @@ -206,7 +202,7 @@ func (fs *filesystem) getChildLocked(ctx context.Context, vfsObj *vfs.VirtualFil return fs.revalidateChildLocked(ctx, vfsObj, parent, name, child, ds) } -// Preconditions: As for getChildLocked. +// Preconditions: As for getChildLocked. !parent.isSynthetic(). func (fs *filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.VirtualFilesystem, parent *dentry, name string, child *dentry, ds **[]*dentry) (*dentry, error) { qid, file, attrMask, attr, err := parent.file.walkGetAttrOne(ctx, name) if err != nil && err != syserror.ENOENT { @@ -220,24 +216,41 @@ func (fs *filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.Vir child.updateFromP9Attrs(attrMask, &attr) return child, nil } - // The file at this path has changed or no longer exists. Remove - // the stale dentry from the tree, and re-evaluate its caching - // status (i.e. if it has 0 references, drop it). + if file.isNil() && child.isSynthetic() { + // We have a synthetic file, and no remote file has arisen to + // replace it. + return child, nil + } + // The file at this path has changed or no longer exists. Mark the + // dentry invalidated, and re-evaluate its caching status (i.e. if it + // has 0 references, drop it). Wait to update parent.children until we + // know what to replace the existing dentry with (i.e. one of the + // returns below), to avoid a redundant map access. vfsObj.InvalidateDentry(&child.vfsd) + if child.isSynthetic() { + // Normally we don't mark invalidated dentries as deleted since + // they may still exist (but at a different path), and also for + // consistency with Linux. However, synthetic files are guaranteed + // to become unreachable if their dentries are invalidated, so + // treat their invalidation as deletion. + child.setDeleted() + parent.syntheticChildren-- + child.decRefLocked() + parent.dirents = nil + } *ds = appendDentry(*ds, child) } if file.isNil() { // No file exists at this path now. Cache the negative lookup if // allowed. - if fs.opts.interop != InteropModeShared { - parent.cacheNegativeChildLocked(name) - } + parent.cacheNegativeLookupLocked(name) return nil, nil } // Create a new dentry representing the file. child, err = fs.newDentry(ctx, file, qid, attrMask, &attr) if err != nil { file.close(ctx) + delete(parent.children, name) return nil, err } parent.cacheNewChildLocked(child, name) @@ -252,8 +265,9 @@ func (fs *filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.Vir // rp.Start().Impl().(*dentry)). It does not check that the returned directory // is searchable by the provider of rp. // -// Preconditions: fs.renameMu must be locked. !rp.Done(). If fs.opts.interop == -// InteropModeShared, then d's cached metadata must be up to date. +// Preconditions: fs.renameMu must be locked. !rp.Done(). If +// !d.cachedMetadataAuthoritative(), then d's cached metadata must be up to +// date. func (fs *filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, ds **[]*dentry) (*dentry, error) { for !rp.Final() { d.dirMu.Lock() @@ -275,7 +289,7 @@ func (fs *filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.Resolving // Preconditions: fs.renameMu must be locked. func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath, ds **[]*dentry) (*dentry, error) { d := rp.Start().Impl().(*dentry) - if fs.opts.interop == InteropModeShared { + if !d.cachedMetadataAuthoritative() { // Get updated metadata for rp.Start() as required by fs.stepLocked(). if err := d.updateFromGetattr(ctx); err != nil { return nil, err @@ -297,16 +311,17 @@ func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath, } // doCreateAt checks that creating a file at rp is permitted, then invokes -// create to do so. +// createInRemoteDir (if the parent directory is a real remote directory) or +// createInSyntheticDir (if the parent directory is synthetic) to do so. // // Preconditions: !rp.Done(). For the final path component in rp, // !rp.ShouldFollowSymlink(). -func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool, create func(parent *dentry, name string) error) error { +func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool, createInRemoteDir func(parent *dentry, name string) error, createInSyntheticDir func(parent *dentry, name string) error) error { var ds *[]*dentry fs.renameMu.RLock() defer fs.renameMuRUnlockAndCheckCaching(&ds) start := rp.Start().Impl().(*dentry) - if fs.opts.interop == InteropModeShared { + if !start.cachedMetadataAuthoritative() { // Get updated metadata for start as required by // fs.walkParentDirLocked(). if err := start.updateFromGetattr(ctx); err != nil { @@ -340,6 +355,20 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir defer mnt.EndWrite() parent.dirMu.Lock() defer parent.dirMu.Unlock() + if parent.isSynthetic() { + if child := parent.children[name]; child != nil { + return syserror.EEXIST + } + if createInSyntheticDir == nil { + return syserror.EPERM + } + if err := createInSyntheticDir(parent, name); err != nil { + return err + } + parent.touchCMtime() + parent.dirents = nil + return nil + } if fs.opts.interop == InteropModeShared { // The existence of a dentry at name would be inconclusive because the // file it represents may have been deleted from the remote filesystem, @@ -348,21 +377,21 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir // will fail with EEXIST like we would have. If the RPC succeeds, and a // stale dentry exists, the dentry will fail revalidation next time // it's used. - return create(parent, name) + return createInRemoteDir(parent, name) } if child := parent.children[name]; child != nil { return syserror.EEXIST } // No cached dentry exists; however, there might still be an existing file // at name. As above, we attempt the file creation RPC anyway. - if err := create(parent, name); err != nil { + if err := createInRemoteDir(parent, name); err != nil { return err } + if child, ok := parent.children[name]; ok && child == nil { + // Delete the now-stale negative dentry. + delete(parent.children, name) + } parent.touchCMtime() - // Either parent.children[name] doesn't exist (in which case this is a - // no-op) or is nil (in which case this erases the now-stale information - // that the file doesn't exist). - delete(parent.children, name) parent.dirents = nil return nil } @@ -373,7 +402,7 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b fs.renameMu.RLock() defer fs.renameMuRUnlockAndCheckCaching(&ds) start := rp.Start().Impl().(*dentry) - if fs.opts.interop == InteropModeShared { + if !start.cachedMetadataAuthoritative() { // Get updated metadata for start as required by // fs.walkParentDirLocked(). if err := start.updateFromGetattr(ctx); err != nil { @@ -421,8 +450,10 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b // only revalidating the dentry if that fails (indicating that the existing // dentry is a mount point). if child != nil { + child.dirMu.Lock() + defer child.dirMu.Unlock() if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil { - if fs.opts.interop != InteropModeShared { + if parent.cachedMetadataAuthoritative() { return err } child, err = fs.revalidateChildLocked(ctx, vfsObj, parent, name, child, &ds) @@ -437,13 +468,37 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b } } flags := uint32(0) + // If a dentry exists, use it for best-effort checks on its deletability. if dir { - if child != nil && !child.isDir() { - vfsObj.AbortDeleteDentry(&child.vfsd) - return syserror.ENOTDIR + if child != nil { + // child must be an empty directory. + if child.syntheticChildren != 0 { + // This is definitely not an empty directory, irrespective of + // fs.opts.interop. + vfsObj.AbortDeleteDentry(&child.vfsd) + return syserror.ENOTEMPTY + } + // If InteropModeShared is in effect and the first call to + // PrepareDeleteDentry above succeeded, then child wasn't + // revalidated (so we can't expect its file type to be correct) and + // individually revalidating its children (to confirm that they + // still exist) would be a waste of time. + if child.cachedMetadataAuthoritative() { + if !child.isDir() { + vfsObj.AbortDeleteDentry(&child.vfsd) + return syserror.ENOTDIR + } + for _, grandchild := range child.children { + if grandchild != nil { + vfsObj.AbortDeleteDentry(&child.vfsd) + return syserror.ENOTEMPTY + } + } + } } flags = linux.AT_REMOVEDIR } else { + // child must be a non-directory file. if child != nil && child.isDir() { vfsObj.AbortDeleteDentry(&child.vfsd) return syserror.EISDIR @@ -455,28 +510,36 @@ func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir b return syserror.ENOTDIR } } - err = parent.file.unlinkAt(ctx, name, flags) - if err != nil { - if child != nil { - vfsObj.AbortDeleteDentry(&child.vfsd) - } - return err - } - if fs.opts.interop != InteropModeShared { - parent.touchCMtime() - if dir { - parent.decLinks() + if parent.isSynthetic() { + if child == nil { + return syserror.ENOENT } - parent.cacheNegativeChildLocked(name) - parent.dirents = nil } else { - delete(parent.children, name) + err = parent.file.unlinkAt(ctx, name, flags) + if err != nil { + if child != nil { + vfsObj.AbortDeleteDentry(&child.vfsd) + } + return err + } } if child != nil { - child.setDeleted() vfsObj.CommitDeleteDentry(&child.vfsd) + child.setDeleted() + if child.isSynthetic() { + parent.syntheticChildren-- + child.decRefLocked() + } ds = appendDentry(ds, child) } + parent.cacheNegativeLookupLocked(name) + if parent.cachedMetadataAuthoritative() { + parent.dirents = nil + parent.touchCMtime() + if dir { + parent.decLinks() + } + } return nil } @@ -554,7 +617,7 @@ func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPa fs.renameMu.RLock() defer fs.renameMuRUnlockAndCheckCaching(&ds) start := rp.Start().Impl().(*dentry) - if fs.opts.interop == InteropModeShared { + if !start.cachedMetadataAuthoritative() { // Get updated metadata for start as required by // fs.walkParentDirLocked(). if err := start.updateFromGetattr(ctx); err != nil { @@ -577,20 +640,32 @@ func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs. } // 9P2000.L supports hard links, but we don't. return syserror.EPERM - }) + }, nil) } // MkdirAt implements vfs.FilesystemImpl.MkdirAt. func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error { + creds := rp.Credentials() return fs.doCreateAt(ctx, rp, true /* dir */, func(parent *dentry, name string) error { - creds := rp.Credentials() if _, err := parent.file.mkdir(ctx, name, (p9.FileMode)(opts.Mode), (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID)); err != nil { - return err + if !opts.ForSyntheticMountpoint || err == syserror.EEXIST { + return err + } + ctx.Infof("Failed to create remote directory %q: %v; falling back to synthetic directory", name, err) + parent.createSyntheticDirectoryLocked(name, opts.Mode, creds.EffectiveKUID, creds.EffectiveKGID) } if fs.opts.interop != InteropModeShared { parent.incLinks() } return nil + }, func(parent *dentry, name string) error { + if !opts.ForSyntheticMountpoint { + // Can't create non-synthetic files in synthetic directories. + return syserror.EPERM + } + parent.createSyntheticDirectoryLocked(name, opts.Mode, creds.EffectiveKUID, creds.EffectiveKGID) + parent.incLinks() + return nil }) } @@ -600,7 +675,7 @@ func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v creds := rp.Credentials() _, err := parent.file.mknod(ctx, name, (p9.FileMode)(opts.Mode), opts.DevMajor, opts.DevMinor, (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID)) return err - }) + }, nil) } // OpenAt implements vfs.FilesystemImpl.OpenAt. @@ -620,7 +695,7 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf defer fs.renameMuRUnlockAndCheckCaching(&ds) start := rp.Start().Impl().(*dentry) - if fs.opts.interop == InteropModeShared { + if !start.cachedMetadataAuthoritative() { // Get updated metadata for start as required by fs.stepLocked(). if err := start.updateFromGetattr(ctx); err != nil { return nil, err @@ -643,6 +718,10 @@ afterTrailingSymlink: parent.dirMu.Lock() child, err := fs.stepLocked(ctx, rp, parent, &ds) if err == syserror.ENOENT && mayCreate { + if parent.isSynthetic() { + parent.dirMu.Unlock() + return nil, syserror.EPERM + } fd, err := parent.createAndOpenChildLocked(ctx, rp, &opts) parent.dirMu.Unlock() return fd, err @@ -702,8 +781,10 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vf if opts.Flags&linux.O_DIRECT != 0 { return nil, syserror.EINVAL } - if err := d.ensureSharedHandle(ctx, ats&vfs.MayRead != 0, false /* write */, false /* trunc */); err != nil { - return nil, err + if !d.isSynthetic() { + if err := d.ensureSharedHandle(ctx, ats&vfs.MayRead != 0, false /* write */, false /* trunc */); err != nil { + return nil, err + } } fd := &directoryFD{} if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil { @@ -733,6 +814,7 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vf } // Preconditions: d.fs.renameMu must be locked. d.dirMu must be locked. +// !d.isSynthetic(). func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions) (*vfs.FileDescription, error) { if err := d.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil { return nil, err @@ -811,7 +893,7 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving child.refs = 1 // Insert the dentry into the tree. d.cacheNewChildLocked(child, name) - if d.fs.opts.interop != InteropModeShared { + if d.cachedMetadataAuthoritative() { d.touchCMtime() d.dirents = nil } @@ -888,7 +970,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa defer mnt.EndWrite() oldParent := oldParentVD.Dentry().Impl().(*dentry) - if fs.opts.interop == InteropModeShared { + if !oldParent.cachedMetadataAuthoritative() { if err := oldParent.updateFromGetattr(ctx); err != nil { return err } @@ -933,35 +1015,22 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa if newParent.isDeleted() { return syserror.ENOENT } - replaced := newParent.children[newName] - // This is similar to unlinkAt, except: - // - // - If a dentry exists for the file to be replaced, we revalidate it - // unconditionally (instead of only if PrepareRenameDentry fails) for - // simplicity. - // - // - If rp.MustBeDir(), then we need a dentry representing the replaced - // file regardless to confirm that it's a directory. - if replaced != nil || rp.MustBeDir() { - replaced, err = fs.getChildLocked(ctx, rp.VirtualFilesystem(), newParent, newName, &ds) - if err != nil { - return err - } - if replaced != nil { - if replaced.isDir() { - if !renamed.isDir() { - return syserror.EISDIR - } - } else { - if rp.MustBeDir() || renamed.isDir() { - return syserror.ENOTDIR - } - } - } + replaced, err := fs.getChildLocked(ctx, rp.VirtualFilesystem(), newParent, newName, &ds) + if err != nil { + return err } var replacedVFSD *vfs.Dentry if replaced != nil { replacedVFSD = &replaced.vfsd + if replaced.isDir() { + if !renamed.isDir() { + return syserror.EISDIR + } + } else { + if rp.MustBeDir() || renamed.isDir() { + return syserror.ENOTDIR + } + } } if oldParent == newParent && oldName == newName { @@ -972,27 +1041,47 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa if err := vfsObj.PrepareRenameDentry(mntns, &renamed.vfsd, replacedVFSD); err != nil { return err } - if err := renamed.file.rename(ctx, newParent.file, newName); err != nil { - vfsObj.AbortRenameDentry(&renamed.vfsd, replacedVFSD) - return err + + // Update the remote filesystem. + if !renamed.isSynthetic() { + if err := renamed.file.rename(ctx, newParent.file, newName); err != nil { + vfsObj.AbortRenameDentry(&renamed.vfsd, replacedVFSD) + return err + } + } else if replaced != nil && !replaced.isSynthetic() { + // We are replacing an existing real file with a synthetic one, so we + // need to unlink the former. + flags := uint32(0) + if replaced.isDir() { + flags = linux.AT_REMOVEDIR + } + if err := newParent.file.unlinkAt(ctx, newName, flags); err != nil { + vfsObj.AbortRenameDentry(&renamed.vfsd, replacedVFSD) + return err + } } - if fs.opts.interop != InteropModeShared { - oldParent.cacheNegativeChildLocked(oldName) - oldParent.dirents = nil - newParent.dirents = nil - if renamed.isDir() { - oldParent.decLinks() - newParent.incLinks() + + // Update the dentry tree. + vfsObj.CommitRenameReplaceDentry(&renamed.vfsd, replacedVFSD) + if replaced != nil { + replaced.setDeleted() + if replaced.isSynthetic() { + newParent.syntheticChildren-- + replaced.decRefLocked() } - oldParent.touchCMtime() - newParent.touchCMtime() - renamed.touchCtime() - } else { - delete(oldParent.children, oldName) + ds = appendDentry(ds, replaced) } + oldParent.cacheNegativeLookupLocked(oldName) + // We don't use newParent.cacheNewChildLocked() since we don't want to mess + // with reference counts and queue oldParent for checkCachingLocked if the + // parent isn't actually changing. if oldParent != newParent { - appendDentry(ds, oldParent) + ds = appendDentry(ds, oldParent) newParent.IncRef() + if renamed.isSynthetic() { + oldParent.syntheticChildren-- + newParent.syntheticChildren++ + } } renamed.parent = newParent renamed.name = newName @@ -1000,11 +1089,25 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa newParent.children = make(map[string]*dentry) } newParent.children[newName] = renamed - if replaced != nil { - replaced.setDeleted() - appendDentry(ds, replaced) + + // Update metadata. + if renamed.cachedMetadataAuthoritative() { + renamed.touchCtime() + } + if oldParent.cachedMetadataAuthoritative() { + oldParent.dirents = nil + oldParent.touchCMtime() + if renamed.isDir() { + oldParent.decLinks() + } + } + if newParent.cachedMetadataAuthoritative() { + newParent.dirents = nil + newParent.touchCMtime() + if renamed.isDir() { + newParent.incLinks() + } } - vfsObj.CommitRenameReplaceDentry(&renamed.vfsd, replacedVFSD) return nil } @@ -1051,6 +1154,10 @@ func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linu if err != nil { return linux.Statfs{}, err } + // If d is synthetic, invoke statfs on the first ancestor of d that isn't. + for d.isSynthetic() { + d = d.parent + } fsstat, err := d.file.statFS(ctx) if err != nil { return linux.Statfs{}, err @@ -1080,7 +1187,7 @@ func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, targ creds := rp.Credentials() _, err := parent.file.symlink(ctx, target, name, (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID)) return err - }) + }, nil) } // UnlinkAt implements vfs.FilesystemImpl.UnlinkAt. diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go index 293df2545..8b4e91d17 100644 --- a/pkg/sentry/fsimpl/gofer/gofer.go +++ b/pkg/sentry/fsimpl/gofer/gofer.go @@ -27,8 +27,9 @@ // dentry.handleMu // dentry.dataMu // -// Locking dentry.dirMu in multiple dentries requires holding -// filesystem.renameMu for writing. +// Locking dentry.dirMu in multiple dentries requires that either ancestor +// dentries are locked before descendant dentries, or that filesystem.renameMu +// is locked for writing. package gofer import ( @@ -102,11 +103,12 @@ type filesystem struct { cachedDentries dentryList cachedDentriesLen uint64 - // dentries contains all dentries in this filesystem. specialFileFDs - // contains all open specialFileFDs. These fields are protected by syncMu. - syncMu sync.Mutex - dentries map[*dentry]struct{} - specialFileFDs map[*specialFileFD]struct{} + // syncableDentries contains all dentries in this filesystem for which + // !dentry.file.isNil(). specialFileFDs contains all open specialFileFDs. + // These fields are protected by syncMu. + syncMu sync.Mutex + syncableDentries map[*dentry]struct{} + specialFileFDs map[*specialFileFD]struct{} } type filesystemOptions struct { @@ -187,7 +189,8 @@ const ( // InteropModeShared is appropriate when there are users of the remote // filesystem that may mutate its state other than the client. // - // - The client must verify cached filesystem state before using it. + // - The client must verify ("revalidate") cached filesystem state before + // using it. // // - Client changes to filesystem state must be sent to the remote // filesystem synchronously. @@ -376,14 +379,14 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt // Construct the filesystem object. fs := &filesystem{ - mfp: mfp, - opts: fsopts, - uid: creds.EffectiveKUID, - gid: creds.EffectiveKGID, - client: client, - clock: ktime.RealtimeClockFromContext(ctx), - dentries: make(map[*dentry]struct{}), - specialFileFDs: make(map[*specialFileFD]struct{}), + mfp: mfp, + opts: fsopts, + uid: creds.EffectiveKUID, + gid: creds.EffectiveKGID, + client: client, + clock: ktime.RealtimeClockFromContext(ctx), + syncableDentries: make(map[*dentry]struct{}), + specialFileFDs: make(map[*specialFileFD]struct{}), } fs.vfsfs.Init(vfsObj, &fstype, fs) @@ -409,7 +412,7 @@ func (fs *filesystem) Release() { mf := fs.mfp.MemoryFile() fs.syncMu.Lock() - for d := range fs.dentries { + for d := range fs.syncableDentries { d.handleMu.Lock() d.dataMu.Lock() if d.handleWritable { @@ -444,9 +447,11 @@ type dentry struct { vfsd vfs.Dentry // refs is the reference count. Each dentry holds a reference on its - // parent, even if disowned. refs is accessed using atomic memory - // operations. When refs reaches 0, the dentry may be added to the cache or - // destroyed. If refs==-1 the dentry has already been destroyed. + // parent, even if disowned. An additional reference is held on all + // synthetic dentries until they are unlinked or invalidated. When refs + // reaches 0, the dentry may be added to the cache or destroyed. If refs == + // -1, the dentry has already been destroyed. refs is accessed using atomic + // memory operations. refs int64 // fs is the owning filesystem. fs is immutable. @@ -465,6 +470,12 @@ type dentry struct { // We don't support hard links, so each dentry maps 1:1 to an inode. // file is the unopened p9.File that backs this dentry. file is immutable. + // + // If file.isNil(), this dentry represents a synthetic file, i.e. a file + // that does not exist on the remote filesystem. As of this writing, this + // is only possible for a directory created with + // MkdirOptions.ForSyntheticMountpoint == true. + // TODO(gvisor.dev/issue/1476): Support synthetic sockets (and pipes). file p9file // If deleted is non-zero, the file represented by this dentry has been @@ -484,15 +495,21 @@ type dentry struct { // - Mappings of child filenames to dentries representing those children. // // - Mappings of child filenames that are known not to exist to nil - // dentries (only if InteropModeShared is not in effect). + // dentries (only if InteropModeShared is not in effect and the directory + // is not synthetic). // // children is protected by dirMu. children map[string]*dentry - // If this dentry represents a directory, InteropModeShared is not in - // effect, and dirents is not nil, it is a cache of all entries in the - // directory, in the order they were returned by the server. dirents is - // protected by dirMu. + // If this dentry represents a directory, syntheticChildren is the number + // of child dentries for which dentry.isSynthetic() == true. + // syntheticChildren is protected by dirMu. + syntheticChildren int + + // If this dentry represents a directory, + // dentry.cachedMetadataAuthoritative() == true, and dirents is not nil, it + // is a cache of all entries in the directory, in the order they were + // returned by the server. dirents is protected by dirMu. dirents []vfs.Dirent // Cached metadata; protected by metadataMu and accessed using atomic @@ -589,6 +606,8 @@ func dentryAttrMask() p9.AttrMask { // initially has no references, but is not cached; it is the caller's // responsibility to set the dentry's reference count and/or call // dentry.checkCachingLocked() as appropriate. +// +// Preconditions: !file.isNil(). func (fs *filesystem) newDentry(ctx context.Context, file p9file, qid p9.QID, mask p9.AttrMask, attr *p9.Attr) (*dentry, error) { if !mask.Mode { ctx.Warningf("can't create gofer.dentry without file type") @@ -612,10 +631,10 @@ func (fs *filesystem) newDentry(ctx context.Context, file p9file, qid p9.QID, ma }, } d.pf.dentry = d - if mask.UID { + if mask.UID && attr.UID != auth.NoID { d.uid = uint32(attr.UID) } - if mask.GID { + if mask.GID && attr.GID != auth.NoID { d.gid = uint32(attr.GID) } if mask.Size { @@ -642,11 +661,19 @@ func (fs *filesystem) newDentry(ctx context.Context, file p9file, qid p9.QID, ma d.vfsd.Init(d) fs.syncMu.Lock() - fs.dentries[d] = struct{}{} + fs.syncableDentries[d] = struct{}{} fs.syncMu.Unlock() return d, nil } +func (d *dentry) isSynthetic() bool { + return d.file.isNil() +} + +func (d *dentry) cachedMetadataAuthoritative() bool { + return d.fs.opts.interop != InteropModeShared || d.isSynthetic() +} + // updateFromP9Attrs is called to update d's metadata after an update from the // remote filesystem. func (d *dentry) updateFromP9Attrs(mask p9.AttrMask, attr *p9.Attr) { @@ -691,6 +718,7 @@ func (d *dentry) updateFromP9Attrs(mask p9.AttrMask, attr *p9.Attr) { d.metadataMu.Unlock() } +// Preconditions: !d.isSynthetic() func (d *dentry) updateFromGetattr(ctx context.Context) error { // Use d.handle.file, which represents a 9P fid that has been opened, in // preference to d.file, which represents a 9P fid that has not. This may @@ -758,7 +786,7 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *lin defer mnt.EndWrite() setLocalAtime := false setLocalMtime := false - if d.fs.opts.interop != InteropModeShared { + if d.cachedMetadataAuthoritative() { // Timestamp updates will be handled locally. setLocalAtime = stat.Mask&linux.STATX_ATIME != 0 setLocalMtime = stat.Mask&linux.STATX_MTIME != 0 @@ -771,35 +799,37 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *lin } d.metadataMu.Lock() defer d.metadataMu.Unlock() - if stat.Mask != 0 { - if err := d.file.setAttr(ctx, p9.SetAttrMask{ - Permissions: stat.Mask&linux.STATX_MODE != 0, - UID: stat.Mask&linux.STATX_UID != 0, - GID: stat.Mask&linux.STATX_GID != 0, - Size: stat.Mask&linux.STATX_SIZE != 0, - ATime: stat.Mask&linux.STATX_ATIME != 0, - MTime: stat.Mask&linux.STATX_MTIME != 0, - ATimeNotSystemTime: stat.Atime.Nsec != linux.UTIME_NOW, - MTimeNotSystemTime: stat.Mtime.Nsec != linux.UTIME_NOW, - }, p9.SetAttr{ - Permissions: p9.FileMode(stat.Mode), - UID: p9.UID(stat.UID), - GID: p9.GID(stat.GID), - Size: stat.Size, - ATimeSeconds: uint64(stat.Atime.Sec), - ATimeNanoSeconds: uint64(stat.Atime.Nsec), - MTimeSeconds: uint64(stat.Mtime.Sec), - MTimeNanoSeconds: uint64(stat.Mtime.Nsec), - }); err != nil { - return err + if !d.isSynthetic() { + if stat.Mask != 0 { + if err := d.file.setAttr(ctx, p9.SetAttrMask{ + Permissions: stat.Mask&linux.STATX_MODE != 0, + UID: stat.Mask&linux.STATX_UID != 0, + GID: stat.Mask&linux.STATX_GID != 0, + Size: stat.Mask&linux.STATX_SIZE != 0, + ATime: stat.Mask&linux.STATX_ATIME != 0, + MTime: stat.Mask&linux.STATX_MTIME != 0, + ATimeNotSystemTime: stat.Atime.Nsec != linux.UTIME_NOW, + MTimeNotSystemTime: stat.Mtime.Nsec != linux.UTIME_NOW, + }, p9.SetAttr{ + Permissions: p9.FileMode(stat.Mode), + UID: p9.UID(stat.UID), + GID: p9.GID(stat.GID), + Size: stat.Size, + ATimeSeconds: uint64(stat.Atime.Sec), + ATimeNanoSeconds: uint64(stat.Atime.Nsec), + MTimeSeconds: uint64(stat.Mtime.Sec), + MTimeNanoSeconds: uint64(stat.Mtime.Nsec), + }); err != nil { + return err + } + } + if d.fs.opts.interop == InteropModeShared { + // There's no point to updating d's metadata in this case since + // it'll be overwritten by revalidation before the next time it's + // used anyway. (InteropModeShared inhibits client caching of + // regular file data, so there's no cache to truncate either.) + return nil } - } - if d.fs.opts.interop == InteropModeShared { - // There's no point to updating d's metadata in this case since it'll - // be overwritten by revalidation before the next time it's used - // anyway. (InteropModeShared inhibits client caching of regular file - // data, so there's no cache to truncate either.) - return nil } now := d.fs.clock.Now().Nanoseconds() if stat.Mask&linux.STATX_MODE != 0 { @@ -897,6 +927,15 @@ func (d *dentry) DecRef() { } } +// decRefLocked decrements d's reference count without calling +// d.checkCachingLocked, even if d's reference count reaches 0; callers are +// responsible for ensuring that d.checkCachingLocked will be called later. +func (d *dentry) decRefLocked() { + if refs := atomic.AddInt64(&d.refs, -1); refs < 0 { + panic("gofer.dentry.decRefLocked() called without holding a reference") + } +} + // checkCachingLocked should be called after d's reference count becomes 0 or it // becomes disowned. // @@ -1013,11 +1052,11 @@ func (d *dentry) destroyLocked() { if !d.file.isNil() { d.file.close(ctx) d.file = p9file{} + // Remove d from the set of syncable dentries. + d.fs.syncMu.Lock() + delete(d.fs.syncableDentries, d) + d.fs.syncMu.Unlock() } - // Remove d from the set of all dentries. - d.fs.syncMu.Lock() - delete(d.fs.dentries, d) - d.fs.syncMu.Unlock() // Drop the reference held by d on its parent without recursively locking // d.fs.renameMu. if d.parent != nil { @@ -1040,6 +1079,9 @@ func (d *dentry) setDeleted() { // We only support xattrs prefixed with "user." (see b/148380782). Currently, // there is no need to expose any other xattrs through a gofer. func (d *dentry) listxattr(ctx context.Context, creds *auth.Credentials, size uint64) ([]string, error) { + if d.file.isNil() { + return nil, nil + } xattrMap, err := d.file.listXattr(ctx, size) if err != nil { return nil, err @@ -1054,6 +1096,9 @@ func (d *dentry) listxattr(ctx context.Context, creds *auth.Credentials, size ui } func (d *dentry) getxattr(ctx context.Context, creds *auth.Credentials, opts *vfs.GetxattrOptions) (string, error) { + if d.file.isNil() { + return "", syserror.ENODATA + } if err := d.checkPermissions(creds, vfs.MayRead); err != nil { return "", err } @@ -1064,6 +1109,9 @@ func (d *dentry) getxattr(ctx context.Context, creds *auth.Credentials, opts *vf } func (d *dentry) setxattr(ctx context.Context, creds *auth.Credentials, opts *vfs.SetxattrOptions) error { + if d.file.isNil() { + return syserror.EPERM + } if err := d.checkPermissions(creds, vfs.MayWrite); err != nil { return err } @@ -1074,6 +1122,9 @@ func (d *dentry) setxattr(ctx context.Context, creds *auth.Credentials, opts *vf } func (d *dentry) removexattr(ctx context.Context, creds *auth.Credentials, name string) error { + if d.file.isNil() { + return syserror.EPERM + } if err := d.checkPermissions(creds, vfs.MayWrite); err != nil { return err } @@ -1083,7 +1134,7 @@ func (d *dentry) removexattr(ctx context.Context, creds *auth.Credentials, name return d.file.removeXattr(ctx, name) } -// Preconditions: d.isRegularFile() || d.isDirectory(). +// Preconditions: !d.file.isNil(). d.isRegularFile() || d.isDirectory(). func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool) error { // O_TRUNC unconditionally requires us to obtain a new handle (opened with // O_TRUNC). @@ -1213,7 +1264,7 @@ func (fd *fileDescription) dentry() *dentry { func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { d := fd.dentry() const validMask = uint32(linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME | linux.STATX_SIZE | linux.STATX_BLOCKS | linux.STATX_BTIME) - if d.fs.opts.interop == InteropModeShared && opts.Mask&(validMask) != 0 && opts.Sync != linux.AT_STATX_DONT_SYNC { + if !d.cachedMetadataAuthoritative() && opts.Mask&validMask != 0 && opts.Sync != linux.AT_STATX_DONT_SYNC { // TODO(jamieliu): Use specialFileFD.handle.file for the getattr if // available? if err := d.updateFromGetattr(ctx); err != nil { diff --git a/pkg/sentry/fsimpl/gofer/gofer_test.go b/pkg/sentry/fsimpl/gofer/gofer_test.go index 4041fb252..adff39490 100644 --- a/pkg/sentry/fsimpl/gofer/gofer_test.go +++ b/pkg/sentry/fsimpl/gofer/gofer_test.go @@ -24,7 +24,7 @@ import ( func TestDestroyIdempotent(t *testing.T) { fs := filesystem{ - dentries: make(map[*dentry]struct{}), + syncableDentries: make(map[*dentry]struct{}), opts: filesystemOptions{ // Test relies on no dentry being held in the cache. maxCachedDentries: 0, diff --git a/pkg/sentry/vfs/filesystem.go b/pkg/sentry/vfs/filesystem.go index 74577bc2f..20e5bb072 100644 --- a/pkg/sentry/vfs/filesystem.go +++ b/pkg/sentry/vfs/filesystem.go @@ -443,8 +443,7 @@ type FilesystemImpl interface { // Errors: // // - If extended attributes are not supported by the filesystem, - // ListxattrAt returns nil. (See FileDescription.Listxattr for an - // explanation.) + // ListxattrAt returns ENOTSUP. // // - If the size of the list (including a NUL terminating byte after every // entry) would exceed size, ERANGE may be returned. Note that diff --git a/pkg/sentry/vfs/options.go b/pkg/sentry/vfs/options.go index 534528ce6..022bac127 100644 --- a/pkg/sentry/vfs/options.go +++ b/pkg/sentry/vfs/options.go @@ -33,6 +33,25 @@ type GetDentryOptions struct { type MkdirOptions struct { // Mode is the file mode bits for the created directory. Mode linux.FileMode + + // If ForSyntheticMountpoint is true, FilesystemImpl.MkdirAt() may create + // the given directory in memory only (as opposed to persistent storage). + // The created directory should be able to support the creation of + // subdirectories with ForSyntheticMountpoint == true. It does not need to + // support the creation of subdirectories with ForSyntheticMountpoint == + // false, or files of other types. + // + // FilesystemImpls are permitted to ignore the ForSyntheticMountpoint + // option. + // + // The ForSyntheticMountpoint option exists because, unlike mount(2), the + // OCI Runtime Specification permits the specification of mount points that + // do not exist, under the expectation that container runtimes will create + // them. (More accurately, the OCI Runtime Specification completely fails + // to document this feature, but it's implemented by runc.) + // ForSyntheticMountpoint allows such mount points to be created even when + // the underlying persistent filesystem is immutable. + ForSyntheticMountpoint bool } // MknodOptions contains options to VirtualFilesystem.MknodAt() and diff --git a/runsc/boot/vfs.go b/runsc/boot/vfs.go index 82083c57d..bce3a3593 100644 --- a/runsc/boot/vfs.go +++ b/runsc/boot/vfs.go @@ -251,6 +251,12 @@ func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *Config, // All writes go to upper, be paranoid and make lower readonly. opts.ReadOnly = useOverlay + if err := c.k.VFS().MkdirAt(ctx, creds, target, &vfs.MkdirOptions{ + ForSyntheticMountpoint: true, + }); err != nil && err != syserror.EEXIST { + // Log a warning, but attempt the mount anyway. + log.Warningf("Failed to create mount point at %q: %v", submount.Destination, err) + } if err := c.k.VFS().MountAt(ctx, creds, "", target, submount.Type, opts); err != nil { return fmt.Errorf("failed to mount %q (type: %s): %w, opts: %v", submount.Destination, submount.Type, err, opts) } -- cgit v1.2.3 From 15a822a1936e295cb6418df7ddf445d8500dfb2e Mon Sep 17 00:00:00 2001 From: Zach Koopmans Date: Fri, 24 Apr 2020 18:22:21 -0700 Subject: VFS2: Get HelloWorld image tests to pass with VFS2 This change includes: - Modifications to loader_test.go to get TestCreateMountNamespace to pass with VFS2. - Changes necessary to get TestHelloWorld in image tests to pass with VFS2. This means runsc can run the hello-world container with docker on VSF2. Note: Containers that use sockets will not run with these changes. See "//test/image/...". Any tests here with sockets currently fail (which is all of them but HelloWorld). PiperOrigin-RevId: 308363072 --- pkg/sentry/fsimpl/gofer/directory.go | 1 + runsc/boot/BUILD | 2 + runsc/boot/loader.go | 13 +-- runsc/boot/loader_test.go | 152 ++++++++++++++++++++++++----------- runsc/boot/vfs.go | 78 ++++++++++++++---- scripts/docker_tests.sh | 3 + 6 files changed, 183 insertions(+), 66 deletions(-) (limited to 'runsc/boot/vfs.go') diff --git a/pkg/sentry/fsimpl/gofer/directory.go b/pkg/sentry/fsimpl/gofer/directory.go index c67766ab2..55f9ed911 100644 --- a/pkg/sentry/fsimpl/gofer/directory.go +++ b/pkg/sentry/fsimpl/gofer/directory.go @@ -75,6 +75,7 @@ func (d *dentry) createSyntheticDirectoryLocked(name string, mode linux.FileMode handle: handle{ fd: -1, }, + nlink: uint32(2), } d2.pf.dentry = d2 d2.vfsd.Init(d2) diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD index 69dcc74f2..ed3c8f546 100644 --- a/runsc/boot/BUILD +++ b/runsc/boot/BUILD @@ -119,11 +119,13 @@ go_test( library = ":boot", deps = [ "//pkg/control/server", + "//pkg/fspath", "//pkg/log", "//pkg/p9", "//pkg/sentry/contexttest", "//pkg/sentry/fs", "//pkg/sentry/kernel", + "//pkg/sentry/vfs", "//pkg/sync", "//pkg/unet", "//runsc/fsgofer", diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index 3f41d8357..f6ea4c102 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -625,11 +625,14 @@ func (l *Loader) run() error { // l.stdioFDs are derived from dup() in boot.New() and they are now dup()ed again // either in createFDTable() during initial start or in descriptor.initAfterLoad() - // during restore, we can release l.stdioFDs now. - for _, fd := range l.stdioFDs { - err := syscall.Close(fd) - if err != nil { - return fmt.Errorf("close dup()ed stdioFDs: %v", err) + // during restore, we can release l.stdioFDs now. VFS2 takes ownership of the + // passed FDs, so only close for VFS1. + if !kernel.VFS2Enabled { + for _, fd := range l.stdioFDs { + err := syscall.Close(fd) + if err != nil { + return fmt.Errorf("close dup()ed stdioFDs: %v", err) + } } } diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go index e7c71734f..55d27a632 100644 --- a/runsc/boot/loader_test.go +++ b/runsc/boot/loader_test.go @@ -26,11 +26,13 @@ import ( specs "github.com/opencontainers/runtime-spec/specs-go" "golang.org/x/sys/unix" "gvisor.dev/gvisor/pkg/control/server" + "gvisor.dev/gvisor/pkg/fspath" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/p9" "gvisor.dev/gvisor/pkg/sentry/contexttest" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/unet" "gvisor.dev/gvisor/runsc/fsgofer" @@ -107,14 +109,12 @@ func startGofer(root string) (int, func(), error) { return sandboxEnd, cleanup, nil } -func createLoader(vfsEnabled bool) (*Loader, func(), error) { +func createLoader(vfsEnabled bool, spec *specs.Spec) (*Loader, func(), error) { fd, err := server.CreateSocket(ControlSocketAddr(fmt.Sprintf("%010d", rand.Int())[:10])) if err != nil { return nil, nil, err } conf := testConfig() - spec := testSpec() - conf.VFS2 = vfsEnabled sandEnd, cleanup, err := startGofer(spec.Root.Path) @@ -161,7 +161,7 @@ func TestRunVFS2(t *testing.T) { } func doRun(t *testing.T, vfsEnabled bool) { - l, cleanup, err := createLoader(vfsEnabled) + l, cleanup, err := createLoader(vfsEnabled, testSpec()) if err != nil { t.Fatalf("error creating loader: %v", err) } @@ -210,7 +210,7 @@ func TestStartSignalVFS2(t *testing.T) { } func doStartSignal(t *testing.T, vfsEnabled bool) { - l, cleanup, err := createLoader(vfsEnabled) + l, cleanup, err := createLoader(vfsEnabled, testSpec()) if err != nil { t.Fatalf("error creating loader: %v", err) } @@ -258,18 +258,19 @@ func doStartSignal(t *testing.T, vfsEnabled bool) { } -// Test that MountNamespace can be created with various specs. -func TestCreateMountNamespace(t *testing.T) { - testCases := []struct { - name string - // Spec that will be used to create the mount manager. Note - // that we can't mount procfs without a kernel, so each spec - // MUST contain something other than procfs mounted at /proc. - spec specs.Spec - // Paths that are expected to exist in the resulting fs. - expectedPaths []string - }{ - { +type CreateMountTestcase struct { + name string + // Spec that will be used to create the mount manager. Note + // that we can't mount procfs without a kernel, so each spec + // MUST contain something other than procfs mounted at /proc. + spec specs.Spec + // Paths that are expected to exist in the resulting fs. + expectedPaths []string +} + +func createMountTestcases(vfs2 bool) []*CreateMountTestcase { + testCases := []*CreateMountTestcase{ + &CreateMountTestcase{ // Only proc. name: "only proc mount", spec: specs.Spec{ @@ -311,7 +312,7 @@ func TestCreateMountNamespace(t *testing.T) { // /dev, and /sys. expectedPaths: []string{"/some/very/very/deep/path", "/proc", "/dev", "/sys"}, }, - { + &CreateMountTestcase{ // Mounts are nested inside each other. name: "nested mounts", spec: specs.Spec{ @@ -355,7 +356,7 @@ func TestCreateMountNamespace(t *testing.T) { expectedPaths: []string{"/foo", "/foo/bar", "/foo/bar/baz", "/foo/qux", "/foo/qux-quz", "/foo/some/very/very/deep/path", "/proc", "/dev", "/sys"}, }, - { + &CreateMountTestcase{ name: "mount inside /dev", spec: specs.Spec{ Root: &specs.Root{ @@ -398,40 +399,47 @@ func TestCreateMountNamespace(t *testing.T) { }, expectedPaths: []string{"/proc", "/dev", "/dev/fd-foo", "/dev/foo", "/dev/bar", "/sys"}, }, - { - name: "mounts inside mandatory mounts", - spec: specs.Spec{ - Root: &specs.Root{ - Path: os.TempDir(), - Readonly: true, + } + + vfsCase := &CreateMountTestcase{ + name: "mounts inside mandatory mounts", + spec: specs.Spec{ + Root: &specs.Root{ + Path: os.TempDir(), + Readonly: true, + }, + Mounts: []specs.Mount{ + { + Destination: "/proc", + Type: "tmpfs", }, - Mounts: []specs.Mount{ - { - Destination: "/proc", - Type: "tmpfs", - }, - // We don't include /sys, and /tmp in - // the spec, since they will be added - // automatically. - // - // Instead, add submounts inside these - // directories and make sure they are - // visible under the mandatory mounts. - { - Destination: "/sys/bar", - Type: "tmpfs", - }, - { - Destination: "/tmp/baz", - Type: "tmpfs", - }, + // TODO (gvisor.dev/issue/1487): Re-add this case when sysfs supports + // MkDirAt in VFS2 (and remove the reduntant append). + // { + // Destination: "/sys/bar", + // Type: "tmpfs", + // }, + // + { + Destination: "/tmp/baz", + Type: "tmpfs", }, }, - expectedPaths: []string{"/proc", "/sys", "/sys/bar", "/tmp", "/tmp/baz"}, }, + expectedPaths: []string{"/proc", "/sys" /* "/sys/bar" ,*/, "/tmp", "/tmp/baz"}, } - for _, tc := range testCases { + if !vfs2 { + vfsCase.spec.Mounts = append(vfsCase.spec.Mounts, specs.Mount{Destination: "/sys/bar", Type: "tmpfs"}) + vfsCase.expectedPaths = append(vfsCase.expectedPaths, "/sys/bar") + } + return append(testCases, vfsCase) +} + +// Test that MountNamespace can be created with various specs. +func TestCreateMountNamespace(t *testing.T) { + + for _, tc := range createMountTestcases(false /* vfs2 */) { t.Run(tc.name, func(t *testing.T) { conf := testConfig() ctx := contexttest.Context(t) @@ -466,6 +474,56 @@ func TestCreateMountNamespace(t *testing.T) { } } +// Test that MountNamespace can be created with various specs. +func TestCreateMountNamespaceVFS2(t *testing.T) { + + for _, tc := range createMountTestcases(true /* vfs2 */) { + t.Run(tc.name, func(t *testing.T) { + defer resetSyscallTable() + + spec := testSpec() + spec.Mounts = tc.spec.Mounts + spec.Root = tc.spec.Root + + l, loaderCleanup, err := createLoader(true /* VFS2 Enabled */, spec) + if err != nil { + t.Fatalf("failed to create loader: %v", err) + } + defer l.Destroy() + defer loaderCleanup() + + mntr := newContainerMounter(l.spec, l.goferFDs, l.k, l.mountHints) + if err := mntr.processHints(l.conf); err != nil { + t.Fatalf("failed process hints: %v", err) + } + + ctx := l.rootProcArgs.NewContext(l.k) + mns, err := mntr.setupVFS2(ctx, l.conf, &l.rootProcArgs) + if err != nil { + t.Fatalf("failed to setupVFS2: %v", err) + } + + root := mns.Root() + defer root.DecRef() + for _, p := range tc.expectedPaths { + + target := &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(p), + } + + if d, err := l.k.VFS().GetDentryAt(ctx, l.rootProcArgs.Credentials, target, &vfs.GetDentryOptions{}); err != nil { + t.Errorf("expected path %v to exist with spec %v, but got error %v", p, tc.spec, err) + } else { + d.DecRef() + } + + } + }) + } +} + // TestRestoreEnvironment tests that the correct mounts are collected from the spec and config // in order to build the environment for restoring. func TestRestoreEnvironment(t *testing.T) { diff --git a/runsc/boot/vfs.go b/runsc/boot/vfs.go index bce3a3593..0b9b0b436 100644 --- a/runsc/boot/vfs.go +++ b/runsc/boot/vfs.go @@ -17,6 +17,7 @@ package boot import ( "fmt" "path" + "sort" "strconv" "strings" @@ -192,14 +193,9 @@ func (c *containerMounter) setupVFS2(ctx context.Context, conf *Config, procArgs return nil, fmt.Errorf("register filesystems: %w", err) } - fd := c.fds.remove() - - opts := strings.Join(p9MountOptionsVFS2(fd, conf.FileAccess), ",") - - log.Infof("Mounting root over 9P, ioFD: %d", fd) - mns, err := c.k.VFS().NewMountNamespace(ctx, creds, "", rootFsName, &vfs.GetFilesystemOptions{Data: opts}) + mns, err := c.createMountNamespaceVFS2(ctx, conf, creds) if err != nil { - return nil, fmt.Errorf("setting up mountnamespace: %w", err) + return nil, fmt.Errorf("creating mount namespace: %w", err) } rootProcArgs.MountNamespaceVFS2 = mns @@ -212,8 +208,23 @@ func (c *containerMounter) setupVFS2(ctx context.Context, conf *Config, procArgs return mns, nil } +func (c *containerMounter) createMountNamespaceVFS2(ctx context.Context, conf *Config, creds *auth.Credentials) (*vfs.MountNamespace, error) { + + fd := c.fds.remove() + opts := strings.Join(p9MountOptionsVFS2(fd, conf.FileAccess), ",") + + log.Infof("Mounting root over 9P, ioFD: %d", fd) + mns, err := c.k.VFS().NewMountNamespace(ctx, creds, "", rootFsName, &vfs.GetFilesystemOptions{Data: opts}) + if err != nil { + return nil, fmt.Errorf("setting up mount namespace: %w", err) + } + return mns, nil +} + func (c *containerMounter) mountSubmountsVFS2(ctx context.Context, conf *Config, mns *vfs.MountNamespace, creds *auth.Credentials) error { + c.prepareMountsVFS2() + for _, submount := range c.mounts { log.Debugf("Mounting %q to %q, type: %s, options: %s", submount.Source, submount.Destination, submount.Type, submount.Options) if err := c.mountSubmountVFS2(ctx, conf, mns, creds, &submount); err != nil { @@ -226,6 +237,11 @@ func (c *containerMounter) mountSubmountsVFS2(ctx context.Context, conf *Config, return c.checkDispenser() } +func (c *containerMounter) prepareMountsVFS2() { + // Sort the mounts so that we don't place children before parents. + sort.Slice(c.mounts, func(i, j int) bool { return len(c.mounts[i].Destination) < len(c.mounts[j].Destination) }) +} + // TODO(gvisor.dev/issue/1487): Implement submount options similar to the VFS1 version. func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *Config, mns *vfs.MountNamespace, creds *auth.Credentials, submount *specs.Mount) error { root := mns.Root() @@ -236,11 +252,21 @@ func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *Config, Path: fspath.Parse(submount.Destination), } - _, options, useOverlay, err := c.getMountNameAndOptionsVFS2(conf, *submount) + fsName, options, useOverlay, err := c.getMountNameAndOptionsVFS2(conf, *submount) if err != nil { return fmt.Errorf("mountOptions failed: %w", err) } + if fsName == "" { + // Filesystem is not supported (e.g. cgroup), just skip it. + return nil + } + + if err := c.makeSyntheticMount(ctx, submount.Destination, root, creds); err != nil { + return err + } + log.Debugf("directory exists or made directory for submount: %s", submount.Destination) + opts := &vfs.MountOptions{ GetFilesystemOptions: vfs.GetFilesystemOptions{ Data: strings.Join(options, ","), @@ -251,12 +277,6 @@ func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *Config, // All writes go to upper, be paranoid and make lower readonly. opts.ReadOnly = useOverlay - if err := c.k.VFS().MkdirAt(ctx, creds, target, &vfs.MkdirOptions{ - ForSyntheticMountpoint: true, - }); err != nil && err != syserror.EEXIST { - // Log a warning, but attempt the mount anyway. - log.Warningf("Failed to create mount point at %q: %v", submount.Destination, err) - } if err := c.k.VFS().MountAt(ctx, creds, "", target, submount.Type, opts); err != nil { return fmt.Errorf("failed to mount %q (type: %s): %w, opts: %v", submount.Destination, submount.Type, err, opts) } @@ -314,3 +334,33 @@ func p9MountOptionsVFS2(fd int, fa FileAccessType) []string { } return opts } + +func (c *containerMounter) makeSyntheticMount(ctx context.Context, currentPath string, root vfs.VirtualDentry, creds *auth.Credentials) error { + + target := &vfs.PathOperation{ + Root: root, + Start: root, + Path: fspath.Parse(currentPath), + } + + _, err := c.k.VFS().StatAt(ctx, creds, target, &vfs.StatOptions{}) + switch { + + case err == syserror.ENOENT: + if err := c.makeSyntheticMount(ctx, path.Dir(currentPath), root, creds); err != nil { + return err + } + + mkdirOpts := &vfs.MkdirOptions{Mode: 0777, ForSyntheticMountpoint: true} + if err := c.k.VFS().MkdirAt(ctx, creds, target, mkdirOpts); err != nil { + return fmt.Errorf("failed to makedir for mount %+v: %w", target, err) + } + return nil + + case err != nil: + return fmt.Errorf("stat failed for mount %+v: %w", target, err) + + default: + return nil + } +} diff --git a/scripts/docker_tests.sh b/scripts/docker_tests.sh index 931ce1aa4..dce0a4085 100755 --- a/scripts/docker_tests.sh +++ b/scripts/docker_tests.sh @@ -20,3 +20,6 @@ make load-all-images install_runsc_for_test docker test_runsc //test/image:image_test //test/e2e:integration_test + +install_runsc_for_test docker --vfs2 +test_runsc //test/image:image_test --test_filter=.*TestHelloWorld -- cgit v1.2.3