diff options
Diffstat (limited to 'pkg/sentry/fs/gofer/inode.go')
-rw-r--r-- | pkg/sentry/fs/gofer/inode.go | 606 |
1 files changed, 606 insertions, 0 deletions
diff --git a/pkg/sentry/fs/gofer/inode.go b/pkg/sentry/fs/gofer/inode.go new file mode 100644 index 000000000..dcb3b2880 --- /dev/null +++ b/pkg/sentry/fs/gofer/inode.go @@ -0,0 +1,606 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package gofer + +import ( + "errors" + "sync" + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/fd" + "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/pkg/p9" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/device" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/fdpipe" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/host" + "gvisor.googlesource.com/gvisor/pkg/sentry/memmap" + "gvisor.googlesource.com/gvisor/pkg/sentry/safemem" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// inodeOperations implements fs.InodeOperations. +// +// +stateify savable +type inodeOperations struct { + fsutil.InodeNotVirtual `state:"nosave"` + fsutil.InodeNoExtendedAttributes `state:"nosave"` + + // fileState implements fs.CachedFileObject. It exists + // to break a circular load dependency between inodeOperations + // and cachingInodeOps (below). + fileState *inodeFileState `state:"wait"` + + // cachingInodeOps implement memmap.Mappable for inodeOperations. + cachingInodeOps *fsutil.CachingInodeOperations + + // readdirMu protects readdirCache and concurrent Readdirs. + readdirMu sync.Mutex `state:"nosave"` + + // readdirCache is a cache of readdir results in the form of + // a fs.SortedDentryMap. + // + // Starts out as nil, and is initialized under readdirMu lazily; + // invalidating the cache means setting it to nil. + readdirCache *fs.SortedDentryMap `state:"nosave"` +} + +// inodeFileState implements fs.CachedFileObject and otherwise fully +// encapsulates state that needs to be manually loaded on restore for +// this file object. +// +// This unfortunate structure exists because fs.CachingInodeOperations +// defines afterLoad and therefore cannot be lazily loaded (to break a +// circular load dependency between it and inodeOperations). Even with +// lazy loading, this approach defines the dependencies between objects +// and the expected load behavior more concretely. +// +// +stateify savable +type inodeFileState struct { + // s is common file system state for Gofers. + s *session `state:"wait"` + + // MultiDeviceKey consists of: + // + // * Device: file system device from a specific gofer. + // * SecondaryDevice: unique identifier of the attach point. + // * Inode: the inode of this resource, unique per Device.= + // + // These fields combined enable consistent hashing of virtual inodes + // on goferDevice. + key device.MultiDeviceKey `state:"nosave"` + + // file is the p9 file that contains a single unopened fid. + file contextFile `state:"nosave"` + + // sattr caches the stable attributes. + sattr fs.StableAttr `state:"wait"` + + // handlesMu protects the below fields. + handlesMu sync.RWMutex `state:"nosave"` + + // If readHandles is non-nil, it holds handles that are either read-only or + // read/write. If writeHandles is non-nil, it holds write-only handles if + // writeHandlesRW is false, and read/write handles if writeHandlesRW is + // true. + // + // Once readHandles becomes non-nil, it can't be changed until + // inodeFileState.Release(), because of a defect in the + // fsutil.CachedFileObject interface: there's no way for the caller of + // fsutil.CachedFileObject.FD() to keep the returned FD open, so if we + // racily replace readHandles after inodeFileState.FD() has returned + // readHandles.Host.FD(), fsutil.CachingInodeOperations may use a closed + // FD. writeHandles can be changed if writeHandlesRW is false, since + // inodeFileState.FD() can't return a write-only FD, but can't be changed + // if writeHandlesRW is true for the same reason. + readHandles *handles `state:"nosave"` + writeHandles *handles `state:"nosave"` + writeHandlesRW bool `state:"nosave"` + + // loading is acquired when the inodeFileState begins an asynchronous + // load. It releases when the load is complete. Callers that require all + // state to be available should call waitForLoad() to ensure that. + loading sync.Mutex `state:".(struct{})"` + + // savedUAttr is only allocated during S/R. It points to the save-time + // unstable attributes and is used to validate restore-time ones. + // + // Note that these unstable attributes are only used to detect cross-S/R + // external file system metadata changes. They may differ from the + // cached unstable attributes in cachingInodeOps, as that might differ + // from the external file system attributes if there had been WriteOut + // failures. S/R is transparent to Sentry and the latter will continue + // using its cached values after restore. + savedUAttr *fs.UnstableAttr + + // hostMappable is created when using 'cacheRemoteRevalidating' to map pages + // directly from host. + hostMappable *fsutil.HostMappable +} + +// Release releases file handles. +func (i *inodeFileState) Release(ctx context.Context) { + i.file.close(ctx) + if i.readHandles != nil { + i.readHandles.DecRef() + } + if i.writeHandles != nil { + i.writeHandles.DecRef() + } +} + +func (i *inodeFileState) canShareHandles() bool { + // Only share handles for regular files, since for other file types, + // distinct handles may have special semantics even if they represent the + // same file. Disable handle sharing for cache policy cacheNone, since this + // is legacy behavior. + return fs.IsFile(i.sattr) && i.s.cachePolicy != cacheNone +} + +// Preconditions: i.handlesMu must be locked for writing. +func (i *inodeFileState) setSharedHandlesLocked(flags fs.FileFlags, h *handles) { + if flags.Read && i.readHandles == nil { + h.IncRef() + i.readHandles = h + } + if flags.Write { + if i.writeHandles == nil { + h.IncRef() + i.writeHandles = h + i.writeHandlesRW = flags.Read + } else if !i.writeHandlesRW && flags.Read { + // Upgrade i.writeHandles. + i.writeHandles.DecRef() + h.IncRef() + i.writeHandles = h + i.writeHandlesRW = flags.Read + } + } +} + +// getHandles returns a set of handles for a new file using i opened with the +// given flags. +func (i *inodeFileState) getHandles(ctx context.Context, flags fs.FileFlags) (*handles, error) { + if !i.canShareHandles() { + return newHandles(ctx, i.file, flags) + } + i.handlesMu.Lock() + defer i.handlesMu.Unlock() + // Do we already have usable shared handles? + if flags.Write { + if i.writeHandles != nil && (i.writeHandlesRW || !flags.Read) { + i.writeHandles.IncRef() + return i.writeHandles, nil + } + } else if i.readHandles != nil { + i.readHandles.IncRef() + return i.readHandles, nil + } + // No; get new handles and cache them for future sharing. + h, err := newHandles(ctx, i.file, flags) + if err != nil { + return nil, err + } + i.setSharedHandlesLocked(flags, h) + return h, nil +} + +// ReadToBlocksAt implements fsutil.CachedFileObject.ReadToBlocksAt. +func (i *inodeFileState) ReadToBlocksAt(ctx context.Context, dsts safemem.BlockSeq, offset uint64) (uint64, error) { + i.handlesMu.RLock() + defer i.handlesMu.RUnlock() + return i.readHandles.readWriterAt(ctx, int64(offset)).ReadToBlocks(dsts) +} + +// WriteFromBlocksAt implements fsutil.CachedFileObject.WriteFromBlocksAt. +func (i *inodeFileState) WriteFromBlocksAt(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error) { + i.handlesMu.RLock() + defer i.handlesMu.RUnlock() + return i.writeHandles.readWriterAt(ctx, int64(offset)).WriteFromBlocks(srcs) +} + +// SetMaskedAttributes implements fsutil.CachedFileObject.SetMaskedAttributes. +func (i *inodeFileState) SetMaskedAttributes(ctx context.Context, mask fs.AttrMask, attr fs.UnstableAttr) error { + if i.skipSetAttr(mask) { + return nil + } + as, ans := attr.AccessTime.Unix() + ms, mns := attr.ModificationTime.Unix() + // An update of status change time is implied by mask.AccessTime + // or mask.ModificationTime. Updating status change time to a + // time earlier than the system time is not possible. + return i.file.setAttr( + ctx, + p9.SetAttrMask{ + Permissions: mask.Perms, + Size: mask.Size, + UID: mask.UID, + GID: mask.GID, + ATime: mask.AccessTime, + ATimeNotSystemTime: true, + MTime: mask.ModificationTime, + MTimeNotSystemTime: true, + }, p9.SetAttr{ + Permissions: p9.FileMode(attr.Perms.LinuxMode()), + UID: p9.UID(attr.Owner.UID), + GID: p9.GID(attr.Owner.GID), + Size: uint64(attr.Size), + ATimeSeconds: uint64(as), + ATimeNanoSeconds: uint64(ans), + MTimeSeconds: uint64(ms), + MTimeNanoSeconds: uint64(mns), + }) +} + +// skipSetAttr checks if attribute change can be skipped. It can be skipped +// when: +// - Mask is empty +// - Mask contains only attributes that cannot be set in the gofer +// - Mask contains only atime and/or mtime, and host FD exists +// +// Updates to atime and mtime can be skipped because cached value will be +// "close enough" to host value, given that operation went directly to host FD. +// Skipping atime updates is particularly important to reduce the number of +// operations sent to the Gofer for readonly files. +func (i *inodeFileState) skipSetAttr(mask fs.AttrMask) bool { + // First remove attributes that cannot be updated. + cpy := mask + cpy.Type = false + cpy.DeviceID = false + cpy.InodeID = false + cpy.BlockSize = false + cpy.Usage = false + cpy.Links = false + if cpy.Empty() { + return true + } + + // Then check if more than just atime and mtime is being set. + cpy.AccessTime = false + cpy.ModificationTime = false + if !cpy.Empty() { + return false + } + + i.handlesMu.RLock() + defer i.handlesMu.RUnlock() + return (i.readHandles != nil && i.readHandles.Host != nil) || + (i.writeHandles != nil && i.writeHandles.Host != nil) +} + +// Sync implements fsutil.CachedFileObject.Sync. +func (i *inodeFileState) Sync(ctx context.Context) error { + i.handlesMu.RLock() + defer i.handlesMu.RUnlock() + if i.writeHandles == nil { + return nil + } + return i.writeHandles.File.fsync(ctx) +} + +// FD implements fsutil.CachedFileObject.FD. +func (i *inodeFileState) FD() int { + i.handlesMu.RLock() + defer i.handlesMu.RUnlock() + if i.writeHandlesRW && i.writeHandles != nil && i.writeHandles.Host != nil { + return int(i.writeHandles.Host.FD()) + } + if i.readHandles != nil && i.readHandles.Host != nil { + return int(i.readHandles.Host.FD()) + } + return -1 +} + +// waitForLoad makes sure any restore-issued loading is done. +func (i *inodeFileState) waitForLoad() { + // This is not a no-op. The loading mutex is hold upon restore until + // all loading actions are done. + i.loading.Lock() + i.loading.Unlock() +} + +func (i *inodeFileState) unstableAttr(ctx context.Context) (fs.UnstableAttr, error) { + _, valid, pattr, err := getattr(ctx, i.file) + if err != nil { + return fs.UnstableAttr{}, err + } + return unstable(ctx, valid, pattr, i.s.mounter, i.s.client), nil +} + +func (i *inodeFileState) Allocate(ctx context.Context, offset, length int64) error { + i.handlesMu.RLock() + defer i.handlesMu.RUnlock() + + // No options are supported for now. + mode := p9.AllocateMode{} + return i.writeHandles.File.allocate(ctx, mode, uint64(offset), uint64(length)) +} + +// session extracts the gofer's session from the MountSource. +func (i *inodeOperations) session() *session { + return i.fileState.s +} + +// Release implements fs.InodeOperations.Release. +func (i *inodeOperations) Release(ctx context.Context) { + i.cachingInodeOps.Release() + + // Releasing the fileState may make RPCs to the gofer. There is + // no need to wait for those to return, so we can do this + // asynchronously. + // + // We use AsyncWithContext to avoid needing to allocate an extra + // anonymous function on the heap. + fs.AsyncWithContext(ctx, i.fileState.Release) +} + +// Mappable implements fs.InodeOperations.Mappable. +func (i *inodeOperations) Mappable(inode *fs.Inode) memmap.Mappable { + if i.session().cachePolicy.useCachingInodeOps(inode) { + return i.cachingInodeOps + } + // This check is necessary because it's returning an interface type. + if i.fileState.hostMappable != nil { + return i.fileState.hostMappable + } + return nil +} + +// UnstableAttr implements fs.InodeOperations.UnstableAttr. +func (i *inodeOperations) UnstableAttr(ctx context.Context, inode *fs.Inode) (fs.UnstableAttr, error) { + if i.session().cachePolicy.cacheUAttrs(inode) { + return i.cachingInodeOps.UnstableAttr(ctx, inode) + } + return i.fileState.unstableAttr(ctx) +} + +// Check implements fs.InodeOperations.Check. +func (i *inodeOperations) Check(ctx context.Context, inode *fs.Inode, p fs.PermMask) bool { + return fs.ContextCanAccessFile(ctx, inode, p) +} + +// GetFile implements fs.InodeOperations.GetFile. +func (i *inodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) { + switch d.Inode.StableAttr.Type { + case fs.Socket: + return i.getFileSocket(ctx, d, flags) + case fs.Pipe: + return i.getFilePipe(ctx, d, flags) + default: + return i.getFileDefault(ctx, d, flags) + } +} + +func (i *inodeOperations) getFileSocket(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) { + f, err := i.fileState.file.connect(ctx, p9.AnonymousSocket) + if err != nil { + return nil, syscall.EIO + } + fsf, err := host.NewSocketWithDirent(ctx, d, f, flags) + if err != nil { + f.Close() + return nil, err + } + return fsf, nil +} + +func (i *inodeOperations) getFilePipe(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) { + // Try to open as a host pipe; if that doesn't work, handle it normally. + pipeOps, err := fdpipe.Open(ctx, i, flags) + if err == errNotHostFile { + return i.getFileDefault(ctx, d, flags) + } + if err != nil { + return nil, err + } + return fs.NewFile(ctx, d, flags, pipeOps), nil +} + +// errNotHostFile indicates that the file is not a host file. +var errNotHostFile = errors.New("not a host file") + +// NonBlockingOpen implements fdpipe.NonBlockingOpener for opening host named pipes. +func (i *inodeOperations) NonBlockingOpen(ctx context.Context, p fs.PermMask) (*fd.FD, error) { + i.fileState.waitForLoad() + + // Get a cloned fid which we will open. + _, newFile, err := i.fileState.file.walk(ctx, nil) + if err != nil { + log.Warningf("Open Walk failed: %v", err) + return nil, err + } + defer newFile.close(ctx) + + flags, err := openFlagsFromPerms(p) + if err != nil { + log.Warningf("Open flags %s parsing failed: %v", p, err) + return nil, err + } + hostFile, _, _, err := newFile.open(ctx, flags) + // If the host file returned is nil and the error is nil, + // then this was never a host file to begin with, and should + // be treated like a remote file. + if hostFile == nil && err == nil { + return nil, errNotHostFile + } + return hostFile, err +} + +func (i *inodeOperations) getFileDefault(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) { + h, err := i.fileState.getHandles(ctx, flags) + if err != nil { + return nil, err + } + return NewFile(ctx, d, d.BaseName(), flags, i, h), nil +} + +// SetPermissions implements fs.InodeOperations.SetPermissions. +func (i *inodeOperations) SetPermissions(ctx context.Context, inode *fs.Inode, p fs.FilePermissions) bool { + if i.session().cachePolicy.cacheUAttrs(inode) { + return i.cachingInodeOps.SetPermissions(ctx, inode, p) + } + + mask := p9.SetAttrMask{Permissions: true} + pattr := p9.SetAttr{Permissions: p9.FileMode(p.LinuxMode())} + // Execute the chmod. + return i.fileState.file.setAttr(ctx, mask, pattr) == nil +} + +// SetOwner implements fs.InodeOperations.SetOwner. +func (i *inodeOperations) SetOwner(ctx context.Context, inode *fs.Inode, owner fs.FileOwner) error { + // Save the roundtrip. + if !owner.UID.Ok() && !owner.GID.Ok() { + return nil + } + + if i.session().cachePolicy.cacheUAttrs(inode) { + return i.cachingInodeOps.SetOwner(ctx, inode, owner) + } + + var mask p9.SetAttrMask + var attr p9.SetAttr + if owner.UID.Ok() { + mask.UID = true + attr.UID = p9.UID(owner.UID) + } + if owner.GID.Ok() { + mask.GID = true + attr.GID = p9.GID(owner.GID) + } + return i.fileState.file.setAttr(ctx, mask, attr) +} + +// SetTimestamps implements fs.InodeOperations.SetTimestamps. +func (i *inodeOperations) SetTimestamps(ctx context.Context, inode *fs.Inode, ts fs.TimeSpec) error { + if i.session().cachePolicy.cacheUAttrs(inode) { + return i.cachingInodeOps.SetTimestamps(ctx, inode, ts) + } + + return utimes(ctx, i.fileState.file, ts) +} + +// Truncate implements fs.InodeOperations.Truncate. +func (i *inodeOperations) Truncate(ctx context.Context, inode *fs.Inode, length int64) error { + // This can only be called for files anyway. + if i.session().cachePolicy.useCachingInodeOps(inode) { + return i.cachingInodeOps.Truncate(ctx, inode, length) + } + if i.session().cachePolicy == cacheRemoteRevalidating { + return i.fileState.hostMappable.Truncate(ctx, length) + } + + return i.fileState.file.setAttr(ctx, p9.SetAttrMask{Size: true}, p9.SetAttr{Size: uint64(length)}) +} + +// Allocate implements fs.InodeOperations.Allocate. +func (i *inodeOperations) Allocate(ctx context.Context, inode *fs.Inode, offset, length int64) error { + // This can only be called for files anyway. + if i.session().cachePolicy.useCachingInodeOps(inode) { + return i.cachingInodeOps.Allocate(ctx, offset, length) + } + if i.session().cachePolicy == cacheRemoteRevalidating { + return i.fileState.hostMappable.Allocate(ctx, offset, length) + } + + // No options are supported for now. + mode := p9.AllocateMode{} + return i.fileState.file.allocate(ctx, mode, uint64(offset), uint64(length)) +} + +// WriteOut implements fs.InodeOperations.WriteOut. +func (i *inodeOperations) WriteOut(ctx context.Context, inode *fs.Inode) error { + if !i.session().cachePolicy.cacheUAttrs(inode) { + return nil + } + + return i.cachingInodeOps.WriteOut(ctx, inode) +} + +// Readlink implements fs.InodeOperations.Readlink. +func (i *inodeOperations) Readlink(ctx context.Context, inode *fs.Inode) (string, error) { + if !fs.IsSymlink(inode.StableAttr) { + return "", syscall.ENOLINK + } + return i.fileState.file.readlink(ctx) +} + +// Getlink implementfs fs.InodeOperations.Getlink. +func (i *inodeOperations) Getlink(context.Context, *fs.Inode) (*fs.Dirent, error) { + if !fs.IsSymlink(i.fileState.sattr) { + return nil, syserror.ENOLINK + } + return nil, fs.ErrResolveViaReadlink +} + +// StatFS makes a StatFS request. +func (i *inodeOperations) StatFS(ctx context.Context) (fs.Info, error) { + fsstat, err := i.fileState.file.statFS(ctx) + if err != nil { + return fs.Info{}, err + } + + info := fs.Info{ + // This is primarily for distinguishing a gofer file system in + // tests. Testing is important, so instead of defining + // something completely random, use a standard value. + Type: linux.V9FS_MAGIC, + TotalBlocks: fsstat.Blocks, + FreeBlocks: fsstat.BlocksFree, + TotalFiles: fsstat.Files, + FreeFiles: fsstat.FilesFree, + } + + // If blocks available is non-zero, prefer that. + if fsstat.BlocksAvailable != 0 { + info.FreeBlocks = fsstat.BlocksAvailable + } + + return info, nil +} + +func (i *inodeOperations) configureMMap(file *fs.File, opts *memmap.MMapOpts) error { + if i.session().cachePolicy.useCachingInodeOps(file.Dirent.Inode) { + return fsutil.GenericConfigureMMap(file, i.cachingInodeOps, opts) + } + if i.fileState.hostMappable != nil { + return fsutil.GenericConfigureMMap(file, i.fileState.hostMappable, opts) + } + return syserror.ENODEV +} + +func init() { + syserror.AddErrorUnwrapper(func(err error) (syscall.Errno, bool) { + if _, ok := err.(p9.ErrSocket); ok { + // Treat as an I/O error. + return syscall.EIO, true + } + return 0, false + }) +} + +// AddLink implements InodeOperations.AddLink, but is currently a noop. +// FIXME(b/63117438): Remove this from InodeOperations altogether. +func (*inodeOperations) AddLink() {} + +// DropLink implements InodeOperations.DropLink, but is currently a noop. +// FIXME(b/63117438): Remove this from InodeOperations altogether. +func (*inodeOperations) DropLink() {} + +// NotifyStatusChange implements fs.InodeOperations.NotifyStatusChange. +// FIXME(b/63117438): Remove this from InodeOperations altogether. +func (i *inodeOperations) NotifyStatusChange(ctx context.Context) {} |