diff options
Diffstat (limited to 'pkg/sentry/fs/inode.go')
-rw-r--r-- | pkg/sentry/fs/inode.go | 455 |
1 files changed, 455 insertions, 0 deletions
diff --git a/pkg/sentry/fs/inode.go b/pkg/sentry/fs/inode.go new file mode 100644 index 000000000..b624f4182 --- /dev/null +++ b/pkg/sentry/fs/inode.go @@ -0,0 +1,455 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fs + +import ( + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/pkg/refs" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/lock" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth" + "gvisor.googlesource.com/gvisor/pkg/sentry/memmap" + "gvisor.googlesource.com/gvisor/pkg/syserror" + "gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix" +) + +// Inode is a file system object that can be simulatenously referenced by different +// components of the VFS (Dirent, fs.File, etc). +type Inode struct { + // AtomicRefCount is our reference count. + refs.AtomicRefCount + + // InodeOperations is the file system specific behavior of the Inode. + InodeOperations InodeOperations + + // StableAttr are stable cached attributes of the Inode. + StableAttr StableAttr + + // LockCtx is the file lock context. It manages its own sychronization and tracks + // regions of the Inode that have locks held. + LockCtx LockCtx + + // Watches is the set of inotify watches for this inode. + Watches *Watches + + // MountSource is the mount source this Inode is a part of. + MountSource *MountSource + + // overlay is the overlay entry for this Inode. + overlay *overlayEntry +} + +// LockCtx is an Inode's lock context and contains different personalities of locks; both +// Posix and BSD style locks are supported. +// +// Note that in Linux fcntl(2) and flock(2) locks are _not_ cooperative, because race and +// deadlock conditions make merging them prohibitive. We do the same and keep them oblivious +// to each other but provide a "context" as a convenient container. +type LockCtx struct { + // Posix is a set of POSIX-style regional advisory locks, see fcntl(2). + Posix lock.Locks + + // BSD is a set of BSD-style advisory file wide locks, see flock(2). + BSD lock.Locks +} + +// NewInode constructs an Inode from InodeOperations, a MountSource, and stable attributes. +// +// NewInode takes a reference on msrc. +func NewInode(iops InodeOperations, msrc *MountSource, sattr StableAttr) *Inode { + msrc.IncRef() + return &Inode{ + InodeOperations: iops, + StableAttr: sattr, + Watches: newWatches(), + MountSource: msrc, + } +} + +// DecRef drops a reference on the Inode. +func (i *Inode) DecRef() { + i.DecRefWithDestructor(i.destroy) +} + +// destroy releases the Inode and releases the msrc reference taken. +func (i *Inode) destroy() { + // FIXME: Context is not plumbed here. + ctx := context.Background() + if err := i.WriteOut(ctx); err != nil { + // FIXME: Mark as warning again once noatime is + // properly supported. + log.Debugf("Inode %+v, failed to sync all metadata: %v", i.StableAttr, err) + } + + // If this inode is being destroyed because it was unlinked, queue a + // deletion event. This may not be the case for inodes being revalidated. + if i.Watches.unlinked { + i.Watches.Notify("", linux.IN_DELETE_SELF, 0) + } + + // Remove references from the watch owners to the watches on this inode, + // since the watches are about to be GCed. Note that we don't need to worry + // about the watch pins since if there were any active pins, this inode + // wouldn't be in the destructor. + i.Watches.targetDestroyed() + + // Overlay resources should be released synchronously, since they may + // trigger more Inode.destroy calls which must themselves be handled + // synchronously, like the WriteOut call above. + if i.overlay != nil { + i.overlay.release() + i.MountSource.DecRef() + return + } + + // Regular (non-overlay) resources may be released asynchronously. + Async(func() { + i.InodeOperations.Release(ctx) + i.MountSource.DecRef() + }) +} + +// Mappable calls i.InodeOperations.Mappable. +func (i *Inode) Mappable() memmap.Mappable { + if i.overlay != nil { + // In an overlay, Mappable is always implemented by + // the overlayEntry metadata to synchronize memory + // access of files with copy up. But first check if + // the Inodes involved would be mappable in the first + // place. + i.overlay.copyMu.RLock() + ok := i.overlay.isMappableLocked() + i.overlay.copyMu.RUnlock() + if !ok { + return nil + } + return i.overlay + } + return i.InodeOperations.Mappable(i) +} + +// WriteOut calls i.InodeOperations.WriteOut with i as the Inode. +func (i *Inode) WriteOut(ctx context.Context) error { + if i.overlay != nil { + return overlayWriteOut(ctx, i.overlay) + } + return i.InodeOperations.WriteOut(ctx, i) +} + +// Lookup calls i.InodeOperations.Lookup with i as the directory. +func (i *Inode) Lookup(ctx context.Context, name string) (*Dirent, error) { + if i.overlay != nil { + return overlayLookup(ctx, i.overlay, i, name) + } + return i.InodeOperations.Lookup(ctx, i, name) +} + +// Create calls i.InodeOperations.Create with i as the directory. +func (i *Inode) Create(ctx context.Context, d *Dirent, name string, flags FileFlags, perm FilePermissions) (*File, error) { + if i.overlay != nil { + return overlayCreate(ctx, i.overlay, d, name, flags, perm) + } + return i.InodeOperations.Create(ctx, i, name, flags, perm) +} + +// CreateDirectory calls i.InodeOperations.CreateDirectory with i as the directory. +func (i *Inode) CreateDirectory(ctx context.Context, d *Dirent, name string, perm FilePermissions) error { + if i.overlay != nil { + return overlayCreateDirectory(ctx, i.overlay, d, name, perm) + } + return i.InodeOperations.CreateDirectory(ctx, i, name, perm) +} + +// CreateLink calls i.InodeOperations.CreateLink with i as the directory. +func (i *Inode) CreateLink(ctx context.Context, d *Dirent, oldname string, newname string) error { + if i.overlay != nil { + return overlayCreateLink(ctx, i.overlay, d, oldname, newname) + } + return i.InodeOperations.CreateLink(ctx, i, oldname, newname) +} + +// CreateHardLink calls i.InodeOperations.CreateHardLink with i as the directory. +func (i *Inode) CreateHardLink(ctx context.Context, d *Dirent, target *Dirent, name string) error { + if i.overlay != nil { + return overlayCreateHardLink(ctx, i.overlay, d, target, name) + } + return i.InodeOperations.CreateHardLink(ctx, i, target.Inode, name) +} + +// CreateFifo calls i.InodeOperations.CreateFifo with i as the directory. +func (i *Inode) CreateFifo(ctx context.Context, d *Dirent, name string, perm FilePermissions) error { + if i.overlay != nil { + return overlayCreateFifo(ctx, i.overlay, d, name, perm) + } + return i.InodeOperations.CreateFifo(ctx, i, name, perm) +} + +// Remove calls i.InodeOperations.Remove/RemoveDirectory with i as the directory. +func (i *Inode) Remove(ctx context.Context, d *Dirent, remove *Dirent) error { + if i.overlay != nil { + return overlayRemove(ctx, i.overlay, d, remove) + } + switch remove.Inode.StableAttr.Type { + case Directory, SpecialDirectory: + return i.InodeOperations.RemoveDirectory(ctx, i, remove.name) + default: + return i.InodeOperations.Remove(ctx, i, remove.name) + } +} + +// Rename calls i.InodeOperations.Rename with the given arguments. +func (i *Inode) Rename(ctx context.Context, oldParent *Dirent, renamed *Dirent, newParent *Dirent, newName string) error { + if i.overlay != nil { + return overlayRename(ctx, i.overlay, oldParent, renamed, newParent, newName) + } + return i.InodeOperations.Rename(ctx, oldParent.Inode, renamed.name, newParent.Inode, newName) +} + +// Bind calls i.InodeOperations.Bind with i as the directory. +func (i *Inode) Bind(ctx context.Context, name string, data unix.BoundEndpoint, perm FilePermissions) error { + if i.overlay != nil { + return overlayBind(ctx, i.overlay, name, data, perm) + } + return i.InodeOperations.Bind(ctx, i, name, data, perm) +} + +// BoundEndpoint calls i.InodeOperations.BoundEndpoint with i as the Inode. +func (i *Inode) BoundEndpoint(path string) unix.BoundEndpoint { + if i.overlay != nil { + return overlayBoundEndpoint(i.overlay, path) + } + return i.InodeOperations.BoundEndpoint(i, path) +} + +// GetFile calls i.InodeOperations.GetFile with the given arguments. +func (i *Inode) GetFile(ctx context.Context, d *Dirent, flags FileFlags) (*File, error) { + if i.overlay != nil { + return overlayGetFile(ctx, i.overlay, d, flags) + } + return i.InodeOperations.GetFile(ctx, d, flags) +} + +// UnstableAttr calls i.InodeOperations.UnstableAttr with i as the Inode. +func (i *Inode) UnstableAttr(ctx context.Context) (UnstableAttr, error) { + if i.overlay != nil { + return overlayUnstableAttr(ctx, i.overlay) + } + return i.InodeOperations.UnstableAttr(ctx, i) +} + +// Getxattr calls i.InodeOperations.Getxattr with i as the Inode. +func (i *Inode) Getxattr(name string) ([]byte, error) { + if i.overlay != nil { + return overlayGetxattr(i.overlay, name) + } + return i.InodeOperations.Getxattr(i, name) +} + +// Listxattr calls i.InodeOperations.Listxattr with i as the Inode. +func (i *Inode) Listxattr() (map[string]struct{}, error) { + if i.overlay != nil { + return overlayListxattr(i.overlay) + } + return i.InodeOperations.Listxattr(i) +} + +// CheckPermission will check if the caller may access this file in the +// requested way for reading, writing, or executing. +// +// CheckPermission is like Linux's fs/namei.c:inode_permission. It +// - checks file system mount flags, +// - and utilizes InodeOperations.Check to check capabilities and modes. +func (i *Inode) CheckPermission(ctx context.Context, p PermMask) error { + // First check the outer-most mounted filesystem. + if p.Write && i.MountSource.Flags.ReadOnly { + return syserror.EROFS + } + + if i.overlay != nil { + // CheckPermission requires some special handling for + // an overlay. + // + // Writes will always be redirected to an upper filesystem, + // so ignore all lower layers being read-only. + // + // But still honor the upper-most filesystem's mount flags; + // we should not attempt to modify the writable layer if it + // is mounted read-only. + if p.Write && overlayUpperMountSource(i.MountSource).Flags.ReadOnly { + return syserror.EROFS + } + } + + return i.check(ctx, p) +} + +func (i *Inode) check(ctx context.Context, p PermMask) error { + if i.overlay != nil { + return overlayCheck(ctx, i.overlay, p) + } + if !i.InodeOperations.Check(ctx, i, p) { + return syserror.EACCES + } + return nil +} + +// SetPermissions calls i.InodeOperations.SetPermissions with i as the Inode. +func (i *Inode) SetPermissions(ctx context.Context, d *Dirent, f FilePermissions) bool { + if i.overlay != nil { + return overlaySetPermissions(ctx, i.overlay, d, f) + } + return i.InodeOperations.SetPermissions(ctx, i, f) +} + +// SetOwner calls i.InodeOperations.SetOwner with i as the Inode. +func (i *Inode) SetOwner(ctx context.Context, d *Dirent, o FileOwner) error { + if i.overlay != nil { + return overlaySetOwner(ctx, i.overlay, d, o) + } + return i.InodeOperations.SetOwner(ctx, i, o) +} + +// SetTimestamps calls i.InodeOperations.SetTimestamps with i as the Inode. +func (i *Inode) SetTimestamps(ctx context.Context, d *Dirent, ts TimeSpec) error { + if i.overlay != nil { + return overlaySetTimestamps(ctx, i.overlay, d, ts) + } + return i.InodeOperations.SetTimestamps(ctx, i, ts) +} + +// Truncate calls i.InodeOperations.Truncate with i as the Inode. +func (i *Inode) Truncate(ctx context.Context, d *Dirent, size int64) error { + if i.overlay != nil { + return overlayTruncate(ctx, i.overlay, d, size) + } + return i.InodeOperations.Truncate(ctx, i, size) +} + +// Readlink calls i.InodeOperations.Readlnk with i as the Inode. +func (i *Inode) Readlink(ctx context.Context) (string, error) { + if i.overlay != nil { + return overlayReadlink(ctx, i.overlay) + } + return i.InodeOperations.Readlink(ctx, i) +} + +// Getlink calls i.InodeOperations.Getlink. +func (i *Inode) Getlink(ctx context.Context) (*Dirent, error) { + if i.overlay != nil { + return overlayGetlink(ctx, i.overlay) + } + return i.InodeOperations.Getlink(ctx, i) +} + +// AddLink calls i.InodeOperations.AddLink. +func (i *Inode) AddLink() { + if i.overlay != nil { + // FIXME: Remove this from InodeOperations altogether. + // + // This interface (including DropLink and NotifyStatusChange) + // is only used by ramfs to update metadata of children. These + // filesystems should _never_ have overlay Inodes cached as + // children. So explicitly disallow this scenario and avoid plumbing + // Dirents through to do copy up. + panic("overlay Inodes cached in ramfs directories are not supported") + } + i.InodeOperations.AddLink() +} + +// DropLink calls i.InodeOperations.DropLink. +func (i *Inode) DropLink() { + if i.overlay != nil { + // Same as AddLink. + panic("overlay Inodes cached in ramfs directories are not supported") + } + i.InodeOperations.DropLink() +} + +// NotifyStatusChange calls i.InodeOperations.NotifyStatusChange. +func (i *Inode) NotifyStatusChange(ctx context.Context) { + if i.overlay != nil { + // Same as AddLink. + panic("overlay Inodes cached in ramfs directories are not supported") + } + i.InodeOperations.NotifyStatusChange(ctx) +} + +// IsVirtual calls i.InodeOperations.IsVirtual. +func (i *Inode) IsVirtual() bool { + if i.overlay != nil { + // An overlay configuration does not support virtual files. + return false + } + return i.InodeOperations.IsVirtual() +} + +// StatFS calls i.InodeOperations.StatFS. +func (i *Inode) StatFS(ctx context.Context) (Info, error) { + if i.overlay != nil { + return overlayStatFS(ctx, i.overlay) + } + return i.InodeOperations.StatFS(ctx) +} + +// HandleOps extracts HandleOperations from i. +func (i *Inode) HandleOps() HandleOperations { + if i.overlay != nil { + return overlayHandleOps(i.overlay) + } + if h, ok := i.InodeOperations.(HandleOperations); ok { + return h + } + return nil +} + +// CheckOwnership checks whether `ctx` owns this Inode or may act as its owner. +// Compare Linux's fs/inode.c:inode_owner_or_capable(). +func (i *Inode) CheckOwnership(ctx context.Context) bool { + uattr, err := i.UnstableAttr(ctx) + if err != nil { + return false + } + creds := auth.CredentialsFromContext(ctx) + if uattr.Owner.UID == creds.EffectiveKUID { + return true + } + if creds.HasCapability(linux.CAP_FOWNER) && creds.UserNamespace.MapFromKUID(uattr.Owner.UID).Ok() { + return true + } + return false +} + +// CheckCapability checks whether `ctx` has capability `cp` with respect to +// operations on this Inode. +// +// Compare Linux's kernel/capability.c:capable_wrt_inode_uidgid(). Note that +// this function didn't exist in Linux 3.11.10, but was added by upstream +// 23adbe12ef7d "fs,userns: Change inode_capable to capable_wrt_inode_uidgid" +// to fix local privilege escalation CVE-2014-4014. +func (i *Inode) CheckCapability(ctx context.Context, cp linux.Capability) bool { + uattr, err := i.UnstableAttr(ctx) + if err != nil { + return false + } + creds := auth.CredentialsFromContext(ctx) + if !creds.UserNamespace.MapFromKUID(uattr.Owner.UID).Ok() { + return false + } + if !creds.UserNamespace.MapFromKGID(uattr.Owner.GID).Ok() { + return false + } + return creds.HasCapability(cp) +} |