diff options
Diffstat (limited to 'pkg/sentry')
36 files changed, 457 insertions, 208 deletions
diff --git a/pkg/sentry/fsimpl/overlay/BUILD b/pkg/sentry/fsimpl/overlay/BUILD index 8cf5b35d3..1e11b0428 100644 --- a/pkg/sentry/fsimpl/overlay/BUILD +++ b/pkg/sentry/fsimpl/overlay/BUILD @@ -21,14 +21,16 @@ go_library( "directory.go", "filesystem.go", "fstree.go", - "non_directory.go", "overlay.go", + "regular_file.go", ], visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", "//pkg/context", "//pkg/fspath", + "//pkg/log", + "//pkg/sentry/arch", "//pkg/sentry/fs/lock", "//pkg/sentry/kernel/auth", "//pkg/sentry/memmap", @@ -37,5 +39,6 @@ go_library( "//pkg/sync", "//pkg/syserror", "//pkg/usermem", + "//pkg/waiter", ], ) diff --git a/pkg/sentry/fsimpl/overlay/copy_up.go b/pkg/sentry/fsimpl/overlay/copy_up.go index 73b126669..4506642ca 100644 --- a/pkg/sentry/fsimpl/overlay/copy_up.go +++ b/pkg/sentry/fsimpl/overlay/copy_up.go @@ -75,8 +75,21 @@ func (d *dentry) copyUpLocked(ctx context.Context) error { return syserror.ENOENT } - // Perform copy-up. + // Obtain settable timestamps from the lower layer. vfsObj := d.fs.vfsfs.VirtualFilesystem() + oldpop := vfs.PathOperation{ + Root: d.lowerVDs[0], + Start: d.lowerVDs[0], + } + const timestampsMask = linux.STATX_ATIME | linux.STATX_MTIME + oldStat, err := vfsObj.StatAt(ctx, d.fs.creds, &oldpop, &vfs.StatOptions{ + Mask: timestampsMask, + }) + if err != nil { + return err + } + + // Perform copy-up. newpop := vfs.PathOperation{ Root: d.parent.upperVD, Start: d.parent.upperVD, @@ -101,10 +114,7 @@ func (d *dentry) copyUpLocked(ctx context.Context) error { } switch ftype { case linux.S_IFREG: - oldFD, err := vfsObj.OpenAt(ctx, d.fs.creds, &vfs.PathOperation{ - Root: d.lowerVDs[0], - Start: d.lowerVDs[0], - }, &vfs.OpenOptions{ + oldFD, err := vfsObj.OpenAt(ctx, d.fs.creds, &oldpop, &vfs.OpenOptions{ Flags: linux.O_RDONLY, }) if err != nil { @@ -160,9 +170,11 @@ func (d *dentry) copyUpLocked(ctx context.Context) error { } if err := newFD.SetStat(ctx, vfs.SetStatOptions{ Stat: linux.Statx{ - Mask: linux.STATX_UID | linux.STATX_GID, - UID: d.uid, - GID: d.gid, + Mask: linux.STATX_UID | linux.STATX_GID | oldStat.Mask×tampsMask, + UID: d.uid, + GID: d.gid, + Atime: oldStat.Atime, + Mtime: oldStat.Mtime, }, }); err != nil { cleanupUndoCopyUp() @@ -179,9 +191,11 @@ func (d *dentry) copyUpLocked(ctx context.Context) error { } if err := vfsObj.SetStatAt(ctx, d.fs.creds, &newpop, &vfs.SetStatOptions{ Stat: linux.Statx{ - Mask: linux.STATX_UID | linux.STATX_GID, - UID: d.uid, - GID: d.gid, + Mask: linux.STATX_UID | linux.STATX_GID | oldStat.Mask×tampsMask, + UID: d.uid, + GID: d.gid, + Atime: oldStat.Atime, + Mtime: oldStat.Mtime, }, }); err != nil { cleanupUndoCopyUp() @@ -195,10 +209,7 @@ func (d *dentry) copyUpLocked(ctx context.Context) error { d.upperVD = upperVD case linux.S_IFLNK: - target, err := vfsObj.ReadlinkAt(ctx, d.fs.creds, &vfs.PathOperation{ - Root: d.lowerVDs[0], - Start: d.lowerVDs[0], - }) + target, err := vfsObj.ReadlinkAt(ctx, d.fs.creds, &oldpop) if err != nil { return err } @@ -207,10 +218,12 @@ func (d *dentry) copyUpLocked(ctx context.Context) error { } if err := vfsObj.SetStatAt(ctx, d.fs.creds, &newpop, &vfs.SetStatOptions{ Stat: linux.Statx{ - Mask: linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID, - Mode: uint16(d.mode), - UID: d.uid, - GID: d.gid, + Mask: linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | oldStat.Mask×tampsMask, + Mode: uint16(d.mode), + UID: d.uid, + GID: d.gid, + Atime: oldStat.Atime, + Mtime: oldStat.Mtime, }, }); err != nil { cleanupUndoCopyUp() @@ -224,25 +237,20 @@ func (d *dentry) copyUpLocked(ctx context.Context) error { d.upperVD = upperVD case linux.S_IFBLK, linux.S_IFCHR: - lowerStat, err := vfsObj.StatAt(ctx, d.fs.creds, &vfs.PathOperation{ - Root: d.lowerVDs[0], - Start: d.lowerVDs[0], - }, &vfs.StatOptions{}) - if err != nil { - return err - } if err := vfsObj.MknodAt(ctx, d.fs.creds, &newpop, &vfs.MknodOptions{ Mode: linux.FileMode(d.mode), - DevMajor: lowerStat.RdevMajor, - DevMinor: lowerStat.RdevMinor, + DevMajor: oldStat.RdevMajor, + DevMinor: oldStat.RdevMinor, }); err != nil { return err } if err := vfsObj.SetStatAt(ctx, d.fs.creds, &newpop, &vfs.SetStatOptions{ Stat: linux.Statx{ - Mask: linux.STATX_UID | linux.STATX_GID, - UID: d.uid, - GID: d.gid, + Mask: linux.STATX_UID | linux.STATX_GID | oldStat.Mask×tampsMask, + UID: d.uid, + GID: d.gid, + Atime: oldStat.Atime, + Mtime: oldStat.Mtime, }, }); err != nil { cleanupUndoCopyUp() diff --git a/pkg/sentry/fsimpl/overlay/filesystem.go b/pkg/sentry/fsimpl/overlay/filesystem.go index bd11372d5..78a01bbb7 100644 --- a/pkg/sentry/fsimpl/overlay/filesystem.go +++ b/pkg/sentry/fsimpl/overlay/filesystem.go @@ -765,7 +765,7 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf if mustCreate { return nil, syserror.EEXIST } - if mayWrite { + if start.isRegularFile() && mayWrite { if err := start.copyUpLocked(ctx); err != nil { return nil, err } @@ -819,7 +819,7 @@ afterTrailingSymlink: if rp.MustBeDir() && !child.isDir() { return nil, syserror.ENOTDIR } - if mayWrite { + if child.isRegularFile() && mayWrite { if err := child.copyUpLocked(ctx); err != nil { return nil, err } @@ -872,8 +872,11 @@ func (d *dentry) openCopiedUp(ctx context.Context, rp *vfs.ResolvingPath, opts * if err != nil { return nil, err } + if ftype != linux.S_IFREG { + return layerFD, nil + } layerFlags := layerFD.StatusFlags() - fd := &nonDirectoryFD{ + fd := ®ularFileFD{ copiedUp: isUpper, cachedFD: layerFD, cachedFlags: layerFlags, @@ -969,7 +972,7 @@ func (fs *filesystem) createAndOpenLocked(ctx context.Context, rp *vfs.Resolving } // Finally construct the overlay FD. upperFlags := upperFD.StatusFlags() - fd := &nonDirectoryFD{ + fd := ®ularFileFD{ copiedUp: true, cachedFD: upperFD, cachedFlags: upperFlags, @@ -1293,6 +1296,9 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error if !child.isDir() { return syserror.ENOTDIR } + if err := vfs.CheckDeleteSticky(rp.Credentials(), linux.FileMode(atomic.LoadUint32(&parent.mode)), auth.KUID(atomic.LoadUint32(&child.uid))); err != nil { + return err + } child.dirMu.Lock() defer child.dirMu.Unlock() whiteouts, err := child.collectWhiteoutsForRmdirLocked(ctx) @@ -1528,12 +1534,38 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error return err } + parentMode := atomic.LoadUint32(&parent.mode) child := parent.children[name] var childLayer lookupLayer + if child == nil { + if parentMode&linux.S_ISVTX != 0 { + // If the parent's sticky bit is set, we need a child dentry to get + // its owner. + child, err = fs.getChildLocked(ctx, parent, name, &ds) + if err != nil { + return err + } + } else { + // Determine if the file being unlinked actually exists. Holding + // parent.dirMu prevents a dentry from being instantiated for the file, + // which in turn prevents it from being copied-up, so this result is + // stable. + childLayer, err = fs.lookupLayerLocked(ctx, parent, name) + if err != nil { + return err + } + if !childLayer.existsInOverlay() { + return syserror.ENOENT + } + } + } if child != nil { if child.isDir() { return syserror.EISDIR } + if err := vfs.CheckDeleteSticky(rp.Credentials(), linux.FileMode(parentMode), auth.KUID(atomic.LoadUint32(&child.uid))); err != nil { + return err + } if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil { return err } @@ -1546,18 +1578,6 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error } else { childLayer = lookupLayerLower } - } else { - // Determine if the file being unlinked actually exists. Holding - // parent.dirMu prevents a dentry from being instantiated for the file, - // which in turn prevents it from being copied-up, so this result is - // stable. - childLayer, err = fs.lookupLayerLocked(ctx, parent, name) - if err != nil { - return err - } - if !childLayer.existsInOverlay() { - return syserror.ENOENT - } } pop := vfs.PathOperation{ diff --git a/pkg/sentry/fsimpl/overlay/overlay.go b/pkg/sentry/fsimpl/overlay/overlay.go index e5f506d2e..4c5de8d32 100644 --- a/pkg/sentry/fsimpl/overlay/overlay.go +++ b/pkg/sentry/fsimpl/overlay/overlay.go @@ -18,7 +18,7 @@ // // Lock order: // -// directoryFD.mu / nonDirectoryFD.mu +// directoryFD.mu / regularFileFD.mu // filesystem.renameMu // dentry.dirMu // dentry.copyMu @@ -453,7 +453,7 @@ type dentry struct { // - If this dentry is copied-up, then wrappedMappable is the Mappable // obtained from a call to the current top layer's // FileDescription.ConfigureMMap(). Once wrappedMappable becomes non-nil - // (from a call to nonDirectoryFD.ensureMappable()), it cannot become nil. + // (from a call to regularFileFD.ensureMappable()), it cannot become nil. // wrappedMappable is protected by mapsMu and dataMu. // // - isMappable is non-zero iff wrappedMappable is non-nil. isMappable is diff --git a/pkg/sentry/fsimpl/overlay/non_directory.go b/pkg/sentry/fsimpl/overlay/regular_file.go index 853aee951..2b89a7a6d 100644 --- a/pkg/sentry/fsimpl/overlay/non_directory.go +++ b/pkg/sentry/fsimpl/overlay/regular_file.go @@ -19,14 +19,21 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/memmap" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" ) +func (d *dentry) isRegularFile() bool { + return atomic.LoadUint32(&d.mode)&linux.S_IFMT == linux.S_IFREG +} + func (d *dentry) isSymlink() bool { return atomic.LoadUint32(&d.mode)&linux.S_IFMT == linux.S_IFLNK } @@ -40,7 +47,7 @@ func (d *dentry) readlink(ctx context.Context) (string, error) { } // +stateify savable -type nonDirectoryFD struct { +type regularFileFD struct { fileDescription // If copiedUp is false, cachedFD represents @@ -52,9 +59,13 @@ type nonDirectoryFD struct { copiedUp bool cachedFD *vfs.FileDescription cachedFlags uint32 + + // If copiedUp is false, lowerWaiters contains all waiter.Entries + // registered with cachedFD. lowerWaiters is protected by mu. + lowerWaiters map[*waiter.Entry]waiter.EventMask } -func (fd *nonDirectoryFD) getCurrentFD(ctx context.Context) (*vfs.FileDescription, error) { +func (fd *regularFileFD) getCurrentFD(ctx context.Context) (*vfs.FileDescription, error) { fd.mu.Lock() defer fd.mu.Unlock() wrappedFD, err := fd.currentFDLocked(ctx) @@ -65,7 +76,7 @@ func (fd *nonDirectoryFD) getCurrentFD(ctx context.Context) (*vfs.FileDescriptio return wrappedFD, nil } -func (fd *nonDirectoryFD) currentFDLocked(ctx context.Context) (*vfs.FileDescription, error) { +func (fd *regularFileFD) currentFDLocked(ctx context.Context) (*vfs.FileDescription, error) { d := fd.dentry() statusFlags := fd.vfsfd.StatusFlags() if !fd.copiedUp && d.isCopiedUp() { @@ -87,10 +98,21 @@ func (fd *nonDirectoryFD) currentFDLocked(ctx context.Context) (*vfs.FileDescrip return nil, err } } + if len(fd.lowerWaiters) != 0 { + ready := upperFD.Readiness(^waiter.EventMask(0)) + for e, mask := range fd.lowerWaiters { + fd.cachedFD.EventUnregister(e) + upperFD.EventRegister(e, mask) + if ready&mask != 0 { + e.Callback.Callback(e) + } + } + } fd.cachedFD.DecRef(ctx) fd.copiedUp = true fd.cachedFD = upperFD fd.cachedFlags = statusFlags + fd.lowerWaiters = nil } else if fd.cachedFlags != statusFlags { if err := fd.cachedFD.SetStatusFlags(ctx, d.fs.creds, statusFlags); err != nil { return nil, err @@ -101,13 +123,13 @@ func (fd *nonDirectoryFD) currentFDLocked(ctx context.Context) (*vfs.FileDescrip } // Release implements vfs.FileDescriptionImpl.Release. -func (fd *nonDirectoryFD) Release(ctx context.Context) { +func (fd *regularFileFD) Release(ctx context.Context) { fd.cachedFD.DecRef(ctx) fd.cachedFD = nil } // OnClose implements vfs.FileDescriptionImpl.OnClose. -func (fd *nonDirectoryFD) OnClose(ctx context.Context) error { +func (fd *regularFileFD) OnClose(ctx context.Context) error { // Linux doesn't define ovl_file_operations.flush at all (i.e. its // equivalent to OnClose is a no-op). We pass through to // fd.cachedFD.OnClose() without upgrading if fd.dentry() has been @@ -128,7 +150,7 @@ func (fd *nonDirectoryFD) OnClose(ctx context.Context) error { } // Stat implements vfs.FileDescriptionImpl.Stat. -func (fd *nonDirectoryFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { +func (fd *regularFileFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { var stat linux.Statx if layerMask := opts.Mask &^ statInternalMask; layerMask != 0 { wrappedFD, err := fd.getCurrentFD(ctx) @@ -149,7 +171,7 @@ func (fd *nonDirectoryFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux } // Allocate implements vfs.FileDescriptionImpl.Allocate. -func (fd *nonDirectoryFD) Allocate(ctx context.Context, mode, offset, length uint64) error { +func (fd *regularFileFD) Allocate(ctx context.Context, mode, offset, length uint64) error { wrappedFD, err := fd.getCurrentFD(ctx) if err != nil { return err @@ -159,7 +181,7 @@ func (fd *nonDirectoryFD) Allocate(ctx context.Context, mode, offset, length uin } // SetStat implements vfs.FileDescriptionImpl.SetStat. -func (fd *nonDirectoryFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { +func (fd *regularFileFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { d := fd.dentry() mode := linux.FileMode(atomic.LoadUint32(&d.mode)) if err := vfs.CheckSetStat(ctx, auth.CredentialsFromContext(ctx), &opts, mode, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))); err != nil { @@ -191,12 +213,61 @@ func (fd *nonDirectoryFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) } // StatFS implements vfs.FileDescriptionImpl.StatFS. -func (fd *nonDirectoryFD) StatFS(ctx context.Context) (linux.Statfs, error) { +func (fd *regularFileFD) StatFS(ctx context.Context) (linux.Statfs, error) { return fd.filesystem().statFS(ctx) } +// Readiness implements waiter.Waitable.Readiness. +func (fd *regularFileFD) Readiness(mask waiter.EventMask) waiter.EventMask { + ctx := context.Background() + wrappedFD, err := fd.getCurrentFD(ctx) + if err != nil { + // TODO(b/171089913): Just use fd.cachedFD since Readiness can't return + // an error. This is obviously wrong, but at least consistent with + // VFS1. + log.Warningf("overlay.regularFileFD.Readiness: currentFDLocked failed: %v", err) + fd.mu.Lock() + wrappedFD = fd.cachedFD + wrappedFD.IncRef() + fd.mu.Unlock() + } + defer wrappedFD.DecRef(ctx) + return wrappedFD.Readiness(mask) +} + +// EventRegister implements waiter.Waitable.EventRegister. +func (fd *regularFileFD) EventRegister(e *waiter.Entry, mask waiter.EventMask) { + fd.mu.Lock() + defer fd.mu.Unlock() + wrappedFD, err := fd.currentFDLocked(context.Background()) + if err != nil { + // TODO(b/171089913): Just use fd.cachedFD since EventRegister can't + // return an error. This is obviously wrong, but at least consistent + // with VFS1. + log.Warningf("overlay.regularFileFD.EventRegister: currentFDLocked failed: %v", err) + wrappedFD = fd.cachedFD + } + wrappedFD.EventRegister(e, mask) + if !fd.copiedUp { + if fd.lowerWaiters == nil { + fd.lowerWaiters = make(map[*waiter.Entry]waiter.EventMask) + } + fd.lowerWaiters[e] = mask + } +} + +// EventUnregister implements waiter.Waitable.EventUnregister. +func (fd *regularFileFD) EventUnregister(e *waiter.Entry) { + fd.mu.Lock() + defer fd.mu.Unlock() + fd.cachedFD.EventUnregister(e) + if !fd.copiedUp { + delete(fd.lowerWaiters, e) + } +} + // PRead implements vfs.FileDescriptionImpl.PRead. -func (fd *nonDirectoryFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { +func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { wrappedFD, err := fd.getCurrentFD(ctx) if err != nil { return 0, err @@ -206,7 +277,7 @@ func (fd *nonDirectoryFD) PRead(ctx context.Context, dst usermem.IOSequence, off } // Read implements vfs.FileDescriptionImpl.Read. -func (fd *nonDirectoryFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { +func (fd *regularFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { // Hold fd.mu during the read to serialize the file offset. fd.mu.Lock() defer fd.mu.Unlock() @@ -218,7 +289,7 @@ func (fd *nonDirectoryFD) Read(ctx context.Context, dst usermem.IOSequence, opts } // PWrite implements vfs.FileDescriptionImpl.PWrite. -func (fd *nonDirectoryFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { +func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { wrappedFD, err := fd.getCurrentFD(ctx) if err != nil { return 0, err @@ -228,7 +299,7 @@ func (fd *nonDirectoryFD) PWrite(ctx context.Context, src usermem.IOSequence, of } // Write implements vfs.FileDescriptionImpl.Write. -func (fd *nonDirectoryFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { +func (fd *regularFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { // Hold fd.mu during the write to serialize the file offset. fd.mu.Lock() defer fd.mu.Unlock() @@ -240,7 +311,7 @@ func (fd *nonDirectoryFD) Write(ctx context.Context, src usermem.IOSequence, opt } // Seek implements vfs.FileDescriptionImpl.Seek. -func (fd *nonDirectoryFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { +func (fd *regularFileFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { // Hold fd.mu during the seek to serialize the file offset. fd.mu.Lock() defer fd.mu.Unlock() @@ -252,7 +323,7 @@ func (fd *nonDirectoryFD) Seek(ctx context.Context, offset int64, whence int32) } // Sync implements vfs.FileDescriptionImpl.Sync. -func (fd *nonDirectoryFD) Sync(ctx context.Context) error { +func (fd *regularFileFD) Sync(ctx context.Context) error { fd.mu.Lock() if !fd.dentry().isCopiedUp() { fd.mu.Unlock() @@ -269,8 +340,18 @@ func (fd *nonDirectoryFD) Sync(ctx context.Context) error { return wrappedFD.Sync(ctx) } +// Ioctl implements vfs.FileDescriptionImpl.Ioctl. +func (fd *regularFileFD) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) { + wrappedFD, err := fd.getCurrentFD(ctx) + if err != nil { + return 0, err + } + defer wrappedFD.DecRef(ctx) + return wrappedFD.Ioctl(ctx, uio, args) +} + // ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap. -func (fd *nonDirectoryFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { +func (fd *regularFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { if err := fd.ensureMappable(ctx, opts); err != nil { return err } @@ -278,7 +359,7 @@ func (fd *nonDirectoryFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOp } // ensureMappable ensures that fd.dentry().wrappedMappable is not nil. -func (fd *nonDirectoryFD) ensureMappable(ctx context.Context, opts *memmap.MMapOpts) error { +func (fd *regularFileFD) ensureMappable(ctx context.Context, opts *memmap.MMapOpts) error { d := fd.dentry() // Fast path if we already have a Mappable for the current top layer. diff --git a/pkg/sentry/fsimpl/testutil/kernel.go b/pkg/sentry/fsimpl/testutil/kernel.go index 1813269e0..738c0c9cc 100644 --- a/pkg/sentry/fsimpl/testutil/kernel.go +++ b/pkg/sentry/fsimpl/testutil/kernel.go @@ -147,7 +147,12 @@ func CreateTask(ctx context.Context, name string, tc *kernel.ThreadGroup, mntns FSContext: kernel.NewFSContextVFS2(root, cwd, 0022), FDTable: k.NewFDTable(), } - return k.TaskSet().NewTask(config) + t, err := k.TaskSet().NewTask(ctx, config) + if err != nil { + config.ThreadGroup.Release(ctx) + return nil, err + } + return t, nil } func newFakeExecutable(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, root vfs.VirtualDentry) (*vfs.FileDescription, error) { diff --git a/pkg/sentry/fsimpl/verity/filesystem.go b/pkg/sentry/fsimpl/verity/filesystem.go index 3b3c8725f..03da505e1 100644 --- a/pkg/sentry/fsimpl/verity/filesystem.go +++ b/pkg/sentry/fsimpl/verity/filesystem.go @@ -377,12 +377,12 @@ func (fs *filesystem) getChildLocked(ctx context.Context, parent *dentry, name s // enabled, we should verify the child hash here because it may // be cached before enabled. if fs.allowRuntimeEnable { - if isEnabled(parent) { + if parent.verityEnabled() { if _, err := fs.verifyChild(ctx, parent, child); err != nil { return nil, err } } - if isEnabled(child) { + if child.verityEnabled() { vfsObj := fs.vfsfs.VirtualFilesystem() mask := uint32(linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID) stat, err := vfsObj.StatAt(ctx, fs.creds, &vfs.PathOperation{ @@ -553,13 +553,13 @@ func (fs *filesystem) lookupAndVerifyLocked(ctx context.Context, parent *dentry, // Verify child hash. This should always be performed unless in // allowRuntimeEnable mode and the parent directory hasn't been enabled // yet. - if isEnabled(parent) { + if parent.verityEnabled() { if _, err := fs.verifyChild(ctx, parent, child); err != nil { child.destroyLocked(ctx) return nil, err } } - if isEnabled(child) { + if child.verityEnabled() { if err := fs.verifyStat(ctx, child, stat); err != nil { child.destroyLocked(ctx) return nil, err @@ -915,7 +915,7 @@ func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf if err != nil { return linux.Statx{}, err } - if isEnabled(d) { + if d.verityEnabled() { if err := fs.verifyStat(ctx, d, stat); err != nil { return linux.Statx{}, err } diff --git a/pkg/sentry/fsimpl/verity/verity.go b/pkg/sentry/fsimpl/verity/verity.go index 70034280b..8dc9e26bc 100644 --- a/pkg/sentry/fsimpl/verity/verity.go +++ b/pkg/sentry/fsimpl/verity/verity.go @@ -148,14 +148,6 @@ func (FilesystemType) Name() string { return Name } -// isEnabled checks whether the target is enabled with verity features. It -// should always be true if runtime enable is not allowed. In runtime enable -// mode, it returns true if the target has been enabled with -// ioctl(FS_IOC_ENABLE_VERITY). -func isEnabled(d *dentry) bool { - return !d.fs.allowRuntimeEnable || len(d.hash) != 0 -} - // Release implements vfs.FilesystemType.Release. func (FilesystemType) Release(ctx context.Context) {} @@ -448,6 +440,14 @@ func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(atomic.LoadUint32(&d.mode)), auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))) } +// verityEnabled checks whether the file is enabled with verity features. It +// should always be true if runtime enable is not allowed. In runtime enable +// mode, it returns true if the target has been enabled with +// ioctl(FS_IOC_ENABLE_VERITY). +func (d *dentry) verityEnabled() bool { + return !d.fs.allowRuntimeEnable || len(d.hash) != 0 +} + func (d *dentry) readlink(ctx context.Context) (string, error) { return d.fs.vfsfs.VirtualFilesystem().ReadlinkAt(ctx, d.fs.creds, &vfs.PathOperation{ Root: d.lowerVD, @@ -510,7 +510,7 @@ func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linu if err != nil { return linux.Statx{}, err } - if isEnabled(fd.d) { + if fd.d.verityEnabled() { if err := fd.d.fs.verifyStat(ctx, fd.d, stat); err != nil { return linux.Statx{}, err } @@ -726,7 +726,7 @@ func (fd *fileDescription) Ioctl(ctx context.Context, uio usermem.IO, args arch. func (fd *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { // No need to verify if the file is not enabled yet in // allowRuntimeEnable mode. - if !isEnabled(fd.d) { + if !fd.d.verityEnabled() { return fd.lowerFD.PRead(ctx, dst, offset, opts) } diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD index 5de70aecb..c0de72eef 100644 --- a/pkg/sentry/kernel/BUILD +++ b/pkg/sentry/kernel/BUILD @@ -97,6 +97,17 @@ go_template_instance( ) go_template_instance( + name = "ipc_namespace_refs", + out = "ipc_namespace_refs.go", + package = "kernel", + prefix = "IPCNamespace", + template = "//pkg/refs_vfs2:refs_template", + types = { + "T": "IPCNamespace", + }, +) + +go_template_instance( name = "process_group_refs", out = "process_group_refs.go", package = "kernel", @@ -137,6 +148,7 @@ go_library( "fs_context.go", "fs_context_refs.go", "ipc_namespace.go", + "ipc_namespace_refs.go", "kcov.go", "kcov_unsafe.go", "kernel.go", @@ -206,6 +218,7 @@ go_library( "//pkg/amutex", "//pkg/bits", "//pkg/bpf", + "//pkg/cleanup", "//pkg/context", "//pkg/coverage", "//pkg/cpuid", diff --git a/pkg/sentry/kernel/context.go b/pkg/sentry/kernel/context.go index dd5f0f5fa..bb94769c4 100644 --- a/pkg/sentry/kernel/context.go +++ b/pkg/sentry/kernel/context.go @@ -81,7 +81,8 @@ func UTSNamespaceFromContext(ctx context.Context) *UTSNamespace { } // IPCNamespaceFromContext returns the IPC namespace in which ctx is executing, -// or nil if there is no such IPC namespace. +// or nil if there is no such IPC namespace. It takes a reference on the +// namespace. func IPCNamespaceFromContext(ctx context.Context) *IPCNamespace { if v := ctx.Value(CtxIPCNamespace); v != nil { return v.(*IPCNamespace) diff --git a/pkg/sentry/kernel/ipc_namespace.go b/pkg/sentry/kernel/ipc_namespace.go index 80a070d7e..3f34ee0db 100644 --- a/pkg/sentry/kernel/ipc_namespace.go +++ b/pkg/sentry/kernel/ipc_namespace.go @@ -15,6 +15,7 @@ package kernel import ( + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/semaphore" "gvisor.dev/gvisor/pkg/sentry/kernel/shm" @@ -24,6 +25,8 @@ import ( // // +stateify savable type IPCNamespace struct { + IPCNamespaceRefs + // User namespace which owns this IPC namespace. Immutable. userNS *auth.UserNamespace @@ -33,11 +36,13 @@ type IPCNamespace struct { // NewIPCNamespace creates a new IPC namespace. func NewIPCNamespace(userNS *auth.UserNamespace) *IPCNamespace { - return &IPCNamespace{ + ns := &IPCNamespace{ userNS: userNS, semaphores: semaphore.NewRegistry(userNS), shms: shm.NewRegistry(userNS), } + ns.EnableLeakCheck() + return ns } // SemaphoreRegistry returns the semaphore set registry for this namespace. @@ -50,6 +55,13 @@ func (i *IPCNamespace) ShmRegistry() *shm.Registry { return i.shms } +// DecRef implements refs_vfs2.RefCounter.DecRef. +func (i *IPCNamespace) DecRef(ctx context.Context) { + i.IPCNamespaceRefs.DecRef(func() { + i.shms.Release(ctx) + }) +} + // IPCNamespace returns the task's IPC namespace. func (t *Task) IPCNamespace() *IPCNamespace { t.mu.Lock() diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index 675506269..0eb2bf7bd 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -39,6 +39,7 @@ import ( "time" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/cleanup" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/cpuid" "gvisor.dev/gvisor/pkg/eventchannel" @@ -340,7 +341,7 @@ func (k *Kernel) Init(args InitKernelArgs) error { return fmt.Errorf("Timekeeper is nil") } if args.Timekeeper.clocks == nil { - return fmt.Errorf("Must call Timekeeper.SetClocks() before Kernel.Init()") + return fmt.Errorf("must call Timekeeper.SetClocks() before Kernel.Init()") } if args.RootUserNamespace == nil { return fmt.Errorf("RootUserNamespace is nil") @@ -365,7 +366,7 @@ func (k *Kernel) Init(args InitKernelArgs) error { k.useHostCores = true maxCPU, err := hostcpu.MaxPossibleCPU() if err != nil { - return fmt.Errorf("Failed to get maximum CPU number: %v", err) + return fmt.Errorf("failed to get maximum CPU number: %v", err) } minAppCores := uint(maxCPU) + 1 if k.applicationCores < minAppCores { @@ -828,7 +829,9 @@ func (ctx *createProcessContext) Value(key interface{}) interface{} { case CtxUTSNamespace: return ctx.args.UTSNamespace case CtxIPCNamespace: - return ctx.args.IPCNamespace + ipcns := ctx.args.IPCNamespace + ipcns.IncRef() + return ipcns case auth.CtxCredentials: return ctx.args.Credentials case fs.CtxRoot: @@ -964,6 +967,10 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, } tg := k.NewThreadGroup(mntns, args.PIDNamespace, NewSignalHandlers(), linux.SIGCHLD, args.Limits) + cu := cleanup.Make(func() { + tg.Release(ctx) + }) + defer cu.Clean() // Check which file to start from. switch { @@ -1023,13 +1030,14 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, MountNamespaceVFS2: mntnsVFS2, ContainerID: args.ContainerID, } - t, err := k.tasks.NewTask(config) + t, err := k.tasks.NewTask(ctx, config) if err != nil { return nil, 0, err } t.traceExecEvent(tc) // Simulate exec for tracing. // Success. + cu.Release() tgid := k.tasks.Root.IDOfThreadGroup(tg) if k.globalInit == nil { k.globalInit = tg @@ -1374,8 +1382,9 @@ func (k *Kernel) RootUTSNamespace() *UTSNamespace { return k.rootUTSNamespace } -// RootIPCNamespace returns the root IPCNamespace. +// RootIPCNamespace takes a reference and returns the root IPCNamespace. func (k *Kernel) RootIPCNamespace() *IPCNamespace { + k.rootIPCNamespace.IncRef() return k.rootIPCNamespace } @@ -1636,7 +1645,9 @@ func (ctx supervisorContext) Value(key interface{}) interface{} { case CtxUTSNamespace: return ctx.k.rootUTSNamespace case CtxIPCNamespace: - return ctx.k.rootIPCNamespace + ipcns := ctx.k.rootIPCNamespace + ipcns.IncRef() + return ipcns case auth.CtxCredentials: // The supervisor context is global root. return auth.NewRootCredentials(ctx.k.rootUserNamespace) diff --git a/pkg/sentry/kernel/pipe/vfs.go b/pkg/sentry/kernel/pipe/vfs.go index f61039f5b..1a152142b 100644 --- a/pkg/sentry/kernel/pipe/vfs.go +++ b/pkg/sentry/kernel/pipe/vfs.go @@ -237,8 +237,7 @@ func (fd *VFSPipeFD) Ioctl(ctx context.Context, uio usermem.IO, args arch.Syscal // PipeSize implements fcntl(F_GETPIPE_SZ). func (fd *VFSPipeFD) PipeSize() int64 { - // Inline Pipe.FifoSize() rather than calling it with nil Context and - // fs.File and ignoring the returned error (which is always nil). + // Inline Pipe.FifoSize() since we don't have a fs.File. fd.pipe.mu.Lock() defer fd.pipe.mu.Unlock() return fd.pipe.max diff --git a/pkg/sentry/kernel/shm/BUILD b/pkg/sentry/kernel/shm/BUILD index b7e4b480d..f8a382fd8 100644 --- a/pkg/sentry/kernel/shm/BUILD +++ b/pkg/sentry/kernel/shm/BUILD @@ -27,6 +27,7 @@ go_library( "//pkg/context", "//pkg/log", "//pkg/refs", + "//pkg/refs_vfs2", "//pkg/sentry/device", "//pkg/sentry/fs", "//pkg/sentry/kernel/auth", diff --git a/pkg/sentry/kernel/shm/shm.go b/pkg/sentry/kernel/shm/shm.go index 00c03585e..ebbebf46b 100644 --- a/pkg/sentry/kernel/shm/shm.go +++ b/pkg/sentry/kernel/shm/shm.go @@ -321,9 +321,32 @@ func (r *Registry) remove(s *Shm) { r.totalPages -= s.effectiveSize / usermem.PageSize } +// Release drops the self-reference of each active shm segment in the registry. +// It is called when the kernel.IPCNamespace containing r is being destroyed. +func (r *Registry) Release(ctx context.Context) { + // Because Shm.DecRef() may acquire the same locks, collect the segments to + // release first. Note that this should not race with any updates to r, since + // the IPC namespace containing it has no more references. + toRelease := make([]*Shm, 0) + r.mu.Lock() + for _, s := range r.keysToShms { + s.mu.Lock() + if !s.pendingDestruction { + toRelease = append(toRelease, s) + } + s.mu.Unlock() + } + r.mu.Unlock() + + for _, s := range toRelease { + r.dissociateKey(s) + s.DecRef(ctx) + } +} + // Shm represents a single shared memory segment. // -// Shm segment are backed directly by an allocation from platform memory. +// Shm segments are backed directly by an allocation from platform memory. // Segments are always mapped as a whole, greatly simplifying how mappings are // tracked. However note that mremap and munmap calls may cause the vma for a // segment to become fragmented; which requires special care when unmapping a @@ -652,17 +675,20 @@ func (s *Shm) MarkDestroyed(ctx context.Context) { s.registry.dissociateKey(s) s.mu.Lock() - defer s.mu.Unlock() - if !s.pendingDestruction { - s.pendingDestruction = true - // Drop the self-reference so destruction occurs when all - // external references are gone. - // - // N.B. This cannot be the final DecRef, as the caller also - // holds a reference. - s.DecRef(ctx) + if s.pendingDestruction { + s.mu.Unlock() return } + s.pendingDestruction = true + s.mu.Unlock() + + // Drop the self-reference so destruction occurs when all + // external references are gone. + // + // N.B. This cannot be the final DecRef, as the caller also + // holds a reference. + s.DecRef(ctx) + return } // checkOwnership verifies whether a segment may be accessed by ctx as an diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go index e90a19cfb..037971393 100644 --- a/pkg/sentry/kernel/task.go +++ b/pkg/sentry/kernel/task.go @@ -656,7 +656,9 @@ func (t *Task) Value(key interface{}) interface{} { case CtxUTSNamespace: return t.utsns case CtxIPCNamespace: - return t.ipcns + ipcns := t.IPCNamespace() + ipcns.IncRef() + return ipcns case CtxTask: return t case auth.CtxCredentials: diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go index fce1064a7..682080c14 100644 --- a/pkg/sentry/kernel/task_clone.go +++ b/pkg/sentry/kernel/task_clone.go @@ -19,6 +19,7 @@ import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/bpf" + "gvisor.dev/gvisor/pkg/cleanup" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" @@ -203,7 +204,13 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { // Note that "If CLONE_NEWIPC is set, then create the process in a new IPC // namespace" ipcns = NewIPCNamespace(userns) + } else { + ipcns.IncRef() } + cu := cleanup.Make(func() { + ipcns.DecRef(t) + }) + defer cu.Clean() netns := t.NetworkNamespace() if opts.NewNetworkNamespace { @@ -214,12 +221,18 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { mntnsVFS2 := t.mountNamespaceVFS2 if mntnsVFS2 != nil { mntnsVFS2.IncRef() + cu.Add(func() { + mntnsVFS2.DecRef(t) + }) } tc, err := t.tc.Fork(t, t.k, !opts.NewAddressSpace) if err != nil { return 0, nil, err } + cu.Add(func() { + tc.release() + }) // clone() returns 0 in the child. tc.Arch.SetReturn(0) if opts.Stack != 0 { @@ -295,11 +308,11 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { } else { cfg.InheritParent = t } - nt, err := t.tg.pidns.owner.NewTask(cfg) + nt, err := t.tg.pidns.owner.NewTask(t, cfg) + // If NewTask succeeds, we transfer references to nt. If NewTask fails, it does + // the cleanup for us. + cu.Release() if err != nil { - if opts.NewThreadGroup { - tg.release(t) - } return 0, nil, err } @@ -509,6 +522,7 @@ func (t *Task) Unshare(opts *SharingOptions) error { } // Note that "If CLONE_NEWIPC is set, then create the process in a new IPC // namespace" + t.ipcns.DecRef(t) t.ipcns = NewIPCNamespace(creds.UserNamespace) } var oldFDTable *FDTable diff --git a/pkg/sentry/kernel/task_exit.go b/pkg/sentry/kernel/task_exit.go index b400a8b41..ce7b9641d 100644 --- a/pkg/sentry/kernel/task_exit.go +++ b/pkg/sentry/kernel/task_exit.go @@ -280,12 +280,13 @@ func (*runExitMain) execute(t *Task) taskRunState { t.mountNamespaceVFS2.DecRef(t) t.mountNamespaceVFS2 = nil } + t.ipcns.DecRef(t) t.mu.Unlock() // If this is the last task to exit from the thread group, release the // thread group's resources. if lastExiter { - t.tg.release(t) + t.tg.Release(t) } // Detach tracees. diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go index 64c1e120a..8e28230cc 100644 --- a/pkg/sentry/kernel/task_start.go +++ b/pkg/sentry/kernel/task_start.go @@ -16,6 +16,7 @@ package kernel import ( "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/inet" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" @@ -98,14 +99,18 @@ type TaskConfig struct { // NewTask creates a new task defined by cfg. // // NewTask does not start the returned task; the caller must call Task.Start. -func (ts *TaskSet) NewTask(cfg *TaskConfig) (*Task, error) { +// +// If successful, NewTask transfers references held by cfg to the new task. +// Otherwise, NewTask releases them. +func (ts *TaskSet) NewTask(ctx context.Context, cfg *TaskConfig) (*Task, error) { t, err := ts.newTask(cfg) if err != nil { cfg.TaskContext.release() - cfg.FSContext.DecRef(t) - cfg.FDTable.DecRef(t) + cfg.FSContext.DecRef(ctx) + cfg.FDTable.DecRef(ctx) + cfg.IPCNamespace.DecRef(ctx) if cfg.MountNamespaceVFS2 != nil { - cfg.MountNamespaceVFS2.DecRef(t) + cfg.MountNamespaceVFS2.DecRef(ctx) } return nil, err } diff --git a/pkg/sentry/kernel/thread_group.go b/pkg/sentry/kernel/thread_group.go index 0b34c0099..a183b28c1 100644 --- a/pkg/sentry/kernel/thread_group.go +++ b/pkg/sentry/kernel/thread_group.go @@ -18,6 +18,7 @@ import ( "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" @@ -307,8 +308,8 @@ func (tg *ThreadGroup) Limits() *limits.LimitSet { return tg.limits } -// release releases the thread group's resources. -func (tg *ThreadGroup) release(t *Task) { +// Release releases the thread group's resources. +func (tg *ThreadGroup) Release(ctx context.Context) { // Timers must be destroyed without holding the TaskSet or signal mutexes // since timers send signals with Timer.mu locked. tg.itimerRealTimer.Destroy() @@ -325,7 +326,7 @@ func (tg *ThreadGroup) release(t *Task) { it.DestroyTimer() } if tg.mounts != nil { - tg.mounts.DecRef(t) + tg.mounts.DecRef(ctx) } } diff --git a/pkg/sentry/pgalloc/BUILD b/pkg/sentry/pgalloc/BUILD index 7a3311a70..5b09b9feb 100644 --- a/pkg/sentry/pgalloc/BUILD +++ b/pkg/sentry/pgalloc/BUILD @@ -83,6 +83,7 @@ go_library( ], visibility = ["//pkg/sentry:internal"], deps = [ + "//pkg/abi/linux", "//pkg/context", "//pkg/log", "//pkg/memutil", diff --git a/pkg/sentry/pgalloc/pgalloc.go b/pkg/sentry/pgalloc/pgalloc.go index 626d1eaa4..7c297fb9e 100644 --- a/pkg/sentry/pgalloc/pgalloc.go +++ b/pkg/sentry/pgalloc/pgalloc.go @@ -29,6 +29,7 @@ import ( "syscall" "time" + "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/safemem" @@ -224,6 +225,18 @@ type usageInfo struct { refs uint64 } +// canCommit returns true if the tracked region can be committed. +func (u *usageInfo) canCommit() bool { + // refs must be greater than 0 because we assume that reclaimable pages + // (that aren't already known to be committed) are not committed. This + // isn't necessarily true, even after the reclaimer does Decommit(), + // because the kernel may subsequently back the hugepage-sized region + // containing the decommitted page with a hugepage. However, it's + // consistent with our treatment of unallocated pages, which have the same + // property. + return !u.knownCommitted && u.refs != 0 +} + // An EvictableMemoryUser represents a user of MemoryFile-allocated memory that // may be asked to deallocate that memory in the presence of memory pressure. type EvictableMemoryUser interface { @@ -828,6 +841,11 @@ func (f *MemoryFile) UpdateUsage() error { log.Debugf("UpdateUsage: skipped with usageSwapped!=0.") return nil } + // Linux updates usage values at CONFIG_HZ. + if scanningAfter := time.Now().Sub(f.usageLast).Milliseconds(); scanningAfter < time.Second.Milliseconds()/linux.CLOCKS_PER_SEC { + log.Debugf("UpdateUsage: skipped because previous scan happened %d ms back", scanningAfter) + return nil + } f.usageLast = time.Now() err = f.updateUsageLocked(currentUsage, mincore) @@ -841,7 +859,7 @@ func (f *MemoryFile) UpdateUsage() error { // pages by invoking checkCommitted, which is a function that, for each page i // in bs, sets committed[i] to 1 if the page is committed and 0 otherwise. // -// Precondition: f.mu must be held. +// Precondition: f.mu must be held; it may be unlocked and reacquired. func (f *MemoryFile) updateUsageLocked(currentUsage uint64, checkCommitted func(bs []byte, committed []byte) error) error { // Track if anything changed to elide the merge. In the common case, we // expect all segments to be committed and no merge to occur. @@ -868,7 +886,7 @@ func (f *MemoryFile) updateUsageLocked(currentUsage uint64, checkCommitted func( } else if f.usageSwapped != 0 { // We have more usage accounted for than the file itself. // That's fine, we probably caught a race where pages were - // being committed while the above loop was running. Just + // being committed while the below loop was running. Just // report the higher number that we found and ignore swap. usage.MemoryAccounting.Dec(f.usageSwapped, usage.System) f.usageSwapped = 0 @@ -880,21 +898,9 @@ func (f *MemoryFile) updateUsageLocked(currentUsage uint64, checkCommitted func( // Iterate over all usage data. There will only be usage segments // present when there is an associated reference. - for seg := f.usage.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { - val := seg.Value() - - // Already known to be committed; ignore. - if val.knownCommitted { - continue - } - - // Assume that reclaimable pages (that aren't already known to be - // committed) are not committed. This isn't necessarily true, even - // after the reclaimer does Decommit(), because the kernel may - // subsequently back the hugepage-sized region containing the - // decommitted page with a hugepage. However, it's consistent with our - // treatment of unallocated pages, which have the same property. - if val.refs == 0 { + for seg := f.usage.FirstSegment(); seg.Ok(); { + if !seg.ValuePtr().canCommit() { + seg = seg.NextSegment() continue } @@ -917,56 +923,53 @@ func (f *MemoryFile) updateUsageLocked(currentUsage uint64, checkCommitted func( } // Query for new pages in core. - if err := checkCommitted(s, buf); err != nil { + // NOTE(b/165896008): mincore (which is passed as checkCommitted) + // by f.UpdateUsage() might take a really long time. So unlock f.mu + // while checkCommitted runs. + f.mu.Unlock() + err := checkCommitted(s, buf) + f.mu.Lock() + if err != nil { checkErr = err return } // Scan each page and switch out segments. - populatedRun := false - populatedRunStart := 0 - for i := 0; i <= bufLen; i++ { - // We run past the end of the slice here to - // simplify the logic and only set populated if - // we're still looking at elements. - populated := false - if i < bufLen { - populated = buf[i]&0x1 != 0 - } - - switch { - case populated == populatedRun: - // Keep the run going. - continue - case populated && !populatedRun: - // Begin the run. - populatedRun = true - populatedRunStart = i - // Keep going. + seg := f.usage.LowerBoundSegment(r.Start) + for i := 0; i < bufLen; { + if buf[i]&0x1 == 0 { + i++ continue - case !populated && populatedRun: - // Finish the run by changing this segment. - runRange := memmap.FileRange{ - Start: r.Start + uint64(populatedRunStart*usermem.PageSize), - End: r.Start + uint64(i*usermem.PageSize), + } + // Scan to the end of this committed range. + j := i + 1 + for ; j < bufLen; j++ { + if buf[j]&0x1 == 0 { + break } - seg = f.usage.Isolate(seg, runRange) - seg.ValuePtr().knownCommitted = true - // Advance the segment only if we still - // have work to do in the context of - // the original segment from the for - // loop. Otherwise, the for loop itself - // will advance the segment - // appropriately. - if runRange.End != r.End { - seg = seg.NextSegment() + } + committedFR := memmap.FileRange{ + Start: r.Start + uint64(i*usermem.PageSize), + End: r.Start + uint64(j*usermem.PageSize), + } + // Advance seg to committedFR.Start. + for seg.Ok() && seg.End() < committedFR.Start { + seg = seg.NextSegment() + } + // Mark pages overlapping committedFR as committed. + for seg.Ok() && seg.Start() < committedFR.End { + if seg.ValuePtr().canCommit() { + seg = f.usage.Isolate(seg, committedFR) + seg.ValuePtr().knownCommitted = true + amount := seg.Range().Length() + usage.MemoryAccounting.Inc(amount, seg.ValuePtr().kind) + f.usageExpected += amount + changedAny = true } - amount := runRange.Length() - usage.MemoryAccounting.Inc(amount, val.kind) - f.usageExpected += amount - changedAny = true - populatedRun = false + seg = seg.NextSegment() } + // Continue scanning for committed pages. + i = j + 1 } // Advance r.Start. @@ -978,6 +981,9 @@ func (f *MemoryFile) updateUsageLocked(currentUsage uint64, checkCommitted func( if err != nil { return err } + + // Continue with the first segment after r.End. + seg = f.usage.LowerBoundSegment(r.End) } return nil diff --git a/pkg/sentry/platform/kvm/bluepill_arm64.go b/pkg/sentry/platform/kvm/bluepill_arm64.go index ed5ae03d3..58f3d6fdd 100644 --- a/pkg/sentry/platform/kvm/bluepill_arm64.go +++ b/pkg/sentry/platform/kvm/bluepill_arm64.go @@ -39,6 +39,16 @@ var ( } ) +// getTLS returns the value of TPIDR_EL0 register. +// +//go:nosplit +func getTLS() (value uint64) + +// setTLS writes the TPIDR_EL0 value. +// +//go:nosplit +func setTLS(value uint64) + // bluepillArchEnter is called during bluepillEnter. // //go:nosplit @@ -51,6 +61,8 @@ func bluepillArchEnter(context *arch.SignalContext64) (c *vCPU) { regs.Pstate = context.Pstate regs.Pstate &^= uint64(ring0.PsrFlagsClear) regs.Pstate |= ring0.KernelFlagsSet + regs.TPIDR_EL0 = getTLS() + return } @@ -65,6 +77,7 @@ func bluepillArchExit(c *vCPU, context *arch.SignalContext64) { context.Pstate = regs.Pstate context.Pstate &^= uint64(ring0.PsrFlagsClear) context.Pstate |= ring0.UserFlagsSet + setTLS(regs.TPIDR_EL0) lazyVfp := c.GetLazyVFP() if lazyVfp != 0 { diff --git a/pkg/sentry/platform/kvm/bluepill_arm64.s b/pkg/sentry/platform/kvm/bluepill_arm64.s index 04efa0147..09c7e88e5 100644 --- a/pkg/sentry/platform/kvm/bluepill_arm64.s +++ b/pkg/sentry/platform/kvm/bluepill_arm64.s @@ -32,6 +32,18 @@ #define CONTEXT_PC 0x1B8 #define CONTEXT_R0 0xB8 +// getTLS returns the value of TPIDR_EL0 register. +TEXT ·getTLS(SB),NOSPLIT,$0-8 + MRS TPIDR_EL0, R1 + MOVD R1, ret+0(FP) + RET + +// setTLS writes the TPIDR_EL0 value. +TEXT ·setTLS(SB),NOSPLIT,$0-8 + MOVD addr+0(FP), R1 + MSR R1, TPIDR_EL0 + RET + // See bluepill.go. TEXT ·bluepill(SB),NOSPLIT,$0 begin: diff --git a/pkg/sentry/platform/ring0/lib_arm64.go b/pkg/sentry/platform/ring0/lib_arm64.go index 2f1abcb0f..d91a09de1 100644 --- a/pkg/sentry/platform/ring0/lib_arm64.go +++ b/pkg/sentry/platform/ring0/lib_arm64.go @@ -53,12 +53,6 @@ func LoadFloatingPoint(*byte) // SaveFloatingPoint saves floating point state. func SaveFloatingPoint(*byte) -// GetTLS returns the value of TPIDR_EL0 register. -func GetTLS() (value uint64) - -// SetTLS writes the TPIDR_EL0 value. -func SetTLS(value uint64) - // Init sets function pointers based on architectural features. // // This must be called prior to using ring0. diff --git a/pkg/sentry/platform/ring0/lib_arm64.s b/pkg/sentry/platform/ring0/lib_arm64.s index 8aabf7d0e..da9d3cf55 100644 --- a/pkg/sentry/platform/ring0/lib_arm64.s +++ b/pkg/sentry/platform/ring0/lib_arm64.s @@ -29,16 +29,6 @@ TEXT ·FlushTlbAll(SB),NOSPLIT,$0 ISB $15 RET -TEXT ·GetTLS(SB),NOSPLIT,$0-8 - MRS TPIDR_EL0, R1 - MOVD R1, ret+0(FP) - RET - -TEXT ·SetTLS(SB),NOSPLIT,$0-8 - MOVD addr+0(FP), R1 - MSR R1, TPIDR_EL0 - RET - TEXT ·CPACREL1(SB),NOSPLIT,$0-8 WORD $0xd5381041 // MRS CPACR_EL1, R1 MOVD R1, ret+0(FP) diff --git a/pkg/sentry/socket/netlink/socket.go b/pkg/sentry/socket/netlink/socket.go index 5ddcd4be5..3baad098b 100644 --- a/pkg/sentry/socket/netlink/socket.go +++ b/pkg/sentry/socket/netlink/socket.go @@ -16,6 +16,7 @@ package netlink import ( + "io" "math" "gvisor.dev/gvisor/pkg/abi/linux" @@ -748,6 +749,12 @@ func (s *socketOpsCommon) sendMsg(ctx context.Context, src usermem.IOSequence, t buf := make([]byte, src.NumBytes()) n, err := src.CopyIn(ctx, buf) + // io.EOF can be only returned if src is a file, this means that + // sendMsg is called from splice and the error has to be ignored in + // this case. + if err == io.EOF { + err = nil + } if err != nil { // Don't partially consume messages. return 0, syserr.FromError(err) diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go index 87e30d742..211f07947 100644 --- a/pkg/sentry/socket/netstack/netstack.go +++ b/pkg/sentry/socket/netstack/netstack.go @@ -587,6 +587,11 @@ func (i *ioSequencePayload) Payload(size int) ([]byte, *tcpip.Error) { } v := buffer.NewView(size) if _, err := i.src.CopyIn(i.ctx, v); err != nil { + // EOF can be returned only if src is a file and this means it + // is in a splice syscall and the error has to be ignored. + if err == io.EOF { + return v, nil + } return nil, tcpip.ErrBadAddress } return v, nil diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go index f80011ce4..a4a76d0a3 100644 --- a/pkg/sentry/socket/unix/unix.go +++ b/pkg/sentry/socket/unix/unix.go @@ -573,13 +573,17 @@ func (s *SocketOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOS if dst.NumBytes() == 0 { return 0, nil } - return dst.CopyOutFrom(ctx, &EndpointReader{ + r := &EndpointReader{ Ctx: ctx, Endpoint: s.ep, NumRights: 0, Peek: false, From: nil, - }) + } + n, err := dst.CopyOutFrom(ctx, r) + // Drop control messages. + r.Control.Release(ctx) + return n, err } // RecvMsg implements the linux syscall recvmsg(2) for sockets backed by diff --git a/pkg/sentry/socket/unix/unix_vfs2.go b/pkg/sentry/socket/unix/unix_vfs2.go index 3345124cc..678355fb9 100644 --- a/pkg/sentry/socket/unix/unix_vfs2.go +++ b/pkg/sentry/socket/unix/unix_vfs2.go @@ -267,13 +267,17 @@ func (s *SocketVFS2) Read(ctx context.Context, dst usermem.IOSequence, opts vfs. if dst.NumBytes() == 0 { return 0, nil } - return dst.CopyOutFrom(ctx, &EndpointReader{ + r := &EndpointReader{ Ctx: ctx, Endpoint: s.ep, NumRights: 0, Peek: false, From: nil, - }) + } + n, err := dst.CopyOutFrom(ctx, r) + // Drop control messages. + r.Control.Release(ctx) + return n, err } // PWrite implements vfs.FileDescriptionImpl. diff --git a/pkg/sentry/syscalls/linux/sys_socket.go b/pkg/sentry/syscalls/linux/sys_socket.go index 9feaca0da..9cd052c3d 100644 --- a/pkg/sentry/syscalls/linux/sys_socket.go +++ b/pkg/sentry/syscalls/linux/sys_socket.go @@ -1052,7 +1052,9 @@ func sendSingleMsg(t *kernel.Task, s socket.Socket, file *fs.File, msgPtr userme // Call the syscall implementation. n, e := s.SendMsg(t, src, to, int(flags), haveDeadline, deadline, controlMessages) err = handleIOError(t, n != 0, e.ToError(), syserror.ERESTARTSYS, "sendmsg", file) - if err != nil { + // Control messages should be released on error as well as for zero-length + // messages, which are discarded by the receiver. + if n == 0 || err != nil { controlMessages.Release(t) } return uintptr(n), err diff --git a/pkg/sentry/syscalls/linux/sys_sysinfo.go b/pkg/sentry/syscalls/linux/sys_sysinfo.go index 6320593f0..db3d924d9 100644 --- a/pkg/sentry/syscalls/linux/sys_sysinfo.go +++ b/pkg/sentry/syscalls/linux/sys_sysinfo.go @@ -21,7 +21,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/usage" ) -// Sysinfo implements the sysinfo syscall as described in man 2 sysinfo. +// Sysinfo implements Linux syscall sysinfo(2). func Sysinfo(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { addr := args[0].Pointer() diff --git a/pkg/sentry/syscalls/linux/vfs2/fd.go b/pkg/sentry/syscalls/linux/vfs2/fd.go index d8b8d9783..36e89700e 100644 --- a/pkg/sentry/syscalls/linux/vfs2/fd.go +++ b/pkg/sentry/syscalls/linux/vfs2/fd.go @@ -145,16 +145,6 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall return uintptr(file.StatusFlags()), nil, nil case linux.F_SETFL: return 0, nil, file.SetStatusFlags(t, t.Credentials(), args[2].Uint()) - case linux.F_SETPIPE_SZ: - pipefile, ok := file.Impl().(*pipe.VFSPipeFD) - if !ok { - return 0, nil, syserror.EBADF - } - n, err := pipefile.SetPipeSize(int64(args[2].Int())) - if err != nil { - return 0, nil, err - } - return uintptr(n), nil, nil case linux.F_GETOWN: owner, hasOwner := getAsyncOwner(t, file) if !hasOwner { @@ -190,6 +180,16 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall return 0, nil, err } return 0, nil, setAsyncOwner(t, file, owner.Type, owner.PID) + case linux.F_SETPIPE_SZ: + pipefile, ok := file.Impl().(*pipe.VFSPipeFD) + if !ok { + return 0, nil, syserror.EBADF + } + n, err := pipefile.SetPipeSize(int64(args[2].Int())) + if err != nil { + return 0, nil, err + } + return uintptr(n), nil, nil case linux.F_GETPIPE_SZ: pipefile, ok := file.Impl().(*pipe.VFSPipeFD) if !ok { diff --git a/pkg/sentry/syscalls/linux/vfs2/socket.go b/pkg/sentry/syscalls/linux/vfs2/socket.go index bfae6b7e9..7b33b3f59 100644 --- a/pkg/sentry/syscalls/linux/vfs2/socket.go +++ b/pkg/sentry/syscalls/linux/vfs2/socket.go @@ -1055,7 +1055,9 @@ func sendSingleMsg(t *kernel.Task, s socket.SocketVFS2, file *vfs.FileDescriptio // Call the syscall implementation. n, e := s.SendMsg(t, src, to, int(flags), haveDeadline, deadline, controlMessages) err = slinux.HandleIOErrorVFS2(t, n != 0, e.ToError(), syserror.ERESTARTSYS, "sendmsg", file) - if err != nil { + // Control messages should be released on error as well as for zero-length + // messages, which are discarded by the receiver. + if n == 0 || err != nil { controlMessages.Release(t) } return uintptr(n), err diff --git a/pkg/sentry/syscalls/linux/vfs2/splice.go b/pkg/sentry/syscalls/linux/vfs2/splice.go index bf5c1171f..035e2a6b0 100644 --- a/pkg/sentry/syscalls/linux/vfs2/splice.go +++ b/pkg/sentry/syscalls/linux/vfs2/splice.go @@ -45,6 +45,9 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal if count > int64(kernel.MAX_RW_COUNT) { count = int64(kernel.MAX_RW_COUNT) } + if count < 0 { + return 0, nil, syserror.EINVAL + } // Check for invalid flags. if flags&^(linux.SPLICE_F_MOVE|linux.SPLICE_F_NONBLOCK|linux.SPLICE_F_MORE|linux.SPLICE_F_GIFT) != 0 { @@ -192,6 +195,9 @@ func Tee(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallCo if count > int64(kernel.MAX_RW_COUNT) { count = int64(kernel.MAX_RW_COUNT) } + if count < 0 { + return 0, nil, syserror.EINVAL + } // Check for invalid flags. if flags&^(linux.SPLICE_F_MOVE|linux.SPLICE_F_NONBLOCK|linux.SPLICE_F_MORE|linux.SPLICE_F_GIFT) != 0 { diff --git a/pkg/sentry/usage/memory.go b/pkg/sentry/usage/memory.go index ab1d140d2..5ed6726ab 100644 --- a/pkg/sentry/usage/memory.go +++ b/pkg/sentry/usage/memory.go @@ -278,7 +278,7 @@ func TotalMemory(memSize, used uint64) uint64 { } if memSize < used { memSize = used - // Bump totalSize to the next largest power of 2, if one exists, so + // Bump memSize to the next largest power of 2, if one exists, so // that MemFree isn't 0. if msb := bits.MostSignificantOne64(memSize); msb < 63 { memSize = uint64(1) << (uint(msb) + 1) |