summaryrefslogtreecommitdiffhomepage
path: root/pkg/sentry
diff options
context:
space:
mode:
Diffstat (limited to 'pkg/sentry')
-rw-r--r--pkg/sentry/control/pprof.go6
-rw-r--r--pkg/sentry/devices/tundev/tundev.go4
-rw-r--r--pkg/sentry/fs/proc/sys_net.go95
-rw-r--r--pkg/sentry/fsimpl/gofer/BUILD1
-rw-r--r--pkg/sentry/fsimpl/gofer/directory.go12
-rw-r--r--pkg/sentry/fsimpl/gofer/filesystem.go44
-rw-r--r--pkg/sentry/fsimpl/gofer/gofer.go471
-rw-r--r--pkg/sentry/fsimpl/gofer/gofer_test.go6
-rw-r--r--pkg/sentry/fsimpl/gofer/handle.go19
-rw-r--r--pkg/sentry/fsimpl/gofer/regular_file.go118
-rw-r--r--pkg/sentry/fsimpl/gofer/special_file.go11
-rw-r--r--pkg/sentry/fsimpl/gofer/time.go3
-rw-r--r--pkg/sentry/fsimpl/proc/tasks_sys.go49
-rw-r--r--pkg/sentry/fsimpl/tmpfs/filesystem.go4
-rw-r--r--pkg/sentry/inet/inet.go17
-rw-r--r--pkg/sentry/inet/test_stack.go12
-rw-r--r--pkg/sentry/platform/kvm/bluepill_arm64.go4
-rw-r--r--pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go2
-rw-r--r--pkg/sentry/platform/kvm/machine_arm64_unsafe.go6
-rw-r--r--pkg/sentry/platform/ring0/aarch64.go35
-rw-r--r--pkg/sentry/platform/ring0/entry_arm64.s8
-rw-r--r--pkg/sentry/platform/ring0/kernel_arm64.go2
-rw-r--r--pkg/sentry/socket/hostinet/stack.go11
-rw-r--r--pkg/sentry/socket/netstack/stack.go14
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/memfd.go1
-rw-r--r--pkg/sentry/vfs/file_description.go5
26 files changed, 668 insertions, 292 deletions
diff --git a/pkg/sentry/control/pprof.go b/pkg/sentry/control/pprof.go
index 663e51989..2bf3c45e1 100644
--- a/pkg/sentry/control/pprof.go
+++ b/pkg/sentry/control/pprof.go
@@ -49,6 +49,9 @@ type ProfileOpts struct {
// - dump out the stack trace of current go routines.
// sentryctl -pid <pid> pprof-goroutine
type Profile struct {
+ // Kernel is the kernel under profile. It's immutable.
+ Kernel *kernel.Kernel
+
// mu protects the fields below.
mu sync.Mutex
@@ -57,9 +60,6 @@ type Profile struct {
// traceFile is the current execution trace output file.
traceFile *fd.FD
-
- // Kernel is the kernel under profile.
- Kernel *kernel.Kernel
}
// StartCPUProfile is an RPC stub which starts recording the CPU profile in a
diff --git a/pkg/sentry/devices/tundev/tundev.go b/pkg/sentry/devices/tundev/tundev.go
index 852ec3c5c..a40625e19 100644
--- a/pkg/sentry/devices/tundev/tundev.go
+++ b/pkg/sentry/devices/tundev/tundev.go
@@ -160,8 +160,8 @@ func (fd *tunFD) EventUnregister(e *waiter.Entry) {
fd.device.EventUnregister(e)
}
-// isNetTunSupported returns whether /dev/net/tun device is supported for s.
-func isNetTunSupported(s inet.Stack) bool {
+// IsNetTunSupported returns whether /dev/net/tun device is supported for s.
+func IsNetTunSupported(s inet.Stack) bool {
_, ok := s.(*netstack.Stack)
return ok
}
diff --git a/pkg/sentry/fs/proc/sys_net.go b/pkg/sentry/fs/proc/sys_net.go
index 702fdd392..8615b60f0 100644
--- a/pkg/sentry/fs/proc/sys_net.go
+++ b/pkg/sentry/fs/proc/sys_net.go
@@ -272,6 +272,96 @@ func (f *tcpSackFile) Write(ctx context.Context, _ *fs.File, src usermem.IOSeque
return n, f.tcpSack.stack.SetTCPSACKEnabled(*f.tcpSack.enabled)
}
+// +stateify savable
+type tcpRecovery struct {
+ fsutil.SimpleFileInode
+
+ stack inet.Stack `state:"wait"`
+ recovery inet.TCPLossRecovery
+}
+
+func newTCPRecoveryInode(ctx context.Context, msrc *fs.MountSource, s inet.Stack) *fs.Inode {
+ ts := &tcpRecovery{
+ SimpleFileInode: *fsutil.NewSimpleFileInode(ctx, fs.RootOwner, fs.FilePermsFromMode(0644), linux.PROC_SUPER_MAGIC),
+ stack: s,
+ }
+ sattr := fs.StableAttr{
+ DeviceID: device.ProcDevice.DeviceID(),
+ InodeID: device.ProcDevice.NextIno(),
+ BlockSize: usermem.PageSize,
+ Type: fs.SpecialFile,
+ }
+ return fs.NewInode(ctx, ts, msrc, sattr)
+}
+
+// Truncate implements fs.InodeOperations.Truncate.
+func (*tcpRecovery) Truncate(context.Context, *fs.Inode, int64) error {
+ return nil
+}
+
+// GetFile implements fs.InodeOperations.GetFile.
+func (r *tcpRecovery) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+ flags.Pread = true
+ flags.Pwrite = true
+ return fs.NewFile(ctx, dirent, flags, &tcpRecoveryFile{
+ tcpRecovery: r,
+ stack: r.stack,
+ }), nil
+}
+
+// +stateify savable
+type tcpRecoveryFile struct {
+ fsutil.FileGenericSeek `state:"nosave"`
+ fsutil.FileNoIoctl `state:"nosave"`
+ fsutil.FileNoMMap `state:"nosave"`
+ fsutil.FileNoSplice `state:"nosave"`
+ fsutil.FileNoopRelease `state:"nosave"`
+ fsutil.FileNoopFlush `state:"nosave"`
+ fsutil.FileNoopFsync `state:"nosave"`
+ fsutil.FileNotDirReaddir `state:"nosave"`
+ fsutil.FileUseInodeUnstableAttr `state:"nosave"`
+ waiter.AlwaysReady `state:"nosave"`
+
+ tcpRecovery *tcpRecovery
+
+ stack inet.Stack `state:"wait"`
+}
+
+// Read implements fs.FileOperations.Read.
+func (f *tcpRecoveryFile) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
+ if offset != 0 {
+ return 0, io.EOF
+ }
+
+ recovery, err := f.stack.TCPRecovery()
+ if err != nil {
+ return 0, err
+ }
+ f.tcpRecovery.recovery = recovery
+ s := fmt.Sprintf("%d\n", f.tcpRecovery.recovery)
+ n, err := dst.CopyOut(ctx, []byte(s))
+ return int64(n), err
+}
+
+// Write implements fs.FileOperations.Write.
+func (f *tcpRecoveryFile) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, offset int64) (int64, error) {
+ if src.NumBytes() == 0 {
+ return 0, nil
+ }
+ src = src.TakeFirst(usermem.PageSize - 1)
+
+ var v int32
+ n, err := usermem.CopyInt32StringInVec(ctx, src.IO, src.Addrs, &v, src.Opts)
+ if err != nil {
+ return 0, err
+ }
+ f.tcpRecovery.recovery = inet.TCPLossRecovery(v)
+ if err := f.tcpRecovery.stack.SetTCPRecovery(f.tcpRecovery.recovery); err != nil {
+ return 0, err
+ }
+ return n, nil
+}
+
func (p *proc) newSysNetCore(ctx context.Context, msrc *fs.MountSource, s inet.Stack) *fs.Inode {
// The following files are simple stubs until they are implemented in
// netstack, most of these files are configuration related. We use the
@@ -351,6 +441,11 @@ func (p *proc) newSysNetIPv4Dir(ctx context.Context, msrc *fs.MountSource, s ine
contents["tcp_wmem"] = newTCPMemInode(ctx, msrc, s, tcpWMem)
}
+ // Add tcp_recovery.
+ if _, err := s.TCPRecovery(); err == nil {
+ contents["tcp_recovery"] = newTCPRecoveryInode(ctx, msrc, s)
+ }
+
d := ramfs.NewDir(ctx, contents, fs.RootOwner, fs.FilePermsFromMode(0555))
return newProcInode(ctx, d, msrc, fs.SpecialDirectory, nil)
}
diff --git a/pkg/sentry/fsimpl/gofer/BUILD b/pkg/sentry/fsimpl/gofer/BUILD
index 4a800dcf9..16787116f 100644
--- a/pkg/sentry/fsimpl/gofer/BUILD
+++ b/pkg/sentry/fsimpl/gofer/BUILD
@@ -85,5 +85,6 @@ go_test(
deps = [
"//pkg/p9",
"//pkg/sentry/contexttest",
+ "//pkg/sentry/pgalloc",
],
)
diff --git a/pkg/sentry/fsimpl/gofer/directory.go b/pkg/sentry/fsimpl/gofer/directory.go
index 1679066ba..2a8011eb4 100644
--- a/pkg/sentry/fsimpl/gofer/directory.go
+++ b/pkg/sentry/fsimpl/gofer/directory.go
@@ -90,10 +90,8 @@ func (d *dentry) createSyntheticChildLocked(opts *createSyntheticOpts) {
uid: uint32(opts.kuid),
gid: uint32(opts.kgid),
blockSize: usermem.PageSize, // arbitrary
- handle: handle{
- fd: -1,
- },
- nlink: uint32(2),
+ hostFD: -1,
+ nlink: uint32(2),
}
switch opts.mode.FileType() {
case linux.S_IFDIR:
@@ -205,14 +203,14 @@ func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) {
off := uint64(0)
const count = 64 * 1024 // for consistency with the vfs1 client
d.handleMu.RLock()
- if !d.handleReadable {
+ if d.readFile.isNil() {
// This should not be possible because a readable handle should
// have been opened when the calling directoryFD was opened.
d.handleMu.RUnlock()
panic("gofer.dentry.getDirents called without a readable handle")
}
for {
- p9ds, err := d.handle.file.readdir(ctx, off, count)
+ p9ds, err := d.readFile.readdir(ctx, off, count)
if err != nil {
d.handleMu.RUnlock()
return nil, err
@@ -304,5 +302,5 @@ func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (in
// Sync implements vfs.FileDescriptionImpl.Sync.
func (fd *directoryFD) Sync(ctx context.Context) error {
- return fd.dentry().handle.sync(ctx)
+ return fd.dentry().syncRemoteFile(ctx)
}
diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go
index e6af37d0d..eaef2594d 100644
--- a/pkg/sentry/fsimpl/gofer/filesystem.go
+++ b/pkg/sentry/fsimpl/gofer/filesystem.go
@@ -15,6 +15,7 @@
package gofer
import (
+ "math"
"sync"
"sync/atomic"
@@ -54,7 +55,7 @@ func (fs *filesystem) Sync(ctx context.Context) error {
// Sync regular files.
for _, d := range ds {
- err := d.syncSharedHandle(ctx)
+ err := d.syncCachedFile(ctx)
d.DecRef(ctx)
if err != nil && retErr == nil {
retErr = err
@@ -724,8 +725,29 @@ func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.
if rp.Mount() != vd.Mount() {
return syserror.EXDEV
}
- // 9P2000.L supports hard links, but we don't.
- return syserror.EPERM
+ d := vd.Dentry().Impl().(*dentry)
+ if d.isDir() {
+ return syserror.EPERM
+ }
+ gid := auth.KGID(atomic.LoadUint32(&d.gid))
+ uid := auth.KUID(atomic.LoadUint32(&d.uid))
+ mode := linux.FileMode(atomic.LoadUint32(&d.mode))
+ if err := vfs.MayLink(rp.Credentials(), mode, uid, gid); err != nil {
+ return err
+ }
+ if d.nlink == 0 {
+ return syserror.ENOENT
+ }
+ if d.nlink == math.MaxUint32 {
+ return syserror.EMLINK
+ }
+ if err := parent.file.link(ctx, d.file, childName); err != nil {
+ return err
+ }
+
+ // Success!
+ atomic.AddUint32(&d.nlink, 1)
+ return nil
}, nil)
}
@@ -1085,12 +1107,18 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving
useRegularFileFD := child.fileType() == linux.S_IFREG && !d.fs.opts.regularFilesUseSpecialFileFD
if useRegularFileFD {
child.handleMu.Lock()
- child.handle.file = openFile
- if fdobj != nil {
- child.handle.fd = int32(fdobj.Release())
+ if vfs.MayReadFileWithOpenFlags(opts.Flags) {
+ child.readFile = openFile
+ if fdobj != nil {
+ child.hostFD = int32(fdobj.Release())
+ }
+ } else if fdobj != nil {
+ // Can't use fdobj if it's not readable.
+ fdobj.Close()
+ }
+ if vfs.MayWriteFileWithOpenFlags(opts.Flags) {
+ child.writeFile = openFile
}
- child.handleReadable = vfs.MayReadFileWithOpenFlags(opts.Flags)
- child.handleWritable = vfs.MayWriteFileWithOpenFlags(opts.Flags)
child.handleMu.Unlock()
}
// Insert the dentry into the tree.
diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go
index 2e5575d8d..63e589859 100644
--- a/pkg/sentry/fsimpl/gofer/gofer.go
+++ b/pkg/sentry/fsimpl/gofer/gofer.go
@@ -192,10 +192,14 @@ const (
//
// - File timestamps are based on client clocks. This ensures that users of
// the client observe timestamps that are coherent with their own clocks
- // and consistent with Linux's semantics. However, since it is not always
- // possible for clients to set arbitrary atimes and mtimes, and never
- // possible for clients to set arbitrary ctimes, file timestamp changes are
- // stored in the client only and never sent to the remote filesystem.
+ // and consistent with Linux's semantics (in particular, it is not always
+ // possible for clients to set arbitrary atimes and mtimes depending on the
+ // remote filesystem implementation, and never possible for clients to set
+ // arbitrary ctimes.) If a dentry containing a client-defined atime or
+ // mtime is evicted from cache, client timestamps will be sent to the
+ // remote filesystem on a best-effort basis to attempt to ensure that
+ // timestamps will be preserved when another dentry representing the same
+ // file is instantiated.
InteropModeExclusive InteropMode = iota
// InteropModeWritethrough is appropriate when there are read-only users of
@@ -502,9 +506,9 @@ func (fs *filesystem) Release(ctx context.Context) {
for d := range fs.syncableDentries {
d.handleMu.Lock()
d.dataMu.Lock()
- if d.handleWritable {
+ if h := d.writeHandleLocked(); h.isOpen() {
// Write dirty cached data to the remote file.
- if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, fs.mfp.MemoryFile(), d.handle.writeFromBlocksAt); err != nil {
+ if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, fs.mfp.MemoryFile(), h.writeFromBlocksAt); err != nil {
log.Warningf("gofer.filesystem.Release: failed to flush dentry: %v", err)
}
// TODO(jamieliu): Do we need to flushf/fsync d?
@@ -514,9 +518,9 @@ func (fs *filesystem) Release(ctx context.Context) {
d.dirty.RemoveAll()
d.dataMu.Unlock()
// Close the host fd if one exists.
- if d.handle.fd >= 0 {
- syscall.Close(int(d.handle.fd))
- d.handle.fd = -1
+ if d.hostFD >= 0 {
+ syscall.Close(int(d.hostFD))
+ d.hostFD = -1
}
d.handleMu.Unlock()
}
@@ -558,8 +562,6 @@ type dentry struct {
// filesystem.renameMu.
name string
- // We don't support hard links, so each dentry maps 1:1 to an inode.
-
// file is the unopened p9.File that backs this dentry. file is immutable.
//
// If file.isNil(), this dentry represents a synthetic file, i.e. a file
@@ -620,9 +622,20 @@ type dentry struct {
mtime int64
ctime int64
btime int64
- // File size, protected by both metadataMu and dataMu (i.e. both must be
+ // File size, which differs from other metadata in two ways:
+ //
+ // - We make a best-effort attempt to keep it up to date even if
+ // !dentry.cachedMetadataAuthoritative() for the sake of O_APPEND writes.
+ //
+ // - size is protected by both metadataMu and dataMu (i.e. both must be
// locked to mutate it; locking either is sufficient to access it).
size uint64
+ // If this dentry does not represent a synthetic file, deleted is 0, and
+ // atimeDirty/mtimeDirty are non-zero, atime/mtime may have diverged from the
+ // remote file's timestamps, which should be updated when this dentry is
+ // evicted.
+ atimeDirty uint32
+ mtimeDirty uint32
// nlink counts the number of hard links to this dentry. It's updated and
// accessed using atomic operations. It's not protected by metadataMu like the
@@ -635,30 +648,28 @@ type dentry struct {
// the file into memmap.MappingSpaces. mappings is protected by mapsMu.
mappings memmap.MappingSet
- // If this dentry represents a regular file or directory:
- //
- // - handle is the I/O handle used by all regularFileFDs/directoryFDs
- // representing this dentry.
- //
- // - handleReadable is true if handle is readable.
+ // - If this dentry represents a regular file or directory, readFile is the
+ // p9.File used for reads by all regularFileFDs/directoryFDs representing
+ // this dentry.
//
- // - handleWritable is true if handle is writable.
+ // - If this dentry represents a regular file, writeFile is the p9.File
+ // used for writes by all regularFileFDs representing this dentry.
//
- // Invariants:
- //
- // - If handleReadable == handleWritable == false, then handle.file == nil
- // (i.e. there is no open handle). Conversely, if handleReadable ||
- // handleWritable == true, then handle.file != nil (i.e. there is an open
- // handle).
- //
- // - handleReadable and handleWritable cannot transition from true to false
- // (i.e. handles may not be downgraded).
+ // - If this dentry represents a regular file, hostFD is the host FD used
+ // for memory mappings and I/O (when applicable) in preference to readFile
+ // and writeFile. hostFD is always readable; if !writeFile.isNil(), it must
+ // also be writable. If hostFD is -1, no such host FD is available.
//
// These fields are protected by handleMu.
- handleMu sync.RWMutex
- handle handle
- handleReadable bool
- handleWritable bool
+ //
+ // readFile and writeFile may or may not represent the same p9.File. Once
+ // either p9.File transitions from closed (isNil() == true) to open
+ // (isNil() == false), it may be mutated with handleMu locked, but cannot
+ // be closed until the dentry is destroyed.
+ handleMu sync.RWMutex
+ readFile p9file
+ writeFile p9file
+ hostFD int32
dataMu sync.RWMutex
@@ -672,7 +683,7 @@ type dentry struct {
// tracks dirty segments in cache. dirty is protected by dataMu.
dirty fsutil.DirtySet
- // pf implements platform.File for mappings of handle.fd.
+ // pf implements platform.File for mappings of hostFD.
pf dentryPlatformFile
// If this dentry represents a symbolic link, InteropModeShared is not in
@@ -734,9 +745,7 @@ func (fs *filesystem) newDentry(ctx context.Context, file p9file, qid p9.QID, ma
uid: uint32(fs.opts.dfltuid),
gid: uint32(fs.opts.dfltgid),
blockSize: usermem.PageSize,
- handle: handle{
- fd: -1,
- },
+ hostFD: -1,
}
d.pf.dentry = d
if mask.UID {
@@ -803,10 +812,12 @@ func (d *dentry) updateFromP9AttrsLocked(mask p9.AttrMask, attr *p9.Attr) {
if attr.BlockSize != 0 {
atomic.StoreUint32(&d.blockSize, uint32(attr.BlockSize))
}
- if mask.ATime {
+ // Don't override newer client-defined timestamps with old server-defined
+ // ones.
+ if mask.ATime && atomic.LoadUint32(&d.atimeDirty) == 0 {
atomic.StoreInt64(&d.atime, dentryTimestampFromP9(attr.ATimeSeconds, attr.ATimeNanoSeconds))
}
- if mask.MTime {
+ if mask.MTime && atomic.LoadUint32(&d.mtimeDirty) == 0 {
atomic.StoreInt64(&d.mtime, dentryTimestampFromP9(attr.MTimeSeconds, attr.MTimeNanoSeconds))
}
if mask.CTime {
@@ -825,9 +836,13 @@ func (d *dentry) updateFromP9AttrsLocked(mask p9.AttrMask, attr *p9.Attr) {
// Preconditions: !d.isSynthetic()
func (d *dentry) updateFromGetattr(ctx context.Context) error {
- // Use d.handle.file, which represents a 9P fid that has been opened, in
- // preference to d.file, which represents a 9P fid that has not. This may
- // be significantly more efficient in some implementations.
+ // Use d.readFile or d.writeFile, which represent 9P fids that have been
+ // opened, in preference to d.file, which represents a 9P fid that has not.
+ // This may be significantly more efficient in some implementations. Prefer
+ // d.writeFile over d.readFile since some filesystem implementations may
+ // update a writable handle's metadata after writes to that handle, without
+ // making metadata updates immediately visible to read-only handles
+ // representing the same file.
var (
file p9file
handleMuRLocked bool
@@ -837,8 +852,11 @@ func (d *dentry) updateFromGetattr(ctx context.Context) error {
d.metadataMu.Lock()
defer d.metadataMu.Unlock()
d.handleMu.RLock()
- if !d.handle.file.isNil() {
- file = d.handle.file
+ if !d.writeFile.isNil() {
+ file = d.writeFile
+ handleMuRLocked = true
+ } else if !d.readFile.isNil() {
+ file = d.readFile
handleMuRLocked = true
} else {
file = d.file
@@ -903,51 +921,44 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs
return err
}
defer mnt.EndWrite()
- setLocalAtime := false
- setLocalMtime := false
+
+ if stat.Mask&linux.STATX_SIZE != 0 {
+ // Reject attempts to truncate files other than regular files, since
+ // filesystem implementations may return the wrong errno.
+ switch mode.FileType() {
+ case linux.S_IFREG:
+ // ok
+ case linux.S_IFDIR:
+ return syserror.EISDIR
+ default:
+ return syserror.EINVAL
+ }
+ }
+
+ var now int64
if d.cachedMetadataAuthoritative() {
- // Timestamp updates will be handled locally.
- setLocalAtime = stat.Mask&linux.STATX_ATIME != 0
- setLocalMtime = stat.Mask&linux.STATX_MTIME != 0
- stat.Mask &^= linux.STATX_ATIME | linux.STATX_MTIME
-
- // Prepare for truncate.
- if stat.Mask&linux.STATX_SIZE != 0 {
- switch mode.FileType() {
- case linux.ModeRegular:
- if !setLocalMtime {
- // Truncate updates mtime.
- setLocalMtime = true
- stat.Mtime.Nsec = linux.UTIME_NOW
- }
- case linux.ModeDirectory:
- return syserror.EISDIR
- default:
- return syserror.EINVAL
+ // Truncate updates mtime.
+ if stat.Mask&(linux.STATX_SIZE|linux.STATX_MTIME) == linux.STATX_SIZE {
+ stat.Mask |= linux.STATX_MTIME
+ stat.Mtime = linux.StatxTimestamp{
+ Nsec: linux.UTIME_NOW,
}
}
+
+ // Use client clocks for timestamps.
+ now = d.fs.clock.Now().Nanoseconds()
+ if stat.Mask&linux.STATX_ATIME != 0 && stat.Atime.Nsec == linux.UTIME_NOW {
+ stat.Atime = statxTimestampFromDentry(now)
+ }
+ if stat.Mask&linux.STATX_MTIME != 0 && stat.Mtime.Nsec == linux.UTIME_NOW {
+ stat.Mtime = statxTimestampFromDentry(now)
+ }
}
+
d.metadataMu.Lock()
defer d.metadataMu.Unlock()
- if stat.Mask&linux.STATX_SIZE != 0 {
- // The size needs to be changed even when
- // !d.cachedMetadataAuthoritative() because d.mappings has to be
- // updated.
- d.updateFileSizeLocked(stat.Size)
- }
if !d.isSynthetic() {
if stat.Mask != 0 {
- if stat.Mask&linux.STATX_SIZE != 0 {
- // Check whether to allow a truncate request to be made.
- switch d.mode & linux.S_IFMT {
- case linux.S_IFREG:
- // Allow.
- case linux.S_IFDIR:
- return syserror.EISDIR
- default:
- return syserror.EINVAL
- }
- }
if err := d.file.setAttr(ctx, p9.SetAttrMask{
Permissions: stat.Mask&linux.STATX_MODE != 0,
UID: stat.Mask&linux.STATX_UID != 0,
@@ -969,6 +980,12 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs
}); err != nil {
return err
}
+ if stat.Mask&linux.STATX_SIZE != 0 {
+ // d.size should be kept up to date, and privatized
+ // copy-on-write mappings of truncated pages need to be
+ // invalidated, even if InteropModeShared is in effect.
+ d.updateFileSizeLocked(stat.Size)
+ }
}
if d.fs.opts.interop == InteropModeShared {
// There's no point to updating d's metadata in this case since
@@ -978,7 +995,6 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs
return nil
}
}
- now := d.fs.clock.Now().Nanoseconds()
if stat.Mask&linux.STATX_MODE != 0 {
atomic.StoreUint32(&d.mode, d.fileType()|uint32(stat.Mode))
}
@@ -988,23 +1004,18 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs
if stat.Mask&linux.STATX_GID != 0 {
atomic.StoreUint32(&d.gid, stat.GID)
}
- if setLocalAtime {
- if stat.Atime.Nsec == linux.UTIME_NOW {
- atomic.StoreInt64(&d.atime, now)
- } else {
- atomic.StoreInt64(&d.atime, dentryTimestampFromStatx(stat.Atime))
- }
- // Restore mask bits that we cleared earlier.
- stat.Mask |= linux.STATX_ATIME
+ // Note that stat.Atime.Nsec and stat.Mtime.Nsec can't be UTIME_NOW because
+ // if d.cachedMetadataAuthoritative() then we converted stat.Atime and
+ // stat.Mtime to client-local timestamps above, and if
+ // !d.cachedMetadataAuthoritative() then we returned after calling
+ // d.file.setAttr(). For the same reason, now must have been initialized.
+ if stat.Mask&linux.STATX_ATIME != 0 {
+ atomic.StoreInt64(&d.atime, dentryTimestampFromStatx(stat.Atime))
+ atomic.StoreUint32(&d.atimeDirty, 0)
}
- if setLocalMtime {
- if stat.Mtime.Nsec == linux.UTIME_NOW {
- atomic.StoreInt64(&d.mtime, now)
- } else {
- atomic.StoreInt64(&d.mtime, dentryTimestampFromStatx(stat.Mtime))
- }
- // Restore mask bits that we cleared earlier.
- stat.Mask |= linux.STATX_MTIME
+ if stat.Mask&linux.STATX_MTIME != 0 {
+ atomic.StoreInt64(&d.mtime, dentryTimestampFromStatx(stat.Mtime))
+ atomic.StoreUint32(&d.mtimeDirty, 0)
}
atomic.StoreInt64(&d.ctime, now)
return nil
@@ -1147,7 +1158,8 @@ func (d *dentry) OnZeroWatches(ctx context.Context) {
// operation. One of the calls may destroy the dentry, so subsequent calls will
// do nothing.
//
-// Preconditions: d.fs.renameMu must be locked for writing.
+// Preconditions: d.fs.renameMu must be locked for writing; it may be
+// temporarily unlocked.
func (d *dentry) checkCachingLocked(ctx context.Context) {
// Dentries with a non-zero reference count must be retained. (The only way
// to obtain a reference on a dentry with zero references is via path
@@ -1227,11 +1239,13 @@ func (d *dentry) checkCachingLocked(ctx context.Context) {
}
}
-// destroyLocked destroys the dentry. It may flushes dirty pages from cache,
-// close p9 file and remove reference on parent dentry.
+// destroyLocked destroys the dentry.
//
-// Preconditions: d.fs.renameMu must be locked for writing. d.refs == 0. d is
-// not a child dentry.
+// Preconditions:
+// * d.fs.renameMu must be locked for writing; it may be temporarily unlocked.
+// * d.refs == 0.
+// * d.parent.children[d.name] != d, i.e. d is not reachable by path traversal
+// from its former parent dentry.
func (d *dentry) destroyLocked(ctx context.Context) {
switch atomic.LoadInt64(&d.refs) {
case 0:
@@ -1243,26 +1257,64 @@ func (d *dentry) destroyLocked(ctx context.Context) {
panic("dentry.destroyLocked() called with references on the dentry")
}
+ // Allow the following to proceed without renameMu locked to improve
+ // scalability.
+ d.fs.renameMu.Unlock()
+
+ mf := d.fs.mfp.MemoryFile()
d.handleMu.Lock()
- if !d.handle.file.isNil() {
- mf := d.fs.mfp.MemoryFile()
- d.dataMu.Lock()
+ d.dataMu.Lock()
+ if h := d.writeHandleLocked(); h.isOpen() {
// Write dirty pages back to the remote filesystem.
- if d.handleWritable {
- if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, mf, d.handle.writeFromBlocksAt); err != nil {
- log.Warningf("gofer.dentry.DecRef: failed to write dirty data back: %v", err)
- }
+ if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, mf, h.writeFromBlocksAt); err != nil {
+ log.Warningf("gofer.dentry.destroyLocked: failed to write dirty data back: %v", err)
}
- // Discard cached data.
+ }
+ // Discard cached data.
+ if !d.cache.IsEmpty() {
+ mf.MarkAllUnevictable(d)
d.cache.DropAll(mf)
d.dirty.RemoveAll()
- d.dataMu.Unlock()
- // Clunk open fids and close open host FDs.
- d.handle.close(ctx)
+ }
+ d.dataMu.Unlock()
+ // Clunk open fids and close open host FDs.
+ if !d.readFile.isNil() {
+ d.readFile.close(ctx)
+ }
+ if !d.writeFile.isNil() && d.readFile != d.writeFile {
+ d.writeFile.close(ctx)
+ }
+ d.readFile = p9file{}
+ d.writeFile = p9file{}
+ if d.hostFD >= 0 {
+ syscall.Close(int(d.hostFD))
+ d.hostFD = -1
}
d.handleMu.Unlock()
if !d.file.isNil() {
+ if !d.isDeleted() {
+ // Write dirty timestamps back to the remote filesystem.
+ atimeDirty := atomic.LoadUint32(&d.atimeDirty) != 0
+ mtimeDirty := atomic.LoadUint32(&d.mtimeDirty) != 0
+ if atimeDirty || mtimeDirty {
+ atime := atomic.LoadInt64(&d.atime)
+ mtime := atomic.LoadInt64(&d.mtime)
+ if err := d.file.setAttr(ctx, p9.SetAttrMask{
+ ATime: atimeDirty,
+ ATimeNotSystemTime: atimeDirty,
+ MTime: mtimeDirty,
+ MTimeNotSystemTime: mtimeDirty,
+ }, p9.SetAttr{
+ ATimeSeconds: uint64(atime / 1e9),
+ ATimeNanoSeconds: uint64(atime % 1e9),
+ MTimeSeconds: uint64(mtime / 1e9),
+ MTimeNanoSeconds: uint64(mtime % 1e9),
+ }); err != nil {
+ log.Warningf("gofer.dentry.destroyLocked: failed to write dirty timestamps back: %v", err)
+ }
+ }
+ }
d.file.close(ctx)
d.file = p9file{}
// Remove d from the set of syncable dentries.
@@ -1270,6 +1322,9 @@ func (d *dentry) destroyLocked(ctx context.Context) {
delete(d.fs.syncableDentries, d)
d.fs.syncMu.Unlock()
}
+
+ d.fs.renameMu.Lock()
+
// Drop the reference held by d on its parent without recursively locking
// d.fs.renameMu.
if d.parent != nil {
@@ -1369,80 +1424,120 @@ func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool
// O_TRUNC).
if !trunc {
d.handleMu.RLock()
- if (!read || d.handleReadable) && (!write || d.handleWritable) {
- // The current handle is sufficient.
+ if (!read || !d.readFile.isNil()) && (!write || !d.writeFile.isNil()) {
+ // Current handles are sufficient.
d.handleMu.RUnlock()
return nil
}
d.handleMu.RUnlock()
}
- haveOldFD := false
+ fdToClose := int32(-1)
+ invalidateTranslations := false
d.handleMu.Lock()
- if (read && !d.handleReadable) || (write && !d.handleWritable) || trunc {
- // Get a new handle.
- wantReadable := d.handleReadable || read
- wantWritable := d.handleWritable || write
- h, err := openHandle(ctx, d.file, wantReadable, wantWritable, trunc)
+ if (read && d.readFile.isNil()) || (write && d.writeFile.isNil()) || trunc {
+ // Get a new handle. If this file has been opened for both reading and
+ // writing, try to get a single handle that is usable for both:
+ //
+ // - Writable memory mappings of a host FD require that the host FD is
+ // opened for both reading and writing.
+ //
+ // - NOTE(b/141991141): Some filesystems may not ensure coherence
+ // between multiple handles for the same file.
+ openReadable := !d.readFile.isNil() || read
+ openWritable := !d.writeFile.isNil() || write
+ h, err := openHandle(ctx, d.file, openReadable, openWritable, trunc)
+ if err == syserror.EACCES && (openReadable != read || openWritable != write) {
+ // It may not be possible to use a single handle for both
+ // reading and writing, since permissions on the file may have
+ // changed to e.g. disallow reading after previously being
+ // opened for reading. In this case, we have no choice but to
+ // use separate handles for reading and writing.
+ ctx.Debugf("gofer.dentry.ensureSharedHandle: bifurcating read/write handles for dentry %p", d)
+ openReadable = read
+ openWritable = write
+ h, err = openHandle(ctx, d.file, openReadable, openWritable, trunc)
+ }
if err != nil {
d.handleMu.Unlock()
return err
}
- if !d.handle.file.isNil() {
- // Check that old and new handles are compatible: If the old handle
- // includes a host file descriptor but the new one does not, or
- // vice versa, old and new memory mappings may be incoherent.
- haveOldFD = d.handle.fd >= 0
- haveNewFD := h.fd >= 0
- if haveOldFD != haveNewFD {
- d.handleMu.Unlock()
- ctx.Warningf("gofer.dentry.ensureSharedHandle: can't change host FD availability from %v to %v across dentry handle upgrade", haveOldFD, haveNewFD)
- h.close(ctx)
- return syserror.EIO
- }
- if haveOldFD {
- // We may have raced with callers of d.pf.FD() that are now
- // using the old file descriptor, preventing us from safely
- // closing it. We could handle this by invalidating existing
- // memmap.Translations, but this is expensive. Instead, use
- // dup3 to make the old file descriptor refer to the new file
- // description, then close the new file descriptor (which is no
- // longer needed). Racing callers may use the old or new file
- // description, but this doesn't matter since they refer to the
- // same file (unless d.fs.opts.overlayfsStaleRead is true,
- // which we handle separately).
- if err := syscall.Dup3(int(h.fd), int(d.handle.fd), syscall.O_CLOEXEC); err != nil {
+
+ if d.hostFD < 0 && openReadable && h.fd >= 0 {
+ // We have no existing FD; use the new FD for at least reading.
+ d.hostFD = h.fd
+ } else if d.hostFD >= 0 && d.writeFile.isNil() && openWritable {
+ // We have an existing read-only FD, but the file has just been
+ // opened for writing, so we need to start supporting writable memory
+ // mappings. This may race with callers of d.pf.FD() using the existing
+ // FD, so in most cases we need to delay closing the old FD until after
+ // invalidating memmap.Translations that might have observed it.
+ if !openReadable || h.fd < 0 {
+ // We don't have a read/write FD, so we have no FD that can be
+ // used to create writable memory mappings. Switch to using the
+ // internal page cache.
+ invalidateTranslations = true
+ fdToClose = d.hostFD
+ d.hostFD = -1
+ } else if d.fs.opts.overlayfsStaleRead {
+ // We do have a read/write FD, but it may not be coherent with
+ // the existing read-only FD, so we must switch to mappings of
+ // the new FD in both the application and sentry.
+ if err := d.pf.hostFileMapper.RegenerateMappings(int(h.fd)); err != nil {
d.handleMu.Unlock()
- ctx.Warningf("gofer.dentry.ensureSharedHandle: failed to dup fd %d to fd %d: %v", h.fd, d.handle.fd, err)
+ ctx.Warningf("gofer.dentry.ensureSharedHandle: failed to replace sentry mappings of old FD with mappings of new FD: %v", err)
h.close(ctx)
return err
}
- syscall.Close(int(h.fd))
- h.fd = d.handle.fd
- if d.fs.opts.overlayfsStaleRead {
- // Replace sentry mappings of the old FD with mappings of
- // the new FD, since the two are not necessarily coherent.
- if err := d.pf.hostFileMapper.RegenerateMappings(int(h.fd)); err != nil {
- d.handleMu.Unlock()
- ctx.Warningf("gofer.dentry.ensureSharedHandle: failed to replace sentry mappings of old FD with mappings of new FD: %v", err)
- h.close(ctx)
- return err
- }
+ invalidateTranslations = true
+ fdToClose = d.hostFD
+ d.hostFD = h.fd
+ } else {
+ // We do have a read/write FD. To avoid invalidating existing
+ // memmap.Translations (which is expensive), use dup3 to make
+ // the old file descriptor refer to the new file description,
+ // then close the new file descriptor (which is no longer
+ // needed). Racing callers of d.pf.FD() may use the old or new
+ // file description, but this doesn't matter since they refer
+ // to the same file, and any racing mappings must be read-only.
+ if err := syscall.Dup3(int(h.fd), int(d.hostFD), syscall.O_CLOEXEC); err != nil {
+ oldHostFD := d.hostFD
+ d.handleMu.Unlock()
+ ctx.Warningf("gofer.dentry.ensureSharedHandle: failed to dup fd %d to fd %d: %v", h.fd, oldHostFD, err)
+ h.close(ctx)
+ return err
}
- // Clunk the old fid before making the new handle visible (by
- // unlocking d.handleMu).
- d.handle.file.close(ctx)
+ fdToClose = h.fd
}
+ } else {
+ // h.fd is not useful.
+ fdToClose = h.fd
+ }
+
+ // Switch to new fids.
+ var oldReadFile p9file
+ if openReadable {
+ oldReadFile = d.readFile
+ d.readFile = h.file
+ }
+ var oldWriteFile p9file
+ if openWritable {
+ oldWriteFile = d.writeFile
+ d.writeFile = h.file
+ }
+ // NOTE(b/141991141): Clunk old fids before making new fids visible (by
+ // unlocking d.handleMu).
+ if !oldReadFile.isNil() {
+ oldReadFile.close(ctx)
+ }
+ if !oldWriteFile.isNil() && oldReadFile != oldWriteFile {
+ oldWriteFile.close(ctx)
}
- // Switch to the new handle.
- d.handle = h
- d.handleReadable = wantReadable
- d.handleWritable = wantWritable
}
d.handleMu.Unlock()
- if d.fs.opts.overlayfsStaleRead && haveOldFD {
- // Invalidate application mappings that may be using the old FD; they
+ if invalidateTranslations {
+ // Invalidate application mappings that may be using an old FD; they
// will be replaced with mappings using the new FD after future calls
// to d.Translate(). This requires holding d.mapsMu, which precedes
// d.handleMu in the lock order.
@@ -1450,7 +1545,51 @@ func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool
d.mappings.InvalidateAll(memmap.InvalidateOpts{})
d.mapsMu.Unlock()
}
+ if fdToClose >= 0 {
+ syscall.Close(int(fdToClose))
+ }
+
+ return nil
+}
+
+// Preconditions: d.handleMu must be locked.
+func (d *dentry) readHandleLocked() handle {
+ return handle{
+ file: d.readFile,
+ fd: d.hostFD,
+ }
+}
+// Preconditions: d.handleMu must be locked.
+func (d *dentry) writeHandleLocked() handle {
+ return handle{
+ file: d.writeFile,
+ fd: d.hostFD,
+ }
+}
+
+func (d *dentry) syncRemoteFile(ctx context.Context) error {
+ d.handleMu.RLock()
+ defer d.handleMu.RUnlock()
+ return d.syncRemoteFileLocked(ctx)
+}
+
+// Preconditions: d.handleMu must be locked.
+func (d *dentry) syncRemoteFileLocked(ctx context.Context) error {
+ // If we have a host FD, fsyncing it is likely to be faster than an fsync
+ // RPC.
+ if d.hostFD >= 0 {
+ ctx.UninterruptibleSleepStart(false)
+ err := syscall.Fsync(int(d.hostFD))
+ ctx.UninterruptibleSleepFinish(false)
+ return err
+ }
+ if !d.writeFile.isNil() {
+ return d.writeFile.fsync(ctx)
+ }
+ if !d.readFile.isNil() {
+ return d.readFile.fsync(ctx)
+ }
return nil
}
diff --git a/pkg/sentry/fsimpl/gofer/gofer_test.go b/pkg/sentry/fsimpl/gofer/gofer_test.go
index 56d80bcf8..bfe75dfe4 100644
--- a/pkg/sentry/fsimpl/gofer/gofer_test.go
+++ b/pkg/sentry/fsimpl/gofer/gofer_test.go
@@ -20,10 +20,13 @@ import (
"gvisor.dev/gvisor/pkg/p9"
"gvisor.dev/gvisor/pkg/sentry/contexttest"
+ "gvisor.dev/gvisor/pkg/sentry/pgalloc"
)
func TestDestroyIdempotent(t *testing.T) {
+ ctx := contexttest.Context(t)
fs := filesystem{
+ mfp: pgalloc.MemoryFileProviderFromContext(ctx),
syncableDentries: make(map[*dentry]struct{}),
opts: filesystemOptions{
// Test relies on no dentry being held in the cache.
@@ -31,7 +34,6 @@ func TestDestroyIdempotent(t *testing.T) {
},
}
- ctx := contexttest.Context(t)
attr := &p9.Attr{
Mode: p9.ModeRegular,
}
@@ -50,6 +52,8 @@ func TestDestroyIdempotent(t *testing.T) {
}
parent.cacheNewChildLocked(child, "child")
+ fs.renameMu.Lock()
+ defer fs.renameMu.Unlock()
child.checkCachingLocked(ctx)
if got := atomic.LoadInt64(&child.refs); got != -1 {
t.Fatalf("child.refs=%d, want: -1", got)
diff --git a/pkg/sentry/fsimpl/gofer/handle.go b/pkg/sentry/fsimpl/gofer/handle.go
index 8792ca4f2..104157512 100644
--- a/pkg/sentry/fsimpl/gofer/handle.go
+++ b/pkg/sentry/fsimpl/gofer/handle.go
@@ -63,6 +63,10 @@ func openHandle(ctx context.Context, file p9file, read, write, trunc bool) (hand
}, nil
}
+func (h *handle) isOpen() bool {
+ return !h.file.isNil()
+}
+
func (h *handle) close(ctx context.Context) {
h.file.close(ctx)
h.file = p9file{}
@@ -124,18 +128,3 @@ func (h *handle) writeFromBlocksAt(ctx context.Context, srcs safemem.BlockSeq, o
}
return cp, cperr
}
-
-func (h *handle) sync(ctx context.Context) error {
- // Handle most common case first.
- if h.fd >= 0 {
- ctx.UninterruptibleSleepStart(false)
- err := syscall.Fsync(int(h.fd))
- ctx.UninterruptibleSleepFinish(false)
- return err
- }
- if h.file.isNil() {
- // File hasn't been touched, there is nothing to sync.
- return nil
- }
- return h.file.fsync(ctx)
-}
diff --git a/pkg/sentry/fsimpl/gofer/regular_file.go b/pkg/sentry/fsimpl/gofer/regular_file.go
index db6bed4f6..7e1cbf065 100644
--- a/pkg/sentry/fsimpl/gofer/regular_file.go
+++ b/pkg/sentry/fsimpl/gofer/regular_file.go
@@ -64,34 +64,34 @@ func (fd *regularFileFD) OnClose(ctx context.Context) error {
}
d.handleMu.RLock()
defer d.handleMu.RUnlock()
- return d.handle.file.flush(ctx)
+ if d.writeFile.isNil() {
+ return nil
+ }
+ return d.writeFile.flush(ctx)
}
// Allocate implements vfs.FileDescriptionImpl.Allocate.
func (fd *regularFileFD) Allocate(ctx context.Context, mode, offset, length uint64) error {
-
d := fd.dentry()
d.metadataMu.Lock()
defer d.metadataMu.Unlock()
- size := offset + length
-
// Allocating a smaller size is a noop.
- if size <= d.size {
+ size := offset + length
+ if d.cachedMetadataAuthoritative() && size <= d.size {
return nil
}
- d.handleMu.Lock()
- defer d.handleMu.Unlock()
-
- err := d.handle.file.allocate(ctx, p9.ToAllocateMode(mode), offset, length)
+ d.handleMu.RLock()
+ err := d.writeFile.allocate(ctx, p9.ToAllocateMode(mode), offset, length)
+ d.handleMu.RUnlock()
if err != nil {
return err
}
d.dataMu.Lock()
atomic.StoreUint64(&d.size, size)
d.dataMu.Unlock()
- if !d.cachedMetadataAuthoritative() {
+ if d.cachedMetadataAuthoritative() {
d.touchCMtimeLocked()
}
return nil
@@ -113,7 +113,7 @@ func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offs
// Check for reading at EOF before calling into MM (but not under
// InteropModeShared, which makes d.size unreliable).
d := fd.dentry()
- if d.fs.opts.interop != InteropModeShared && uint64(offset) >= atomic.LoadUint64(&d.size) {
+ if d.cachedMetadataAuthoritative() && uint64(offset) >= atomic.LoadUint64(&d.size) {
return 0, io.EOF
}
@@ -217,16 +217,23 @@ func (fd *regularFileFD) pwrite(ctx context.Context, src usermem.IOSequence, off
n, err := src.CopyInTo(ctx, rw)
if err != nil {
- return n, offset, err
+ return n, offset + n, err
}
if n > 0 && fd.vfsfd.StatusFlags()&(linux.O_DSYNC|linux.O_SYNC) != 0 {
- // Write dirty cached pages touched by the write back to the remote file.
+ // Note that if any of the following fail, then we can't guarantee that
+ // any data was actually written with the semantics of O_DSYNC or
+ // O_SYNC, so we return zero bytes written. Compare Linux's
+ // mm/filemap.c:generic_file_write_iter() =>
+ // include/linux/fs.h:generic_write_sync().
+ //
+ // Write dirty cached pages touched by the write back to the remote
+ // file.
if err := d.writeback(ctx, offset, src.NumBytes()); err != nil {
- return n, offset, err
+ return 0, offset, err
}
// Request the remote filesystem to sync the remote file.
- if err := d.handle.sync(ctx); err != nil {
- return n, offset, err
+ if err := d.syncRemoteFile(ctx); err != nil {
+ return 0, offset, err
}
}
return n, offset + n, nil
@@ -317,10 +324,11 @@ func (rw *dentryReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error)
// coherence with memory-mapped I/O), or if InteropModeShared is in effect
// (which prevents us from caching file contents and makes dentry.size
// unreliable), or if the file was opened O_DIRECT, read directly from
- // dentry.handle without locking dentry.dataMu.
+ // dentry.readHandleLocked() without locking dentry.dataMu.
rw.d.handleMu.RLock()
- if (rw.d.handle.fd >= 0 && !rw.d.fs.opts.forcePageCache) || rw.d.fs.opts.interop == InteropModeShared || rw.direct {
- n, err := rw.d.handle.readToBlocksAt(rw.ctx, dsts, rw.off)
+ h := rw.d.readHandleLocked()
+ if (rw.d.hostFD >= 0 && !rw.d.fs.opts.forcePageCache) || rw.d.fs.opts.interop == InteropModeShared || rw.direct {
+ n, err := h.readToBlocksAt(rw.ctx, dsts, rw.off)
rw.d.handleMu.RUnlock()
rw.off += n
return n, err
@@ -388,7 +396,7 @@ func (rw *dentryReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error)
End: gapEnd,
}
optMR := gap.Range()
- err := rw.d.cache.Fill(rw.ctx, reqMR, maxFillRange(reqMR, optMR), mf, usage.PageCache, rw.d.handle.readToBlocksAt)
+ err := rw.d.cache.Fill(rw.ctx, reqMR, maxFillRange(reqMR, optMR), mf, usage.PageCache, h.readToBlocksAt)
mf.MarkEvictable(rw.d, pgalloc.EvictableRange{optMR.Start, optMR.End})
seg, gap = rw.d.cache.Find(rw.off)
if !seg.Ok() {
@@ -403,7 +411,7 @@ func (rw *dentryReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error)
} else {
// Read directly from the file.
gapDsts := dsts.TakeFirst64(gapMR.Length())
- n, err := rw.d.handle.readToBlocksAt(rw.ctx, gapDsts, gapMR.Start)
+ n, err := h.readToBlocksAt(rw.ctx, gapDsts, gapMR.Start)
done += n
rw.off += n
dsts = dsts.DropFirst64(n)
@@ -435,11 +443,12 @@ func (rw *dentryReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, erro
// If we have a mmappable host FD (which must be used here to ensure
// coherence with memory-mapped I/O), or if InteropModeShared is in effect
// (which prevents us from caching file contents), or if the file was
- // opened with O_DIRECT, write directly to dentry.handle without locking
- // dentry.dataMu.
+ // opened with O_DIRECT, write directly to dentry.writeHandleLocked()
+ // without locking dentry.dataMu.
rw.d.handleMu.RLock()
- if (rw.d.handle.fd >= 0 && !rw.d.fs.opts.forcePageCache) || rw.d.fs.opts.interop == InteropModeShared || rw.direct {
- n, err := rw.d.handle.writeFromBlocksAt(rw.ctx, srcs, rw.off)
+ h := rw.d.writeHandleLocked()
+ if (rw.d.hostFD >= 0 && !rw.d.fs.opts.forcePageCache) || rw.d.fs.opts.interop == InteropModeShared || rw.direct {
+ n, err := h.writeFromBlocksAt(rw.ctx, srcs, rw.off)
rw.off += n
rw.d.dataMu.Lock()
if rw.off > rw.d.size {
@@ -501,7 +510,7 @@ func (rw *dentryReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, erro
// for detecting or avoiding this.
gapMR := gap.Range().Intersect(mr)
gapSrcs := srcs.TakeFirst64(gapMR.Length())
- n, err := rw.d.handle.writeFromBlocksAt(rw.ctx, gapSrcs, gapMR.Start)
+ n, err := h.writeFromBlocksAt(rw.ctx, gapSrcs, gapMR.Start)
done += n
rw.off += n
srcs = srcs.DropFirst64(n)
@@ -527,7 +536,7 @@ exitLoop:
if err := fsutil.SyncDirty(rw.ctx, memmap.MappableRange{
Start: start,
End: rw.off,
- }, &rw.d.cache, &rw.d.dirty, rw.d.size, mf, rw.d.handle.writeFromBlocksAt); err != nil {
+ }, &rw.d.cache, &rw.d.dirty, rw.d.size, mf, h.writeFromBlocksAt); err != nil {
// We have no idea how many bytes were actually flushed.
rw.off = start
done = 0
@@ -545,6 +554,7 @@ func (d *dentry) writeback(ctx context.Context, offset, size int64) error {
}
d.handleMu.RLock()
defer d.handleMu.RUnlock()
+ h := d.writeHandleLocked()
d.dataMu.Lock()
defer d.dataMu.Unlock()
// Compute the range of valid bytes (overflow-checked).
@@ -558,7 +568,7 @@ func (d *dentry) writeback(ctx context.Context, offset, size int64) error {
return fsutil.SyncDirty(ctx, memmap.MappableRange{
Start: uint64(offset),
End: uint64(end),
- }, &d.cache, &d.dirty, d.size, d.fs.mfp.MemoryFile(), d.handle.writeFromBlocksAt)
+ }, &d.cache, &d.dirty, d.size, d.fs.mfp.MemoryFile(), h.writeFromBlocksAt)
}
// Seek implements vfs.FileDescriptionImpl.Seek.
@@ -615,24 +625,23 @@ func regularFileSeekLocked(ctx context.Context, d *dentry, fdOffset, offset int6
// Sync implements vfs.FileDescriptionImpl.Sync.
func (fd *regularFileFD) Sync(ctx context.Context) error {
- return fd.dentry().syncSharedHandle(ctx)
+ return fd.dentry().syncCachedFile(ctx)
}
-func (d *dentry) syncSharedHandle(ctx context.Context) error {
+func (d *dentry) syncCachedFile(ctx context.Context) error {
d.handleMu.RLock()
defer d.handleMu.RUnlock()
- if d.handleWritable {
+ if h := d.writeHandleLocked(); h.isOpen() {
d.dataMu.Lock()
// Write dirty cached data to the remote file.
- err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, d.fs.mfp.MemoryFile(), d.handle.writeFromBlocksAt)
+ err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, d.fs.mfp.MemoryFile(), h.writeFromBlocksAt)
d.dataMu.Unlock()
if err != nil {
return err
}
}
- // Sync the remote file.
- return d.handle.sync(ctx)
+ return d.syncRemoteFileLocked(ctx)
}
// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap.
@@ -656,7 +665,7 @@ func (fd *regularFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpt
return syserror.ENODEV
}
d.handleMu.RLock()
- haveFD := d.handle.fd >= 0
+ haveFD := d.hostFD >= 0
d.handleMu.RUnlock()
if !haveFD {
return syserror.ENODEV
@@ -677,7 +686,7 @@ func (d *dentry) mayCachePages() bool {
return true
}
d.handleMu.RLock()
- haveFD := d.handle.fd >= 0
+ haveFD := d.hostFD >= 0
d.handleMu.RUnlock()
return haveFD
}
@@ -735,7 +744,7 @@ func (d *dentry) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR,
// Translate implements memmap.Mappable.Translate.
func (d *dentry) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) {
d.handleMu.RLock()
- if d.handle.fd >= 0 && !d.fs.opts.forcePageCache {
+ if d.hostFD >= 0 && !d.fs.opts.forcePageCache {
d.handleMu.RUnlock()
mr := optional
if d.fs.opts.limitHostFDTranslation {
@@ -771,7 +780,8 @@ func (d *dentry) Translate(ctx context.Context, required, optional memmap.Mappab
}
mf := d.fs.mfp.MemoryFile()
- cerr := d.cache.Fill(ctx, required, maxFillRange(required, optional), mf, usage.PageCache, d.handle.readToBlocksAt)
+ h := d.readHandleLocked()
+ cerr := d.cache.Fill(ctx, required, maxFillRange(required, optional), mf, usage.PageCache, h.readToBlocksAt)
var ts []memmap.Translation
var translatedEnd uint64
@@ -840,9 +850,12 @@ func (d *dentry) InvalidateUnsavable(ctx context.Context) error {
// Write the cache's contents back to the remote file so that if we have a
// host fd after restore, the remote file's contents are coherent.
mf := d.fs.mfp.MemoryFile()
+ d.handleMu.RLock()
+ defer d.handleMu.RUnlock()
+ h := d.writeHandleLocked()
d.dataMu.Lock()
defer d.dataMu.Unlock()
- if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, mf, d.handle.writeFromBlocksAt); err != nil {
+ if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, mf, h.writeFromBlocksAt); err != nil {
return err
}
@@ -857,20 +870,23 @@ func (d *dentry) InvalidateUnsavable(ctx context.Context) error {
// Evict implements pgalloc.EvictableMemoryUser.Evict.
func (d *dentry) Evict(ctx context.Context, er pgalloc.EvictableRange) {
+ mr := memmap.MappableRange{er.Start, er.End}
+ mf := d.fs.mfp.MemoryFile()
d.mapsMu.Lock()
defer d.mapsMu.Unlock()
+ d.handleMu.RLock()
+ defer d.handleMu.RUnlock()
+ h := d.writeHandleLocked()
d.dataMu.Lock()
defer d.dataMu.Unlock()
- mr := memmap.MappableRange{er.Start, er.End}
- mf := d.fs.mfp.MemoryFile()
// Only allow pages that are no longer memory-mapped to be evicted.
for mgap := d.mappings.LowerBoundGap(mr.Start); mgap.Ok() && mgap.Start() < mr.End; mgap = mgap.NextGap() {
mgapMR := mgap.Range().Intersect(mr)
if mgapMR.Length() == 0 {
continue
}
- if err := fsutil.SyncDirty(ctx, mgapMR, &d.cache, &d.dirty, d.size, mf, d.handle.writeFromBlocksAt); err != nil {
+ if err := fsutil.SyncDirty(ctx, mgapMR, &d.cache, &d.dirty, d.size, mf, h.writeFromBlocksAt); err != nil {
log.Warningf("Failed to writeback cached data %v: %v", mgapMR, err)
}
d.cache.Drop(mgapMR, mf)
@@ -882,8 +898,8 @@ func (d *dentry) Evict(ctx context.Context, er pgalloc.EvictableRange) {
// cannot implement both vfs.DentryImpl.IncRef and memmap.File.IncRef.
//
// dentryPlatformFile is only used when a host FD representing the remote file
-// is available (i.e. dentry.handle.fd >= 0), and that FD is used for
-// application memory mappings (i.e. !filesystem.opts.forcePageCache).
+// is available (i.e. dentry.hostFD >= 0), and that FD is used for application
+// memory mappings (i.e. !filesystem.opts.forcePageCache).
type dentryPlatformFile struct {
*dentry
@@ -891,8 +907,8 @@ type dentryPlatformFile struct {
// by dentry.dataMu.
fdRefs fsutil.FrameRefSet
- // If this dentry represents a regular file, and handle.fd >= 0,
- // hostFileMapper caches mappings of handle.fd.
+ // If this dentry represents a regular file, and dentry.hostFD >= 0,
+ // hostFileMapper caches mappings of dentry.hostFD.
hostFileMapper fsutil.HostFileMapper
// hostFileMapperInitOnce is used to lazily initialize hostFileMapper.
@@ -916,15 +932,13 @@ func (d *dentryPlatformFile) DecRef(fr memmap.FileRange) {
// MapInternal implements memmap.File.MapInternal.
func (d *dentryPlatformFile) MapInternal(fr memmap.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) {
d.handleMu.RLock()
- bs, err := d.hostFileMapper.MapInternal(fr, int(d.handle.fd), at.Write)
- d.handleMu.RUnlock()
- return bs, err
+ defer d.handleMu.RUnlock()
+ return d.hostFileMapper.MapInternal(fr, int(d.hostFD), at.Write)
}
// FD implements memmap.File.FD.
func (d *dentryPlatformFile) FD() int {
d.handleMu.RLock()
- fd := d.handle.fd
- d.handleMu.RUnlock()
- return int(fd)
+ defer d.handleMu.RUnlock()
+ return int(d.hostFD)
}
diff --git a/pkg/sentry/fsimpl/gofer/special_file.go b/pkg/sentry/fsimpl/gofer/special_file.go
index fc269ef2b..a6368fdd0 100644
--- a/pkg/sentry/fsimpl/gofer/special_file.go
+++ b/pkg/sentry/fsimpl/gofer/special_file.go
@@ -17,6 +17,7 @@ package gofer
import (
"sync"
"sync/atomic"
+ "syscall"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
@@ -279,5 +280,13 @@ func (fd *specialFileFD) Seek(ctx context.Context, offset int64, whence int32) (
// Sync implements vfs.FileDescriptionImpl.Sync.
func (fd *specialFileFD) Sync(ctx context.Context) error {
- return fd.dentry().syncSharedHandle(ctx)
+ // If we have a host FD, fsyncing it is likely to be faster than an fsync
+ // RPC.
+ if fd.handle.fd >= 0 {
+ ctx.UninterruptibleSleepStart(false)
+ err := syscall.Fsync(int(fd.handle.fd))
+ ctx.UninterruptibleSleepFinish(false)
+ return err
+ }
+ return fd.handle.file.fsync(ctx)
}
diff --git a/pkg/sentry/fsimpl/gofer/time.go b/pkg/sentry/fsimpl/gofer/time.go
index 0eef4e16e..2cb8191b9 100644
--- a/pkg/sentry/fsimpl/gofer/time.go
+++ b/pkg/sentry/fsimpl/gofer/time.go
@@ -47,6 +47,7 @@ func (d *dentry) touchAtime(mnt *vfs.Mount) {
now := d.fs.clock.Now().Nanoseconds()
d.metadataMu.Lock()
atomic.StoreInt64(&d.atime, now)
+ atomic.StoreUint32(&d.atimeDirty, 1)
d.metadataMu.Unlock()
mnt.EndWrite()
}
@@ -67,6 +68,7 @@ func (d *dentry) touchCMtime() {
d.metadataMu.Lock()
atomic.StoreInt64(&d.mtime, now)
atomic.StoreInt64(&d.ctime, now)
+ atomic.StoreUint32(&d.mtimeDirty, 1)
d.metadataMu.Unlock()
}
@@ -76,4 +78,5 @@ func (d *dentry) touchCMtimeLocked() {
now := d.fs.clock.Now().Nanoseconds()
atomic.StoreInt64(&d.mtime, now)
atomic.StoreInt64(&d.ctime, now)
+ atomic.StoreUint32(&d.mtimeDirty, 1)
}
diff --git a/pkg/sentry/fsimpl/proc/tasks_sys.go b/pkg/sentry/fsimpl/proc/tasks_sys.go
index 6dac2afa4..b71778128 100644
--- a/pkg/sentry/fsimpl/proc/tasks_sys.go
+++ b/pkg/sentry/fsimpl/proc/tasks_sys.go
@@ -55,7 +55,8 @@ func (fs *filesystem) newSysNetDir(root *auth.Credentials, k *kernel.Kernel) *ke
if stack := k.RootNetworkNamespace().Stack(); stack != nil {
contents = map[string]*kernfs.Dentry{
"ipv4": kernfs.NewStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{
- "tcp_sack": fs.newDentry(root, fs.NextIno(), 0644, &tcpSackData{stack: stack}),
+ "tcp_recovery": fs.newDentry(root, fs.NextIno(), 0644, &tcpRecoveryData{stack: stack}),
+ "tcp_sack": fs.newDentry(root, fs.NextIno(), 0644, &tcpSackData{stack: stack}),
// The following files are simple stubs until they are implemented in
// netstack, most of these files are configuration related. We use the
@@ -207,3 +208,49 @@ func (d *tcpSackData) Write(ctx context.Context, src usermem.IOSequence, offset
*d.enabled = v != 0
return n, d.stack.SetTCPSACKEnabled(*d.enabled)
}
+
+// tcpRecoveryData implements vfs.WritableDynamicBytesSource for
+// /proc/sys/net/ipv4/tcp_recovery.
+//
+// +stateify savable
+type tcpRecoveryData struct {
+ kernfs.DynamicBytesFile
+
+ stack inet.Stack `state:"wait"`
+}
+
+var _ vfs.WritableDynamicBytesSource = (*tcpRecoveryData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.
+func (d *tcpRecoveryData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+ recovery, err := d.stack.TCPRecovery()
+ if err != nil {
+ return err
+ }
+
+ buf.WriteString(fmt.Sprintf("%d\n", recovery))
+ return nil
+}
+
+func (d *tcpRecoveryData) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) {
+ if offset != 0 {
+ // No need to handle partial writes thus far.
+ return 0, syserror.EINVAL
+ }
+ if src.NumBytes() == 0 {
+ return 0, nil
+ }
+
+ // Limit the amount of memory allocated.
+ src = src.TakeFirst(usermem.PageSize - 1)
+
+ var v int32
+ n, err := usermem.CopyInt32StringInVec(ctx, src.IO, src.Addrs, &v, src.Opts)
+ if err != nil {
+ return 0, err
+ }
+ if err := d.stack.SetTCPRecovery(inet.TCPLossRecovery(v)); err != nil {
+ return 0, err
+ }
+ return n, nil
+}
diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go
index fb77f95cc..065812065 100644
--- a/pkg/sentry/fsimpl/tmpfs/filesystem.go
+++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go
@@ -566,7 +566,9 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
if replaced != nil {
newParentDir.removeChildLocked(replaced)
if replaced.inode.isDir() {
- newParentDir.inode.decLinksLocked(ctx) // from replaced's ".."
+ // Remove links for replaced/. and replaced/..
+ replaced.inode.decLinksLocked(ctx)
+ newParentDir.inode.decLinksLocked(ctx)
}
replaced.inode.decLinksLocked(ctx)
}
diff --git a/pkg/sentry/inet/inet.go b/pkg/sentry/inet/inet.go
index 2916a0644..c0b4831d1 100644
--- a/pkg/sentry/inet/inet.go
+++ b/pkg/sentry/inet/inet.go
@@ -56,6 +56,12 @@ type Stack interface {
// settings.
SetTCPSACKEnabled(enabled bool) error
+ // TCPRecovery returns the TCP loss detection algorithm.
+ TCPRecovery() (TCPLossRecovery, error)
+
+ // SetTCPRecovery attempts to change TCP loss detection algorithm.
+ SetTCPRecovery(recovery TCPLossRecovery) error
+
// Statistics reports stack statistics.
Statistics(stat interface{}, arg string) error
@@ -189,3 +195,14 @@ type StatSNMPUDP [8]uint64
// StatSNMPUDPLite describes UdpLite line of /proc/net/snmp.
type StatSNMPUDPLite [8]uint64
+
+// TCPLossRecovery indicates TCP loss detection and recovery methods to use.
+type TCPLossRecovery int32
+
+// Loss recovery constants from include/net/tcp.h which are used to set
+// /proc/sys/net/ipv4/tcp_recovery.
+const (
+ TCP_RACK_LOSS_DETECTION TCPLossRecovery = 1 << iota
+ TCP_RACK_STATIC_REO_WND
+ TCP_RACK_NO_DUPTHRESH
+)
diff --git a/pkg/sentry/inet/test_stack.go b/pkg/sentry/inet/test_stack.go
index d8961fc94..9771f01fc 100644
--- a/pkg/sentry/inet/test_stack.go
+++ b/pkg/sentry/inet/test_stack.go
@@ -25,6 +25,7 @@ type TestStack struct {
TCPRecvBufSize TCPBufferSize
TCPSendBufSize TCPBufferSize
TCPSACKFlag bool
+ Recovery TCPLossRecovery
}
// NewTestStack returns a TestStack with no network interfaces. The value of
@@ -91,6 +92,17 @@ func (s *TestStack) SetTCPSACKEnabled(enabled bool) error {
return nil
}
+// TCPRecovery implements Stack.TCPRecovery.
+func (s *TestStack) TCPRecovery() (TCPLossRecovery, error) {
+ return s.Recovery, nil
+}
+
+// SetTCPRecovery implements Stack.SetTCPRecovery.
+func (s *TestStack) SetTCPRecovery(recovery TCPLossRecovery) error {
+ s.Recovery = recovery
+ return nil
+}
+
// Statistics implements inet.Stack.Statistics.
func (s *TestStack) Statistics(stat interface{}, arg string) error {
return nil
diff --git a/pkg/sentry/platform/kvm/bluepill_arm64.go b/pkg/sentry/platform/kvm/bluepill_arm64.go
index dba563160..ed5ae03d3 100644
--- a/pkg/sentry/platform/kvm/bluepill_arm64.go
+++ b/pkg/sentry/platform/kvm/bluepill_arm64.go
@@ -49,7 +49,7 @@ func bluepillArchEnter(context *arch.SignalContext64) (c *vCPU) {
regs.Sp = context.Sp
regs.Pc = context.Pc
regs.Pstate = context.Pstate
- regs.Pstate &^= uint64(ring0.KernelFlagsClear)
+ regs.Pstate &^= uint64(ring0.PsrFlagsClear)
regs.Pstate |= ring0.KernelFlagsSet
return
}
@@ -63,7 +63,7 @@ func bluepillArchExit(c *vCPU, context *arch.SignalContext64) {
context.Sp = regs.Sp
context.Pc = regs.Pc
context.Pstate = regs.Pstate
- context.Pstate &^= uint64(ring0.UserFlagsClear)
+ context.Pstate &^= uint64(ring0.PsrFlagsClear)
context.Pstate |= ring0.UserFlagsSet
lazyVfp := c.GetLazyVFP()
diff --git a/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go b/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go
index 8b64f3a1e..b35c930e2 100644
--- a/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go
+++ b/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go
@@ -41,7 +41,7 @@ func fpsimdPtr(addr *byte) *arch.FpsimdContext {
func dieArchSetup(c *vCPU, context *arch.SignalContext64, guestRegs *userRegs) {
// If the vCPU is in user mode, we set the stack to the stored stack
// value in the vCPU itself. We don't want to unwind the user stack.
- if guestRegs.Regs.Pstate&ring0.PSR_MODE_MASK == ring0.PSR_MODE_EL0t {
+ if guestRegs.Regs.Pstate&ring0.PsrModeMask == ring0.UserFlagsSet {
regs := c.CPU.Registers()
context.Regs[0] = regs.Regs[0]
context.Sp = regs.Sp
diff --git a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go
index ff8c068c0..307a7645f 100644
--- a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go
+++ b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go
@@ -151,12 +151,6 @@ func (c *vCPU) initArchState() error {
// the MMIO address base.
arm64HypercallMMIOBase = toLocation
- data = ring0.PsrDefaultSet | ring0.KernelFlagsSet
- reg.id = _KVM_ARM64_REGS_PSTATE
- if err := c.setOneRegister(&reg); err != nil {
- return err
- }
-
// Initialize the PCID database.
if hasGuestPCID {
// Note that NewPCIDs may return a nil table here, in which
diff --git a/pkg/sentry/platform/ring0/aarch64.go b/pkg/sentry/platform/ring0/aarch64.go
index 8122ac6e2..87a573cc4 100644
--- a/pkg/sentry/platform/ring0/aarch64.go
+++ b/pkg/sentry/platform/ring0/aarch64.go
@@ -26,30 +26,31 @@ const (
_PMD_PGT_SIZE = 0x4000
_PTE_PGT_BASE = 0x7000
_PTE_PGT_SIZE = 0x1000
-
- _PSR_D_BIT = 0x00000200
- _PSR_A_BIT = 0x00000100
- _PSR_I_BIT = 0x00000080
- _PSR_F_BIT = 0x00000040
)
const (
- // PSR bits
- PSR_MODE_EL0t = 0x00000000
- PSR_MODE_EL1t = 0x00000004
- PSR_MODE_EL1h = 0x00000005
- PSR_MODE_MASK = 0x0000000f
+ // DAIF bits:debug, sError, IRQ, FIQ.
+ _PSR_D_BIT = 0x00000200
+ _PSR_A_BIT = 0x00000100
+ _PSR_I_BIT = 0x00000080
+ _PSR_F_BIT = 0x00000040
+ _PSR_DAIF_SHIFT = 6
+ _PSR_DAIF_MASK = 0xf << _PSR_DAIF_SHIFT
- // KernelFlagsSet should always be set in the kernel.
- KernelFlagsSet = PSR_MODE_EL1h
+ // PSR bits.
+ _PSR_MODE_EL0t = 0x00000000
+ _PSR_MODE_EL1t = 0x00000004
+ _PSR_MODE_EL1h = 0x00000005
+ _PSR_MODE_MASK = 0x0000000f
- // UserFlagsSet are always set in userspace.
- UserFlagsSet = PSR_MODE_EL0t
+ PsrFlagsClear = _PSR_MODE_MASK | _PSR_DAIF_MASK
+ PsrModeMask = _PSR_MODE_MASK
- KernelFlagsClear = PSR_MODE_MASK
- UserFlagsClear = PSR_MODE_MASK
+ // KernelFlagsSet should always be set in the kernel.
+ KernelFlagsSet = _PSR_MODE_EL1h | _PSR_D_BIT | _PSR_A_BIT | _PSR_I_BIT | _PSR_F_BIT
- PsrDefaultSet = _PSR_D_BIT | _PSR_A_BIT | _PSR_I_BIT | _PSR_F_BIT
+ // UserFlagsSet are always set in userspace.
+ UserFlagsSet = _PSR_MODE_EL0t
)
// Vector is an exception vector.
diff --git a/pkg/sentry/platform/ring0/entry_arm64.s b/pkg/sentry/platform/ring0/entry_arm64.s
index 9fd02d628..d8a7bc2f9 100644
--- a/pkg/sentry/platform/ring0/entry_arm64.s
+++ b/pkg/sentry/platform/ring0/entry_arm64.s
@@ -312,12 +312,6 @@
ISB $15; \
DSB $15;
-#define IRQ_ENABLE \
- MSR $2, DAIFSet;
-
-#define IRQ_DISABLE \
- MSR $2, DAIFClr;
-
#define VFP_ENABLE \
MOVD $FPEN_ENABLE, R0; \
WORD $0xd5181040; \ //MSR R0, CPACR_EL1
@@ -509,8 +503,6 @@ TEXT ·kernelExitToEl1(SB),NOSPLIT,$0
// Start is the CPU entrypoint.
TEXT ·Start(SB),NOSPLIT,$0
- IRQ_DISABLE
-
// Init.
MOVD $SCTLR_EL1_DEFAULT, R1
MSR R1, SCTLR_EL1
diff --git a/pkg/sentry/platform/ring0/kernel_arm64.go b/pkg/sentry/platform/ring0/kernel_arm64.go
index d483ff03c..42009dac0 100644
--- a/pkg/sentry/platform/ring0/kernel_arm64.go
+++ b/pkg/sentry/platform/ring0/kernel_arm64.go
@@ -56,7 +56,7 @@ func (c *CPU) SwitchToUser(switchOpts SwitchOpts) (vector Vector) {
// Sanitize registers.
regs := switchOpts.Registers
- regs.Pstate &= ^uint64(UserFlagsClear)
+ regs.Pstate &= ^uint64(PsrFlagsClear)
regs.Pstate |= UserFlagsSet
LoadFloatingPoint(switchOpts.FloatingPointState)
diff --git a/pkg/sentry/socket/hostinet/stack.go b/pkg/sentry/socket/hostinet/stack.go
index a48082631..fda3dcb35 100644
--- a/pkg/sentry/socket/hostinet/stack.go
+++ b/pkg/sentry/socket/hostinet/stack.go
@@ -53,6 +53,7 @@ type Stack struct {
interfaceAddrs map[int32][]inet.InterfaceAddr
routes []inet.Route
supportsIPv6 bool
+ tcpRecovery inet.TCPLossRecovery
tcpRecvBufSize inet.TCPBufferSize
tcpSendBufSize inet.TCPBufferSize
tcpSACKEnabled bool
@@ -350,6 +351,16 @@ func (s *Stack) SetTCPSACKEnabled(enabled bool) error {
return syserror.EACCES
}
+// TCPRecovery implements inet.Stack.TCPRecovery.
+func (s *Stack) TCPRecovery() (inet.TCPLossRecovery, error) {
+ return s.tcpRecovery, nil
+}
+
+// SetTCPRecovery implements inet.Stack.SetTCPRecovery.
+func (s *Stack) SetTCPRecovery(recovery inet.TCPLossRecovery) error {
+ return syserror.EACCES
+}
+
// getLine reads one line from proc file, with specified prefix.
// The last argument, withHeader, specifies if it contains line header.
func getLine(f *os.File, prefix string, withHeader bool) string {
diff --git a/pkg/sentry/socket/netstack/stack.go b/pkg/sentry/socket/netstack/stack.go
index 67737ae87..f0fe18684 100644
--- a/pkg/sentry/socket/netstack/stack.go
+++ b/pkg/sentry/socket/netstack/stack.go
@@ -207,6 +207,20 @@ func (s *Stack) SetTCPSACKEnabled(enabled bool) error {
return syserr.TranslateNetstackError(s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SACKEnabled(enabled))).ToError()
}
+// TCPRecovery implements inet.Stack.TCPRecovery.
+func (s *Stack) TCPRecovery() (inet.TCPLossRecovery, error) {
+ var recovery tcp.Recovery
+ if err := s.Stack.TransportProtocolOption(tcp.ProtocolNumber, &recovery); err != nil {
+ return 0, syserr.TranslateNetstackError(err).ToError()
+ }
+ return inet.TCPLossRecovery(recovery), nil
+}
+
+// SetTCPRecovery implements inet.Stack.SetTCPRecovery.
+func (s *Stack) SetTCPRecovery(recovery inet.TCPLossRecovery) error {
+ return syserr.TranslateNetstackError(s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.Recovery(recovery))).ToError()
+}
+
// Statistics implements inet.Stack.Statistics.
func (s *Stack) Statistics(stat interface{}, arg string) error {
switch stats := stat.(type) {
diff --git a/pkg/sentry/syscalls/linux/vfs2/memfd.go b/pkg/sentry/syscalls/linux/vfs2/memfd.go
index 519583e4e..c4c0f9e0a 100644
--- a/pkg/sentry/syscalls/linux/vfs2/memfd.go
+++ b/pkg/sentry/syscalls/linux/vfs2/memfd.go
@@ -51,6 +51,7 @@ func MemfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.S
if err != nil {
return 0, nil, err
}
+ defer file.DecRef(t)
fd, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{
CloseOnExec: cloExec,
diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go
index 576ab3920..d3c1197e3 100644
--- a/pkg/sentry/vfs/file_description.go
+++ b/pkg/sentry/vfs/file_description.go
@@ -356,6 +356,8 @@ type FileDescriptionImpl interface {
// Allocate grows the file to offset + length bytes.
// Only mode == 0 is supported currently.
+ //
+ // Preconditions: The FileDescription was opened for writing.
Allocate(ctx context.Context, mode, offset, length uint64) error
// waiter.Waitable methods may be used to poll for I/O events.
@@ -565,6 +567,9 @@ func (fd *FileDescription) StatFS(ctx context.Context) (linux.Statfs, error) {
// Allocate grows file represented by FileDescription to offset + length bytes.
func (fd *FileDescription) Allocate(ctx context.Context, mode, offset, length uint64) error {
+ if !fd.IsWritable() {
+ return syserror.EBADF
+ }
return fd.impl.Allocate(ctx, mode, offset, length)
}