summaryrefslogtreecommitdiffhomepage
path: root/pkg/sentry
diff options
context:
space:
mode:
Diffstat (limited to 'pkg/sentry')
-rw-r--r--pkg/sentry/fs/gofer/path.go3
-rw-r--r--pkg/sentry/fs/proc/sys_net.go4
-rw-r--r--pkg/sentry/fs/tmpfs/inode_file.go6
-rw-r--r--pkg/sentry/fs/tmpfs/tmpfs.go2
-rw-r--r--pkg/sentry/fsimpl/devpts/devpts.go2
-rw-r--r--pkg/sentry/fsimpl/fuse/dev_test.go2
-rw-r--r--pkg/sentry/fsimpl/fuse/fusefs.go8
-rw-r--r--pkg/sentry/fsimpl/fuse/utils_test.go1
-rw-r--r--pkg/sentry/fsimpl/gofer/directory.go4
-rw-r--r--pkg/sentry/fsimpl/gofer/filesystem.go26
-rw-r--r--pkg/sentry/fsimpl/gofer/gofer.go214
-rw-r--r--pkg/sentry/fsimpl/gofer/regular_file.go20
-rw-r--r--pkg/sentry/fsimpl/gofer/save_restore.go4
-rw-r--r--pkg/sentry/fsimpl/host/host.go2
-rw-r--r--pkg/sentry/fsimpl/host/socket.go6
-rw-r--r--pkg/sentry/fsimpl/kernfs/filesystem.go54
-rw-r--r--pkg/sentry/fsimpl/kernfs/inode_impl_util.go18
-rw-r--r--pkg/sentry/fsimpl/kernfs/kernfs.go16
-rw-r--r--pkg/sentry/fsimpl/kernfs/kernfs_test.go4
-rw-r--r--pkg/sentry/fsimpl/overlay/copy_up.go2
-rw-r--r--pkg/sentry/fsimpl/overlay/filesystem.go133
-rw-r--r--pkg/sentry/fsimpl/overlay/overlay.go23
-rw-r--r--pkg/sentry/fsimpl/pipefs/pipefs.go2
-rw-r--r--pkg/sentry/fsimpl/proc/subtasks.go2
-rw-r--r--pkg/sentry/fsimpl/proc/task.go2
-rw-r--r--pkg/sentry/fsimpl/proc/task_fds.go4
-rw-r--r--pkg/sentry/fsimpl/proc/tasks.go2
-rw-r--r--pkg/sentry/fsimpl/sys/sys.go2
-rw-r--r--pkg/sentry/fsimpl/tmpfs/named_pipe.go3
-rw-r--r--pkg/sentry/fsimpl/tmpfs/tmpfs.go2
-rw-r--r--pkg/sentry/fsimpl/verity/filesystem.go6
-rw-r--r--pkg/sentry/fsimpl/verity/verity.go50
-rw-r--r--pkg/sentry/fsimpl/verity/verity_test.go869
-rw-r--r--pkg/sentry/hostfd/BUILD2
-rw-r--r--pkg/sentry/hostfd/hostfd_linux.go18
-rw-r--r--pkg/sentry/hostfd/hostfd_unsafe.go9
-rw-r--r--pkg/sentry/kernel/fd_table_unsafe.go2
-rw-r--r--pkg/sentry/kernel/fs_context.go6
-rw-r--r--pkg/sentry/kernel/ipc_namespace.go2
-rw-r--r--pkg/sentry/kernel/pipe/node_test.go8
-rw-r--r--pkg/sentry/kernel/pipe/pipe.go50
-rw-r--r--pkg/sentry/kernel/pipe/pipe_test.go11
-rw-r--r--pkg/sentry/kernel/pipe/vfs.go4
-rw-r--r--pkg/sentry/kernel/ptrace.go4
-rw-r--r--pkg/sentry/kernel/semaphore/semaphore.go46
-rw-r--r--pkg/sentry/kernel/sessions.go38
-rw-r--r--pkg/sentry/kernel/shm/shm.go2
-rw-r--r--pkg/sentry/kernel/task_clone.go2
-rw-r--r--pkg/sentry/kernel/task_usermem.go95
-rw-r--r--pkg/sentry/kernel/vdso.go2
-rw-r--r--pkg/sentry/mm/aio_context.go2
-rw-r--r--pkg/sentry/mm/special_mappable.go2
-rw-r--r--pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go12
-rw-r--r--pkg/sentry/platform/kvm/bluepill_arm64.go17
-rw-r--r--pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go17
-rw-r--r--pkg/sentry/platform/kvm/bluepill_unsafe.go7
-rw-r--r--pkg/sentry/platform/kvm/kvm.go3
-rw-r--r--pkg/sentry/platform/kvm/kvm_const_arm64.go3
-rw-r--r--pkg/sentry/platform/kvm/machine.go15
-rw-r--r--pkg/sentry/platform/kvm/machine_amd64.go31
-rw-r--r--pkg/sentry/platform/kvm/machine_arm64_unsafe.go8
-rw-r--r--pkg/sentry/platform/ring0/aarch64.go67
-rw-r--r--pkg/sentry/platform/ring0/defs.go3
-rw-r--r--pkg/sentry/platform/ring0/defs_amd64.go10
-rw-r--r--pkg/sentry/platform/ring0/defs_arm64.go7
-rw-r--r--pkg/sentry/platform/ring0/entry_arm64.s31
-rw-r--r--pkg/sentry/platform/ring0/kernel.go6
-rw-r--r--pkg/sentry/platform/ring0/kernel_amd64.go5
-rw-r--r--pkg/sentry/platform/ring0/kernel_arm64.go4
-rw-r--r--pkg/sentry/platform/ring0/offsets_arm64.go45
-rw-r--r--pkg/sentry/platform/ring0/pagetables/pagetables.go84
-rw-r--r--pkg/sentry/platform/ring0/pagetables/pagetables_aarch64.go10
-rw-r--r--pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go21
-rw-r--r--pkg/sentry/platform/ring0/pagetables/pagetables_arm64.go13
-rw-r--r--pkg/sentry/platform/ring0/pagetables/walker_arm64.go2
-rw-r--r--pkg/sentry/socket/netfilter/extensions.go63
-rw-r--r--pkg/sentry/socket/netfilter/ipv4.go4
-rw-r--r--pkg/sentry/socket/netfilter/ipv6.go4
-rw-r--r--pkg/sentry/socket/netfilter/netfilter.go52
-rw-r--r--pkg/sentry/socket/netfilter/targets.go217
-rw-r--r--pkg/sentry/socket/netlink/route/protocol.go2
-rw-r--r--pkg/sentry/socket/unix/transport/connectioned.go8
-rw-r--r--pkg/sentry/socket/unix/transport/connectionless.go2
-rw-r--r--pkg/sentry/socket/unix/unix.go2
-rw-r--r--pkg/sentry/socket/unix/unix_vfs2.go2
-rw-r--r--pkg/sentry/syscalls/linux/linux64.go4
-rw-r--r--pkg/sentry/syscalls/linux/sys_pipe.go2
-rw-r--r--pkg/sentry/syscalls/linux/sys_sem.go32
-rw-r--r--pkg/sentry/syscalls/linux/sys_splice.go30
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/splice.go23
-rw-r--r--pkg/sentry/vfs/file_description.go2
-rw-r--r--pkg/sentry/vfs/filesystem.go2
-rw-r--r--pkg/sentry/vfs/inotify.go2
-rw-r--r--pkg/sentry/vfs/mount.go13
-rw-r--r--pkg/sentry/vfs/mount_test.go35
95 files changed, 1675 insertions, 1050 deletions
diff --git a/pkg/sentry/fs/gofer/path.go b/pkg/sentry/fs/gofer/path.go
index 3c66dc3c2..6b3627813 100644
--- a/pkg/sentry/fs/gofer/path.go
+++ b/pkg/sentry/fs/gofer/path.go
@@ -25,7 +25,6 @@ import (
"gvisor.dev/gvisor/pkg/sentry/kernel/pipe"
"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
"gvisor.dev/gvisor/pkg/syserror"
- "gvisor.dev/gvisor/pkg/usermem"
)
// maxFilenameLen is the maximum length of a filename. This is dictated by 9P's
@@ -305,7 +304,7 @@ func (i *inodeOperations) createInternalFifo(ctx context.Context, dir *fs.Inode,
}
// First create a pipe.
- p := pipe.NewPipe(true /* isNamed */, pipe.DefaultPipeSize, usermem.PageSize)
+ p := pipe.NewPipe(true /* isNamed */, pipe.DefaultPipeSize)
// Wrap the fileOps with our Fifo.
iops := &fifo{
diff --git a/pkg/sentry/fs/proc/sys_net.go b/pkg/sentry/fs/proc/sys_net.go
index e555672ad..52061175f 100644
--- a/pkg/sentry/fs/proc/sys_net.go
+++ b/pkg/sentry/fs/proc/sys_net.go
@@ -86,9 +86,9 @@ func (*tcpMemInode) Truncate(context.Context, *fs.Inode, int64) error {
}
// GetFile implements fs.InodeOperations.GetFile.
-func (m *tcpMemInode) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
+func (t *tcpMemInode) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
flags.Pread = true
- return fs.NewFile(ctx, dirent, flags, &tcpMemFile{tcpMemInode: m}), nil
+ return fs.NewFile(ctx, dirent, flags, &tcpMemFile{tcpMemInode: t}), nil
}
// +stateify savable
diff --git a/pkg/sentry/fs/tmpfs/inode_file.go b/pkg/sentry/fs/tmpfs/inode_file.go
index fc0498f17..d6c65301c 100644
--- a/pkg/sentry/fs/tmpfs/inode_file.go
+++ b/pkg/sentry/fs/tmpfs/inode_file.go
@@ -431,9 +431,6 @@ func (rw *fileReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
// Continue.
seg, gap = gap.NextSegment(), fsutil.FileRangeGapIterator{}
-
- default:
- break
}
}
return done, nil
@@ -532,9 +529,6 @@ func (rw *fileReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error)
// Write to that memory as usual.
seg, gap = rw.f.data.Insert(gap, gapMR, fr.Start), fsutil.FileRangeGapIterator{}
-
- default:
- break
}
}
return done, nil
diff --git a/pkg/sentry/fs/tmpfs/tmpfs.go b/pkg/sentry/fs/tmpfs/tmpfs.go
index 998b697ca..cf4ed5de0 100644
--- a/pkg/sentry/fs/tmpfs/tmpfs.go
+++ b/pkg/sentry/fs/tmpfs/tmpfs.go
@@ -336,7 +336,7 @@ type Fifo struct {
// NewFifo creates a new named pipe.
func NewFifo(ctx context.Context, owner fs.FileOwner, perms fs.FilePermissions, msrc *fs.MountSource) *fs.Inode {
// First create a pipe.
- p := pipe.NewPipe(true /* isNamed */, pipe.DefaultPipeSize, usermem.PageSize)
+ p := pipe.NewPipe(true /* isNamed */, pipe.DefaultPipeSize)
// Build pipe InodeOperations.
iops := pipe.NewInodeOperations(ctx, perms, p)
diff --git a/pkg/sentry/fsimpl/devpts/devpts.go b/pkg/sentry/fsimpl/devpts/devpts.go
index 346cca558..d8c237753 100644
--- a/pkg/sentry/fsimpl/devpts/devpts.go
+++ b/pkg/sentry/fsimpl/devpts/devpts.go
@@ -110,7 +110,7 @@ func (fstype *FilesystemType) newFilesystem(ctx context.Context, vfsObj *vfs.Vir
}
root.InodeAttrs.Init(ctx, creds, linux.UNNAMED_MAJOR, devMinor, 1, linux.ModeDirectory|0555)
root.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
- root.EnableLeakCheck()
+ root.InitRefs()
var rootD kernfs.Dentry
rootD.InitRoot(&fs.Filesystem, root)
diff --git a/pkg/sentry/fsimpl/fuse/dev_test.go b/pkg/sentry/fsimpl/fuse/dev_test.go
index 5986133e9..95c475a65 100644
--- a/pkg/sentry/fsimpl/fuse/dev_test.go
+++ b/pkg/sentry/fsimpl/fuse/dev_test.go
@@ -315,7 +315,7 @@ func fuseServerRun(t *testing.T, s *testutil.System, k *kernel.Kernel, fd *vfs.F
readPayload.MarshalUnsafe(outBuf[outHdrLen:])
outIOseq := usermem.BytesIOSequence(outBuf)
- n, err = fd.Write(s.Ctx, outIOseq, vfs.WriteOptions{})
+ _, err = fd.Write(s.Ctx, outIOseq, vfs.WriteOptions{})
if err != nil {
t.Fatalf("Write failed :%v", err)
}
diff --git a/pkg/sentry/fsimpl/fuse/fusefs.go b/pkg/sentry/fsimpl/fuse/fusefs.go
index 6de416da0..cd0eb56e5 100644
--- a/pkg/sentry/fsimpl/fuse/fusefs.go
+++ b/pkg/sentry/fsimpl/fuse/fusefs.go
@@ -219,16 +219,12 @@ func newFUSEFilesystem(ctx context.Context, devMinor uint32, opts *filesystemOpt
}
fuseFD := device.Impl().(*DeviceFD)
-
fs := &filesystem{
devMinor: devMinor,
opts: opts,
conn: conn,
}
-
- fs.VFSFilesystem().IncRef()
fuseFD.fs = fs
-
return fs, nil
}
@@ -288,7 +284,7 @@ func (fs *filesystem) newRoot(ctx context.Context, creds *auth.Credentials, mode
i := &inode{fs: fs, nodeID: 1}
i.InodeAttrs.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, 1, linux.ModeDirectory|0755)
i.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
- i.EnableLeakCheck()
+ i.InitRefs()
var d kernfs.Dentry
d.InitRoot(&fs.Filesystem, i)
@@ -301,7 +297,7 @@ func (fs *filesystem) newInode(ctx context.Context, nodeID uint64, attr linux.FU
i.InodeAttrs.Init(ctx, &creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.FileMode(attr.Mode))
atomic.StoreUint64(&i.size, attr.Size)
i.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
- i.EnableLeakCheck()
+ i.InitRefs()
return i
}
diff --git a/pkg/sentry/fsimpl/fuse/utils_test.go b/pkg/sentry/fsimpl/fuse/utils_test.go
index e1d9e3365..b2f4276b8 100644
--- a/pkg/sentry/fsimpl/fuse/utils_test.go
+++ b/pkg/sentry/fsimpl/fuse/utils_test.go
@@ -72,6 +72,7 @@ func newTestConnection(system *testutil.System, k *kernel.Kernel, maxActiveReque
if err != nil {
return nil, nil, err
}
+ fs.VFSFilesystem().Init(vfsObj, nil, fs)
return fs.conn, &fuseDev.vfsfd, nil
}
diff --git a/pkg/sentry/fsimpl/gofer/directory.go b/pkg/sentry/fsimpl/gofer/directory.go
index ce1b2a390..3b5927702 100644
--- a/pkg/sentry/fsimpl/gofer/directory.go
+++ b/pkg/sentry/fsimpl/gofer/directory.go
@@ -98,7 +98,9 @@ func (d *dentry) createSyntheticChildLocked(opts *createSyntheticOpts) {
uid: uint32(opts.kuid),
gid: uint32(opts.kgid),
blockSize: usermem.PageSize, // arbitrary
- hostFD: -1,
+ readFD: -1,
+ writeFD: -1,
+ mmapFD: -1,
nlink: uint32(2),
}
refsvfs2.Register(child)
diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go
index 57a2ca43c..7ab298019 100644
--- a/pkg/sentry/fsimpl/gofer/filesystem.go
+++ b/pkg/sentry/fsimpl/gofer/filesystem.go
@@ -30,7 +30,6 @@ import (
"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/syserror"
- "gvisor.dev/gvisor/pkg/usermem"
)
// Sync implements vfs.FilesystemImpl.Sync.
@@ -372,9 +371,6 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir
if len(name) > maxFilenameLen {
return syserror.ENAMETOOLONG
}
- if !dir && rp.MustBeDir() {
- return syserror.ENOENT
- }
if parent.isDeleted() {
return syserror.ENOENT
}
@@ -389,6 +385,9 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir
if child := parent.children[name]; child != nil {
return syserror.EEXIST
}
+ if !dir && rp.MustBeDir() {
+ return syserror.ENOENT
+ }
if createInSyntheticDir == nil {
return syserror.EPERM
}
@@ -408,6 +407,9 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir
if child := parent.children[name]; child != nil && child.isSynthetic() {
return syserror.EEXIST
}
+ if !dir && rp.MustBeDir() {
+ return syserror.ENOENT
+ }
// The existence of a non-synthetic dentry at name would be inconclusive
// because the file it represents may have been deleted from the remote
// filesystem, so we would need to make an RPC to revalidate the dentry.
@@ -428,6 +430,9 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir
if child := parent.children[name]; child != nil {
return syserror.EEXIST
}
+ if !dir && rp.MustBeDir() {
+ return syserror.ENOENT
+ }
// No cached dentry exists; however, there might still be an existing file
// at name. As above, we attempt the file creation RPC anyway.
if err := createInRemoteDir(parent, name, &ds); err != nil {
@@ -842,7 +847,7 @@ func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
mode: opts.Mode,
kuid: creds.EffectiveKUID,
kgid: creds.EffectiveKGID,
- pipe: pipe.NewVFSPipe(true /* isNamed */, pipe.DefaultPipeSize, usermem.PageSize),
+ pipe: pipe.NewVFSPipe(true /* isNamed */, pipe.DefaultPipeSize),
})
return nil
}
@@ -1162,18 +1167,21 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving
// Incorporate the fid that was opened by lcreate.
useRegularFileFD := child.fileType() == linux.S_IFREG && !d.fs.opts.regularFilesUseSpecialFileFD
if useRegularFileFD {
+ openFD := int32(-1)
+ if fdobj != nil {
+ openFD = int32(fdobj.Release())
+ }
child.handleMu.Lock()
if vfs.MayReadFileWithOpenFlags(opts.Flags) {
child.readFile = openFile
if fdobj != nil {
- child.hostFD = int32(fdobj.Release())
+ child.readFD = openFD
+ child.mmapFD = openFD
}
- } else if fdobj != nil {
- // Can't use fdobj if it's not readable.
- fdobj.Close()
}
if vfs.MayWriteFileWithOpenFlags(opts.Flags) {
child.writeFile = openFile
+ child.writeFD = openFD
}
child.handleMu.Unlock()
}
diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go
index 6f82ce61b..53bcc9986 100644
--- a/pkg/sentry/fsimpl/gofer/gofer.go
+++ b/pkg/sentry/fsimpl/gofer/gofer.go
@@ -548,11 +548,16 @@ func (fs *filesystem) Release(ctx context.Context) {
d.cache.DropAll(mf)
d.dirty.RemoveAll()
d.dataMu.Unlock()
- // Close the host fd if one exists.
- if d.hostFD >= 0 {
- syscall.Close(int(d.hostFD))
- d.hostFD = -1
+ // Close host FDs if they exist.
+ if d.readFD >= 0 {
+ syscall.Close(int(d.readFD))
}
+ if d.writeFD >= 0 && d.readFD != d.writeFD {
+ syscall.Close(int(d.writeFD))
+ }
+ d.readFD = -1
+ d.writeFD = -1
+ d.mmapFD = -1
d.handleMu.Unlock()
}
// There can't be any specialFileFDs still using fs, since each such
@@ -726,15 +731,17 @@ type dentry struct {
// - If this dentry represents a regular file or directory, readFile is the
// p9.File used for reads by all regularFileFDs/directoryFDs representing
- // this dentry.
+ // this dentry, and readFD (if not -1) is a host FD equivalent to readFile
+ // used as a faster alternative.
//
// - If this dentry represents a regular file, writeFile is the p9.File
- // used for writes by all regularFileFDs representing this dentry.
+ // used for writes by all regularFileFDs representing this dentry, and
+ // writeFD (if not -1) is a host FD equivalent to writeFile used as a
+ // faster alternative.
//
- // - If this dentry represents a regular file, hostFD is the host FD used
- // for memory mappings and I/O (when applicable) in preference to readFile
- // and writeFile. hostFD is always readable; if !writeFile.isNil(), it must
- // also be writable. If hostFD is -1, no such host FD is available.
+ // - If this dentry represents a regular file, mmapFD is the host FD used
+ // for memory mappings. If mmapFD is -1, no such FD is available, and the
+ // internal page cache implementation is used for memory mappings instead.
//
// These fields are protected by handleMu.
//
@@ -742,10 +749,17 @@ type dentry struct {
// either p9.File transitions from closed (isNil() == true) to open
// (isNil() == false), it may be mutated with handleMu locked, but cannot
// be closed until the dentry is destroyed.
+ //
+ // readFD and writeFD may or may not be the same file descriptor. mmapFD is
+ // always either -1 or equal to readFD; if !writeFile.isNil() (the file has
+ // been opened for writing), it is additionally either -1 or equal to
+ // writeFD.
handleMu sync.RWMutex `state:"nosave"`
readFile p9file `state:"nosave"`
writeFile p9file `state:"nosave"`
- hostFD int32 `state:"nosave"`
+ readFD int32 `state:"nosave"`
+ writeFD int32 `state:"nosave"`
+ mmapFD int32 `state:"nosave"`
dataMu sync.RWMutex `state:"nosave"`
@@ -829,7 +843,9 @@ func (fs *filesystem) newDentry(ctx context.Context, file p9file, qid p9.QID, ma
uid: uint32(fs.opts.dfltuid),
gid: uint32(fs.opts.dfltgid),
blockSize: usermem.PageSize,
- hostFD: -1,
+ readFD: -1,
+ writeFD: -1,
+ mmapFD: -1,
}
d.pf.dentry = d
if mask.UID {
@@ -1220,7 +1236,9 @@ func (d *dentry) IncRef() {
// d.refs may be 0 if d.fs.renameMu is locked, which serializes against
// d.checkCachingLocked().
r := atomic.AddInt64(&d.refs, 1)
- refsvfs2.LogIncRef(d, r)
+ if d.LogRefs() {
+ refsvfs2.LogIncRef(d, r)
+ }
}
// TryIncRef implements vfs.DentryImpl.TryIncRef.
@@ -1231,7 +1249,9 @@ func (d *dentry) TryIncRef() bool {
return false
}
if atomic.CompareAndSwapInt64(&d.refs, r, r+1) {
- refsvfs2.LogTryIncRef(d, r+1)
+ if d.LogRefs() {
+ refsvfs2.LogTryIncRef(d, r+1)
+ }
return true
}
}
@@ -1251,7 +1271,9 @@ func (d *dentry) DecRef(ctx context.Context) {
// responsible for ensuring that d.checkCachingLocked will be called later.
func (d *dentry) decRefNoCaching() int64 {
r := atomic.AddInt64(&d.refs, -1)
- refsvfs2.LogDecRef(d, r)
+ if d.LogRefs() {
+ refsvfs2.LogDecRef(d, r)
+ }
if r < 0 {
panic("gofer.dentry.decRefNoCaching() called without holding a reference")
}
@@ -1469,10 +1491,15 @@ func (d *dentry) destroyLocked(ctx context.Context) {
}
d.readFile = p9file{}
d.writeFile = p9file{}
- if d.hostFD >= 0 {
- syscall.Close(int(d.hostFD))
- d.hostFD = -1
+ if d.readFD >= 0 {
+ syscall.Close(int(d.readFD))
+ }
+ if d.writeFD >= 0 && d.readFD != d.writeFD {
+ syscall.Close(int(d.writeFD))
}
+ d.readFD = -1
+ d.writeFD = -1
+ d.mmapFD = -1
d.handleMu.Unlock()
if !d.file.isNil() {
@@ -1584,7 +1611,8 @@ func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool
d.handleMu.RUnlock()
}
- fdToClose := int32(-1)
+ var fdsToCloseArr [2]int32
+ fdsToClose := fdsToCloseArr[:0]
invalidateTranslations := false
d.handleMu.Lock()
if (read && d.readFile.isNil()) || (write && d.writeFile.isNil()) || trunc {
@@ -1615,56 +1643,88 @@ func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool
return err
}
- if d.hostFD < 0 && h.fd >= 0 && openReadable && (d.writeFile.isNil() || openWritable) {
- // We have no existing FD, and the new FD meets the requirements
- // for d.hostFD, so start using it.
- d.hostFD = h.fd
- } else if d.hostFD >= 0 && d.writeFile.isNil() && openWritable {
- // We have an existing read-only FD, but the file has just been
- // opened for writing, so we need to start supporting writable memory
- // mappings. This may race with callers of d.pf.FD() using the existing
- // FD, so in most cases we need to delay closing the old FD until after
- // invalidating memmap.Translations that might have observed it.
- if !openReadable || h.fd < 0 {
- // We don't have a read/write FD, so we have no FD that can be
- // used to create writable memory mappings. Switch to using the
- // internal page cache.
- invalidateTranslations = true
- fdToClose = d.hostFD
- d.hostFD = -1
- } else if d.fs.opts.overlayfsStaleRead {
- // We do have a read/write FD, but it may not be coherent with
- // the existing read-only FD, so we must switch to mappings of
- // the new FD in both the application and sentry.
- if err := d.pf.hostFileMapper.RegenerateMappings(int(h.fd)); err != nil {
- d.handleMu.Unlock()
- ctx.Warningf("gofer.dentry.ensureSharedHandle: failed to replace sentry mappings of old FD with mappings of new FD: %v", err)
- h.close(ctx)
- return err
+ // Update d.readFD and d.writeFD.
+ if h.fd >= 0 {
+ if openReadable && openWritable && (d.readFD < 0 || d.writeFD < 0 || d.readFD != d.writeFD) {
+ // Replace existing FDs with this one.
+ if d.readFD >= 0 {
+ // We already have a readable FD that may be in use by
+ // concurrent callers of d.pf.FD().
+ if d.fs.opts.overlayfsStaleRead {
+ // If overlayfsStaleRead is in effect, then the new FD
+ // may not be coherent with the existing one, so we
+ // have no choice but to switch to mappings of the new
+ // FD in both the application and sentry.
+ if err := d.pf.hostFileMapper.RegenerateMappings(int(h.fd)); err != nil {
+ d.handleMu.Unlock()
+ ctx.Warningf("gofer.dentry.ensureSharedHandle: failed to replace sentry mappings of old FD with mappings of new FD: %v", err)
+ h.close(ctx)
+ return err
+ }
+ fdsToClose = append(fdsToClose, d.readFD)
+ invalidateTranslations = true
+ d.readFD = h.fd
+ } else {
+ // Otherwise, we want to avoid invalidating existing
+ // memmap.Translations (which is expensive); instead, use
+ // dup3 to make the old file descriptor refer to the new
+ // file description, then close the new file descriptor
+ // (which is no longer needed). Racing callers of d.pf.FD()
+ // may use the old or new file description, but this
+ // doesn't matter since they refer to the same file, and
+ // any racing mappings must be read-only.
+ if err := syscall.Dup3(int(h.fd), int(d.readFD), syscall.O_CLOEXEC); err != nil {
+ oldFD := d.readFD
+ d.handleMu.Unlock()
+ ctx.Warningf("gofer.dentry.ensureSharedHandle: failed to dup fd %d to fd %d: %v", h.fd, oldFD, err)
+ h.close(ctx)
+ return err
+ }
+ fdsToClose = append(fdsToClose, h.fd)
+ h.fd = d.readFD
+ }
+ } else {
+ d.readFD = h.fd
}
- invalidateTranslations = true
- fdToClose = d.hostFD
- d.hostFD = h.fd
- } else {
- // We do have a read/write FD. To avoid invalidating existing
- // memmap.Translations (which is expensive), use dup3 to make
- // the old file descriptor refer to the new file description,
- // then close the new file descriptor (which is no longer
- // needed). Racing callers of d.pf.FD() may use the old or new
- // file description, but this doesn't matter since they refer
- // to the same file, and any racing mappings must be read-only.
- if err := syscall.Dup3(int(h.fd), int(d.hostFD), syscall.O_CLOEXEC); err != nil {
- oldHostFD := d.hostFD
- d.handleMu.Unlock()
- ctx.Warningf("gofer.dentry.ensureSharedHandle: failed to dup fd %d to fd %d: %v", h.fd, oldHostFD, err)
- h.close(ctx)
- return err
+ if d.writeFD != h.fd && d.writeFD >= 0 {
+ fdsToClose = append(fdsToClose, d.writeFD)
}
- fdToClose = h.fd
+ d.writeFD = h.fd
+ d.mmapFD = h.fd
+ } else if openReadable && d.readFD < 0 {
+ d.readFD = h.fd
+ // If the file has not been opened for writing, the new FD may
+ // be used for read-only memory mappings. If the file was
+ // previously opened for reading (without an FD), then existing
+ // translations of the file may use the internal page cache;
+ // invalidate those mappings.
+ if d.writeFile.isNil() {
+ invalidateTranslations = !d.readFile.isNil()
+ d.mmapFD = h.fd
+ }
+ } else if openWritable && d.writeFD < 0 {
+ d.writeFD = h.fd
+ if d.readFD >= 0 {
+ // We have an existing read-only FD, but the file has just
+ // been opened for writing, so we need to start supporting
+ // writable memory mappings. However, the new FD is not
+ // readable, so we have no FD that can be used to create
+ // writable memory mappings. Switch to using the internal
+ // page cache.
+ invalidateTranslations = true
+ d.mmapFD = -1
+ }
+ } else {
+ // The new FD is not useful.
+ fdsToClose = append(fdsToClose, h.fd)
}
- } else {
- // h.fd is not useful.
- fdToClose = h.fd
+ } else if openWritable && d.writeFD < 0 && d.mmapFD >= 0 {
+ // We have an existing read-only FD, but the file has just been
+ // opened for writing, so we need to start supporting writable
+ // memory mappings. However, we have no writable host FD. Switch to
+ // using the internal page cache.
+ invalidateTranslations = true
+ d.mmapFD = -1
}
// Switch to new fids.
@@ -1698,8 +1758,8 @@ func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool
d.mappings.InvalidateAll(memmap.InvalidateOpts{})
d.mapsMu.Unlock()
}
- if fdToClose >= 0 {
- syscall.Close(int(fdToClose))
+ for _, fd := range fdsToClose {
+ syscall.Close(int(fd))
}
return nil
@@ -1709,7 +1769,7 @@ func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool
func (d *dentry) readHandleLocked() handle {
return handle{
file: d.readFile,
- fd: d.hostFD,
+ fd: d.readFD,
}
}
@@ -1717,7 +1777,7 @@ func (d *dentry) readHandleLocked() handle {
func (d *dentry) writeHandleLocked() handle {
return handle{
file: d.writeFile,
- fd: d.hostFD,
+ fd: d.writeFD,
}
}
@@ -1730,16 +1790,24 @@ func (d *dentry) syncRemoteFile(ctx context.Context) error {
// Preconditions: d.handleMu must be locked.
func (d *dentry) syncRemoteFileLocked(ctx context.Context) error {
// If we have a host FD, fsyncing it is likely to be faster than an fsync
- // RPC.
- if d.hostFD >= 0 {
+ // RPC. Prefer syncing write handles over read handles, since some remote
+ // filesystem implementations may not sync changes made through write
+ // handles otherwise.
+ if d.writeFD >= 0 {
ctx.UninterruptibleSleepStart(false)
- err := syscall.Fsync(int(d.hostFD))
+ err := syscall.Fsync(int(d.writeFD))
ctx.UninterruptibleSleepFinish(false)
return err
}
if !d.writeFile.isNil() {
return d.writeFile.fsync(ctx)
}
+ if d.readFD >= 0 {
+ ctx.UninterruptibleSleepStart(false)
+ err := syscall.Fsync(int(d.readFD))
+ ctx.UninterruptibleSleepFinish(false)
+ return err
+ }
if !d.readFile.isNil() {
return d.readFile.fsync(ctx)
}
diff --git a/pkg/sentry/fsimpl/gofer/regular_file.go b/pkg/sentry/fsimpl/gofer/regular_file.go
index dc8a890cb..652142ecc 100644
--- a/pkg/sentry/fsimpl/gofer/regular_file.go
+++ b/pkg/sentry/fsimpl/gofer/regular_file.go
@@ -326,7 +326,7 @@ func (rw *dentryReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error)
// dentry.readHandleLocked() without locking dentry.dataMu.
rw.d.handleMu.RLock()
h := rw.d.readHandleLocked()
- if (rw.d.hostFD >= 0 && !rw.d.fs.opts.forcePageCache) || rw.d.fs.opts.interop == InteropModeShared || rw.direct {
+ if (rw.d.mmapFD >= 0 && !rw.d.fs.opts.forcePageCache) || rw.d.fs.opts.interop == InteropModeShared || rw.direct {
n, err := h.readToBlocksAt(rw.ctx, dsts, rw.off)
rw.d.handleMu.RUnlock()
rw.off += n
@@ -446,7 +446,7 @@ func (rw *dentryReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, erro
// without locking dentry.dataMu.
rw.d.handleMu.RLock()
h := rw.d.writeHandleLocked()
- if (rw.d.hostFD >= 0 && !rw.d.fs.opts.forcePageCache) || rw.d.fs.opts.interop == InteropModeShared || rw.direct {
+ if (rw.d.mmapFD >= 0 && !rw.d.fs.opts.forcePageCache) || rw.d.fs.opts.interop == InteropModeShared || rw.direct {
n, err := h.writeFromBlocksAt(rw.ctx, srcs, rw.off)
rw.off += n
rw.d.dataMu.Lock()
@@ -648,7 +648,7 @@ func (fd *regularFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpt
return syserror.ENODEV
}
d.handleMu.RLock()
- haveFD := d.hostFD >= 0
+ haveFD := d.mmapFD >= 0
d.handleMu.RUnlock()
if !haveFD {
return syserror.ENODEV
@@ -669,7 +669,7 @@ func (d *dentry) mayCachePages() bool {
return true
}
d.handleMu.RLock()
- haveFD := d.hostFD >= 0
+ haveFD := d.mmapFD >= 0
d.handleMu.RUnlock()
return haveFD
}
@@ -727,7 +727,7 @@ func (d *dentry) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR,
// Translate implements memmap.Mappable.Translate.
func (d *dentry) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) {
d.handleMu.RLock()
- if d.hostFD >= 0 && !d.fs.opts.forcePageCache {
+ if d.mmapFD >= 0 && !d.fs.opts.forcePageCache {
d.handleMu.RUnlock()
mr := optional
if d.fs.opts.limitHostFDTranslation {
@@ -881,7 +881,7 @@ func (d *dentry) Evict(ctx context.Context, er pgalloc.EvictableRange) {
// cannot implement both vfs.DentryImpl.IncRef and memmap.File.IncRef.
//
// dentryPlatformFile is only used when a host FD representing the remote file
-// is available (i.e. dentry.hostFD >= 0), and that FD is used for application
+// is available (i.e. dentry.mmapFD >= 0), and that FD is used for application
// memory mappings (i.e. !filesystem.opts.forcePageCache).
//
// +stateify savable
@@ -892,8 +892,8 @@ type dentryPlatformFile struct {
// by dentry.dataMu.
fdRefs fsutil.FrameRefSet
- // If this dentry represents a regular file, and dentry.hostFD >= 0,
- // hostFileMapper caches mappings of dentry.hostFD.
+ // If this dentry represents a regular file, and dentry.mmapFD >= 0,
+ // hostFileMapper caches mappings of dentry.mmapFD.
hostFileMapper fsutil.HostFileMapper
// hostFileMapperInitOnce is used to lazily initialize hostFileMapper.
@@ -918,12 +918,12 @@ func (d *dentryPlatformFile) DecRef(fr memmap.FileRange) {
func (d *dentryPlatformFile) MapInternal(fr memmap.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) {
d.handleMu.RLock()
defer d.handleMu.RUnlock()
- return d.hostFileMapper.MapInternal(fr, int(d.hostFD), at.Write)
+ return d.hostFileMapper.MapInternal(fr, int(d.mmapFD), at.Write)
}
// FD implements memmap.File.FD.
func (d *dentryPlatformFile) FD() int {
d.handleMu.RLock()
defer d.handleMu.RUnlock()
- return int(d.hostFD)
+ return int(d.mmapFD)
}
diff --git a/pkg/sentry/fsimpl/gofer/save_restore.go b/pkg/sentry/fsimpl/gofer/save_restore.go
index 17849dcc0..c90071e4e 100644
--- a/pkg/sentry/fsimpl/gofer/save_restore.go
+++ b/pkg/sentry/fsimpl/gofer/save_restore.go
@@ -139,7 +139,9 @@ func (d *dentry) beforeSave() {
// afterLoad is invoked by stateify.
func (d *dentry) afterLoad() {
- d.hostFD = -1
+ d.readFD = -1
+ d.writeFD = -1
+ d.mmapFD = -1
if atomic.LoadInt64(&d.refs) != -1 {
refsvfs2.Register(d)
}
diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go
index 39b902a3e..435a21d77 100644
--- a/pkg/sentry/fsimpl/host/host.go
+++ b/pkg/sentry/fsimpl/host/host.go
@@ -126,8 +126,8 @@ func newInode(ctx context.Context, fs *filesystem, hostFD int, savable bool, fil
isTTY: isTTY,
savable: savable,
}
+ i.InitRefs()
i.CachedMappable.Init(hostFD)
- i.EnableLeakCheck()
// If the hostFD can return EWOULDBLOCK when set to non-blocking, do so and
// handle blocking behavior in the sentry.
diff --git a/pkg/sentry/fsimpl/host/socket.go b/pkg/sentry/fsimpl/host/socket.go
index 8a447e29f..60acc367f 100644
--- a/pkg/sentry/fsimpl/host/socket.go
+++ b/pkg/sentry/fsimpl/host/socket.go
@@ -84,6 +84,8 @@ type ConnectedEndpoint struct {
// init performs initialization required for creating new ConnectedEndpoints and
// for restoring them.
func (c *ConnectedEndpoint) init() *syserr.Error {
+ c.InitRefs()
+
family, err := syscall.GetsockoptInt(c.fd, syscall.SOL_SOCKET, syscall.SO_DOMAIN)
if err != nil {
return syserr.FromError(err)
@@ -132,7 +134,6 @@ func NewConnectedEndpoint(ctx context.Context, hostFD int, addr string, saveable
// ConnectedEndpointRefs start off with a single reference. We need two.
e.IncRef()
- e.EnableLeakCheck()
return &e, nil
}
@@ -376,8 +377,7 @@ func NewSCMEndpoint(ctx context.Context, hostFD int, queue *waiter.Queue, addr s
return nil, err
}
- // ConnectedEndpointRefs start off with a single reference. We need two.
+ // e starts off with a single reference. We need two.
e.IncRef()
- e.EnableLeakCheck()
return &e, nil
}
diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go
index f81056023..e77523f22 100644
--- a/pkg/sentry/fsimpl/kernfs/filesystem.go
+++ b/pkg/sentry/fsimpl/kernfs/filesystem.go
@@ -207,24 +207,23 @@ func (fs *Filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.Resolving
// Preconditions:
// * Filesystem.mu must be locked for at least reading.
// * isDir(parentInode) == true.
-func checkCreateLocked(ctx context.Context, rp *vfs.ResolvingPath, parent *Dentry) (string, error) {
- if err := parent.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
- return "", err
+func checkCreateLocked(ctx context.Context, creds *auth.Credentials, name string, parent *Dentry) error {
+ if err := parent.inode.CheckPermissions(ctx, creds, vfs.MayWrite|vfs.MayExec); err != nil {
+ return err
}
- pc := rp.Component()
- if pc == "." || pc == ".." {
- return "", syserror.EEXIST
+ if name == "." || name == ".." {
+ return syserror.EEXIST
}
- if len(pc) > linux.NAME_MAX {
- return "", syserror.ENAMETOOLONG
+ if len(name) > linux.NAME_MAX {
+ return syserror.ENAMETOOLONG
}
- if _, ok := parent.children[pc]; ok {
- return "", syserror.EEXIST
+ if _, ok := parent.children[name]; ok {
+ return syserror.EEXIST
}
if parent.VFSDentry().IsDead() {
- return "", syserror.ENOENT
+ return syserror.ENOENT
}
- return pc, nil
+ return nil
}
// checkDeleteLocked checks that the file represented by vfsd may be deleted.
@@ -265,7 +264,7 @@ func (fs *Filesystem) Release(ctx context.Context) {
//
// Precondition: Filesystem.mu is held.
func (d *Dentry) releaseKeptDentriesLocked(ctx context.Context) {
- if d.inode.Keep() {
+ if d.inode.Keep() && d != d.fs.root {
d.decRefLocked(ctx)
}
@@ -352,10 +351,13 @@ func (fs *Filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.
parent.dirMu.Lock()
defer parent.dirMu.Unlock()
- pc, err := checkCreateLocked(ctx, rp, parent)
- if err != nil {
+ pc := rp.Component()
+ if err := checkCreateLocked(ctx, rp.Credentials(), pc, parent); err != nil {
return err
}
+ if rp.MustBeDir() {
+ return syserror.ENOENT
+ }
if rp.Mount() != vd.Mount() {
return syserror.EXDEV
}
@@ -394,8 +396,8 @@ func (fs *Filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
parent.dirMu.Lock()
defer parent.dirMu.Unlock()
- pc, err := checkCreateLocked(ctx, rp, parent)
- if err != nil {
+ pc := rp.Component()
+ if err := checkCreateLocked(ctx, rp.Credentials(), pc, parent); err != nil {
return err
}
if err := rp.Mount().CheckBeginWrite(); err != nil {
@@ -430,10 +432,13 @@ func (fs *Filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
parent.dirMu.Lock()
defer parent.dirMu.Unlock()
- pc, err := checkCreateLocked(ctx, rp, parent)
- if err != nil {
+ pc := rp.Component()
+ if err := checkCreateLocked(ctx, rp.Credentials(), pc, parent); err != nil {
return err
}
+ if rp.MustBeDir() {
+ return syserror.ENOENT
+ }
if err := rp.Mount().CheckBeginWrite(); err != nil {
return err
}
@@ -657,8 +662,8 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
// Can we create the dst dentry?
var dst *Dentry
- pc, err := checkCreateLocked(ctx, rp, dstDir)
- switch err {
+ pc := rp.Component()
+ switch err := checkCreateLocked(ctx, rp.Credentials(), pc, dstDir); err {
case nil:
// Ok, continue with rename as replacement.
case syserror.EEXIST:
@@ -822,10 +827,13 @@ func (fs *Filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, targ
parent.dirMu.Lock()
defer parent.dirMu.Unlock()
- pc, err := checkCreateLocked(ctx, rp, parent)
- if err != nil {
+ pc := rp.Component()
+ if err := checkCreateLocked(ctx, rp.Credentials(), pc, parent); err != nil {
return err
}
+ if rp.MustBeDir() {
+ return syserror.ENOENT
+ }
if err := rp.Mount().CheckBeginWrite(); err != nil {
return err
}
diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
index d9d76758a..eac578f25 100644
--- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
+++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
@@ -568,13 +568,6 @@ func (o *OrderedChildren) RmDir(ctx context.Context, name string, child Inode) e
return o.Unlink(ctx, name, child)
}
-// +stateify savable
-type renameAcrossDifferentImplementationsError struct{}
-
-func (renameAcrossDifferentImplementationsError) Error() string {
- return "rename across inodes with different implementations"
-}
-
// Rename implements Inode.Rename.
//
// Precondition: Rename may only be called across two directory inodes with
@@ -585,13 +578,18 @@ func (renameAcrossDifferentImplementationsError) Error() string {
//
// Postcondition: reference on any replaced dentry transferred to caller.
func (o *OrderedChildren) Rename(ctx context.Context, oldname, newname string, child, dstDir Inode) error {
+ if !o.writable {
+ return syserror.EPERM
+ }
+
dst, ok := dstDir.(interface{}).(*OrderedChildren)
if !ok {
- return renameAcrossDifferentImplementationsError{}
+ return syserror.EXDEV
}
- if !o.writable || !dst.writable {
+ if !dst.writable {
return syserror.EPERM
}
+
// Note: There's a potential deadlock below if concurrent calls to Rename
// refer to the same src and dst directories in reverse. We avoid any
// ordering issues because the caller is required to serialize concurrent
@@ -662,7 +660,7 @@ var _ Inode = (*StaticDirectory)(nil)
func NewStaticDir(ctx context.Context, creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode, children map[string]Inode, fdOpts GenericDirectoryFDOptions) Inode {
inode := &StaticDirectory{}
inode.Init(ctx, creds, devMajor, devMinor, ino, perm, fdOpts)
- inode.EnableLeakCheck()
+ inode.InitRefs()
inode.OrderedChildren.Init(OrderedChildrenOptions{})
links := inode.OrderedChildren.Populate(children)
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go
index abb477c7d..c14abcff4 100644
--- a/pkg/sentry/fsimpl/kernfs/kernfs.go
+++ b/pkg/sentry/fsimpl/kernfs/kernfs.go
@@ -222,7 +222,9 @@ func (d *Dentry) IncRef() {
// d.refs may be 0 if d.fs.mu is locked, which serializes against
// d.cacheLocked().
r := atomic.AddInt64(&d.refs, 1)
- refsvfs2.LogIncRef(d, r)
+ if d.LogRefs() {
+ refsvfs2.LogIncRef(d, r)
+ }
}
// TryIncRef implements vfs.DentryImpl.TryIncRef.
@@ -233,7 +235,9 @@ func (d *Dentry) TryIncRef() bool {
return false
}
if atomic.CompareAndSwapInt64(&d.refs, r, r+1) {
- refsvfs2.LogTryIncRef(d, r+1)
+ if d.LogRefs() {
+ refsvfs2.LogTryIncRef(d, r+1)
+ }
return true
}
}
@@ -242,7 +246,9 @@ func (d *Dentry) TryIncRef() bool {
// DecRef implements vfs.DentryImpl.DecRef.
func (d *Dentry) DecRef(ctx context.Context) {
r := atomic.AddInt64(&d.refs, -1)
- refsvfs2.LogDecRef(d, r)
+ if d.LogRefs() {
+ refsvfs2.LogDecRef(d, r)
+ }
if r == 0 {
d.fs.mu.Lock()
d.cacheLocked(ctx)
@@ -254,7 +260,9 @@ func (d *Dentry) DecRef(ctx context.Context) {
func (d *Dentry) decRefLocked(ctx context.Context) {
r := atomic.AddInt64(&d.refs, -1)
- refsvfs2.LogDecRef(d, r)
+ if d.LogRefs() {
+ refsvfs2.LogDecRef(d, r)
+ }
if r == 0 {
d.cacheLocked(ctx)
} else if r < 0 {
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go
index 2418eec44..e63588e33 100644
--- a/pkg/sentry/fsimpl/kernfs/kernfs_test.go
+++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go
@@ -109,7 +109,7 @@ func (fs *filesystem) newReadonlyDir(ctx context.Context, creds *auth.Credential
dir := &readonlyDir{}
dir.attrs.Init(ctx, creds, 0 /* devMajor */, 0 /* devMinor */, fs.NextIno(), linux.ModeDirectory|mode)
dir.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
- dir.EnableLeakCheck()
+ dir.InitRefs()
dir.IncLinks(dir.OrderedChildren.Populate(contents))
return dir
}
@@ -147,7 +147,7 @@ func (fs *filesystem) newDir(ctx context.Context, creds *auth.Credentials, mode
dir.fs = fs
dir.attrs.Init(ctx, creds, 0 /* devMajor */, 0 /* devMinor */, fs.NextIno(), linux.ModeDirectory|mode)
dir.OrderedChildren.Init(kernfs.OrderedChildrenOptions{Writable: true})
- dir.EnableLeakCheck()
+ dir.InitRefs()
dir.IncLinks(dir.OrderedChildren.Populate(contents))
return dir
diff --git a/pkg/sentry/fsimpl/overlay/copy_up.go b/pkg/sentry/fsimpl/overlay/copy_up.go
index 4506642ca..469f3a33d 100644
--- a/pkg/sentry/fsimpl/overlay/copy_up.go
+++ b/pkg/sentry/fsimpl/overlay/copy_up.go
@@ -409,7 +409,7 @@ func (d *dentry) copyUpDescendantsLocked(ctx context.Context, ds **[]*dentry) er
if dirent.Name == "." || dirent.Name == ".." {
continue
}
- child, err := d.fs.getChildLocked(ctx, d, dirent.Name, ds)
+ child, _, err := d.fs.getChildLocked(ctx, d, dirent.Name, ds)
if err != nil {
return err
}
diff --git a/pkg/sentry/fsimpl/overlay/filesystem.go b/pkg/sentry/fsimpl/overlay/filesystem.go
index 04ca85f1a..bc07d72c0 100644
--- a/pkg/sentry/fsimpl/overlay/filesystem.go
+++ b/pkg/sentry/fsimpl/overlay/filesystem.go
@@ -22,6 +22,7 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/fspath"
+ "gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
"gvisor.dev/gvisor/pkg/sentry/vfs"
@@ -121,63 +122,63 @@ func (fs *filesystem) renameMuUnlockAndCheckDrop(ctx context.Context, ds **[]*de
// * fs.renameMu must be locked.
// * d.dirMu must be locked.
// * !rp.Done().
-func (fs *filesystem) stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, mayFollowSymlinks bool, ds **[]*dentry) (*dentry, error) {
+func (fs *filesystem) stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, mayFollowSymlinks bool, ds **[]*dentry) (*dentry, lookupLayer, error) {
if !d.isDir() {
- return nil, syserror.ENOTDIR
+ return nil, lookupLayerNone, syserror.ENOTDIR
}
if err := d.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
- return nil, err
+ return nil, lookupLayerNone, err
}
afterSymlink:
name := rp.Component()
if name == "." {
rp.Advance()
- return d, nil
+ return d, d.topLookupLayer(), nil
}
if name == ".." {
if isRoot, err := rp.CheckRoot(ctx, &d.vfsd); err != nil {
- return nil, err
+ return nil, lookupLayerNone, err
} else if isRoot || d.parent == nil {
rp.Advance()
- return d, nil
+ return d, d.topLookupLayer(), nil
}
if err := rp.CheckMount(ctx, &d.parent.vfsd); err != nil {
- return nil, err
+ return nil, lookupLayerNone, err
}
rp.Advance()
- return d.parent, nil
+ return d.parent, d.parent.topLookupLayer(), nil
}
- child, err := fs.getChildLocked(ctx, d, name, ds)
+ child, topLookupLayer, err := fs.getChildLocked(ctx, d, name, ds)
if err != nil {
- return nil, err
+ return nil, topLookupLayer, err
}
if err := rp.CheckMount(ctx, &child.vfsd); err != nil {
- return nil, err
+ return nil, lookupLayerNone, err
}
if child.isSymlink() && mayFollowSymlinks && rp.ShouldFollowSymlink() {
target, err := child.readlink(ctx)
if err != nil {
- return nil, err
+ return nil, lookupLayerNone, err
}
if err := rp.HandleSymlink(target); err != nil {
- return nil, err
+ return nil, topLookupLayer, err
}
goto afterSymlink // don't check the current directory again
}
rp.Advance()
- return child, nil
+ return child, topLookupLayer, nil
}
// Preconditions:
// * fs.renameMu must be locked.
// * d.dirMu must be locked.
-func (fs *filesystem) getChildLocked(ctx context.Context, parent *dentry, name string, ds **[]*dentry) (*dentry, error) {
+func (fs *filesystem) getChildLocked(ctx context.Context, parent *dentry, name string, ds **[]*dentry) (*dentry, lookupLayer, error) {
if child, ok := parent.children[name]; ok {
- return child, nil
+ return child, child.topLookupLayer(), nil
}
- child, err := fs.lookupLocked(ctx, parent, name)
+ child, topLookupLayer, err := fs.lookupLocked(ctx, parent, name)
if err != nil {
- return nil, err
+ return nil, topLookupLayer, err
}
if parent.children == nil {
parent.children = make(map[string]*dentry)
@@ -185,16 +186,16 @@ func (fs *filesystem) getChildLocked(ctx context.Context, parent *dentry, name s
parent.children[name] = child
// child's refcount is initially 0, so it may be dropped after traversal.
*ds = appendDentry(*ds, child)
- return child, nil
+ return child, topLookupLayer, nil
}
// Preconditions:
// * fs.renameMu must be locked.
// * parent.dirMu must be locked.
-func (fs *filesystem) lookupLocked(ctx context.Context, parent *dentry, name string) (*dentry, error) {
+func (fs *filesystem) lookupLocked(ctx context.Context, parent *dentry, name string) (*dentry, lookupLayer, error) {
childPath := fspath.Parse(name)
child := fs.newDentry()
- existsOnAnyLayer := false
+ topLookupLayer := lookupLayerNone
var lookupErr error
vfsObj := fs.vfsfs.VirtualFilesystem()
@@ -215,7 +216,7 @@ func (fs *filesystem) lookupLocked(ctx context.Context, parent *dentry, name str
defer childVD.DecRef(ctx)
mask := uint32(linux.STATX_TYPE)
- if !existsOnAnyLayer {
+ if topLookupLayer == lookupLayerNone {
// Mode, UID, GID, and (for non-directories) inode number come from
// the topmost layer on which the file exists.
mask |= linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO
@@ -238,10 +239,13 @@ func (fs *filesystem) lookupLocked(ctx context.Context, parent *dentry, name str
if isWhiteout(&stat) {
// This is a whiteout, so it "doesn't exist" on this layer, and
// layers below this one are ignored.
+ if isUpper {
+ topLookupLayer = lookupLayerUpperWhiteout
+ }
return false
}
isDir := stat.Mode&linux.S_IFMT == linux.S_IFDIR
- if existsOnAnyLayer && !isDir {
+ if topLookupLayer != lookupLayerNone && !isDir {
// Directories are not merged with non-directory files from lower
// layers; instead, layers including and below the first
// non-directory file are ignored. (This file must be a directory
@@ -258,8 +262,12 @@ func (fs *filesystem) lookupLocked(ctx context.Context, parent *dentry, name str
} else {
child.lowerVDs = append(child.lowerVDs, childVD)
}
- if !existsOnAnyLayer {
- existsOnAnyLayer = true
+ if topLookupLayer == lookupLayerNone {
+ if isUpper {
+ topLookupLayer = lookupLayerUpper
+ } else {
+ topLookupLayer = lookupLayerLower
+ }
child.mode = uint32(stat.Mode)
child.uid = stat.UID
child.gid = stat.GID
@@ -288,11 +296,11 @@ func (fs *filesystem) lookupLocked(ctx context.Context, parent *dentry, name str
if lookupErr != nil {
child.destroyLocked(ctx)
- return nil, lookupErr
+ return nil, topLookupLayer, lookupErr
}
- if !existsOnAnyLayer {
+ if !topLookupLayer.existsInOverlay() {
child.destroyLocked(ctx)
- return nil, syserror.ENOENT
+ return nil, topLookupLayer, syserror.ENOENT
}
// Device and inode numbers were copied from the topmost layer above;
@@ -306,7 +314,7 @@ func (fs *filesystem) lookupLocked(ctx context.Context, parent *dentry, name str
if err != nil {
ctx.Infof("overlay.filesystem.lookupLocked: failed to map lower layer device number (%d, %d) to an overlay-specific device number: %v", child.devMajor, child.devMinor, err)
child.destroyLocked(ctx)
- return nil, err
+ return nil, topLookupLayer, err
}
child.devMajor = linux.UNNAMED_MAJOR
child.devMinor = childDevMinor
@@ -315,7 +323,7 @@ func (fs *filesystem) lookupLocked(ctx context.Context, parent *dentry, name str
parent.IncRef()
child.parent = parent
child.name = name
- return child, nil
+ return child, topLookupLayer, nil
}
// lookupLayerLocked is similar to lookupLocked, but only returns information
@@ -414,7 +422,7 @@ func (ll lookupLayer) existsInOverlay() bool {
func (fs *filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, ds **[]*dentry) (*dentry, error) {
for !rp.Final() {
d.dirMu.Lock()
- next, err := fs.stepLocked(ctx, rp, d, true /* mayFollowSymlinks */, ds)
+ next, _, err := fs.stepLocked(ctx, rp, d, true /* mayFollowSymlinks */, ds)
d.dirMu.Unlock()
if err != nil {
return nil, err
@@ -434,7 +442,7 @@ func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath,
d := rp.Start().Impl().(*dentry)
for !rp.Done() {
d.dirMu.Lock()
- next, err := fs.stepLocked(ctx, rp, d, true /* mayFollowSymlinks */, ds)
+ next, _, err := fs.stepLocked(ctx, rp, d, true /* mayFollowSymlinks */, ds)
d.dirMu.Unlock()
if err != nil {
return nil, err
@@ -469,9 +477,6 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir
if name == "." || name == ".." {
return syserror.EEXIST
}
- if !dir && rp.MustBeDir() {
- return syserror.ENOENT
- }
if parent.vfsd.IsDead() {
return syserror.ENOENT
}
@@ -495,6 +500,10 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir
return syserror.EEXIST
}
+ if !dir && rp.MustBeDir() {
+ return syserror.ENOENT
+ }
+
// Ensure that the parent directory is copied-up so that we can create the
// new file in the upper layer.
if err := parent.copyUpLocked(ctx); err != nil {
@@ -797,9 +806,9 @@ afterTrailingSymlink:
}
// Determine whether or not we need to create a file.
parent.dirMu.Lock()
- child, err := fs.stepLocked(ctx, rp, parent, false /* mayFollowSymlinks */, &ds)
+ child, topLookupLayer, err := fs.stepLocked(ctx, rp, parent, false /* mayFollowSymlinks */, &ds)
if err == syserror.ENOENT && mayCreate {
- fd, err := fs.createAndOpenLocked(ctx, rp, parent, &opts, &ds)
+ fd, err := fs.createAndOpenLocked(ctx, rp, parent, &opts, &ds, topLookupLayer == lookupLayerUpperWhiteout)
parent.dirMu.Unlock()
return fd, err
}
@@ -899,7 +908,7 @@ func (d *dentry) openCopiedUp(ctx context.Context, rp *vfs.ResolvingPath, opts *
// Preconditions:
// * parent.dirMu must be locked.
// * parent does not already contain a child named rp.Component().
-func (fs *filesystem) createAndOpenLocked(ctx context.Context, rp *vfs.ResolvingPath, parent *dentry, opts *vfs.OpenOptions, ds **[]*dentry) (*vfs.FileDescription, error) {
+func (fs *filesystem) createAndOpenLocked(ctx context.Context, rp *vfs.ResolvingPath, parent *dentry, opts *vfs.OpenOptions, ds **[]*dentry, haveUpperWhiteout bool) (*vfs.FileDescription, error) {
creds := rp.Credentials()
if err := parent.checkPermissions(creds, vfs.MayWrite); err != nil {
return nil, err
@@ -924,19 +933,12 @@ func (fs *filesystem) createAndOpenLocked(ctx context.Context, rp *vfs.Resolving
Start: parent.upperVD,
Path: fspath.Parse(childName),
}
- // We don't know if a whiteout exists on the upper layer; speculatively
- // unlink it.
- //
- // TODO(gvisor.dev/issue/1199): Modify OpenAt => stepLocked so that we do
- // know whether a whiteout exists.
- var haveUpperWhiteout bool
- switch err := vfsObj.UnlinkAt(ctx, fs.creds, &pop); err {
- case nil:
- haveUpperWhiteout = true
- case syserror.ENOENT:
- haveUpperWhiteout = false
- default:
- return nil, err
+ // Unlink the whiteout if it exists.
+ if haveUpperWhiteout {
+ if err := vfsObj.UnlinkAt(ctx, fs.creds, &pop); err != nil {
+ log.Warningf("overlay.filesystem.createAndOpenLocked: failed to unlink whiteout: %v", err)
+ return nil, err
+ }
}
// Create the file on the upper layer, and get an FD representing it.
upperFD, err := vfsObj.OpenAt(ctx, fs.creds, &pop, &vfs.OpenOptions{
@@ -967,7 +969,7 @@ func (fs *filesystem) createAndOpenLocked(ctx context.Context, rp *vfs.Resolving
}
// Re-lookup to get a dentry representing the new file, which is needed for
// the returned FD.
- child, err := fs.getChildLocked(ctx, parent, childName, ds)
+ child, _, err := fs.getChildLocked(ctx, parent, childName, ds)
if err != nil {
if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &pop); cleanupErr != nil {
panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer file after OpenAt(O_CREAT) dentry lookup failure: %v", cleanupErr))
@@ -1047,7 +1049,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
// directory, we need to check for write permission on it.
oldParent.dirMu.Lock()
defer oldParent.dirMu.Unlock()
- renamed, err := fs.getChildLocked(ctx, oldParent, oldName, &ds)
+ renamed, _, err := fs.getChildLocked(ctx, oldParent, oldName, &ds)
if err != nil {
return err
}
@@ -1079,20 +1081,17 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
if newParent.vfsd.IsDead() {
return syserror.ENOENT
}
- replacedLayer, err := fs.lookupLayerLocked(ctx, newParent, newName)
- if err != nil {
- return err
- }
var (
- replaced *dentry
- replacedVFSD *vfs.Dentry
- whiteouts map[string]bool
+ replaced *dentry
+ replacedVFSD *vfs.Dentry
+ replacedLayer lookupLayer
+ whiteouts map[string]bool
)
- if replacedLayer.existsInOverlay() {
- replaced, err = fs.getChildLocked(ctx, newParent, newName, &ds)
- if err != nil {
- return err
- }
+ replaced, replacedLayer, err = fs.getChildLocked(ctx, newParent, newName, &ds)
+ if err != nil && err != syserror.ENOENT {
+ return err
+ }
+ if replaced != nil {
replacedVFSD = &replaced.vfsd
if replaced.isDir() {
if !renamed.isDir() {
@@ -1296,7 +1295,7 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error
// Unlike UnlinkAt, we need a dentry representing the child directory being
// removed in order to verify that it's empty.
- child, err := fs.getChildLocked(ctx, parent, name, &ds)
+ child, _, err := fs.getChildLocked(ctx, parent, name, &ds)
if err != nil {
return err
}
@@ -1548,7 +1547,7 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
if parentMode&linux.S_ISVTX != 0 {
// If the parent's sticky bit is set, we need a child dentry to get
// its owner.
- child, err = fs.getChildLocked(ctx, parent, name, &ds)
+ child, _, err = fs.getChildLocked(ctx, parent, name, &ds)
if err != nil {
return err
}
diff --git a/pkg/sentry/fsimpl/overlay/overlay.go b/pkg/sentry/fsimpl/overlay/overlay.go
index f6c58f2e7..3492409b2 100644
--- a/pkg/sentry/fsimpl/overlay/overlay.go
+++ b/pkg/sentry/fsimpl/overlay/overlay.go
@@ -514,7 +514,9 @@ func (d *dentry) IncRef() {
// d.refs may be 0 if d.fs.renameMu is locked, which serializes against
// d.checkDropLocked().
r := atomic.AddInt64(&d.refs, 1)
- refsvfs2.LogIncRef(d, r)
+ if d.LogRefs() {
+ refsvfs2.LogIncRef(d, r)
+ }
}
// TryIncRef implements vfs.DentryImpl.TryIncRef.
@@ -525,7 +527,9 @@ func (d *dentry) TryIncRef() bool {
return false
}
if atomic.CompareAndSwapInt64(&d.refs, r, r+1) {
- refsvfs2.LogTryIncRef(d, r+1)
+ if d.LogRefs() {
+ refsvfs2.LogTryIncRef(d, r+1)
+ }
return true
}
}
@@ -534,7 +538,9 @@ func (d *dentry) TryIncRef() bool {
// DecRef implements vfs.DentryImpl.DecRef.
func (d *dentry) DecRef(ctx context.Context) {
r := atomic.AddInt64(&d.refs, -1)
- refsvfs2.LogDecRef(d, r)
+ if d.LogRefs() {
+ refsvfs2.LogDecRef(d, r)
+ }
if r == 0 {
d.fs.renameMu.Lock()
d.checkDropLocked(ctx)
@@ -546,7 +552,9 @@ func (d *dentry) DecRef(ctx context.Context) {
func (d *dentry) decRefLocked(ctx context.Context) {
r := atomic.AddInt64(&d.refs, -1)
- refsvfs2.LogDecRef(d, r)
+ if d.LogRefs() {
+ refsvfs2.LogDecRef(d, r)
+ }
if r == 0 {
d.checkDropLocked(ctx)
} else if r < 0 {
@@ -696,6 +704,13 @@ func (d *dentry) topLayer() vfs.VirtualDentry {
return vd
}
+func (d *dentry) topLookupLayer() lookupLayer {
+ if d.upperVD.Ok() {
+ return lookupLayerUpper
+ }
+ return lookupLayerLower
+}
+
func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error {
return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(atomic.LoadUint32(&d.mode)), auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid)))
}
diff --git a/pkg/sentry/fsimpl/pipefs/pipefs.go b/pkg/sentry/fsimpl/pipefs/pipefs.go
index e44b79b68..0ecb592cf 100644
--- a/pkg/sentry/fsimpl/pipefs/pipefs.go
+++ b/pkg/sentry/fsimpl/pipefs/pipefs.go
@@ -101,7 +101,7 @@ type inode struct {
func newInode(ctx context.Context, fs *filesystem) *inode {
creds := auth.CredentialsFromContext(ctx)
return &inode{
- pipe: pipe.NewVFSPipe(false /* isNamed */, pipe.DefaultPipeSize, usermem.PageSize),
+ pipe: pipe.NewVFSPipe(false /* isNamed */, pipe.DefaultPipeSize),
ino: fs.Filesystem.NextIno(),
uid: creds.EffectiveKUID,
gid: creds.EffectiveKGID,
diff --git a/pkg/sentry/fsimpl/proc/subtasks.go b/pkg/sentry/fsimpl/proc/subtasks.go
index cb3c5e0fd..e001d5032 100644
--- a/pkg/sentry/fsimpl/proc/subtasks.go
+++ b/pkg/sentry/fsimpl/proc/subtasks.go
@@ -60,7 +60,7 @@ func (fs *filesystem) newSubtasks(task *kernel.Task, pidns *kernel.PIDNamespace,
// Note: credentials are overridden by taskOwnedInode.
subInode.InodeAttrs.Init(task, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555)
subInode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
- subInode.EnableLeakCheck()
+ subInode.InitRefs()
inode := &taskOwnedInode{Inode: subInode, owner: task}
return inode
diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go
index 19011b010..dc46a09bc 100644
--- a/pkg/sentry/fsimpl/proc/task.go
+++ b/pkg/sentry/fsimpl/proc/task.go
@@ -91,7 +91,7 @@ func (fs *filesystem) newTaskInode(task *kernel.Task, pidns *kernel.PIDNamespace
taskInode := &taskInode{task: task}
// Note: credentials are overridden by taskOwnedInode.
taskInode.InodeAttrs.Init(task, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555)
- taskInode.EnableLeakCheck()
+ taskInode.InitRefs()
inode := &taskOwnedInode{Inode: taskInode, owner: task}
diff --git a/pkg/sentry/fsimpl/proc/task_fds.go b/pkg/sentry/fsimpl/proc/task_fds.go
index d268b44be..3ec4471f5 100644
--- a/pkg/sentry/fsimpl/proc/task_fds.go
+++ b/pkg/sentry/fsimpl/proc/task_fds.go
@@ -128,7 +128,7 @@ func (fs *filesystem) newFDDirInode(task *kernel.Task) kernfs.Inode {
},
}
inode.InodeAttrs.Init(task, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555)
- inode.EnableLeakCheck()
+ inode.InitRefs()
inode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
return inode
}
@@ -265,7 +265,7 @@ func (fs *filesystem) newFDInfoDirInode(task *kernel.Task) kernfs.Inode {
},
}
inode.InodeAttrs.Init(task, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555)
- inode.EnableLeakCheck()
+ inode.InitRefs()
inode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
return inode
}
diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go
index b81ea14bf..151d1f10d 100644
--- a/pkg/sentry/fsimpl/proc/tasks.go
+++ b/pkg/sentry/fsimpl/proc/tasks.go
@@ -83,7 +83,7 @@ func (fs *filesystem) newTasksInode(ctx context.Context, k *kernel.Kernel, pidns
cgroupControllers: cgroupControllers,
}
inode.InodeAttrs.Init(ctx, root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555)
- inode.EnableLeakCheck()
+ inode.InitRefs()
inode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
links := inode.OrderedChildren.Populate(contents)
diff --git a/pkg/sentry/fsimpl/sys/sys.go b/pkg/sentry/fsimpl/sys/sys.go
index 506a2a0f0..79bc3fe88 100644
--- a/pkg/sentry/fsimpl/sys/sys.go
+++ b/pkg/sentry/fsimpl/sys/sys.go
@@ -160,7 +160,7 @@ func (fs *filesystem) newDir(ctx context.Context, creds *auth.Credentials, mode
d := &dir{}
d.InodeAttrs.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0755)
d.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
- d.EnableLeakCheck()
+ d.InitRefs()
d.IncLinks(d.OrderedChildren.Populate(contents))
return d
}
diff --git a/pkg/sentry/fsimpl/tmpfs/named_pipe.go b/pkg/sentry/fsimpl/tmpfs/named_pipe.go
index d772db9e9..57e7b57b0 100644
--- a/pkg/sentry/fsimpl/tmpfs/named_pipe.go
+++ b/pkg/sentry/fsimpl/tmpfs/named_pipe.go
@@ -18,7 +18,6 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/kernel/pipe"
- "gvisor.dev/gvisor/pkg/usermem"
)
// +stateify savable
@@ -32,7 +31,7 @@ type namedPipe struct {
// * fs.mu must be locked.
// * rp.Mount().CheckBeginWrite() has been called successfully.
func (fs *filesystem) newNamedPipe(kuid auth.KUID, kgid auth.KGID, mode linux.FileMode) *inode {
- file := &namedPipe{pipe: pipe.NewVFSPipe(true /* isNamed */, pipe.DefaultPipeSize, usermem.PageSize)}
+ file := &namedPipe{pipe: pipe.NewVFSPipe(true /* isNamed */, pipe.DefaultPipeSize)}
file.inode.init(file, fs, kuid, kgid, linux.S_IFIFO|mode)
file.inode.nlink = 1 // Only the parent has a link.
return &file.inode
diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
index 4ce859d57..85a3dfe20 100644
--- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go
+++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
@@ -402,7 +402,7 @@ func (i *inode) init(impl interface{}, fs *filesystem, kuid auth.KUID, kgid auth
i.mtime = now
// i.nlink initialized by caller
i.impl = impl
- i.refs.EnableLeakCheck()
+ i.refs.InitRefs()
}
// incLinksLocked increments i's link count.
diff --git a/pkg/sentry/fsimpl/verity/filesystem.go b/pkg/sentry/fsimpl/verity/filesystem.go
index 2f6050cfd..4e8d63d51 100644
--- a/pkg/sentry/fsimpl/verity/filesystem.go
+++ b/pkg/sentry/fsimpl/verity/filesystem.go
@@ -276,9 +276,9 @@ func (fs *filesystem) verifyChild(ctx context.Context, parent *dentry, child *de
UID: parentStat.UID,
GID: parentStat.GID,
//TODO(b/156980949): Support passing other hash algorithms.
- HashAlgorithms: linux.FS_VERITY_HASH_ALG_SHA256,
+ HashAlgorithms: fs.alg.toLinuxHashAlg(),
ReadOffset: int64(offset),
- ReadSize: int64(merkletree.DigestSize(linux.FS_VERITY_HASH_ALG_SHA256)),
+ ReadSize: int64(merkletree.DigestSize(fs.alg.toLinuxHashAlg())),
Expected: parent.hash,
DataAndTreeInSameFile: true,
}); err != nil && err != io.EOF {
@@ -352,7 +352,7 @@ func (fs *filesystem) verifyStat(ctx context.Context, d *dentry, stat linux.Stat
UID: stat.UID,
GID: stat.GID,
//TODO(b/156980949): Support passing other hash algorithms.
- HashAlgorithms: linux.FS_VERITY_HASH_ALG_SHA256,
+ HashAlgorithms: fs.alg.toLinuxHashAlg(),
ReadOffset: 0,
// Set read size to 0 so only the metadata is verified.
ReadSize: 0,
diff --git a/pkg/sentry/fsimpl/verity/verity.go b/pkg/sentry/fsimpl/verity/verity.go
index de92878fd..faa862c55 100644
--- a/pkg/sentry/fsimpl/verity/verity.go
+++ b/pkg/sentry/fsimpl/verity/verity.go
@@ -79,6 +79,27 @@ var (
verityMu sync.RWMutex
)
+// HashAlgorithm is a type specifying the algorithm used to hash the file
+// content.
+type HashAlgorithm int
+
+// Currently supported hashing algorithms include SHA256 and SHA512.
+const (
+ SHA256 HashAlgorithm = iota
+ SHA512
+)
+
+func (alg HashAlgorithm) toLinuxHashAlg() int {
+ switch alg {
+ case SHA256:
+ return linux.FS_VERITY_HASH_ALG_SHA256
+ case SHA512:
+ return linux.FS_VERITY_HASH_ALG_SHA512
+ default:
+ return 0
+ }
+}
+
// FilesystemType implements vfs.FilesystemType.
//
// +stateify savable
@@ -108,6 +129,10 @@ type filesystem struct {
// stores the root hash of the whole file system in bytes.
rootDentry *dentry
+ // alg is the algorithms used to hash the files in the verity file
+ // system.
+ alg HashAlgorithm
+
// renameMu synchronizes renaming with non-renaming operations in order
// to ensure consistent lock ordering between dentry.dirMu in different
// dentries.
@@ -136,6 +161,10 @@ type InternalFilesystemOptions struct {
// LowerName is the name of the filesystem wrapped by verity fs.
LowerName string
+ // Alg is the algorithms used to hash the files in the verity file
+ // system.
+ Alg HashAlgorithm
+
// RootHash is the root hash of the overall verity file system.
RootHash []byte
@@ -194,6 +223,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
fs := &filesystem{
creds: creds.Fork(),
+ alg: iopts.Alg,
lowerMount: mnt,
allowRuntimeEnable: iopts.AllowRuntimeEnable,
}
@@ -350,7 +380,9 @@ func (fs *filesystem) newDentry() *dentry {
// IncRef implements vfs.DentryImpl.IncRef.
func (d *dentry) IncRef() {
r := atomic.AddInt64(&d.refs, 1)
- refsvfs2.LogIncRef(d, r)
+ if d.LogRefs() {
+ refsvfs2.LogIncRef(d, r)
+ }
}
// TryIncRef implements vfs.DentryImpl.TryIncRef.
@@ -361,7 +393,9 @@ func (d *dentry) TryIncRef() bool {
return false
}
if atomic.CompareAndSwapInt64(&d.refs, r, r+1) {
- refsvfs2.LogTryIncRef(d, r+1)
+ if d.LogRefs() {
+ refsvfs2.LogTryIncRef(d, r+1)
+ }
return true
}
}
@@ -370,7 +404,9 @@ func (d *dentry) TryIncRef() bool {
// DecRef implements vfs.DentryImpl.DecRef.
func (d *dentry) DecRef(ctx context.Context) {
r := atomic.AddInt64(&d.refs, -1)
- refsvfs2.LogDecRef(d, r)
+ if d.LogRefs() {
+ refsvfs2.LogDecRef(d, r)
+ }
if r == 0 {
d.fs.renameMu.Lock()
d.checkDropLocked(ctx)
@@ -382,7 +418,9 @@ func (d *dentry) DecRef(ctx context.Context) {
func (d *dentry) decRefLocked(ctx context.Context) {
r := atomic.AddInt64(&d.refs, -1)
- refsvfs2.LogDecRef(d, r)
+ if d.LogRefs() {
+ refsvfs2.LogDecRef(d, r)
+ }
if r == 0 {
d.checkDropLocked(ctx)
} else if r < 0 {
@@ -627,7 +665,7 @@ func (fd *fileDescription) generateMerkle(ctx context.Context) ([]byte, uint64,
TreeReader: &merkleReader,
TreeWriter: &merkleWriter,
//TODO(b/156980949): Support passing other hash algorithms.
- HashAlgorithms: linux.FS_VERITY_HASH_ALG_SHA256,
+ HashAlgorithms: fd.d.fs.alg.toLinuxHashAlg(),
}
switch atomic.LoadUint32(&fd.d.mode) & linux.S_IFMT {
@@ -873,7 +911,7 @@ func (fd *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, of
UID: fd.d.uid,
GID: fd.d.gid,
//TODO(b/156980949): Support passing other hash algorithms.
- HashAlgorithms: linux.FS_VERITY_HASH_ALG_SHA256,
+ HashAlgorithms: fd.d.fs.alg.toLinuxHashAlg(),
ReadOffset: offset,
ReadSize: dst.NumBytes(),
Expected: fd.d.hash,
diff --git a/pkg/sentry/fsimpl/verity/verity_test.go b/pkg/sentry/fsimpl/verity/verity_test.go
index c647cbfd3..b2da9dd96 100644
--- a/pkg/sentry/fsimpl/verity/verity_test.go
+++ b/pkg/sentry/fsimpl/verity/verity_test.go
@@ -43,7 +43,7 @@ const maxDataSize = 100000
// newVerityRoot creates a new verity mount, and returns the root. The
// underlying file system is tmpfs. If the error is not nil, then cleanup
// should be called when the root is no longer needed.
-func newVerityRoot(t *testing.T) (*vfs.VirtualFilesystem, vfs.VirtualDentry, *kernel.Task, error) {
+func newVerityRoot(t *testing.T, hashAlg HashAlgorithm) (*vfs.VirtualFilesystem, vfs.VirtualDentry, *kernel.Task, error) {
k, err := testutil.Boot()
if err != nil {
t.Fatalf("testutil.Boot: %v", err)
@@ -70,6 +70,7 @@ func newVerityRoot(t *testing.T) (*vfs.VirtualFilesystem, vfs.VirtualDentry, *ke
InternalData: InternalFilesystemOptions{
RootMerkleFileName: rootMerkleFilename,
LowerName: "tmpfs",
+ Alg: hashAlg,
AllowRuntimeEnable: true,
NoCrashOnVerificationFailure: true,
},
@@ -161,280 +162,296 @@ func corruptRandomBit(ctx context.Context, fd *vfs.FileDescription, size int) er
return nil
}
+var hashAlgs = []HashAlgorithm{SHA256, SHA512}
+
// TestOpen ensures that when a file is created, the corresponding Merkle tree
// file and the root Merkle tree file exist.
func TestOpen(t *testing.T) {
- vfsObj, root, ctx, err := newVerityRoot(t)
- if err != nil {
- t.Fatalf("newVerityRoot: %v", err)
- }
-
- filename := "verity-test-file"
- if _, _, err := newFileFD(ctx, vfsObj, root, filename, 0644); err != nil {
- t.Fatalf("newFileFD: %v", err)
- }
-
- // Ensure that the corresponding Merkle tree file is created.
- lowerRoot := root.Dentry().Impl().(*dentry).lowerVD
- if _, err = vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
- Root: lowerRoot,
- Start: lowerRoot,
- Path: fspath.Parse(merklePrefix + filename),
- }, &vfs.OpenOptions{
- Flags: linux.O_RDONLY,
- }); err != nil {
- t.Errorf("OpenAt Merkle tree file %s: %v", merklePrefix+filename, err)
- }
-
- // Ensure the root merkle tree file is created.
- if _, err = vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
- Root: lowerRoot,
- Start: lowerRoot,
- Path: fspath.Parse(merklePrefix + rootMerkleFilename),
- }, &vfs.OpenOptions{
- Flags: linux.O_RDONLY,
- }); err != nil {
- t.Errorf("OpenAt root Merkle tree file %s: %v", merklePrefix+rootMerkleFilename, err)
+ for _, alg := range hashAlgs {
+ vfsObj, root, ctx, err := newVerityRoot(t, alg)
+ if err != nil {
+ t.Fatalf("newVerityRoot: %v", err)
+ }
+
+ filename := "verity-test-file"
+ if _, _, err := newFileFD(ctx, vfsObj, root, filename, 0644); err != nil {
+ t.Fatalf("newFileFD: %v", err)
+ }
+
+ // Ensure that the corresponding Merkle tree file is created.
+ lowerRoot := root.Dentry().Impl().(*dentry).lowerVD
+ if _, err = vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
+ Root: lowerRoot,
+ Start: lowerRoot,
+ Path: fspath.Parse(merklePrefix + filename),
+ }, &vfs.OpenOptions{
+ Flags: linux.O_RDONLY,
+ }); err != nil {
+ t.Errorf("OpenAt Merkle tree file %s: %v", merklePrefix+filename, err)
+ }
+
+ // Ensure the root merkle tree file is created.
+ if _, err = vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
+ Root: lowerRoot,
+ Start: lowerRoot,
+ Path: fspath.Parse(merklePrefix + rootMerkleFilename),
+ }, &vfs.OpenOptions{
+ Flags: linux.O_RDONLY,
+ }); err != nil {
+ t.Errorf("OpenAt root Merkle tree file %s: %v", merklePrefix+rootMerkleFilename, err)
+ }
}
}
// TestPReadUnmodifiedFileSucceeds ensures that pread from an untouched verity
// file succeeds after enabling verity for it.
func TestPReadUnmodifiedFileSucceeds(t *testing.T) {
- vfsObj, root, ctx, err := newVerityRoot(t)
- if err != nil {
- t.Fatalf("newVerityRoot: %v", err)
- }
-
- filename := "verity-test-file"
- fd, size, err := newFileFD(ctx, vfsObj, root, filename, 0644)
- if err != nil {
- t.Fatalf("newFileFD: %v", err)
- }
-
- // Enable verity on the file and confirm a normal read succeeds.
- var args arch.SyscallArguments
- args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY}
- if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil {
- t.Fatalf("Ioctl: %v", err)
- }
-
- buf := make([]byte, size)
- n, err := fd.PRead(ctx, usermem.BytesIOSequence(buf), 0 /* offset */, vfs.ReadOptions{})
- if err != nil && err != io.EOF {
- t.Fatalf("fd.PRead: %v", err)
- }
-
- if n != int64(size) {
- t.Errorf("fd.PRead got read length %d, want %d", n, size)
+ for _, alg := range hashAlgs {
+ vfsObj, root, ctx, err := newVerityRoot(t, alg)
+ if err != nil {
+ t.Fatalf("newVerityRoot: %v", err)
+ }
+
+ filename := "verity-test-file"
+ fd, size, err := newFileFD(ctx, vfsObj, root, filename, 0644)
+ if err != nil {
+ t.Fatalf("newFileFD: %v", err)
+ }
+
+ // Enable verity on the file and confirm a normal read succeeds.
+ var args arch.SyscallArguments
+ args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY}
+ if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil {
+ t.Fatalf("Ioctl: %v", err)
+ }
+
+ buf := make([]byte, size)
+ n, err := fd.PRead(ctx, usermem.BytesIOSequence(buf), 0 /* offset */, vfs.ReadOptions{})
+ if err != nil && err != io.EOF {
+ t.Fatalf("fd.PRead: %v", err)
+ }
+
+ if n != int64(size) {
+ t.Errorf("fd.PRead got read length %d, want %d", n, size)
+ }
}
}
// TestReadUnmodifiedFileSucceeds ensures that read from an untouched verity
// file succeeds after enabling verity for it.
func TestReadUnmodifiedFileSucceeds(t *testing.T) {
- vfsObj, root, ctx, err := newVerityRoot(t)
- if err != nil {
- t.Fatalf("newVerityRoot: %v", err)
- }
-
- filename := "verity-test-file"
- fd, size, err := newFileFD(ctx, vfsObj, root, filename, 0644)
- if err != nil {
- t.Fatalf("newFileFD: %v", err)
- }
-
- // Enable verity on the file and confirm a normal read succeeds.
- var args arch.SyscallArguments
- args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY}
- if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil {
- t.Fatalf("Ioctl: %v", err)
- }
-
- buf := make([]byte, size)
- n, err := fd.Read(ctx, usermem.BytesIOSequence(buf), vfs.ReadOptions{})
- if err != nil && err != io.EOF {
- t.Fatalf("fd.Read: %v", err)
- }
-
- if n != int64(size) {
- t.Errorf("fd.PRead got read length %d, want %d", n, size)
+ for _, alg := range hashAlgs {
+ vfsObj, root, ctx, err := newVerityRoot(t, alg)
+ if err != nil {
+ t.Fatalf("newVerityRoot: %v", err)
+ }
+
+ filename := "verity-test-file"
+ fd, size, err := newFileFD(ctx, vfsObj, root, filename, 0644)
+ if err != nil {
+ t.Fatalf("newFileFD: %v", err)
+ }
+
+ // Enable verity on the file and confirm a normal read succeeds.
+ var args arch.SyscallArguments
+ args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY}
+ if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil {
+ t.Fatalf("Ioctl: %v", err)
+ }
+
+ buf := make([]byte, size)
+ n, err := fd.Read(ctx, usermem.BytesIOSequence(buf), vfs.ReadOptions{})
+ if err != nil && err != io.EOF {
+ t.Fatalf("fd.Read: %v", err)
+ }
+
+ if n != int64(size) {
+ t.Errorf("fd.PRead got read length %d, want %d", n, size)
+ }
}
}
// TestReopenUnmodifiedFileSucceeds ensures that reopen an untouched verity file
// succeeds after enabling verity for it.
func TestReopenUnmodifiedFileSucceeds(t *testing.T) {
- vfsObj, root, ctx, err := newVerityRoot(t)
- if err != nil {
- t.Fatalf("newVerityRoot: %v", err)
- }
-
- filename := "verity-test-file"
- fd, _, err := newFileFD(ctx, vfsObj, root, filename, 0644)
- if err != nil {
- t.Fatalf("newFileFD: %v", err)
- }
-
- // Enable verity on the file and confirms a normal read succeeds.
- var args arch.SyscallArguments
- args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY}
- if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil {
- t.Fatalf("Ioctl: %v", err)
- }
-
- // Ensure reopening the verity enabled file succeeds.
- if _, err = vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
- Root: root,
- Start: root,
- Path: fspath.Parse(filename),
- }, &vfs.OpenOptions{
- Flags: linux.O_RDONLY,
- Mode: linux.ModeRegular,
- }); err != nil {
- t.Errorf("reopen enabled file failed: %v", err)
+ for _, alg := range hashAlgs {
+ vfsObj, root, ctx, err := newVerityRoot(t, alg)
+ if err != nil {
+ t.Fatalf("newVerityRoot: %v", err)
+ }
+
+ filename := "verity-test-file"
+ fd, _, err := newFileFD(ctx, vfsObj, root, filename, 0644)
+ if err != nil {
+ t.Fatalf("newFileFD: %v", err)
+ }
+
+ // Enable verity on the file and confirms a normal read succeeds.
+ var args arch.SyscallArguments
+ args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY}
+ if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil {
+ t.Fatalf("Ioctl: %v", err)
+ }
+
+ // Ensure reopening the verity enabled file succeeds.
+ if _, err = vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
+ Root: root,
+ Start: root,
+ Path: fspath.Parse(filename),
+ }, &vfs.OpenOptions{
+ Flags: linux.O_RDONLY,
+ Mode: linux.ModeRegular,
+ }); err != nil {
+ t.Errorf("reopen enabled file failed: %v", err)
+ }
}
}
// TestPReadModifiedFileFails ensures that read from a modified verity file
// fails.
func TestPReadModifiedFileFails(t *testing.T) {
- vfsObj, root, ctx, err := newVerityRoot(t)
- if err != nil {
- t.Fatalf("newVerityRoot: %v", err)
- }
-
- filename := "verity-test-file"
- fd, size, err := newFileFD(ctx, vfsObj, root, filename, 0644)
- if err != nil {
- t.Fatalf("newFileFD: %v", err)
- }
-
- // Enable verity on the file.
- var args arch.SyscallArguments
- args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY}
- if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil {
- t.Fatalf("Ioctl: %v", err)
- }
-
- // Open a new lowerFD that's read/writable.
- lowerVD := fd.Impl().(*fileDescription).d.lowerVD
-
- lowerFD, err := vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
- Root: lowerVD,
- Start: lowerVD,
- }, &vfs.OpenOptions{
- Flags: linux.O_RDWR,
- })
- if err != nil {
- t.Fatalf("OpenAt: %v", err)
- }
+ for _, alg := range hashAlgs {
+ vfsObj, root, ctx, err := newVerityRoot(t, alg)
+ if err != nil {
+ t.Fatalf("newVerityRoot: %v", err)
+ }
+
+ filename := "verity-test-file"
+ fd, size, err := newFileFD(ctx, vfsObj, root, filename, 0644)
+ if err != nil {
+ t.Fatalf("newFileFD: %v", err)
+ }
+
+ // Enable verity on the file.
+ var args arch.SyscallArguments
+ args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY}
+ if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil {
+ t.Fatalf("Ioctl: %v", err)
+ }
+
+ // Open a new lowerFD that's read/writable.
+ lowerVD := fd.Impl().(*fileDescription).d.lowerVD
+
+ lowerFD, err := vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
+ Root: lowerVD,
+ Start: lowerVD,
+ }, &vfs.OpenOptions{
+ Flags: linux.O_RDWR,
+ })
+ if err != nil {
+ t.Fatalf("OpenAt: %v", err)
+ }
- if err := corruptRandomBit(ctx, lowerFD, size); err != nil {
- t.Fatalf("corruptRandomBit: %v", err)
- }
+ if err := corruptRandomBit(ctx, lowerFD, size); err != nil {
+ t.Fatalf("corruptRandomBit: %v", err)
+ }
- // Confirm that read from the modified file fails.
- buf := make([]byte, size)
- if _, err := fd.PRead(ctx, usermem.BytesIOSequence(buf), 0 /* offset */, vfs.ReadOptions{}); err == nil {
- t.Fatalf("fd.PRead succeeded, expected failure")
+ // Confirm that read from the modified file fails.
+ buf := make([]byte, size)
+ if _, err := fd.PRead(ctx, usermem.BytesIOSequence(buf), 0 /* offset */, vfs.ReadOptions{}); err == nil {
+ t.Fatalf("fd.PRead succeeded, expected failure")
+ }
}
}
// TestReadModifiedFileFails ensures that read from a modified verity file
// fails.
func TestReadModifiedFileFails(t *testing.T) {
- vfsObj, root, ctx, err := newVerityRoot(t)
- if err != nil {
- t.Fatalf("newVerityRoot: %v", err)
- }
-
- filename := "verity-test-file"
- fd, size, err := newFileFD(ctx, vfsObj, root, filename, 0644)
- if err != nil {
- t.Fatalf("newFileFD: %v", err)
- }
-
- // Enable verity on the file.
- var args arch.SyscallArguments
- args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY}
- if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil {
- t.Fatalf("Ioctl: %v", err)
- }
-
- // Open a new lowerFD that's read/writable.
- lowerVD := fd.Impl().(*fileDescription).d.lowerVD
-
- lowerFD, err := vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
- Root: lowerVD,
- Start: lowerVD,
- }, &vfs.OpenOptions{
- Flags: linux.O_RDWR,
- })
- if err != nil {
- t.Fatalf("OpenAt: %v", err)
- }
+ for _, alg := range hashAlgs {
+ vfsObj, root, ctx, err := newVerityRoot(t, alg)
+ if err != nil {
+ t.Fatalf("newVerityRoot: %v", err)
+ }
+
+ filename := "verity-test-file"
+ fd, size, err := newFileFD(ctx, vfsObj, root, filename, 0644)
+ if err != nil {
+ t.Fatalf("newFileFD: %v", err)
+ }
+
+ // Enable verity on the file.
+ var args arch.SyscallArguments
+ args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY}
+ if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil {
+ t.Fatalf("Ioctl: %v", err)
+ }
+
+ // Open a new lowerFD that's read/writable.
+ lowerVD := fd.Impl().(*fileDescription).d.lowerVD
+
+ lowerFD, err := vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
+ Root: lowerVD,
+ Start: lowerVD,
+ }, &vfs.OpenOptions{
+ Flags: linux.O_RDWR,
+ })
+ if err != nil {
+ t.Fatalf("OpenAt: %v", err)
+ }
- if err := corruptRandomBit(ctx, lowerFD, size); err != nil {
- t.Fatalf("corruptRandomBit: %v", err)
- }
+ if err := corruptRandomBit(ctx, lowerFD, size); err != nil {
+ t.Fatalf("corruptRandomBit: %v", err)
+ }
- // Confirm that read from the modified file fails.
- buf := make([]byte, size)
- if _, err := fd.Read(ctx, usermem.BytesIOSequence(buf), vfs.ReadOptions{}); err == nil {
- t.Fatalf("fd.Read succeeded, expected failure")
+ // Confirm that read from the modified file fails.
+ buf := make([]byte, size)
+ if _, err := fd.Read(ctx, usermem.BytesIOSequence(buf), vfs.ReadOptions{}); err == nil {
+ t.Fatalf("fd.Read succeeded, expected failure")
+ }
}
}
// TestModifiedMerkleFails ensures that read from a verity file fails if the
// corresponding Merkle tree file is modified.
func TestModifiedMerkleFails(t *testing.T) {
- vfsObj, root, ctx, err := newVerityRoot(t)
- if err != nil {
- t.Fatalf("newVerityRoot: %v", err)
- }
-
- filename := "verity-test-file"
- fd, size, err := newFileFD(ctx, vfsObj, root, filename, 0644)
- if err != nil {
- t.Fatalf("newFileFD: %v", err)
- }
-
- // Enable verity on the file.
- var args arch.SyscallArguments
- args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY}
- if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil {
- t.Fatalf("Ioctl: %v", err)
- }
-
- // Open a new lowerMerkleFD that's read/writable.
- lowerMerkleVD := fd.Impl().(*fileDescription).d.lowerMerkleVD
-
- lowerMerkleFD, err := vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
- Root: lowerMerkleVD,
- Start: lowerMerkleVD,
- }, &vfs.OpenOptions{
- Flags: linux.O_RDWR,
- })
- if err != nil {
- t.Fatalf("OpenAt: %v", err)
- }
-
- // Flip a random bit in the Merkle tree file.
- stat, err := lowerMerkleFD.Stat(ctx, vfs.StatOptions{})
- if err != nil {
- t.Fatalf("stat: %v", err)
- }
- merkleSize := int(stat.Size)
- if err := corruptRandomBit(ctx, lowerMerkleFD, merkleSize); err != nil {
- t.Fatalf("corruptRandomBit: %v", err)
- }
-
- // Confirm that read from a file with modified Merkle tree fails.
- buf := make([]byte, size)
- if _, err := fd.PRead(ctx, usermem.BytesIOSequence(buf), 0 /* offset */, vfs.ReadOptions{}); err == nil {
- fmt.Println(buf)
- t.Fatalf("fd.PRead succeeded with modified Merkle file")
+ for _, alg := range hashAlgs {
+ vfsObj, root, ctx, err := newVerityRoot(t, alg)
+ if err != nil {
+ t.Fatalf("newVerityRoot: %v", err)
+ }
+
+ filename := "verity-test-file"
+ fd, size, err := newFileFD(ctx, vfsObj, root, filename, 0644)
+ if err != nil {
+ t.Fatalf("newFileFD: %v", err)
+ }
+
+ // Enable verity on the file.
+ var args arch.SyscallArguments
+ args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY}
+ if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil {
+ t.Fatalf("Ioctl: %v", err)
+ }
+
+ // Open a new lowerMerkleFD that's read/writable.
+ lowerMerkleVD := fd.Impl().(*fileDescription).d.lowerMerkleVD
+
+ lowerMerkleFD, err := vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
+ Root: lowerMerkleVD,
+ Start: lowerMerkleVD,
+ }, &vfs.OpenOptions{
+ Flags: linux.O_RDWR,
+ })
+ if err != nil {
+ t.Fatalf("OpenAt: %v", err)
+ }
+
+ // Flip a random bit in the Merkle tree file.
+ stat, err := lowerMerkleFD.Stat(ctx, vfs.StatOptions{})
+ if err != nil {
+ t.Fatalf("stat: %v", err)
+ }
+ merkleSize := int(stat.Size)
+ if err := corruptRandomBit(ctx, lowerMerkleFD, merkleSize); err != nil {
+ t.Fatalf("corruptRandomBit: %v", err)
+ }
+
+ // Confirm that read from a file with modified Merkle tree fails.
+ buf := make([]byte, size)
+ if _, err := fd.PRead(ctx, usermem.BytesIOSequence(buf), 0 /* offset */, vfs.ReadOptions{}); err == nil {
+ fmt.Println(buf)
+ t.Fatalf("fd.PRead succeeded with modified Merkle file")
+ }
}
}
@@ -442,140 +459,146 @@ func TestModifiedMerkleFails(t *testing.T) {
// verity enabled directory fails if the hashes related to the target file in
// the parent Merkle tree file is modified.
func TestModifiedParentMerkleFails(t *testing.T) {
- vfsObj, root, ctx, err := newVerityRoot(t)
- if err != nil {
- t.Fatalf("newVerityRoot: %v", err)
- }
-
- filename := "verity-test-file"
- fd, _, err := newFileFD(ctx, vfsObj, root, filename, 0644)
- if err != nil {
- t.Fatalf("newFileFD: %v", err)
- }
-
- // Enable verity on the file.
- var args arch.SyscallArguments
- args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY}
- if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil {
- t.Fatalf("Ioctl: %v", err)
- }
-
- // Enable verity on the parent directory.
- parentFD, err := vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
- Root: root,
- Start: root,
- }, &vfs.OpenOptions{
- Flags: linux.O_RDONLY,
- })
- if err != nil {
- t.Fatalf("OpenAt: %v", err)
- }
-
- if _, err := parentFD.Ioctl(ctx, nil /* uio */, args); err != nil {
- t.Fatalf("Ioctl: %v", err)
- }
-
- // Open a new lowerMerkleFD that's read/writable.
- parentLowerMerkleVD := fd.Impl().(*fileDescription).d.parent.lowerMerkleVD
-
- parentLowerMerkleFD, err := vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
- Root: parentLowerMerkleVD,
- Start: parentLowerMerkleVD,
- }, &vfs.OpenOptions{
- Flags: linux.O_RDWR,
- })
- if err != nil {
- t.Fatalf("OpenAt: %v", err)
- }
-
- // Flip a random bit in the parent Merkle tree file.
- // This parent directory contains only one child, so any random
- // modification in the parent Merkle tree should cause verification
- // failure when opening the child file.
- stat, err := parentLowerMerkleFD.Stat(ctx, vfs.StatOptions{})
- if err != nil {
- t.Fatalf("stat: %v", err)
- }
- parentMerkleSize := int(stat.Size)
- if err := corruptRandomBit(ctx, parentLowerMerkleFD, parentMerkleSize); err != nil {
- t.Fatalf("corruptRandomBit: %v", err)
- }
-
- parentLowerMerkleFD.DecRef(ctx)
-
- // Ensure reopening the verity enabled file fails.
- if _, err = vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
- Root: root,
- Start: root,
- Path: fspath.Parse(filename),
- }, &vfs.OpenOptions{
- Flags: linux.O_RDONLY,
- Mode: linux.ModeRegular,
- }); err == nil {
- t.Errorf("OpenAt file with modified parent Merkle succeeded")
+ for _, alg := range hashAlgs {
+ vfsObj, root, ctx, err := newVerityRoot(t, alg)
+ if err != nil {
+ t.Fatalf("newVerityRoot: %v", err)
+ }
+
+ filename := "verity-test-file"
+ fd, _, err := newFileFD(ctx, vfsObj, root, filename, 0644)
+ if err != nil {
+ t.Fatalf("newFileFD: %v", err)
+ }
+
+ // Enable verity on the file.
+ var args arch.SyscallArguments
+ args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY}
+ if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil {
+ t.Fatalf("Ioctl: %v", err)
+ }
+
+ // Enable verity on the parent directory.
+ parentFD, err := vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
+ Root: root,
+ Start: root,
+ }, &vfs.OpenOptions{
+ Flags: linux.O_RDONLY,
+ })
+ if err != nil {
+ t.Fatalf("OpenAt: %v", err)
+ }
+
+ if _, err := parentFD.Ioctl(ctx, nil /* uio */, args); err != nil {
+ t.Fatalf("Ioctl: %v", err)
+ }
+
+ // Open a new lowerMerkleFD that's read/writable.
+ parentLowerMerkleVD := fd.Impl().(*fileDescription).d.parent.lowerMerkleVD
+
+ parentLowerMerkleFD, err := vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
+ Root: parentLowerMerkleVD,
+ Start: parentLowerMerkleVD,
+ }, &vfs.OpenOptions{
+ Flags: linux.O_RDWR,
+ })
+ if err != nil {
+ t.Fatalf("OpenAt: %v", err)
+ }
+
+ // Flip a random bit in the parent Merkle tree file.
+ // This parent directory contains only one child, so any random
+ // modification in the parent Merkle tree should cause verification
+ // failure when opening the child file.
+ stat, err := parentLowerMerkleFD.Stat(ctx, vfs.StatOptions{})
+ if err != nil {
+ t.Fatalf("stat: %v", err)
+ }
+ parentMerkleSize := int(stat.Size)
+ if err := corruptRandomBit(ctx, parentLowerMerkleFD, parentMerkleSize); err != nil {
+ t.Fatalf("corruptRandomBit: %v", err)
+ }
+
+ parentLowerMerkleFD.DecRef(ctx)
+
+ // Ensure reopening the verity enabled file fails.
+ if _, err = vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
+ Root: root,
+ Start: root,
+ Path: fspath.Parse(filename),
+ }, &vfs.OpenOptions{
+ Flags: linux.O_RDONLY,
+ Mode: linux.ModeRegular,
+ }); err == nil {
+ t.Errorf("OpenAt file with modified parent Merkle succeeded")
+ }
}
}
// TestUnmodifiedStatSucceeds ensures that stat of an untouched verity file
// succeeds after enabling verity for it.
func TestUnmodifiedStatSucceeds(t *testing.T) {
- vfsObj, root, ctx, err := newVerityRoot(t)
- if err != nil {
- t.Fatalf("newVerityRoot: %v", err)
- }
-
- filename := "verity-test-file"
- fd, _, err := newFileFD(ctx, vfsObj, root, filename, 0644)
- if err != nil {
- t.Fatalf("newFileFD: %v", err)
- }
-
- // Enable verity on the file and confirms stat succeeds.
- var args arch.SyscallArguments
- args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY}
- if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil {
- t.Fatalf("fd.Ioctl: %v", err)
- }
-
- if _, err := fd.Stat(ctx, vfs.StatOptions{}); err != nil {
- t.Errorf("fd.Stat: %v", err)
+ for _, alg := range hashAlgs {
+ vfsObj, root, ctx, err := newVerityRoot(t, alg)
+ if err != nil {
+ t.Fatalf("newVerityRoot: %v", err)
+ }
+
+ filename := "verity-test-file"
+ fd, _, err := newFileFD(ctx, vfsObj, root, filename, 0644)
+ if err != nil {
+ t.Fatalf("newFileFD: %v", err)
+ }
+
+ // Enable verity on the file and confirms stat succeeds.
+ var args arch.SyscallArguments
+ args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY}
+ if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil {
+ t.Fatalf("fd.Ioctl: %v", err)
+ }
+
+ if _, err := fd.Stat(ctx, vfs.StatOptions{}); err != nil {
+ t.Errorf("fd.Stat: %v", err)
+ }
}
}
// TestModifiedStatFails checks that getting stat for a file with modified stat
// should fail.
func TestModifiedStatFails(t *testing.T) {
- vfsObj, root, ctx, err := newVerityRoot(t)
- if err != nil {
- t.Fatalf("newVerityRoot: %v", err)
- }
-
- filename := "verity-test-file"
- fd, _, err := newFileFD(ctx, vfsObj, root, filename, 0644)
- if err != nil {
- t.Fatalf("newFileFD: %v", err)
- }
-
- // Enable verity on the file.
- var args arch.SyscallArguments
- args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY}
- if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil {
- t.Fatalf("fd.Ioctl: %v", err)
- }
-
- lowerFD := fd.Impl().(*fileDescription).lowerFD
- // Change the stat of the underlying file, and check that stat fails.
- if err := lowerFD.SetStat(ctx, vfs.SetStatOptions{
- Stat: linux.Statx{
- Mask: uint32(linux.STATX_MODE),
- Mode: 0777,
- },
- }); err != nil {
- t.Fatalf("lowerFD.SetStat: %v", err)
- }
+ for _, alg := range hashAlgs {
+ vfsObj, root, ctx, err := newVerityRoot(t, alg)
+ if err != nil {
+ t.Fatalf("newVerityRoot: %v", err)
+ }
+
+ filename := "verity-test-file"
+ fd, _, err := newFileFD(ctx, vfsObj, root, filename, 0644)
+ if err != nil {
+ t.Fatalf("newFileFD: %v", err)
+ }
+
+ // Enable verity on the file.
+ var args arch.SyscallArguments
+ args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY}
+ if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil {
+ t.Fatalf("fd.Ioctl: %v", err)
+ }
+
+ lowerFD := fd.Impl().(*fileDescription).lowerFD
+ // Change the stat of the underlying file, and check that stat fails.
+ if err := lowerFD.SetStat(ctx, vfs.SetStatOptions{
+ Stat: linux.Statx{
+ Mask: uint32(linux.STATX_MODE),
+ Mode: 0777,
+ },
+ }); err != nil {
+ t.Fatalf("lowerFD.SetStat: %v", err)
+ }
- if _, err := fd.Stat(ctx, vfs.StatOptions{}); err == nil {
- t.Errorf("fd.Stat succeeded when it should fail")
+ if _, err := fd.Stat(ctx, vfs.StatOptions{}); err == nil {
+ t.Errorf("fd.Stat succeeded when it should fail")
+ }
}
}
@@ -616,84 +639,86 @@ func TestOpenDeletedFileFails(t *testing.T) {
}
for _, tc := range testCases {
t.Run(fmt.Sprintf("remove:%t", tc.remove), func(t *testing.T) {
- vfsObj, root, ctx, err := newVerityRoot(t)
- if err != nil {
- t.Fatalf("newVerityRoot: %v", err)
- }
+ for _, alg := range hashAlgs {
+ vfsObj, root, ctx, err := newVerityRoot(t, alg)
+ if err != nil {
+ t.Fatalf("newVerityRoot: %v", err)
+ }
- filename := "verity-test-file"
- fd, _, err := newFileFD(ctx, vfsObj, root, filename, 0644)
- if err != nil {
- t.Fatalf("newFileFD: %v", err)
- }
+ filename := "verity-test-file"
+ fd, _, err := newFileFD(ctx, vfsObj, root, filename, 0644)
+ if err != nil {
+ t.Fatalf("newFileFD: %v", err)
+ }
- // Enable verity on the file.
- var args arch.SyscallArguments
- args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY}
- if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil {
- t.Fatalf("Ioctl: %v", err)
- }
+ // Enable verity on the file.
+ var args arch.SyscallArguments
+ args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY}
+ if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil {
+ t.Fatalf("Ioctl: %v", err)
+ }
- rootLowerVD := root.Dentry().Impl().(*dentry).lowerVD
- if tc.remove {
- if tc.changeFile {
- if err := vfsObj.UnlinkAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
- Root: rootLowerVD,
- Start: rootLowerVD,
- Path: fspath.Parse(filename),
- }); err != nil {
- t.Fatalf("UnlinkAt: %v", err)
+ rootLowerVD := root.Dentry().Impl().(*dentry).lowerVD
+ if tc.remove {
+ if tc.changeFile {
+ if err := vfsObj.UnlinkAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
+ Root: rootLowerVD,
+ Start: rootLowerVD,
+ Path: fspath.Parse(filename),
+ }); err != nil {
+ t.Fatalf("UnlinkAt: %v", err)
+ }
}
- }
- if tc.changeMerkleFile {
- if err := vfsObj.UnlinkAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
- Root: rootLowerVD,
- Start: rootLowerVD,
- Path: fspath.Parse(merklePrefix + filename),
- }); err != nil {
- t.Fatalf("UnlinkAt: %v", err)
+ if tc.changeMerkleFile {
+ if err := vfsObj.UnlinkAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
+ Root: rootLowerVD,
+ Start: rootLowerVD,
+ Path: fspath.Parse(merklePrefix + filename),
+ }); err != nil {
+ t.Fatalf("UnlinkAt: %v", err)
+ }
}
- }
- } else {
- newFilename := "renamed-test-file"
- if tc.changeFile {
- if err := vfsObj.RenameAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
- Root: rootLowerVD,
- Start: rootLowerVD,
- Path: fspath.Parse(filename),
- }, &vfs.PathOperation{
- Root: rootLowerVD,
- Start: rootLowerVD,
- Path: fspath.Parse(newFilename),
- }, &vfs.RenameOptions{}); err != nil {
- t.Fatalf("RenameAt: %v", err)
+ } else {
+ newFilename := "renamed-test-file"
+ if tc.changeFile {
+ if err := vfsObj.RenameAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
+ Root: rootLowerVD,
+ Start: rootLowerVD,
+ Path: fspath.Parse(filename),
+ }, &vfs.PathOperation{
+ Root: rootLowerVD,
+ Start: rootLowerVD,
+ Path: fspath.Parse(newFilename),
+ }, &vfs.RenameOptions{}); err != nil {
+ t.Fatalf("RenameAt: %v", err)
+ }
}
- }
- if tc.changeMerkleFile {
- if err := vfsObj.RenameAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
- Root: rootLowerVD,
- Start: rootLowerVD,
- Path: fspath.Parse(merklePrefix + filename),
- }, &vfs.PathOperation{
- Root: rootLowerVD,
- Start: rootLowerVD,
- Path: fspath.Parse(merklePrefix + newFilename),
- }, &vfs.RenameOptions{}); err != nil {
- t.Fatalf("UnlinkAt: %v", err)
+ if tc.changeMerkleFile {
+ if err := vfsObj.RenameAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
+ Root: rootLowerVD,
+ Start: rootLowerVD,
+ Path: fspath.Parse(merklePrefix + filename),
+ }, &vfs.PathOperation{
+ Root: rootLowerVD,
+ Start: rootLowerVD,
+ Path: fspath.Parse(merklePrefix + newFilename),
+ }, &vfs.RenameOptions{}); err != nil {
+ t.Fatalf("UnlinkAt: %v", err)
+ }
}
}
- }
- // Ensure reopening the verity enabled file fails.
- if _, err = vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
- Root: root,
- Start: root,
- Path: fspath.Parse(filename),
- }, &vfs.OpenOptions{
- Flags: linux.O_RDONLY,
- Mode: linux.ModeRegular,
- }); err != syserror.EIO {
- t.Errorf("got OpenAt error: %v, expected EIO", err)
+ // Ensure reopening the verity enabled file fails.
+ if _, err = vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
+ Root: root,
+ Start: root,
+ Path: fspath.Parse(filename),
+ }, &vfs.OpenOptions{
+ Flags: linux.O_RDONLY,
+ Mode: linux.ModeRegular,
+ }); err != syserror.EIO {
+ t.Errorf("got OpenAt error: %v, expected EIO", err)
+ }
}
})
}
diff --git a/pkg/sentry/hostfd/BUILD b/pkg/sentry/hostfd/BUILD
index 364a78306..db3b0d0a0 100644
--- a/pkg/sentry/hostfd/BUILD
+++ b/pkg/sentry/hostfd/BUILD
@@ -6,10 +6,12 @@ go_library(
name = "hostfd",
srcs = [
"hostfd.go",
+ "hostfd_linux.go",
"hostfd_unsafe.go",
],
visibility = ["//pkg/sentry:internal"],
deps = [
+ "//pkg/log",
"//pkg/safemem",
"//pkg/sync",
"@org_golang_x_sys//unix:go_default_library",
diff --git a/pkg/sentry/hostfd/hostfd_linux.go b/pkg/sentry/hostfd/hostfd_linux.go
new file mode 100644
index 000000000..1cabc848f
--- /dev/null
+++ b/pkg/sentry/hostfd/hostfd_linux.go
@@ -0,0 +1,18 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package hostfd
+
+// maxIov is the maximum permitted size of a struct iovec array.
+const maxIov = 1024 // UIO_MAXIOV
diff --git a/pkg/sentry/hostfd/hostfd_unsafe.go b/pkg/sentry/hostfd/hostfd_unsafe.go
index cd4dc67fb..694371b1c 100644
--- a/pkg/sentry/hostfd/hostfd_unsafe.go
+++ b/pkg/sentry/hostfd/hostfd_unsafe.go
@@ -20,6 +20,7 @@ import (
"unsafe"
"golang.org/x/sys/unix"
+ "gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/safemem"
)
@@ -44,6 +45,10 @@ func Preadv2(fd int32, dsts safemem.BlockSeq, offset int64, flags uint32) (uint6
}
} else {
iovs := safemem.IovecsFromBlockSeq(dsts)
+ if len(iovs) > maxIov {
+ log.Debugf("hostfd.Preadv2: truncating from %d iovecs to %d", len(iovs), maxIov)
+ iovs = iovs[:maxIov]
+ }
n, _, e = syscall.Syscall6(unix.SYS_PREADV2, uintptr(fd), uintptr((unsafe.Pointer)(&iovs[0])), uintptr(len(iovs)), uintptr(offset), 0 /* pos_h */, uintptr(flags))
}
if e != 0 {
@@ -76,6 +81,10 @@ func Pwritev2(fd int32, srcs safemem.BlockSeq, offset int64, flags uint32) (uint
}
} else {
iovs := safemem.IovecsFromBlockSeq(srcs)
+ if len(iovs) > maxIov {
+ log.Debugf("hostfd.Preadv2: truncating from %d iovecs to %d", len(iovs), maxIov)
+ iovs = iovs[:maxIov]
+ }
n, _, e = syscall.Syscall6(unix.SYS_PWRITEV2, uintptr(fd), uintptr((unsafe.Pointer)(&iovs[0])), uintptr(len(iovs)), uintptr(offset), 0 /* pos_h */, uintptr(flags))
}
if e != 0 {
diff --git a/pkg/sentry/kernel/fd_table_unsafe.go b/pkg/sentry/kernel/fd_table_unsafe.go
index 3476551f3..470d8bf83 100644
--- a/pkg/sentry/kernel/fd_table_unsafe.go
+++ b/pkg/sentry/kernel/fd_table_unsafe.go
@@ -43,7 +43,7 @@ func (f *FDTable) initNoLeakCheck() {
// init initializes the table with leak checking.
func (f *FDTable) init() {
f.initNoLeakCheck()
- f.EnableLeakCheck()
+ f.InitRefs()
}
// get gets a file entry.
diff --git a/pkg/sentry/kernel/fs_context.go b/pkg/sentry/kernel/fs_context.go
index 41fb2a784..dfde4deee 100644
--- a/pkg/sentry/kernel/fs_context.go
+++ b/pkg/sentry/kernel/fs_context.go
@@ -63,7 +63,7 @@ func newFSContext(root, cwd *fs.Dirent, umask uint) *FSContext {
cwd: cwd,
umask: umask,
}
- f.EnableLeakCheck()
+ f.InitRefs()
return &f
}
@@ -76,7 +76,7 @@ func NewFSContextVFS2(root, cwd vfs.VirtualDentry, umask uint) *FSContext {
cwdVFS2: cwd,
umask: umask,
}
- f.EnableLeakCheck()
+ f.InitRefs()
return &f
}
@@ -137,7 +137,7 @@ func (f *FSContext) Fork() *FSContext {
rootVFS2: f.rootVFS2,
umask: f.umask,
}
- ctx.EnableLeakCheck()
+ ctx.InitRefs()
return ctx
}
diff --git a/pkg/sentry/kernel/ipc_namespace.go b/pkg/sentry/kernel/ipc_namespace.go
index b87e40dd1..9545bb5ef 100644
--- a/pkg/sentry/kernel/ipc_namespace.go
+++ b/pkg/sentry/kernel/ipc_namespace.go
@@ -41,7 +41,7 @@ func NewIPCNamespace(userNS *auth.UserNamespace) *IPCNamespace {
semaphores: semaphore.NewRegistry(userNS),
shms: shm.NewRegistry(userNS),
}
- ns.EnableLeakCheck()
+ ns.InitRefs()
return ns
}
diff --git a/pkg/sentry/kernel/pipe/node_test.go b/pkg/sentry/kernel/pipe/node_test.go
index ce0db5583..d6fb0fdb8 100644
--- a/pkg/sentry/kernel/pipe/node_test.go
+++ b/pkg/sentry/kernel/pipe/node_test.go
@@ -22,7 +22,6 @@ import (
"gvisor.dev/gvisor/pkg/sentry/contexttest"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/syserror"
- "gvisor.dev/gvisor/pkg/usermem"
)
type sleeper struct {
@@ -66,7 +65,8 @@ func testOpenOrDie(ctx context.Context, t *testing.T, n fs.InodeOperations, flag
d := fs.NewDirent(ctx, inode, "pipe")
file, err := n.GetFile(ctx, d, flags)
if err != nil {
- t.Fatalf("open with flags %+v failed: %v", flags, err)
+ t.Errorf("open with flags %+v failed: %v", flags, err)
+ return nil, err
}
if doneChan != nil {
doneChan <- struct{}{}
@@ -85,11 +85,11 @@ func testOpen(ctx context.Context, t *testing.T, n fs.InodeOperations, flags fs.
}
func newNamedPipe(t *testing.T) *Pipe {
- return NewPipe(true, DefaultPipeSize, usermem.PageSize)
+ return NewPipe(true, DefaultPipeSize)
}
func newAnonPipe(t *testing.T) *Pipe {
- return NewPipe(false, DefaultPipeSize, usermem.PageSize)
+ return NewPipe(false, DefaultPipeSize)
}
// assertRecvBlocks ensures that a recv attempt on c blocks for at least
diff --git a/pkg/sentry/kernel/pipe/pipe.go b/pkg/sentry/kernel/pipe/pipe.go
index 67beb0ad6..b989e14c7 100644
--- a/pkg/sentry/kernel/pipe/pipe.go
+++ b/pkg/sentry/kernel/pipe/pipe.go
@@ -26,18 +26,27 @@ import (
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
"gvisor.dev/gvisor/pkg/waiter"
)
const (
// MinimumPipeSize is a hard limit of the minimum size of a pipe.
- MinimumPipeSize = 64 << 10
+ // It corresponds to fs/pipe.c:pipe_min_size.
+ MinimumPipeSize = usermem.PageSize
+
+ // MaximumPipeSize is a hard limit on the maximum size of a pipe.
+ // It corresponds to fs/pipe.c:pipe_max_size.
+ MaximumPipeSize = 1048576
// DefaultPipeSize is the system-wide default size of a pipe in bytes.
- DefaultPipeSize = MinimumPipeSize
+ // It corresponds to pipe_fs_i.h:PIPE_DEF_BUFFERS.
+ DefaultPipeSize = 16 * usermem.PageSize
- // MaximumPipeSize is a hard limit on the maximum size of a pipe.
- MaximumPipeSize = 8 << 20
+ // atomicIOBytes is the maximum number of bytes that the pipe will
+ // guarantee atomic reads or writes atomically.
+ // It corresponds to limits.h:PIPE_BUF.
+ atomicIOBytes = 4096
)
// Pipe is an encapsulation of a platform-independent pipe.
@@ -53,12 +62,6 @@ type Pipe struct {
// This value is immutable.
isNamed bool
- // atomicIOBytes is the maximum number of bytes that the pipe will
- // guarantee atomic reads or writes atomically.
- //
- // This value is immutable.
- atomicIOBytes int64
-
// The number of active readers for this pipe.
//
// Access atomically.
@@ -94,47 +97,34 @@ type Pipe struct {
// NewPipe initializes and returns a pipe.
//
-// N.B. The size and atomicIOBytes will be bounded.
-func NewPipe(isNamed bool, sizeBytes, atomicIOBytes int64) *Pipe {
+// N.B. The size will be bounded.
+func NewPipe(isNamed bool, sizeBytes int64) *Pipe {
if sizeBytes < MinimumPipeSize {
sizeBytes = MinimumPipeSize
}
if sizeBytes > MaximumPipeSize {
sizeBytes = MaximumPipeSize
}
- if atomicIOBytes <= 0 {
- atomicIOBytes = 1
- }
- if atomicIOBytes > sizeBytes {
- atomicIOBytes = sizeBytes
- }
var p Pipe
- initPipe(&p, isNamed, sizeBytes, atomicIOBytes)
+ initPipe(&p, isNamed, sizeBytes)
return &p
}
-func initPipe(pipe *Pipe, isNamed bool, sizeBytes, atomicIOBytes int64) {
+func initPipe(pipe *Pipe, isNamed bool, sizeBytes int64) {
if sizeBytes < MinimumPipeSize {
sizeBytes = MinimumPipeSize
}
if sizeBytes > MaximumPipeSize {
sizeBytes = MaximumPipeSize
}
- if atomicIOBytes <= 0 {
- atomicIOBytes = 1
- }
- if atomicIOBytes > sizeBytes {
- atomicIOBytes = sizeBytes
- }
pipe.isNamed = isNamed
pipe.max = sizeBytes
- pipe.atomicIOBytes = atomicIOBytes
}
// NewConnectedPipe initializes a pipe and returns a pair of objects
// representing the read and write ends of the pipe.
-func NewConnectedPipe(ctx context.Context, sizeBytes, atomicIOBytes int64) (*fs.File, *fs.File) {
- p := NewPipe(false /* isNamed */, sizeBytes, atomicIOBytes)
+func NewConnectedPipe(ctx context.Context, sizeBytes int64) (*fs.File, *fs.File) {
+ p := NewPipe(false /* isNamed */, sizeBytes)
// Build an fs.Dirent for the pipe which will be shared by both
// returned files.
@@ -264,7 +254,7 @@ func (p *Pipe) writeLocked(ctx context.Context, ops writeOps) (int64, error) {
wanted := ops.left()
avail := p.max - p.view.Size()
if wanted > avail {
- if wanted <= p.atomicIOBytes {
+ if wanted <= atomicIOBytes {
return 0, syserror.ErrWouldBlock
}
ops.limit(avail)
diff --git a/pkg/sentry/kernel/pipe/pipe_test.go b/pkg/sentry/kernel/pipe/pipe_test.go
index fe97e9800..3dd739080 100644
--- a/pkg/sentry/kernel/pipe/pipe_test.go
+++ b/pkg/sentry/kernel/pipe/pipe_test.go
@@ -26,7 +26,7 @@ import (
func TestPipeRW(t *testing.T) {
ctx := contexttest.Context(t)
- r, w := NewConnectedPipe(ctx, 65536, 4096)
+ r, w := NewConnectedPipe(ctx, 65536)
defer r.DecRef(ctx)
defer w.DecRef(ctx)
@@ -46,7 +46,7 @@ func TestPipeRW(t *testing.T) {
func TestPipeReadBlock(t *testing.T) {
ctx := contexttest.Context(t)
- r, w := NewConnectedPipe(ctx, 65536, 4096)
+ r, w := NewConnectedPipe(ctx, 65536)
defer r.DecRef(ctx)
defer w.DecRef(ctx)
@@ -61,7 +61,7 @@ func TestPipeWriteBlock(t *testing.T) {
const capacity = MinimumPipeSize
ctx := contexttest.Context(t)
- r, w := NewConnectedPipe(ctx, capacity, atomicIOBytes)
+ r, w := NewConnectedPipe(ctx, capacity)
defer r.DecRef(ctx)
defer w.DecRef(ctx)
@@ -76,7 +76,7 @@ func TestPipeWriteUntilEnd(t *testing.T) {
const atomicIOBytes = 2
ctx := contexttest.Context(t)
- r, w := NewConnectedPipe(ctx, atomicIOBytes, atomicIOBytes)
+ r, w := NewConnectedPipe(ctx, atomicIOBytes)
defer r.DecRef(ctx)
defer w.DecRef(ctx)
@@ -116,7 +116,8 @@ func TestPipeWriteUntilEnd(t *testing.T) {
}
}
if err != nil {
- t.Fatalf("Readv: got unexpected error %v", err)
+ t.Errorf("Readv: got unexpected error %v", err)
+ return
}
}
}()
diff --git a/pkg/sentry/kernel/pipe/vfs.go b/pkg/sentry/kernel/pipe/vfs.go
index d96bf253b..7b23cbe86 100644
--- a/pkg/sentry/kernel/pipe/vfs.go
+++ b/pkg/sentry/kernel/pipe/vfs.go
@@ -54,9 +54,9 @@ type VFSPipe struct {
}
// NewVFSPipe returns an initialized VFSPipe.
-func NewVFSPipe(isNamed bool, sizeBytes, atomicIOBytes int64) *VFSPipe {
+func NewVFSPipe(isNamed bool, sizeBytes int64) *VFSPipe {
var vp VFSPipe
- initPipe(&vp.pipe, isNamed, sizeBytes, atomicIOBytes)
+ initPipe(&vp.pipe, isNamed, sizeBytes)
return &vp
}
diff --git a/pkg/sentry/kernel/ptrace.go b/pkg/sentry/kernel/ptrace.go
index 1145faf13..1abfe2201 100644
--- a/pkg/sentry/kernel/ptrace.go
+++ b/pkg/sentry/kernel/ptrace.go
@@ -1000,7 +1000,7 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error {
// at the address specified by the data parameter, and the return value
// is the error flag." - ptrace(2)
word := t.Arch().Native(0)
- if _, err := word.CopyIn(target.AsCopyContext(usermem.IOOpts{IgnorePermissions: true}), addr); err != nil {
+ if _, err := word.CopyIn(target.CopyContext(t, usermem.IOOpts{IgnorePermissions: true}), addr); err != nil {
return err
}
_, err := word.CopyOut(t, data)
@@ -1008,7 +1008,7 @@ func (t *Task) Ptrace(req int64, pid ThreadID, addr, data usermem.Addr) error {
case linux.PTRACE_POKETEXT, linux.PTRACE_POKEDATA:
word := t.Arch().Native(uintptr(data))
- _, err := word.CopyOut(target.AsCopyContext(usermem.IOOpts{IgnorePermissions: true}), addr)
+ _, err := word.CopyOut(target.CopyContext(t, usermem.IOOpts{IgnorePermissions: true}), addr)
return err
case linux.PTRACE_GETREGSET:
diff --git a/pkg/sentry/kernel/semaphore/semaphore.go b/pkg/sentry/kernel/semaphore/semaphore.go
index c39ecfb8f..b99c0bffa 100644
--- a/pkg/sentry/kernel/semaphore/semaphore.go
+++ b/pkg/sentry/kernel/semaphore/semaphore.go
@@ -103,6 +103,7 @@ type waiter struct {
waiterEntry
// value represents how much resource the waiter needs to wake up.
+ // The value is either 0 or negative.
value int16
ch chan struct{}
}
@@ -423,6 +424,42 @@ func (s *Set) GetPID(num int32, creds *auth.Credentials) (int32, error) {
return sem.pid, nil
}
+func (s *Set) countWaiters(num int32, creds *auth.Credentials, pred func(w *waiter) bool) (uint16, error) {
+ s.mu.Lock()
+ defer s.mu.Unlock()
+
+ // The calling process must have read permission on the semaphore set.
+ if !s.checkPerms(creds, fs.PermMask{Read: true}) {
+ return 0, syserror.EACCES
+ }
+
+ sem := s.findSem(num)
+ if sem == nil {
+ return 0, syserror.ERANGE
+ }
+ var cnt uint16
+ for w := sem.waiters.Front(); w != nil; w = w.Next() {
+ if pred(w) {
+ cnt++
+ }
+ }
+ return cnt, nil
+}
+
+// CountZeroWaiters returns number of waiters waiting for the sem's value to increase.
+func (s *Set) CountZeroWaiters(num int32, creds *auth.Credentials) (uint16, error) {
+ return s.countWaiters(num, creds, func(w *waiter) bool {
+ return w.value == 0
+ })
+}
+
+// CountNegativeWaiters returns number of waiters waiting for the sem to go to zero.
+func (s *Set) CountNegativeWaiters(num int32, creds *auth.Credentials) (uint16, error) {
+ return s.countWaiters(num, creds, func(w *waiter) bool {
+ return w.value < 0
+ })
+}
+
// ExecuteOps attempts to execute a list of operations to the set. It only
// succeeds when all operations can be applied. No changes are made if it fails.
//
@@ -575,11 +612,18 @@ func (s *Set) destroy() {
}
}
+func abs(val int16) int16 {
+ if val < 0 {
+ return -val
+ }
+ return val
+}
+
// wakeWaiters goes over all waiters and checks which of them can be notified.
func (s *sem) wakeWaiters() {
// Note that this will release all waiters waiting for 0 too.
for w := s.waiters.Front(); w != nil; {
- if s.value < w.value {
+ if s.value < abs(w.value) {
// Still blocked, skip it.
w = w.Next()
continue
diff --git a/pkg/sentry/kernel/sessions.go b/pkg/sentry/kernel/sessions.go
index df5c8421b..0cd9e2533 100644
--- a/pkg/sentry/kernel/sessions.go
+++ b/pkg/sentry/kernel/sessions.go
@@ -295,7 +295,7 @@ func (tg *ThreadGroup) createSession() error {
id: SessionID(id),
leader: tg,
}
- s.EnableLeakCheck()
+ s.InitRefs()
// Create a new ProcessGroup, belonging to that Session.
// This also has a single reference (assigned below).
@@ -309,7 +309,7 @@ func (tg *ThreadGroup) createSession() error {
session: s,
ancestors: 0,
}
- pg.refs.EnableLeakCheck()
+ pg.refs.InitRefs()
// Tie them and return the result.
s.processGroups.PushBack(pg)
@@ -395,7 +395,7 @@ func (tg *ThreadGroup) CreateProcessGroup() error {
originator: tg,
session: tg.processGroup.session,
}
- pg.refs.EnableLeakCheck()
+ pg.refs.InitRefs()
if tg.leader.parent != nil && tg.leader.parent.tg.processGroup.session == pg.session {
pg.ancestors++
@@ -477,20 +477,20 @@ func (tg *ThreadGroup) Session() *Session {
//
// If this group isn't visible in this namespace, zero will be returned. It is
// the callers responsibility to check that before using this function.
-func (pidns *PIDNamespace) IDOfSession(s *Session) SessionID {
- pidns.owner.mu.RLock()
- defer pidns.owner.mu.RUnlock()
- return pidns.sids[s]
+func (ns *PIDNamespace) IDOfSession(s *Session) SessionID {
+ ns.owner.mu.RLock()
+ defer ns.owner.mu.RUnlock()
+ return ns.sids[s]
}
// SessionWithID returns the Session with the given ID in the PID namespace ns,
// or nil if that given ID is not defined in this namespace.
//
// A reference is not taken on the session.
-func (pidns *PIDNamespace) SessionWithID(id SessionID) *Session {
- pidns.owner.mu.RLock()
- defer pidns.owner.mu.RUnlock()
- return pidns.sessions[id]
+func (ns *PIDNamespace) SessionWithID(id SessionID) *Session {
+ ns.owner.mu.RLock()
+ defer ns.owner.mu.RUnlock()
+ return ns.sessions[id]
}
// ProcessGroup returns the ThreadGroup's ProcessGroup.
@@ -505,18 +505,18 @@ func (tg *ThreadGroup) ProcessGroup() *ProcessGroup {
// IDOfProcessGroup returns the process group assigned to pg in PID namespace ns.
//
// The same constraints apply as IDOfSession.
-func (pidns *PIDNamespace) IDOfProcessGroup(pg *ProcessGroup) ProcessGroupID {
- pidns.owner.mu.RLock()
- defer pidns.owner.mu.RUnlock()
- return pidns.pgids[pg]
+func (ns *PIDNamespace) IDOfProcessGroup(pg *ProcessGroup) ProcessGroupID {
+ ns.owner.mu.RLock()
+ defer ns.owner.mu.RUnlock()
+ return ns.pgids[pg]
}
// ProcessGroupWithID returns the ProcessGroup with the given ID in the PID
// namespace ns, or nil if that given ID is not defined in this namespace.
//
// A reference is not taken on the process group.
-func (pidns *PIDNamespace) ProcessGroupWithID(id ProcessGroupID) *ProcessGroup {
- pidns.owner.mu.RLock()
- defer pidns.owner.mu.RUnlock()
- return pidns.processGroups[id]
+func (ns *PIDNamespace) ProcessGroupWithID(id ProcessGroupID) *ProcessGroup {
+ ns.owner.mu.RLock()
+ defer ns.owner.mu.RUnlock()
+ return ns.processGroups[id]
}
diff --git a/pkg/sentry/kernel/shm/shm.go b/pkg/sentry/kernel/shm/shm.go
index ebbebf46b..92d60ba78 100644
--- a/pkg/sentry/kernel/shm/shm.go
+++ b/pkg/sentry/kernel/shm/shm.go
@@ -251,7 +251,7 @@ func (r *Registry) newShm(ctx context.Context, pid int32, key Key, creator fs.Fi
creatorPID: pid,
changeTime: ktime.NowFromContext(ctx),
}
- shm.EnableLeakCheck()
+ shm.InitRefs()
// Find the next available ID.
for id := r.lastIDUsed + 1; id != r.lastIDUsed; id++ {
diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go
index 682080c14..527344162 100644
--- a/pkg/sentry/kernel/task_clone.go
+++ b/pkg/sentry/kernel/task_clone.go
@@ -355,7 +355,7 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
}
if opts.ChildSetTID {
ctid := nt.ThreadID()
- ctid.CopyOut(nt.AsCopyContext(usermem.IOOpts{AddressSpaceActive: false}), opts.ChildTID)
+ ctid.CopyOut(nt.CopyContext(t, usermem.IOOpts{AddressSpaceActive: false}), opts.ChildTID)
}
ntid := t.tg.pidns.IDOfTask(nt)
if opts.ParentSetTID {
diff --git a/pkg/sentry/kernel/task_usermem.go b/pkg/sentry/kernel/task_usermem.go
index ce134bf54..94dabbcd8 100644
--- a/pkg/sentry/kernel/task_usermem.go
+++ b/pkg/sentry/kernel/task_usermem.go
@@ -18,7 +18,8 @@ import (
"math"
"gvisor.dev/gvisor/pkg/abi/linux"
- "gvisor.dev/gvisor/pkg/marshal"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/sentry/mm"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
)
@@ -281,29 +282,89 @@ func (t *Task) IovecsIOSequence(addr usermem.Addr, iovcnt int, opts usermem.IOOp
}, nil
}
-// copyContext implements marshal.CopyContext. It wraps a task to allow copying
-// memory to and from the task memory with custom usermem.IOOpts.
-type copyContext struct {
- *Task
+type taskCopyContext struct {
+ ctx context.Context
+ t *Task
opts usermem.IOOpts
}
-// AsCopyContext wraps the task and returns it as CopyContext.
-func (t *Task) AsCopyContext(opts usermem.IOOpts) marshal.CopyContext {
- return &copyContext{t, opts}
+// CopyContext returns a marshal.CopyContext that copies to/from t's address
+// space using opts.
+func (t *Task) CopyContext(ctx context.Context, opts usermem.IOOpts) *taskCopyContext {
+ return &taskCopyContext{
+ ctx: ctx,
+ t: t,
+ opts: opts,
+ }
+}
+
+// CopyScratchBuffer implements marshal.CopyContext.CopyScratchBuffer.
+func (cc *taskCopyContext) CopyScratchBuffer(size int) []byte {
+ if ctxTask, ok := cc.ctx.(*Task); ok {
+ return ctxTask.CopyScratchBuffer(size)
+ }
+ return make([]byte, size)
+}
+
+func (cc *taskCopyContext) getMemoryManager() (*mm.MemoryManager, error) {
+ cc.t.mu.Lock()
+ tmm := cc.t.MemoryManager()
+ cc.t.mu.Unlock()
+ if !tmm.IncUsers() {
+ return nil, syserror.EFAULT
+ }
+ return tmm, nil
+}
+
+// CopyInBytes implements marshal.CopyContext.CopyInBytes.
+func (cc *taskCopyContext) CopyInBytes(addr usermem.Addr, dst []byte) (int, error) {
+ tmm, err := cc.getMemoryManager()
+ if err != nil {
+ return 0, err
+ }
+ defer tmm.DecUsers(cc.ctx)
+ return tmm.CopyIn(cc.ctx, addr, dst, cc.opts)
+}
+
+// CopyOutBytes implements marshal.CopyContext.CopyOutBytes.
+func (cc *taskCopyContext) CopyOutBytes(addr usermem.Addr, src []byte) (int, error) {
+ tmm, err := cc.getMemoryManager()
+ if err != nil {
+ return 0, err
+ }
+ defer tmm.DecUsers(cc.ctx)
+ return tmm.CopyOut(cc.ctx, addr, src, cc.opts)
+}
+
+type ownTaskCopyContext struct {
+ t *Task
+ opts usermem.IOOpts
+}
+
+// OwnCopyContext returns a marshal.CopyContext that copies to/from t's address
+// space using opts. The returned CopyContext may only be used by t's task
+// goroutine.
+//
+// Since t already implements marshal.CopyContext, this is only needed to
+// override the usermem.IOOpts used for the copy.
+func (t *Task) OwnCopyContext(opts usermem.IOOpts) *ownTaskCopyContext {
+ return &ownTaskCopyContext{
+ t: t,
+ opts: opts,
+ }
}
-// CopyInString copies a string in from the task's memory.
-func (t *copyContext) CopyInString(addr usermem.Addr, maxLen int) (string, error) {
- return usermem.CopyStringIn(t, t.MemoryManager(), addr, maxLen, t.opts)
+// CopyScratchBuffer implements marshal.CopyContext.CopyScratchBuffer.
+func (cc *ownTaskCopyContext) CopyScratchBuffer(size int) []byte {
+ return cc.t.CopyScratchBuffer(size)
}
-// CopyInBytes copies task memory into dst from an IO context.
-func (t *copyContext) CopyInBytes(addr usermem.Addr, dst []byte) (int, error) {
- return t.MemoryManager().CopyIn(t, addr, dst, t.opts)
+// CopyInBytes implements marshal.CopyContext.CopyInBytes.
+func (cc *ownTaskCopyContext) CopyInBytes(addr usermem.Addr, dst []byte) (int, error) {
+ return cc.t.MemoryManager().CopyIn(cc.t, addr, dst, cc.opts)
}
-// CopyOutBytes copies src into task memoryfrom an IO context.
-func (t *copyContext) CopyOutBytes(addr usermem.Addr, src []byte) (int, error) {
- return t.MemoryManager().CopyOut(t, addr, src, t.opts)
+// CopyOutBytes implements marshal.CopyContext.CopyOutBytes.
+func (cc *ownTaskCopyContext) CopyOutBytes(addr usermem.Addr, src []byte) (int, error) {
+ return cc.t.MemoryManager().CopyOut(cc.t, addr, src, cc.opts)
}
diff --git a/pkg/sentry/kernel/vdso.go b/pkg/sentry/kernel/vdso.go
index 9bc452e67..9e5c2d26f 100644
--- a/pkg/sentry/kernel/vdso.go
+++ b/pkg/sentry/kernel/vdso.go
@@ -115,7 +115,7 @@ func (v *VDSOParamPage) incrementSeq(paramPage safemem.Block) error {
}
if old != v.seq {
- return fmt.Errorf("unexpected VDSOParamPage seq value: got %d expected %d. Application may hang or get incorrect time from the VDSO.", old, v.seq)
+ return fmt.Errorf("unexpected VDSOParamPage seq value: got %d expected %d; application may hang or get incorrect time from the VDSO", old, v.seq)
}
v.seq = next
diff --git a/pkg/sentry/mm/aio_context.go b/pkg/sentry/mm/aio_context.go
index 7bf48cb2c..4c8cd38ed 100644
--- a/pkg/sentry/mm/aio_context.go
+++ b/pkg/sentry/mm/aio_context.go
@@ -252,7 +252,7 @@ func newAIOMappable(mfp pgalloc.MemoryFileProvider) (*aioMappable, error) {
return nil, err
}
m := aioMappable{mfp: mfp, fr: fr}
- m.EnableLeakCheck()
+ m.InitRefs()
return &m, nil
}
diff --git a/pkg/sentry/mm/special_mappable.go b/pkg/sentry/mm/special_mappable.go
index 2dbe5b751..48d8b6a2b 100644
--- a/pkg/sentry/mm/special_mappable.go
+++ b/pkg/sentry/mm/special_mappable.go
@@ -44,7 +44,7 @@ type SpecialMappable struct {
// Preconditions: fr.Length() != 0.
func NewSpecialMappable(name string, mfp pgalloc.MemoryFileProvider, fr memmap.FileRange) *SpecialMappable {
m := SpecialMappable{mfp: mfp, fr: fr, name: name}
- m.EnableLeakCheck()
+ m.InitRefs()
return &m
}
diff --git a/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go b/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go
index 0a54dd30d..acad4c793 100644
--- a/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go
+++ b/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go
@@ -79,6 +79,18 @@ func bluepillStopGuest(c *vCPU) {
c.runData.requestInterruptWindow = 0
}
+// bluepillSigBus is reponsible for injecting NMI to trigger sigbus.
+//
+//go:nosplit
+func bluepillSigBus(c *vCPU) {
+ if _, _, errno := syscall.RawSyscall( // escapes: no.
+ syscall.SYS_IOCTL,
+ uintptr(c.fd),
+ _KVM_NMI, 0); errno != 0 {
+ throw("NMI injection failed")
+ }
+}
+
// bluepillReadyStopGuest checks whether the current vCPU is ready for interrupt injection.
//
//go:nosplit
diff --git a/pkg/sentry/platform/kvm/bluepill_arm64.go b/pkg/sentry/platform/kvm/bluepill_arm64.go
index 58f3d6fdd..965ad66b5 100644
--- a/pkg/sentry/platform/kvm/bluepill_arm64.go
+++ b/pkg/sentry/platform/kvm/bluepill_arm64.go
@@ -27,15 +27,20 @@ var (
// The action for bluepillSignal is changed by sigaction().
bluepillSignal = syscall.SIGILL
- // vcpuSErr is the event of system error.
- vcpuSErr = kvmVcpuEvents{
+ // vcpuSErrBounce is the event of system error for bouncing KVM.
+ vcpuSErrBounce = kvmVcpuEvents{
exception: exception{
sErrPending: 1,
- sErrHasEsr: 0,
- pad: [6]uint8{0, 0, 0, 0, 0, 0},
- sErrEsr: 1,
},
- rsvd: [12]uint32{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+ }
+
+ // vcpuSErrNMI is the event of system error to trigger sigbus.
+ vcpuSErrNMI = kvmVcpuEvents{
+ exception: exception{
+ sErrPending: 1,
+ sErrHasEsr: 1,
+ sErrEsr: _ESR_ELx_SERR_NMI,
+ },
}
)
diff --git a/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go b/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go
index b35c930e2..9433d4da5 100644
--- a/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go
+++ b/pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go
@@ -80,11 +80,24 @@ func getHypercallID(addr uintptr) int {
//
//go:nosplit
func bluepillStopGuest(c *vCPU) {
- if _, _, errno := syscall.RawSyscall(
+ if _, _, errno := syscall.RawSyscall( // escapes: no.
syscall.SYS_IOCTL,
uintptr(c.fd),
_KVM_SET_VCPU_EVENTS,
- uintptr(unsafe.Pointer(&vcpuSErr))); errno != 0 {
+ uintptr(unsafe.Pointer(&vcpuSErrBounce))); errno != 0 {
+ throw("sErr injection failed")
+ }
+}
+
+// bluepillSigBus is reponsible for injecting sError to trigger sigbus.
+//
+//go:nosplit
+func bluepillSigBus(c *vCPU) {
+ if _, _, errno := syscall.RawSyscall( // escapes: no.
+ syscall.SYS_IOCTL,
+ uintptr(c.fd),
+ _KVM_SET_VCPU_EVENTS,
+ uintptr(unsafe.Pointer(&vcpuSErrNMI))); errno != 0 {
throw("sErr injection failed")
}
}
diff --git a/pkg/sentry/platform/kvm/bluepill_unsafe.go b/pkg/sentry/platform/kvm/bluepill_unsafe.go
index eb05950cd..75085ac6a 100644
--- a/pkg/sentry/platform/kvm/bluepill_unsafe.go
+++ b/pkg/sentry/platform/kvm/bluepill_unsafe.go
@@ -146,12 +146,7 @@ func bluepillHandler(context unsafe.Pointer) {
// MMIO exit we receive EFAULT from the run ioctl. We
// always inject an NMI here since we may be in kernel
// mode and have interrupts disabled.
- if _, _, errno := syscall.RawSyscall( // escapes: no.
- syscall.SYS_IOCTL,
- uintptr(c.fd),
- _KVM_NMI, 0); errno != 0 {
- throw("NMI injection failed")
- }
+ bluepillSigBus(c)
continue // Rerun vCPU.
default:
throw("run failed")
diff --git a/pkg/sentry/platform/kvm/kvm.go b/pkg/sentry/platform/kvm/kvm.go
index dd45ad10b..5979aef97 100644
--- a/pkg/sentry/platform/kvm/kvm.go
+++ b/pkg/sentry/platform/kvm/kvm.go
@@ -158,8 +158,7 @@ func (*KVM) MaxUserAddress() usermem.Addr {
// NewAddressSpace returns a new pagetable root.
func (k *KVM) NewAddressSpace(_ interface{}) (platform.AddressSpace, <-chan struct{}, error) {
// Allocate page tables and install system mappings.
- pageTables := pagetables.New(newAllocator())
- k.machine.mapUpperHalf(pageTables)
+ pageTables := pagetables.NewWithUpper(newAllocator(), k.machine.upperSharedPageTables, ring0.KernelStartAddress)
// Return the new address space.
return &addressSpace{
diff --git a/pkg/sentry/platform/kvm/kvm_const_arm64.go b/pkg/sentry/platform/kvm/kvm_const_arm64.go
index 5831b9345..b060d9544 100644
--- a/pkg/sentry/platform/kvm/kvm_const_arm64.go
+++ b/pkg/sentry/platform/kvm/kvm_const_arm64.go
@@ -151,6 +151,9 @@ const (
_ESR_SEGV_PEMERR_L1 = 0xd
_ESR_SEGV_PEMERR_L2 = 0xe
_ESR_SEGV_PEMERR_L3 = 0xf
+
+ // Custom ISS field definitions for system error.
+ _ESR_ELx_SERR_NMI = 0x1
)
// Arm64: MMIO base address used to dispatch hypercalls.
diff --git a/pkg/sentry/platform/kvm/machine.go b/pkg/sentry/platform/kvm/machine.go
index f70d761fd..e2fffc99b 100644
--- a/pkg/sentry/platform/kvm/machine.go
+++ b/pkg/sentry/platform/kvm/machine.go
@@ -41,6 +41,9 @@ type machine struct {
// slots are currently being updated, and the caller should retry.
nextSlot uint32
+ // upperSharedPageTables tracks the read-only shared upper of all the pagetables.
+ upperSharedPageTables *pagetables.PageTables
+
// kernel is the set of global structures.
kernel ring0.Kernel
@@ -199,9 +202,7 @@ func newMachine(vm int) (*machine, error) {
log.Debugf("The maximum number of vCPUs is %d.", m.maxVCPUs)
m.vCPUsByTID = make(map[uint64]*vCPU)
m.vCPUsByID = make([]*vCPU, m.maxVCPUs)
- m.kernel.Init(ring0.KernelOpts{
- PageTables: pagetables.New(newAllocator()),
- }, m.maxVCPUs)
+ m.kernel.Init(m.maxVCPUs)
// Pull the maximum slots.
maxSlots, _, errno := syscall.RawSyscall(syscall.SYS_IOCTL, uintptr(m.fd), _KVM_CHECK_EXTENSION, _KVM_CAP_MAX_MEMSLOTS)
@@ -213,6 +214,13 @@ func newMachine(vm int) (*machine, error) {
log.Debugf("The maximum number of slots is %d.", m.maxSlots)
m.usedSlots = make([]uintptr, m.maxSlots)
+ // Create the upper shared pagetables and kernel(sentry) pagetables.
+ m.upperSharedPageTables = pagetables.New(newAllocator())
+ m.mapUpperHalf(m.upperSharedPageTables)
+ m.upperSharedPageTables.Allocator.(*allocator).base.Drain()
+ m.upperSharedPageTables.MarkReadOnlyShared()
+ m.kernel.PageTables = pagetables.NewWithUpper(newAllocator(), m.upperSharedPageTables, ring0.KernelStartAddress)
+
// Apply the physical mappings. Note that these mappings may point to
// guest physical addresses that are not actually available. These
// physical pages are mapped on demand, see kernel_unsafe.go.
@@ -226,7 +234,6 @@ func newMachine(vm int) (*machine, error) {
return true // Keep iterating.
})
- m.mapUpperHalf(m.kernel.PageTables)
var physicalRegionsReadOnly []physicalRegion
var physicalRegionsAvailable []physicalRegion
diff --git a/pkg/sentry/platform/kvm/machine_amd64.go b/pkg/sentry/platform/kvm/machine_amd64.go
index a8b729e62..8e03c310d 100644
--- a/pkg/sentry/platform/kvm/machine_amd64.go
+++ b/pkg/sentry/platform/kvm/machine_amd64.go
@@ -432,30 +432,27 @@ func availableRegionsForSetMem() (phyRegions []physicalRegion) {
return physicalRegions
}
-var execRegions = func() (regions []region) {
+func (m *machine) mapUpperHalf(pageTable *pagetables.PageTables) {
+ // Map all the executible regions so that all the entry functions
+ // are mapped in the upper half.
applyVirtualRegions(func(vr virtualRegion) {
if excludeVirtualRegion(vr) || vr.filename == "[vsyscall]" {
return
}
+
if vr.accessType.Execute {
- regions = append(regions, vr.region)
+ r := vr.region
+ physical, length, ok := translateToPhysical(r.virtual)
+ if !ok || length < r.length {
+ panic("impossible translation")
+ }
+ pageTable.Map(
+ usermem.Addr(ring0.KernelStartAddress|r.virtual),
+ r.length,
+ pagetables.MapOpts{AccessType: usermem.Execute},
+ physical)
}
})
- return
-}()
-
-func (m *machine) mapUpperHalf(pageTable *pagetables.PageTables) {
- for _, r := range execRegions {
- physical, length, ok := translateToPhysical(r.virtual)
- if !ok || length < r.length {
- panic("impossilbe translation")
- }
- pageTable.Map(
- usermem.Addr(ring0.KernelStartAddress|r.virtual),
- r.length,
- pagetables.MapOpts{AccessType: usermem.Execute},
- physical)
- }
for start, end := range m.kernel.EntryRegions() {
regionLen := end - start
physical, length, ok := translateToPhysical(start)
diff --git a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go
index 1344ed3c9..fd92c3873 100644
--- a/pkg/sentry/platform/kvm/machine_arm64_unsafe.go
+++ b/pkg/sentry/platform/kvm/machine_arm64_unsafe.go
@@ -221,7 +221,7 @@ func (c *vCPU) SwitchToUser(switchOpts ring0.SwitchOpts, info *arch.SignalInfo)
if regs := switchOpts.Registers; !ring0.IsCanonical(regs.Pc) {
return nonCanonical(regs.Pc, int32(syscall.SIGSEGV), info)
} else if !ring0.IsCanonical(regs.Sp) {
- return nonCanonical(regs.Sp, int32(syscall.SIGBUS), info)
+ return nonCanonical(regs.Sp, int32(syscall.SIGSEGV), info)
}
// Assign PCIDs.
@@ -257,11 +257,13 @@ func (c *vCPU) SwitchToUser(switchOpts ring0.SwitchOpts, info *arch.SignalInfo)
case ring0.PageFault:
return c.fault(int32(syscall.SIGSEGV), info)
+ case ring0.El0ErrNMI:
+ return c.fault(int32(syscall.SIGBUS), info)
case ring0.Vector(bounce): // ring0.VirtualizationException
return usermem.NoAccess, platform.ErrContextInterrupt
- case ring0.El0Sync_undef:
+ case ring0.El0SyncUndef:
return c.fault(int32(syscall.SIGILL), info)
- case ring0.El1Sync_undef:
+ case ring0.El1SyncUndef:
*info = arch.SignalInfo{
Signo: int32(syscall.SIGILL),
Code: 1, // ILL_ILLOPC (illegal opcode).
diff --git a/pkg/sentry/platform/ring0/aarch64.go b/pkg/sentry/platform/ring0/aarch64.go
index 87a573cc4..327d48465 100644
--- a/pkg/sentry/platform/ring0/aarch64.go
+++ b/pkg/sentry/platform/ring0/aarch64.go
@@ -58,46 +58,55 @@ type Vector uintptr
// Exception vectors.
const (
- El1SyncInvalid = iota
- El1IrqInvalid
- El1FiqInvalid
- El1ErrorInvalid
+ El1InvSync = iota
+ El1InvIrq
+ El1InvFiq
+ El1InvError
+
El1Sync
El1Irq
El1Fiq
- El1Error
+ El1Err
+
El0Sync
El0Irq
El0Fiq
- El0Error
- El0Sync_invalid
- El0Irq_invalid
- El0Fiq_invalid
- El0Error_invalid
- El1Sync_da
- El1Sync_ia
- El1Sync_sp_pc
- El1Sync_undef
- El1Sync_dbg
- El1Sync_inv
- El0Sync_svc
- El0Sync_da
- El0Sync_ia
- El0Sync_fpsimd_acc
- El0Sync_sve_acc
- El0Sync_sys
- El0Sync_sp_pc
- El0Sync_undef
- El0Sync_dbg
- El0Sync_inv
+ El0Err
+
+ El0InvSync
+ El0InvIrq
+ El0InvFiq
+ El0InvErr
+
+ El1SyncDa
+ El1SyncIa
+ El1SyncSpPc
+ El1SyncUndef
+ El1SyncDbg
+ El1SyncInv
+
+ El0SyncSVC
+ El0SyncDa
+ El0SyncIa
+ El0SyncFpsimdAcc
+ El0SyncSveAcc
+ El0SyncSys
+ El0SyncSpPc
+ El0SyncUndef
+ El0SyncDbg
+ El0SyncInv
+
+ El0ErrNMI
+ El0ErrBounce
+
_NR_INTERRUPTS
)
// System call vectors.
const (
- Syscall Vector = El0Sync_svc
- PageFault Vector = El0Sync_da
- VirtualizationException Vector = El0Error
+ Syscall Vector = El0SyncSVC
+ PageFault Vector = El0SyncDa
+ VirtualizationException Vector = El0ErrBounce
)
// VirtualAddressBits returns the number bits available for virtual addresses.
diff --git a/pkg/sentry/platform/ring0/defs.go b/pkg/sentry/platform/ring0/defs.go
index e6daf24df..f9765771e 100644
--- a/pkg/sentry/platform/ring0/defs.go
+++ b/pkg/sentry/platform/ring0/defs.go
@@ -23,6 +23,9 @@ import (
//
// This contains global state, shared by multiple CPUs.
type Kernel struct {
+ // PageTables are the kernel pagetables; this must be provided.
+ PageTables *pagetables.PageTables
+
KernelArchState
}
diff --git a/pkg/sentry/platform/ring0/defs_amd64.go b/pkg/sentry/platform/ring0/defs_amd64.go
index 00899273e..7a2275558 100644
--- a/pkg/sentry/platform/ring0/defs_amd64.go
+++ b/pkg/sentry/platform/ring0/defs_amd64.go
@@ -66,17 +66,9 @@ var (
KernelDataSegment SegmentDescriptor
)
-// KernelOpts has initialization options for the kernel.
-type KernelOpts struct {
- // PageTables are the kernel pagetables; this must be provided.
- PageTables *pagetables.PageTables
-}
-
// KernelArchState contains architecture-specific state.
type KernelArchState struct {
- KernelOpts
-
- // cpuEntries is array of kernelEntry for all cpus
+ // cpuEntries is array of kernelEntry for all cpus.
cpuEntries []kernelEntry
// globalIDT is our set of interrupt gates.
diff --git a/pkg/sentry/platform/ring0/defs_arm64.go b/pkg/sentry/platform/ring0/defs_arm64.go
index 508236e46..a014dcbc0 100644
--- a/pkg/sentry/platform/ring0/defs_arm64.go
+++ b/pkg/sentry/platform/ring0/defs_arm64.go
@@ -32,15 +32,8 @@ var (
KernelStartAddress = ^uintptr(0) - (UserspaceSize - 1)
)
-// KernelOpts has initialization options for the kernel.
-type KernelOpts struct {
- // PageTables are the kernel pagetables; this must be provided.
- PageTables *pagetables.PageTables
-}
-
// KernelArchState contains architecture-specific state.
type KernelArchState struct {
- KernelOpts
}
// CPUArchState contains CPU-specific arch state.
diff --git a/pkg/sentry/platform/ring0/entry_arm64.s b/pkg/sentry/platform/ring0/entry_arm64.s
index f9278b653..f489ad352 100644
--- a/pkg/sentry/platform/ring0/entry_arm64.s
+++ b/pkg/sentry/platform/ring0/entry_arm64.s
@@ -288,6 +288,10 @@
#define ESR_ELx_WFx_ISS_WFE (UL(1) << 0)
#define ESR_ELx_xVC_IMM_MASK ((1UL << 16) - 1)
+/* ISS field definitions for system error */
+#define ESR_ELx_SERR_MASK (0x1)
+#define ESR_ELx_SERR_NMI (0x1)
+
// LOAD_KERNEL_ADDRESS loads a kernel address.
#define LOAD_KERNEL_ADDRESS(from, to) \
MOVD from, to; \
@@ -691,7 +695,7 @@ el0_sp_pc:
B ·Shutdown(SB)
el0_undef:
- EXCEPTION_WITH_ERROR(1, El0Sync_undef)
+ EXCEPTION_WITH_ERROR(1, El0SyncUndef)
el0_dbg:
B ·Shutdown(SB)
@@ -707,6 +711,29 @@ TEXT ·El0_fiq(SB),NOSPLIT,$0
TEXT ·El0_error(SB),NOSPLIT,$0
KERNEL_ENTRY_FROM_EL0
+ WORD $0xd5385219 // MRS ESR_EL1, R25
+ AND $ESR_ELx_SERR_MASK, R25, R24
+ CMP $ESR_ELx_SERR_NMI, R24
+ BEQ el0_nmi
+ B el0_bounce
+el0_nmi:
+ WORD $0xd538d092 //MRS TPIDR_EL1, R18
+ WORD $0xd538601a //MRS FAR_EL1, R26
+
+ MOVD R26, CPU_FAULT_ADDR(RSV_REG)
+
+ MOVD $1, R3
+ MOVD R3, CPU_ERROR_TYPE(RSV_REG) // Set error type to user.
+
+ MOVD $El0ErrNMI, R3
+ MOVD R3, CPU_VECTOR_CODE(RSV_REG)
+
+ MRS ESR_EL1, R3
+ MOVD R3, CPU_ERROR_CODE(RSV_REG)
+
+ B ·kernelExitToEl1(SB)
+
+el0_bounce:
WORD $0xd538d092 //MRS TPIDR_EL1, R18
WORD $0xd538601a //MRS FAR_EL1, R26
@@ -718,7 +745,7 @@ TEXT ·El0_error(SB),NOSPLIT,$0
MOVD $VirtualizationException, R3
MOVD R3, CPU_VECTOR_CODE(RSV_REG)
- B ·HaltAndResume(SB)
+ B ·kernelExitToEl1(SB)
TEXT ·El0_sync_invalid(SB),NOSPLIT,$0
B ·Shutdown(SB)
diff --git a/pkg/sentry/platform/ring0/kernel.go b/pkg/sentry/platform/ring0/kernel.go
index 264be23d3..292f9d0cc 100644
--- a/pkg/sentry/platform/ring0/kernel.go
+++ b/pkg/sentry/platform/ring0/kernel.go
@@ -16,11 +16,9 @@ package ring0
// Init initializes a new kernel.
//
-// N.B. that constraints on KernelOpts must be satisfied.
-//
//go:nosplit
-func (k *Kernel) Init(opts KernelOpts, maxCPUs int) {
- k.init(opts, maxCPUs)
+func (k *Kernel) Init(maxCPUs int) {
+ k.init(maxCPUs)
}
// Halt halts execution.
diff --git a/pkg/sentry/platform/ring0/kernel_amd64.go b/pkg/sentry/platform/ring0/kernel_amd64.go
index 3a9dff4cc..b55dc29b3 100644
--- a/pkg/sentry/platform/ring0/kernel_amd64.go
+++ b/pkg/sentry/platform/ring0/kernel_amd64.go
@@ -24,10 +24,7 @@ import (
)
// init initializes architecture-specific state.
-func (k *Kernel) init(opts KernelOpts, maxCPUs int) {
- // Save the root page tables.
- k.PageTables = opts.PageTables
-
+func (k *Kernel) init(maxCPUs int) {
entrySize := reflect.TypeOf(kernelEntry{}).Size()
var (
entries []kernelEntry
diff --git a/pkg/sentry/platform/ring0/kernel_arm64.go b/pkg/sentry/platform/ring0/kernel_arm64.go
index b294ccc7c..6cbbf001f 100644
--- a/pkg/sentry/platform/ring0/kernel_arm64.go
+++ b/pkg/sentry/platform/ring0/kernel_arm64.go
@@ -25,9 +25,7 @@ func HaltAndResume()
func HaltEl1SvcAndResume()
// init initializes architecture-specific state.
-func (k *Kernel) init(opts KernelOpts, maxCPUs int) {
- // Save the root page tables.
- k.PageTables = opts.PageTables
+func (k *Kernel) init(maxCPUs int) {
}
// init initializes architecture-specific state.
diff --git a/pkg/sentry/platform/ring0/offsets_arm64.go b/pkg/sentry/platform/ring0/offsets_arm64.go
index 45eba960d..53bc3353c 100644
--- a/pkg/sentry/platform/ring0/offsets_arm64.go
+++ b/pkg/sentry/platform/ring0/offsets_arm64.go
@@ -47,43 +47,36 @@ func Emit(w io.Writer) {
fmt.Fprintf(w, "#define _KERNEL_FLAGS 0x%02x\n", KernelFlagsSet)
fmt.Fprintf(w, "\n// Vectors.\n")
- fmt.Fprintf(w, "#define El1SyncInvalid 0x%02x\n", El1SyncInvalid)
- fmt.Fprintf(w, "#define El1IrqInvalid 0x%02x\n", El1IrqInvalid)
- fmt.Fprintf(w, "#define El1FiqInvalid 0x%02x\n", El1FiqInvalid)
- fmt.Fprintf(w, "#define El1ErrorInvalid 0x%02x\n", El1ErrorInvalid)
fmt.Fprintf(w, "#define El1Sync 0x%02x\n", El1Sync)
fmt.Fprintf(w, "#define El1Irq 0x%02x\n", El1Irq)
fmt.Fprintf(w, "#define El1Fiq 0x%02x\n", El1Fiq)
- fmt.Fprintf(w, "#define El1Error 0x%02x\n", El1Error)
+ fmt.Fprintf(w, "#define El1Err 0x%02x\n", El1Err)
fmt.Fprintf(w, "#define El0Sync 0x%02x\n", El0Sync)
fmt.Fprintf(w, "#define El0Irq 0x%02x\n", El0Irq)
fmt.Fprintf(w, "#define El0Fiq 0x%02x\n", El0Fiq)
- fmt.Fprintf(w, "#define El0Error 0x%02x\n", El0Error)
+ fmt.Fprintf(w, "#define El0Err 0x%02x\n", El0Err)
- fmt.Fprintf(w, "#define El0Sync_invalid 0x%02x\n", El0Sync_invalid)
- fmt.Fprintf(w, "#define El0Irq_invalid 0x%02x\n", El0Irq_invalid)
- fmt.Fprintf(w, "#define El0Fiq_invalid 0x%02x\n", El0Fiq_invalid)
- fmt.Fprintf(w, "#define El0Error_invalid 0x%02x\n", El0Error_invalid)
+ fmt.Fprintf(w, "#define El1SyncDa 0x%02x\n", El1SyncDa)
+ fmt.Fprintf(w, "#define El1SyncIa 0x%02x\n", El1SyncIa)
+ fmt.Fprintf(w, "#define El1SyncSpPc 0x%02x\n", El1SyncSpPc)
+ fmt.Fprintf(w, "#define El1SyncUndef 0x%02x\n", El1SyncUndef)
+ fmt.Fprintf(w, "#define El1SyncDbg 0x%02x\n", El1SyncDbg)
+ fmt.Fprintf(w, "#define El1SyncInv 0x%02x\n", El1SyncInv)
- fmt.Fprintf(w, "#define El1Sync_da 0x%02x\n", El1Sync_da)
- fmt.Fprintf(w, "#define El1Sync_ia 0x%02x\n", El1Sync_ia)
- fmt.Fprintf(w, "#define El1Sync_sp_pc 0x%02x\n", El1Sync_sp_pc)
- fmt.Fprintf(w, "#define El1Sync_undef 0x%02x\n", El1Sync_undef)
- fmt.Fprintf(w, "#define El1Sync_dbg 0x%02x\n", El1Sync_dbg)
- fmt.Fprintf(w, "#define El1Sync_inv 0x%02x\n", El1Sync_inv)
+ fmt.Fprintf(w, "#define El0SyncSVC 0x%02x\n", El0SyncSVC)
+ fmt.Fprintf(w, "#define El0SyncDa 0x%02x\n", El0SyncDa)
+ fmt.Fprintf(w, "#define El0SyncIa 0x%02x\n", El0SyncIa)
+ fmt.Fprintf(w, "#define El0SyncFpsimdAcc 0x%02x\n", El0SyncFpsimdAcc)
+ fmt.Fprintf(w, "#define El0SyncSveAcc 0x%02x\n", El0SyncSveAcc)
+ fmt.Fprintf(w, "#define El0SyncSys 0x%02x\n", El0SyncSys)
+ fmt.Fprintf(w, "#define El0SyncSpPc 0x%02x\n", El0SyncSpPc)
+ fmt.Fprintf(w, "#define El0SyncUndef 0x%02x\n", El0SyncUndef)
+ fmt.Fprintf(w, "#define El0SyncDbg 0x%02x\n", El0SyncDbg)
+ fmt.Fprintf(w, "#define El0SyncInv 0x%02x\n", El0SyncInv)
- fmt.Fprintf(w, "#define El0Sync_svc 0x%02x\n", El0Sync_svc)
- fmt.Fprintf(w, "#define El0Sync_da 0x%02x\n", El0Sync_da)
- fmt.Fprintf(w, "#define El0Sync_ia 0x%02x\n", El0Sync_ia)
- fmt.Fprintf(w, "#define El0Sync_fpsimd_acc 0x%02x\n", El0Sync_fpsimd_acc)
- fmt.Fprintf(w, "#define El0Sync_sve_acc 0x%02x\n", El0Sync_sve_acc)
- fmt.Fprintf(w, "#define El0Sync_sys 0x%02x\n", El0Sync_sys)
- fmt.Fprintf(w, "#define El0Sync_sp_pc 0x%02x\n", El0Sync_sp_pc)
- fmt.Fprintf(w, "#define El0Sync_undef 0x%02x\n", El0Sync_undef)
- fmt.Fprintf(w, "#define El0Sync_dbg 0x%02x\n", El0Sync_dbg)
- fmt.Fprintf(w, "#define El0Sync_inv 0x%02x\n", El0Sync_inv)
+ fmt.Fprintf(w, "#define El0ErrNMI 0x%02x\n", El0ErrNMI)
fmt.Fprintf(w, "#define PageFault 0x%02x\n", PageFault)
fmt.Fprintf(w, "#define Syscall 0x%02x\n", Syscall)
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables.go b/pkg/sentry/platform/ring0/pagetables/pagetables.go
index 7f18ac296..bc16a1622 100644
--- a/pkg/sentry/platform/ring0/pagetables/pagetables.go
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables.go
@@ -30,6 +30,10 @@ type PageTables struct {
Allocator Allocator
// root is the pagetable root.
+ //
+ // For same archs such as amd64, the upper of the PTEs is cloned
+ // from and owned by upperSharedPageTables which are shared among
+ // many PageTables if upperSharedPageTables is not nil.
root *PTEs
// rootPhysical is the cached physical address of the root.
@@ -39,15 +43,52 @@ type PageTables struct {
// archPageTables includes architecture-specific features.
archPageTables
+
+ // upperSharedPageTables represents a read-only shared upper
+ // of the Pagetable. When it is not nil, the upper is not
+ // allowed to be modified.
+ upperSharedPageTables *PageTables
+
+ // upperStart is the start address of the upper portion that
+ // are shared from upperSharedPageTables
+ upperStart uintptr
+
+ // readOnlyShared indicates the Pagetables are read-only and
+ // own the ranges that are shared with other Pagetables.
+ readOnlyShared bool
}
-// New returns new PageTables.
-func New(a Allocator) *PageTables {
+// NewWithUpper returns new PageTables.
+//
+// upperSharedPageTables are used for mapping the upper of addresses,
+// starting at upperStart. These pageTables should not be touched (as
+// invalidations may be incorrect) after they are passed as an
+// upperSharedPageTables. Only when all dependent PageTables are gone
+// may they be used. The intenteded use case is for kernel page tables,
+// which are static and fixed.
+//
+// Precondition: upperStart must be between canonical ranges.
+// Precondition: upperStart must be pgdSize aligned.
+// precondition: upperSharedPageTables must be marked read-only shared.
+func NewWithUpper(a Allocator, upperSharedPageTables *PageTables, upperStart uintptr) *PageTables {
p := new(PageTables)
p.Init(a)
+ if upperSharedPageTables != nil {
+ if !upperSharedPageTables.readOnlyShared {
+ panic("Only read-only shared pagetables can be used as upper")
+ }
+ p.upperSharedPageTables = upperSharedPageTables
+ p.upperStart = upperStart
+ p.cloneUpperShared()
+ }
return p
}
+// New returns new PageTables.
+func New(a Allocator) *PageTables {
+ return NewWithUpper(a, nil, 0)
+}
+
// mapVisitor is used for map.
type mapVisitor struct {
target uintptr // Input.
@@ -90,6 +131,21 @@ func (*mapVisitor) requiresSplit() bool { return true }
//
//go:nosplit
func (p *PageTables) Map(addr usermem.Addr, length uintptr, opts MapOpts, physical uintptr) bool {
+ if p.readOnlyShared {
+ panic("Should not modify read-only shared pagetables.")
+ }
+ if uintptr(addr)+length < uintptr(addr) {
+ panic("addr & length overflow")
+ }
+ if p.upperSharedPageTables != nil {
+ // ignore change to the read-only upper shared portion.
+ if uintptr(addr) >= p.upperStart {
+ return false
+ }
+ if uintptr(addr)+length > p.upperStart {
+ length = p.upperStart - uintptr(addr)
+ }
+ }
if !opts.AccessType.Any() {
return p.Unmap(addr, length)
}
@@ -128,12 +184,27 @@ func (v *unmapVisitor) visit(start uintptr, pte *PTE, align uintptr) {
//
// True is returned iff there was a previous mapping in the range.
//
-// Precondition: addr & length must be page-aligned.
+// Precondition: addr & length must be page-aligned, their sum must not overflow.
//
// +checkescape:hard,stack
//
//go:nosplit
func (p *PageTables) Unmap(addr usermem.Addr, length uintptr) bool {
+ if p.readOnlyShared {
+ panic("Should not modify read-only shared pagetables.")
+ }
+ if uintptr(addr)+length < uintptr(addr) {
+ panic("addr & length overflow")
+ }
+ if p.upperSharedPageTables != nil {
+ // ignore change to the read-only upper shared portion.
+ if uintptr(addr) >= p.upperStart {
+ return false
+ }
+ if uintptr(addr)+length > p.upperStart {
+ length = p.upperStart - uintptr(addr)
+ }
+ }
w := unmapWalker{
pageTables: p,
visitor: unmapVisitor{
@@ -218,3 +289,10 @@ func (p *PageTables) Lookup(addr usermem.Addr) (physical uintptr, opts MapOpts)
w.iterateRange(uintptr(addr), uintptr(addr)+1)
return w.visitor.physical + offset, w.visitor.opts
}
+
+// MarkReadOnlyShared marks the pagetables read-only and can be shared.
+//
+// It is usually used on the pagetables that are used as the upper
+func (p *PageTables) MarkReadOnlyShared() {
+ p.readOnlyShared = true
+}
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_aarch64.go b/pkg/sentry/platform/ring0/pagetables/pagetables_aarch64.go
index 520161755..a4e416af7 100644
--- a/pkg/sentry/platform/ring0/pagetables/pagetables_aarch64.go
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables_aarch64.go
@@ -24,14 +24,6 @@ import (
// archPageTables is architecture-specific data.
type archPageTables struct {
- // root is the pagetable root for kernel space.
- root *PTEs
-
- // rootPhysical is the cached physical address of the root.
- //
- // This is saved only to prevent constant translation.
- rootPhysical uintptr
-
asid uint16
}
@@ -46,7 +38,7 @@ func (p *PageTables) TTBR0_EL1(noFlush bool, asid uint16) uint64 {
//
//go:nosplit
func (p *PageTables) TTBR1_EL1(noFlush bool, asid uint16) uint64 {
- return uint64(p.archPageTables.rootPhysical) | (uint64(asid)&ttbrASIDMask)<<ttbrASIDOffset
+ return uint64(p.upperSharedPageTables.rootPhysical) | (uint64(asid)&ttbrASIDMask)<<ttbrASIDOffset
}
// Bits in page table entries.
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go b/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go
index 0c153cf8c..e7ab887e5 100644
--- a/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables_amd64.go
@@ -50,5 +50,26 @@ func (p *PageTables) Init(allocator Allocator) {
p.rootPhysical = p.Allocator.PhysicalFor(p.root)
}
+func pgdIndex(upperStart uintptr) uintptr {
+ if upperStart&(pgdSize-1) != 0 {
+ panic("upperStart should be pgd size aligned")
+ }
+ if upperStart >= upperBottom {
+ return entriesPerPage/2 + (upperStart-upperBottom)/pgdSize
+ }
+ if upperStart < lowerTop {
+ return upperStart / pgdSize
+ }
+ panic("upperStart should be in canonical range")
+}
+
+// cloneUpperShared clone the upper from the upper shared page tables.
+//
+//go:nosplit
+func (p *PageTables) cloneUpperShared() {
+ start := pgdIndex(p.upperStart)
+ copy(p.root[start:entriesPerPage], p.upperSharedPageTables.root[start:entriesPerPage])
+}
+
// PTEs is a collection of entries.
type PTEs [entriesPerPage]PTE
diff --git a/pkg/sentry/platform/ring0/pagetables/pagetables_arm64.go b/pkg/sentry/platform/ring0/pagetables/pagetables_arm64.go
index 5ddd10256..5392bf27a 100644
--- a/pkg/sentry/platform/ring0/pagetables/pagetables_arm64.go
+++ b/pkg/sentry/platform/ring0/pagetables/pagetables_arm64.go
@@ -49,8 +49,17 @@ func (p *PageTables) Init(allocator Allocator) {
p.Allocator = allocator
p.root = p.Allocator.NewPTEs()
p.rootPhysical = p.Allocator.PhysicalFor(p.root)
- p.archPageTables.root = p.Allocator.NewPTEs()
- p.archPageTables.rootPhysical = p.Allocator.PhysicalFor(p.archPageTables.root)
+}
+
+// cloneUpperShared clone the upper from the upper shared page tables.
+//
+//go:nosplit
+func (p *PageTables) cloneUpperShared() {
+ if p.upperStart != upperBottom {
+ panic("upperStart should be the same as upperBottom")
+ }
+
+ // nothing to do for arm.
}
// PTEs is a collection of entries.
diff --git a/pkg/sentry/platform/ring0/pagetables/walker_arm64.go b/pkg/sentry/platform/ring0/pagetables/walker_arm64.go
index c261d393a..157c9a7cc 100644
--- a/pkg/sentry/platform/ring0/pagetables/walker_arm64.go
+++ b/pkg/sentry/platform/ring0/pagetables/walker_arm64.go
@@ -116,7 +116,7 @@ func next(start uintptr, size uintptr) uintptr {
func (w *Walker) iterateRangeCanonical(start, end uintptr) {
pgdEntryIndex := w.pageTables.root
if start >= upperBottom {
- pgdEntryIndex = w.pageTables.archPageTables.root
+ pgdEntryIndex = w.pageTables.upperSharedPageTables.root
}
for pgdIndex := (uint16((start & pgdMask) >> pgdShift)); start < end && pgdIndex < entriesPerPage; pgdIndex++ {
diff --git a/pkg/sentry/socket/netfilter/extensions.go b/pkg/sentry/socket/netfilter/extensions.go
index 549787955..e0976fed0 100644
--- a/pkg/sentry/socket/netfilter/extensions.go
+++ b/pkg/sentry/socket/netfilter/extensions.go
@@ -100,24 +100,43 @@ func unmarshalMatcher(match linux.XTEntryMatch, filter stack.IPHeaderFilter, buf
// marshalTarget and unmarshalTarget can be used.
type targetMaker interface {
// id uniquely identifies the target.
- id() stack.TargetID
+ id() targetID
- // marshal converts from a stack.Target to an ABI struct.
- marshal(target stack.Target) []byte
+ // marshal converts from a target to an ABI struct.
+ marshal(target target) []byte
- // unmarshal converts from the ABI matcher struct to a stack.Target.
- unmarshal(buf []byte, filter stack.IPHeaderFilter) (stack.Target, *syserr.Error)
+ // unmarshal converts from the ABI matcher struct to a target.
+ unmarshal(buf []byte, filter stack.IPHeaderFilter) (target, *syserr.Error)
}
-// targetMakers maps the TargetID of supported targets to the targetMaker that
+// A targetID uniquely identifies a target.
+type targetID struct {
+ // name is the target name as stored in the xt_entry_target struct.
+ name string
+
+ // networkProtocol is the protocol to which the target applies.
+ networkProtocol tcpip.NetworkProtocolNumber
+
+ // revision is the version of the target.
+ revision uint8
+}
+
+// target extends a stack.Target, allowing it to be used with the extension
+// system. The sentry only uses targets, never stack.Targets directly.
+type target interface {
+ stack.Target
+ id() targetID
+}
+
+// targetMakers maps the targetID of supported targets to the targetMaker that
// marshals and unmarshals it. It is immutable after package initialization.
-var targetMakers = map[stack.TargetID]targetMaker{}
+var targetMakers = map[targetID]targetMaker{}
func targetRevision(name string, netProto tcpip.NetworkProtocolNumber, rev uint8) (uint8, bool) {
- tid := stack.TargetID{
- Name: name,
- NetworkProtocol: netProto,
- Revision: rev,
+ tid := targetID{
+ name: name,
+ networkProtocol: netProto,
+ revision: rev,
}
if _, ok := targetMakers[tid]; !ok {
return 0, false
@@ -126,8 +145,8 @@ func targetRevision(name string, netProto tcpip.NetworkProtocolNumber, rev uint8
// Return the highest supported revision unless rev is higher.
for _, other := range targetMakers {
otherID := other.id()
- if name == otherID.Name && netProto == otherID.NetworkProtocol && otherID.Revision > rev {
- rev = uint8(otherID.Revision)
+ if name == otherID.name && netProto == otherID.networkProtocol && otherID.revision > rev {
+ rev = uint8(otherID.revision)
}
}
return rev, true
@@ -142,19 +161,21 @@ func registerTargetMaker(tm targetMaker) {
targetMakers[tm.id()] = tm
}
-func marshalTarget(target stack.Target) []byte {
- targetMaker, ok := targetMakers[target.ID()]
+func marshalTarget(tgt stack.Target) []byte {
+ // The sentry only uses targets, never stack.Targets directly.
+ target := tgt.(target)
+ targetMaker, ok := targetMakers[target.id()]
if !ok {
- panic(fmt.Sprintf("unknown target of type %T with id %+v.", target, target.ID()))
+ panic(fmt.Sprintf("unknown target of type %T with id %+v.", target, target.id()))
}
return targetMaker.marshal(target)
}
-func unmarshalTarget(target linux.XTEntryTarget, filter stack.IPHeaderFilter, buf []byte) (stack.Target, *syserr.Error) {
- tid := stack.TargetID{
- Name: target.Name.String(),
- NetworkProtocol: filter.NetworkProtocol(),
- Revision: target.Revision,
+func unmarshalTarget(target linux.XTEntryTarget, filter stack.IPHeaderFilter, buf []byte) (target, *syserr.Error) {
+ tid := targetID{
+ name: target.Name.String(),
+ networkProtocol: filter.NetworkProtocol(),
+ revision: target.Revision,
}
targetMaker, ok := targetMakers[tid]
if !ok {
diff --git a/pkg/sentry/socket/netfilter/ipv4.go b/pkg/sentry/socket/netfilter/ipv4.go
index b560fae0d..70c561cce 100644
--- a/pkg/sentry/socket/netfilter/ipv4.go
+++ b/pkg/sentry/socket/netfilter/ipv4.go
@@ -46,13 +46,13 @@ func convertNetstackToBinary4(stk *stack.Stack, tablename linux.TableName) (linu
return linux.KernelIPTGetEntries{}, linux.IPTGetinfo{}, fmt.Errorf("table name %q too long", tablename)
}
- table, ok := stk.IPTables().GetTable(tablename.String(), false)
+ id, ok := nameToID[tablename.String()]
if !ok {
return linux.KernelIPTGetEntries{}, linux.IPTGetinfo{}, fmt.Errorf("couldn't find table %q", tablename)
}
// Setup the info struct.
- entries, info := getEntries4(table, tablename)
+ entries, info := getEntries4(stk.IPTables().GetTable(id, false), tablename)
return entries, info, nil
}
diff --git a/pkg/sentry/socket/netfilter/ipv6.go b/pkg/sentry/socket/netfilter/ipv6.go
index 4253f7bf4..5dbb604f0 100644
--- a/pkg/sentry/socket/netfilter/ipv6.go
+++ b/pkg/sentry/socket/netfilter/ipv6.go
@@ -46,13 +46,13 @@ func convertNetstackToBinary6(stk *stack.Stack, tablename linux.TableName) (linu
return linux.KernelIP6TGetEntries{}, linux.IPTGetinfo{}, fmt.Errorf("table name %q too long", tablename)
}
- table, ok := stk.IPTables().GetTable(tablename.String(), true)
+ id, ok := nameToID[tablename.String()]
if !ok {
return linux.KernelIP6TGetEntries{}, linux.IPTGetinfo{}, fmt.Errorf("couldn't find table %q", tablename)
}
// Setup the info struct, which is the same in IPv4 and IPv6.
- entries, info := getEntries6(table, tablename)
+ entries, info := getEntries6(stk.IPTables().GetTable(id, true), tablename)
return entries, info, nil
}
diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go
index 904a12e38..b283d7229 100644
--- a/pkg/sentry/socket/netfilter/netfilter.go
+++ b/pkg/sentry/socket/netfilter/netfilter.go
@@ -42,6 +42,45 @@ func nflog(format string, args ...interface{}) {
}
}
+// Table names.
+const (
+ natTable = "nat"
+ mangleTable = "mangle"
+ filterTable = "filter"
+)
+
+// nameToID is immutable.
+var nameToID = map[string]stack.TableID{
+ natTable: stack.NATID,
+ mangleTable: stack.MangleID,
+ filterTable: stack.FilterID,
+}
+
+// DefaultLinuxTables returns the rules of stack.DefaultTables() wrapped for
+// compatibility with netfilter extensions.
+func DefaultLinuxTables() *stack.IPTables {
+ tables := stack.DefaultTables()
+ tables.VisitTargets(func(oldTarget stack.Target) stack.Target {
+ switch val := oldTarget.(type) {
+ case *stack.AcceptTarget:
+ return &acceptTarget{AcceptTarget: *val}
+ case *stack.DropTarget:
+ return &dropTarget{DropTarget: *val}
+ case *stack.ErrorTarget:
+ return &errorTarget{ErrorTarget: *val}
+ case *stack.UserChainTarget:
+ return &userChainTarget{UserChainTarget: *val}
+ case *stack.ReturnTarget:
+ return &returnTarget{ReturnTarget: *val}
+ case *stack.RedirectTarget:
+ return &redirectTarget{RedirectTarget: *val}
+ default:
+ panic(fmt.Sprintf("Unknown rule in default iptables of type %T", val))
+ }
+ })
+ return tables
+}
+
// GetInfo returns information about iptables.
func GetInfo(t *kernel.Task, stack *stack.Stack, outPtr usermem.Addr, ipv6 bool) (linux.IPTGetinfo, *syserr.Error) {
// Read in the struct and table name.
@@ -144,9 +183,9 @@ func SetEntries(stk *stack.Stack, optVal []byte, ipv6 bool) *syserr.Error {
// TODO(gvisor.dev/issue/170): Support other tables.
var table stack.Table
switch replace.Name.String() {
- case stack.FilterTable:
+ case filterTable:
table = stack.EmptyFilterTable()
- case stack.NATTable:
+ case natTable:
table = stack.EmptyNATTable()
default:
nflog("we don't yet support writing to the %q table (gvisor.dev/issue/170)", replace.Name.String())
@@ -177,7 +216,7 @@ func SetEntries(stk *stack.Stack, optVal []byte, ipv6 bool) *syserr.Error {
}
if offset == replace.Underflow[hook] {
if !validUnderflow(table.Rules[ruleIdx], ipv6) {
- nflog("underflow for hook %d isn't an unconditional ACCEPT or DROP", ruleIdx)
+ nflog("underflow for hook %d isn't an unconditional ACCEPT or DROP: %+v", ruleIdx)
return syserr.ErrInvalidArgument
}
table.Underflows[hk] = ruleIdx
@@ -253,8 +292,7 @@ func SetEntries(stk *stack.Stack, optVal []byte, ipv6 bool) *syserr.Error {
// - There are no chains without an unconditional final rule.
// - There are no chains without an unconditional underflow rule.
- return syserr.TranslateNetstackError(stk.IPTables().ReplaceTable(replace.Name.String(), table, ipv6))
-
+ return syserr.TranslateNetstackError(stk.IPTables().ReplaceTable(nameToID[replace.Name.String()], table, ipv6))
}
// parseMatchers parses 0 or more matchers from optVal. optVal should contain
@@ -308,7 +346,7 @@ func validUnderflow(rule stack.Rule, ipv6 bool) bool {
return false
}
switch rule.Target.(type) {
- case *stack.AcceptTarget, *stack.DropTarget:
+ case *acceptTarget, *dropTarget:
return true
default:
return false
@@ -319,7 +357,7 @@ func isUnconditionalAccept(rule stack.Rule, ipv6 bool) bool {
if !validUnderflow(rule, ipv6) {
return false
}
- _, ok := rule.Target.(*stack.AcceptTarget)
+ _, ok := rule.Target.(*acceptTarget)
return ok
}
diff --git a/pkg/sentry/socket/netfilter/targets.go b/pkg/sentry/socket/netfilter/targets.go
index 0e14447fe..f2653d523 100644
--- a/pkg/sentry/socket/netfilter/targets.go
+++ b/pkg/sentry/socket/netfilter/targets.go
@@ -26,6 +26,15 @@ import (
"gvisor.dev/gvisor/pkg/usermem"
)
+// ErrorTargetName is used to mark targets as error targets. Error targets
+// shouldn't be reached - an error has occurred if we fall through to one.
+const ErrorTargetName = "ERROR"
+
+// RedirectTargetName is used to mark targets as redirect targets. Redirect
+// targets should be reached for only NAT and Mangle tables. These targets will
+// change the destination port and/or IP for packets.
+const RedirectTargetName = "REDIRECT"
+
func init() {
// Standard targets include ACCEPT, DROP, RETURN, and JUMP.
registerTargetMaker(&standardTargetMaker{
@@ -52,25 +61,96 @@ func init() {
})
}
+// The stack package provides some basic, useful targets for us. The following
+// types wrap them for compatibility with the extension system.
+
+type acceptTarget struct {
+ stack.AcceptTarget
+}
+
+func (at *acceptTarget) id() targetID {
+ return targetID{
+ networkProtocol: at.NetworkProtocol,
+ }
+}
+
+type dropTarget struct {
+ stack.DropTarget
+}
+
+func (dt *dropTarget) id() targetID {
+ return targetID{
+ networkProtocol: dt.NetworkProtocol,
+ }
+}
+
+type errorTarget struct {
+ stack.ErrorTarget
+}
+
+func (et *errorTarget) id() targetID {
+ return targetID{
+ name: ErrorTargetName,
+ networkProtocol: et.NetworkProtocol,
+ }
+}
+
+type userChainTarget struct {
+ stack.UserChainTarget
+}
+
+func (uc *userChainTarget) id() targetID {
+ return targetID{
+ name: ErrorTargetName,
+ networkProtocol: uc.NetworkProtocol,
+ }
+}
+
+type returnTarget struct {
+ stack.ReturnTarget
+}
+
+func (rt *returnTarget) id() targetID {
+ return targetID{
+ networkProtocol: rt.NetworkProtocol,
+ }
+}
+
+type redirectTarget struct {
+ stack.RedirectTarget
+
+ // addr must be (un)marshalled when reading and writing the target to
+ // userspace, but does not affect behavior.
+ addr tcpip.Address
+}
+
+func (rt *redirectTarget) id() targetID {
+ return targetID{
+ name: RedirectTargetName,
+ networkProtocol: rt.NetworkProtocol,
+ }
+}
+
type standardTargetMaker struct {
NetworkProtocol tcpip.NetworkProtocolNumber
}
-func (sm *standardTargetMaker) id() stack.TargetID {
+func (sm *standardTargetMaker) id() targetID {
// Standard targets have the empty string as a name and no revisions.
- return stack.TargetID{
- NetworkProtocol: sm.NetworkProtocol,
+ return targetID{
+ networkProtocol: sm.NetworkProtocol,
}
}
-func (*standardTargetMaker) marshal(target stack.Target) []byte {
+
+func (*standardTargetMaker) marshal(target target) []byte {
// Translate verdicts the same way as the iptables tool.
var verdict int32
switch tg := target.(type) {
- case *stack.AcceptTarget:
+ case *acceptTarget:
verdict = -linux.NF_ACCEPT - 1
- case *stack.DropTarget:
+ case *dropTarget:
verdict = -linux.NF_DROP - 1
- case *stack.ReturnTarget:
+ case *returnTarget:
verdict = linux.NF_RETURN
case *JumpTarget:
verdict = int32(tg.Offset)
@@ -90,7 +170,7 @@ func (*standardTargetMaker) marshal(target stack.Target) []byte {
return binary.Marshal(ret, usermem.ByteOrder, xt)
}
-func (*standardTargetMaker) unmarshal(buf []byte, filter stack.IPHeaderFilter) (stack.Target, *syserr.Error) {
+func (*standardTargetMaker) unmarshal(buf []byte, filter stack.IPHeaderFilter) (target, *syserr.Error) {
if len(buf) != linux.SizeOfXTStandardTarget {
nflog("buf has wrong size for standard target %d", len(buf))
return nil, syserr.ErrInvalidArgument
@@ -114,20 +194,20 @@ type errorTargetMaker struct {
NetworkProtocol tcpip.NetworkProtocolNumber
}
-func (em *errorTargetMaker) id() stack.TargetID {
+func (em *errorTargetMaker) id() targetID {
// Error targets have no revision.
- return stack.TargetID{
- Name: stack.ErrorTargetName,
- NetworkProtocol: em.NetworkProtocol,
+ return targetID{
+ name: ErrorTargetName,
+ networkProtocol: em.NetworkProtocol,
}
}
-func (*errorTargetMaker) marshal(target stack.Target) []byte {
+func (*errorTargetMaker) marshal(target target) []byte {
var errorName string
switch tg := target.(type) {
- case *stack.ErrorTarget:
- errorName = stack.ErrorTargetName
- case *stack.UserChainTarget:
+ case *errorTarget:
+ errorName = ErrorTargetName
+ case *userChainTarget:
errorName = tg.Name
default:
panic(fmt.Sprintf("errorMakerTarget cannot marshal unknown type %T", target))
@@ -140,37 +220,38 @@ func (*errorTargetMaker) marshal(target stack.Target) []byte {
},
}
copy(xt.Name[:], errorName)
- copy(xt.Target.Name[:], stack.ErrorTargetName)
+ copy(xt.Target.Name[:], ErrorTargetName)
ret := make([]byte, 0, linux.SizeOfXTErrorTarget)
return binary.Marshal(ret, usermem.ByteOrder, xt)
}
-func (*errorTargetMaker) unmarshal(buf []byte, filter stack.IPHeaderFilter) (stack.Target, *syserr.Error) {
+func (*errorTargetMaker) unmarshal(buf []byte, filter stack.IPHeaderFilter) (target, *syserr.Error) {
if len(buf) != linux.SizeOfXTErrorTarget {
nflog("buf has insufficient size for error target %d", len(buf))
return nil, syserr.ErrInvalidArgument
}
- var errorTarget linux.XTErrorTarget
+ var errTgt linux.XTErrorTarget
buf = buf[:linux.SizeOfXTErrorTarget]
- binary.Unmarshal(buf, usermem.ByteOrder, &errorTarget)
+ binary.Unmarshal(buf, usermem.ByteOrder, &errTgt)
// Error targets are used in 2 cases:
- // * An actual error case. These rules have an error
- // named stack.ErrorTargetName. The last entry of the table
- // is usually an error case to catch any packets that
- // somehow fall through every rule.
+ // * An actual error case. These rules have an error named
+ // ErrorTargetName. The last entry of the table is usually an error
+ // case to catch any packets that somehow fall through every rule.
// * To mark the start of a user defined chain. These
// rules have an error with the name of the chain.
- switch name := errorTarget.Name.String(); name {
- case stack.ErrorTargetName:
- return &stack.ErrorTarget{NetworkProtocol: filter.NetworkProtocol()}, nil
+ switch name := errTgt.Name.String(); name {
+ case ErrorTargetName:
+ return &errorTarget{stack.ErrorTarget{
+ NetworkProtocol: filter.NetworkProtocol(),
+ }}, nil
default:
// User defined chain.
- return &stack.UserChainTarget{
+ return &userChainTarget{stack.UserChainTarget{
Name: name,
NetworkProtocol: filter.NetworkProtocol(),
- }, nil
+ }}, nil
}
}
@@ -178,22 +259,22 @@ type redirectTargetMaker struct {
NetworkProtocol tcpip.NetworkProtocolNumber
}
-func (rm *redirectTargetMaker) id() stack.TargetID {
- return stack.TargetID{
- Name: stack.RedirectTargetName,
- NetworkProtocol: rm.NetworkProtocol,
+func (rm *redirectTargetMaker) id() targetID {
+ return targetID{
+ name: RedirectTargetName,
+ networkProtocol: rm.NetworkProtocol,
}
}
-func (*redirectTargetMaker) marshal(target stack.Target) []byte {
- rt := target.(*stack.RedirectTarget)
+func (*redirectTargetMaker) marshal(target target) []byte {
+ rt := target.(*redirectTarget)
// This is a redirect target named redirect
xt := linux.XTRedirectTarget{
Target: linux.XTEntryTarget{
TargetSize: linux.SizeOfXTRedirectTarget,
},
}
- copy(xt.Target.Name[:], stack.RedirectTargetName)
+ copy(xt.Target.Name[:], RedirectTargetName)
ret := make([]byte, 0, linux.SizeOfXTRedirectTarget)
xt.NfRange.RangeSize = 1
@@ -203,7 +284,7 @@ func (*redirectTargetMaker) marshal(target stack.Target) []byte {
return binary.Marshal(ret, usermem.ByteOrder, xt)
}
-func (*redirectTargetMaker) unmarshal(buf []byte, filter stack.IPHeaderFilter) (stack.Target, *syserr.Error) {
+func (*redirectTargetMaker) unmarshal(buf []byte, filter stack.IPHeaderFilter) (target, *syserr.Error) {
if len(buf) < linux.SizeOfXTRedirectTarget {
nflog("redirectTargetMaker: buf has insufficient size for redirect target %d", len(buf))
return nil, syserr.ErrInvalidArgument
@@ -214,15 +295,17 @@ func (*redirectTargetMaker) unmarshal(buf []byte, filter stack.IPHeaderFilter) (
return nil, syserr.ErrInvalidArgument
}
- var redirectTarget linux.XTRedirectTarget
+ var rt linux.XTRedirectTarget
buf = buf[:linux.SizeOfXTRedirectTarget]
- binary.Unmarshal(buf, usermem.ByteOrder, &redirectTarget)
+ binary.Unmarshal(buf, usermem.ByteOrder, &rt)
// Copy linux.XTRedirectTarget to stack.RedirectTarget.
- target := stack.RedirectTarget{NetworkProtocol: filter.NetworkProtocol()}
+ target := redirectTarget{RedirectTarget: stack.RedirectTarget{
+ NetworkProtocol: filter.NetworkProtocol(),
+ }}
// RangeSize should be 1.
- nfRange := redirectTarget.NfRange
+ nfRange := rt.NfRange
if nfRange.RangeSize != 1 {
nflog("redirectTargetMaker: bad rangesize %d", nfRange.RangeSize)
return nil, syserr.ErrInvalidArgument
@@ -247,7 +330,7 @@ func (*redirectTargetMaker) unmarshal(buf []byte, filter stack.IPHeaderFilter) (
return nil, syserr.ErrInvalidArgument
}
- target.Addr = tcpip.Address(nfRange.RangeIPV4.MinIP[:])
+ target.addr = tcpip.Address(nfRange.RangeIPV4.MinIP[:])
target.Port = ntohs(nfRange.RangeIPV4.MinPort)
return &target, nil
@@ -264,15 +347,15 @@ type nfNATTargetMaker struct {
NetworkProtocol tcpip.NetworkProtocolNumber
}
-func (rm *nfNATTargetMaker) id() stack.TargetID {
- return stack.TargetID{
- Name: stack.RedirectTargetName,
- NetworkProtocol: rm.NetworkProtocol,
+func (rm *nfNATTargetMaker) id() targetID {
+ return targetID{
+ name: RedirectTargetName,
+ networkProtocol: rm.NetworkProtocol,
}
}
-func (*nfNATTargetMaker) marshal(target stack.Target) []byte {
- rt := target.(*stack.RedirectTarget)
+func (*nfNATTargetMaker) marshal(target target) []byte {
+ rt := target.(*redirectTarget)
nt := nfNATTarget{
Target: linux.XTEntryTarget{
TargetSize: nfNATMarhsalledSize,
@@ -281,9 +364,9 @@ func (*nfNATTargetMaker) marshal(target stack.Target) []byte {
Flags: linux.NF_NAT_RANGE_PROTO_SPECIFIED,
},
}
- copy(nt.Target.Name[:], stack.RedirectTargetName)
- copy(nt.Range.MinAddr[:], rt.Addr)
- copy(nt.Range.MaxAddr[:], rt.Addr)
+ copy(nt.Target.Name[:], RedirectTargetName)
+ copy(nt.Range.MinAddr[:], rt.addr)
+ copy(nt.Range.MaxAddr[:], rt.addr)
nt.Range.MinProto = htons(rt.Port)
nt.Range.MaxProto = nt.Range.MinProto
@@ -292,7 +375,7 @@ func (*nfNATTargetMaker) marshal(target stack.Target) []byte {
return binary.Marshal(ret, usermem.ByteOrder, nt)
}
-func (*nfNATTargetMaker) unmarshal(buf []byte, filter stack.IPHeaderFilter) (stack.Target, *syserr.Error) {
+func (*nfNATTargetMaker) unmarshal(buf []byte, filter stack.IPHeaderFilter) (target, *syserr.Error) {
if size := nfNATMarhsalledSize; len(buf) < size {
nflog("nfNATTargetMaker: buf has insufficient size (%d) for nfNAT target (%d)", len(buf), size)
return nil, syserr.ErrInvalidArgument
@@ -324,10 +407,12 @@ func (*nfNATTargetMaker) unmarshal(buf []byte, filter stack.IPHeaderFilter) (sta
return nil, syserr.ErrInvalidArgument
}
- target := stack.RedirectTarget{
- NetworkProtocol: filter.NetworkProtocol(),
- Addr: tcpip.Address(natRange.MinAddr[:]),
- Port: ntohs(natRange.MinProto),
+ target := redirectTarget{
+ RedirectTarget: stack.RedirectTarget{
+ NetworkProtocol: filter.NetworkProtocol(),
+ Port: ntohs(natRange.MinProto),
+ },
+ addr: tcpip.Address(natRange.MinAddr[:]),
}
return &target, nil
@@ -335,18 +420,24 @@ func (*nfNATTargetMaker) unmarshal(buf []byte, filter stack.IPHeaderFilter) (sta
// translateToStandardTarget translates from the value in a
// linux.XTStandardTarget to an stack.Verdict.
-func translateToStandardTarget(val int32, netProto tcpip.NetworkProtocolNumber) (stack.Target, *syserr.Error) {
+func translateToStandardTarget(val int32, netProto tcpip.NetworkProtocolNumber) (target, *syserr.Error) {
// TODO(gvisor.dev/issue/170): Support other verdicts.
switch val {
case -linux.NF_ACCEPT - 1:
- return &stack.AcceptTarget{NetworkProtocol: netProto}, nil
+ return &acceptTarget{stack.AcceptTarget{
+ NetworkProtocol: netProto,
+ }}, nil
case -linux.NF_DROP - 1:
- return &stack.DropTarget{NetworkProtocol: netProto}, nil
+ return &dropTarget{stack.DropTarget{
+ NetworkProtocol: netProto,
+ }}, nil
case -linux.NF_QUEUE - 1:
nflog("unsupported iptables verdict QUEUE")
return nil, syserr.ErrInvalidArgument
case linux.NF_RETURN:
- return &stack.ReturnTarget{NetworkProtocol: netProto}, nil
+ return &returnTarget{stack.ReturnTarget{
+ NetworkProtocol: netProto,
+ }}, nil
default:
nflog("unknown iptables verdict %d", val)
return nil, syserr.ErrInvalidArgument
@@ -382,9 +473,9 @@ type JumpTarget struct {
}
// ID implements Target.ID.
-func (jt *JumpTarget) ID() stack.TargetID {
- return stack.TargetID{
- NetworkProtocol: jt.NetworkProtocol,
+func (jt *JumpTarget) id() targetID {
+ return targetID{
+ networkProtocol: jt.NetworkProtocol,
}
}
diff --git a/pkg/sentry/socket/netlink/route/protocol.go b/pkg/sentry/socket/netlink/route/protocol.go
index 22216158e..f4d034c13 100644
--- a/pkg/sentry/socket/netlink/route/protocol.go
+++ b/pkg/sentry/socket/netlink/route/protocol.go
@@ -487,7 +487,7 @@ func (p *Protocol) delAddr(ctx context.Context, msg *netlink.Message, ms *netlin
Addr: value,
})
if err != nil {
- return syserr.ErrInvalidArgument
+ return syserr.ErrBadLocalAddress
}
case linux.IFA_ADDRESS:
default:
diff --git a/pkg/sentry/socket/unix/transport/connectioned.go b/pkg/sentry/socket/unix/transport/connectioned.go
index aa4f3c04d..6d9e502bd 100644
--- a/pkg/sentry/socket/unix/transport/connectioned.go
+++ b/pkg/sentry/socket/unix/transport/connectioned.go
@@ -142,9 +142,9 @@ func NewPair(ctx context.Context, stype linux.SockType, uid UniqueIDProvider) (E
}
q1 := &queue{ReaderQueue: a.Queue, WriterQueue: b.Queue, limit: initialLimit}
- q1.EnableLeakCheck()
+ q1.InitRefs()
q2 := &queue{ReaderQueue: b.Queue, WriterQueue: a.Queue, limit: initialLimit}
- q2.EnableLeakCheck()
+ q2.InitRefs()
if stype == linux.SOCK_STREAM {
a.receiver = &streamQueueReceiver{queueReceiver: queueReceiver{q1}}
@@ -300,14 +300,14 @@ func (e *connectionedEndpoint) BidirectionalConnect(ctx context.Context, ce Conn
}
readQueue := &queue{ReaderQueue: ce.WaiterQueue(), WriterQueue: ne.Queue, limit: initialLimit}
- readQueue.EnableLeakCheck()
+ readQueue.InitRefs()
ne.connected = &connectedEndpoint{
endpoint: ce,
writeQueue: readQueue,
}
writeQueue := &queue{ReaderQueue: ne.Queue, WriterQueue: ce.WaiterQueue(), limit: initialLimit}
- writeQueue.EnableLeakCheck()
+ writeQueue.InitRefs()
if e.stype == linux.SOCK_STREAM {
ne.receiver = &streamQueueReceiver{queueReceiver: queueReceiver{readQueue: writeQueue}}
} else {
diff --git a/pkg/sentry/socket/unix/transport/connectionless.go b/pkg/sentry/socket/unix/transport/connectionless.go
index f8aacca13..1406971bc 100644
--- a/pkg/sentry/socket/unix/transport/connectionless.go
+++ b/pkg/sentry/socket/unix/transport/connectionless.go
@@ -42,7 +42,7 @@ var (
func NewConnectionless(ctx context.Context) Endpoint {
ep := &connectionlessEndpoint{baseEndpoint{Queue: &waiter.Queue{}}}
q := queue{ReaderQueue: ep.Queue, WriterQueue: &waiter.Queue{}, limit: initialLimit}
- q.EnableLeakCheck()
+ q.InitRefs()
ep.receiver = &queueReceiver{readQueue: &q}
return ep
}
diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go
index adad485a9..b32bb7ba8 100644
--- a/pkg/sentry/socket/unix/unix.go
+++ b/pkg/sentry/socket/unix/unix.go
@@ -80,7 +80,7 @@ func NewWithDirent(ctx context.Context, d *fs.Dirent, ep transport.Endpoint, sty
stype: stype,
},
}
- s.EnableLeakCheck()
+ s.InitRefs()
return fs.NewFile(ctx, d, flags, &s)
}
diff --git a/pkg/sentry/socket/unix/unix_vfs2.go b/pkg/sentry/socket/unix/unix_vfs2.go
index 7a78444dc..eaf0b0d26 100644
--- a/pkg/sentry/socket/unix/unix_vfs2.go
+++ b/pkg/sentry/socket/unix/unix_vfs2.go
@@ -80,7 +80,7 @@ func NewFileDescription(ep transport.Endpoint, stype linux.SockType, flags uint3
stype: stype,
},
}
- sock.EnableLeakCheck()
+ sock.InitRefs()
sock.LockFD.Init(locks)
vfsfd := &sock.vfsfd
if err := vfsfd.Init(sock, flags, mnt, d, &vfs.FileDescriptionOptions{
diff --git a/pkg/sentry/syscalls/linux/linux64.go b/pkg/sentry/syscalls/linux/linux64.go
index 36902d177..bb1f715e2 100644
--- a/pkg/sentry/syscalls/linux/linux64.go
+++ b/pkg/sentry/syscalls/linux/linux64.go
@@ -118,7 +118,7 @@ var AMD64 = &kernel.SyscallTable{
63: syscalls.Supported("uname", Uname),
64: syscalls.Supported("semget", Semget),
65: syscalls.PartiallySupported("semop", Semop, "Option SEM_UNDO not supported.", nil),
- 66: syscalls.PartiallySupported("semctl", Semctl, "Options IPC_INFO, SEM_INFO, SEM_STAT, SEM_STAT_ANY, GETNCNT, GETZCNT not supported.", nil),
+ 66: syscalls.PartiallySupported("semctl", Semctl, "Options IPC_INFO, SEM_INFO, SEM_STAT, SEM_STAT_ANY not supported.", nil),
67: syscalls.Supported("shmdt", Shmdt),
68: syscalls.ErrorWithEvent("msgget", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921)
69: syscalls.ErrorWithEvent("msgsnd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921)
@@ -619,7 +619,7 @@ var ARM64 = &kernel.SyscallTable{
188: syscalls.ErrorWithEvent("msgrcv", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921)
189: syscalls.ErrorWithEvent("msgsnd", syserror.ENOSYS, "", []string{"gvisor.dev/issue/135"}), // TODO(b/29354921)
190: syscalls.Supported("semget", Semget),
- 191: syscalls.PartiallySupported("semctl", Semctl, "Options IPC_INFO, SEM_INFO, SEM_STAT, SEM_STAT_ANY, GETNCNT, GETZCNT not supported.", nil),
+ 191: syscalls.PartiallySupported("semctl", Semctl, "Options IPC_INFO, SEM_INFO, SEM_STAT, SEM_STAT_ANY not supported.", nil),
192: syscalls.ErrorWithEvent("semtimedop", syserror.ENOSYS, "", []string{"gvisor.dev/issue/137"}),
193: syscalls.PartiallySupported("semop", Semop, "Option SEM_UNDO not supported.", nil),
194: syscalls.PartiallySupported("shmget", Shmget, "Option SHM_HUGETLB is not supported.", nil),
diff --git a/pkg/sentry/syscalls/linux/sys_pipe.go b/pkg/sentry/syscalls/linux/sys_pipe.go
index 849a47476..f7135ea46 100644
--- a/pkg/sentry/syscalls/linux/sys_pipe.go
+++ b/pkg/sentry/syscalls/linux/sys_pipe.go
@@ -32,7 +32,7 @@ func pipe2(t *kernel.Task, addr usermem.Addr, flags uint) (uintptr, error) {
if flags&^(linux.O_NONBLOCK|linux.O_CLOEXEC) != 0 {
return 0, syserror.EINVAL
}
- r, w := pipe.NewConnectedPipe(t, pipe.DefaultPipeSize, usermem.PageSize)
+ r, w := pipe.NewConnectedPipe(t, pipe.DefaultPipeSize)
r.SetFlags(linuxToFlags(flags).Settable())
defer r.DecRef(t)
diff --git a/pkg/sentry/syscalls/linux/sys_sem.go b/pkg/sentry/syscalls/linux/sys_sem.go
index c2d4bf805..e383a0a87 100644
--- a/pkg/sentry/syscalls/linux/sys_sem.go
+++ b/pkg/sentry/syscalls/linux/sys_sem.go
@@ -138,12 +138,18 @@ func Semctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
return 0, nil, err
+ case linux.GETZCNT:
+ v, err := getZCnt(t, id, num)
+ return uintptr(v), nil, err
+
+ case linux.GETNCNT:
+ v, err := getNCnt(t, id, num)
+ return uintptr(v), nil, err
+
case linux.IPC_INFO,
linux.SEM_INFO,
linux.SEM_STAT,
- linux.SEM_STAT_ANY,
- linux.GETNCNT,
- linux.GETZCNT:
+ linux.SEM_STAT_ANY:
t.Kernel().EmitUnimplementedEvent(t)
fallthrough
@@ -258,3 +264,23 @@ func getPID(t *kernel.Task, id int32, num int32) (int32, error) {
}
return int32(tg.ID()), nil
}
+
+func getZCnt(t *kernel.Task, id int32, num int32) (uint16, error) {
+ r := t.IPCNamespace().SemaphoreRegistry()
+ set := r.FindByID(id)
+ if set == nil {
+ return 0, syserror.EINVAL
+ }
+ creds := auth.CredentialsFromContext(t)
+ return set.CountZeroWaiters(num, creds)
+}
+
+func getNCnt(t *kernel.Task, id int32, num int32) (uint16, error) {
+ r := t.IPCNamespace().SemaphoreRegistry()
+ set := r.FindByID(id)
+ if set == nil {
+ return 0, syserror.EINVAL
+ }
+ creds := auth.CredentialsFromContext(t)
+ return set.CountNegativeWaiters(num, creds)
+}
diff --git a/pkg/sentry/syscalls/linux/sys_splice.go b/pkg/sentry/syscalls/linux/sys_splice.go
index 46616c961..1c4cdb0dd 100644
--- a/pkg/sentry/syscalls/linux/sys_splice.go
+++ b/pkg/sentry/syscalls/linux/sys_splice.go
@@ -41,6 +41,7 @@ func doSplice(t *kernel.Task, outFile, inFile *fs.File, opts fs.SpliceOpts, nonB
inCh chan struct{}
outCh chan struct{}
)
+
for opts.Length > 0 {
n, err = fs.Splice(t, outFile, inFile, opts)
opts.Length -= n
@@ -61,23 +62,28 @@ func doSplice(t *kernel.Task, outFile, inFile *fs.File, opts fs.SpliceOpts, nonB
inW, _ := waiter.NewChannelEntry(inCh)
inFile.EventRegister(&inW, EventMaskRead)
defer inFile.EventUnregister(&inW)
- continue // Need to refresh readiness.
+ // Need to refresh readiness.
+ continue
}
if err = t.Block(inCh); err != nil {
break
}
}
- if outFile.Readiness(EventMaskWrite) == 0 {
- if outCh == nil {
- outCh = make(chan struct{}, 1)
- outW, _ := waiter.NewChannelEntry(outCh)
- outFile.EventRegister(&outW, EventMaskWrite)
- defer outFile.EventUnregister(&outW)
- continue // Need to refresh readiness.
- }
- if err = t.Block(outCh); err != nil {
- break
- }
+ // Don't bother checking readiness of the outFile, because it's not a
+ // guarantee that it won't return EWOULDBLOCK. Both pipes and eventfds
+ // can be "ready" but will reject writes of certain sizes with
+ // EWOULDBLOCK.
+ if outCh == nil {
+ outCh = make(chan struct{}, 1)
+ outW, _ := waiter.NewChannelEntry(outCh)
+ outFile.EventRegister(&outW, EventMaskWrite)
+ defer outFile.EventUnregister(&outW)
+ // We might be ready to write now. Try again before
+ // blocking.
+ continue
+ }
+ if err = t.Block(outCh); err != nil {
+ break
}
}
diff --git a/pkg/sentry/syscalls/linux/vfs2/splice.go b/pkg/sentry/syscalls/linux/vfs2/splice.go
index 035e2a6b0..9ce4f280a 100644
--- a/pkg/sentry/syscalls/linux/vfs2/splice.go
+++ b/pkg/sentry/syscalls/linux/vfs2/splice.go
@@ -480,18 +480,17 @@ func (dw *dualWaiter) waitForBoth(t *kernel.Task) error {
// waitForOut waits for dw.outfile to be read.
func (dw *dualWaiter) waitForOut(t *kernel.Task) error {
- if dw.outFile.Readiness(eventMaskWrite)&eventMaskWrite == 0 {
- if dw.outCh == nil {
- dw.outW, dw.outCh = waiter.NewChannelEntry(nil)
- dw.outFile.EventRegister(&dw.outW, eventMaskWrite)
- // We might be ready now. Try again before blocking.
- return nil
- }
- if err := t.Block(dw.outCh); err != nil {
- return err
- }
- }
- return nil
+ // Don't bother checking readiness of the outFile, because it's not a
+ // guarantee that it won't return EWOULDBLOCK. Both pipes and eventfds
+ // can be "ready" but will reject writes of certain sizes with
+ // EWOULDBLOCK. See b/172075629, b/170743336.
+ if dw.outCh == nil {
+ dw.outW, dw.outCh = waiter.NewChannelEntry(nil)
+ dw.outFile.EventRegister(&dw.outW, eventMaskWrite)
+ // We might be ready to write now. Try again before blocking.
+ return nil
+ }
+ return t.Block(dw.outCh)
}
// destroy cleans up resources help by dw. No more calls to wait* can occur
diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go
index 546e445aa..936f9fc71 100644
--- a/pkg/sentry/vfs/file_description.go
+++ b/pkg/sentry/vfs/file_description.go
@@ -133,7 +133,7 @@ func (fd *FileDescription) Init(impl FileDescriptionImpl, flags uint32, mnt *Mou
}
}
- fd.EnableLeakCheck()
+ fd.InitRefs()
// Remove "file creation flags" to mirror the behavior from file.f_flags in
// fs/open.c:do_dentry_open.
diff --git a/pkg/sentry/vfs/filesystem.go b/pkg/sentry/vfs/filesystem.go
index c93d94634..2c4b81e78 100644
--- a/pkg/sentry/vfs/filesystem.go
+++ b/pkg/sentry/vfs/filesystem.go
@@ -48,7 +48,7 @@ type Filesystem struct {
// Init must be called before first use of fs.
func (fs *Filesystem) Init(vfsObj *VirtualFilesystem, fsType FilesystemType, impl FilesystemImpl) {
- fs.EnableLeakCheck()
+ fs.InitRefs()
fs.vfs = vfsObj
fs.fsType = fsType
fs.impl = impl
diff --git a/pkg/sentry/vfs/inotify.go b/pkg/sentry/vfs/inotify.go
index 3f0b8f45b..107171b61 100644
--- a/pkg/sentry/vfs/inotify.go
+++ b/pkg/sentry/vfs/inotify.go
@@ -65,7 +65,7 @@ type Inotify struct {
// queue is used to notify interested parties when the inotify instance
// becomes readable or writable.
- queue waiter.Queue `state:"nosave"`
+ queue waiter.Queue
// evMu *only* protects the events list. We need a separate lock while
// queuing events: using mu may violate lock ordering, since at that point
diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go
index 3ea981ad4..d865fd603 100644
--- a/pkg/sentry/vfs/mount.go
+++ b/pkg/sentry/vfs/mount.go
@@ -169,7 +169,7 @@ func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth
Owner: creds.UserNamespace,
mountpoints: make(map[*Dentry]uint32),
}
- mntns.EnableLeakCheck()
+ mntns.InitRefs()
mntns.root = newMount(vfs, fs, root, mntns, opts)
return mntns, nil
}
@@ -477,7 +477,9 @@ func (mnt *Mount) tryIncMountedRef() bool {
return false
}
if atomic.CompareAndSwapInt64(&mnt.refs, r, r+1) {
- refsvfs2.LogTryIncRef(mnt, r+1)
+ if mnt.LogRefs() {
+ refsvfs2.LogTryIncRef(mnt, r+1)
+ }
return true
}
}
@@ -488,12 +490,17 @@ func (mnt *Mount) IncRef() {
// In general, negative values for mnt.refs are valid because the MSB is
// the eager-unmount bit.
r := atomic.AddInt64(&mnt.refs, 1)
- refsvfs2.LogIncRef(mnt, r)
+ if mnt.LogRefs() {
+ refsvfs2.LogIncRef(mnt, r)
+ }
}
// DecRef decrements mnt's reference count.
func (mnt *Mount) DecRef(ctx context.Context) {
r := atomic.AddInt64(&mnt.refs, -1)
+ if mnt.LogRefs() {
+ refsvfs2.LogDecRef(mnt, r)
+ }
if r&^math.MinInt64 == 0 { // mask out MSB
refsvfs2.Unregister(mnt)
mnt.destroy(ctx)
diff --git a/pkg/sentry/vfs/mount_test.go b/pkg/sentry/vfs/mount_test.go
index cb8c56bd3..cb882a983 100644
--- a/pkg/sentry/vfs/mount_test.go
+++ b/pkg/sentry/vfs/mount_test.go
@@ -29,7 +29,7 @@ func TestMountTableLookupEmpty(t *testing.T) {
parent := &Mount{}
point := &Dentry{}
if m := mt.Lookup(parent, point); m != nil {
- t.Errorf("empty mountTable lookup: got %p, wanted nil", m)
+ t.Errorf("Empty mountTable lookup: got %p, wanted nil", m)
}
}
@@ -111,13 +111,16 @@ func BenchmarkMountTableParallelLookup(b *testing.B) {
k := keys[i&(numMounts-1)]
m := mt.Lookup(k.mount, k.dentry)
if m == nil {
- b.Fatalf("lookup failed")
+ b.Errorf("Lookup failed")
+ return
}
if parent := m.parent(); parent != k.mount {
- b.Fatalf("lookup returned mount with parent %p, wanted %p", parent, k.mount)
+ b.Errorf("Lookup returned mount with parent %p, wanted %p", parent, k.mount)
+ return
}
if point := m.point(); point != k.dentry {
- b.Fatalf("lookup returned mount with point %p, wanted %p", point, k.dentry)
+ b.Errorf("Lookup returned mount with point %p, wanted %p", point, k.dentry)
+ return
}
}
}()
@@ -167,13 +170,16 @@ func BenchmarkMountMapParallelLookup(b *testing.B) {
m := ms[k]
mu.RUnlock()
if m == nil {
- b.Fatalf("lookup failed")
+ b.Errorf("Lookup failed")
+ return
}
if parent := m.parent(); parent != k.mount {
- b.Fatalf("lookup returned mount with parent %p, wanted %p", parent, k.mount)
+ b.Errorf("Lookup returned mount with parent %p, wanted %p", parent, k.mount)
+ return
}
if point := m.point(); point != k.dentry {
- b.Fatalf("lookup returned mount with point %p, wanted %p", point, k.dentry)
+ b.Errorf("Lookup returned mount with point %p, wanted %p", point, k.dentry)
+ return
}
}
}()
@@ -220,14 +226,17 @@ func BenchmarkMountSyncMapParallelLookup(b *testing.B) {
k := keys[i&(numMounts-1)]
mi, ok := ms.Load(k)
if !ok {
- b.Fatalf("lookup failed")
+ b.Errorf("Lookup failed")
+ return
}
m := mi.(*Mount)
if parent := m.parent(); parent != k.mount {
- b.Fatalf("lookup returned mount with parent %p, wanted %p", parent, k.mount)
+ b.Errorf("Lookup returned mount with parent %p, wanted %p", parent, k.mount)
+ return
}
if point := m.point(); point != k.dentry {
- b.Fatalf("lookup returned mount with point %p, wanted %p", point, k.dentry)
+ b.Errorf("Lookup returned mount with point %p, wanted %p", point, k.dentry)
+ return
}
}
}()
@@ -264,7 +273,7 @@ func BenchmarkMountTableNegativeLookup(b *testing.B) {
k := negkeys[i&(numMounts-1)]
m := mt.Lookup(k.mount, k.dentry)
if m != nil {
- b.Fatalf("lookup got %p, wanted nil", m)
+ b.Fatalf("Lookup got %p, wanted nil", m)
}
}
})
@@ -300,7 +309,7 @@ func BenchmarkMountMapNegativeLookup(b *testing.B) {
m := ms[k]
mu.RUnlock()
if m != nil {
- b.Fatalf("lookup got %p, wanted nil", m)
+ b.Fatalf("Lookup got %p, wanted nil", m)
}
}
})
@@ -333,7 +342,7 @@ func BenchmarkMountSyncMapNegativeLookup(b *testing.B) {
k := negkeys[i&(numMounts-1)]
m, _ := ms.Load(k)
if m != nil {
- b.Fatalf("lookup got %p, wanted nil", m)
+ b.Fatalf("Lookup got %p, wanted nil", m)
}
}
})