summaryrefslogtreecommitdiffhomepage
path: root/pkg/sentry
diff options
context:
space:
mode:
Diffstat (limited to 'pkg/sentry')
-rw-r--r--pkg/sentry/devices/memdev/BUILD5
-rw-r--r--pkg/sentry/devices/memdev/zero.go24
-rw-r--r--pkg/sentry/fs/g3doc/fuse.md59
-rw-r--r--pkg/sentry/fsimpl/devpts/devpts.go2
-rw-r--r--pkg/sentry/fsimpl/eventfd/eventfd.go8
-rw-r--r--pkg/sentry/fsimpl/ext/filesystem.go2
-rw-r--r--pkg/sentry/fsimpl/ext/symlink.go2
-rw-r--r--pkg/sentry/fsimpl/fuse/dev.go7
-rw-r--r--pkg/sentry/fsimpl/fuse/directory.go10
-rw-r--r--pkg/sentry/fsimpl/fuse/fusefs.go3
-rw-r--r--pkg/sentry/fsimpl/gofer/filesystem.go8
-rw-r--r--pkg/sentry/fsimpl/host/host.go42
-rw-r--r--pkg/sentry/fsimpl/host/tty.go10
-rw-r--r--pkg/sentry/fsimpl/kernfs/filesystem.go2
-rw-r--r--pkg/sentry/fsimpl/kernfs/inode_impl_util.go6
-rw-r--r--pkg/sentry/fsimpl/kernfs/symlink.go2
-rw-r--r--pkg/sentry/fsimpl/overlay/copy_up.go42
-rw-r--r--pkg/sentry/fsimpl/overlay/directory.go7
-rw-r--r--pkg/sentry/fsimpl/overlay/filesystem.go249
-rw-r--r--pkg/sentry/fsimpl/overlay/overlay.go28
-rw-r--r--pkg/sentry/fsimpl/proc/subtasks.go12
-rw-r--r--pkg/sentry/fsimpl/proc/task.go10
-rw-r--r--pkg/sentry/fsimpl/proc/task_fds.go16
-rw-r--r--pkg/sentry/fsimpl/proc/task_files.go14
-rw-r--r--pkg/sentry/fsimpl/proc/task_net.go6
-rw-r--r--pkg/sentry/fsimpl/proc/tasks.go8
-rw-r--r--pkg/sentry/fsimpl/proc/tasks_files.go6
-rw-r--r--pkg/sentry/fsimpl/proc/tasks_sys.go6
-rw-r--r--pkg/sentry/fsimpl/signalfd/signalfd.go6
-rw-r--r--pkg/sentry/fsimpl/sockfs/sockfs.go4
-rw-r--r--pkg/sentry/fsimpl/timerfd/timerfd.go6
-rw-r--r--pkg/sentry/fsimpl/tmpfs/filesystem.go12
-rw-r--r--pkg/sentry/fsimpl/tmpfs/regular_file.go73
-rw-r--r--pkg/sentry/fsimpl/tmpfs/tmpfs.go31
-rw-r--r--pkg/sentry/fsimpl/verity/filesystem.go8
-rw-r--r--pkg/sentry/fsimpl/verity/verity.go7
-rw-r--r--pkg/sentry/mm/mm.go2
-rw-r--r--pkg/sentry/mm/mm_test.go3
-rw-r--r--pkg/sentry/mm/special_mappable.go9
-rw-r--r--pkg/sentry/mm/syscalls.go13
-rw-r--r--pkg/sentry/socket/netfilter/netfilter.go4
-rw-r--r--pkg/sentry/socket/netfilter/tcp_matcher.go32
-rw-r--r--pkg/sentry/socket/netfilter/udp_matcher.go32
-rw-r--r--pkg/sentry/socket/netstack/netstack.go5
-rw-r--r--pkg/sentry/socket/unix/transport/unix.go18
-rw-r--r--pkg/sentry/syscalls/linux/sys_mmap.go9
-rw-r--r--pkg/sentry/syscalls/linux/sys_sched.go2
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/mmap.go12
48 files changed, 630 insertions, 254 deletions
diff --git a/pkg/sentry/devices/memdev/BUILD b/pkg/sentry/devices/memdev/BUILD
index abe58f818..4c8604d58 100644
--- a/pkg/sentry/devices/memdev/BUILD
+++ b/pkg/sentry/devices/memdev/BUILD
@@ -18,9 +18,10 @@ go_library(
"//pkg/rand",
"//pkg/safemem",
"//pkg/sentry/fsimpl/devtmpfs",
+ "//pkg/sentry/fsimpl/tmpfs",
+ "//pkg/sentry/kernel",
+ "//pkg/sentry/kernel/auth",
"//pkg/sentry/memmap",
- "//pkg/sentry/mm",
- "//pkg/sentry/pgalloc",
"//pkg/sentry/vfs",
"//pkg/syserror",
"//pkg/usermem",
diff --git a/pkg/sentry/devices/memdev/zero.go b/pkg/sentry/devices/memdev/zero.go
index 2e631a252..60cfea888 100644
--- a/pkg/sentry/devices/memdev/zero.go
+++ b/pkg/sentry/devices/memdev/zero.go
@@ -16,9 +16,10 @@ package memdev
import (
"gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/memmap"
- "gvisor.dev/gvisor/pkg/sentry/mm"
- "gvisor.dev/gvisor/pkg/sentry/pgalloc"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/usermem"
)
@@ -79,11 +80,22 @@ func (fd *zeroFD) Seek(ctx context.Context, offset int64, whence int32) (int64,
// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap.
func (fd *zeroFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
- m, err := mm.NewSharedAnonMappable(opts.Length, pgalloc.MemoryFileProviderFromContext(ctx))
+ if opts.Private || !opts.MaxPerms.Write {
+ // This mapping will never permit writing to the "underlying file" (in
+ // Linux terms, it isn't VM_SHARED), so implement it as an anonymous
+ // mapping, but back it with fd; this is what Linux does, and is
+ // actually application-visible because the resulting VMA will show up
+ // in /proc/[pid]/maps with fd.vfsfd.VirtualDentry()'s path rather than
+ // "/dev/zero (deleted)".
+ opts.Offset = 0
+ opts.MappingIdentity = &fd.vfsfd
+ opts.MappingIdentity.IncRef()
+ return nil
+ }
+ tmpfsFD, err := tmpfs.NewZeroFile(ctx, auth.CredentialsFromContext(ctx), kernel.KernelFromContext(ctx).ShmMount(), opts.Length)
if err != nil {
return err
}
- opts.MappingIdentity = m
- opts.Mappable = m
- return nil
+ defer tmpfsFD.DecRef(ctx)
+ return tmpfsFD.ConfigureMMap(ctx, opts)
}
diff --git a/pkg/sentry/fs/g3doc/fuse.md b/pkg/sentry/fs/g3doc/fuse.md
index 8f5e45005..05e043583 100644
--- a/pkg/sentry/fs/g3doc/fuse.md
+++ b/pkg/sentry/fs/g3doc/fuse.md
@@ -143,8 +143,8 @@ ops can be implemented in parallel.
#### Basic I/O Implementation
-Currently we have implemented basic functionalities of read and write
-for our FUSE. We describe the design and ways to improve it here:
+Currently we have implemented basic functionalities of read and write for our
+FUSE. We describe the design and ways to improve it here:
##### Basic FUSE Read
@@ -153,38 +153,37 @@ The vfs2 expects implementations of `vfs.FileDescriptionImpl.Read()` and
reach our implementation of those interface functions located at
`pkg/sentry/fsimpl/fuse/regular_file.go` for regular files.
-After validation checks of the input, sentry sends `FUSE_READ` requests
-to the FUSE daemon. The FUSE daemon returns data after the `fuse_out_header`
-as the responses. For the first version, we create a copy in kernel memory
-of those data. They are represented as a byte slice in the marshalled
-struct. This happens as a common process for all the FUSE responses at this
-moment at `pkg/sentry/fsimpl/fuse/dev.go:writeLocked()`. We then directly
-copy from this intermediate buffer to the input buffer
-provided by the read syscall.
-
-There is an extra requirement for FUSE: When mounting the FUSE fs,
-the mounter or the FUSE daemon can specify a `max_read` or a `max_pages`
-parameter. They are the upperbound of the bytes to read in each
-`FUSE_READ` request. We implemented the code to handle the fragmented reads.
-
-To improve the performance: ideally we should have buffer cache to copy
-those data from the responses of FUSE daemon into, as is also the
-design of several other existing file system implementations for
-sentry, instead of a single-use temporary buffer. Directly mapping
-the memory of one process to another could also boost the performance,
-but to keep them isolated, we did not choose to do so.
+After validation checks of the input, sentry sends `FUSE_READ` requests to the
+FUSE daemon. The FUSE daemon returns data after the `fuse_out_header` as the
+responses. For the first version, we create a copy in kernel memory of those
+data. They are represented as a byte slice in the marshalled struct. This
+happens as a common process for all the FUSE responses at this moment at
+`pkg/sentry/fsimpl/fuse/dev.go:writeLocked()`. We then directly copy from this
+intermediate buffer to the input buffer provided by the read syscall.
+
+There is an extra requirement for FUSE: When mounting the FUSE fs, the mounter
+or the FUSE daemon can specify a `max_read` or a `max_pages` parameter. They are
+the upperbound of the bytes to read in each `FUSE_READ` request. We implemented
+the code to handle the fragmented reads.
+
+To improve the performance: ideally we should have buffer cache to copy those
+data from the responses of FUSE daemon into, as is also the design of several
+other existing file system implementations for sentry, instead of a single-use
+temporary buffer. Directly mapping the memory of one process to another could
+also boost the performance, but to keep them isolated, we did not choose to do
+so.
##### Basic FUSE Write
-The vfs2 invokes implementations of `vfs.FileDescriptionImpl.Write()`
-and `vfs.FileDescriptionImpl.PWrite()` on the regular file descriptor
-of FUSE when a user makes write(2) and pwrite(2) syscall.
+The vfs2 invokes implementations of `vfs.FileDescriptionImpl.Write()` and
+`vfs.FileDescriptionImpl.PWrite()` on the regular file descriptor of FUSE when a
+user makes write(2) and pwrite(2) syscall.
-For valid writes, sentry sends the bytes to write after a `FUSE_WRITE`
-header (can be regarded as a request with 2 payloads) to the FUSE daemon.
-For the first version, we allocate a buffer inside kernel memory to store
-the bytes from the user, and copy directly from that buffer to the memory
-of FUSE daemon. This happens at `pkg/sentry/fsimpl/fuse/dev.go:readLocked()`
+For valid writes, sentry sends the bytes to write after a `FUSE_WRITE` header
+(can be regarded as a request with 2 payloads) to the FUSE daemon. For the first
+version, we allocate a buffer inside kernel memory to store the bytes from the
+user, and copy directly from that buffer to the memory of FUSE daemon. This
+happens at `pkg/sentry/fsimpl/fuse/dev.go:readLocked()`
The parameters `max_write` and `max_pages` restrict the number of bytes in one
`FUSE_WRITE`. There are code handling fragmented writes in current
diff --git a/pkg/sentry/fsimpl/devpts/devpts.go b/pkg/sentry/fsimpl/devpts/devpts.go
index dcf1ee25b..f0f2e0be7 100644
--- a/pkg/sentry/fsimpl/devpts/devpts.go
+++ b/pkg/sentry/fsimpl/devpts/devpts.go
@@ -237,7 +237,7 @@ func (i *rootInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback,
return offset, nil
}
-// DecRef implements kernfs.Inode.
+// DecRef implements kernfs.Inode.DecRef.
func (i *rootInode) DecRef(context.Context) {
i.rootInodeRefs.DecRef(i.Destroy)
}
diff --git a/pkg/sentry/fsimpl/eventfd/eventfd.go b/pkg/sentry/fsimpl/eventfd/eventfd.go
index 812171fa3..bb0bf3a07 100644
--- a/pkg/sentry/fsimpl/eventfd/eventfd.go
+++ b/pkg/sentry/fsimpl/eventfd/eventfd.go
@@ -30,7 +30,7 @@ import (
"gvisor.dev/gvisor/pkg/waiter"
)
-// EventFileDescription implements FileDescriptionImpl for file-based event
+// EventFileDescription implements vfs.FileDescriptionImpl for file-based event
// notification (eventfd). Eventfds are usually internal to the Sentry but in
// certain situations they may be converted into a host-backed eventfd.
type EventFileDescription struct {
@@ -106,7 +106,7 @@ func (efd *EventFileDescription) HostFD() (int, error) {
return efd.hostfd, nil
}
-// Release implements FileDescriptionImpl.Release()
+// Release implements vfs.FileDescriptionImpl.Release.
func (efd *EventFileDescription) Release(context.Context) {
efd.mu.Lock()
defer efd.mu.Unlock()
@@ -119,7 +119,7 @@ func (efd *EventFileDescription) Release(context.Context) {
}
}
-// Read implements FileDescriptionImpl.Read.
+// Read implements vfs.FileDescriptionImpl.Read.
func (efd *EventFileDescription) Read(ctx context.Context, dst usermem.IOSequence, _ vfs.ReadOptions) (int64, error) {
if dst.NumBytes() < 8 {
return 0, syscall.EINVAL
@@ -130,7 +130,7 @@ func (efd *EventFileDescription) Read(ctx context.Context, dst usermem.IOSequenc
return 8, nil
}
-// Write implements FileDescriptionImpl.Write.
+// Write implements vfs.FileDescriptionImpl.Write.
func (efd *EventFileDescription) Write(ctx context.Context, src usermem.IOSequence, _ vfs.WriteOptions) (int64, error) {
if src.NumBytes() < 8 {
return 0, syscall.EINVAL
diff --git a/pkg/sentry/fsimpl/ext/filesystem.go b/pkg/sentry/fsimpl/ext/filesystem.go
index 075de0e22..a4a6d8c55 100644
--- a/pkg/sentry/fsimpl/ext/filesystem.go
+++ b/pkg/sentry/fsimpl/ext/filesystem.go
@@ -490,7 +490,7 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
return syserror.EROFS
}
-// BoundEndpointAt implements FilesystemImpl.BoundEndpointAt.
+// BoundEndpointAt implements vfs.FilesystemImpl.BoundEndpointAt.
func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) {
_, inode, err := fs.walk(ctx, rp, false)
if err != nil {
diff --git a/pkg/sentry/fsimpl/ext/symlink.go b/pkg/sentry/fsimpl/ext/symlink.go
index 2fd0d1fa8..f33592d59 100644
--- a/pkg/sentry/fsimpl/ext/symlink.go
+++ b/pkg/sentry/fsimpl/ext/symlink.go
@@ -61,7 +61,7 @@ func (in *inode) isSymlink() bool {
return ok
}
-// symlinkFD represents a symlink file description and implements implements
+// symlinkFD represents a symlink file description and implements
// vfs.FileDescriptionImpl. which may only be used if open options contains
// O_PATH. For this reason most of the functions return EBADF.
type symlinkFD struct {
diff --git a/pkg/sentry/fsimpl/fuse/dev.go b/pkg/sentry/fsimpl/fuse/dev.go
index 5539466ff..f690ef5ad 100644
--- a/pkg/sentry/fsimpl/fuse/dev.go
+++ b/pkg/sentry/fsimpl/fuse/dev.go
@@ -95,9 +95,14 @@ type DeviceFD struct {
}
// Release implements vfs.FileDescriptionImpl.Release.
-func (fd *DeviceFD) Release(context.Context) {
+func (fd *DeviceFD) Release(ctx context.Context) {
if fd.fs != nil {
+ fd.fs.conn.mu.Lock()
fd.fs.conn.connected = false
+ fd.fs.conn.mu.Unlock()
+
+ fd.fs.VFSFilesystem().DecRef(ctx)
+ fd.fs = nil
}
}
diff --git a/pkg/sentry/fsimpl/fuse/directory.go b/pkg/sentry/fsimpl/fuse/directory.go
index ee77dbaa3..8f220a04b 100644
--- a/pkg/sentry/fsimpl/fuse/directory.go
+++ b/pkg/sentry/fsimpl/fuse/directory.go
@@ -35,27 +35,27 @@ func (*directoryFD) Allocate(ctx context.Context, mode, offset, length uint64) e
return syserror.EISDIR
}
-// PRead implements FileDescriptionImpl.PRead.
+// PRead implements vfs.FileDescriptionImpl.PRead.
func (*directoryFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
return 0, syserror.EISDIR
}
-// Read implements FileDescriptionImpl.Read.
+// Read implements vfs.FileDescriptionImpl.Read.
func (*directoryFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
return 0, syserror.EISDIR
}
-// PWrite implements FileDescriptionImpl.PWrite.
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
func (*directoryFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
return 0, syserror.EISDIR
}
-// Write implements FileDescriptionImpl.Write.
+// Write implements vfs.FileDescriptionImpl.Write.
func (*directoryFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
return 0, syserror.EISDIR
}
-// IterDirents implements FileDescriptionImpl.IterDirents.
+// IterDirents implements vfs.FileDescriptionImpl.IterDirents.
func (dir *directoryFD) IterDirents(ctx context.Context, callback vfs.IterDirentsCallback) error {
fusefs := dir.inode().fs
task, creds := kernel.TaskFromContext(ctx), auth.CredentialsFromContext(ctx)
diff --git a/pkg/sentry/fsimpl/fuse/fusefs.go b/pkg/sentry/fsimpl/fuse/fusefs.go
index f1ffd2343..b3573f80d 100644
--- a/pkg/sentry/fsimpl/fuse/fusefs.go
+++ b/pkg/sentry/fsimpl/fuse/fusefs.go
@@ -218,6 +218,7 @@ func newFUSEFilesystem(ctx context.Context, devMinor uint32, opts *filesystemOpt
conn: conn,
}
+ fs.VFSFilesystem().IncRef()
fuseFD.fs = fs
return fs, nil
@@ -734,7 +735,7 @@ func (i *inode) Stat(ctx context.Context, fs *vfs.Filesystem, opts vfs.StatOptio
return statFromFUSEAttr(attr, opts.Mask, i.fs.devMinor), nil
}
-// DecRef implements kernfs.Inode.
+// DecRef implements kernfs.Inode.DecRef.
func (i *inode) DecRef(context.Context) {
i.inodeRefs.DecRef(i.Destroy)
}
diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go
index 4a592175b..97b9165cc 100644
--- a/pkg/sentry/fsimpl/gofer/filesystem.go
+++ b/pkg/sentry/fsimpl/gofer/filesystem.go
@@ -1311,6 +1311,9 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
if !renamed.isDir() {
return syserror.EISDIR
}
+ if genericIsAncestorDentry(replaced, renamed) {
+ return syserror.ENOTEMPTY
+ }
} else {
if rp.MustBeDir() || renamed.isDir() {
return syserror.ENOTDIR
@@ -1361,14 +1364,15 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
// with reference counts and queue oldParent for checkCachingLocked if the
// parent isn't actually changing.
if oldParent != newParent {
+ oldParent.decRefLocked()
ds = appendDentry(ds, oldParent)
newParent.IncRef()
if renamed.isSynthetic() {
oldParent.syntheticChildren--
newParent.syntheticChildren++
}
+ renamed.parent = newParent
}
- renamed.parent = newParent
renamed.name = newName
if newParent.children == nil {
newParent.children = make(map[string]*dentry)
@@ -1491,7 +1495,7 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
return fs.unlinkAt(ctx, rp, false /* dir */)
}
-// BoundEndpointAt implements FilesystemImpl.BoundEndpointAt.
+// BoundEndpointAt implements vfs.FilesystemImpl.BoundEndpointAt.
func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) {
var ds *[]*dentry
fs.renameMu.RLock()
diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go
index c4551562a..db8536f26 100644
--- a/pkg/sentry/fsimpl/host/host.go
+++ b/pkg/sentry/fsimpl/host/host.go
@@ -139,12 +139,12 @@ func ImportFD(ctx context.Context, mnt *vfs.Mount, hostFD int, isTTY bool) (*vfs
// filesystemType implements vfs.FilesystemType.
type filesystemType struct{}
-// GetFilesystem implements FilesystemType.GetFilesystem.
+// GetFilesystem implements vfs.FilesystemType.GetFilesystem.
func (filesystemType) GetFilesystem(context.Context, *vfs.VirtualFilesystem, *auth.Credentials, string, vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
panic("host.filesystemType.GetFilesystem should never be called")
}
-// Name implements FilesystemType.Name.
+// Name implements vfs.FilesystemType.Name.
func (filesystemType) Name() string {
return "none"
}
@@ -243,7 +243,7 @@ type inode struct {
pf inodePlatformFile
}
-// CheckPermissions implements kernfs.Inode.
+// CheckPermissions implements kernfs.Inode.CheckPermissions.
func (i *inode) CheckPermissions(ctx context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error {
var s syscall.Stat_t
if err := syscall.Fstat(i.hostFD, &s); err != nil {
@@ -252,7 +252,7 @@ func (i *inode) CheckPermissions(ctx context.Context, creds *auth.Credentials, a
return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(s.Mode), auth.KUID(s.Uid), auth.KGID(s.Gid))
}
-// Mode implements kernfs.Inode.
+// Mode implements kernfs.Inode.Mode.
func (i *inode) Mode() linux.FileMode {
var s syscall.Stat_t
if err := syscall.Fstat(i.hostFD, &s); err != nil {
@@ -263,7 +263,7 @@ func (i *inode) Mode() linux.FileMode {
return linux.FileMode(s.Mode)
}
-// Stat implements kernfs.Inode.
+// Stat implements kernfs.Inode.Stat.
func (i *inode) Stat(ctx context.Context, vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
if opts.Mask&linux.STATX__RESERVED != 0 {
return linux.Statx{}, syserror.EINVAL
@@ -376,7 +376,7 @@ func (i *inode) fstat(fs *filesystem) (linux.Statx, error) {
}, nil
}
-// SetStat implements kernfs.Inode.
+// SetStat implements kernfs.Inode.SetStat.
func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error {
s := &opts.Stat
@@ -435,17 +435,17 @@ func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Cre
return nil
}
-// IncRef implements kernfs.Inode.
+// IncRef implements kernfs.Inode.IncRef.
func (i *inode) IncRef() {
i.refs.IncRef()
}
-// TryIncRef implements kernfs.Inode.
+// TryIncRef implements kernfs.Inode.TryIncRef.
func (i *inode) TryIncRef() bool {
return i.refs.TryIncRef()
}
-// DecRef implements kernfs.Inode.
+// DecRef implements kernfs.Inode.DecRef.
func (i *inode) DecRef(ctx context.Context) {
i.refs.DecRef(func() {
if i.wouldBlock {
@@ -457,7 +457,7 @@ func (i *inode) DecRef(ctx context.Context) {
})
}
-// Open implements kernfs.Inode.
+// Open implements kernfs.Inode.Open.
func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
// Once created, we cannot re-open a socket fd through /proc/[pid]/fd/.
if i.Mode().FileType() == linux.S_IFSOCK {
@@ -542,28 +542,28 @@ type fileDescription struct {
offset int64
}
-// SetStat implements vfs.FileDescriptionImpl.
+// SetStat implements vfs.FileDescriptionImpl.SetStat.
func (f *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
creds := auth.CredentialsFromContext(ctx)
return f.inode.SetStat(ctx, f.vfsfd.Mount().Filesystem(), creds, opts)
}
-// Stat implements vfs.FileDescriptionImpl.
+// Stat implements vfs.FileDescriptionImpl.Stat.
func (f *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
return f.inode.Stat(ctx, f.vfsfd.Mount().Filesystem(), opts)
}
-// Release implements vfs.FileDescriptionImpl.
+// Release implements vfs.FileDescriptionImpl.Release.
func (f *fileDescription) Release(context.Context) {
// noop
}
-// Allocate implements vfs.FileDescriptionImpl.
+// Allocate implements vfs.FileDescriptionImpl.Allocate.
func (f *fileDescription) Allocate(ctx context.Context, mode, offset, length uint64) error {
return unix.Fallocate(f.inode.hostFD, uint32(mode), int64(offset), int64(length))
}
-// PRead implements FileDescriptionImpl.
+// PRead implements vfs.FileDescriptionImpl.PRead.
func (f *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
i := f.inode
if !i.seekable {
@@ -573,7 +573,7 @@ func (f *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, off
return readFromHostFD(ctx, i.hostFD, dst, offset, opts.Flags)
}
-// Read implements FileDescriptionImpl.
+// Read implements vfs.FileDescriptionImpl.Read.
func (f *fileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
i := f.inode
if !i.seekable {
@@ -610,7 +610,7 @@ func readFromHostFD(ctx context.Context, hostFD int, dst usermem.IOSequence, off
return int64(n), err
}
-// PWrite implements FileDescriptionImpl.
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
func (f *fileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
if !f.inode.seekable {
return 0, syserror.ESPIPE
@@ -619,7 +619,7 @@ func (f *fileDescription) PWrite(ctx context.Context, src usermem.IOSequence, of
return f.writeToHostFD(ctx, src, offset, opts.Flags)
}
-// Write implements FileDescriptionImpl.
+// Write implements vfs.FileDescriptionImpl.Write.
func (f *fileDescription) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
i := f.inode
if !i.seekable {
@@ -667,7 +667,7 @@ func (f *fileDescription) writeToHostFD(ctx context.Context, src usermem.IOSeque
return int64(n), err
}
-// Seek implements FileDescriptionImpl.
+// Seek implements vfs.FileDescriptionImpl.Seek.
//
// Note that we do not support seeking on directories, since we do not even
// allow directory fds to be imported at all.
@@ -732,13 +732,13 @@ func (f *fileDescription) Seek(_ context.Context, offset int64, whence int32) (i
return f.offset, nil
}
-// Sync implements FileDescriptionImpl.
+// Sync implements vfs.FileDescriptionImpl.Sync.
func (f *fileDescription) Sync(context.Context) error {
// TODO(gvisor.dev/issue/1897): Currently, we always sync everything.
return unix.Fsync(f.inode.hostFD)
}
-// ConfigureMMap implements FileDescriptionImpl.
+// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap.
func (f *fileDescription) ConfigureMMap(_ context.Context, opts *memmap.MMapOpts) error {
if !f.inode.canMap {
return syserror.ENODEV
diff --git a/pkg/sentry/fsimpl/host/tty.go b/pkg/sentry/fsimpl/host/tty.go
index 97cefa350..e02b9b8f6 100644
--- a/pkg/sentry/fsimpl/host/tty.go
+++ b/pkg/sentry/fsimpl/host/tty.go
@@ -76,7 +76,7 @@ func (t *TTYFileDescription) Release(ctx context.Context) {
t.fileDescription.Release(ctx)
}
-// PRead implements vfs.FileDescriptionImpl.
+// PRead implements vfs.FileDescriptionImpl.PRead.
//
// Reading from a TTY is only allowed for foreground process groups. Background
// process groups will either get EIO or a SIGTTIN.
@@ -94,7 +94,7 @@ func (t *TTYFileDescription) PRead(ctx context.Context, dst usermem.IOSequence,
return t.fileDescription.PRead(ctx, dst, offset, opts)
}
-// Read implements vfs.FileDescriptionImpl.
+// Read implements vfs.FileDescriptionImpl.Read.
//
// Reading from a TTY is only allowed for foreground process groups. Background
// process groups will either get EIO or a SIGTTIN.
@@ -112,7 +112,7 @@ func (t *TTYFileDescription) Read(ctx context.Context, dst usermem.IOSequence, o
return t.fileDescription.Read(ctx, dst, opts)
}
-// PWrite implements vfs.FileDescriptionImpl.
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
func (t *TTYFileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
t.mu.Lock()
defer t.mu.Unlock()
@@ -127,7 +127,7 @@ func (t *TTYFileDescription) PWrite(ctx context.Context, src usermem.IOSequence,
return t.fileDescription.PWrite(ctx, src, offset, opts)
}
-// Write implements vfs.FileDescriptionImpl.
+// Write implements vfs.FileDescriptionImpl.Write.
func (t *TTYFileDescription) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
t.mu.Lock()
defer t.mu.Unlock()
@@ -142,7 +142,7 @@ func (t *TTYFileDescription) Write(ctx context.Context, src usermem.IOSequence,
return t.fileDescription.Write(ctx, src, opts)
}
-// Ioctl implements vfs.FileDescriptionImpl.
+// Ioctl implements vfs.FileDescriptionImpl.Ioctl.
func (t *TTYFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
task := kernel.TaskFromContext(ctx)
if task == nil {
diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go
index c659436ac..89ed265dc 100644
--- a/pkg/sentry/fsimpl/kernfs/filesystem.go
+++ b/pkg/sentry/fsimpl/kernfs/filesystem.go
@@ -801,7 +801,7 @@ func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
return nil
}
-// BoundEndpointAt implements FilesystemImpl.BoundEndpointAt.
+// BoundEndpointAt implements vfs.FilesystemImpl.BoundEndpointAt.
func (fs *Filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) {
fs.mu.RLock()
_, inode, err := fs.walkExistingLocked(ctx, rp)
diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
index c2109cf76..6ee353ace 100644
--- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
+++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
@@ -594,7 +594,7 @@ func (s *StaticDirectory) Init(creds *auth.Credentials, devMajor, devMinor uint3
s.InodeAttrs.Init(creds, devMajor, devMinor, ino, linux.ModeDirectory|perm)
}
-// Open implements kernfs.Inode.
+// Open implements kernfs.Inode.Open.
func (s *StaticDirectory) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
fd, err := NewGenericDirectoryFD(rp.Mount(), vfsd, &s.OrderedChildren, &s.locks, &opts, s.fdOpts)
if err != nil {
@@ -608,7 +608,7 @@ func (*StaticDirectory) SetStat(context.Context, *vfs.Filesystem, *auth.Credenti
return syserror.EPERM
}
-// DecRef implements kernfs.Inode.
+// DecRef implements kernfs.Inode.DecRef.
func (s *StaticDirectory) DecRef(context.Context) {
s.StaticDirectoryRefs.DecRef(s.Destroy)
}
@@ -616,7 +616,7 @@ func (s *StaticDirectory) DecRef(context.Context) {
// AlwaysValid partially implements kernfs.inodeDynamicLookup.
type AlwaysValid struct{}
-// Valid implements kernfs.inodeDynamicLookup.
+// Valid implements kernfs.inodeDynamicLookup.Valid.
func (*AlwaysValid) Valid(context.Context) bool {
return true
}
diff --git a/pkg/sentry/fsimpl/kernfs/symlink.go b/pkg/sentry/fsimpl/kernfs/symlink.go
index a9812fcef..443121c99 100644
--- a/pkg/sentry/fsimpl/kernfs/symlink.go
+++ b/pkg/sentry/fsimpl/kernfs/symlink.go
@@ -51,7 +51,7 @@ func (s *StaticSymlink) Init(creds *auth.Credentials, devMajor uint32, devMinor
s.InodeAttrs.Init(creds, devMajor, devMinor, ino, linux.ModeSymlink|0777)
}
-// Readlink implements Inode.
+// Readlink implements Inode.Readlink.
func (s *StaticSymlink) Readlink(_ context.Context, _ *vfs.Mount) (string, error) {
return s.target, nil
}
diff --git a/pkg/sentry/fsimpl/overlay/copy_up.go b/pkg/sentry/fsimpl/overlay/copy_up.go
index 360b77ef6..73b126669 100644
--- a/pkg/sentry/fsimpl/overlay/copy_up.go
+++ b/pkg/sentry/fsimpl/overlay/copy_up.go
@@ -92,7 +92,7 @@ func (d *dentry) copyUpLocked(ctx context.Context) error {
err = vfsObj.UnlinkAt(ctx, d.fs.creds, &newpop)
}
if err != nil {
- ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer file after copy-up error: %v", err)
+ panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer file after copy-up error: %v", err))
}
if d.upperVD.Ok() {
d.upperVD.DecRef(ctx)
@@ -362,7 +362,7 @@ func (d *dentry) copyXattrsLocked(ctx context.Context) error {
// There are no guarantees as to the contents of lowerXattrs.
return nil
}
- ctx.Warningf("failed to copy up xattrs because ListXattrAt failed: %v", err)
+ ctx.Infof("failed to copy up xattrs because ListXattrAt failed: %v", err)
return err
}
@@ -374,14 +374,48 @@ func (d *dentry) copyXattrsLocked(ctx context.Context) error {
value, err := vfsObj.GetXattrAt(ctx, d.fs.creds, lowerPop, &vfs.GetXattrOptions{Name: name, Size: 0})
if err != nil {
- ctx.Warningf("failed to copy up xattrs because GetXattrAt failed: %v", err)
+ ctx.Infof("failed to copy up xattrs because GetXattrAt failed: %v", err)
return err
}
if err := vfsObj.SetXattrAt(ctx, d.fs.creds, upperPop, &vfs.SetXattrOptions{Name: name, Value: value}); err != nil {
- ctx.Warningf("failed to copy up xattrs because SetXattrAt failed: %v", err)
+ ctx.Infof("failed to copy up xattrs because SetXattrAt failed: %v", err)
return err
}
}
return nil
}
+
+// copyUpDescendantsLocked ensures that all descendants of d are copied up.
+//
+// Preconditions:
+// * filesystem.renameMu must be locked.
+// * d.dirMu must be locked.
+// * d.isDir().
+func (d *dentry) copyUpDescendantsLocked(ctx context.Context, ds **[]*dentry) error {
+ dirents, err := d.getDirentsLocked(ctx)
+ if err != nil {
+ return err
+ }
+ for _, dirent := range dirents {
+ if dirent.Name == "." || dirent.Name == ".." {
+ continue
+ }
+ child, err := d.fs.getChildLocked(ctx, d, dirent.Name, ds)
+ if err != nil {
+ return err
+ }
+ if err := child.copyUpLocked(ctx); err != nil {
+ return err
+ }
+ if child.isDir() {
+ child.dirMu.Lock()
+ err := child.copyUpDescendantsLocked(ctx, ds)
+ child.dirMu.Unlock()
+ if err != nil {
+ return err
+ }
+ }
+ }
+ return nil
+}
diff --git a/pkg/sentry/fsimpl/overlay/directory.go b/pkg/sentry/fsimpl/overlay/directory.go
index b1b292e83..7ab42e71e 100644
--- a/pkg/sentry/fsimpl/overlay/directory.go
+++ b/pkg/sentry/fsimpl/overlay/directory.go
@@ -143,7 +143,14 @@ func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) {
defer d.fs.renameMu.RUnlock()
d.dirMu.Lock()
defer d.dirMu.Unlock()
+ return d.getDirentsLocked(ctx)
+}
+// Preconditions:
+// * filesystem.renameMu must be locked.
+// * d.dirMu must be locked.
+// * d.isDir().
+func (d *dentry) getDirentsLocked(ctx context.Context) ([]vfs.Dirent, error) {
if d.dirents != nil {
return d.dirents, nil
}
diff --git a/pkg/sentry/fsimpl/overlay/filesystem.go b/pkg/sentry/fsimpl/overlay/filesystem.go
index b530851c5..e9ce4bde1 100644
--- a/pkg/sentry/fsimpl/overlay/filesystem.go
+++ b/pkg/sentry/fsimpl/overlay/filesystem.go
@@ -15,6 +15,7 @@
package overlay
import (
+ "fmt"
"strings"
"sync/atomic"
@@ -512,7 +513,7 @@ func (fs *filesystem) createWhiteout(ctx context.Context, vfsObj *vfs.VirtualFil
func (fs *filesystem) cleanupRecreateWhiteout(ctx context.Context, vfsObj *vfs.VirtualFilesystem, pop *vfs.PathOperation) {
if err := fs.createWhiteout(ctx, vfsObj, pop); err != nil {
- ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to recreate whiteout after failed file creation: %v", err)
+ panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to recreate whiteout after failed file creation: %v", err))
}
}
@@ -624,7 +625,7 @@ func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.
},
}); err != nil {
if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &newpop); cleanupErr != nil {
- ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer file after LinkAt metadata update failure: %v", cleanupErr)
+ panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer file after LinkAt metadata update failure: %v", cleanupErr))
} else if haveUpperWhiteout {
fs.cleanupRecreateWhiteout(ctx, vfsObj, &newpop)
}
@@ -663,7 +664,7 @@ func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
},
}); err != nil {
if cleanupErr := vfsObj.RmdirAt(ctx, fs.creds, &pop); cleanupErr != nil {
- ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer directory after MkdirAt metadata update failure: %v", cleanupErr)
+ panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer directory after MkdirAt metadata update failure: %v", cleanupErr))
} else if haveUpperWhiteout {
fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
}
@@ -678,7 +679,7 @@ func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
Value: "y",
}); err != nil {
if cleanupErr := vfsObj.RmdirAt(ctx, fs.creds, &pop); cleanupErr != nil {
- ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer directory after MkdirAt set-opaque failure: %v", cleanupErr)
+ panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer directory after MkdirAt set-opaque failure: %v", cleanupErr))
} else {
fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
}
@@ -722,7 +723,7 @@ func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
},
}); err != nil {
if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &pop); cleanupErr != nil {
- ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer file after MknodAt metadata update failure: %v", cleanupErr)
+ panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer file after MknodAt metadata update failure: %v", cleanupErr))
} else if haveUpperWhiteout {
fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
}
@@ -942,7 +943,7 @@ func (fs *filesystem) createAndOpenLocked(ctx context.Context, rp *vfs.Resolving
},
}); err != nil {
if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &pop); cleanupErr != nil {
- ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer file after OpenAt(O_CREAT) metadata update failure: %v", cleanupErr)
+ panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer file after OpenAt(O_CREAT) metadata update failure: %v", cleanupErr))
} else if haveUpperWhiteout {
fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
}
@@ -953,7 +954,7 @@ func (fs *filesystem) createAndOpenLocked(ctx context.Context, rp *vfs.Resolving
child, err := fs.getChildLocked(ctx, parent, childName, ds)
if err != nil {
if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &pop); cleanupErr != nil {
- ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer file after OpenAt(O_CREAT) dentry lookup failure: %v", cleanupErr)
+ panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer file after OpenAt(O_CREAT) dentry lookup failure: %v", cleanupErr))
} else if haveUpperWhiteout {
fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
}
@@ -1019,9 +1020,223 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
}
defer mnt.EndWrite()
- // FIXME(gvisor.dev/issue/1199): Actually implement rename.
- _ = newParent
- return syserror.EXDEV
+ oldParent := oldParentVD.Dentry().Impl().(*dentry)
+ creds := rp.Credentials()
+ if err := oldParent.checkPermissions(creds, vfs.MayWrite|vfs.MayExec); err != nil {
+ return err
+ }
+ // We need a dentry representing the renamed file since, if it's a
+ // directory, we need to check for write permission on it.
+ oldParent.dirMu.Lock()
+ defer oldParent.dirMu.Unlock()
+ renamed, err := fs.getChildLocked(ctx, oldParent, oldName, &ds)
+ if err != nil {
+ return err
+ }
+ if err := vfs.CheckDeleteSticky(creds, linux.FileMode(atomic.LoadUint32(&oldParent.mode)), auth.KUID(atomic.LoadUint32(&renamed.uid))); err != nil {
+ return err
+ }
+ if renamed.isDir() {
+ if renamed == newParent || genericIsAncestorDentry(renamed, newParent) {
+ return syserror.EINVAL
+ }
+ if oldParent != newParent {
+ if err := renamed.checkPermissions(creds, vfs.MayWrite); err != nil {
+ return err
+ }
+ }
+ } else {
+ if opts.MustBeDir || rp.MustBeDir() {
+ return syserror.ENOTDIR
+ }
+ }
+
+ if oldParent != newParent {
+ if err := newParent.checkPermissions(creds, vfs.MayWrite|vfs.MayExec); err != nil {
+ return err
+ }
+ newParent.dirMu.Lock()
+ defer newParent.dirMu.Unlock()
+ }
+ if newParent.vfsd.IsDead() {
+ return syserror.ENOENT
+ }
+ replacedLayer, err := fs.lookupLayerLocked(ctx, newParent, newName)
+ if err != nil {
+ return err
+ }
+ var (
+ replaced *dentry
+ replacedVFSD *vfs.Dentry
+ whiteouts map[string]bool
+ )
+ if replacedLayer.existsInOverlay() {
+ replaced, err = fs.getChildLocked(ctx, newParent, newName, &ds)
+ if err != nil {
+ return err
+ }
+ replacedVFSD = &replaced.vfsd
+ if replaced.isDir() {
+ if !renamed.isDir() {
+ return syserror.EISDIR
+ }
+ if genericIsAncestorDentry(replaced, renamed) {
+ return syserror.ENOTEMPTY
+ }
+ replaced.dirMu.Lock()
+ defer replaced.dirMu.Unlock()
+ whiteouts, err = replaced.collectWhiteoutsForRmdirLocked(ctx)
+ if err != nil {
+ return err
+ }
+ } else {
+ if rp.MustBeDir() || renamed.isDir() {
+ return syserror.ENOTDIR
+ }
+ }
+ }
+
+ if oldParent == newParent && oldName == newName {
+ return nil
+ }
+
+ // renamed and oldParent need to be copied-up before they're renamed on the
+ // upper layer.
+ if err := renamed.copyUpLocked(ctx); err != nil {
+ return err
+ }
+ // If renamed is a directory, all of its descendants need to be copied-up
+ // before they're renamed on the upper layer.
+ if renamed.isDir() {
+ if err := renamed.copyUpDescendantsLocked(ctx, &ds); err != nil {
+ return err
+ }
+ }
+ // newParent must be copied-up before it can contain renamed on the upper
+ // layer.
+ if err := newParent.copyUpLocked(ctx); err != nil {
+ return err
+ }
+ // If replaced exists, it doesn't need to be copied-up, but we do need to
+ // serialize with copy-up. Holding renameMu for writing should be
+ // sufficient, but out of an abundance of caution...
+ if replaced != nil {
+ replaced.copyMu.RLock()
+ defer replaced.copyMu.RUnlock()
+ }
+
+ vfsObj := rp.VirtualFilesystem()
+ mntns := vfs.MountNamespaceFromContext(ctx)
+ defer mntns.DecRef(ctx)
+ if err := vfsObj.PrepareRenameDentry(mntns, &renamed.vfsd, replacedVFSD); err != nil {
+ return err
+ }
+
+ newpop := vfs.PathOperation{
+ Root: newParent.upperVD,
+ Start: newParent.upperVD,
+ Path: fspath.Parse(newName),
+ }
+
+ needRecreateWhiteouts := false
+ cleanupRecreateWhiteouts := func() {
+ if !needRecreateWhiteouts {
+ return
+ }
+ for whiteoutName, whiteoutUpper := range whiteouts {
+ if !whiteoutUpper {
+ continue
+ }
+ if err := fs.createWhiteout(ctx, vfsObj, &vfs.PathOperation{
+ Root: replaced.upperVD,
+ Start: replaced.upperVD,
+ Path: fspath.Parse(whiteoutName),
+ }); err != nil && err != syserror.EEXIST {
+ panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to recreate deleted whiteout after RenameAt failure: %v", err))
+ }
+ }
+ }
+ if renamed.isDir() {
+ if replacedLayer == lookupLayerUpper {
+ // Remove whiteouts from the directory being replaced.
+ needRecreateWhiteouts = true
+ for whiteoutName, whiteoutUpper := range whiteouts {
+ if !whiteoutUpper {
+ continue
+ }
+ if err := vfsObj.UnlinkAt(ctx, fs.creds, &vfs.PathOperation{
+ Root: replaced.upperVD,
+ Start: replaced.upperVD,
+ Path: fspath.Parse(whiteoutName),
+ }); err != nil {
+ cleanupRecreateWhiteouts()
+ vfsObj.AbortRenameDentry(&renamed.vfsd, replacedVFSD)
+ return err
+ }
+ }
+ } else if replacedLayer == lookupLayerUpperWhiteout {
+ // We need to explicitly remove the whiteout since otherwise rename
+ // on the upper layer will fail with ENOTDIR.
+ if err := vfsObj.UnlinkAt(ctx, fs.creds, &newpop); err != nil {
+ vfsObj.AbortRenameDentry(&renamed.vfsd, replacedVFSD)
+ return err
+ }
+ }
+ }
+
+ // Essentially no gVisor filesystem supports RENAME_WHITEOUT, so just do a
+ // regular rename and create the whiteout at the origin manually. Unlike
+ // RENAME_WHITEOUT, this isn't atomic with respect to other users of the
+ // upper filesystem, but this is already the case for virtually all other
+ // overlay filesystem operations too.
+ oldpop := vfs.PathOperation{
+ Root: oldParent.upperVD,
+ Start: oldParent.upperVD,
+ Path: fspath.Parse(oldName),
+ }
+ if err := vfsObj.RenameAt(ctx, creds, &oldpop, &newpop, &opts); err != nil {
+ cleanupRecreateWhiteouts()
+ vfsObj.AbortRenameDentry(&renamed.vfsd, replacedVFSD)
+ return err
+ }
+
+ // Below this point, the renamed dentry is now at newpop, and anything we
+ // replaced is gone forever. Commit the rename, update the overlay
+ // filesystem tree, and abandon attempts to recover from errors.
+ vfsObj.CommitRenameReplaceDentry(ctx, &renamed.vfsd, replacedVFSD)
+ delete(oldParent.children, oldName)
+ if replaced != nil {
+ ds = appendDentry(ds, replaced)
+ }
+ if oldParent != newParent {
+ newParent.dirents = nil
+ // This can't drop the last reference on oldParent because one is held
+ // by oldParentVD, so lock recursion is impossible.
+ oldParent.DecRef(ctx)
+ ds = appendDentry(ds, oldParent)
+ newParent.IncRef()
+ renamed.parent = newParent
+ }
+ renamed.name = newName
+ if newParent.children == nil {
+ newParent.children = make(map[string]*dentry)
+ }
+ newParent.children[newName] = renamed
+ oldParent.dirents = nil
+
+ if err := fs.createWhiteout(ctx, vfsObj, &oldpop); err != nil {
+ panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to create whiteout at origin after RenameAt: %v", err))
+ }
+ if renamed.isDir() {
+ if err := vfsObj.SetXattrAt(ctx, fs.creds, &newpop, &vfs.SetXattrOptions{
+ Name: _OVL_XATTR_OPAQUE,
+ Value: "y",
+ }); err != nil {
+ panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to make renamed directory opaque: %v", err))
+ }
+ }
+
+ return nil
}
// RmdirAt implements vfs.FilesystemImpl.RmdirAt.
@@ -1100,7 +1315,7 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error
Start: child.upperVD,
Path: fspath.Parse(whiteoutName),
}); err != nil && err != syserror.EEXIST {
- ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to recreate deleted whiteout after RmdirAt failure: %v", err)
+ panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to recreate deleted whiteout after RmdirAt failure: %v", err))
}
}
}
@@ -1130,9 +1345,7 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error
// Don't attempt to recover from this: the original directory is
// already gone, so any dentries representing it are invalid, and
// creating a new directory won't undo that.
- ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to create whiteout during RmdirAt: %v", err)
- vfsObj.AbortDeleteDentry(&child.vfsd)
- return err
+ panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to create whiteout during RmdirAt: %v", err))
}
vfsObj.CommitDeleteDentry(ctx, &child.vfsd)
@@ -1246,7 +1459,7 @@ func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, targ
},
}); err != nil {
if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &pop); cleanupErr != nil {
- ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer file after SymlinkAt metadata update failure: %v", cleanupErr)
+ panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer file after SymlinkAt metadata update failure: %v", cleanupErr))
} else if haveUpperWhiteout {
fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
}
@@ -1339,11 +1552,7 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
}
}
if err := fs.createWhiteout(ctx, vfsObj, &pop); err != nil {
- ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to create whiteout during UnlinkAt: %v", err)
- if child != nil {
- vfsObj.AbortDeleteDentry(&child.vfsd)
- }
- return err
+ panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to create whiteout during UnlinkAt: %v", err))
}
if child != nil {
diff --git a/pkg/sentry/fsimpl/overlay/overlay.go b/pkg/sentry/fsimpl/overlay/overlay.go
index b2efe5f80..d0d26185e 100644
--- a/pkg/sentry/fsimpl/overlay/overlay.go
+++ b/pkg/sentry/fsimpl/overlay/overlay.go
@@ -111,16 +111,16 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
fsoptsRaw := opts.InternalData
fsopts, haveFSOpts := fsoptsRaw.(FilesystemOptions)
if fsoptsRaw != nil && !haveFSOpts {
- ctx.Warningf("overlay.FilesystemType.GetFilesystem: GetFilesystemOptions.InternalData has type %T, wanted overlay.FilesystemOptions or nil", fsoptsRaw)
+ ctx.Infof("overlay.FilesystemType.GetFilesystem: GetFilesystemOptions.InternalData has type %T, wanted overlay.FilesystemOptions or nil", fsoptsRaw)
return nil, nil, syserror.EINVAL
}
if haveFSOpts {
if len(fsopts.LowerRoots) == 0 {
- ctx.Warningf("overlay.FilesystemType.GetFilesystem: LowerRoots must be non-empty")
+ ctx.Infof("overlay.FilesystemType.GetFilesystem: LowerRoots must be non-empty")
return nil, nil, syserror.EINVAL
}
if len(fsopts.LowerRoots) < 2 && !fsopts.UpperRoot.Ok() {
- ctx.Warningf("overlay.FilesystemType.GetFilesystem: at least two LowerRoots are required when UpperRoot is unspecified")
+ ctx.Infof("overlay.FilesystemType.GetFilesystem: at least two LowerRoots are required when UpperRoot is unspecified")
return nil, nil, syserror.EINVAL
}
// We don't enforce a maximum number of lower layers when not
@@ -137,7 +137,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
delete(mopts, "workdir")
upperPath := fspath.Parse(upperPathname)
if !upperPath.Absolute {
- ctx.Warningf("overlay.FilesystemType.GetFilesystem: upperdir %q must be absolute", upperPathname)
+ ctx.Infof("overlay.FilesystemType.GetFilesystem: upperdir %q must be absolute", upperPathname)
return nil, nil, syserror.EINVAL
}
upperRoot, err := vfsObj.GetDentryAt(ctx, creds, &vfs.PathOperation{
@@ -149,13 +149,13 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
CheckSearchable: true,
})
if err != nil {
- ctx.Warningf("overlay.FilesystemType.GetFilesystem: failed to resolve upperdir %q: %v", upperPathname, err)
+ ctx.Infof("overlay.FilesystemType.GetFilesystem: failed to resolve upperdir %q: %v", upperPathname, err)
return nil, nil, err
}
defer upperRoot.DecRef(ctx)
privateUpperRoot, err := clonePrivateMount(vfsObj, upperRoot, false /* forceReadOnly */)
if err != nil {
- ctx.Warningf("overlay.FilesystemType.GetFilesystem: failed to make private bind mount of upperdir %q: %v", upperPathname, err)
+ ctx.Infof("overlay.FilesystemType.GetFilesystem: failed to make private bind mount of upperdir %q: %v", upperPathname, err)
return nil, nil, err
}
defer privateUpperRoot.DecRef(ctx)
@@ -163,24 +163,24 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
}
lowerPathnamesStr, ok := mopts["lowerdir"]
if !ok {
- ctx.Warningf("overlay.FilesystemType.GetFilesystem: missing required option lowerdir")
+ ctx.Infof("overlay.FilesystemType.GetFilesystem: missing required option lowerdir")
return nil, nil, syserror.EINVAL
}
delete(mopts, "lowerdir")
lowerPathnames := strings.Split(lowerPathnamesStr, ":")
const maxLowerLayers = 500 // Linux: fs/overlay/super.c:OVL_MAX_STACK
if len(lowerPathnames) < 2 && !fsopts.UpperRoot.Ok() {
- ctx.Warningf("overlay.FilesystemType.GetFilesystem: at least two lowerdirs are required when upperdir is unspecified")
+ ctx.Infof("overlay.FilesystemType.GetFilesystem: at least two lowerdirs are required when upperdir is unspecified")
return nil, nil, syserror.EINVAL
}
if len(lowerPathnames) > maxLowerLayers {
- ctx.Warningf("overlay.FilesystemType.GetFilesystem: %d lowerdirs specified, maximum %d", len(lowerPathnames), maxLowerLayers)
+ ctx.Infof("overlay.FilesystemType.GetFilesystem: %d lowerdirs specified, maximum %d", len(lowerPathnames), maxLowerLayers)
return nil, nil, syserror.EINVAL
}
for _, lowerPathname := range lowerPathnames {
lowerPath := fspath.Parse(lowerPathname)
if !lowerPath.Absolute {
- ctx.Warningf("overlay.FilesystemType.GetFilesystem: lowerdir %q must be absolute", lowerPathname)
+ ctx.Infof("overlay.FilesystemType.GetFilesystem: lowerdir %q must be absolute", lowerPathname)
return nil, nil, syserror.EINVAL
}
lowerRoot, err := vfsObj.GetDentryAt(ctx, creds, &vfs.PathOperation{
@@ -192,13 +192,13 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
CheckSearchable: true,
})
if err != nil {
- ctx.Warningf("overlay.FilesystemType.GetFilesystem: failed to resolve lowerdir %q: %v", lowerPathname, err)
+ ctx.Infof("overlay.FilesystemType.GetFilesystem: failed to resolve lowerdir %q: %v", lowerPathname, err)
return nil, nil, err
}
defer lowerRoot.DecRef(ctx)
privateLowerRoot, err := clonePrivateMount(vfsObj, lowerRoot, true /* forceReadOnly */)
if err != nil {
- ctx.Warningf("overlay.FilesystemType.GetFilesystem: failed to make private bind mount of lowerdir %q: %v", lowerPathname, err)
+ ctx.Infof("overlay.FilesystemType.GetFilesystem: failed to make private bind mount of lowerdir %q: %v", lowerPathname, err)
return nil, nil, err
}
defer privateLowerRoot.DecRef(ctx)
@@ -206,7 +206,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
}
}
if len(mopts) != 0 {
- ctx.Warningf("overlay.FilesystemType.GetFilesystem: unused options: %v", mopts)
+ ctx.Infof("overlay.FilesystemType.GetFilesystem: unused options: %v", mopts)
return nil, nil, syserror.EINVAL
}
@@ -279,7 +279,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
return nil, nil, syserror.EREMOTE
}
if isWhiteout(&rootStat) {
- ctx.Warningf("overlay.FilesystemType.GetFilesystem: filesystem root is a whiteout")
+ ctx.Infof("overlay.FilesystemType.GetFilesystem: filesystem root is a whiteout")
root.destroyLocked(ctx)
fs.vfsfs.DecRef(ctx)
return nil, nil, syserror.EINVAL
diff --git a/pkg/sentry/fsimpl/proc/subtasks.go b/pkg/sentry/fsimpl/proc/subtasks.go
index d57d94dbc..57f026040 100644
--- a/pkg/sentry/fsimpl/proc/subtasks.go
+++ b/pkg/sentry/fsimpl/proc/subtasks.go
@@ -68,7 +68,7 @@ func (fs *filesystem) newSubtasks(task *kernel.Task, pidns *kernel.PIDNamespace,
return dentry
}
-// Lookup implements kernfs.inodeDynamicLookup.
+// Lookup implements kernfs.inodeDynamicLookup.Lookup.
func (i *subtasksInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) {
tid, err := strconv.ParseUint(name, 10, 32)
if err != nil {
@@ -87,7 +87,7 @@ func (i *subtasksInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, e
return subTaskDentry.VFSDentry(), nil
}
-// IterDirents implements kernfs.inodeDynamicLookup.
+// IterDirents implements kernfs.inodeDynamicLookup.IterDirents.
func (i *subtasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) {
tasks := i.task.ThreadGroup().MemberIDs(i.pidns)
if len(tasks) == 0 {
@@ -155,7 +155,7 @@ func (fd *subtasksFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) erro
return fd.GenericDirectoryFD.SetStat(ctx, opts)
}
-// Open implements kernfs.Inode.
+// Open implements kernfs.Inode.Open.
func (i *subtasksInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
fd := &subtasksFD{task: i.task}
if err := fd.Init(&i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{
@@ -169,7 +169,7 @@ func (i *subtasksInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *v
return fd.VFSFileDescription(), nil
}
-// Stat implements kernfs.Inode.
+// Stat implements kernfs.Inode.Stat.
func (i *subtasksInode) Stat(ctx context.Context, vsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
stat, err := i.InodeAttrs.Stat(ctx, vsfs, opts)
if err != nil {
@@ -181,12 +181,12 @@ func (i *subtasksInode) Stat(ctx context.Context, vsfs *vfs.Filesystem, opts vfs
return stat, nil
}
-// SetStat implements Inode.SetStat not allowing inode attributes to be changed.
+// SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed.
func (*subtasksInode) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
return syserror.EPERM
}
-// DecRef implements kernfs.Inode.
+// DecRef implements kernfs.Inode.DecRef.
func (i *subtasksInode) DecRef(context.Context) {
i.subtasksInodeRefs.DecRef(i.Destroy)
}
diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go
index dbdb5d929..e24c8a031 100644
--- a/pkg/sentry/fsimpl/proc/task.go
+++ b/pkg/sentry/fsimpl/proc/task.go
@@ -106,7 +106,7 @@ func (i *taskInode) Valid(ctx context.Context) bool {
return i.task.ExitState() != kernel.TaskExitDead
}
-// Open implements kernfs.Inode.
+// Open implements kernfs.Inode.Open.
func (i *taskInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{
SeekEnd: kernfs.SeekEndZero,
@@ -117,12 +117,12 @@ func (i *taskInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.D
return fd.VFSFileDescription(), nil
}
-// SetStat implements Inode.SetStat not allowing inode attributes to be changed.
+// SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed.
func (*taskInode) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
return syserror.EPERM
}
-// DecRef implements kernfs.Inode.
+// DecRef implements kernfs.Inode.DecRef.
func (i *taskInode) DecRef(context.Context) {
i.taskInodeRefs.DecRef(i.Destroy)
}
@@ -168,7 +168,7 @@ func (fs *filesystem) newTaskOwnedDir(task *kernel.Task, ino uint64, perm linux.
return d
}
-// Stat implements kernfs.Inode.
+// Stat implements kernfs.Inode.Stat.
func (i *taskOwnedInode) Stat(ctx context.Context, fs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
stat, err := i.Inode.Stat(ctx, fs, opts)
if err != nil {
@@ -186,7 +186,7 @@ func (i *taskOwnedInode) Stat(ctx context.Context, fs *vfs.Filesystem, opts vfs.
return stat, nil
}
-// CheckPermissions implements kernfs.Inode.
+// CheckPermissions implements kernfs.Inode.CheckPermissions.
func (i *taskOwnedInode) CheckPermissions(_ context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error {
mode := i.Mode()
uid, gid := i.getOwner(mode)
diff --git a/pkg/sentry/fsimpl/proc/task_fds.go b/pkg/sentry/fsimpl/proc/task_fds.go
index 47d744a57..c492bcfa7 100644
--- a/pkg/sentry/fsimpl/proc/task_fds.go
+++ b/pkg/sentry/fsimpl/proc/task_fds.go
@@ -62,7 +62,7 @@ type fdDir struct {
produceSymlink bool
}
-// IterDirents implements kernfs.inodeDynamicLookup.
+// IterDirents implements kernfs.inodeDynamicLookup.IterDirents.
func (i *fdDir) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) {
var fds []int32
i.task.WithMuLocked(func(t *kernel.Task) {
@@ -135,7 +135,7 @@ func (fs *filesystem) newFDDirInode(task *kernel.Task) *kernfs.Dentry {
return dentry
}
-// Lookup implements kernfs.inodeDynamicLookup.
+// Lookup implements kernfs.inodeDynamicLookup.Lookup.
func (i *fdDirInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) {
fdInt, err := strconv.ParseInt(name, 10, 32)
if err != nil {
@@ -149,7 +149,7 @@ func (i *fdDirInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, erro
return taskDentry.VFSDentry(), nil
}
-// Open implements kernfs.Inode.
+// Open implements kernfs.Inode.Open.
func (i *fdDirInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{
SeekEnd: kernfs.SeekEndZero,
@@ -160,7 +160,7 @@ func (i *fdDirInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.
return fd.VFSFileDescription(), nil
}
-// CheckPermissions implements kernfs.Inode.
+// CheckPermissions implements kernfs.Inode.CheckPermissions.
//
// This is to match Linux, which uses a special permission handler to guarantee
// that a process can still access /proc/self/fd after it has executed
@@ -182,7 +182,7 @@ func (i *fdDirInode) CheckPermissions(ctx context.Context, creds *auth.Credentia
return err
}
-// DecRef implements kernfs.Inode.
+// DecRef implements kernfs.Inode.DecRef.
func (i *fdDirInode) DecRef(context.Context) {
i.fdDirInodeRefs.DecRef(i.Destroy)
}
@@ -269,7 +269,7 @@ func (fs *filesystem) newFDInfoDirInode(task *kernel.Task) *kernfs.Dentry {
return dentry
}
-// Lookup implements kernfs.inodeDynamicLookup.
+// Lookup implements kernfs.inodeDynamicLookup.Lookup.
func (i *fdInfoDirInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) {
fdInt, err := strconv.ParseInt(name, 10, 32)
if err != nil {
@@ -287,7 +287,7 @@ func (i *fdInfoDirInode) Lookup(ctx context.Context, name string) (*vfs.Dentry,
return dentry.VFSDentry(), nil
}
-// Open implements kernfs.Inode.
+// Open implements kernfs.Inode.Open.
func (i *fdInfoDirInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{
SeekEnd: kernfs.SeekEndZero,
@@ -298,7 +298,7 @@ func (i *fdInfoDirInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *
return fd.VFSFileDescription(), nil
}
-// DecRef implements kernfs.Inode.
+// DecRef implements kernfs.Inode.DecRef.
func (i *fdInfoDirInode) DecRef(context.Context) {
i.fdInfoDirInodeRefs.DecRef(i.Destroy)
}
diff --git a/pkg/sentry/fsimpl/proc/task_files.go b/pkg/sentry/fsimpl/proc/task_files.go
index 87726d8f7..8f7e9b801 100644
--- a/pkg/sentry/fsimpl/proc/task_files.go
+++ b/pkg/sentry/fsimpl/proc/task_files.go
@@ -667,7 +667,7 @@ func (fs *filesystem) newExeSymlink(task *kernel.Task, ino uint64) *kernfs.Dentr
return d
}
-// Readlink implements kernfs.Inode.
+// Readlink implements kernfs.Inode.Readlink.
func (s *exeSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) {
if !kernel.ContextCanTrace(ctx, s.task, false) {
return "", syserror.EACCES
@@ -807,7 +807,7 @@ func (fs *filesystem) newNamespaceSymlink(task *kernel.Task, ino uint64, ns stri
return d
}
-// Readlink implements Inode.
+// Readlink implements kernfs.Inode.Readlink.
func (s *namespaceSymlink) Readlink(ctx context.Context, mnt *vfs.Mount) (string, error) {
if err := checkTaskState(s.task); err != nil {
return "", err
@@ -815,7 +815,7 @@ func (s *namespaceSymlink) Readlink(ctx context.Context, mnt *vfs.Mount) (string
return s.StaticSymlink.Readlink(ctx, mnt)
}
-// Getlink implements Inode.Getlink.
+// Getlink implements kernfs.Inode.Getlink.
func (s *namespaceSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDentry, string, error) {
if err := checkTaskState(s.task); err != nil {
return vfs.VirtualDentry{}, "", err
@@ -852,7 +852,7 @@ func (i *namespaceInode) Init(creds *auth.Credentials, devMajor, devMinor uint32
i.InodeAttrs.Init(creds, devMajor, devMinor, ino, linux.ModeRegular|perm)
}
-// Open implements Inode.Open.
+// Open implements kernfs.Inode.Open.
func (i *namespaceInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
fd := &namespaceFD{inode: i}
i.IncRef()
@@ -875,20 +875,20 @@ type namespaceFD struct {
var _ vfs.FileDescriptionImpl = (*namespaceFD)(nil)
-// Stat implements FileDescriptionImpl.
+// Stat implements vfs.FileDescriptionImpl.Stat.
func (fd *namespaceFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
vfs := fd.vfsfd.VirtualDentry().Mount().Filesystem()
return fd.inode.Stat(ctx, vfs, opts)
}
-// SetStat implements FileDescriptionImpl.
+// SetStat implements vfs.FileDescriptionImpl.SetStat.
func (fd *namespaceFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
vfs := fd.vfsfd.VirtualDentry().Mount().Filesystem()
creds := auth.CredentialsFromContext(ctx)
return fd.inode.SetStat(ctx, vfs, creds, opts)
}
-// Release implements FileDescriptionImpl.
+// Release implements vfs.FileDescriptionImpl.Release.
func (fd *namespaceFD) Release(ctx context.Context) {
fd.inode.DecRef(ctx)
}
diff --git a/pkg/sentry/fsimpl/proc/task_net.go b/pkg/sentry/fsimpl/proc/task_net.go
index 4e69782c7..1607eac19 100644
--- a/pkg/sentry/fsimpl/proc/task_net.go
+++ b/pkg/sentry/fsimpl/proc/task_net.go
@@ -660,7 +660,7 @@ func sprintSlice(s []uint64) string {
return r[1 : len(r)-1] // Remove "[]" introduced by fmt of slice.
}
-// Generate implements vfs.DynamicBytesSource.
+// Generate implements vfs.DynamicBytesSource.Generate.
func (d *netSnmpData) Generate(ctx context.Context, buf *bytes.Buffer) error {
types := []interface{}{
&inet.StatSNMPIP{},
@@ -709,7 +709,7 @@ type netRouteData struct {
var _ dynamicInode = (*netRouteData)(nil)
-// Generate implements vfs.DynamicBytesSource.
+// Generate implements vfs.DynamicBytesSource.Generate.
// See Linux's net/ipv4/fib_trie.c:fib_route_seq_show.
func (d *netRouteData) Generate(ctx context.Context, buf *bytes.Buffer) error {
fmt.Fprintf(buf, "%-127s\n", "Iface\tDestination\tGateway\tFlags\tRefCnt\tUse\tMetric\tMask\tMTU\tWindow\tIRTT")
@@ -773,7 +773,7 @@ type netStatData struct {
var _ dynamicInode = (*netStatData)(nil)
-// Generate implements vfs.DynamicBytesSource.
+// Generate implements vfs.DynamicBytesSource.Generate.
// See Linux's net/ipv4/fib_trie.c:fib_route_seq_show.
func (d *netStatData) Generate(ctx context.Context, buf *bytes.Buffer) error {
buf.WriteString("TcpExt: SyncookiesSent SyncookiesRecv SyncookiesFailed " +
diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go
index 3ea00ab87..6d60acc30 100644
--- a/pkg/sentry/fsimpl/proc/tasks.go
+++ b/pkg/sentry/fsimpl/proc/tasks.go
@@ -98,7 +98,7 @@ func (fs *filesystem) newTasksInode(k *kernel.Kernel, pidns *kernel.PIDNamespace
return inode, dentry
}
-// Lookup implements kernfs.inodeDynamicLookup.
+// Lookup implements kernfs.inodeDynamicLookup.Lookup.
func (i *tasksInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) {
// Try to lookup a corresponding task.
tid, err := strconv.ParseUint(name, 10, 64)
@@ -122,7 +122,7 @@ func (i *tasksInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, erro
return taskDentry.VFSDentry(), nil
}
-// IterDirents implements kernfs.inodeDynamicLookup.
+// IterDirents implements kernfs.inodeDynamicLookup.IterDirents.
func (i *tasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, offset, _ int64) (int64, error) {
// fs/proc/internal.h: #define FIRST_PROCESS_ENTRY 256
const FIRST_PROCESS_ENTRY = 256
@@ -200,7 +200,7 @@ func (i *tasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback
return maxTaskID, nil
}
-// Open implements kernfs.Inode.
+// Open implements kernfs.Inode.Open.
func (i *tasksInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{
SeekEnd: kernfs.SeekEndZero,
@@ -229,7 +229,7 @@ func (i *tasksInode) Stat(ctx context.Context, vsfs *vfs.Filesystem, opts vfs.St
return stat, nil
}
-// DecRef implements kernfs.Inode.
+// DecRef implements kernfs.Inode.DecRef.
func (i *tasksInode) DecRef(context.Context) {
i.tasksInodeRefs.DecRef(i.Destroy)
}
diff --git a/pkg/sentry/fsimpl/proc/tasks_files.go b/pkg/sentry/fsimpl/proc/tasks_files.go
index 68c541046..459a8e52e 100644
--- a/pkg/sentry/fsimpl/proc/tasks_files.go
+++ b/pkg/sentry/fsimpl/proc/tasks_files.go
@@ -69,7 +69,7 @@ func (s *selfSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualD
return vfs.VirtualDentry{}, target, err
}
-// SetStat implements Inode.SetStat not allowing inode attributes to be changed.
+// SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed.
func (*selfSymlink) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
return syserror.EPERM
}
@@ -113,7 +113,7 @@ func (s *threadSelfSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.Vi
return vfs.VirtualDentry{}, target, err
}
-// SetStat implements Inode.SetStat not allowing inode attributes to be changed.
+// SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed.
func (*threadSelfSymlink) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
return syserror.EPERM
}
@@ -125,7 +125,7 @@ type dynamicBytesFileSetAttr struct {
kernfs.DynamicBytesFile
}
-// SetStat implements Inode.SetStat.
+// SetStat implements kernfs.Inode.SetStat.
func (d *dynamicBytesFileSetAttr) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error {
return d.DynamicBytesFile.InodeAttrs.SetStat(ctx, fs, creds, opts)
}
diff --git a/pkg/sentry/fsimpl/proc/tasks_sys.go b/pkg/sentry/fsimpl/proc/tasks_sys.go
index 038a194c7..9e0966efe 100644
--- a/pkg/sentry/fsimpl/proc/tasks_sys.go
+++ b/pkg/sentry/fsimpl/proc/tasks_sys.go
@@ -174,7 +174,7 @@ type tcpSackData struct {
var _ vfs.WritableDynamicBytesSource = (*tcpSackData)(nil)
-// Generate implements vfs.DynamicBytesSource.
+// Generate implements vfs.DynamicBytesSource.Generate.
func (d *tcpSackData) Generate(ctx context.Context, buf *bytes.Buffer) error {
if d.enabled == nil {
sack, err := d.stack.TCPSACKEnabled()
@@ -232,7 +232,7 @@ type tcpRecoveryData struct {
var _ vfs.WritableDynamicBytesSource = (*tcpRecoveryData)(nil)
-// Generate implements vfs.DynamicBytesSource.
+// Generate implements vfs.DynamicBytesSource.Generate.
func (d *tcpRecoveryData) Generate(ctx context.Context, buf *bytes.Buffer) error {
recovery, err := d.stack.TCPRecovery()
if err != nil {
@@ -284,7 +284,7 @@ type tcpMemData struct {
var _ vfs.WritableDynamicBytesSource = (*tcpMemData)(nil)
-// Generate implements vfs.DynamicBytesSource.
+// Generate implements vfs.DynamicBytesSource.Generate.
func (d *tcpMemData) Generate(ctx context.Context, buf *bytes.Buffer) error {
d.mu.Lock()
defer d.mu.Unlock()
diff --git a/pkg/sentry/fsimpl/signalfd/signalfd.go b/pkg/sentry/fsimpl/signalfd/signalfd.go
index 6297e1df4..3c02af8c9 100644
--- a/pkg/sentry/fsimpl/signalfd/signalfd.go
+++ b/pkg/sentry/fsimpl/signalfd/signalfd.go
@@ -26,7 +26,7 @@ import (
"gvisor.dev/gvisor/pkg/waiter"
)
-// SignalFileDescription implements FileDescriptionImpl for signal fds.
+// SignalFileDescription implements vfs.FileDescriptionImpl for signal fds.
type SignalFileDescription struct {
vfsfd vfs.FileDescription
vfs.FileDescriptionDefaultImpl
@@ -83,7 +83,7 @@ func (sfd *SignalFileDescription) SetMask(mask linux.SignalSet) {
sfd.mask = mask
}
-// Read implements FileDescriptionImpl.Read.
+// Read implements vfs.FileDescriptionImpl.Read.
func (sfd *SignalFileDescription) Read(ctx context.Context, dst usermem.IOSequence, _ vfs.ReadOptions) (int64, error) {
// Attempt to dequeue relevant signals.
info, err := sfd.target.Sigtimedwait(sfd.Mask(), 0)
@@ -132,5 +132,5 @@ func (sfd *SignalFileDescription) EventUnregister(entry *waiter.Entry) {
sfd.target.SignalUnregister(entry)
}
-// Release implements FileDescriptionImpl.Release()
+// Release implements vfs.FileDescriptionImpl.Release.
func (sfd *SignalFileDescription) Release(context.Context) {}
diff --git a/pkg/sentry/fsimpl/sockfs/sockfs.go b/pkg/sentry/fsimpl/sockfs/sockfs.go
index 94a998568..80b41aa9e 100644
--- a/pkg/sentry/fsimpl/sockfs/sockfs.go
+++ b/pkg/sentry/fsimpl/sockfs/sockfs.go
@@ -30,12 +30,12 @@ import (
// filesystemType implements vfs.FilesystemType.
type filesystemType struct{}
-// GetFilesystem implements FilesystemType.GetFilesystem.
+// GetFilesystem implements vfs.FilesystemType.GetFilesystem.
func (fsType filesystemType) GetFilesystem(_ context.Context, vfsObj *vfs.VirtualFilesystem, _ *auth.Credentials, _ string, _ vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
panic("sockfs.filesystemType.GetFilesystem should never be called")
}
-// Name implements FilesystemType.Name.
+// Name implements vfs.FilesystemType.Name.
//
// Note that registering sockfs is unnecessary, except for the fact that it
// will not show up under /proc/filesystems as a result. This is a very minor
diff --git a/pkg/sentry/fsimpl/timerfd/timerfd.go b/pkg/sentry/fsimpl/timerfd/timerfd.go
index 86beaa0a8..ac8a4e3bb 100644
--- a/pkg/sentry/fsimpl/timerfd/timerfd.go
+++ b/pkg/sentry/fsimpl/timerfd/timerfd.go
@@ -26,7 +26,7 @@ import (
"gvisor.dev/gvisor/pkg/waiter"
)
-// TimerFileDescription implements FileDescriptionImpl for timer fds. It also
+// TimerFileDescription implements vfs.FileDescriptionImpl for timer fds. It also
// implements ktime.TimerListener.
type TimerFileDescription struct {
vfsfd vfs.FileDescription
@@ -62,7 +62,7 @@ func New(ctx context.Context, vfsObj *vfs.VirtualFilesystem, clock ktime.Clock,
return &tfd.vfsfd, nil
}
-// Read implements FileDescriptionImpl.Read.
+// Read implements vfs.FileDescriptionImpl.Read.
func (tfd *TimerFileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
const sizeofUint64 = 8
if dst.NumBytes() < sizeofUint64 {
@@ -128,7 +128,7 @@ func (tfd *TimerFileDescription) ResumeTimer() {
tfd.timer.Resume()
}
-// Release implements FileDescriptionImpl.Release()
+// Release implements vfs.FileDescriptionImpl.Release.
func (tfd *TimerFileDescription) Release(context.Context) {
tfd.timer.Destroy()
}
diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go
index 61d925a73..1362c1602 100644
--- a/pkg/sentry/fsimpl/tmpfs/filesystem.go
+++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go
@@ -770,7 +770,7 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
return nil
}
-// BoundEndpointAt implements FilesystemImpl.BoundEndpointAt.
+// BoundEndpointAt implements vfs.FilesystemImpl.BoundEndpointAt.
func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) {
fs.mu.RLock()
defer fs.mu.RUnlock()
@@ -865,8 +865,16 @@ func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDe
}
if d.parent == nil {
if d.name != "" {
- // This must be an anonymous memfd file.
+ // This file must have been created by
+ // newUnlinkedRegularFileDescription(). In Linux,
+ // mm/shmem.c:__shmem_file_setup() =>
+ // fs/file_table.c:alloc_file_pseudo() sets the created
+ // dentry's dentry_operations to anon_ops, for which d_dname ==
+ // simple_dname. fs/d_path.c:simple_dname() defines the
+ // dentry's pathname to be its name, prefixed with "/" and
+ // suffixed with " (deleted)".
b.PrependComponent("/" + d.name)
+ b.AppendString(" (deleted)")
return vfs.PrependPathSyntheticError{}
}
return vfs.PrependPathAtNonMountRootError{}
diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file.go b/pkg/sentry/fsimpl/tmpfs/regular_file.go
index 0710b65db..b8699d064 100644
--- a/pkg/sentry/fsimpl/tmpfs/regular_file.go
+++ b/pkg/sentry/fsimpl/tmpfs/regular_file.go
@@ -42,6 +42,10 @@ type regularFile struct {
// memFile is a platform.File used to allocate pages to this regularFile.
memFile *pgalloc.MemoryFile
+ // memoryUsageKind is the memory accounting category under which pages backing
+ // this regularFile's contents are accounted.
+ memoryUsageKind usage.MemoryKind
+
// mapsMu protects mappings.
mapsMu sync.Mutex `state:"nosave"`
@@ -86,14 +90,75 @@ type regularFile struct {
func (fs *filesystem) newRegularFile(kuid auth.KUID, kgid auth.KGID, mode linux.FileMode) *inode {
file := &regularFile{
- memFile: fs.memFile,
- seals: linux.F_SEAL_SEAL,
+ memFile: fs.memFile,
+ memoryUsageKind: usage.Tmpfs,
+ seals: linux.F_SEAL_SEAL,
}
file.inode.init(file, fs, kuid, kgid, linux.S_IFREG|mode)
file.inode.nlink = 1 // from parent directory
return &file.inode
}
+// newUnlinkedRegularFileDescription creates a regular file on the tmpfs
+// filesystem represented by mount and returns an FD representing that file.
+// The new file is not reachable by path traversal from any other file.
+//
+// newUnlinkedRegularFileDescription is analogous to Linux's
+// mm/shmem.c:__shmem_file_setup().
+//
+// Preconditions: mount must be a tmpfs mount.
+func newUnlinkedRegularFileDescription(ctx context.Context, creds *auth.Credentials, mount *vfs.Mount, name string) (*regularFileFD, error) {
+ fs, ok := mount.Filesystem().Impl().(*filesystem)
+ if !ok {
+ panic("tmpfs.newUnlinkedRegularFileDescription() called with non-tmpfs mount")
+ }
+
+ inode := fs.newRegularFile(creds.EffectiveKUID, creds.EffectiveKGID, 0777)
+ d := fs.newDentry(inode)
+ defer d.DecRef(ctx)
+ d.name = name
+
+ fd := &regularFileFD{}
+ fd.Init(&inode.locks)
+ flags := uint32(linux.O_RDWR)
+ if err := fd.vfsfd.Init(fd, flags, mount, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil {
+ return nil, err
+ }
+ return fd, nil
+}
+
+// NewZeroFile creates a new regular file and file description as for
+// mmap(MAP_SHARED | MAP_ANONYMOUS). The file has the given size and is
+// initially (implicitly) filled with zeroes.
+//
+// Preconditions: mount must be a tmpfs mount.
+func NewZeroFile(ctx context.Context, creds *auth.Credentials, mount *vfs.Mount, size uint64) (*vfs.FileDescription, error) {
+ // Compare mm/shmem.c:shmem_zero_setup().
+ fd, err := newUnlinkedRegularFileDescription(ctx, creds, mount, "dev/zero")
+ if err != nil {
+ return nil, err
+ }
+ rf := fd.inode().impl.(*regularFile)
+ rf.memoryUsageKind = usage.Anonymous
+ rf.size = size
+ return &fd.vfsfd, err
+}
+
+// NewMemfd creates a new regular file and file description as for
+// memfd_create.
+//
+// Preconditions: mount must be a tmpfs mount.
+func NewMemfd(ctx context.Context, creds *auth.Credentials, mount *vfs.Mount, allowSeals bool, name string) (*vfs.FileDescription, error) {
+ fd, err := newUnlinkedRegularFileDescription(ctx, creds, mount, name)
+ if err != nil {
+ return nil, err
+ }
+ if allowSeals {
+ fd.inode().impl.(*regularFile).seals = 0
+ }
+ return &fd.vfsfd, nil
+}
+
// truncate grows or shrinks the file to the given size. It returns true if the
// file size was updated.
func (rf *regularFile) truncate(newSize uint64) (bool, error) {
@@ -226,7 +291,7 @@ func (rf *regularFile) Translate(ctx context.Context, required, optional memmap.
optional.End = pgend
}
- cerr := rf.data.Fill(ctx, required, optional, rf.memFile, usage.Tmpfs, func(_ context.Context, dsts safemem.BlockSeq, _ uint64) (uint64, error) {
+ cerr := rf.data.Fill(ctx, required, optional, rf.memFile, rf.memoryUsageKind, func(_ context.Context, dsts safemem.BlockSeq, _ uint64) (uint64, error) {
// Newly-allocated pages are zeroed, so we don't need to do anything.
return dsts.NumBytes(), nil
})
@@ -575,7 +640,7 @@ func (rw *regularFileReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64,
case gap.Ok():
// Allocate memory for the write.
gapMR := gap.Range().Intersect(pgMR)
- fr, err := rw.file.memFile.Allocate(gapMR.Length(), usage.Tmpfs)
+ fr, err := rw.file.memFile.Allocate(gapMR.Length(), rw.file.memoryUsageKind)
if err != nil {
retErr = err
goto exitLoop
diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
index 4871e55d3..4658e1533 100644
--- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go
+++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
@@ -746,37 +746,6 @@ func (fd *fileDescription) RemoveXattr(ctx context.Context, name string) error {
return nil
}
-// NewMemfd creates a new tmpfs regular file and file description that can back
-// an anonymous fd created by memfd_create.
-func NewMemfd(ctx context.Context, creds *auth.Credentials, mount *vfs.Mount, allowSeals bool, name string) (*vfs.FileDescription, error) {
- fs, ok := mount.Filesystem().Impl().(*filesystem)
- if !ok {
- panic("NewMemfd() called with non-tmpfs mount")
- }
-
- // Per Linux, mm/shmem.c:__shmem_file_setup(), memfd inodes are set up with
- // S_IRWXUGO.
- inode := fs.newRegularFile(creds.EffectiveKUID, creds.EffectiveKGID, 0777)
- rf := inode.impl.(*regularFile)
- if allowSeals {
- rf.seals = 0
- }
-
- d := fs.newDentry(inode)
- defer d.DecRef(ctx)
- d.name = name
-
- // Per Linux, mm/shmem.c:__shmem_file_setup(), memfd files are set up with
- // FMODE_READ | FMODE_WRITE.
- var fd regularFileFD
- fd.Init(&inode.locks)
- flags := uint32(linux.O_RDWR)
- if err := fd.vfsfd.Init(&fd, flags, mount, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil {
- return nil, err
- }
- return &fd.vfsfd, nil
-}
-
// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
func (fd *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block)
diff --git a/pkg/sentry/fsimpl/verity/filesystem.go b/pkg/sentry/fsimpl/verity/filesystem.go
index f86a6a0b2..26b117ca4 100644
--- a/pkg/sentry/fsimpl/verity/filesystem.go
+++ b/pkg/sentry/fsimpl/verity/filesystem.go
@@ -185,8 +185,7 @@ func (fs *filesystem) verifyChild(ctx context.Context, parent *dentry, child *de
Start: child.lowerMerkleVD,
}, &vfs.GetXattrOptions{
Name: merkleOffsetInParentXattr,
- // Offset is a 32 bit integer.
- Size: sizeOfInt32,
+ Size: sizeOfStringInt32,
})
// The Merkle tree file for the child should have been created and
@@ -227,7 +226,7 @@ func (fs *filesystem) verifyChild(ctx context.Context, parent *dentry, child *de
// the size of all its children's root hashes.
dataSize, err := parentMerkleFD.GetXattr(ctx, &vfs.GetXattrOptions{
Name: merkleSizeXattr,
- Size: sizeOfInt32,
+ Size: sizeOfStringInt32,
})
// The Merkle tree file for the child should have been created and
@@ -372,6 +371,7 @@ func (fs *filesystem) lookupAndVerifyLocked(ctx context.Context, parent *dentry,
Path: fspath.Parse(childMerkleFilename),
}, &vfs.OpenOptions{
Flags: linux.O_RDWR | linux.O_CREAT,
+ Mode: 0644,
})
if err != nil {
return nil, err
@@ -807,7 +807,7 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
return syserror.EROFS
}
-// BoundEndpointAt implements FilesystemImpl.BoundEndpointAt.
+// BoundEndpointAt implements vfs.FilesystemImpl.BoundEndpointAt.
func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) {
var ds *[]*dentry
fs.renameMu.RLock()
diff --git a/pkg/sentry/fsimpl/verity/verity.go b/pkg/sentry/fsimpl/verity/verity.go
index 3e0bcd02b..9182df317 100644
--- a/pkg/sentry/fsimpl/verity/verity.go
+++ b/pkg/sentry/fsimpl/verity/verity.go
@@ -57,8 +57,9 @@ const merkleOffsetInParentXattr = "user.merkle.offset"
// whole file. For a directory, it's the size of all its children's root hashes.
const merkleSizeXattr = "user.merkle.size"
-// sizeOfInt32 is the size in bytes for a 32 bit integer in extended attributes.
-const sizeOfInt32 = 4
+// sizeOfStringInt32 is the size for a 32 bit integer stored as string in
+// extended attributes. The maximum value of a 32 bit integer is 10 digits.
+const sizeOfStringInt32 = 10
// noCrashOnVerificationFailure indicates whether the sandbox should panic
// whenever verification fails. If true, an error is returned instead of
@@ -636,7 +637,7 @@ func (fd *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, of
// dataSize is the size of the whole file.
dataSize, err := fd.merkleReader.GetXattr(ctx, &vfs.GetXattrOptions{
Name: merkleSizeXattr,
- Size: sizeOfInt32,
+ Size: sizeOfStringInt32,
})
// The Merkle tree file for the child should have been created and
diff --git a/pkg/sentry/mm/mm.go b/pkg/sentry/mm/mm.go
index 3e85964e4..8c9f11cce 100644
--- a/pkg/sentry/mm/mm.go
+++ b/pkg/sentry/mm/mm.go
@@ -242,7 +242,7 @@ type MemoryManager struct {
// +stateify savable
type vma struct {
// mappable is the virtual memory object mapped by this vma. If mappable is
- // nil, the vma represents a private anonymous mapping.
+ // nil, the vma represents an anonymous mapping.
mappable memmap.Mappable
// off is the offset into mappable at which this vma begins. If mappable is
diff --git a/pkg/sentry/mm/mm_test.go b/pkg/sentry/mm/mm_test.go
index fdc308542..acac3d357 100644
--- a/pkg/sentry/mm/mm_test.go
+++ b/pkg/sentry/mm/mm_test.go
@@ -51,7 +51,8 @@ func TestUsageASUpdates(t *testing.T) {
defer mm.DecUsers(ctx)
addr, err := mm.MMap(ctx, memmap.MMapOpts{
- Length: 2 * usermem.PageSize,
+ Length: 2 * usermem.PageSize,
+ Private: true,
})
if err != nil {
t.Fatalf("MMap got err %v want nil", err)
diff --git a/pkg/sentry/mm/special_mappable.go b/pkg/sentry/mm/special_mappable.go
index f4c93baeb..2dbe5b751 100644
--- a/pkg/sentry/mm/special_mappable.go
+++ b/pkg/sentry/mm/special_mappable.go
@@ -136,9 +136,12 @@ func (m *SpecialMappable) Length() uint64 {
// NewSharedAnonMappable returns a SpecialMappable that implements the
// semantics of mmap(MAP_SHARED|MAP_ANONYMOUS) and mappings of /dev/zero.
//
-// TODO(jamieliu): The use of SpecialMappable is a lazy code reuse hack. Linux
-// uses an ephemeral file created by mm/shmem.c:shmem_zero_setup(); we should
-// do the same to get non-zero device and inode IDs.
+// TODO(gvisor.dev/issue/1624): Linux uses an ephemeral file created by
+// mm/shmem.c:shmem_zero_setup(), and VFS2 does something analogous. VFS1 uses
+// a SpecialMappable instead, incorrectly getting device and inode IDs of zero
+// and causing memory for shared anonymous mappings to be allocated up-front
+// instead of on first touch; this is to avoid exacerbating the fs.MountSource
+// leak (b/143656263). Delete this function along with VFS1.
func NewSharedAnonMappable(length uint64, mfp pgalloc.MemoryFileProvider) (*SpecialMappable, error) {
if length == 0 {
return nil, syserror.EINVAL
diff --git a/pkg/sentry/mm/syscalls.go b/pkg/sentry/mm/syscalls.go
index 4c9a575e7..a2555ba1a 100644
--- a/pkg/sentry/mm/syscalls.go
+++ b/pkg/sentry/mm/syscalls.go
@@ -24,7 +24,6 @@ import (
"gvisor.dev/gvisor/pkg/sentry/kernel/futex"
"gvisor.dev/gvisor/pkg/sentry/limits"
"gvisor.dev/gvisor/pkg/sentry/memmap"
- "gvisor.dev/gvisor/pkg/sentry/pgalloc"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
)
@@ -93,18 +92,6 @@ func (mm *MemoryManager) MMap(ctx context.Context, opts memmap.MMapOpts) (userme
}
} else {
opts.Offset = 0
- if !opts.Private {
- if opts.MappingIdentity != nil {
- return 0, syserror.EINVAL
- }
- m, err := NewSharedAnonMappable(opts.Length, pgalloc.MemoryFileProviderFromContext(ctx))
- if err != nil {
- return 0, err
- }
- defer m.DecRef(ctx)
- opts.MappingIdentity = m
- opts.Mappable = m
- }
}
if opts.Addr.RoundDown() != opts.Addr {
diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go
index 3e1735079..871ea80ee 100644
--- a/pkg/sentry/socket/netfilter/netfilter.go
+++ b/pkg/sentry/socket/netfilter/netfilter.go
@@ -146,6 +146,10 @@ func SetEntries(stk *stack.Stack, optVal []byte, ipv6 bool) *syserr.Error {
case stack.FilterTable:
table = stack.EmptyFilterTable()
case stack.NATTable:
+ if ipv6 {
+ nflog("IPv6 redirection not yet supported (gvisor.dev/issue/3549)")
+ return syserr.ErrInvalidArgument
+ }
table = stack.EmptyNATTable()
default:
nflog("we don't yet support writing to the %q table (gvisor.dev/issue/170)", replace.Name.String())
diff --git a/pkg/sentry/socket/netfilter/tcp_matcher.go b/pkg/sentry/socket/netfilter/tcp_matcher.go
index 0bfd6c1f4..844acfede 100644
--- a/pkg/sentry/socket/netfilter/tcp_matcher.go
+++ b/pkg/sentry/socket/netfilter/tcp_matcher.go
@@ -97,17 +97,33 @@ func (*TCPMatcher) Name() string {
// Match implements Matcher.Match.
func (tm *TCPMatcher) Match(hook stack.Hook, pkt *stack.PacketBuffer, interfaceName string) (bool, bool) {
- netHeader := header.IPv4(pkt.NetworkHeader().View())
+ // TODO(gvisor.dev/issue/170): Proto checks should ultimately be moved
+ // into the stack.Check codepath as matchers are added.
+ switch pkt.NetworkProtocolNumber {
+ case header.IPv4ProtocolNumber:
+ netHeader := header.IPv4(pkt.NetworkHeader().View())
+ if netHeader.TransportProtocol() != header.TCPProtocolNumber {
+ return false, false
+ }
- if netHeader.TransportProtocol() != header.TCPProtocolNumber {
- return false, false
- }
+ // We don't match fragments.
+ if frag := netHeader.FragmentOffset(); frag != 0 {
+ if frag == 1 {
+ return false, true
+ }
+ return false, false
+ }
- // We dont't match fragments.
- if frag := netHeader.FragmentOffset(); frag != 0 {
- if frag == 1 {
- return false, true
+ case header.IPv6ProtocolNumber:
+ // As in Linux, we do not perform an IPv6 fragment check. See
+ // xt_action_param.fragoff in
+ // include/linux/netfilter/x_tables.h.
+ if header.IPv6(pkt.NetworkHeader().View()).TransportProtocol() != header.TCPProtocolNumber {
+ return false, false
}
+
+ default:
+ // We don't know the network protocol.
return false, false
}
diff --git a/pkg/sentry/socket/netfilter/udp_matcher.go b/pkg/sentry/socket/netfilter/udp_matcher.go
index 7ed05461d..63201201c 100644
--- a/pkg/sentry/socket/netfilter/udp_matcher.go
+++ b/pkg/sentry/socket/netfilter/udp_matcher.go
@@ -94,19 +94,33 @@ func (*UDPMatcher) Name() string {
// Match implements Matcher.Match.
func (um *UDPMatcher) Match(hook stack.Hook, pkt *stack.PacketBuffer, interfaceName string) (bool, bool) {
- netHeader := header.IPv4(pkt.NetworkHeader().View())
-
// TODO(gvisor.dev/issue/170): Proto checks should ultimately be moved
// into the stack.Check codepath as matchers are added.
- if netHeader.TransportProtocol() != header.UDPProtocolNumber {
- return false, false
- }
+ switch pkt.NetworkProtocolNumber {
+ case header.IPv4ProtocolNumber:
+ netHeader := header.IPv4(pkt.NetworkHeader().View())
+ if netHeader.TransportProtocol() != header.UDPProtocolNumber {
+ return false, false
+ }
- // We dont't match fragments.
- if frag := netHeader.FragmentOffset(); frag != 0 {
- if frag == 1 {
- return false, true
+ // We don't match fragments.
+ if frag := netHeader.FragmentOffset(); frag != 0 {
+ if frag == 1 {
+ return false, true
+ }
+ return false, false
}
+
+ case header.IPv6ProtocolNumber:
+ // As in Linux, we do not perform an IPv6 fragment check. See
+ // xt_action_param.fragoff in
+ // include/linux/netfilter/x_tables.h.
+ if header.IPv6(pkt.NetworkHeader().View()).TransportProtocol() != header.UDPProtocolNumber {
+ return false, false
+ }
+
+ default:
+ // We don't know the network protocol.
return false, false
}
diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index 33edb1b6f..6fede181a 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -158,6 +158,9 @@ var Metrics = tcpip.Stats{
OutgoingPacketErrors: mustCreateMetric("/netstack/ip/outgoing_packet_errors", "Total number of IP packets which failed to write to a link-layer endpoint."),
MalformedPacketsReceived: mustCreateMetric("/netstack/ip/malformed_packets_received", "Total number of IP packets which failed IP header validation checks."),
MalformedFragmentsReceived: mustCreateMetric("/netstack/ip/malformed_fragments_received", "Total number of IP fragments which failed IP fragment validation checks."),
+ IPTablesPreroutingDropped: mustCreateMetric("/netstack/ip/iptables/prerouting_dropped", "Total number of IP packets dropped in the Prerouting chain."),
+ IPTablesInputDropped: mustCreateMetric("/netstack/ip/iptables/input_dropped", "Total number of IP packets dropped in the Input chain."),
+ IPTablesOutputDropped: mustCreateMetric("/netstack/ip/iptables/output_dropped", "Total number of IP packets dropped in the Output chain."),
},
TCP: tcpip.TCPStats{
ActiveConnectionOpenings: mustCreateMetric("/netstack/tcp/active_connection_openings", "Number of connections opened successfully via Connect."),
@@ -1185,7 +1188,7 @@ func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, fam
var v tcpip.LingerOption
var linger linux.Linger
if err := ep.GetSockOpt(&v); err != nil {
- return &linger, nil
+ return nil, syserr.TranslateNetstackError(err)
}
if v.Enabled {
diff --git a/pkg/sentry/socket/unix/transport/unix.go b/pkg/sentry/socket/unix/transport/unix.go
index 08504560c..d6fc03520 100644
--- a/pkg/sentry/socket/unix/transport/unix.go
+++ b/pkg/sentry/socket/unix/transport/unix.go
@@ -746,6 +746,9 @@ type baseEndpoint struct {
// path is not empty if the endpoint has been bound,
// or may be used if the endpoint is connected.
path string
+
+ // linger is used for SO_LINGER socket option.
+ linger tcpip.LingerOption
}
// EventRegister implements waiter.Waitable.EventRegister.
@@ -841,8 +844,14 @@ func (e *baseEndpoint) SendMsg(ctx context.Context, data [][]byte, c ControlMess
return n, err
}
-// SetSockOpt sets a socket option. Currently not supported.
-func (e *baseEndpoint) SetSockOpt(tcpip.SettableSocketOption) *tcpip.Error {
+// SetSockOpt sets a socket option.
+func (e *baseEndpoint) SetSockOpt(opt tcpip.SettableSocketOption) *tcpip.Error {
+ switch v := opt.(type) {
+ case *tcpip.LingerOption:
+ e.Lock()
+ e.linger = *v
+ e.Unlock()
+ }
return nil
}
@@ -945,8 +954,11 @@ func (e *baseEndpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
// GetSockOpt implements tcpip.Endpoint.GetSockOpt.
func (e *baseEndpoint) GetSockOpt(opt tcpip.GettableSocketOption) *tcpip.Error {
- switch opt.(type) {
+ switch o := opt.(type) {
case *tcpip.LingerOption:
+ e.Lock()
+ *o = e.linger
+ e.Unlock()
return nil
default:
diff --git a/pkg/sentry/syscalls/linux/sys_mmap.go b/pkg/sentry/syscalls/linux/sys_mmap.go
index 8ab062bca..cd8dfdfa4 100644
--- a/pkg/sentry/syscalls/linux/sys_mmap.go
+++ b/pkg/sentry/syscalls/linux/sys_mmap.go
@@ -100,6 +100,15 @@ func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC
if err := file.ConfigureMMap(t, &opts); err != nil {
return 0, nil, err
}
+ } else if shared {
+ // Back shared anonymous mappings with a special mappable.
+ opts.Offset = 0
+ m, err := mm.NewSharedAnonMappable(opts.Length, t.Kernel())
+ if err != nil {
+ return 0, nil, err
+ }
+ opts.MappingIdentity = m // transfers ownership of m to opts
+ opts.Mappable = m
}
rv, err := t.MemoryManager().MMap(t, opts)
diff --git a/pkg/sentry/syscalls/linux/sys_sched.go b/pkg/sentry/syscalls/linux/sys_sched.go
index cd6f4dd94..bfcf44b6f 100644
--- a/pkg/sentry/syscalls/linux/sys_sched.go
+++ b/pkg/sentry/syscalls/linux/sys_sched.go
@@ -30,7 +30,7 @@ const (
//
// +marshal
type SchedParam struct {
- schedPriority int64
+ schedPriority int32
}
// SchedGetparam implements linux syscall sched_getparam(2).
diff --git a/pkg/sentry/syscalls/linux/vfs2/mmap.go b/pkg/sentry/syscalls/linux/vfs2/mmap.go
index dc05c2994..9d9dbf775 100644
--- a/pkg/sentry/syscalls/linux/vfs2/mmap.go
+++ b/pkg/sentry/syscalls/linux/vfs2/mmap.go
@@ -17,6 +17,7 @@ package vfs2
import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/syserror"
@@ -85,6 +86,17 @@ func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC
if err := file.ConfigureMMap(t, &opts); err != nil {
return 0, nil, err
}
+ } else if shared {
+ // Back shared anonymous mappings with an anonymous tmpfs file.
+ opts.Offset = 0
+ file, err := tmpfs.NewZeroFile(t, t.Credentials(), t.Kernel().ShmMount(), opts.Length)
+ if err != nil {
+ return 0, nil, err
+ }
+ defer file.DecRef(t)
+ if err := file.ConfigureMMap(t, &opts); err != nil {
+ return 0, nil, err
+ }
}
rv, err := t.MemoryManager().MMap(t, opts)