summaryrefslogtreecommitdiffhomepage
path: root/pkg/sentry
diff options
context:
space:
mode:
Diffstat (limited to 'pkg/sentry')
-rw-r--r--pkg/sentry/arch/signal_arm64.go22
-rw-r--r--pkg/sentry/fs/gofer/fs.go3
-rw-r--r--pkg/sentry/fs/gofer/path.go24
-rw-r--r--pkg/sentry/fs/tmpfs/tmpfs.go11
-rw-r--r--pkg/sentry/fsimpl/gofer/filesystem.go62
-rw-r--r--pkg/sentry/fsimpl/gofer/gofer.go13
-rw-r--r--pkg/sentry/fsimpl/gofer/special_file.go28
-rw-r--r--pkg/sentry/fsimpl/host/host.go41
-rw-r--r--pkg/sentry/fsimpl/tmpfs/BUILD3
-rw-r--r--pkg/sentry/fsimpl/tmpfs/filesystem.go46
-rw-r--r--pkg/sentry/fsimpl/tmpfs/regular_file.go42
-rw-r--r--pkg/sentry/fsimpl/tmpfs/regular_file_test.go138
-rw-r--r--pkg/sentry/fsimpl/tmpfs/stat_test.go4
-rw-r--r--pkg/sentry/fsimpl/tmpfs/tmpfs.go38
-rw-r--r--pkg/sentry/fsimpl/tmpfs/tmpfs_test.go156
-rw-r--r--pkg/sentry/kernel/BUILD1
-rw-r--r--pkg/sentry/kernel/kernel.go25
-rw-r--r--pkg/sentry/socket/netfilter/netfilter.go22
-rw-r--r--pkg/sentry/socket/netfilter/owner_matcher.go81
-rw-r--r--pkg/sentry/socket/netstack/BUILD1
-rw-r--r--pkg/sentry/socket/netstack/netstack.go52
-rw-r--r--pkg/sentry/socket/netstack/netstack_vfs2.go17
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/BUILD2
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/fd.go10
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/memfd.go63
-rw-r--r--pkg/sentry/syscalls/linux/vfs2/vfs2.go2
26 files changed, 641 insertions, 266 deletions
diff --git a/pkg/sentry/arch/signal_arm64.go b/pkg/sentry/arch/signal_arm64.go
index 1cb1adf8c..642c79dda 100644
--- a/pkg/sentry/arch/signal_arm64.go
+++ b/pkg/sentry/arch/signal_arm64.go
@@ -19,6 +19,7 @@ import (
"syscall"
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/usermem"
)
@@ -134,6 +135,11 @@ func (c *context64) SignalSetup(st *Stack, act *SignalAct, info *SignalInfo, alt
c.Regs.Regs[1] = uint64(infoAddr)
c.Regs.Regs[2] = uint64(ucAddr)
c.Regs.Regs[30] = uint64(act.Restorer)
+
+ // Save the thread's floating point state.
+ c.sigFPState = append(c.sigFPState, c.aarch64FPState)
+ // Signal handler gets a clean floating point state.
+ c.aarch64FPState = newAarch64FPState()
return nil
}
@@ -155,5 +161,21 @@ func (c *context64) SignalRestore(st *Stack, rt bool) (linux.SignalSet, SignalSt
c.Regs.Sp = uc.MContext.Sp
c.Regs.Pstate = uc.MContext.Pstate
+ // Restore floating point state.
+ l := len(c.sigFPState)
+ if l > 0 {
+ c.aarch64FPState = c.sigFPState[l-1]
+ // NOTE(cl/133042258): State save requires that any slice
+ // elements from '[len:cap]' to be zero value.
+ c.sigFPState[l-1] = nil
+ c.sigFPState = c.sigFPState[0 : l-1]
+ } else {
+ // This might happen if sigreturn(2) calls are unbalanced with
+ // respect to signal handler entries. This is not expected so
+ // don't bother to do anything fancy with the floating point
+ // state.
+ log.Warningf("sigreturn unable to restore application fpstate")
+ }
+
return uc.Sigset, uc.Stack, nil
}
diff --git a/pkg/sentry/fs/gofer/fs.go b/pkg/sentry/fs/gofer/fs.go
index 9d41fcbdb..8ae2d78d7 100644
--- a/pkg/sentry/fs/gofer/fs.go
+++ b/pkg/sentry/fs/gofer/fs.go
@@ -60,8 +60,7 @@ const (
limitHostFDTranslationKey = "limit_host_fd_translation"
// overlayfsStaleRead if present closes cached readonly file after the first
- // write. This is done to workaround a limitation of overlayfs in kernels
- // before 4.19 where open FDs are not updated after the file is copied up.
+ // write. This is done to workaround a limitation of Linux overlayfs.
overlayfsStaleRead = "overlayfs_stale_read"
)
diff --git a/pkg/sentry/fs/gofer/path.go b/pkg/sentry/fs/gofer/path.go
index a35c3a23d..cf9800100 100644
--- a/pkg/sentry/fs/gofer/path.go
+++ b/pkg/sentry/fs/gofer/path.go
@@ -16,7 +16,6 @@ package gofer
import (
"fmt"
- "syscall"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/log"
@@ -68,7 +67,7 @@ func (i *inodeOperations) Lookup(ctx context.Context, dir *fs.Inode, name string
// Get a p9.File for name.
qids, newFile, mask, p9attr, err := i.fileState.file.walkGetAttr(ctx, []string{name})
if err != nil {
- if err == syscall.ENOENT {
+ if err == syserror.ENOENT {
if cp.cacheNegativeDirents() {
// Return a negative Dirent. It will stay cached until something
// is created over it.
@@ -207,7 +206,7 @@ func (i *inodeOperations) CreateHardLink(ctx context.Context, inode *fs.Inode, t
targetOpts, ok := target.InodeOperations.(*inodeOperations)
if !ok {
- return syscall.EXDEV
+ return syserror.EXDEV
}
if err := i.fileState.file.link(ctx, &targetOpts.fileState.file, newName); err != nil {
@@ -251,7 +250,7 @@ func (i *inodeOperations) Bind(ctx context.Context, dir *fs.Inode, name string,
}
if i.session().overrides == nil {
- return nil, syscall.EOPNOTSUPP
+ return nil, syserror.EOPNOTSUPP
}
// Stabilize the override map while creation is in progress.
@@ -280,7 +279,7 @@ func (i *inodeOperations) CreateFifo(ctx context.Context, dir *fs.Inode, name st
// N.B. FIFOs use major/minor numbers 0.
if _, err := i.fileState.file.mknod(ctx, name, mode, 0, 0, p9.UID(owner.UID), p9.GID(owner.GID)); err != nil {
- if i.session().overrides == nil || err != syscall.EPERM {
+ if i.session().overrides == nil || err != syserror.EPERM {
return err
}
// If gofer doesn't support mknod, check if we can create an internal fifo.
@@ -427,17 +426,16 @@ func (i *inodeOperations) Rename(ctx context.Context, inode *fs.Inode, oldParent
return syserror.ENAMETOOLONG
}
- // Unwrap the new parent to a *inodeOperations.
- newParentInodeOperations, ok := newParent.InodeOperations.(*inodeOperations)
- if !ok {
- return syscall.EXDEV
+ // Don't allow renames across different mounts.
+ if newParent.MountSource != oldParent.MountSource {
+ return syserror.EXDEV
}
+ // Unwrap the new parent to a *inodeOperations.
+ newParentInodeOperations := newParent.InodeOperations.(*inodeOperations)
+
// Unwrap the old parent to a *inodeOperations.
- oldParentInodeOperations, ok := oldParent.InodeOperations.(*inodeOperations)
- if !ok {
- return syscall.EXDEV
- }
+ oldParentInodeOperations := oldParent.InodeOperations.(*inodeOperations)
// Do the rename.
if err := i.fileState.file.rename(ctx, newParentInodeOperations.fileState.file, newName); err != nil {
diff --git a/pkg/sentry/fs/tmpfs/tmpfs.go b/pkg/sentry/fs/tmpfs/tmpfs.go
index 3c2b583ae..b095312fe 100644
--- a/pkg/sentry/fs/tmpfs/tmpfs.go
+++ b/pkg/sentry/fs/tmpfs/tmpfs.go
@@ -39,14 +39,13 @@ var fsInfo = fs.Info{
// rename implements fs.InodeOperations.Rename for tmpfs nodes.
func rename(ctx context.Context, oldParent *fs.Inode, oldName string, newParent *fs.Inode, newName string, replacement bool) error {
- op, ok := oldParent.InodeOperations.(*Dir)
- if !ok {
- return syserror.EXDEV
- }
- np, ok := newParent.InodeOperations.(*Dir)
- if !ok {
+ // Don't allow renames across different mounts.
+ if newParent.MountSource != oldParent.MountSource {
return syserror.EXDEV
}
+
+ op := oldParent.InodeOperations.(*Dir)
+ np := newParent.InodeOperations.(*Dir)
return ramfs.Rename(ctx, op.ramfsDir, oldName, np.ramfsDir, newName, replacement)
}
diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go
index 4a32821bd..7f2181216 100644
--- a/pkg/sentry/fsimpl/gofer/filesystem.go
+++ b/pkg/sentry/fsimpl/gofer/filesystem.go
@@ -21,6 +21,8 @@ import (
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/fspath"
"gvisor.dev/gvisor/pkg/p9"
+ "gvisor.dev/gvisor/pkg/sentry/fsimpl/host"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/kernel/pipe"
"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
@@ -835,6 +837,9 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vf
if d.isSynthetic() {
return nil, syserror.ENXIO
}
+ if d.fs.iopts.OpenSocketsByConnecting {
+ return d.connectSocketLocked(ctx, opts)
+ }
case linux.S_IFIFO:
if d.isSynthetic() {
return d.pipe.Open(ctx, mnt, &d.vfsd, opts.Flags)
@@ -843,10 +848,28 @@ func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vf
return d.openSpecialFileLocked(ctx, mnt, opts)
}
+func (d *dentry) connectSocketLocked(ctx context.Context, opts *vfs.OpenOptions) (*vfs.FileDescription, error) {
+ if opts.Flags&linux.O_DIRECT != 0 {
+ return nil, syserror.EINVAL
+ }
+ fdObj, err := d.file.connect(ctx, p9.AnonymousSocket)
+ if err != nil {
+ return nil, err
+ }
+ fd, err := host.NewFD(ctx, kernel.KernelFromContext(ctx).HostMount(), fdObj.FD(), &host.NewFDOptions{
+ HaveFlags: true,
+ Flags: opts.Flags,
+ })
+ if err != nil {
+ fdObj.Close()
+ return nil, err
+ }
+ fdObj.Release()
+ return fd, nil
+}
+
func (d *dentry) openSpecialFileLocked(ctx context.Context, mnt *vfs.Mount, opts *vfs.OpenOptions) (*vfs.FileDescription, error) {
ats := vfs.AccessTypesForOpenFlags(opts)
- // Treat as a special file. This is done for non-synthetic pipes as well as
- // regular files when d.fs.opts.regularFilesUseSpecialFileFD is true.
if opts.Flags&linux.O_DIRECT != 0 {
return nil, syserror.EINVAL
}
@@ -854,10 +877,15 @@ func (d *dentry) openSpecialFileLocked(ctx context.Context, mnt *vfs.Mount, opts
if err != nil {
return nil, err
}
+ seekable := d.fileType() == linux.S_IFREG
fd := &specialFileFD{
- handle: h,
+ handle: h,
+ seekable: seekable,
}
- if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil {
+ if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{
+ DenyPRead: !seekable,
+ DenyPWrite: !seekable,
+ }); err != nil {
h.close(ctx)
return nil, err
}
@@ -888,7 +916,11 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving
}
creds := rp.Credentials()
name := rp.Component()
- fdobj, openFile, createQID, _, err := dirfile.create(ctx, name, (p9.OpenFlags)(opts.Flags), (p9.FileMode)(opts.Mode), (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID))
+ // Filter file creation flags and O_LARGEFILE out; the create RPC already
+ // has the semantics of O_CREAT|O_EXCL, while some servers will choke on
+ // O_LARGEFILE.
+ createFlags := p9.OpenFlags(opts.Flags &^ (linux.O_CREAT | linux.O_EXCL | linux.O_NOCTTY | linux.O_TRUNC | linux.O_LARGEFILE))
+ fdobj, openFile, createQID, _, err := dirfile.create(ctx, name, createFlags, (p9.FileMode)(opts.Mode), (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID))
if err != nil {
dirfile.close(ctx)
return nil, err
@@ -896,7 +928,7 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving
// Then we need to walk to the file we just created to get a non-open fid
// representing it, and to get its metadata. This must use d.file since, as
// explained above, dirfile was invalidated by dirfile.Create().
- walkQID, nonOpenFile, attrMask, attr, err := d.file.walkGetAttrOne(ctx, name)
+ _, nonOpenFile, attrMask, attr, err := d.file.walkGetAttrOne(ctx, name)
if err != nil {
openFile.close(ctx)
if fdobj != nil {
@@ -904,17 +936,6 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving
}
return nil, err
}
- // Sanity-check that we walked to the file we created.
- if createQID.Path != walkQID.Path {
- // Probably due to concurrent remote filesystem mutation?
- ctx.Warningf("gofer.dentry.createAndOpenChildLocked: created file has QID %v before walk, QID %v after (interop=%v)", createQID, walkQID, d.fs.opts.interop)
- nonOpenFile.close(ctx)
- openFile.close(ctx)
- if fdobj != nil {
- fdobj.Close()
- }
- return nil, syserror.EAGAIN
- }
// Construct the new dentry.
child, err := d.fs.newDentry(ctx, nonOpenFile, createQID, attrMask, &attr)
@@ -960,16 +981,21 @@ func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.Resolving
}
childVFSFD = &fd.vfsfd
} else {
+ seekable := child.fileType() == linux.S_IFREG
fd := &specialFileFD{
handle: handle{
file: openFile,
fd: -1,
},
+ seekable: seekable,
}
if fdobj != nil {
fd.handle.fd = int32(fdobj.Release())
}
- if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &child.vfsd, &vfs.FileDescriptionOptions{}); err != nil {
+ if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &child.vfsd, &vfs.FileDescriptionOptions{
+ DenyPRead: !seekable,
+ DenyPWrite: !seekable,
+ }); err != nil {
fd.handle.close(ctx)
return nil, err
}
diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go
index e68e37ebc..353e2cf5b 100644
--- a/pkg/sentry/fsimpl/gofer/gofer.go
+++ b/pkg/sentry/fsimpl/gofer/gofer.go
@@ -143,9 +143,12 @@ type filesystemOptions struct {
// If overlayfsStaleRead is true, O_RDONLY host FDs provided by the remote
// filesystem may not be coherent with writable host FDs opened later, so
- // mappings of the former must be replaced by mappings of the latter. This
- // is usually only the case when the remote filesystem is an overlayfs
- // mount on Linux < 4.19.
+ // all uses of the former must be replaced by uses of the latter. This is
+ // usually only the case when the remote filesystem is a Linux overlayfs
+ // mount. (Prior to Linux 4.18, patch series centered on commit
+ // d1d04ef8572b "ovl: stack file ops", both I/O and memory mappings were
+ // incoherent between pre-copy-up and post-copy-up FDs; after that patch
+ // series, only memory mappings are incoherent.)
overlayfsStaleRead bool
// If regularFilesUseSpecialFileFD is true, application FDs representing
@@ -221,6 +224,10 @@ type InternalFilesystemOptions struct {
// which servers can handle only a single client and report failure if that
// client disconnects.
LeakConnection bool
+
+ // If OpenSocketsByConnecting is true, silently translate attempts to open
+ // files identifying as sockets to connect RPCs.
+ OpenSocketsByConnecting bool
}
// Name implements vfs.FilesystemType.Name.
diff --git a/pkg/sentry/fsimpl/gofer/special_file.go b/pkg/sentry/fsimpl/gofer/special_file.go
index 507e0e276..a464e6a94 100644
--- a/pkg/sentry/fsimpl/gofer/special_file.go
+++ b/pkg/sentry/fsimpl/gofer/special_file.go
@@ -33,13 +33,14 @@ import (
type specialFileFD struct {
fileDescription
- // handle is immutable.
+ // handle is used for file I/O. handle is immutable.
handle handle
- // off is the file offset. off is protected by mu. (POSIX 2.9.7 only
- // requires operations using the file offset to be atomic for regular files
- // and symlinks; however, since specialFileFD may be used for regular
- // files, we apply this atomicity unconditionally.)
+ // seekable is true if this file description represents a file for which
+ // file offset is significant, i.e. a regular file. seekable is immutable.
+ seekable bool
+
+ // If seekable is true, off is the file offset. off is protected by mu.
mu sync.Mutex
off int64
}
@@ -63,7 +64,7 @@ func (fd *specialFileFD) OnClose(ctx context.Context) error {
// PRead implements vfs.FileDescriptionImpl.PRead.
func (fd *specialFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
- if offset < 0 {
+ if fd.seekable && offset < 0 {
return 0, syserror.EINVAL
}
if opts.Flags != 0 {
@@ -91,6 +92,10 @@ func (fd *specialFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offs
// Read implements vfs.FileDescriptionImpl.Read.
func (fd *specialFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+ if !fd.seekable {
+ return fd.PRead(ctx, dst, -1, opts)
+ }
+
fd.mu.Lock()
n, err := fd.PRead(ctx, dst, fd.off, opts)
fd.off += n
@@ -100,14 +105,14 @@ func (fd *specialFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts
// PWrite implements vfs.FileDescriptionImpl.PWrite.
func (fd *specialFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
- if offset < 0 {
+ if fd.seekable && offset < 0 {
return 0, syserror.EINVAL
}
if opts.Flags != 0 {
return 0, syserror.EOPNOTSUPP
}
- if fd.dentry().fileType() == linux.S_IFREG {
+ if fd.seekable {
limit, err := vfs.CheckLimit(ctx, offset, src.NumBytes())
if err != nil {
return 0, err
@@ -130,6 +135,10 @@ func (fd *specialFileFD) PWrite(ctx context.Context, src usermem.IOSequence, off
// Write implements vfs.FileDescriptionImpl.Write.
func (fd *specialFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+ if !fd.seekable {
+ return fd.PWrite(ctx, src, -1, opts)
+ }
+
fd.mu.Lock()
n, err := fd.PWrite(ctx, src, fd.off, opts)
fd.off += n
@@ -139,6 +148,9 @@ func (fd *specialFileFD) Write(ctx context.Context, src usermem.IOSequence, opts
// Seek implements vfs.FileDescriptionImpl.Seek.
func (fd *specialFileFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+ if !fd.seekable {
+ return 0, syserror.ESPIPE
+ }
fd.mu.Lock()
defer fd.mu.Unlock()
switch whence {
diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go
index 36bceeaa4..8caf55a1b 100644
--- a/pkg/sentry/fsimpl/host/host.go
+++ b/pkg/sentry/fsimpl/host/host.go
@@ -40,8 +40,20 @@ import (
"gvisor.dev/gvisor/pkg/waiter"
)
-// ImportFD sets up and returns a vfs.FileDescription from a donated fd.
-func ImportFD(ctx context.Context, mnt *vfs.Mount, hostFD int, isTTY bool) (*vfs.FileDescription, error) {
+// NewFDOptions contains options to NewFD.
+type NewFDOptions struct {
+ // If IsTTY is true, the file descriptor is a TTY.
+ IsTTY bool
+
+ // If HaveFlags is true, use Flags for the new file description. Otherwise,
+ // the new file description will inherit flags from hostFD.
+ HaveFlags bool
+ Flags uint32
+}
+
+// NewFD returns a vfs.FileDescription representing the given host file
+// descriptor. mnt must be Kernel.HostMount().
+func NewFD(ctx context.Context, mnt *vfs.Mount, hostFD int, opts *NewFDOptions) (*vfs.FileDescription, error) {
fs, ok := mnt.Filesystem().Impl().(*filesystem)
if !ok {
return nil, fmt.Errorf("can't import host FDs into filesystems of type %T", mnt.Filesystem().Impl())
@@ -53,10 +65,14 @@ func ImportFD(ctx context.Context, mnt *vfs.Mount, hostFD int, isTTY bool) (*vfs
return nil, err
}
- // Get flags for the imported FD.
- flags, err := unix.FcntlInt(uintptr(hostFD), syscall.F_GETFL, 0)
- if err != nil {
- return nil, err
+ flags := opts.Flags
+ if !opts.HaveFlags {
+ // Get flags for the imported FD.
+ flagsInt, err := unix.FcntlInt(uintptr(hostFD), syscall.F_GETFL, 0)
+ if err != nil {
+ return nil, err
+ }
+ flags = uint32(flagsInt)
}
fileMode := linux.FileMode(s.Mode)
@@ -65,13 +81,13 @@ func ImportFD(ctx context.Context, mnt *vfs.Mount, hostFD int, isTTY bool) (*vfs
// Determine if hostFD is seekable. If not, this syscall will return ESPIPE
// (see fs/read_write.c:llseek), e.g. for pipes, sockets, and some character
// devices.
- _, err = unix.Seek(hostFD, 0, linux.SEEK_CUR)
+ _, err := unix.Seek(hostFD, 0, linux.SEEK_CUR)
seekable := err != syserror.ESPIPE
i := &inode{
hostFD: hostFD,
seekable: seekable,
- isTTY: isTTY,
+ isTTY: opts.IsTTY,
canMap: canMap(uint32(fileType)),
wouldBlock: wouldBlock(uint32(fileType)),
ino: fs.NextIno(),
@@ -101,7 +117,14 @@ func ImportFD(ctx context.Context, mnt *vfs.Mount, hostFD int, isTTY bool) (*vfs
// i.open will take a reference on d.
defer d.DecRef()
- return i.open(ctx, d.VFSDentry(), mnt, uint32(flags))
+ return i.open(ctx, d.VFSDentry(), mnt, flags)
+}
+
+// ImportFD sets up and returns a vfs.FileDescription from a donated fd.
+func ImportFD(ctx context.Context, mnt *vfs.Mount, hostFD int, isTTY bool) (*vfs.FileDescription, error) {
+ return NewFD(ctx, mnt, hostFD, &NewFDOptions{
+ IsTTY: isTTY,
+ })
}
// filesystemType implements vfs.FilesystemType.
diff --git a/pkg/sentry/fsimpl/tmpfs/BUILD b/pkg/sentry/fsimpl/tmpfs/BUILD
index a2d9649e7..007be1572 100644
--- a/pkg/sentry/fsimpl/tmpfs/BUILD
+++ b/pkg/sentry/fsimpl/tmpfs/BUILD
@@ -52,7 +52,6 @@ go_library(
"//pkg/sentry/fs",
"//pkg/sentry/fs/fsutil",
"//pkg/sentry/fs/lock",
- "//pkg/sentry/kernel",
"//pkg/sentry/kernel/auth",
"//pkg/sentry/kernel/pipe",
"//pkg/sentry/kernel/time",
@@ -96,6 +95,7 @@ go_test(
"pipe_test.go",
"regular_file_test.go",
"stat_test.go",
+ "tmpfs_test.go",
],
library = ":tmpfs",
deps = [
@@ -105,7 +105,6 @@ go_test(
"//pkg/sentry/contexttest",
"//pkg/sentry/fs/lock",
"//pkg/sentry/kernel/auth",
- "//pkg/sentry/kernel/contexttest",
"//pkg/sentry/vfs",
"//pkg/syserror",
"//pkg/usermem",
diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go
index 36ffcb592..80fa7b29d 100644
--- a/pkg/sentry/fsimpl/tmpfs/filesystem.go
+++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go
@@ -16,6 +16,7 @@ package tmpfs
import (
"fmt"
+ "sync/atomic"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
@@ -24,6 +25,7 @@ import (
"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
)
// Sync implements vfs.FilesystemImpl.Sync.
@@ -76,8 +78,8 @@ afterSymlink:
return nil, err
}
if symlink, ok := child.inode.impl.(*symlink); ok && rp.ShouldFollowSymlink() {
- // TODO(gvisor.dev/issue/1197): Symlink traversals updates
- // access time.
+ // Symlink traversal updates access time.
+ atomic.StoreInt64(&d.inode.atime, d.inode.fs.clock.Now().Nanoseconds())
if err := rp.HandleSymlink(symlink.target); err != nil {
return nil, err
}
@@ -361,8 +363,8 @@ afterTrailingSymlink:
}
// Do we need to resolve a trailing symlink?
if symlink, ok := child.inode.impl.(*symlink); ok && rp.ShouldFollowSymlink() {
- // TODO(gvisor.dev/issue/1197): Symlink traversals updates
- // access time.
+ // Symlink traversal updates access time.
+ atomic.StoreInt64(&child.inode.atime, child.inode.fs.clock.Now().Nanoseconds())
if err := rp.HandleSymlink(symlink.target); err != nil {
return nil, err
}
@@ -636,12 +638,19 @@ func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) {
fs.mu.RLock()
defer fs.mu.RUnlock()
- _, err := resolveLocked(rp)
- if err != nil {
+ if _, err := resolveLocked(rp); err != nil {
return linux.Statfs{}, err
}
- // TODO(gvisor.dev/issue/1197): Actually implement statfs.
- return linux.Statfs{}, syserror.ENOSYS
+ statfs := linux.Statfs{
+ Type: linux.TMPFS_MAGIC,
+ BlockSize: usermem.PageSize,
+ FragmentSize: usermem.PageSize,
+ NameLength: linux.NAME_MAX,
+ // TODO(b/29637826): Allow configuring a tmpfs size and enforce it.
+ Blocks: 0,
+ BlocksFree: 0,
+ }
+ return statfs, nil
}
// SymlinkAt implements vfs.FilesystemImpl.SymlinkAt.
@@ -763,5 +772,24 @@ func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath,
func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error {
fs.mu.RLock()
defer fs.mu.RUnlock()
- return genericPrependPath(vfsroot, vd.Mount(), vd.Dentry().Impl().(*dentry), b)
+ mnt := vd.Mount()
+ d := vd.Dentry().Impl().(*dentry)
+ for {
+ if mnt == vfsroot.Mount() && &d.vfsd == vfsroot.Dentry() {
+ return vfs.PrependPathAtVFSRootError{}
+ }
+ if &d.vfsd == mnt.Root() {
+ return nil
+ }
+ if d.parent == nil {
+ if d.name != "" {
+ // This must be an anonymous memfd file.
+ b.PrependComponent("/" + d.name)
+ return vfs.PrependPathSyntheticError{}
+ }
+ return vfs.PrependPathAtNonMountRootError{}
+ }
+ b.PrependComponent(d.name)
+ d = d.parent
+ }
}
diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file.go b/pkg/sentry/fsimpl/tmpfs/regular_file.go
index 57e5e28ec..3f433d666 100644
--- a/pkg/sentry/fsimpl/tmpfs/regular_file.go
+++ b/pkg/sentry/fsimpl/tmpfs/regular_file.go
@@ -88,6 +88,7 @@ type regularFile struct {
func (fs *filesystem) newRegularFile(creds *auth.Credentials, mode linux.FileMode) *inode {
file := &regularFile{
memFile: fs.memFile,
+ seals: linux.F_SEAL_SEAL,
}
file.inode.init(file, fs, creds, linux.S_IFREG|mode)
file.inode.nlink = 1 // from parent directory
@@ -577,3 +578,44 @@ exitLoop:
return done, retErr
}
+
+// GetSeals returns the current set of seals on a memfd inode.
+func GetSeals(fd *vfs.FileDescription) (uint32, error) {
+ f, ok := fd.Impl().(*regularFileFD)
+ if !ok {
+ return 0, syserror.EINVAL
+ }
+ rf := f.inode().impl.(*regularFile)
+ rf.dataMu.RLock()
+ defer rf.dataMu.RUnlock()
+ return rf.seals, nil
+}
+
+// AddSeals adds new file seals to a memfd inode.
+func AddSeals(fd *vfs.FileDescription, val uint32) error {
+ f, ok := fd.Impl().(*regularFileFD)
+ if !ok {
+ return syserror.EINVAL
+ }
+ rf := f.inode().impl.(*regularFile)
+ rf.mapsMu.Lock()
+ defer rf.mapsMu.Unlock()
+ rf.dataMu.RLock()
+ defer rf.dataMu.RUnlock()
+
+ if rf.seals&linux.F_SEAL_SEAL != 0 {
+ // Seal applied which prevents addition of any new seals.
+ return syserror.EPERM
+ }
+
+ // F_SEAL_WRITE can only be added if there are no active writable maps.
+ if rf.seals&linux.F_SEAL_WRITE == 0 && val&linux.F_SEAL_WRITE != 0 {
+ if rf.writableMappingPages > 0 {
+ return syserror.EBUSY
+ }
+ }
+
+ // Seals can only be added, never removed.
+ rf.seals |= val
+ return nil
+}
diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file_test.go b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go
index 0399725cf..64e1c40ad 100644
--- a/pkg/sentry/fsimpl/tmpfs/regular_file_test.go
+++ b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go
@@ -18,152 +18,16 @@ import (
"bytes"
"fmt"
"io"
- "sync/atomic"
"testing"
"gvisor.dev/gvisor/pkg/abi/linux"
- "gvisor.dev/gvisor/pkg/context"
- "gvisor.dev/gvisor/pkg/fspath"
+ "gvisor.dev/gvisor/pkg/sentry/contexttest"
"gvisor.dev/gvisor/pkg/sentry/fs/lock"
- "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
- "gvisor.dev/gvisor/pkg/sentry/kernel/contexttest"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
)
-// nextFileID is used to generate unique file names.
-var nextFileID int64
-
-// newTmpfsRoot creates a new tmpfs mount, and returns the root. If the error
-// is not nil, then cleanup should be called when the root is no longer needed.
-func newTmpfsRoot(ctx context.Context) (*vfs.VirtualFilesystem, vfs.VirtualDentry, func(), error) {
- creds := auth.CredentialsFromContext(ctx)
-
- vfsObj := &vfs.VirtualFilesystem{}
- if err := vfsObj.Init(); err != nil {
- return nil, vfs.VirtualDentry{}, nil, fmt.Errorf("VFS init: %v", err)
- }
-
- vfsObj.MustRegisterFilesystemType("tmpfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
- AllowUserMount: true,
- })
- mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{})
- if err != nil {
- return nil, vfs.VirtualDentry{}, nil, fmt.Errorf("failed to create tmpfs root mount: %v", err)
- }
- root := mntns.Root()
- return vfsObj, root, func() {
- root.DecRef()
- mntns.DecRef()
- }, nil
-}
-
-// newFileFD creates a new file in a new tmpfs mount, and returns the FD. If
-// the returned err is not nil, then cleanup should be called when the FD is no
-// longer needed.
-func newFileFD(ctx context.Context, mode linux.FileMode) (*vfs.FileDescription, func(), error) {
- creds := auth.CredentialsFromContext(ctx)
- vfsObj, root, cleanup, err := newTmpfsRoot(ctx)
- if err != nil {
- return nil, nil, err
- }
-
- filename := fmt.Sprintf("tmpfs-test-file-%d", atomic.AddInt64(&nextFileID, 1))
-
- // Create the file that will be write/read.
- fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{
- Root: root,
- Start: root,
- Path: fspath.Parse(filename),
- }, &vfs.OpenOptions{
- Flags: linux.O_RDWR | linux.O_CREAT | linux.O_EXCL,
- Mode: linux.ModeRegular | mode,
- })
- if err != nil {
- cleanup()
- return nil, nil, fmt.Errorf("failed to create file %q: %v", filename, err)
- }
-
- return fd, cleanup, nil
-}
-
-// newDirFD is like newFileFD, but for directories.
-func newDirFD(ctx context.Context, mode linux.FileMode) (*vfs.FileDescription, func(), error) {
- creds := auth.CredentialsFromContext(ctx)
- vfsObj, root, cleanup, err := newTmpfsRoot(ctx)
- if err != nil {
- return nil, nil, err
- }
-
- dirname := fmt.Sprintf("tmpfs-test-dir-%d", atomic.AddInt64(&nextFileID, 1))
-
- // Create the dir.
- if err := vfsObj.MkdirAt(ctx, creds, &vfs.PathOperation{
- Root: root,
- Start: root,
- Path: fspath.Parse(dirname),
- }, &vfs.MkdirOptions{
- Mode: linux.ModeDirectory | mode,
- }); err != nil {
- cleanup()
- return nil, nil, fmt.Errorf("failed to create directory %q: %v", dirname, err)
- }
-
- // Open the dir and return it.
- fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{
- Root: root,
- Start: root,
- Path: fspath.Parse(dirname),
- }, &vfs.OpenOptions{
- Flags: linux.O_RDONLY | linux.O_DIRECTORY,
- })
- if err != nil {
- cleanup()
- return nil, nil, fmt.Errorf("failed to open directory %q: %v", dirname, err)
- }
-
- return fd, cleanup, nil
-}
-
-// newPipeFD is like newFileFD, but for pipes.
-func newPipeFD(ctx context.Context, mode linux.FileMode) (*vfs.FileDescription, func(), error) {
- creds := auth.CredentialsFromContext(ctx)
- vfsObj, root, cleanup, err := newTmpfsRoot(ctx)
- if err != nil {
- return nil, nil, err
- }
-
- pipename := fmt.Sprintf("tmpfs-test-pipe-%d", atomic.AddInt64(&nextFileID, 1))
-
- // Create the pipe.
- if err := vfsObj.MknodAt(ctx, creds, &vfs.PathOperation{
- Root: root,
- Start: root,
- Path: fspath.Parse(pipename),
- }, &vfs.MknodOptions{
- Mode: linux.ModeNamedPipe | mode,
- }); err != nil {
- cleanup()
- return nil, nil, fmt.Errorf("failed to create pipe %q: %v", pipename, err)
- }
-
- // Open the pipe and return it.
- fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{
- Root: root,
- Start: root,
- Path: fspath.Parse(pipename),
- }, &vfs.OpenOptions{
- Flags: linux.O_RDWR,
- })
- if err != nil {
- cleanup()
- return nil, nil, fmt.Errorf("failed to open pipe %q: %v", pipename, err)
- }
-
- return fd, cleanup, nil
-}
-
// Test that we can write some data to a file and read it back.`
func TestSimpleWriteRead(t *testing.T) {
ctx := contexttest.Context(t)
diff --git a/pkg/sentry/fsimpl/tmpfs/stat_test.go b/pkg/sentry/fsimpl/tmpfs/stat_test.go
index 60c2c980e..f7ee4aab2 100644
--- a/pkg/sentry/fsimpl/tmpfs/stat_test.go
+++ b/pkg/sentry/fsimpl/tmpfs/stat_test.go
@@ -19,8 +19,8 @@ import (
"testing"
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/contexttest"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
- "gvisor.dev/gvisor/pkg/sentry/kernel/contexttest"
"gvisor.dev/gvisor/pkg/sentry/vfs"
)
@@ -29,7 +29,6 @@ func TestStatAfterCreate(t *testing.T) {
mode := linux.FileMode(0644)
// Run with different file types.
- // TODO(gvisor.dev/issue/1197): Also test symlinks and sockets.
for _, typ := range []string{"file", "dir", "pipe"} {
t.Run(fmt.Sprintf("type=%q", typ), func(t *testing.T) {
var (
@@ -175,7 +174,6 @@ func TestSetStat(t *testing.T) {
mode := linux.FileMode(0644)
// Run with different file types.
- // TODO(gvisor.dev/issue/1197): Also test symlinks and sockets.
for _, typ := range []string{"file", "dir", "pipe"} {
t.Run(fmt.Sprintf("type=%q", typ), func(t *testing.T) {
var (
diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
index 405928bd0..1e781aecd 100644
--- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go
+++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
@@ -94,7 +94,7 @@ type FilesystemOpts struct {
}
// GetFilesystem implements vfs.FilesystemType.GetFilesystem.
-func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
+func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, _ string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
memFileProvider := pgalloc.MemoryFileProviderFromContext(ctx)
if memFileProvider == nil {
panic("MemoryFileProviderFromContext returned nil")
@@ -139,6 +139,11 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
return &fs.vfsfs, &root.vfsd, nil
}
+// NewFilesystem returns a new tmpfs filesystem.
+func NewFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials) (*vfs.Filesystem, *vfs.Dentry, error) {
+ return FilesystemType{}.GetFilesystem(ctx, vfsObj, creds, "", vfs.GetFilesystemOptions{})
+}
+
// Release implements vfs.FilesystemImpl.Release.
func (fs *filesystem) Release() {
fs.vfsfs.VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor)
@@ -658,3 +663,34 @@ func (fd *fileDescription) Setxattr(ctx context.Context, opts vfs.SetxattrOption
func (fd *fileDescription) Removexattr(ctx context.Context, name string) error {
return fd.inode().removexattr(auth.CredentialsFromContext(ctx), name)
}
+
+// NewMemfd creates a new tmpfs regular file and file description that can back
+// an anonymous fd created by memfd_create.
+func NewMemfd(mount *vfs.Mount, creds *auth.Credentials, allowSeals bool, name string) (*vfs.FileDescription, error) {
+ fs, ok := mount.Filesystem().Impl().(*filesystem)
+ if !ok {
+ panic("NewMemfd() called with non-tmpfs mount")
+ }
+
+ // Per Linux, mm/shmem.c:__shmem_file_setup(), memfd inodes are set up with
+ // S_IRWXUGO.
+ mode := linux.FileMode(0777)
+ inode := fs.newRegularFile(creds, mode)
+ rf := inode.impl.(*regularFile)
+ if allowSeals {
+ rf.seals = 0
+ }
+
+ d := fs.newDentry(inode)
+ defer d.DecRef()
+ d.name = name
+
+ // Per Linux, mm/shmem.c:__shmem_file_setup(), memfd files are set up with
+ // FMODE_READ | FMODE_WRITE.
+ var fd regularFileFD
+ flags := uint32(linux.O_RDWR)
+ if err := fd.vfsfd.Init(&fd, flags, mount, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil {
+ return nil, err
+ }
+ return &fd.vfsfd, nil
+}
diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs_test.go b/pkg/sentry/fsimpl/tmpfs/tmpfs_test.go
new file mode 100644
index 000000000..a240fb276
--- /dev/null
+++ b/pkg/sentry/fsimpl/tmpfs/tmpfs_test.go
@@ -0,0 +1,156 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tmpfs
+
+import (
+ "fmt"
+ "sync/atomic"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/fspath"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+)
+
+// nextFileID is used to generate unique file names.
+var nextFileID int64
+
+// newTmpfsRoot creates a new tmpfs mount, and returns the root. If the error
+// is not nil, then cleanup should be called when the root is no longer needed.
+func newTmpfsRoot(ctx context.Context) (*vfs.VirtualFilesystem, vfs.VirtualDentry, func(), error) {
+ creds := auth.CredentialsFromContext(ctx)
+
+ vfsObj := &vfs.VirtualFilesystem{}
+ if err := vfsObj.Init(); err != nil {
+ return nil, vfs.VirtualDentry{}, nil, fmt.Errorf("VFS init: %v", err)
+ }
+
+ vfsObj.MustRegisterFilesystemType("tmpfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+ AllowUserMount: true,
+ })
+ mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{})
+ if err != nil {
+ return nil, vfs.VirtualDentry{}, nil, fmt.Errorf("failed to create tmpfs root mount: %v", err)
+ }
+ root := mntns.Root()
+ return vfsObj, root, func() {
+ root.DecRef()
+ mntns.DecRef()
+ }, nil
+}
+
+// newFileFD creates a new file in a new tmpfs mount, and returns the FD. If
+// the returned err is not nil, then cleanup should be called when the FD is no
+// longer needed.
+func newFileFD(ctx context.Context, mode linux.FileMode) (*vfs.FileDescription, func(), error) {
+ creds := auth.CredentialsFromContext(ctx)
+ vfsObj, root, cleanup, err := newTmpfsRoot(ctx)
+ if err != nil {
+ return nil, nil, err
+ }
+
+ filename := fmt.Sprintf("tmpfs-test-file-%d", atomic.AddInt64(&nextFileID, 1))
+
+ // Create the file that will be write/read.
+ fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{
+ Root: root,
+ Start: root,
+ Path: fspath.Parse(filename),
+ }, &vfs.OpenOptions{
+ Flags: linux.O_RDWR | linux.O_CREAT | linux.O_EXCL,
+ Mode: linux.ModeRegular | mode,
+ })
+ if err != nil {
+ cleanup()
+ return nil, nil, fmt.Errorf("failed to create file %q: %v", filename, err)
+ }
+
+ return fd, cleanup, nil
+}
+
+// newDirFD is like newFileFD, but for directories.
+func newDirFD(ctx context.Context, mode linux.FileMode) (*vfs.FileDescription, func(), error) {
+ creds := auth.CredentialsFromContext(ctx)
+ vfsObj, root, cleanup, err := newTmpfsRoot(ctx)
+ if err != nil {
+ return nil, nil, err
+ }
+
+ dirname := fmt.Sprintf("tmpfs-test-dir-%d", atomic.AddInt64(&nextFileID, 1))
+
+ // Create the dir.
+ if err := vfsObj.MkdirAt(ctx, creds, &vfs.PathOperation{
+ Root: root,
+ Start: root,
+ Path: fspath.Parse(dirname),
+ }, &vfs.MkdirOptions{
+ Mode: linux.ModeDirectory | mode,
+ }); err != nil {
+ cleanup()
+ return nil, nil, fmt.Errorf("failed to create directory %q: %v", dirname, err)
+ }
+
+ // Open the dir and return it.
+ fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{
+ Root: root,
+ Start: root,
+ Path: fspath.Parse(dirname),
+ }, &vfs.OpenOptions{
+ Flags: linux.O_RDONLY | linux.O_DIRECTORY,
+ })
+ if err != nil {
+ cleanup()
+ return nil, nil, fmt.Errorf("failed to open directory %q: %v", dirname, err)
+ }
+
+ return fd, cleanup, nil
+}
+
+// newPipeFD is like newFileFD, but for pipes.
+func newPipeFD(ctx context.Context, mode linux.FileMode) (*vfs.FileDescription, func(), error) {
+ creds := auth.CredentialsFromContext(ctx)
+ vfsObj, root, cleanup, err := newTmpfsRoot(ctx)
+ if err != nil {
+ return nil, nil, err
+ }
+
+ name := fmt.Sprintf("tmpfs-test-%d", atomic.AddInt64(&nextFileID, 1))
+
+ if err := vfsObj.MknodAt(ctx, creds, &vfs.PathOperation{
+ Root: root,
+ Start: root,
+ Path: fspath.Parse(name),
+ }, &vfs.MknodOptions{
+ Mode: linux.ModeNamedPipe | mode,
+ }); err != nil {
+ cleanup()
+ return nil, nil, fmt.Errorf("failed to create pipe %q: %v", name, err)
+ }
+
+ fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{
+ Root: root,
+ Start: root,
+ Path: fspath.Parse(name),
+ }, &vfs.OpenOptions{
+ Flags: linux.O_RDWR,
+ })
+ if err != nil {
+ cleanup()
+ return nil, nil, fmt.Errorf("failed to open pipe %q: %v", name, err)
+ }
+
+ return fd, cleanup, nil
+}
diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD
index 8104f50f3..a28eab8b8 100644
--- a/pkg/sentry/kernel/BUILD
+++ b/pkg/sentry/kernel/BUILD
@@ -173,6 +173,7 @@ go_library(
"//pkg/sentry/fsimpl/pipefs",
"//pkg/sentry/fsimpl/sockfs",
"//pkg/sentry/fsimpl/timerfd",
+ "//pkg/sentry/fsimpl/tmpfs",
"//pkg/sentry/hostcpu",
"//pkg/sentry/inet",
"//pkg/sentry/kernel/auth",
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 3617da8c6..5efeb3767 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -53,6 +53,7 @@ import (
"gvisor.dev/gvisor/pkg/sentry/fsimpl/pipefs"
"gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs"
"gvisor.dev/gvisor/pkg/sentry/fsimpl/timerfd"
+ "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs"
"gvisor.dev/gvisor/pkg/sentry/hostcpu"
"gvisor.dev/gvisor/pkg/sentry/inet"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
@@ -259,6 +260,10 @@ type Kernel struct {
// syscalls (as opposed to named pipes created by mknod()).
pipeMount *vfs.Mount
+ // shmMount is the Mount used for anonymous files created by the
+ // memfd_create() syscalls. It is analagous to Linux's shm_mnt.
+ shmMount *vfs.Mount
+
// socketMount is the Mount used for sockets created by the socket() and
// socketpair() syscalls. There are several cases where a socket dentry will
// not be contained in socketMount:
@@ -330,6 +335,9 @@ func (k *Kernel) Init(args InitKernelArgs) error {
if args.Timekeeper == nil {
return fmt.Errorf("Timekeeper is nil")
}
+ if args.Timekeeper.clocks == nil {
+ return fmt.Errorf("Must call Timekeeper.SetClocks() before Kernel.Init()")
+ }
if args.RootUserNamespace == nil {
return fmt.Errorf("RootUserNamespace is nil")
}
@@ -384,6 +392,18 @@ func (k *Kernel) Init(args InitKernelArgs) error {
}
k.pipeMount = pipeMount
+ tmpfsFilesystem, tmpfsRoot, err := tmpfs.NewFilesystem(k.SupervisorContext(), &k.vfs, auth.NewRootCredentials(k.rootUserNamespace))
+ if err != nil {
+ return fmt.Errorf("failed to create tmpfs filesystem: %v", err)
+ }
+ defer tmpfsFilesystem.DecRef()
+ defer tmpfsRoot.DecRef()
+ shmMount, err := k.vfs.NewDisconnectedMount(tmpfsFilesystem, tmpfsRoot, &vfs.MountOptions{})
+ if err != nil {
+ return fmt.Errorf("failed to create tmpfs mount: %v", err)
+ }
+ k.shmMount = shmMount
+
socketFilesystem, err := sockfs.NewFilesystem(&k.vfs)
if err != nil {
return fmt.Errorf("failed to create sockfs filesystem: %v", err)
@@ -1656,6 +1676,11 @@ func (k *Kernel) PipeMount() *vfs.Mount {
return k.pipeMount
}
+// ShmMount returns the tmpfs mount.
+func (k *Kernel) ShmMount() *vfs.Mount {
+ return k.shmMount
+}
+
// SocketMount returns the sockfs mount.
func (k *Kernel) SocketMount() *vfs.Mount {
return k.socketMount
diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go
index 41a1ce031..789bb94c8 100644
--- a/pkg/sentry/socket/netfilter/netfilter.go
+++ b/pkg/sentry/socket/netfilter/netfilter.go
@@ -59,6 +59,13 @@ type metadata struct {
// developing iptables, but can pollute sentry logs otherwise.
const enableLogging = false
+// emptyFilter is for comparison with a rule's filters to determine whether it
+// is also empty. It is immutable.
+var emptyFilter = stack.IPHeaderFilter{
+ Dst: "\x00\x00\x00\x00",
+ DstMask: "\x00\x00\x00\x00",
+}
+
// nflog logs messages related to the writing and reading of iptables.
func nflog(format string, args ...interface{}) {
if enableLogging && log.IsLogging(log.Debug) {
@@ -484,7 +491,7 @@ func SetEntries(stk *stack.Stack, optVal []byte) *syserr.Error {
}
if offset == replace.Underflow[hook] {
if !validUnderflow(table.Rules[ruleIdx]) {
- nflog("underflow for hook %d isn't an unconditional ACCEPT or DROP")
+ nflog("underflow for hook %d isn't an unconditional ACCEPT or DROP", ruleIdx)
return syserr.ErrInvalidArgument
}
table.Underflows[hk] = ruleIdx
@@ -547,7 +554,7 @@ func SetEntries(stk *stack.Stack, optVal []byte) *syserr.Error {
// make sure all other chains point to ACCEPT rules.
for hook, ruleIdx := range table.BuiltinChains {
if hook == stack.Forward || hook == stack.Postrouting {
- if _, ok := table.Rules[ruleIdx].Target.(stack.AcceptTarget); !ok {
+ if !isUnconditionalAccept(table.Rules[ruleIdx]) {
nflog("hook %d is unsupported.", hook)
return syserr.ErrInvalidArgument
}
@@ -776,6 +783,9 @@ func validUnderflow(rule stack.Rule) bool {
if len(rule.Matchers) != 0 {
return false
}
+ if rule.Filter != emptyFilter {
+ return false
+ }
switch rule.Target.(type) {
case stack.AcceptTarget, stack.DropTarget:
return true
@@ -784,6 +794,14 @@ func validUnderflow(rule stack.Rule) bool {
}
}
+func isUnconditionalAccept(rule stack.Rule) bool {
+ if !validUnderflow(rule) {
+ return false
+ }
+ _, ok := rule.Target.(stack.AcceptTarget)
+ return ok
+}
+
func hookFromLinux(hook int) stack.Hook {
switch hook {
case linux.NF_INET_PRE_ROUTING:
diff --git a/pkg/sentry/socket/netfilter/owner_matcher.go b/pkg/sentry/socket/netfilter/owner_matcher.go
index 5949a7c29..3863293c7 100644
--- a/pkg/sentry/socket/netfilter/owner_matcher.go
+++ b/pkg/sentry/socket/netfilter/owner_matcher.go
@@ -45,14 +45,18 @@ func (ownerMarshaler) marshal(mr stack.Matcher) []byte {
GID: matcher.gid,
}
- // Support for UID match.
- // TODO(gvisor.dev/issue/170): Need to support gid match.
+ // Support for UID and GID match.
if matcher.matchUID {
iptOwnerInfo.Match = linux.XT_OWNER_UID
- } else if matcher.matchGID {
- panic("GID match is not supported.")
- } else {
- panic("UID match is not set.")
+ if matcher.invertUID {
+ iptOwnerInfo.Invert = linux.XT_OWNER_UID
+ }
+ }
+ if matcher.matchGID {
+ iptOwnerInfo.Match |= linux.XT_OWNER_GID
+ if matcher.invertGID {
+ iptOwnerInfo.Invert |= linux.XT_OWNER_GID
+ }
}
buf := make([]byte, 0, linux.SizeOfIPTOwnerInfo)
@@ -71,31 +75,34 @@ func (ownerMarshaler) unmarshal(buf []byte, filter stack.IPHeaderFilter) (stack.
binary.Unmarshal(buf[:linux.SizeOfIPTOwnerInfo], usermem.ByteOrder, &matchData)
nflog("parseMatchers: parsed IPTOwnerInfo: %+v", matchData)
- if matchData.Invert != 0 {
- return nil, fmt.Errorf("invert flag is not supported for owner match")
- }
-
- // Support for UID match.
- // TODO(gvisor.dev/issue/170): Need to support gid match.
- if matchData.Match&linux.XT_OWNER_UID != linux.XT_OWNER_UID {
- return nil, fmt.Errorf("owner match is only supported for uid")
- }
-
- // Check Flags.
var owner OwnerMatcher
owner.uid = matchData.UID
owner.gid = matchData.GID
- owner.matchUID = true
+
+ // Check flags.
+ if matchData.Match&linux.XT_OWNER_UID != 0 {
+ owner.matchUID = true
+ if matchData.Invert&linux.XT_OWNER_UID != 0 {
+ owner.invertUID = true
+ }
+ }
+ if matchData.Match&linux.XT_OWNER_GID != 0 {
+ owner.matchGID = true
+ if matchData.Invert&linux.XT_OWNER_GID != 0 {
+ owner.invertGID = true
+ }
+ }
return &owner, nil
}
type OwnerMatcher struct {
- uid uint32
- gid uint32
- matchUID bool
- matchGID bool
- invert uint8
+ uid uint32
+ gid uint32
+ matchUID bool
+ matchGID bool
+ invertUID bool
+ invertGID bool
}
// Name implements Matcher.Name.
@@ -112,16 +119,30 @@ func (om *OwnerMatcher) Match(hook stack.Hook, pkt stack.PacketBuffer, interface
}
// If the packet owner is not set, drop the packet.
- // Support for uid match.
- // TODO(gvisor.dev/issue/170): Need to support gid match.
- if pkt.Owner == nil || !om.matchUID {
+ if pkt.Owner == nil {
return false, true
}
- // TODO(gvisor.dev/issue/170): Need to add tests to verify
- // drop rule when packet UID does not match owner matcher UID.
- if pkt.Owner.UID() != om.uid {
- return false, false
+ var matches bool
+ // Check for UID match.
+ if om.matchUID {
+ if pkt.Owner.UID() == om.uid {
+ matches = true
+ }
+ if matches == om.invertUID {
+ return false, false
+ }
+ }
+
+ // Check for GID match.
+ if om.matchGID {
+ matches = false
+ if pkt.Owner.GID() == om.gid {
+ matches = true
+ }
+ if matches == om.invertGID {
+ return false, false
+ }
}
return true, false
diff --git a/pkg/sentry/socket/netstack/BUILD b/pkg/sentry/socket/netstack/BUILD
index 6129fb83d..333e0042e 100644
--- a/pkg/sentry/socket/netstack/BUILD
+++ b/pkg/sentry/socket/netstack/BUILD
@@ -18,6 +18,7 @@ go_library(
],
deps = [
"//pkg/abi/linux",
+ "//pkg/amutex",
"//pkg/binary",
"//pkg/context",
"//pkg/log",
diff --git a/pkg/sentry/socket/netstack/netstack.go b/pkg/sentry/socket/netstack/netstack.go
index 81053d8ef..9dea2b5ff 100644
--- a/pkg/sentry/socket/netstack/netstack.go
+++ b/pkg/sentry/socket/netstack/netstack.go
@@ -34,6 +34,7 @@ import (
"time"
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/amutex"
"gvisor.dev/gvisor/pkg/binary"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/log"
@@ -553,11 +554,9 @@ func (s *SocketOperations) Write(ctx context.Context, _ *fs.File, src usermem.IO
}
if resCh != nil {
- t := kernel.TaskFromContext(ctx)
- if err := t.Block(resCh); err != nil {
- return 0, syserr.FromError(err).ToError()
+ if err := amutex.Block(ctx, resCh); err != nil {
+ return 0, err
}
-
n, _, err = s.Endpoint.Write(f, tcpip.WriteOptions{})
}
@@ -626,11 +625,9 @@ func (s *SocketOperations) ReadFrom(ctx context.Context, _ *fs.File, r io.Reader
}
if resCh != nil {
- t := kernel.TaskFromContext(ctx)
- if err := t.Block(resCh); err != nil {
- return 0, syserr.FromError(err).ToError()
+ if err := amutex.Block(ctx, resCh); err != nil {
+ return 0, err
}
-
n, _, err = s.Endpoint.Write(f, tcpip.WriteOptions{
Atomic: true, // See above.
})
@@ -1324,6 +1321,29 @@ func getSockOptTCP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfa
return int32(time.Duration(v) / time.Second), nil
+ case linux.TCP_SYNCNT:
+ if outLen < sizeOfInt32 {
+ return nil, syserr.ErrInvalidArgument
+ }
+
+ v, err := ep.GetSockOptInt(tcpip.TCPSynCountOption)
+ if err != nil {
+ return nil, syserr.TranslateNetstackError(err)
+ }
+
+ return int32(v), nil
+
+ case linux.TCP_WINDOW_CLAMP:
+ if outLen < sizeOfInt32 {
+ return nil, syserr.ErrInvalidArgument
+ }
+
+ v, err := ep.GetSockOptInt(tcpip.TCPWindowClampOption)
+ if err != nil {
+ return nil, syserr.TranslateNetstackError(err)
+ }
+
+ return int32(v), nil
default:
emitUnimplementedEventTCP(t, name)
}
@@ -1793,6 +1813,22 @@ func setSockOptTCP(t *kernel.Task, ep commonEndpoint, name int, optVal []byte) *
}
return syserr.TranslateNetstackError(ep.SetSockOpt(tcpip.TCPDeferAcceptOption(time.Second * time.Duration(v))))
+ case linux.TCP_SYNCNT:
+ if len(optVal) < sizeOfInt32 {
+ return syserr.ErrInvalidArgument
+ }
+ v := usermem.ByteOrder.Uint32(optVal)
+
+ return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.TCPSynCountOption, int(v)))
+
+ case linux.TCP_WINDOW_CLAMP:
+ if len(optVal) < sizeOfInt32 {
+ return syserr.ErrInvalidArgument
+ }
+ v := usermem.ByteOrder.Uint32(optVal)
+
+ return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.TCPWindowClampOption, int(v)))
+
case linux.TCP_REPAIR_OPTIONS:
t.Kernel().EmitUnimplementedEvent(t)
diff --git a/pkg/sentry/socket/netstack/netstack_vfs2.go b/pkg/sentry/socket/netstack/netstack_vfs2.go
index 191970d41..fcd8013c0 100644
--- a/pkg/sentry/socket/netstack/netstack_vfs2.go
+++ b/pkg/sentry/socket/netstack/netstack_vfs2.go
@@ -16,6 +16,7 @@ package netstack
import (
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/amutex"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs"
@@ -89,11 +90,6 @@ func (s *SocketVFS2) EventUnregister(e *waiter.Entry) {
s.socketOpsCommon.EventUnregister(e)
}
-// PRead implements vfs.FileDescriptionImpl.
-func (s *SocketVFS2) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
- return 0, syserror.ESPIPE
-}
-
// Read implements vfs.FileDescriptionImpl.
func (s *SocketVFS2) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
// All flags other than RWF_NOWAIT should be ignored.
@@ -115,11 +111,6 @@ func (s *SocketVFS2) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.
return int64(n), nil
}
-// PWrite implements vfs.FileDescriptionImpl.
-func (s *SocketVFS2) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
- return 0, syserror.ESPIPE
-}
-
// Write implements vfs.FileDescriptionImpl.
func (s *SocketVFS2) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
// All flags other than RWF_NOWAIT should be ignored.
@@ -135,11 +126,9 @@ func (s *SocketVFS2) Write(ctx context.Context, src usermem.IOSequence, opts vfs
}
if resCh != nil {
- t := kernel.TaskFromContext(ctx)
- if err := t.Block(resCh); err != nil {
- return 0, syserr.FromError(err).ToError()
+ if err := amutex.Block(ctx, resCh); err != nil {
+ return 0, err
}
-
n, _, err = s.Endpoint.Write(f, tcpip.WriteOptions{})
}
diff --git a/pkg/sentry/syscalls/linux/vfs2/BUILD b/pkg/sentry/syscalls/linux/vfs2/BUILD
index c32f942fb..f882ef840 100644
--- a/pkg/sentry/syscalls/linux/vfs2/BUILD
+++ b/pkg/sentry/syscalls/linux/vfs2/BUILD
@@ -13,6 +13,7 @@ go_library(
"fscontext.go",
"getdents.go",
"ioctl.go",
+ "memfd.go",
"mmap.go",
"path.go",
"pipe.go",
@@ -43,6 +44,7 @@ go_library(
"//pkg/sentry/fsimpl/pipefs",
"//pkg/sentry/fsimpl/signalfd",
"//pkg/sentry/fsimpl/timerfd",
+ "//pkg/sentry/fsimpl/tmpfs",
"//pkg/sentry/kernel",
"//pkg/sentry/kernel/auth",
"//pkg/sentry/kernel/pipe",
diff --git a/pkg/sentry/syscalls/linux/vfs2/fd.go b/pkg/sentry/syscalls/linux/vfs2/fd.go
index 8181d80f4..ca0f7fd1e 100644
--- a/pkg/sentry/syscalls/linux/vfs2/fd.go
+++ b/pkg/sentry/syscalls/linux/vfs2/fd.go
@@ -17,6 +17,7 @@ package vfs2
import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/kernel/pipe"
slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
@@ -157,6 +158,15 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
return 0, nil, syserror.EBADF
}
return uintptr(pipefile.PipeSize()), nil, nil
+ case linux.F_GET_SEALS:
+ val, err := tmpfs.GetSeals(file)
+ return uintptr(val), nil, err
+ case linux.F_ADD_SEALS:
+ if !file.IsWritable() {
+ return 0, nil, syserror.EPERM
+ }
+ err := tmpfs.AddSeals(file, args[2].Uint())
+ return 0, nil, err
default:
// TODO(gvisor.dev/issue/1623): Everything else is not yet supported.
return 0, nil, syserror.EINVAL
diff --git a/pkg/sentry/syscalls/linux/vfs2/memfd.go b/pkg/sentry/syscalls/linux/vfs2/memfd.go
new file mode 100644
index 000000000..bbe248d17
--- /dev/null
+++ b/pkg/sentry/syscalls/linux/vfs2/memfd.go
@@ -0,0 +1,63 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs2
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+const (
+ memfdPrefix = "memfd:"
+ memfdMaxNameLen = linux.NAME_MAX - len(memfdPrefix)
+ memfdAllFlags = uint32(linux.MFD_CLOEXEC | linux.MFD_ALLOW_SEALING)
+)
+
+// MemfdCreate implements the linux syscall memfd_create(2).
+func MemfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
+ addr := args[0].Pointer()
+ flags := args[1].Uint()
+
+ if flags&^memfdAllFlags != 0 {
+ // Unknown bits in flags.
+ return 0, nil, syserror.EINVAL
+ }
+
+ allowSeals := flags&linux.MFD_ALLOW_SEALING != 0
+ cloExec := flags&linux.MFD_CLOEXEC != 0
+
+ name, err := t.CopyInString(addr, memfdMaxNameLen)
+ if err != nil {
+ return 0, nil, err
+ }
+
+ shmMount := t.Kernel().ShmMount()
+ file, err := tmpfs.NewMemfd(shmMount, t.Credentials(), allowSeals, memfdPrefix+name)
+ if err != nil {
+ return 0, nil, err
+ }
+
+ fd, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{
+ CloseOnExec: cloExec,
+ })
+ if err != nil {
+ return 0, nil, err
+ }
+
+ return uintptr(fd), nil, nil
+}
diff --git a/pkg/sentry/syscalls/linux/vfs2/vfs2.go b/pkg/sentry/syscalls/linux/vfs2/vfs2.go
index 9c04677f1..ec8da7f06 100644
--- a/pkg/sentry/syscalls/linux/vfs2/vfs2.go
+++ b/pkg/sentry/syscalls/linux/vfs2/vfs2.go
@@ -158,7 +158,7 @@ func Override() {
s.Table[306] = syscalls.Supported("syncfs", Syncfs)
s.Table[307] = syscalls.Supported("sendmmsg", SendMMsg)
s.Table[316] = syscalls.Supported("renameat2", Renameat2)
- delete(s.Table, 319) // memfd_create
+ s.Table[319] = syscalls.Supported("memfd_create", MemfdCreate)
s.Table[322] = syscalls.Supported("execveat", Execveat)
s.Table[327] = syscalls.Supported("preadv2", Preadv2)
s.Table[328] = syscalls.Supported("pwritev2", Pwritev2)