summaryrefslogtreecommitdiffhomepage
path: root/pkg/sentry/vfs
diff options
context:
space:
mode:
Diffstat (limited to 'pkg/sentry/vfs')
-rw-r--r--pkg/sentry/vfs/BUILD2
-rw-r--r--pkg/sentry/vfs/anonfs.go4
-rw-r--r--pkg/sentry/vfs/epoll.go2
-rw-r--r--pkg/sentry/vfs/file_description.go60
-rw-r--r--pkg/sentry/vfs/file_description_impl_util.go4
-rw-r--r--pkg/sentry/vfs/filesystem.go26
-rw-r--r--pkg/sentry/vfs/memxattr/BUILD15
-rw-r--r--pkg/sentry/vfs/memxattr/xattr.go102
-rw-r--r--pkg/sentry/vfs/mount.go205
-rw-r--r--pkg/sentry/vfs/mount_test.go2
-rw-r--r--pkg/sentry/vfs/options.go19
-rw-r--r--pkg/sentry/vfs/timerfd.go142
-rw-r--r--pkg/sentry/vfs/vfs.go8
13 files changed, 561 insertions, 30 deletions
diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD
index bf4d27c7d..9aeb83fb0 100644
--- a/pkg/sentry/vfs/BUILD
+++ b/pkg/sentry/vfs/BUILD
@@ -36,6 +36,7 @@ go_library(
"pathname.go",
"permissions.go",
"resolving_path.go",
+ "timerfd.go",
"vfs.go",
],
visibility = ["//pkg/sentry:internal"],
@@ -51,6 +52,7 @@ go_library(
"//pkg/sentry/fs",
"//pkg/sentry/fs/lock",
"//pkg/sentry/kernel/auth",
+ "//pkg/sentry/kernel/time",
"//pkg/sentry/limits",
"//pkg/sentry/memmap",
"//pkg/sentry/socket/unix/transport",
diff --git a/pkg/sentry/vfs/anonfs.go b/pkg/sentry/vfs/anonfs.go
index d1f6dfb45..a64d86122 100644
--- a/pkg/sentry/vfs/anonfs.go
+++ b/pkg/sentry/vfs/anonfs.go
@@ -245,7 +245,7 @@ func (fs *anonFilesystem) BoundEndpointAt(ctx context.Context, rp *ResolvingPath
}
// ListxattrAt implements FilesystemImpl.ListxattrAt.
-func (fs *anonFilesystem) ListxattrAt(ctx context.Context, rp *ResolvingPath) ([]string, error) {
+func (fs *anonFilesystem) ListxattrAt(ctx context.Context, rp *ResolvingPath, size uint64) ([]string, error) {
if !rp.Done() {
return nil, syserror.ENOTDIR
}
@@ -253,7 +253,7 @@ func (fs *anonFilesystem) ListxattrAt(ctx context.Context, rp *ResolvingPath) ([
}
// GetxattrAt implements FilesystemImpl.GetxattrAt.
-func (fs *anonFilesystem) GetxattrAt(ctx context.Context, rp *ResolvingPath, name string) (string, error) {
+func (fs *anonFilesystem) GetxattrAt(ctx context.Context, rp *ResolvingPath, opts GetxattrOptions) (string, error) {
if !rp.Done() {
return "", syserror.ENOTDIR
}
diff --git a/pkg/sentry/vfs/epoll.go b/pkg/sentry/vfs/epoll.go
index 3da45d744..8e0b40841 100644
--- a/pkg/sentry/vfs/epoll.go
+++ b/pkg/sentry/vfs/epoll.go
@@ -99,6 +99,8 @@ func (vfs *VirtualFilesystem) NewEpollInstanceFD() (*FileDescription, error) {
interest: make(map[epollInterestKey]*epollInterest),
}
if err := ep.vfsfd.Init(ep, linux.O_RDWR, vd.Mount(), vd.Dentry(), &FileDescriptionOptions{
+ DenyPRead: true,
+ DenyPWrite: true,
UseDentryMetadata: true,
}); err != nil {
return nil, err
diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go
index 8ee549dc2..4fb9aea87 100644
--- a/pkg/sentry/vfs/file_description.go
+++ b/pkg/sentry/vfs/file_description.go
@@ -84,6 +84,17 @@ type FileDescriptionOptions struct {
// usually only the case if O_DIRECT would actually have an effect.
AllowDirectIO bool
+ // If DenyPRead is true, calls to FileDescription.PRead() return ESPIPE.
+ DenyPRead bool
+
+ // If DenyPWrite is true, calls to FileDescription.PWrite() return
+ // ESPIPE.
+ DenyPWrite bool
+
+ // if InvalidWrite is true, calls to FileDescription.Write() return
+ // EINVAL.
+ InvalidWrite bool
+
// If UseDentryMetadata is true, calls to FileDescription methods that
// interact with file and filesystem metadata (Stat, SetStat, StatFS,
// Listxattr, Getxattr, Setxattr, Removexattr) are implemented by calling
@@ -175,6 +186,12 @@ func (fd *FileDescription) DecRef() {
}
}
+// Refs returns the current number of references. The returned count
+// is inherently racy and is unsafe to use without external synchronization.
+func (fd *FileDescription) Refs() int64 {
+ return atomic.LoadInt64(&fd.refs)
+}
+
// Mount returns the mount on which fd was opened. It does not take a reference
// on the returned Mount.
func (fd *FileDescription) Mount() *Mount {
@@ -306,6 +323,7 @@ type FileDescriptionImpl interface {
// - If opts.Flags specifies unsupported options, PRead returns EOPNOTSUPP.
//
// Preconditions: The FileDescription was opened for reading.
+ // FileDescriptionOptions.DenyPRead == false.
PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error)
// Read is similar to PRead, but does not specify an offset.
@@ -337,6 +355,7 @@ type FileDescriptionImpl interface {
// EOPNOTSUPP.
//
// Preconditions: The FileDescription was opened for writing.
+ // FileDescriptionOptions.DenyPWrite == false.
PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error)
// Write is similar to PWrite, but does not specify an offset, which is
@@ -382,11 +401,11 @@ type FileDescriptionImpl interface {
Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error)
// Listxattr returns all extended attribute names for the file.
- Listxattr(ctx context.Context) ([]string, error)
+ Listxattr(ctx context.Context, size uint64) ([]string, error)
// Getxattr returns the value associated with the given extended attribute
// for the file.
- Getxattr(ctx context.Context, name string) (string, error)
+ Getxattr(ctx context.Context, opts GetxattrOptions) (string, error)
// Setxattr changes the value associated with the given extended attribute
// for the file.
@@ -515,6 +534,9 @@ func (fd *FileDescription) EventUnregister(e *waiter.Entry) {
// offset, and returns the number of bytes read. PRead is permitted to return
// partial reads with a nil error.
func (fd *FileDescription) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) {
+ if fd.opts.DenyPRead {
+ return 0, syserror.ESPIPE
+ }
if !fd.readable {
return 0, syserror.EBADF
}
@@ -533,6 +555,9 @@ func (fd *FileDescription) Read(ctx context.Context, dst usermem.IOSequence, opt
// offset, and returns the number of bytes written. PWrite is permitted to
// return partial writes with a nil error.
func (fd *FileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) {
+ if fd.opts.DenyPWrite {
+ return 0, syserror.ESPIPE
+ }
if !fd.writable {
return 0, syserror.EBADF
}
@@ -541,6 +566,9 @@ func (fd *FileDescription) PWrite(ctx context.Context, src usermem.IOSequence, o
// Write is similar to PWrite, but does not specify an offset.
func (fd *FileDescription) Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) {
+ if fd.opts.InvalidWrite {
+ return 0, syserror.EINVAL
+ }
if !fd.writable {
return 0, syserror.EBADF
}
@@ -577,18 +605,23 @@ func (fd *FileDescription) Ioctl(ctx context.Context, uio usermem.IO, args arch.
// Listxattr returns all extended attribute names for the file represented by
// fd.
-func (fd *FileDescription) Listxattr(ctx context.Context) ([]string, error) {
+//
+// If the size of the list (including a NUL terminating byte after every entry)
+// would exceed size, ERANGE may be returned. Note that implementations
+// are free to ignore size entirely and return without error). In all cases,
+// if size is 0, the list should be returned without error, regardless of size.
+func (fd *FileDescription) Listxattr(ctx context.Context, size uint64) ([]string, error) {
if fd.opts.UseDentryMetadata {
vfsObj := fd.vd.mount.vfs
rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
Root: fd.vd,
Start: fd.vd,
})
- names, err := fd.vd.mount.fs.impl.ListxattrAt(ctx, rp)
+ names, err := fd.vd.mount.fs.impl.ListxattrAt(ctx, rp, size)
vfsObj.putResolvingPath(rp)
return names, err
}
- names, err := fd.impl.Listxattr(ctx)
+ names, err := fd.impl.Listxattr(ctx, size)
if err == syserror.ENOTSUP {
// Linux doesn't actually return ENOTSUP in this case; instead,
// fs/xattr.c:vfs_listxattr() falls back to allowing the security
@@ -601,34 +634,39 @@ func (fd *FileDescription) Listxattr(ctx context.Context) ([]string, error) {
// Getxattr returns the value associated with the given extended attribute for
// the file represented by fd.
-func (fd *FileDescription) Getxattr(ctx context.Context, name string) (string, error) {
+//
+// If the size of the return value exceeds opts.Size, ERANGE may be returned
+// (note that implementations are free to ignore opts.Size entirely and return
+// without error). In all cases, if opts.Size is 0, the value should be
+// returned without error, regardless of size.
+func (fd *FileDescription) Getxattr(ctx context.Context, opts *GetxattrOptions) (string, error) {
if fd.opts.UseDentryMetadata {
vfsObj := fd.vd.mount.vfs
rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
Root: fd.vd,
Start: fd.vd,
})
- val, err := fd.vd.mount.fs.impl.GetxattrAt(ctx, rp, name)
+ val, err := fd.vd.mount.fs.impl.GetxattrAt(ctx, rp, *opts)
vfsObj.putResolvingPath(rp)
return val, err
}
- return fd.impl.Getxattr(ctx, name)
+ return fd.impl.Getxattr(ctx, *opts)
}
// Setxattr changes the value associated with the given extended attribute for
// the file represented by fd.
-func (fd *FileDescription) Setxattr(ctx context.Context, opts SetxattrOptions) error {
+func (fd *FileDescription) Setxattr(ctx context.Context, opts *SetxattrOptions) error {
if fd.opts.UseDentryMetadata {
vfsObj := fd.vd.mount.vfs
rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
Root: fd.vd,
Start: fd.vd,
})
- err := fd.vd.mount.fs.impl.SetxattrAt(ctx, rp, opts)
+ err := fd.vd.mount.fs.impl.SetxattrAt(ctx, rp, *opts)
vfsObj.putResolvingPath(rp)
return err
}
- return fd.impl.Setxattr(ctx, opts)
+ return fd.impl.Setxattr(ctx, *opts)
}
// Removexattr removes the given extended attribute from the file represented
diff --git a/pkg/sentry/vfs/file_description_impl_util.go b/pkg/sentry/vfs/file_description_impl_util.go
index d45e602ce..f4c111926 100644
--- a/pkg/sentry/vfs/file_description_impl_util.go
+++ b/pkg/sentry/vfs/file_description_impl_util.go
@@ -130,14 +130,14 @@ func (FileDescriptionDefaultImpl) Ioctl(ctx context.Context, uio usermem.IO, arg
// Listxattr implements FileDescriptionImpl.Listxattr analogously to
// inode_operations::listxattr == NULL in Linux.
-func (FileDescriptionDefaultImpl) Listxattr(ctx context.Context) ([]string, error) {
+func (FileDescriptionDefaultImpl) Listxattr(ctx context.Context, size uint64) ([]string, error) {
// This isn't exactly accurate; see FileDescription.Listxattr.
return nil, syserror.ENOTSUP
}
// Getxattr implements FileDescriptionImpl.Getxattr analogously to
// inode::i_opflags & IOP_XATTR == 0 in Linux.
-func (FileDescriptionDefaultImpl) Getxattr(ctx context.Context, name string) (string, error) {
+func (FileDescriptionDefaultImpl) Getxattr(ctx context.Context, opts GetxattrOptions) (string, error) {
return "", syserror.ENOTSUP
}
diff --git a/pkg/sentry/vfs/filesystem.go b/pkg/sentry/vfs/filesystem.go
index cd34782ff..a537a29d1 100644
--- a/pkg/sentry/vfs/filesystem.go
+++ b/pkg/sentry/vfs/filesystem.go
@@ -442,7 +442,13 @@ type FilesystemImpl interface {
// - If extended attributes are not supported by the filesystem,
// ListxattrAt returns nil. (See FileDescription.Listxattr for an
// explanation.)
- ListxattrAt(ctx context.Context, rp *ResolvingPath) ([]string, error)
+ //
+ // - If the size of the list (including a NUL terminating byte after every
+ // entry) would exceed size, ERANGE may be returned. Note that
+ // implementations are free to ignore size entirely and return without
+ // error). In all cases, if size is 0, the list should be returned without
+ // error, regardless of size.
+ ListxattrAt(ctx context.Context, rp *ResolvingPath, size uint64) ([]string, error)
// GetxattrAt returns the value associated with the given extended
// attribute for the file at rp.
@@ -451,7 +457,15 @@ type FilesystemImpl interface {
//
// - If extended attributes are not supported by the filesystem, GetxattrAt
// returns ENOTSUP.
- GetxattrAt(ctx context.Context, rp *ResolvingPath, name string) (string, error)
+ //
+ // - If an extended attribute named opts.Name does not exist, ENODATA is
+ // returned.
+ //
+ // - If the size of the return value exceeds opts.Size, ERANGE may be
+ // returned (note that implementations are free to ignore opts.Size entirely
+ // and return without error). In all cases, if opts.Size is 0, the value
+ // should be returned without error, regardless of size.
+ GetxattrAt(ctx context.Context, rp *ResolvingPath, opts GetxattrOptions) (string, error)
// SetxattrAt changes the value associated with the given extended
// attribute for the file at rp.
@@ -460,6 +474,10 @@ type FilesystemImpl interface {
//
// - If extended attributes are not supported by the filesystem, SetxattrAt
// returns ENOTSUP.
+ //
+ // - If XATTR_CREATE is set in opts.Flag and opts.Name already exists,
+ // EEXIST is returned. If XATTR_REPLACE is set and opts.Name does not exist,
+ // ENODATA is returned.
SetxattrAt(ctx context.Context, rp *ResolvingPath, opts SetxattrOptions) error
// RemovexattrAt removes the given extended attribute from the file at rp.
@@ -468,6 +486,8 @@ type FilesystemImpl interface {
//
// - If extended attributes are not supported by the filesystem,
// RemovexattrAt returns ENOTSUP.
+ //
+ // - If name does not exist, ENODATA is returned.
RemovexattrAt(ctx context.Context, rp *ResolvingPath, name string) error
// BoundEndpointAt returns the Unix socket endpoint bound at the path rp.
@@ -497,7 +517,7 @@ type FilesystemImpl interface {
// Preconditions: vd.Mount().Filesystem().Impl() == this FilesystemImpl.
PrependPath(ctx context.Context, vfsroot, vd VirtualDentry, b *fspath.Builder) error
- // TODO: inotify_add_watch()
+ // TODO(gvisor.dev/issue/1479): inotify_add_watch()
}
// PrependPathAtVFSRootError is returned by implementations of
diff --git a/pkg/sentry/vfs/memxattr/BUILD b/pkg/sentry/vfs/memxattr/BUILD
new file mode 100644
index 000000000..d8c4d27b9
--- /dev/null
+++ b/pkg/sentry/vfs/memxattr/BUILD
@@ -0,0 +1,15 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+ name = "memxattr",
+ srcs = ["xattr.go"],
+ visibility = ["//pkg/sentry:internal"],
+ deps = [
+ "//pkg/abi/linux",
+ "//pkg/sentry/vfs",
+ "//pkg/sync",
+ "//pkg/syserror",
+ ],
+)
diff --git a/pkg/sentry/vfs/memxattr/xattr.go b/pkg/sentry/vfs/memxattr/xattr.go
new file mode 100644
index 000000000..cc1e7d764
--- /dev/null
+++ b/pkg/sentry/vfs/memxattr/xattr.go
@@ -0,0 +1,102 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package memxattr provides a default, in-memory extended attribute
+// implementation.
+package memxattr
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/sync"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+// SimpleExtendedAttributes implements extended attributes using a map of
+// names to values.
+//
+// +stateify savable
+type SimpleExtendedAttributes struct {
+ // mu protects the below fields.
+ mu sync.RWMutex `state:"nosave"`
+ xattrs map[string]string
+}
+
+// Getxattr returns the value at 'name'.
+func (x *SimpleExtendedAttributes) Getxattr(opts *vfs.GetxattrOptions) (string, error) {
+ x.mu.RLock()
+ value, ok := x.xattrs[opts.Name]
+ x.mu.RUnlock()
+ if !ok {
+ return "", syserror.ENODATA
+ }
+ // Check that the size of the buffer provided in getxattr(2) is large enough
+ // to contain the value.
+ if opts.Size != 0 && uint64(len(value)) > opts.Size {
+ return "", syserror.ERANGE
+ }
+ return value, nil
+}
+
+// Setxattr sets 'value' at 'name'.
+func (x *SimpleExtendedAttributes) Setxattr(opts *vfs.SetxattrOptions) error {
+ x.mu.Lock()
+ defer x.mu.Unlock()
+ if x.xattrs == nil {
+ if opts.Flags&linux.XATTR_REPLACE != 0 {
+ return syserror.ENODATA
+ }
+ x.xattrs = make(map[string]string)
+ }
+
+ _, ok := x.xattrs[opts.Name]
+ if ok && opts.Flags&linux.XATTR_CREATE != 0 {
+ return syserror.EEXIST
+ }
+ if !ok && opts.Flags&linux.XATTR_REPLACE != 0 {
+ return syserror.ENODATA
+ }
+
+ x.xattrs[opts.Name] = opts.Value
+ return nil
+}
+
+// Listxattr returns all names in xattrs.
+func (x *SimpleExtendedAttributes) Listxattr(size uint64) ([]string, error) {
+ // Keep track of the size of the buffer needed in listxattr(2) for the list.
+ listSize := 0
+ x.mu.RLock()
+ names := make([]string, 0, len(x.xattrs))
+ for n := range x.xattrs {
+ names = append(names, n)
+ // Add one byte per null terminator.
+ listSize += len(n) + 1
+ }
+ x.mu.RUnlock()
+ if size != 0 && uint64(listSize) > size {
+ return nil, syserror.ERANGE
+ }
+ return names, nil
+}
+
+// Removexattr removes the xattr at 'name'.
+func (x *SimpleExtendedAttributes) Removexattr(name string) error {
+ x.mu.Lock()
+ defer x.mu.Unlock()
+ if _, ok := x.xattrs[name]; !ok {
+ return syserror.ENODATA
+ }
+ delete(x.xattrs, name)
+ return nil
+}
diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go
index 4b68cabda..f06946103 100644
--- a/pkg/sentry/vfs/mount.go
+++ b/pkg/sentry/vfs/mount.go
@@ -15,7 +15,11 @@
package vfs
import (
+ "bytes"
+ "fmt"
"math"
+ "sort"
+ "strings"
"sync/atomic"
"gvisor.dev/gvisor/pkg/abi/linux"
@@ -44,7 +48,7 @@ var lastMountID uint64
//
// +stateify savable
type Mount struct {
- // vfs, fs, and root are immutable. References are held on fs and root.
+ // vfs, fs, root are immutable. References are held on fs and root.
//
// Invariant: root belongs to fs.
vfs *VirtualFilesystem
@@ -229,9 +233,9 @@ func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentia
}
vd.dentry.mu.Lock()
}
- // TODO: Linux requires that either both the mount point and the mount root
- // are directories, or neither are, and returns ENOTDIR if this is not the
- // case.
+ // TODO(gvisor.dev/issue/1035): Linux requires that either both the mount
+ // point and the mount root are directories, or neither are, and returns
+ // ENOTDIR if this is not the case.
mntns := vd.mount.ns
mnt := newMount(vfs, fs, root, mntns, opts)
vfs.mounts.seq.BeginWrite()
@@ -270,9 +274,9 @@ func (vfs *VirtualFilesystem) UmountAt(ctx context.Context, creds *auth.Credenti
}
}
- // TODO(jamieliu): Linux special-cases umount of the caller's root, which
- // we don't implement yet (we'll just fail it since the caller holds a
- // reference on it).
+ // TODO(gvisor.dev/issue/1035): Linux special-cases umount of the caller's
+ // root, which we don't implement yet (we'll just fail it since the caller
+ // holds a reference on it).
vfs.mounts.seq.BeginWrite()
if opts.Flags&linux.MNT_DETACH == 0 {
@@ -639,12 +643,28 @@ func (mnt *Mount) setReadOnlyLocked(ro bool) error {
return nil
}
+func (mnt *Mount) readOnly() bool {
+ return atomic.LoadInt64(&mnt.writers) < 0
+}
+
// Filesystem returns the mounted Filesystem. It does not take a reference on
// the returned Filesystem.
func (mnt *Mount) Filesystem() *Filesystem {
return mnt.fs
}
+// submountsLocked returns this Mount and all Mounts that are descendents of
+// it.
+//
+// Precondition: mnt.vfs.mountMu must be held.
+func (mnt *Mount) submountsLocked() []*Mount {
+ mounts := []*Mount{mnt}
+ for m := range mnt.children {
+ mounts = append(mounts, m.submountsLocked()...)
+ }
+ return mounts
+}
+
// Root returns mntns' root. A reference is taken on the returned
// VirtualDentry.
func (mntns *MountNamespace) Root() VirtualDentry {
@@ -655,3 +675,174 @@ func (mntns *MountNamespace) Root() VirtualDentry {
vd.IncRef()
return vd
}
+
+// GenerateProcMounts emits the contents of /proc/[pid]/mounts for vfs to buf.
+//
+// Preconditions: taskRootDir.Ok().
+func (vfs *VirtualFilesystem) GenerateProcMounts(ctx context.Context, taskRootDir VirtualDentry, buf *bytes.Buffer) {
+ vfs.mountMu.Lock()
+ defer vfs.mountMu.Unlock()
+ rootMnt := taskRootDir.mount
+ mounts := rootMnt.submountsLocked()
+ sort.Slice(mounts, func(i, j int) bool { return mounts[i].ID < mounts[j].ID })
+ for _, mnt := range mounts {
+ // Get the path to this mount relative to task root.
+ mntRootVD := VirtualDentry{
+ mount: mnt,
+ dentry: mnt.root,
+ }
+ path, err := vfs.PathnameReachable(ctx, taskRootDir, mntRootVD)
+ if err != nil {
+ // For some reason we didn't get a path. Log a warning
+ // and run with empty path.
+ ctx.Warningf("Error getting pathname for mount root %+v: %v", mnt.root, err)
+ path = ""
+ }
+ if path == "" {
+ // Either an error occurred, or path is not reachable
+ // from root.
+ break
+ }
+
+ opts := "rw"
+ if mnt.readOnly() {
+ opts = "ro"
+ }
+ if mnt.flags.NoExec {
+ opts += ",noexec"
+ }
+
+ // Format:
+ // <special device or remote filesystem> <mount point> <filesystem type> <mount options> <needs dump> <fsck order>
+ //
+ // The "needs dump" and "fsck order" flags are always 0, which
+ // is allowed.
+ fmt.Fprintf(buf, "%s %s %s %s %d %d\n", "none", path, mnt.fs.FilesystemType().Name(), opts, 0, 0)
+ }
+}
+
+// GenerateProcMountInfo emits the contents of /proc/[pid]/mountinfo for vfs to
+// buf.
+//
+// Preconditions: taskRootDir.Ok().
+func (vfs *VirtualFilesystem) GenerateProcMountInfo(ctx context.Context, taskRootDir VirtualDentry, buf *bytes.Buffer) {
+ vfs.mountMu.Lock()
+ defer vfs.mountMu.Unlock()
+ rootMnt := taskRootDir.mount
+ mounts := rootMnt.submountsLocked()
+ sort.Slice(mounts, func(i, j int) bool { return mounts[i].ID < mounts[j].ID })
+ for _, mnt := range mounts {
+ // Get the path to this mount relative to task root.
+ mntRootVD := VirtualDentry{
+ mount: mnt,
+ dentry: mnt.root,
+ }
+ path, err := vfs.PathnameReachable(ctx, taskRootDir, mntRootVD)
+ if err != nil {
+ // For some reason we didn't get a path. Log a warning
+ // and run with empty path.
+ ctx.Warningf("Error getting pathname for mount root %+v: %v", mnt.root, err)
+ path = ""
+ }
+ if path == "" {
+ // Either an error occurred, or path is not reachable
+ // from root.
+ break
+ }
+ // Stat the mount root to get the major/minor device numbers.
+ pop := &PathOperation{
+ Root: mntRootVD,
+ Start: mntRootVD,
+ }
+ statx, err := vfs.StatAt(ctx, auth.NewAnonymousCredentials(), pop, &StatOptions{})
+ if err != nil {
+ // Well that's not good. Ignore this mount.
+ break
+ }
+
+ // Format:
+ // 36 35 98:0 /mnt1 /mnt2 rw,noatime master:1 - ext3 /dev/root rw,errors=continue
+ // (1)(2)(3) (4) (5) (6) (7) (8) (9) (10) (11)
+
+ // (1) Mount ID.
+ fmt.Fprintf(buf, "%d ", mnt.ID)
+
+ // (2) Parent ID (or this ID if there is no parent).
+ pID := mnt.ID
+ if p := mnt.parent(); p != nil {
+ pID = p.ID
+ }
+ fmt.Fprintf(buf, "%d ", pID)
+
+ // (3) Major:Minor device ID. We don't have a superblock, so we
+ // just use the root inode device number.
+ fmt.Fprintf(buf, "%d:%d ", statx.DevMajor, statx.DevMinor)
+
+ // (4) Root: the pathname of the directory in the filesystem
+ // which forms the root of this mount.
+ //
+ // NOTE(b/78135857): This will always be "/" until we implement
+ // bind mounts.
+ fmt.Fprintf(buf, "/ ")
+
+ // (5) Mount point (relative to process root).
+ fmt.Fprintf(buf, "%s ", manglePath(path))
+
+ // (6) Mount options.
+ opts := "rw"
+ if mnt.readOnly() {
+ opts = "ro"
+ }
+ if mnt.flags.NoExec {
+ opts += ",noexec"
+ }
+ // TODO(gvisor.dev/issue/1193): Add "noatime" if MS_NOATIME is
+ // set.
+ fmt.Fprintf(buf, "%s ", opts)
+
+ // (7) Optional fields: zero or more fields of the form "tag[:value]".
+ // (8) Separator: the end of the optional fields is marked by a single hyphen.
+ fmt.Fprintf(buf, "- ")
+
+ // (9) Filesystem type.
+ fmt.Fprintf(buf, "%s ", mnt.fs.FilesystemType().Name())
+
+ // (10) Mount source: filesystem-specific information or "none".
+ fmt.Fprintf(buf, "none ")
+
+ // (11) Superblock options, and final newline.
+ fmt.Fprintf(buf, "%s\n", superBlockOpts(path, mnt))
+ }
+}
+
+// manglePath replaces ' ', '\t', '\n', and '\\' with their octal equivalents.
+// See Linux fs/seq_file.c:mangle_path.
+func manglePath(p string) string {
+ r := strings.NewReplacer(" ", "\\040", "\t", "\\011", "\n", "\\012", "\\", "\\134")
+ return r.Replace(p)
+}
+
+// superBlockOpts returns the super block options string for the the mount at
+// the given path.
+func superBlockOpts(mountPath string, mnt *Mount) string {
+ // gVisor doesn't (yet) have a concept of super block options, so we
+ // use the ro/rw bit from the mount flag.
+ opts := "rw"
+ if mnt.readOnly() {
+ opts = "ro"
+ }
+
+ // NOTE(b/147673608): If the mount is a cgroup, we also need to include
+ // the cgroup name in the options. For now we just read that from the
+ // path.
+ //
+ // TODO(gvisor.dev/issue/190): Once gVisor has full cgroup support, we
+ // should get this value from the cgroup itself, and not rely on the
+ // path.
+ if mnt.fs.FilesystemType().Name() == "cgroup" {
+ splitPath := strings.Split(mountPath, "/")
+ cgroupType := splitPath[len(splitPath)-1]
+ opts += "," + cgroupType
+ }
+ return opts
+}
diff --git a/pkg/sentry/vfs/mount_test.go b/pkg/sentry/vfs/mount_test.go
index 3b933468d..3335e4057 100644
--- a/pkg/sentry/vfs/mount_test.go
+++ b/pkg/sentry/vfs/mount_test.go
@@ -55,7 +55,7 @@ func TestMountTableInsertLookup(t *testing.T) {
}
}
-// TODO: concurrent lookup/insertion/removal
+// TODO(gvisor.dev/issue/1035): concurrent lookup/insertion/removal.
// must be powers of 2
var benchNumMounts = []int{1 << 2, 1 << 5, 1 << 8}
diff --git a/pkg/sentry/vfs/options.go b/pkg/sentry/vfs/options.go
index 3e90dc4ed..534528ce6 100644
--- a/pkg/sentry/vfs/options.go
+++ b/pkg/sentry/vfs/options.go
@@ -16,6 +16,7 @@ package vfs
import (
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
)
// GetDentryOptions contains options to VirtualFilesystem.GetDentryAt() and
@@ -44,6 +45,10 @@ type MknodOptions struct {
// DevMinor are the major and minor device numbers for the created device.
DevMajor uint32
DevMinor uint32
+
+ // Endpoint is the endpoint to bind to the created file, if a socket file is
+ // being created for bind(2) on a Unix domain socket.
+ Endpoint transport.BoundEndpoint
}
// MountFlags contains flags as specified for mount(2), e.g. MS_NOEXEC.
@@ -127,6 +132,20 @@ type SetStatOptions struct {
Stat linux.Statx
}
+// GetxattrOptions contains options to VirtualFilesystem.GetxattrAt(),
+// FilesystemImpl.GetxattrAt(), FileDescription.Getxattr(), and
+// FileDescriptionImpl.Getxattr().
+type GetxattrOptions struct {
+ // Name is the name of the extended attribute to retrieve.
+ Name string
+
+ // Size is the maximum value size that the caller will tolerate. If the value
+ // is larger than size, getxattr methods may return ERANGE, but they are also
+ // free to ignore the hint entirely (i.e. the value returned may be larger
+ // than size). All size checking is done independently at the syscall layer.
+ Size uint64
+}
+
// SetxattrOptions contains options to VirtualFilesystem.SetxattrAt(),
// FilesystemImpl.SetxattrAt(), FileDescription.Setxattr(), and
// FileDescriptionImpl.Setxattr().
diff --git a/pkg/sentry/vfs/timerfd.go b/pkg/sentry/vfs/timerfd.go
new file mode 100644
index 000000000..42b880656
--- /dev/null
+++ b/pkg/sentry/vfs/timerfd.go
@@ -0,0 +1,142 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs
+
+import (
+ "sync/atomic"
+
+ "gvisor.dev/gvisor/pkg/context"
+ ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+ "gvisor.dev/gvisor/pkg/waiter"
+)
+
+// TimerFileDescription implements FileDescriptionImpl for timer fds. It also
+// implements ktime.TimerListener.
+type TimerFileDescription struct {
+ vfsfd FileDescription
+ FileDescriptionDefaultImpl
+ DentryMetadataFileDescriptionImpl
+
+ events waiter.Queue
+ timer *ktime.Timer
+
+ // val is the number of timer expirations since the last successful
+ // call to PRead, or SetTime. val must be accessed using atomic memory
+ // operations.
+ val uint64
+}
+
+var _ FileDescriptionImpl = (*TimerFileDescription)(nil)
+var _ ktime.TimerListener = (*TimerFileDescription)(nil)
+
+// NewTimerFD returns a new timer fd.
+func (vfs *VirtualFilesystem) NewTimerFD(clock ktime.Clock, flags uint32) (*FileDescription, error) {
+ vd := vfs.NewAnonVirtualDentry("[timerfd]")
+ defer vd.DecRef()
+ tfd := &TimerFileDescription{}
+ tfd.timer = ktime.NewTimer(clock, tfd)
+ if err := tfd.vfsfd.Init(tfd, flags, vd.Mount(), vd.Dentry(), &FileDescriptionOptions{
+ UseDentryMetadata: true,
+ DenyPRead: true,
+ DenyPWrite: true,
+ InvalidWrite: true,
+ }); err != nil {
+ return nil, err
+ }
+ return &tfd.vfsfd, nil
+}
+
+// Read implements FileDescriptionImpl.Read.
+func (tfd *TimerFileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) {
+ const sizeofUint64 = 8
+ if dst.NumBytes() < sizeofUint64 {
+ return 0, syserror.EINVAL
+ }
+ if val := atomic.SwapUint64(&tfd.val, 0); val != 0 {
+ var buf [sizeofUint64]byte
+ usermem.ByteOrder.PutUint64(buf[:], val)
+ if _, err := dst.CopyOut(ctx, buf[:]); err != nil {
+ // Linux does not undo consuming the number of
+ // expirations even if writing to userspace fails.
+ return 0, err
+ }
+ return sizeofUint64, nil
+ }
+ return 0, syserror.ErrWouldBlock
+}
+
+// Clock returns the timer fd's Clock.
+func (tfd *TimerFileDescription) Clock() ktime.Clock {
+ return tfd.timer.Clock()
+}
+
+// GetTime returns the associated Timer's setting and the time at which it was
+// observed.
+func (tfd *TimerFileDescription) GetTime() (ktime.Time, ktime.Setting) {
+ return tfd.timer.Get()
+}
+
+// SetTime atomically changes the associated Timer's setting, resets the number
+// of expirations to 0, and returns the previous setting and the time at which
+// it was observed.
+func (tfd *TimerFileDescription) SetTime(s ktime.Setting) (ktime.Time, ktime.Setting) {
+ return tfd.timer.SwapAnd(s, func() { atomic.StoreUint64(&tfd.val, 0) })
+}
+
+// Readiness implements waiter.Waitable.Readiness.
+func (tfd *TimerFileDescription) Readiness(mask waiter.EventMask) waiter.EventMask {
+ var ready waiter.EventMask
+ if atomic.LoadUint64(&tfd.val) != 0 {
+ ready |= waiter.EventIn
+ }
+ return ready
+}
+
+// EventRegister implements waiter.Waitable.EventRegister.
+func (tfd *TimerFileDescription) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+ tfd.events.EventRegister(e, mask)
+}
+
+// EventUnregister implements waiter.Waitable.EventUnregister.
+func (tfd *TimerFileDescription) EventUnregister(e *waiter.Entry) {
+ tfd.events.EventUnregister(e)
+}
+
+// PauseTimer pauses the associated Timer.
+func (tfd *TimerFileDescription) PauseTimer() {
+ tfd.timer.Pause()
+}
+
+// ResumeTimer resumes the associated Timer.
+func (tfd *TimerFileDescription) ResumeTimer() {
+ tfd.timer.Resume()
+}
+
+// Release implements FileDescriptionImpl.Release()
+func (tfd *TimerFileDescription) Release() {
+ tfd.timer.Destroy()
+}
+
+// Notify implements ktime.TimerListener.Notify.
+func (tfd *TimerFileDescription) Notify(exp uint64, setting ktime.Setting) (ktime.Setting, bool) {
+ atomic.AddUint64(&tfd.val, exp)
+ tfd.events.Notify(waiter.EventIn)
+ return ktime.Setting{}, false
+}
+
+// Destroy implements ktime.TimerListener.Destroy.
+func (tfd *TimerFileDescription) Destroy() {}
diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go
index 720b90d8f..f592913d5 100644
--- a/pkg/sentry/vfs/vfs.go
+++ b/pkg/sentry/vfs/vfs.go
@@ -680,10 +680,10 @@ func (vfs *VirtualFilesystem) UnlinkAt(ctx context.Context, creds *auth.Credenti
// ListxattrAt returns all extended attribute names for the file at the given
// path.
-func (vfs *VirtualFilesystem) ListxattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) ([]string, error) {
+func (vfs *VirtualFilesystem) ListxattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, size uint64) ([]string, error) {
rp := vfs.getResolvingPath(creds, pop)
for {
- names, err := rp.mount.fs.impl.ListxattrAt(ctx, rp)
+ names, err := rp.mount.fs.impl.ListxattrAt(ctx, rp, size)
if err == nil {
vfs.putResolvingPath(rp)
return names, nil
@@ -705,10 +705,10 @@ func (vfs *VirtualFilesystem) ListxattrAt(ctx context.Context, creds *auth.Crede
// GetxattrAt returns the value associated with the given extended attribute
// for the file at the given path.
-func (vfs *VirtualFilesystem) GetxattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, name string) (string, error) {
+func (vfs *VirtualFilesystem) GetxattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *GetxattrOptions) (string, error) {
rp := vfs.getResolvingPath(creds, pop)
for {
- val, err := rp.mount.fs.impl.GetxattrAt(ctx, rp, name)
+ val, err := rp.mount.fs.impl.GetxattrAt(ctx, rp, *opts)
if err == nil {
vfs.putResolvingPath(rp)
return val, nil