summaryrefslogtreecommitdiffhomepage
path: root/pkg/sentry/vfs
diff options
context:
space:
mode:
Diffstat (limited to 'pkg/sentry/vfs')
-rw-r--r--pkg/sentry/vfs/BUILD41
-rw-r--r--pkg/sentry/vfs/anonfs.go259
-rw-r--r--pkg/sentry/vfs/context.go2
-rw-r--r--pkg/sentry/vfs/dentry.go2
-rw-r--r--pkg/sentry/vfs/device.go129
-rw-r--r--pkg/sentry/vfs/epoll.go377
-rw-r--r--pkg/sentry/vfs/file_description.go224
-rw-r--r--pkg/sentry/vfs/file_description_impl_util.go113
-rw-r--r--pkg/sentry/vfs/file_description_impl_util_test.go156
-rw-r--r--pkg/sentry/vfs/filesystem.go23
-rw-r--r--pkg/sentry/vfs/filesystem_type.go57
-rw-r--r--pkg/sentry/vfs/mount.go17
-rw-r--r--pkg/sentry/vfs/mount_test.go3
-rw-r--r--pkg/sentry/vfs/mount_unsafe.go4
-rw-r--r--pkg/sentry/vfs/options.go4
-rw-r--r--pkg/sentry/vfs/pathname.go5
-rw-r--r--pkg/sentry/vfs/permissions.go29
-rw-r--r--pkg/sentry/vfs/resolving_path.go2
-rw-r--r--pkg/sentry/vfs/testutil.go173
-rw-r--r--pkg/sentry/vfs/vfs.go76
20 files changed, 1403 insertions, 293 deletions
diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD
index e3e554b88..14b39eb9d 100644
--- a/pkg/sentry/vfs/BUILD
+++ b/pkg/sentry/vfs/BUILD
@@ -1,14 +1,30 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
-package(licenses = ["notice"])
+licenses(["notice"])
+
+go_template_instance(
+ name = "epoll_interest_list",
+ out = "epoll_interest_list.go",
+ package = "vfs",
+ prefix = "epollInterest",
+ template = "//pkg/ilist:generic_list",
+ types = {
+ "Element": "*epollInterest",
+ "Linker": "*epollInterest",
+ },
+)
go_library(
name = "vfs",
srcs = [
+ "anonfs.go",
"context.go",
"debug.go",
"dentry.go",
+ "device.go",
+ "epoll.go",
+ "epoll_interest_list.go",
"file_description.go",
"file_description_impl_util.go",
"filesystem.go",
@@ -20,21 +36,20 @@ go_library(
"pathname.go",
"permissions.go",
"resolving_path.go",
- "testutil.go",
"vfs.go",
],
- importpath = "gvisor.dev/gvisor/pkg/sentry/vfs",
visibility = ["//pkg/sentry:internal"],
deps = [
"//pkg/abi/linux",
+ "//pkg/context",
"//pkg/fspath",
"//pkg/sentry/arch",
- "//pkg/sentry/context",
+ "//pkg/sentry/fs/lock",
"//pkg/sentry/kernel/auth",
"//pkg/sentry/memmap",
- "//pkg/sentry/usermem",
- "//pkg/syncutil",
+ "//pkg/sync",
"//pkg/syserror",
+ "//pkg/usermem",
"//pkg/waiter",
],
)
@@ -46,13 +61,13 @@ go_test(
"file_description_impl_util_test.go",
"mount_test.go",
],
- embed = [":vfs"],
+ library = ":vfs",
deps = [
"//pkg/abi/linux",
- "//pkg/sentry/context",
- "//pkg/sentry/context/contexttest",
- "//pkg/sentry/kernel/auth",
- "//pkg/sentry/usermem",
+ "//pkg/context",
+ "//pkg/sentry/contexttest",
+ "//pkg/sync",
"//pkg/syserror",
+ "//pkg/usermem",
],
)
diff --git a/pkg/sentry/vfs/anonfs.go b/pkg/sentry/vfs/anonfs.go
new file mode 100644
index 000000000..2db25be49
--- /dev/null
+++ b/pkg/sentry/vfs/anonfs.go
@@ -0,0 +1,259 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs
+
+import (
+ "fmt"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/fspath"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+// NewAnonVirtualDentry returns a VirtualDentry with the given synthetic name,
+// consistent with Linux's fs/anon_inodes.c:anon_inode_getfile(). References
+// are taken on the returned VirtualDentry.
+func (vfs *VirtualFilesystem) NewAnonVirtualDentry(name string) VirtualDentry {
+ d := anonDentry{
+ name: name,
+ }
+ d.vfsd.Init(&d)
+ vfs.anonMount.IncRef()
+ // anonDentry no-ops refcounting.
+ return VirtualDentry{
+ mount: vfs.anonMount,
+ dentry: &d.vfsd,
+ }
+}
+
+const anonfsBlockSize = usermem.PageSize // via fs/libfs.c:pseudo_fs_fill_super()
+
+// anonFilesystem is the implementation of FilesystemImpl that backs
+// VirtualDentries returned by VirtualFilesystem.NewAnonVirtualDentry().
+//
+// Since all Dentries in anonFilesystem are non-directories, all FilesystemImpl
+// methods that would require an anonDentry to be a directory return ENOTDIR.
+type anonFilesystem struct {
+ vfsfs Filesystem
+
+ devMinor uint32
+}
+
+type anonDentry struct {
+ vfsd Dentry
+
+ name string
+}
+
+// Release implements FilesystemImpl.Release.
+func (fs *anonFilesystem) Release() {
+}
+
+// Sync implements FilesystemImpl.Sync.
+func (fs *anonFilesystem) Sync(ctx context.Context) error {
+ return nil
+}
+
+// GetDentryAt implements FilesystemImpl.GetDentryAt.
+func (fs *anonFilesystem) GetDentryAt(ctx context.Context, rp *ResolvingPath, opts GetDentryOptions) (*Dentry, error) {
+ if !rp.Done() {
+ return nil, syserror.ENOTDIR
+ }
+ if opts.CheckSearchable {
+ return nil, syserror.ENOTDIR
+ }
+ // anonDentry no-ops refcounting.
+ return rp.Start(), nil
+}
+
+// GetParentDentryAt implements FilesystemImpl.GetParentDentryAt.
+func (fs *anonFilesystem) GetParentDentryAt(ctx context.Context, rp *ResolvingPath) (*Dentry, error) {
+ if !rp.Final() {
+ return nil, syserror.ENOTDIR
+ }
+ // anonDentry no-ops refcounting.
+ return rp.Start(), nil
+}
+
+// LinkAt implements FilesystemImpl.LinkAt.
+func (fs *anonFilesystem) LinkAt(ctx context.Context, rp *ResolvingPath, vd VirtualDentry) error {
+ if !rp.Final() {
+ return syserror.ENOTDIR
+ }
+ return syserror.EPERM
+}
+
+// MkdirAt implements FilesystemImpl.MkdirAt.
+func (fs *anonFilesystem) MkdirAt(ctx context.Context, rp *ResolvingPath, opts MkdirOptions) error {
+ if !rp.Final() {
+ return syserror.ENOTDIR
+ }
+ return syserror.EPERM
+}
+
+// MknodAt implements FilesystemImpl.MknodAt.
+func (fs *anonFilesystem) MknodAt(ctx context.Context, rp *ResolvingPath, opts MknodOptions) error {
+ if !rp.Final() {
+ return syserror.ENOTDIR
+ }
+ return syserror.EPERM
+}
+
+// OpenAt implements FilesystemImpl.OpenAt.
+func (fs *anonFilesystem) OpenAt(ctx context.Context, rp *ResolvingPath, opts OpenOptions) (*FileDescription, error) {
+ if !rp.Done() {
+ return nil, syserror.ENOTDIR
+ }
+ return nil, syserror.ENODEV
+}
+
+// ReadlinkAt implements FilesystemImpl.ReadlinkAt.
+func (fs *anonFilesystem) ReadlinkAt(ctx context.Context, rp *ResolvingPath) (string, error) {
+ if !rp.Done() {
+ return "", syserror.ENOTDIR
+ }
+ return "", syserror.EINVAL
+}
+
+// RenameAt implements FilesystemImpl.RenameAt.
+func (fs *anonFilesystem) RenameAt(ctx context.Context, rp *ResolvingPath, oldParentVD VirtualDentry, oldName string, opts RenameOptions) error {
+ if !rp.Final() {
+ return syserror.ENOTDIR
+ }
+ return syserror.EPERM
+}
+
+// RmdirAt implements FilesystemImpl.RmdirAt.
+func (fs *anonFilesystem) RmdirAt(ctx context.Context, rp *ResolvingPath) error {
+ if !rp.Final() {
+ return syserror.ENOTDIR
+ }
+ return syserror.EPERM
+}
+
+// SetStatAt implements FilesystemImpl.SetStatAt.
+func (fs *anonFilesystem) SetStatAt(ctx context.Context, rp *ResolvingPath, opts SetStatOptions) error {
+ if !rp.Done() {
+ return syserror.ENOTDIR
+ }
+ // Linux actually permits anon_inode_inode's metadata to be set, which is
+ // visible to all users of anon_inode_inode. We just silently ignore
+ // metadata changes.
+ return nil
+}
+
+// StatAt implements FilesystemImpl.StatAt.
+func (fs *anonFilesystem) StatAt(ctx context.Context, rp *ResolvingPath, opts StatOptions) (linux.Statx, error) {
+ if !rp.Done() {
+ return linux.Statx{}, syserror.ENOTDIR
+ }
+ // See fs/anon_inodes.c:anon_inode_init() => fs/libfs.c:alloc_anon_inode().
+ return linux.Statx{
+ Mask: linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO | linux.STATX_SIZE | linux.STATX_BLOCKS,
+ Blksize: anonfsBlockSize,
+ Nlink: 1,
+ UID: uint32(auth.RootKUID),
+ GID: uint32(auth.RootKGID),
+ Mode: 0600, // no type is correct
+ Ino: 1,
+ Size: 0,
+ Blocks: 0,
+ DevMajor: 0,
+ DevMinor: fs.devMinor,
+ }, nil
+}
+
+// StatFSAt implements FilesystemImpl.StatFSAt.
+func (fs *anonFilesystem) StatFSAt(ctx context.Context, rp *ResolvingPath) (linux.Statfs, error) {
+ if !rp.Done() {
+ return linux.Statfs{}, syserror.ENOTDIR
+ }
+ return linux.Statfs{
+ Type: linux.ANON_INODE_FS_MAGIC,
+ BlockSize: anonfsBlockSize,
+ }, nil
+}
+
+// SymlinkAt implements FilesystemImpl.SymlinkAt.
+func (fs *anonFilesystem) SymlinkAt(ctx context.Context, rp *ResolvingPath, target string) error {
+ if !rp.Final() {
+ return syserror.ENOTDIR
+ }
+ return syserror.EPERM
+}
+
+// UnlinkAt implements FilesystemImpl.UnlinkAt.
+func (fs *anonFilesystem) UnlinkAt(ctx context.Context, rp *ResolvingPath) error {
+ if !rp.Final() {
+ return syserror.ENOTDIR
+ }
+ return syserror.EPERM
+}
+
+// ListxattrAt implements FilesystemImpl.ListxattrAt.
+func (fs *anonFilesystem) ListxattrAt(ctx context.Context, rp *ResolvingPath) ([]string, error) {
+ if !rp.Done() {
+ return nil, syserror.ENOTDIR
+ }
+ return nil, nil
+}
+
+// GetxattrAt implements FilesystemImpl.GetxattrAt.
+func (fs *anonFilesystem) GetxattrAt(ctx context.Context, rp *ResolvingPath, name string) (string, error) {
+ if !rp.Done() {
+ return "", syserror.ENOTDIR
+ }
+ return "", syserror.ENOTSUP
+}
+
+// SetxattrAt implements FilesystemImpl.SetxattrAt.
+func (fs *anonFilesystem) SetxattrAt(ctx context.Context, rp *ResolvingPath, opts SetxattrOptions) error {
+ if !rp.Done() {
+ return syserror.ENOTDIR
+ }
+ return syserror.EPERM
+}
+
+// RemovexattrAt implements FilesystemImpl.RemovexattrAt.
+func (fs *anonFilesystem) RemovexattrAt(ctx context.Context, rp *ResolvingPath, name string) error {
+ if !rp.Done() {
+ return syserror.ENOTDIR
+ }
+ return syserror.EPERM
+}
+
+// PrependPath implements FilesystemImpl.PrependPath.
+func (fs *anonFilesystem) PrependPath(ctx context.Context, vfsroot, vd VirtualDentry, b *fspath.Builder) error {
+ b.PrependComponent(fmt.Sprintf("anon_inode:%s", vd.dentry.impl.(*anonDentry).name))
+ return PrependPathSyntheticError{}
+}
+
+// IncRef implements DentryImpl.IncRef.
+func (d *anonDentry) IncRef() {
+ // no-op
+}
+
+// TryIncRef implements DentryImpl.TryIncRef.
+func (d *anonDentry) TryIncRef() bool {
+ return true
+}
+
+// DecRef implements DentryImpl.DecRef.
+func (d *anonDentry) DecRef() {
+ // no-op
+}
diff --git a/pkg/sentry/vfs/context.go b/pkg/sentry/vfs/context.go
index 705194ebc..d97362b9a 100644
--- a/pkg/sentry/vfs/context.go
+++ b/pkg/sentry/vfs/context.go
@@ -15,7 +15,7 @@
package vfs
import (
- "gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/context"
)
// contextID is this package's type for context.Context.Value keys.
diff --git a/pkg/sentry/vfs/dentry.go b/pkg/sentry/vfs/dentry.go
index 1bc9c4a38..486a76475 100644
--- a/pkg/sentry/vfs/dentry.go
+++ b/pkg/sentry/vfs/dentry.go
@@ -16,9 +16,9 @@ package vfs
import (
"fmt"
- "sync"
"sync/atomic"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
)
diff --git a/pkg/sentry/vfs/device.go b/pkg/sentry/vfs/device.go
new file mode 100644
index 000000000..3af2aa58d
--- /dev/null
+++ b/pkg/sentry/vfs/device.go
@@ -0,0 +1,129 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs
+
+import (
+ "fmt"
+
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+// DeviceKind indicates whether a device is a block or character device.
+type DeviceKind uint32
+
+const (
+ // BlockDevice indicates a block device.
+ BlockDevice DeviceKind = iota
+
+ // CharDevice indicates a character device.
+ CharDevice
+)
+
+// String implements fmt.Stringer.String.
+func (kind DeviceKind) String() string {
+ switch kind {
+ case BlockDevice:
+ return "block"
+ case CharDevice:
+ return "character"
+ default:
+ return fmt.Sprintf("invalid device kind %d", kind)
+ }
+}
+
+type devTuple struct {
+ kind DeviceKind
+ major uint32
+ minor uint32
+}
+
+// A Device backs device special files.
+type Device interface {
+ // Open returns a FileDescription representing this device.
+ Open(ctx context.Context, mnt *Mount, d *Dentry, opts OpenOptions) (*FileDescription, error)
+}
+
+type registeredDevice struct {
+ dev Device
+ opts RegisterDeviceOptions
+}
+
+// RegisterDeviceOptions contains options to
+// VirtualFilesystem.RegisterDevice().
+type RegisterDeviceOptions struct {
+ // GroupName is the name shown for this device registration in
+ // /proc/devices. If GroupName is empty, this registration will not be
+ // shown in /proc/devices.
+ GroupName string
+}
+
+// RegisterDevice registers the given Device in vfs with the given major and
+// minor device numbers.
+func (vfs *VirtualFilesystem) RegisterDevice(kind DeviceKind, major, minor uint32, dev Device, opts *RegisterDeviceOptions) error {
+ tup := devTuple{kind, major, minor}
+ vfs.devicesMu.Lock()
+ defer vfs.devicesMu.Unlock()
+ if existing, ok := vfs.devices[tup]; ok {
+ return fmt.Errorf("%s device number (%d, %d) is already registered to device type %T", kind, major, minor, existing.dev)
+ }
+ vfs.devices[tup] = &registeredDevice{
+ dev: dev,
+ opts: *opts,
+ }
+ return nil
+}
+
+// OpenDeviceSpecialFile returns a FileDescription representing the given
+// device.
+func (vfs *VirtualFilesystem) OpenDeviceSpecialFile(ctx context.Context, mnt *Mount, d *Dentry, kind DeviceKind, major, minor uint32, opts *OpenOptions) (*FileDescription, error) {
+ tup := devTuple{kind, major, minor}
+ vfs.devicesMu.RLock()
+ defer vfs.devicesMu.RUnlock()
+ rd, ok := vfs.devices[tup]
+ if !ok {
+ return nil, syserror.ENXIO
+ }
+ return rd.dev.Open(ctx, mnt, d, *opts)
+}
+
+// GetAnonBlockDevMinor allocates and returns an unused minor device number for
+// an "anonymous" block device with major number 0.
+func (vfs *VirtualFilesystem) GetAnonBlockDevMinor() (uint32, error) {
+ vfs.anonBlockDevMinorMu.Lock()
+ defer vfs.anonBlockDevMinorMu.Unlock()
+ minor := vfs.anonBlockDevMinorNext
+ const maxDevMinor = (1 << 20) - 1
+ for minor < maxDevMinor {
+ if _, ok := vfs.anonBlockDevMinor[minor]; !ok {
+ vfs.anonBlockDevMinor[minor] = struct{}{}
+ vfs.anonBlockDevMinorNext = minor + 1
+ return minor, nil
+ }
+ minor++
+ }
+ return 0, syserror.EMFILE
+}
+
+// PutAnonBlockDevMinor deallocates a minor device number returned by a
+// previous call to GetAnonBlockDevMinor.
+func (vfs *VirtualFilesystem) PutAnonBlockDevMinor(minor uint32) {
+ vfs.anonBlockDevMinorMu.Lock()
+ defer vfs.anonBlockDevMinorMu.Unlock()
+ delete(vfs.anonBlockDevMinor, minor)
+ if minor < vfs.anonBlockDevMinorNext {
+ vfs.anonBlockDevMinorNext = minor
+ }
+}
diff --git a/pkg/sentry/vfs/epoll.go b/pkg/sentry/vfs/epoll.go
new file mode 100644
index 000000000..7c83f9a5a
--- /dev/null
+++ b/pkg/sentry/vfs/epoll.go
@@ -0,0 +1,377 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/sync"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/waiter"
+)
+
+// epollCycleMu serializes attempts to register EpollInstances with other
+// EpollInstances in order to check for cycles.
+var epollCycleMu sync.Mutex
+
+// EpollInstance represents an epoll instance, as described by epoll(7).
+type EpollInstance struct {
+ vfsfd FileDescription
+ FileDescriptionDefaultImpl
+ DentryMetadataFileDescriptionImpl
+
+ // q holds waiters on this EpollInstance.
+ q waiter.Queue
+
+ // interest is the set of file descriptors that are registered with the
+ // EpollInstance for monitoring. interest is protected by interestMu.
+ interestMu sync.Mutex
+ interest map[epollInterestKey]*epollInterest
+
+ // mu protects fields in registered epollInterests.
+ mu sync.Mutex
+
+ // ready is the set of file descriptors that may be "ready" for I/O. Note
+ // that this must be an ordered list, not a map: "If more than maxevents
+ // file descriptors are ready when epoll_wait() is called, then successive
+ // epoll_wait() calls will round robin through the set of ready file
+ // descriptors. This behavior helps avoid starvation scenarios, where a
+ // process fails to notice that additional file descriptors are ready
+ // because it focuses on a set of file descriptors that are already known
+ // to be ready." - epoll_wait(2)
+ ready epollInterestList
+}
+
+type epollInterestKey struct {
+ // file is the registered FileDescription. No reference is held on file;
+ // instead, when the last reference is dropped, FileDescription.DecRef()
+ // removes the FileDescription from all EpollInstances. file is immutable.
+ file *FileDescription
+
+ // num is the file descriptor number with which this entry was registered.
+ // num is immutable.
+ num int32
+}
+
+// epollInterest represents an EpollInstance's interest in a file descriptor.
+type epollInterest struct {
+ // epoll is the owning EpollInstance. epoll is immutable.
+ epoll *EpollInstance
+
+ // key is the file to which this epollInterest applies. key is immutable.
+ key epollInterestKey
+
+ // waiter is registered with key.file. entry is protected by epoll.mu.
+ waiter waiter.Entry
+
+ // mask is the event mask associated with this registration, including
+ // flags EPOLLET and EPOLLONESHOT. mask is protected by epoll.mu.
+ mask uint32
+
+ // ready is true if epollInterestEntry is linked into epoll.ready. ready
+ // and epollInterestEntry are protected by epoll.mu.
+ ready bool
+ epollInterestEntry
+
+ // userData is the epoll_data_t associated with this epollInterest.
+ // userData is protected by epoll.mu.
+ userData [2]int32
+}
+
+// NewEpollInstanceFD returns a FileDescription representing a new epoll
+// instance. A reference is taken on the returned FileDescription.
+func (vfs *VirtualFilesystem) NewEpollInstanceFD() (*FileDescription, error) {
+ vd := vfs.NewAnonVirtualDentry("[eventpoll]")
+ defer vd.DecRef()
+ ep := &EpollInstance{
+ interest: make(map[epollInterestKey]*epollInterest),
+ }
+ if err := ep.vfsfd.Init(ep, linux.O_RDWR, vd.Mount(), vd.Dentry(), &FileDescriptionOptions{
+ UseDentryMetadata: true,
+ }); err != nil {
+ return nil, err
+ }
+ return &ep.vfsfd, nil
+}
+
+// Release implements FileDescriptionImpl.Release.
+func (ep *EpollInstance) Release() {
+ // Unregister all polled fds.
+ ep.interestMu.Lock()
+ defer ep.interestMu.Unlock()
+ for key, epi := range ep.interest {
+ file := key.file
+ file.epollMu.Lock()
+ delete(file.epolls, epi)
+ file.epollMu.Unlock()
+ file.EventUnregister(&epi.waiter)
+ }
+ ep.interest = nil
+}
+
+// Readiness implements waiter.Waitable.Readiness.
+func (ep *EpollInstance) Readiness(mask waiter.EventMask) waiter.EventMask {
+ if mask&waiter.EventIn == 0 {
+ return 0
+ }
+ ep.mu.Lock()
+ for epi := ep.ready.Front(); epi != nil; epi = epi.Next() {
+ wmask := waiter.EventMaskFromLinux(epi.mask)
+ if epi.key.file.Readiness(wmask)&wmask != 0 {
+ ep.mu.Unlock()
+ return waiter.EventIn
+ }
+ }
+ ep.mu.Unlock()
+ return 0
+}
+
+// EventRegister implements waiter.Waitable.EventRegister.
+func (ep *EpollInstance) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+ ep.q.EventRegister(e, mask)
+}
+
+// EventUnregister implements waiter.Waitable.EventUnregister.
+func (ep *EpollInstance) EventUnregister(e *waiter.Entry) {
+ ep.q.EventUnregister(e)
+}
+
+// Seek implements FileDescriptionImpl.Seek.
+func (ep *EpollInstance) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+ // Linux: fs/eventpoll.c:eventpoll_fops.llseek == noop_llseek
+ return 0, nil
+}
+
+// AddInterest implements the semantics of EPOLL_CTL_ADD.
+//
+// Preconditions: A reference must be held on file.
+func (ep *EpollInstance) AddInterest(file *FileDescription, num int32, mask uint32, userData [2]int32) error {
+ // Check for cyclic polling if necessary.
+ subep, _ := file.impl.(*EpollInstance)
+ if subep != nil {
+ epollCycleMu.Lock()
+ // epollCycleMu must be locked for the rest of AddInterest to ensure
+ // that cyclic polling is not introduced after the check.
+ defer epollCycleMu.Unlock()
+ if subep.mightPoll(ep) {
+ return syserror.ELOOP
+ }
+ }
+
+ ep.interestMu.Lock()
+ defer ep.interestMu.Unlock()
+
+ // Fail if the key is already registered.
+ key := epollInterestKey{
+ file: file,
+ num: num,
+ }
+ if _, ok := ep.interest[key]; ok {
+ return syserror.EEXIST
+ }
+
+ // Register interest in file.
+ mask |= linux.EPOLLERR | linux.EPOLLRDHUP
+ epi := &epollInterest{
+ epoll: ep,
+ key: key,
+ mask: mask,
+ userData: userData,
+ }
+ ep.interest[key] = epi
+ wmask := waiter.EventMaskFromLinux(mask)
+ file.EventRegister(&epi.waiter, wmask)
+
+ // Check if the file is already ready.
+ if file.Readiness(wmask)&wmask != 0 {
+ epi.Callback(nil)
+ }
+
+ // Add epi to file.epolls so that it is removed when the last
+ // FileDescription reference is dropped.
+ file.epollMu.Lock()
+ file.epolls[epi] = struct{}{}
+ file.epollMu.Unlock()
+
+ return nil
+}
+
+func (ep *EpollInstance) mightPoll(ep2 *EpollInstance) bool {
+ return ep.mightPollRecursive(ep2, 4) // Linux: fs/eventpoll.c:EP_MAX_NESTS
+}
+
+func (ep *EpollInstance) mightPollRecursive(ep2 *EpollInstance, remainingRecursion int) bool {
+ ep.interestMu.Lock()
+ defer ep.interestMu.Unlock()
+ for key := range ep.interest {
+ nextep, ok := key.file.impl.(*EpollInstance)
+ if !ok {
+ continue
+ }
+ if nextep == ep2 {
+ return true
+ }
+ if remainingRecursion == 0 {
+ return true
+ }
+ if nextep.mightPollRecursive(ep2, remainingRecursion-1) {
+ return true
+ }
+ }
+ return false
+}
+
+// ModifyInterest implements the semantics of EPOLL_CTL_MOD.
+//
+// Preconditions: A reference must be held on file.
+func (ep *EpollInstance) ModifyInterest(file *FileDescription, num int32, mask uint32, userData [2]int32) error {
+ ep.interestMu.Lock()
+ defer ep.interestMu.Unlock()
+
+ // Fail if the key is not already registered.
+ epi, ok := ep.interest[epollInterestKey{
+ file: file,
+ num: num,
+ }]
+ if !ok {
+ return syserror.ENOENT
+ }
+
+ // Update epi for the next call to ep.ReadEvents().
+ ep.mu.Lock()
+ epi.mask = mask
+ epi.userData = userData
+ ep.mu.Unlock()
+
+ // Re-register with the new mask.
+ mask |= linux.EPOLLERR | linux.EPOLLRDHUP
+ file.EventUnregister(&epi.waiter)
+ wmask := waiter.EventMaskFromLinux(mask)
+ file.EventRegister(&epi.waiter, wmask)
+
+ // Check if the file is already ready with the new mask.
+ if file.Readiness(wmask)&wmask != 0 {
+ epi.Callback(nil)
+ }
+
+ return nil
+}
+
+// DeleteInterest implements the semantics of EPOLL_CTL_DEL.
+//
+// Preconditions: A reference must be held on file.
+func (ep *EpollInstance) DeleteInterest(file *FileDescription, num int32) error {
+ ep.interestMu.Lock()
+ defer ep.interestMu.Unlock()
+
+ // Fail if the key is not already registered.
+ epi, ok := ep.interest[epollInterestKey{
+ file: file,
+ num: num,
+ }]
+ if !ok {
+ return syserror.ENOENT
+ }
+
+ // Unregister from the file so that epi will no longer be readied.
+ file.EventUnregister(&epi.waiter)
+
+ // Forget about epi.
+ ep.removeLocked(epi)
+
+ file.epollMu.Lock()
+ delete(file.epolls, epi)
+ file.epollMu.Unlock()
+
+ return nil
+}
+
+// Callback implements waiter.EntryCallback.Callback.
+func (epi *epollInterest) Callback(*waiter.Entry) {
+ newReady := false
+ epi.epoll.mu.Lock()
+ if !epi.ready {
+ newReady = true
+ epi.ready = true
+ epi.epoll.ready.PushBack(epi)
+ }
+ epi.epoll.mu.Unlock()
+ if newReady {
+ epi.epoll.q.Notify(waiter.EventIn)
+ }
+}
+
+// Preconditions: ep.interestMu must be locked.
+func (ep *EpollInstance) removeLocked(epi *epollInterest) {
+ delete(ep.interest, epi.key)
+ ep.mu.Lock()
+ if epi.ready {
+ epi.ready = false
+ ep.ready.Remove(epi)
+ }
+ ep.mu.Unlock()
+}
+
+// ReadEvents reads up to len(events) ready events into events and returns the
+// number of events read.
+//
+// Preconditions: len(events) != 0.
+func (ep *EpollInstance) ReadEvents(events []linux.EpollEvent) int {
+ i := 0
+ // Hot path: avoid defer.
+ ep.mu.Lock()
+ var next *epollInterest
+ var requeue epollInterestList
+ for epi := ep.ready.Front(); epi != nil; epi = next {
+ next = epi.Next()
+ // Regardless of what else happens, epi is initially removed from the
+ // ready list.
+ ep.ready.Remove(epi)
+ wmask := waiter.EventMaskFromLinux(epi.mask)
+ ievents := epi.key.file.Readiness(wmask) & wmask
+ if ievents == 0 {
+ // Leave epi off the ready list.
+ epi.ready = false
+ continue
+ }
+ // Determine what we should do with epi.
+ switch {
+ case epi.mask&linux.EPOLLONESHOT != 0:
+ // Clear all events from the mask; they must be re-added by
+ // EPOLL_CTL_MOD.
+ epi.mask &= linux.EP_PRIVATE_BITS
+ fallthrough
+ case epi.mask&linux.EPOLLET != 0:
+ // Leave epi off the ready list.
+ epi.ready = false
+ default:
+ // Queue epi to be moved to the end of the ready list.
+ requeue.PushBack(epi)
+ }
+ // Report ievents.
+ events[i] = linux.EpollEvent{
+ Events: ievents.ToLinux(),
+ Fd: epi.userData[0],
+ Data: epi.userData[1],
+ }
+ i++
+ if i == len(events) {
+ break
+ }
+ }
+ ep.ready.PushBackList(&requeue)
+ ep.mu.Unlock()
+ return i
+}
diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go
index 0b053201a..5bac660c7 100644
--- a/pkg/sentry/vfs/file_description.go
+++ b/pkg/sentry/vfs/file_description.go
@@ -18,12 +18,14 @@ import (
"sync/atomic"
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/sentry/arch"
- "gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/sentry/fs/lock"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/memmap"
- "gvisor.dev/gvisor/pkg/sentry/usermem"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
"gvisor.dev/gvisor/pkg/waiter"
)
@@ -45,12 +47,32 @@ type FileDescription struct {
// memory operations.
statusFlags uint32
+ // epolls is the set of epollInterests registered for this FileDescription.
+ // epolls is protected by epollMu.
+ epollMu sync.Mutex
+ epolls map[*epollInterest]struct{}
+
// vd is the filesystem location at which this FileDescription was opened.
// A reference is held on vd. vd is immutable.
vd VirtualDentry
+ // opts contains options passed to FileDescription.Init(). opts is
+ // immutable.
opts FileDescriptionOptions
+ // readable is MayReadFileWithOpenFlags(statusFlags). readable is
+ // immutable.
+ //
+ // readable is analogous to Linux's FMODE_READ.
+ readable bool
+
+ // writable is MayWriteFileWithOpenFlags(statusFlags). If writable is true,
+ // the FileDescription holds a write count on vd.mount. writable is
+ // immutable.
+ //
+ // writable is analogous to Linux's FMODE_WRITE.
+ writable bool
+
// impl is the FileDescriptionImpl associated with this Filesystem. impl is
// immutable. This should be the last field in FileDescription.
impl FileDescriptionImpl
@@ -61,20 +83,45 @@ type FileDescriptionOptions struct {
// If AllowDirectIO is true, allow O_DIRECT to be set on the file. This is
// usually only the case if O_DIRECT would actually have an effect.
AllowDirectIO bool
+
+ // If UseDentryMetadata is true, calls to FileDescription methods that
+ // interact with file and filesystem metadata (Stat, SetStat, StatFS,
+ // Listxattr, Getxattr, Setxattr, Removexattr) are implemented by calling
+ // the corresponding FilesystemImpl methods instead of the corresponding
+ // FileDescriptionImpl methods.
+ //
+ // UseDentryMetadata is intended for file descriptions that are implemented
+ // outside of individual filesystems, such as pipes, sockets, and device
+ // special files. FileDescriptions for which UseDentryMetadata is true may
+ // embed DentryMetadataFileDescriptionImpl to obtain appropriate
+ // implementations of FileDescriptionImpl methods that should not be
+ // called.
+ UseDentryMetadata bool
}
-// Init must be called before first use of fd. It takes ownership of references
-// on mnt and d held by the caller. statusFlags is the initial file description
-// status flags, which is usually the full set of flags passed to open(2).
-func (fd *FileDescription) Init(impl FileDescriptionImpl, statusFlags uint32, mnt *Mount, d *Dentry, opts *FileDescriptionOptions) {
+// Init must be called before first use of fd. If it succeeds, it takes
+// references on mnt and d. statusFlags is the initial file description status
+// flags, which is usually the full set of flags passed to open(2).
+func (fd *FileDescription) Init(impl FileDescriptionImpl, statusFlags uint32, mnt *Mount, d *Dentry, opts *FileDescriptionOptions) error {
+ writable := MayWriteFileWithOpenFlags(statusFlags)
+ if writable {
+ if err := mnt.CheckBeginWrite(); err != nil {
+ return err
+ }
+ }
+
fd.refs = 1
fd.statusFlags = statusFlags | linux.O_LARGEFILE
fd.vd = VirtualDentry{
mount: mnt,
dentry: d,
}
+ fd.vd.IncRef()
fd.opts = *opts
+ fd.readable = MayReadFileWithOpenFlags(statusFlags)
+ fd.writable = writable
fd.impl = impl
+ return nil
}
// IncRef increments fd's reference count.
@@ -101,7 +148,27 @@ func (fd *FileDescription) TryIncRef() bool {
// DecRef decrements fd's reference count.
func (fd *FileDescription) DecRef() {
if refs := atomic.AddInt64(&fd.refs, -1); refs == 0 {
+ // Unregister fd from all epoll instances.
+ fd.epollMu.Lock()
+ epolls := fd.epolls
+ fd.epolls = nil
+ fd.epollMu.Unlock()
+ for epi := range epolls {
+ ep := epi.epoll
+ ep.interestMu.Lock()
+ // Check that epi has not been concurrently unregistered by
+ // EpollInstance.DeleteInterest() or EpollInstance.Release().
+ if _, ok := ep.interest[epi.key]; ok {
+ fd.EventUnregister(&epi.waiter)
+ ep.removeLocked(epi)
+ }
+ ep.interestMu.Unlock()
+ }
+ // Release implementation resources.
fd.impl.Release()
+ if fd.writable {
+ fd.vd.mount.EndWrite()
+ }
fd.vd.DecRef()
} else if refs < 0 {
panic("FileDescription.DecRef() called without holding a reference")
@@ -140,7 +207,7 @@ func (fd *FileDescription) SetStatusFlags(ctx context.Context, creds *auth.Crede
// sense. However, the check as actually implemented seems to be "O_APPEND
// cannot be changed if the file is marked as append-only".
if (flags^oldFlags)&linux.O_APPEND != 0 {
- stat, err := fd.impl.Stat(ctx, StatOptions{
+ stat, err := fd.Stat(ctx, StatOptions{
// There is no mask bit for stx_attributes.
Mask: 0,
// Linux just reads inode::i_flags directly.
@@ -154,7 +221,7 @@ func (fd *FileDescription) SetStatusFlags(ctx context.Context, creds *auth.Crede
}
}
if (flags&linux.O_NOATIME != 0) && (oldFlags&linux.O_NOATIME == 0) {
- stat, err := fd.impl.Stat(ctx, StatOptions{
+ stat, err := fd.Stat(ctx, StatOptions{
Mask: linux.STATX_UID,
// Linux's inode_owner_or_capable() just reads inode::i_uid
// directly.
@@ -179,6 +246,16 @@ func (fd *FileDescription) SetStatusFlags(ctx context.Context, creds *auth.Crede
return nil
}
+// IsReadable returns true if fd was opened for reading.
+func (fd *FileDescription) IsReadable() bool {
+ return fd.readable
+}
+
+// IsWritable returns true if fd was opened for writing.
+func (fd *FileDescription) IsWritable() bool {
+ return fd.writable
+}
+
// Impl returns the FileDescriptionImpl associated with fd.
func (fd *FileDescription) Impl() FileDescriptionImpl {
return fd.impl
@@ -226,6 +303,8 @@ type FileDescriptionImpl interface {
// Errors:
//
// - If opts.Flags specifies unsupported options, PRead returns EOPNOTSUPP.
+ //
+ // Preconditions: The FileDescription was opened for reading.
PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error)
// Read is similar to PRead, but does not specify an offset.
@@ -239,6 +318,8 @@ type FileDescriptionImpl interface {
// Errors:
//
// - If opts.Flags specifies unsupported options, Read returns EOPNOTSUPP.
+ //
+ // Preconditions: The FileDescription was opened for reading.
Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error)
// PWrite writes src to the file, starting at the given offset, and returns
@@ -253,6 +334,8 @@ type FileDescriptionImpl interface {
//
// - If opts.Flags specifies unsupported options, PWrite returns
// EOPNOTSUPP.
+ //
+ // Preconditions: The FileDescription was opened for writing.
PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error)
// Write is similar to PWrite, but does not specify an offset, which is
@@ -266,6 +349,8 @@ type FileDescriptionImpl interface {
// Errors:
//
// - If opts.Flags specifies unsupported options, Write returns EOPNOTSUPP.
+ //
+ // Preconditions: The FileDescription was opened for writing.
Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error)
// IterDirents invokes cb on each entry in the directory represented by the
@@ -309,7 +394,25 @@ type FileDescriptionImpl interface {
// Removexattr removes the given extended attribute from the file.
Removexattr(ctx context.Context, name string) error
- // TODO: file locking
+ // LockBSD tries to acquire a BSD-style advisory file lock.
+ //
+ // TODO(gvisor.dev/issue/1480): BSD-style file locking
+ LockBSD(ctx context.Context, uid lock.UniqueID, t lock.LockType, block lock.Blocker) error
+
+ // LockBSD releases a BSD-style advisory file lock.
+ //
+ // TODO(gvisor.dev/issue/1480): BSD-style file locking
+ UnlockBSD(ctx context.Context, uid lock.UniqueID) error
+
+ // LockPOSIX tries to acquire a POSIX-style advisory file lock.
+ //
+ // TODO(gvisor.dev/issue/1480): POSIX-style file locking
+ LockPOSIX(ctx context.Context, uid lock.UniqueID, t lock.LockType, rng lock.LockRange, block lock.Blocker) error
+
+ // UnlockPOSIX releases a POSIX-style advisory file lock.
+ //
+ // TODO(gvisor.dev/issue/1480): POSIX-style file locking
+ UnlockPOSIX(ctx context.Context, uid lock.UniqueID, rng lock.LockRange) error
}
// Dirent holds the information contained in struct linux_dirent64.
@@ -348,29 +451,80 @@ func (fd *FileDescription) OnClose(ctx context.Context) error {
// Stat returns metadata for the file represented by fd.
func (fd *FileDescription) Stat(ctx context.Context, opts StatOptions) (linux.Statx, error) {
+ if fd.opts.UseDentryMetadata {
+ vfsObj := fd.vd.mount.vfs
+ rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
+ Root: fd.vd,
+ Start: fd.vd,
+ })
+ stat, err := fd.vd.mount.fs.impl.StatAt(ctx, rp, opts)
+ vfsObj.putResolvingPath(rp)
+ return stat, err
+ }
return fd.impl.Stat(ctx, opts)
}
// SetStat updates metadata for the file represented by fd.
func (fd *FileDescription) SetStat(ctx context.Context, opts SetStatOptions) error {
+ if fd.opts.UseDentryMetadata {
+ vfsObj := fd.vd.mount.vfs
+ rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
+ Root: fd.vd,
+ Start: fd.vd,
+ })
+ err := fd.vd.mount.fs.impl.SetStatAt(ctx, rp, opts)
+ vfsObj.putResolvingPath(rp)
+ return err
+ }
return fd.impl.SetStat(ctx, opts)
}
// StatFS returns metadata for the filesystem containing the file represented
// by fd.
func (fd *FileDescription) StatFS(ctx context.Context) (linux.Statfs, error) {
+ if fd.opts.UseDentryMetadata {
+ vfsObj := fd.vd.mount.vfs
+ rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
+ Root: fd.vd,
+ Start: fd.vd,
+ })
+ statfs, err := fd.vd.mount.fs.impl.StatFSAt(ctx, rp)
+ vfsObj.putResolvingPath(rp)
+ return statfs, err
+ }
return fd.impl.StatFS(ctx)
}
+// Readiness returns fd's I/O readiness.
+func (fd *FileDescription) Readiness(mask waiter.EventMask) waiter.EventMask {
+ return fd.impl.Readiness(mask)
+}
+
+// EventRegister registers e for I/O readiness events in mask.
+func (fd *FileDescription) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+ fd.impl.EventRegister(e, mask)
+}
+
+// EventUnregister unregisters e for I/O readiness events.
+func (fd *FileDescription) EventUnregister(e *waiter.Entry) {
+ fd.impl.EventUnregister(e)
+}
+
// PRead reads from the file represented by fd into dst, starting at the given
// offset, and returns the number of bytes read. PRead is permitted to return
// partial reads with a nil error.
func (fd *FileDescription) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) {
+ if !fd.readable {
+ return 0, syserror.EBADF
+ }
return fd.impl.PRead(ctx, dst, offset, opts)
}
// Read is similar to PRead, but does not specify an offset.
func (fd *FileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) {
+ if !fd.readable {
+ return 0, syserror.EBADF
+ }
return fd.impl.Read(ctx, dst, opts)
}
@@ -378,11 +532,17 @@ func (fd *FileDescription) Read(ctx context.Context, dst usermem.IOSequence, opt
// offset, and returns the number of bytes written. PWrite is permitted to
// return partial writes with a nil error.
func (fd *FileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) {
+ if !fd.writable {
+ return 0, syserror.EBADF
+ }
return fd.impl.PWrite(ctx, src, offset, opts)
}
// Write is similar to PWrite, but does not specify an offset.
func (fd *FileDescription) Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) {
+ if !fd.writable {
+ return 0, syserror.EBADF
+ }
return fd.impl.Write(ctx, src, opts)
}
@@ -417,6 +577,16 @@ func (fd *FileDescription) Ioctl(ctx context.Context, uio usermem.IO, args arch.
// Listxattr returns all extended attribute names for the file represented by
// fd.
func (fd *FileDescription) Listxattr(ctx context.Context) ([]string, error) {
+ if fd.opts.UseDentryMetadata {
+ vfsObj := fd.vd.mount.vfs
+ rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
+ Root: fd.vd,
+ Start: fd.vd,
+ })
+ names, err := fd.vd.mount.fs.impl.ListxattrAt(ctx, rp)
+ vfsObj.putResolvingPath(rp)
+ return names, err
+ }
names, err := fd.impl.Listxattr(ctx)
if err == syserror.ENOTSUP {
// Linux doesn't actually return ENOTSUP in this case; instead,
@@ -431,18 +601,48 @@ func (fd *FileDescription) Listxattr(ctx context.Context) ([]string, error) {
// Getxattr returns the value associated with the given extended attribute for
// the file represented by fd.
func (fd *FileDescription) Getxattr(ctx context.Context, name string) (string, error) {
+ if fd.opts.UseDentryMetadata {
+ vfsObj := fd.vd.mount.vfs
+ rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
+ Root: fd.vd,
+ Start: fd.vd,
+ })
+ val, err := fd.vd.mount.fs.impl.GetxattrAt(ctx, rp, name)
+ vfsObj.putResolvingPath(rp)
+ return val, err
+ }
return fd.impl.Getxattr(ctx, name)
}
// Setxattr changes the value associated with the given extended attribute for
// the file represented by fd.
func (fd *FileDescription) Setxattr(ctx context.Context, opts SetxattrOptions) error {
+ if fd.opts.UseDentryMetadata {
+ vfsObj := fd.vd.mount.vfs
+ rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
+ Root: fd.vd,
+ Start: fd.vd,
+ })
+ err := fd.vd.mount.fs.impl.SetxattrAt(ctx, rp, opts)
+ vfsObj.putResolvingPath(rp)
+ return err
+ }
return fd.impl.Setxattr(ctx, opts)
}
// Removexattr removes the given extended attribute from the file represented
// by fd.
func (fd *FileDescription) Removexattr(ctx context.Context, name string) error {
+ if fd.opts.UseDentryMetadata {
+ vfsObj := fd.vd.mount.vfs
+ rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
+ Root: fd.vd,
+ Start: fd.vd,
+ })
+ err := fd.vd.mount.fs.impl.RemovexattrAt(ctx, rp, name)
+ vfsObj.putResolvingPath(rp)
+ return err
+ }
return fd.impl.Removexattr(ctx, name)
}
@@ -464,7 +664,7 @@ func (fd *FileDescription) MappedName(ctx context.Context) string {
// DeviceID implements memmap.MappingIdentity.DeviceID.
func (fd *FileDescription) DeviceID() uint64 {
- stat, err := fd.impl.Stat(context.Background(), StatOptions{
+ stat, err := fd.Stat(context.Background(), StatOptions{
// There is no STATX_DEV; we assume that Stat will return it if it's
// available regardless of mask.
Mask: 0,
@@ -480,7 +680,7 @@ func (fd *FileDescription) DeviceID() uint64 {
// InodeID implements memmap.MappingIdentity.InodeID.
func (fd *FileDescription) InodeID() uint64 {
- stat, err := fd.impl.Stat(context.Background(), StatOptions{
+ stat, err := fd.Stat(context.Background(), StatOptions{
Mask: linux.STATX_INO,
// fs/proc/task_mmu.c:show_map_vma() just reads inode::i_ino directly.
Sync: linux.AT_STATX_DONT_SYNC,
@@ -493,5 +693,5 @@ func (fd *FileDescription) InodeID() uint64 {
// Msync implements memmap.MappingIdentity.Msync.
func (fd *FileDescription) Msync(ctx context.Context, mr memmap.MappableRange) error {
- return fd.impl.Sync(ctx)
+ return fd.Sync(ctx)
}
diff --git a/pkg/sentry/vfs/file_description_impl_util.go b/pkg/sentry/vfs/file_description_impl_util.go
index 3df49991c..c2a52ec1b 100644
--- a/pkg/sentry/vfs/file_description_impl_util.go
+++ b/pkg/sentry/vfs/file_description_impl_util.go
@@ -17,14 +17,15 @@ package vfs
import (
"bytes"
"io"
- "sync"
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/sentry/arch"
- "gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/sentry/fs/lock"
"gvisor.dev/gvisor/pkg/sentry/memmap"
- "gvisor.dev/gvisor/pkg/sentry/usermem"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
"gvisor.dev/gvisor/pkg/waiter"
)
@@ -152,6 +153,26 @@ func (FileDescriptionDefaultImpl) Removexattr(ctx context.Context, name string)
return syserror.ENOTSUP
}
+// LockBSD implements FileDescriptionImpl.LockBSD.
+func (FileDescriptionDefaultImpl) LockBSD(ctx context.Context, uid lock.UniqueID, t lock.LockType, block lock.Blocker) error {
+ return syserror.EBADF
+}
+
+// UnlockBSD implements FileDescriptionImpl.UnlockBSD.
+func (FileDescriptionDefaultImpl) UnlockBSD(ctx context.Context, uid lock.UniqueID) error {
+ return syserror.EBADF
+}
+
+// LockPOSIX implements FileDescriptionImpl.LockPOSIX.
+func (FileDescriptionDefaultImpl) LockPOSIX(ctx context.Context, uid lock.UniqueID, t lock.LockType, rng lock.LockRange, block lock.Blocker) error {
+ return syserror.EBADF
+}
+
+// UnlockPOSIX implements FileDescriptionImpl.UnlockPOSIX.
+func (FileDescriptionDefaultImpl) UnlockPOSIX(ctx context.Context, uid lock.UniqueID, rng lock.LockRange) error {
+ return syserror.EBADF
+}
+
// DirectoryFileDescriptionDefaultImpl may be embedded by implementations of
// FileDescriptionImpl that always represent directories to obtain
// implementations of non-directory I/O methods that return EISDIR.
@@ -177,6 +198,48 @@ func (DirectoryFileDescriptionDefaultImpl) Write(ctx context.Context, src userme
return 0, syserror.EISDIR
}
+// DentryMetadataFileDescriptionImpl may be embedded by implementations of
+// FileDescriptionImpl for which FileDescriptionOptions.UseDentryMetadata is
+// true to obtain implementations of Stat and SetStat that panic.
+type DentryMetadataFileDescriptionImpl struct{}
+
+// Stat implements FileDescriptionImpl.Stat.
+func (DentryMetadataFileDescriptionImpl) Stat(ctx context.Context, opts StatOptions) (linux.Statx, error) {
+ panic("illegal call to DentryMetadataFileDescriptionImpl.Stat")
+}
+
+// SetStat implements FileDescriptionImpl.SetStat.
+func (DentryMetadataFileDescriptionImpl) SetStat(ctx context.Context, opts SetStatOptions) error {
+ panic("illegal call to DentryMetadataFileDescriptionImpl.SetStat")
+}
+
+// DynamicBytesSource represents a data source for a
+// DynamicBytesFileDescriptionImpl.
+type DynamicBytesSource interface {
+ // Generate writes the file's contents to buf.
+ Generate(ctx context.Context, buf *bytes.Buffer) error
+}
+
+// StaticData implements DynamicBytesSource over a static string.
+type StaticData struct {
+ Data string
+}
+
+// Generate implements DynamicBytesSource.
+func (s *StaticData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+ buf.WriteString(s.Data)
+ return nil
+}
+
+// WritableDynamicBytesSource extends DynamicBytesSource to allow writes to the
+// underlying source.
+type WritableDynamicBytesSource interface {
+ DynamicBytesSource
+
+ // Write sends writes to the source.
+ Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error)
+}
+
// DynamicBytesFileDescriptionImpl may be embedded by implementations of
// FileDescriptionImpl that represent read-only regular files whose contents
// are backed by a bytes.Buffer that is regenerated when necessary, consistent
@@ -192,13 +255,6 @@ type DynamicBytesFileDescriptionImpl struct {
lastRead int64 // offset at which the last Read, PRead, or Seek ended
}
-// DynamicBytesSource represents a data source for a
-// DynamicBytesFileDescriptionImpl.
-type DynamicBytesSource interface {
- // Generate writes the file's contents to buf.
- Generate(ctx context.Context, buf *bytes.Buffer) error
-}
-
// SetDataSource must be called exactly once on fd before first use.
func (fd *DynamicBytesFileDescriptionImpl) SetDataSource(data DynamicBytesSource) {
fd.data = data
@@ -278,6 +334,43 @@ func (fd *DynamicBytesFileDescriptionImpl) Seek(ctx context.Context, offset int6
return offset, nil
}
+// Preconditions: fd.mu must be locked.
+func (fd *DynamicBytesFileDescriptionImpl) pwriteLocked(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) {
+ if opts.Flags&^(linux.RWF_HIPRI|linux.RWF_DSYNC|linux.RWF_SYNC) != 0 {
+ return 0, syserror.EOPNOTSUPP
+ }
+
+ writable, ok := fd.data.(WritableDynamicBytesSource)
+ if !ok {
+ return 0, syserror.EINVAL
+ }
+ n, err := writable.Write(ctx, src, offset)
+ if err != nil {
+ return 0, err
+ }
+
+ // Invalidate cached data that might exist prior to this call.
+ fd.buf.Reset()
+ return n, nil
+}
+
+// PWrite implements FileDescriptionImpl.PWrite.
+func (fd *DynamicBytesFileDescriptionImpl) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) {
+ fd.mu.Lock()
+ n, err := fd.pwriteLocked(ctx, src, offset, opts)
+ fd.mu.Unlock()
+ return n, err
+}
+
+// Write implements FileDescriptionImpl.Write.
+func (fd *DynamicBytesFileDescriptionImpl) Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) {
+ fd.mu.Lock()
+ n, err := fd.pwriteLocked(ctx, src, fd.off, opts)
+ fd.off += n
+ fd.mu.Unlock()
+ return n, err
+}
+
// GenericConfigureMMap may be used by most implementations of
// FileDescriptionImpl.ConfigureMMap.
func GenericConfigureMMap(fd *FileDescription, m memmap.Mappable, opts *memmap.MMapOpts) error {
diff --git a/pkg/sentry/vfs/file_description_impl_util_test.go b/pkg/sentry/vfs/file_description_impl_util_test.go
index 678be07fe..8fa26418e 100644
--- a/pkg/sentry/vfs/file_description_impl_util_test.go
+++ b/pkg/sentry/vfs/file_description_impl_util_test.go
@@ -22,11 +22,10 @@ import (
"testing"
"gvisor.dev/gvisor/pkg/abi/linux"
- "gvisor.dev/gvisor/pkg/sentry/context"
- "gvisor.dev/gvisor/pkg/sentry/context/contexttest"
- "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
- "gvisor.dev/gvisor/pkg/sentry/usermem"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/sentry/contexttest"
"gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
)
// fileDescription is the common fd struct which a filesystem implementation
@@ -36,68 +35,80 @@ type fileDescription struct {
FileDescriptionDefaultImpl
}
-// genCountFD is a read-only FileDescriptionImpl representing a regular file
-// that contains the number of times its DynamicBytesSource.Generate()
+// genCount contains the number of times its DynamicBytesSource.Generate()
// implementation has been called.
-type genCountFD struct {
- fileDescription
- DynamicBytesFileDescriptionImpl
-
+type genCount struct {
count uint64 // accessed using atomic memory ops
}
-func newGenCountFD(mnt *Mount, vfsd *Dentry) *FileDescription {
- var fd genCountFD
- fd.vfsfd.Init(&fd, 0 /* statusFlags */, mnt, vfsd, &FileDescriptionOptions{})
- fd.DynamicBytesFileDescriptionImpl.SetDataSource(&fd)
- return &fd.vfsfd
+// Generate implements DynamicBytesSource.Generate.
+func (g *genCount) Generate(ctx context.Context, buf *bytes.Buffer) error {
+ fmt.Fprintf(buf, "%d", atomic.AddUint64(&g.count, 1))
+ return nil
}
-// Release implements FileDescriptionImpl.Release.
-func (fd *genCountFD) Release() {
+type storeData struct {
+ data string
+}
+
+var _ WritableDynamicBytesSource = (*storeData)(nil)
+
+// Generate implements DynamicBytesSource.
+func (d *storeData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+ buf.WriteString(d.data)
+ return nil
}
-// StatusFlags implements FileDescriptionImpl.StatusFlags.
-func (fd *genCountFD) StatusFlags(ctx context.Context) (uint32, error) {
+// Generate implements WritableDynamicBytesSource.
+func (d *storeData) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) {
+ buf := make([]byte, src.NumBytes())
+ n, err := src.CopyIn(ctx, buf)
+ if err != nil {
+ return 0, err
+ }
+
+ d.data = string(buf[:n])
return 0, nil
}
-// SetStatusFlags implements FileDescriptionImpl.SetStatusFlags.
-func (fd *genCountFD) SetStatusFlags(ctx context.Context, flags uint32) error {
- return syserror.EPERM
+// testFD is a read-only FileDescriptionImpl representing a regular file.
+type testFD struct {
+ fileDescription
+ DynamicBytesFileDescriptionImpl
+
+ data DynamicBytesSource
+}
+
+func newTestFD(vfsObj *VirtualFilesystem, statusFlags uint32, data DynamicBytesSource) *FileDescription {
+ vd := vfsObj.NewAnonVirtualDentry("genCountFD")
+ defer vd.DecRef()
+ var fd testFD
+ fd.vfsfd.Init(&fd, statusFlags, vd.Mount(), vd.Dentry(), &FileDescriptionOptions{})
+ fd.DynamicBytesFileDescriptionImpl.SetDataSource(data)
+ return &fd.vfsfd
}
+// Release implements FileDescriptionImpl.Release.
+func (fd *testFD) Release() {
+}
+
+// SetStatusFlags implements FileDescriptionImpl.SetStatusFlags.
// Stat implements FileDescriptionImpl.Stat.
-func (fd *genCountFD) Stat(ctx context.Context, opts StatOptions) (linux.Statx, error) {
+func (fd *testFD) Stat(ctx context.Context, opts StatOptions) (linux.Statx, error) {
// Note that Statx.Mask == 0 in the return value.
return linux.Statx{}, nil
}
// SetStat implements FileDescriptionImpl.SetStat.
-func (fd *genCountFD) SetStat(ctx context.Context, opts SetStatOptions) error {
+func (fd *testFD) SetStat(ctx context.Context, opts SetStatOptions) error {
return syserror.EPERM
}
-// Generate implements DynamicBytesSource.Generate.
-func (fd *genCountFD) Generate(ctx context.Context, buf *bytes.Buffer) error {
- fmt.Fprintf(buf, "%d", atomic.AddUint64(&fd.count, 1))
- return nil
-}
-
func TestGenCountFD(t *testing.T) {
ctx := contexttest.Context(t)
- creds := auth.CredentialsFromContext(ctx)
vfsObj := New() // vfs.New()
- vfsObj.MustRegisterFilesystemType("testfs", FDTestFilesystemType{})
- mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "testfs", &GetFilesystemOptions{})
- if err != nil {
- t.Fatalf("failed to create testfs root mount: %v", err)
- }
- vd := mntns.Root()
- defer vd.DecRef()
-
- fd := newGenCountFD(vd.Mount(), vd.Dentry())
+ fd := newTestFD(vfsObj, linux.O_RDWR, &genCount{})
defer fd.DecRef()
// The first read causes Generate to be called to fill the FD's buffer.
@@ -138,4 +149,69 @@ func TestGenCountFD(t *testing.T) {
if want := byte('3'); buf[0] != want {
t.Errorf("PRead: got byte %c, wanted %c", buf[0], want)
}
+
+ // Write and PWrite fails.
+ if _, err := fd.Write(ctx, ioseq, WriteOptions{}); err != syserror.EINVAL {
+ t.Errorf("Write: got err %v, wanted %v", err, syserror.EINVAL)
+ }
+ if _, err := fd.PWrite(ctx, ioseq, 0, WriteOptions{}); err != syserror.EINVAL {
+ t.Errorf("Write: got err %v, wanted %v", err, syserror.EINVAL)
+ }
+}
+
+func TestWritable(t *testing.T) {
+ ctx := contexttest.Context(t)
+
+ vfsObj := New() // vfs.New()
+ fd := newTestFD(vfsObj, linux.O_RDWR, &storeData{data: "init"})
+ defer fd.DecRef()
+
+ buf := make([]byte, 10)
+ ioseq := usermem.BytesIOSequence(buf)
+ if n, err := fd.Read(ctx, ioseq, ReadOptions{}); n != 4 && err != io.EOF {
+ t.Fatalf("Read: got (%v, %v), wanted (4, EOF)", n, err)
+ }
+ if want := "init"; want == string(buf) {
+ t.Fatalf("Read: got %v, wanted %v", string(buf), want)
+ }
+
+ // Test PWrite.
+ want := "write"
+ writeIOSeq := usermem.BytesIOSequence([]byte(want))
+ if n, err := fd.PWrite(ctx, writeIOSeq, 0, WriteOptions{}); int(n) != len(want) && err != nil {
+ t.Errorf("PWrite: got err (%v, %v), wanted (%v, nil)", n, err, len(want))
+ }
+ if n, err := fd.PRead(ctx, ioseq, 0, ReadOptions{}); int(n) != len(want) && err != io.EOF {
+ t.Fatalf("PRead: got (%v, %v), wanted (%v, EOF)", n, err, len(want))
+ }
+ if want == string(buf) {
+ t.Fatalf("PRead: got %v, wanted %v", string(buf), want)
+ }
+
+ // Test Seek to 0 followed by Write.
+ want = "write2"
+ writeIOSeq = usermem.BytesIOSequence([]byte(want))
+ if n, err := fd.Seek(ctx, 0, linux.SEEK_SET); n != 0 && err != nil {
+ t.Errorf("Seek: got err (%v, %v), wanted (0, nil)", n, err)
+ }
+ if n, err := fd.Write(ctx, writeIOSeq, WriteOptions{}); int(n) != len(want) && err != nil {
+ t.Errorf("Write: got err (%v, %v), wanted (%v, nil)", n, err, len(want))
+ }
+ if n, err := fd.PRead(ctx, ioseq, 0, ReadOptions{}); int(n) != len(want) && err != io.EOF {
+ t.Fatalf("PRead: got (%v, %v), wanted (%v, EOF)", n, err, len(want))
+ }
+ if want == string(buf) {
+ t.Fatalf("PRead: got %v, wanted %v", string(buf), want)
+ }
+
+ // Test failure if offset != 0.
+ if n, err := fd.Seek(ctx, 1, linux.SEEK_SET); n != 0 && err != nil {
+ t.Errorf("Seek: got err (%v, %v), wanted (0, nil)", n, err)
+ }
+ if n, err := fd.Write(ctx, writeIOSeq, WriteOptions{}); n != 0 && err != syserror.EINVAL {
+ t.Errorf("Write: got err (%v, %v), wanted (0, EINVAL)", n, err)
+ }
+ if n, err := fd.PWrite(ctx, writeIOSeq, 2, WriteOptions{}); n != 0 && err != syserror.EINVAL {
+ t.Errorf("PWrite: got err (%v, %v), wanted (0, EINVAL)", n, err)
+ }
}
diff --git a/pkg/sentry/vfs/filesystem.go b/pkg/sentry/vfs/filesystem.go
index 89bd58864..a06a6caf3 100644
--- a/pkg/sentry/vfs/filesystem.go
+++ b/pkg/sentry/vfs/filesystem.go
@@ -18,8 +18,8 @@ import (
"sync/atomic"
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/fspath"
- "gvisor.dev/gvisor/pkg/sentry/context"
)
// A Filesystem is a tree of nodes represented by Dentries, which forms part of
@@ -418,17 +418,38 @@ type FilesystemImpl interface {
UnlinkAt(ctx context.Context, rp *ResolvingPath) error
// ListxattrAt returns all extended attribute names for the file at rp.
+ //
+ // Errors:
+ //
+ // - If extended attributes are not supported by the filesystem,
+ // ListxattrAt returns nil. (See FileDescription.Listxattr for an
+ // explanation.)
ListxattrAt(ctx context.Context, rp *ResolvingPath) ([]string, error)
// GetxattrAt returns the value associated with the given extended
// attribute for the file at rp.
+ //
+ // Errors:
+ //
+ // - If extended attributes are not supported by the filesystem, GetxattrAt
+ // returns ENOTSUP.
GetxattrAt(ctx context.Context, rp *ResolvingPath, name string) (string, error)
// SetxattrAt changes the value associated with the given extended
// attribute for the file at rp.
+ //
+ // Errors:
+ //
+ // - If extended attributes are not supported by the filesystem, SetxattrAt
+ // returns ENOTSUP.
SetxattrAt(ctx context.Context, rp *ResolvingPath, opts SetxattrOptions) error
// RemovexattrAt removes the given extended attribute from the file at rp.
+ //
+ // Errors:
+ //
+ // - If extended attributes are not supported by the filesystem,
+ // RemovexattrAt returns ENOTSUP.
RemovexattrAt(ctx context.Context, rp *ResolvingPath, name string) error
// PrependPath prepends a path from vd to vd.Mount().Root() to b.
diff --git a/pkg/sentry/vfs/filesystem_type.go b/pkg/sentry/vfs/filesystem_type.go
index c335e206d..c58b70728 100644
--- a/pkg/sentry/vfs/filesystem_type.go
+++ b/pkg/sentry/vfs/filesystem_type.go
@@ -15,9 +15,10 @@
package vfs
import (
+ "bytes"
"fmt"
- "gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
)
@@ -43,28 +44,70 @@ type GetFilesystemOptions struct {
InternalData interface{}
}
+type registeredFilesystemType struct {
+ fsType FilesystemType
+ opts RegisterFilesystemTypeOptions
+}
+
+// RegisterFilesystemTypeOptions contains options to
+// VirtualFilesystem.RegisterFilesystem().
+type RegisterFilesystemTypeOptions struct {
+ // If AllowUserMount is true, allow calls to VirtualFilesystem.MountAt()
+ // for which MountOptions.InternalMount == false to use this filesystem
+ // type.
+ AllowUserMount bool
+
+ // If AllowUserList is true, make this filesystem type visible in
+ // /proc/filesystems.
+ AllowUserList bool
+
+ // If RequiresDevice is true, indicate that mounting this filesystem
+ // requires a block device as the mount source in /proc/filesystems.
+ RequiresDevice bool
+}
+
// RegisterFilesystemType registers the given FilesystemType in vfs with the
// given name.
-func (vfs *VirtualFilesystem) RegisterFilesystemType(name string, fsType FilesystemType) error {
+func (vfs *VirtualFilesystem) RegisterFilesystemType(name string, fsType FilesystemType, opts *RegisterFilesystemTypeOptions) error {
vfs.fsTypesMu.Lock()
defer vfs.fsTypesMu.Unlock()
if existing, ok := vfs.fsTypes[name]; ok {
- return fmt.Errorf("name %q is already registered to filesystem type %T", name, existing)
+ return fmt.Errorf("name %q is already registered to filesystem type %T", name, existing.fsType)
+ }
+ vfs.fsTypes[name] = &registeredFilesystemType{
+ fsType: fsType,
+ opts: *opts,
}
- vfs.fsTypes[name] = fsType
return nil
}
// MustRegisterFilesystemType is equivalent to RegisterFilesystemType but
// panics on failure.
-func (vfs *VirtualFilesystem) MustRegisterFilesystemType(name string, fsType FilesystemType) {
- if err := vfs.RegisterFilesystemType(name, fsType); err != nil {
+func (vfs *VirtualFilesystem) MustRegisterFilesystemType(name string, fsType FilesystemType, opts *RegisterFilesystemTypeOptions) {
+ if err := vfs.RegisterFilesystemType(name, fsType, opts); err != nil {
panic(fmt.Sprintf("failed to register filesystem type %T: %v", fsType, err))
}
}
-func (vfs *VirtualFilesystem) getFilesystemType(name string) FilesystemType {
+func (vfs *VirtualFilesystem) getFilesystemType(name string) *registeredFilesystemType {
vfs.fsTypesMu.RLock()
defer vfs.fsTypesMu.RUnlock()
return vfs.fsTypes[name]
}
+
+// GenerateProcFilesystems emits the contents of /proc/filesystems for vfs to
+// buf.
+func (vfs *VirtualFilesystem) GenerateProcFilesystems(buf *bytes.Buffer) {
+ vfs.fsTypesMu.RLock()
+ defer vfs.fsTypesMu.RUnlock()
+ for name, rft := range vfs.fsTypes {
+ if !rft.opts.AllowUserList {
+ continue
+ }
+ var nodev string
+ if !rft.opts.RequiresDevice {
+ nodev = "nodev"
+ }
+ fmt.Fprintf(buf, "%s\t%s\n", nodev, name)
+ }
+}
diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go
index ec23ab0dd..d39528051 100644
--- a/pkg/sentry/vfs/mount.go
+++ b/pkg/sentry/vfs/mount.go
@@ -19,7 +19,7 @@ import (
"sync/atomic"
"gvisor.dev/gvisor/pkg/abi/linux"
- "gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/syserror"
)
@@ -112,11 +112,11 @@ type MountNamespace struct {
// configured by the given arguments. A reference is taken on the returned
// MountNamespace.
func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth.Credentials, source, fsTypeName string, opts *GetFilesystemOptions) (*MountNamespace, error) {
- fsType := vfs.getFilesystemType(fsTypeName)
- if fsType == nil {
+ rft := vfs.getFilesystemType(fsTypeName)
+ if rft == nil {
return nil, syserror.ENODEV
}
- fs, root, err := fsType.GetFilesystem(ctx, vfs, creds, source, *opts)
+ fs, root, err := rft.fsType.GetFilesystem(ctx, vfs, creds, source, *opts)
if err != nil {
return nil, err
}
@@ -136,11 +136,14 @@ func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth
// MountAt creates and mounts a Filesystem configured by the given arguments.
func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentials, source string, target *PathOperation, fsTypeName string, opts *MountOptions) error {
- fsType := vfs.getFilesystemType(fsTypeName)
- if fsType == nil {
+ rft := vfs.getFilesystemType(fsTypeName)
+ if rft == nil {
return syserror.ENODEV
}
- fs, root, err := fsType.GetFilesystem(ctx, vfs, creds, source, opts.GetFilesystemOptions)
+ if !opts.InternalMount && !rft.opts.AllowUserMount {
+ return syserror.ENODEV
+ }
+ fs, root, err := rft.fsType.GetFilesystem(ctx, vfs, creds, source, opts.GetFilesystemOptions)
if err != nil {
return err
}
diff --git a/pkg/sentry/vfs/mount_test.go b/pkg/sentry/vfs/mount_test.go
index adff0b94b..3b933468d 100644
--- a/pkg/sentry/vfs/mount_test.go
+++ b/pkg/sentry/vfs/mount_test.go
@@ -17,8 +17,9 @@ package vfs
import (
"fmt"
"runtime"
- "sync"
"testing"
+
+ "gvisor.dev/gvisor/pkg/sync"
)
func TestMountTableLookupEmpty(t *testing.T) {
diff --git a/pkg/sentry/vfs/mount_unsafe.go b/pkg/sentry/vfs/mount_unsafe.go
index ab13fa461..bd90d36c4 100644
--- a/pkg/sentry/vfs/mount_unsafe.go
+++ b/pkg/sentry/vfs/mount_unsafe.go
@@ -26,7 +26,7 @@ import (
"sync/atomic"
"unsafe"
- "gvisor.dev/gvisor/pkg/syncutil"
+ "gvisor.dev/gvisor/pkg/sync"
)
// mountKey represents the location at which a Mount is mounted. It is
@@ -75,7 +75,7 @@ type mountTable struct {
// intrinsics and inline assembly, limiting the performance of this
// approach.)
- seq syncutil.SeqCount
+ seq sync.SeqCount
seed uint32 // for hashing keys
// size holds both length (number of elements) and capacity (number of
diff --git a/pkg/sentry/vfs/options.go b/pkg/sentry/vfs/options.go
index 87d2b0d1c..b7774bf28 100644
--- a/pkg/sentry/vfs/options.go
+++ b/pkg/sentry/vfs/options.go
@@ -50,6 +50,10 @@ type MknodOptions struct {
type MountOptions struct {
// GetFilesystemOptions contains options to FilesystemType.GetFilesystem().
GetFilesystemOptions GetFilesystemOptions
+
+ // If InternalMount is true, allow the use of filesystem types for which
+ // RegisterFilesystemTypeOptions.AllowUserMount == false.
+ InternalMount bool
}
// OpenOptions contains options to VirtualFilesystem.OpenAt() and
diff --git a/pkg/sentry/vfs/pathname.go b/pkg/sentry/vfs/pathname.go
index 8e155654f..b318c681a 100644
--- a/pkg/sentry/vfs/pathname.go
+++ b/pkg/sentry/vfs/pathname.go
@@ -15,10 +15,9 @@
package vfs
import (
- "sync"
-
+ "gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/fspath"
- "gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
)
diff --git a/pkg/sentry/vfs/permissions.go b/pkg/sentry/vfs/permissions.go
index f1edb0680..f664581f4 100644
--- a/pkg/sentry/vfs/permissions.go
+++ b/pkg/sentry/vfs/permissions.go
@@ -30,6 +30,26 @@ const (
MayExec = 1
)
+// OnlyRead returns true if access _only_ allows read.
+func (a AccessTypes) OnlyRead() bool {
+ return a == MayRead
+}
+
+// MayRead returns true if access allows read.
+func (a AccessTypes) MayRead() bool {
+ return a&MayRead != 0
+}
+
+// MayWrite returns true if access allows write.
+func (a AccessTypes) MayWrite() bool {
+ return a&MayWrite != 0
+}
+
+// MayExec returns true if access allows exec.
+func (a AccessTypes) MayExec() bool {
+ return a&MayExec != 0
+}
+
// GenericCheckPermissions checks that creds has the given access rights on a
// file with the given permissions, UID, and GID, subject to the rules of
// fs/namei.c:generic_permission(). isDir is true if the file is a directory.
@@ -53,7 +73,7 @@ func GenericCheckPermissions(creds *auth.Credentials, ats AccessTypes, isDir boo
}
// CAP_DAC_READ_SEARCH allows the caller to read and search arbitrary
// directories, and read arbitrary non-directory files.
- if (isDir && (ats&MayWrite == 0)) || ats == MayRead {
+ if (isDir && !ats.MayWrite()) || ats.OnlyRead() {
if creds.HasCapability(linux.CAP_DAC_READ_SEARCH) {
return nil
}
@@ -61,7 +81,7 @@ func GenericCheckPermissions(creds *auth.Credentials, ats AccessTypes, isDir boo
// CAP_DAC_OVERRIDE allows arbitrary access to directories, read/write
// access to non-directory files, and execute access to non-directory files
// for which at least one execute bit is set.
- if isDir || (ats&MayExec == 0) || (mode&0111 != 0) {
+ if isDir || !ats.MayExec() || (mode&0111 != 0) {
if creds.HasCapability(linux.CAP_DAC_OVERRIDE) {
return nil
}
@@ -74,14 +94,13 @@ func GenericCheckPermissions(creds *auth.Credentials, ats AccessTypes, isDir boo
// the set of accesses permitted for the opened file:
//
// - O_TRUNC causes MayWrite to be set in the returned AccessTypes (since it
-// mutates the file), but does not permit the opened to write to the file
+// mutates the file), but does not permit writing to the open file description
// thereafter.
//
// - "Linux reserves the special, nonstandard access mode 3 (binary 11) in
// flags to mean: check for read and write permission on the file and return a
// file descriptor that can't be used for reading or writing." - open(2). Thus
-// AccessTypesForOpenFlags returns MayRead|MayWrite in this case, but
-// filesystems are responsible for ensuring that access is denied.
+// AccessTypesForOpenFlags returns MayRead|MayWrite in this case.
//
// Use May{Read,Write}FileWithOpenFlags() for these checks instead.
func AccessTypesForOpenFlags(flags uint32) AccessTypes {
diff --git a/pkg/sentry/vfs/resolving_path.go b/pkg/sentry/vfs/resolving_path.go
index f0641d314..8a0b382f6 100644
--- a/pkg/sentry/vfs/resolving_path.go
+++ b/pkg/sentry/vfs/resolving_path.go
@@ -16,11 +16,11 @@ package vfs
import (
"fmt"
- "sync"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/fspath"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
)
diff --git a/pkg/sentry/vfs/testutil.go b/pkg/sentry/vfs/testutil.go
deleted file mode 100644
index ee5c8b9e2..000000000
--- a/pkg/sentry/vfs/testutil.go
+++ /dev/null
@@ -1,173 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package vfs
-
-import (
- "fmt"
-
- "gvisor.dev/gvisor/pkg/abi/linux"
- "gvisor.dev/gvisor/pkg/fspath"
- "gvisor.dev/gvisor/pkg/sentry/context"
- "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
- "gvisor.dev/gvisor/pkg/syserror"
-)
-
-// FDTestFilesystemType is a test-only FilesystemType that produces Filesystems
-// for which all FilesystemImpl methods taking a path return EPERM. It is used
-// to produce Mounts and Dentries for testing of FileDescriptionImpls that do
-// not depend on their originating Filesystem.
-type FDTestFilesystemType struct{}
-
-// FDTestFilesystem is a test-only FilesystemImpl produced by
-// FDTestFilesystemType.
-type FDTestFilesystem struct {
- vfsfs Filesystem
-}
-
-// GetFilesystem implements FilesystemType.GetFilesystem.
-func (fstype FDTestFilesystemType) GetFilesystem(ctx context.Context, vfsObj *VirtualFilesystem, creds *auth.Credentials, source string, opts GetFilesystemOptions) (*Filesystem, *Dentry, error) {
- var fs FDTestFilesystem
- fs.vfsfs.Init(vfsObj, &fs)
- return &fs.vfsfs, fs.NewDentry(), nil
-}
-
-// Release implements FilesystemImpl.Release.
-func (fs *FDTestFilesystem) Release() {
-}
-
-// Sync implements FilesystemImpl.Sync.
-func (fs *FDTestFilesystem) Sync(ctx context.Context) error {
- return nil
-}
-
-// GetDentryAt implements FilesystemImpl.GetDentryAt.
-func (fs *FDTestFilesystem) GetDentryAt(ctx context.Context, rp *ResolvingPath, opts GetDentryOptions) (*Dentry, error) {
- return nil, syserror.EPERM
-}
-
-// GetParentDentryAt implements FilesystemImpl.GetParentDentryAt.
-func (fs *FDTestFilesystem) GetParentDentryAt(ctx context.Context, rp *ResolvingPath) (*Dentry, error) {
- return nil, syserror.EPERM
-}
-
-// LinkAt implements FilesystemImpl.LinkAt.
-func (fs *FDTestFilesystem) LinkAt(ctx context.Context, rp *ResolvingPath, vd VirtualDentry) error {
- return syserror.EPERM
-}
-
-// MkdirAt implements FilesystemImpl.MkdirAt.
-func (fs *FDTestFilesystem) MkdirAt(ctx context.Context, rp *ResolvingPath, opts MkdirOptions) error {
- return syserror.EPERM
-}
-
-// MknodAt implements FilesystemImpl.MknodAt.
-func (fs *FDTestFilesystem) MknodAt(ctx context.Context, rp *ResolvingPath, opts MknodOptions) error {
- return syserror.EPERM
-}
-
-// OpenAt implements FilesystemImpl.OpenAt.
-func (fs *FDTestFilesystem) OpenAt(ctx context.Context, rp *ResolvingPath, opts OpenOptions) (*FileDescription, error) {
- return nil, syserror.EPERM
-}
-
-// ReadlinkAt implements FilesystemImpl.ReadlinkAt.
-func (fs *FDTestFilesystem) ReadlinkAt(ctx context.Context, rp *ResolvingPath) (string, error) {
- return "", syserror.EPERM
-}
-
-// RenameAt implements FilesystemImpl.RenameAt.
-func (fs *FDTestFilesystem) RenameAt(ctx context.Context, rp *ResolvingPath, oldParentVD VirtualDentry, oldName string, opts RenameOptions) error {
- return syserror.EPERM
-}
-
-// RmdirAt implements FilesystemImpl.RmdirAt.
-func (fs *FDTestFilesystem) RmdirAt(ctx context.Context, rp *ResolvingPath) error {
- return syserror.EPERM
-}
-
-// SetStatAt implements FilesystemImpl.SetStatAt.
-func (fs *FDTestFilesystem) SetStatAt(ctx context.Context, rp *ResolvingPath, opts SetStatOptions) error {
- return syserror.EPERM
-}
-
-// StatAt implements FilesystemImpl.StatAt.
-func (fs *FDTestFilesystem) StatAt(ctx context.Context, rp *ResolvingPath, opts StatOptions) (linux.Statx, error) {
- return linux.Statx{}, syserror.EPERM
-}
-
-// StatFSAt implements FilesystemImpl.StatFSAt.
-func (fs *FDTestFilesystem) StatFSAt(ctx context.Context, rp *ResolvingPath) (linux.Statfs, error) {
- return linux.Statfs{}, syserror.EPERM
-}
-
-// SymlinkAt implements FilesystemImpl.SymlinkAt.
-func (fs *FDTestFilesystem) SymlinkAt(ctx context.Context, rp *ResolvingPath, target string) error {
- return syserror.EPERM
-}
-
-// UnlinkAt implements FilesystemImpl.UnlinkAt.
-func (fs *FDTestFilesystem) UnlinkAt(ctx context.Context, rp *ResolvingPath) error {
- return syserror.EPERM
-}
-
-// ListxattrAt implements FilesystemImpl.ListxattrAt.
-func (fs *FDTestFilesystem) ListxattrAt(ctx context.Context, rp *ResolvingPath) ([]string, error) {
- return nil, syserror.EPERM
-}
-
-// GetxattrAt implements FilesystemImpl.GetxattrAt.
-func (fs *FDTestFilesystem) GetxattrAt(ctx context.Context, rp *ResolvingPath, name string) (string, error) {
- return "", syserror.EPERM
-}
-
-// SetxattrAt implements FilesystemImpl.SetxattrAt.
-func (fs *FDTestFilesystem) SetxattrAt(ctx context.Context, rp *ResolvingPath, opts SetxattrOptions) error {
- return syserror.EPERM
-}
-
-// RemovexattrAt implements FilesystemImpl.RemovexattrAt.
-func (fs *FDTestFilesystem) RemovexattrAt(ctx context.Context, rp *ResolvingPath, name string) error {
- return syserror.EPERM
-}
-
-// PrependPath implements FilesystemImpl.PrependPath.
-func (fs *FDTestFilesystem) PrependPath(ctx context.Context, vfsroot, vd VirtualDentry, b *fspath.Builder) error {
- b.PrependComponent(fmt.Sprintf("vfs.fdTestDentry:%p", vd.dentry.impl.(*fdTestDentry)))
- return PrependPathSyntheticError{}
-}
-
-type fdTestDentry struct {
- vfsd Dentry
-}
-
-// NewDentry returns a new Dentry.
-func (fs *FDTestFilesystem) NewDentry() *Dentry {
- var d fdTestDentry
- d.vfsd.Init(&d)
- return &d.vfsd
-}
-
-// IncRef implements DentryImpl.IncRef.
-func (d *fdTestDentry) IncRef() {
-}
-
-// TryIncRef implements DentryImpl.TryIncRef.
-func (d *fdTestDentry) TryIncRef() bool {
- return true
-}
-
-// DecRef implements DentryImpl.DecRef.
-func (d *fdTestDentry) DecRef() {
-}
diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go
index 3e4df8558..908c69f91 100644
--- a/pkg/sentry/vfs/vfs.go
+++ b/pkg/sentry/vfs/vfs.go
@@ -16,25 +16,29 @@
//
// Lock order:
//
-// FilesystemImpl/FileDescriptionImpl locks
-// VirtualFilesystem.mountMu
-// Dentry.mu
-// Locks acquired by FilesystemImpls between Prepare{Delete,Rename}Dentry and Commit{Delete,Rename*}Dentry
-// VirtualFilesystem.filesystemsMu
+// EpollInstance.interestMu
+// FileDescription.epollMu
+// FilesystemImpl/FileDescriptionImpl locks
+// VirtualFilesystem.mountMu
+// Dentry.mu
+// Locks acquired by FilesystemImpls between Prepare{Delete,Rename}Dentry and Commit{Delete,Rename*}Dentry
+// VirtualFilesystem.filesystemsMu
+// EpollInstance.mu
// VirtualFilesystem.fsTypesMu
//
// Locking Dentry.mu in multiple Dentries requires holding
-// VirtualFilesystem.mountMu.
+// VirtualFilesystem.mountMu. Locking EpollInstance.interestMu in multiple
+// EpollInstances requires holding epollCycleMu.
package vfs
import (
"fmt"
- "sync"
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/fspath"
- "gvisor.dev/gvisor/pkg/sentry/context"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
)
@@ -75,25 +79,65 @@ type VirtualFilesystem struct {
// mountpoints is analogous to Linux's mountpoint_hashtable.
mountpoints map[*Dentry]map[*Mount]struct{}
+ // anonMount is a Mount, not included in mounts or mountpoints,
+ // representing an anonFilesystem. anonMount is used to back
+ // VirtualDentries returned by VirtualFilesystem.NewAnonVirtualDentry().
+ // anonMount is immutable.
+ //
+ // anonMount is analogous to Linux's anon_inode_mnt.
+ anonMount *Mount
+
+ // devices contains all registered Devices. devices is protected by
+ // devicesMu.
+ devicesMu sync.RWMutex
+ devices map[devTuple]*registeredDevice
+
+ // anonBlockDevMinor contains all allocated anonymous block device minor
+ // numbers. anonBlockDevMinorNext is a lower bound for the smallest
+ // unallocated anonymous block device number. anonBlockDevMinorNext and
+ // anonBlockDevMinor are protected by anonBlockDevMinorMu.
+ anonBlockDevMinorMu sync.Mutex
+ anonBlockDevMinorNext uint32
+ anonBlockDevMinor map[uint32]struct{}
+
+ // fsTypes contains all registered FilesystemTypes. fsTypes is protected by
+ // fsTypesMu.
+ fsTypesMu sync.RWMutex
+ fsTypes map[string]*registeredFilesystemType
+
// filesystems contains all Filesystems. filesystems is protected by
// filesystemsMu.
filesystemsMu sync.Mutex
filesystems map[*Filesystem]struct{}
-
- // fsTypes contains all FilesystemTypes that are usable in the
- // VirtualFilesystem. fsTypes is protected by fsTypesMu.
- fsTypesMu sync.RWMutex
- fsTypes map[string]FilesystemType
}
// New returns a new VirtualFilesystem with no mounts or FilesystemTypes.
func New() *VirtualFilesystem {
vfs := &VirtualFilesystem{
- mountpoints: make(map[*Dentry]map[*Mount]struct{}),
- filesystems: make(map[*Filesystem]struct{}),
- fsTypes: make(map[string]FilesystemType),
+ mountpoints: make(map[*Dentry]map[*Mount]struct{}),
+ devices: make(map[devTuple]*registeredDevice),
+ anonBlockDevMinorNext: 1,
+ anonBlockDevMinor: make(map[uint32]struct{}),
+ fsTypes: make(map[string]*registeredFilesystemType),
+ filesystems: make(map[*Filesystem]struct{}),
}
vfs.mounts.Init()
+
+ // Construct vfs.anonMount.
+ anonfsDevMinor, err := vfs.GetAnonBlockDevMinor()
+ if err != nil {
+ panic(fmt.Sprintf("VirtualFilesystem.GetAnonBlockDevMinor() failed during VirtualFilesystem construction: %v", err))
+ }
+ anonfs := anonFilesystem{
+ devMinor: anonfsDevMinor,
+ }
+ anonfs.vfsfs.Init(vfs, &anonfs)
+ vfs.anonMount = &Mount{
+ vfs: vfs,
+ fs: &anonfs.vfsfs,
+ refs: 1,
+ }
+
return vfs
}