summaryrefslogtreecommitdiffhomepage
path: root/pkg/sentry/vfs
diff options
context:
space:
mode:
Diffstat (limited to 'pkg/sentry/vfs')
-rw-r--r--pkg/sentry/vfs/BUILD37
-rw-r--r--pkg/sentry/vfs/anonfs.go259
-rw-r--r--pkg/sentry/vfs/context.go2
-rw-r--r--pkg/sentry/vfs/device.go2
-rw-r--r--pkg/sentry/vfs/epoll.go377
-rw-r--r--pkg/sentry/vfs/file_description.go63
-rw-r--r--pkg/sentry/vfs/file_description_impl_util.go101
-rw-r--r--pkg/sentry/vfs/file_description_impl_util_test.go156
-rw-r--r--pkg/sentry/vfs/filesystem.go2
-rw-r--r--pkg/sentry/vfs/filesystem_type.go2
-rw-r--r--pkg/sentry/vfs/lock/BUILD13
-rw-r--r--pkg/sentry/vfs/lock/lock.go72
-rw-r--r--pkg/sentry/vfs/mount.go5
-rw-r--r--pkg/sentry/vfs/pathname.go2
-rw-r--r--pkg/sentry/vfs/testutil.go173
-rw-r--r--pkg/sentry/vfs/vfs.go42
16 files changed, 1049 insertions, 259 deletions
diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD
index 35c7be259..14b39eb9d 100644
--- a/pkg/sentry/vfs/BUILD
+++ b/pkg/sentry/vfs/BUILD
@@ -1,15 +1,30 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
-package(licenses = ["notice"])
+licenses(["notice"])
+
+go_template_instance(
+ name = "epoll_interest_list",
+ out = "epoll_interest_list.go",
+ package = "vfs",
+ prefix = "epollInterest",
+ template = "//pkg/ilist:generic_list",
+ types = {
+ "Element": "*epollInterest",
+ "Linker": "*epollInterest",
+ },
+)
go_library(
name = "vfs",
srcs = [
+ "anonfs.go",
"context.go",
"debug.go",
"dentry.go",
"device.go",
+ "epoll.go",
+ "epoll_interest_list.go",
"file_description.go",
"file_description_impl_util.go",
"filesystem.go",
@@ -21,21 +36,20 @@ go_library(
"pathname.go",
"permissions.go",
"resolving_path.go",
- "testutil.go",
"vfs.go",
],
- importpath = "gvisor.dev/gvisor/pkg/sentry/vfs",
visibility = ["//pkg/sentry:internal"],
deps = [
"//pkg/abi/linux",
+ "//pkg/context",
"//pkg/fspath",
"//pkg/sentry/arch",
- "//pkg/sentry/context",
+ "//pkg/sentry/fs/lock",
"//pkg/sentry/kernel/auth",
"//pkg/sentry/memmap",
- "//pkg/sentry/usermem",
"//pkg/sync",
"//pkg/syserror",
+ "//pkg/usermem",
"//pkg/waiter",
],
)
@@ -47,14 +61,13 @@ go_test(
"file_description_impl_util_test.go",
"mount_test.go",
],
- embed = [":vfs"],
+ library = ":vfs",
deps = [
"//pkg/abi/linux",
- "//pkg/sentry/context",
- "//pkg/sentry/context/contexttest",
- "//pkg/sentry/kernel/auth",
- "//pkg/sentry/usermem",
+ "//pkg/context",
+ "//pkg/sentry/contexttest",
"//pkg/sync",
"//pkg/syserror",
+ "//pkg/usermem",
],
)
diff --git a/pkg/sentry/vfs/anonfs.go b/pkg/sentry/vfs/anonfs.go
new file mode 100644
index 000000000..2db25be49
--- /dev/null
+++ b/pkg/sentry/vfs/anonfs.go
@@ -0,0 +1,259 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs
+
+import (
+ "fmt"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/fspath"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
+)
+
+// NewAnonVirtualDentry returns a VirtualDentry with the given synthetic name,
+// consistent with Linux's fs/anon_inodes.c:anon_inode_getfile(). References
+// are taken on the returned VirtualDentry.
+func (vfs *VirtualFilesystem) NewAnonVirtualDentry(name string) VirtualDentry {
+ d := anonDentry{
+ name: name,
+ }
+ d.vfsd.Init(&d)
+ vfs.anonMount.IncRef()
+ // anonDentry no-ops refcounting.
+ return VirtualDentry{
+ mount: vfs.anonMount,
+ dentry: &d.vfsd,
+ }
+}
+
+const anonfsBlockSize = usermem.PageSize // via fs/libfs.c:pseudo_fs_fill_super()
+
+// anonFilesystem is the implementation of FilesystemImpl that backs
+// VirtualDentries returned by VirtualFilesystem.NewAnonVirtualDentry().
+//
+// Since all Dentries in anonFilesystem are non-directories, all FilesystemImpl
+// methods that would require an anonDentry to be a directory return ENOTDIR.
+type anonFilesystem struct {
+ vfsfs Filesystem
+
+ devMinor uint32
+}
+
+type anonDentry struct {
+ vfsd Dentry
+
+ name string
+}
+
+// Release implements FilesystemImpl.Release.
+func (fs *anonFilesystem) Release() {
+}
+
+// Sync implements FilesystemImpl.Sync.
+func (fs *anonFilesystem) Sync(ctx context.Context) error {
+ return nil
+}
+
+// GetDentryAt implements FilesystemImpl.GetDentryAt.
+func (fs *anonFilesystem) GetDentryAt(ctx context.Context, rp *ResolvingPath, opts GetDentryOptions) (*Dentry, error) {
+ if !rp.Done() {
+ return nil, syserror.ENOTDIR
+ }
+ if opts.CheckSearchable {
+ return nil, syserror.ENOTDIR
+ }
+ // anonDentry no-ops refcounting.
+ return rp.Start(), nil
+}
+
+// GetParentDentryAt implements FilesystemImpl.GetParentDentryAt.
+func (fs *anonFilesystem) GetParentDentryAt(ctx context.Context, rp *ResolvingPath) (*Dentry, error) {
+ if !rp.Final() {
+ return nil, syserror.ENOTDIR
+ }
+ // anonDentry no-ops refcounting.
+ return rp.Start(), nil
+}
+
+// LinkAt implements FilesystemImpl.LinkAt.
+func (fs *anonFilesystem) LinkAt(ctx context.Context, rp *ResolvingPath, vd VirtualDentry) error {
+ if !rp.Final() {
+ return syserror.ENOTDIR
+ }
+ return syserror.EPERM
+}
+
+// MkdirAt implements FilesystemImpl.MkdirAt.
+func (fs *anonFilesystem) MkdirAt(ctx context.Context, rp *ResolvingPath, opts MkdirOptions) error {
+ if !rp.Final() {
+ return syserror.ENOTDIR
+ }
+ return syserror.EPERM
+}
+
+// MknodAt implements FilesystemImpl.MknodAt.
+func (fs *anonFilesystem) MknodAt(ctx context.Context, rp *ResolvingPath, opts MknodOptions) error {
+ if !rp.Final() {
+ return syserror.ENOTDIR
+ }
+ return syserror.EPERM
+}
+
+// OpenAt implements FilesystemImpl.OpenAt.
+func (fs *anonFilesystem) OpenAt(ctx context.Context, rp *ResolvingPath, opts OpenOptions) (*FileDescription, error) {
+ if !rp.Done() {
+ return nil, syserror.ENOTDIR
+ }
+ return nil, syserror.ENODEV
+}
+
+// ReadlinkAt implements FilesystemImpl.ReadlinkAt.
+func (fs *anonFilesystem) ReadlinkAt(ctx context.Context, rp *ResolvingPath) (string, error) {
+ if !rp.Done() {
+ return "", syserror.ENOTDIR
+ }
+ return "", syserror.EINVAL
+}
+
+// RenameAt implements FilesystemImpl.RenameAt.
+func (fs *anonFilesystem) RenameAt(ctx context.Context, rp *ResolvingPath, oldParentVD VirtualDentry, oldName string, opts RenameOptions) error {
+ if !rp.Final() {
+ return syserror.ENOTDIR
+ }
+ return syserror.EPERM
+}
+
+// RmdirAt implements FilesystemImpl.RmdirAt.
+func (fs *anonFilesystem) RmdirAt(ctx context.Context, rp *ResolvingPath) error {
+ if !rp.Final() {
+ return syserror.ENOTDIR
+ }
+ return syserror.EPERM
+}
+
+// SetStatAt implements FilesystemImpl.SetStatAt.
+func (fs *anonFilesystem) SetStatAt(ctx context.Context, rp *ResolvingPath, opts SetStatOptions) error {
+ if !rp.Done() {
+ return syserror.ENOTDIR
+ }
+ // Linux actually permits anon_inode_inode's metadata to be set, which is
+ // visible to all users of anon_inode_inode. We just silently ignore
+ // metadata changes.
+ return nil
+}
+
+// StatAt implements FilesystemImpl.StatAt.
+func (fs *anonFilesystem) StatAt(ctx context.Context, rp *ResolvingPath, opts StatOptions) (linux.Statx, error) {
+ if !rp.Done() {
+ return linux.Statx{}, syserror.ENOTDIR
+ }
+ // See fs/anon_inodes.c:anon_inode_init() => fs/libfs.c:alloc_anon_inode().
+ return linux.Statx{
+ Mask: linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO | linux.STATX_SIZE | linux.STATX_BLOCKS,
+ Blksize: anonfsBlockSize,
+ Nlink: 1,
+ UID: uint32(auth.RootKUID),
+ GID: uint32(auth.RootKGID),
+ Mode: 0600, // no type is correct
+ Ino: 1,
+ Size: 0,
+ Blocks: 0,
+ DevMajor: 0,
+ DevMinor: fs.devMinor,
+ }, nil
+}
+
+// StatFSAt implements FilesystemImpl.StatFSAt.
+func (fs *anonFilesystem) StatFSAt(ctx context.Context, rp *ResolvingPath) (linux.Statfs, error) {
+ if !rp.Done() {
+ return linux.Statfs{}, syserror.ENOTDIR
+ }
+ return linux.Statfs{
+ Type: linux.ANON_INODE_FS_MAGIC,
+ BlockSize: anonfsBlockSize,
+ }, nil
+}
+
+// SymlinkAt implements FilesystemImpl.SymlinkAt.
+func (fs *anonFilesystem) SymlinkAt(ctx context.Context, rp *ResolvingPath, target string) error {
+ if !rp.Final() {
+ return syserror.ENOTDIR
+ }
+ return syserror.EPERM
+}
+
+// UnlinkAt implements FilesystemImpl.UnlinkAt.
+func (fs *anonFilesystem) UnlinkAt(ctx context.Context, rp *ResolvingPath) error {
+ if !rp.Final() {
+ return syserror.ENOTDIR
+ }
+ return syserror.EPERM
+}
+
+// ListxattrAt implements FilesystemImpl.ListxattrAt.
+func (fs *anonFilesystem) ListxattrAt(ctx context.Context, rp *ResolvingPath) ([]string, error) {
+ if !rp.Done() {
+ return nil, syserror.ENOTDIR
+ }
+ return nil, nil
+}
+
+// GetxattrAt implements FilesystemImpl.GetxattrAt.
+func (fs *anonFilesystem) GetxattrAt(ctx context.Context, rp *ResolvingPath, name string) (string, error) {
+ if !rp.Done() {
+ return "", syserror.ENOTDIR
+ }
+ return "", syserror.ENOTSUP
+}
+
+// SetxattrAt implements FilesystemImpl.SetxattrAt.
+func (fs *anonFilesystem) SetxattrAt(ctx context.Context, rp *ResolvingPath, opts SetxattrOptions) error {
+ if !rp.Done() {
+ return syserror.ENOTDIR
+ }
+ return syserror.EPERM
+}
+
+// RemovexattrAt implements FilesystemImpl.RemovexattrAt.
+func (fs *anonFilesystem) RemovexattrAt(ctx context.Context, rp *ResolvingPath, name string) error {
+ if !rp.Done() {
+ return syserror.ENOTDIR
+ }
+ return syserror.EPERM
+}
+
+// PrependPath implements FilesystemImpl.PrependPath.
+func (fs *anonFilesystem) PrependPath(ctx context.Context, vfsroot, vd VirtualDentry, b *fspath.Builder) error {
+ b.PrependComponent(fmt.Sprintf("anon_inode:%s", vd.dentry.impl.(*anonDentry).name))
+ return PrependPathSyntheticError{}
+}
+
+// IncRef implements DentryImpl.IncRef.
+func (d *anonDentry) IncRef() {
+ // no-op
+}
+
+// TryIncRef implements DentryImpl.TryIncRef.
+func (d *anonDentry) TryIncRef() bool {
+ return true
+}
+
+// DecRef implements DentryImpl.DecRef.
+func (d *anonDentry) DecRef() {
+ // no-op
+}
diff --git a/pkg/sentry/vfs/context.go b/pkg/sentry/vfs/context.go
index 705194ebc..d97362b9a 100644
--- a/pkg/sentry/vfs/context.go
+++ b/pkg/sentry/vfs/context.go
@@ -15,7 +15,7 @@
package vfs
import (
- "gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/context"
)
// contextID is this package's type for context.Context.Value keys.
diff --git a/pkg/sentry/vfs/device.go b/pkg/sentry/vfs/device.go
index 9f9d6e783..3af2aa58d 100644
--- a/pkg/sentry/vfs/device.go
+++ b/pkg/sentry/vfs/device.go
@@ -17,7 +17,7 @@ package vfs
import (
"fmt"
- "gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/syserror"
)
diff --git a/pkg/sentry/vfs/epoll.go b/pkg/sentry/vfs/epoll.go
new file mode 100644
index 000000000..7c83f9a5a
--- /dev/null
+++ b/pkg/sentry/vfs/epoll.go
@@ -0,0 +1,377 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/sync"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/waiter"
+)
+
+// epollCycleMu serializes attempts to register EpollInstances with other
+// EpollInstances in order to check for cycles.
+var epollCycleMu sync.Mutex
+
+// EpollInstance represents an epoll instance, as described by epoll(7).
+type EpollInstance struct {
+ vfsfd FileDescription
+ FileDescriptionDefaultImpl
+ DentryMetadataFileDescriptionImpl
+
+ // q holds waiters on this EpollInstance.
+ q waiter.Queue
+
+ // interest is the set of file descriptors that are registered with the
+ // EpollInstance for monitoring. interest is protected by interestMu.
+ interestMu sync.Mutex
+ interest map[epollInterestKey]*epollInterest
+
+ // mu protects fields in registered epollInterests.
+ mu sync.Mutex
+
+ // ready is the set of file descriptors that may be "ready" for I/O. Note
+ // that this must be an ordered list, not a map: "If more than maxevents
+ // file descriptors are ready when epoll_wait() is called, then successive
+ // epoll_wait() calls will round robin through the set of ready file
+ // descriptors. This behavior helps avoid starvation scenarios, where a
+ // process fails to notice that additional file descriptors are ready
+ // because it focuses on a set of file descriptors that are already known
+ // to be ready." - epoll_wait(2)
+ ready epollInterestList
+}
+
+type epollInterestKey struct {
+ // file is the registered FileDescription. No reference is held on file;
+ // instead, when the last reference is dropped, FileDescription.DecRef()
+ // removes the FileDescription from all EpollInstances. file is immutable.
+ file *FileDescription
+
+ // num is the file descriptor number with which this entry was registered.
+ // num is immutable.
+ num int32
+}
+
+// epollInterest represents an EpollInstance's interest in a file descriptor.
+type epollInterest struct {
+ // epoll is the owning EpollInstance. epoll is immutable.
+ epoll *EpollInstance
+
+ // key is the file to which this epollInterest applies. key is immutable.
+ key epollInterestKey
+
+ // waiter is registered with key.file. entry is protected by epoll.mu.
+ waiter waiter.Entry
+
+ // mask is the event mask associated with this registration, including
+ // flags EPOLLET and EPOLLONESHOT. mask is protected by epoll.mu.
+ mask uint32
+
+ // ready is true if epollInterestEntry is linked into epoll.ready. ready
+ // and epollInterestEntry are protected by epoll.mu.
+ ready bool
+ epollInterestEntry
+
+ // userData is the epoll_data_t associated with this epollInterest.
+ // userData is protected by epoll.mu.
+ userData [2]int32
+}
+
+// NewEpollInstanceFD returns a FileDescription representing a new epoll
+// instance. A reference is taken on the returned FileDescription.
+func (vfs *VirtualFilesystem) NewEpollInstanceFD() (*FileDescription, error) {
+ vd := vfs.NewAnonVirtualDentry("[eventpoll]")
+ defer vd.DecRef()
+ ep := &EpollInstance{
+ interest: make(map[epollInterestKey]*epollInterest),
+ }
+ if err := ep.vfsfd.Init(ep, linux.O_RDWR, vd.Mount(), vd.Dentry(), &FileDescriptionOptions{
+ UseDentryMetadata: true,
+ }); err != nil {
+ return nil, err
+ }
+ return &ep.vfsfd, nil
+}
+
+// Release implements FileDescriptionImpl.Release.
+func (ep *EpollInstance) Release() {
+ // Unregister all polled fds.
+ ep.interestMu.Lock()
+ defer ep.interestMu.Unlock()
+ for key, epi := range ep.interest {
+ file := key.file
+ file.epollMu.Lock()
+ delete(file.epolls, epi)
+ file.epollMu.Unlock()
+ file.EventUnregister(&epi.waiter)
+ }
+ ep.interest = nil
+}
+
+// Readiness implements waiter.Waitable.Readiness.
+func (ep *EpollInstance) Readiness(mask waiter.EventMask) waiter.EventMask {
+ if mask&waiter.EventIn == 0 {
+ return 0
+ }
+ ep.mu.Lock()
+ for epi := ep.ready.Front(); epi != nil; epi = epi.Next() {
+ wmask := waiter.EventMaskFromLinux(epi.mask)
+ if epi.key.file.Readiness(wmask)&wmask != 0 {
+ ep.mu.Unlock()
+ return waiter.EventIn
+ }
+ }
+ ep.mu.Unlock()
+ return 0
+}
+
+// EventRegister implements waiter.Waitable.EventRegister.
+func (ep *EpollInstance) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+ ep.q.EventRegister(e, mask)
+}
+
+// EventUnregister implements waiter.Waitable.EventUnregister.
+func (ep *EpollInstance) EventUnregister(e *waiter.Entry) {
+ ep.q.EventUnregister(e)
+}
+
+// Seek implements FileDescriptionImpl.Seek.
+func (ep *EpollInstance) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+ // Linux: fs/eventpoll.c:eventpoll_fops.llseek == noop_llseek
+ return 0, nil
+}
+
+// AddInterest implements the semantics of EPOLL_CTL_ADD.
+//
+// Preconditions: A reference must be held on file.
+func (ep *EpollInstance) AddInterest(file *FileDescription, num int32, mask uint32, userData [2]int32) error {
+ // Check for cyclic polling if necessary.
+ subep, _ := file.impl.(*EpollInstance)
+ if subep != nil {
+ epollCycleMu.Lock()
+ // epollCycleMu must be locked for the rest of AddInterest to ensure
+ // that cyclic polling is not introduced after the check.
+ defer epollCycleMu.Unlock()
+ if subep.mightPoll(ep) {
+ return syserror.ELOOP
+ }
+ }
+
+ ep.interestMu.Lock()
+ defer ep.interestMu.Unlock()
+
+ // Fail if the key is already registered.
+ key := epollInterestKey{
+ file: file,
+ num: num,
+ }
+ if _, ok := ep.interest[key]; ok {
+ return syserror.EEXIST
+ }
+
+ // Register interest in file.
+ mask |= linux.EPOLLERR | linux.EPOLLRDHUP
+ epi := &epollInterest{
+ epoll: ep,
+ key: key,
+ mask: mask,
+ userData: userData,
+ }
+ ep.interest[key] = epi
+ wmask := waiter.EventMaskFromLinux(mask)
+ file.EventRegister(&epi.waiter, wmask)
+
+ // Check if the file is already ready.
+ if file.Readiness(wmask)&wmask != 0 {
+ epi.Callback(nil)
+ }
+
+ // Add epi to file.epolls so that it is removed when the last
+ // FileDescription reference is dropped.
+ file.epollMu.Lock()
+ file.epolls[epi] = struct{}{}
+ file.epollMu.Unlock()
+
+ return nil
+}
+
+func (ep *EpollInstance) mightPoll(ep2 *EpollInstance) bool {
+ return ep.mightPollRecursive(ep2, 4) // Linux: fs/eventpoll.c:EP_MAX_NESTS
+}
+
+func (ep *EpollInstance) mightPollRecursive(ep2 *EpollInstance, remainingRecursion int) bool {
+ ep.interestMu.Lock()
+ defer ep.interestMu.Unlock()
+ for key := range ep.interest {
+ nextep, ok := key.file.impl.(*EpollInstance)
+ if !ok {
+ continue
+ }
+ if nextep == ep2 {
+ return true
+ }
+ if remainingRecursion == 0 {
+ return true
+ }
+ if nextep.mightPollRecursive(ep2, remainingRecursion-1) {
+ return true
+ }
+ }
+ return false
+}
+
+// ModifyInterest implements the semantics of EPOLL_CTL_MOD.
+//
+// Preconditions: A reference must be held on file.
+func (ep *EpollInstance) ModifyInterest(file *FileDescription, num int32, mask uint32, userData [2]int32) error {
+ ep.interestMu.Lock()
+ defer ep.interestMu.Unlock()
+
+ // Fail if the key is not already registered.
+ epi, ok := ep.interest[epollInterestKey{
+ file: file,
+ num: num,
+ }]
+ if !ok {
+ return syserror.ENOENT
+ }
+
+ // Update epi for the next call to ep.ReadEvents().
+ ep.mu.Lock()
+ epi.mask = mask
+ epi.userData = userData
+ ep.mu.Unlock()
+
+ // Re-register with the new mask.
+ mask |= linux.EPOLLERR | linux.EPOLLRDHUP
+ file.EventUnregister(&epi.waiter)
+ wmask := waiter.EventMaskFromLinux(mask)
+ file.EventRegister(&epi.waiter, wmask)
+
+ // Check if the file is already ready with the new mask.
+ if file.Readiness(wmask)&wmask != 0 {
+ epi.Callback(nil)
+ }
+
+ return nil
+}
+
+// DeleteInterest implements the semantics of EPOLL_CTL_DEL.
+//
+// Preconditions: A reference must be held on file.
+func (ep *EpollInstance) DeleteInterest(file *FileDescription, num int32) error {
+ ep.interestMu.Lock()
+ defer ep.interestMu.Unlock()
+
+ // Fail if the key is not already registered.
+ epi, ok := ep.interest[epollInterestKey{
+ file: file,
+ num: num,
+ }]
+ if !ok {
+ return syserror.ENOENT
+ }
+
+ // Unregister from the file so that epi will no longer be readied.
+ file.EventUnregister(&epi.waiter)
+
+ // Forget about epi.
+ ep.removeLocked(epi)
+
+ file.epollMu.Lock()
+ delete(file.epolls, epi)
+ file.epollMu.Unlock()
+
+ return nil
+}
+
+// Callback implements waiter.EntryCallback.Callback.
+func (epi *epollInterest) Callback(*waiter.Entry) {
+ newReady := false
+ epi.epoll.mu.Lock()
+ if !epi.ready {
+ newReady = true
+ epi.ready = true
+ epi.epoll.ready.PushBack(epi)
+ }
+ epi.epoll.mu.Unlock()
+ if newReady {
+ epi.epoll.q.Notify(waiter.EventIn)
+ }
+}
+
+// Preconditions: ep.interestMu must be locked.
+func (ep *EpollInstance) removeLocked(epi *epollInterest) {
+ delete(ep.interest, epi.key)
+ ep.mu.Lock()
+ if epi.ready {
+ epi.ready = false
+ ep.ready.Remove(epi)
+ }
+ ep.mu.Unlock()
+}
+
+// ReadEvents reads up to len(events) ready events into events and returns the
+// number of events read.
+//
+// Preconditions: len(events) != 0.
+func (ep *EpollInstance) ReadEvents(events []linux.EpollEvent) int {
+ i := 0
+ // Hot path: avoid defer.
+ ep.mu.Lock()
+ var next *epollInterest
+ var requeue epollInterestList
+ for epi := ep.ready.Front(); epi != nil; epi = next {
+ next = epi.Next()
+ // Regardless of what else happens, epi is initially removed from the
+ // ready list.
+ ep.ready.Remove(epi)
+ wmask := waiter.EventMaskFromLinux(epi.mask)
+ ievents := epi.key.file.Readiness(wmask) & wmask
+ if ievents == 0 {
+ // Leave epi off the ready list.
+ epi.ready = false
+ continue
+ }
+ // Determine what we should do with epi.
+ switch {
+ case epi.mask&linux.EPOLLONESHOT != 0:
+ // Clear all events from the mask; they must be re-added by
+ // EPOLL_CTL_MOD.
+ epi.mask &= linux.EP_PRIVATE_BITS
+ fallthrough
+ case epi.mask&linux.EPOLLET != 0:
+ // Leave epi off the ready list.
+ epi.ready = false
+ default:
+ // Queue epi to be moved to the end of the ready list.
+ requeue.PushBack(epi)
+ }
+ // Report ievents.
+ events[i] = linux.EpollEvent{
+ Events: ievents.ToLinux(),
+ Fd: epi.userData[0],
+ Data: epi.userData[1],
+ }
+ i++
+ if i == len(events) {
+ break
+ }
+ }
+ ep.ready.PushBackList(&requeue)
+ ep.mu.Unlock()
+ return i
+}
diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go
index 51c95c2d9..5bac660c7 100644
--- a/pkg/sentry/vfs/file_description.go
+++ b/pkg/sentry/vfs/file_description.go
@@ -18,12 +18,14 @@ import (
"sync/atomic"
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/sentry/arch"
- "gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/sentry/fs/lock"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/memmap"
- "gvisor.dev/gvisor/pkg/sentry/usermem"
+ "gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
"gvisor.dev/gvisor/pkg/waiter"
)
@@ -45,6 +47,11 @@ type FileDescription struct {
// memory operations.
statusFlags uint32
+ // epolls is the set of epollInterests registered for this FileDescription.
+ // epolls is protected by epollMu.
+ epollMu sync.Mutex
+ epolls map[*epollInterest]struct{}
+
// vd is the filesystem location at which this FileDescription was opened.
// A reference is held on vd. vd is immutable.
vd VirtualDentry
@@ -141,6 +148,23 @@ func (fd *FileDescription) TryIncRef() bool {
// DecRef decrements fd's reference count.
func (fd *FileDescription) DecRef() {
if refs := atomic.AddInt64(&fd.refs, -1); refs == 0 {
+ // Unregister fd from all epoll instances.
+ fd.epollMu.Lock()
+ epolls := fd.epolls
+ fd.epolls = nil
+ fd.epollMu.Unlock()
+ for epi := range epolls {
+ ep := epi.epoll
+ ep.interestMu.Lock()
+ // Check that epi has not been concurrently unregistered by
+ // EpollInstance.DeleteInterest() or EpollInstance.Release().
+ if _, ok := ep.interest[epi.key]; ok {
+ fd.EventUnregister(&epi.waiter)
+ ep.removeLocked(epi)
+ }
+ ep.interestMu.Unlock()
+ }
+ // Release implementation resources.
fd.impl.Release()
if fd.writable {
fd.vd.mount.EndWrite()
@@ -370,7 +394,25 @@ type FileDescriptionImpl interface {
// Removexattr removes the given extended attribute from the file.
Removexattr(ctx context.Context, name string) error
- // TODO: file locking
+ // LockBSD tries to acquire a BSD-style advisory file lock.
+ //
+ // TODO(gvisor.dev/issue/1480): BSD-style file locking
+ LockBSD(ctx context.Context, uid lock.UniqueID, t lock.LockType, block lock.Blocker) error
+
+ // LockBSD releases a BSD-style advisory file lock.
+ //
+ // TODO(gvisor.dev/issue/1480): BSD-style file locking
+ UnlockBSD(ctx context.Context, uid lock.UniqueID) error
+
+ // LockPOSIX tries to acquire a POSIX-style advisory file lock.
+ //
+ // TODO(gvisor.dev/issue/1480): POSIX-style file locking
+ LockPOSIX(ctx context.Context, uid lock.UniqueID, t lock.LockType, rng lock.LockRange, block lock.Blocker) error
+
+ // UnlockPOSIX releases a POSIX-style advisory file lock.
+ //
+ // TODO(gvisor.dev/issue/1480): POSIX-style file locking
+ UnlockPOSIX(ctx context.Context, uid lock.UniqueID, rng lock.LockRange) error
}
// Dirent holds the information contained in struct linux_dirent64.
@@ -453,6 +495,21 @@ func (fd *FileDescription) StatFS(ctx context.Context) (linux.Statfs, error) {
return fd.impl.StatFS(ctx)
}
+// Readiness returns fd's I/O readiness.
+func (fd *FileDescription) Readiness(mask waiter.EventMask) waiter.EventMask {
+ return fd.impl.Readiness(mask)
+}
+
+// EventRegister registers e for I/O readiness events in mask.
+func (fd *FileDescription) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+ fd.impl.EventRegister(e, mask)
+}
+
+// EventUnregister unregisters e for I/O readiness events.
+func (fd *FileDescription) EventUnregister(e *waiter.Entry) {
+ fd.impl.EventUnregister(e)
+}
+
// PRead reads from the file represented by fd into dst, starting at the given
// offset, and returns the number of bytes read. PRead is permitted to return
// partial reads with a nil error.
diff --git a/pkg/sentry/vfs/file_description_impl_util.go b/pkg/sentry/vfs/file_description_impl_util.go
index c00b3c84b..c2a52ec1b 100644
--- a/pkg/sentry/vfs/file_description_impl_util.go
+++ b/pkg/sentry/vfs/file_description_impl_util.go
@@ -19,12 +19,13 @@ import (
"io"
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/sentry/arch"
- "gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/sentry/fs/lock"
"gvisor.dev/gvisor/pkg/sentry/memmap"
- "gvisor.dev/gvisor/pkg/sentry/usermem"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
"gvisor.dev/gvisor/pkg/waiter"
)
@@ -152,6 +153,26 @@ func (FileDescriptionDefaultImpl) Removexattr(ctx context.Context, name string)
return syserror.ENOTSUP
}
+// LockBSD implements FileDescriptionImpl.LockBSD.
+func (FileDescriptionDefaultImpl) LockBSD(ctx context.Context, uid lock.UniqueID, t lock.LockType, block lock.Blocker) error {
+ return syserror.EBADF
+}
+
+// UnlockBSD implements FileDescriptionImpl.UnlockBSD.
+func (FileDescriptionDefaultImpl) UnlockBSD(ctx context.Context, uid lock.UniqueID) error {
+ return syserror.EBADF
+}
+
+// LockPOSIX implements FileDescriptionImpl.LockPOSIX.
+func (FileDescriptionDefaultImpl) LockPOSIX(ctx context.Context, uid lock.UniqueID, t lock.LockType, rng lock.LockRange, block lock.Blocker) error {
+ return syserror.EBADF
+}
+
+// UnlockPOSIX implements FileDescriptionImpl.UnlockPOSIX.
+func (FileDescriptionDefaultImpl) UnlockPOSIX(ctx context.Context, uid lock.UniqueID, rng lock.LockRange) error {
+ return syserror.EBADF
+}
+
// DirectoryFileDescriptionDefaultImpl may be embedded by implementations of
// FileDescriptionImpl that always represent directories to obtain
// implementations of non-directory I/O methods that return EISDIR.
@@ -192,21 +213,6 @@ func (DentryMetadataFileDescriptionImpl) SetStat(ctx context.Context, opts SetSt
panic("illegal call to DentryMetadataFileDescriptionImpl.SetStat")
}
-// DynamicBytesFileDescriptionImpl may be embedded by implementations of
-// FileDescriptionImpl that represent read-only regular files whose contents
-// are backed by a bytes.Buffer that is regenerated when necessary, consistent
-// with Linux's fs/seq_file.c:single_open().
-//
-// DynamicBytesFileDescriptionImpl.SetDataSource() must be called before first
-// use.
-type DynamicBytesFileDescriptionImpl struct {
- data DynamicBytesSource // immutable
- mu sync.Mutex // protects the following fields
- buf bytes.Buffer
- off int64
- lastRead int64 // offset at which the last Read, PRead, or Seek ended
-}
-
// DynamicBytesSource represents a data source for a
// DynamicBytesFileDescriptionImpl.
type DynamicBytesSource interface {
@@ -225,6 +231,30 @@ func (s *StaticData) Generate(ctx context.Context, buf *bytes.Buffer) error {
return nil
}
+// WritableDynamicBytesSource extends DynamicBytesSource to allow writes to the
+// underlying source.
+type WritableDynamicBytesSource interface {
+ DynamicBytesSource
+
+ // Write sends writes to the source.
+ Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error)
+}
+
+// DynamicBytesFileDescriptionImpl may be embedded by implementations of
+// FileDescriptionImpl that represent read-only regular files whose contents
+// are backed by a bytes.Buffer that is regenerated when necessary, consistent
+// with Linux's fs/seq_file.c:single_open().
+//
+// DynamicBytesFileDescriptionImpl.SetDataSource() must be called before first
+// use.
+type DynamicBytesFileDescriptionImpl struct {
+ data DynamicBytesSource // immutable
+ mu sync.Mutex // protects the following fields
+ buf bytes.Buffer
+ off int64
+ lastRead int64 // offset at which the last Read, PRead, or Seek ended
+}
+
// SetDataSource must be called exactly once on fd before first use.
func (fd *DynamicBytesFileDescriptionImpl) SetDataSource(data DynamicBytesSource) {
fd.data = data
@@ -304,6 +334,43 @@ func (fd *DynamicBytesFileDescriptionImpl) Seek(ctx context.Context, offset int6
return offset, nil
}
+// Preconditions: fd.mu must be locked.
+func (fd *DynamicBytesFileDescriptionImpl) pwriteLocked(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) {
+ if opts.Flags&^(linux.RWF_HIPRI|linux.RWF_DSYNC|linux.RWF_SYNC) != 0 {
+ return 0, syserror.EOPNOTSUPP
+ }
+
+ writable, ok := fd.data.(WritableDynamicBytesSource)
+ if !ok {
+ return 0, syserror.EINVAL
+ }
+ n, err := writable.Write(ctx, src, offset)
+ if err != nil {
+ return 0, err
+ }
+
+ // Invalidate cached data that might exist prior to this call.
+ fd.buf.Reset()
+ return n, nil
+}
+
+// PWrite implements FileDescriptionImpl.PWrite.
+func (fd *DynamicBytesFileDescriptionImpl) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) {
+ fd.mu.Lock()
+ n, err := fd.pwriteLocked(ctx, src, offset, opts)
+ fd.mu.Unlock()
+ return n, err
+}
+
+// Write implements FileDescriptionImpl.Write.
+func (fd *DynamicBytesFileDescriptionImpl) Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) {
+ fd.mu.Lock()
+ n, err := fd.pwriteLocked(ctx, src, fd.off, opts)
+ fd.off += n
+ fd.mu.Unlock()
+ return n, err
+}
+
// GenericConfigureMMap may be used by most implementations of
// FileDescriptionImpl.ConfigureMMap.
func GenericConfigureMMap(fd *FileDescription, m memmap.Mappable, opts *memmap.MMapOpts) error {
diff --git a/pkg/sentry/vfs/file_description_impl_util_test.go b/pkg/sentry/vfs/file_description_impl_util_test.go
index 9ed58512f..8fa26418e 100644
--- a/pkg/sentry/vfs/file_description_impl_util_test.go
+++ b/pkg/sentry/vfs/file_description_impl_util_test.go
@@ -22,11 +22,10 @@ import (
"testing"
"gvisor.dev/gvisor/pkg/abi/linux"
- "gvisor.dev/gvisor/pkg/sentry/context"
- "gvisor.dev/gvisor/pkg/sentry/context/contexttest"
- "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
- "gvisor.dev/gvisor/pkg/sentry/usermem"
+ "gvisor.dev/gvisor/pkg/context"
+ "gvisor.dev/gvisor/pkg/sentry/contexttest"
"gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/usermem"
)
// fileDescription is the common fd struct which a filesystem implementation
@@ -36,68 +35,80 @@ type fileDescription struct {
FileDescriptionDefaultImpl
}
-// genCountFD is a read-only FileDescriptionImpl representing a regular file
-// that contains the number of times its DynamicBytesSource.Generate()
+// genCount contains the number of times its DynamicBytesSource.Generate()
// implementation has been called.
-type genCountFD struct {
- fileDescription
- DynamicBytesFileDescriptionImpl
-
+type genCount struct {
count uint64 // accessed using atomic memory ops
}
-func newGenCountFD(mnt *Mount, vfsd *Dentry) *FileDescription {
- var fd genCountFD
- fd.vfsfd.Init(&fd, 0 /* statusFlags */, mnt, vfsd, &FileDescriptionOptions{})
- fd.DynamicBytesFileDescriptionImpl.SetDataSource(&fd)
- return &fd.vfsfd
+// Generate implements DynamicBytesSource.Generate.
+func (g *genCount) Generate(ctx context.Context, buf *bytes.Buffer) error {
+ fmt.Fprintf(buf, "%d", atomic.AddUint64(&g.count, 1))
+ return nil
}
-// Release implements FileDescriptionImpl.Release.
-func (fd *genCountFD) Release() {
+type storeData struct {
+ data string
+}
+
+var _ WritableDynamicBytesSource = (*storeData)(nil)
+
+// Generate implements DynamicBytesSource.
+func (d *storeData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+ buf.WriteString(d.data)
+ return nil
}
-// StatusFlags implements FileDescriptionImpl.StatusFlags.
-func (fd *genCountFD) StatusFlags(ctx context.Context) (uint32, error) {
+// Generate implements WritableDynamicBytesSource.
+func (d *storeData) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) {
+ buf := make([]byte, src.NumBytes())
+ n, err := src.CopyIn(ctx, buf)
+ if err != nil {
+ return 0, err
+ }
+
+ d.data = string(buf[:n])
return 0, nil
}
-// SetStatusFlags implements FileDescriptionImpl.SetStatusFlags.
-func (fd *genCountFD) SetStatusFlags(ctx context.Context, flags uint32) error {
- return syserror.EPERM
+// testFD is a read-only FileDescriptionImpl representing a regular file.
+type testFD struct {
+ fileDescription
+ DynamicBytesFileDescriptionImpl
+
+ data DynamicBytesSource
+}
+
+func newTestFD(vfsObj *VirtualFilesystem, statusFlags uint32, data DynamicBytesSource) *FileDescription {
+ vd := vfsObj.NewAnonVirtualDentry("genCountFD")
+ defer vd.DecRef()
+ var fd testFD
+ fd.vfsfd.Init(&fd, statusFlags, vd.Mount(), vd.Dentry(), &FileDescriptionOptions{})
+ fd.DynamicBytesFileDescriptionImpl.SetDataSource(data)
+ return &fd.vfsfd
}
+// Release implements FileDescriptionImpl.Release.
+func (fd *testFD) Release() {
+}
+
+// SetStatusFlags implements FileDescriptionImpl.SetStatusFlags.
// Stat implements FileDescriptionImpl.Stat.
-func (fd *genCountFD) Stat(ctx context.Context, opts StatOptions) (linux.Statx, error) {
+func (fd *testFD) Stat(ctx context.Context, opts StatOptions) (linux.Statx, error) {
// Note that Statx.Mask == 0 in the return value.
return linux.Statx{}, nil
}
// SetStat implements FileDescriptionImpl.SetStat.
-func (fd *genCountFD) SetStat(ctx context.Context, opts SetStatOptions) error {
+func (fd *testFD) SetStat(ctx context.Context, opts SetStatOptions) error {
return syserror.EPERM
}
-// Generate implements DynamicBytesSource.Generate.
-func (fd *genCountFD) Generate(ctx context.Context, buf *bytes.Buffer) error {
- fmt.Fprintf(buf, "%d", atomic.AddUint64(&fd.count, 1))
- return nil
-}
-
func TestGenCountFD(t *testing.T) {
ctx := contexttest.Context(t)
- creds := auth.CredentialsFromContext(ctx)
vfsObj := New() // vfs.New()
- vfsObj.MustRegisterFilesystemType("testfs", FDTestFilesystemType{}, &RegisterFilesystemTypeOptions{})
- mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "testfs", &GetFilesystemOptions{})
- if err != nil {
- t.Fatalf("failed to create testfs root mount: %v", err)
- }
- vd := mntns.Root()
- defer vd.DecRef()
-
- fd := newGenCountFD(vd.Mount(), vd.Dentry())
+ fd := newTestFD(vfsObj, linux.O_RDWR, &genCount{})
defer fd.DecRef()
// The first read causes Generate to be called to fill the FD's buffer.
@@ -138,4 +149,69 @@ func TestGenCountFD(t *testing.T) {
if want := byte('3'); buf[0] != want {
t.Errorf("PRead: got byte %c, wanted %c", buf[0], want)
}
+
+ // Write and PWrite fails.
+ if _, err := fd.Write(ctx, ioseq, WriteOptions{}); err != syserror.EINVAL {
+ t.Errorf("Write: got err %v, wanted %v", err, syserror.EINVAL)
+ }
+ if _, err := fd.PWrite(ctx, ioseq, 0, WriteOptions{}); err != syserror.EINVAL {
+ t.Errorf("Write: got err %v, wanted %v", err, syserror.EINVAL)
+ }
+}
+
+func TestWritable(t *testing.T) {
+ ctx := contexttest.Context(t)
+
+ vfsObj := New() // vfs.New()
+ fd := newTestFD(vfsObj, linux.O_RDWR, &storeData{data: "init"})
+ defer fd.DecRef()
+
+ buf := make([]byte, 10)
+ ioseq := usermem.BytesIOSequence(buf)
+ if n, err := fd.Read(ctx, ioseq, ReadOptions{}); n != 4 && err != io.EOF {
+ t.Fatalf("Read: got (%v, %v), wanted (4, EOF)", n, err)
+ }
+ if want := "init"; want == string(buf) {
+ t.Fatalf("Read: got %v, wanted %v", string(buf), want)
+ }
+
+ // Test PWrite.
+ want := "write"
+ writeIOSeq := usermem.BytesIOSequence([]byte(want))
+ if n, err := fd.PWrite(ctx, writeIOSeq, 0, WriteOptions{}); int(n) != len(want) && err != nil {
+ t.Errorf("PWrite: got err (%v, %v), wanted (%v, nil)", n, err, len(want))
+ }
+ if n, err := fd.PRead(ctx, ioseq, 0, ReadOptions{}); int(n) != len(want) && err != io.EOF {
+ t.Fatalf("PRead: got (%v, %v), wanted (%v, EOF)", n, err, len(want))
+ }
+ if want == string(buf) {
+ t.Fatalf("PRead: got %v, wanted %v", string(buf), want)
+ }
+
+ // Test Seek to 0 followed by Write.
+ want = "write2"
+ writeIOSeq = usermem.BytesIOSequence([]byte(want))
+ if n, err := fd.Seek(ctx, 0, linux.SEEK_SET); n != 0 && err != nil {
+ t.Errorf("Seek: got err (%v, %v), wanted (0, nil)", n, err)
+ }
+ if n, err := fd.Write(ctx, writeIOSeq, WriteOptions{}); int(n) != len(want) && err != nil {
+ t.Errorf("Write: got err (%v, %v), wanted (%v, nil)", n, err, len(want))
+ }
+ if n, err := fd.PRead(ctx, ioseq, 0, ReadOptions{}); int(n) != len(want) && err != io.EOF {
+ t.Fatalf("PRead: got (%v, %v), wanted (%v, EOF)", n, err, len(want))
+ }
+ if want == string(buf) {
+ t.Fatalf("PRead: got %v, wanted %v", string(buf), want)
+ }
+
+ // Test failure if offset != 0.
+ if n, err := fd.Seek(ctx, 1, linux.SEEK_SET); n != 0 && err != nil {
+ t.Errorf("Seek: got err (%v, %v), wanted (0, nil)", n, err)
+ }
+ if n, err := fd.Write(ctx, writeIOSeq, WriteOptions{}); n != 0 && err != syserror.EINVAL {
+ t.Errorf("Write: got err (%v, %v), wanted (0, EINVAL)", n, err)
+ }
+ if n, err := fd.PWrite(ctx, writeIOSeq, 2, WriteOptions{}); n != 0 && err != syserror.EINVAL {
+ t.Errorf("PWrite: got err (%v, %v), wanted (0, EINVAL)", n, err)
+ }
}
diff --git a/pkg/sentry/vfs/filesystem.go b/pkg/sentry/vfs/filesystem.go
index ea78f555b..a06a6caf3 100644
--- a/pkg/sentry/vfs/filesystem.go
+++ b/pkg/sentry/vfs/filesystem.go
@@ -18,8 +18,8 @@ import (
"sync/atomic"
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/fspath"
- "gvisor.dev/gvisor/pkg/sentry/context"
)
// A Filesystem is a tree of nodes represented by Dentries, which forms part of
diff --git a/pkg/sentry/vfs/filesystem_type.go b/pkg/sentry/vfs/filesystem_type.go
index 023301780..c58b70728 100644
--- a/pkg/sentry/vfs/filesystem_type.go
+++ b/pkg/sentry/vfs/filesystem_type.go
@@ -18,7 +18,7 @@ import (
"bytes"
"fmt"
- "gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
)
diff --git a/pkg/sentry/vfs/lock/BUILD b/pkg/sentry/vfs/lock/BUILD
new file mode 100644
index 000000000..d9ab063b7
--- /dev/null
+++ b/pkg/sentry/vfs/lock/BUILD
@@ -0,0 +1,13 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+ name = "lock",
+ srcs = ["lock.go"],
+ visibility = ["//pkg/sentry:internal"],
+ deps = [
+ "//pkg/sentry/fs/lock",
+ "//pkg/syserror",
+ ],
+)
diff --git a/pkg/sentry/vfs/lock/lock.go b/pkg/sentry/vfs/lock/lock.go
new file mode 100644
index 000000000..724dfe743
--- /dev/null
+++ b/pkg/sentry/vfs/lock/lock.go
@@ -0,0 +1,72 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package lock provides POSIX and BSD style file locking for VFS2 file
+// implementations.
+//
+// The actual implementations can be found in the lock package under
+// sentry/fs/lock.
+package lock
+
+import (
+ fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+// FileLocks supports POSIX and BSD style locks, which correspond to fcntl(2)
+// and flock(2) respectively in Linux. It can be embedded into various file
+// implementations for VFS2 that support locking.
+//
+// Note that in Linux these two types of locks are _not_ cooperative, because
+// race and deadlock conditions make merging them prohibitive. We do the same
+// and keep them oblivious to each other.
+type FileLocks struct {
+ // bsd is a set of BSD-style advisory file wide locks, see flock(2).
+ bsd fslock.Locks
+
+ // posix is a set of POSIX-style regional advisory locks, see fcntl(2).
+ posix fslock.Locks
+}
+
+// LockBSD tries to acquire a BSD-style lock on the entire file.
+func (fl *FileLocks) LockBSD(uid fslock.UniqueID, t fslock.LockType, block fslock.Blocker) error {
+ if fl.bsd.LockRegion(uid, t, fslock.LockRange{0, fslock.LockEOF}, block) {
+ return nil
+ }
+ return syserror.ErrWouldBlock
+}
+
+// UnlockBSD releases a BSD-style lock on the entire file.
+//
+// This operation is always successful, even if there did not exist a lock on
+// the requested region held by uid in the first place.
+func (fl *FileLocks) UnlockBSD(uid fslock.UniqueID) {
+ fl.bsd.UnlockRegion(uid, fslock.LockRange{0, fslock.LockEOF})
+}
+
+// LockPOSIX tries to acquire a POSIX-style lock on a file region.
+func (fl *FileLocks) LockPOSIX(uid fslock.UniqueID, t fslock.LockType, rng fslock.LockRange, block fslock.Blocker) error {
+ if fl.posix.LockRegion(uid, t, rng, block) {
+ return nil
+ }
+ return syserror.ErrWouldBlock
+}
+
+// UnlockPOSIX releases a POSIX-style lock on a file region.
+//
+// This operation is always successful, even if there did not exist a lock on
+// the requested region held by uid in the first place.
+func (fl *FileLocks) UnlockPOSIX(uid fslock.UniqueID, rng fslock.LockRange) {
+ fl.posix.UnlockRegion(uid, rng)
+}
diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go
index 00177b371..1fbb420f9 100644
--- a/pkg/sentry/vfs/mount.go
+++ b/pkg/sentry/vfs/mount.go
@@ -19,7 +19,7 @@ import (
"sync/atomic"
"gvisor.dev/gvisor/pkg/abi/linux"
- "gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/syserror"
)
@@ -423,7 +423,8 @@ func (mntns *MountNamespace) IncRef() {
}
// DecRef decrements mntns' reference count.
-func (mntns *MountNamespace) DecRef(vfs *VirtualFilesystem) {
+func (mntns *MountNamespace) DecRef() {
+ vfs := mntns.root.fs.VirtualFilesystem()
if refs := atomic.AddInt64(&mntns.refs, -1); refs == 0 {
vfs.mountMu.Lock()
vfs.mounts.seq.BeginWrite()
diff --git a/pkg/sentry/vfs/pathname.go b/pkg/sentry/vfs/pathname.go
index cf80df90e..b318c681a 100644
--- a/pkg/sentry/vfs/pathname.go
+++ b/pkg/sentry/vfs/pathname.go
@@ -15,8 +15,8 @@
package vfs
import (
+ "gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/fspath"
- "gvisor.dev/gvisor/pkg/sentry/context"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
)
diff --git a/pkg/sentry/vfs/testutil.go b/pkg/sentry/vfs/testutil.go
deleted file mode 100644
index ee5c8b9e2..000000000
--- a/pkg/sentry/vfs/testutil.go
+++ /dev/null
@@ -1,173 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package vfs
-
-import (
- "fmt"
-
- "gvisor.dev/gvisor/pkg/abi/linux"
- "gvisor.dev/gvisor/pkg/fspath"
- "gvisor.dev/gvisor/pkg/sentry/context"
- "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
- "gvisor.dev/gvisor/pkg/syserror"
-)
-
-// FDTestFilesystemType is a test-only FilesystemType that produces Filesystems
-// for which all FilesystemImpl methods taking a path return EPERM. It is used
-// to produce Mounts and Dentries for testing of FileDescriptionImpls that do
-// not depend on their originating Filesystem.
-type FDTestFilesystemType struct{}
-
-// FDTestFilesystem is a test-only FilesystemImpl produced by
-// FDTestFilesystemType.
-type FDTestFilesystem struct {
- vfsfs Filesystem
-}
-
-// GetFilesystem implements FilesystemType.GetFilesystem.
-func (fstype FDTestFilesystemType) GetFilesystem(ctx context.Context, vfsObj *VirtualFilesystem, creds *auth.Credentials, source string, opts GetFilesystemOptions) (*Filesystem, *Dentry, error) {
- var fs FDTestFilesystem
- fs.vfsfs.Init(vfsObj, &fs)
- return &fs.vfsfs, fs.NewDentry(), nil
-}
-
-// Release implements FilesystemImpl.Release.
-func (fs *FDTestFilesystem) Release() {
-}
-
-// Sync implements FilesystemImpl.Sync.
-func (fs *FDTestFilesystem) Sync(ctx context.Context) error {
- return nil
-}
-
-// GetDentryAt implements FilesystemImpl.GetDentryAt.
-func (fs *FDTestFilesystem) GetDentryAt(ctx context.Context, rp *ResolvingPath, opts GetDentryOptions) (*Dentry, error) {
- return nil, syserror.EPERM
-}
-
-// GetParentDentryAt implements FilesystemImpl.GetParentDentryAt.
-func (fs *FDTestFilesystem) GetParentDentryAt(ctx context.Context, rp *ResolvingPath) (*Dentry, error) {
- return nil, syserror.EPERM
-}
-
-// LinkAt implements FilesystemImpl.LinkAt.
-func (fs *FDTestFilesystem) LinkAt(ctx context.Context, rp *ResolvingPath, vd VirtualDentry) error {
- return syserror.EPERM
-}
-
-// MkdirAt implements FilesystemImpl.MkdirAt.
-func (fs *FDTestFilesystem) MkdirAt(ctx context.Context, rp *ResolvingPath, opts MkdirOptions) error {
- return syserror.EPERM
-}
-
-// MknodAt implements FilesystemImpl.MknodAt.
-func (fs *FDTestFilesystem) MknodAt(ctx context.Context, rp *ResolvingPath, opts MknodOptions) error {
- return syserror.EPERM
-}
-
-// OpenAt implements FilesystemImpl.OpenAt.
-func (fs *FDTestFilesystem) OpenAt(ctx context.Context, rp *ResolvingPath, opts OpenOptions) (*FileDescription, error) {
- return nil, syserror.EPERM
-}
-
-// ReadlinkAt implements FilesystemImpl.ReadlinkAt.
-func (fs *FDTestFilesystem) ReadlinkAt(ctx context.Context, rp *ResolvingPath) (string, error) {
- return "", syserror.EPERM
-}
-
-// RenameAt implements FilesystemImpl.RenameAt.
-func (fs *FDTestFilesystem) RenameAt(ctx context.Context, rp *ResolvingPath, oldParentVD VirtualDentry, oldName string, opts RenameOptions) error {
- return syserror.EPERM
-}
-
-// RmdirAt implements FilesystemImpl.RmdirAt.
-func (fs *FDTestFilesystem) RmdirAt(ctx context.Context, rp *ResolvingPath) error {
- return syserror.EPERM
-}
-
-// SetStatAt implements FilesystemImpl.SetStatAt.
-func (fs *FDTestFilesystem) SetStatAt(ctx context.Context, rp *ResolvingPath, opts SetStatOptions) error {
- return syserror.EPERM
-}
-
-// StatAt implements FilesystemImpl.StatAt.
-func (fs *FDTestFilesystem) StatAt(ctx context.Context, rp *ResolvingPath, opts StatOptions) (linux.Statx, error) {
- return linux.Statx{}, syserror.EPERM
-}
-
-// StatFSAt implements FilesystemImpl.StatFSAt.
-func (fs *FDTestFilesystem) StatFSAt(ctx context.Context, rp *ResolvingPath) (linux.Statfs, error) {
- return linux.Statfs{}, syserror.EPERM
-}
-
-// SymlinkAt implements FilesystemImpl.SymlinkAt.
-func (fs *FDTestFilesystem) SymlinkAt(ctx context.Context, rp *ResolvingPath, target string) error {
- return syserror.EPERM
-}
-
-// UnlinkAt implements FilesystemImpl.UnlinkAt.
-func (fs *FDTestFilesystem) UnlinkAt(ctx context.Context, rp *ResolvingPath) error {
- return syserror.EPERM
-}
-
-// ListxattrAt implements FilesystemImpl.ListxattrAt.
-func (fs *FDTestFilesystem) ListxattrAt(ctx context.Context, rp *ResolvingPath) ([]string, error) {
- return nil, syserror.EPERM
-}
-
-// GetxattrAt implements FilesystemImpl.GetxattrAt.
-func (fs *FDTestFilesystem) GetxattrAt(ctx context.Context, rp *ResolvingPath, name string) (string, error) {
- return "", syserror.EPERM
-}
-
-// SetxattrAt implements FilesystemImpl.SetxattrAt.
-func (fs *FDTestFilesystem) SetxattrAt(ctx context.Context, rp *ResolvingPath, opts SetxattrOptions) error {
- return syserror.EPERM
-}
-
-// RemovexattrAt implements FilesystemImpl.RemovexattrAt.
-func (fs *FDTestFilesystem) RemovexattrAt(ctx context.Context, rp *ResolvingPath, name string) error {
- return syserror.EPERM
-}
-
-// PrependPath implements FilesystemImpl.PrependPath.
-func (fs *FDTestFilesystem) PrependPath(ctx context.Context, vfsroot, vd VirtualDentry, b *fspath.Builder) error {
- b.PrependComponent(fmt.Sprintf("vfs.fdTestDentry:%p", vd.dentry.impl.(*fdTestDentry)))
- return PrependPathSyntheticError{}
-}
-
-type fdTestDentry struct {
- vfsd Dentry
-}
-
-// NewDentry returns a new Dentry.
-func (fs *FDTestFilesystem) NewDentry() *Dentry {
- var d fdTestDentry
- d.vfsd.Init(&d)
- return &d.vfsd
-}
-
-// IncRef implements DentryImpl.IncRef.
-func (d *fdTestDentry) IncRef() {
-}
-
-// TryIncRef implements DentryImpl.TryIncRef.
-func (d *fdTestDentry) TryIncRef() bool {
- return true
-}
-
-// DecRef implements DentryImpl.DecRef.
-func (d *fdTestDentry) DecRef() {
-}
diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go
index 1f6f56293..908c69f91 100644
--- a/pkg/sentry/vfs/vfs.go
+++ b/pkg/sentry/vfs/vfs.go
@@ -16,23 +16,27 @@
//
// Lock order:
//
-// FilesystemImpl/FileDescriptionImpl locks
-// VirtualFilesystem.mountMu
-// Dentry.mu
-// Locks acquired by FilesystemImpls between Prepare{Delete,Rename}Dentry and Commit{Delete,Rename*}Dentry
-// VirtualFilesystem.filesystemsMu
+// EpollInstance.interestMu
+// FileDescription.epollMu
+// FilesystemImpl/FileDescriptionImpl locks
+// VirtualFilesystem.mountMu
+// Dentry.mu
+// Locks acquired by FilesystemImpls between Prepare{Delete,Rename}Dentry and Commit{Delete,Rename*}Dentry
+// VirtualFilesystem.filesystemsMu
+// EpollInstance.mu
// VirtualFilesystem.fsTypesMu
//
// Locking Dentry.mu in multiple Dentries requires holding
-// VirtualFilesystem.mountMu.
+// VirtualFilesystem.mountMu. Locking EpollInstance.interestMu in multiple
+// EpollInstances requires holding epollCycleMu.
package vfs
import (
"fmt"
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/fspath"
- "gvisor.dev/gvisor/pkg/sentry/context"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
@@ -75,6 +79,14 @@ type VirtualFilesystem struct {
// mountpoints is analogous to Linux's mountpoint_hashtable.
mountpoints map[*Dentry]map[*Mount]struct{}
+ // anonMount is a Mount, not included in mounts or mountpoints,
+ // representing an anonFilesystem. anonMount is used to back
+ // VirtualDentries returned by VirtualFilesystem.NewAnonVirtualDentry().
+ // anonMount is immutable.
+ //
+ // anonMount is analogous to Linux's anon_inode_mnt.
+ anonMount *Mount
+
// devices contains all registered Devices. devices is protected by
// devicesMu.
devicesMu sync.RWMutex
@@ -110,6 +122,22 @@ func New() *VirtualFilesystem {
filesystems: make(map[*Filesystem]struct{}),
}
vfs.mounts.Init()
+
+ // Construct vfs.anonMount.
+ anonfsDevMinor, err := vfs.GetAnonBlockDevMinor()
+ if err != nil {
+ panic(fmt.Sprintf("VirtualFilesystem.GetAnonBlockDevMinor() failed during VirtualFilesystem construction: %v", err))
+ }
+ anonfs := anonFilesystem{
+ devMinor: anonfsDevMinor,
+ }
+ anonfs.vfsfs.Init(vfs, &anonfs)
+ vfs.anonMount = &Mount{
+ vfs: vfs,
+ fs: &anonfs.vfsfs,
+ refs: 1,
+ }
+
return vfs
}