diff options
Diffstat (limited to 'pkg/sentry/vfs')
26 files changed, 211 insertions, 1030 deletions
diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD deleted file mode 100644 index 14b39eb9d..000000000 --- a/pkg/sentry/vfs/BUILD +++ /dev/null @@ -1,73 +0,0 @@ -load("//tools:defs.bzl", "go_library", "go_test") -load("//tools/go_generics:defs.bzl", "go_template_instance") - -licenses(["notice"]) - -go_template_instance( - name = "epoll_interest_list", - out = "epoll_interest_list.go", - package = "vfs", - prefix = "epollInterest", - template = "//pkg/ilist:generic_list", - types = { - "Element": "*epollInterest", - "Linker": "*epollInterest", - }, -) - -go_library( - name = "vfs", - srcs = [ - "anonfs.go", - "context.go", - "debug.go", - "dentry.go", - "device.go", - "epoll.go", - "epoll_interest_list.go", - "file_description.go", - "file_description_impl_util.go", - "filesystem.go", - "filesystem_impl_util.go", - "filesystem_type.go", - "mount.go", - "mount_unsafe.go", - "options.go", - "pathname.go", - "permissions.go", - "resolving_path.go", - "vfs.go", - ], - visibility = ["//pkg/sentry:internal"], - deps = [ - "//pkg/abi/linux", - "//pkg/context", - "//pkg/fspath", - "//pkg/sentry/arch", - "//pkg/sentry/fs/lock", - "//pkg/sentry/kernel/auth", - "//pkg/sentry/memmap", - "//pkg/sync", - "//pkg/syserror", - "//pkg/usermem", - "//pkg/waiter", - ], -) - -go_test( - name = "vfs_test", - size = "small", - srcs = [ - "file_description_impl_util_test.go", - "mount_test.go", - ], - library = ":vfs", - deps = [ - "//pkg/abi/linux", - "//pkg/context", - "//pkg/sentry/contexttest", - "//pkg/sync", - "//pkg/syserror", - "//pkg/usermem", - ], -) diff --git a/pkg/sentry/vfs/README.md b/pkg/sentry/vfs/README.md deleted file mode 100644 index 9aa133bcb..000000000 --- a/pkg/sentry/vfs/README.md +++ /dev/null @@ -1,197 +0,0 @@ -# The gVisor Virtual Filesystem - -THIS PACKAGE IS CURRENTLY EXPERIMENTAL AND NOT READY OR ENABLED FOR PRODUCTION -USE. For the filesystem implementation currently used by gVisor, see the `fs` -package. - -## Implementation Notes - -### Reference Counting - -Filesystem, Dentry, Mount, MountNamespace, and FileDescription are all -reference-counted. Mount and MountNamespace are exclusively VFS-managed; when -their reference count reaches zero, VFS releases their resources. Filesystem and -FileDescription management is shared between VFS and filesystem implementations; -when their reference count reaches zero, VFS notifies the implementation by -calling `FilesystemImpl.Release()` or `FileDescriptionImpl.Release()` -respectively and then releases VFS-owned resources. Dentries are exclusively -managed by filesystem implementations; reference count changes are abstracted -through DentryImpl, which should release resources when reference count reaches -zero. - -Filesystem references are held by: - -- Mount: Each referenced Mount holds a reference on the mounted Filesystem. - -Dentry references are held by: - -- FileDescription: Each referenced FileDescription holds a reference on the - Dentry through which it was opened, via `FileDescription.vd.dentry`. - -- Mount: Each referenced Mount holds a reference on its mount point and on the - mounted filesystem root. The mount point is mutable (`mount(MS_MOVE)`). - -Mount references are held by: - -- FileDescription: Each referenced FileDescription holds a reference on the - Mount on which it was opened, via `FileDescription.vd.mount`. - -- Mount: Each referenced Mount holds a reference on its parent, which is the - mount containing its mount point. - -- VirtualFilesystem: A reference is held on each Mount that has not been - umounted. - -MountNamespace and FileDescription references are held by users of VFS. The -expectation is that each `kernel.Task` holds a reference on its corresponding -MountNamespace, and each file descriptor holds a reference on its represented -FileDescription. - -Notes: - -- Dentries do not hold a reference on their owning Filesystem. Instead, all - uses of a Dentry occur in the context of a Mount, which holds a reference on - the relevant Filesystem (see e.g. the VirtualDentry type). As a corollary, - when releasing references on both a Dentry and its corresponding Mount, the - Dentry's reference must be released first (because releasing the Mount's - reference may release the last reference on the Filesystem, whose state may - be required to release the Dentry reference). - -### The Inheritance Pattern - -Filesystem, Dentry, and FileDescription are all concepts featuring both state -that must be shared between VFS and filesystem implementations, and operations -that are implementation-defined. To facilitate this, each of these three -concepts follows the same pattern, shown below for Dentry: - -```go -// Dentry represents a node in a filesystem tree. -type Dentry struct { - // VFS-required dentry state. - parent *Dentry - // ... - - // impl is the DentryImpl associated with this Dentry. impl is immutable. - // This should be the last field in Dentry. - impl DentryImpl -} - -// Init must be called before first use of d. -func (d *Dentry) Init(impl DentryImpl) { - d.impl = impl -} - -// Impl returns the DentryImpl associated with d. -func (d *Dentry) Impl() DentryImpl { - return d.impl -} - -// DentryImpl contains implementation-specific details of a Dentry. -// Implementations of DentryImpl should contain their associated Dentry by -// value as their first field. -type DentryImpl interface { - // VFS-required implementation-defined dentry operations. - IncRef() - // ... -} -``` - -This construction, which is essentially a type-safe analogue to Linux's -`container_of` pattern, has the following properties: - -- VFS works almost exclusively with pointers to Dentry rather than DentryImpl - interface objects, such as in the type of `Dentry.parent`. This avoids - interface method calls (which are somewhat expensive to perform, and defeat - inlining and escape analysis), reduces the size of VFS types (since an - interface object is two pointers in size), and allows pointers to be loaded - and stored atomically using `sync/atomic`. Implementation-defined behavior - is accessed via `Dentry.impl` when required. - -- Filesystem implementations can access the implementation-defined state - associated with objects of VFS types by type-asserting or type-switching - (e.g. `Dentry.Impl().(*myDentry)`). Type assertions to a concrete type - require only an equality comparison of the interface object's type pointer - to a static constant, and are consequently very fast. - -- Filesystem implementations can access the VFS state associated with objects - of implementation-defined types directly. - -- VFS and implementation-defined state for a given type occupy the same - object, minimizing memory allocations and maximizing memory locality. `impl` - is the last field in `Dentry`, and `Dentry` is the first field in - `DentryImpl` implementations, for similar reasons: this tends to cause - fetching of the `Dentry.impl` interface object to also fetch `DentryImpl` - fields, either because they are in the same cache line or via next-line - prefetching. - -## Future Work - -- Most `mount(2)` features, and unmounting, are incomplete. - -- VFS1 filesystems are not directly compatible with VFS2. It may be possible - to implement shims that implement `vfs.FilesystemImpl` for - `fs.MountNamespace`, `vfs.DentryImpl` for `fs.Dirent`, and - `vfs.FileDescriptionImpl` for `fs.File`, which may be adequate for - filesystems that are not performance-critical (e.g. sysfs); however, it is - not clear that this will be less effort than simply porting the filesystems - in question. Practically speaking, the following filesystems will probably - need to be ported or made compatible through a shim to evaluate filesystem - performance on realistic workloads: - - - devfs/procfs/sysfs, which will realistically be necessary to execute - most applications. (Note that procfs and sysfs do not support hard - links, so they do not require the complexity of separate inode objects. - Also note that Linux's /dev is actually a variant of tmpfs called - devtmpfs.) - - - tmpfs. This should be relatively straightforward: copy/paste memfs, - store regular file contents in pgalloc-allocated memory instead of - `[]byte`, and add support for file timestamps. (In fact, it probably - makes more sense to convert memfs to tmpfs and not keep the former.) - - - A remote filesystem, either lisafs (if it is ready by the time that - other benchmarking prerequisites are) or v9fs (aka 9P, aka gofers). - - - epoll files. - - Filesystems that will need to be ported before switching to VFS2, but can - probably be skipped for early testing: - - - overlayfs, which is needed for (at least) synthetic mount points. - - - Support for host ttys. - - - timerfd files. - - Filesystems that can be probably dropped: - - - ashmem, which is far too incomplete to use. - - - binder, which is similarly far too incomplete to use. - - - whitelistfs, which we are already actively attempting to remove. - -- Save/restore. For instance, it is unclear if the current implementation of - the `state` package supports the inheritance pattern described above. - -- Many features that were previously implemented by VFS must now be - implemented by individual filesystems (though, in most cases, this should - consist of calls to hooks or libraries provided by `vfs` or other packages). - This includes, but is not necessarily limited to: - - - Block and character device special files - - - Inotify - - - File locking - - - `O_ASYNC` - -- Reference counts in the `vfs` package do not use the `refs` package since - `refs.AtomicRefCount` adds 64 bytes of overhead to each 8-byte reference - count, resulting in considerable cache bloat. 24 bytes of this overhead is - for weak reference support, which have poor performance and will not be used - by VFS2. The remaining 40 bytes is to store a descriptive string and stack - trace for reference leak checking; we can support reference leak checking - without incurring this space overhead by including the applicable - information directly in finalizers for applicable types. diff --git a/pkg/sentry/vfs/anonfs.go b/pkg/sentry/vfs/anonfs.go index 2db25be49..2db25be49 100644..100755 --- a/pkg/sentry/vfs/anonfs.go +++ b/pkg/sentry/vfs/anonfs.go diff --git a/pkg/sentry/vfs/context.go b/pkg/sentry/vfs/context.go index d97362b9a..d97362b9a 100644..100755 --- a/pkg/sentry/vfs/context.go +++ b/pkg/sentry/vfs/context.go diff --git a/pkg/sentry/vfs/debug.go b/pkg/sentry/vfs/debug.go index 0ed20f249..0ed20f249 100644..100755 --- a/pkg/sentry/vfs/debug.go +++ b/pkg/sentry/vfs/debug.go diff --git a/pkg/sentry/vfs/dentry.go b/pkg/sentry/vfs/dentry.go index 486a76475..486a76475 100644..100755 --- a/pkg/sentry/vfs/dentry.go +++ b/pkg/sentry/vfs/dentry.go diff --git a/pkg/sentry/vfs/device.go b/pkg/sentry/vfs/device.go index 3af2aa58d..3af2aa58d 100644..100755 --- a/pkg/sentry/vfs/device.go +++ b/pkg/sentry/vfs/device.go diff --git a/pkg/sentry/vfs/epoll.go b/pkg/sentry/vfs/epoll.go index 7c83f9a5a..7c83f9a5a 100644..100755 --- a/pkg/sentry/vfs/epoll.go +++ b/pkg/sentry/vfs/epoll.go diff --git a/pkg/sentry/vfs/epoll_interest_list.go b/pkg/sentry/vfs/epoll_interest_list.go new file mode 100755 index 000000000..16cf878cd --- /dev/null +++ b/pkg/sentry/vfs/epoll_interest_list.go @@ -0,0 +1,173 @@ +package vfs + +// ElementMapper provides an identity mapping by default. +// +// This can be replaced to provide a struct that maps elements to linker +// objects, if they are not the same. An ElementMapper is not typically +// required if: Linker is left as is, Element is left as is, or Linker and +// Element are the same type. +type epollInterestElementMapper struct{} + +// linkerFor maps an Element to a Linker. +// +// This default implementation should be inlined. +// +//go:nosplit +func (epollInterestElementMapper) linkerFor(elem *epollInterest) *epollInterest { return elem } + +// List is an intrusive list. Entries can be added to or removed from the list +// in O(1) time and with no additional memory allocations. +// +// The zero value for List is an empty list ready to use. +// +// To iterate over a list (where l is a List): +// for e := l.Front(); e != nil; e = e.Next() { +// // do something with e. +// } +// +// +stateify savable +type epollInterestList struct { + head *epollInterest + tail *epollInterest +} + +// Reset resets list l to the empty state. +func (l *epollInterestList) Reset() { + l.head = nil + l.tail = nil +} + +// Empty returns true iff the list is empty. +func (l *epollInterestList) Empty() bool { + return l.head == nil +} + +// Front returns the first element of list l or nil. +func (l *epollInterestList) Front() *epollInterest { + return l.head +} + +// Back returns the last element of list l or nil. +func (l *epollInterestList) Back() *epollInterest { + return l.tail +} + +// PushFront inserts the element e at the front of list l. +func (l *epollInterestList) PushFront(e *epollInterest) { + epollInterestElementMapper{}.linkerFor(e).SetNext(l.head) + epollInterestElementMapper{}.linkerFor(e).SetPrev(nil) + + if l.head != nil { + epollInterestElementMapper{}.linkerFor(l.head).SetPrev(e) + } else { + l.tail = e + } + + l.head = e +} + +// PushBack inserts the element e at the back of list l. +func (l *epollInterestList) PushBack(e *epollInterest) { + epollInterestElementMapper{}.linkerFor(e).SetNext(nil) + epollInterestElementMapper{}.linkerFor(e).SetPrev(l.tail) + + if l.tail != nil { + epollInterestElementMapper{}.linkerFor(l.tail).SetNext(e) + } else { + l.head = e + } + + l.tail = e +} + +// PushBackList inserts list m at the end of list l, emptying m. +func (l *epollInterestList) PushBackList(m *epollInterestList) { + if l.head == nil { + l.head = m.head + l.tail = m.tail + } else if m.head != nil { + epollInterestElementMapper{}.linkerFor(l.tail).SetNext(m.head) + epollInterestElementMapper{}.linkerFor(m.head).SetPrev(l.tail) + + l.tail = m.tail + } + + m.head = nil + m.tail = nil +} + +// InsertAfter inserts e after b. +func (l *epollInterestList) InsertAfter(b, e *epollInterest) { + a := epollInterestElementMapper{}.linkerFor(b).Next() + epollInterestElementMapper{}.linkerFor(e).SetNext(a) + epollInterestElementMapper{}.linkerFor(e).SetPrev(b) + epollInterestElementMapper{}.linkerFor(b).SetNext(e) + + if a != nil { + epollInterestElementMapper{}.linkerFor(a).SetPrev(e) + } else { + l.tail = e + } +} + +// InsertBefore inserts e before a. +func (l *epollInterestList) InsertBefore(a, e *epollInterest) { + b := epollInterestElementMapper{}.linkerFor(a).Prev() + epollInterestElementMapper{}.linkerFor(e).SetNext(a) + epollInterestElementMapper{}.linkerFor(e).SetPrev(b) + epollInterestElementMapper{}.linkerFor(a).SetPrev(e) + + if b != nil { + epollInterestElementMapper{}.linkerFor(b).SetNext(e) + } else { + l.head = e + } +} + +// Remove removes e from l. +func (l *epollInterestList) Remove(e *epollInterest) { + prev := epollInterestElementMapper{}.linkerFor(e).Prev() + next := epollInterestElementMapper{}.linkerFor(e).Next() + + if prev != nil { + epollInterestElementMapper{}.linkerFor(prev).SetNext(next) + } else { + l.head = next + } + + if next != nil { + epollInterestElementMapper{}.linkerFor(next).SetPrev(prev) + } else { + l.tail = prev + } +} + +// Entry is a default implementation of Linker. Users can add anonymous fields +// of this type to their structs to make them automatically implement the +// methods needed by List. +// +// +stateify savable +type epollInterestEntry struct { + next *epollInterest + prev *epollInterest +} + +// Next returns the entry that follows e in the list. +func (e *epollInterestEntry) Next() *epollInterest { + return e.next +} + +// Prev returns the entry that precedes e in the list. +func (e *epollInterestEntry) Prev() *epollInterest { + return e.prev +} + +// SetNext assigns 'entry' as the entry that follows e in the list. +func (e *epollInterestEntry) SetNext(elem *epollInterest) { + e.next = elem +} + +// SetPrev assigns 'entry' as the entry that precedes e in the list. +func (e *epollInterestEntry) SetPrev(elem *epollInterest) { + e.prev = elem +} diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go index 5bac660c7..5bac660c7 100644..100755 --- a/pkg/sentry/vfs/file_description.go +++ b/pkg/sentry/vfs/file_description.go diff --git a/pkg/sentry/vfs/file_description_impl_util.go b/pkg/sentry/vfs/file_description_impl_util.go index c2a52ec1b..c2a52ec1b 100644..100755 --- a/pkg/sentry/vfs/file_description_impl_util.go +++ b/pkg/sentry/vfs/file_description_impl_util.go diff --git a/pkg/sentry/vfs/file_description_impl_util_test.go b/pkg/sentry/vfs/file_description_impl_util_test.go deleted file mode 100644 index 8fa26418e..000000000 --- a/pkg/sentry/vfs/file_description_impl_util_test.go +++ /dev/null @@ -1,217 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package vfs - -import ( - "bytes" - "fmt" - "io" - "sync/atomic" - "testing" - - "gvisor.dev/gvisor/pkg/abi/linux" - "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/sentry/contexttest" - "gvisor.dev/gvisor/pkg/syserror" - "gvisor.dev/gvisor/pkg/usermem" -) - -// fileDescription is the common fd struct which a filesystem implementation -// embeds in all of its file description implementations as required. -type fileDescription struct { - vfsfd FileDescription - FileDescriptionDefaultImpl -} - -// genCount contains the number of times its DynamicBytesSource.Generate() -// implementation has been called. -type genCount struct { - count uint64 // accessed using atomic memory ops -} - -// Generate implements DynamicBytesSource.Generate. -func (g *genCount) Generate(ctx context.Context, buf *bytes.Buffer) error { - fmt.Fprintf(buf, "%d", atomic.AddUint64(&g.count, 1)) - return nil -} - -type storeData struct { - data string -} - -var _ WritableDynamicBytesSource = (*storeData)(nil) - -// Generate implements DynamicBytesSource. -func (d *storeData) Generate(ctx context.Context, buf *bytes.Buffer) error { - buf.WriteString(d.data) - return nil -} - -// Generate implements WritableDynamicBytesSource. -func (d *storeData) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) { - buf := make([]byte, src.NumBytes()) - n, err := src.CopyIn(ctx, buf) - if err != nil { - return 0, err - } - - d.data = string(buf[:n]) - return 0, nil -} - -// testFD is a read-only FileDescriptionImpl representing a regular file. -type testFD struct { - fileDescription - DynamicBytesFileDescriptionImpl - - data DynamicBytesSource -} - -func newTestFD(vfsObj *VirtualFilesystem, statusFlags uint32, data DynamicBytesSource) *FileDescription { - vd := vfsObj.NewAnonVirtualDentry("genCountFD") - defer vd.DecRef() - var fd testFD - fd.vfsfd.Init(&fd, statusFlags, vd.Mount(), vd.Dentry(), &FileDescriptionOptions{}) - fd.DynamicBytesFileDescriptionImpl.SetDataSource(data) - return &fd.vfsfd -} - -// Release implements FileDescriptionImpl.Release. -func (fd *testFD) Release() { -} - -// SetStatusFlags implements FileDescriptionImpl.SetStatusFlags. -// Stat implements FileDescriptionImpl.Stat. -func (fd *testFD) Stat(ctx context.Context, opts StatOptions) (linux.Statx, error) { - // Note that Statx.Mask == 0 in the return value. - return linux.Statx{}, nil -} - -// SetStat implements FileDescriptionImpl.SetStat. -func (fd *testFD) SetStat(ctx context.Context, opts SetStatOptions) error { - return syserror.EPERM -} - -func TestGenCountFD(t *testing.T) { - ctx := contexttest.Context(t) - - vfsObj := New() // vfs.New() - fd := newTestFD(vfsObj, linux.O_RDWR, &genCount{}) - defer fd.DecRef() - - // The first read causes Generate to be called to fill the FD's buffer. - buf := make([]byte, 2) - ioseq := usermem.BytesIOSequence(buf) - n, err := fd.Read(ctx, ioseq, ReadOptions{}) - if n != 1 || (err != nil && err != io.EOF) { - t.Fatalf("first Read: got (%d, %v), wanted (1, nil or EOF)", n, err) - } - if want := byte('1'); buf[0] != want { - t.Errorf("first Read: got byte %c, wanted %c", buf[0], want) - } - - // A second read without seeking is still at EOF. - n, err = fd.Read(ctx, ioseq, ReadOptions{}) - if n != 0 || err != io.EOF { - t.Fatalf("second Read: got (%d, %v), wanted (0, EOF)", n, err) - } - - // Seeking to the beginning of the file causes it to be regenerated. - n, err = fd.Seek(ctx, 0, linux.SEEK_SET) - if n != 0 || err != nil { - t.Fatalf("Seek: got (%d, %v), wanted (0, nil)", n, err) - } - n, err = fd.Read(ctx, ioseq, ReadOptions{}) - if n != 1 || (err != nil && err != io.EOF) { - t.Fatalf("Read after Seek: got (%d, %v), wanted (1, nil or EOF)", n, err) - } - if want := byte('2'); buf[0] != want { - t.Errorf("Read after Seek: got byte %c, wanted %c", buf[0], want) - } - - // PRead at the beginning of the file also causes it to be regenerated. - n, err = fd.PRead(ctx, ioseq, 0, ReadOptions{}) - if n != 1 || (err != nil && err != io.EOF) { - t.Fatalf("PRead: got (%d, %v), wanted (1, nil or EOF)", n, err) - } - if want := byte('3'); buf[0] != want { - t.Errorf("PRead: got byte %c, wanted %c", buf[0], want) - } - - // Write and PWrite fails. - if _, err := fd.Write(ctx, ioseq, WriteOptions{}); err != syserror.EINVAL { - t.Errorf("Write: got err %v, wanted %v", err, syserror.EINVAL) - } - if _, err := fd.PWrite(ctx, ioseq, 0, WriteOptions{}); err != syserror.EINVAL { - t.Errorf("Write: got err %v, wanted %v", err, syserror.EINVAL) - } -} - -func TestWritable(t *testing.T) { - ctx := contexttest.Context(t) - - vfsObj := New() // vfs.New() - fd := newTestFD(vfsObj, linux.O_RDWR, &storeData{data: "init"}) - defer fd.DecRef() - - buf := make([]byte, 10) - ioseq := usermem.BytesIOSequence(buf) - if n, err := fd.Read(ctx, ioseq, ReadOptions{}); n != 4 && err != io.EOF { - t.Fatalf("Read: got (%v, %v), wanted (4, EOF)", n, err) - } - if want := "init"; want == string(buf) { - t.Fatalf("Read: got %v, wanted %v", string(buf), want) - } - - // Test PWrite. - want := "write" - writeIOSeq := usermem.BytesIOSequence([]byte(want)) - if n, err := fd.PWrite(ctx, writeIOSeq, 0, WriteOptions{}); int(n) != len(want) && err != nil { - t.Errorf("PWrite: got err (%v, %v), wanted (%v, nil)", n, err, len(want)) - } - if n, err := fd.PRead(ctx, ioseq, 0, ReadOptions{}); int(n) != len(want) && err != io.EOF { - t.Fatalf("PRead: got (%v, %v), wanted (%v, EOF)", n, err, len(want)) - } - if want == string(buf) { - t.Fatalf("PRead: got %v, wanted %v", string(buf), want) - } - - // Test Seek to 0 followed by Write. - want = "write2" - writeIOSeq = usermem.BytesIOSequence([]byte(want)) - if n, err := fd.Seek(ctx, 0, linux.SEEK_SET); n != 0 && err != nil { - t.Errorf("Seek: got err (%v, %v), wanted (0, nil)", n, err) - } - if n, err := fd.Write(ctx, writeIOSeq, WriteOptions{}); int(n) != len(want) && err != nil { - t.Errorf("Write: got err (%v, %v), wanted (%v, nil)", n, err, len(want)) - } - if n, err := fd.PRead(ctx, ioseq, 0, ReadOptions{}); int(n) != len(want) && err != io.EOF { - t.Fatalf("PRead: got (%v, %v), wanted (%v, EOF)", n, err, len(want)) - } - if want == string(buf) { - t.Fatalf("PRead: got %v, wanted %v", string(buf), want) - } - - // Test failure if offset != 0. - if n, err := fd.Seek(ctx, 1, linux.SEEK_SET); n != 0 && err != nil { - t.Errorf("Seek: got err (%v, %v), wanted (0, nil)", n, err) - } - if n, err := fd.Write(ctx, writeIOSeq, WriteOptions{}); n != 0 && err != syserror.EINVAL { - t.Errorf("Write: got err (%v, %v), wanted (0, EINVAL)", n, err) - } - if n, err := fd.PWrite(ctx, writeIOSeq, 2, WriteOptions{}); n != 0 && err != syserror.EINVAL { - t.Errorf("PWrite: got err (%v, %v), wanted (0, EINVAL)", n, err) - } -} diff --git a/pkg/sentry/vfs/filesystem.go b/pkg/sentry/vfs/filesystem.go index a06a6caf3..a06a6caf3 100644..100755 --- a/pkg/sentry/vfs/filesystem.go +++ b/pkg/sentry/vfs/filesystem.go diff --git a/pkg/sentry/vfs/filesystem_impl_util.go b/pkg/sentry/vfs/filesystem_impl_util.go index 7315a588e..7315a588e 100644..100755 --- a/pkg/sentry/vfs/filesystem_impl_util.go +++ b/pkg/sentry/vfs/filesystem_impl_util.go diff --git a/pkg/sentry/vfs/filesystem_type.go b/pkg/sentry/vfs/filesystem_type.go index c58b70728..c58b70728 100644..100755 --- a/pkg/sentry/vfs/filesystem_type.go +++ b/pkg/sentry/vfs/filesystem_type.go diff --git a/pkg/sentry/vfs/lock/BUILD b/pkg/sentry/vfs/lock/BUILD deleted file mode 100644 index d9ab063b7..000000000 --- a/pkg/sentry/vfs/lock/BUILD +++ /dev/null @@ -1,13 +0,0 @@ -load("//tools:defs.bzl", "go_library") - -package(licenses = ["notice"]) - -go_library( - name = "lock", - srcs = ["lock.go"], - visibility = ["//pkg/sentry:internal"], - deps = [ - "//pkg/sentry/fs/lock", - "//pkg/syserror", - ], -) diff --git a/pkg/sentry/vfs/lock/lock.go b/pkg/sentry/vfs/lock/lock.go deleted file mode 100644 index 724dfe743..000000000 --- a/pkg/sentry/vfs/lock/lock.go +++ /dev/null @@ -1,72 +0,0 @@ -// Copyright 2020 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Package lock provides POSIX and BSD style file locking for VFS2 file -// implementations. -// -// The actual implementations can be found in the lock package under -// sentry/fs/lock. -package lock - -import ( - fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" - "gvisor.dev/gvisor/pkg/syserror" -) - -// FileLocks supports POSIX and BSD style locks, which correspond to fcntl(2) -// and flock(2) respectively in Linux. It can be embedded into various file -// implementations for VFS2 that support locking. -// -// Note that in Linux these two types of locks are _not_ cooperative, because -// race and deadlock conditions make merging them prohibitive. We do the same -// and keep them oblivious to each other. -type FileLocks struct { - // bsd is a set of BSD-style advisory file wide locks, see flock(2). - bsd fslock.Locks - - // posix is a set of POSIX-style regional advisory locks, see fcntl(2). - posix fslock.Locks -} - -// LockBSD tries to acquire a BSD-style lock on the entire file. -func (fl *FileLocks) LockBSD(uid fslock.UniqueID, t fslock.LockType, block fslock.Blocker) error { - if fl.bsd.LockRegion(uid, t, fslock.LockRange{0, fslock.LockEOF}, block) { - return nil - } - return syserror.ErrWouldBlock -} - -// UnlockBSD releases a BSD-style lock on the entire file. -// -// This operation is always successful, even if there did not exist a lock on -// the requested region held by uid in the first place. -func (fl *FileLocks) UnlockBSD(uid fslock.UniqueID) { - fl.bsd.UnlockRegion(uid, fslock.LockRange{0, fslock.LockEOF}) -} - -// LockPOSIX tries to acquire a POSIX-style lock on a file region. -func (fl *FileLocks) LockPOSIX(uid fslock.UniqueID, t fslock.LockType, rng fslock.LockRange, block fslock.Blocker) error { - if fl.posix.LockRegion(uid, t, rng, block) { - return nil - } - return syserror.ErrWouldBlock -} - -// UnlockPOSIX releases a POSIX-style lock on a file region. -// -// This operation is always successful, even if there did not exist a lock on -// the requested region held by uid in the first place. -func (fl *FileLocks) UnlockPOSIX(uid fslock.UniqueID, rng fslock.LockRange) { - fl.posix.UnlockRegion(uid, rng) -} diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go index d39528051..d39528051 100644..100755 --- a/pkg/sentry/vfs/mount.go +++ b/pkg/sentry/vfs/mount.go diff --git a/pkg/sentry/vfs/mount_test.go b/pkg/sentry/vfs/mount_test.go deleted file mode 100644 index 3b933468d..000000000 --- a/pkg/sentry/vfs/mount_test.go +++ /dev/null @@ -1,458 +0,0 @@ -// Copyright 2019 The gVisor Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package vfs - -import ( - "fmt" - "runtime" - "testing" - - "gvisor.dev/gvisor/pkg/sync" -) - -func TestMountTableLookupEmpty(t *testing.T) { - var mt mountTable - mt.Init() - - parent := &Mount{} - point := &Dentry{} - if m := mt.Lookup(parent, point); m != nil { - t.Errorf("empty mountTable lookup: got %p, wanted nil", m) - } -} - -func TestMountTableInsertLookup(t *testing.T) { - var mt mountTable - mt.Init() - - mount := &Mount{} - mount.storeKey(VirtualDentry{&Mount{}, &Dentry{}}) - mt.Insert(mount) - - if m := mt.Lookup(mount.parent(), mount.point()); m != mount { - t.Errorf("mountTable positive lookup: got %p, wanted %p", m, mount) - } - - otherParent := &Mount{} - if m := mt.Lookup(otherParent, mount.point()); m != nil { - t.Errorf("mountTable lookup with wrong mount parent: got %p, wanted nil", m) - } - otherPoint := &Dentry{} - if m := mt.Lookup(mount.parent(), otherPoint); m != nil { - t.Errorf("mountTable lookup with wrong mount point: got %p, wanted nil", m) - } -} - -// TODO: concurrent lookup/insertion/removal - -// must be powers of 2 -var benchNumMounts = []int{1 << 2, 1 << 5, 1 << 8} - -// For all of the following: -// -// - BenchmarkMountTableFoo tests usage pattern "Foo" for mountTable. -// -// - BenchmarkMountMapFoo tests usage pattern "Foo" for a -// sync.RWMutex-protected map. (Mutator benchmarks do not use a RWMutex, since -// mountTable also requires external synchronization between mutators.) -// -// - BenchmarkMountSyncMapFoo tests usage pattern "Foo" for a sync.Map. -// -// ParallelLookup is by far the most common and performance-sensitive operation -// for this application. NegativeLookup is also important, but less so (only -// relevant with multiple mount namespaces and significant differences in -// mounts between them). Insertion and removal are benchmarked for -// completeness. -const enableComparativeBenchmarks = false - -func newBenchMount() *Mount { - mount := &Mount{} - mount.storeKey(VirtualDentry{&Mount{}, &Dentry{}}) - return mount -} - -func BenchmarkMountTableParallelLookup(b *testing.B) { - for numG, maxG := 1, runtime.GOMAXPROCS(0); numG >= 0 && numG <= maxG; numG *= 2 { - for _, numMounts := range benchNumMounts { - desc := fmt.Sprintf("%dx%d", numG, numMounts) - b.Run(desc, func(b *testing.B) { - var mt mountTable - mt.Init() - keys := make([]VirtualDentry, 0, numMounts) - for i := 0; i < numMounts; i++ { - mount := newBenchMount() - mt.Insert(mount) - keys = append(keys, mount.loadKey()) - } - - var ready sync.WaitGroup - begin := make(chan struct{}) - var end sync.WaitGroup - for g := 0; g < numG; g++ { - ready.Add(1) - end.Add(1) - go func() { - defer end.Done() - ready.Done() - <-begin - for i := 0; i < b.N; i++ { - k := keys[i&(numMounts-1)] - m := mt.Lookup(k.mount, k.dentry) - if m == nil { - b.Fatalf("lookup failed") - } - if parent := m.parent(); parent != k.mount { - b.Fatalf("lookup returned mount with parent %p, wanted %p", parent, k.mount) - } - if point := m.point(); point != k.dentry { - b.Fatalf("lookup returned mount with point %p, wanted %p", point, k.dentry) - } - } - }() - } - - ready.Wait() - b.ResetTimer() - close(begin) - end.Wait() - }) - } - } -} - -func BenchmarkMountMapParallelLookup(b *testing.B) { - if !enableComparativeBenchmarks { - b.Skipf("comparative benchmarks are disabled") - } - - for numG, maxG := 1, runtime.GOMAXPROCS(0); numG >= 0 && numG <= maxG; numG *= 2 { - for _, numMounts := range benchNumMounts { - desc := fmt.Sprintf("%dx%d", numG, numMounts) - b.Run(desc, func(b *testing.B) { - var mu sync.RWMutex - ms := make(map[VirtualDentry]*Mount) - keys := make([]VirtualDentry, 0, numMounts) - for i := 0; i < numMounts; i++ { - mount := newBenchMount() - key := mount.loadKey() - ms[key] = mount - keys = append(keys, key) - } - - var ready sync.WaitGroup - begin := make(chan struct{}) - var end sync.WaitGroup - for g := 0; g < numG; g++ { - ready.Add(1) - end.Add(1) - go func() { - defer end.Done() - ready.Done() - <-begin - for i := 0; i < b.N; i++ { - k := keys[i&(numMounts-1)] - mu.RLock() - m := ms[k] - mu.RUnlock() - if m == nil { - b.Fatalf("lookup failed") - } - if parent := m.parent(); parent != k.mount { - b.Fatalf("lookup returned mount with parent %p, wanted %p", parent, k.mount) - } - if point := m.point(); point != k.dentry { - b.Fatalf("lookup returned mount with point %p, wanted %p", point, k.dentry) - } - } - }() - } - - ready.Wait() - b.ResetTimer() - close(begin) - end.Wait() - }) - } - } -} - -func BenchmarkMountSyncMapParallelLookup(b *testing.B) { - if !enableComparativeBenchmarks { - b.Skipf("comparative benchmarks are disabled") - } - - for numG, maxG := 1, runtime.GOMAXPROCS(0); numG >= 0 && numG <= maxG; numG *= 2 { - for _, numMounts := range benchNumMounts { - desc := fmt.Sprintf("%dx%d", numG, numMounts) - b.Run(desc, func(b *testing.B) { - var ms sync.Map - keys := make([]VirtualDentry, 0, numMounts) - for i := 0; i < numMounts; i++ { - mount := newBenchMount() - key := mount.loadKey() - ms.Store(key, mount) - keys = append(keys, key) - } - - var ready sync.WaitGroup - begin := make(chan struct{}) - var end sync.WaitGroup - for g := 0; g < numG; g++ { - ready.Add(1) - end.Add(1) - go func() { - defer end.Done() - ready.Done() - <-begin - for i := 0; i < b.N; i++ { - k := keys[i&(numMounts-1)] - mi, ok := ms.Load(k) - if !ok { - b.Fatalf("lookup failed") - } - m := mi.(*Mount) - if parent := m.parent(); parent != k.mount { - b.Fatalf("lookup returned mount with parent %p, wanted %p", parent, k.mount) - } - if point := m.point(); point != k.dentry { - b.Fatalf("lookup returned mount with point %p, wanted %p", point, k.dentry) - } - } - }() - } - - ready.Wait() - b.ResetTimer() - close(begin) - end.Wait() - }) - } - } -} - -func BenchmarkMountTableNegativeLookup(b *testing.B) { - for _, numMounts := range benchNumMounts { - desc := fmt.Sprintf("%d", numMounts) - b.Run(desc, func(b *testing.B) { - var mt mountTable - mt.Init() - for i := 0; i < numMounts; i++ { - mt.Insert(newBenchMount()) - } - negkeys := make([]VirtualDentry, 0, numMounts) - for i := 0; i < numMounts; i++ { - negkeys = append(negkeys, VirtualDentry{ - mount: &Mount{}, - dentry: &Dentry{}, - }) - } - - b.ResetTimer() - for i := 0; i < b.N; i++ { - k := negkeys[i&(numMounts-1)] - m := mt.Lookup(k.mount, k.dentry) - if m != nil { - b.Fatalf("lookup got %p, wanted nil", m) - } - } - }) - } -} - -func BenchmarkMountMapNegativeLookup(b *testing.B) { - if !enableComparativeBenchmarks { - b.Skipf("comparative benchmarks are disabled") - } - - for _, numMounts := range benchNumMounts { - desc := fmt.Sprintf("%d", numMounts) - b.Run(desc, func(b *testing.B) { - var mu sync.RWMutex - ms := make(map[VirtualDentry]*Mount) - for i := 0; i < numMounts; i++ { - mount := newBenchMount() - ms[mount.loadKey()] = mount - } - negkeys := make([]VirtualDentry, 0, numMounts) - for i := 0; i < numMounts; i++ { - negkeys = append(negkeys, VirtualDentry{ - mount: &Mount{}, - dentry: &Dentry{}, - }) - } - - b.ResetTimer() - for i := 0; i < b.N; i++ { - k := negkeys[i&(numMounts-1)] - mu.RLock() - m := ms[k] - mu.RUnlock() - if m != nil { - b.Fatalf("lookup got %p, wanted nil", m) - } - } - }) - } -} - -func BenchmarkMountSyncMapNegativeLookup(b *testing.B) { - if !enableComparativeBenchmarks { - b.Skipf("comparative benchmarks are disabled") - } - - for _, numMounts := range benchNumMounts { - desc := fmt.Sprintf("%d", numMounts) - b.Run(desc, func(b *testing.B) { - var ms sync.Map - for i := 0; i < numMounts; i++ { - mount := newBenchMount() - ms.Store(mount.loadKey(), mount) - } - negkeys := make([]VirtualDentry, 0, numMounts) - for i := 0; i < numMounts; i++ { - negkeys = append(negkeys, VirtualDentry{ - mount: &Mount{}, - dentry: &Dentry{}, - }) - } - - b.ResetTimer() - for i := 0; i < b.N; i++ { - k := negkeys[i&(numMounts-1)] - m, _ := ms.Load(k) - if m != nil { - b.Fatalf("lookup got %p, wanted nil", m) - } - } - }) - } -} - -func BenchmarkMountTableInsert(b *testing.B) { - // Preallocate Mounts so that allocation time isn't included in the - // benchmark. - mounts := make([]*Mount, 0, b.N) - for i := 0; i < b.N; i++ { - mounts = append(mounts, newBenchMount()) - } - - var mt mountTable - mt.Init() - b.ResetTimer() - for i := range mounts { - mt.Insert(mounts[i]) - } -} - -func BenchmarkMountMapInsert(b *testing.B) { - if !enableComparativeBenchmarks { - b.Skipf("comparative benchmarks are disabled") - } - - // Preallocate Mounts so that allocation time isn't included in the - // benchmark. - mounts := make([]*Mount, 0, b.N) - for i := 0; i < b.N; i++ { - mounts = append(mounts, newBenchMount()) - } - - ms := make(map[VirtualDentry]*Mount) - b.ResetTimer() - for i := range mounts { - mount := mounts[i] - ms[mount.loadKey()] = mount - } -} - -func BenchmarkMountSyncMapInsert(b *testing.B) { - if !enableComparativeBenchmarks { - b.Skipf("comparative benchmarks are disabled") - } - - // Preallocate Mounts so that allocation time isn't included in the - // benchmark. - mounts := make([]*Mount, 0, b.N) - for i := 0; i < b.N; i++ { - mounts = append(mounts, newBenchMount()) - } - - var ms sync.Map - b.ResetTimer() - for i := range mounts { - mount := mounts[i] - ms.Store(mount.loadKey(), mount) - } -} - -func BenchmarkMountTableRemove(b *testing.B) { - mounts := make([]*Mount, 0, b.N) - for i := 0; i < b.N; i++ { - mounts = append(mounts, newBenchMount()) - } - var mt mountTable - mt.Init() - for i := range mounts { - mt.Insert(mounts[i]) - } - - b.ResetTimer() - for i := range mounts { - mt.Remove(mounts[i]) - } -} - -func BenchmarkMountMapRemove(b *testing.B) { - if !enableComparativeBenchmarks { - b.Skipf("comparative benchmarks are disabled") - } - - mounts := make([]*Mount, 0, b.N) - for i := 0; i < b.N; i++ { - mounts = append(mounts, newBenchMount()) - } - ms := make(map[VirtualDentry]*Mount) - for i := range mounts { - mount := mounts[i] - ms[mount.loadKey()] = mount - } - - b.ResetTimer() - for i := range mounts { - mount := mounts[i] - delete(ms, mount.loadKey()) - } -} - -func BenchmarkMountSyncMapRemove(b *testing.B) { - if !enableComparativeBenchmarks { - b.Skipf("comparative benchmarks are disabled") - } - - mounts := make([]*Mount, 0, b.N) - for i := 0; i < b.N; i++ { - mounts = append(mounts, newBenchMount()) - } - var ms sync.Map - for i := range mounts { - mount := mounts[i] - ms.Store(mount.loadKey(), mount) - } - - b.ResetTimer() - for i := range mounts { - mount := mounts[i] - ms.Delete(mount.loadKey()) - } -} diff --git a/pkg/sentry/vfs/mount_unsafe.go b/pkg/sentry/vfs/mount_unsafe.go index bd90d36c4..bd90d36c4 100644..100755 --- a/pkg/sentry/vfs/mount_unsafe.go +++ b/pkg/sentry/vfs/mount_unsafe.go diff --git a/pkg/sentry/vfs/options.go b/pkg/sentry/vfs/options.go index b7774bf28..b7774bf28 100644..100755 --- a/pkg/sentry/vfs/options.go +++ b/pkg/sentry/vfs/options.go diff --git a/pkg/sentry/vfs/pathname.go b/pkg/sentry/vfs/pathname.go index b318c681a..b318c681a 100644..100755 --- a/pkg/sentry/vfs/pathname.go +++ b/pkg/sentry/vfs/pathname.go diff --git a/pkg/sentry/vfs/permissions.go b/pkg/sentry/vfs/permissions.go index f664581f4..f664581f4 100644..100755 --- a/pkg/sentry/vfs/permissions.go +++ b/pkg/sentry/vfs/permissions.go diff --git a/pkg/sentry/vfs/resolving_path.go b/pkg/sentry/vfs/resolving_path.go index 8a0b382f6..8a0b382f6 100644..100755 --- a/pkg/sentry/vfs/resolving_path.go +++ b/pkg/sentry/vfs/resolving_path.go diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go index 908c69f91..908c69f91 100644..100755 --- a/pkg/sentry/vfs/vfs.go +++ b/pkg/sentry/vfs/vfs.go diff --git a/pkg/sentry/vfs/vfs_state_autogen.go b/pkg/sentry/vfs/vfs_state_autogen.go new file mode 100755 index 000000000..78d4a0305 --- /dev/null +++ b/pkg/sentry/vfs/vfs_state_autogen.go @@ -0,0 +1,38 @@ +// automatically generated by stateify. + +package vfs + +import ( + "gvisor.dev/gvisor/pkg/state" +) + +func (x *epollInterestList) beforeSave() {} +func (x *epollInterestList) save(m state.Map) { + x.beforeSave() + m.Save("head", &x.head) + m.Save("tail", &x.tail) +} + +func (x *epollInterestList) afterLoad() {} +func (x *epollInterestList) load(m state.Map) { + m.Load("head", &x.head) + m.Load("tail", &x.tail) +} + +func (x *epollInterestEntry) beforeSave() {} +func (x *epollInterestEntry) save(m state.Map) { + x.beforeSave() + m.Save("next", &x.next) + m.Save("prev", &x.prev) +} + +func (x *epollInterestEntry) afterLoad() {} +func (x *epollInterestEntry) load(m state.Map) { + m.Load("next", &x.next) + m.Load("prev", &x.prev) +} + +func init() { + state.Register("vfs.epollInterestList", (*epollInterestList)(nil), state.Fns{Save: (*epollInterestList).save, Load: (*epollInterestList).load}) + state.Register("vfs.epollInterestEntry", (*epollInterestEntry)(nil), state.Fns{Save: (*epollInterestEntry).save, Load: (*epollInterestEntry).load}) +} |