summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
-rw-r--r--pkg/abi/linux/file.go101
-rw-r--r--pkg/abi/linux/fs.go9
-rw-r--r--pkg/fspath/BUILD28
-rw-r--r--pkg/fspath/builder.go104
-rw-r--r--pkg/fspath/builder_test.go58
-rw-r--r--pkg/fspath/builder_unsafe.go27
-rw-r--r--pkg/fspath/fspath.go182
-rw-r--r--pkg/fspath/fspath_test.go143
-rw-r--r--pkg/sentry/fsimpl/memfs/BUILD55
-rw-r--r--pkg/sentry/fsimpl/memfs/benchmark_test.go464
-rw-r--r--pkg/sentry/fsimpl/memfs/directory.go178
-rw-r--r--pkg/sentry/fsimpl/memfs/filesystem.go542
-rw-r--r--pkg/sentry/fsimpl/memfs/memfs.go299
-rw-r--r--pkg/sentry/fsimpl/memfs/regular_file.go155
-rw-r--r--pkg/sentry/fsimpl/memfs/symlink.go36
-rw-r--r--pkg/sentry/vfs/BUILD46
-rw-r--r--pkg/sentry/vfs/README.md197
-rw-r--r--pkg/sentry/vfs/context.go37
-rw-r--r--pkg/sentry/vfs/debug.go22
-rw-r--r--pkg/sentry/vfs/dentry.go347
-rw-r--r--pkg/sentry/vfs/file_description.go213
-rw-r--r--pkg/sentry/vfs/file_description_impl_util.go142
-rw-r--r--pkg/sentry/vfs/filesystem.go155
-rw-r--r--pkg/sentry/vfs/filesystem_type.go70
-rw-r--r--pkg/sentry/vfs/mount.go411
-rw-r--r--pkg/sentry/vfs/mount_test.go465
-rw-r--r--pkg/sentry/vfs/mount_unsafe.go356
-rw-r--r--pkg/sentry/vfs/options.go123
-rw-r--r--pkg/sentry/vfs/permissions.go121
-rw-r--r--pkg/sentry/vfs/resolving_path.go453
-rw-r--r--pkg/sentry/vfs/syscalls.go217
-rw-r--r--pkg/sentry/vfs/vfs.go135
32 files changed, 5851 insertions, 40 deletions
diff --git a/pkg/abi/linux/file.go b/pkg/abi/linux/file.go
index 285338e47..4b0ea33dc 100644
--- a/pkg/abi/linux/file.go
+++ b/pkg/abi/linux/file.go
@@ -24,25 +24,27 @@ import (
// Constants for open(2).
const (
- O_ACCMODE = 00000003
- O_RDONLY = 00000000
- O_WRONLY = 00000001
- O_RDWR = 00000002
- O_CREAT = 00000100
- O_EXCL = 00000200
- O_NOCTTY = 00000400
- O_TRUNC = 00001000
- O_APPEND = 00002000
- O_NONBLOCK = 00004000
- O_DSYNC = 00010000
- O_ASYNC = 00020000
- O_DIRECT = 00040000
- O_LARGEFILE = 00100000
- O_DIRECTORY = 00200000
- O_NOFOLLOW = 00400000
- O_CLOEXEC = 02000000
- O_SYNC = 04000000
+ O_ACCMODE = 000000003
+ O_RDONLY = 000000000
+ O_WRONLY = 000000001
+ O_RDWR = 000000002
+ O_CREAT = 000000100
+ O_EXCL = 000000200
+ O_NOCTTY = 000000400
+ O_TRUNC = 000001000
+ O_APPEND = 000002000
+ O_NONBLOCK = 000004000
+ O_DSYNC = 000010000
+ O_ASYNC = 000020000
+ O_DIRECT = 000040000
+ O_LARGEFILE = 000100000
+ O_DIRECTORY = 000200000
+ O_NOFOLLOW = 000400000
+ O_NOATIME = 001000000
+ O_CLOEXEC = 002000000
+ O_SYNC = 004000000 // __O_SYNC in Linux
O_PATH = 010000000
+ O_TMPFILE = 020000000 // __O_TMPFILE in Linux
)
// Constants for fstatat(2).
@@ -124,14 +126,23 @@ const (
// Values for mode_t.
const (
- FileTypeMask = 0170000
- ModeSocket = 0140000
- ModeSymlink = 0120000
- ModeRegular = 0100000
- ModeBlockDevice = 060000
- ModeDirectory = 040000
- ModeCharacterDevice = 020000
- ModeNamedPipe = 010000
+ S_IFMT = 0170000
+ S_IFSOCK = 0140000
+ S_IFLNK = 0120000
+ S_IFREG = 0100000
+ S_IFBLK = 060000
+ S_IFDIR = 040000
+ S_IFCHR = 020000
+ S_IFIFO = 010000
+
+ FileTypeMask = S_IFMT
+ ModeSocket = S_IFSOCK
+ ModeSymlink = S_IFLNK
+ ModeRegular = S_IFREG
+ ModeBlockDevice = S_IFBLK
+ ModeDirectory = S_IFDIR
+ ModeCharacterDevice = S_IFCHR
+ ModeNamedPipe = S_IFIFO
ModeSetUID = 04000
ModeSetGID = 02000
@@ -152,6 +163,19 @@ const (
PermissionsMask = 0777
)
+// Values for linux_dirent64.d_type.
+const (
+ DT_UNKNOWN = 0
+ DT_FIFO = 1
+ DT_CHR = 2
+ DT_DIR = 4
+ DT_BLK = 6
+ DT_REG = 8
+ DT_LNK = 10
+ DT_SOCK = 12
+ DT_WHT = 14
+)
+
// Values for preadv2/pwritev2.
const (
RWF_HIPRI = 0x00000001
@@ -179,19 +203,6 @@ type Stat struct {
_ [3]int64
}
-// File types.
-const (
- DT_BLK = 0x6
- DT_CHR = 0x2
- DT_DIR = 0x4
- DT_FIFO = 0x1
- DT_LNK = 0xa
- DT_REG = 0x8
- DT_SOCK = 0xc
- DT_UNKNOWN = 0x0
- DT_WHT = 0xe
-)
-
// SizeOfStat is the size of a Stat struct.
var SizeOfStat = binary.Size(Stat{})
@@ -222,6 +233,17 @@ const (
STATX__RESERVED = 0x80000000
)
+// Bitmasks for Statx.Attributes and Statx.AttributesMask, from
+// include/uapi/linux/stat.h.
+const (
+ STATX_ATTR_COMPRESSED = 0x00000004
+ STATX_ATTR_IMMUTABLE = 0x00000010
+ STATX_ATTR_APPEND = 0x00000020
+ STATX_ATTR_NODUMP = 0x00000040
+ STATX_ATTR_ENCRYPTED = 0x00000800
+ STATX_ATTR_AUTOMOUNT = 0x00001000
+)
+
// Statx represents struct statx.
type Statx struct {
Mask uint32
@@ -231,7 +253,6 @@ type Statx struct {
UID uint32
GID uint32
Mode uint16
- _ uint16
Ino uint64
Size uint64
Blocks uint64
diff --git a/pkg/abi/linux/fs.go b/pkg/abi/linux/fs.go
index 549e0fb93..b416e3472 100644
--- a/pkg/abi/linux/fs.go
+++ b/pkg/abi/linux/fs.go
@@ -77,6 +77,15 @@ type Statfs struct {
Spare [4]uint64
}
+// Whence argument to lseek(2), from include/uapi/linux/fs.h.
+const (
+ SEEK_SET = 0
+ SEEK_CUR = 1
+ SEEK_END = 2
+ SEEK_DATA = 3
+ SEEK_HOLE = 4
+)
+
// Sync_file_range flags, from include/uapi/linux/fs.h
const (
SYNC_FILE_RANGE_WAIT_BEFORE = 1
diff --git a/pkg/fspath/BUILD b/pkg/fspath/BUILD
new file mode 100644
index 000000000..11716af81
--- /dev/null
+++ b/pkg/fspath/BUILD
@@ -0,0 +1,28 @@
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+
+package(
+ default_visibility = ["//visibility:public"],
+ licenses = ["notice"],
+)
+
+go_library(
+ name = "fspath",
+ srcs = [
+ "builder.go",
+ "builder_unsafe.go",
+ "fspath.go",
+ ],
+ importpath = "gvisor.dev/gvisor/pkg/fspath",
+ deps = ["//pkg/syserror"],
+)
+
+go_test(
+ name = "fspath_test",
+ size = "small",
+ srcs = [
+ "builder_test.go",
+ "fspath_test.go",
+ ],
+ embed = [":fspath"],
+ deps = ["//pkg/syserror"],
+)
diff --git a/pkg/fspath/builder.go b/pkg/fspath/builder.go
new file mode 100644
index 000000000..7ddb36826
--- /dev/null
+++ b/pkg/fspath/builder.go
@@ -0,0 +1,104 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fspath
+
+import (
+ "fmt"
+)
+
+// Builder is similar to strings.Builder, but is used to produce pathnames
+// given path components in reverse order (from leaf to root). This is useful
+// in the common case where a filesystem is represented by a tree of named
+// nodes, and the path to a given node must be produced by walking upward from
+// that node to a given root.
+type Builder struct {
+ buf []byte
+ start int
+ needSep bool
+}
+
+// Reset resets the Builder to be empty.
+func (b *Builder) Reset() {
+ b.start = len(b.buf)
+ b.needSep = false
+}
+
+// Len returns the number of accumulated bytes.
+func (b *Builder) Len() int {
+ return len(b.buf) - b.start
+}
+
+func (b *Builder) needToGrow(n int) bool {
+ return b.start < n
+}
+
+func (b *Builder) grow(n int) {
+ newLen := b.Len() + n
+ var newCap int
+ if len(b.buf) == 0 {
+ newCap = 64 // arbitrary
+ } else {
+ newCap = 2 * len(b.buf)
+ }
+ for newCap < newLen {
+ newCap *= 2
+ if newCap == 0 {
+ panic(fmt.Sprintf("required length (%d) causes buffer size to overflow", newLen))
+ }
+ }
+ newBuf := make([]byte, newCap)
+ copy(newBuf[newCap-b.Len():], b.buf[b.start:])
+ b.start += newCap - len(b.buf)
+ b.buf = newBuf
+}
+
+// PrependComponent prepends the given path component to b's buffer. A path
+// separator is automatically inserted if appropriate.
+func (b *Builder) PrependComponent(pc string) {
+ if b.needSep {
+ b.PrependByte('/')
+ }
+ b.PrependString(pc)
+ b.needSep = true
+}
+
+// PrependString prepends the given string to b's buffer.
+func (b *Builder) PrependString(str string) {
+ if b.needToGrow(len(str)) {
+ b.grow(len(str))
+ }
+ b.start -= len(str)
+ copy(b.buf[b.start:], str)
+}
+
+// PrependByte prepends the given byte to b's buffer.
+func (b *Builder) PrependByte(c byte) {
+ if b.needToGrow(1) {
+ b.grow(1)
+ }
+ b.start--
+ b.buf[b.start] = c
+}
+
+// AppendString appends the given string to b's buffer.
+func (b *Builder) AppendString(str string) {
+ if b.needToGrow(len(str)) {
+ b.grow(len(str))
+ }
+ oldStart := b.start
+ b.start -= len(str)
+ copy(b.buf[b.start:], b.buf[oldStart:])
+ copy(b.buf[len(b.buf)-len(str):], str)
+}
diff --git a/pkg/fspath/builder_test.go b/pkg/fspath/builder_test.go
new file mode 100644
index 000000000..22f890273
--- /dev/null
+++ b/pkg/fspath/builder_test.go
@@ -0,0 +1,58 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fspath
+
+import (
+ "testing"
+)
+
+func TestBuilder(t *testing.T) {
+ type testCase struct {
+ pcs []string // path components in reverse order
+ after string
+ want string
+ }
+ tests := []testCase{
+ {
+ // Empty case.
+ },
+ {
+ pcs: []string{"foo"},
+ want: "foo",
+ },
+ {
+ pcs: []string{"foo", "bar", "baz"},
+ want: "baz/bar/foo",
+ },
+ {
+ pcs: []string{"foo", "bar"},
+ after: " (deleted)",
+ want: "bar/foo (deleted)",
+ },
+ }
+
+ for _, test := range tests {
+ t.Run(test.want, func(t *testing.T) {
+ var b Builder
+ for _, pc := range test.pcs {
+ b.PrependComponent(pc)
+ }
+ b.AppendString(test.after)
+ if got := b.String(); got != test.want {
+ t.Errorf("got %q, wanted %q", got, test.want)
+ }
+ })
+ }
+}
diff --git a/pkg/fspath/builder_unsafe.go b/pkg/fspath/builder_unsafe.go
new file mode 100644
index 000000000..75606808d
--- /dev/null
+++ b/pkg/fspath/builder_unsafe.go
@@ -0,0 +1,27 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fspath
+
+import (
+ "unsafe"
+)
+
+// String returns the accumulated string. No other methods should be called
+// after String.
+func (b *Builder) String() string {
+ bs := b.buf[b.start:]
+ // Compare strings.Builder.String().
+ return *(*string)(unsafe.Pointer(&bs))
+}
diff --git a/pkg/fspath/fspath.go b/pkg/fspath/fspath.go
new file mode 100644
index 000000000..f68752560
--- /dev/null
+++ b/pkg/fspath/fspath.go
@@ -0,0 +1,182 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package fspath provides efficient tools for working with file paths in
+// Linux-compatible filesystem implementations.
+package fspath
+
+import (
+ "strings"
+
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+const pathSep = '/'
+
+// Parse parses a pathname as described by path_resolution(7).
+func Parse(pathname string) (Path, error) {
+ if len(pathname) == 0 {
+ // "... POSIX decrees that an empty pathname must not be resolved
+ // successfully. Linux returns ENOENT in this case." -
+ // path_resolution(7)
+ return Path{}, syserror.ENOENT
+ }
+ // Skip leading path separators.
+ i := 0
+ for pathname[i] == pathSep {
+ i++
+ if i == len(pathname) {
+ // pathname consists entirely of path separators.
+ return Path{
+ Absolute: true,
+ Dir: true,
+ }, nil
+ }
+ }
+ // Skip trailing path separators. This is required by Iterator.Next. This
+ // loop is guaranteed to terminate with j >= 0 because otherwise the
+ // pathname would consist entirely of path separators, so we would have
+ // returned above.
+ j := len(pathname) - 1
+ for pathname[j] == pathSep {
+ j--
+ }
+ // Find the end of the first path component.
+ firstEnd := i + 1
+ for firstEnd != len(pathname) && pathname[firstEnd] != pathSep {
+ firstEnd++
+ }
+ return Path{
+ Begin: Iterator{
+ partialPathname: pathname[i : j+1],
+ end: firstEnd - i,
+ },
+ Absolute: i != 0,
+ Dir: j != len(pathname)-1,
+ }, nil
+}
+
+// Path contains the information contained in a pathname string.
+//
+// Path is copyable by value.
+type Path struct {
+ // Begin is an iterator to the first path component in the relative part of
+ // the path.
+ //
+ // Path doesn't store information about path components after the first
+ // since this would require allocation.
+ Begin Iterator
+
+ // If true, the path is absolute, such that lookup should begin at the
+ // filesystem root. If false, the path is relative, such that where lookup
+ // begins is unspecified.
+ Absolute bool
+
+ // If true, the pathname contains trailing path separators, so the last
+ // path component must exist and resolve to a directory.
+ Dir bool
+}
+
+// String returns a pathname string equivalent to p. Note that the returned
+// string is not necessarily equal to the string p was parsed from; in
+// particular, redundant path separators will not be present.
+func (p Path) String() string {
+ var b strings.Builder
+ if p.Absolute {
+ b.WriteByte(pathSep)
+ }
+ sep := false
+ for pit := p.Begin; pit.Ok(); pit = pit.Next() {
+ if sep {
+ b.WriteByte(pathSep)
+ }
+ b.WriteString(pit.String())
+ sep = true
+ }
+ // Don't return "//" for Parse("/").
+ if p.Dir && p.Begin.Ok() {
+ b.WriteByte(pathSep)
+ }
+ return b.String()
+}
+
+// An Iterator represents either a path component in a Path or a terminal
+// iterator indicating that the end of the path has been reached.
+//
+// Iterator is immutable and copyable by value. The zero value of Iterator is
+// valid, and represents a terminal iterator.
+type Iterator struct {
+ // partialPathname is a substring of the original pathname beginning at the
+ // start of the represented path component and ending immediately after the
+ // end of the last path component in the pathname. If partialPathname is
+ // empty, the PathnameIterator is terminal.
+ //
+ // See TestParseIteratorPartialPathnames in fspath_test.go for a worked
+ // example.
+ partialPathname string
+
+ // end is the offset into partialPathname of the first byte after the end
+ // of the represented path component.
+ end int
+}
+
+// Ok returns true if it is not terminal.
+func (it Iterator) Ok() bool {
+ return len(it.partialPathname) != 0
+}
+
+// String returns the path component represented by it.
+//
+// Preconditions: it.Ok().
+func (it Iterator) String() string {
+ return it.partialPathname[:it.end]
+}
+
+// Next returns an iterator to the path component after it. If it is the last
+// component in the path, Next returns a terminal iterator.
+//
+// Preconditions: it.Ok().
+func (it Iterator) Next() Iterator {
+ if it.end == len(it.partialPathname) {
+ // End of the path.
+ return Iterator{}
+ }
+ // Skip path separators. Since Parse trims trailing path separators, if we
+ // aren't at the end of the path, there is definitely another path
+ // component.
+ i := it.end + 1
+ for {
+ if it.partialPathname[i] != pathSep {
+ break
+ }
+ i++
+ }
+ nextPartialPathname := it.partialPathname[i:]
+ // Find the end of this path component.
+ nextEnd := 1
+ for nextEnd < len(nextPartialPathname) && nextPartialPathname[nextEnd] != pathSep {
+ nextEnd++
+ }
+ return Iterator{
+ partialPathname: nextPartialPathname,
+ end: nextEnd,
+ }
+}
+
+// NextOk is equivalent to it.Next().Ok(), but is faster.
+//
+// Preconditions: it.Ok().
+func (it Iterator) NextOk() bool {
+ return it.end != len(it.partialPathname)
+}
diff --git a/pkg/fspath/fspath_test.go b/pkg/fspath/fspath_test.go
new file mode 100644
index 000000000..215b35622
--- /dev/null
+++ b/pkg/fspath/fspath_test.go
@@ -0,0 +1,143 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fspath
+
+import (
+ "reflect"
+ "strings"
+ "testing"
+
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+func TestParseIteratorPartialPathnames(t *testing.T) {
+ path, err := Parse("/foo//bar///baz////")
+ if err != nil {
+ t.Fatalf("Parse failed: %v", err)
+ }
+ // Parse strips leading slashes, and records their presence as
+ // Path.Absolute.
+ if !path.Absolute {
+ t.Errorf("Path.Absolute: got false, wanted true")
+ }
+ // Parse strips trailing slashes, and records their presence as Path.Dir.
+ if !path.Dir {
+ t.Errorf("Path.Dir: got false, wanted true")
+ }
+ // The first Iterator.partialPathname is the input pathname, with leading
+ // and trailing slashes stripped.
+ it := path.Begin
+ if want := "foo//bar///baz"; it.partialPathname != want {
+ t.Errorf("first Iterator.partialPathname: got %q, wanted %q", it.partialPathname, want)
+ }
+ // Successive Iterator.partialPathnames remove the leading path component
+ // and following slashes, until we run out of path components and get a
+ // terminal Iterator.
+ it = it.Next()
+ if want := "bar///baz"; it.partialPathname != want {
+ t.Errorf("second Iterator.partialPathname: got %q, wanted %q", it.partialPathname, want)
+ }
+ it = it.Next()
+ if want := "baz"; it.partialPathname != want {
+ t.Errorf("third Iterator.partialPathname: got %q, wanted %q", it.partialPathname, want)
+ }
+ it = it.Next()
+ if want := ""; it.partialPathname != want {
+ t.Errorf("fourth Iterator.partialPathname: got %q, wanted %q", it.partialPathname, want)
+ }
+ if it.Ok() {
+ t.Errorf("fourth Iterator.Ok(): got true, wanted false")
+ }
+}
+
+func TestParse(t *testing.T) {
+ type testCase struct {
+ pathname string
+ relpath []string
+ abs bool
+ dir bool
+ }
+ tests := []testCase{
+ {
+ pathname: "/",
+ relpath: []string{},
+ abs: true,
+ dir: true,
+ },
+ {
+ pathname: "//",
+ relpath: []string{},
+ abs: true,
+ dir: true,
+ },
+ }
+ for _, sep := range []string{"/", "//"} {
+ for _, abs := range []bool{false, true} {
+ for _, dir := range []bool{false, true} {
+ for _, pcs := range [][]string{
+ // single path component
+ {"foo"},
+ // multiple path components, including non-UTF-8
+ {".", "foo", "..", "\xe6", "bar"},
+ } {
+ prefix := ""
+ if abs {
+ prefix = sep
+ }
+ suffix := ""
+ if dir {
+ suffix = sep
+ }
+ tests = append(tests, testCase{
+ pathname: prefix + strings.Join(pcs, sep) + suffix,
+ relpath: pcs,
+ abs: abs,
+ dir: dir,
+ })
+ }
+ }
+ }
+ }
+
+ for _, test := range tests {
+ t.Run(test.pathname, func(t *testing.T) {
+ p, err := Parse(test.pathname)
+ if err != nil {
+ t.Fatalf("failed to parse pathname %q: %v", test.pathname, err)
+ }
+ t.Logf("pathname %q => path %q", test.pathname, p)
+ if p.Absolute != test.abs {
+ t.Errorf("path absoluteness: got %v, wanted %v", p.Absolute, test.abs)
+ }
+ if p.Dir != test.dir {
+ t.Errorf("path must resolve to a directory: got %v, wanted %v", p.Dir, test.dir)
+ }
+ pcs := []string{}
+ for pit := p.Begin; pit.Ok(); pit = pit.Next() {
+ pcs = append(pcs, pit.String())
+ }
+ if !reflect.DeepEqual(pcs, test.relpath) {
+ t.Errorf("relative path: got %v, wanted %v", pcs, test.relpath)
+ }
+ })
+ }
+}
+
+func TestParseEmptyPathname(t *testing.T) {
+ p, err := Parse("")
+ if err != syserror.ENOENT {
+ t.Errorf("parsing empty pathname: got (%v, %v), wanted (<unspecified>, ENOENT)", p, err)
+ }
+}
diff --git a/pkg/sentry/fsimpl/memfs/BUILD b/pkg/sentry/fsimpl/memfs/BUILD
new file mode 100644
index 000000000..d5d4f68df
--- /dev/null
+++ b/pkg/sentry/fsimpl/memfs/BUILD
@@ -0,0 +1,55 @@
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+
+package(licenses = ["notice"])
+
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+
+go_template_instance(
+ name = "dentry_list",
+ out = "dentry_list.go",
+ package = "memfs",
+ prefix = "dentry",
+ template = "//pkg/ilist:generic_list",
+ types = {
+ "Element": "*Dentry",
+ "Linker": "*Dentry",
+ },
+)
+
+go_library(
+ name = "memfs",
+ srcs = [
+ "dentry_list.go",
+ "directory.go",
+ "filesystem.go",
+ "memfs.go",
+ "regular_file.go",
+ "symlink.go",
+ ],
+ importpath = "gvisor.dev/gvisor/pkg/sentry/fsimpl/memfs",
+ deps = [
+ "//pkg/abi/linux",
+ "//pkg/sentry/context",
+ "//pkg/sentry/kernel/auth",
+ "//pkg/sentry/usermem",
+ "//pkg/sentry/vfs",
+ "//pkg/syserror",
+ ],
+)
+
+go_test(
+ name = "benchmark_test",
+ size = "small",
+ srcs = ["benchmark_test.go"],
+ deps = [
+ ":memfs",
+ "//pkg/abi/linux",
+ "//pkg/sentry/context",
+ "//pkg/sentry/context/contexttest",
+ "//pkg/sentry/fs",
+ "//pkg/sentry/fs/tmpfs",
+ "//pkg/sentry/kernel/auth",
+ "//pkg/sentry/vfs",
+ "//pkg/syserror",
+ ],
+)
diff --git a/pkg/sentry/fsimpl/memfs/benchmark_test.go b/pkg/sentry/fsimpl/memfs/benchmark_test.go
new file mode 100644
index 000000000..a94b17db6
--- /dev/null
+++ b/pkg/sentry/fsimpl/memfs/benchmark_test.go
@@ -0,0 +1,464 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package benchmark_test
+
+import (
+ "fmt"
+ "runtime"
+ "strings"
+ "testing"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/sentry/context/contexttest"
+ "gvisor.dev/gvisor/pkg/sentry/fs"
+ _ "gvisor.dev/gvisor/pkg/sentry/fs/tmpfs"
+ "gvisor.dev/gvisor/pkg/sentry/fsimpl/memfs"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+// Differences from stat_benchmark:
+//
+// - Syscall interception, CopyInPath, copyOutStat, and overlayfs overheads are
+// not included.
+//
+// - *MountStat benchmarks use a tmpfs root mount and a tmpfs submount at /tmp.
+// Non-MountStat benchmarks use a tmpfs root mount and no submounts.
+// stat_benchmark uses a varying root mount, a tmpfs submount at /tmp, and a
+// subdirectory /tmp/<top_dir> (assuming TEST_TMPDIR == "/tmp"). Thus
+// stat_benchmark at depth 1 does a comparable amount of work to *MountStat
+// benchmarks at depth 2, and non-MountStat benchmarks at depth 3.
+var depths = []int{1, 2, 3, 8, 64, 100}
+
+const (
+ mountPointName = "tmp"
+ filename = "gvisor_test_temp_0_1557494568"
+)
+
+// This is copied from syscalls/linux/sys_file.go, with the dependency on
+// kernel.Task stripped out.
+func fileOpOn(ctx context.Context, mntns *fs.MountNamespace, root, wd *fs.Dirent, dirFD int32, path string, resolve bool, fn func(root *fs.Dirent, d *fs.Dirent) error) error {
+ var (
+ d *fs.Dirent // The file.
+ rel *fs.Dirent // The relative directory for search (if required.)
+ err error
+ )
+
+ // Extract the working directory (maybe).
+ if len(path) > 0 && path[0] == '/' {
+ // Absolute path; rel can be nil.
+ } else if dirFD == linux.AT_FDCWD {
+ // Need to reference the working directory.
+ rel = wd
+ } else {
+ // Need to extract the given FD.
+ return syserror.EBADF
+ }
+
+ // Lookup the node.
+ remainingTraversals := uint(linux.MaxSymlinkTraversals)
+ if resolve {
+ d, err = mntns.FindInode(ctx, root, rel, path, &remainingTraversals)
+ } else {
+ d, err = mntns.FindLink(ctx, root, rel, path, &remainingTraversals)
+ }
+ if err != nil {
+ return err
+ }
+
+ err = fn(root, d)
+ d.DecRef()
+ return err
+}
+
+func BenchmarkVFS1TmpfsStat(b *testing.B) {
+ for _, depth := range depths {
+ b.Run(fmt.Sprintf("%d", depth), func(b *testing.B) {
+ ctx := contexttest.Context(b)
+
+ // Create VFS.
+ tmpfsFS, ok := fs.FindFilesystem("tmpfs")
+ if !ok {
+ b.Fatalf("failed to find tmpfs filesystem type")
+ }
+ rootInode, err := tmpfsFS.Mount(ctx, "tmpfs", fs.MountSourceFlags{}, "", nil)
+ if err != nil {
+ b.Fatalf("failed to create tmpfs root mount: %v", err)
+ }
+ mntns, err := fs.NewMountNamespace(ctx, rootInode)
+ if err != nil {
+ b.Fatalf("failed to create mount namespace: %v", err)
+ }
+ defer mntns.DecRef()
+
+ var filePathBuilder strings.Builder
+ filePathBuilder.WriteByte('/')
+
+ // Create nested directories with given depth.
+ root := mntns.Root()
+ defer root.DecRef()
+ d := root
+ d.IncRef()
+ defer d.DecRef()
+ for i := depth; i > 0; i-- {
+ name := fmt.Sprintf("%d", i)
+ if err := d.Inode.CreateDirectory(ctx, d, name, fs.FilePermsFromMode(0755)); err != nil {
+ b.Fatalf("failed to create directory %q: %v", name, err)
+ }
+ next, err := d.Walk(ctx, root, name)
+ if err != nil {
+ b.Fatalf("failed to walk to directory %q: %v", name, err)
+ }
+ d.DecRef()
+ d = next
+ filePathBuilder.WriteString(name)
+ filePathBuilder.WriteByte('/')
+ }
+
+ // Create the file that will be stat'd.
+ file, err := d.Inode.Create(ctx, d, filename, fs.FileFlags{Read: true, Write: true}, fs.FilePermsFromMode(0644))
+ if err != nil {
+ b.Fatalf("failed to create file %q: %v", filename, err)
+ }
+ file.DecRef()
+ filePathBuilder.WriteString(filename)
+ filePath := filePathBuilder.String()
+
+ dirPath := false
+ runtime.GC()
+ b.ResetTimer()
+ for i := 0; i < b.N; i++ {
+ err := fileOpOn(ctx, mntns, root, root, linux.AT_FDCWD, filePath, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent) error {
+ if dirPath && !fs.IsDir(d.Inode.StableAttr) {
+ return syserror.ENOTDIR
+ }
+ uattr, err := d.Inode.UnstableAttr(ctx)
+ if err != nil {
+ return err
+ }
+ // Sanity check.
+ if uattr.Perms.User.Execute {
+ b.Fatalf("got wrong permissions (%0o)", uattr.Perms.LinuxMode())
+ }
+ return nil
+ })
+ if err != nil {
+ b.Fatalf("stat(%q) failed: %v", filePath, err)
+ }
+ }
+ })
+ }
+}
+
+func BenchmarkVFS2MemfsStat(b *testing.B) {
+ for _, depth := range depths {
+ b.Run(fmt.Sprintf("%d", depth), func(b *testing.B) {
+ ctx := contexttest.Context(b)
+ creds := auth.CredentialsFromContext(ctx)
+
+ // Create VFS.
+ vfsObj := vfs.New()
+ vfsObj.MustRegisterFilesystemType("memfs", memfs.FilesystemType{})
+ mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "memfs", &vfs.NewFilesystemOptions{})
+ if err != nil {
+ b.Fatalf("failed to create tmpfs root mount: %v", err)
+ }
+
+ var filePathBuilder strings.Builder
+ filePathBuilder.WriteByte('/')
+
+ // Create nested directories with given depth.
+ root := mntns.Root()
+ defer root.DecRef()
+ vd := root
+ vd.IncRef()
+ defer vd.DecRef()
+ for i := depth; i > 0; i-- {
+ name := fmt.Sprintf("%d", i)
+ pop := vfs.PathOperation{
+ Root: root,
+ Start: vd,
+ Pathname: name,
+ }
+ if err := vfsObj.MkdirAt(ctx, creds, &pop, &vfs.MkdirOptions{
+ Mode: 0755,
+ }); err != nil {
+ b.Fatalf("failed to create directory %q: %v", name, err)
+ }
+ nextVD, err := vfsObj.GetDentryAt(ctx, creds, &pop, &vfs.GetDentryOptions{})
+ if err != nil {
+ b.Fatalf("failed to walk to directory %q: %v", name, err)
+ }
+ vd.DecRef()
+ vd = nextVD
+ filePathBuilder.WriteString(name)
+ filePathBuilder.WriteByte('/')
+ }
+
+ // Create the file that will be stat'd.
+ fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{
+ Root: root,
+ Start: vd,
+ Pathname: filename,
+ FollowFinalSymlink: true,
+ }, &vfs.OpenOptions{
+ Flags: linux.O_RDWR | linux.O_CREAT | linux.O_EXCL,
+ Mode: 0644,
+ })
+ if err != nil {
+ b.Fatalf("failed to create file %q: %v", filename, err)
+ }
+ defer fd.DecRef()
+ filePathBuilder.WriteString(filename)
+ filePath := filePathBuilder.String()
+
+ runtime.GC()
+ b.ResetTimer()
+ for i := 0; i < b.N; i++ {
+ stat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{
+ Root: root,
+ Start: root,
+ Pathname: filePath,
+ FollowFinalSymlink: true,
+ }, &vfs.StatOptions{})
+ if err != nil {
+ b.Fatalf("stat(%q) failed: %v", filePath, err)
+ }
+ // Sanity check.
+ if stat.Mode&^linux.S_IFMT != 0644 {
+ b.Fatalf("got wrong permissions (%0o)", stat.Mode)
+ }
+ }
+ })
+ }
+}
+
+func BenchmarkVFS1TmpfsMountStat(b *testing.B) {
+ for _, depth := range depths {
+ b.Run(fmt.Sprintf("%d", depth), func(b *testing.B) {
+ ctx := contexttest.Context(b)
+
+ // Create VFS.
+ tmpfsFS, ok := fs.FindFilesystem("tmpfs")
+ if !ok {
+ b.Fatalf("failed to find tmpfs filesystem type")
+ }
+ rootInode, err := tmpfsFS.Mount(ctx, "tmpfs", fs.MountSourceFlags{}, "", nil)
+ if err != nil {
+ b.Fatalf("failed to create tmpfs root mount: %v", err)
+ }
+ mntns, err := fs.NewMountNamespace(ctx, rootInode)
+ if err != nil {
+ b.Fatalf("failed to create mount namespace: %v", err)
+ }
+ defer mntns.DecRef()
+
+ var filePathBuilder strings.Builder
+ filePathBuilder.WriteByte('/')
+
+ // Create and mount the submount.
+ root := mntns.Root()
+ defer root.DecRef()
+ if err := root.Inode.CreateDirectory(ctx, root, mountPointName, fs.FilePermsFromMode(0755)); err != nil {
+ b.Fatalf("failed to create mount point: %v", err)
+ }
+ mountPoint, err := root.Walk(ctx, root, mountPointName)
+ if err != nil {
+ b.Fatalf("failed to walk to mount point: %v", err)
+ }
+ defer mountPoint.DecRef()
+ submountInode, err := tmpfsFS.Mount(ctx, "tmpfs", fs.MountSourceFlags{}, "", nil)
+ if err != nil {
+ b.Fatalf("failed to create tmpfs submount: %v", err)
+ }
+ if err := mntns.Mount(ctx, mountPoint, submountInode); err != nil {
+ b.Fatalf("failed to mount tmpfs submount: %v", err)
+ }
+ filePathBuilder.WriteString(mountPointName)
+ filePathBuilder.WriteByte('/')
+
+ // Create nested directories with given depth.
+ d, err := root.Walk(ctx, root, mountPointName)
+ if err != nil {
+ b.Fatalf("failed to walk to mount root: %v", err)
+ }
+ defer d.DecRef()
+ for i := depth; i > 0; i-- {
+ name := fmt.Sprintf("%d", i)
+ if err := d.Inode.CreateDirectory(ctx, d, name, fs.FilePermsFromMode(0755)); err != nil {
+ b.Fatalf("failed to create directory %q: %v", name, err)
+ }
+ next, err := d.Walk(ctx, root, name)
+ if err != nil {
+ b.Fatalf("failed to walk to directory %q: %v", name, err)
+ }
+ d.DecRef()
+ d = next
+ filePathBuilder.WriteString(name)
+ filePathBuilder.WriteByte('/')
+ }
+
+ // Create the file that will be stat'd.
+ file, err := d.Inode.Create(ctx, d, filename, fs.FileFlags{Read: true, Write: true}, fs.FilePermsFromMode(0644))
+ if err != nil {
+ b.Fatalf("failed to create file %q: %v", filename, err)
+ }
+ file.DecRef()
+ filePathBuilder.WriteString(filename)
+ filePath := filePathBuilder.String()
+
+ dirPath := false
+ runtime.GC()
+ b.ResetTimer()
+ for i := 0; i < b.N; i++ {
+ err := fileOpOn(ctx, mntns, root, root, linux.AT_FDCWD, filePath, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent) error {
+ if dirPath && !fs.IsDir(d.Inode.StableAttr) {
+ return syserror.ENOTDIR
+ }
+ uattr, err := d.Inode.UnstableAttr(ctx)
+ if err != nil {
+ return err
+ }
+ // Sanity check.
+ if uattr.Perms.User.Execute {
+ b.Fatalf("got wrong permissions (%0o)", uattr.Perms.LinuxMode())
+ }
+ return nil
+ })
+ if err != nil {
+ b.Fatalf("stat(%q) failed: %v", filePath, err)
+ }
+ }
+ })
+ }
+}
+
+func BenchmarkVFS2MemfsMountStat(b *testing.B) {
+ for _, depth := range depths {
+ b.Run(fmt.Sprintf("%d", depth), func(b *testing.B) {
+ ctx := contexttest.Context(b)
+ creds := auth.CredentialsFromContext(ctx)
+
+ // Create VFS.
+ vfsObj := vfs.New()
+ vfsObj.MustRegisterFilesystemType("memfs", memfs.FilesystemType{})
+ mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "memfs", &vfs.NewFilesystemOptions{})
+ if err != nil {
+ b.Fatalf("failed to create tmpfs root mount: %v", err)
+ }
+
+ var filePathBuilder strings.Builder
+ filePathBuilder.WriteByte('/')
+
+ // Create the mount point.
+ root := mntns.Root()
+ defer root.DecRef()
+ pop := vfs.PathOperation{
+ Root: root,
+ Start: root,
+ Pathname: mountPointName,
+ }
+ if err := vfsObj.MkdirAt(ctx, creds, &pop, &vfs.MkdirOptions{
+ Mode: 0755,
+ }); err != nil {
+ b.Fatalf("failed to create mount point: %v", err)
+ }
+ // Save the mount point for later use.
+ mountPoint, err := vfsObj.GetDentryAt(ctx, creds, &pop, &vfs.GetDentryOptions{})
+ if err != nil {
+ b.Fatalf("failed to walk to mount point: %v", err)
+ }
+ defer mountPoint.DecRef()
+ // Create and mount the submount.
+ if err := vfsObj.NewMount(ctx, creds, "", &pop, "memfs", &vfs.NewFilesystemOptions{}); err != nil {
+ b.Fatalf("failed to mount tmpfs submount: %v", err)
+ }
+ filePathBuilder.WriteString(mountPointName)
+ filePathBuilder.WriteByte('/')
+
+ // Create nested directories with given depth.
+ vd, err := vfsObj.GetDentryAt(ctx, creds, &pop, &vfs.GetDentryOptions{})
+ if err != nil {
+ b.Fatalf("failed to walk to mount root: %v", err)
+ }
+ defer vd.DecRef()
+ for i := depth; i > 0; i-- {
+ name := fmt.Sprintf("%d", i)
+ pop := vfs.PathOperation{
+ Root: root,
+ Start: vd,
+ Pathname: name,
+ }
+ if err := vfsObj.MkdirAt(ctx, creds, &pop, &vfs.MkdirOptions{
+ Mode: 0755,
+ }); err != nil {
+ b.Fatalf("failed to create directory %q: %v", name, err)
+ }
+ nextVD, err := vfsObj.GetDentryAt(ctx, creds, &pop, &vfs.GetDentryOptions{})
+ if err != nil {
+ b.Fatalf("failed to walk to directory %q: %v", name, err)
+ }
+ vd.DecRef()
+ vd = nextVD
+ filePathBuilder.WriteString(name)
+ filePathBuilder.WriteByte('/')
+ }
+
+ // Verify that we didn't create any directories under the mount
+ // point (i.e. they were all created on the submount).
+ firstDirName := fmt.Sprintf("%d", depth)
+ if child := mountPoint.Dentry().Child(firstDirName); child != nil {
+ b.Fatalf("created directory %q under root mount, not submount", firstDirName)
+ }
+
+ // Create the file that will be stat'd.
+ fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{
+ Root: root,
+ Start: vd,
+ Pathname: filename,
+ FollowFinalSymlink: true,
+ }, &vfs.OpenOptions{
+ Flags: linux.O_RDWR | linux.O_CREAT | linux.O_EXCL,
+ Mode: 0644,
+ })
+ if err != nil {
+ b.Fatalf("failed to create file %q: %v", filename, err)
+ }
+ fd.DecRef()
+ filePathBuilder.WriteString(filename)
+ filePath := filePathBuilder.String()
+
+ runtime.GC()
+ b.ResetTimer()
+ for i := 0; i < b.N; i++ {
+ stat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{
+ Root: root,
+ Start: root,
+ Pathname: filePath,
+ FollowFinalSymlink: true,
+ }, &vfs.StatOptions{})
+ if err != nil {
+ b.Fatalf("stat(%q) failed: %v", filePath, err)
+ }
+ // Sanity check.
+ if stat.Mode&^linux.S_IFMT != 0644 {
+ b.Fatalf("got wrong permissions (%0o)", stat.Mode)
+ }
+ }
+ })
+ }
+}
diff --git a/pkg/sentry/fsimpl/memfs/directory.go b/pkg/sentry/fsimpl/memfs/directory.go
new file mode 100644
index 000000000..b0c3ea39a
--- /dev/null
+++ b/pkg/sentry/fsimpl/memfs/directory.go
@@ -0,0 +1,178 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package memfs
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+type directory struct {
+ inode Inode
+
+ // childList is a list containing (1) child Dentries and (2) fake Dentries
+ // (with inode == nil) that represent the iteration position of
+ // directoryFDs. childList is used to support directoryFD.IterDirents()
+ // efficiently. childList is protected by Filesystem.mu.
+ childList dentryList
+}
+
+func (fs *Filesystem) newDirectory(creds *auth.Credentials, mode uint16) *Inode {
+ dir := &directory{}
+ dir.inode.init(dir, fs, creds, mode)
+ dir.inode.nlink = 2 // from "." and parent directory or ".." for root
+ return &dir.inode
+}
+
+func (i *Inode) isDir() bool {
+ _, ok := i.impl.(*directory)
+ return ok
+}
+
+type directoryFD struct {
+ fileDescription
+ vfs.DirectoryFileDescriptionDefaultImpl
+
+ // Protected by Filesystem.mu.
+ iter *Dentry
+ off int64
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *directoryFD) Release() {
+ if fd.iter != nil {
+ fs := fd.filesystem()
+ dir := fd.inode().impl.(*directory)
+ fs.mu.Lock()
+ dir.childList.Remove(fd.iter)
+ fs.mu.Unlock()
+ fd.iter = nil
+ }
+}
+
+// IterDirents implements vfs.FileDescriptionImpl.IterDirents.
+func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error {
+ fs := fd.filesystem()
+ d := fd.vfsfd.VirtualDentry().Dentry()
+
+ fs.mu.Lock()
+ defer fs.mu.Unlock()
+
+ if fd.off == 0 {
+ if !cb.Handle(vfs.Dirent{
+ Name: ".",
+ Type: linux.DT_DIR,
+ Ino: d.Impl().(*Dentry).inode.ino,
+ Off: 0,
+ }) {
+ return nil
+ }
+ fd.off++
+ }
+ if fd.off == 1 {
+ parentInode := d.ParentOrSelf().Impl().(*Dentry).inode
+ if !cb.Handle(vfs.Dirent{
+ Name: "..",
+ Type: parentInode.direntType(),
+ Ino: parentInode.ino,
+ Off: 1,
+ }) {
+ return nil
+ }
+ fd.off++
+ }
+
+ dir := d.Impl().(*Dentry).inode.impl.(*directory)
+ var child *Dentry
+ if fd.iter == nil {
+ // Start iteration at the beginning of dir.
+ child = dir.childList.Front()
+ fd.iter = &Dentry{}
+ } else {
+ // Continue iteration from where we left off.
+ child = fd.iter.Next()
+ dir.childList.Remove(fd.iter)
+ }
+ for child != nil {
+ // Skip other directoryFD iterators.
+ if child.inode != nil {
+ if !cb.Handle(vfs.Dirent{
+ Name: child.vfsd.Name(),
+ Type: child.inode.direntType(),
+ Ino: child.inode.ino,
+ Off: fd.off,
+ }) {
+ dir.childList.InsertBefore(child, fd.iter)
+ return nil
+ }
+ fd.off++
+ }
+ child = child.Next()
+ }
+ dir.childList.PushBack(fd.iter)
+ return nil
+}
+
+// Seek implements vfs.FileDescriptionImpl.Seek.
+func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+ if whence != linux.SEEK_SET {
+ // TODO: Linux also allows SEEK_CUR.
+ return 0, syserror.EINVAL
+ }
+ if offset < 0 {
+ return 0, syserror.EINVAL
+ }
+
+ fd.off = offset
+ // Compensate for "." and "..".
+ var remChildren int64
+ if offset < 2 {
+ remChildren = 0
+ } else {
+ remChildren = offset - 2
+ }
+
+ fs := fd.filesystem()
+ dir := fd.inode().impl.(*directory)
+
+ fs.mu.Lock()
+ defer fs.mu.Unlock()
+
+ // Ensure that fd.iter exists and is not linked into dir.childList.
+ if fd.iter == nil {
+ fd.iter = &Dentry{}
+ } else {
+ dir.childList.Remove(fd.iter)
+ }
+ // Insert fd.iter before the remChildren'th child, or at the end of the
+ // list if remChildren >= number of children.
+ child := dir.childList.Front()
+ for child != nil {
+ // Skip other directoryFD iterators.
+ if child.inode != nil {
+ if remChildren == 0 {
+ dir.childList.InsertBefore(child, fd.iter)
+ return offset, nil
+ }
+ remChildren--
+ }
+ child = child.Next()
+ }
+ dir.childList.PushBack(fd.iter)
+ return offset, nil
+}
diff --git a/pkg/sentry/fsimpl/memfs/filesystem.go b/pkg/sentry/fsimpl/memfs/filesystem.go
new file mode 100644
index 000000000..4d989eeaf
--- /dev/null
+++ b/pkg/sentry/fsimpl/memfs/filesystem.go
@@ -0,0 +1,542 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package memfs
+
+import (
+ "fmt"
+ "sync/atomic"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+// stepLocked resolves rp.Component() in parent directory vfsd.
+//
+// stepLocked is loosely analogous to fs/namei.c:walk_component().
+//
+// Preconditions: Filesystem.mu must be locked. !rp.Done(). inode ==
+// vfsd.Impl().(*Dentry).inode.
+func stepLocked(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, inode *Inode) (*vfs.Dentry, *Inode, error) {
+ if !inode.isDir() {
+ return nil, nil, syserror.ENOTDIR
+ }
+ if err := inode.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil {
+ return nil, nil, err
+ }
+afterSymlink:
+ nextVFSD, err := rp.ResolveComponent(vfsd)
+ if err != nil {
+ return nil, nil, err
+ }
+ if nextVFSD == nil {
+ // Since the Dentry tree is the sole source of truth for memfs, if it's
+ // not in the Dentry tree, it doesn't exist.
+ return nil, nil, syserror.ENOENT
+ }
+ nextInode := nextVFSD.Impl().(*Dentry).inode
+ if symlink, ok := nextInode.impl.(*symlink); ok && rp.ShouldFollowSymlink() {
+ // TODO: symlink traversals update access time
+ if err := rp.HandleSymlink(symlink.target); err != nil {
+ return nil, nil, err
+ }
+ goto afterSymlink // don't check the current directory again
+ }
+ rp.Advance()
+ return nextVFSD, nextInode, nil
+}
+
+// walkExistingLocked resolves rp to an existing file.
+//
+// walkExistingLocked is loosely analogous to Linux's
+// fs/namei.c:path_lookupat().
+//
+// Preconditions: Filesystem.mu must be locked.
+func walkExistingLocked(rp *vfs.ResolvingPath) (*vfs.Dentry, *Inode, error) {
+ vfsd := rp.Start()
+ inode := vfsd.Impl().(*Dentry).inode
+ for !rp.Done() {
+ var err error
+ vfsd, inode, err = stepLocked(rp, vfsd, inode)
+ if err != nil {
+ return nil, nil, err
+ }
+ }
+ if rp.MustBeDir() && !inode.isDir() {
+ return nil, nil, syserror.ENOTDIR
+ }
+ return vfsd, inode, nil
+}
+
+// walkParentDirLocked resolves all but the last path component of rp to an
+// existing directory. It does not check that the returned directory is
+// searchable by the provider of rp.
+//
+// walkParentDirLocked is loosely analogous to Linux's
+// fs/namei.c:path_parentat().
+//
+// Preconditions: Filesystem.mu must be locked. !rp.Done().
+func walkParentDirLocked(rp *vfs.ResolvingPath) (*vfs.Dentry, *Inode, error) {
+ vfsd := rp.Start()
+ inode := vfsd.Impl().(*Dentry).inode
+ for !rp.Final() {
+ var err error
+ vfsd, inode, err = stepLocked(rp, vfsd, inode)
+ if err != nil {
+ return nil, nil, err
+ }
+ }
+ if !inode.isDir() {
+ return nil, nil, syserror.ENOTDIR
+ }
+ return vfsd, inode, nil
+}
+
+// checkCreateLocked checks that a file named rp.Component() may be created in
+// directory parentVFSD, then returns rp.Component().
+//
+// Preconditions: Filesystem.mu must be locked. parentInode ==
+// parentVFSD.Impl().(*Dentry).inode. parentInode.isDir() == true.
+func checkCreateLocked(rp *vfs.ResolvingPath, parentVFSD *vfs.Dentry, parentInode *Inode) (string, error) {
+ if err := parentInode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true); err != nil {
+ return "", err
+ }
+ pc := rp.Component()
+ if pc == "." || pc == ".." {
+ return "", syserror.EEXIST
+ }
+ childVFSD, err := rp.ResolveChild(parentVFSD, pc)
+ if err != nil {
+ return "", err
+ }
+ if childVFSD != nil {
+ return "", syserror.EEXIST
+ }
+ if parentVFSD.IsDisowned() {
+ return "", syserror.ENOENT
+ }
+ return pc, nil
+}
+
+// checkDeleteLocked checks that the file represented by vfsd may be deleted.
+func checkDeleteLocked(vfsd *vfs.Dentry) error {
+ parentVFSD := vfsd.Parent()
+ if parentVFSD == nil {
+ return syserror.EBUSY
+ }
+ if parentVFSD.IsDisowned() {
+ return syserror.ENOENT
+ }
+ return nil
+}
+
+// GetDentryAt implements vfs.FilesystemImpl.GetDentryAt.
+func (fs *Filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) {
+ fs.mu.RLock()
+ defer fs.mu.RUnlock()
+ vfsd, inode, err := walkExistingLocked(rp)
+ if err != nil {
+ return nil, err
+ }
+ if opts.CheckSearchable {
+ if !inode.isDir() {
+ return nil, syserror.ENOTDIR
+ }
+ if err := inode.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil {
+ return nil, err
+ }
+ }
+ inode.incRef() // vfsd.IncRef(&fs.vfsfs)
+ return vfsd, nil
+}
+
+// LinkAt implements vfs.FilesystemImpl.LinkAt.
+func (fs *Filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error {
+ if rp.Done() {
+ return syserror.EEXIST
+ }
+ fs.mu.Lock()
+ defer fs.mu.Unlock()
+ parentVFSD, parentInode, err := walkParentDirLocked(rp)
+ if err != nil {
+ return err
+ }
+ pc, err := checkCreateLocked(rp, parentVFSD, parentInode)
+ if err != nil {
+ return err
+ }
+ if rp.Mount() != vd.Mount() {
+ return syserror.EXDEV
+ }
+ if err := rp.Mount().CheckBeginWrite(); err != nil {
+ return err
+ }
+ defer rp.Mount().EndWrite()
+ d := vd.Dentry().Impl().(*Dentry)
+ if d.inode.isDir() {
+ return syserror.EPERM
+ }
+ d.inode.incLinksLocked()
+ child := fs.newDentry(d.inode)
+ parentVFSD.InsertChild(&child.vfsd, pc)
+ parentInode.impl.(*directory).childList.PushBack(child)
+ return nil
+}
+
+// MkdirAt implements vfs.FilesystemImpl.MkdirAt.
+func (fs *Filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error {
+ if rp.Done() {
+ return syserror.EEXIST
+ }
+ fs.mu.Lock()
+ defer fs.mu.Unlock()
+ parentVFSD, parentInode, err := walkParentDirLocked(rp)
+ if err != nil {
+ return err
+ }
+ pc, err := checkCreateLocked(rp, parentVFSD, parentInode)
+ if err != nil {
+ return err
+ }
+ if err := rp.Mount().CheckBeginWrite(); err != nil {
+ return err
+ }
+ defer rp.Mount().EndWrite()
+ child := fs.newDentry(fs.newDirectory(rp.Credentials(), opts.Mode))
+ parentVFSD.InsertChild(&child.vfsd, pc)
+ parentInode.impl.(*directory).childList.PushBack(child)
+ parentInode.incLinksLocked() // from child's ".."
+ return nil
+}
+
+// MknodAt implements vfs.FilesystemImpl.MknodAt.
+func (fs *Filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error {
+ if rp.Done() {
+ return syserror.EEXIST
+ }
+ fs.mu.Lock()
+ defer fs.mu.Unlock()
+ parentVFSD, parentInode, err := walkParentDirLocked(rp)
+ if err != nil {
+ return err
+ }
+ _, err = checkCreateLocked(rp, parentVFSD, parentInode)
+ if err != nil {
+ return err
+ }
+ if err := rp.Mount().CheckBeginWrite(); err != nil {
+ return err
+ }
+ defer rp.Mount().EndWrite()
+ // TODO: actually implement mknod
+ return syserror.EPERM
+}
+
+// OpenAt implements vfs.FilesystemImpl.OpenAt.
+func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+ // Filter out flags that are not supported by memfs. O_DIRECTORY and
+ // O_NOFOLLOW have no effect here (they're handled by VFS by setting
+ // appropriate bits in rp), but are returned by
+ // FileDescriptionImpl.StatusFlags().
+ opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_TRUNC | linux.O_DIRECTORY | linux.O_NOFOLLOW
+
+ if opts.Flags&linux.O_CREAT == 0 {
+ fs.mu.RLock()
+ defer fs.mu.RUnlock()
+ vfsd, inode, err := walkExistingLocked(rp)
+ if err != nil {
+ return nil, err
+ }
+ return inode.open(rp, vfsd, opts.Flags, false)
+ }
+
+ mustCreate := opts.Flags&linux.O_EXCL != 0
+ vfsd := rp.Start()
+ inode := vfsd.Impl().(*Dentry).inode
+ fs.mu.Lock()
+ defer fs.mu.Unlock()
+ if rp.Done() {
+ // FIXME: ???
+ if rp.MustBeDir() {
+ return nil, syserror.EISDIR
+ }
+ if mustCreate {
+ return nil, syserror.EEXIST
+ }
+ return inode.open(rp, vfsd, opts.Flags, false)
+ }
+afterTrailingSymlink:
+ // Walk to the parent directory of the last path component.
+ for !rp.Final() {
+ var err error
+ vfsd, inode, err = stepLocked(rp, vfsd, inode)
+ if err != nil {
+ return nil, err
+ }
+ }
+ if !inode.isDir() {
+ return nil, syserror.ENOTDIR
+ }
+ // Check for search permission in the parent directory.
+ if err := inode.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil {
+ return nil, err
+ }
+ // Reject attempts to open directories with O_CREAT.
+ if rp.MustBeDir() {
+ return nil, syserror.EISDIR
+ }
+ pc := rp.Component()
+ if pc == "." || pc == ".." {
+ return nil, syserror.EISDIR
+ }
+ // Determine whether or not we need to create a file.
+ childVFSD, err := rp.ResolveChild(vfsd, pc)
+ if err != nil {
+ return nil, err
+ }
+ if childVFSD == nil {
+ // Already checked for searchability above; now check for writability.
+ if err := inode.checkPermissions(rp.Credentials(), vfs.MayWrite, true); err != nil {
+ return nil, err
+ }
+ if err := rp.Mount().CheckBeginWrite(); err != nil {
+ return nil, err
+ }
+ defer rp.Mount().EndWrite()
+ // Create and open the child.
+ childInode := fs.newRegularFile(rp.Credentials(), opts.Mode)
+ child := fs.newDentry(childInode)
+ vfsd.InsertChild(&child.vfsd, pc)
+ inode.impl.(*directory).childList.PushBack(child)
+ return childInode.open(rp, &child.vfsd, opts.Flags, true)
+ }
+ // Open existing file or follow symlink.
+ if mustCreate {
+ return nil, syserror.EEXIST
+ }
+ childInode := childVFSD.Impl().(*Dentry).inode
+ if symlink, ok := childInode.impl.(*symlink); ok && rp.ShouldFollowSymlink() {
+ // TODO: symlink traversals update access time
+ if err := rp.HandleSymlink(symlink.target); err != nil {
+ return nil, err
+ }
+ // rp.Final() may no longer be true since we now need to resolve the
+ // symlink target.
+ goto afterTrailingSymlink
+ }
+ return childInode.open(rp, childVFSD, opts.Flags, false)
+}
+
+func (i *Inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32, afterCreate bool) (*vfs.FileDescription, error) {
+ ats := vfs.AccessTypesForOpenFlags(flags)
+ if !afterCreate {
+ if err := i.checkPermissions(rp.Credentials(), ats, i.isDir()); err != nil {
+ return nil, err
+ }
+ }
+ switch impl := i.impl.(type) {
+ case *regularFile:
+ var fd regularFileFD
+ fd.flags = flags
+ fd.readable = vfs.MayReadFileWithOpenFlags(flags)
+ fd.writable = vfs.MayWriteFileWithOpenFlags(flags)
+ if fd.writable {
+ if err := rp.Mount().CheckBeginWrite(); err != nil {
+ return nil, err
+ }
+ // Mount.EndWrite() is called by regularFileFD.Release().
+ }
+ fd.vfsfd.Init(&fd, rp.Mount(), vfsd)
+ if flags&linux.O_TRUNC != 0 {
+ impl.mu.Lock()
+ impl.data = impl.data[:0]
+ atomic.StoreInt64(&impl.dataLen, 0)
+ impl.mu.Unlock()
+ }
+ return &fd.vfsfd, nil
+ case *directory:
+ // Can't open directories writably.
+ if ats&vfs.MayWrite != 0 {
+ return nil, syserror.EISDIR
+ }
+ var fd directoryFD
+ fd.vfsfd.Init(&fd, rp.Mount(), vfsd)
+ fd.flags = flags
+ return &fd.vfsfd, nil
+ case *symlink:
+ // Can't open symlinks without O_PATH (which is unimplemented).
+ return nil, syserror.ELOOP
+ default:
+ panic(fmt.Sprintf("unknown inode type: %T", i.impl))
+ }
+}
+
+// ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt.
+func (fs *Filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) {
+ fs.mu.RLock()
+ _, inode, err := walkExistingLocked(rp)
+ fs.mu.RUnlock()
+ if err != nil {
+ return "", err
+ }
+ symlink, ok := inode.impl.(*symlink)
+ if !ok {
+ return "", syserror.EINVAL
+ }
+ return symlink.target, nil
+}
+
+// RenameAt implements vfs.FilesystemImpl.RenameAt.
+func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry, opts vfs.RenameOptions) error {
+ if rp.Done() {
+ // FIXME
+ return syserror.ENOENT
+ }
+ fs.mu.Lock()
+ defer fs.mu.Unlock()
+ parentVFSD, parentInode, err := walkParentDirLocked(rp)
+ if err != nil {
+ return err
+ }
+ _, err = checkCreateLocked(rp, parentVFSD, parentInode)
+ if err != nil {
+ return err
+ }
+ if err := rp.Mount().CheckBeginWrite(); err != nil {
+ return err
+ }
+ defer rp.Mount().EndWrite()
+ // TODO: actually implement RenameAt
+ return syserror.EPERM
+}
+
+// RmdirAt implements vfs.FilesystemImpl.RmdirAt.
+func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error {
+ fs.mu.Lock()
+ defer fs.mu.Unlock()
+ vfsd, inode, err := walkExistingLocked(rp)
+ if err != nil {
+ return err
+ }
+ if err := rp.Mount().CheckBeginWrite(); err != nil {
+ return err
+ }
+ defer rp.Mount().EndWrite()
+ if err := checkDeleteLocked(vfsd); err != nil {
+ return err
+ }
+ if !inode.isDir() {
+ return syserror.ENOTDIR
+ }
+ if vfsd.HasChildren() {
+ return syserror.ENOTEMPTY
+ }
+ if err := rp.VirtualFilesystem().DeleteDentry(vfs.MountNamespaceFromContext(ctx), vfsd); err != nil {
+ return err
+ }
+ inode.decRef()
+ return nil
+}
+
+// SetStatAt implements vfs.FilesystemImpl.SetStatAt.
+func (fs *Filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error {
+ fs.mu.RLock()
+ _, _, err := walkExistingLocked(rp)
+ fs.mu.RUnlock()
+ if err != nil {
+ return err
+ }
+ if opts.Stat.Mask == 0 {
+ return nil
+ }
+ // TODO: implement Inode.setStat
+ return syserror.EPERM
+}
+
+// StatAt implements vfs.FilesystemImpl.StatAt.
+func (fs *Filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) {
+ fs.mu.RLock()
+ _, inode, err := walkExistingLocked(rp)
+ fs.mu.RUnlock()
+ if err != nil {
+ return linux.Statx{}, err
+ }
+ var stat linux.Statx
+ inode.statTo(&stat)
+ return stat, nil
+}
+
+// StatFSAt implements vfs.FilesystemImpl.StatFSAt.
+func (fs *Filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) {
+ fs.mu.RLock()
+ _, _, err := walkExistingLocked(rp)
+ fs.mu.RUnlock()
+ if err != nil {
+ return linux.Statfs{}, err
+ }
+ // TODO: actually implement statfs
+ return linux.Statfs{}, syserror.ENOSYS
+}
+
+// SymlinkAt implements vfs.FilesystemImpl.SymlinkAt.
+func (fs *Filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error {
+ if rp.Done() {
+ return syserror.EEXIST
+ }
+ fs.mu.Lock()
+ defer fs.mu.Unlock()
+ parentVFSD, parentInode, err := walkParentDirLocked(rp)
+ if err != nil {
+ return err
+ }
+ pc, err := checkCreateLocked(rp, parentVFSD, parentInode)
+ if err != nil {
+ return err
+ }
+ if err := rp.Mount().CheckBeginWrite(); err != nil {
+ return err
+ }
+ defer rp.Mount().EndWrite()
+ child := fs.newDentry(fs.newSymlink(rp.Credentials(), target))
+ parentVFSD.InsertChild(&child.vfsd, pc)
+ parentInode.impl.(*directory).childList.PushBack(child)
+ return nil
+}
+
+// UnlinkAt implements vfs.FilesystemImpl.UnlinkAt.
+func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error {
+ fs.mu.Lock()
+ defer fs.mu.Unlock()
+ vfsd, inode, err := walkExistingLocked(rp)
+ if err != nil {
+ return err
+ }
+ if err := rp.Mount().CheckBeginWrite(); err != nil {
+ return err
+ }
+ defer rp.Mount().EndWrite()
+ if err := checkDeleteLocked(vfsd); err != nil {
+ return err
+ }
+ if inode.isDir() {
+ return syserror.EISDIR
+ }
+ if err := rp.VirtualFilesystem().DeleteDentry(vfs.MountNamespaceFromContext(ctx), vfsd); err != nil {
+ return err
+ }
+ inode.decLinksLocked()
+ return nil
+}
diff --git a/pkg/sentry/fsimpl/memfs/memfs.go b/pkg/sentry/fsimpl/memfs/memfs.go
new file mode 100644
index 000000000..f381e1a88
--- /dev/null
+++ b/pkg/sentry/fsimpl/memfs/memfs.go
@@ -0,0 +1,299 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package memfs provides a filesystem implementation that behaves like tmpfs:
+// the Dentry tree is the sole source of truth for the state of the filesystem.
+//
+// memfs is intended primarily to demonstrate filesystem implementation
+// patterns. Real uses cases for an in-memory filesystem should use tmpfs
+// instead.
+//
+// Lock order:
+//
+// Filesystem.mu
+// regularFileFD.offMu
+// regularFile.mu
+// Inode.mu
+package memfs
+
+import (
+ "fmt"
+ "sync"
+ "sync/atomic"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+// FilesystemType implements vfs.FilesystemType.
+type FilesystemType struct{}
+
+// Filesystem implements vfs.FilesystemImpl.
+type Filesystem struct {
+ vfsfs vfs.Filesystem
+
+ // mu serializes changes to the Dentry tree.
+ mu sync.RWMutex
+
+ nextInoMinusOne uint64 // accessed using atomic memory operations
+}
+
+// NewFilesystem implements vfs.FilesystemType.NewFilesystem.
+func (fstype FilesystemType) NewFilesystem(ctx context.Context, creds *auth.Credentials, source string, opts vfs.NewFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
+ var fs Filesystem
+ fs.vfsfs.Init(&fs)
+ root := fs.newDentry(fs.newDirectory(creds, 01777))
+ return &fs.vfsfs, &root.vfsd, nil
+}
+
+// Release implements vfs.FilesystemImpl.Release.
+func (fs *Filesystem) Release() {
+}
+
+// Sync implements vfs.FilesystemImpl.Sync.
+func (fs *Filesystem) Sync(ctx context.Context) error {
+ // All filesystem state is in-memory.
+ return nil
+}
+
+// Dentry implements vfs.DentryImpl.
+type Dentry struct {
+ vfsd vfs.Dentry
+
+ // inode is the inode represented by this Dentry. Multiple Dentries may
+ // share a single non-directory Inode (with hard links). inode is
+ // immutable.
+ inode *Inode
+
+ // memfs doesn't count references on Dentries; because the Dentry tree is
+ // the sole source of truth, it is by definition always consistent with the
+ // state of the filesystem. However, it does count references on Inodes,
+ // because Inode resources are released when all references are dropped.
+ // (memfs doesn't really have resources to release, but we implement
+ // reference counting because tmpfs regular files will.)
+
+ // dentryEntry (ugh) links Dentries into their parent directory.childList.
+ dentryEntry
+}
+
+func (fs *Filesystem) newDentry(inode *Inode) *Dentry {
+ d := &Dentry{
+ inode: inode,
+ }
+ d.vfsd.Init(d)
+ return d
+}
+
+// IncRef implements vfs.DentryImpl.IncRef.
+func (d *Dentry) IncRef(vfsfs *vfs.Filesystem) {
+ d.inode.incRef()
+}
+
+// TryIncRef implements vfs.DentryImpl.TryIncRef.
+func (d *Dentry) TryIncRef(vfsfs *vfs.Filesystem) bool {
+ return d.inode.tryIncRef()
+}
+
+// DecRef implements vfs.DentryImpl.DecRef.
+func (d *Dentry) DecRef(vfsfs *vfs.Filesystem) {
+ d.inode.decRef()
+}
+
+// Inode represents a filesystem object.
+type Inode struct {
+ // refs is a reference count. refs is accessed using atomic memory
+ // operations.
+ //
+ // A reference is held on all Inodes that are reachable in the filesystem
+ // tree. For non-directories (which may have multiple hard links), this
+ // means that a reference is dropped when nlink reaches 0. For directories,
+ // nlink never reaches 0 due to the "." entry; instead,
+ // Filesystem.RmdirAt() drops the reference.
+ refs int64
+
+ // Inode metadata; protected by mu and accessed using atomic memory
+ // operations unless otherwise specified.
+ mu sync.RWMutex
+ mode uint32 // excluding file type bits, which are based on impl
+ nlink uint32 // protected by Filesystem.mu instead of Inode.mu
+ uid uint32 // auth.KUID, but stored as raw uint32 for sync/atomic
+ gid uint32 // auth.KGID, but ...
+ ino uint64 // immutable
+
+ impl interface{} // immutable
+}
+
+func (i *Inode) init(impl interface{}, fs *Filesystem, creds *auth.Credentials, mode uint16) {
+ i.refs = 1
+ i.mode = uint32(mode)
+ i.uid = uint32(creds.EffectiveKUID)
+ i.gid = uint32(creds.EffectiveKGID)
+ i.ino = atomic.AddUint64(&fs.nextInoMinusOne, 1)
+ // i.nlink initialized by caller
+ i.impl = impl
+}
+
+// Preconditions: Filesystem.mu must be locked for writing.
+func (i *Inode) incLinksLocked() {
+ if atomic.AddUint32(&i.nlink, 1) <= 1 {
+ panic("memfs.Inode.incLinksLocked() called with no existing links")
+ }
+}
+
+// Preconditions: Filesystem.mu must be locked for writing.
+func (i *Inode) decLinksLocked() {
+ if nlink := atomic.AddUint32(&i.nlink, ^uint32(0)); nlink == 0 {
+ i.decRef()
+ } else if nlink == ^uint32(0) { // negative overflow
+ panic("memfs.Inode.decLinksLocked() called with no existing links")
+ }
+}
+
+func (i *Inode) incRef() {
+ if atomic.AddInt64(&i.refs, 1) <= 1 {
+ panic("memfs.Inode.incRef() called without holding a reference")
+ }
+}
+
+func (i *Inode) tryIncRef() bool {
+ for {
+ refs := atomic.LoadInt64(&i.refs)
+ if refs == 0 {
+ return false
+ }
+ if atomic.CompareAndSwapInt64(&i.refs, refs, refs+1) {
+ return true
+ }
+ }
+}
+
+func (i *Inode) decRef() {
+ if refs := atomic.AddInt64(&i.refs, -1); refs == 0 {
+ // This is unnecessary; it's mostly to simulate what tmpfs would do.
+ if regfile, ok := i.impl.(*regularFile); ok {
+ regfile.mu.Lock()
+ regfile.data = nil
+ atomic.StoreInt64(&regfile.dataLen, 0)
+ regfile.mu.Unlock()
+ }
+ } else if refs < 0 {
+ panic("memfs.Inode.decRef() called without holding a reference")
+ }
+}
+
+func (i *Inode) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes, isDir bool) error {
+ return vfs.GenericCheckPermissions(creds, ats, isDir, uint16(atomic.LoadUint32(&i.mode)), auth.KUID(atomic.LoadUint32(&i.uid)), auth.KGID(atomic.LoadUint32(&i.gid)))
+}
+
+// Go won't inline this function, and returning linux.Statx (which is quite
+// big) means spending a lot of time in runtime.duffcopy(), so instead it's an
+// output parameter.
+func (i *Inode) statTo(stat *linux.Statx) {
+ stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO
+ stat.Blksize = 1 // usermem.PageSize in tmpfs
+ stat.Nlink = atomic.LoadUint32(&i.nlink)
+ stat.UID = atomic.LoadUint32(&i.uid)
+ stat.GID = atomic.LoadUint32(&i.gid)
+ stat.Mode = uint16(atomic.LoadUint32(&i.mode))
+ stat.Ino = i.ino
+ // TODO: device number
+ switch impl := i.impl.(type) {
+ case *regularFile:
+ stat.Mode |= linux.S_IFREG
+ stat.Mask |= linux.STATX_SIZE | linux.STATX_BLOCKS
+ stat.Size = uint64(atomic.LoadInt64(&impl.dataLen))
+ // In tmpfs, this will be FileRangeSet.Span() / 512 (but also cached in
+ // a uint64 accessed using atomic memory operations to avoid taking
+ // locks).
+ stat.Blocks = allocatedBlocksForSize(stat.Size)
+ case *directory:
+ stat.Mode |= linux.S_IFDIR
+ case *symlink:
+ stat.Mode |= linux.S_IFLNK
+ stat.Mask |= linux.STATX_SIZE | linux.STATX_BLOCKS
+ stat.Size = uint64(len(impl.target))
+ stat.Blocks = allocatedBlocksForSize(stat.Size)
+ default:
+ panic(fmt.Sprintf("unknown inode type: %T", i.impl))
+ }
+}
+
+// allocatedBlocksForSize returns the number of 512B blocks needed to
+// accommodate the given size in bytes, as appropriate for struct
+// stat::st_blocks and struct statx::stx_blocks. (Note that this 512B block
+// size is independent of the "preferred block size for I/O", struct
+// stat::st_blksize and struct statx::stx_blksize.)
+func allocatedBlocksForSize(size uint64) uint64 {
+ return (size + 511) / 512
+}
+
+func (i *Inode) direntType() uint8 {
+ switch i.impl.(type) {
+ case *regularFile:
+ return linux.DT_REG
+ case *directory:
+ return linux.DT_DIR
+ case *symlink:
+ return linux.DT_LNK
+ default:
+ panic(fmt.Sprintf("unknown inode type: %T", i.impl))
+ }
+}
+
+// fileDescription is embedded by memfs implementations of
+// vfs.FileDescriptionImpl.
+type fileDescription struct {
+ vfsfd vfs.FileDescription
+
+ flags uint32 // status flags; immutable
+}
+
+func (fd *fileDescription) filesystem() *Filesystem {
+ return fd.vfsfd.VirtualDentry().Mount().Filesystem().Impl().(*Filesystem)
+}
+
+func (fd *fileDescription) inode() *Inode {
+ return fd.vfsfd.VirtualDentry().Dentry().Impl().(*Dentry).inode
+}
+
+// StatusFlags implements vfs.FileDescriptionImpl.StatusFlags.
+func (fd *fileDescription) StatusFlags(ctx context.Context) (uint32, error) {
+ return fd.flags, nil
+}
+
+// SetStatusFlags implements vfs.FileDescriptionImpl.SetStatusFlags.
+func (fd *fileDescription) SetStatusFlags(ctx context.Context, flags uint32) error {
+ // None of the flags settable by fcntl(F_SETFL) are supported, so this is a
+ // no-op.
+ return nil
+}
+
+// Stat implements vfs.FileDescriptionImpl.Stat.
+func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
+ var stat linux.Statx
+ fd.inode().statTo(&stat)
+ return stat, nil
+}
+
+// SetStat implements vfs.FileDescriptionImpl.SetStat.
+func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
+ if opts.Stat.Mask == 0 {
+ return nil
+ }
+ // TODO: implement Inode.setStat
+ return syserror.EPERM
+}
diff --git a/pkg/sentry/fsimpl/memfs/regular_file.go b/pkg/sentry/fsimpl/memfs/regular_file.go
new file mode 100644
index 000000000..4a3603cc8
--- /dev/null
+++ b/pkg/sentry/fsimpl/memfs/regular_file.go
@@ -0,0 +1,155 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package memfs
+
+import (
+ "io"
+ "sync"
+ "sync/atomic"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/usermem"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+type regularFile struct {
+ inode Inode
+
+ mu sync.RWMutex
+ data []byte
+ // dataLen is len(data), but accessed using atomic memory operations to
+ // avoid locking in Inode.stat().
+ dataLen int64
+}
+
+func (fs *Filesystem) newRegularFile(creds *auth.Credentials, mode uint16) *Inode {
+ file := &regularFile{}
+ file.inode.init(file, fs, creds, mode)
+ file.inode.nlink = 1 // from parent directory
+ return &file.inode
+}
+
+type regularFileFD struct {
+ fileDescription
+ vfs.FileDescriptionDefaultImpl
+
+ // These are immutable.
+ readable bool
+ writable bool
+
+ // off is the file offset. off is accessed using atomic memory operations.
+ // offMu serializes operations that may mutate off.
+ off int64
+ offMu sync.Mutex
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *regularFileFD) Release() {
+ if fd.writable {
+ fd.vfsfd.VirtualDentry().Mount().EndWrite()
+ }
+}
+
+// PRead implements vfs.FileDescriptionImpl.PRead.
+func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+ if !fd.readable {
+ return 0, syserror.EINVAL
+ }
+ f := fd.inode().impl.(*regularFile)
+ f.mu.RLock()
+ if offset >= int64(len(f.data)) {
+ f.mu.RUnlock()
+ return 0, io.EOF
+ }
+ n, err := dst.CopyOut(ctx, f.data[offset:])
+ f.mu.RUnlock()
+ return int64(n), err
+}
+
+// Read implements vfs.FileDescriptionImpl.Read.
+func (fd *regularFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+ fd.offMu.Lock()
+ n, err := fd.PRead(ctx, dst, fd.off, opts)
+ fd.off += n
+ fd.offMu.Unlock()
+ return n, err
+}
+
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
+func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+ if !fd.writable {
+ return 0, syserror.EINVAL
+ }
+ if offset < 0 {
+ return 0, syserror.EINVAL
+ }
+ srclen := src.NumBytes()
+ if srclen == 0 {
+ return 0, nil
+ }
+ f := fd.inode().impl.(*regularFile)
+ f.mu.Lock()
+ end := offset + srclen
+ if end < offset {
+ // Overflow.
+ f.mu.Unlock()
+ return 0, syserror.EFBIG
+ }
+ if end > f.dataLen {
+ f.data = append(f.data, make([]byte, end-f.dataLen)...)
+ atomic.StoreInt64(&f.dataLen, end)
+ }
+ n, err := src.CopyIn(ctx, f.data[offset:end])
+ f.mu.Unlock()
+ return int64(n), err
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (fd *regularFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+ fd.offMu.Lock()
+ n, err := fd.PWrite(ctx, src, fd.off, opts)
+ fd.off += n
+ fd.offMu.Unlock()
+ return n, err
+}
+
+// Seek implements vfs.FileDescriptionImpl.Seek.
+func (fd *regularFileFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+ fd.offMu.Lock()
+ defer fd.offMu.Unlock()
+ switch whence {
+ case linux.SEEK_SET:
+ // use offset as specified
+ case linux.SEEK_CUR:
+ offset += fd.off
+ case linux.SEEK_END:
+ offset += atomic.LoadInt64(&fd.inode().impl.(*regularFile).dataLen)
+ default:
+ return 0, syserror.EINVAL
+ }
+ if offset < 0 {
+ return 0, syserror.EINVAL
+ }
+ fd.off = offset
+ return offset, nil
+}
+
+// Sync implements vfs.FileDescriptionImpl.Sync.
+func (fd *regularFileFD) Sync(ctx context.Context) error {
+ return nil
+}
diff --git a/pkg/sentry/fsimpl/memfs/symlink.go b/pkg/sentry/fsimpl/memfs/symlink.go
new file mode 100644
index 000000000..e002d1727
--- /dev/null
+++ b/pkg/sentry/fsimpl/memfs/symlink.go
@@ -0,0 +1,36 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package memfs
+
+import (
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+)
+
+type symlink struct {
+ inode Inode
+ target string // immutable
+}
+
+func (fs *Filesystem) newSymlink(creds *auth.Credentials, target string) *Inode {
+ link := &symlink{
+ target: target,
+ }
+ link.inode.init(link, fs, creds, 0777)
+ link.inode.nlink = 1 // from parent directory
+ return &link.inode
+}
+
+// O_PATH is unimplemented, so there's no way to get a FileDescription
+// representing a symlink yet.
diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD
new file mode 100644
index 000000000..4de6c41cf
--- /dev/null
+++ b/pkg/sentry/vfs/BUILD
@@ -0,0 +1,46 @@
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+
+package(licenses = ["notice"])
+
+go_library(
+ name = "vfs",
+ srcs = [
+ "context.go",
+ "debug.go",
+ "dentry.go",
+ "file_description.go",
+ "file_description_impl_util.go",
+ "filesystem.go",
+ "filesystem_type.go",
+ "mount.go",
+ "mount_unsafe.go",
+ "options.go",
+ "permissions.go",
+ "resolving_path.go",
+ "syscalls.go",
+ "vfs.go",
+ ],
+ importpath = "gvisor.dev/gvisor/pkg/sentry/vfs",
+ visibility = ["//pkg/sentry:internal"],
+ deps = [
+ "//pkg/abi/linux",
+ "//pkg/fspath",
+ "//pkg/sentry/arch",
+ "//pkg/sentry/context",
+ "//pkg/sentry/kernel/auth",
+ "//pkg/sentry/memmap",
+ "//pkg/sentry/usermem",
+ "//pkg/syserror",
+ "//pkg/waiter",
+ "//third_party/gvsync",
+ ],
+)
+
+go_test(
+ name = "vfs_test",
+ size = "small",
+ srcs = [
+ "mount_test.go",
+ ],
+ embed = [":vfs"],
+)
diff --git a/pkg/sentry/vfs/README.md b/pkg/sentry/vfs/README.md
new file mode 100644
index 000000000..7847854bc
--- /dev/null
+++ b/pkg/sentry/vfs/README.md
@@ -0,0 +1,197 @@
+# The gVisor Virtual Filesystem
+
+THIS PACKAGE IS CURRENTLY EXPERIMENTAL AND NOT READY OR ENABLED FOR PRODUCTION
+USE. For the filesystem implementation currently used by gVisor, see the `fs`
+package.
+
+## Implementation Notes
+
+### Reference Counting
+
+Filesystem, Dentry, Mount, MountNamespace, and FileDescription are all
+reference-counted. Mount and MountNamespace are exclusively VFS-managed; when
+their reference count reaches zero, VFS releases their resources. Filesystem and
+FileDescription management is shared between VFS and filesystem implementations;
+when their reference count reaches zero, VFS notifies the implementation by
+calling `FilesystemImpl.Release()` or `FileDescriptionImpl.Release()`
+respectively and then releases VFS-owned resources. Dentries are exclusively
+managed by filesystem implementations; reference count changes are abstracted
+through DentryImpl, which should release resources when reference count reaches
+zero.
+
+Filesystem references are held by:
+
+- Mount: Each referenced Mount holds a reference on the mounted Filesystem.
+
+Dentry references are held by:
+
+- FileDescription: Each referenced FileDescription holds a reference on the
+ Dentry through which it was opened, via `FileDescription.vd.dentry`.
+
+- Mount: Each referenced Mount holds a reference on its mount point and on the
+ mounted filesystem root. The mount point is mutable (`mount(MS_MOVE)`).
+
+Mount references are held by:
+
+- FileDescription: Each referenced FileDescription holds a reference on the
+ Mount on which it was opened, via `FileDescription.vd.mount`.
+
+- Mount: Each referenced Mount holds a reference on its parent, which is the
+ mount containing its mount point.
+
+- VirtualFilesystem: A reference is held on all Mounts that are attached
+ (reachable by Mount traversal).
+
+MountNamespace and FileDescription references are held by users of VFS. The
+expectation is that each `kernel.Task` holds a reference on its corresponding
+MountNamespace, and each file descriptor holds a reference on its represented
+FileDescription.
+
+Notes:
+
+- Dentries do not hold a reference on their owning Filesystem. Instead, all
+ uses of a Dentry occur in the context of a Mount, which holds a reference on
+ the relevant Filesystem (see e.g. the VirtualDentry type). As a corollary,
+ when releasing references on both a Dentry and its corresponding Mount, the
+ Dentry's reference must be released first (because releasing the Mount's
+ reference may release the last reference on the Filesystem, whose state may
+ be required to release the Dentry reference).
+
+### The Inheritance Pattern
+
+Filesystem, Dentry, and FileDescription are all concepts featuring both state
+that must be shared between VFS and filesystem implementations, and operations
+that are implementation-defined. To facilitate this, each of these three
+concepts follows the same pattern, shown below for Dentry:
+
+```go
+// Dentry represents a node in a filesystem tree.
+type Dentry struct {
+ // VFS-required dentry state.
+ parent *Dentry
+ // ...
+
+ // impl is the DentryImpl associated with this Dentry. impl is immutable.
+ // This should be the last field in Dentry.
+ impl DentryImpl
+}
+
+// Init must be called before first use of d.
+func (d *Dentry) Init(impl DentryImpl) {
+ d.impl = impl
+}
+
+// Impl returns the DentryImpl associated with d.
+func (d *Dentry) Impl() DentryImpl {
+ return d.impl
+}
+
+// DentryImpl contains implementation-specific details of a Dentry.
+// Implementations of DentryImpl should contain their associated Dentry by
+// value as their first field.
+type DentryImpl interface {
+ // VFS-required implementation-defined dentry operations.
+ IncRef()
+ // ...
+}
+```
+
+This construction, which is essentially a type-safe analogue to Linux's
+`container_of` pattern, has the following properties:
+
+- VFS works almost exclusively with pointers to Dentry rather than DentryImpl
+ interface objects, such as in the type of `Dentry.parent`. This avoids
+ interface method calls (which are somewhat expensive to perform, and defeat
+ inlining and escape analysis), reduces the size of VFS types (since an
+ interface object is two pointers in size), and allows pointers to be loaded
+ and stored atomically using `sync/atomic`. Implementation-defined behavior
+ is accessed via `Dentry.impl` when required.
+
+- Filesystem implementations can access the implementation-defined state
+ associated with objects of VFS types by type-asserting or type-switching
+ (e.g. `Dentry.Impl().(*myDentry)`). Type assertions to a concrete type
+ require only an equality comparison of the interface object's type pointer
+ to a static constant, and are consequently very fast.
+
+- Filesystem implementations can access the VFS state associated with objects
+ of implementation-defined types directly.
+
+- VFS and implementation-defined state for a given type occupy the same
+ object, minimizing memory allocations and maximizing memory locality. `impl`
+ is the last field in `Dentry`, and `Dentry` is the first field in
+ `DentryImpl` implementations, for similar reasons: this tends to cause
+ fetching of the `Dentry.impl` interface object to also fetch `DentryImpl`
+ fields, either because they are in the same cache line or via next-line
+ prefetching.
+
+## Future Work
+
+- Most `mount(2)` features, and unmounting, are incomplete.
+
+- VFS1 filesystems are not directly compatible with VFS2. It may be possible
+ to implement shims that implement `vfs.FilesystemImpl` for
+ `fs.MountNamespace`, `vfs.DentryImpl` for `fs.Dirent`, and
+ `vfs.FileDescriptionImpl` for `fs.File`, which may be adequate for
+ filesystems that are not performance-critical (e.g. sysfs); however, it is
+ not clear that this will be less effort than simply porting the filesystems
+ in question. Practically speaking, the following filesystems will probably
+ need to be ported or made compatible through a shim to evaluate filesystem
+ performance on realistic workloads:
+
+ - devfs/procfs/sysfs, which will realistically be necessary to execute
+ most applications. (Note that procfs and sysfs do not support hard
+ links, so they do not require the complexity of separate inode objects.
+ Also note that Linux's /dev is actually a variant of tmpfs called
+ devtmpfs.)
+
+ - tmpfs. This should be relatively straightforward: copy/paste memfs,
+ store regular file contents in pgalloc-allocated memory instead of
+ `[]byte`, and add support for file timestamps. (In fact, it probably
+ makes more sense to convert memfs to tmpfs and not keep the former.)
+
+ - A remote filesystem, either lisafs (if it is ready by the time that
+ other benchmarking prerequisites are) or v9fs (aka 9P, aka gofers).
+
+ - epoll files.
+
+ Filesystems that will need to be ported before switching to VFS2, but can
+ probably be skipped for early testing:
+
+ - overlayfs, which is needed for (at least) synthetic mount points.
+
+ - Support for host ttys.
+
+ - timerfd files.
+
+ Filesystems that can be probably dropped:
+
+ - ashmem, which is far too incomplete to use.
+
+ - binder, which is similarly far too incomplete to use.
+
+ - whitelistfs, which we are already actively attempting to remove.
+
+- Save/restore. For instance, it is unclear if the current implementation of
+ the `state` package supports the inheritance pattern described above.
+
+- Many features that were previously implemented by VFS must now be
+ implemented by individual filesystems (though, in most cases, this should
+ consist of calls to hooks or libraries provided by `vfs` or other packages).
+ This includes, but is not necessarily limited to:
+
+ - Block and character device special files
+
+ - Inotify
+
+ - File locking
+
+ - `O_ASYNC`
+
+- Reference counts in the `vfs` package do not use the `refs` package since
+ `refs.AtomicRefCount` adds 64 bytes of overhead to each 8-byte reference
+ count, resulting in considerable cache bloat. 24 bytes of this overhead is
+ for weak reference support, which have poor performance and will not be used
+ by VFS2. The remaining 40 bytes is to store a descriptive string and stack
+ trace for reference leak checking; we can support reference leak checking
+ without incurring this space overhead by including the applicable
+ information directly in finalizers for applicable types.
diff --git a/pkg/sentry/vfs/context.go b/pkg/sentry/vfs/context.go
new file mode 100644
index 000000000..32cf9151b
--- /dev/null
+++ b/pkg/sentry/vfs/context.go
@@ -0,0 +1,37 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs
+
+import (
+ "gvisor.dev/gvisor/pkg/sentry/context"
+)
+
+// contextID is this package's type for context.Context.Value keys.
+type contextID int
+
+const (
+ // CtxMountNamespace is a Context.Value key for a MountNamespace.
+ CtxMountNamespace contextID = iota
+)
+
+// MountNamespaceFromContext returns the MountNamespace used by ctx. It does
+// not take a reference on the returned MountNamespace. If ctx is not
+// associated with a MountNamespace, MountNamespaceFromContext returns nil.
+func MountNamespaceFromContext(ctx context.Context) *MountNamespace {
+ if v := ctx.Value(CtxMountNamespace); v != nil {
+ return v.(*MountNamespace)
+ }
+ return nil
+}
diff --git a/pkg/sentry/vfs/debug.go b/pkg/sentry/vfs/debug.go
new file mode 100644
index 000000000..0ed20f249
--- /dev/null
+++ b/pkg/sentry/vfs/debug.go
@@ -0,0 +1,22 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs
+
+const (
+ // If checkInvariants is true, perform runtime checks for invariants
+ // expected by the vfs package. This is normally disabled since VFS is
+ // often a hot path.
+ checkInvariants = false
+)
diff --git a/pkg/sentry/vfs/dentry.go b/pkg/sentry/vfs/dentry.go
new file mode 100644
index 000000000..45912fc58
--- /dev/null
+++ b/pkg/sentry/vfs/dentry.go
@@ -0,0 +1,347 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs
+
+import (
+ "fmt"
+ "sync/atomic"
+
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+// Dentry represents a node in a Filesystem tree which may represent a file.
+//
+// Dentries are reference-counted. Unless otherwise specified, all Dentry
+// methods require that a reference is held.
+//
+// A Dentry transitions through up to 3 different states through its lifetime:
+//
+// - Dentries are initially "independent". Independent Dentries have no parent,
+// and consequently no name.
+//
+// - Dentry.InsertChild() causes an independent Dentry to become a "child" of
+// another Dentry. A child node has a parent node, and a name in that parent,
+// both of which are mutable by DentryMoveChild(). Each child Dentry's name is
+// unique within its parent.
+//
+// - Dentry.RemoveChild() causes a child Dentry to become "disowned". A
+// disowned Dentry can still refer to its former parent and its former name in
+// said parent, but the disowned Dentry is no longer reachable from its parent,
+// and a new Dentry with the same name may become a child of the parent. (This
+// is analogous to a struct dentry being "unhashed" in Linux.)
+//
+// Dentry is loosely analogous to Linux's struct dentry, but:
+//
+// - VFS does not associate Dentries with inodes. gVisor interacts primarily
+// with filesystems that are accessed through filesystem APIs (as opposed to
+// raw block devices); many such APIs support only paths and file descriptors,
+// and not inodes. Furthermore, when parties outside the scope of VFS can
+// rename inodes on such filesystems, VFS generally cannot "follow" the rename,
+// both due to synchronization issues and because it may not even be able to
+// name the destination path; this implies that it would in fact be *incorrect*
+// for Dentries to be associated with inodes on such filesystems. Consequently,
+// operations that are inode operations in Linux are FilesystemImpl methods
+// and/or FileDescriptionImpl methods in gVisor's VFS. Filesystems that do
+// support inodes may store appropriate state in implementations of DentryImpl.
+//
+// - VFS does not provide synchronization for mutable Dentry fields, other than
+// mount-related ones.
+//
+// - VFS does not require that Dentries are instantiated for all paths accessed
+// through VFS, only those that are tracked beyond the scope of a single
+// Filesystem operation. This includes file descriptions, mount points, mount
+// roots, process working directories, and chroots. This avoids instantiation
+// of Dentries for operations on mutable remote filesystems that can't actually
+// cache any state in the Dentry.
+//
+// - For the reasons above, VFS is not directly responsible for managing Dentry
+// lifetime. Dentry reference counts only indicate the extent to which VFS
+// requires Dentries to exist; Filesystems may elect to cache or discard
+// Dentries with zero references.
+type Dentry struct {
+ // parent is this Dentry's parent in this Filesystem. If this Dentry is
+ // independent, parent is nil.
+ parent *Dentry
+
+ // name is this Dentry's name in parent.
+ name string
+
+ flags uint32
+
+ // mounts is the number of Mounts for which this Dentry is Mount.point.
+ // mounts is accessed using atomic memory operations.
+ mounts uint32
+
+ // children are child Dentries.
+ children map[string]*Dentry
+
+ // impl is the DentryImpl associated with this Dentry. impl is immutable.
+ // This should be the last field in Dentry.
+ impl DentryImpl
+}
+
+const (
+ // dflagsDisownedMask is set in Dentry.flags if the Dentry has been
+ // disowned.
+ dflagsDisownedMask = 1 << iota
+)
+
+// Init must be called before first use of d.
+func (d *Dentry) Init(impl DentryImpl) {
+ d.impl = impl
+}
+
+// Impl returns the DentryImpl associated with d.
+func (d *Dentry) Impl() DentryImpl {
+ return d.impl
+}
+
+// DentryImpl contains implementation details for a Dentry. Implementations of
+// DentryImpl should contain their associated Dentry by value as their first
+// field.
+type DentryImpl interface {
+ // IncRef increments the Dentry's reference count. A Dentry with a non-zero
+ // reference count must remain coherent with the state of the filesystem.
+ IncRef(fs *Filesystem)
+
+ // TryIncRef increments the Dentry's reference count and returns true. If
+ // the Dentry's reference count is zero, TryIncRef may do nothing and
+ // return false. (It is also permitted to succeed if it can restore the
+ // guarantee that the Dentry is coherent with the state of the filesystem.)
+ //
+ // TryIncRef does not require that a reference is held on the Dentry.
+ TryIncRef(fs *Filesystem) bool
+
+ // DecRef decrements the Dentry's reference count.
+ DecRef(fs *Filesystem)
+}
+
+// IsDisowned returns true if d is disowned.
+func (d *Dentry) IsDisowned() bool {
+ return atomic.LoadUint32(&d.flags)&dflagsDisownedMask != 0
+}
+
+// Preconditions: !d.IsDisowned().
+func (d *Dentry) setDisowned() {
+ atomic.AddUint32(&d.flags, dflagsDisownedMask)
+}
+
+func (d *Dentry) isMounted() bool {
+ return atomic.LoadUint32(&d.mounts) != 0
+}
+
+func (d *Dentry) incRef(fs *Filesystem) {
+ d.impl.IncRef(fs)
+}
+
+func (d *Dentry) tryIncRef(fs *Filesystem) bool {
+ return d.impl.TryIncRef(fs)
+}
+
+func (d *Dentry) decRef(fs *Filesystem) {
+ d.impl.DecRef(fs)
+}
+
+// These functions are exported so that filesystem implementations can use
+// them. The vfs package, and users of VFS, should not call these functions.
+// Unless otherwise specified, these methods require that there are no
+// concurrent mutators of d.
+
+// Name returns d's name in its parent in its owning Filesystem. If d is
+// independent, Name returns an empty string.
+func (d *Dentry) Name() string {
+ return d.name
+}
+
+// Parent returns d's parent in its owning Filesystem. It does not take a
+// reference on the returned Dentry. If d is independent, Parent returns nil.
+func (d *Dentry) Parent() *Dentry {
+ return d.parent
+}
+
+// ParentOrSelf is equivalent to Parent, but returns d if d is independent.
+func (d *Dentry) ParentOrSelf() *Dentry {
+ if d.parent == nil {
+ return d
+ }
+ return d.parent
+}
+
+// Child returns d's child with the given name in its owning Filesystem. It
+// does not take a reference on the returned Dentry. If no such child exists,
+// Child returns nil.
+func (d *Dentry) Child(name string) *Dentry {
+ return d.children[name]
+}
+
+// HasChildren returns true if d has any children.
+func (d *Dentry) HasChildren() bool {
+ return len(d.children) != 0
+}
+
+// InsertChild makes child a child of d with the given name.
+//
+// InsertChild is a mutator of d and child.
+//
+// Preconditions: child must be an independent Dentry. d and child must be from
+// the same Filesystem. d must not already have a child with the given name.
+func (d *Dentry) InsertChild(child *Dentry, name string) {
+ if checkInvariants {
+ if _, ok := d.children[name]; ok {
+ panic(fmt.Sprintf("parent already contains a child named %q", name))
+ }
+ if child.parent != nil || child.name != "" {
+ panic(fmt.Sprintf("child is not independent: parent = %v, name = %q", child.parent, child.name))
+ }
+ }
+ if d.children == nil {
+ d.children = make(map[string]*Dentry)
+ }
+ d.children[name] = child
+ child.parent = d
+ child.name = name
+}
+
+// PrepareDeleteDentry must be called before attempting to delete the file
+// represented by d. If PrepareDeleteDentry succeeds, the caller must call
+// AbortDeleteDentry or CommitDeleteDentry depending on the deletion's outcome.
+//
+// Preconditions: d is a child Dentry.
+func (vfs *VirtualFilesystem) PrepareDeleteDentry(mntns *MountNamespace, d *Dentry) error {
+ if checkInvariants {
+ if d.parent == nil {
+ panic("d is independent")
+ }
+ if d.IsDisowned() {
+ panic("d is already disowned")
+ }
+ }
+ vfs.mountMu.RLock()
+ if _, ok := mntns.mountpoints[d]; ok {
+ vfs.mountMu.RUnlock()
+ return syserror.EBUSY
+ }
+ // Return with vfs.mountMu locked, which will be unlocked by
+ // AbortDeleteDentry or CommitDeleteDentry.
+ return nil
+}
+
+// AbortDeleteDentry must be called after PrepareDeleteDentry if the deletion
+// fails.
+func (vfs *VirtualFilesystem) AbortDeleteDentry() {
+ vfs.mountMu.RUnlock()
+}
+
+// CommitDeleteDentry must be called after the file represented by d is
+// deleted, and causes d to become disowned.
+//
+// Preconditions: PrepareDeleteDentry was previously called on d.
+func (vfs *VirtualFilesystem) CommitDeleteDentry(d *Dentry) {
+ delete(d.parent.children, d.name)
+ d.setDisowned()
+ // TODO: lazily unmount mounts at d
+ vfs.mountMu.RUnlock()
+}
+
+// DeleteDentry combines PrepareDeleteDentry and CommitDeleteDentry, as
+// appropriate for in-memory filesystems that don't need to ensure that some
+// external state change succeeds before committing the deletion.
+func (vfs *VirtualFilesystem) DeleteDentry(mntns *MountNamespace, d *Dentry) error {
+ if err := vfs.PrepareDeleteDentry(mntns, d); err != nil {
+ return err
+ }
+ vfs.CommitDeleteDentry(d)
+ return nil
+}
+
+// PrepareRenameDentry must be called before attempting to rename the file
+// represented by from. If to is not nil, it represents the file that will be
+// replaced or exchanged by the rename. If PrepareRenameDentry succeeds, the
+// caller must call AbortRenameDentry, CommitRenameReplaceDentry, or
+// CommitRenameExchangeDentry depending on the rename's outcome.
+//
+// Preconditions: from is a child Dentry. If to is not nil, it must be a child
+// Dentry from the same Filesystem.
+func (vfs *VirtualFilesystem) PrepareRenameDentry(mntns *MountNamespace, from, to *Dentry) error {
+ if checkInvariants {
+ if from.parent == nil {
+ panic("from is independent")
+ }
+ if from.IsDisowned() {
+ panic("from is already disowned")
+ }
+ if to != nil {
+ if to.parent == nil {
+ panic("to is independent")
+ }
+ if to.IsDisowned() {
+ panic("to is already disowned")
+ }
+ }
+ }
+ vfs.mountMu.RLock()
+ if _, ok := mntns.mountpoints[from]; ok {
+ vfs.mountMu.RUnlock()
+ return syserror.EBUSY
+ }
+ if to != nil {
+ if _, ok := mntns.mountpoints[to]; ok {
+ vfs.mountMu.RUnlock()
+ return syserror.EBUSY
+ }
+ }
+ // Return with vfs.mountMu locked, which will be unlocked by
+ // AbortRenameDentry, CommitRenameReplaceDentry, or
+ // CommitRenameExchangeDentry.
+ return nil
+}
+
+// AbortRenameDentry must be called after PrepareRenameDentry if the rename
+// fails.
+func (vfs *VirtualFilesystem) AbortRenameDentry() {
+ vfs.mountMu.RUnlock()
+}
+
+// CommitRenameReplaceDentry must be called after the file represented by from
+// is renamed without RENAME_EXCHANGE. If to is not nil, it represents the file
+// that was replaced by from.
+//
+// Preconditions: PrepareRenameDentry was previously called on from and to.
+// newParent.Child(newName) == to.
+func (vfs *VirtualFilesystem) CommitRenameReplaceDentry(from, newParent *Dentry, newName string, to *Dentry) {
+ if to != nil {
+ to.setDisowned()
+ // TODO: lazily unmount mounts at d
+ }
+ if newParent.children == nil {
+ newParent.children = make(map[string]*Dentry)
+ }
+ newParent.children[newName] = from
+ from.parent = newParent
+ from.name = newName
+ vfs.mountMu.RUnlock()
+}
+
+// CommitRenameExchangeDentry must be called after the files represented by
+// from and to are exchanged by rename(RENAME_EXCHANGE).
+//
+// Preconditions: PrepareRenameDentry was previously called on from and to.
+func (vfs *VirtualFilesystem) CommitRenameExchangeDentry(from, to *Dentry) {
+ from.parent, to.parent = to.parent, from.parent
+ from.name, to.name = to.name, from.name
+ from.parent.children[from.name] = from
+ to.parent.children[to.name] = to
+ vfs.mountMu.RUnlock()
+}
diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go
new file mode 100644
index 000000000..86bde7fb3
--- /dev/null
+++ b/pkg/sentry/vfs/file_description.go
@@ -0,0 +1,213 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs
+
+import (
+ "sync/atomic"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/sentry/memmap"
+ "gvisor.dev/gvisor/pkg/sentry/usermem"
+ "gvisor.dev/gvisor/pkg/waiter"
+)
+
+// A FileDescription represents an open file description, which is the entity
+// referred to by a file descriptor (POSIX.1-2017 3.258 "Open File
+// Description").
+//
+// FileDescriptions are reference-counted. Unless otherwise specified, all
+// FileDescription methods require that a reference is held.
+//
+// FileDescription is analogous to Linux's struct file.
+type FileDescription struct {
+ // refs is the reference count. refs is accessed using atomic memory
+ // operations.
+ refs int64
+
+ // vd is the filesystem location at which this FileDescription was opened.
+ // A reference is held on vd. vd is immutable.
+ vd VirtualDentry
+
+ // impl is the FileDescriptionImpl associated with this Filesystem. impl is
+ // immutable. This should be the last field in FileDescription.
+ impl FileDescriptionImpl
+}
+
+// Init must be called before first use of fd. It takes references on mnt and
+// d.
+func (fd *FileDescription) Init(impl FileDescriptionImpl, mnt *Mount, d *Dentry) {
+ fd.refs = 1
+ fd.vd = VirtualDentry{
+ mount: mnt,
+ dentry: d,
+ }
+ fd.vd.IncRef()
+ fd.impl = impl
+}
+
+// Impl returns the FileDescriptionImpl associated with fd.
+func (fd *FileDescription) Impl() FileDescriptionImpl {
+ return fd.impl
+}
+
+// VirtualDentry returns the location at which fd was opened. It does not take
+// a reference on the returned VirtualDentry.
+func (fd *FileDescription) VirtualDentry() VirtualDentry {
+ return fd.vd
+}
+
+// IncRef increments fd's reference count.
+func (fd *FileDescription) IncRef() {
+ atomic.AddInt64(&fd.refs, 1)
+}
+
+// DecRef decrements fd's reference count.
+func (fd *FileDescription) DecRef() {
+ if refs := atomic.AddInt64(&fd.refs, -1); refs == 0 {
+ fd.impl.Release()
+ fd.vd.DecRef()
+ } else if refs < 0 {
+ panic("FileDescription.DecRef() called without holding a reference")
+ }
+}
+
+// FileDescriptionImpl contains implementation details for an FileDescription.
+// Implementations of FileDescriptionImpl should contain their associated
+// FileDescription by value as their first field.
+//
+// For all functions that return linux.Statx, Statx.Uid and Statx.Gid will
+// be interpreted as IDs in the root UserNamespace (i.e. as auth.KUID and
+// auth.KGID respectively).
+//
+// FileDescriptionImpl is analogous to Linux's struct file_operations.
+type FileDescriptionImpl interface {
+ // Release is called when the associated FileDescription reaches zero
+ // references.
+ Release()
+
+ // OnClose is called when a file descriptor representing the
+ // FileDescription is closed. Note that returning a non-nil error does not
+ // prevent the file descriptor from being closed.
+ OnClose() error
+
+ // StatusFlags returns file description status flags, as for
+ // fcntl(F_GETFL).
+ StatusFlags(ctx context.Context) (uint32, error)
+
+ // SetStatusFlags sets file description status flags, as for
+ // fcntl(F_SETFL).
+ SetStatusFlags(ctx context.Context, flags uint32) error
+
+ // Stat returns metadata for the file represented by the FileDescription.
+ Stat(ctx context.Context, opts StatOptions) (linux.Statx, error)
+
+ // SetStat updates metadata for the file represented by the
+ // FileDescription.
+ SetStat(ctx context.Context, opts SetStatOptions) error
+
+ // StatFS returns metadata for the filesystem containing the file
+ // represented by the FileDescription.
+ StatFS(ctx context.Context) (linux.Statfs, error)
+
+ // waiter.Waitable methods may be used to poll for I/O events.
+ waiter.Waitable
+
+ // PRead reads from the file into dst, starting at the given offset, and
+ // returns the number of bytes read. PRead is permitted to return partial
+ // reads with a nil error.
+ PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error)
+
+ // Read is similar to PRead, but does not specify an offset.
+ //
+ // For files with an implicit FileDescription offset (e.g. regular files),
+ // Read begins at the FileDescription offset, and advances the offset by
+ // the number of bytes read; note that POSIX 2.9.7 "Thread Interactions
+ // with Regular File Operations" requires that all operations that may
+ // mutate the FileDescription offset are serialized.
+ Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error)
+
+ // PWrite writes src to the file, starting at the given offset, and returns
+ // the number of bytes written. PWrite is permitted to return partial
+ // writes with a nil error.
+ //
+ // As in Linux (but not POSIX), if O_APPEND is in effect for the
+ // FileDescription, PWrite should ignore the offset and append data to the
+ // end of the file.
+ PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error)
+
+ // Write is similar to PWrite, but does not specify an offset, which is
+ // implied as for Read.
+ //
+ // Write is a FileDescriptionImpl method, instead of a wrapper around
+ // PWrite that uses a FileDescription offset, to make it possible for
+ // remote filesystems to implement O_APPEND correctly (i.e. atomically with
+ // respect to writers outside the scope of VFS).
+ Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error)
+
+ // IterDirents invokes cb on each entry in the directory represented by the
+ // FileDescription. If IterDirents has been called since the last call to
+ // Seek, it continues iteration from the end of the last call.
+ IterDirents(ctx context.Context, cb IterDirentsCallback) error
+
+ // Seek changes the FileDescription offset (assuming one exists) and
+ // returns its new value.
+ //
+ // For directories, if whence == SEEK_SET and offset == 0, the caller is
+ // rewinddir(), such that Seek "shall also cause the directory stream to
+ // refer to the current state of the corresponding directory" -
+ // POSIX.1-2017.
+ Seek(ctx context.Context, offset int64, whence int32) (int64, error)
+
+ // Sync requests that cached state associated with the file represented by
+ // the FileDescription is synchronized with persistent storage, and blocks
+ // until this is complete.
+ Sync(ctx context.Context) error
+
+ // ConfigureMMap mutates opts to implement mmap(2) for the file. Most
+ // implementations that support memory mapping can call
+ // GenericConfigureMMap with the appropriate memmap.Mappable.
+ ConfigureMMap(ctx context.Context, opts memmap.MMapOpts) error
+
+ // Ioctl implements the ioctl(2) syscall.
+ Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error)
+
+ // TODO: extended attributes; file locking
+}
+
+// Dirent holds the information contained in struct linux_dirent64.
+type Dirent struct {
+ // Name is the filename.
+ Name string
+
+ // Type is the file type, a linux.DT_* constant.
+ Type uint8
+
+ // Ino is the inode number.
+ Ino uint64
+
+ // Off is this Dirent's offset.
+ Off int64
+}
+
+// IterDirentsCallback receives Dirents from FileDescriptionImpl.IterDirents.
+type IterDirentsCallback interface {
+ // Handle handles the given iterated Dirent. It returns true if iteration
+ // should continue, and false if FileDescriptionImpl.IterDirents should
+ // terminate now and restart with the same Dirent the next time it is
+ // called.
+ Handle(dirent Dirent) bool
+}
diff --git a/pkg/sentry/vfs/file_description_impl_util.go b/pkg/sentry/vfs/file_description_impl_util.go
new file mode 100644
index 000000000..486893e70
--- /dev/null
+++ b/pkg/sentry/vfs/file_description_impl_util.go
@@ -0,0 +1,142 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+ "gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/sentry/memmap"
+ "gvisor.dev/gvisor/pkg/sentry/usermem"
+ "gvisor.dev/gvisor/pkg/syserror"
+ "gvisor.dev/gvisor/pkg/waiter"
+)
+
+// FileDescriptionDefaultImpl may be embedded by implementations of
+// FileDescriptionImpl to obtain implementations of many FileDescriptionImpl
+// methods with default behavior analogous to Linux's.
+type FileDescriptionDefaultImpl struct{}
+
+// OnClose implements FileDescriptionImpl.OnClose analogously to
+// file_operations::flush == NULL in Linux.
+func (FileDescriptionDefaultImpl) OnClose() error {
+ return nil
+}
+
+// StatFS implements FileDescriptionImpl.StatFS analogously to
+// super_operations::statfs == NULL in Linux.
+func (FileDescriptionDefaultImpl) StatFS(ctx context.Context) (linux.Statfs, error) {
+ return linux.Statfs{}, syserror.ENOSYS
+}
+
+// Readiness implements waiter.Waitable.Readiness analogously to
+// file_operations::poll == NULL in Linux.
+func (FileDescriptionDefaultImpl) Readiness(mask waiter.EventMask) waiter.EventMask {
+ // include/linux/poll.h:vfs_poll() => DEFAULT_POLLMASK
+ return waiter.EventIn | waiter.EventOut
+}
+
+// EventRegister implements waiter.Waitable.EventRegister analogously to
+// file_operations::poll == NULL in Linux.
+func (FileDescriptionDefaultImpl) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+}
+
+// EventUnregister implements waiter.Waitable.EventUnregister analogously to
+// file_operations::poll == NULL in Linux.
+func (FileDescriptionDefaultImpl) EventUnregister(e *waiter.Entry) {
+}
+
+// PRead implements FileDescriptionImpl.PRead analogously to
+// file_operations::read == file_operations::read_iter == NULL in Linux.
+func (FileDescriptionDefaultImpl) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) {
+ return 0, syserror.EINVAL
+}
+
+// Read implements FileDescriptionImpl.Read analogously to
+// file_operations::read == file_operations::read_iter == NULL in Linux.
+func (FileDescriptionDefaultImpl) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) {
+ return 0, syserror.EINVAL
+}
+
+// PWrite implements FileDescriptionImpl.PWrite analogously to
+// file_operations::write == file_operations::write_iter == NULL in Linux.
+func (FileDescriptionDefaultImpl) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) {
+ return 0, syserror.EINVAL
+}
+
+// Write implements FileDescriptionImpl.Write analogously to
+// file_operations::write == file_operations::write_iter == NULL in Linux.
+func (FileDescriptionDefaultImpl) Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) {
+ return 0, syserror.EINVAL
+}
+
+// IterDirents implements FileDescriptionImpl.IterDirents analogously to
+// file_operations::iterate == file_operations::iterate_shared == NULL in
+// Linux.
+func (FileDescriptionDefaultImpl) IterDirents(ctx context.Context, cb IterDirentsCallback) error {
+ return syserror.ENOTDIR
+}
+
+// Seek implements FileDescriptionImpl.Seek analogously to
+// file_operations::llseek == NULL in Linux.
+func (FileDescriptionDefaultImpl) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+ return 0, syserror.ESPIPE
+}
+
+// Sync implements FileDescriptionImpl.Sync analogously to
+// file_operations::fsync == NULL in Linux.
+func (FileDescriptionDefaultImpl) Sync(ctx context.Context) error {
+ return syserror.EINVAL
+}
+
+// ConfigureMMap implements FileDescriptionImpl.ConfigureMMap analogously to
+// file_operations::mmap == NULL in Linux.
+func (FileDescriptionDefaultImpl) ConfigureMMap(ctx context.Context, opts memmap.MMapOpts) error {
+ return syserror.ENODEV
+}
+
+// Ioctl implements FileDescriptionImpl.Ioctl analogously to
+// file_operations::unlocked_ioctl == NULL in Linux.
+func (FileDescriptionDefaultImpl) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+ return 0, syserror.ENOTTY
+}
+
+// DirectoryFileDescriptionDefaultImpl may be embedded by implementations of
+// FileDescriptionImpl that always represent directories to obtain
+// implementations of non-directory I/O methods that return EISDIR, and
+// implementations of other methods consistent with FileDescriptionDefaultImpl.
+type DirectoryFileDescriptionDefaultImpl struct {
+ FileDescriptionDefaultImpl
+}
+
+// PRead implements FileDescriptionImpl.PRead.
+func (DirectoryFileDescriptionDefaultImpl) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) {
+ return 0, syserror.EISDIR
+}
+
+// Read implements FileDescriptionImpl.Read.
+func (DirectoryFileDescriptionDefaultImpl) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) {
+ return 0, syserror.EISDIR
+}
+
+// PWrite implements FileDescriptionImpl.PWrite.
+func (DirectoryFileDescriptionDefaultImpl) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) {
+ return 0, syserror.EISDIR
+}
+
+// Write implements FileDescriptionImpl.Write.
+func (DirectoryFileDescriptionDefaultImpl) Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) {
+ return 0, syserror.EISDIR
+}
diff --git a/pkg/sentry/vfs/filesystem.go b/pkg/sentry/vfs/filesystem.go
new file mode 100644
index 000000000..7a074b718
--- /dev/null
+++ b/pkg/sentry/vfs/filesystem.go
@@ -0,0 +1,155 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs
+
+import (
+ "sync/atomic"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/context"
+)
+
+// A Filesystem is a tree of nodes represented by Dentries, which forms part of
+// a VirtualFilesystem.
+//
+// Filesystems are reference-counted. Unless otherwise specified, all
+// Filesystem methods require that a reference is held.
+//
+// Filesystem is analogous to Linux's struct super_block.
+type Filesystem struct {
+ // refs is the reference count. refs is accessed using atomic memory
+ // operations.
+ refs int64
+
+ // impl is the FilesystemImpl associated with this Filesystem. impl is
+ // immutable. This should be the last field in Dentry.
+ impl FilesystemImpl
+}
+
+// Init must be called before first use of fs.
+func (fs *Filesystem) Init(impl FilesystemImpl) {
+ fs.refs = 1
+ fs.impl = impl
+}
+
+// Impl returns the FilesystemImpl associated with fs.
+func (fs *Filesystem) Impl() FilesystemImpl {
+ return fs.impl
+}
+
+func (fs *Filesystem) incRef() {
+ if atomic.AddInt64(&fs.refs, 1) <= 1 {
+ panic("Filesystem.incRef() called without holding a reference")
+ }
+}
+
+func (fs *Filesystem) decRef() {
+ if refs := atomic.AddInt64(&fs.refs, -1); refs == 0 {
+ fs.impl.Release()
+ } else if refs < 0 {
+ panic("Filesystem.decRef() called without holding a reference")
+ }
+}
+
+// FilesystemImpl contains implementation details for a Filesystem.
+// Implementations of FilesystemImpl should contain their associated Filesystem
+// by value as their first field.
+//
+// All methods that take a ResolvingPath must resolve the path before
+// performing any other checks, including rejection of the operation if not
+// supported by the FilesystemImpl. This is because the final FilesystemImpl
+// (responsible for actually implementing the operation) isn't known until path
+// resolution is complete.
+//
+// For all methods that take or return linux.Statx, Statx.Uid and Statx.Gid
+// should be interpreted as IDs in the root UserNamespace (i.e. as auth.KUID
+// and auth.KGID respectively).
+//
+// FilesystemImpl combines elements of Linux's struct super_operations and
+// struct inode_operations, for reasons described in the documentation for
+// Dentry.
+type FilesystemImpl interface {
+ // Release is called when the associated Filesystem reaches zero
+ // references.
+ Release()
+
+ // Sync "causes all pending modifications to filesystem metadata and cached
+ // file data to be written to the underlying [filesystem]", as by syncfs(2).
+ Sync(ctx context.Context) error
+
+ // GetDentryAt returns a Dentry representing the file at rp. A reference is
+ // taken on the returned Dentry.
+ //
+ // GetDentryAt does not correspond directly to a Linux syscall; it is used
+ // in the implementation of:
+ //
+ // - Syscalls that need to resolve two paths: rename(), renameat(),
+ // renameat2(), link(), linkat().
+ //
+ // - Syscalls that need to refer to a filesystem position outside the
+ // context of a file description: chdir(), fchdir(), chroot(), mount(),
+ // umount().
+ GetDentryAt(ctx context.Context, rp *ResolvingPath, opts GetDentryOptions) (*Dentry, error)
+
+ // LinkAt creates a hard link at rp representing the same file as vd. It
+ // does not take ownership of references on vd.
+ //
+ // The implementation is responsible for checking that vd.Mount() ==
+ // rp.Mount(), and that vd does not represent a directory.
+ LinkAt(ctx context.Context, rp *ResolvingPath, vd VirtualDentry) error
+
+ // MkdirAt creates a directory at rp.
+ MkdirAt(ctx context.Context, rp *ResolvingPath, opts MkdirOptions) error
+
+ // MknodAt creates a regular file, device special file, or named pipe at
+ // rp.
+ MknodAt(ctx context.Context, rp *ResolvingPath, opts MknodOptions) error
+
+ // OpenAt returns an FileDescription providing access to the file at rp. A
+ // reference is taken on the returned FileDescription.
+ OpenAt(ctx context.Context, rp *ResolvingPath, opts OpenOptions) (*FileDescription, error)
+
+ // ReadlinkAt returns the target of the symbolic link at rp.
+ ReadlinkAt(ctx context.Context, rp *ResolvingPath) (string, error)
+
+ // RenameAt renames the Dentry represented by vd to rp. It does not take
+ // ownership of references on vd.
+ //
+ // The implementation is responsible for checking that vd.Mount() ==
+ // rp.Mount().
+ RenameAt(ctx context.Context, rp *ResolvingPath, vd VirtualDentry, opts RenameOptions) error
+
+ // RmdirAt removes the directory at rp.
+ RmdirAt(ctx context.Context, rp *ResolvingPath) error
+
+ // SetStatAt updates metadata for the file at the given path.
+ SetStatAt(ctx context.Context, rp *ResolvingPath, opts SetStatOptions) error
+
+ // StatAt returns metadata for the file at rp.
+ StatAt(ctx context.Context, rp *ResolvingPath, opts StatOptions) (linux.Statx, error)
+
+ // StatFSAt returns metadata for the filesystem containing the file at rp.
+ // (This method takes a path because a FilesystemImpl may consist of any
+ // number of constituent filesystems.)
+ StatFSAt(ctx context.Context, rp *ResolvingPath) (linux.Statfs, error)
+
+ // SymlinkAt creates a symbolic link at rp referring to the given target.
+ SymlinkAt(ctx context.Context, rp *ResolvingPath, target string) error
+
+ // UnlinkAt removes the non-directory file at rp.
+ UnlinkAt(ctx context.Context, rp *ResolvingPath) error
+
+ // TODO: d_path(); extended attributes; inotify_add_watch(); bind()
+}
diff --git a/pkg/sentry/vfs/filesystem_type.go b/pkg/sentry/vfs/filesystem_type.go
new file mode 100644
index 000000000..f401ad7f3
--- /dev/null
+++ b/pkg/sentry/vfs/filesystem_type.go
@@ -0,0 +1,70 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs
+
+import (
+ "fmt"
+
+ "gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+)
+
+// A FilesystemType constructs filesystems.
+//
+// FilesystemType is analogous to Linux's struct file_system_type.
+type FilesystemType interface {
+ // NewFilesystem returns a Filesystem configured by the given options,
+ // along with its mount root. A reference is taken on the returned
+ // Filesystem and Dentry.
+ NewFilesystem(ctx context.Context, creds *auth.Credentials, source string, opts NewFilesystemOptions) (*Filesystem, *Dentry, error)
+}
+
+// NewFilesystemOptions contains options to FilesystemType.NewFilesystem.
+type NewFilesystemOptions struct {
+ // Data is the string passed as the 5th argument to mount(2), which is
+ // usually a comma-separated list of filesystem-specific mount options.
+ Data string
+
+ // InternalData holds opaque FilesystemType-specific data. There is
+ // intentionally no way for applications to specify InternalData; if it is
+ // not nil, the call to NewFilesystem originates from within the sentry.
+ InternalData interface{}
+}
+
+// RegisterFilesystemType registers the given FilesystemType in vfs with the
+// given name.
+func (vfs *VirtualFilesystem) RegisterFilesystemType(name string, fsType FilesystemType) error {
+ vfs.fsTypesMu.Lock()
+ defer vfs.fsTypesMu.Unlock()
+ if existing, ok := vfs.fsTypes[name]; ok {
+ return fmt.Errorf("name %q is already registered to filesystem type %T", name, existing)
+ }
+ vfs.fsTypes[name] = fsType
+ return nil
+}
+
+// MustRegisterFilesystemType is equivalent to RegisterFilesystemType but
+// panics on failure.
+func (vfs *VirtualFilesystem) MustRegisterFilesystemType(name string, fsType FilesystemType) {
+ if err := vfs.RegisterFilesystemType(name, fsType); err != nil {
+ panic(fmt.Sprintf("failed to register filesystem type %T: %v", fsType, err))
+ }
+}
+
+func (vfs *VirtualFilesystem) getFilesystemType(name string) FilesystemType {
+ vfs.fsTypesMu.RLock()
+ defer vfs.fsTypesMu.RUnlock()
+ return vfs.fsTypes[name]
+}
diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go
new file mode 100644
index 000000000..11702f720
--- /dev/null
+++ b/pkg/sentry/vfs/mount.go
@@ -0,0 +1,411 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs
+
+import (
+ "math"
+ "sync/atomic"
+
+ "gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+// A Mount is a replacement of a Dentry (Mount.key.point) from one Filesystem
+// (Mount.key.parent.fs) with a Dentry (Mount.root) from another Filesystem
+// (Mount.fs), which applies to path resolution in the context of a particular
+// Mount (Mount.key.parent).
+//
+// Mounts are reference-counted. Unless otherwise specified, all Mount methods
+// require that a reference is held.
+//
+// Mount and Filesystem are distinct types because it's possible for a single
+// Filesystem to be mounted at multiple locations and/or in multiple mount
+// namespaces.
+//
+// Mount is analogous to Linux's struct mount. (gVisor does not distinguish
+// between struct mount and struct vfsmount.)
+type Mount struct {
+ // The lower 63 bits of refs are a reference count. The MSB of refs is set
+ // if the Mount has been eagerly unmounted, as by umount(2) without the
+ // MNT_DETACH flag. refs is accessed using atomic memory operations.
+ refs int64
+
+ // The lower 63 bits of writers is the number of calls to
+ // Mount.CheckBeginWrite() that have not yet been paired with a call to
+ // Mount.EndWrite(). The MSB of writers is set if MS_RDONLY is in effect.
+ // writers is accessed using atomic memory operations.
+ writers int64
+
+ // key is protected by VirtualFilesystem.mountMu and
+ // VirtualFilesystem.mounts.seq, and may be nil. References are held on
+ // key.parent and key.point if they are not nil.
+ //
+ // Invariant: key.parent != nil iff key.point != nil. key.point belongs to
+ // key.parent.fs.
+ key mountKey
+
+ // fs, root, and ns are immutable. References are held on fs and root (but
+ // not ns).
+ //
+ // Invariant: root belongs to fs.
+ fs *Filesystem
+ root *Dentry
+ ns *MountNamespace
+}
+
+// A MountNamespace is a collection of Mounts.
+//
+// MountNamespaces are reference-counted. Unless otherwise specified, all
+// MountNamespace methods require that a reference is held.
+//
+// MountNamespace is analogous to Linux's struct mnt_namespace.
+type MountNamespace struct {
+ refs int64 // accessed using atomic memory operations
+
+ // root is the MountNamespace's root mount. root is immutable.
+ root *Mount
+
+ // mountpoints contains all Dentries which are mount points in this
+ // namespace. mountpoints is protected by VirtualFilesystem.mountMu.
+ //
+ // mountpoints is used to determine if a Dentry can be moved or removed
+ // (which requires that the Dentry is not a mount point in the calling
+ // namespace).
+ //
+ // mountpoints is maintained even if there are no references held on the
+ // MountNamespace; this is required to ensure that
+ // VFS.PrepareDeleteDentry() and VFS.PrepareRemoveDentry() operate
+ // correctly on unreferenced MountNamespaces.
+ mountpoints map[*Dentry]struct{}
+}
+
+// NewMountNamespace returns a new mount namespace with a root filesystem
+// configured by the given arguments. A reference is taken on the returned
+// MountNamespace.
+func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth.Credentials, source, fsTypeName string, opts *NewFilesystemOptions) (*MountNamespace, error) {
+ fsType := vfs.getFilesystemType(fsTypeName)
+ if fsType == nil {
+ return nil, syserror.ENODEV
+ }
+ fs, root, err := fsType.NewFilesystem(ctx, creds, source, *opts)
+ if err != nil {
+ return nil, err
+ }
+ mntns := &MountNamespace{
+ refs: 1,
+ mountpoints: make(map[*Dentry]struct{}),
+ }
+ mntns.root = &Mount{
+ fs: fs,
+ root: root,
+ ns: mntns,
+ refs: 1,
+ }
+ return mntns, nil
+}
+
+// NewMount creates and mounts a new Filesystem.
+func (vfs *VirtualFilesystem) NewMount(ctx context.Context, creds *auth.Credentials, source string, target *PathOperation, fsTypeName string, opts *NewFilesystemOptions) error {
+ fsType := vfs.getFilesystemType(fsTypeName)
+ if fsType == nil {
+ return syserror.ENODEV
+ }
+ fs, root, err := fsType.NewFilesystem(ctx, creds, source, *opts)
+ if err != nil {
+ return err
+ }
+ // We can't hold vfs.mountMu while calling FilesystemImpl methods due to
+ // lock ordering.
+ vd, err := vfs.GetDentryAt(ctx, creds, target, &GetDentryOptions{})
+ if err != nil {
+ root.decRef(fs)
+ fs.decRef()
+ return err
+ }
+ vfs.mountMu.Lock()
+ for {
+ if vd.dentry.IsDisowned() {
+ vfs.mountMu.Unlock()
+ vd.DecRef()
+ root.decRef(fs)
+ fs.decRef()
+ return syserror.ENOENT
+ }
+ // vd might have been mounted over between vfs.GetDentryAt() and
+ // vfs.mountMu.Lock().
+ if !vd.dentry.isMounted() {
+ break
+ }
+ nextmnt := vfs.mounts.Lookup(vd.mount, vd.dentry)
+ if nextmnt == nil {
+ break
+ }
+ nextmnt.incRef()
+ nextmnt.root.incRef(nextmnt.fs)
+ vd.DecRef()
+ vd = VirtualDentry{
+ mount: nextmnt,
+ dentry: nextmnt.root,
+ }
+ }
+ // TODO: Linux requires that either both the mount point and the mount root
+ // are directories, or neither are, and returns ENOTDIR if this is not the
+ // case.
+ mntns := vd.mount.ns
+ mnt := &Mount{
+ fs: fs,
+ root: root,
+ ns: mntns,
+ refs: 1,
+ }
+ mnt.storeKey(vd.mount, vd.dentry)
+ atomic.AddUint32(&vd.dentry.mounts, 1)
+ mntns.mountpoints[vd.dentry] = struct{}{}
+ vfsmpmounts, ok := vfs.mountpoints[vd.dentry]
+ if !ok {
+ vfsmpmounts = make(map[*Mount]struct{})
+ vfs.mountpoints[vd.dentry] = vfsmpmounts
+ }
+ vfsmpmounts[mnt] = struct{}{}
+ vfs.mounts.Insert(mnt)
+ vfs.mountMu.Unlock()
+ return nil
+}
+
+// getMountAt returns the last Mount in the stack mounted at (mnt, d). It takes
+// a reference on the returned Mount. If (mnt, d) is not a mount point,
+// getMountAt returns nil.
+//
+// getMountAt is analogous to Linux's fs/namei.c:follow_mount().
+//
+// Preconditions: References are held on mnt and d.
+func (vfs *VirtualFilesystem) getMountAt(mnt *Mount, d *Dentry) *Mount {
+ // The first mount is special-cased:
+ //
+ // - The caller is assumed to have checked d.isMounted() already. (This
+ // isn't a precondition because it doesn't matter for correctness.)
+ //
+ // - We return nil, instead of mnt, if there is no mount at (mnt, d).
+ //
+ // - We don't drop the caller's references on mnt and d.
+retryFirst:
+ next := vfs.mounts.Lookup(mnt, d)
+ if next == nil {
+ return nil
+ }
+ if !next.tryIncMountedRef() {
+ // Raced with umount.
+ goto retryFirst
+ }
+ mnt = next
+ d = next.root
+ // We don't need to take Dentry refs anywhere in this function because
+ // Mounts hold references on Mount.root, which is immutable.
+ for d.isMounted() {
+ next := vfs.mounts.Lookup(mnt, d)
+ if next == nil {
+ break
+ }
+ if !next.tryIncMountedRef() {
+ // Raced with umount.
+ continue
+ }
+ mnt.decRef()
+ mnt = next
+ d = next.root
+ }
+ return mnt
+}
+
+// getMountpointAt returns the mount point for the stack of Mounts including
+// mnt. It takes a reference on the returned Mount and Dentry. If no such mount
+// point exists (i.e. mnt is a root mount), getMountpointAt returns (nil, nil).
+//
+// Preconditions: References are held on mnt and root. vfsroot is not (mnt,
+// mnt.root).
+func (vfs *VirtualFilesystem) getMountpointAt(mnt *Mount, vfsroot VirtualDentry) (*Mount, *Dentry) {
+ // The first mount is special-cased:
+ //
+ // - The caller must have already checked mnt against vfsroot.
+ //
+ // - We return nil, instead of mnt, if there is no mount point for mnt.
+ //
+ // - We don't drop the caller's reference on mnt.
+retryFirst:
+ epoch := vfs.mounts.seq.BeginRead()
+ parent, point := mnt.loadKey()
+ if !vfs.mounts.seq.ReadOk(epoch) {
+ goto retryFirst
+ }
+ if parent == nil {
+ return nil, nil
+ }
+ if !parent.tryIncMountedRef() {
+ // Raced with umount.
+ goto retryFirst
+ }
+ if !point.tryIncRef(parent.fs) {
+ // Since Mount holds a reference on Mount.key.point, this can only
+ // happen due to a racing change to Mount.key.
+ parent.decRef()
+ goto retryFirst
+ }
+ mnt = parent
+ d := point
+ for {
+ if mnt == vfsroot.mount && d == vfsroot.dentry {
+ break
+ }
+ if d != mnt.root {
+ break
+ }
+ retryNotFirst:
+ epoch := vfs.mounts.seq.BeginRead()
+ parent, point := mnt.loadKey()
+ if !vfs.mounts.seq.ReadOk(epoch) {
+ goto retryNotFirst
+ }
+ if parent == nil {
+ break
+ }
+ if !parent.tryIncMountedRef() {
+ // Raced with umount.
+ goto retryNotFirst
+ }
+ if !point.tryIncRef(parent.fs) {
+ // Since Mount holds a reference on Mount.key.point, this can
+ // only happen due to a racing change to Mount.key.
+ parent.decRef()
+ goto retryNotFirst
+ }
+ if !vfs.mounts.seq.ReadOk(epoch) {
+ point.decRef(parent.fs)
+ parent.decRef()
+ goto retryNotFirst
+ }
+ d.decRef(mnt.fs)
+ mnt.decRef()
+ mnt = parent
+ d = point
+ }
+ return mnt, d
+}
+
+// tryIncMountedRef increments mnt's reference count and returns true. If mnt's
+// reference count is already zero, or has been eagerly unmounted,
+// tryIncMountedRef does nothing and returns false.
+//
+// tryIncMountedRef does not require that a reference is held on mnt.
+func (mnt *Mount) tryIncMountedRef() bool {
+ for {
+ refs := atomic.LoadInt64(&mnt.refs)
+ if refs <= 0 { // refs < 0 => MSB set => eagerly unmounted
+ return false
+ }
+ if atomic.CompareAndSwapInt64(&mnt.refs, refs, refs+1) {
+ return true
+ }
+ }
+}
+
+func (mnt *Mount) incRef() {
+ // In general, negative values for mnt.refs are valid because the MSB is
+ // the eager-unmount bit.
+ atomic.AddInt64(&mnt.refs, 1)
+}
+
+func (mnt *Mount) decRef() {
+ refs := atomic.AddInt64(&mnt.refs, -1)
+ if refs&^math.MinInt64 == 0 { // mask out MSB
+ parent, point := mnt.loadKey()
+ if point != nil {
+ point.decRef(parent.fs)
+ parent.decRef()
+ }
+ mnt.root.decRef(mnt.fs)
+ mnt.fs.decRef()
+ }
+}
+
+// CheckBeginWrite increments the counter of in-progress write operations on
+// mnt. If mnt is mounted MS_RDONLY, CheckBeginWrite does nothing and returns
+// EROFS.
+//
+// If CheckBeginWrite succeeds, EndWrite must be called when the write
+// operation is finished.
+func (mnt *Mount) CheckBeginWrite() error {
+ if atomic.AddInt64(&mnt.writers, 1) < 0 {
+ atomic.AddInt64(&mnt.writers, -1)
+ return syserror.EROFS
+ }
+ return nil
+}
+
+// EndWrite indicates that a write operation signaled by a previous successful
+// call to CheckBeginWrite has finished.
+func (mnt *Mount) EndWrite() {
+ atomic.AddInt64(&mnt.writers, -1)
+}
+
+// Preconditions: VirtualFilesystem.mountMu must be locked for writing.
+func (mnt *Mount) setReadOnlyLocked(ro bool) error {
+ if oldRO := atomic.LoadInt64(&mnt.writers) < 0; oldRO == ro {
+ return nil
+ }
+ if ro {
+ if !atomic.CompareAndSwapInt64(&mnt.writers, 0, math.MinInt64) {
+ return syserror.EBUSY
+ }
+ return nil
+ }
+ // Unset MSB without dropping any temporary increments from failed calls to
+ // mnt.CheckBeginWrite().
+ atomic.AddInt64(&mnt.writers, math.MinInt64)
+ return nil
+}
+
+// Filesystem returns the mounted Filesystem. It does not take a reference on
+// the returned Filesystem.
+func (mnt *Mount) Filesystem() *Filesystem {
+ return mnt.fs
+}
+
+// IncRef increments mntns' reference count.
+func (mntns *MountNamespace) IncRef() {
+ if atomic.AddInt64(&mntns.refs, 1) <= 1 {
+ panic("MountNamespace.IncRef() called without holding a reference")
+ }
+}
+
+// DecRef decrements mntns' reference count.
+func (mntns *MountNamespace) DecRef() {
+ if refs := atomic.AddInt64(&mntns.refs, 0); refs == 0 {
+ // TODO: unmount mntns.root
+ } else if refs < 0 {
+ panic("MountNamespace.DecRef() called without holding a reference")
+ }
+}
+
+// Root returns mntns' root. A reference is taken on the returned
+// VirtualDentry.
+func (mntns *MountNamespace) Root() VirtualDentry {
+ vd := VirtualDentry{
+ mount: mntns.root,
+ dentry: mntns.root.root,
+ }
+ vd.IncRef()
+ return vd
+}
diff --git a/pkg/sentry/vfs/mount_test.go b/pkg/sentry/vfs/mount_test.go
new file mode 100644
index 000000000..f394d7483
--- /dev/null
+++ b/pkg/sentry/vfs/mount_test.go
@@ -0,0 +1,465 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs
+
+import (
+ "fmt"
+ "runtime"
+ "sync"
+ "testing"
+)
+
+func TestMountTableLookupEmpty(t *testing.T) {
+ var mt mountTable
+ mt.Init()
+
+ parent := &Mount{}
+ point := &Dentry{}
+ if m := mt.Lookup(parent, point); m != nil {
+ t.Errorf("empty mountTable lookup: got %p, wanted nil", m)
+ }
+}
+
+func TestMountTableInsertLookup(t *testing.T) {
+ var mt mountTable
+ mt.Init()
+
+ mount := &Mount{}
+ mount.storeKey(&Mount{}, &Dentry{})
+ mt.Insert(mount)
+
+ if m := mt.Lookup(mount.parent(), mount.point()); m != mount {
+ t.Errorf("mountTable positive lookup: got %p, wanted %p", m, mount)
+ }
+
+ otherParent := &Mount{}
+ if m := mt.Lookup(otherParent, mount.point()); m != nil {
+ t.Errorf("mountTable lookup with wrong mount parent: got %p, wanted nil", m)
+ }
+ otherPoint := &Dentry{}
+ if m := mt.Lookup(mount.parent(), otherPoint); m != nil {
+ t.Errorf("mountTable lookup with wrong mount point: got %p, wanted nil", m)
+ }
+}
+
+// TODO: concurrent lookup/insertion/removal
+
+// must be powers of 2
+var benchNumMounts = []int{1 << 2, 1 << 5, 1 << 8}
+
+// For all of the following:
+//
+// - BenchmarkMountTableFoo tests usage pattern "Foo" for mountTable.
+//
+// - BenchmarkMountMapFoo tests usage pattern "Foo" for a
+// sync.RWMutex-protected map. (Mutator benchmarks do not use a RWMutex, since
+// mountTable also requires external synchronization between mutators.)
+//
+// - BenchmarkMountSyncMapFoo tests usage pattern "Foo" for a sync.Map.
+//
+// ParallelLookup is by far the most common and performance-sensitive operation
+// for this application. NegativeLookup is also important, but less so (only
+// relevant with multiple mount namespaces and significant differences in
+// mounts between them). Insertion and removal are benchmarked for
+// completeness.
+const enableComparativeBenchmarks = false
+
+func newBenchMount() *Mount {
+ mount := &Mount{}
+ mount.storeKey(&Mount{}, &Dentry{})
+ return mount
+}
+
+func vdkey(mnt *Mount) VirtualDentry {
+ parent, point := mnt.loadKey()
+ return VirtualDentry{
+ mount: parent,
+ dentry: point,
+ }
+}
+
+func BenchmarkMountTableParallelLookup(b *testing.B) {
+ for numG, maxG := 1, runtime.GOMAXPROCS(0); numG >= 0 && numG <= maxG; numG *= 2 {
+ for _, numMounts := range benchNumMounts {
+ desc := fmt.Sprintf("%dx%d", numG, numMounts)
+ b.Run(desc, func(b *testing.B) {
+ var mt mountTable
+ mt.Init()
+ keys := make([]VirtualDentry, 0, numMounts)
+ for i := 0; i < numMounts; i++ {
+ mount := newBenchMount()
+ mt.Insert(mount)
+ keys = append(keys, vdkey(mount))
+ }
+
+ var ready sync.WaitGroup
+ begin := make(chan struct{})
+ var end sync.WaitGroup
+ for g := 0; g < numG; g++ {
+ ready.Add(1)
+ end.Add(1)
+ go func() {
+ defer end.Done()
+ ready.Done()
+ <-begin
+ for i := 0; i < b.N; i++ {
+ k := keys[i&(numMounts-1)]
+ m := mt.Lookup(k.mount, k.dentry)
+ if m == nil {
+ b.Fatalf("lookup failed")
+ }
+ if parent := m.parent(); parent != k.mount {
+ b.Fatalf("lookup returned mount with parent %p, wanted %p", parent, k.mount)
+ }
+ if point := m.point(); point != k.dentry {
+ b.Fatalf("lookup returned mount with point %p, wanted %p", point, k.dentry)
+ }
+ }
+ }()
+ }
+
+ ready.Wait()
+ b.ResetTimer()
+ close(begin)
+ end.Wait()
+ })
+ }
+ }
+}
+
+func BenchmarkMountMapParallelLookup(b *testing.B) {
+ if !enableComparativeBenchmarks {
+ b.Skipf("comparative benchmarks are disabled")
+ }
+
+ for numG, maxG := 1, runtime.GOMAXPROCS(0); numG >= 0 && numG <= maxG; numG *= 2 {
+ for _, numMounts := range benchNumMounts {
+ desc := fmt.Sprintf("%dx%d", numG, numMounts)
+ b.Run(desc, func(b *testing.B) {
+ var mu sync.RWMutex
+ ms := make(map[VirtualDentry]*Mount)
+ keys := make([]VirtualDentry, 0, numMounts)
+ for i := 0; i < numMounts; i++ {
+ mount := newBenchMount()
+ key := vdkey(mount)
+ ms[key] = mount
+ keys = append(keys, key)
+ }
+
+ var ready sync.WaitGroup
+ begin := make(chan struct{})
+ var end sync.WaitGroup
+ for g := 0; g < numG; g++ {
+ ready.Add(1)
+ end.Add(1)
+ go func() {
+ defer end.Done()
+ ready.Done()
+ <-begin
+ for i := 0; i < b.N; i++ {
+ k := keys[i&(numMounts-1)]
+ mu.RLock()
+ m := ms[k]
+ mu.RUnlock()
+ if m == nil {
+ b.Fatalf("lookup failed")
+ }
+ if parent := m.parent(); parent != k.mount {
+ b.Fatalf("lookup returned mount with parent %p, wanted %p", parent, k.mount)
+ }
+ if point := m.point(); point != k.dentry {
+ b.Fatalf("lookup returned mount with point %p, wanted %p", point, k.dentry)
+ }
+ }
+ }()
+ }
+
+ ready.Wait()
+ b.ResetTimer()
+ close(begin)
+ end.Wait()
+ })
+ }
+ }
+}
+
+func BenchmarkMountSyncMapParallelLookup(b *testing.B) {
+ if !enableComparativeBenchmarks {
+ b.Skipf("comparative benchmarks are disabled")
+ }
+
+ for numG, maxG := 1, runtime.GOMAXPROCS(0); numG >= 0 && numG <= maxG; numG *= 2 {
+ for _, numMounts := range benchNumMounts {
+ desc := fmt.Sprintf("%dx%d", numG, numMounts)
+ b.Run(desc, func(b *testing.B) {
+ var ms sync.Map
+ keys := make([]VirtualDentry, 0, numMounts)
+ for i := 0; i < numMounts; i++ {
+ mount := newBenchMount()
+ key := vdkey(mount)
+ ms.Store(key, mount)
+ keys = append(keys, key)
+ }
+
+ var ready sync.WaitGroup
+ begin := make(chan struct{})
+ var end sync.WaitGroup
+ for g := 0; g < numG; g++ {
+ ready.Add(1)
+ end.Add(1)
+ go func() {
+ defer end.Done()
+ ready.Done()
+ <-begin
+ for i := 0; i < b.N; i++ {
+ k := keys[i&(numMounts-1)]
+ mi, ok := ms.Load(k)
+ if !ok {
+ b.Fatalf("lookup failed")
+ }
+ m := mi.(*Mount)
+ if parent := m.parent(); parent != k.mount {
+ b.Fatalf("lookup returned mount with parent %p, wanted %p", parent, k.mount)
+ }
+ if point := m.point(); point != k.dentry {
+ b.Fatalf("lookup returned mount with point %p, wanted %p", point, k.dentry)
+ }
+ }
+ }()
+ }
+
+ ready.Wait()
+ b.ResetTimer()
+ close(begin)
+ end.Wait()
+ })
+ }
+ }
+}
+
+func BenchmarkMountTableNegativeLookup(b *testing.B) {
+ for _, numMounts := range benchNumMounts {
+ desc := fmt.Sprintf("%d", numMounts)
+ b.Run(desc, func(b *testing.B) {
+ var mt mountTable
+ mt.Init()
+ for i := 0; i < numMounts; i++ {
+ mt.Insert(newBenchMount())
+ }
+ negkeys := make([]VirtualDentry, 0, numMounts)
+ for i := 0; i < numMounts; i++ {
+ negkeys = append(negkeys, VirtualDentry{
+ mount: &Mount{},
+ dentry: &Dentry{},
+ })
+ }
+
+ b.ResetTimer()
+ for i := 0; i < b.N; i++ {
+ k := negkeys[i&(numMounts-1)]
+ m := mt.Lookup(k.mount, k.dentry)
+ if m != nil {
+ b.Fatalf("lookup got %p, wanted nil", m)
+ }
+ }
+ })
+ }
+}
+
+func BenchmarkMountMapNegativeLookup(b *testing.B) {
+ if !enableComparativeBenchmarks {
+ b.Skipf("comparative benchmarks are disabled")
+ }
+
+ for _, numMounts := range benchNumMounts {
+ desc := fmt.Sprintf("%d", numMounts)
+ b.Run(desc, func(b *testing.B) {
+ var mu sync.RWMutex
+ ms := make(map[VirtualDentry]*Mount)
+ for i := 0; i < numMounts; i++ {
+ mount := newBenchMount()
+ ms[vdkey(mount)] = mount
+ }
+ negkeys := make([]VirtualDentry, 0, numMounts)
+ for i := 0; i < numMounts; i++ {
+ negkeys = append(negkeys, VirtualDentry{
+ mount: &Mount{},
+ dentry: &Dentry{},
+ })
+ }
+
+ b.ResetTimer()
+ for i := 0; i < b.N; i++ {
+ k := negkeys[i&(numMounts-1)]
+ mu.RLock()
+ m := ms[k]
+ mu.RUnlock()
+ if m != nil {
+ b.Fatalf("lookup got %p, wanted nil", m)
+ }
+ }
+ })
+ }
+}
+
+func BenchmarkMountSyncMapNegativeLookup(b *testing.B) {
+ if !enableComparativeBenchmarks {
+ b.Skipf("comparative benchmarks are disabled")
+ }
+
+ for _, numMounts := range benchNumMounts {
+ desc := fmt.Sprintf("%d", numMounts)
+ b.Run(desc, func(b *testing.B) {
+ var ms sync.Map
+ for i := 0; i < numMounts; i++ {
+ mount := newBenchMount()
+ ms.Store(vdkey(mount), mount)
+ }
+ negkeys := make([]VirtualDentry, 0, numMounts)
+ for i := 0; i < numMounts; i++ {
+ negkeys = append(negkeys, VirtualDentry{
+ mount: &Mount{},
+ dentry: &Dentry{},
+ })
+ }
+
+ b.ResetTimer()
+ for i := 0; i < b.N; i++ {
+ k := negkeys[i&(numMounts-1)]
+ m, _ := ms.Load(k)
+ if m != nil {
+ b.Fatalf("lookup got %p, wanted nil", m)
+ }
+ }
+ })
+ }
+}
+
+func BenchmarkMountTableInsert(b *testing.B) {
+ // Preallocate Mounts so that allocation time isn't included in the
+ // benchmark.
+ mounts := make([]*Mount, 0, b.N)
+ for i := 0; i < b.N; i++ {
+ mounts = append(mounts, newBenchMount())
+ }
+
+ var mt mountTable
+ mt.Init()
+ b.ResetTimer()
+ for i := range mounts {
+ mt.Insert(mounts[i])
+ }
+}
+
+func BenchmarkMountMapInsert(b *testing.B) {
+ if !enableComparativeBenchmarks {
+ b.Skipf("comparative benchmarks are disabled")
+ }
+
+ // Preallocate Mounts so that allocation time isn't included in the
+ // benchmark.
+ mounts := make([]*Mount, 0, b.N)
+ for i := 0; i < b.N; i++ {
+ mounts = append(mounts, newBenchMount())
+ }
+
+ ms := make(map[VirtualDentry]*Mount)
+ b.ResetTimer()
+ for i := range mounts {
+ mount := mounts[i]
+ ms[vdkey(mount)] = mount
+ }
+}
+
+func BenchmarkMountSyncMapInsert(b *testing.B) {
+ if !enableComparativeBenchmarks {
+ b.Skipf("comparative benchmarks are disabled")
+ }
+
+ // Preallocate Mounts so that allocation time isn't included in the
+ // benchmark.
+ mounts := make([]*Mount, 0, b.N)
+ for i := 0; i < b.N; i++ {
+ mounts = append(mounts, newBenchMount())
+ }
+
+ var ms sync.Map
+ b.ResetTimer()
+ for i := range mounts {
+ mount := mounts[i]
+ ms.Store(vdkey(mount), mount)
+ }
+}
+
+func BenchmarkMountTableRemove(b *testing.B) {
+ mounts := make([]*Mount, 0, b.N)
+ for i := 0; i < b.N; i++ {
+ mounts = append(mounts, newBenchMount())
+ }
+ var mt mountTable
+ mt.Init()
+ for i := range mounts {
+ mt.Insert(mounts[i])
+ }
+
+ b.ResetTimer()
+ for i := range mounts {
+ mt.Remove(mounts[i])
+ }
+}
+
+func BenchmarkMountMapRemove(b *testing.B) {
+ if !enableComparativeBenchmarks {
+ b.Skipf("comparative benchmarks are disabled")
+ }
+
+ mounts := make([]*Mount, 0, b.N)
+ for i := 0; i < b.N; i++ {
+ mounts = append(mounts, newBenchMount())
+ }
+ ms := make(map[VirtualDentry]*Mount)
+ for i := range mounts {
+ mount := mounts[i]
+ ms[vdkey(mount)] = mount
+ }
+
+ b.ResetTimer()
+ for i := range mounts {
+ mount := mounts[i]
+ delete(ms, vdkey(mount))
+ }
+}
+
+func BenchmarkMountSyncMapRemove(b *testing.B) {
+ if !enableComparativeBenchmarks {
+ b.Skipf("comparative benchmarks are disabled")
+ }
+
+ mounts := make([]*Mount, 0, b.N)
+ for i := 0; i < b.N; i++ {
+ mounts = append(mounts, newBenchMount())
+ }
+ var ms sync.Map
+ for i := range mounts {
+ mount := mounts[i]
+ ms.Store(vdkey(mount), mount)
+ }
+
+ b.ResetTimer()
+ for i := range mounts {
+ mount := mounts[i]
+ ms.Delete(vdkey(mount))
+ }
+}
diff --git a/pkg/sentry/vfs/mount_unsafe.go b/pkg/sentry/vfs/mount_unsafe.go
new file mode 100644
index 000000000..b0511aa40
--- /dev/null
+++ b/pkg/sentry/vfs/mount_unsafe.go
@@ -0,0 +1,356 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build go1.12
+// +build !go1.14
+
+// Check go:linkname function signatures when updating Go version.
+
+package vfs
+
+import (
+ "fmt"
+ "math/bits"
+ "reflect"
+ "sync/atomic"
+ "unsafe"
+
+ "gvisor.dev/gvisor/third_party/gvsync"
+)
+
+// mountKey represents the location at which a Mount is mounted. It is
+// structurally identical to VirtualDentry, but stores its fields as
+// unsafe.Pointer since mutators synchronize with VFS path traversal using
+// seqcounts.
+type mountKey struct {
+ parent unsafe.Pointer // *Mount
+ point unsafe.Pointer // *Dentry
+}
+
+// Invariant: mnt.key's fields are nil. parent and point are non-nil.
+func (mnt *Mount) storeKey(parent *Mount, point *Dentry) {
+ atomic.StorePointer(&mnt.key.parent, unsafe.Pointer(parent))
+ atomic.StorePointer(&mnt.key.point, unsafe.Pointer(point))
+}
+
+func (mnt *Mount) loadKey() (*Mount, *Dentry) {
+ return (*Mount)(atomic.LoadPointer(&mnt.key.parent)), (*Dentry)(atomic.LoadPointer(&mnt.key.point))
+}
+
+func (mnt *Mount) parent() *Mount {
+ return (*Mount)(atomic.LoadPointer(&mnt.key.parent))
+}
+
+func (mnt *Mount) point() *Dentry {
+ return (*Dentry)(atomic.LoadPointer(&mnt.key.point))
+}
+
+// mountTable maps (mount parent, mount point) pairs to mounts. It supports
+// efficient concurrent lookup, even in the presence of concurrent mutators
+// (provided mutation is sufficiently uncommon).
+//
+// mountTable.Init() must be called on new mountTables before use.
+type mountTable struct {
+ // mountTable is implemented as a seqcount-protected hash table that
+ // resolves collisions with linear probing, featuring Robin Hood insertion
+ // and backward shift deletion. These minimize probe length variance,
+ // significantly improving the performance of linear probing at high load
+ // factors. (mountTable doesn't use bucketing, which is the other major
+ // technique commonly used in high-performance hash tables; the efficiency
+ // of bucketing is largely due to SIMD lookup, and Go lacks both SIMD
+ // intrinsics and inline assembly, limiting the performance of this
+ // approach.)
+
+ seq gvsync.SeqCount
+ seed uint32 // for hashing keys
+
+ // size holds both length (number of elements) and capacity (number of
+ // slots): capacity is stored as its base-2 log (referred to as order) in
+ // the least significant bits of size, and length is stored in the
+ // remaining bits. Go defines bit shifts >= width of shifted unsigned
+ // operand as shifting to 0, which differs from x86's SHL, so the Go
+ // compiler inserts a bounds check for each bit shift unless we mask order
+ // anyway (cf. runtime.bucketShift()), and length isn't used by lookup;
+ // thus this bit packing gets us more bits for the length (vs. storing
+ // length and cap in separate uint32s) for ~free.
+ size uint64
+
+ slots unsafe.Pointer // []mountSlot; never nil after Init
+}
+
+type mountSlot struct {
+ // We don't store keys in slots; instead, we just check Mount.parent and
+ // Mount.point directly. Any practical use of lookup will need to touch
+ // Mounts anyway, and comparing hashes means that false positives are
+ // extremely rare, so this isn't an extra cache line touch overall.
+ value unsafe.Pointer // *Mount
+ hash uintptr
+}
+
+const (
+ mtSizeOrderBits = 6 // log2 of pointer size in bits
+ mtSizeOrderMask = (1 << mtSizeOrderBits) - 1
+ mtSizeOrderOne = 1
+ mtSizeLenLSB = mtSizeOrderBits
+ mtSizeLenOne = 1 << mtSizeLenLSB
+ mtSizeLenNegOne = ^uint64(mtSizeOrderMask) // uint64(-1) << mtSizeLenLSB
+
+ mountSlotBytes = unsafe.Sizeof(mountSlot{})
+ mountKeyBytes = unsafe.Sizeof(mountKey{})
+
+ // Tuning parameters.
+ //
+ // Essentially every mountTable will contain at least /proc, /sys, and
+ // /dev/shm, so there is ~no reason for mtInitCap to be < 4.
+ mtInitOrder = 2
+ mtInitCap = 1 << mtInitOrder
+ mtMaxLoadNum = 13
+ mtMaxLoadDen = 16
+)
+
+func init() {
+ // We can't just define mtSizeOrderBits as follows because Go doesn't have
+ // constexpr.
+ if ptrBits := uint(unsafe.Sizeof(uintptr(0)) * 8); mtSizeOrderBits != bits.TrailingZeros(ptrBits) {
+ panic(fmt.Sprintf("mtSizeOrderBits (%d) must be %d = log2 of pointer size in bits (%d)", mtSizeOrderBits, bits.TrailingZeros(ptrBits), ptrBits))
+ }
+ if bits.OnesCount(uint(mountSlotBytes)) != 1 {
+ panic(fmt.Sprintf("sizeof(mountSlotBytes) (%d) must be a power of 2 to use bit masking for wraparound", mountSlotBytes))
+ }
+ if mtInitCap <= 1 {
+ panic(fmt.Sprintf("mtInitCap (%d) must be at least 2 since mountTable methods assume that there will always be at least one empty slot", mtInitCap))
+ }
+ if mtMaxLoadNum >= mtMaxLoadDen {
+ panic(fmt.Sprintf("invalid mountTable maximum load factor (%d/%d)", mtMaxLoadNum, mtMaxLoadDen))
+ }
+}
+
+// Init must be called exactly once on each mountTable before use.
+func (mt *mountTable) Init() {
+ mt.seed = rand32()
+ mt.size = mtInitOrder
+ mt.slots = newMountTableSlots(mtInitCap)
+}
+
+func newMountTableSlots(cap uintptr) unsafe.Pointer {
+ slice := make([]mountSlot, cap, cap)
+ hdr := (*reflect.SliceHeader)(unsafe.Pointer(&slice))
+ return unsafe.Pointer(hdr.Data)
+}
+
+// Lookup returns the Mount with the given parent, mounted at the given point.
+// If no such Mount exists, Lookup returns nil.
+//
+// Lookup may be called even if there are concurrent mutators of mt.
+func (mt *mountTable) Lookup(parent *Mount, point *Dentry) *Mount {
+ key := mountKey{parent: unsafe.Pointer(parent), point: unsafe.Pointer(point)}
+ hash := memhash(noescape(unsafe.Pointer(&key)), uintptr(mt.seed), mountKeyBytes)
+
+loop:
+ for {
+ epoch := mt.seq.BeginRead()
+ size := atomic.LoadUint64(&mt.size)
+ slots := atomic.LoadPointer(&mt.slots)
+ if !mt.seq.ReadOk(epoch) {
+ continue
+ }
+ tcap := uintptr(1) << (size & mtSizeOrderMask)
+ mask := tcap - 1
+ off := (hash & mask) * mountSlotBytes
+ offmask := mask * mountSlotBytes
+ for {
+ // This avoids bounds checking.
+ slot := (*mountSlot)(unsafe.Pointer(uintptr(slots) + off))
+ slotValue := atomic.LoadPointer(&slot.value)
+ slotHash := atomic.LoadUintptr(&slot.hash)
+ if !mt.seq.ReadOk(epoch) {
+ // The element we're looking for might have been moved into a
+ // slot we've previously checked, so restart entirely.
+ continue loop
+ }
+ if slotValue == nil {
+ return nil
+ }
+ if slotHash == hash {
+ mount := (*Mount)(slotValue)
+ var mountKey mountKey
+ mountKey.parent = atomic.LoadPointer(&mount.key.parent)
+ mountKey.point = atomic.LoadPointer(&mount.key.point)
+ if !mt.seq.ReadOk(epoch) {
+ continue loop
+ }
+ if key == mountKey {
+ return mount
+ }
+ }
+ off = (off + mountSlotBytes) & offmask
+ }
+ }
+}
+
+// Insert inserts the given mount into mt.
+//
+// Preconditions: There are no concurrent mutators of mt. mt must not already
+// contain a Mount with the same mount point and parent.
+func (mt *mountTable) Insert(mount *Mount) {
+ hash := memhash(unsafe.Pointer(&mount.key), uintptr(mt.seed), mountKeyBytes)
+
+ // We're under the maximum load factor if:
+ //
+ // (len+1) / cap <= mtMaxLoadNum / mtMaxLoadDen
+ // (len+1) * mtMaxLoadDen <= mtMaxLoadNum * cap
+ tlen := mt.size >> mtSizeLenLSB
+ order := mt.size & mtSizeOrderMask
+ tcap := uintptr(1) << order
+ if ((tlen + 1) * mtMaxLoadDen) <= (uint64(mtMaxLoadNum) << order) {
+ // Atomically insert the new element into the table.
+ mt.seq.BeginWrite()
+ atomic.AddUint64(&mt.size, mtSizeLenOne)
+ mtInsertLocked(mt.slots, tcap, unsafe.Pointer(mount), hash)
+ mt.seq.EndWrite()
+ return
+ }
+
+ // Otherwise, we have to expand. Double the number of slots in the new
+ // table.
+ newOrder := order + 1
+ if newOrder > mtSizeOrderMask {
+ panic("mount table size overflow")
+ }
+ newCap := uintptr(1) << newOrder
+ newSlots := newMountTableSlots(newCap)
+ // Copy existing elements to the new table.
+ oldCur := mt.slots
+ // Go does not permit pointers to the end of allocated objects, so we
+ // must use a pointer to the last element of the old table. The
+ // following expression is equivalent to
+ // `slots+(cap-1)*mountSlotBytes` but has a critical path length of 2
+ // arithmetic instructions instead of 3.
+ oldLast := unsafe.Pointer((uintptr(mt.slots) - mountSlotBytes) + (tcap * mountSlotBytes))
+ for {
+ oldSlot := (*mountSlot)(oldCur)
+ if oldSlot.value != nil {
+ // Don't need to lock mt.seq yet since newSlots isn't visible
+ // to readers.
+ mtInsertLocked(newSlots, newCap, oldSlot.value, oldSlot.hash)
+ }
+ if oldCur == oldLast {
+ break
+ }
+ oldCur = unsafe.Pointer(uintptr(oldCur) + mountSlotBytes)
+ }
+ // Insert the new element into the new table.
+ mtInsertLocked(newSlots, newCap, unsafe.Pointer(mount), hash)
+ // Atomically switch to the new table.
+ mt.seq.BeginWrite()
+ atomic.AddUint64(&mt.size, mtSizeLenOne|mtSizeOrderOne)
+ atomic.StorePointer(&mt.slots, newSlots)
+ mt.seq.EndWrite()
+}
+
+// Preconditions: There are no concurrent mutators of the table (slots, cap).
+// If the table is visible to readers, then mt.seq must be in a writer critical
+// section. cap must be a power of 2.
+func mtInsertLocked(slots unsafe.Pointer, cap uintptr, value unsafe.Pointer, hash uintptr) {
+ mask := cap - 1
+ off := (hash & mask) * mountSlotBytes
+ offmask := mask * mountSlotBytes
+ disp := uintptr(0)
+ for {
+ slot := (*mountSlot)(unsafe.Pointer(uintptr(slots) + off))
+ slotValue := slot.value
+ if slotValue == nil {
+ atomic.StorePointer(&slot.value, value)
+ atomic.StoreUintptr(&slot.hash, hash)
+ return
+ }
+ // If we've been displaced farther from our first-probed slot than the
+ // element stored in this one, swap elements and switch to inserting
+ // the replaced one. (This is Robin Hood insertion.)
+ slotHash := slot.hash
+ slotDisp := ((off / mountSlotBytes) - slotHash) & mask
+ if disp > slotDisp {
+ atomic.StorePointer(&slot.value, value)
+ atomic.StoreUintptr(&slot.hash, hash)
+ value = slotValue
+ hash = slotHash
+ disp = slotDisp
+ }
+ off = (off + mountSlotBytes) & offmask
+ disp++
+ }
+}
+
+// Remove removes the given mount from mt.
+//
+// Preconditions: There are no concurrent mutators of mt. mt must contain
+// mount.
+func (mt *mountTable) Remove(mount *Mount) {
+ hash := memhash(unsafe.Pointer(&mount.key), uintptr(mt.seed), mountKeyBytes)
+ tcap := uintptr(1) << (mt.size & mtSizeOrderMask)
+ mask := tcap - 1
+ slots := mt.slots
+ off := (hash & mask) * mountSlotBytes
+ offmask := mask * mountSlotBytes
+ for {
+ slot := (*mountSlot)(unsafe.Pointer(uintptr(slots) + off))
+ slotValue := slot.value
+ if slotValue == unsafe.Pointer(mount) {
+ // Found the element to remove. Move all subsequent elements
+ // backward until we either find an empty slot, or an element that
+ // is already in its first-probed slot. (This is backward shift
+ // deletion.)
+ mt.seq.BeginWrite()
+ for {
+ nextOff := (off + mountSlotBytes) & offmask
+ nextSlot := (*mountSlot)(unsafe.Pointer(uintptr(slots) + nextOff))
+ nextSlotValue := nextSlot.value
+ if nextSlotValue == nil {
+ break
+ }
+ nextSlotHash := nextSlot.hash
+ if (nextOff / mountSlotBytes) == (nextSlotHash & mask) {
+ break
+ }
+ atomic.StorePointer(&slot.value, nextSlotValue)
+ atomic.StoreUintptr(&slot.hash, nextSlotHash)
+ off = nextOff
+ slot = nextSlot
+ }
+ atomic.StorePointer(&slot.value, nil)
+ atomic.AddUint64(&mt.size, mtSizeLenNegOne)
+ mt.seq.EndWrite()
+ return
+ }
+ if checkInvariants && slotValue == nil {
+ panic(fmt.Sprintf("mountTable.Remove() called on missing Mount %v", mount))
+ }
+ off = (off + mountSlotBytes) & offmask
+ }
+}
+
+//go:linkname memhash runtime.memhash
+func memhash(p unsafe.Pointer, seed, s uintptr) uintptr
+
+//go:linkname rand32 runtime.fastrand
+func rand32() uint32
+
+// This is copy/pasted from runtime.noescape(), and is needed because arguments
+// apparently escape from all functions defined by linkname.
+//
+//go:nosplit
+func noescape(p unsafe.Pointer) unsafe.Pointer {
+ x := uintptr(p)
+ return unsafe.Pointer(x ^ 0)
+}
diff --git a/pkg/sentry/vfs/options.go b/pkg/sentry/vfs/options.go
new file mode 100644
index 000000000..187e5410c
--- /dev/null
+++ b/pkg/sentry/vfs/options.go
@@ -0,0 +1,123 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+)
+
+// GetDentryOptions contains options to VirtualFilesystem.GetDentryAt() and
+// FilesystemImpl.GetDentryAt().
+type GetDentryOptions struct {
+ // If CheckSearchable is true, FilesystemImpl.GetDentryAt() must check that
+ // the returned Dentry is a directory for which creds has search
+ // permission.
+ CheckSearchable bool
+}
+
+// MkdirOptions contains options to VirtualFilesystem.MkdirAt() and
+// FilesystemImpl.MkdirAt().
+type MkdirOptions struct {
+ // Mode is the file mode bits for the created directory.
+ Mode uint16
+}
+
+// MknodOptions contains options to VirtualFilesystem.MknodAt() and
+// FilesystemImpl.MknodAt().
+type MknodOptions struct {
+ // Mode is the file type and mode bits for the created file.
+ Mode uint16
+
+ // If Mode specifies a character or block device special file, DevMajor and
+ // DevMinor are the major and minor device numbers for the created device.
+ DevMajor uint32
+ DevMinor uint32
+}
+
+// OpenOptions contains options to VirtualFilesystem.OpenAt() and
+// FilesystemImpl.OpenAt().
+type OpenOptions struct {
+ // Flags contains access mode and flags as specified for open(2).
+ //
+ // FilesystemImpls is reponsible for implementing the following flags:
+ // O_RDONLY, O_WRONLY, O_RDWR, O_APPEND, O_CREAT, O_DIRECT, O_DSYNC,
+ // O_EXCL, O_NOATIME, O_NOCTTY, O_NONBLOCK, O_PATH, O_SYNC, O_TMPFILE, and
+ // O_TRUNC. VFS is responsible for handling O_DIRECTORY, O_LARGEFILE, and
+ // O_NOFOLLOW. VFS users are responsible for handling O_CLOEXEC, since file
+ // descriptors are mostly outside the scope of VFS.
+ Flags uint32
+
+ // If FilesystemImpl.OpenAt() creates a file, Mode is the file mode for the
+ // created file.
+ Mode uint16
+}
+
+// ReadOptions contains options to FileDescription.PRead(),
+// FileDescriptionImpl.PRead(), FileDescription.Read(), and
+// FileDescriptionImpl.Read().
+type ReadOptions struct {
+ // Flags contains flags as specified for preadv2(2).
+ Flags uint32
+}
+
+// RenameOptions contains options to VirtualFilesystem.RenameAt() and
+// FilesystemImpl.RenameAt().
+type RenameOptions struct {
+ // Flags contains flags as specified for renameat2(2).
+ Flags uint32
+}
+
+// SetStatOptions contains options to VirtualFilesystem.SetStatAt(),
+// FilesystemImpl.SetStatAt(), FileDescription.SetStat(), and
+// FileDescriptionImpl.SetStat().
+type SetStatOptions struct {
+ // Stat is the metadata that should be set. Only fields indicated by
+ // Stat.Mask should be set.
+ //
+ // If Stat specifies that a timestamp should be set,
+ // FilesystemImpl.SetStatAt() and FileDescriptionImpl.SetStat() must
+ // special-case StatxTimestamp.Nsec == UTIME_NOW as described by
+ // utimensat(2); however, they do not need to check for StatxTimestamp.Nsec
+ // == UTIME_OMIT (VFS users must unset the corresponding bit in Stat.Mask
+ // instead).
+ Stat linux.Statx
+}
+
+// StatOptions contains options to VirtualFilesystem.StatAt(),
+// FilesystemImpl.StatAt(), FileDescription.Stat(), and
+// FileDescriptionImpl.Stat().
+type StatOptions struct {
+ // Mask is the set of fields in the returned Statx that the FilesystemImpl
+ // or FileDescriptionImpl should provide. Bits are as in linux.Statx.Mask.
+ //
+ // The FilesystemImpl or FileDescriptionImpl may return fields not
+ // requested in Mask, and may fail to return fields requested in Mask that
+ // are not supported by the underlying filesystem implementation, without
+ // returning an error.
+ Mask uint32
+
+ // Sync specifies the synchronization required, and is one of
+ // linux.AT_STATX_SYNC_AS_STAT (which is 0, and therefore the default),
+ // linux.AT_STATX_SYNC_FORCE_SYNC, or linux.AT_STATX_SYNC_DONT_SYNC.
+ Sync uint32
+}
+
+// WriteOptions contains options to FileDescription.PWrite(),
+// FileDescriptionImpl.PWrite(), FileDescription.Write(), and
+// FileDescriptionImpl.Write().
+type WriteOptions struct {
+ // Flags contains flags as specified for pwritev2(2).
+ Flags uint32
+}
diff --git a/pkg/sentry/vfs/permissions.go b/pkg/sentry/vfs/permissions.go
new file mode 100644
index 000000000..f8e74355c
--- /dev/null
+++ b/pkg/sentry/vfs/permissions.go
@@ -0,0 +1,121 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+// AccessTypes is a bitmask of Unix file permissions.
+type AccessTypes uint16
+
+// Bits in AccessTypes.
+const (
+ MayRead AccessTypes = 4
+ MayWrite = 2
+ MayExec = 1
+)
+
+// GenericCheckPermissions checks that creds has the given access rights on a
+// file with the given permissions, UID, and GID, subject to the rules of
+// fs/namei.c:generic_permission(). isDir is true if the file is a directory.
+func GenericCheckPermissions(creds *auth.Credentials, ats AccessTypes, isDir bool, mode uint16, kuid auth.KUID, kgid auth.KGID) error {
+ // Check permission bits.
+ perms := mode
+ if creds.EffectiveKUID == kuid {
+ perms >>= 6
+ } else if creds.InGroup(kgid) {
+ perms >>= 3
+ }
+ if uint16(ats)&perms == uint16(ats) {
+ return nil
+ }
+
+ // Caller capabilities require that the file's KUID and KGID are mapped in
+ // the caller's user namespace; compare
+ // kernel/capability.c:privileged_wrt_inode_uidgid().
+ if !kuid.In(creds.UserNamespace).Ok() || !kgid.In(creds.UserNamespace).Ok() {
+ return syserror.EACCES
+ }
+ // CAP_DAC_READ_SEARCH allows the caller to read and search arbitrary
+ // directories, and read arbitrary non-directory files.
+ if (isDir && (ats&MayWrite == 0)) || ats == MayRead {
+ if creds.HasCapability(linux.CAP_DAC_READ_SEARCH) {
+ return nil
+ }
+ }
+ // CAP_DAC_OVERRIDE allows arbitrary access to directories, read/write
+ // access to non-directory files, and execute access to non-directory files
+ // for which at least one execute bit is set.
+ if isDir || (ats&MayExec == 0) || (mode&0111 != 0) {
+ if creds.HasCapability(linux.CAP_DAC_OVERRIDE) {
+ return nil
+ }
+ }
+ return syserror.EACCES
+}
+
+// AccessTypesForOpenFlags returns the access types required to open a file
+// with the given OpenOptions.Flags. Note that this is NOT the same thing as
+// the set of accesses permitted for the opened file:
+//
+// - O_TRUNC causes MayWrite to be set in the returned AccessTypes (since it
+// mutates the file), but does not permit the opened to write to the file
+// thereafter.
+//
+// - "Linux reserves the special, nonstandard access mode 3 (binary 11) in
+// flags to mean: check for read and write permission on the file and return a
+// file descriptor that can't be used for reading or writing." - open(2). Thus
+// AccessTypesForOpenFlags returns MayRead|MayWrite in this case, but
+// filesystems are responsible for ensuring that access is denied.
+//
+// Use May{Read,Write}FileWithOpenFlags() for these checks instead.
+func AccessTypesForOpenFlags(flags uint32) AccessTypes {
+ switch flags & linux.O_ACCMODE {
+ case linux.O_RDONLY:
+ if flags&linux.O_TRUNC != 0 {
+ return MayRead | MayWrite
+ }
+ return MayRead
+ case linux.O_WRONLY:
+ return MayWrite
+ default:
+ return MayRead | MayWrite
+ }
+}
+
+// MayReadFileWithOpenFlags returns true if a file with the given open flags
+// should be readable.
+func MayReadFileWithOpenFlags(flags uint32) bool {
+ switch flags & linux.O_ACCMODE {
+ case linux.O_RDONLY, linux.O_RDWR:
+ return true
+ default:
+ return false
+ }
+}
+
+// MayWriteFileWithOpenFlags returns true if a file with the given open flags
+// should be writable.
+func MayWriteFileWithOpenFlags(flags uint32) bool {
+ switch flags & linux.O_ACCMODE {
+ case linux.O_WRONLY, linux.O_RDWR:
+ return true
+ default:
+ return false
+ }
+}
diff --git a/pkg/sentry/vfs/resolving_path.go b/pkg/sentry/vfs/resolving_path.go
new file mode 100644
index 000000000..8d05c8583
--- /dev/null
+++ b/pkg/sentry/vfs/resolving_path.go
@@ -0,0 +1,453 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs
+
+import (
+ "fmt"
+ "sync"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/fspath"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+// ResolvingPath represents the state of an in-progress path resolution, shared
+// between VFS and FilesystemImpl methods that take a path.
+//
+// From the perspective of FilesystemImpl methods, a ResolvingPath represents a
+// starting Dentry on the associated Filesystem (on which a reference is
+// already held) and a stream of path components relative to that Dentry.
+//
+// ResolvingPath is loosely analogous to Linux's struct nameidata.
+type ResolvingPath struct {
+ vfs *VirtualFilesystem
+ root VirtualDentry // refs borrowed from PathOperation
+ mount *Mount
+ start *Dentry
+ pit fspath.Iterator
+
+ flags uint16
+ mustBeDir bool // final file must be a directory?
+ mustBeDirOrig bool
+ symlinks uint8 // number of symlinks traversed
+ symlinksOrig uint8
+ curPart uint8 // index into parts
+ numOrigParts uint8
+
+ creds *auth.Credentials
+
+ // Data associated with resolve*Errors, stored in ResolvingPath so that
+ // those errors don't need to allocate.
+ nextMount *Mount // ref held if not nil
+ nextStart *Dentry // ref held if not nil
+ absSymlinkTarget fspath.Path
+
+ // ResolvingPath must track up to two relative paths: the "current"
+ // relative path, which is updated whenever a relative symlink is
+ // encountered, and the "original" relative path, which is updated from the
+ // current relative path by handleError() when resolution must change
+ // filesystems (due to reaching a mount boundary or absolute symlink) and
+ // overwrites the current relative path when Restart() is called.
+ parts [1 + linux.MaxSymlinkTraversals]fspath.Iterator
+ origParts [1 + linux.MaxSymlinkTraversals]fspath.Iterator
+}
+
+const (
+ rpflagsHaveMountRef = 1 << iota // do we hold a reference on mount?
+ rpflagsHaveStartRef // do we hold a reference on start?
+ rpflagsFollowFinalSymlink // same as PathOperation.FollowFinalSymlink
+)
+
+func init() {
+ if maxParts := len(ResolvingPath{}.parts); maxParts > 255 {
+ panic(fmt.Sprintf("uint8 is insufficient to accommodate len(ResolvingPath.parts) (%d)", maxParts))
+ }
+}
+
+// Error types that communicate state from the FilesystemImpl-caller,
+// VFS-callee side of path resolution (i.e. errors returned by
+// ResolvingPath.Resolve*()) to the VFS-caller, FilesystemImpl-callee side
+// (i.e. VFS methods => ResolvingPath.handleError()). These are empty structs
+// rather than error values because Go doesn't support non-primitive constants,
+// so error "constants" are really mutable vars, necessitating somewhat
+// expensive interface object comparisons.
+
+type resolveMountRootError struct{}
+
+// Error implements error.Error.
+func (resolveMountRootError) Error() string {
+ return "resolving mount root"
+}
+
+type resolveMountPointError struct{}
+
+// Error implements error.Error.
+func (resolveMountPointError) Error() string {
+ return "resolving mount point"
+}
+
+type resolveAbsSymlinkError struct{}
+
+// Error implements error.Error.
+func (resolveAbsSymlinkError) Error() string {
+ return "resolving absolute symlink"
+}
+
+var resolvingPathPool = sync.Pool{
+ New: func() interface{} {
+ return &ResolvingPath{}
+ },
+}
+
+func (vfs *VirtualFilesystem) getResolvingPath(creds *auth.Credentials, pop *PathOperation) (*ResolvingPath, error) {
+ path, err := fspath.Parse(pop.Pathname)
+ if err != nil {
+ return nil, err
+ }
+ rp := resolvingPathPool.Get().(*ResolvingPath)
+ rp.vfs = vfs
+ rp.root = pop.Root
+ rp.mount = pop.Start.mount
+ rp.start = pop.Start.dentry
+ rp.pit = path.Begin
+ rp.flags = 0
+ if pop.FollowFinalSymlink {
+ rp.flags |= rpflagsFollowFinalSymlink
+ }
+ rp.mustBeDir = path.Dir
+ rp.mustBeDirOrig = path.Dir
+ rp.symlinks = 0
+ rp.curPart = 0
+ rp.numOrigParts = 1
+ rp.creds = creds
+ rp.parts[0] = path.Begin
+ rp.origParts[0] = path.Begin
+ return rp, nil
+}
+
+func (vfs *VirtualFilesystem) putResolvingPath(rp *ResolvingPath) {
+ rp.root = VirtualDentry{}
+ rp.decRefStartAndMount()
+ rp.mount = nil
+ rp.start = nil
+ rp.releaseErrorState()
+ resolvingPathPool.Put(rp)
+}
+
+func (rp *ResolvingPath) decRefStartAndMount() {
+ if rp.flags&rpflagsHaveStartRef != 0 {
+ rp.start.decRef(rp.mount.fs)
+ }
+ if rp.flags&rpflagsHaveMountRef != 0 {
+ rp.mount.decRef()
+ }
+}
+
+func (rp *ResolvingPath) releaseErrorState() {
+ if rp.nextStart != nil {
+ rp.nextStart.decRef(rp.nextMount.fs)
+ rp.nextStart = nil
+ }
+ if rp.nextMount != nil {
+ rp.nextMount.decRef()
+ rp.nextMount = nil
+ }
+}
+
+// VirtualFilesystem returns the containing VirtualFilesystem.
+func (rp *ResolvingPath) VirtualFilesystem() *VirtualFilesystem {
+ return rp.vfs
+}
+
+// Credentials returns the credentials of rp's provider.
+func (rp *ResolvingPath) Credentials() *auth.Credentials {
+ return rp.creds
+}
+
+// Mount returns the Mount on which path resolution is currently occurring. It
+// does not take a reference on the returned Mount.
+func (rp *ResolvingPath) Mount() *Mount {
+ return rp.mount
+}
+
+// Start returns the starting Dentry represented by rp. It does not take a
+// reference on the returned Dentry.
+func (rp *ResolvingPath) Start() *Dentry {
+ return rp.start
+}
+
+// Done returns true if there are no remaining path components in the stream
+// represented by rp.
+func (rp *ResolvingPath) Done() bool {
+ // We don't need to check for rp.curPart == 0 because rp.Advance() won't
+ // set rp.pit to a terminal iterator otherwise.
+ return !rp.pit.Ok()
+}
+
+// Final returns true if there is exactly one remaining path component in the
+// stream represented by rp.
+//
+// Preconditions: !rp.Done().
+func (rp *ResolvingPath) Final() bool {
+ return rp.curPart == 0 && !rp.pit.NextOk()
+}
+
+// Component returns the current path component in the stream represented by
+// rp.
+//
+// Preconditions: !rp.Done().
+func (rp *ResolvingPath) Component() string {
+ if checkInvariants {
+ if !rp.pit.Ok() {
+ panic("ResolvingPath.Component() called at end of relative path")
+ }
+ }
+ return rp.pit.String()
+}
+
+// Advance advances the stream of path components represented by rp.
+//
+// Preconditions: !rp.Done().
+func (rp *ResolvingPath) Advance() {
+ if checkInvariants {
+ if !rp.pit.Ok() {
+ panic("ResolvingPath.Advance() called at end of relative path")
+ }
+ }
+ next := rp.pit.Next()
+ if next.Ok() || rp.curPart == 0 { // have next component, or at end of path
+ rp.pit = next
+ } else { // at end of path segment, continue with next one
+ rp.curPart--
+ rp.pit = rp.parts[rp.curPart-1]
+ }
+}
+
+// Restart resets the stream of path components represented by rp to its state
+// on entry to the current FilesystemImpl method.
+func (rp *ResolvingPath) Restart() {
+ rp.pit = rp.origParts[rp.numOrigParts-1]
+ rp.mustBeDir = rp.mustBeDirOrig
+ rp.symlinks = rp.symlinksOrig
+ rp.curPart = rp.numOrigParts - 1
+ copy(rp.parts[:], rp.origParts[:rp.numOrigParts])
+ rp.releaseErrorState()
+}
+
+func (rp *ResolvingPath) relpathCommit() {
+ rp.mustBeDirOrig = rp.mustBeDir
+ rp.symlinksOrig = rp.symlinks
+ rp.numOrigParts = rp.curPart + 1
+ copy(rp.origParts[:rp.curPart], rp.parts[:])
+ rp.origParts[rp.curPart] = rp.pit
+}
+
+// ResolveParent returns the VFS parent of d. It does not take a reference on
+// the returned Dentry.
+//
+// Preconditions: There are no concurrent mutators of d.
+//
+// Postconditions: If the returned error is nil, then the returned Dentry is
+// not nil.
+func (rp *ResolvingPath) ResolveParent(d *Dentry) (*Dentry, error) {
+ var parent *Dentry
+ if d == rp.root.dentry && rp.mount == rp.root.mount {
+ // At contextual VFS root.
+ parent = d
+ } else if d == rp.mount.root {
+ // At mount root ...
+ mnt, mntpt := rp.vfs.getMountpointAt(rp.mount, rp.root)
+ if mnt != nil {
+ // ... of non-root mount.
+ rp.nextMount = mnt
+ rp.nextStart = mntpt
+ return nil, resolveMountRootError{}
+ }
+ // ... of root mount.
+ parent = d
+ } else if d.parent == nil {
+ // At filesystem root.
+ parent = d
+ } else {
+ parent = d.parent
+ }
+ if parent.isMounted() {
+ if mnt := rp.vfs.getMountAt(rp.mount, parent); mnt != nil {
+ rp.nextMount = mnt
+ return nil, resolveMountPointError{}
+ }
+ }
+ return parent, nil
+}
+
+// ResolveChild returns the VFS child of d with the given name. It does not
+// take a reference on the returned Dentry. If no such child exists,
+// ResolveChild returns (nil, nil).
+//
+// Preconditions: There are no concurrent mutators of d.
+func (rp *ResolvingPath) ResolveChild(d *Dentry, name string) (*Dentry, error) {
+ child := d.children[name]
+ if child == nil {
+ return nil, nil
+ }
+ if child.isMounted() {
+ if mnt := rp.vfs.getMountAt(rp.mount, child); mnt != nil {
+ rp.nextMount = mnt
+ return nil, resolveMountPointError{}
+ }
+ }
+ return child, nil
+}
+
+// ResolveComponent returns the Dentry reached by starting at d and resolving
+// the current path component in the stream represented by rp. It does not
+// advance the stream. It does not take a reference on the returned Dentry. If
+// no such Dentry exists, ResolveComponent returns (nil, nil).
+//
+// Preconditions: !rp.Done(). There are no concurrent mutators of d.
+func (rp *ResolvingPath) ResolveComponent(d *Dentry) (*Dentry, error) {
+ switch pc := rp.Component(); pc {
+ case ".":
+ return d, nil
+ case "..":
+ return rp.ResolveParent(d)
+ default:
+ return rp.ResolveChild(d, pc)
+ }
+}
+
+// ShouldFollowSymlink returns true if, supposing that the current path
+// component in pcs represents a symbolic link, the symbolic link should be
+// followed.
+//
+// Preconditions: !rp.Done().
+func (rp *ResolvingPath) ShouldFollowSymlink() bool {
+ // Non-final symlinks are always followed.
+ return rp.flags&rpflagsFollowFinalSymlink != 0 || !rp.Final()
+}
+
+// HandleSymlink is called when the current path component is a symbolic link
+// to the given target. If the calling Filesystem method should continue path
+// traversal, HandleSymlink updates the path component stream to reflect the
+// symlink target and returns nil. Otherwise it returns a non-nil error.
+//
+// Preconditions: !rp.Done().
+func (rp *ResolvingPath) HandleSymlink(target string) error {
+ if rp.symlinks >= linux.MaxSymlinkTraversals {
+ return syserror.ELOOP
+ }
+ targetPath, err := fspath.Parse(target)
+ if err != nil {
+ return err
+ }
+ rp.symlinks++
+ if targetPath.Absolute {
+ rp.absSymlinkTarget = targetPath
+ return resolveAbsSymlinkError{}
+ }
+ if !targetPath.Begin.Ok() {
+ panic(fmt.Sprintf("symbolic link has non-empty target %q that is both relative and has no path components?", target))
+ }
+ // Consume the path component that represented the symlink.
+ rp.Advance()
+ // Prepend the symlink target to the relative path.
+ rp.relpathPrepend(targetPath)
+ return nil
+}
+
+func (rp *ResolvingPath) relpathPrepend(path fspath.Path) {
+ if rp.pit.Ok() {
+ rp.parts[rp.curPart] = rp.pit
+ rp.pit = path.Begin
+ rp.curPart++
+ } else {
+ // The symlink was the final path component, so now the symlink target
+ // is the whole path.
+ rp.pit = path.Begin
+ // Symlink targets can set rp.mustBeDir (if they end in a trailing /),
+ // but can't unset it.
+ if path.Dir {
+ rp.mustBeDir = true
+ }
+ }
+}
+
+func (rp *ResolvingPath) handleError(err error) bool {
+ switch err.(type) {
+ case resolveMountRootError:
+ // Switch to the new Mount. We hold references on the Mount and Dentry
+ // (from VFS.getMountpointAt()).
+ rp.decRefStartAndMount()
+ rp.mount = rp.nextMount
+ rp.start = rp.nextStart
+ rp.flags |= rpflagsHaveMountRef | rpflagsHaveStartRef
+ rp.nextMount = nil
+ rp.nextStart = nil
+ // Commit the previous FileystemImpl's progress through the relative
+ // path. (Don't consume the path component that caused us to traverse
+ // through the mount root - i.e. the ".." - because we still need to
+ // resolve the mount point's parent in the new FilesystemImpl.)
+ rp.relpathCommit()
+ // Restart path resolution on the new Mount. Don't bother calling
+ // rp.releaseErrorState() since we already set nextMount and nextStart
+ // to nil above.
+ return true
+
+ case resolveMountPointError:
+ // Switch to the new Mount. We hold a reference on the Mount (from
+ // VFS.getMountAt()), but borrow the reference on the mount root from
+ // the Mount.
+ rp.decRefStartAndMount()
+ rp.mount = rp.nextMount
+ rp.start = rp.nextMount.root
+ rp.flags = rp.flags&^rpflagsHaveStartRef | rpflagsHaveMountRef
+ rp.nextMount = nil
+ // Consume the path component that represented the mount point.
+ rp.Advance()
+ // Commit the previous FilesystemImpl's progress through the relative
+ // path.
+ rp.relpathCommit()
+ // Restart path resolution on the new Mount.
+ rp.releaseErrorState()
+ return true
+
+ case resolveAbsSymlinkError:
+ // Switch to the new Mount. References are borrowed from rp.root.
+ rp.decRefStartAndMount()
+ rp.mount = rp.root.mount
+ rp.start = rp.root.dentry
+ rp.flags &^= rpflagsHaveMountRef | rpflagsHaveStartRef
+ // Consume the path component that represented the symlink.
+ rp.Advance()
+ // Prepend the symlink target to the relative path.
+ rp.relpathPrepend(rp.absSymlinkTarget)
+ // Commit the previous FilesystemImpl's progress through the relative
+ // path, including the symlink target we just prepended.
+ rp.relpathCommit()
+ // Restart path resolution on the new Mount.
+ rp.releaseErrorState()
+ return true
+
+ default:
+ // Not an error we can handle.
+ return false
+ }
+}
+
+// MustBeDir returns true if the file traversed by rp must be a directory.
+func (rp *ResolvingPath) MustBeDir() bool {
+ return rp.mustBeDir
+}
diff --git a/pkg/sentry/vfs/syscalls.go b/pkg/sentry/vfs/syscalls.go
new file mode 100644
index 000000000..23f2b9e08
--- /dev/null
+++ b/pkg/sentry/vfs/syscalls.go
@@ -0,0 +1,217 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+// PathOperation specifies the path operated on by a VFS method.
+//
+// PathOperation is passed to VFS methods by pointer to reduce memory copying:
+// it's somewhat large and should never escape. (Options structs are passed by
+// pointer to VFS and FileDescription methods for the same reason.)
+type PathOperation struct {
+ // Root is the VFS root. References on Root are borrowed from the provider
+ // of the PathOperation.
+ //
+ // Invariants: Root.Ok().
+ Root VirtualDentry
+
+ // Start is the starting point for the path traversal. References on Start
+ // are borrowed from the provider of the PathOperation (i.e. the caller of
+ // the VFS method to which the PathOperation was passed).
+ //
+ // Invariants: Start.Ok(). If Pathname.Absolute, then Start == Root.
+ Start VirtualDentry
+
+ // Path is the pathname traversed by this operation.
+ Pathname string
+
+ // If FollowFinalSymlink is true, and the Dentry traversed by the final
+ // path component represents a symbolic link, the symbolic link should be
+ // followed.
+ FollowFinalSymlink bool
+}
+
+// GetDentryAt returns a VirtualDentry representing the given path, at which a
+// file must exist. A reference is taken on the returned VirtualDentry.
+func (vfs *VirtualFilesystem) GetDentryAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *GetDentryOptions) (VirtualDentry, error) {
+ rp, err := vfs.getResolvingPath(creds, pop)
+ if err != nil {
+ return VirtualDentry{}, err
+ }
+ for {
+ d, err := rp.mount.fs.impl.GetDentryAt(ctx, rp, *opts)
+ if err == nil {
+ vd := VirtualDentry{
+ mount: rp.mount,
+ dentry: d,
+ }
+ rp.mount.incRef()
+ vfs.putResolvingPath(rp)
+ return vd, nil
+ }
+ if !rp.handleError(err) {
+ vfs.putResolvingPath(rp)
+ return VirtualDentry{}, err
+ }
+ }
+}
+
+// MkdirAt creates a directory at the given path.
+func (vfs *VirtualFilesystem) MkdirAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *MkdirOptions) error {
+ // "Under Linux, apart from the permission bits, the S_ISVTX mode bit is
+ // also honored." - mkdir(2)
+ opts.Mode &= 01777
+ rp, err := vfs.getResolvingPath(creds, pop)
+ if err != nil {
+ return err
+ }
+ for {
+ err := rp.mount.fs.impl.MkdirAt(ctx, rp, *opts)
+ if err == nil {
+ vfs.putResolvingPath(rp)
+ return nil
+ }
+ if !rp.handleError(err) {
+ vfs.putResolvingPath(rp)
+ return err
+ }
+ }
+}
+
+// OpenAt returns a FileDescription providing access to the file at the given
+// path. A reference is taken on the returned FileDescription.
+func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *OpenOptions) (*FileDescription, error) {
+ // Remove:
+ //
+ // - O_LARGEFILE, which we always report in FileDescription status flags
+ // since only 64-bit architectures are supported at this time.
+ //
+ // - O_CLOEXEC, which affects file descriptors and therefore must be
+ // handled outside of VFS.
+ //
+ // - Unknown flags.
+ opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_NOCTTY | linux.O_TRUNC | linux.O_APPEND | linux.O_NONBLOCK | linux.O_DSYNC | linux.O_ASYNC | linux.O_DIRECT | linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_NOATIME | linux.O_SYNC | linux.O_PATH | linux.O_TMPFILE
+ // Linux's __O_SYNC (which we call linux.O_SYNC) implies O_DSYNC.
+ if opts.Flags&linux.O_SYNC != 0 {
+ opts.Flags |= linux.O_DSYNC
+ }
+ // Linux's __O_TMPFILE (which we call linux.O_TMPFILE) must be specified
+ // with O_DIRECTORY and a writable access mode (to ensure that it fails on
+ // filesystem implementations that do not support it).
+ if opts.Flags&linux.O_TMPFILE != 0 {
+ if opts.Flags&linux.O_DIRECTORY == 0 {
+ return nil, syserror.EINVAL
+ }
+ if opts.Flags&linux.O_CREAT != 0 {
+ return nil, syserror.EINVAL
+ }
+ if opts.Flags&linux.O_ACCMODE == linux.O_RDONLY {
+ return nil, syserror.EINVAL
+ }
+ }
+ // O_PATH causes most other flags to be ignored.
+ if opts.Flags&linux.O_PATH != 0 {
+ opts.Flags &= linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_PATH
+ }
+ // "On Linux, the following bits are also honored in mode: [S_ISUID,
+ // S_ISGID, S_ISVTX]" - open(2)
+ opts.Mode &= 07777
+
+ if opts.Flags&linux.O_NOFOLLOW != 0 {
+ pop.FollowFinalSymlink = false
+ }
+ rp, err := vfs.getResolvingPath(creds, pop)
+ if err != nil {
+ return nil, err
+ }
+ if opts.Flags&linux.O_DIRECTORY != 0 {
+ rp.mustBeDir = true
+ rp.mustBeDirOrig = true
+ }
+ for {
+ fd, err := rp.mount.fs.impl.OpenAt(ctx, rp, *opts)
+ if err == nil {
+ vfs.putResolvingPath(rp)
+ return fd, nil
+ }
+ if !rp.handleError(err) {
+ vfs.putResolvingPath(rp)
+ return nil, err
+ }
+ }
+}
+
+// StatAt returns metadata for the file at the given path.
+func (vfs *VirtualFilesystem) StatAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *StatOptions) (linux.Statx, error) {
+ rp, err := vfs.getResolvingPath(creds, pop)
+ if err != nil {
+ return linux.Statx{}, err
+ }
+ for {
+ stat, err := rp.mount.fs.impl.StatAt(ctx, rp, *opts)
+ if err == nil {
+ vfs.putResolvingPath(rp)
+ return stat, nil
+ }
+ if !rp.handleError(err) {
+ vfs.putResolvingPath(rp)
+ return linux.Statx{}, err
+ }
+ }
+}
+
+// StatusFlags returns file description status flags.
+func (fd *FileDescription) StatusFlags(ctx context.Context) (uint32, error) {
+ flags, err := fd.impl.StatusFlags(ctx)
+ flags |= linux.O_LARGEFILE
+ return flags, err
+}
+
+// SetStatusFlags sets file description status flags.
+func (fd *FileDescription) SetStatusFlags(ctx context.Context, flags uint32) error {
+ return fd.impl.SetStatusFlags(ctx, flags)
+}
+
+// TODO:
+//
+// - VFS.SyncAllFilesystems() for sync(2)
+//
+// - Something for syncfs(2)
+//
+// - VFS.LinkAt()
+//
+// - VFS.MknodAt()
+//
+// - VFS.ReadlinkAt()
+//
+// - VFS.RenameAt()
+//
+// - VFS.RmdirAt()
+//
+// - VFS.SetStatAt()
+//
+// - VFS.StatFSAt()
+//
+// - VFS.SymlinkAt()
+//
+// - VFS.UnlinkAt()
+//
+// - FileDescription.(almost everything)
diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go
new file mode 100644
index 000000000..4a8a69540
--- /dev/null
+++ b/pkg/sentry/vfs/vfs.go
@@ -0,0 +1,135 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package vfs implements a virtual filesystem layer.
+//
+// Lock order:
+//
+// Filesystem implementation locks
+// VirtualFilesystem.mountMu
+// VirtualFilesystem.fsTypesMu
+package vfs
+
+import (
+ "sync"
+)
+
+// A VirtualFilesystem (VFS for short) combines Filesystems in trees of Mounts.
+//
+// There is no analogue to the VirtualFilesystem type in Linux, as the
+// equivalent state in Linux is global.
+type VirtualFilesystem struct {
+ // mountMu serializes mount mutations.
+ //
+ // mountMu is analogous to Linux's namespace_sem.
+ mountMu sync.RWMutex
+
+ // mounts maps (mount parent, mount point) pairs to mounts. (Since mounts
+ // are uniquely namespaced, including mount parent in the key correctly
+ // handles both bind mounts and mount namespaces; Linux does the same.)
+ // Synchronization between mutators and readers is provided by mounts.seq;
+ // synchronization between mutators is provided by mountMu.
+ //
+ // mounts is used to follow mount points during path traversal. We use a
+ // single table rather than per-Dentry tables to reduce size (and therefore
+ // cache footprint) for the vast majority of Dentries that are not mount
+ // points.
+ //
+ // mounts is analogous to Linux's mount_hashtable.
+ mounts mountTable
+
+ // mountpoints maps mount points to mounts at those points in all
+ // namespaces. mountpoints is protected by mountMu.
+ //
+ // mountpoints is used to find mounts that must be unmounted due to
+ // removal of a mount point Dentry from another mount namespace. ("A file
+ // or directory that is a mount point in one namespace that is not a mount
+ // point in another namespace, may be renamed, unlinked, or removed
+ // (rmdir(2)) in the mount namespace in which it is not a mount point
+ // (subject to the usual permission checks)." - mount_namespaces(7))
+ //
+ // mountpoints is analogous to Linux's mountpoint_hashtable.
+ mountpoints map[*Dentry]map[*Mount]struct{}
+
+ // fsTypes contains all FilesystemTypes that are usable in the
+ // VirtualFilesystem. fsTypes is protected by fsTypesMu.
+ fsTypesMu sync.RWMutex
+ fsTypes map[string]FilesystemType
+}
+
+// New returns a new VirtualFilesystem with no mounts or FilesystemTypes.
+func New() *VirtualFilesystem {
+ vfs := &VirtualFilesystem{
+ mountpoints: make(map[*Dentry]map[*Mount]struct{}),
+ fsTypes: make(map[string]FilesystemType),
+ }
+ vfs.mounts.Init()
+ return vfs
+}
+
+// A VirtualDentry represents a node in a VFS tree, by combining a Dentry
+// (which represents a node in a Filesystem's tree) and a Mount (which
+// represents the Filesystem's position in a VFS mount tree).
+//
+// VirtualDentry's semantics are similar to that of a Go interface object
+// representing a pointer: it is a copyable value type that represents
+// references to another entity. The zero value of VirtualDentry is an "empty
+// VirtualDentry", directly analogous to a nil interface object.
+// VirtualDentry.Ok() checks that a VirtualDentry is not zero-valued; unless
+// otherwise specified, all other VirtualDentry methods require
+// VirtualDentry.Ok() == true.
+//
+// Mounts and Dentries are reference-counted, requiring that users call
+// VirtualDentry.{Inc,Dec}Ref() as appropriate. We often colloquially refer to
+// references on the Mount and Dentry referred to by a VirtualDentry as
+// references on the VirtualDentry itself. Unless otherwise specified, all
+// VirtualDentry methods require that a reference is held on the VirtualDentry.
+//
+// VirtualDentry is analogous to Linux's struct path.
+type VirtualDentry struct {
+ mount *Mount
+ dentry *Dentry
+}
+
+// Ok returns true if vd is not empty. It does not require that a reference is
+// held.
+func (vd VirtualDentry) Ok() bool {
+ return vd.mount != nil
+}
+
+// IncRef increments the reference counts on the Mount and Dentry represented
+// by vd.
+func (vd VirtualDentry) IncRef() {
+ vd.mount.incRef()
+ vd.dentry.incRef(vd.mount.fs)
+}
+
+// DecRef decrements the reference counts on the Mount and Dentry represented
+// by vd.
+func (vd VirtualDentry) DecRef() {
+ vd.dentry.decRef(vd.mount.fs)
+ vd.mount.decRef()
+}
+
+// Mount returns the Mount associated with vd. It does not take a reference on
+// the returned Mount.
+func (vd VirtualDentry) Mount() *Mount {
+ return vd.mount
+}
+
+// Dentry returns the Dentry associated with vd. It does not take a reference
+// on the returned Dentry.
+func (vd VirtualDentry) Dentry() *Dentry {
+ return vd.dentry
+}