diff options
32 files changed, 5851 insertions, 40 deletions
diff --git a/pkg/abi/linux/file.go b/pkg/abi/linux/file.go index 285338e47..4b0ea33dc 100644 --- a/pkg/abi/linux/file.go +++ b/pkg/abi/linux/file.go @@ -24,25 +24,27 @@ import ( // Constants for open(2). const ( - O_ACCMODE = 00000003 - O_RDONLY = 00000000 - O_WRONLY = 00000001 - O_RDWR = 00000002 - O_CREAT = 00000100 - O_EXCL = 00000200 - O_NOCTTY = 00000400 - O_TRUNC = 00001000 - O_APPEND = 00002000 - O_NONBLOCK = 00004000 - O_DSYNC = 00010000 - O_ASYNC = 00020000 - O_DIRECT = 00040000 - O_LARGEFILE = 00100000 - O_DIRECTORY = 00200000 - O_NOFOLLOW = 00400000 - O_CLOEXEC = 02000000 - O_SYNC = 04000000 + O_ACCMODE = 000000003 + O_RDONLY = 000000000 + O_WRONLY = 000000001 + O_RDWR = 000000002 + O_CREAT = 000000100 + O_EXCL = 000000200 + O_NOCTTY = 000000400 + O_TRUNC = 000001000 + O_APPEND = 000002000 + O_NONBLOCK = 000004000 + O_DSYNC = 000010000 + O_ASYNC = 000020000 + O_DIRECT = 000040000 + O_LARGEFILE = 000100000 + O_DIRECTORY = 000200000 + O_NOFOLLOW = 000400000 + O_NOATIME = 001000000 + O_CLOEXEC = 002000000 + O_SYNC = 004000000 // __O_SYNC in Linux O_PATH = 010000000 + O_TMPFILE = 020000000 // __O_TMPFILE in Linux ) // Constants for fstatat(2). @@ -124,14 +126,23 @@ const ( // Values for mode_t. const ( - FileTypeMask = 0170000 - ModeSocket = 0140000 - ModeSymlink = 0120000 - ModeRegular = 0100000 - ModeBlockDevice = 060000 - ModeDirectory = 040000 - ModeCharacterDevice = 020000 - ModeNamedPipe = 010000 + S_IFMT = 0170000 + S_IFSOCK = 0140000 + S_IFLNK = 0120000 + S_IFREG = 0100000 + S_IFBLK = 060000 + S_IFDIR = 040000 + S_IFCHR = 020000 + S_IFIFO = 010000 + + FileTypeMask = S_IFMT + ModeSocket = S_IFSOCK + ModeSymlink = S_IFLNK + ModeRegular = S_IFREG + ModeBlockDevice = S_IFBLK + ModeDirectory = S_IFDIR + ModeCharacterDevice = S_IFCHR + ModeNamedPipe = S_IFIFO ModeSetUID = 04000 ModeSetGID = 02000 @@ -152,6 +163,19 @@ const ( PermissionsMask = 0777 ) +// Values for linux_dirent64.d_type. +const ( + DT_UNKNOWN = 0 + DT_FIFO = 1 + DT_CHR = 2 + DT_DIR = 4 + DT_BLK = 6 + DT_REG = 8 + DT_LNK = 10 + DT_SOCK = 12 + DT_WHT = 14 +) + // Values for preadv2/pwritev2. const ( RWF_HIPRI = 0x00000001 @@ -179,19 +203,6 @@ type Stat struct { _ [3]int64 } -// File types. -const ( - DT_BLK = 0x6 - DT_CHR = 0x2 - DT_DIR = 0x4 - DT_FIFO = 0x1 - DT_LNK = 0xa - DT_REG = 0x8 - DT_SOCK = 0xc - DT_UNKNOWN = 0x0 - DT_WHT = 0xe -) - // SizeOfStat is the size of a Stat struct. var SizeOfStat = binary.Size(Stat{}) @@ -222,6 +233,17 @@ const ( STATX__RESERVED = 0x80000000 ) +// Bitmasks for Statx.Attributes and Statx.AttributesMask, from +// include/uapi/linux/stat.h. +const ( + STATX_ATTR_COMPRESSED = 0x00000004 + STATX_ATTR_IMMUTABLE = 0x00000010 + STATX_ATTR_APPEND = 0x00000020 + STATX_ATTR_NODUMP = 0x00000040 + STATX_ATTR_ENCRYPTED = 0x00000800 + STATX_ATTR_AUTOMOUNT = 0x00001000 +) + // Statx represents struct statx. type Statx struct { Mask uint32 @@ -231,7 +253,6 @@ type Statx struct { UID uint32 GID uint32 Mode uint16 - _ uint16 Ino uint64 Size uint64 Blocks uint64 diff --git a/pkg/abi/linux/fs.go b/pkg/abi/linux/fs.go index 549e0fb93..b416e3472 100644 --- a/pkg/abi/linux/fs.go +++ b/pkg/abi/linux/fs.go @@ -77,6 +77,15 @@ type Statfs struct { Spare [4]uint64 } +// Whence argument to lseek(2), from include/uapi/linux/fs.h. +const ( + SEEK_SET = 0 + SEEK_CUR = 1 + SEEK_END = 2 + SEEK_DATA = 3 + SEEK_HOLE = 4 +) + // Sync_file_range flags, from include/uapi/linux/fs.h const ( SYNC_FILE_RANGE_WAIT_BEFORE = 1 diff --git a/pkg/fspath/BUILD b/pkg/fspath/BUILD new file mode 100644 index 000000000..11716af81 --- /dev/null +++ b/pkg/fspath/BUILD @@ -0,0 +1,28 @@ +load("//tools/go_stateify:defs.bzl", "go_library", "go_test") + +package( + default_visibility = ["//visibility:public"], + licenses = ["notice"], +) + +go_library( + name = "fspath", + srcs = [ + "builder.go", + "builder_unsafe.go", + "fspath.go", + ], + importpath = "gvisor.dev/gvisor/pkg/fspath", + deps = ["//pkg/syserror"], +) + +go_test( + name = "fspath_test", + size = "small", + srcs = [ + "builder_test.go", + "fspath_test.go", + ], + embed = [":fspath"], + deps = ["//pkg/syserror"], +) diff --git a/pkg/fspath/builder.go b/pkg/fspath/builder.go new file mode 100644 index 000000000..7ddb36826 --- /dev/null +++ b/pkg/fspath/builder.go @@ -0,0 +1,104 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fspath + +import ( + "fmt" +) + +// Builder is similar to strings.Builder, but is used to produce pathnames +// given path components in reverse order (from leaf to root). This is useful +// in the common case where a filesystem is represented by a tree of named +// nodes, and the path to a given node must be produced by walking upward from +// that node to a given root. +type Builder struct { + buf []byte + start int + needSep bool +} + +// Reset resets the Builder to be empty. +func (b *Builder) Reset() { + b.start = len(b.buf) + b.needSep = false +} + +// Len returns the number of accumulated bytes. +func (b *Builder) Len() int { + return len(b.buf) - b.start +} + +func (b *Builder) needToGrow(n int) bool { + return b.start < n +} + +func (b *Builder) grow(n int) { + newLen := b.Len() + n + var newCap int + if len(b.buf) == 0 { + newCap = 64 // arbitrary + } else { + newCap = 2 * len(b.buf) + } + for newCap < newLen { + newCap *= 2 + if newCap == 0 { + panic(fmt.Sprintf("required length (%d) causes buffer size to overflow", newLen)) + } + } + newBuf := make([]byte, newCap) + copy(newBuf[newCap-b.Len():], b.buf[b.start:]) + b.start += newCap - len(b.buf) + b.buf = newBuf +} + +// PrependComponent prepends the given path component to b's buffer. A path +// separator is automatically inserted if appropriate. +func (b *Builder) PrependComponent(pc string) { + if b.needSep { + b.PrependByte('/') + } + b.PrependString(pc) + b.needSep = true +} + +// PrependString prepends the given string to b's buffer. +func (b *Builder) PrependString(str string) { + if b.needToGrow(len(str)) { + b.grow(len(str)) + } + b.start -= len(str) + copy(b.buf[b.start:], str) +} + +// PrependByte prepends the given byte to b's buffer. +func (b *Builder) PrependByte(c byte) { + if b.needToGrow(1) { + b.grow(1) + } + b.start-- + b.buf[b.start] = c +} + +// AppendString appends the given string to b's buffer. +func (b *Builder) AppendString(str string) { + if b.needToGrow(len(str)) { + b.grow(len(str)) + } + oldStart := b.start + b.start -= len(str) + copy(b.buf[b.start:], b.buf[oldStart:]) + copy(b.buf[len(b.buf)-len(str):], str) +} diff --git a/pkg/fspath/builder_test.go b/pkg/fspath/builder_test.go new file mode 100644 index 000000000..22f890273 --- /dev/null +++ b/pkg/fspath/builder_test.go @@ -0,0 +1,58 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fspath + +import ( + "testing" +) + +func TestBuilder(t *testing.T) { + type testCase struct { + pcs []string // path components in reverse order + after string + want string + } + tests := []testCase{ + { + // Empty case. + }, + { + pcs: []string{"foo"}, + want: "foo", + }, + { + pcs: []string{"foo", "bar", "baz"}, + want: "baz/bar/foo", + }, + { + pcs: []string{"foo", "bar"}, + after: " (deleted)", + want: "bar/foo (deleted)", + }, + } + + for _, test := range tests { + t.Run(test.want, func(t *testing.T) { + var b Builder + for _, pc := range test.pcs { + b.PrependComponent(pc) + } + b.AppendString(test.after) + if got := b.String(); got != test.want { + t.Errorf("got %q, wanted %q", got, test.want) + } + }) + } +} diff --git a/pkg/fspath/builder_unsafe.go b/pkg/fspath/builder_unsafe.go new file mode 100644 index 000000000..75606808d --- /dev/null +++ b/pkg/fspath/builder_unsafe.go @@ -0,0 +1,27 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fspath + +import ( + "unsafe" +) + +// String returns the accumulated string. No other methods should be called +// after String. +func (b *Builder) String() string { + bs := b.buf[b.start:] + // Compare strings.Builder.String(). + return *(*string)(unsafe.Pointer(&bs)) +} diff --git a/pkg/fspath/fspath.go b/pkg/fspath/fspath.go new file mode 100644 index 000000000..f68752560 --- /dev/null +++ b/pkg/fspath/fspath.go @@ -0,0 +1,182 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package fspath provides efficient tools for working with file paths in +// Linux-compatible filesystem implementations. +package fspath + +import ( + "strings" + + "gvisor.dev/gvisor/pkg/syserror" +) + +const pathSep = '/' + +// Parse parses a pathname as described by path_resolution(7). +func Parse(pathname string) (Path, error) { + if len(pathname) == 0 { + // "... POSIX decrees that an empty pathname must not be resolved + // successfully. Linux returns ENOENT in this case." - + // path_resolution(7) + return Path{}, syserror.ENOENT + } + // Skip leading path separators. + i := 0 + for pathname[i] == pathSep { + i++ + if i == len(pathname) { + // pathname consists entirely of path separators. + return Path{ + Absolute: true, + Dir: true, + }, nil + } + } + // Skip trailing path separators. This is required by Iterator.Next. This + // loop is guaranteed to terminate with j >= 0 because otherwise the + // pathname would consist entirely of path separators, so we would have + // returned above. + j := len(pathname) - 1 + for pathname[j] == pathSep { + j-- + } + // Find the end of the first path component. + firstEnd := i + 1 + for firstEnd != len(pathname) && pathname[firstEnd] != pathSep { + firstEnd++ + } + return Path{ + Begin: Iterator{ + partialPathname: pathname[i : j+1], + end: firstEnd - i, + }, + Absolute: i != 0, + Dir: j != len(pathname)-1, + }, nil +} + +// Path contains the information contained in a pathname string. +// +// Path is copyable by value. +type Path struct { + // Begin is an iterator to the first path component in the relative part of + // the path. + // + // Path doesn't store information about path components after the first + // since this would require allocation. + Begin Iterator + + // If true, the path is absolute, such that lookup should begin at the + // filesystem root. If false, the path is relative, such that where lookup + // begins is unspecified. + Absolute bool + + // If true, the pathname contains trailing path separators, so the last + // path component must exist and resolve to a directory. + Dir bool +} + +// String returns a pathname string equivalent to p. Note that the returned +// string is not necessarily equal to the string p was parsed from; in +// particular, redundant path separators will not be present. +func (p Path) String() string { + var b strings.Builder + if p.Absolute { + b.WriteByte(pathSep) + } + sep := false + for pit := p.Begin; pit.Ok(); pit = pit.Next() { + if sep { + b.WriteByte(pathSep) + } + b.WriteString(pit.String()) + sep = true + } + // Don't return "//" for Parse("/"). + if p.Dir && p.Begin.Ok() { + b.WriteByte(pathSep) + } + return b.String() +} + +// An Iterator represents either a path component in a Path or a terminal +// iterator indicating that the end of the path has been reached. +// +// Iterator is immutable and copyable by value. The zero value of Iterator is +// valid, and represents a terminal iterator. +type Iterator struct { + // partialPathname is a substring of the original pathname beginning at the + // start of the represented path component and ending immediately after the + // end of the last path component in the pathname. If partialPathname is + // empty, the PathnameIterator is terminal. + // + // See TestParseIteratorPartialPathnames in fspath_test.go for a worked + // example. + partialPathname string + + // end is the offset into partialPathname of the first byte after the end + // of the represented path component. + end int +} + +// Ok returns true if it is not terminal. +func (it Iterator) Ok() bool { + return len(it.partialPathname) != 0 +} + +// String returns the path component represented by it. +// +// Preconditions: it.Ok(). +func (it Iterator) String() string { + return it.partialPathname[:it.end] +} + +// Next returns an iterator to the path component after it. If it is the last +// component in the path, Next returns a terminal iterator. +// +// Preconditions: it.Ok(). +func (it Iterator) Next() Iterator { + if it.end == len(it.partialPathname) { + // End of the path. + return Iterator{} + } + // Skip path separators. Since Parse trims trailing path separators, if we + // aren't at the end of the path, there is definitely another path + // component. + i := it.end + 1 + for { + if it.partialPathname[i] != pathSep { + break + } + i++ + } + nextPartialPathname := it.partialPathname[i:] + // Find the end of this path component. + nextEnd := 1 + for nextEnd < len(nextPartialPathname) && nextPartialPathname[nextEnd] != pathSep { + nextEnd++ + } + return Iterator{ + partialPathname: nextPartialPathname, + end: nextEnd, + } +} + +// NextOk is equivalent to it.Next().Ok(), but is faster. +// +// Preconditions: it.Ok(). +func (it Iterator) NextOk() bool { + return it.end != len(it.partialPathname) +} diff --git a/pkg/fspath/fspath_test.go b/pkg/fspath/fspath_test.go new file mode 100644 index 000000000..215b35622 --- /dev/null +++ b/pkg/fspath/fspath_test.go @@ -0,0 +1,143 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fspath + +import ( + "reflect" + "strings" + "testing" + + "gvisor.dev/gvisor/pkg/syserror" +) + +func TestParseIteratorPartialPathnames(t *testing.T) { + path, err := Parse("/foo//bar///baz////") + if err != nil { + t.Fatalf("Parse failed: %v", err) + } + // Parse strips leading slashes, and records their presence as + // Path.Absolute. + if !path.Absolute { + t.Errorf("Path.Absolute: got false, wanted true") + } + // Parse strips trailing slashes, and records their presence as Path.Dir. + if !path.Dir { + t.Errorf("Path.Dir: got false, wanted true") + } + // The first Iterator.partialPathname is the input pathname, with leading + // and trailing slashes stripped. + it := path.Begin + if want := "foo//bar///baz"; it.partialPathname != want { + t.Errorf("first Iterator.partialPathname: got %q, wanted %q", it.partialPathname, want) + } + // Successive Iterator.partialPathnames remove the leading path component + // and following slashes, until we run out of path components and get a + // terminal Iterator. + it = it.Next() + if want := "bar///baz"; it.partialPathname != want { + t.Errorf("second Iterator.partialPathname: got %q, wanted %q", it.partialPathname, want) + } + it = it.Next() + if want := "baz"; it.partialPathname != want { + t.Errorf("third Iterator.partialPathname: got %q, wanted %q", it.partialPathname, want) + } + it = it.Next() + if want := ""; it.partialPathname != want { + t.Errorf("fourth Iterator.partialPathname: got %q, wanted %q", it.partialPathname, want) + } + if it.Ok() { + t.Errorf("fourth Iterator.Ok(): got true, wanted false") + } +} + +func TestParse(t *testing.T) { + type testCase struct { + pathname string + relpath []string + abs bool + dir bool + } + tests := []testCase{ + { + pathname: "/", + relpath: []string{}, + abs: true, + dir: true, + }, + { + pathname: "//", + relpath: []string{}, + abs: true, + dir: true, + }, + } + for _, sep := range []string{"/", "//"} { + for _, abs := range []bool{false, true} { + for _, dir := range []bool{false, true} { + for _, pcs := range [][]string{ + // single path component + {"foo"}, + // multiple path components, including non-UTF-8 + {".", "foo", "..", "\xe6", "bar"}, + } { + prefix := "" + if abs { + prefix = sep + } + suffix := "" + if dir { + suffix = sep + } + tests = append(tests, testCase{ + pathname: prefix + strings.Join(pcs, sep) + suffix, + relpath: pcs, + abs: abs, + dir: dir, + }) + } + } + } + } + + for _, test := range tests { + t.Run(test.pathname, func(t *testing.T) { + p, err := Parse(test.pathname) + if err != nil { + t.Fatalf("failed to parse pathname %q: %v", test.pathname, err) + } + t.Logf("pathname %q => path %q", test.pathname, p) + if p.Absolute != test.abs { + t.Errorf("path absoluteness: got %v, wanted %v", p.Absolute, test.abs) + } + if p.Dir != test.dir { + t.Errorf("path must resolve to a directory: got %v, wanted %v", p.Dir, test.dir) + } + pcs := []string{} + for pit := p.Begin; pit.Ok(); pit = pit.Next() { + pcs = append(pcs, pit.String()) + } + if !reflect.DeepEqual(pcs, test.relpath) { + t.Errorf("relative path: got %v, wanted %v", pcs, test.relpath) + } + }) + } +} + +func TestParseEmptyPathname(t *testing.T) { + p, err := Parse("") + if err != syserror.ENOENT { + t.Errorf("parsing empty pathname: got (%v, %v), wanted (<unspecified>, ENOENT)", p, err) + } +} diff --git a/pkg/sentry/fsimpl/memfs/BUILD b/pkg/sentry/fsimpl/memfs/BUILD new file mode 100644 index 000000000..d5d4f68df --- /dev/null +++ b/pkg/sentry/fsimpl/memfs/BUILD @@ -0,0 +1,55 @@ +load("//tools/go_stateify:defs.bzl", "go_library", "go_test") + +package(licenses = ["notice"]) + +load("//tools/go_generics:defs.bzl", "go_template_instance") + +go_template_instance( + name = "dentry_list", + out = "dentry_list.go", + package = "memfs", + prefix = "dentry", + template = "//pkg/ilist:generic_list", + types = { + "Element": "*Dentry", + "Linker": "*Dentry", + }, +) + +go_library( + name = "memfs", + srcs = [ + "dentry_list.go", + "directory.go", + "filesystem.go", + "memfs.go", + "regular_file.go", + "symlink.go", + ], + importpath = "gvisor.dev/gvisor/pkg/sentry/fsimpl/memfs", + deps = [ + "//pkg/abi/linux", + "//pkg/sentry/context", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/usermem", + "//pkg/sentry/vfs", + "//pkg/syserror", + ], +) + +go_test( + name = "benchmark_test", + size = "small", + srcs = ["benchmark_test.go"], + deps = [ + ":memfs", + "//pkg/abi/linux", + "//pkg/sentry/context", + "//pkg/sentry/context/contexttest", + "//pkg/sentry/fs", + "//pkg/sentry/fs/tmpfs", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/vfs", + "//pkg/syserror", + ], +) diff --git a/pkg/sentry/fsimpl/memfs/benchmark_test.go b/pkg/sentry/fsimpl/memfs/benchmark_test.go new file mode 100644 index 000000000..a94b17db6 --- /dev/null +++ b/pkg/sentry/fsimpl/memfs/benchmark_test.go @@ -0,0 +1,464 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package benchmark_test + +import ( + "fmt" + "runtime" + "strings" + "testing" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/context/contexttest" + "gvisor.dev/gvisor/pkg/sentry/fs" + _ "gvisor.dev/gvisor/pkg/sentry/fs/tmpfs" + "gvisor.dev/gvisor/pkg/sentry/fsimpl/memfs" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +// Differences from stat_benchmark: +// +// - Syscall interception, CopyInPath, copyOutStat, and overlayfs overheads are +// not included. +// +// - *MountStat benchmarks use a tmpfs root mount and a tmpfs submount at /tmp. +// Non-MountStat benchmarks use a tmpfs root mount and no submounts. +// stat_benchmark uses a varying root mount, a tmpfs submount at /tmp, and a +// subdirectory /tmp/<top_dir> (assuming TEST_TMPDIR == "/tmp"). Thus +// stat_benchmark at depth 1 does a comparable amount of work to *MountStat +// benchmarks at depth 2, and non-MountStat benchmarks at depth 3. +var depths = []int{1, 2, 3, 8, 64, 100} + +const ( + mountPointName = "tmp" + filename = "gvisor_test_temp_0_1557494568" +) + +// This is copied from syscalls/linux/sys_file.go, with the dependency on +// kernel.Task stripped out. +func fileOpOn(ctx context.Context, mntns *fs.MountNamespace, root, wd *fs.Dirent, dirFD int32, path string, resolve bool, fn func(root *fs.Dirent, d *fs.Dirent) error) error { + var ( + d *fs.Dirent // The file. + rel *fs.Dirent // The relative directory for search (if required.) + err error + ) + + // Extract the working directory (maybe). + if len(path) > 0 && path[0] == '/' { + // Absolute path; rel can be nil. + } else if dirFD == linux.AT_FDCWD { + // Need to reference the working directory. + rel = wd + } else { + // Need to extract the given FD. + return syserror.EBADF + } + + // Lookup the node. + remainingTraversals := uint(linux.MaxSymlinkTraversals) + if resolve { + d, err = mntns.FindInode(ctx, root, rel, path, &remainingTraversals) + } else { + d, err = mntns.FindLink(ctx, root, rel, path, &remainingTraversals) + } + if err != nil { + return err + } + + err = fn(root, d) + d.DecRef() + return err +} + +func BenchmarkVFS1TmpfsStat(b *testing.B) { + for _, depth := range depths { + b.Run(fmt.Sprintf("%d", depth), func(b *testing.B) { + ctx := contexttest.Context(b) + + // Create VFS. + tmpfsFS, ok := fs.FindFilesystem("tmpfs") + if !ok { + b.Fatalf("failed to find tmpfs filesystem type") + } + rootInode, err := tmpfsFS.Mount(ctx, "tmpfs", fs.MountSourceFlags{}, "", nil) + if err != nil { + b.Fatalf("failed to create tmpfs root mount: %v", err) + } + mntns, err := fs.NewMountNamespace(ctx, rootInode) + if err != nil { + b.Fatalf("failed to create mount namespace: %v", err) + } + defer mntns.DecRef() + + var filePathBuilder strings.Builder + filePathBuilder.WriteByte('/') + + // Create nested directories with given depth. + root := mntns.Root() + defer root.DecRef() + d := root + d.IncRef() + defer d.DecRef() + for i := depth; i > 0; i-- { + name := fmt.Sprintf("%d", i) + if err := d.Inode.CreateDirectory(ctx, d, name, fs.FilePermsFromMode(0755)); err != nil { + b.Fatalf("failed to create directory %q: %v", name, err) + } + next, err := d.Walk(ctx, root, name) + if err != nil { + b.Fatalf("failed to walk to directory %q: %v", name, err) + } + d.DecRef() + d = next + filePathBuilder.WriteString(name) + filePathBuilder.WriteByte('/') + } + + // Create the file that will be stat'd. + file, err := d.Inode.Create(ctx, d, filename, fs.FileFlags{Read: true, Write: true}, fs.FilePermsFromMode(0644)) + if err != nil { + b.Fatalf("failed to create file %q: %v", filename, err) + } + file.DecRef() + filePathBuilder.WriteString(filename) + filePath := filePathBuilder.String() + + dirPath := false + runtime.GC() + b.ResetTimer() + for i := 0; i < b.N; i++ { + err := fileOpOn(ctx, mntns, root, root, linux.AT_FDCWD, filePath, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent) error { + if dirPath && !fs.IsDir(d.Inode.StableAttr) { + return syserror.ENOTDIR + } + uattr, err := d.Inode.UnstableAttr(ctx) + if err != nil { + return err + } + // Sanity check. + if uattr.Perms.User.Execute { + b.Fatalf("got wrong permissions (%0o)", uattr.Perms.LinuxMode()) + } + return nil + }) + if err != nil { + b.Fatalf("stat(%q) failed: %v", filePath, err) + } + } + }) + } +} + +func BenchmarkVFS2MemfsStat(b *testing.B) { + for _, depth := range depths { + b.Run(fmt.Sprintf("%d", depth), func(b *testing.B) { + ctx := contexttest.Context(b) + creds := auth.CredentialsFromContext(ctx) + + // Create VFS. + vfsObj := vfs.New() + vfsObj.MustRegisterFilesystemType("memfs", memfs.FilesystemType{}) + mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "memfs", &vfs.NewFilesystemOptions{}) + if err != nil { + b.Fatalf("failed to create tmpfs root mount: %v", err) + } + + var filePathBuilder strings.Builder + filePathBuilder.WriteByte('/') + + // Create nested directories with given depth. + root := mntns.Root() + defer root.DecRef() + vd := root + vd.IncRef() + defer vd.DecRef() + for i := depth; i > 0; i-- { + name := fmt.Sprintf("%d", i) + pop := vfs.PathOperation{ + Root: root, + Start: vd, + Pathname: name, + } + if err := vfsObj.MkdirAt(ctx, creds, &pop, &vfs.MkdirOptions{ + Mode: 0755, + }); err != nil { + b.Fatalf("failed to create directory %q: %v", name, err) + } + nextVD, err := vfsObj.GetDentryAt(ctx, creds, &pop, &vfs.GetDentryOptions{}) + if err != nil { + b.Fatalf("failed to walk to directory %q: %v", name, err) + } + vd.DecRef() + vd = nextVD + filePathBuilder.WriteString(name) + filePathBuilder.WriteByte('/') + } + + // Create the file that will be stat'd. + fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{ + Root: root, + Start: vd, + Pathname: filename, + FollowFinalSymlink: true, + }, &vfs.OpenOptions{ + Flags: linux.O_RDWR | linux.O_CREAT | linux.O_EXCL, + Mode: 0644, + }) + if err != nil { + b.Fatalf("failed to create file %q: %v", filename, err) + } + defer fd.DecRef() + filePathBuilder.WriteString(filename) + filePath := filePathBuilder.String() + + runtime.GC() + b.ResetTimer() + for i := 0; i < b.N; i++ { + stat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{ + Root: root, + Start: root, + Pathname: filePath, + FollowFinalSymlink: true, + }, &vfs.StatOptions{}) + if err != nil { + b.Fatalf("stat(%q) failed: %v", filePath, err) + } + // Sanity check. + if stat.Mode&^linux.S_IFMT != 0644 { + b.Fatalf("got wrong permissions (%0o)", stat.Mode) + } + } + }) + } +} + +func BenchmarkVFS1TmpfsMountStat(b *testing.B) { + for _, depth := range depths { + b.Run(fmt.Sprintf("%d", depth), func(b *testing.B) { + ctx := contexttest.Context(b) + + // Create VFS. + tmpfsFS, ok := fs.FindFilesystem("tmpfs") + if !ok { + b.Fatalf("failed to find tmpfs filesystem type") + } + rootInode, err := tmpfsFS.Mount(ctx, "tmpfs", fs.MountSourceFlags{}, "", nil) + if err != nil { + b.Fatalf("failed to create tmpfs root mount: %v", err) + } + mntns, err := fs.NewMountNamespace(ctx, rootInode) + if err != nil { + b.Fatalf("failed to create mount namespace: %v", err) + } + defer mntns.DecRef() + + var filePathBuilder strings.Builder + filePathBuilder.WriteByte('/') + + // Create and mount the submount. + root := mntns.Root() + defer root.DecRef() + if err := root.Inode.CreateDirectory(ctx, root, mountPointName, fs.FilePermsFromMode(0755)); err != nil { + b.Fatalf("failed to create mount point: %v", err) + } + mountPoint, err := root.Walk(ctx, root, mountPointName) + if err != nil { + b.Fatalf("failed to walk to mount point: %v", err) + } + defer mountPoint.DecRef() + submountInode, err := tmpfsFS.Mount(ctx, "tmpfs", fs.MountSourceFlags{}, "", nil) + if err != nil { + b.Fatalf("failed to create tmpfs submount: %v", err) + } + if err := mntns.Mount(ctx, mountPoint, submountInode); err != nil { + b.Fatalf("failed to mount tmpfs submount: %v", err) + } + filePathBuilder.WriteString(mountPointName) + filePathBuilder.WriteByte('/') + + // Create nested directories with given depth. + d, err := root.Walk(ctx, root, mountPointName) + if err != nil { + b.Fatalf("failed to walk to mount root: %v", err) + } + defer d.DecRef() + for i := depth; i > 0; i-- { + name := fmt.Sprintf("%d", i) + if err := d.Inode.CreateDirectory(ctx, d, name, fs.FilePermsFromMode(0755)); err != nil { + b.Fatalf("failed to create directory %q: %v", name, err) + } + next, err := d.Walk(ctx, root, name) + if err != nil { + b.Fatalf("failed to walk to directory %q: %v", name, err) + } + d.DecRef() + d = next + filePathBuilder.WriteString(name) + filePathBuilder.WriteByte('/') + } + + // Create the file that will be stat'd. + file, err := d.Inode.Create(ctx, d, filename, fs.FileFlags{Read: true, Write: true}, fs.FilePermsFromMode(0644)) + if err != nil { + b.Fatalf("failed to create file %q: %v", filename, err) + } + file.DecRef() + filePathBuilder.WriteString(filename) + filePath := filePathBuilder.String() + + dirPath := false + runtime.GC() + b.ResetTimer() + for i := 0; i < b.N; i++ { + err := fileOpOn(ctx, mntns, root, root, linux.AT_FDCWD, filePath, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent) error { + if dirPath && !fs.IsDir(d.Inode.StableAttr) { + return syserror.ENOTDIR + } + uattr, err := d.Inode.UnstableAttr(ctx) + if err != nil { + return err + } + // Sanity check. + if uattr.Perms.User.Execute { + b.Fatalf("got wrong permissions (%0o)", uattr.Perms.LinuxMode()) + } + return nil + }) + if err != nil { + b.Fatalf("stat(%q) failed: %v", filePath, err) + } + } + }) + } +} + +func BenchmarkVFS2MemfsMountStat(b *testing.B) { + for _, depth := range depths { + b.Run(fmt.Sprintf("%d", depth), func(b *testing.B) { + ctx := contexttest.Context(b) + creds := auth.CredentialsFromContext(ctx) + + // Create VFS. + vfsObj := vfs.New() + vfsObj.MustRegisterFilesystemType("memfs", memfs.FilesystemType{}) + mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "memfs", &vfs.NewFilesystemOptions{}) + if err != nil { + b.Fatalf("failed to create tmpfs root mount: %v", err) + } + + var filePathBuilder strings.Builder + filePathBuilder.WriteByte('/') + + // Create the mount point. + root := mntns.Root() + defer root.DecRef() + pop := vfs.PathOperation{ + Root: root, + Start: root, + Pathname: mountPointName, + } + if err := vfsObj.MkdirAt(ctx, creds, &pop, &vfs.MkdirOptions{ + Mode: 0755, + }); err != nil { + b.Fatalf("failed to create mount point: %v", err) + } + // Save the mount point for later use. + mountPoint, err := vfsObj.GetDentryAt(ctx, creds, &pop, &vfs.GetDentryOptions{}) + if err != nil { + b.Fatalf("failed to walk to mount point: %v", err) + } + defer mountPoint.DecRef() + // Create and mount the submount. + if err := vfsObj.NewMount(ctx, creds, "", &pop, "memfs", &vfs.NewFilesystemOptions{}); err != nil { + b.Fatalf("failed to mount tmpfs submount: %v", err) + } + filePathBuilder.WriteString(mountPointName) + filePathBuilder.WriteByte('/') + + // Create nested directories with given depth. + vd, err := vfsObj.GetDentryAt(ctx, creds, &pop, &vfs.GetDentryOptions{}) + if err != nil { + b.Fatalf("failed to walk to mount root: %v", err) + } + defer vd.DecRef() + for i := depth; i > 0; i-- { + name := fmt.Sprintf("%d", i) + pop := vfs.PathOperation{ + Root: root, + Start: vd, + Pathname: name, + } + if err := vfsObj.MkdirAt(ctx, creds, &pop, &vfs.MkdirOptions{ + Mode: 0755, + }); err != nil { + b.Fatalf("failed to create directory %q: %v", name, err) + } + nextVD, err := vfsObj.GetDentryAt(ctx, creds, &pop, &vfs.GetDentryOptions{}) + if err != nil { + b.Fatalf("failed to walk to directory %q: %v", name, err) + } + vd.DecRef() + vd = nextVD + filePathBuilder.WriteString(name) + filePathBuilder.WriteByte('/') + } + + // Verify that we didn't create any directories under the mount + // point (i.e. they were all created on the submount). + firstDirName := fmt.Sprintf("%d", depth) + if child := mountPoint.Dentry().Child(firstDirName); child != nil { + b.Fatalf("created directory %q under root mount, not submount", firstDirName) + } + + // Create the file that will be stat'd. + fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{ + Root: root, + Start: vd, + Pathname: filename, + FollowFinalSymlink: true, + }, &vfs.OpenOptions{ + Flags: linux.O_RDWR | linux.O_CREAT | linux.O_EXCL, + Mode: 0644, + }) + if err != nil { + b.Fatalf("failed to create file %q: %v", filename, err) + } + fd.DecRef() + filePathBuilder.WriteString(filename) + filePath := filePathBuilder.String() + + runtime.GC() + b.ResetTimer() + for i := 0; i < b.N; i++ { + stat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{ + Root: root, + Start: root, + Pathname: filePath, + FollowFinalSymlink: true, + }, &vfs.StatOptions{}) + if err != nil { + b.Fatalf("stat(%q) failed: %v", filePath, err) + } + // Sanity check. + if stat.Mode&^linux.S_IFMT != 0644 { + b.Fatalf("got wrong permissions (%0o)", stat.Mode) + } + } + }) + } +} diff --git a/pkg/sentry/fsimpl/memfs/directory.go b/pkg/sentry/fsimpl/memfs/directory.go new file mode 100644 index 000000000..b0c3ea39a --- /dev/null +++ b/pkg/sentry/fsimpl/memfs/directory.go @@ -0,0 +1,178 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package memfs + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +type directory struct { + inode Inode + + // childList is a list containing (1) child Dentries and (2) fake Dentries + // (with inode == nil) that represent the iteration position of + // directoryFDs. childList is used to support directoryFD.IterDirents() + // efficiently. childList is protected by Filesystem.mu. + childList dentryList +} + +func (fs *Filesystem) newDirectory(creds *auth.Credentials, mode uint16) *Inode { + dir := &directory{} + dir.inode.init(dir, fs, creds, mode) + dir.inode.nlink = 2 // from "." and parent directory or ".." for root + return &dir.inode +} + +func (i *Inode) isDir() bool { + _, ok := i.impl.(*directory) + return ok +} + +type directoryFD struct { + fileDescription + vfs.DirectoryFileDescriptionDefaultImpl + + // Protected by Filesystem.mu. + iter *Dentry + off int64 +} + +// Release implements vfs.FileDescriptionImpl.Release. +func (fd *directoryFD) Release() { + if fd.iter != nil { + fs := fd.filesystem() + dir := fd.inode().impl.(*directory) + fs.mu.Lock() + dir.childList.Remove(fd.iter) + fs.mu.Unlock() + fd.iter = nil + } +} + +// IterDirents implements vfs.FileDescriptionImpl.IterDirents. +func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error { + fs := fd.filesystem() + d := fd.vfsfd.VirtualDentry().Dentry() + + fs.mu.Lock() + defer fs.mu.Unlock() + + if fd.off == 0 { + if !cb.Handle(vfs.Dirent{ + Name: ".", + Type: linux.DT_DIR, + Ino: d.Impl().(*Dentry).inode.ino, + Off: 0, + }) { + return nil + } + fd.off++ + } + if fd.off == 1 { + parentInode := d.ParentOrSelf().Impl().(*Dentry).inode + if !cb.Handle(vfs.Dirent{ + Name: "..", + Type: parentInode.direntType(), + Ino: parentInode.ino, + Off: 1, + }) { + return nil + } + fd.off++ + } + + dir := d.Impl().(*Dentry).inode.impl.(*directory) + var child *Dentry + if fd.iter == nil { + // Start iteration at the beginning of dir. + child = dir.childList.Front() + fd.iter = &Dentry{} + } else { + // Continue iteration from where we left off. + child = fd.iter.Next() + dir.childList.Remove(fd.iter) + } + for child != nil { + // Skip other directoryFD iterators. + if child.inode != nil { + if !cb.Handle(vfs.Dirent{ + Name: child.vfsd.Name(), + Type: child.inode.direntType(), + Ino: child.inode.ino, + Off: fd.off, + }) { + dir.childList.InsertBefore(child, fd.iter) + return nil + } + fd.off++ + } + child = child.Next() + } + dir.childList.PushBack(fd.iter) + return nil +} + +// Seek implements vfs.FileDescriptionImpl.Seek. +func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { + if whence != linux.SEEK_SET { + // TODO: Linux also allows SEEK_CUR. + return 0, syserror.EINVAL + } + if offset < 0 { + return 0, syserror.EINVAL + } + + fd.off = offset + // Compensate for "." and "..". + var remChildren int64 + if offset < 2 { + remChildren = 0 + } else { + remChildren = offset - 2 + } + + fs := fd.filesystem() + dir := fd.inode().impl.(*directory) + + fs.mu.Lock() + defer fs.mu.Unlock() + + // Ensure that fd.iter exists and is not linked into dir.childList. + if fd.iter == nil { + fd.iter = &Dentry{} + } else { + dir.childList.Remove(fd.iter) + } + // Insert fd.iter before the remChildren'th child, or at the end of the + // list if remChildren >= number of children. + child := dir.childList.Front() + for child != nil { + // Skip other directoryFD iterators. + if child.inode != nil { + if remChildren == 0 { + dir.childList.InsertBefore(child, fd.iter) + return offset, nil + } + remChildren-- + } + child = child.Next() + } + dir.childList.PushBack(fd.iter) + return offset, nil +} diff --git a/pkg/sentry/fsimpl/memfs/filesystem.go b/pkg/sentry/fsimpl/memfs/filesystem.go new file mode 100644 index 000000000..4d989eeaf --- /dev/null +++ b/pkg/sentry/fsimpl/memfs/filesystem.go @@ -0,0 +1,542 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package memfs + +import ( + "fmt" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +// stepLocked resolves rp.Component() in parent directory vfsd. +// +// stepLocked is loosely analogous to fs/namei.c:walk_component(). +// +// Preconditions: Filesystem.mu must be locked. !rp.Done(). inode == +// vfsd.Impl().(*Dentry).inode. +func stepLocked(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, inode *Inode) (*vfs.Dentry, *Inode, error) { + if !inode.isDir() { + return nil, nil, syserror.ENOTDIR + } + if err := inode.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil { + return nil, nil, err + } +afterSymlink: + nextVFSD, err := rp.ResolveComponent(vfsd) + if err != nil { + return nil, nil, err + } + if nextVFSD == nil { + // Since the Dentry tree is the sole source of truth for memfs, if it's + // not in the Dentry tree, it doesn't exist. + return nil, nil, syserror.ENOENT + } + nextInode := nextVFSD.Impl().(*Dentry).inode + if symlink, ok := nextInode.impl.(*symlink); ok && rp.ShouldFollowSymlink() { + // TODO: symlink traversals update access time + if err := rp.HandleSymlink(symlink.target); err != nil { + return nil, nil, err + } + goto afterSymlink // don't check the current directory again + } + rp.Advance() + return nextVFSD, nextInode, nil +} + +// walkExistingLocked resolves rp to an existing file. +// +// walkExistingLocked is loosely analogous to Linux's +// fs/namei.c:path_lookupat(). +// +// Preconditions: Filesystem.mu must be locked. +func walkExistingLocked(rp *vfs.ResolvingPath) (*vfs.Dentry, *Inode, error) { + vfsd := rp.Start() + inode := vfsd.Impl().(*Dentry).inode + for !rp.Done() { + var err error + vfsd, inode, err = stepLocked(rp, vfsd, inode) + if err != nil { + return nil, nil, err + } + } + if rp.MustBeDir() && !inode.isDir() { + return nil, nil, syserror.ENOTDIR + } + return vfsd, inode, nil +} + +// walkParentDirLocked resolves all but the last path component of rp to an +// existing directory. It does not check that the returned directory is +// searchable by the provider of rp. +// +// walkParentDirLocked is loosely analogous to Linux's +// fs/namei.c:path_parentat(). +// +// Preconditions: Filesystem.mu must be locked. !rp.Done(). +func walkParentDirLocked(rp *vfs.ResolvingPath) (*vfs.Dentry, *Inode, error) { + vfsd := rp.Start() + inode := vfsd.Impl().(*Dentry).inode + for !rp.Final() { + var err error + vfsd, inode, err = stepLocked(rp, vfsd, inode) + if err != nil { + return nil, nil, err + } + } + if !inode.isDir() { + return nil, nil, syserror.ENOTDIR + } + return vfsd, inode, nil +} + +// checkCreateLocked checks that a file named rp.Component() may be created in +// directory parentVFSD, then returns rp.Component(). +// +// Preconditions: Filesystem.mu must be locked. parentInode == +// parentVFSD.Impl().(*Dentry).inode. parentInode.isDir() == true. +func checkCreateLocked(rp *vfs.ResolvingPath, parentVFSD *vfs.Dentry, parentInode *Inode) (string, error) { + if err := parentInode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true); err != nil { + return "", err + } + pc := rp.Component() + if pc == "." || pc == ".." { + return "", syserror.EEXIST + } + childVFSD, err := rp.ResolveChild(parentVFSD, pc) + if err != nil { + return "", err + } + if childVFSD != nil { + return "", syserror.EEXIST + } + if parentVFSD.IsDisowned() { + return "", syserror.ENOENT + } + return pc, nil +} + +// checkDeleteLocked checks that the file represented by vfsd may be deleted. +func checkDeleteLocked(vfsd *vfs.Dentry) error { + parentVFSD := vfsd.Parent() + if parentVFSD == nil { + return syserror.EBUSY + } + if parentVFSD.IsDisowned() { + return syserror.ENOENT + } + return nil +} + +// GetDentryAt implements vfs.FilesystemImpl.GetDentryAt. +func (fs *Filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) { + fs.mu.RLock() + defer fs.mu.RUnlock() + vfsd, inode, err := walkExistingLocked(rp) + if err != nil { + return nil, err + } + if opts.CheckSearchable { + if !inode.isDir() { + return nil, syserror.ENOTDIR + } + if err := inode.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil { + return nil, err + } + } + inode.incRef() // vfsd.IncRef(&fs.vfsfs) + return vfsd, nil +} + +// LinkAt implements vfs.FilesystemImpl.LinkAt. +func (fs *Filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error { + if rp.Done() { + return syserror.EEXIST + } + fs.mu.Lock() + defer fs.mu.Unlock() + parentVFSD, parentInode, err := walkParentDirLocked(rp) + if err != nil { + return err + } + pc, err := checkCreateLocked(rp, parentVFSD, parentInode) + if err != nil { + return err + } + if rp.Mount() != vd.Mount() { + return syserror.EXDEV + } + if err := rp.Mount().CheckBeginWrite(); err != nil { + return err + } + defer rp.Mount().EndWrite() + d := vd.Dentry().Impl().(*Dentry) + if d.inode.isDir() { + return syserror.EPERM + } + d.inode.incLinksLocked() + child := fs.newDentry(d.inode) + parentVFSD.InsertChild(&child.vfsd, pc) + parentInode.impl.(*directory).childList.PushBack(child) + return nil +} + +// MkdirAt implements vfs.FilesystemImpl.MkdirAt. +func (fs *Filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error { + if rp.Done() { + return syserror.EEXIST + } + fs.mu.Lock() + defer fs.mu.Unlock() + parentVFSD, parentInode, err := walkParentDirLocked(rp) + if err != nil { + return err + } + pc, err := checkCreateLocked(rp, parentVFSD, parentInode) + if err != nil { + return err + } + if err := rp.Mount().CheckBeginWrite(); err != nil { + return err + } + defer rp.Mount().EndWrite() + child := fs.newDentry(fs.newDirectory(rp.Credentials(), opts.Mode)) + parentVFSD.InsertChild(&child.vfsd, pc) + parentInode.impl.(*directory).childList.PushBack(child) + parentInode.incLinksLocked() // from child's ".." + return nil +} + +// MknodAt implements vfs.FilesystemImpl.MknodAt. +func (fs *Filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error { + if rp.Done() { + return syserror.EEXIST + } + fs.mu.Lock() + defer fs.mu.Unlock() + parentVFSD, parentInode, err := walkParentDirLocked(rp) + if err != nil { + return err + } + _, err = checkCreateLocked(rp, parentVFSD, parentInode) + if err != nil { + return err + } + if err := rp.Mount().CheckBeginWrite(); err != nil { + return err + } + defer rp.Mount().EndWrite() + // TODO: actually implement mknod + return syserror.EPERM +} + +// OpenAt implements vfs.FilesystemImpl.OpenAt. +func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) { + // Filter out flags that are not supported by memfs. O_DIRECTORY and + // O_NOFOLLOW have no effect here (they're handled by VFS by setting + // appropriate bits in rp), but are returned by + // FileDescriptionImpl.StatusFlags(). + opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_TRUNC | linux.O_DIRECTORY | linux.O_NOFOLLOW + + if opts.Flags&linux.O_CREAT == 0 { + fs.mu.RLock() + defer fs.mu.RUnlock() + vfsd, inode, err := walkExistingLocked(rp) + if err != nil { + return nil, err + } + return inode.open(rp, vfsd, opts.Flags, false) + } + + mustCreate := opts.Flags&linux.O_EXCL != 0 + vfsd := rp.Start() + inode := vfsd.Impl().(*Dentry).inode + fs.mu.Lock() + defer fs.mu.Unlock() + if rp.Done() { + // FIXME: ??? + if rp.MustBeDir() { + return nil, syserror.EISDIR + } + if mustCreate { + return nil, syserror.EEXIST + } + return inode.open(rp, vfsd, opts.Flags, false) + } +afterTrailingSymlink: + // Walk to the parent directory of the last path component. + for !rp.Final() { + var err error + vfsd, inode, err = stepLocked(rp, vfsd, inode) + if err != nil { + return nil, err + } + } + if !inode.isDir() { + return nil, syserror.ENOTDIR + } + // Check for search permission in the parent directory. + if err := inode.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil { + return nil, err + } + // Reject attempts to open directories with O_CREAT. + if rp.MustBeDir() { + return nil, syserror.EISDIR + } + pc := rp.Component() + if pc == "." || pc == ".." { + return nil, syserror.EISDIR + } + // Determine whether or not we need to create a file. + childVFSD, err := rp.ResolveChild(vfsd, pc) + if err != nil { + return nil, err + } + if childVFSD == nil { + // Already checked for searchability above; now check for writability. + if err := inode.checkPermissions(rp.Credentials(), vfs.MayWrite, true); err != nil { + return nil, err + } + if err := rp.Mount().CheckBeginWrite(); err != nil { + return nil, err + } + defer rp.Mount().EndWrite() + // Create and open the child. + childInode := fs.newRegularFile(rp.Credentials(), opts.Mode) + child := fs.newDentry(childInode) + vfsd.InsertChild(&child.vfsd, pc) + inode.impl.(*directory).childList.PushBack(child) + return childInode.open(rp, &child.vfsd, opts.Flags, true) + } + // Open existing file or follow symlink. + if mustCreate { + return nil, syserror.EEXIST + } + childInode := childVFSD.Impl().(*Dentry).inode + if symlink, ok := childInode.impl.(*symlink); ok && rp.ShouldFollowSymlink() { + // TODO: symlink traversals update access time + if err := rp.HandleSymlink(symlink.target); err != nil { + return nil, err + } + // rp.Final() may no longer be true since we now need to resolve the + // symlink target. + goto afterTrailingSymlink + } + return childInode.open(rp, childVFSD, opts.Flags, false) +} + +func (i *Inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32, afterCreate bool) (*vfs.FileDescription, error) { + ats := vfs.AccessTypesForOpenFlags(flags) + if !afterCreate { + if err := i.checkPermissions(rp.Credentials(), ats, i.isDir()); err != nil { + return nil, err + } + } + switch impl := i.impl.(type) { + case *regularFile: + var fd regularFileFD + fd.flags = flags + fd.readable = vfs.MayReadFileWithOpenFlags(flags) + fd.writable = vfs.MayWriteFileWithOpenFlags(flags) + if fd.writable { + if err := rp.Mount().CheckBeginWrite(); err != nil { + return nil, err + } + // Mount.EndWrite() is called by regularFileFD.Release(). + } + fd.vfsfd.Init(&fd, rp.Mount(), vfsd) + if flags&linux.O_TRUNC != 0 { + impl.mu.Lock() + impl.data = impl.data[:0] + atomic.StoreInt64(&impl.dataLen, 0) + impl.mu.Unlock() + } + return &fd.vfsfd, nil + case *directory: + // Can't open directories writably. + if ats&vfs.MayWrite != 0 { + return nil, syserror.EISDIR + } + var fd directoryFD + fd.vfsfd.Init(&fd, rp.Mount(), vfsd) + fd.flags = flags + return &fd.vfsfd, nil + case *symlink: + // Can't open symlinks without O_PATH (which is unimplemented). + return nil, syserror.ELOOP + default: + panic(fmt.Sprintf("unknown inode type: %T", i.impl)) + } +} + +// ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt. +func (fs *Filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) { + fs.mu.RLock() + _, inode, err := walkExistingLocked(rp) + fs.mu.RUnlock() + if err != nil { + return "", err + } + symlink, ok := inode.impl.(*symlink) + if !ok { + return "", syserror.EINVAL + } + return symlink.target, nil +} + +// RenameAt implements vfs.FilesystemImpl.RenameAt. +func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry, opts vfs.RenameOptions) error { + if rp.Done() { + // FIXME + return syserror.ENOENT + } + fs.mu.Lock() + defer fs.mu.Unlock() + parentVFSD, parentInode, err := walkParentDirLocked(rp) + if err != nil { + return err + } + _, err = checkCreateLocked(rp, parentVFSD, parentInode) + if err != nil { + return err + } + if err := rp.Mount().CheckBeginWrite(); err != nil { + return err + } + defer rp.Mount().EndWrite() + // TODO: actually implement RenameAt + return syserror.EPERM +} + +// RmdirAt implements vfs.FilesystemImpl.RmdirAt. +func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error { + fs.mu.Lock() + defer fs.mu.Unlock() + vfsd, inode, err := walkExistingLocked(rp) + if err != nil { + return err + } + if err := rp.Mount().CheckBeginWrite(); err != nil { + return err + } + defer rp.Mount().EndWrite() + if err := checkDeleteLocked(vfsd); err != nil { + return err + } + if !inode.isDir() { + return syserror.ENOTDIR + } + if vfsd.HasChildren() { + return syserror.ENOTEMPTY + } + if err := rp.VirtualFilesystem().DeleteDentry(vfs.MountNamespaceFromContext(ctx), vfsd); err != nil { + return err + } + inode.decRef() + return nil +} + +// SetStatAt implements vfs.FilesystemImpl.SetStatAt. +func (fs *Filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error { + fs.mu.RLock() + _, _, err := walkExistingLocked(rp) + fs.mu.RUnlock() + if err != nil { + return err + } + if opts.Stat.Mask == 0 { + return nil + } + // TODO: implement Inode.setStat + return syserror.EPERM +} + +// StatAt implements vfs.FilesystemImpl.StatAt. +func (fs *Filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) { + fs.mu.RLock() + _, inode, err := walkExistingLocked(rp) + fs.mu.RUnlock() + if err != nil { + return linux.Statx{}, err + } + var stat linux.Statx + inode.statTo(&stat) + return stat, nil +} + +// StatFSAt implements vfs.FilesystemImpl.StatFSAt. +func (fs *Filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) { + fs.mu.RLock() + _, _, err := walkExistingLocked(rp) + fs.mu.RUnlock() + if err != nil { + return linux.Statfs{}, err + } + // TODO: actually implement statfs + return linux.Statfs{}, syserror.ENOSYS +} + +// SymlinkAt implements vfs.FilesystemImpl.SymlinkAt. +func (fs *Filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error { + if rp.Done() { + return syserror.EEXIST + } + fs.mu.Lock() + defer fs.mu.Unlock() + parentVFSD, parentInode, err := walkParentDirLocked(rp) + if err != nil { + return err + } + pc, err := checkCreateLocked(rp, parentVFSD, parentInode) + if err != nil { + return err + } + if err := rp.Mount().CheckBeginWrite(); err != nil { + return err + } + defer rp.Mount().EndWrite() + child := fs.newDentry(fs.newSymlink(rp.Credentials(), target)) + parentVFSD.InsertChild(&child.vfsd, pc) + parentInode.impl.(*directory).childList.PushBack(child) + return nil +} + +// UnlinkAt implements vfs.FilesystemImpl.UnlinkAt. +func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error { + fs.mu.Lock() + defer fs.mu.Unlock() + vfsd, inode, err := walkExistingLocked(rp) + if err != nil { + return err + } + if err := rp.Mount().CheckBeginWrite(); err != nil { + return err + } + defer rp.Mount().EndWrite() + if err := checkDeleteLocked(vfsd); err != nil { + return err + } + if inode.isDir() { + return syserror.EISDIR + } + if err := rp.VirtualFilesystem().DeleteDentry(vfs.MountNamespaceFromContext(ctx), vfsd); err != nil { + return err + } + inode.decLinksLocked() + return nil +} diff --git a/pkg/sentry/fsimpl/memfs/memfs.go b/pkg/sentry/fsimpl/memfs/memfs.go new file mode 100644 index 000000000..f381e1a88 --- /dev/null +++ b/pkg/sentry/fsimpl/memfs/memfs.go @@ -0,0 +1,299 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package memfs provides a filesystem implementation that behaves like tmpfs: +// the Dentry tree is the sole source of truth for the state of the filesystem. +// +// memfs is intended primarily to demonstrate filesystem implementation +// patterns. Real uses cases for an in-memory filesystem should use tmpfs +// instead. +// +// Lock order: +// +// Filesystem.mu +// regularFileFD.offMu +// regularFile.mu +// Inode.mu +package memfs + +import ( + "fmt" + "sync" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +// FilesystemType implements vfs.FilesystemType. +type FilesystemType struct{} + +// Filesystem implements vfs.FilesystemImpl. +type Filesystem struct { + vfsfs vfs.Filesystem + + // mu serializes changes to the Dentry tree. + mu sync.RWMutex + + nextInoMinusOne uint64 // accessed using atomic memory operations +} + +// NewFilesystem implements vfs.FilesystemType.NewFilesystem. +func (fstype FilesystemType) NewFilesystem(ctx context.Context, creds *auth.Credentials, source string, opts vfs.NewFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { + var fs Filesystem + fs.vfsfs.Init(&fs) + root := fs.newDentry(fs.newDirectory(creds, 01777)) + return &fs.vfsfs, &root.vfsd, nil +} + +// Release implements vfs.FilesystemImpl.Release. +func (fs *Filesystem) Release() { +} + +// Sync implements vfs.FilesystemImpl.Sync. +func (fs *Filesystem) Sync(ctx context.Context) error { + // All filesystem state is in-memory. + return nil +} + +// Dentry implements vfs.DentryImpl. +type Dentry struct { + vfsd vfs.Dentry + + // inode is the inode represented by this Dentry. Multiple Dentries may + // share a single non-directory Inode (with hard links). inode is + // immutable. + inode *Inode + + // memfs doesn't count references on Dentries; because the Dentry tree is + // the sole source of truth, it is by definition always consistent with the + // state of the filesystem. However, it does count references on Inodes, + // because Inode resources are released when all references are dropped. + // (memfs doesn't really have resources to release, but we implement + // reference counting because tmpfs regular files will.) + + // dentryEntry (ugh) links Dentries into their parent directory.childList. + dentryEntry +} + +func (fs *Filesystem) newDentry(inode *Inode) *Dentry { + d := &Dentry{ + inode: inode, + } + d.vfsd.Init(d) + return d +} + +// IncRef implements vfs.DentryImpl.IncRef. +func (d *Dentry) IncRef(vfsfs *vfs.Filesystem) { + d.inode.incRef() +} + +// TryIncRef implements vfs.DentryImpl.TryIncRef. +func (d *Dentry) TryIncRef(vfsfs *vfs.Filesystem) bool { + return d.inode.tryIncRef() +} + +// DecRef implements vfs.DentryImpl.DecRef. +func (d *Dentry) DecRef(vfsfs *vfs.Filesystem) { + d.inode.decRef() +} + +// Inode represents a filesystem object. +type Inode struct { + // refs is a reference count. refs is accessed using atomic memory + // operations. + // + // A reference is held on all Inodes that are reachable in the filesystem + // tree. For non-directories (which may have multiple hard links), this + // means that a reference is dropped when nlink reaches 0. For directories, + // nlink never reaches 0 due to the "." entry; instead, + // Filesystem.RmdirAt() drops the reference. + refs int64 + + // Inode metadata; protected by mu and accessed using atomic memory + // operations unless otherwise specified. + mu sync.RWMutex + mode uint32 // excluding file type bits, which are based on impl + nlink uint32 // protected by Filesystem.mu instead of Inode.mu + uid uint32 // auth.KUID, but stored as raw uint32 for sync/atomic + gid uint32 // auth.KGID, but ... + ino uint64 // immutable + + impl interface{} // immutable +} + +func (i *Inode) init(impl interface{}, fs *Filesystem, creds *auth.Credentials, mode uint16) { + i.refs = 1 + i.mode = uint32(mode) + i.uid = uint32(creds.EffectiveKUID) + i.gid = uint32(creds.EffectiveKGID) + i.ino = atomic.AddUint64(&fs.nextInoMinusOne, 1) + // i.nlink initialized by caller + i.impl = impl +} + +// Preconditions: Filesystem.mu must be locked for writing. +func (i *Inode) incLinksLocked() { + if atomic.AddUint32(&i.nlink, 1) <= 1 { + panic("memfs.Inode.incLinksLocked() called with no existing links") + } +} + +// Preconditions: Filesystem.mu must be locked for writing. +func (i *Inode) decLinksLocked() { + if nlink := atomic.AddUint32(&i.nlink, ^uint32(0)); nlink == 0 { + i.decRef() + } else if nlink == ^uint32(0) { // negative overflow + panic("memfs.Inode.decLinksLocked() called with no existing links") + } +} + +func (i *Inode) incRef() { + if atomic.AddInt64(&i.refs, 1) <= 1 { + panic("memfs.Inode.incRef() called without holding a reference") + } +} + +func (i *Inode) tryIncRef() bool { + for { + refs := atomic.LoadInt64(&i.refs) + if refs == 0 { + return false + } + if atomic.CompareAndSwapInt64(&i.refs, refs, refs+1) { + return true + } + } +} + +func (i *Inode) decRef() { + if refs := atomic.AddInt64(&i.refs, -1); refs == 0 { + // This is unnecessary; it's mostly to simulate what tmpfs would do. + if regfile, ok := i.impl.(*regularFile); ok { + regfile.mu.Lock() + regfile.data = nil + atomic.StoreInt64(®file.dataLen, 0) + regfile.mu.Unlock() + } + } else if refs < 0 { + panic("memfs.Inode.decRef() called without holding a reference") + } +} + +func (i *Inode) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes, isDir bool) error { + return vfs.GenericCheckPermissions(creds, ats, isDir, uint16(atomic.LoadUint32(&i.mode)), auth.KUID(atomic.LoadUint32(&i.uid)), auth.KGID(atomic.LoadUint32(&i.gid))) +} + +// Go won't inline this function, and returning linux.Statx (which is quite +// big) means spending a lot of time in runtime.duffcopy(), so instead it's an +// output parameter. +func (i *Inode) statTo(stat *linux.Statx) { + stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO + stat.Blksize = 1 // usermem.PageSize in tmpfs + stat.Nlink = atomic.LoadUint32(&i.nlink) + stat.UID = atomic.LoadUint32(&i.uid) + stat.GID = atomic.LoadUint32(&i.gid) + stat.Mode = uint16(atomic.LoadUint32(&i.mode)) + stat.Ino = i.ino + // TODO: device number + switch impl := i.impl.(type) { + case *regularFile: + stat.Mode |= linux.S_IFREG + stat.Mask |= linux.STATX_SIZE | linux.STATX_BLOCKS + stat.Size = uint64(atomic.LoadInt64(&impl.dataLen)) + // In tmpfs, this will be FileRangeSet.Span() / 512 (but also cached in + // a uint64 accessed using atomic memory operations to avoid taking + // locks). + stat.Blocks = allocatedBlocksForSize(stat.Size) + case *directory: + stat.Mode |= linux.S_IFDIR + case *symlink: + stat.Mode |= linux.S_IFLNK + stat.Mask |= linux.STATX_SIZE | linux.STATX_BLOCKS + stat.Size = uint64(len(impl.target)) + stat.Blocks = allocatedBlocksForSize(stat.Size) + default: + panic(fmt.Sprintf("unknown inode type: %T", i.impl)) + } +} + +// allocatedBlocksForSize returns the number of 512B blocks needed to +// accommodate the given size in bytes, as appropriate for struct +// stat::st_blocks and struct statx::stx_blocks. (Note that this 512B block +// size is independent of the "preferred block size for I/O", struct +// stat::st_blksize and struct statx::stx_blksize.) +func allocatedBlocksForSize(size uint64) uint64 { + return (size + 511) / 512 +} + +func (i *Inode) direntType() uint8 { + switch i.impl.(type) { + case *regularFile: + return linux.DT_REG + case *directory: + return linux.DT_DIR + case *symlink: + return linux.DT_LNK + default: + panic(fmt.Sprintf("unknown inode type: %T", i.impl)) + } +} + +// fileDescription is embedded by memfs implementations of +// vfs.FileDescriptionImpl. +type fileDescription struct { + vfsfd vfs.FileDescription + + flags uint32 // status flags; immutable +} + +func (fd *fileDescription) filesystem() *Filesystem { + return fd.vfsfd.VirtualDentry().Mount().Filesystem().Impl().(*Filesystem) +} + +func (fd *fileDescription) inode() *Inode { + return fd.vfsfd.VirtualDentry().Dentry().Impl().(*Dentry).inode +} + +// StatusFlags implements vfs.FileDescriptionImpl.StatusFlags. +func (fd *fileDescription) StatusFlags(ctx context.Context) (uint32, error) { + return fd.flags, nil +} + +// SetStatusFlags implements vfs.FileDescriptionImpl.SetStatusFlags. +func (fd *fileDescription) SetStatusFlags(ctx context.Context, flags uint32) error { + // None of the flags settable by fcntl(F_SETFL) are supported, so this is a + // no-op. + return nil +} + +// Stat implements vfs.FileDescriptionImpl.Stat. +func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { + var stat linux.Statx + fd.inode().statTo(&stat) + return stat, nil +} + +// SetStat implements vfs.FileDescriptionImpl.SetStat. +func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { + if opts.Stat.Mask == 0 { + return nil + } + // TODO: implement Inode.setStat + return syserror.EPERM +} diff --git a/pkg/sentry/fsimpl/memfs/regular_file.go b/pkg/sentry/fsimpl/memfs/regular_file.go new file mode 100644 index 000000000..4a3603cc8 --- /dev/null +++ b/pkg/sentry/fsimpl/memfs/regular_file.go @@ -0,0 +1,155 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package memfs + +import ( + "io" + "sync" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +type regularFile struct { + inode Inode + + mu sync.RWMutex + data []byte + // dataLen is len(data), but accessed using atomic memory operations to + // avoid locking in Inode.stat(). + dataLen int64 +} + +func (fs *Filesystem) newRegularFile(creds *auth.Credentials, mode uint16) *Inode { + file := ®ularFile{} + file.inode.init(file, fs, creds, mode) + file.inode.nlink = 1 // from parent directory + return &file.inode +} + +type regularFileFD struct { + fileDescription + vfs.FileDescriptionDefaultImpl + + // These are immutable. + readable bool + writable bool + + // off is the file offset. off is accessed using atomic memory operations. + // offMu serializes operations that may mutate off. + off int64 + offMu sync.Mutex +} + +// Release implements vfs.FileDescriptionImpl.Release. +func (fd *regularFileFD) Release() { + if fd.writable { + fd.vfsfd.VirtualDentry().Mount().EndWrite() + } +} + +// PRead implements vfs.FileDescriptionImpl.PRead. +func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { + if !fd.readable { + return 0, syserror.EINVAL + } + f := fd.inode().impl.(*regularFile) + f.mu.RLock() + if offset >= int64(len(f.data)) { + f.mu.RUnlock() + return 0, io.EOF + } + n, err := dst.CopyOut(ctx, f.data[offset:]) + f.mu.RUnlock() + return int64(n), err +} + +// Read implements vfs.FileDescriptionImpl.Read. +func (fd *regularFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { + fd.offMu.Lock() + n, err := fd.PRead(ctx, dst, fd.off, opts) + fd.off += n + fd.offMu.Unlock() + return n, err +} + +// PWrite implements vfs.FileDescriptionImpl.PWrite. +func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { + if !fd.writable { + return 0, syserror.EINVAL + } + if offset < 0 { + return 0, syserror.EINVAL + } + srclen := src.NumBytes() + if srclen == 0 { + return 0, nil + } + f := fd.inode().impl.(*regularFile) + f.mu.Lock() + end := offset + srclen + if end < offset { + // Overflow. + f.mu.Unlock() + return 0, syserror.EFBIG + } + if end > f.dataLen { + f.data = append(f.data, make([]byte, end-f.dataLen)...) + atomic.StoreInt64(&f.dataLen, end) + } + n, err := src.CopyIn(ctx, f.data[offset:end]) + f.mu.Unlock() + return int64(n), err +} + +// Write implements vfs.FileDescriptionImpl.Write. +func (fd *regularFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { + fd.offMu.Lock() + n, err := fd.PWrite(ctx, src, fd.off, opts) + fd.off += n + fd.offMu.Unlock() + return n, err +} + +// Seek implements vfs.FileDescriptionImpl.Seek. +func (fd *regularFileFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { + fd.offMu.Lock() + defer fd.offMu.Unlock() + switch whence { + case linux.SEEK_SET: + // use offset as specified + case linux.SEEK_CUR: + offset += fd.off + case linux.SEEK_END: + offset += atomic.LoadInt64(&fd.inode().impl.(*regularFile).dataLen) + default: + return 0, syserror.EINVAL + } + if offset < 0 { + return 0, syserror.EINVAL + } + fd.off = offset + return offset, nil +} + +// Sync implements vfs.FileDescriptionImpl.Sync. +func (fd *regularFileFD) Sync(ctx context.Context) error { + return nil +} diff --git a/pkg/sentry/fsimpl/memfs/symlink.go b/pkg/sentry/fsimpl/memfs/symlink.go new file mode 100644 index 000000000..e002d1727 --- /dev/null +++ b/pkg/sentry/fsimpl/memfs/symlink.go @@ -0,0 +1,36 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package memfs + +import ( + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" +) + +type symlink struct { + inode Inode + target string // immutable +} + +func (fs *Filesystem) newSymlink(creds *auth.Credentials, target string) *Inode { + link := &symlink{ + target: target, + } + link.inode.init(link, fs, creds, 0777) + link.inode.nlink = 1 // from parent directory + return &link.inode +} + +// O_PATH is unimplemented, so there's no way to get a FileDescription +// representing a symlink yet. diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD new file mode 100644 index 000000000..4de6c41cf --- /dev/null +++ b/pkg/sentry/vfs/BUILD @@ -0,0 +1,46 @@ +load("//tools/go_stateify:defs.bzl", "go_library", "go_test") + +package(licenses = ["notice"]) + +go_library( + name = "vfs", + srcs = [ + "context.go", + "debug.go", + "dentry.go", + "file_description.go", + "file_description_impl_util.go", + "filesystem.go", + "filesystem_type.go", + "mount.go", + "mount_unsafe.go", + "options.go", + "permissions.go", + "resolving_path.go", + "syscalls.go", + "vfs.go", + ], + importpath = "gvisor.dev/gvisor/pkg/sentry/vfs", + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/fspath", + "//pkg/sentry/arch", + "//pkg/sentry/context", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/memmap", + "//pkg/sentry/usermem", + "//pkg/syserror", + "//pkg/waiter", + "//third_party/gvsync", + ], +) + +go_test( + name = "vfs_test", + size = "small", + srcs = [ + "mount_test.go", + ], + embed = [":vfs"], +) diff --git a/pkg/sentry/vfs/README.md b/pkg/sentry/vfs/README.md new file mode 100644 index 000000000..7847854bc --- /dev/null +++ b/pkg/sentry/vfs/README.md @@ -0,0 +1,197 @@ +# The gVisor Virtual Filesystem + +THIS PACKAGE IS CURRENTLY EXPERIMENTAL AND NOT READY OR ENABLED FOR PRODUCTION +USE. For the filesystem implementation currently used by gVisor, see the `fs` +package. + +## Implementation Notes + +### Reference Counting + +Filesystem, Dentry, Mount, MountNamespace, and FileDescription are all +reference-counted. Mount and MountNamespace are exclusively VFS-managed; when +their reference count reaches zero, VFS releases their resources. Filesystem and +FileDescription management is shared between VFS and filesystem implementations; +when their reference count reaches zero, VFS notifies the implementation by +calling `FilesystemImpl.Release()` or `FileDescriptionImpl.Release()` +respectively and then releases VFS-owned resources. Dentries are exclusively +managed by filesystem implementations; reference count changes are abstracted +through DentryImpl, which should release resources when reference count reaches +zero. + +Filesystem references are held by: + +- Mount: Each referenced Mount holds a reference on the mounted Filesystem. + +Dentry references are held by: + +- FileDescription: Each referenced FileDescription holds a reference on the + Dentry through which it was opened, via `FileDescription.vd.dentry`. + +- Mount: Each referenced Mount holds a reference on its mount point and on the + mounted filesystem root. The mount point is mutable (`mount(MS_MOVE)`). + +Mount references are held by: + +- FileDescription: Each referenced FileDescription holds a reference on the + Mount on which it was opened, via `FileDescription.vd.mount`. + +- Mount: Each referenced Mount holds a reference on its parent, which is the + mount containing its mount point. + +- VirtualFilesystem: A reference is held on all Mounts that are attached + (reachable by Mount traversal). + +MountNamespace and FileDescription references are held by users of VFS. The +expectation is that each `kernel.Task` holds a reference on its corresponding +MountNamespace, and each file descriptor holds a reference on its represented +FileDescription. + +Notes: + +- Dentries do not hold a reference on their owning Filesystem. Instead, all + uses of a Dentry occur in the context of a Mount, which holds a reference on + the relevant Filesystem (see e.g. the VirtualDentry type). As a corollary, + when releasing references on both a Dentry and its corresponding Mount, the + Dentry's reference must be released first (because releasing the Mount's + reference may release the last reference on the Filesystem, whose state may + be required to release the Dentry reference). + +### The Inheritance Pattern + +Filesystem, Dentry, and FileDescription are all concepts featuring both state +that must be shared between VFS and filesystem implementations, and operations +that are implementation-defined. To facilitate this, each of these three +concepts follows the same pattern, shown below for Dentry: + +```go +// Dentry represents a node in a filesystem tree. +type Dentry struct { + // VFS-required dentry state. + parent *Dentry + // ... + + // impl is the DentryImpl associated with this Dentry. impl is immutable. + // This should be the last field in Dentry. + impl DentryImpl +} + +// Init must be called before first use of d. +func (d *Dentry) Init(impl DentryImpl) { + d.impl = impl +} + +// Impl returns the DentryImpl associated with d. +func (d *Dentry) Impl() DentryImpl { + return d.impl +} + +// DentryImpl contains implementation-specific details of a Dentry. +// Implementations of DentryImpl should contain their associated Dentry by +// value as their first field. +type DentryImpl interface { + // VFS-required implementation-defined dentry operations. + IncRef() + // ... +} +``` + +This construction, which is essentially a type-safe analogue to Linux's +`container_of` pattern, has the following properties: + +- VFS works almost exclusively with pointers to Dentry rather than DentryImpl + interface objects, such as in the type of `Dentry.parent`. This avoids + interface method calls (which are somewhat expensive to perform, and defeat + inlining and escape analysis), reduces the size of VFS types (since an + interface object is two pointers in size), and allows pointers to be loaded + and stored atomically using `sync/atomic`. Implementation-defined behavior + is accessed via `Dentry.impl` when required. + +- Filesystem implementations can access the implementation-defined state + associated with objects of VFS types by type-asserting or type-switching + (e.g. `Dentry.Impl().(*myDentry)`). Type assertions to a concrete type + require only an equality comparison of the interface object's type pointer + to a static constant, and are consequently very fast. + +- Filesystem implementations can access the VFS state associated with objects + of implementation-defined types directly. + +- VFS and implementation-defined state for a given type occupy the same + object, minimizing memory allocations and maximizing memory locality. `impl` + is the last field in `Dentry`, and `Dentry` is the first field in + `DentryImpl` implementations, for similar reasons: this tends to cause + fetching of the `Dentry.impl` interface object to also fetch `DentryImpl` + fields, either because they are in the same cache line or via next-line + prefetching. + +## Future Work + +- Most `mount(2)` features, and unmounting, are incomplete. + +- VFS1 filesystems are not directly compatible with VFS2. It may be possible + to implement shims that implement `vfs.FilesystemImpl` for + `fs.MountNamespace`, `vfs.DentryImpl` for `fs.Dirent`, and + `vfs.FileDescriptionImpl` for `fs.File`, which may be adequate for + filesystems that are not performance-critical (e.g. sysfs); however, it is + not clear that this will be less effort than simply porting the filesystems + in question. Practically speaking, the following filesystems will probably + need to be ported or made compatible through a shim to evaluate filesystem + performance on realistic workloads: + + - devfs/procfs/sysfs, which will realistically be necessary to execute + most applications. (Note that procfs and sysfs do not support hard + links, so they do not require the complexity of separate inode objects. + Also note that Linux's /dev is actually a variant of tmpfs called + devtmpfs.) + + - tmpfs. This should be relatively straightforward: copy/paste memfs, + store regular file contents in pgalloc-allocated memory instead of + `[]byte`, and add support for file timestamps. (In fact, it probably + makes more sense to convert memfs to tmpfs and not keep the former.) + + - A remote filesystem, either lisafs (if it is ready by the time that + other benchmarking prerequisites are) or v9fs (aka 9P, aka gofers). + + - epoll files. + + Filesystems that will need to be ported before switching to VFS2, but can + probably be skipped for early testing: + + - overlayfs, which is needed for (at least) synthetic mount points. + + - Support for host ttys. + + - timerfd files. + + Filesystems that can be probably dropped: + + - ashmem, which is far too incomplete to use. + + - binder, which is similarly far too incomplete to use. + + - whitelistfs, which we are already actively attempting to remove. + +- Save/restore. For instance, it is unclear if the current implementation of + the `state` package supports the inheritance pattern described above. + +- Many features that were previously implemented by VFS must now be + implemented by individual filesystems (though, in most cases, this should + consist of calls to hooks or libraries provided by `vfs` or other packages). + This includes, but is not necessarily limited to: + + - Block and character device special files + + - Inotify + + - File locking + + - `O_ASYNC` + +- Reference counts in the `vfs` package do not use the `refs` package since + `refs.AtomicRefCount` adds 64 bytes of overhead to each 8-byte reference + count, resulting in considerable cache bloat. 24 bytes of this overhead is + for weak reference support, which have poor performance and will not be used + by VFS2. The remaining 40 bytes is to store a descriptive string and stack + trace for reference leak checking; we can support reference leak checking + without incurring this space overhead by including the applicable + information directly in finalizers for applicable types. diff --git a/pkg/sentry/vfs/context.go b/pkg/sentry/vfs/context.go new file mode 100644 index 000000000..32cf9151b --- /dev/null +++ b/pkg/sentry/vfs/context.go @@ -0,0 +1,37 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs + +import ( + "gvisor.dev/gvisor/pkg/sentry/context" +) + +// contextID is this package's type for context.Context.Value keys. +type contextID int + +const ( + // CtxMountNamespace is a Context.Value key for a MountNamespace. + CtxMountNamespace contextID = iota +) + +// MountNamespaceFromContext returns the MountNamespace used by ctx. It does +// not take a reference on the returned MountNamespace. If ctx is not +// associated with a MountNamespace, MountNamespaceFromContext returns nil. +func MountNamespaceFromContext(ctx context.Context) *MountNamespace { + if v := ctx.Value(CtxMountNamespace); v != nil { + return v.(*MountNamespace) + } + return nil +} diff --git a/pkg/sentry/vfs/debug.go b/pkg/sentry/vfs/debug.go new file mode 100644 index 000000000..0ed20f249 --- /dev/null +++ b/pkg/sentry/vfs/debug.go @@ -0,0 +1,22 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs + +const ( + // If checkInvariants is true, perform runtime checks for invariants + // expected by the vfs package. This is normally disabled since VFS is + // often a hot path. + checkInvariants = false +) diff --git a/pkg/sentry/vfs/dentry.go b/pkg/sentry/vfs/dentry.go new file mode 100644 index 000000000..45912fc58 --- /dev/null +++ b/pkg/sentry/vfs/dentry.go @@ -0,0 +1,347 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs + +import ( + "fmt" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/syserror" +) + +// Dentry represents a node in a Filesystem tree which may represent a file. +// +// Dentries are reference-counted. Unless otherwise specified, all Dentry +// methods require that a reference is held. +// +// A Dentry transitions through up to 3 different states through its lifetime: +// +// - Dentries are initially "independent". Independent Dentries have no parent, +// and consequently no name. +// +// - Dentry.InsertChild() causes an independent Dentry to become a "child" of +// another Dentry. A child node has a parent node, and a name in that parent, +// both of which are mutable by DentryMoveChild(). Each child Dentry's name is +// unique within its parent. +// +// - Dentry.RemoveChild() causes a child Dentry to become "disowned". A +// disowned Dentry can still refer to its former parent and its former name in +// said parent, but the disowned Dentry is no longer reachable from its parent, +// and a new Dentry with the same name may become a child of the parent. (This +// is analogous to a struct dentry being "unhashed" in Linux.) +// +// Dentry is loosely analogous to Linux's struct dentry, but: +// +// - VFS does not associate Dentries with inodes. gVisor interacts primarily +// with filesystems that are accessed through filesystem APIs (as opposed to +// raw block devices); many such APIs support only paths and file descriptors, +// and not inodes. Furthermore, when parties outside the scope of VFS can +// rename inodes on such filesystems, VFS generally cannot "follow" the rename, +// both due to synchronization issues and because it may not even be able to +// name the destination path; this implies that it would in fact be *incorrect* +// for Dentries to be associated with inodes on such filesystems. Consequently, +// operations that are inode operations in Linux are FilesystemImpl methods +// and/or FileDescriptionImpl methods in gVisor's VFS. Filesystems that do +// support inodes may store appropriate state in implementations of DentryImpl. +// +// - VFS does not provide synchronization for mutable Dentry fields, other than +// mount-related ones. +// +// - VFS does not require that Dentries are instantiated for all paths accessed +// through VFS, only those that are tracked beyond the scope of a single +// Filesystem operation. This includes file descriptions, mount points, mount +// roots, process working directories, and chroots. This avoids instantiation +// of Dentries for operations on mutable remote filesystems that can't actually +// cache any state in the Dentry. +// +// - For the reasons above, VFS is not directly responsible for managing Dentry +// lifetime. Dentry reference counts only indicate the extent to which VFS +// requires Dentries to exist; Filesystems may elect to cache or discard +// Dentries with zero references. +type Dentry struct { + // parent is this Dentry's parent in this Filesystem. If this Dentry is + // independent, parent is nil. + parent *Dentry + + // name is this Dentry's name in parent. + name string + + flags uint32 + + // mounts is the number of Mounts for which this Dentry is Mount.point. + // mounts is accessed using atomic memory operations. + mounts uint32 + + // children are child Dentries. + children map[string]*Dentry + + // impl is the DentryImpl associated with this Dentry. impl is immutable. + // This should be the last field in Dentry. + impl DentryImpl +} + +const ( + // dflagsDisownedMask is set in Dentry.flags if the Dentry has been + // disowned. + dflagsDisownedMask = 1 << iota +) + +// Init must be called before first use of d. +func (d *Dentry) Init(impl DentryImpl) { + d.impl = impl +} + +// Impl returns the DentryImpl associated with d. +func (d *Dentry) Impl() DentryImpl { + return d.impl +} + +// DentryImpl contains implementation details for a Dentry. Implementations of +// DentryImpl should contain their associated Dentry by value as their first +// field. +type DentryImpl interface { + // IncRef increments the Dentry's reference count. A Dentry with a non-zero + // reference count must remain coherent with the state of the filesystem. + IncRef(fs *Filesystem) + + // TryIncRef increments the Dentry's reference count and returns true. If + // the Dentry's reference count is zero, TryIncRef may do nothing and + // return false. (It is also permitted to succeed if it can restore the + // guarantee that the Dentry is coherent with the state of the filesystem.) + // + // TryIncRef does not require that a reference is held on the Dentry. + TryIncRef(fs *Filesystem) bool + + // DecRef decrements the Dentry's reference count. + DecRef(fs *Filesystem) +} + +// IsDisowned returns true if d is disowned. +func (d *Dentry) IsDisowned() bool { + return atomic.LoadUint32(&d.flags)&dflagsDisownedMask != 0 +} + +// Preconditions: !d.IsDisowned(). +func (d *Dentry) setDisowned() { + atomic.AddUint32(&d.flags, dflagsDisownedMask) +} + +func (d *Dentry) isMounted() bool { + return atomic.LoadUint32(&d.mounts) != 0 +} + +func (d *Dentry) incRef(fs *Filesystem) { + d.impl.IncRef(fs) +} + +func (d *Dentry) tryIncRef(fs *Filesystem) bool { + return d.impl.TryIncRef(fs) +} + +func (d *Dentry) decRef(fs *Filesystem) { + d.impl.DecRef(fs) +} + +// These functions are exported so that filesystem implementations can use +// them. The vfs package, and users of VFS, should not call these functions. +// Unless otherwise specified, these methods require that there are no +// concurrent mutators of d. + +// Name returns d's name in its parent in its owning Filesystem. If d is +// independent, Name returns an empty string. +func (d *Dentry) Name() string { + return d.name +} + +// Parent returns d's parent in its owning Filesystem. It does not take a +// reference on the returned Dentry. If d is independent, Parent returns nil. +func (d *Dentry) Parent() *Dentry { + return d.parent +} + +// ParentOrSelf is equivalent to Parent, but returns d if d is independent. +func (d *Dentry) ParentOrSelf() *Dentry { + if d.parent == nil { + return d + } + return d.parent +} + +// Child returns d's child with the given name in its owning Filesystem. It +// does not take a reference on the returned Dentry. If no such child exists, +// Child returns nil. +func (d *Dentry) Child(name string) *Dentry { + return d.children[name] +} + +// HasChildren returns true if d has any children. +func (d *Dentry) HasChildren() bool { + return len(d.children) != 0 +} + +// InsertChild makes child a child of d with the given name. +// +// InsertChild is a mutator of d and child. +// +// Preconditions: child must be an independent Dentry. d and child must be from +// the same Filesystem. d must not already have a child with the given name. +func (d *Dentry) InsertChild(child *Dentry, name string) { + if checkInvariants { + if _, ok := d.children[name]; ok { + panic(fmt.Sprintf("parent already contains a child named %q", name)) + } + if child.parent != nil || child.name != "" { + panic(fmt.Sprintf("child is not independent: parent = %v, name = %q", child.parent, child.name)) + } + } + if d.children == nil { + d.children = make(map[string]*Dentry) + } + d.children[name] = child + child.parent = d + child.name = name +} + +// PrepareDeleteDentry must be called before attempting to delete the file +// represented by d. If PrepareDeleteDentry succeeds, the caller must call +// AbortDeleteDentry or CommitDeleteDentry depending on the deletion's outcome. +// +// Preconditions: d is a child Dentry. +func (vfs *VirtualFilesystem) PrepareDeleteDentry(mntns *MountNamespace, d *Dentry) error { + if checkInvariants { + if d.parent == nil { + panic("d is independent") + } + if d.IsDisowned() { + panic("d is already disowned") + } + } + vfs.mountMu.RLock() + if _, ok := mntns.mountpoints[d]; ok { + vfs.mountMu.RUnlock() + return syserror.EBUSY + } + // Return with vfs.mountMu locked, which will be unlocked by + // AbortDeleteDentry or CommitDeleteDentry. + return nil +} + +// AbortDeleteDentry must be called after PrepareDeleteDentry if the deletion +// fails. +func (vfs *VirtualFilesystem) AbortDeleteDentry() { + vfs.mountMu.RUnlock() +} + +// CommitDeleteDentry must be called after the file represented by d is +// deleted, and causes d to become disowned. +// +// Preconditions: PrepareDeleteDentry was previously called on d. +func (vfs *VirtualFilesystem) CommitDeleteDentry(d *Dentry) { + delete(d.parent.children, d.name) + d.setDisowned() + // TODO: lazily unmount mounts at d + vfs.mountMu.RUnlock() +} + +// DeleteDentry combines PrepareDeleteDentry and CommitDeleteDentry, as +// appropriate for in-memory filesystems that don't need to ensure that some +// external state change succeeds before committing the deletion. +func (vfs *VirtualFilesystem) DeleteDentry(mntns *MountNamespace, d *Dentry) error { + if err := vfs.PrepareDeleteDentry(mntns, d); err != nil { + return err + } + vfs.CommitDeleteDentry(d) + return nil +} + +// PrepareRenameDentry must be called before attempting to rename the file +// represented by from. If to is not nil, it represents the file that will be +// replaced or exchanged by the rename. If PrepareRenameDentry succeeds, the +// caller must call AbortRenameDentry, CommitRenameReplaceDentry, or +// CommitRenameExchangeDentry depending on the rename's outcome. +// +// Preconditions: from is a child Dentry. If to is not nil, it must be a child +// Dentry from the same Filesystem. +func (vfs *VirtualFilesystem) PrepareRenameDentry(mntns *MountNamespace, from, to *Dentry) error { + if checkInvariants { + if from.parent == nil { + panic("from is independent") + } + if from.IsDisowned() { + panic("from is already disowned") + } + if to != nil { + if to.parent == nil { + panic("to is independent") + } + if to.IsDisowned() { + panic("to is already disowned") + } + } + } + vfs.mountMu.RLock() + if _, ok := mntns.mountpoints[from]; ok { + vfs.mountMu.RUnlock() + return syserror.EBUSY + } + if to != nil { + if _, ok := mntns.mountpoints[to]; ok { + vfs.mountMu.RUnlock() + return syserror.EBUSY + } + } + // Return with vfs.mountMu locked, which will be unlocked by + // AbortRenameDentry, CommitRenameReplaceDentry, or + // CommitRenameExchangeDentry. + return nil +} + +// AbortRenameDentry must be called after PrepareRenameDentry if the rename +// fails. +func (vfs *VirtualFilesystem) AbortRenameDentry() { + vfs.mountMu.RUnlock() +} + +// CommitRenameReplaceDentry must be called after the file represented by from +// is renamed without RENAME_EXCHANGE. If to is not nil, it represents the file +// that was replaced by from. +// +// Preconditions: PrepareRenameDentry was previously called on from and to. +// newParent.Child(newName) == to. +func (vfs *VirtualFilesystem) CommitRenameReplaceDentry(from, newParent *Dentry, newName string, to *Dentry) { + if to != nil { + to.setDisowned() + // TODO: lazily unmount mounts at d + } + if newParent.children == nil { + newParent.children = make(map[string]*Dentry) + } + newParent.children[newName] = from + from.parent = newParent + from.name = newName + vfs.mountMu.RUnlock() +} + +// CommitRenameExchangeDentry must be called after the files represented by +// from and to are exchanged by rename(RENAME_EXCHANGE). +// +// Preconditions: PrepareRenameDentry was previously called on from and to. +func (vfs *VirtualFilesystem) CommitRenameExchangeDentry(from, to *Dentry) { + from.parent, to.parent = to.parent, from.parent + from.name, to.name = to.name, from.name + from.parent.children[from.name] = from + to.parent.children[to.name] = to + vfs.mountMu.RUnlock() +} diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go new file mode 100644 index 000000000..86bde7fb3 --- /dev/null +++ b/pkg/sentry/vfs/file_description.go @@ -0,0 +1,213 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs + +import ( + "sync/atomic" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/memmap" + "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +// A FileDescription represents an open file description, which is the entity +// referred to by a file descriptor (POSIX.1-2017 3.258 "Open File +// Description"). +// +// FileDescriptions are reference-counted. Unless otherwise specified, all +// FileDescription methods require that a reference is held. +// +// FileDescription is analogous to Linux's struct file. +type FileDescription struct { + // refs is the reference count. refs is accessed using atomic memory + // operations. + refs int64 + + // vd is the filesystem location at which this FileDescription was opened. + // A reference is held on vd. vd is immutable. + vd VirtualDentry + + // impl is the FileDescriptionImpl associated with this Filesystem. impl is + // immutable. This should be the last field in FileDescription. + impl FileDescriptionImpl +} + +// Init must be called before first use of fd. It takes references on mnt and +// d. +func (fd *FileDescription) Init(impl FileDescriptionImpl, mnt *Mount, d *Dentry) { + fd.refs = 1 + fd.vd = VirtualDentry{ + mount: mnt, + dentry: d, + } + fd.vd.IncRef() + fd.impl = impl +} + +// Impl returns the FileDescriptionImpl associated with fd. +func (fd *FileDescription) Impl() FileDescriptionImpl { + return fd.impl +} + +// VirtualDentry returns the location at which fd was opened. It does not take +// a reference on the returned VirtualDentry. +func (fd *FileDescription) VirtualDentry() VirtualDentry { + return fd.vd +} + +// IncRef increments fd's reference count. +func (fd *FileDescription) IncRef() { + atomic.AddInt64(&fd.refs, 1) +} + +// DecRef decrements fd's reference count. +func (fd *FileDescription) DecRef() { + if refs := atomic.AddInt64(&fd.refs, -1); refs == 0 { + fd.impl.Release() + fd.vd.DecRef() + } else if refs < 0 { + panic("FileDescription.DecRef() called without holding a reference") + } +} + +// FileDescriptionImpl contains implementation details for an FileDescription. +// Implementations of FileDescriptionImpl should contain their associated +// FileDescription by value as their first field. +// +// For all functions that return linux.Statx, Statx.Uid and Statx.Gid will +// be interpreted as IDs in the root UserNamespace (i.e. as auth.KUID and +// auth.KGID respectively). +// +// FileDescriptionImpl is analogous to Linux's struct file_operations. +type FileDescriptionImpl interface { + // Release is called when the associated FileDescription reaches zero + // references. + Release() + + // OnClose is called when a file descriptor representing the + // FileDescription is closed. Note that returning a non-nil error does not + // prevent the file descriptor from being closed. + OnClose() error + + // StatusFlags returns file description status flags, as for + // fcntl(F_GETFL). + StatusFlags(ctx context.Context) (uint32, error) + + // SetStatusFlags sets file description status flags, as for + // fcntl(F_SETFL). + SetStatusFlags(ctx context.Context, flags uint32) error + + // Stat returns metadata for the file represented by the FileDescription. + Stat(ctx context.Context, opts StatOptions) (linux.Statx, error) + + // SetStat updates metadata for the file represented by the + // FileDescription. + SetStat(ctx context.Context, opts SetStatOptions) error + + // StatFS returns metadata for the filesystem containing the file + // represented by the FileDescription. + StatFS(ctx context.Context) (linux.Statfs, error) + + // waiter.Waitable methods may be used to poll for I/O events. + waiter.Waitable + + // PRead reads from the file into dst, starting at the given offset, and + // returns the number of bytes read. PRead is permitted to return partial + // reads with a nil error. + PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) + + // Read is similar to PRead, but does not specify an offset. + // + // For files with an implicit FileDescription offset (e.g. regular files), + // Read begins at the FileDescription offset, and advances the offset by + // the number of bytes read; note that POSIX 2.9.7 "Thread Interactions + // with Regular File Operations" requires that all operations that may + // mutate the FileDescription offset are serialized. + Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) + + // PWrite writes src to the file, starting at the given offset, and returns + // the number of bytes written. PWrite is permitted to return partial + // writes with a nil error. + // + // As in Linux (but not POSIX), if O_APPEND is in effect for the + // FileDescription, PWrite should ignore the offset and append data to the + // end of the file. + PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) + + // Write is similar to PWrite, but does not specify an offset, which is + // implied as for Read. + // + // Write is a FileDescriptionImpl method, instead of a wrapper around + // PWrite that uses a FileDescription offset, to make it possible for + // remote filesystems to implement O_APPEND correctly (i.e. atomically with + // respect to writers outside the scope of VFS). + Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) + + // IterDirents invokes cb on each entry in the directory represented by the + // FileDescription. If IterDirents has been called since the last call to + // Seek, it continues iteration from the end of the last call. + IterDirents(ctx context.Context, cb IterDirentsCallback) error + + // Seek changes the FileDescription offset (assuming one exists) and + // returns its new value. + // + // For directories, if whence == SEEK_SET and offset == 0, the caller is + // rewinddir(), such that Seek "shall also cause the directory stream to + // refer to the current state of the corresponding directory" - + // POSIX.1-2017. + Seek(ctx context.Context, offset int64, whence int32) (int64, error) + + // Sync requests that cached state associated with the file represented by + // the FileDescription is synchronized with persistent storage, and blocks + // until this is complete. + Sync(ctx context.Context) error + + // ConfigureMMap mutates opts to implement mmap(2) for the file. Most + // implementations that support memory mapping can call + // GenericConfigureMMap with the appropriate memmap.Mappable. + ConfigureMMap(ctx context.Context, opts memmap.MMapOpts) error + + // Ioctl implements the ioctl(2) syscall. + Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) + + // TODO: extended attributes; file locking +} + +// Dirent holds the information contained in struct linux_dirent64. +type Dirent struct { + // Name is the filename. + Name string + + // Type is the file type, a linux.DT_* constant. + Type uint8 + + // Ino is the inode number. + Ino uint64 + + // Off is this Dirent's offset. + Off int64 +} + +// IterDirentsCallback receives Dirents from FileDescriptionImpl.IterDirents. +type IterDirentsCallback interface { + // Handle handles the given iterated Dirent. It returns true if iteration + // should continue, and false if FileDescriptionImpl.IterDirents should + // terminate now and restart with the same Dirent the next time it is + // called. + Handle(dirent Dirent) bool +} diff --git a/pkg/sentry/vfs/file_description_impl_util.go b/pkg/sentry/vfs/file_description_impl_util.go new file mode 100644 index 000000000..486893e70 --- /dev/null +++ b/pkg/sentry/vfs/file_description_impl_util.go @@ -0,0 +1,142 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/memmap" + "gvisor.dev/gvisor/pkg/sentry/usermem" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/waiter" +) + +// FileDescriptionDefaultImpl may be embedded by implementations of +// FileDescriptionImpl to obtain implementations of many FileDescriptionImpl +// methods with default behavior analogous to Linux's. +type FileDescriptionDefaultImpl struct{} + +// OnClose implements FileDescriptionImpl.OnClose analogously to +// file_operations::flush == NULL in Linux. +func (FileDescriptionDefaultImpl) OnClose() error { + return nil +} + +// StatFS implements FileDescriptionImpl.StatFS analogously to +// super_operations::statfs == NULL in Linux. +func (FileDescriptionDefaultImpl) StatFS(ctx context.Context) (linux.Statfs, error) { + return linux.Statfs{}, syserror.ENOSYS +} + +// Readiness implements waiter.Waitable.Readiness analogously to +// file_operations::poll == NULL in Linux. +func (FileDescriptionDefaultImpl) Readiness(mask waiter.EventMask) waiter.EventMask { + // include/linux/poll.h:vfs_poll() => DEFAULT_POLLMASK + return waiter.EventIn | waiter.EventOut +} + +// EventRegister implements waiter.Waitable.EventRegister analogously to +// file_operations::poll == NULL in Linux. +func (FileDescriptionDefaultImpl) EventRegister(e *waiter.Entry, mask waiter.EventMask) { +} + +// EventUnregister implements waiter.Waitable.EventUnregister analogously to +// file_operations::poll == NULL in Linux. +func (FileDescriptionDefaultImpl) EventUnregister(e *waiter.Entry) { +} + +// PRead implements FileDescriptionImpl.PRead analogously to +// file_operations::read == file_operations::read_iter == NULL in Linux. +func (FileDescriptionDefaultImpl) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) { + return 0, syserror.EINVAL +} + +// Read implements FileDescriptionImpl.Read analogously to +// file_operations::read == file_operations::read_iter == NULL in Linux. +func (FileDescriptionDefaultImpl) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) { + return 0, syserror.EINVAL +} + +// PWrite implements FileDescriptionImpl.PWrite analogously to +// file_operations::write == file_operations::write_iter == NULL in Linux. +func (FileDescriptionDefaultImpl) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) { + return 0, syserror.EINVAL +} + +// Write implements FileDescriptionImpl.Write analogously to +// file_operations::write == file_operations::write_iter == NULL in Linux. +func (FileDescriptionDefaultImpl) Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) { + return 0, syserror.EINVAL +} + +// IterDirents implements FileDescriptionImpl.IterDirents analogously to +// file_operations::iterate == file_operations::iterate_shared == NULL in +// Linux. +func (FileDescriptionDefaultImpl) IterDirents(ctx context.Context, cb IterDirentsCallback) error { + return syserror.ENOTDIR +} + +// Seek implements FileDescriptionImpl.Seek analogously to +// file_operations::llseek == NULL in Linux. +func (FileDescriptionDefaultImpl) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { + return 0, syserror.ESPIPE +} + +// Sync implements FileDescriptionImpl.Sync analogously to +// file_operations::fsync == NULL in Linux. +func (FileDescriptionDefaultImpl) Sync(ctx context.Context) error { + return syserror.EINVAL +} + +// ConfigureMMap implements FileDescriptionImpl.ConfigureMMap analogously to +// file_operations::mmap == NULL in Linux. +func (FileDescriptionDefaultImpl) ConfigureMMap(ctx context.Context, opts memmap.MMapOpts) error { + return syserror.ENODEV +} + +// Ioctl implements FileDescriptionImpl.Ioctl analogously to +// file_operations::unlocked_ioctl == NULL in Linux. +func (FileDescriptionDefaultImpl) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) { + return 0, syserror.ENOTTY +} + +// DirectoryFileDescriptionDefaultImpl may be embedded by implementations of +// FileDescriptionImpl that always represent directories to obtain +// implementations of non-directory I/O methods that return EISDIR, and +// implementations of other methods consistent with FileDescriptionDefaultImpl. +type DirectoryFileDescriptionDefaultImpl struct { + FileDescriptionDefaultImpl +} + +// PRead implements FileDescriptionImpl.PRead. +func (DirectoryFileDescriptionDefaultImpl) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) { + return 0, syserror.EISDIR +} + +// Read implements FileDescriptionImpl.Read. +func (DirectoryFileDescriptionDefaultImpl) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) { + return 0, syserror.EISDIR +} + +// PWrite implements FileDescriptionImpl.PWrite. +func (DirectoryFileDescriptionDefaultImpl) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) { + return 0, syserror.EISDIR +} + +// Write implements FileDescriptionImpl.Write. +func (DirectoryFileDescriptionDefaultImpl) Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) { + return 0, syserror.EISDIR +} diff --git a/pkg/sentry/vfs/filesystem.go b/pkg/sentry/vfs/filesystem.go new file mode 100644 index 000000000..7a074b718 --- /dev/null +++ b/pkg/sentry/vfs/filesystem.go @@ -0,0 +1,155 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs + +import ( + "sync/atomic" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/context" +) + +// A Filesystem is a tree of nodes represented by Dentries, which forms part of +// a VirtualFilesystem. +// +// Filesystems are reference-counted. Unless otherwise specified, all +// Filesystem methods require that a reference is held. +// +// Filesystem is analogous to Linux's struct super_block. +type Filesystem struct { + // refs is the reference count. refs is accessed using atomic memory + // operations. + refs int64 + + // impl is the FilesystemImpl associated with this Filesystem. impl is + // immutable. This should be the last field in Dentry. + impl FilesystemImpl +} + +// Init must be called before first use of fs. +func (fs *Filesystem) Init(impl FilesystemImpl) { + fs.refs = 1 + fs.impl = impl +} + +// Impl returns the FilesystemImpl associated with fs. +func (fs *Filesystem) Impl() FilesystemImpl { + return fs.impl +} + +func (fs *Filesystem) incRef() { + if atomic.AddInt64(&fs.refs, 1) <= 1 { + panic("Filesystem.incRef() called without holding a reference") + } +} + +func (fs *Filesystem) decRef() { + if refs := atomic.AddInt64(&fs.refs, -1); refs == 0 { + fs.impl.Release() + } else if refs < 0 { + panic("Filesystem.decRef() called without holding a reference") + } +} + +// FilesystemImpl contains implementation details for a Filesystem. +// Implementations of FilesystemImpl should contain their associated Filesystem +// by value as their first field. +// +// All methods that take a ResolvingPath must resolve the path before +// performing any other checks, including rejection of the operation if not +// supported by the FilesystemImpl. This is because the final FilesystemImpl +// (responsible for actually implementing the operation) isn't known until path +// resolution is complete. +// +// For all methods that take or return linux.Statx, Statx.Uid and Statx.Gid +// should be interpreted as IDs in the root UserNamespace (i.e. as auth.KUID +// and auth.KGID respectively). +// +// FilesystemImpl combines elements of Linux's struct super_operations and +// struct inode_operations, for reasons described in the documentation for +// Dentry. +type FilesystemImpl interface { + // Release is called when the associated Filesystem reaches zero + // references. + Release() + + // Sync "causes all pending modifications to filesystem metadata and cached + // file data to be written to the underlying [filesystem]", as by syncfs(2). + Sync(ctx context.Context) error + + // GetDentryAt returns a Dentry representing the file at rp. A reference is + // taken on the returned Dentry. + // + // GetDentryAt does not correspond directly to a Linux syscall; it is used + // in the implementation of: + // + // - Syscalls that need to resolve two paths: rename(), renameat(), + // renameat2(), link(), linkat(). + // + // - Syscalls that need to refer to a filesystem position outside the + // context of a file description: chdir(), fchdir(), chroot(), mount(), + // umount(). + GetDentryAt(ctx context.Context, rp *ResolvingPath, opts GetDentryOptions) (*Dentry, error) + + // LinkAt creates a hard link at rp representing the same file as vd. It + // does not take ownership of references on vd. + // + // The implementation is responsible for checking that vd.Mount() == + // rp.Mount(), and that vd does not represent a directory. + LinkAt(ctx context.Context, rp *ResolvingPath, vd VirtualDentry) error + + // MkdirAt creates a directory at rp. + MkdirAt(ctx context.Context, rp *ResolvingPath, opts MkdirOptions) error + + // MknodAt creates a regular file, device special file, or named pipe at + // rp. + MknodAt(ctx context.Context, rp *ResolvingPath, opts MknodOptions) error + + // OpenAt returns an FileDescription providing access to the file at rp. A + // reference is taken on the returned FileDescription. + OpenAt(ctx context.Context, rp *ResolvingPath, opts OpenOptions) (*FileDescription, error) + + // ReadlinkAt returns the target of the symbolic link at rp. + ReadlinkAt(ctx context.Context, rp *ResolvingPath) (string, error) + + // RenameAt renames the Dentry represented by vd to rp. It does not take + // ownership of references on vd. + // + // The implementation is responsible for checking that vd.Mount() == + // rp.Mount(). + RenameAt(ctx context.Context, rp *ResolvingPath, vd VirtualDentry, opts RenameOptions) error + + // RmdirAt removes the directory at rp. + RmdirAt(ctx context.Context, rp *ResolvingPath) error + + // SetStatAt updates metadata for the file at the given path. + SetStatAt(ctx context.Context, rp *ResolvingPath, opts SetStatOptions) error + + // StatAt returns metadata for the file at rp. + StatAt(ctx context.Context, rp *ResolvingPath, opts StatOptions) (linux.Statx, error) + + // StatFSAt returns metadata for the filesystem containing the file at rp. + // (This method takes a path because a FilesystemImpl may consist of any + // number of constituent filesystems.) + StatFSAt(ctx context.Context, rp *ResolvingPath) (linux.Statfs, error) + + // SymlinkAt creates a symbolic link at rp referring to the given target. + SymlinkAt(ctx context.Context, rp *ResolvingPath, target string) error + + // UnlinkAt removes the non-directory file at rp. + UnlinkAt(ctx context.Context, rp *ResolvingPath) error + + // TODO: d_path(); extended attributes; inotify_add_watch(); bind() +} diff --git a/pkg/sentry/vfs/filesystem_type.go b/pkg/sentry/vfs/filesystem_type.go new file mode 100644 index 000000000..f401ad7f3 --- /dev/null +++ b/pkg/sentry/vfs/filesystem_type.go @@ -0,0 +1,70 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs + +import ( + "fmt" + + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" +) + +// A FilesystemType constructs filesystems. +// +// FilesystemType is analogous to Linux's struct file_system_type. +type FilesystemType interface { + // NewFilesystem returns a Filesystem configured by the given options, + // along with its mount root. A reference is taken on the returned + // Filesystem and Dentry. + NewFilesystem(ctx context.Context, creds *auth.Credentials, source string, opts NewFilesystemOptions) (*Filesystem, *Dentry, error) +} + +// NewFilesystemOptions contains options to FilesystemType.NewFilesystem. +type NewFilesystemOptions struct { + // Data is the string passed as the 5th argument to mount(2), which is + // usually a comma-separated list of filesystem-specific mount options. + Data string + + // InternalData holds opaque FilesystemType-specific data. There is + // intentionally no way for applications to specify InternalData; if it is + // not nil, the call to NewFilesystem originates from within the sentry. + InternalData interface{} +} + +// RegisterFilesystemType registers the given FilesystemType in vfs with the +// given name. +func (vfs *VirtualFilesystem) RegisterFilesystemType(name string, fsType FilesystemType) error { + vfs.fsTypesMu.Lock() + defer vfs.fsTypesMu.Unlock() + if existing, ok := vfs.fsTypes[name]; ok { + return fmt.Errorf("name %q is already registered to filesystem type %T", name, existing) + } + vfs.fsTypes[name] = fsType + return nil +} + +// MustRegisterFilesystemType is equivalent to RegisterFilesystemType but +// panics on failure. +func (vfs *VirtualFilesystem) MustRegisterFilesystemType(name string, fsType FilesystemType) { + if err := vfs.RegisterFilesystemType(name, fsType); err != nil { + panic(fmt.Sprintf("failed to register filesystem type %T: %v", fsType, err)) + } +} + +func (vfs *VirtualFilesystem) getFilesystemType(name string) FilesystemType { + vfs.fsTypesMu.RLock() + defer vfs.fsTypesMu.RUnlock() + return vfs.fsTypes[name] +} diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go new file mode 100644 index 000000000..11702f720 --- /dev/null +++ b/pkg/sentry/vfs/mount.go @@ -0,0 +1,411 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs + +import ( + "math" + "sync/atomic" + + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/syserror" +) + +// A Mount is a replacement of a Dentry (Mount.key.point) from one Filesystem +// (Mount.key.parent.fs) with a Dentry (Mount.root) from another Filesystem +// (Mount.fs), which applies to path resolution in the context of a particular +// Mount (Mount.key.parent). +// +// Mounts are reference-counted. Unless otherwise specified, all Mount methods +// require that a reference is held. +// +// Mount and Filesystem are distinct types because it's possible for a single +// Filesystem to be mounted at multiple locations and/or in multiple mount +// namespaces. +// +// Mount is analogous to Linux's struct mount. (gVisor does not distinguish +// between struct mount and struct vfsmount.) +type Mount struct { + // The lower 63 bits of refs are a reference count. The MSB of refs is set + // if the Mount has been eagerly unmounted, as by umount(2) without the + // MNT_DETACH flag. refs is accessed using atomic memory operations. + refs int64 + + // The lower 63 bits of writers is the number of calls to + // Mount.CheckBeginWrite() that have not yet been paired with a call to + // Mount.EndWrite(). The MSB of writers is set if MS_RDONLY is in effect. + // writers is accessed using atomic memory operations. + writers int64 + + // key is protected by VirtualFilesystem.mountMu and + // VirtualFilesystem.mounts.seq, and may be nil. References are held on + // key.parent and key.point if they are not nil. + // + // Invariant: key.parent != nil iff key.point != nil. key.point belongs to + // key.parent.fs. + key mountKey + + // fs, root, and ns are immutable. References are held on fs and root (but + // not ns). + // + // Invariant: root belongs to fs. + fs *Filesystem + root *Dentry + ns *MountNamespace +} + +// A MountNamespace is a collection of Mounts. +// +// MountNamespaces are reference-counted. Unless otherwise specified, all +// MountNamespace methods require that a reference is held. +// +// MountNamespace is analogous to Linux's struct mnt_namespace. +type MountNamespace struct { + refs int64 // accessed using atomic memory operations + + // root is the MountNamespace's root mount. root is immutable. + root *Mount + + // mountpoints contains all Dentries which are mount points in this + // namespace. mountpoints is protected by VirtualFilesystem.mountMu. + // + // mountpoints is used to determine if a Dentry can be moved or removed + // (which requires that the Dentry is not a mount point in the calling + // namespace). + // + // mountpoints is maintained even if there are no references held on the + // MountNamespace; this is required to ensure that + // VFS.PrepareDeleteDentry() and VFS.PrepareRemoveDentry() operate + // correctly on unreferenced MountNamespaces. + mountpoints map[*Dentry]struct{} +} + +// NewMountNamespace returns a new mount namespace with a root filesystem +// configured by the given arguments. A reference is taken on the returned +// MountNamespace. +func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth.Credentials, source, fsTypeName string, opts *NewFilesystemOptions) (*MountNamespace, error) { + fsType := vfs.getFilesystemType(fsTypeName) + if fsType == nil { + return nil, syserror.ENODEV + } + fs, root, err := fsType.NewFilesystem(ctx, creds, source, *opts) + if err != nil { + return nil, err + } + mntns := &MountNamespace{ + refs: 1, + mountpoints: make(map[*Dentry]struct{}), + } + mntns.root = &Mount{ + fs: fs, + root: root, + ns: mntns, + refs: 1, + } + return mntns, nil +} + +// NewMount creates and mounts a new Filesystem. +func (vfs *VirtualFilesystem) NewMount(ctx context.Context, creds *auth.Credentials, source string, target *PathOperation, fsTypeName string, opts *NewFilesystemOptions) error { + fsType := vfs.getFilesystemType(fsTypeName) + if fsType == nil { + return syserror.ENODEV + } + fs, root, err := fsType.NewFilesystem(ctx, creds, source, *opts) + if err != nil { + return err + } + // We can't hold vfs.mountMu while calling FilesystemImpl methods due to + // lock ordering. + vd, err := vfs.GetDentryAt(ctx, creds, target, &GetDentryOptions{}) + if err != nil { + root.decRef(fs) + fs.decRef() + return err + } + vfs.mountMu.Lock() + for { + if vd.dentry.IsDisowned() { + vfs.mountMu.Unlock() + vd.DecRef() + root.decRef(fs) + fs.decRef() + return syserror.ENOENT + } + // vd might have been mounted over between vfs.GetDentryAt() and + // vfs.mountMu.Lock(). + if !vd.dentry.isMounted() { + break + } + nextmnt := vfs.mounts.Lookup(vd.mount, vd.dentry) + if nextmnt == nil { + break + } + nextmnt.incRef() + nextmnt.root.incRef(nextmnt.fs) + vd.DecRef() + vd = VirtualDentry{ + mount: nextmnt, + dentry: nextmnt.root, + } + } + // TODO: Linux requires that either both the mount point and the mount root + // are directories, or neither are, and returns ENOTDIR if this is not the + // case. + mntns := vd.mount.ns + mnt := &Mount{ + fs: fs, + root: root, + ns: mntns, + refs: 1, + } + mnt.storeKey(vd.mount, vd.dentry) + atomic.AddUint32(&vd.dentry.mounts, 1) + mntns.mountpoints[vd.dentry] = struct{}{} + vfsmpmounts, ok := vfs.mountpoints[vd.dentry] + if !ok { + vfsmpmounts = make(map[*Mount]struct{}) + vfs.mountpoints[vd.dentry] = vfsmpmounts + } + vfsmpmounts[mnt] = struct{}{} + vfs.mounts.Insert(mnt) + vfs.mountMu.Unlock() + return nil +} + +// getMountAt returns the last Mount in the stack mounted at (mnt, d). It takes +// a reference on the returned Mount. If (mnt, d) is not a mount point, +// getMountAt returns nil. +// +// getMountAt is analogous to Linux's fs/namei.c:follow_mount(). +// +// Preconditions: References are held on mnt and d. +func (vfs *VirtualFilesystem) getMountAt(mnt *Mount, d *Dentry) *Mount { + // The first mount is special-cased: + // + // - The caller is assumed to have checked d.isMounted() already. (This + // isn't a precondition because it doesn't matter for correctness.) + // + // - We return nil, instead of mnt, if there is no mount at (mnt, d). + // + // - We don't drop the caller's references on mnt and d. +retryFirst: + next := vfs.mounts.Lookup(mnt, d) + if next == nil { + return nil + } + if !next.tryIncMountedRef() { + // Raced with umount. + goto retryFirst + } + mnt = next + d = next.root + // We don't need to take Dentry refs anywhere in this function because + // Mounts hold references on Mount.root, which is immutable. + for d.isMounted() { + next := vfs.mounts.Lookup(mnt, d) + if next == nil { + break + } + if !next.tryIncMountedRef() { + // Raced with umount. + continue + } + mnt.decRef() + mnt = next + d = next.root + } + return mnt +} + +// getMountpointAt returns the mount point for the stack of Mounts including +// mnt. It takes a reference on the returned Mount and Dentry. If no such mount +// point exists (i.e. mnt is a root mount), getMountpointAt returns (nil, nil). +// +// Preconditions: References are held on mnt and root. vfsroot is not (mnt, +// mnt.root). +func (vfs *VirtualFilesystem) getMountpointAt(mnt *Mount, vfsroot VirtualDentry) (*Mount, *Dentry) { + // The first mount is special-cased: + // + // - The caller must have already checked mnt against vfsroot. + // + // - We return nil, instead of mnt, if there is no mount point for mnt. + // + // - We don't drop the caller's reference on mnt. +retryFirst: + epoch := vfs.mounts.seq.BeginRead() + parent, point := mnt.loadKey() + if !vfs.mounts.seq.ReadOk(epoch) { + goto retryFirst + } + if parent == nil { + return nil, nil + } + if !parent.tryIncMountedRef() { + // Raced with umount. + goto retryFirst + } + if !point.tryIncRef(parent.fs) { + // Since Mount holds a reference on Mount.key.point, this can only + // happen due to a racing change to Mount.key. + parent.decRef() + goto retryFirst + } + mnt = parent + d := point + for { + if mnt == vfsroot.mount && d == vfsroot.dentry { + break + } + if d != mnt.root { + break + } + retryNotFirst: + epoch := vfs.mounts.seq.BeginRead() + parent, point := mnt.loadKey() + if !vfs.mounts.seq.ReadOk(epoch) { + goto retryNotFirst + } + if parent == nil { + break + } + if !parent.tryIncMountedRef() { + // Raced with umount. + goto retryNotFirst + } + if !point.tryIncRef(parent.fs) { + // Since Mount holds a reference on Mount.key.point, this can + // only happen due to a racing change to Mount.key. + parent.decRef() + goto retryNotFirst + } + if !vfs.mounts.seq.ReadOk(epoch) { + point.decRef(parent.fs) + parent.decRef() + goto retryNotFirst + } + d.decRef(mnt.fs) + mnt.decRef() + mnt = parent + d = point + } + return mnt, d +} + +// tryIncMountedRef increments mnt's reference count and returns true. If mnt's +// reference count is already zero, or has been eagerly unmounted, +// tryIncMountedRef does nothing and returns false. +// +// tryIncMountedRef does not require that a reference is held on mnt. +func (mnt *Mount) tryIncMountedRef() bool { + for { + refs := atomic.LoadInt64(&mnt.refs) + if refs <= 0 { // refs < 0 => MSB set => eagerly unmounted + return false + } + if atomic.CompareAndSwapInt64(&mnt.refs, refs, refs+1) { + return true + } + } +} + +func (mnt *Mount) incRef() { + // In general, negative values for mnt.refs are valid because the MSB is + // the eager-unmount bit. + atomic.AddInt64(&mnt.refs, 1) +} + +func (mnt *Mount) decRef() { + refs := atomic.AddInt64(&mnt.refs, -1) + if refs&^math.MinInt64 == 0 { // mask out MSB + parent, point := mnt.loadKey() + if point != nil { + point.decRef(parent.fs) + parent.decRef() + } + mnt.root.decRef(mnt.fs) + mnt.fs.decRef() + } +} + +// CheckBeginWrite increments the counter of in-progress write operations on +// mnt. If mnt is mounted MS_RDONLY, CheckBeginWrite does nothing and returns +// EROFS. +// +// If CheckBeginWrite succeeds, EndWrite must be called when the write +// operation is finished. +func (mnt *Mount) CheckBeginWrite() error { + if atomic.AddInt64(&mnt.writers, 1) < 0 { + atomic.AddInt64(&mnt.writers, -1) + return syserror.EROFS + } + return nil +} + +// EndWrite indicates that a write operation signaled by a previous successful +// call to CheckBeginWrite has finished. +func (mnt *Mount) EndWrite() { + atomic.AddInt64(&mnt.writers, -1) +} + +// Preconditions: VirtualFilesystem.mountMu must be locked for writing. +func (mnt *Mount) setReadOnlyLocked(ro bool) error { + if oldRO := atomic.LoadInt64(&mnt.writers) < 0; oldRO == ro { + return nil + } + if ro { + if !atomic.CompareAndSwapInt64(&mnt.writers, 0, math.MinInt64) { + return syserror.EBUSY + } + return nil + } + // Unset MSB without dropping any temporary increments from failed calls to + // mnt.CheckBeginWrite(). + atomic.AddInt64(&mnt.writers, math.MinInt64) + return nil +} + +// Filesystem returns the mounted Filesystem. It does not take a reference on +// the returned Filesystem. +func (mnt *Mount) Filesystem() *Filesystem { + return mnt.fs +} + +// IncRef increments mntns' reference count. +func (mntns *MountNamespace) IncRef() { + if atomic.AddInt64(&mntns.refs, 1) <= 1 { + panic("MountNamespace.IncRef() called without holding a reference") + } +} + +// DecRef decrements mntns' reference count. +func (mntns *MountNamespace) DecRef() { + if refs := atomic.AddInt64(&mntns.refs, 0); refs == 0 { + // TODO: unmount mntns.root + } else if refs < 0 { + panic("MountNamespace.DecRef() called without holding a reference") + } +} + +// Root returns mntns' root. A reference is taken on the returned +// VirtualDentry. +func (mntns *MountNamespace) Root() VirtualDentry { + vd := VirtualDentry{ + mount: mntns.root, + dentry: mntns.root.root, + } + vd.IncRef() + return vd +} diff --git a/pkg/sentry/vfs/mount_test.go b/pkg/sentry/vfs/mount_test.go new file mode 100644 index 000000000..f394d7483 --- /dev/null +++ b/pkg/sentry/vfs/mount_test.go @@ -0,0 +1,465 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs + +import ( + "fmt" + "runtime" + "sync" + "testing" +) + +func TestMountTableLookupEmpty(t *testing.T) { + var mt mountTable + mt.Init() + + parent := &Mount{} + point := &Dentry{} + if m := mt.Lookup(parent, point); m != nil { + t.Errorf("empty mountTable lookup: got %p, wanted nil", m) + } +} + +func TestMountTableInsertLookup(t *testing.T) { + var mt mountTable + mt.Init() + + mount := &Mount{} + mount.storeKey(&Mount{}, &Dentry{}) + mt.Insert(mount) + + if m := mt.Lookup(mount.parent(), mount.point()); m != mount { + t.Errorf("mountTable positive lookup: got %p, wanted %p", m, mount) + } + + otherParent := &Mount{} + if m := mt.Lookup(otherParent, mount.point()); m != nil { + t.Errorf("mountTable lookup with wrong mount parent: got %p, wanted nil", m) + } + otherPoint := &Dentry{} + if m := mt.Lookup(mount.parent(), otherPoint); m != nil { + t.Errorf("mountTable lookup with wrong mount point: got %p, wanted nil", m) + } +} + +// TODO: concurrent lookup/insertion/removal + +// must be powers of 2 +var benchNumMounts = []int{1 << 2, 1 << 5, 1 << 8} + +// For all of the following: +// +// - BenchmarkMountTableFoo tests usage pattern "Foo" for mountTable. +// +// - BenchmarkMountMapFoo tests usage pattern "Foo" for a +// sync.RWMutex-protected map. (Mutator benchmarks do not use a RWMutex, since +// mountTable also requires external synchronization between mutators.) +// +// - BenchmarkMountSyncMapFoo tests usage pattern "Foo" for a sync.Map. +// +// ParallelLookup is by far the most common and performance-sensitive operation +// for this application. NegativeLookup is also important, but less so (only +// relevant with multiple mount namespaces and significant differences in +// mounts between them). Insertion and removal are benchmarked for +// completeness. +const enableComparativeBenchmarks = false + +func newBenchMount() *Mount { + mount := &Mount{} + mount.storeKey(&Mount{}, &Dentry{}) + return mount +} + +func vdkey(mnt *Mount) VirtualDentry { + parent, point := mnt.loadKey() + return VirtualDentry{ + mount: parent, + dentry: point, + } +} + +func BenchmarkMountTableParallelLookup(b *testing.B) { + for numG, maxG := 1, runtime.GOMAXPROCS(0); numG >= 0 && numG <= maxG; numG *= 2 { + for _, numMounts := range benchNumMounts { + desc := fmt.Sprintf("%dx%d", numG, numMounts) + b.Run(desc, func(b *testing.B) { + var mt mountTable + mt.Init() + keys := make([]VirtualDentry, 0, numMounts) + for i := 0; i < numMounts; i++ { + mount := newBenchMount() + mt.Insert(mount) + keys = append(keys, vdkey(mount)) + } + + var ready sync.WaitGroup + begin := make(chan struct{}) + var end sync.WaitGroup + for g := 0; g < numG; g++ { + ready.Add(1) + end.Add(1) + go func() { + defer end.Done() + ready.Done() + <-begin + for i := 0; i < b.N; i++ { + k := keys[i&(numMounts-1)] + m := mt.Lookup(k.mount, k.dentry) + if m == nil { + b.Fatalf("lookup failed") + } + if parent := m.parent(); parent != k.mount { + b.Fatalf("lookup returned mount with parent %p, wanted %p", parent, k.mount) + } + if point := m.point(); point != k.dentry { + b.Fatalf("lookup returned mount with point %p, wanted %p", point, k.dentry) + } + } + }() + } + + ready.Wait() + b.ResetTimer() + close(begin) + end.Wait() + }) + } + } +} + +func BenchmarkMountMapParallelLookup(b *testing.B) { + if !enableComparativeBenchmarks { + b.Skipf("comparative benchmarks are disabled") + } + + for numG, maxG := 1, runtime.GOMAXPROCS(0); numG >= 0 && numG <= maxG; numG *= 2 { + for _, numMounts := range benchNumMounts { + desc := fmt.Sprintf("%dx%d", numG, numMounts) + b.Run(desc, func(b *testing.B) { + var mu sync.RWMutex + ms := make(map[VirtualDentry]*Mount) + keys := make([]VirtualDentry, 0, numMounts) + for i := 0; i < numMounts; i++ { + mount := newBenchMount() + key := vdkey(mount) + ms[key] = mount + keys = append(keys, key) + } + + var ready sync.WaitGroup + begin := make(chan struct{}) + var end sync.WaitGroup + for g := 0; g < numG; g++ { + ready.Add(1) + end.Add(1) + go func() { + defer end.Done() + ready.Done() + <-begin + for i := 0; i < b.N; i++ { + k := keys[i&(numMounts-1)] + mu.RLock() + m := ms[k] + mu.RUnlock() + if m == nil { + b.Fatalf("lookup failed") + } + if parent := m.parent(); parent != k.mount { + b.Fatalf("lookup returned mount with parent %p, wanted %p", parent, k.mount) + } + if point := m.point(); point != k.dentry { + b.Fatalf("lookup returned mount with point %p, wanted %p", point, k.dentry) + } + } + }() + } + + ready.Wait() + b.ResetTimer() + close(begin) + end.Wait() + }) + } + } +} + +func BenchmarkMountSyncMapParallelLookup(b *testing.B) { + if !enableComparativeBenchmarks { + b.Skipf("comparative benchmarks are disabled") + } + + for numG, maxG := 1, runtime.GOMAXPROCS(0); numG >= 0 && numG <= maxG; numG *= 2 { + for _, numMounts := range benchNumMounts { + desc := fmt.Sprintf("%dx%d", numG, numMounts) + b.Run(desc, func(b *testing.B) { + var ms sync.Map + keys := make([]VirtualDentry, 0, numMounts) + for i := 0; i < numMounts; i++ { + mount := newBenchMount() + key := vdkey(mount) + ms.Store(key, mount) + keys = append(keys, key) + } + + var ready sync.WaitGroup + begin := make(chan struct{}) + var end sync.WaitGroup + for g := 0; g < numG; g++ { + ready.Add(1) + end.Add(1) + go func() { + defer end.Done() + ready.Done() + <-begin + for i := 0; i < b.N; i++ { + k := keys[i&(numMounts-1)] + mi, ok := ms.Load(k) + if !ok { + b.Fatalf("lookup failed") + } + m := mi.(*Mount) + if parent := m.parent(); parent != k.mount { + b.Fatalf("lookup returned mount with parent %p, wanted %p", parent, k.mount) + } + if point := m.point(); point != k.dentry { + b.Fatalf("lookup returned mount with point %p, wanted %p", point, k.dentry) + } + } + }() + } + + ready.Wait() + b.ResetTimer() + close(begin) + end.Wait() + }) + } + } +} + +func BenchmarkMountTableNegativeLookup(b *testing.B) { + for _, numMounts := range benchNumMounts { + desc := fmt.Sprintf("%d", numMounts) + b.Run(desc, func(b *testing.B) { + var mt mountTable + mt.Init() + for i := 0; i < numMounts; i++ { + mt.Insert(newBenchMount()) + } + negkeys := make([]VirtualDentry, 0, numMounts) + for i := 0; i < numMounts; i++ { + negkeys = append(negkeys, VirtualDentry{ + mount: &Mount{}, + dentry: &Dentry{}, + }) + } + + b.ResetTimer() + for i := 0; i < b.N; i++ { + k := negkeys[i&(numMounts-1)] + m := mt.Lookup(k.mount, k.dentry) + if m != nil { + b.Fatalf("lookup got %p, wanted nil", m) + } + } + }) + } +} + +func BenchmarkMountMapNegativeLookup(b *testing.B) { + if !enableComparativeBenchmarks { + b.Skipf("comparative benchmarks are disabled") + } + + for _, numMounts := range benchNumMounts { + desc := fmt.Sprintf("%d", numMounts) + b.Run(desc, func(b *testing.B) { + var mu sync.RWMutex + ms := make(map[VirtualDentry]*Mount) + for i := 0; i < numMounts; i++ { + mount := newBenchMount() + ms[vdkey(mount)] = mount + } + negkeys := make([]VirtualDentry, 0, numMounts) + for i := 0; i < numMounts; i++ { + negkeys = append(negkeys, VirtualDentry{ + mount: &Mount{}, + dentry: &Dentry{}, + }) + } + + b.ResetTimer() + for i := 0; i < b.N; i++ { + k := negkeys[i&(numMounts-1)] + mu.RLock() + m := ms[k] + mu.RUnlock() + if m != nil { + b.Fatalf("lookup got %p, wanted nil", m) + } + } + }) + } +} + +func BenchmarkMountSyncMapNegativeLookup(b *testing.B) { + if !enableComparativeBenchmarks { + b.Skipf("comparative benchmarks are disabled") + } + + for _, numMounts := range benchNumMounts { + desc := fmt.Sprintf("%d", numMounts) + b.Run(desc, func(b *testing.B) { + var ms sync.Map + for i := 0; i < numMounts; i++ { + mount := newBenchMount() + ms.Store(vdkey(mount), mount) + } + negkeys := make([]VirtualDentry, 0, numMounts) + for i := 0; i < numMounts; i++ { + negkeys = append(negkeys, VirtualDentry{ + mount: &Mount{}, + dentry: &Dentry{}, + }) + } + + b.ResetTimer() + for i := 0; i < b.N; i++ { + k := negkeys[i&(numMounts-1)] + m, _ := ms.Load(k) + if m != nil { + b.Fatalf("lookup got %p, wanted nil", m) + } + } + }) + } +} + +func BenchmarkMountTableInsert(b *testing.B) { + // Preallocate Mounts so that allocation time isn't included in the + // benchmark. + mounts := make([]*Mount, 0, b.N) + for i := 0; i < b.N; i++ { + mounts = append(mounts, newBenchMount()) + } + + var mt mountTable + mt.Init() + b.ResetTimer() + for i := range mounts { + mt.Insert(mounts[i]) + } +} + +func BenchmarkMountMapInsert(b *testing.B) { + if !enableComparativeBenchmarks { + b.Skipf("comparative benchmarks are disabled") + } + + // Preallocate Mounts so that allocation time isn't included in the + // benchmark. + mounts := make([]*Mount, 0, b.N) + for i := 0; i < b.N; i++ { + mounts = append(mounts, newBenchMount()) + } + + ms := make(map[VirtualDentry]*Mount) + b.ResetTimer() + for i := range mounts { + mount := mounts[i] + ms[vdkey(mount)] = mount + } +} + +func BenchmarkMountSyncMapInsert(b *testing.B) { + if !enableComparativeBenchmarks { + b.Skipf("comparative benchmarks are disabled") + } + + // Preallocate Mounts so that allocation time isn't included in the + // benchmark. + mounts := make([]*Mount, 0, b.N) + for i := 0; i < b.N; i++ { + mounts = append(mounts, newBenchMount()) + } + + var ms sync.Map + b.ResetTimer() + for i := range mounts { + mount := mounts[i] + ms.Store(vdkey(mount), mount) + } +} + +func BenchmarkMountTableRemove(b *testing.B) { + mounts := make([]*Mount, 0, b.N) + for i := 0; i < b.N; i++ { + mounts = append(mounts, newBenchMount()) + } + var mt mountTable + mt.Init() + for i := range mounts { + mt.Insert(mounts[i]) + } + + b.ResetTimer() + for i := range mounts { + mt.Remove(mounts[i]) + } +} + +func BenchmarkMountMapRemove(b *testing.B) { + if !enableComparativeBenchmarks { + b.Skipf("comparative benchmarks are disabled") + } + + mounts := make([]*Mount, 0, b.N) + for i := 0; i < b.N; i++ { + mounts = append(mounts, newBenchMount()) + } + ms := make(map[VirtualDentry]*Mount) + for i := range mounts { + mount := mounts[i] + ms[vdkey(mount)] = mount + } + + b.ResetTimer() + for i := range mounts { + mount := mounts[i] + delete(ms, vdkey(mount)) + } +} + +func BenchmarkMountSyncMapRemove(b *testing.B) { + if !enableComparativeBenchmarks { + b.Skipf("comparative benchmarks are disabled") + } + + mounts := make([]*Mount, 0, b.N) + for i := 0; i < b.N; i++ { + mounts = append(mounts, newBenchMount()) + } + var ms sync.Map + for i := range mounts { + mount := mounts[i] + ms.Store(vdkey(mount), mount) + } + + b.ResetTimer() + for i := range mounts { + mount := mounts[i] + ms.Delete(vdkey(mount)) + } +} diff --git a/pkg/sentry/vfs/mount_unsafe.go b/pkg/sentry/vfs/mount_unsafe.go new file mode 100644 index 000000000..b0511aa40 --- /dev/null +++ b/pkg/sentry/vfs/mount_unsafe.go @@ -0,0 +1,356 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build go1.12 +// +build !go1.14 + +// Check go:linkname function signatures when updating Go version. + +package vfs + +import ( + "fmt" + "math/bits" + "reflect" + "sync/atomic" + "unsafe" + + "gvisor.dev/gvisor/third_party/gvsync" +) + +// mountKey represents the location at which a Mount is mounted. It is +// structurally identical to VirtualDentry, but stores its fields as +// unsafe.Pointer since mutators synchronize with VFS path traversal using +// seqcounts. +type mountKey struct { + parent unsafe.Pointer // *Mount + point unsafe.Pointer // *Dentry +} + +// Invariant: mnt.key's fields are nil. parent and point are non-nil. +func (mnt *Mount) storeKey(parent *Mount, point *Dentry) { + atomic.StorePointer(&mnt.key.parent, unsafe.Pointer(parent)) + atomic.StorePointer(&mnt.key.point, unsafe.Pointer(point)) +} + +func (mnt *Mount) loadKey() (*Mount, *Dentry) { + return (*Mount)(atomic.LoadPointer(&mnt.key.parent)), (*Dentry)(atomic.LoadPointer(&mnt.key.point)) +} + +func (mnt *Mount) parent() *Mount { + return (*Mount)(atomic.LoadPointer(&mnt.key.parent)) +} + +func (mnt *Mount) point() *Dentry { + return (*Dentry)(atomic.LoadPointer(&mnt.key.point)) +} + +// mountTable maps (mount parent, mount point) pairs to mounts. It supports +// efficient concurrent lookup, even in the presence of concurrent mutators +// (provided mutation is sufficiently uncommon). +// +// mountTable.Init() must be called on new mountTables before use. +type mountTable struct { + // mountTable is implemented as a seqcount-protected hash table that + // resolves collisions with linear probing, featuring Robin Hood insertion + // and backward shift deletion. These minimize probe length variance, + // significantly improving the performance of linear probing at high load + // factors. (mountTable doesn't use bucketing, which is the other major + // technique commonly used in high-performance hash tables; the efficiency + // of bucketing is largely due to SIMD lookup, and Go lacks both SIMD + // intrinsics and inline assembly, limiting the performance of this + // approach.) + + seq gvsync.SeqCount + seed uint32 // for hashing keys + + // size holds both length (number of elements) and capacity (number of + // slots): capacity is stored as its base-2 log (referred to as order) in + // the least significant bits of size, and length is stored in the + // remaining bits. Go defines bit shifts >= width of shifted unsigned + // operand as shifting to 0, which differs from x86's SHL, so the Go + // compiler inserts a bounds check for each bit shift unless we mask order + // anyway (cf. runtime.bucketShift()), and length isn't used by lookup; + // thus this bit packing gets us more bits for the length (vs. storing + // length and cap in separate uint32s) for ~free. + size uint64 + + slots unsafe.Pointer // []mountSlot; never nil after Init +} + +type mountSlot struct { + // We don't store keys in slots; instead, we just check Mount.parent and + // Mount.point directly. Any practical use of lookup will need to touch + // Mounts anyway, and comparing hashes means that false positives are + // extremely rare, so this isn't an extra cache line touch overall. + value unsafe.Pointer // *Mount + hash uintptr +} + +const ( + mtSizeOrderBits = 6 // log2 of pointer size in bits + mtSizeOrderMask = (1 << mtSizeOrderBits) - 1 + mtSizeOrderOne = 1 + mtSizeLenLSB = mtSizeOrderBits + mtSizeLenOne = 1 << mtSizeLenLSB + mtSizeLenNegOne = ^uint64(mtSizeOrderMask) // uint64(-1) << mtSizeLenLSB + + mountSlotBytes = unsafe.Sizeof(mountSlot{}) + mountKeyBytes = unsafe.Sizeof(mountKey{}) + + // Tuning parameters. + // + // Essentially every mountTable will contain at least /proc, /sys, and + // /dev/shm, so there is ~no reason for mtInitCap to be < 4. + mtInitOrder = 2 + mtInitCap = 1 << mtInitOrder + mtMaxLoadNum = 13 + mtMaxLoadDen = 16 +) + +func init() { + // We can't just define mtSizeOrderBits as follows because Go doesn't have + // constexpr. + if ptrBits := uint(unsafe.Sizeof(uintptr(0)) * 8); mtSizeOrderBits != bits.TrailingZeros(ptrBits) { + panic(fmt.Sprintf("mtSizeOrderBits (%d) must be %d = log2 of pointer size in bits (%d)", mtSizeOrderBits, bits.TrailingZeros(ptrBits), ptrBits)) + } + if bits.OnesCount(uint(mountSlotBytes)) != 1 { + panic(fmt.Sprintf("sizeof(mountSlotBytes) (%d) must be a power of 2 to use bit masking for wraparound", mountSlotBytes)) + } + if mtInitCap <= 1 { + panic(fmt.Sprintf("mtInitCap (%d) must be at least 2 since mountTable methods assume that there will always be at least one empty slot", mtInitCap)) + } + if mtMaxLoadNum >= mtMaxLoadDen { + panic(fmt.Sprintf("invalid mountTable maximum load factor (%d/%d)", mtMaxLoadNum, mtMaxLoadDen)) + } +} + +// Init must be called exactly once on each mountTable before use. +func (mt *mountTable) Init() { + mt.seed = rand32() + mt.size = mtInitOrder + mt.slots = newMountTableSlots(mtInitCap) +} + +func newMountTableSlots(cap uintptr) unsafe.Pointer { + slice := make([]mountSlot, cap, cap) + hdr := (*reflect.SliceHeader)(unsafe.Pointer(&slice)) + return unsafe.Pointer(hdr.Data) +} + +// Lookup returns the Mount with the given parent, mounted at the given point. +// If no such Mount exists, Lookup returns nil. +// +// Lookup may be called even if there are concurrent mutators of mt. +func (mt *mountTable) Lookup(parent *Mount, point *Dentry) *Mount { + key := mountKey{parent: unsafe.Pointer(parent), point: unsafe.Pointer(point)} + hash := memhash(noescape(unsafe.Pointer(&key)), uintptr(mt.seed), mountKeyBytes) + +loop: + for { + epoch := mt.seq.BeginRead() + size := atomic.LoadUint64(&mt.size) + slots := atomic.LoadPointer(&mt.slots) + if !mt.seq.ReadOk(epoch) { + continue + } + tcap := uintptr(1) << (size & mtSizeOrderMask) + mask := tcap - 1 + off := (hash & mask) * mountSlotBytes + offmask := mask * mountSlotBytes + for { + // This avoids bounds checking. + slot := (*mountSlot)(unsafe.Pointer(uintptr(slots) + off)) + slotValue := atomic.LoadPointer(&slot.value) + slotHash := atomic.LoadUintptr(&slot.hash) + if !mt.seq.ReadOk(epoch) { + // The element we're looking for might have been moved into a + // slot we've previously checked, so restart entirely. + continue loop + } + if slotValue == nil { + return nil + } + if slotHash == hash { + mount := (*Mount)(slotValue) + var mountKey mountKey + mountKey.parent = atomic.LoadPointer(&mount.key.parent) + mountKey.point = atomic.LoadPointer(&mount.key.point) + if !mt.seq.ReadOk(epoch) { + continue loop + } + if key == mountKey { + return mount + } + } + off = (off + mountSlotBytes) & offmask + } + } +} + +// Insert inserts the given mount into mt. +// +// Preconditions: There are no concurrent mutators of mt. mt must not already +// contain a Mount with the same mount point and parent. +func (mt *mountTable) Insert(mount *Mount) { + hash := memhash(unsafe.Pointer(&mount.key), uintptr(mt.seed), mountKeyBytes) + + // We're under the maximum load factor if: + // + // (len+1) / cap <= mtMaxLoadNum / mtMaxLoadDen + // (len+1) * mtMaxLoadDen <= mtMaxLoadNum * cap + tlen := mt.size >> mtSizeLenLSB + order := mt.size & mtSizeOrderMask + tcap := uintptr(1) << order + if ((tlen + 1) * mtMaxLoadDen) <= (uint64(mtMaxLoadNum) << order) { + // Atomically insert the new element into the table. + mt.seq.BeginWrite() + atomic.AddUint64(&mt.size, mtSizeLenOne) + mtInsertLocked(mt.slots, tcap, unsafe.Pointer(mount), hash) + mt.seq.EndWrite() + return + } + + // Otherwise, we have to expand. Double the number of slots in the new + // table. + newOrder := order + 1 + if newOrder > mtSizeOrderMask { + panic("mount table size overflow") + } + newCap := uintptr(1) << newOrder + newSlots := newMountTableSlots(newCap) + // Copy existing elements to the new table. + oldCur := mt.slots + // Go does not permit pointers to the end of allocated objects, so we + // must use a pointer to the last element of the old table. The + // following expression is equivalent to + // `slots+(cap-1)*mountSlotBytes` but has a critical path length of 2 + // arithmetic instructions instead of 3. + oldLast := unsafe.Pointer((uintptr(mt.slots) - mountSlotBytes) + (tcap * mountSlotBytes)) + for { + oldSlot := (*mountSlot)(oldCur) + if oldSlot.value != nil { + // Don't need to lock mt.seq yet since newSlots isn't visible + // to readers. + mtInsertLocked(newSlots, newCap, oldSlot.value, oldSlot.hash) + } + if oldCur == oldLast { + break + } + oldCur = unsafe.Pointer(uintptr(oldCur) + mountSlotBytes) + } + // Insert the new element into the new table. + mtInsertLocked(newSlots, newCap, unsafe.Pointer(mount), hash) + // Atomically switch to the new table. + mt.seq.BeginWrite() + atomic.AddUint64(&mt.size, mtSizeLenOne|mtSizeOrderOne) + atomic.StorePointer(&mt.slots, newSlots) + mt.seq.EndWrite() +} + +// Preconditions: There are no concurrent mutators of the table (slots, cap). +// If the table is visible to readers, then mt.seq must be in a writer critical +// section. cap must be a power of 2. +func mtInsertLocked(slots unsafe.Pointer, cap uintptr, value unsafe.Pointer, hash uintptr) { + mask := cap - 1 + off := (hash & mask) * mountSlotBytes + offmask := mask * mountSlotBytes + disp := uintptr(0) + for { + slot := (*mountSlot)(unsafe.Pointer(uintptr(slots) + off)) + slotValue := slot.value + if slotValue == nil { + atomic.StorePointer(&slot.value, value) + atomic.StoreUintptr(&slot.hash, hash) + return + } + // If we've been displaced farther from our first-probed slot than the + // element stored in this one, swap elements and switch to inserting + // the replaced one. (This is Robin Hood insertion.) + slotHash := slot.hash + slotDisp := ((off / mountSlotBytes) - slotHash) & mask + if disp > slotDisp { + atomic.StorePointer(&slot.value, value) + atomic.StoreUintptr(&slot.hash, hash) + value = slotValue + hash = slotHash + disp = slotDisp + } + off = (off + mountSlotBytes) & offmask + disp++ + } +} + +// Remove removes the given mount from mt. +// +// Preconditions: There are no concurrent mutators of mt. mt must contain +// mount. +func (mt *mountTable) Remove(mount *Mount) { + hash := memhash(unsafe.Pointer(&mount.key), uintptr(mt.seed), mountKeyBytes) + tcap := uintptr(1) << (mt.size & mtSizeOrderMask) + mask := tcap - 1 + slots := mt.slots + off := (hash & mask) * mountSlotBytes + offmask := mask * mountSlotBytes + for { + slot := (*mountSlot)(unsafe.Pointer(uintptr(slots) + off)) + slotValue := slot.value + if slotValue == unsafe.Pointer(mount) { + // Found the element to remove. Move all subsequent elements + // backward until we either find an empty slot, or an element that + // is already in its first-probed slot. (This is backward shift + // deletion.) + mt.seq.BeginWrite() + for { + nextOff := (off + mountSlotBytes) & offmask + nextSlot := (*mountSlot)(unsafe.Pointer(uintptr(slots) + nextOff)) + nextSlotValue := nextSlot.value + if nextSlotValue == nil { + break + } + nextSlotHash := nextSlot.hash + if (nextOff / mountSlotBytes) == (nextSlotHash & mask) { + break + } + atomic.StorePointer(&slot.value, nextSlotValue) + atomic.StoreUintptr(&slot.hash, nextSlotHash) + off = nextOff + slot = nextSlot + } + atomic.StorePointer(&slot.value, nil) + atomic.AddUint64(&mt.size, mtSizeLenNegOne) + mt.seq.EndWrite() + return + } + if checkInvariants && slotValue == nil { + panic(fmt.Sprintf("mountTable.Remove() called on missing Mount %v", mount)) + } + off = (off + mountSlotBytes) & offmask + } +} + +//go:linkname memhash runtime.memhash +func memhash(p unsafe.Pointer, seed, s uintptr) uintptr + +//go:linkname rand32 runtime.fastrand +func rand32() uint32 + +// This is copy/pasted from runtime.noescape(), and is needed because arguments +// apparently escape from all functions defined by linkname. +// +//go:nosplit +func noescape(p unsafe.Pointer) unsafe.Pointer { + x := uintptr(p) + return unsafe.Pointer(x ^ 0) +} diff --git a/pkg/sentry/vfs/options.go b/pkg/sentry/vfs/options.go new file mode 100644 index 000000000..187e5410c --- /dev/null +++ b/pkg/sentry/vfs/options.go @@ -0,0 +1,123 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" +) + +// GetDentryOptions contains options to VirtualFilesystem.GetDentryAt() and +// FilesystemImpl.GetDentryAt(). +type GetDentryOptions struct { + // If CheckSearchable is true, FilesystemImpl.GetDentryAt() must check that + // the returned Dentry is a directory for which creds has search + // permission. + CheckSearchable bool +} + +// MkdirOptions contains options to VirtualFilesystem.MkdirAt() and +// FilesystemImpl.MkdirAt(). +type MkdirOptions struct { + // Mode is the file mode bits for the created directory. + Mode uint16 +} + +// MknodOptions contains options to VirtualFilesystem.MknodAt() and +// FilesystemImpl.MknodAt(). +type MknodOptions struct { + // Mode is the file type and mode bits for the created file. + Mode uint16 + + // If Mode specifies a character or block device special file, DevMajor and + // DevMinor are the major and minor device numbers for the created device. + DevMajor uint32 + DevMinor uint32 +} + +// OpenOptions contains options to VirtualFilesystem.OpenAt() and +// FilesystemImpl.OpenAt(). +type OpenOptions struct { + // Flags contains access mode and flags as specified for open(2). + // + // FilesystemImpls is reponsible for implementing the following flags: + // O_RDONLY, O_WRONLY, O_RDWR, O_APPEND, O_CREAT, O_DIRECT, O_DSYNC, + // O_EXCL, O_NOATIME, O_NOCTTY, O_NONBLOCK, O_PATH, O_SYNC, O_TMPFILE, and + // O_TRUNC. VFS is responsible for handling O_DIRECTORY, O_LARGEFILE, and + // O_NOFOLLOW. VFS users are responsible for handling O_CLOEXEC, since file + // descriptors are mostly outside the scope of VFS. + Flags uint32 + + // If FilesystemImpl.OpenAt() creates a file, Mode is the file mode for the + // created file. + Mode uint16 +} + +// ReadOptions contains options to FileDescription.PRead(), +// FileDescriptionImpl.PRead(), FileDescription.Read(), and +// FileDescriptionImpl.Read(). +type ReadOptions struct { + // Flags contains flags as specified for preadv2(2). + Flags uint32 +} + +// RenameOptions contains options to VirtualFilesystem.RenameAt() and +// FilesystemImpl.RenameAt(). +type RenameOptions struct { + // Flags contains flags as specified for renameat2(2). + Flags uint32 +} + +// SetStatOptions contains options to VirtualFilesystem.SetStatAt(), +// FilesystemImpl.SetStatAt(), FileDescription.SetStat(), and +// FileDescriptionImpl.SetStat(). +type SetStatOptions struct { + // Stat is the metadata that should be set. Only fields indicated by + // Stat.Mask should be set. + // + // If Stat specifies that a timestamp should be set, + // FilesystemImpl.SetStatAt() and FileDescriptionImpl.SetStat() must + // special-case StatxTimestamp.Nsec == UTIME_NOW as described by + // utimensat(2); however, they do not need to check for StatxTimestamp.Nsec + // == UTIME_OMIT (VFS users must unset the corresponding bit in Stat.Mask + // instead). + Stat linux.Statx +} + +// StatOptions contains options to VirtualFilesystem.StatAt(), +// FilesystemImpl.StatAt(), FileDescription.Stat(), and +// FileDescriptionImpl.Stat(). +type StatOptions struct { + // Mask is the set of fields in the returned Statx that the FilesystemImpl + // or FileDescriptionImpl should provide. Bits are as in linux.Statx.Mask. + // + // The FilesystemImpl or FileDescriptionImpl may return fields not + // requested in Mask, and may fail to return fields requested in Mask that + // are not supported by the underlying filesystem implementation, without + // returning an error. + Mask uint32 + + // Sync specifies the synchronization required, and is one of + // linux.AT_STATX_SYNC_AS_STAT (which is 0, and therefore the default), + // linux.AT_STATX_SYNC_FORCE_SYNC, or linux.AT_STATX_SYNC_DONT_SYNC. + Sync uint32 +} + +// WriteOptions contains options to FileDescription.PWrite(), +// FileDescriptionImpl.PWrite(), FileDescription.Write(), and +// FileDescriptionImpl.Write(). +type WriteOptions struct { + // Flags contains flags as specified for pwritev2(2). + Flags uint32 +} diff --git a/pkg/sentry/vfs/permissions.go b/pkg/sentry/vfs/permissions.go new file mode 100644 index 000000000..f8e74355c --- /dev/null +++ b/pkg/sentry/vfs/permissions.go @@ -0,0 +1,121 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/syserror" +) + +// AccessTypes is a bitmask of Unix file permissions. +type AccessTypes uint16 + +// Bits in AccessTypes. +const ( + MayRead AccessTypes = 4 + MayWrite = 2 + MayExec = 1 +) + +// GenericCheckPermissions checks that creds has the given access rights on a +// file with the given permissions, UID, and GID, subject to the rules of +// fs/namei.c:generic_permission(). isDir is true if the file is a directory. +func GenericCheckPermissions(creds *auth.Credentials, ats AccessTypes, isDir bool, mode uint16, kuid auth.KUID, kgid auth.KGID) error { + // Check permission bits. + perms := mode + if creds.EffectiveKUID == kuid { + perms >>= 6 + } else if creds.InGroup(kgid) { + perms >>= 3 + } + if uint16(ats)&perms == uint16(ats) { + return nil + } + + // Caller capabilities require that the file's KUID and KGID are mapped in + // the caller's user namespace; compare + // kernel/capability.c:privileged_wrt_inode_uidgid(). + if !kuid.In(creds.UserNamespace).Ok() || !kgid.In(creds.UserNamespace).Ok() { + return syserror.EACCES + } + // CAP_DAC_READ_SEARCH allows the caller to read and search arbitrary + // directories, and read arbitrary non-directory files. + if (isDir && (ats&MayWrite == 0)) || ats == MayRead { + if creds.HasCapability(linux.CAP_DAC_READ_SEARCH) { + return nil + } + } + // CAP_DAC_OVERRIDE allows arbitrary access to directories, read/write + // access to non-directory files, and execute access to non-directory files + // for which at least one execute bit is set. + if isDir || (ats&MayExec == 0) || (mode&0111 != 0) { + if creds.HasCapability(linux.CAP_DAC_OVERRIDE) { + return nil + } + } + return syserror.EACCES +} + +// AccessTypesForOpenFlags returns the access types required to open a file +// with the given OpenOptions.Flags. Note that this is NOT the same thing as +// the set of accesses permitted for the opened file: +// +// - O_TRUNC causes MayWrite to be set in the returned AccessTypes (since it +// mutates the file), but does not permit the opened to write to the file +// thereafter. +// +// - "Linux reserves the special, nonstandard access mode 3 (binary 11) in +// flags to mean: check for read and write permission on the file and return a +// file descriptor that can't be used for reading or writing." - open(2). Thus +// AccessTypesForOpenFlags returns MayRead|MayWrite in this case, but +// filesystems are responsible for ensuring that access is denied. +// +// Use May{Read,Write}FileWithOpenFlags() for these checks instead. +func AccessTypesForOpenFlags(flags uint32) AccessTypes { + switch flags & linux.O_ACCMODE { + case linux.O_RDONLY: + if flags&linux.O_TRUNC != 0 { + return MayRead | MayWrite + } + return MayRead + case linux.O_WRONLY: + return MayWrite + default: + return MayRead | MayWrite + } +} + +// MayReadFileWithOpenFlags returns true if a file with the given open flags +// should be readable. +func MayReadFileWithOpenFlags(flags uint32) bool { + switch flags & linux.O_ACCMODE { + case linux.O_RDONLY, linux.O_RDWR: + return true + default: + return false + } +} + +// MayWriteFileWithOpenFlags returns true if a file with the given open flags +// should be writable. +func MayWriteFileWithOpenFlags(flags uint32) bool { + switch flags & linux.O_ACCMODE { + case linux.O_WRONLY, linux.O_RDWR: + return true + default: + return false + } +} diff --git a/pkg/sentry/vfs/resolving_path.go b/pkg/sentry/vfs/resolving_path.go new file mode 100644 index 000000000..8d05c8583 --- /dev/null +++ b/pkg/sentry/vfs/resolving_path.go @@ -0,0 +1,453 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs + +import ( + "fmt" + "sync" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/syserror" +) + +// ResolvingPath represents the state of an in-progress path resolution, shared +// between VFS and FilesystemImpl methods that take a path. +// +// From the perspective of FilesystemImpl methods, a ResolvingPath represents a +// starting Dentry on the associated Filesystem (on which a reference is +// already held) and a stream of path components relative to that Dentry. +// +// ResolvingPath is loosely analogous to Linux's struct nameidata. +type ResolvingPath struct { + vfs *VirtualFilesystem + root VirtualDentry // refs borrowed from PathOperation + mount *Mount + start *Dentry + pit fspath.Iterator + + flags uint16 + mustBeDir bool // final file must be a directory? + mustBeDirOrig bool + symlinks uint8 // number of symlinks traversed + symlinksOrig uint8 + curPart uint8 // index into parts + numOrigParts uint8 + + creds *auth.Credentials + + // Data associated with resolve*Errors, stored in ResolvingPath so that + // those errors don't need to allocate. + nextMount *Mount // ref held if not nil + nextStart *Dentry // ref held if not nil + absSymlinkTarget fspath.Path + + // ResolvingPath must track up to two relative paths: the "current" + // relative path, which is updated whenever a relative symlink is + // encountered, and the "original" relative path, which is updated from the + // current relative path by handleError() when resolution must change + // filesystems (due to reaching a mount boundary or absolute symlink) and + // overwrites the current relative path when Restart() is called. + parts [1 + linux.MaxSymlinkTraversals]fspath.Iterator + origParts [1 + linux.MaxSymlinkTraversals]fspath.Iterator +} + +const ( + rpflagsHaveMountRef = 1 << iota // do we hold a reference on mount? + rpflagsHaveStartRef // do we hold a reference on start? + rpflagsFollowFinalSymlink // same as PathOperation.FollowFinalSymlink +) + +func init() { + if maxParts := len(ResolvingPath{}.parts); maxParts > 255 { + panic(fmt.Sprintf("uint8 is insufficient to accommodate len(ResolvingPath.parts) (%d)", maxParts)) + } +} + +// Error types that communicate state from the FilesystemImpl-caller, +// VFS-callee side of path resolution (i.e. errors returned by +// ResolvingPath.Resolve*()) to the VFS-caller, FilesystemImpl-callee side +// (i.e. VFS methods => ResolvingPath.handleError()). These are empty structs +// rather than error values because Go doesn't support non-primitive constants, +// so error "constants" are really mutable vars, necessitating somewhat +// expensive interface object comparisons. + +type resolveMountRootError struct{} + +// Error implements error.Error. +func (resolveMountRootError) Error() string { + return "resolving mount root" +} + +type resolveMountPointError struct{} + +// Error implements error.Error. +func (resolveMountPointError) Error() string { + return "resolving mount point" +} + +type resolveAbsSymlinkError struct{} + +// Error implements error.Error. +func (resolveAbsSymlinkError) Error() string { + return "resolving absolute symlink" +} + +var resolvingPathPool = sync.Pool{ + New: func() interface{} { + return &ResolvingPath{} + }, +} + +func (vfs *VirtualFilesystem) getResolvingPath(creds *auth.Credentials, pop *PathOperation) (*ResolvingPath, error) { + path, err := fspath.Parse(pop.Pathname) + if err != nil { + return nil, err + } + rp := resolvingPathPool.Get().(*ResolvingPath) + rp.vfs = vfs + rp.root = pop.Root + rp.mount = pop.Start.mount + rp.start = pop.Start.dentry + rp.pit = path.Begin + rp.flags = 0 + if pop.FollowFinalSymlink { + rp.flags |= rpflagsFollowFinalSymlink + } + rp.mustBeDir = path.Dir + rp.mustBeDirOrig = path.Dir + rp.symlinks = 0 + rp.curPart = 0 + rp.numOrigParts = 1 + rp.creds = creds + rp.parts[0] = path.Begin + rp.origParts[0] = path.Begin + return rp, nil +} + +func (vfs *VirtualFilesystem) putResolvingPath(rp *ResolvingPath) { + rp.root = VirtualDentry{} + rp.decRefStartAndMount() + rp.mount = nil + rp.start = nil + rp.releaseErrorState() + resolvingPathPool.Put(rp) +} + +func (rp *ResolvingPath) decRefStartAndMount() { + if rp.flags&rpflagsHaveStartRef != 0 { + rp.start.decRef(rp.mount.fs) + } + if rp.flags&rpflagsHaveMountRef != 0 { + rp.mount.decRef() + } +} + +func (rp *ResolvingPath) releaseErrorState() { + if rp.nextStart != nil { + rp.nextStart.decRef(rp.nextMount.fs) + rp.nextStart = nil + } + if rp.nextMount != nil { + rp.nextMount.decRef() + rp.nextMount = nil + } +} + +// VirtualFilesystem returns the containing VirtualFilesystem. +func (rp *ResolvingPath) VirtualFilesystem() *VirtualFilesystem { + return rp.vfs +} + +// Credentials returns the credentials of rp's provider. +func (rp *ResolvingPath) Credentials() *auth.Credentials { + return rp.creds +} + +// Mount returns the Mount on which path resolution is currently occurring. It +// does not take a reference on the returned Mount. +func (rp *ResolvingPath) Mount() *Mount { + return rp.mount +} + +// Start returns the starting Dentry represented by rp. It does not take a +// reference on the returned Dentry. +func (rp *ResolvingPath) Start() *Dentry { + return rp.start +} + +// Done returns true if there are no remaining path components in the stream +// represented by rp. +func (rp *ResolvingPath) Done() bool { + // We don't need to check for rp.curPart == 0 because rp.Advance() won't + // set rp.pit to a terminal iterator otherwise. + return !rp.pit.Ok() +} + +// Final returns true if there is exactly one remaining path component in the +// stream represented by rp. +// +// Preconditions: !rp.Done(). +func (rp *ResolvingPath) Final() bool { + return rp.curPart == 0 && !rp.pit.NextOk() +} + +// Component returns the current path component in the stream represented by +// rp. +// +// Preconditions: !rp.Done(). +func (rp *ResolvingPath) Component() string { + if checkInvariants { + if !rp.pit.Ok() { + panic("ResolvingPath.Component() called at end of relative path") + } + } + return rp.pit.String() +} + +// Advance advances the stream of path components represented by rp. +// +// Preconditions: !rp.Done(). +func (rp *ResolvingPath) Advance() { + if checkInvariants { + if !rp.pit.Ok() { + panic("ResolvingPath.Advance() called at end of relative path") + } + } + next := rp.pit.Next() + if next.Ok() || rp.curPart == 0 { // have next component, or at end of path + rp.pit = next + } else { // at end of path segment, continue with next one + rp.curPart-- + rp.pit = rp.parts[rp.curPart-1] + } +} + +// Restart resets the stream of path components represented by rp to its state +// on entry to the current FilesystemImpl method. +func (rp *ResolvingPath) Restart() { + rp.pit = rp.origParts[rp.numOrigParts-1] + rp.mustBeDir = rp.mustBeDirOrig + rp.symlinks = rp.symlinksOrig + rp.curPart = rp.numOrigParts - 1 + copy(rp.parts[:], rp.origParts[:rp.numOrigParts]) + rp.releaseErrorState() +} + +func (rp *ResolvingPath) relpathCommit() { + rp.mustBeDirOrig = rp.mustBeDir + rp.symlinksOrig = rp.symlinks + rp.numOrigParts = rp.curPart + 1 + copy(rp.origParts[:rp.curPart], rp.parts[:]) + rp.origParts[rp.curPart] = rp.pit +} + +// ResolveParent returns the VFS parent of d. It does not take a reference on +// the returned Dentry. +// +// Preconditions: There are no concurrent mutators of d. +// +// Postconditions: If the returned error is nil, then the returned Dentry is +// not nil. +func (rp *ResolvingPath) ResolveParent(d *Dentry) (*Dentry, error) { + var parent *Dentry + if d == rp.root.dentry && rp.mount == rp.root.mount { + // At contextual VFS root. + parent = d + } else if d == rp.mount.root { + // At mount root ... + mnt, mntpt := rp.vfs.getMountpointAt(rp.mount, rp.root) + if mnt != nil { + // ... of non-root mount. + rp.nextMount = mnt + rp.nextStart = mntpt + return nil, resolveMountRootError{} + } + // ... of root mount. + parent = d + } else if d.parent == nil { + // At filesystem root. + parent = d + } else { + parent = d.parent + } + if parent.isMounted() { + if mnt := rp.vfs.getMountAt(rp.mount, parent); mnt != nil { + rp.nextMount = mnt + return nil, resolveMountPointError{} + } + } + return parent, nil +} + +// ResolveChild returns the VFS child of d with the given name. It does not +// take a reference on the returned Dentry. If no such child exists, +// ResolveChild returns (nil, nil). +// +// Preconditions: There are no concurrent mutators of d. +func (rp *ResolvingPath) ResolveChild(d *Dentry, name string) (*Dentry, error) { + child := d.children[name] + if child == nil { + return nil, nil + } + if child.isMounted() { + if mnt := rp.vfs.getMountAt(rp.mount, child); mnt != nil { + rp.nextMount = mnt + return nil, resolveMountPointError{} + } + } + return child, nil +} + +// ResolveComponent returns the Dentry reached by starting at d and resolving +// the current path component in the stream represented by rp. It does not +// advance the stream. It does not take a reference on the returned Dentry. If +// no such Dentry exists, ResolveComponent returns (nil, nil). +// +// Preconditions: !rp.Done(). There are no concurrent mutators of d. +func (rp *ResolvingPath) ResolveComponent(d *Dentry) (*Dentry, error) { + switch pc := rp.Component(); pc { + case ".": + return d, nil + case "..": + return rp.ResolveParent(d) + default: + return rp.ResolveChild(d, pc) + } +} + +// ShouldFollowSymlink returns true if, supposing that the current path +// component in pcs represents a symbolic link, the symbolic link should be +// followed. +// +// Preconditions: !rp.Done(). +func (rp *ResolvingPath) ShouldFollowSymlink() bool { + // Non-final symlinks are always followed. + return rp.flags&rpflagsFollowFinalSymlink != 0 || !rp.Final() +} + +// HandleSymlink is called when the current path component is a symbolic link +// to the given target. If the calling Filesystem method should continue path +// traversal, HandleSymlink updates the path component stream to reflect the +// symlink target and returns nil. Otherwise it returns a non-nil error. +// +// Preconditions: !rp.Done(). +func (rp *ResolvingPath) HandleSymlink(target string) error { + if rp.symlinks >= linux.MaxSymlinkTraversals { + return syserror.ELOOP + } + targetPath, err := fspath.Parse(target) + if err != nil { + return err + } + rp.symlinks++ + if targetPath.Absolute { + rp.absSymlinkTarget = targetPath + return resolveAbsSymlinkError{} + } + if !targetPath.Begin.Ok() { + panic(fmt.Sprintf("symbolic link has non-empty target %q that is both relative and has no path components?", target)) + } + // Consume the path component that represented the symlink. + rp.Advance() + // Prepend the symlink target to the relative path. + rp.relpathPrepend(targetPath) + return nil +} + +func (rp *ResolvingPath) relpathPrepend(path fspath.Path) { + if rp.pit.Ok() { + rp.parts[rp.curPart] = rp.pit + rp.pit = path.Begin + rp.curPart++ + } else { + // The symlink was the final path component, so now the symlink target + // is the whole path. + rp.pit = path.Begin + // Symlink targets can set rp.mustBeDir (if they end in a trailing /), + // but can't unset it. + if path.Dir { + rp.mustBeDir = true + } + } +} + +func (rp *ResolvingPath) handleError(err error) bool { + switch err.(type) { + case resolveMountRootError: + // Switch to the new Mount. We hold references on the Mount and Dentry + // (from VFS.getMountpointAt()). + rp.decRefStartAndMount() + rp.mount = rp.nextMount + rp.start = rp.nextStart + rp.flags |= rpflagsHaveMountRef | rpflagsHaveStartRef + rp.nextMount = nil + rp.nextStart = nil + // Commit the previous FileystemImpl's progress through the relative + // path. (Don't consume the path component that caused us to traverse + // through the mount root - i.e. the ".." - because we still need to + // resolve the mount point's parent in the new FilesystemImpl.) + rp.relpathCommit() + // Restart path resolution on the new Mount. Don't bother calling + // rp.releaseErrorState() since we already set nextMount and nextStart + // to nil above. + return true + + case resolveMountPointError: + // Switch to the new Mount. We hold a reference on the Mount (from + // VFS.getMountAt()), but borrow the reference on the mount root from + // the Mount. + rp.decRefStartAndMount() + rp.mount = rp.nextMount + rp.start = rp.nextMount.root + rp.flags = rp.flags&^rpflagsHaveStartRef | rpflagsHaveMountRef + rp.nextMount = nil + // Consume the path component that represented the mount point. + rp.Advance() + // Commit the previous FilesystemImpl's progress through the relative + // path. + rp.relpathCommit() + // Restart path resolution on the new Mount. + rp.releaseErrorState() + return true + + case resolveAbsSymlinkError: + // Switch to the new Mount. References are borrowed from rp.root. + rp.decRefStartAndMount() + rp.mount = rp.root.mount + rp.start = rp.root.dentry + rp.flags &^= rpflagsHaveMountRef | rpflagsHaveStartRef + // Consume the path component that represented the symlink. + rp.Advance() + // Prepend the symlink target to the relative path. + rp.relpathPrepend(rp.absSymlinkTarget) + // Commit the previous FilesystemImpl's progress through the relative + // path, including the symlink target we just prepended. + rp.relpathCommit() + // Restart path resolution on the new Mount. + rp.releaseErrorState() + return true + + default: + // Not an error we can handle. + return false + } +} + +// MustBeDir returns true if the file traversed by rp must be a directory. +func (rp *ResolvingPath) MustBeDir() bool { + return rp.mustBeDir +} diff --git a/pkg/sentry/vfs/syscalls.go b/pkg/sentry/vfs/syscalls.go new file mode 100644 index 000000000..23f2b9e08 --- /dev/null +++ b/pkg/sentry/vfs/syscalls.go @@ -0,0 +1,217 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/context" + "gvisor.dev/gvisor/pkg/sentry/kernel/auth" + "gvisor.dev/gvisor/pkg/syserror" +) + +// PathOperation specifies the path operated on by a VFS method. +// +// PathOperation is passed to VFS methods by pointer to reduce memory copying: +// it's somewhat large and should never escape. (Options structs are passed by +// pointer to VFS and FileDescription methods for the same reason.) +type PathOperation struct { + // Root is the VFS root. References on Root are borrowed from the provider + // of the PathOperation. + // + // Invariants: Root.Ok(). + Root VirtualDentry + + // Start is the starting point for the path traversal. References on Start + // are borrowed from the provider of the PathOperation (i.e. the caller of + // the VFS method to which the PathOperation was passed). + // + // Invariants: Start.Ok(). If Pathname.Absolute, then Start == Root. + Start VirtualDentry + + // Path is the pathname traversed by this operation. + Pathname string + + // If FollowFinalSymlink is true, and the Dentry traversed by the final + // path component represents a symbolic link, the symbolic link should be + // followed. + FollowFinalSymlink bool +} + +// GetDentryAt returns a VirtualDentry representing the given path, at which a +// file must exist. A reference is taken on the returned VirtualDentry. +func (vfs *VirtualFilesystem) GetDentryAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *GetDentryOptions) (VirtualDentry, error) { + rp, err := vfs.getResolvingPath(creds, pop) + if err != nil { + return VirtualDentry{}, err + } + for { + d, err := rp.mount.fs.impl.GetDentryAt(ctx, rp, *opts) + if err == nil { + vd := VirtualDentry{ + mount: rp.mount, + dentry: d, + } + rp.mount.incRef() + vfs.putResolvingPath(rp) + return vd, nil + } + if !rp.handleError(err) { + vfs.putResolvingPath(rp) + return VirtualDentry{}, err + } + } +} + +// MkdirAt creates a directory at the given path. +func (vfs *VirtualFilesystem) MkdirAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *MkdirOptions) error { + // "Under Linux, apart from the permission bits, the S_ISVTX mode bit is + // also honored." - mkdir(2) + opts.Mode &= 01777 + rp, err := vfs.getResolvingPath(creds, pop) + if err != nil { + return err + } + for { + err := rp.mount.fs.impl.MkdirAt(ctx, rp, *opts) + if err == nil { + vfs.putResolvingPath(rp) + return nil + } + if !rp.handleError(err) { + vfs.putResolvingPath(rp) + return err + } + } +} + +// OpenAt returns a FileDescription providing access to the file at the given +// path. A reference is taken on the returned FileDescription. +func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *OpenOptions) (*FileDescription, error) { + // Remove: + // + // - O_LARGEFILE, which we always report in FileDescription status flags + // since only 64-bit architectures are supported at this time. + // + // - O_CLOEXEC, which affects file descriptors and therefore must be + // handled outside of VFS. + // + // - Unknown flags. + opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_NOCTTY | linux.O_TRUNC | linux.O_APPEND | linux.O_NONBLOCK | linux.O_DSYNC | linux.O_ASYNC | linux.O_DIRECT | linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_NOATIME | linux.O_SYNC | linux.O_PATH | linux.O_TMPFILE + // Linux's __O_SYNC (which we call linux.O_SYNC) implies O_DSYNC. + if opts.Flags&linux.O_SYNC != 0 { + opts.Flags |= linux.O_DSYNC + } + // Linux's __O_TMPFILE (which we call linux.O_TMPFILE) must be specified + // with O_DIRECTORY and a writable access mode (to ensure that it fails on + // filesystem implementations that do not support it). + if opts.Flags&linux.O_TMPFILE != 0 { + if opts.Flags&linux.O_DIRECTORY == 0 { + return nil, syserror.EINVAL + } + if opts.Flags&linux.O_CREAT != 0 { + return nil, syserror.EINVAL + } + if opts.Flags&linux.O_ACCMODE == linux.O_RDONLY { + return nil, syserror.EINVAL + } + } + // O_PATH causes most other flags to be ignored. + if opts.Flags&linux.O_PATH != 0 { + opts.Flags &= linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_PATH + } + // "On Linux, the following bits are also honored in mode: [S_ISUID, + // S_ISGID, S_ISVTX]" - open(2) + opts.Mode &= 07777 + + if opts.Flags&linux.O_NOFOLLOW != 0 { + pop.FollowFinalSymlink = false + } + rp, err := vfs.getResolvingPath(creds, pop) + if err != nil { + return nil, err + } + if opts.Flags&linux.O_DIRECTORY != 0 { + rp.mustBeDir = true + rp.mustBeDirOrig = true + } + for { + fd, err := rp.mount.fs.impl.OpenAt(ctx, rp, *opts) + if err == nil { + vfs.putResolvingPath(rp) + return fd, nil + } + if !rp.handleError(err) { + vfs.putResolvingPath(rp) + return nil, err + } + } +} + +// StatAt returns metadata for the file at the given path. +func (vfs *VirtualFilesystem) StatAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *StatOptions) (linux.Statx, error) { + rp, err := vfs.getResolvingPath(creds, pop) + if err != nil { + return linux.Statx{}, err + } + for { + stat, err := rp.mount.fs.impl.StatAt(ctx, rp, *opts) + if err == nil { + vfs.putResolvingPath(rp) + return stat, nil + } + if !rp.handleError(err) { + vfs.putResolvingPath(rp) + return linux.Statx{}, err + } + } +} + +// StatusFlags returns file description status flags. +func (fd *FileDescription) StatusFlags(ctx context.Context) (uint32, error) { + flags, err := fd.impl.StatusFlags(ctx) + flags |= linux.O_LARGEFILE + return flags, err +} + +// SetStatusFlags sets file description status flags. +func (fd *FileDescription) SetStatusFlags(ctx context.Context, flags uint32) error { + return fd.impl.SetStatusFlags(ctx, flags) +} + +// TODO: +// +// - VFS.SyncAllFilesystems() for sync(2) +// +// - Something for syncfs(2) +// +// - VFS.LinkAt() +// +// - VFS.MknodAt() +// +// - VFS.ReadlinkAt() +// +// - VFS.RenameAt() +// +// - VFS.RmdirAt() +// +// - VFS.SetStatAt() +// +// - VFS.StatFSAt() +// +// - VFS.SymlinkAt() +// +// - VFS.UnlinkAt() +// +// - FileDescription.(almost everything) diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go new file mode 100644 index 000000000..4a8a69540 --- /dev/null +++ b/pkg/sentry/vfs/vfs.go @@ -0,0 +1,135 @@ +// Copyright 2019 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package vfs implements a virtual filesystem layer. +// +// Lock order: +// +// Filesystem implementation locks +// VirtualFilesystem.mountMu +// VirtualFilesystem.fsTypesMu +package vfs + +import ( + "sync" +) + +// A VirtualFilesystem (VFS for short) combines Filesystems in trees of Mounts. +// +// There is no analogue to the VirtualFilesystem type in Linux, as the +// equivalent state in Linux is global. +type VirtualFilesystem struct { + // mountMu serializes mount mutations. + // + // mountMu is analogous to Linux's namespace_sem. + mountMu sync.RWMutex + + // mounts maps (mount parent, mount point) pairs to mounts. (Since mounts + // are uniquely namespaced, including mount parent in the key correctly + // handles both bind mounts and mount namespaces; Linux does the same.) + // Synchronization between mutators and readers is provided by mounts.seq; + // synchronization between mutators is provided by mountMu. + // + // mounts is used to follow mount points during path traversal. We use a + // single table rather than per-Dentry tables to reduce size (and therefore + // cache footprint) for the vast majority of Dentries that are not mount + // points. + // + // mounts is analogous to Linux's mount_hashtable. + mounts mountTable + + // mountpoints maps mount points to mounts at those points in all + // namespaces. mountpoints is protected by mountMu. + // + // mountpoints is used to find mounts that must be unmounted due to + // removal of a mount point Dentry from another mount namespace. ("A file + // or directory that is a mount point in one namespace that is not a mount + // point in another namespace, may be renamed, unlinked, or removed + // (rmdir(2)) in the mount namespace in which it is not a mount point + // (subject to the usual permission checks)." - mount_namespaces(7)) + // + // mountpoints is analogous to Linux's mountpoint_hashtable. + mountpoints map[*Dentry]map[*Mount]struct{} + + // fsTypes contains all FilesystemTypes that are usable in the + // VirtualFilesystem. fsTypes is protected by fsTypesMu. + fsTypesMu sync.RWMutex + fsTypes map[string]FilesystemType +} + +// New returns a new VirtualFilesystem with no mounts or FilesystemTypes. +func New() *VirtualFilesystem { + vfs := &VirtualFilesystem{ + mountpoints: make(map[*Dentry]map[*Mount]struct{}), + fsTypes: make(map[string]FilesystemType), + } + vfs.mounts.Init() + return vfs +} + +// A VirtualDentry represents a node in a VFS tree, by combining a Dentry +// (which represents a node in a Filesystem's tree) and a Mount (which +// represents the Filesystem's position in a VFS mount tree). +// +// VirtualDentry's semantics are similar to that of a Go interface object +// representing a pointer: it is a copyable value type that represents +// references to another entity. The zero value of VirtualDentry is an "empty +// VirtualDentry", directly analogous to a nil interface object. +// VirtualDentry.Ok() checks that a VirtualDentry is not zero-valued; unless +// otherwise specified, all other VirtualDentry methods require +// VirtualDentry.Ok() == true. +// +// Mounts and Dentries are reference-counted, requiring that users call +// VirtualDentry.{Inc,Dec}Ref() as appropriate. We often colloquially refer to +// references on the Mount and Dentry referred to by a VirtualDentry as +// references on the VirtualDentry itself. Unless otherwise specified, all +// VirtualDentry methods require that a reference is held on the VirtualDentry. +// +// VirtualDentry is analogous to Linux's struct path. +type VirtualDentry struct { + mount *Mount + dentry *Dentry +} + +// Ok returns true if vd is not empty. It does not require that a reference is +// held. +func (vd VirtualDentry) Ok() bool { + return vd.mount != nil +} + +// IncRef increments the reference counts on the Mount and Dentry represented +// by vd. +func (vd VirtualDentry) IncRef() { + vd.mount.incRef() + vd.dentry.incRef(vd.mount.fs) +} + +// DecRef decrements the reference counts on the Mount and Dentry represented +// by vd. +func (vd VirtualDentry) DecRef() { + vd.dentry.decRef(vd.mount.fs) + vd.mount.decRef() +} + +// Mount returns the Mount associated with vd. It does not take a reference on +// the returned Mount. +func (vd VirtualDentry) Mount() *Mount { + return vd.mount +} + +// Dentry returns the Dentry associated with vd. It does not take a reference +// on the returned Dentry. +func (vd VirtualDentry) Dentry() *Dentry { + return vd.dentry +} |