summaryrefslogtreecommitdiffhomepage
path: root/pkg/sentry/fs/fsutil
diff options
context:
space:
mode:
Diffstat (limited to 'pkg/sentry/fs/fsutil')
-rw-r--r--pkg/sentry/fs/fsutil/BUILD149
-rw-r--r--pkg/sentry/fs/fsutil/README.md207
-rw-r--r--pkg/sentry/fs/fsutil/dirty_set.go213
-rw-r--r--pkg/sentry/fs/fsutil/dirty_set_test.go38
-rw-r--r--pkg/sentry/fs/fsutil/file.go267
-rw-r--r--pkg/sentry/fs/fsutil/file_range_set.go208
-rw-r--r--pkg/sentry/fs/fsutil/frame_ref_set.go50
-rw-r--r--pkg/sentry/fs/fsutil/fsutil.go26
-rw-r--r--pkg/sentry/fs/fsutil/handle.go126
-rw-r--r--pkg/sentry/fs/fsutil/handle_test.go227
-rw-r--r--pkg/sentry/fs/fsutil/host_file_mapper.go209
-rw-r--r--pkg/sentry/fs/fsutil/host_file_mapper_state.go20
-rw-r--r--pkg/sentry/fs/fsutil/host_file_mapper_unsafe.go27
-rw-r--r--pkg/sentry/fs/fsutil/inode.go380
-rw-r--r--pkg/sentry/fs/fsutil/inode_cached.go845
-rw-r--r--pkg/sentry/fs/fsutil/inode_cached_test.go403
16 files changed, 3395 insertions, 0 deletions
diff --git a/pkg/sentry/fs/fsutil/BUILD b/pkg/sentry/fs/fsutil/BUILD
new file mode 100644
index 000000000..4fa6395f7
--- /dev/null
+++ b/pkg/sentry/fs/fsutil/BUILD
@@ -0,0 +1,149 @@
+package(licenses = ["notice"]) # Apache 2.0
+
+load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+load("//tools/go_stateify:defs.bzl", "go_stateify")
+
+go_stateify(
+ name = "fsutil_state",
+ srcs = [
+ "dirty_set_impl.go",
+ "file.go",
+ "file_range_set_impl.go",
+ "frame_ref_set_impl.go",
+ "handle.go",
+ "host_file_mapper.go",
+ "host_file_mapper_state.go",
+ "inode.go",
+ "inode_cached.go",
+ ],
+ out = "fsutil_state.go",
+ package = "fsutil",
+)
+
+go_template_instance(
+ name = "dirty_set_impl",
+ out = "dirty_set_impl.go",
+ imports = {
+ "memmap": "gvisor.googlesource.com/gvisor/pkg/sentry/memmap",
+ "platform": "gvisor.googlesource.com/gvisor/pkg/sentry/platform",
+ },
+ package = "fsutil",
+ prefix = "Dirty",
+ template = "//pkg/segment:generic_set",
+ types = {
+ "Key": "uint64",
+ "Range": "memmap.MappableRange",
+ "Value": "DirtyInfo",
+ "Functions": "dirtySetFunctions",
+ },
+)
+
+go_template_instance(
+ name = "frame_ref_set_impl",
+ out = "frame_ref_set_impl.go",
+ imports = {
+ "platform": "gvisor.googlesource.com/gvisor/pkg/sentry/platform",
+ },
+ package = "fsutil",
+ prefix = "frameRef",
+ template = "//pkg/segment:generic_set",
+ types = {
+ "Key": "uint64",
+ "Range": "platform.FileRange",
+ "Value": "uint64",
+ "Functions": "frameRefSetFunctions",
+ },
+)
+
+go_template_instance(
+ name = "file_range_set_impl",
+ out = "file_range_set_impl.go",
+ imports = {
+ "memmap": "gvisor.googlesource.com/gvisor/pkg/sentry/memmap",
+ "platform": "gvisor.googlesource.com/gvisor/pkg/sentry/platform",
+ },
+ package = "fsutil",
+ prefix = "FileRange",
+ template = "//pkg/segment:generic_set",
+ types = {
+ "Key": "uint64",
+ "Range": "memmap.MappableRange",
+ "Value": "uint64",
+ "Functions": "fileRangeSetFunctions",
+ },
+)
+
+go_library(
+ name = "fsutil",
+ srcs = [
+ "dirty_set.go",
+ "dirty_set_impl.go",
+ "file.go",
+ "file_range_set.go",
+ "file_range_set_impl.go",
+ "frame_ref_set.go",
+ "frame_ref_set_impl.go",
+ "fsutil.go",
+ "fsutil_state.go",
+ "handle.go",
+ "host_file_mapper.go",
+ "host_file_mapper_state.go",
+ "host_file_mapper_unsafe.go",
+ "inode.go",
+ "inode_cached.go",
+ ],
+ importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil",
+ visibility = ["//pkg/sentry:internal"],
+ deps = [
+ "//pkg/abi/linux",
+ "//pkg/log",
+ "//pkg/sentry/arch",
+ "//pkg/sentry/context",
+ "//pkg/sentry/device",
+ "//pkg/sentry/fs",
+ "//pkg/sentry/kernel/time",
+ "//pkg/sentry/memmap",
+ "//pkg/sentry/platform",
+ "//pkg/sentry/safemem",
+ "//pkg/sentry/usage",
+ "//pkg/sentry/usermem",
+ "//pkg/state",
+ "//pkg/syserror",
+ "//pkg/tcpip/transport/unix",
+ "//pkg/waiter",
+ ],
+)
+
+go_test(
+ name = "fsutil_x_test",
+ size = "small",
+ srcs = ["handle_test.go"],
+ deps = [
+ ":fsutil",
+ "//pkg/sentry/context",
+ "//pkg/sentry/context/contexttest",
+ "//pkg/sentry/fs",
+ "//pkg/sentry/fs/ramfs/test",
+ "//pkg/sentry/usermem",
+ ],
+)
+
+go_test(
+ name = "fsutil_test",
+ size = "small",
+ srcs = [
+ "dirty_set_test.go",
+ "inode_cached_test.go",
+ ],
+ embed = [":fsutil"],
+ deps = [
+ "//pkg/sentry/context",
+ "//pkg/sentry/context/contexttest",
+ "//pkg/sentry/fs",
+ "//pkg/sentry/kernel/time",
+ "//pkg/sentry/memmap",
+ "//pkg/sentry/safemem",
+ "//pkg/sentry/usermem",
+ ],
+)
diff --git a/pkg/sentry/fs/fsutil/README.md b/pkg/sentry/fs/fsutil/README.md
new file mode 100644
index 000000000..d3780e9fa
--- /dev/null
+++ b/pkg/sentry/fs/fsutil/README.md
@@ -0,0 +1,207 @@
+This package provides utilities for implementing virtual filesystem objects.
+
+[TOC]
+
+## Page cache
+
+`CachingInodeOperations` implements a page cache for files that cannot use the
+host page cache. Normally these are files that store their data in a remote
+filesystem. This also applies to files that are accessed on a platform that does
+not support directly memory mapping host file descriptors (e.g. the ptrace
+platform).
+
+An `CachingInodeOperations` buffers regions of a single file into memory. It is
+owned by an `fs.Inode`, the in-memory representation of a file (all open file
+descriptors are backed by an `fs.Inode`). The `fs.Inode` provides operations for
+reading memory into an `CachingInodeOperations`, to represent the contents of
+the file in-memory, and for writing memory out, to relieve memory pressure on
+the kernel and to synchronize in-memory changes to filesystems.
+
+An `CachingInodeOperations` enables readable and/or writable memory access to
+file content. Files can be mapped shared or private, see mmap(2). When a file is
+mapped shared, changes to the file via write(2) and truncate(2) are reflected in
+the shared memory region. Conversely, when the shared memory region is modified,
+changes to the file are visible via read(2). Multiple shared mappings of the
+same file are coherent with each other. This is consistent with Linux.
+
+When a file is mapped private, updates to the mapped memory are not visible to
+other memory mappings. Updates to the mapped memory are also not reflected in
+the file content as seen by read(2). If the file is changed after a private
+mapping is created, for instance by write(2), the change to the file may or may
+not be reflected in the private mapping. This is consistent with Linux.
+
+An `CachingInodeOperations` keeps track of ranges of memory that were modified
+(or "dirtied"). When the file is explicitly synced via fsync(2), only the dirty
+ranges are written out to the filesystem. Any error returned indicates a failure
+to write all dirty memory of an `CachingInodeOperations` to the filesystem. In
+this case the filesystem may be in an inconsistent state. The same operation can
+be performed on the shared memory itself using msync(2). If neither fsync(2) nor
+msync(2) is performed, then the dirty memory is written out in accordance with
+the `CachingInodeOperations` eviction strategy (see below) and there is no
+guarantee that memory will be written out successfully in full.
+
+### Memory allocation and eviction
+
+An `CachingInodeOperations` implements the following allocation and eviction
+strategy:
+
+- Memory is allocated and brought up to date with the contents of a file when
+ a region of mapped memory is accessed (or "faulted on").
+
+- Dirty memory is written out to filesystems when an fsync(2) or msync(2)
+ operation is performed on a memory mapped file, for all memory mapped files
+ when saved, and/or when there are no longer any memory mappings of a range
+ of a file, see munmap(2). As the latter implies, in the absence of a panic
+ or SIGKILL, dirty memory is written out for all memory mapped files when an
+ application exits.
+
+- Memory is freed when there are no longer any memory mappings of a range of a
+ file (e.g. when an application exits). This behavior is consistent with
+ Linux for shared memory that has been locked via mlock(2).
+
+Notably, memory is not allocated for read(2) or write(2) operations. This means
+that reads and writes to the file are only accelerated by an
+`CachingInodeOperations` if the file being read or written has been memory
+mapped *and* if the shared memory has been accessed at the region being read or
+written. This diverges from Linux which buffers memory into a page cache on
+read(2) proactively (i.e. readahead) and delays writing it out to filesystems on
+write(2) (i.e. writeback). The absence of these optimizations is not visible to
+applications beyond less than optimal performance when repeatedly reading and/or
+writing to same region of a file. See [Future Work](#future-work) for plans to
+implement these optimizations.
+
+Additionally, memory held by `CachingInodeOperationss` is currently unbounded in
+size. An `CachingInodeOperations` does not write out dirty memory and free it
+under system memory pressure. This can cause pathological memory usage.
+
+When memory is written back, an `CachingInodeOperations` may write regions of
+shared memory that were never modified. This is due to the strategy of
+minimizing page faults (see below) and handling only a subset of memory write
+faults. In the absence of an application or sentry crash, it is guaranteed that
+if a region of shared memory was written to, it is written back to a filesystem.
+
+### Life of a shared memory mapping
+
+A file is memory mapped via mmap(2). For example, if `A` is an address, an
+application may execute:
+
+```
+mmap(A, 0x1000, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
+```
+
+This creates a shared mapping of fd that reflects 4k of the contents of fd
+starting at offset 0, accessible at address `A`. This in turn creates a virtual
+memory area region ("vma") which indicates that [`A`, `A`+0x1000) is now a valid
+address range for this application to access.
+
+At this point, memory has not been allocated in the file's
+`CachingInodeOperations`. It is also the case that the address range [`A`,
+`A`+0x1000) has not been mapped on the host on behalf of the application. If the
+application then tries to modify 8 bytes of the shared memory:
+
+```
+char buffer[] = "aaaaaaaa";
+memcpy(A, buffer, 8);
+```
+
+The host then sends a `SIGSEGV` to the sentry because the address range [`A`,
+`A`+8) is not mapped on the host. The `SIGSEGV` indicates that the memory was
+accessed writable. The sentry looks up the vma associated with [`A`, `A`+8),
+finds the file that was mapped and its `CachingInodeOperations`. It then calls
+`CachingInodeOperations.MapInto` which allocates memory to back [`A`, `A`+8). It
+may choose to allocate more memory (i.e. do "readahead") to minimize subsequent
+faults.
+
+Memory that is allocated comes from a host tmpfs file (see `filemem.FileMem`).
+The host tmpfs file memory is brought up to date with the contents of the mapped
+file on its filesystem. The region of the host tmpfs file that reflects the
+mapped file is then mapped into the host address space of the application so
+that subsequent memory accesses do not repeatedly generate a `SIGSEGV`.
+
+The range that was allocated, including any extra memory allocation to minimize
+faults, is marked dirty due to the write fault. This overcounts dirty memory if
+the extra memory allocated is never modified.
+
+To make the scenario more interesting, imagine that this application spawns
+another process and maps the same file in the exact same way:
+
+```
+mmap(A, 0x1000, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
+```
+
+Imagine that this process then tries to modify the file again but with only 4
+bytes:
+
+```
+char buffer[] = "bbbb";
+memcpy(A, buffer, 4);
+```
+
+Since the first process has already mapped and accessed the same region of the
+file writable, `CachingInodeOperations.MapInto` is called but re-maps the memory
+that has already been allocated (because the host mapping can be invalidated at
+any time) rather than allocating new memory. The address range [`A`, `A`+0x1000)
+reflects the same cached view of the file as the first process sees. For
+example, reading 8 bytes from the file from either process via read(2) starting
+at offset 0 returns a consistent "bbbbaaaa".
+
+When this process no longer needs the shared memory, it may do:
+
+```
+munmap(A, 0x1000);
+```
+
+At this point, the modified memory cached by the `CachingInodeOperations` is not
+written back to the file because it is still in use by the first process that
+mapped it. When the first process also does:
+
+```
+munmap(A, 0x1000);
+```
+
+Then the last memory mapping of the file at the range [0, 0x1000) is gone. The
+file's `CachingInodeOperations` then starts writing back memory marked dirty to
+the file on its filesystem. Once writing completes, regardless of whether it was
+successful, the `CachingInodeOperations` frees the memory cached at the range
+[0, 0x1000).
+
+Subsequent read(2) or write(2) operations on the file go directly to the
+filesystem since there no longer exists memory for it in its
+`CachingInodeOperations`.
+
+## Future Work
+
+### Page cache
+
+The sentry does not yet implement the readahead and writeback optimizations for
+read(2) and write(2) respectively. To do so, on read(2) and/or write(2) the
+sentry must ensure that memory is allocated in a page cache to read or write
+into. However, the sentry cannot boundlessly allocate memory. If it did, the
+host would eventually OOM-kill the sentry+application process. This means that
+the sentry must implement a page cache memory allocation strategy that is
+bounded by a global user or container imposed limit. When this limit is
+approached, the sentry must decide from which page cache memory should be freed
+so that it can allocate more memory. If it makes a poor decision, the sentry may
+end up freeing and re-allocating memory to back regions of files that are
+frequently used, nullifying the optimization (and in some cases causing worse
+performance due to the overhead of memory allocation and general management).
+This is a form of "cache thrashing".
+
+In Linux, much research has been done to select and implement a lightweight but
+optimal page cache eviction algorithm. Linux makes use of hardware page bits to
+keep track of whether memory has been accessed. The sentry does not have direct
+access to hardware. Implementing a similarly lightweight and optimal page cache
+eviction algorithm will need to either introduce a kernel interface to obtain
+these page bits or find a suitable alternative proxy for access events.
+
+In Linux, readahead happens by default but is not always ideal. For instance,
+for files that are not read sequentially, it would be more ideal to simply read
+from only those regions of the file rather than to optimistically cache some
+number of bytes ahead of the read (up to 2MB in Linux) if the bytes cached won't
+be accessed. Linux implements the fadvise64(2) system call for applications to
+specify that a range of a file will not be accessed sequentially. The advice bit
+FADV_RANDOM turns off the readahead optimization for the given range in the
+given file. However fadvise64 is rarely used by applications so Linux implements
+a readahead backoff strategy if reads are not sequential. To ensure that
+application performance is not degraded, the sentry must implement a similar
+backoff strategy.
diff --git a/pkg/sentry/fs/fsutil/dirty_set.go b/pkg/sentry/fs/fsutil/dirty_set.go
new file mode 100644
index 000000000..9c6c98542
--- /dev/null
+++ b/pkg/sentry/fs/fsutil/dirty_set.go
@@ -0,0 +1,213 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fsutil
+
+import (
+ "math"
+
+ "gvisor.googlesource.com/gvisor/pkg/sentry/context"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// DirtySet maps offsets into a memmap.Mappable to DirtyInfo. It is used to
+// implement Mappables that cache data from another source.
+//
+// type DirtySet <generated by go_generics>
+
+// DirtyInfo is the value type of DirtySet, and represents information about a
+// Mappable offset that is dirty (the cached data for that offset is newer than
+// its source).
+type DirtyInfo struct {
+ // Keep is true if the represented offset is concurrently writable, such
+ // that writing the data for that offset back to the source does not
+ // guarantee that the offset is clean (since it may be concurrently
+ // rewritten after the writeback).
+ Keep bool
+}
+
+// dirtySetFunctions implements segment.Functions for DirtySet.
+type dirtySetFunctions struct{}
+
+// MinKey implements segment.Functions.MinKey.
+func (dirtySetFunctions) MinKey() uint64 {
+ return 0
+}
+
+// MaxKey implements segment.Functions.MaxKey.
+func (dirtySetFunctions) MaxKey() uint64 {
+ return math.MaxUint64
+}
+
+// ClearValue implements segment.Functions.ClearValue.
+func (dirtySetFunctions) ClearValue(val *DirtyInfo) {
+}
+
+// Merge implements segment.Functions.Merge.
+func (dirtySetFunctions) Merge(_ memmap.MappableRange, val1 DirtyInfo, _ memmap.MappableRange, val2 DirtyInfo) (DirtyInfo, bool) {
+ if val1 != val2 {
+ return DirtyInfo{}, false
+ }
+ return val1, true
+}
+
+// Split implements segment.Functions.Split.
+func (dirtySetFunctions) Split(_ memmap.MappableRange, val DirtyInfo, _ uint64) (DirtyInfo, DirtyInfo) {
+ return val, val
+}
+
+// MarkClean marks all offsets in mr as not dirty, except for those to which
+// KeepDirty has been applied.
+func (ds *DirtySet) MarkClean(mr memmap.MappableRange) {
+ seg := ds.LowerBoundSegment(mr.Start)
+ for seg.Ok() && seg.Start() < mr.End {
+ if seg.Value().Keep {
+ seg = seg.NextSegment()
+ continue
+ }
+ seg = ds.Isolate(seg, mr)
+ seg = ds.Remove(seg).NextSegment()
+ }
+}
+
+// KeepClean marks all offsets in mr as not dirty, even those that were
+// previously kept dirty by KeepDirty.
+func (ds *DirtySet) KeepClean(mr memmap.MappableRange) {
+ ds.RemoveRange(mr)
+}
+
+// MarkDirty marks all offsets in mr as dirty.
+func (ds *DirtySet) MarkDirty(mr memmap.MappableRange) {
+ ds.setDirty(mr, false)
+}
+
+// KeepDirty marks all offsets in mr as dirty and prevents them from being
+// marked as clean by MarkClean.
+func (ds *DirtySet) KeepDirty(mr memmap.MappableRange) {
+ ds.setDirty(mr, true)
+}
+
+func (ds *DirtySet) setDirty(mr memmap.MappableRange, keep bool) {
+ var changedAny bool
+ defer func() {
+ if changedAny {
+ ds.MergeRange(mr)
+ }
+ }()
+ seg, gap := ds.Find(mr.Start)
+ for {
+ switch {
+ case seg.Ok() && seg.Start() < mr.End:
+ if keep && !seg.Value().Keep {
+ changedAny = true
+ seg = ds.Isolate(seg, mr)
+ seg.ValuePtr().Keep = true
+ }
+ seg, gap = seg.NextNonEmpty()
+
+ case gap.Ok() && gap.Start() < mr.End:
+ changedAny = true
+ seg = ds.Insert(gap, gap.Range().Intersect(mr), DirtyInfo{keep})
+ seg, gap = seg.NextNonEmpty()
+
+ default:
+ return
+ }
+ }
+}
+
+// SyncDirty passes pages in the range mr that are stored in cache and
+// identified as dirty to writeAt, updating dirty to reflect successful writes.
+// If writeAt returns a successful partial write, SyncDirty will call it
+// repeatedly until all bytes have been written. max is the true size of the
+// cached object; offsets beyond max will not be passed to writeAt, even if
+// they are marked dirty.
+func SyncDirty(ctx context.Context, mr memmap.MappableRange, cache *FileRangeSet, dirty *DirtySet, max uint64, mem platform.File, writeAt func(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error)) error {
+ var changedDirty bool
+ defer func() {
+ if changedDirty {
+ dirty.MergeRange(mr)
+ }
+ }()
+ dseg := dirty.LowerBoundSegment(mr.Start)
+ for dseg.Ok() && dseg.Start() < mr.End {
+ var dr memmap.MappableRange
+ if dseg.Value().Keep {
+ dr = dseg.Range().Intersect(mr)
+ } else {
+ changedDirty = true
+ dseg = dirty.Isolate(dseg, mr)
+ dr = dseg.Range()
+ }
+ if err := syncDirtyRange(ctx, dr, cache, max, mem, writeAt); err != nil {
+ return err
+ }
+ if dseg.Value().Keep {
+ dseg = dseg.NextSegment()
+ } else {
+ dseg = dirty.Remove(dseg).NextSegment()
+ }
+ }
+ return nil
+}
+
+// SyncDirtyAll passes all pages stored in cache identified as dirty to
+// writeAt, updating dirty to reflect successful writes. If writeAt returns a
+// successful partial write, SyncDirtyAll will call it repeatedly until all
+// bytes have been written. max is the true size of the cached object; offsets
+// beyond max will not be passed to writeAt, even if they are marked dirty.
+func SyncDirtyAll(ctx context.Context, cache *FileRangeSet, dirty *DirtySet, max uint64, mem platform.File, writeAt func(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error)) error {
+ dseg := dirty.FirstSegment()
+ for dseg.Ok() {
+ if err := syncDirtyRange(ctx, dseg.Range(), cache, max, mem, writeAt); err != nil {
+ return err
+ }
+ if dseg.Value().Keep {
+ dseg = dseg.NextSegment()
+ } else {
+ dseg = dirty.Remove(dseg).NextSegment()
+ }
+ }
+ return nil
+}
+
+// Preconditions: mr must be page-aligned.
+func syncDirtyRange(ctx context.Context, mr memmap.MappableRange, cache *FileRangeSet, max uint64, mem platform.File, writeAt func(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error)) error {
+ for cseg := cache.LowerBoundSegment(mr.Start); cseg.Ok() && cseg.Start() < mr.End; cseg = cseg.NextSegment() {
+ wbr := cseg.Range().Intersect(mr)
+ if max < wbr.Start {
+ break
+ }
+ ims, err := mem.MapInternal(cseg.FileRangeOf(wbr), usermem.Read)
+ if err != nil {
+ return err
+ }
+ if max < wbr.End {
+ ims = ims.TakeFirst64(max - wbr.Start)
+ }
+ offset := wbr.Start
+ for !ims.IsEmpty() {
+ n, err := writeAt(ctx, ims, offset)
+ if err != nil {
+ return err
+ }
+ offset += n
+ ims = ims.DropFirst64(n)
+ }
+ }
+ return nil
+}
diff --git a/pkg/sentry/fs/fsutil/dirty_set_test.go b/pkg/sentry/fs/fsutil/dirty_set_test.go
new file mode 100644
index 000000000..f7693cb19
--- /dev/null
+++ b/pkg/sentry/fs/fsutil/dirty_set_test.go
@@ -0,0 +1,38 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fsutil
+
+import (
+ "reflect"
+ "testing"
+
+ "gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+func TestDirtySet(t *testing.T) {
+ var set DirtySet
+ set.MarkDirty(memmap.MappableRange{0, 2 * usermem.PageSize})
+ set.KeepDirty(memmap.MappableRange{usermem.PageSize, 2 * usermem.PageSize})
+ set.MarkClean(memmap.MappableRange{0, 2 * usermem.PageSize})
+ want := &DirtySegmentDataSlices{
+ Start: []uint64{usermem.PageSize},
+ End: []uint64{2 * usermem.PageSize},
+ Values: []DirtyInfo{{Keep: true}},
+ }
+ if got := set.ExportSortedSlices(); !reflect.DeepEqual(got, want) {
+ t.Errorf("set:\n\tgot %v,\n\twant %v", got, want)
+ }
+}
diff --git a/pkg/sentry/fs/fsutil/file.go b/pkg/sentry/fs/fsutil/file.go
new file mode 100644
index 000000000..a7329f1c9
--- /dev/null
+++ b/pkg/sentry/fs/fsutil/file.go
@@ -0,0 +1,267 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fsutil
+
+import (
+ "gvisor.googlesource.com/gvisor/pkg/sentry/arch"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/context"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+ "gvisor.googlesource.com/gvisor/pkg/syserror"
+ "gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// NoopRelease implements FileOperations.Release for files that have no
+// resources to release.
+type NoopRelease struct{}
+
+// Release is a no-op.
+func (NoopRelease) Release() {}
+
+// SeekWithDirCursor is used to implement fs.FileOperations.Seek. If dirCursor
+// is not nil and the seek was on a directory, the cursor will be updated.
+//
+// Currenly only seeking to 0 on a directory is supported.
+//
+// FIXME: Lift directory seeking limitations.
+func SeekWithDirCursor(ctx context.Context, file *fs.File, whence fs.SeekWhence, offset int64, dirCursor *string) (int64, error) {
+ inode := file.Dirent.Inode
+ current := file.Offset()
+
+ // Does the Inode represents a non-seekable type?
+ if fs.IsPipe(inode.StableAttr) || fs.IsSocket(inode.StableAttr) {
+ return current, syserror.ESPIPE
+ }
+
+ // Does the Inode represent a character device?
+ if fs.IsCharDevice(inode.StableAttr) {
+ // Ignore seek requests.
+ //
+ // FIXME: This preserves existing
+ // behavior but is not universally correct.
+ return 0, nil
+ }
+
+ // Otherwise compute the new offset.
+ switch whence {
+ case fs.SeekSet:
+ switch inode.StableAttr.Type {
+ case fs.RegularFile, fs.SpecialFile, fs.BlockDevice:
+ if offset < 0 {
+ return current, syserror.EINVAL
+ }
+ return offset, nil
+ case fs.Directory, fs.SpecialDirectory:
+ if offset != 0 {
+ return current, syserror.EINVAL
+ }
+ // SEEK_SET to 0 moves the directory "cursor" to the beginning.
+ if dirCursor != nil {
+ *dirCursor = ""
+ }
+ return 0, nil
+ default:
+ return current, syserror.EINVAL
+ }
+ case fs.SeekCurrent:
+ switch inode.StableAttr.Type {
+ case fs.RegularFile, fs.SpecialFile, fs.BlockDevice:
+ if current+offset < 0 {
+ return current, syserror.EINVAL
+ }
+ return current + offset, nil
+ case fs.Directory, fs.SpecialDirectory:
+ if offset != 0 {
+ return current, syserror.EINVAL
+ }
+ return current, nil
+ default:
+ return current, syserror.EINVAL
+ }
+ case fs.SeekEnd:
+ switch inode.StableAttr.Type {
+ case fs.RegularFile, fs.BlockDevice:
+ // Allow the file to determine the end.
+ uattr, err := inode.UnstableAttr(ctx)
+ if err != nil {
+ return current, err
+ }
+ sz := uattr.Size
+ if sz+offset < 0 {
+ return current, syserror.EINVAL
+ }
+ return sz + offset, nil
+ // FIXME: This is not universally correct.
+ // Remove SpecialDirectory.
+ case fs.SpecialDirectory:
+ if offset != 0 {
+ return current, syserror.EINVAL
+ }
+ // SEEK_END to 0 moves the directory "cursor" to the end.
+ //
+ // FIXME: The ensures that after the seek,
+ // reading on the directory will get EOF. But it is not
+ // correct in general because the directory can grow in
+ // size; attempting to read those new entries will be
+ // futile (EOF will always be the result).
+ return fs.FileMaxOffset, nil
+ default:
+ return current, syserror.EINVAL
+ }
+ }
+
+ // Not a valid seek request.
+ return current, syserror.EINVAL
+}
+
+// GenericSeek implements FileOperations.Seek for files that use a generic
+// seek implementation.
+type GenericSeek struct{}
+
+// Seek implements fs.FileOperations.Seek.
+func (GenericSeek) Seek(ctx context.Context, file *fs.File, whence fs.SeekWhence, offset int64) (int64, error) {
+ return SeekWithDirCursor(ctx, file, whence, offset, nil)
+}
+
+// ZeroSeek implements FileOperations.Seek for files that maintain a constant
+// zero-value offset and require a no-op Seek.
+type ZeroSeek struct{}
+
+// Seek implements FileOperations.Seek.
+func (ZeroSeek) Seek(context.Context, *fs.File, fs.SeekWhence, int64) (int64, error) {
+ return 0, nil
+}
+
+// PipeSeek implements FileOperations.Seek and can be used for files that behave
+// like pipes (seeking is not supported).
+type PipeSeek struct{}
+
+// Seek implements FileOperations.Seek.
+func (PipeSeek) Seek(context.Context, *fs.File, fs.SeekWhence, int64) (int64, error) {
+ return 0, syserror.ESPIPE
+}
+
+// NotDirReaddir implements FileOperations.Readdir for non-directories.
+type NotDirReaddir struct{}
+
+// Readdir implements FileOperations.NotDirReaddir.
+func (NotDirReaddir) Readdir(context.Context, *fs.File, fs.DentrySerializer) (int64, error) {
+ return 0, syserror.ENOTDIR
+}
+
+// NoFsync implements FileOperations.Fsync for files that don't support syncing.
+type NoFsync struct{}
+
+// Fsync implements FileOperations.Fsync.
+func (NoFsync) Fsync(context.Context, *fs.File, int64, int64, fs.SyncType) error {
+ return syserror.EINVAL
+}
+
+// NoopFsync implements FileOperations.Fsync for files that don't need to synced.
+type NoopFsync struct{}
+
+// Fsync implements FileOperations.Fsync.
+func (NoopFsync) Fsync(context.Context, *fs.File, int64, int64, fs.SyncType) error {
+ return nil
+}
+
+// NoopFlush implements FileOperations.Flush as a no-op.
+type NoopFlush struct{}
+
+// Flush implements FileOperations.Flush.
+func (NoopFlush) Flush(context.Context, *fs.File) error {
+ return nil
+}
+
+// NoMMap implements fs.FileOperations.Mappable for files that cannot
+// be memory mapped.
+type NoMMap struct{}
+
+// ConfigureMMap implements fs.FileOperations.ConfigureMMap.
+func (NoMMap) ConfigureMMap(context.Context, *fs.File, *memmap.MMapOpts) error {
+ return syserror.ENODEV
+}
+
+// GenericConfigureMMap implements fs.FileOperations.ConfigureMMap for most
+// filesystems that support memory mapping.
+func GenericConfigureMMap(file *fs.File, m memmap.Mappable, opts *memmap.MMapOpts) error {
+ opts.Mappable = m
+ opts.MappingIdentity = file
+ file.IncRef()
+ return nil
+}
+
+// NoIoctl implements fs.FileOperations.Ioctl for files that don't implement
+// the ioctl syscall.
+type NoIoctl struct{}
+
+// Ioctl implements fs.FileOperations.Ioctl.
+func (NoIoctl) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+ return 0, syserror.ENOTTY
+}
+
+// DirFileOperations implements FileOperations for directories.
+type DirFileOperations struct {
+ waiter.AlwaysReady `state:"nosave"`
+ NoopRelease `state:"nosave"`
+ GenericSeek `state:"nosave"`
+ NoFsync `state:"nosave"`
+ NoopFlush `state:"nosave"`
+ NoMMap `state:"nosave"`
+ NoIoctl `state:"nosave"`
+
+ // dentryMap is a SortedDentryMap used to implement Readdir.
+ dentryMap *fs.SortedDentryMap
+
+ // dirCursor contains the name of the last directory entry that was
+ // serialized.
+ dirCursor string
+}
+
+// NewDirFileOperations returns a new DirFileOperations that will iterate the
+// given denty map.
+func NewDirFileOperations(dentries *fs.SortedDentryMap) *DirFileOperations {
+ return &DirFileOperations{
+ dentryMap: dentries,
+ }
+}
+
+// IterateDir implements DirIterator.IterateDir.
+func (dfo *DirFileOperations) IterateDir(ctx context.Context, dirCtx *fs.DirCtx, offset int) (int, error) {
+ n, err := fs.GenericReaddir(dirCtx, dfo.dentryMap)
+ return offset + n, err
+}
+
+// Readdir implements FileOperations.Readdir.
+func (dfo *DirFileOperations) Readdir(ctx context.Context, file *fs.File, serializer fs.DentrySerializer) (int64, error) {
+ root := fs.RootFromContext(ctx)
+ defer root.DecRef()
+ dirCtx := &fs.DirCtx{
+ Serializer: serializer,
+ DirCursor: &dfo.dirCursor,
+ }
+ return fs.DirentReaddir(ctx, file.Dirent, dfo, root, dirCtx, file.Offset())
+}
+
+// Read implements FileOperations.Read
+func (*DirFileOperations) Read(context.Context, *fs.File, usermem.IOSequence, int64) (int64, error) {
+ return 0, syserror.EISDIR
+}
+
+// Write implements FileOperations.Write.
+func (*DirFileOperations) Write(context.Context, *fs.File, usermem.IOSequence, int64) (int64, error) {
+ return 0, syserror.EISDIR
+}
diff --git a/pkg/sentry/fs/fsutil/file_range_set.go b/pkg/sentry/fs/fsutil/file_range_set.go
new file mode 100644
index 000000000..da6949ccb
--- /dev/null
+++ b/pkg/sentry/fs/fsutil/file_range_set.go
@@ -0,0 +1,208 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fsutil
+
+import (
+ "fmt"
+ "io"
+ "math"
+
+ "gvisor.googlesource.com/gvisor/pkg/sentry/context"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/usage"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// FileRangeSet maps offsets into a memmap.Mappable to offsets into a
+// platform.File. It is used to implement Mappables that store data in
+// sparsely-allocated memory.
+//
+// type FileRangeSet <generated by go_generics>
+
+// fileRangeSetFunctions implements segment.Functions for FileRangeSet.
+type fileRangeSetFunctions struct{}
+
+// MinKey implements segment.Functions.MinKey.
+func (fileRangeSetFunctions) MinKey() uint64 {
+ return 0
+}
+
+// MaxKey implements segment.Functions.MaxKey.
+func (fileRangeSetFunctions) MaxKey() uint64 {
+ return math.MaxUint64
+}
+
+// ClearValue implements segment.Functions.ClearValue.
+func (fileRangeSetFunctions) ClearValue(_ *uint64) {
+}
+
+// Merge implements segment.Functions.Merge.
+func (fileRangeSetFunctions) Merge(mr1 memmap.MappableRange, frstart1 uint64, _ memmap.MappableRange, frstart2 uint64) (uint64, bool) {
+ if frstart1+mr1.Length() != frstart2 {
+ return 0, false
+ }
+ return frstart1, true
+}
+
+// Split implements segment.Functions.Split.
+func (fileRangeSetFunctions) Split(mr memmap.MappableRange, frstart uint64, split uint64) (uint64, uint64) {
+ return frstart, frstart + (split - mr.Start)
+}
+
+// FileRange returns the FileRange mapped by seg.
+func (seg FileRangeIterator) FileRange() platform.FileRange {
+ return seg.FileRangeOf(seg.Range())
+}
+
+// FileRangeOf returns the FileRange mapped by mr.
+//
+// Preconditions: seg.Range().IsSupersetOf(mr). mr.Length() != 0.
+func (seg FileRangeIterator) FileRangeOf(mr memmap.MappableRange) platform.FileRange {
+ frstart := seg.Value() + (mr.Start - seg.Start())
+ return platform.FileRange{frstart, frstart + mr.Length()}
+}
+
+// Fill attempts to ensure that all memmap.Mappable offsets in required are
+// mapped to a platform.File offset, by allocating from mem with the given
+// memory usage kind and invoking readAt to store data into memory. (If readAt
+// returns a successful partial read, Fill will call it repeatedly until all
+// bytes have been read.) EOF is handled consistently with the requirements of
+// mmap(2): bytes after EOF on the same page are zeroed; pages after EOF are
+// invalid.
+//
+// Fill may read offsets outside of required, but will never read offsets
+// outside of optional. It returns a non-nil error if any error occurs, even
+// if the error only affects offsets in optional, but not in required.
+//
+// Preconditions: required.Length() > 0. optional.IsSupersetOf(required).
+// required and optional must be page-aligned.
+func (frs *FileRangeSet) Fill(ctx context.Context, required, optional memmap.MappableRange, mem platform.Memory, kind usage.MemoryKind, readAt func(ctx context.Context, dsts safemem.BlockSeq, offset uint64) (uint64, error)) error {
+ gap := frs.LowerBoundGap(required.Start)
+ for gap.Ok() && gap.Start() < required.End {
+ if gap.Range().Length() == 0 {
+ gap = gap.NextGap()
+ continue
+ }
+ gr := gap.Range().Intersect(optional)
+
+ // Read data into the gap.
+ fr, err := platform.AllocateAndFill(mem, gr.Length(), kind, safemem.ReaderFunc(func(dsts safemem.BlockSeq) (uint64, error) {
+ var done uint64
+ for !dsts.IsEmpty() {
+ n, err := readAt(ctx, dsts, gr.Start+done)
+ done += n
+ dsts = dsts.DropFirst64(n)
+ if err != nil {
+ if err == io.EOF {
+ // platform.AllocateAndFill truncates down to a page
+ // boundary, but FileRangeSet.Fill is supposed to
+ // zero-fill to the end of the page in this case.
+ donepgaddr, ok := usermem.Addr(done).RoundUp()
+ if donepg := uint64(donepgaddr); ok && donepg != done {
+ dsts.DropFirst64(donepg - done)
+ done = donepg
+ if dsts.IsEmpty() {
+ return done, nil
+ }
+ }
+ }
+ return done, err
+ }
+ }
+ return done, nil
+ }))
+
+ // Store anything we managed to read into the cache.
+ if done := fr.Length(); done != 0 {
+ gr.End = gr.Start + done
+ gap = frs.Insert(gap, gr, fr.Start).NextGap()
+ }
+
+ if err != nil {
+ return err
+ }
+ }
+ return nil
+}
+
+// Drop removes segments for memmap.Mappable offsets in mr, freeing the
+// corresponding platform.FileRanges.
+//
+// Preconditions: mr must be page-aligned.
+func (frs *FileRangeSet) Drop(mr memmap.MappableRange, mem platform.Memory) {
+ seg := frs.LowerBoundSegment(mr.Start)
+ for seg.Ok() && seg.Start() < mr.End {
+ seg = frs.Isolate(seg, mr)
+ mem.DecRef(seg.FileRange())
+ seg = frs.Remove(seg).NextSegment()
+ }
+}
+
+// DropAll removes all segments in mr, freeing the corresponding
+// platform.FileRanges.
+func (frs *FileRangeSet) DropAll(mem platform.Memory) {
+ for seg := frs.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
+ mem.DecRef(seg.FileRange())
+ }
+ frs.RemoveAll()
+}
+
+// Truncate updates frs to reflect Mappable truncation to the given length:
+// bytes after the new EOF on the same page are zeroed, and pages after the new
+// EOF are freed.
+func (frs *FileRangeSet) Truncate(end uint64, mem platform.Memory) {
+ pgendaddr, ok := usermem.Addr(end).RoundUp()
+ if ok {
+ pgend := uint64(pgendaddr)
+
+ // Free truncated pages.
+ frs.SplitAt(pgend)
+ seg := frs.LowerBoundSegment(pgend)
+ for seg.Ok() {
+ mem.DecRef(seg.FileRange())
+ seg = frs.Remove(seg).NextSegment()
+ }
+
+ if end == pgend {
+ return
+ }
+ }
+
+ // Here we know end < end.RoundUp(). If the new EOF lands in the
+ // middle of a page that we have, zero out its contents beyond the new
+ // length.
+ seg := frs.FindSegment(end)
+ if seg.Ok() {
+ fr := seg.FileRange()
+ fr.Start += end - seg.Start()
+ ims, err := mem.MapInternal(fr, usermem.Write)
+ if err != nil {
+ // There's no good recourse from here. This means
+ // that we can't keep cached memory consistent with
+ // the new end of file. The caller may have already
+ // updated the file size on their backing file system.
+ //
+ // We don't want to risk blindly continuing onward,
+ // so in the extremely rare cases this does happen,
+ // we abandon ship.
+ panic(fmt.Sprintf("Failed to map %v: %v", fr, err))
+ }
+ if _, err := safemem.ZeroSeq(ims); err != nil {
+ panic(fmt.Sprintf("Zeroing %v failed: %v", fr, err))
+ }
+ }
+}
diff --git a/pkg/sentry/fs/fsutil/frame_ref_set.go b/pkg/sentry/fs/fsutil/frame_ref_set.go
new file mode 100644
index 000000000..14dece315
--- /dev/null
+++ b/pkg/sentry/fs/fsutil/frame_ref_set.go
@@ -0,0 +1,50 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fsutil
+
+import (
+ "math"
+
+ "gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+)
+
+type frameRefSetFunctions struct{}
+
+// MinKey implements segment.Functions.MinKey.
+func (frameRefSetFunctions) MinKey() uint64 {
+ return 0
+}
+
+// MaxKey implements segment.Functions.MaxKey.
+func (frameRefSetFunctions) MaxKey() uint64 {
+ return math.MaxUint64
+}
+
+// ClearValue implements segment.Functions.ClearValue.
+func (frameRefSetFunctions) ClearValue(val *uint64) {
+}
+
+// Merge implements segment.Functions.Merge.
+func (frameRefSetFunctions) Merge(_ platform.FileRange, val1 uint64, _ platform.FileRange, val2 uint64) (uint64, bool) {
+ if val1 != val2 {
+ return 0, false
+ }
+ return val1, true
+}
+
+// Split implements segment.Functions.Split.
+func (frameRefSetFunctions) Split(_ platform.FileRange, val uint64, _ uint64) (uint64, uint64) {
+ return val, val
+}
diff --git a/pkg/sentry/fs/fsutil/fsutil.go b/pkg/sentry/fs/fsutil/fsutil.go
new file mode 100644
index 000000000..6fe4ef13d
--- /dev/null
+++ b/pkg/sentry/fs/fsutil/fsutil.go
@@ -0,0 +1,26 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package fsutil provides utilities for implementing fs.InodeOperations
+// and fs.FileOperations:
+//
+// - For embeddable utilities, see inode.go and file.go.
+//
+// - For fs.Inodes that require a page cache to be memory mapped, see
+// inode_cache.go.
+//
+// - For fs.Files that implement fs.HandleOps, see handle.go.
+//
+// - For anon fs.Inodes, see anon.go.
+package fsutil
diff --git a/pkg/sentry/fs/fsutil/handle.go b/pkg/sentry/fs/fsutil/handle.go
new file mode 100644
index 000000000..149c0f84a
--- /dev/null
+++ b/pkg/sentry/fs/fsutil/handle.go
@@ -0,0 +1,126 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fsutil
+
+import (
+ "gvisor.googlesource.com/gvisor/pkg/sentry/context"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+ "gvisor.googlesource.com/gvisor/pkg/syserror"
+ "gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// Handle implements FileOperations.
+//
+// FIXME: Remove Handle entirely in favor of individual fs.File
+// implementations using simple generic utilities.
+type Handle struct {
+ NoopRelease `state:"nosave"`
+ NoIoctl `state:"nosave"`
+ HandleOperations fs.HandleOperations
+
+ // dirCursor is the directory cursor.
+ dirCursor string
+}
+
+// NewHandle returns a File backed by the Dirent and FileFlags.
+func NewHandle(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags, hops fs.HandleOperations) *fs.File {
+ if !fs.IsPipe(dirent.Inode.StableAttr) && !fs.IsSocket(dirent.Inode.StableAttr) {
+ // Allow reading/writing at an arbitrary offset for non-pipes
+ // and non-sockets.
+ flags.Pread = true
+ flags.Pwrite = true
+ }
+
+ return fs.NewFile(ctx, dirent, flags, &Handle{HandleOperations: hops})
+}
+
+// Readiness implements waiter.Waitable.Readiness.
+func (h *Handle) Readiness(mask waiter.EventMask) waiter.EventMask {
+ return h.HandleOperations.Readiness(mask)
+}
+
+// EventRegister implements waiter.Waitable.EventRegister.
+func (h *Handle) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+ h.HandleOperations.EventRegister(e, mask)
+}
+
+// EventUnregister implements waiter.Waitable.EventUnregister.
+func (h *Handle) EventUnregister(e *waiter.Entry) {
+ h.HandleOperations.EventUnregister(e)
+}
+
+// Readdir implements FileOperations.Readdir.
+func (h *Handle) Readdir(ctx context.Context, file *fs.File, serializer fs.DentrySerializer) (int64, error) {
+ root := fs.RootFromContext(ctx)
+ defer root.DecRef()
+ dirCtx := &fs.DirCtx{
+ Serializer: serializer,
+ DirCursor: &h.dirCursor,
+ }
+ n, err := fs.DirentReaddir(ctx, file.Dirent, h, root, dirCtx, file.Offset())
+ return n, err
+}
+
+// Seek implements FileOperations.Seek.
+func (h *Handle) Seek(ctx context.Context, file *fs.File, whence fs.SeekWhence, offset int64) (int64, error) {
+ return SeekWithDirCursor(ctx, file, whence, offset, &h.dirCursor)
+}
+
+// IterateDir implements DirIterator.IterateDir.
+func (h *Handle) IterateDir(ctx context.Context, dirCtx *fs.DirCtx, offset int) (int, error) {
+ return h.HandleOperations.DeprecatedReaddir(ctx, dirCtx, offset)
+}
+
+// Read implements FileOperations.Read.
+func (h *Handle) Read(ctx context.Context, file *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
+ return h.HandleOperations.DeprecatedPreadv(ctx, dst, offset)
+}
+
+// Write implements FileOperations.Write.
+func (h *Handle) Write(ctx context.Context, file *fs.File, src usermem.IOSequence, offset int64) (int64, error) {
+ return h.HandleOperations.DeprecatedPwritev(ctx, src, offset)
+}
+
+// Fsync implements FileOperations.Fsync.
+func (h *Handle) Fsync(ctx context.Context, file *fs.File, start int64, end int64, syncType fs.SyncType) error {
+ switch syncType {
+ case fs.SyncAll, fs.SyncData:
+ // Write out metadata.
+ if err := file.Dirent.Inode.WriteOut(ctx); err != nil {
+ return err
+ }
+ fallthrough
+ case fs.SyncBackingStorage:
+ // Use DeprecatedFsync to sync disks.
+ return h.HandleOperations.DeprecatedFsync()
+ }
+ panic("invalid sync type")
+}
+
+// Flush implements FileOperations.Flush.
+func (h *Handle) Flush(context.Context, *fs.File) error {
+ return h.HandleOperations.DeprecatedFlush()
+}
+
+// ConfigureMMap implements FileOperations.ConfigureMMap.
+func (h *Handle) ConfigureMMap(ctx context.Context, file *fs.File, opts *memmap.MMapOpts) error {
+ mappable := file.Dirent.Inode.Mappable()
+ if mappable == nil {
+ return syserror.ENODEV
+ }
+ return GenericConfigureMMap(file, mappable, opts)
+}
diff --git a/pkg/sentry/fs/fsutil/handle_test.go b/pkg/sentry/fs/fsutil/handle_test.go
new file mode 100644
index 000000000..d94c3eb0d
--- /dev/null
+++ b/pkg/sentry/fs/fsutil/handle_test.go
@@ -0,0 +1,227 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fsutil_test
+
+import (
+ "io"
+ "syscall"
+ "testing"
+
+ "gvisor.googlesource.com/gvisor/pkg/sentry/context"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
+ ramfstest "gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs/test"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+type testInodeOperations struct {
+ fs.InodeOperations
+ fs.InodeType
+ FileSize int64
+ writes uint
+ reads uint
+}
+
+func (t *testInodeOperations) UnstableAttr(ctx context.Context, inode *fs.Inode) (fs.UnstableAttr, error) {
+ return fs.UnstableAttr{Size: t.FileSize}, nil
+}
+
+// Check implements InodeOperations.Check.
+func (t *testInodeOperations) Check(ctx context.Context, inode *fs.Inode, p fs.PermMask) bool {
+ return fs.ContextCanAccessFile(ctx, inode, p)
+}
+
+func (t *testInodeOperations) DeprecatedPreadv(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) {
+ t.reads++
+ return t.InodeOperations.DeprecatedPreadv(ctx, dst, offset)
+}
+
+func (t *testInodeOperations) DeprecatedPwritev(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) {
+ t.writes++
+ return t.InodeOperations.DeprecatedPwritev(ctx, src, offset)
+}
+
+// testHandle returns a handle for a test node.
+//
+// The size of the node is fixed at 20 bytes.
+func testHandle(t *testing.T, flags fs.FileFlags, nt fs.InodeType) (*fs.File, *testInodeOperations) {
+ ctx := contexttest.Context(t)
+ m := fs.NewNonCachingMountSource(nil, fs.MountSourceFlags{})
+ n := &testInodeOperations{
+ InodeOperations: ramfstest.NewFile(ctx, fs.FilePermissions{User: fs.PermMask{Read: true, Write: true}}),
+ FileSize: 20,
+ }
+ d := fs.NewDirent(fs.NewInode(n, m, fs.StableAttr{Type: nt}), "test")
+ return fsutil.NewHandle(ctx, d, flags, d.Inode.HandleOps()), n
+}
+
+func TestHandleOps(t *testing.T) {
+ h, n := testHandle(t, fs.FileFlags{Read: true, Write: true}, fs.RegularFile)
+ defer h.DecRef()
+
+ // Make sure a write request works.
+ if n, err := h.Writev(contexttest.Context(t), usermem.BytesIOSequence([]byte("a"))); n != 1 || err != nil {
+ t.Fatalf("Writev: got (%d, %v), wanted (1, nil)", n, err)
+ }
+ if n.writes != 1 {
+ t.Errorf("found %d writes, expected 1", n.writes)
+ }
+
+ // Make sure a read request works.
+ dst := make([]byte, 1)
+ if n, err := h.Preadv(contexttest.Context(t), usermem.BytesIOSequence(dst), 0); n != 1 || (err != nil && err != io.EOF) {
+ t.Errorf("Preadv: got (%d, %v), wanted (1, nil or EOF)", n, err)
+ }
+ if dst[0] != 'a' {
+ t.Errorf("Preadv: read %q, wanted 'a'", dst[0])
+ }
+ if n.reads != 1 {
+ t.Errorf("found %d reads, expected 1", n.reads)
+ }
+}
+
+type seekTest struct {
+ whence fs.SeekWhence
+ offset int64
+ result int64
+ err error
+}
+
+type seekSuite struct {
+ nodeType fs.InodeType
+ cases []seekTest
+}
+
+// FIXME: This is currently missing fs.SeekEnd tests due to the
+// fact that NullInodeOperations returns an error on stat.
+func TestHandleSeek(t *testing.T) {
+ ts := []seekSuite{
+ {
+ nodeType: fs.RegularFile,
+ cases: []seekTest{
+ {fs.SeekSet, 0, 0, nil},
+ {fs.SeekSet, 10, 10, nil},
+ {fs.SeekSet, -5, 10, syscall.EINVAL},
+ {fs.SeekCurrent, -1, 9, nil},
+ {fs.SeekCurrent, 2, 11, nil},
+ {fs.SeekCurrent, -12, 11, syscall.EINVAL},
+ {fs.SeekEnd, -1, 19, nil},
+ {fs.SeekEnd, 0, 20, nil},
+ {fs.SeekEnd, 2, 22, nil},
+ },
+ },
+ {
+ nodeType: fs.Directory,
+ cases: []seekTest{
+ {fs.SeekSet, 0, 0, nil},
+ {fs.SeekSet, 10, 0, syscall.EINVAL},
+ {fs.SeekSet, -5, 0, syscall.EINVAL},
+ {fs.SeekCurrent, 0, 0, nil},
+ {fs.SeekCurrent, 11, 0, syscall.EINVAL},
+ {fs.SeekCurrent, -6, 0, syscall.EINVAL},
+ {fs.SeekEnd, 0, 0, syscall.EINVAL},
+ {fs.SeekEnd, -1, 0, syscall.EINVAL},
+ {fs.SeekEnd, 2, 0, syscall.EINVAL},
+ },
+ },
+ {
+ nodeType: fs.Symlink,
+ cases: []seekTest{
+ {fs.SeekSet, 5, 0, syscall.EINVAL},
+ {fs.SeekSet, -5, 0, syscall.EINVAL},
+ {fs.SeekSet, 0, 0, syscall.EINVAL},
+ {fs.SeekCurrent, 5, 0, syscall.EINVAL},
+ {fs.SeekCurrent, -5, 0, syscall.EINVAL},
+ {fs.SeekCurrent, 0, 0, syscall.EINVAL},
+ {fs.SeekEnd, 5, 0, syscall.EINVAL},
+ {fs.SeekEnd, -5, 0, syscall.EINVAL},
+ {fs.SeekEnd, 0, 0, syscall.EINVAL},
+ },
+ },
+ {
+ nodeType: fs.Pipe,
+ cases: []seekTest{
+ {fs.SeekSet, 5, 0, syscall.ESPIPE},
+ {fs.SeekSet, -5, 0, syscall.ESPIPE},
+ {fs.SeekSet, 0, 0, syscall.ESPIPE},
+ {fs.SeekCurrent, 5, 0, syscall.ESPIPE},
+ {fs.SeekCurrent, -5, 0, syscall.ESPIPE},
+ {fs.SeekCurrent, 0, 0, syscall.ESPIPE},
+ {fs.SeekEnd, 5, 0, syscall.ESPIPE},
+ {fs.SeekEnd, -5, 0, syscall.ESPIPE},
+ {fs.SeekEnd, 0, 0, syscall.ESPIPE},
+ },
+ },
+ {
+ nodeType: fs.Socket,
+ cases: []seekTest{
+ {fs.SeekSet, 5, 0, syscall.ESPIPE},
+ {fs.SeekSet, -5, 0, syscall.ESPIPE},
+ {fs.SeekSet, 0, 0, syscall.ESPIPE},
+ {fs.SeekCurrent, 5, 0, syscall.ESPIPE},
+ {fs.SeekCurrent, -5, 0, syscall.ESPIPE},
+ {fs.SeekCurrent, 0, 0, syscall.ESPIPE},
+ {fs.SeekEnd, 5, 0, syscall.ESPIPE},
+ {fs.SeekEnd, -5, 0, syscall.ESPIPE},
+ {fs.SeekEnd, 0, 0, syscall.ESPIPE},
+ },
+ },
+ {
+ nodeType: fs.CharacterDevice,
+ cases: []seekTest{
+ {fs.SeekSet, 5, 0, nil},
+ {fs.SeekSet, -5, 0, nil},
+ {fs.SeekSet, 0, 0, nil},
+ {fs.SeekCurrent, 5, 0, nil},
+ {fs.SeekCurrent, -5, 0, nil},
+ {fs.SeekCurrent, 0, 0, nil},
+ {fs.SeekEnd, 5, 0, nil},
+ {fs.SeekEnd, -5, 0, nil},
+ {fs.SeekEnd, 0, 0, nil},
+ },
+ },
+ {
+ nodeType: fs.BlockDevice,
+ cases: []seekTest{
+ {fs.SeekSet, 0, 0, nil},
+ {fs.SeekSet, 10, 10, nil},
+ {fs.SeekSet, -5, 10, syscall.EINVAL},
+ {fs.SeekCurrent, -1, 9, nil},
+ {fs.SeekCurrent, 2, 11, nil},
+ {fs.SeekCurrent, -12, 11, syscall.EINVAL},
+ {fs.SeekEnd, -1, 19, nil},
+ {fs.SeekEnd, 0, 20, nil},
+ {fs.SeekEnd, 2, 22, nil},
+ },
+ },
+ }
+
+ for _, s := range ts {
+ h, _ := testHandle(t, fs.FileFlags{Read: true, Write: true}, s.nodeType)
+ defer h.DecRef()
+
+ for _, c := range s.cases {
+ // Try the given seek.
+ offset, err := h.Seek(contexttest.Context(t), c.whence, c.offset)
+ if err != c.err {
+ t.Errorf("seek(%s, %d) on %s had unexpected error: expected %v, got %v", c.whence, c.offset, s.nodeType, c.err, err)
+ }
+ if err == nil && offset != c.result {
+ t.Errorf("seek(%s, %d) on %s had bad result: expected %v, got %v", c.whence, c.offset, s.nodeType, c.result, offset)
+ }
+ }
+ }
+}
diff --git a/pkg/sentry/fs/fsutil/host_file_mapper.go b/pkg/sentry/fs/fsutil/host_file_mapper.go
new file mode 100644
index 000000000..d0a27fc1c
--- /dev/null
+++ b/pkg/sentry/fs/fsutil/host_file_mapper.go
@@ -0,0 +1,209 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fsutil
+
+import (
+ "fmt"
+ "sync"
+ "syscall"
+
+ "gvisor.googlesource.com/gvisor/pkg/log"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// HostFileMapper caches mappings of an arbitrary host file descriptor. It is
+// used by implementations of memmap.Mappable that represent a host file
+// descriptor.
+type HostFileMapper struct {
+ // HostFile conceptually breaks the file into pieces called chunks, of
+ // size and alignment chunkSize, and caches mappings of the file on a chunk
+ // granularity.
+
+ refsMu sync.Mutex `state:"nosave"`
+
+ // refs maps chunk start offsets to the sum of reference counts for all
+ // pages in that chunk. refs is protected by refsMu.
+ refs map[uint64]int32
+
+ mapsMu sync.Mutex `state:"nosave"`
+
+ // mappings maps chunk start offsets to mappings of those chunks,
+ // obtained by calling syscall.Mmap. mappings is protected by
+ // mapsMu.
+ mappings map[uint64]mapping `state:"nosave"`
+}
+
+const (
+ chunkShift = usermem.HugePageShift
+ chunkSize = 1 << chunkShift
+ chunkMask = chunkSize - 1
+)
+
+func pagesInChunk(mr memmap.MappableRange, chunkStart uint64) int32 {
+ return int32(mr.Intersect(memmap.MappableRange{chunkStart, chunkStart + chunkSize}).Length() / usermem.PageSize)
+}
+
+type mapping struct {
+ addr uintptr
+ writable bool
+}
+
+// NewHostFileMapper returns a HostFileMapper with no references or cached
+// mappings.
+func NewHostFileMapper() *HostFileMapper {
+ return &HostFileMapper{
+ refs: make(map[uint64]int32),
+ mappings: make(map[uint64]mapping),
+ }
+}
+
+// IncRefOn increments the reference count on all offsets in mr.
+//
+// Preconditions: mr.Length() != 0. mr.Start and mr.End must be page-aligned.
+func (f *HostFileMapper) IncRefOn(mr memmap.MappableRange) {
+ f.refsMu.Lock()
+ defer f.refsMu.Unlock()
+ for chunkStart := mr.Start &^ chunkMask; chunkStart < mr.End; chunkStart += chunkSize {
+ refs := f.refs[chunkStart]
+ pgs := pagesInChunk(mr, chunkStart)
+ if refs+pgs < refs {
+ // Would overflow.
+ panic(fmt.Sprintf("HostFileMapper.IncRefOn(%v): adding %d page references to chunk %#x, which has %d page references", mr, pgs, chunkStart, refs))
+ }
+ f.refs[chunkStart] = refs + pgs
+ }
+}
+
+// DecRefOn decrements the reference count on all offsets in mr.
+//
+// Preconditions: mr.Length() != 0. mr.Start and mr.End must be page-aligned.
+func (f *HostFileMapper) DecRefOn(mr memmap.MappableRange) {
+ f.refsMu.Lock()
+ defer f.refsMu.Unlock()
+ for chunkStart := mr.Start &^ chunkMask; chunkStart < mr.End; chunkStart += chunkSize {
+ refs := f.refs[chunkStart]
+ pgs := pagesInChunk(mr, chunkStart)
+ switch {
+ case refs > pgs:
+ f.refs[chunkStart] = refs - pgs
+ case refs == pgs:
+ f.mapsMu.Lock()
+ delete(f.refs, chunkStart)
+ if m, ok := f.mappings[chunkStart]; ok {
+ f.unmapAndRemoveLocked(chunkStart, m)
+ }
+ f.mapsMu.Unlock()
+ case refs < pgs:
+ panic(fmt.Sprintf("HostFileMapper.DecRefOn(%v): removing %d page references from chunk %#x, which has %d page references", mr, pgs, chunkStart, refs))
+ }
+ }
+}
+
+// MapInternal returns a mapping of offsets in fr from fd. The returned
+// safemem.BlockSeq is valid as long as at least one reference is held on all
+// offsets in fr or until the next call to UnmapAll.
+//
+// Preconditions: The caller must hold a reference on all offsets in fr.
+func (f *HostFileMapper) MapInternal(fr platform.FileRange, fd int, write bool) (safemem.BlockSeq, error) {
+ chunks := ((fr.End + chunkMask) >> chunkShift) - (fr.Start >> chunkShift)
+ f.mapsMu.Lock()
+ defer f.mapsMu.Unlock()
+ if chunks == 1 {
+ // Avoid an unnecessary slice allocation.
+ var seq safemem.BlockSeq
+ err := f.forEachMappingBlockLocked(fr, fd, write, func(b safemem.Block) {
+ seq = safemem.BlockSeqOf(b)
+ })
+ return seq, err
+ }
+ blocks := make([]safemem.Block, 0, chunks)
+ err := f.forEachMappingBlockLocked(fr, fd, write, func(b safemem.Block) {
+ blocks = append(blocks, b)
+ })
+ return safemem.BlockSeqFromSlice(blocks), err
+}
+
+// Preconditions: f.mapsMu must be locked.
+func (f *HostFileMapper) forEachMappingBlockLocked(fr platform.FileRange, fd int, write bool, fn func(safemem.Block)) error {
+ prot := syscall.PROT_READ
+ if write {
+ prot |= syscall.PROT_WRITE
+ }
+ for chunkStart := fr.Start &^ chunkMask; chunkStart < fr.End; chunkStart += chunkSize {
+ m, ok := f.mappings[chunkStart]
+ if !ok {
+ addr, _, errno := syscall.Syscall6(
+ syscall.SYS_MMAP,
+ 0,
+ chunkSize,
+ uintptr(prot),
+ syscall.MAP_SHARED,
+ uintptr(fd),
+ uintptr(chunkStart))
+ if errno != 0 {
+ return errno
+ }
+ m = mapping{addr, write}
+ f.mappings[chunkStart] = m
+ } else if write && !m.writable {
+ addr, _, errno := syscall.Syscall6(
+ syscall.SYS_MMAP,
+ m.addr,
+ chunkSize,
+ uintptr(prot),
+ syscall.MAP_SHARED|syscall.MAP_FIXED,
+ uintptr(fd),
+ uintptr(chunkStart))
+ if errno != 0 {
+ return errno
+ }
+ m = mapping{addr, write}
+ f.mappings[chunkStart] = m
+ }
+ var startOff uint64
+ if chunkStart < fr.Start {
+ startOff = fr.Start - chunkStart
+ }
+ endOff := uint64(chunkSize)
+ if chunkStart+chunkSize > fr.End {
+ endOff = fr.End - chunkStart
+ }
+ fn(f.unsafeBlockFromChunkMapping(m.addr).TakeFirst64(endOff).DropFirst64(startOff))
+ }
+ return nil
+}
+
+// UnmapAll unmaps all cached mappings. Callers are responsible for
+// synchronization with mappings returned by previous calls to MapInternal.
+func (f *HostFileMapper) UnmapAll() {
+ f.mapsMu.Lock()
+ defer f.mapsMu.Unlock()
+ for chunkStart, m := range f.mappings {
+ f.unmapAndRemoveLocked(chunkStart, m)
+ }
+}
+
+// Preconditions: f.mapsMu must be locked. f.mappings[chunkStart] == m.
+func (f *HostFileMapper) unmapAndRemoveLocked(chunkStart uint64, m mapping) {
+ if _, _, errno := syscall.Syscall(syscall.SYS_MUNMAP, m.addr, chunkSize, 0); errno != 0 {
+ // This leaks address space and is unexpected, but is otherwise
+ // harmless, so complain but don't panic.
+ log.Warningf("HostFileMapper: failed to unmap mapping %#x for chunk %#x: %v", m.addr, chunkStart, errno)
+ }
+ delete(f.mappings, chunkStart)
+}
diff --git a/pkg/sentry/fs/fsutil/host_file_mapper_state.go b/pkg/sentry/fs/fsutil/host_file_mapper_state.go
new file mode 100644
index 000000000..57705decd
--- /dev/null
+++ b/pkg/sentry/fs/fsutil/host_file_mapper_state.go
@@ -0,0 +1,20 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fsutil
+
+// afterLoad is invoked by stateify.
+func (f *HostFileMapper) afterLoad() {
+ f.mappings = make(map[uint64]mapping)
+}
diff --git a/pkg/sentry/fs/fsutil/host_file_mapper_unsafe.go b/pkg/sentry/fs/fsutil/host_file_mapper_unsafe.go
new file mode 100644
index 000000000..790f3a5a6
--- /dev/null
+++ b/pkg/sentry/fs/fsutil/host_file_mapper_unsafe.go
@@ -0,0 +1,27 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fsutil
+
+import (
+ "unsafe"
+
+ "gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
+)
+
+func (*HostFileMapper) unsafeBlockFromChunkMapping(addr uintptr) safemem.Block {
+ // We don't control the host file's length, so touching its mappings may
+ // raise SIGBUS. Thus accesses to it must use safecopy.
+ return safemem.BlockFromUnsafePointer((unsafe.Pointer)(addr), chunkSize)
+}
diff --git a/pkg/sentry/fs/fsutil/inode.go b/pkg/sentry/fs/fsutil/inode.go
new file mode 100644
index 000000000..e1ad07df2
--- /dev/null
+++ b/pkg/sentry/fs/fsutil/inode.go
@@ -0,0 +1,380 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fsutil
+
+import (
+ "gvisor.googlesource.com/gvisor/pkg/sentry/context"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+ ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+ "gvisor.googlesource.com/gvisor/pkg/syserror"
+ "gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix"
+ "gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// NewSimpleInodeOperations constructs fs.InodeOperations from InodeSimpleAttributes.
+func NewSimpleInodeOperations(i InodeSimpleAttributes) fs.InodeOperations {
+ return &simpleInodeOperations{InodeSimpleAttributes: i}
+}
+
+// simpleInodeOperations is a simple implementation of Inode.
+type simpleInodeOperations struct {
+ DeprecatedFileOperations `state:"nosave"`
+ InodeNotDirectory `state:"nosave"`
+ InodeNotSocket `state:"nosave"`
+ InodeNotRenameable `state:"nosave"`
+ InodeNotOpenable `state:"nosave"`
+ InodeNotVirtual `state:"nosave"`
+ InodeNotSymlink `state:"nosave"`
+ InodeNoExtendedAttributes `state:"nosave"`
+ NoMappable `state:"nosave"`
+ NoopWriteOut `state:"nosave"`
+
+ InodeSimpleAttributes
+}
+
+// InodeSimpleAttributes implements a subset of the Inode interface. It provides
+// read-only access to attributes.
+type InodeSimpleAttributes struct {
+ // FSType is the filesystem type reported by StatFS.
+ FSType uint64
+
+ // UAttr are the unstable attributes of the Inode.
+ UAttr fs.UnstableAttr
+}
+
+// Release implements fs.InodeOperations.Release.
+func (i *InodeSimpleAttributes) Release(context.Context) {}
+
+// StatFS implements fs.InodeOperations.StatFS.
+func (i *InodeSimpleAttributes) StatFS(context.Context) (fs.Info, error) {
+ return fs.Info{Type: i.FSType}, nil
+}
+
+// UnstableAttr implements fs.InodeOperations.UnstableAttr.
+func (i *InodeSimpleAttributes) UnstableAttr(context.Context, *fs.Inode) (fs.UnstableAttr, error) {
+ return i.UAttr, nil
+}
+
+// Check implements fs.InodeOperations.Check.
+func (i *InodeSimpleAttributes) Check(ctx context.Context, inode *fs.Inode, p fs.PermMask) bool {
+ return fs.ContextCanAccessFile(ctx, inode, p)
+}
+
+// AddLink implements fs.InodeOperations.AddLink.
+func (*InodeSimpleAttributes) AddLink() {}
+
+// DropLink implements fs.InodeOperations.DropLink.
+func (*InodeSimpleAttributes) DropLink() {}
+
+// NotifyStatusChange implements fs.fs.InodeOperations.
+func (i *InodeSimpleAttributes) NotifyStatusChange(ctx context.Context) {
+ i.UAttr.StatusChangeTime = ktime.NowFromContext(ctx)
+}
+
+// SetPermissions implements fs.InodeOperations.SetPermissions.
+func (*InodeSimpleAttributes) SetPermissions(context.Context, *fs.Inode, fs.FilePermissions) bool {
+ return false
+}
+
+// SetOwner implements fs.InodeOperations.SetOwner.
+func (*InodeSimpleAttributes) SetOwner(context.Context, *fs.Inode, fs.FileOwner) error {
+ return syserror.EINVAL
+}
+
+// SetTimestamps implements fs.InodeOperations.SetTimestamps.
+func (*InodeSimpleAttributes) SetTimestamps(context.Context, *fs.Inode, fs.TimeSpec) error {
+ return syserror.EINVAL
+}
+
+// Truncate implements fs.InodeOperations.Truncate.
+func (*InodeSimpleAttributes) Truncate(context.Context, *fs.Inode, int64) error {
+ return syserror.EINVAL
+}
+
+// InMemoryAttributes implements utilities for updating in-memory unstable
+// attributes and extended attributes. It is not thread-safe.
+//
+// Users need not initialize Xattrs to non-nil (it will be initialized
+// when the first extended attribute is set.
+type InMemoryAttributes struct {
+ Unstable fs.UnstableAttr
+ Xattrs map[string][]byte
+}
+
+// SetPermissions updates the permissions to p.
+func (i *InMemoryAttributes) SetPermissions(ctx context.Context, p fs.FilePermissions) bool {
+ i.Unstable.Perms = p
+ i.Unstable.StatusChangeTime = ktime.NowFromContext(ctx)
+ return true
+}
+
+// SetOwner updates the file owner to owner.
+func (i *InMemoryAttributes) SetOwner(ctx context.Context, owner fs.FileOwner) error {
+ if owner.UID.Ok() {
+ i.Unstable.Owner.UID = owner.UID
+ }
+ if owner.GID.Ok() {
+ i.Unstable.Owner.GID = owner.GID
+ }
+ return nil
+}
+
+// SetTimestamps sets the timestamps to ts.
+func (i *InMemoryAttributes) SetTimestamps(ctx context.Context, ts fs.TimeSpec) error {
+ if ts.ATimeOmit && ts.MTimeOmit {
+ return nil
+ }
+
+ now := ktime.NowFromContext(ctx)
+ if !ts.ATimeOmit {
+ if ts.ATimeSetSystemTime {
+ i.Unstable.AccessTime = now
+ } else {
+ i.Unstable.AccessTime = ts.ATime
+ }
+ }
+ if !ts.MTimeOmit {
+ if ts.MTimeSetSystemTime {
+ i.Unstable.ModificationTime = now
+ } else {
+ i.Unstable.ModificationTime = ts.MTime
+ }
+ }
+ i.Unstable.StatusChangeTime = now
+ return nil
+}
+
+// TouchAccessTime updates access time to the current time.
+func (i *InMemoryAttributes) TouchAccessTime(ctx context.Context) {
+ i.Unstable.AccessTime = ktime.NowFromContext(ctx)
+}
+
+// TouchModificationTime updates modification and status change
+// time to the current time.
+func (i *InMemoryAttributes) TouchModificationTime(ctx context.Context) {
+ now := ktime.NowFromContext(ctx)
+ i.Unstable.ModificationTime = now
+ i.Unstable.StatusChangeTime = now
+}
+
+// TouchStatusChangeTime updates status change time to the current time.
+func (i *InMemoryAttributes) TouchStatusChangeTime(ctx context.Context) {
+ i.Unstable.StatusChangeTime = ktime.NowFromContext(ctx)
+}
+
+// Getxattr returns the extended attribute at name or ENOATTR if
+// it isn't set.
+func (i *InMemoryAttributes) Getxattr(name string) ([]byte, error) {
+ if value, ok := i.Xattrs[name]; ok {
+ return value, nil
+ }
+ return nil, syserror.ENOATTR
+}
+
+// Setxattr sets the extended attribute at name to value.
+func (i *InMemoryAttributes) Setxattr(name string, value []byte) error {
+ if i.Xattrs == nil {
+ i.Xattrs = make(map[string][]byte)
+ }
+ i.Xattrs[name] = value
+ return nil
+}
+
+// Listxattr returns the set of all currently set extended attributes.
+func (i *InMemoryAttributes) Listxattr() (map[string]struct{}, error) {
+ names := make(map[string]struct{}, len(i.Xattrs))
+ for name := range i.Xattrs {
+ names[name] = struct{}{}
+ }
+ return names, nil
+}
+
+// NoMappable returns a nil memmap.Mappable.
+type NoMappable struct{}
+
+// Mappable implements fs.InodeOperations.Mappable.
+func (NoMappable) Mappable(*fs.Inode) memmap.Mappable {
+ return nil
+}
+
+// NoopWriteOut is a no-op implementation of Inode.WriteOut.
+type NoopWriteOut struct{}
+
+// WriteOut is a no-op.
+func (NoopWriteOut) WriteOut(context.Context, *fs.Inode) error {
+ return nil
+}
+
+// InodeNotDirectory can be used by Inodes that are not directories.
+type InodeNotDirectory struct{}
+
+// Lookup implements fs.InodeOperations.Lookup.
+func (InodeNotDirectory) Lookup(context.Context, *fs.Inode, string) (*fs.Dirent, error) {
+ return nil, syserror.ENOTDIR
+}
+
+// Create implements fs.InodeOperations.Create.
+func (InodeNotDirectory) Create(context.Context, *fs.Inode, string, fs.FileFlags, fs.FilePermissions) (*fs.File, error) {
+ return nil, syserror.ENOTDIR
+}
+
+// CreateLink implements fs.InodeOperations.CreateLink.
+func (InodeNotDirectory) CreateLink(context.Context, *fs.Inode, string, string) error {
+ return syserror.ENOTDIR
+}
+
+// CreateHardLink implements fs.InodeOperations.CreateHardLink.
+func (InodeNotDirectory) CreateHardLink(context.Context, *fs.Inode, *fs.Inode, string) error {
+ return syserror.ENOTDIR
+}
+
+// CreateDirectory implements fs.InodeOperations.CreateDirectory.
+func (InodeNotDirectory) CreateDirectory(context.Context, *fs.Inode, string, fs.FilePermissions) error {
+ return syserror.ENOTDIR
+}
+
+// Bind implements fs.InodeOperations.Bind.
+func (InodeNotDirectory) Bind(context.Context, *fs.Inode, string, unix.BoundEndpoint, fs.FilePermissions) error {
+ return syserror.ENOTDIR
+}
+
+// CreateFifo implements fs.InodeOperations.CreateFifo.
+func (InodeNotDirectory) CreateFifo(context.Context, *fs.Inode, string, fs.FilePermissions) error {
+ return syserror.ENOTDIR
+}
+
+// Remove implements fs.InodeOperations.Remove.
+func (InodeNotDirectory) Remove(context.Context, *fs.Inode, string) error {
+ return syserror.ENOTDIR
+}
+
+// RemoveDirectory implements fs.InodeOperations.RemoveDirectory.
+func (InodeNotDirectory) RemoveDirectory(context.Context, *fs.Inode, string) error {
+ return syserror.ENOTDIR
+}
+
+// InodeNotSocket can be used by Inodes that are not sockets.
+type InodeNotSocket struct{}
+
+// BoundEndpoint implements fs.InodeOperations.BoundEndpoint.
+func (InodeNotSocket) BoundEndpoint(*fs.Inode, string) unix.BoundEndpoint {
+ return nil
+}
+
+// InodeNotRenameable can be used by Inodes that cannot be renamed.
+type InodeNotRenameable struct{}
+
+// Rename implements fs.InodeOperations.Rename.
+func (InodeNotRenameable) Rename(context.Context, *fs.Inode, string, *fs.Inode, string) error {
+ return syserror.EINVAL
+}
+
+// InodeNotOpenable can be used by Inodes that cannot be opened.
+type InodeNotOpenable struct{}
+
+// GetFile implements fs.InodeOperations.GetFile.
+func (InodeNotOpenable) GetFile(context.Context, *fs.Dirent, fs.FileFlags) (*fs.File, error) {
+ return nil, syserror.EIO
+}
+
+// InodeNotVirtual can be used by Inodes that are not virtual.
+type InodeNotVirtual struct{}
+
+// IsVirtual implements fs.InodeOperations.IsVirtual.
+func (InodeNotVirtual) IsVirtual() bool {
+ return false
+}
+
+// InodeNotSymlink can be used by Inodes that are not symlinks.
+type InodeNotSymlink struct{}
+
+// Readlink implements fs.InodeOperations.Readlink.
+func (InodeNotSymlink) Readlink(context.Context, *fs.Inode) (string, error) {
+ return "", syserror.ENOLINK
+}
+
+// Getlink implements fs.InodeOperations.Getlink.
+func (InodeNotSymlink) Getlink(context.Context, *fs.Inode) (*fs.Dirent, error) {
+ return nil, syserror.ENOLINK
+}
+
+// InodeNoExtendedAttributes can be used by Inodes that do not support
+// extended attributes.
+type InodeNoExtendedAttributes struct{}
+
+// Getxattr implements fs.InodeOperations.Getxattr.
+func (InodeNoExtendedAttributes) Getxattr(*fs.Inode, string) ([]byte, error) {
+ return nil, syserror.EOPNOTSUPP
+}
+
+// Setxattr implements fs.InodeOperations.Setxattr.
+func (InodeNoExtendedAttributes) Setxattr(*fs.Inode, string, []byte) error {
+ return syserror.EOPNOTSUPP
+}
+
+// Listxattr implements fs.InodeOperations.Listxattr.
+func (InodeNoExtendedAttributes) Listxattr(*fs.Inode) (map[string]struct{}, error) {
+ return nil, syserror.EOPNOTSUPP
+}
+
+// DeprecatedFileOperations panics if any deprecated Inode method is called.
+type DeprecatedFileOperations struct{}
+
+// Readiness implements fs.InodeOperations.Waitable.Readiness.
+func (DeprecatedFileOperations) Readiness(waiter.EventMask) waiter.EventMask {
+ panic("not implemented")
+}
+
+// EventRegister implements fs.InodeOperations.Waitable.EventRegister.
+func (DeprecatedFileOperations) EventRegister(*waiter.Entry, waiter.EventMask) {
+ panic("not implemented")
+}
+
+// EventUnregister implements fs.InodeOperations.Waitable.EventUnregister.
+func (DeprecatedFileOperations) EventUnregister(*waiter.Entry) {
+ panic("not implemented")
+}
+
+// DeprecatedPreadv implements fs.InodeOperations.DeprecatedPreadv.
+func (DeprecatedFileOperations) DeprecatedPreadv(context.Context, usermem.IOSequence, int64) (int64, error) {
+ panic("not implemented")
+}
+
+// DeprecatedPwritev implements fs.InodeOperations.DeprecatedPwritev.
+func (DeprecatedFileOperations) DeprecatedPwritev(context.Context, usermem.IOSequence, int64) (int64, error) {
+ panic("not implemented")
+}
+
+// DeprecatedReaddir implements fs.InodeOperations.DeprecatedReaddir.
+func (DeprecatedFileOperations) DeprecatedReaddir(context.Context, *fs.DirCtx, int) (int, error) {
+ panic("not implemented")
+}
+
+// DeprecatedFsync implements fs.InodeOperations.DeprecatedFsync.
+func (DeprecatedFileOperations) DeprecatedFsync() error {
+ panic("not implemented")
+}
+
+// DeprecatedFlush implements fs.InodeOperations.DeprecatedFlush.
+func (DeprecatedFileOperations) DeprecatedFlush() error {
+ panic("not implemented")
+}
+
+// DeprecatedMappable implements fs.InodeOperations.DeprecatedMappable.
+func (DeprecatedFileOperations) DeprecatedMappable(context.Context, *fs.Inode) (memmap.Mappable, bool) {
+ panic("not implemented")
+}
diff --git a/pkg/sentry/fs/fsutil/inode_cached.go b/pkg/sentry/fs/fsutil/inode_cached.go
new file mode 100644
index 000000000..484668735
--- /dev/null
+++ b/pkg/sentry/fs/fsutil/inode_cached.go
@@ -0,0 +1,845 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fsutil
+
+import (
+ "fmt"
+ "io"
+ "sync"
+
+ "gvisor.googlesource.com/gvisor/pkg/log"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/context"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+ ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/platform"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/usage"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+// Lock order (compare the lock order model in mm/mm.go):
+//
+// CachingInodeOperations.attrMu ("fs locks")
+// CachingInodeOperations.mapsMu ("memmap.Mappable locks not taken by Translate")
+// CachingInodeOperations.dataMu ("memmap.Mappable locks taken by Translate")
+// CachedFileObject locks
+
+// CachingInodeOperations caches the metadata and content of a CachedFileObject.
+// It implements a subset of InodeOperations. As a utility it can be used to
+// implement the full set of InodeOperations. Generally it should not be
+// embedded to avoid unexpected inherited behavior.
+//
+// CachingInodeOperations implements Mappable for the CachedFileObject:
+//
+// - If CachedFileObject.FD returns a value >= 0 and the current platform shares
+// a host fd table with the sentry, then the value of CachedFileObject.FD
+// will be memory mapped on the host.
+//
+// - Otherwise, the contents of CachedFileObject are buffered into memory
+// managed by the CachingInodeOperations.
+//
+// Implementations of FileOperations for a CachedFileObject must read and
+// write through CachingInodeOperations using Read and Write respectively.
+//
+// Implementations of InodeOperations.WriteOut must call Sync to write out
+// in-memory modifications of data and metadata to the CachedFileObject.
+type CachingInodeOperations struct {
+ // backingFile is a handle to a cached file object.
+ backingFile CachedFileObject
+
+ // platform is used to allocate memory that caches backingFile's contents.
+ platform platform.Platform
+
+ // forcePageCache indicates the sentry page cache should be used regardless
+ // of whether the platform supports host mapped I/O or not. This must not be
+ // modified after inode creation.
+ forcePageCache bool
+
+ attrMu sync.Mutex `state:"nosave"`
+
+ // attr is unstable cached metadata.
+ //
+ // attr is protected by attrMu. attr.Size is protected by both attrMu and
+ // dataMu; reading it requires locking either mutex, while mutating it
+ // requires locking both.
+ attr fs.UnstableAttr
+
+ // dirtyAttr is metadata that was updated in-place but hasn't yet
+ // been successfully written out.
+ //
+ // dirtyAttr is protected by attrMu.
+ dirtyAttr fs.AttrMask
+
+ mapsMu sync.Mutex `state:"nosave"`
+
+ // mappings tracks mappings of the cached file object into
+ // memmap.MappingSpaces.
+ //
+ // mappings is protected by mapsMu.
+ mappings memmap.MappingSet
+
+ dataMu sync.RWMutex `state:"nosave"`
+
+ // cache maps offsets into the cached file to offsets into
+ // platform.Memory() that store the file's data.
+ //
+ // cache is protected by dataMu.
+ cache FileRangeSet
+
+ // dirty tracks dirty segments in cache.
+ //
+ // dirty is protected by dataMu.
+ dirty DirtySet
+
+ // hostFileMapper caches internal mappings of backingFile.FD().
+ hostFileMapper *HostFileMapper
+
+ // refs tracks active references to data in the cache.
+ //
+ // refs is protected by dataMu.
+ refs frameRefSet
+}
+
+// CachedFileObject is a file that may require caching.
+type CachedFileObject interface {
+ // ReadToBlocksAt reads up to dsts.NumBytes() bytes from the file to dsts,
+ // starting at offset, and returns the number of bytes read. ReadToBlocksAt
+ // may return a partial read without an error.
+ ReadToBlocksAt(ctx context.Context, dsts safemem.BlockSeq, offset uint64) (uint64, error)
+
+ // WriteFromBlocksAt writes up to srcs.NumBytes() bytes from srcs to the
+ // file, starting at offset, and returns the number of bytes written.
+ // WriteFromBlocksAt may return a partial write without an error.
+ WriteFromBlocksAt(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error)
+
+ // SetMaskedAttributes sets the attributes in attr that are true in mask
+ // on the backing file.
+ //
+ // SetMaskedAttributes may be called at any point, regardless of whether
+ // the file was opened.
+ SetMaskedAttributes(ctx context.Context, mask fs.AttrMask, attr fs.UnstableAttr) error
+
+ // Sync instructs the remote filesystem to sync the file to stable storage.
+ Sync(ctx context.Context) error
+
+ // FD returns a host file descriptor. Return value must be -1 or not -1
+ // for the lifetime of the CachedFileObject.
+ //
+ // FD is called iff the file has been memory mapped. This implies that
+ // the file was opened (see fs.InodeOperations.GetFile).
+ //
+ // FIXME: This interface seems to be
+ // fundamentally broken. We should clarify CachingInodeOperation's
+ // behavior with metadata.
+ FD() int
+}
+
+// NewCachingInodeOperations returns a new CachingInodeOperations backed by
+// a CachedFileObject and its initial unstable attributes.
+func NewCachingInodeOperations(ctx context.Context, backingFile CachedFileObject, uattr fs.UnstableAttr, forcePageCache bool) *CachingInodeOperations {
+ p := platform.FromContext(ctx)
+ if p == nil {
+ panic(fmt.Sprintf("context.Context %T lacks non-nil value for key %T", ctx, platform.CtxPlatform))
+ }
+ return &CachingInodeOperations{
+ backingFile: backingFile,
+ platform: p,
+ forcePageCache: forcePageCache,
+ attr: uattr,
+ hostFileMapper: NewHostFileMapper(),
+ }
+}
+
+// Release implements fs.InodeOperations.Release.
+func (c *CachingInodeOperations) Release() {
+ c.mapsMu.Lock()
+ defer c.mapsMu.Unlock()
+ c.dataMu.Lock()
+ defer c.dataMu.Unlock()
+ // The cache should be empty (something has gone terribly wrong if we're
+ // releasing an inode that is still memory-mapped).
+ if !c.mappings.IsEmpty() || !c.cache.IsEmpty() || !c.dirty.IsEmpty() {
+ panic(fmt.Sprintf("Releasing CachingInodeOperations with mappings:\n%s\ncache contents:\n%s\ndirty segments:\n%s", &c.mappings, &c.cache, &c.dirty))
+ }
+}
+
+// UnstableAttr implements fs.InodeOperations.UnstableAttr.
+func (c *CachingInodeOperations) UnstableAttr(ctx context.Context, inode *fs.Inode) (fs.UnstableAttr, error) {
+ c.attrMu.Lock()
+ defer c.attrMu.Unlock()
+ return c.attr, nil
+}
+
+// SetPermissions implements fs.InodeOperations.SetPermissions.
+func (c *CachingInodeOperations) SetPermissions(ctx context.Context, inode *fs.Inode, perms fs.FilePermissions) bool {
+ c.attrMu.Lock()
+ defer c.attrMu.Unlock()
+
+ masked := fs.AttrMask{Perms: true}
+ if err := c.backingFile.SetMaskedAttributes(ctx, masked, fs.UnstableAttr{Perms: perms}); err != nil {
+ return false
+ }
+ c.attr.Perms = perms
+ // FIXME: Clarify CachingInodeOperations behavior with metadata.
+ c.dirtyAttr.Perms = true
+ c.touchStatusChangeTimeLocked(ctx)
+ return true
+
+}
+
+// SetOwner implements fs.InodeOperations.SetOwner.
+func (c *CachingInodeOperations) SetOwner(ctx context.Context, inode *fs.Inode, owner fs.FileOwner) error {
+ if !owner.UID.Ok() && !owner.GID.Ok() {
+ return nil
+ }
+
+ c.attrMu.Lock()
+ defer c.attrMu.Unlock()
+
+ masked := fs.AttrMask{
+ UID: owner.UID.Ok(),
+ GID: owner.GID.Ok(),
+ }
+ if err := c.backingFile.SetMaskedAttributes(ctx, masked, fs.UnstableAttr{Owner: owner}); err != nil {
+ return err
+ }
+ if owner.UID.Ok() {
+ c.attr.Owner.UID = owner.UID
+ // FIXME: Clarify CachingInodeOperations behavior with metadata.
+ c.dirtyAttr.UID = true
+ }
+ if owner.GID.Ok() {
+ c.attr.Owner.GID = owner.GID
+ // FIXME: Clarify CachingInodeOperations behavior with metadata.
+ c.dirtyAttr.GID = true
+ }
+ c.touchStatusChangeTimeLocked(ctx)
+ return nil
+}
+
+// SetTimestamps implements fs.InodeOperations.SetTimestamps.
+func (c *CachingInodeOperations) SetTimestamps(ctx context.Context, inode *fs.Inode, ts fs.TimeSpec) error {
+ if ts.ATimeOmit && ts.MTimeOmit {
+ return nil
+ }
+
+ c.attrMu.Lock()
+ defer c.attrMu.Unlock()
+
+ // Replace requests to use the "system time" with the current time to
+ // ensure that cached timestamps remain consistent with the remote
+ // filesystem.
+ now := ktime.NowFromContext(ctx)
+ if ts.ATimeSetSystemTime {
+ ts.ATime = now
+ }
+ if ts.MTimeSetSystemTime {
+ ts.MTime = now
+ }
+ masked := fs.AttrMask{
+ AccessTime: !ts.ATimeOmit,
+ ModificationTime: !ts.MTimeOmit,
+ }
+ if err := c.backingFile.SetMaskedAttributes(ctx, masked, fs.UnstableAttr{AccessTime: ts.ATime, ModificationTime: ts.MTime}); err != nil {
+ return err
+ }
+ if !ts.ATimeOmit {
+ c.attr.AccessTime = ts.ATime
+ // FIXME: Clarify CachingInodeOperations behavior with metadata.
+ c.dirtyAttr.AccessTime = true
+ }
+ if !ts.MTimeOmit {
+ c.attr.ModificationTime = ts.MTime
+ // FIXME: Clarify CachingInodeOperations behavior with metadata.
+ c.dirtyAttr.ModificationTime = true
+ }
+ c.touchStatusChangeTimeLocked(ctx)
+ return nil
+}
+
+// Truncate implements fs.InodeOperations.Truncate.
+func (c *CachingInodeOperations) Truncate(ctx context.Context, inode *fs.Inode, size int64) error {
+ c.attrMu.Lock()
+ defer c.attrMu.Unlock()
+
+ // c.attr.Size is protected by both c.attrMu and c.dataMu.
+ c.dataMu.Lock()
+ if err := c.backingFile.SetMaskedAttributes(ctx, fs.AttrMask{
+ Size: true,
+ }, fs.UnstableAttr{
+ Size: size,
+ }); err != nil {
+ c.dataMu.Unlock()
+ return err
+ }
+ oldSize := c.attr.Size
+ if oldSize != size {
+ c.attr.Size = size
+ // FIXME: Clarify CachingInodeOperations behavior with metadata.
+ c.dirtyAttr.Size = true
+ c.touchModificationTimeLocked(ctx)
+ }
+ // We drop c.dataMu here so that we can lock c.mapsMu and invalidate
+ // mappings below. This allows concurrent calls to Read/Translate/etc.
+ // These functions synchronize with an in-progress Truncate by refusing to
+ // use cache contents beyond the new c.attr.Size. (We are still holding
+ // c.attrMu, so we can't race with Truncate/Write.)
+ c.dataMu.Unlock()
+
+ // Nothing left to do unless shrinking the file.
+ if size >= oldSize {
+ return nil
+ }
+
+ oldpgend := fs.OffsetPageEnd(oldSize)
+ newpgend := fs.OffsetPageEnd(size)
+
+ // Invalidate past translations of truncated pages.
+ if newpgend != oldpgend {
+ c.mapsMu.Lock()
+ c.mappings.Invalidate(memmap.MappableRange{newpgend, oldpgend}, memmap.InvalidateOpts{
+ // Compare Linux's mm/truncate.c:truncate_setsize() =>
+ // truncate_pagecache() =>
+ // mm/memory.c:unmap_mapping_range(evencows=1).
+ InvalidatePrivate: true,
+ })
+ c.mapsMu.Unlock()
+ }
+
+ // We are now guaranteed that there are no translations of truncated pages,
+ // and can remove them from the cache. Since truncated pages have been
+ // removed from the backing file, they should be dropped without being
+ // written back.
+ c.dataMu.Lock()
+ defer c.dataMu.Unlock()
+ c.cache.Truncate(uint64(size), c.platform.Memory())
+ c.dirty.KeepClean(memmap.MappableRange{uint64(size), oldpgend})
+
+ return nil
+}
+
+// WriteOut implements fs.InodeOperations.WriteOut.
+func (c *CachingInodeOperations) WriteOut(ctx context.Context, inode *fs.Inode) error {
+ c.attrMu.Lock()
+
+ // Write dirty pages back.
+ c.dataMu.RLock()
+ err := SyncDirtyAll(ctx, &c.cache, &c.dirty, uint64(c.attr.Size), c.platform.Memory(), c.backingFile.WriteFromBlocksAt)
+ c.dataMu.RUnlock()
+ if err != nil {
+ c.attrMu.Unlock()
+ return err
+ }
+
+ // Write out cached attributes.
+ if err := c.backingFile.SetMaskedAttributes(ctx, c.dirtyAttr, c.attr); err != nil {
+ c.attrMu.Unlock()
+ return err
+ }
+ c.dirtyAttr = fs.AttrMask{}
+
+ c.attrMu.Unlock()
+
+ // Fsync the remote file.
+ return c.backingFile.Sync(ctx)
+}
+
+// IncLinks increases the link count and updates cached access time.
+func (c *CachingInodeOperations) IncLinks(ctx context.Context) {
+ c.attrMu.Lock()
+ c.attr.Links++
+ c.touchModificationTimeLocked(ctx)
+ c.attrMu.Unlock()
+}
+
+// DecLinks decreases the link count and updates cached access time.
+func (c *CachingInodeOperations) DecLinks(ctx context.Context) {
+ c.attrMu.Lock()
+ c.attr.Links--
+ c.touchModificationTimeLocked(ctx)
+ c.attrMu.Unlock()
+}
+
+// TouchAccessTime updates the cached access time in-place to the
+// current time. It does not update status change time in-place. See
+// mm/filemap.c:do_generic_file_read -> include/linux/h:file_accessed.
+func (c *CachingInodeOperations) TouchAccessTime(ctx context.Context, inode *fs.Inode) {
+ if inode.MountSource.Flags.NoAtime {
+ return
+ }
+
+ c.attrMu.Lock()
+ c.touchAccessTimeLocked(ctx)
+ c.attrMu.Unlock()
+}
+
+// touchAccesstimeLocked updates the cached access time in-place to the current
+// time.
+//
+// Preconditions: c.attrMu is locked for writing.
+func (c *CachingInodeOperations) touchAccessTimeLocked(ctx context.Context) {
+ c.attr.AccessTime = ktime.NowFromContext(ctx)
+ c.dirtyAttr.AccessTime = true
+}
+
+// TouchModificationTime updates the cached modification and status change time
+// in-place to the current time.
+func (c *CachingInodeOperations) TouchModificationTime(ctx context.Context) {
+ c.attrMu.Lock()
+ c.touchModificationTimeLocked(ctx)
+ c.attrMu.Unlock()
+}
+
+// touchModificationTimeLocked updates the cached modification and status
+// change time in-place to the current time.
+//
+// Preconditions: c.attrMu is locked for writing.
+func (c *CachingInodeOperations) touchModificationTimeLocked(ctx context.Context) {
+ now := ktime.NowFromContext(ctx)
+ c.attr.ModificationTime = now
+ c.dirtyAttr.ModificationTime = true
+ c.attr.StatusChangeTime = now
+ c.dirtyAttr.StatusChangeTime = true
+}
+
+// touchStatusChangeTimeLocked updates the cached status change time
+// in-place to the current time.
+//
+// Preconditions: c.attrMu is locked for writing.
+func (c *CachingInodeOperations) touchStatusChangeTimeLocked(ctx context.Context) {
+ now := ktime.NowFromContext(ctx)
+ c.attr.StatusChangeTime = now
+ c.dirtyAttr.StatusChangeTime = true
+}
+
+// Read reads from frames and otherwise directly from the backing file
+// into dst starting at offset until dst is full, EOF is reached, or an
+// error is encountered.
+//
+// Read may partially fill dst and return a nil error.
+func (c *CachingInodeOperations) Read(ctx context.Context, file *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
+ if dst.NumBytes() == 0 {
+ return 0, nil
+ }
+
+ // Have we reached EOF? We check for this again in
+ // inodeReadWriter.ReadToBlocks to avoid holding c.attrMu (which would
+ // serialize reads) or c.dataMu (which would violate lock ordering), but
+ // check here first (before calling into MM) since reading at EOF is
+ // common: getting a return value of 0 from a read syscall is the only way
+ // to detect EOF.
+ //
+ // TODO: Separate out c.attr.Size and use atomics instead of
+ // c.dataMu.
+ c.dataMu.RLock()
+ size := c.attr.Size
+ c.dataMu.RUnlock()
+ if offset >= size {
+ return 0, io.EOF
+ }
+
+ n, err := dst.CopyOutFrom(ctx, &inodeReadWriter{ctx, c, offset})
+ // Compare Linux's mm/filemap.c:do_generic_file_read() => file_accessed().
+ c.TouchAccessTime(ctx, file.Dirent.Inode)
+ return n, err
+}
+
+// Write writes to frames and otherwise directly to the backing file
+// from src starting at offset and until src is empty or an error is
+// encountered.
+//
+// If Write partially fills src, a non-nil error is returned.
+func (c *CachingInodeOperations) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) {
+ if src.NumBytes() == 0 {
+ return 0, nil
+ }
+
+ c.attrMu.Lock()
+ defer c.attrMu.Unlock()
+ // Compare Linux's mm/filemap.c:__generic_file_write_iter() => file_update_time().
+ c.touchModificationTimeLocked(ctx)
+ return src.CopyInTo(ctx, &inodeReadWriter{ctx, c, offset})
+}
+
+type inodeReadWriter struct {
+ ctx context.Context
+ c *CachingInodeOperations
+ offset int64
+}
+
+// ReadToBlocks implements safemem.Reader.ReadToBlocks.
+func (rw *inodeReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
+ rw.c.dataMu.RLock()
+ defer rw.c.dataMu.RUnlock()
+
+ // Compute the range to read.
+ if rw.offset >= rw.c.attr.Size {
+ return 0, io.EOF
+ }
+ end := fs.ReadEndOffset(rw.offset, int64(dsts.NumBytes()), rw.c.attr.Size)
+ if end == rw.offset { // dsts.NumBytes() == 0?
+ return 0, nil
+ }
+
+ mem := rw.c.platform.Memory()
+ var done uint64
+ seg, gap := rw.c.cache.Find(uint64(rw.offset))
+ for rw.offset < end {
+ mr := memmap.MappableRange{uint64(rw.offset), uint64(end)}
+ switch {
+ case seg.Ok():
+ // Get internal mappings from the cache.
+ ims, err := mem.MapInternal(seg.FileRangeOf(seg.Range().Intersect(mr)), usermem.Read)
+ if err != nil {
+ return done, err
+ }
+
+ // Copy from internal mappings.
+ n, err := safemem.CopySeq(dsts, ims)
+ done += n
+ rw.offset += int64(n)
+ dsts = dsts.DropFirst64(n)
+ if err != nil {
+ return done, err
+ }
+
+ // Continue.
+ seg, gap = seg.NextNonEmpty()
+
+ case gap.Ok():
+ // Read directly from the backing file.
+ gapmr := gap.Range().Intersect(mr)
+ dst := dsts.TakeFirst64(gapmr.Length())
+ n, err := rw.c.backingFile.ReadToBlocksAt(rw.ctx, dst, gapmr.Start)
+ done += n
+ rw.offset += int64(n)
+ dsts = dsts.DropFirst64(n)
+ // Partial reads are fine. But we must stop reading.
+ if n != dst.NumBytes() || err != nil {
+ return done, err
+ }
+
+ // Continue.
+ seg, gap = gap.NextSegment(), FileRangeGapIterator{}
+
+ default:
+ break
+ }
+ }
+ return done, nil
+}
+
+// WriteFromBlocks implements safemem.Writer.WriteFromBlocks.
+//
+// Preconditions: rw.c.attrMu must be locked.
+func (rw *inodeReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) {
+ rw.c.dataMu.Lock()
+ defer rw.c.dataMu.Unlock()
+
+ // Compute the range to write.
+ end := fs.WriteEndOffset(rw.offset, int64(srcs.NumBytes()))
+ if end == rw.offset { // srcs.NumBytes() == 0?
+ return 0, nil
+ }
+
+ defer func() {
+ // If the write ends beyond the file's previous size, it causes the
+ // file to grow.
+ if rw.offset > rw.c.attr.Size {
+ rw.c.attr.Size = rw.offset
+ rw.c.dirtyAttr.Size = true
+ }
+ if rw.offset > rw.c.attr.Usage {
+ // This is incorrect if CachingInodeOperations is caching a sparse
+ // file. (In Linux, keeping inode::i_blocks up to date is the
+ // filesystem's responsibility.)
+ rw.c.attr.Usage = rw.offset
+ rw.c.dirtyAttr.Usage = true
+ }
+ }()
+
+ mem := rw.c.platform.Memory()
+ var done uint64
+ seg, gap := rw.c.cache.Find(uint64(rw.offset))
+ for rw.offset < end {
+ mr := memmap.MappableRange{uint64(rw.offset), uint64(end)}
+ switch {
+ case seg.Ok() && seg.Start() < mr.End:
+ // Get internal mappings from the cache.
+ segMR := seg.Range().Intersect(mr)
+ ims, err := mem.MapInternal(seg.FileRangeOf(segMR), usermem.Write)
+ if err != nil {
+ return done, err
+ }
+
+ // Copy to internal mappings.
+ n, err := safemem.CopySeq(ims, srcs)
+ done += n
+ rw.offset += int64(n)
+ srcs = srcs.DropFirst64(n)
+ rw.c.dirty.MarkDirty(segMR)
+ if err != nil {
+ return done, err
+ }
+
+ // Continue.
+ seg, gap = seg.NextNonEmpty()
+
+ case gap.Ok() && gap.Start() < mr.End:
+ // Write directly to the backing file.
+ gapmr := gap.Range().Intersect(mr)
+ src := srcs.TakeFirst64(gapmr.Length())
+ n, err := rw.c.backingFile.WriteFromBlocksAt(rw.ctx, src, gapmr.Start)
+ done += n
+ rw.offset += int64(n)
+ srcs = srcs.DropFirst64(n)
+ // Partial writes are fine. But we must stop writing.
+ if n != src.NumBytes() || err != nil {
+ return done, err
+ }
+
+ // Continue.
+ seg, gap = gap.NextSegment(), FileRangeGapIterator{}
+
+ default:
+ break
+ }
+ }
+ return done, nil
+}
+
+// AddMapping implements memmap.Mappable.AddMapping.
+func (c *CachingInodeOperations) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64) error {
+ c.mapsMu.Lock()
+ defer c.mapsMu.Unlock()
+ mapped := c.mappings.AddMapping(ms, ar, offset)
+ // Do this unconditionally since whether we have c.backingFile.FD() >= 0
+ // can change across save/restore.
+ for _, r := range mapped {
+ c.hostFileMapper.IncRefOn(r)
+ }
+ if !usage.IncrementalMappedAccounting && !c.forcePageCache && c.backingFile.FD() >= 0 {
+ for _, r := range mapped {
+ usage.MemoryAccounting.Inc(r.Length(), usage.Mapped)
+ }
+ }
+ return nil
+}
+
+// RemoveMapping implements memmap.Mappable.RemoveMapping.
+func (c *CachingInodeOperations) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64) {
+ c.mapsMu.Lock()
+ defer c.mapsMu.Unlock()
+ unmapped := c.mappings.RemoveMapping(ms, ar, offset)
+ for _, r := range unmapped {
+ c.hostFileMapper.DecRefOn(r)
+ }
+ if !c.forcePageCache && c.backingFile.FD() >= 0 {
+ if !usage.IncrementalMappedAccounting {
+ for _, r := range unmapped {
+ usage.MemoryAccounting.Dec(r.Length(), usage.Mapped)
+ }
+ }
+ return
+ }
+
+ // Writeback dirty mapped memory now that there are no longer any
+ // mappings that reference it. This is our naive memory eviction
+ // strategy.
+ mem := c.platform.Memory()
+ c.dataMu.Lock()
+ defer c.dataMu.Unlock()
+ for _, r := range unmapped {
+ if err := SyncDirty(ctx, r, &c.cache, &c.dirty, uint64(c.attr.Size), c.platform.Memory(), c.backingFile.WriteFromBlocksAt); err != nil {
+ log.Warningf("Failed to writeback cached data %v: %v", r, err)
+ }
+ c.cache.Drop(r, mem)
+ c.dirty.KeepClean(r)
+ }
+}
+
+// CopyMapping implements memmap.Mappable.CopyMapping.
+func (c *CachingInodeOperations) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR usermem.AddrRange, offset uint64) error {
+ return c.AddMapping(ctx, ms, dstAR, offset)
+}
+
+// Translate implements memmap.Mappable.Translate.
+func (c *CachingInodeOperations) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) {
+ if !c.forcePageCache && c.backingFile.FD() >= 0 {
+ return []memmap.Translation{
+ {
+ Source: optional,
+ File: c,
+ Offset: optional.Start,
+ },
+ }, nil
+ }
+
+ c.dataMu.Lock()
+ defer c.dataMu.Unlock()
+
+ // Constrain translations to c.attr.Size (rounded up) to prevent
+ // translation to pages that may be concurrently truncated.
+ pgend := fs.OffsetPageEnd(c.attr.Size)
+ var beyondEOF bool
+ if required.End > pgend {
+ if required.Start >= pgend {
+ return nil, &memmap.BusError{io.EOF}
+ }
+ beyondEOF = true
+ required.End = pgend
+ }
+ if optional.End > pgend {
+ optional.End = pgend
+ }
+
+ mem := c.platform.Memory()
+ cerr := c.cache.Fill(ctx, required, maxFillRange(required, optional), mem, usage.PageCache, c.backingFile.ReadToBlocksAt)
+
+ var ts []memmap.Translation
+ var translatedEnd uint64
+ for seg := c.cache.FindSegment(required.Start); seg.Ok() && seg.Start() < required.End; seg, _ = seg.NextNonEmpty() {
+ segMR := seg.Range().Intersect(optional)
+ ts = append(ts, memmap.Translation{
+ Source: segMR,
+ File: mem,
+ Offset: seg.FileRangeOf(segMR).Start,
+ })
+ if at.Write {
+ // From this point forward, this memory can be dirtied through the
+ // mapping at any time.
+ c.dirty.KeepDirty(segMR)
+ }
+ translatedEnd = segMR.End
+ }
+
+ // Don't return the error returned by c.cache.Fill if it occurred outside
+ // of required.
+ if translatedEnd < required.End && cerr != nil {
+ return ts, &memmap.BusError{cerr}
+ }
+ if beyondEOF {
+ return ts, &memmap.BusError{io.EOF}
+ }
+ return ts, nil
+}
+
+func maxFillRange(required, optional memmap.MappableRange) memmap.MappableRange {
+ const maxReadahead = 64 << 10 // 64 KB, chosen arbitrarily
+ if required.Length() >= maxReadahead {
+ return required
+ }
+ if optional.Length() <= maxReadahead {
+ return optional
+ }
+ optional.Start = required.Start
+ if optional.Length() <= maxReadahead {
+ return optional
+ }
+ optional.End = optional.Start + maxReadahead
+ return optional
+}
+
+// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable.
+func (c *CachingInodeOperations) InvalidateUnsavable(ctx context.Context) error {
+ // Whether we have a host fd (and consequently what platform.File is
+ // mapped) can change across save/restore, so invalidate all translations
+ // unconditionally.
+ c.mapsMu.Lock()
+ defer c.mapsMu.Unlock()
+ c.mappings.InvalidateAll(memmap.InvalidateOpts{})
+
+ // Sync the cache's contents so that if we have a host fd after restore,
+ // the remote file's contents are coherent.
+ c.dataMu.Lock()
+ defer c.dataMu.Unlock()
+ if err := SyncDirtyAll(ctx, &c.cache, &c.dirty, uint64(c.attr.Size), c.platform.Memory(), c.backingFile.WriteFromBlocksAt); err != nil {
+ return err
+ }
+
+ // Discard the cache so that it's not stored in saved state. This is safe
+ // because per InvalidateUnsavable invariants, no new translations can have
+ // been returned after we invalidated all existing translations above.
+ c.cache.DropAll(c.platform.Memory())
+ c.dirty.RemoveAll()
+
+ return nil
+}
+
+// MapInto implements platform.File.MapInto. This is used when we directly map
+// an underlying host fd and CachingInodeOperations is used as the platform.File
+// during translation.
+func (c *CachingInodeOperations) MapInto(as platform.AddressSpace, addr usermem.Addr, fr platform.FileRange, at usermem.AccessType, precommit bool) error {
+ return as.MapFile(addr, c.backingFile.FD(), fr, at, precommit)
+}
+
+// MapInternal implements platform.File.MapInternal. This is used when we
+// directly map an underlying host fd and CachingInodeOperations is used as the
+// platform.File during translation.
+func (c *CachingInodeOperations) MapInternal(fr platform.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) {
+ return c.hostFileMapper.MapInternal(fr, c.backingFile.FD(), at.Write)
+}
+
+// IncRef implements platform.File.IncRef. This is used when we directly map an
+// underlying host fd and CachingInodeOperations is used as the platform.File
+// during translation.
+func (c *CachingInodeOperations) IncRef(fr platform.FileRange) {
+ c.dataMu.Lock()
+ defer c.dataMu.Unlock()
+
+ seg, gap := c.refs.Find(fr.Start)
+ for {
+ switch {
+ case seg.Ok() && seg.Start() < fr.End:
+ seg = c.refs.Isolate(seg, fr)
+ seg.SetValue(seg.Value() + 1)
+ seg, gap = seg.NextNonEmpty()
+ case gap.Ok() && gap.Start() < fr.End:
+ newRange := gap.Range().Intersect(fr)
+ if usage.IncrementalMappedAccounting {
+ usage.MemoryAccounting.Inc(newRange.Length(), usage.Mapped)
+ }
+ seg, gap = c.refs.InsertWithoutMerging(gap, newRange, 1).NextNonEmpty()
+ default:
+ c.refs.MergeAdjacent(fr)
+ return
+ }
+ }
+}
+
+// DecRef implements platform.File.DecRef. This is used when we directly map an
+// underlying host fd and CachingInodeOperations is used as the platform.File
+// during translation.
+func (c *CachingInodeOperations) DecRef(fr platform.FileRange) {
+ c.dataMu.Lock()
+ defer c.dataMu.Unlock()
+
+ seg := c.refs.FindSegment(fr.Start)
+
+ for seg.Ok() && seg.Start() < fr.End {
+ seg = c.refs.Isolate(seg, fr)
+ if old := seg.Value(); old == 1 {
+ if usage.IncrementalMappedAccounting {
+ usage.MemoryAccounting.Dec(seg.Range().Length(), usage.Mapped)
+ }
+ seg = c.refs.Remove(seg).NextSegment()
+ } else {
+ seg.SetValue(old - 1)
+ seg = seg.NextSegment()
+ }
+ }
+ c.refs.MergeAdjacent(fr)
+}
diff --git a/pkg/sentry/fs/fsutil/inode_cached_test.go b/pkg/sentry/fs/fsutil/inode_cached_test.go
new file mode 100644
index 000000000..996c91849
--- /dev/null
+++ b/pkg/sentry/fs/fsutil/inode_cached_test.go
@@ -0,0 +1,403 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fsutil
+
+import (
+ "bytes"
+ "io"
+ "reflect"
+ "testing"
+
+ "gvisor.googlesource.com/gvisor/pkg/sentry/context"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/fs"
+ ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
+ "gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
+)
+
+type noopBackingFile struct{}
+
+func (noopBackingFile) ReadToBlocksAt(ctx context.Context, dsts safemem.BlockSeq, offset uint64) (uint64, error) {
+ return dsts.NumBytes(), nil
+}
+
+func (noopBackingFile) WriteFromBlocksAt(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error) {
+ return srcs.NumBytes(), nil
+}
+
+func (noopBackingFile) SetMaskedAttributes(context.Context, fs.AttrMask, fs.UnstableAttr) error {
+ return nil
+}
+
+func (noopBackingFile) Sync(context.Context) error {
+ return nil
+}
+
+func (noopBackingFile) FD() int {
+ return -1
+}
+
+func TestSetPermissions(t *testing.T) {
+ ctx := contexttest.Context(t)
+
+ uattr := fs.WithCurrentTime(ctx, fs.UnstableAttr{
+ Perms: fs.FilePermsFromMode(0444),
+ })
+ iops := NewCachingInodeOperations(ctx, noopBackingFile{}, uattr, false /*forcePageCache*/)
+ defer iops.Release()
+
+ perms := fs.FilePermsFromMode(0777)
+ if !iops.SetPermissions(ctx, nil, perms) {
+ t.Fatalf("SetPermissions failed, want success")
+ }
+
+ // Did permissions change?
+ if !iops.dirtyAttr.Perms {
+ t.Fatalf("got perms not dirty, want dirty")
+ }
+ if iops.attr.Perms != perms {
+ t.Fatalf("got perms +%v, want +%v", iops.attr.Perms, perms)
+ }
+
+ // Did status change time change?
+ if !iops.dirtyAttr.StatusChangeTime {
+ t.Fatalf("got status change time not dirty, want dirty")
+ }
+ if iops.attr.StatusChangeTime.Equal(uattr.StatusChangeTime) {
+ t.Fatalf("got status change time unchanged")
+ }
+}
+
+func TestSetTimestamps(t *testing.T) {
+ ctx := contexttest.Context(t)
+ for _, test := range []struct {
+ desc string
+ ts fs.TimeSpec
+ wantDirty fs.AttrMask
+ }{
+ {
+ desc: "noop",
+ ts: fs.TimeSpec{
+ ATimeOmit: true,
+ MTimeOmit: true,
+ },
+ wantDirty: fs.AttrMask{},
+ },
+ {
+ desc: "access time only",
+ ts: fs.TimeSpec{
+ ATime: ktime.NowFromContext(ctx),
+ MTimeOmit: true,
+ },
+ wantDirty: fs.AttrMask{
+ AccessTime: true,
+ StatusChangeTime: true,
+ },
+ },
+ {
+ desc: "modification time only",
+ ts: fs.TimeSpec{
+ ATimeOmit: true,
+ MTime: ktime.NowFromContext(ctx),
+ },
+ wantDirty: fs.AttrMask{
+ ModificationTime: true,
+ StatusChangeTime: true,
+ },
+ },
+ {
+ desc: "access and modification time",
+ ts: fs.TimeSpec{
+ ATime: ktime.NowFromContext(ctx),
+ MTime: ktime.NowFromContext(ctx),
+ },
+ wantDirty: fs.AttrMask{
+ AccessTime: true,
+ ModificationTime: true,
+ StatusChangeTime: true,
+ },
+ },
+ {
+ desc: "system time access and modification time",
+ ts: fs.TimeSpec{
+ ATimeSetSystemTime: true,
+ MTimeSetSystemTime: true,
+ },
+ wantDirty: fs.AttrMask{
+ AccessTime: true,
+ ModificationTime: true,
+ StatusChangeTime: true,
+ },
+ },
+ } {
+ t.Run(test.desc, func(t *testing.T) {
+ ctx := contexttest.Context(t)
+
+ epoch := ktime.ZeroTime
+ uattr := fs.UnstableAttr{
+ AccessTime: epoch,
+ ModificationTime: epoch,
+ StatusChangeTime: epoch,
+ }
+ iops := NewCachingInodeOperations(ctx, noopBackingFile{}, uattr, false /*forcePageCache*/)
+ defer iops.Release()
+
+ if err := iops.SetTimestamps(ctx, nil, test.ts); err != nil {
+ t.Fatalf("SetTimestamps got error %v, want nil", err)
+ }
+ if !reflect.DeepEqual(iops.dirtyAttr, test.wantDirty) {
+ t.Fatalf("dirty got %+v, want %+v", iops.dirtyAttr, test.wantDirty)
+ }
+ if iops.dirtyAttr.AccessTime {
+ if !iops.attr.AccessTime.After(uattr.AccessTime) {
+ t.Fatalf("diritied access time did not advance, want %v > %v", iops.attr.AccessTime, uattr.AccessTime)
+ }
+ if !iops.dirtyAttr.StatusChangeTime {
+ t.Fatalf("dirty access time requires dirty status change time")
+ }
+ if !iops.attr.StatusChangeTime.After(uattr.StatusChangeTime) {
+ t.Fatalf("dirtied status change time did not advance")
+ }
+ }
+ if iops.dirtyAttr.ModificationTime {
+ if !iops.attr.ModificationTime.After(uattr.ModificationTime) {
+ t.Fatalf("diritied modification time did not advance")
+ }
+ if !iops.dirtyAttr.StatusChangeTime {
+ t.Fatalf("dirty modification time requires dirty status change time")
+ }
+ if !iops.attr.StatusChangeTime.After(uattr.StatusChangeTime) {
+ t.Fatalf("dirtied status change time did not advance")
+ }
+ }
+ })
+ }
+}
+
+func TestTruncate(t *testing.T) {
+ ctx := contexttest.Context(t)
+
+ uattr := fs.UnstableAttr{
+ Size: 0,
+ }
+ iops := NewCachingInodeOperations(ctx, noopBackingFile{}, uattr, false /*forcePageCache*/)
+ defer iops.Release()
+
+ if err := iops.Truncate(ctx, nil, uattr.Size); err != nil {
+ t.Fatalf("Truncate got error %v, want nil", err)
+ }
+ if iops.dirtyAttr.Size {
+ t.Fatalf("Truncate caused size to be dirtied")
+ }
+ var size int64 = 4096
+ if err := iops.Truncate(ctx, nil, size); err != nil {
+ t.Fatalf("Truncate got error %v, want nil", err)
+ }
+ if !iops.dirtyAttr.Size {
+ t.Fatalf("Truncate caused size to not be dirtied")
+ }
+ if iops.attr.Size != size {
+ t.Fatalf("Truncate got %d, want %d", iops.attr.Size, size)
+ }
+ if !iops.dirtyAttr.ModificationTime || !iops.dirtyAttr.StatusChangeTime {
+ t.Fatalf("Truncate did not dirty modification and status change time")
+ }
+ if !iops.attr.ModificationTime.After(uattr.ModificationTime) {
+ t.Fatalf("dirtied modification time did not change")
+ }
+ if !iops.attr.StatusChangeTime.After(uattr.StatusChangeTime) {
+ t.Fatalf("dirtied status change time did not change")
+ }
+}
+
+type sliceBackingFile struct {
+ data []byte
+}
+
+func newSliceBackingFile(data []byte) *sliceBackingFile {
+ return &sliceBackingFile{data}
+}
+
+func (f *sliceBackingFile) ReadToBlocksAt(ctx context.Context, dsts safemem.BlockSeq, offset uint64) (uint64, error) {
+ r := safemem.BlockSeqReader{safemem.BlockSeqOf(safemem.BlockFromSafeSlice(f.data)).DropFirst64(offset)}
+ return r.ReadToBlocks(dsts)
+}
+
+func (f *sliceBackingFile) WriteFromBlocksAt(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error) {
+ w := safemem.BlockSeqWriter{safemem.BlockSeqOf(safemem.BlockFromSafeSlice(f.data)).DropFirst64(offset)}
+ return w.WriteFromBlocks(srcs)
+}
+
+func (*sliceBackingFile) SetMaskedAttributes(context.Context, fs.AttrMask, fs.UnstableAttr) error {
+ return nil
+}
+
+func (*sliceBackingFile) Sync(context.Context) error {
+ return nil
+}
+
+func (*sliceBackingFile) FD() int {
+ return -1
+}
+
+type noopMappingSpace struct{}
+
+// Invalidate implements memmap.MappingSpace.Invalidate.
+func (noopMappingSpace) Invalidate(ar usermem.AddrRange, opts memmap.InvalidateOpts) {
+}
+
+func anonInode(ctx context.Context) *fs.Inode {
+ return fs.NewInode(NewSimpleInodeOperations(InodeSimpleAttributes{
+ UAttr: fs.WithCurrentTime(ctx, fs.UnstableAttr{
+ Owner: fs.FileOwnerFromContext(ctx),
+ Perms: fs.FilePermissions{
+ User: fs.PermMask{Read: true, Write: true},
+ },
+ Links: 1,
+ }),
+ }), fs.NewNonCachingMountSource(nil, fs.MountSourceFlags{}), fs.StableAttr{
+ Type: fs.Anonymous,
+ BlockSize: usermem.PageSize,
+ })
+}
+
+func pagesOf(bs ...byte) []byte {
+ buf := make([]byte, 0, len(bs)*usermem.PageSize)
+ for _, b := range bs {
+ buf = append(buf, bytes.Repeat([]byte{b}, usermem.PageSize)...)
+ }
+ return buf
+}
+
+func TestRead(t *testing.T) {
+ ctx := contexttest.Context(t)
+
+ // Construct a 3-page file.
+ buf := pagesOf('a', 'b', 'c')
+ file := fs.NewFile(ctx, fs.NewDirent(anonInode(ctx), "anon"), fs.FileFlags{}, nil)
+ uattr := fs.UnstableAttr{
+ Size: int64(len(buf)),
+ }
+ iops := NewCachingInodeOperations(ctx, newSliceBackingFile(buf), uattr, false /*forcePageCache*/)
+ defer iops.Release()
+
+ // Expect the cache to be initially empty.
+ if cached := iops.cache.Span(); cached != 0 {
+ t.Errorf("Span got %d, want 0", cached)
+ }
+
+ // Create a memory mapping of the second page (as CachingInodeOperations
+ // expects to only cache mapped pages), then call Translate to force it to
+ // be cached.
+ var ms noopMappingSpace
+ ar := usermem.AddrRange{usermem.PageSize, 2 * usermem.PageSize}
+ if err := iops.AddMapping(ctx, ms, ar, usermem.PageSize); err != nil {
+ t.Fatalf("AddMapping got %v, want nil", err)
+ }
+ mr := memmap.MappableRange{usermem.PageSize, 2 * usermem.PageSize}
+ if _, err := iops.Translate(ctx, mr, mr, usermem.Read); err != nil {
+ t.Fatalf("Translate got %v, want nil", err)
+ }
+ if cached := iops.cache.Span(); cached != usermem.PageSize {
+ t.Errorf("SpanRange got %d, want %d", cached, usermem.PageSize)
+ }
+
+ // Try to read 4 pages. The first and third pages should be read directly
+ // from the "file", the second page should be read from the cache, and only
+ // 3 pages (the size of the file) should be readable.
+ rbuf := make([]byte, 4*usermem.PageSize)
+ dst := usermem.BytesIOSequence(rbuf)
+ n, err := iops.Read(ctx, file, dst, 0)
+ if n != 3*usermem.PageSize || (err != nil && err != io.EOF) {
+ t.Fatalf("Read got (%d, %v), want (%d, nil or EOF)", n, err, 3*usermem.PageSize)
+ }
+ rbuf = rbuf[:3*usermem.PageSize]
+
+ // Did we get the bytes we expect?
+ if !bytes.Equal(rbuf, buf) {
+ t.Errorf("Read back bytes %v, want %v", rbuf, buf)
+ }
+
+ // Delete the memory mapping and expect it to cause the cached page to be
+ // uncached.
+ iops.RemoveMapping(ctx, ms, ar, usermem.PageSize)
+ if cached := iops.cache.Span(); cached != 0 {
+ t.Fatalf("Span got %d, want 0", cached)
+ }
+}
+
+func TestWrite(t *testing.T) {
+ ctx := contexttest.Context(t)
+
+ // Construct a 4-page file.
+ buf := pagesOf('a', 'b', 'c', 'd')
+ orig := append([]byte(nil), buf...)
+ inode := anonInode(ctx)
+ uattr := fs.UnstableAttr{
+ Size: int64(len(buf)),
+ }
+ iops := NewCachingInodeOperations(ctx, newSliceBackingFile(buf), uattr, false /*forcePageCache*/)
+ defer iops.Release()
+
+ // Expect the cache to be initially empty.
+ if cached := iops.cache.Span(); cached != 0 {
+ t.Errorf("Span got %d, want 0", cached)
+ }
+
+ // Create a memory mapping of the second and third pages (as
+ // CachingInodeOperations expects to only cache mapped pages), then call
+ // Translate to force them to be cached.
+ var ms noopMappingSpace
+ ar := usermem.AddrRange{usermem.PageSize, 3 * usermem.PageSize}
+ if err := iops.AddMapping(ctx, ms, ar, usermem.PageSize); err != nil {
+ t.Fatalf("AddMapping got %v, want nil", err)
+ }
+ defer iops.RemoveMapping(ctx, ms, ar, usermem.PageSize)
+ mr := memmap.MappableRange{usermem.PageSize, 3 * usermem.PageSize}
+ if _, err := iops.Translate(ctx, mr, mr, usermem.Read); err != nil {
+ t.Fatalf("Translate got %v, want nil", err)
+ }
+ if cached := iops.cache.Span(); cached != 2*usermem.PageSize {
+ t.Errorf("SpanRange got %d, want %d", cached, 2*usermem.PageSize)
+ }
+
+ // Write to the first 2 pages.
+ wbuf := pagesOf('e', 'f')
+ src := usermem.BytesIOSequence(wbuf)
+ n, err := iops.Write(ctx, src, 0)
+ if n != 2*usermem.PageSize || err != nil {
+ t.Fatalf("Write got (%d, %v), want (%d, nil)", n, err, 2*usermem.PageSize)
+ }
+
+ // The first page should have been written directly, since it was not cached.
+ want := append([]byte(nil), orig...)
+ copy(want, pagesOf('e'))
+ if !bytes.Equal(buf, want) {
+ t.Errorf("File contents are %v, want %v", buf, want)
+ }
+
+ // Sync back to the "backing file".
+ if err := iops.WriteOut(ctx, inode); err != nil {
+ t.Errorf("Sync got %v, want nil", err)
+ }
+
+ // Now the second page should have been written as well.
+ copy(want[usermem.PageSize:], pagesOf('f'))
+ if !bytes.Equal(buf, want) {
+ t.Errorf("File contents are %v, want %v", buf, want)
+ }
+}