diff options
Diffstat (limited to 'pkg/sentry/fs/fsutil')
-rw-r--r-- | pkg/sentry/fs/fsutil/BUILD | 149 | ||||
-rw-r--r-- | pkg/sentry/fs/fsutil/README.md | 207 | ||||
-rw-r--r-- | pkg/sentry/fs/fsutil/dirty_set.go | 213 | ||||
-rw-r--r-- | pkg/sentry/fs/fsutil/dirty_set_test.go | 38 | ||||
-rw-r--r-- | pkg/sentry/fs/fsutil/file.go | 267 | ||||
-rw-r--r-- | pkg/sentry/fs/fsutil/file_range_set.go | 208 | ||||
-rw-r--r-- | pkg/sentry/fs/fsutil/frame_ref_set.go | 50 | ||||
-rw-r--r-- | pkg/sentry/fs/fsutil/fsutil.go | 26 | ||||
-rw-r--r-- | pkg/sentry/fs/fsutil/handle.go | 126 | ||||
-rw-r--r-- | pkg/sentry/fs/fsutil/handle_test.go | 227 | ||||
-rw-r--r-- | pkg/sentry/fs/fsutil/host_file_mapper.go | 209 | ||||
-rw-r--r-- | pkg/sentry/fs/fsutil/host_file_mapper_state.go | 20 | ||||
-rw-r--r-- | pkg/sentry/fs/fsutil/host_file_mapper_unsafe.go | 27 | ||||
-rw-r--r-- | pkg/sentry/fs/fsutil/inode.go | 380 | ||||
-rw-r--r-- | pkg/sentry/fs/fsutil/inode_cached.go | 845 | ||||
-rw-r--r-- | pkg/sentry/fs/fsutil/inode_cached_test.go | 403 |
16 files changed, 3395 insertions, 0 deletions
diff --git a/pkg/sentry/fs/fsutil/BUILD b/pkg/sentry/fs/fsutil/BUILD new file mode 100644 index 000000000..4fa6395f7 --- /dev/null +++ b/pkg/sentry/fs/fsutil/BUILD @@ -0,0 +1,149 @@ +package(licenses = ["notice"]) # Apache 2.0 + +load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test") +load("//tools/go_generics:defs.bzl", "go_template_instance") +load("//tools/go_stateify:defs.bzl", "go_stateify") + +go_stateify( + name = "fsutil_state", + srcs = [ + "dirty_set_impl.go", + "file.go", + "file_range_set_impl.go", + "frame_ref_set_impl.go", + "handle.go", + "host_file_mapper.go", + "host_file_mapper_state.go", + "inode.go", + "inode_cached.go", + ], + out = "fsutil_state.go", + package = "fsutil", +) + +go_template_instance( + name = "dirty_set_impl", + out = "dirty_set_impl.go", + imports = { + "memmap": "gvisor.googlesource.com/gvisor/pkg/sentry/memmap", + "platform": "gvisor.googlesource.com/gvisor/pkg/sentry/platform", + }, + package = "fsutil", + prefix = "Dirty", + template = "//pkg/segment:generic_set", + types = { + "Key": "uint64", + "Range": "memmap.MappableRange", + "Value": "DirtyInfo", + "Functions": "dirtySetFunctions", + }, +) + +go_template_instance( + name = "frame_ref_set_impl", + out = "frame_ref_set_impl.go", + imports = { + "platform": "gvisor.googlesource.com/gvisor/pkg/sentry/platform", + }, + package = "fsutil", + prefix = "frameRef", + template = "//pkg/segment:generic_set", + types = { + "Key": "uint64", + "Range": "platform.FileRange", + "Value": "uint64", + "Functions": "frameRefSetFunctions", + }, +) + +go_template_instance( + name = "file_range_set_impl", + out = "file_range_set_impl.go", + imports = { + "memmap": "gvisor.googlesource.com/gvisor/pkg/sentry/memmap", + "platform": "gvisor.googlesource.com/gvisor/pkg/sentry/platform", + }, + package = "fsutil", + prefix = "FileRange", + template = "//pkg/segment:generic_set", + types = { + "Key": "uint64", + "Range": "memmap.MappableRange", + "Value": "uint64", + "Functions": "fileRangeSetFunctions", + }, +) + +go_library( + name = "fsutil", + srcs = [ + "dirty_set.go", + "dirty_set_impl.go", + "file.go", + "file_range_set.go", + "file_range_set_impl.go", + "frame_ref_set.go", + "frame_ref_set_impl.go", + "fsutil.go", + "fsutil_state.go", + "handle.go", + "host_file_mapper.go", + "host_file_mapper_state.go", + "host_file_mapper_unsafe.go", + "inode.go", + "inode_cached.go", + ], + importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil", + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/log", + "//pkg/sentry/arch", + "//pkg/sentry/context", + "//pkg/sentry/device", + "//pkg/sentry/fs", + "//pkg/sentry/kernel/time", + "//pkg/sentry/memmap", + "//pkg/sentry/platform", + "//pkg/sentry/safemem", + "//pkg/sentry/usage", + "//pkg/sentry/usermem", + "//pkg/state", + "//pkg/syserror", + "//pkg/tcpip/transport/unix", + "//pkg/waiter", + ], +) + +go_test( + name = "fsutil_x_test", + size = "small", + srcs = ["handle_test.go"], + deps = [ + ":fsutil", + "//pkg/sentry/context", + "//pkg/sentry/context/contexttest", + "//pkg/sentry/fs", + "//pkg/sentry/fs/ramfs/test", + "//pkg/sentry/usermem", + ], +) + +go_test( + name = "fsutil_test", + size = "small", + srcs = [ + "dirty_set_test.go", + "inode_cached_test.go", + ], + embed = [":fsutil"], + deps = [ + "//pkg/sentry/context", + "//pkg/sentry/context/contexttest", + "//pkg/sentry/fs", + "//pkg/sentry/kernel/time", + "//pkg/sentry/memmap", + "//pkg/sentry/safemem", + "//pkg/sentry/usermem", + ], +) diff --git a/pkg/sentry/fs/fsutil/README.md b/pkg/sentry/fs/fsutil/README.md new file mode 100644 index 000000000..d3780e9fa --- /dev/null +++ b/pkg/sentry/fs/fsutil/README.md @@ -0,0 +1,207 @@ +This package provides utilities for implementing virtual filesystem objects. + +[TOC] + +## Page cache + +`CachingInodeOperations` implements a page cache for files that cannot use the +host page cache. Normally these are files that store their data in a remote +filesystem. This also applies to files that are accessed on a platform that does +not support directly memory mapping host file descriptors (e.g. the ptrace +platform). + +An `CachingInodeOperations` buffers regions of a single file into memory. It is +owned by an `fs.Inode`, the in-memory representation of a file (all open file +descriptors are backed by an `fs.Inode`). The `fs.Inode` provides operations for +reading memory into an `CachingInodeOperations`, to represent the contents of +the file in-memory, and for writing memory out, to relieve memory pressure on +the kernel and to synchronize in-memory changes to filesystems. + +An `CachingInodeOperations` enables readable and/or writable memory access to +file content. Files can be mapped shared or private, see mmap(2). When a file is +mapped shared, changes to the file via write(2) and truncate(2) are reflected in +the shared memory region. Conversely, when the shared memory region is modified, +changes to the file are visible via read(2). Multiple shared mappings of the +same file are coherent with each other. This is consistent with Linux. + +When a file is mapped private, updates to the mapped memory are not visible to +other memory mappings. Updates to the mapped memory are also not reflected in +the file content as seen by read(2). If the file is changed after a private +mapping is created, for instance by write(2), the change to the file may or may +not be reflected in the private mapping. This is consistent with Linux. + +An `CachingInodeOperations` keeps track of ranges of memory that were modified +(or "dirtied"). When the file is explicitly synced via fsync(2), only the dirty +ranges are written out to the filesystem. Any error returned indicates a failure +to write all dirty memory of an `CachingInodeOperations` to the filesystem. In +this case the filesystem may be in an inconsistent state. The same operation can +be performed on the shared memory itself using msync(2). If neither fsync(2) nor +msync(2) is performed, then the dirty memory is written out in accordance with +the `CachingInodeOperations` eviction strategy (see below) and there is no +guarantee that memory will be written out successfully in full. + +### Memory allocation and eviction + +An `CachingInodeOperations` implements the following allocation and eviction +strategy: + +- Memory is allocated and brought up to date with the contents of a file when + a region of mapped memory is accessed (or "faulted on"). + +- Dirty memory is written out to filesystems when an fsync(2) or msync(2) + operation is performed on a memory mapped file, for all memory mapped files + when saved, and/or when there are no longer any memory mappings of a range + of a file, see munmap(2). As the latter implies, in the absence of a panic + or SIGKILL, dirty memory is written out for all memory mapped files when an + application exits. + +- Memory is freed when there are no longer any memory mappings of a range of a + file (e.g. when an application exits). This behavior is consistent with + Linux for shared memory that has been locked via mlock(2). + +Notably, memory is not allocated for read(2) or write(2) operations. This means +that reads and writes to the file are only accelerated by an +`CachingInodeOperations` if the file being read or written has been memory +mapped *and* if the shared memory has been accessed at the region being read or +written. This diverges from Linux which buffers memory into a page cache on +read(2) proactively (i.e. readahead) and delays writing it out to filesystems on +write(2) (i.e. writeback). The absence of these optimizations is not visible to +applications beyond less than optimal performance when repeatedly reading and/or +writing to same region of a file. See [Future Work](#future-work) for plans to +implement these optimizations. + +Additionally, memory held by `CachingInodeOperationss` is currently unbounded in +size. An `CachingInodeOperations` does not write out dirty memory and free it +under system memory pressure. This can cause pathological memory usage. + +When memory is written back, an `CachingInodeOperations` may write regions of +shared memory that were never modified. This is due to the strategy of +minimizing page faults (see below) and handling only a subset of memory write +faults. In the absence of an application or sentry crash, it is guaranteed that +if a region of shared memory was written to, it is written back to a filesystem. + +### Life of a shared memory mapping + +A file is memory mapped via mmap(2). For example, if `A` is an address, an +application may execute: + +``` +mmap(A, 0x1000, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); +``` + +This creates a shared mapping of fd that reflects 4k of the contents of fd +starting at offset 0, accessible at address `A`. This in turn creates a virtual +memory area region ("vma") which indicates that [`A`, `A`+0x1000) is now a valid +address range for this application to access. + +At this point, memory has not been allocated in the file's +`CachingInodeOperations`. It is also the case that the address range [`A`, +`A`+0x1000) has not been mapped on the host on behalf of the application. If the +application then tries to modify 8 bytes of the shared memory: + +``` +char buffer[] = "aaaaaaaa"; +memcpy(A, buffer, 8); +``` + +The host then sends a `SIGSEGV` to the sentry because the address range [`A`, +`A`+8) is not mapped on the host. The `SIGSEGV` indicates that the memory was +accessed writable. The sentry looks up the vma associated with [`A`, `A`+8), +finds the file that was mapped and its `CachingInodeOperations`. It then calls +`CachingInodeOperations.MapInto` which allocates memory to back [`A`, `A`+8). It +may choose to allocate more memory (i.e. do "readahead") to minimize subsequent +faults. + +Memory that is allocated comes from a host tmpfs file (see `filemem.FileMem`). +The host tmpfs file memory is brought up to date with the contents of the mapped +file on its filesystem. The region of the host tmpfs file that reflects the +mapped file is then mapped into the host address space of the application so +that subsequent memory accesses do not repeatedly generate a `SIGSEGV`. + +The range that was allocated, including any extra memory allocation to minimize +faults, is marked dirty due to the write fault. This overcounts dirty memory if +the extra memory allocated is never modified. + +To make the scenario more interesting, imagine that this application spawns +another process and maps the same file in the exact same way: + +``` +mmap(A, 0x1000, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); +``` + +Imagine that this process then tries to modify the file again but with only 4 +bytes: + +``` +char buffer[] = "bbbb"; +memcpy(A, buffer, 4); +``` + +Since the first process has already mapped and accessed the same region of the +file writable, `CachingInodeOperations.MapInto` is called but re-maps the memory +that has already been allocated (because the host mapping can be invalidated at +any time) rather than allocating new memory. The address range [`A`, `A`+0x1000) +reflects the same cached view of the file as the first process sees. For +example, reading 8 bytes from the file from either process via read(2) starting +at offset 0 returns a consistent "bbbbaaaa". + +When this process no longer needs the shared memory, it may do: + +``` +munmap(A, 0x1000); +``` + +At this point, the modified memory cached by the `CachingInodeOperations` is not +written back to the file because it is still in use by the first process that +mapped it. When the first process also does: + +``` +munmap(A, 0x1000); +``` + +Then the last memory mapping of the file at the range [0, 0x1000) is gone. The +file's `CachingInodeOperations` then starts writing back memory marked dirty to +the file on its filesystem. Once writing completes, regardless of whether it was +successful, the `CachingInodeOperations` frees the memory cached at the range +[0, 0x1000). + +Subsequent read(2) or write(2) operations on the file go directly to the +filesystem since there no longer exists memory for it in its +`CachingInodeOperations`. + +## Future Work + +### Page cache + +The sentry does not yet implement the readahead and writeback optimizations for +read(2) and write(2) respectively. To do so, on read(2) and/or write(2) the +sentry must ensure that memory is allocated in a page cache to read or write +into. However, the sentry cannot boundlessly allocate memory. If it did, the +host would eventually OOM-kill the sentry+application process. This means that +the sentry must implement a page cache memory allocation strategy that is +bounded by a global user or container imposed limit. When this limit is +approached, the sentry must decide from which page cache memory should be freed +so that it can allocate more memory. If it makes a poor decision, the sentry may +end up freeing and re-allocating memory to back regions of files that are +frequently used, nullifying the optimization (and in some cases causing worse +performance due to the overhead of memory allocation and general management). +This is a form of "cache thrashing". + +In Linux, much research has been done to select and implement a lightweight but +optimal page cache eviction algorithm. Linux makes use of hardware page bits to +keep track of whether memory has been accessed. The sentry does not have direct +access to hardware. Implementing a similarly lightweight and optimal page cache +eviction algorithm will need to either introduce a kernel interface to obtain +these page bits or find a suitable alternative proxy for access events. + +In Linux, readahead happens by default but is not always ideal. For instance, +for files that are not read sequentially, it would be more ideal to simply read +from only those regions of the file rather than to optimistically cache some +number of bytes ahead of the read (up to 2MB in Linux) if the bytes cached won't +be accessed. Linux implements the fadvise64(2) system call for applications to +specify that a range of a file will not be accessed sequentially. The advice bit +FADV_RANDOM turns off the readahead optimization for the given range in the +given file. However fadvise64 is rarely used by applications so Linux implements +a readahead backoff strategy if reads are not sequential. To ensure that +application performance is not degraded, the sentry must implement a similar +backoff strategy. diff --git a/pkg/sentry/fs/fsutil/dirty_set.go b/pkg/sentry/fs/fsutil/dirty_set.go new file mode 100644 index 000000000..9c6c98542 --- /dev/null +++ b/pkg/sentry/fs/fsutil/dirty_set.go @@ -0,0 +1,213 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fsutil + +import ( + "math" + + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/memmap" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform" + "gvisor.googlesource.com/gvisor/pkg/sentry/safemem" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +// DirtySet maps offsets into a memmap.Mappable to DirtyInfo. It is used to +// implement Mappables that cache data from another source. +// +// type DirtySet <generated by go_generics> + +// DirtyInfo is the value type of DirtySet, and represents information about a +// Mappable offset that is dirty (the cached data for that offset is newer than +// its source). +type DirtyInfo struct { + // Keep is true if the represented offset is concurrently writable, such + // that writing the data for that offset back to the source does not + // guarantee that the offset is clean (since it may be concurrently + // rewritten after the writeback). + Keep bool +} + +// dirtySetFunctions implements segment.Functions for DirtySet. +type dirtySetFunctions struct{} + +// MinKey implements segment.Functions.MinKey. +func (dirtySetFunctions) MinKey() uint64 { + return 0 +} + +// MaxKey implements segment.Functions.MaxKey. +func (dirtySetFunctions) MaxKey() uint64 { + return math.MaxUint64 +} + +// ClearValue implements segment.Functions.ClearValue. +func (dirtySetFunctions) ClearValue(val *DirtyInfo) { +} + +// Merge implements segment.Functions.Merge. +func (dirtySetFunctions) Merge(_ memmap.MappableRange, val1 DirtyInfo, _ memmap.MappableRange, val2 DirtyInfo) (DirtyInfo, bool) { + if val1 != val2 { + return DirtyInfo{}, false + } + return val1, true +} + +// Split implements segment.Functions.Split. +func (dirtySetFunctions) Split(_ memmap.MappableRange, val DirtyInfo, _ uint64) (DirtyInfo, DirtyInfo) { + return val, val +} + +// MarkClean marks all offsets in mr as not dirty, except for those to which +// KeepDirty has been applied. +func (ds *DirtySet) MarkClean(mr memmap.MappableRange) { + seg := ds.LowerBoundSegment(mr.Start) + for seg.Ok() && seg.Start() < mr.End { + if seg.Value().Keep { + seg = seg.NextSegment() + continue + } + seg = ds.Isolate(seg, mr) + seg = ds.Remove(seg).NextSegment() + } +} + +// KeepClean marks all offsets in mr as not dirty, even those that were +// previously kept dirty by KeepDirty. +func (ds *DirtySet) KeepClean(mr memmap.MappableRange) { + ds.RemoveRange(mr) +} + +// MarkDirty marks all offsets in mr as dirty. +func (ds *DirtySet) MarkDirty(mr memmap.MappableRange) { + ds.setDirty(mr, false) +} + +// KeepDirty marks all offsets in mr as dirty and prevents them from being +// marked as clean by MarkClean. +func (ds *DirtySet) KeepDirty(mr memmap.MappableRange) { + ds.setDirty(mr, true) +} + +func (ds *DirtySet) setDirty(mr memmap.MappableRange, keep bool) { + var changedAny bool + defer func() { + if changedAny { + ds.MergeRange(mr) + } + }() + seg, gap := ds.Find(mr.Start) + for { + switch { + case seg.Ok() && seg.Start() < mr.End: + if keep && !seg.Value().Keep { + changedAny = true + seg = ds.Isolate(seg, mr) + seg.ValuePtr().Keep = true + } + seg, gap = seg.NextNonEmpty() + + case gap.Ok() && gap.Start() < mr.End: + changedAny = true + seg = ds.Insert(gap, gap.Range().Intersect(mr), DirtyInfo{keep}) + seg, gap = seg.NextNonEmpty() + + default: + return + } + } +} + +// SyncDirty passes pages in the range mr that are stored in cache and +// identified as dirty to writeAt, updating dirty to reflect successful writes. +// If writeAt returns a successful partial write, SyncDirty will call it +// repeatedly until all bytes have been written. max is the true size of the +// cached object; offsets beyond max will not be passed to writeAt, even if +// they are marked dirty. +func SyncDirty(ctx context.Context, mr memmap.MappableRange, cache *FileRangeSet, dirty *DirtySet, max uint64, mem platform.File, writeAt func(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error)) error { + var changedDirty bool + defer func() { + if changedDirty { + dirty.MergeRange(mr) + } + }() + dseg := dirty.LowerBoundSegment(mr.Start) + for dseg.Ok() && dseg.Start() < mr.End { + var dr memmap.MappableRange + if dseg.Value().Keep { + dr = dseg.Range().Intersect(mr) + } else { + changedDirty = true + dseg = dirty.Isolate(dseg, mr) + dr = dseg.Range() + } + if err := syncDirtyRange(ctx, dr, cache, max, mem, writeAt); err != nil { + return err + } + if dseg.Value().Keep { + dseg = dseg.NextSegment() + } else { + dseg = dirty.Remove(dseg).NextSegment() + } + } + return nil +} + +// SyncDirtyAll passes all pages stored in cache identified as dirty to +// writeAt, updating dirty to reflect successful writes. If writeAt returns a +// successful partial write, SyncDirtyAll will call it repeatedly until all +// bytes have been written. max is the true size of the cached object; offsets +// beyond max will not be passed to writeAt, even if they are marked dirty. +func SyncDirtyAll(ctx context.Context, cache *FileRangeSet, dirty *DirtySet, max uint64, mem platform.File, writeAt func(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error)) error { + dseg := dirty.FirstSegment() + for dseg.Ok() { + if err := syncDirtyRange(ctx, dseg.Range(), cache, max, mem, writeAt); err != nil { + return err + } + if dseg.Value().Keep { + dseg = dseg.NextSegment() + } else { + dseg = dirty.Remove(dseg).NextSegment() + } + } + return nil +} + +// Preconditions: mr must be page-aligned. +func syncDirtyRange(ctx context.Context, mr memmap.MappableRange, cache *FileRangeSet, max uint64, mem platform.File, writeAt func(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error)) error { + for cseg := cache.LowerBoundSegment(mr.Start); cseg.Ok() && cseg.Start() < mr.End; cseg = cseg.NextSegment() { + wbr := cseg.Range().Intersect(mr) + if max < wbr.Start { + break + } + ims, err := mem.MapInternal(cseg.FileRangeOf(wbr), usermem.Read) + if err != nil { + return err + } + if max < wbr.End { + ims = ims.TakeFirst64(max - wbr.Start) + } + offset := wbr.Start + for !ims.IsEmpty() { + n, err := writeAt(ctx, ims, offset) + if err != nil { + return err + } + offset += n + ims = ims.DropFirst64(n) + } + } + return nil +} diff --git a/pkg/sentry/fs/fsutil/dirty_set_test.go b/pkg/sentry/fs/fsutil/dirty_set_test.go new file mode 100644 index 000000000..f7693cb19 --- /dev/null +++ b/pkg/sentry/fs/fsutil/dirty_set_test.go @@ -0,0 +1,38 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fsutil + +import ( + "reflect" + "testing" + + "gvisor.googlesource.com/gvisor/pkg/sentry/memmap" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +func TestDirtySet(t *testing.T) { + var set DirtySet + set.MarkDirty(memmap.MappableRange{0, 2 * usermem.PageSize}) + set.KeepDirty(memmap.MappableRange{usermem.PageSize, 2 * usermem.PageSize}) + set.MarkClean(memmap.MappableRange{0, 2 * usermem.PageSize}) + want := &DirtySegmentDataSlices{ + Start: []uint64{usermem.PageSize}, + End: []uint64{2 * usermem.PageSize}, + Values: []DirtyInfo{{Keep: true}}, + } + if got := set.ExportSortedSlices(); !reflect.DeepEqual(got, want) { + t.Errorf("set:\n\tgot %v,\n\twant %v", got, want) + } +} diff --git a/pkg/sentry/fs/fsutil/file.go b/pkg/sentry/fs/fsutil/file.go new file mode 100644 index 000000000..a7329f1c9 --- /dev/null +++ b/pkg/sentry/fs/fsutil/file.go @@ -0,0 +1,267 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fsutil + +import ( + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/memmap" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" + "gvisor.googlesource.com/gvisor/pkg/waiter" +) + +// NoopRelease implements FileOperations.Release for files that have no +// resources to release. +type NoopRelease struct{} + +// Release is a no-op. +func (NoopRelease) Release() {} + +// SeekWithDirCursor is used to implement fs.FileOperations.Seek. If dirCursor +// is not nil and the seek was on a directory, the cursor will be updated. +// +// Currenly only seeking to 0 on a directory is supported. +// +// FIXME: Lift directory seeking limitations. +func SeekWithDirCursor(ctx context.Context, file *fs.File, whence fs.SeekWhence, offset int64, dirCursor *string) (int64, error) { + inode := file.Dirent.Inode + current := file.Offset() + + // Does the Inode represents a non-seekable type? + if fs.IsPipe(inode.StableAttr) || fs.IsSocket(inode.StableAttr) { + return current, syserror.ESPIPE + } + + // Does the Inode represent a character device? + if fs.IsCharDevice(inode.StableAttr) { + // Ignore seek requests. + // + // FIXME: This preserves existing + // behavior but is not universally correct. + return 0, nil + } + + // Otherwise compute the new offset. + switch whence { + case fs.SeekSet: + switch inode.StableAttr.Type { + case fs.RegularFile, fs.SpecialFile, fs.BlockDevice: + if offset < 0 { + return current, syserror.EINVAL + } + return offset, nil + case fs.Directory, fs.SpecialDirectory: + if offset != 0 { + return current, syserror.EINVAL + } + // SEEK_SET to 0 moves the directory "cursor" to the beginning. + if dirCursor != nil { + *dirCursor = "" + } + return 0, nil + default: + return current, syserror.EINVAL + } + case fs.SeekCurrent: + switch inode.StableAttr.Type { + case fs.RegularFile, fs.SpecialFile, fs.BlockDevice: + if current+offset < 0 { + return current, syserror.EINVAL + } + return current + offset, nil + case fs.Directory, fs.SpecialDirectory: + if offset != 0 { + return current, syserror.EINVAL + } + return current, nil + default: + return current, syserror.EINVAL + } + case fs.SeekEnd: + switch inode.StableAttr.Type { + case fs.RegularFile, fs.BlockDevice: + // Allow the file to determine the end. + uattr, err := inode.UnstableAttr(ctx) + if err != nil { + return current, err + } + sz := uattr.Size + if sz+offset < 0 { + return current, syserror.EINVAL + } + return sz + offset, nil + // FIXME: This is not universally correct. + // Remove SpecialDirectory. + case fs.SpecialDirectory: + if offset != 0 { + return current, syserror.EINVAL + } + // SEEK_END to 0 moves the directory "cursor" to the end. + // + // FIXME: The ensures that after the seek, + // reading on the directory will get EOF. But it is not + // correct in general because the directory can grow in + // size; attempting to read those new entries will be + // futile (EOF will always be the result). + return fs.FileMaxOffset, nil + default: + return current, syserror.EINVAL + } + } + + // Not a valid seek request. + return current, syserror.EINVAL +} + +// GenericSeek implements FileOperations.Seek for files that use a generic +// seek implementation. +type GenericSeek struct{} + +// Seek implements fs.FileOperations.Seek. +func (GenericSeek) Seek(ctx context.Context, file *fs.File, whence fs.SeekWhence, offset int64) (int64, error) { + return SeekWithDirCursor(ctx, file, whence, offset, nil) +} + +// ZeroSeek implements FileOperations.Seek for files that maintain a constant +// zero-value offset and require a no-op Seek. +type ZeroSeek struct{} + +// Seek implements FileOperations.Seek. +func (ZeroSeek) Seek(context.Context, *fs.File, fs.SeekWhence, int64) (int64, error) { + return 0, nil +} + +// PipeSeek implements FileOperations.Seek and can be used for files that behave +// like pipes (seeking is not supported). +type PipeSeek struct{} + +// Seek implements FileOperations.Seek. +func (PipeSeek) Seek(context.Context, *fs.File, fs.SeekWhence, int64) (int64, error) { + return 0, syserror.ESPIPE +} + +// NotDirReaddir implements FileOperations.Readdir for non-directories. +type NotDirReaddir struct{} + +// Readdir implements FileOperations.NotDirReaddir. +func (NotDirReaddir) Readdir(context.Context, *fs.File, fs.DentrySerializer) (int64, error) { + return 0, syserror.ENOTDIR +} + +// NoFsync implements FileOperations.Fsync for files that don't support syncing. +type NoFsync struct{} + +// Fsync implements FileOperations.Fsync. +func (NoFsync) Fsync(context.Context, *fs.File, int64, int64, fs.SyncType) error { + return syserror.EINVAL +} + +// NoopFsync implements FileOperations.Fsync for files that don't need to synced. +type NoopFsync struct{} + +// Fsync implements FileOperations.Fsync. +func (NoopFsync) Fsync(context.Context, *fs.File, int64, int64, fs.SyncType) error { + return nil +} + +// NoopFlush implements FileOperations.Flush as a no-op. +type NoopFlush struct{} + +// Flush implements FileOperations.Flush. +func (NoopFlush) Flush(context.Context, *fs.File) error { + return nil +} + +// NoMMap implements fs.FileOperations.Mappable for files that cannot +// be memory mapped. +type NoMMap struct{} + +// ConfigureMMap implements fs.FileOperations.ConfigureMMap. +func (NoMMap) ConfigureMMap(context.Context, *fs.File, *memmap.MMapOpts) error { + return syserror.ENODEV +} + +// GenericConfigureMMap implements fs.FileOperations.ConfigureMMap for most +// filesystems that support memory mapping. +func GenericConfigureMMap(file *fs.File, m memmap.Mappable, opts *memmap.MMapOpts) error { + opts.Mappable = m + opts.MappingIdentity = file + file.IncRef() + return nil +} + +// NoIoctl implements fs.FileOperations.Ioctl for files that don't implement +// the ioctl syscall. +type NoIoctl struct{} + +// Ioctl implements fs.FileOperations.Ioctl. +func (NoIoctl) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { + return 0, syserror.ENOTTY +} + +// DirFileOperations implements FileOperations for directories. +type DirFileOperations struct { + waiter.AlwaysReady `state:"nosave"` + NoopRelease `state:"nosave"` + GenericSeek `state:"nosave"` + NoFsync `state:"nosave"` + NoopFlush `state:"nosave"` + NoMMap `state:"nosave"` + NoIoctl `state:"nosave"` + + // dentryMap is a SortedDentryMap used to implement Readdir. + dentryMap *fs.SortedDentryMap + + // dirCursor contains the name of the last directory entry that was + // serialized. + dirCursor string +} + +// NewDirFileOperations returns a new DirFileOperations that will iterate the +// given denty map. +func NewDirFileOperations(dentries *fs.SortedDentryMap) *DirFileOperations { + return &DirFileOperations{ + dentryMap: dentries, + } +} + +// IterateDir implements DirIterator.IterateDir. +func (dfo *DirFileOperations) IterateDir(ctx context.Context, dirCtx *fs.DirCtx, offset int) (int, error) { + n, err := fs.GenericReaddir(dirCtx, dfo.dentryMap) + return offset + n, err +} + +// Readdir implements FileOperations.Readdir. +func (dfo *DirFileOperations) Readdir(ctx context.Context, file *fs.File, serializer fs.DentrySerializer) (int64, error) { + root := fs.RootFromContext(ctx) + defer root.DecRef() + dirCtx := &fs.DirCtx{ + Serializer: serializer, + DirCursor: &dfo.dirCursor, + } + return fs.DirentReaddir(ctx, file.Dirent, dfo, root, dirCtx, file.Offset()) +} + +// Read implements FileOperations.Read +func (*DirFileOperations) Read(context.Context, *fs.File, usermem.IOSequence, int64) (int64, error) { + return 0, syserror.EISDIR +} + +// Write implements FileOperations.Write. +func (*DirFileOperations) Write(context.Context, *fs.File, usermem.IOSequence, int64) (int64, error) { + return 0, syserror.EISDIR +} diff --git a/pkg/sentry/fs/fsutil/file_range_set.go b/pkg/sentry/fs/fsutil/file_range_set.go new file mode 100644 index 000000000..da6949ccb --- /dev/null +++ b/pkg/sentry/fs/fsutil/file_range_set.go @@ -0,0 +1,208 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fsutil + +import ( + "fmt" + "io" + "math" + + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/memmap" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform" + "gvisor.googlesource.com/gvisor/pkg/sentry/safemem" + "gvisor.googlesource.com/gvisor/pkg/sentry/usage" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +// FileRangeSet maps offsets into a memmap.Mappable to offsets into a +// platform.File. It is used to implement Mappables that store data in +// sparsely-allocated memory. +// +// type FileRangeSet <generated by go_generics> + +// fileRangeSetFunctions implements segment.Functions for FileRangeSet. +type fileRangeSetFunctions struct{} + +// MinKey implements segment.Functions.MinKey. +func (fileRangeSetFunctions) MinKey() uint64 { + return 0 +} + +// MaxKey implements segment.Functions.MaxKey. +func (fileRangeSetFunctions) MaxKey() uint64 { + return math.MaxUint64 +} + +// ClearValue implements segment.Functions.ClearValue. +func (fileRangeSetFunctions) ClearValue(_ *uint64) { +} + +// Merge implements segment.Functions.Merge. +func (fileRangeSetFunctions) Merge(mr1 memmap.MappableRange, frstart1 uint64, _ memmap.MappableRange, frstart2 uint64) (uint64, bool) { + if frstart1+mr1.Length() != frstart2 { + return 0, false + } + return frstart1, true +} + +// Split implements segment.Functions.Split. +func (fileRangeSetFunctions) Split(mr memmap.MappableRange, frstart uint64, split uint64) (uint64, uint64) { + return frstart, frstart + (split - mr.Start) +} + +// FileRange returns the FileRange mapped by seg. +func (seg FileRangeIterator) FileRange() platform.FileRange { + return seg.FileRangeOf(seg.Range()) +} + +// FileRangeOf returns the FileRange mapped by mr. +// +// Preconditions: seg.Range().IsSupersetOf(mr). mr.Length() != 0. +func (seg FileRangeIterator) FileRangeOf(mr memmap.MappableRange) platform.FileRange { + frstart := seg.Value() + (mr.Start - seg.Start()) + return platform.FileRange{frstart, frstart + mr.Length()} +} + +// Fill attempts to ensure that all memmap.Mappable offsets in required are +// mapped to a platform.File offset, by allocating from mem with the given +// memory usage kind and invoking readAt to store data into memory. (If readAt +// returns a successful partial read, Fill will call it repeatedly until all +// bytes have been read.) EOF is handled consistently with the requirements of +// mmap(2): bytes after EOF on the same page are zeroed; pages after EOF are +// invalid. +// +// Fill may read offsets outside of required, but will never read offsets +// outside of optional. It returns a non-nil error if any error occurs, even +// if the error only affects offsets in optional, but not in required. +// +// Preconditions: required.Length() > 0. optional.IsSupersetOf(required). +// required and optional must be page-aligned. +func (frs *FileRangeSet) Fill(ctx context.Context, required, optional memmap.MappableRange, mem platform.Memory, kind usage.MemoryKind, readAt func(ctx context.Context, dsts safemem.BlockSeq, offset uint64) (uint64, error)) error { + gap := frs.LowerBoundGap(required.Start) + for gap.Ok() && gap.Start() < required.End { + if gap.Range().Length() == 0 { + gap = gap.NextGap() + continue + } + gr := gap.Range().Intersect(optional) + + // Read data into the gap. + fr, err := platform.AllocateAndFill(mem, gr.Length(), kind, safemem.ReaderFunc(func(dsts safemem.BlockSeq) (uint64, error) { + var done uint64 + for !dsts.IsEmpty() { + n, err := readAt(ctx, dsts, gr.Start+done) + done += n + dsts = dsts.DropFirst64(n) + if err != nil { + if err == io.EOF { + // platform.AllocateAndFill truncates down to a page + // boundary, but FileRangeSet.Fill is supposed to + // zero-fill to the end of the page in this case. + donepgaddr, ok := usermem.Addr(done).RoundUp() + if donepg := uint64(donepgaddr); ok && donepg != done { + dsts.DropFirst64(donepg - done) + done = donepg + if dsts.IsEmpty() { + return done, nil + } + } + } + return done, err + } + } + return done, nil + })) + + // Store anything we managed to read into the cache. + if done := fr.Length(); done != 0 { + gr.End = gr.Start + done + gap = frs.Insert(gap, gr, fr.Start).NextGap() + } + + if err != nil { + return err + } + } + return nil +} + +// Drop removes segments for memmap.Mappable offsets in mr, freeing the +// corresponding platform.FileRanges. +// +// Preconditions: mr must be page-aligned. +func (frs *FileRangeSet) Drop(mr memmap.MappableRange, mem platform.Memory) { + seg := frs.LowerBoundSegment(mr.Start) + for seg.Ok() && seg.Start() < mr.End { + seg = frs.Isolate(seg, mr) + mem.DecRef(seg.FileRange()) + seg = frs.Remove(seg).NextSegment() + } +} + +// DropAll removes all segments in mr, freeing the corresponding +// platform.FileRanges. +func (frs *FileRangeSet) DropAll(mem platform.Memory) { + for seg := frs.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { + mem.DecRef(seg.FileRange()) + } + frs.RemoveAll() +} + +// Truncate updates frs to reflect Mappable truncation to the given length: +// bytes after the new EOF on the same page are zeroed, and pages after the new +// EOF are freed. +func (frs *FileRangeSet) Truncate(end uint64, mem platform.Memory) { + pgendaddr, ok := usermem.Addr(end).RoundUp() + if ok { + pgend := uint64(pgendaddr) + + // Free truncated pages. + frs.SplitAt(pgend) + seg := frs.LowerBoundSegment(pgend) + for seg.Ok() { + mem.DecRef(seg.FileRange()) + seg = frs.Remove(seg).NextSegment() + } + + if end == pgend { + return + } + } + + // Here we know end < end.RoundUp(). If the new EOF lands in the + // middle of a page that we have, zero out its contents beyond the new + // length. + seg := frs.FindSegment(end) + if seg.Ok() { + fr := seg.FileRange() + fr.Start += end - seg.Start() + ims, err := mem.MapInternal(fr, usermem.Write) + if err != nil { + // There's no good recourse from here. This means + // that we can't keep cached memory consistent with + // the new end of file. The caller may have already + // updated the file size on their backing file system. + // + // We don't want to risk blindly continuing onward, + // so in the extremely rare cases this does happen, + // we abandon ship. + panic(fmt.Sprintf("Failed to map %v: %v", fr, err)) + } + if _, err := safemem.ZeroSeq(ims); err != nil { + panic(fmt.Sprintf("Zeroing %v failed: %v", fr, err)) + } + } +} diff --git a/pkg/sentry/fs/fsutil/frame_ref_set.go b/pkg/sentry/fs/fsutil/frame_ref_set.go new file mode 100644 index 000000000..14dece315 --- /dev/null +++ b/pkg/sentry/fs/fsutil/frame_ref_set.go @@ -0,0 +1,50 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fsutil + +import ( + "math" + + "gvisor.googlesource.com/gvisor/pkg/sentry/platform" +) + +type frameRefSetFunctions struct{} + +// MinKey implements segment.Functions.MinKey. +func (frameRefSetFunctions) MinKey() uint64 { + return 0 +} + +// MaxKey implements segment.Functions.MaxKey. +func (frameRefSetFunctions) MaxKey() uint64 { + return math.MaxUint64 +} + +// ClearValue implements segment.Functions.ClearValue. +func (frameRefSetFunctions) ClearValue(val *uint64) { +} + +// Merge implements segment.Functions.Merge. +func (frameRefSetFunctions) Merge(_ platform.FileRange, val1 uint64, _ platform.FileRange, val2 uint64) (uint64, bool) { + if val1 != val2 { + return 0, false + } + return val1, true +} + +// Split implements segment.Functions.Split. +func (frameRefSetFunctions) Split(_ platform.FileRange, val uint64, _ uint64) (uint64, uint64) { + return val, val +} diff --git a/pkg/sentry/fs/fsutil/fsutil.go b/pkg/sentry/fs/fsutil/fsutil.go new file mode 100644 index 000000000..6fe4ef13d --- /dev/null +++ b/pkg/sentry/fs/fsutil/fsutil.go @@ -0,0 +1,26 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package fsutil provides utilities for implementing fs.InodeOperations +// and fs.FileOperations: +// +// - For embeddable utilities, see inode.go and file.go. +// +// - For fs.Inodes that require a page cache to be memory mapped, see +// inode_cache.go. +// +// - For fs.Files that implement fs.HandleOps, see handle.go. +// +// - For anon fs.Inodes, see anon.go. +package fsutil diff --git a/pkg/sentry/fs/fsutil/handle.go b/pkg/sentry/fs/fsutil/handle.go new file mode 100644 index 000000000..149c0f84a --- /dev/null +++ b/pkg/sentry/fs/fsutil/handle.go @@ -0,0 +1,126 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fsutil + +import ( + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/memmap" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" + "gvisor.googlesource.com/gvisor/pkg/waiter" +) + +// Handle implements FileOperations. +// +// FIXME: Remove Handle entirely in favor of individual fs.File +// implementations using simple generic utilities. +type Handle struct { + NoopRelease `state:"nosave"` + NoIoctl `state:"nosave"` + HandleOperations fs.HandleOperations + + // dirCursor is the directory cursor. + dirCursor string +} + +// NewHandle returns a File backed by the Dirent and FileFlags. +func NewHandle(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags, hops fs.HandleOperations) *fs.File { + if !fs.IsPipe(dirent.Inode.StableAttr) && !fs.IsSocket(dirent.Inode.StableAttr) { + // Allow reading/writing at an arbitrary offset for non-pipes + // and non-sockets. + flags.Pread = true + flags.Pwrite = true + } + + return fs.NewFile(ctx, dirent, flags, &Handle{HandleOperations: hops}) +} + +// Readiness implements waiter.Waitable.Readiness. +func (h *Handle) Readiness(mask waiter.EventMask) waiter.EventMask { + return h.HandleOperations.Readiness(mask) +} + +// EventRegister implements waiter.Waitable.EventRegister. +func (h *Handle) EventRegister(e *waiter.Entry, mask waiter.EventMask) { + h.HandleOperations.EventRegister(e, mask) +} + +// EventUnregister implements waiter.Waitable.EventUnregister. +func (h *Handle) EventUnregister(e *waiter.Entry) { + h.HandleOperations.EventUnregister(e) +} + +// Readdir implements FileOperations.Readdir. +func (h *Handle) Readdir(ctx context.Context, file *fs.File, serializer fs.DentrySerializer) (int64, error) { + root := fs.RootFromContext(ctx) + defer root.DecRef() + dirCtx := &fs.DirCtx{ + Serializer: serializer, + DirCursor: &h.dirCursor, + } + n, err := fs.DirentReaddir(ctx, file.Dirent, h, root, dirCtx, file.Offset()) + return n, err +} + +// Seek implements FileOperations.Seek. +func (h *Handle) Seek(ctx context.Context, file *fs.File, whence fs.SeekWhence, offset int64) (int64, error) { + return SeekWithDirCursor(ctx, file, whence, offset, &h.dirCursor) +} + +// IterateDir implements DirIterator.IterateDir. +func (h *Handle) IterateDir(ctx context.Context, dirCtx *fs.DirCtx, offset int) (int, error) { + return h.HandleOperations.DeprecatedReaddir(ctx, dirCtx, offset) +} + +// Read implements FileOperations.Read. +func (h *Handle) Read(ctx context.Context, file *fs.File, dst usermem.IOSequence, offset int64) (int64, error) { + return h.HandleOperations.DeprecatedPreadv(ctx, dst, offset) +} + +// Write implements FileOperations.Write. +func (h *Handle) Write(ctx context.Context, file *fs.File, src usermem.IOSequence, offset int64) (int64, error) { + return h.HandleOperations.DeprecatedPwritev(ctx, src, offset) +} + +// Fsync implements FileOperations.Fsync. +func (h *Handle) Fsync(ctx context.Context, file *fs.File, start int64, end int64, syncType fs.SyncType) error { + switch syncType { + case fs.SyncAll, fs.SyncData: + // Write out metadata. + if err := file.Dirent.Inode.WriteOut(ctx); err != nil { + return err + } + fallthrough + case fs.SyncBackingStorage: + // Use DeprecatedFsync to sync disks. + return h.HandleOperations.DeprecatedFsync() + } + panic("invalid sync type") +} + +// Flush implements FileOperations.Flush. +func (h *Handle) Flush(context.Context, *fs.File) error { + return h.HandleOperations.DeprecatedFlush() +} + +// ConfigureMMap implements FileOperations.ConfigureMMap. +func (h *Handle) ConfigureMMap(ctx context.Context, file *fs.File, opts *memmap.MMapOpts) error { + mappable := file.Dirent.Inode.Mappable() + if mappable == nil { + return syserror.ENODEV + } + return GenericConfigureMMap(file, mappable, opts) +} diff --git a/pkg/sentry/fs/fsutil/handle_test.go b/pkg/sentry/fs/fsutil/handle_test.go new file mode 100644 index 000000000..d94c3eb0d --- /dev/null +++ b/pkg/sentry/fs/fsutil/handle_test.go @@ -0,0 +1,227 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fsutil_test + +import ( + "io" + "syscall" + "testing" + + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil" + ramfstest "gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs/test" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +type testInodeOperations struct { + fs.InodeOperations + fs.InodeType + FileSize int64 + writes uint + reads uint +} + +func (t *testInodeOperations) UnstableAttr(ctx context.Context, inode *fs.Inode) (fs.UnstableAttr, error) { + return fs.UnstableAttr{Size: t.FileSize}, nil +} + +// Check implements InodeOperations.Check. +func (t *testInodeOperations) Check(ctx context.Context, inode *fs.Inode, p fs.PermMask) bool { + return fs.ContextCanAccessFile(ctx, inode, p) +} + +func (t *testInodeOperations) DeprecatedPreadv(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) { + t.reads++ + return t.InodeOperations.DeprecatedPreadv(ctx, dst, offset) +} + +func (t *testInodeOperations) DeprecatedPwritev(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) { + t.writes++ + return t.InodeOperations.DeprecatedPwritev(ctx, src, offset) +} + +// testHandle returns a handle for a test node. +// +// The size of the node is fixed at 20 bytes. +func testHandle(t *testing.T, flags fs.FileFlags, nt fs.InodeType) (*fs.File, *testInodeOperations) { + ctx := contexttest.Context(t) + m := fs.NewNonCachingMountSource(nil, fs.MountSourceFlags{}) + n := &testInodeOperations{ + InodeOperations: ramfstest.NewFile(ctx, fs.FilePermissions{User: fs.PermMask{Read: true, Write: true}}), + FileSize: 20, + } + d := fs.NewDirent(fs.NewInode(n, m, fs.StableAttr{Type: nt}), "test") + return fsutil.NewHandle(ctx, d, flags, d.Inode.HandleOps()), n +} + +func TestHandleOps(t *testing.T) { + h, n := testHandle(t, fs.FileFlags{Read: true, Write: true}, fs.RegularFile) + defer h.DecRef() + + // Make sure a write request works. + if n, err := h.Writev(contexttest.Context(t), usermem.BytesIOSequence([]byte("a"))); n != 1 || err != nil { + t.Fatalf("Writev: got (%d, %v), wanted (1, nil)", n, err) + } + if n.writes != 1 { + t.Errorf("found %d writes, expected 1", n.writes) + } + + // Make sure a read request works. + dst := make([]byte, 1) + if n, err := h.Preadv(contexttest.Context(t), usermem.BytesIOSequence(dst), 0); n != 1 || (err != nil && err != io.EOF) { + t.Errorf("Preadv: got (%d, %v), wanted (1, nil or EOF)", n, err) + } + if dst[0] != 'a' { + t.Errorf("Preadv: read %q, wanted 'a'", dst[0]) + } + if n.reads != 1 { + t.Errorf("found %d reads, expected 1", n.reads) + } +} + +type seekTest struct { + whence fs.SeekWhence + offset int64 + result int64 + err error +} + +type seekSuite struct { + nodeType fs.InodeType + cases []seekTest +} + +// FIXME: This is currently missing fs.SeekEnd tests due to the +// fact that NullInodeOperations returns an error on stat. +func TestHandleSeek(t *testing.T) { + ts := []seekSuite{ + { + nodeType: fs.RegularFile, + cases: []seekTest{ + {fs.SeekSet, 0, 0, nil}, + {fs.SeekSet, 10, 10, nil}, + {fs.SeekSet, -5, 10, syscall.EINVAL}, + {fs.SeekCurrent, -1, 9, nil}, + {fs.SeekCurrent, 2, 11, nil}, + {fs.SeekCurrent, -12, 11, syscall.EINVAL}, + {fs.SeekEnd, -1, 19, nil}, + {fs.SeekEnd, 0, 20, nil}, + {fs.SeekEnd, 2, 22, nil}, + }, + }, + { + nodeType: fs.Directory, + cases: []seekTest{ + {fs.SeekSet, 0, 0, nil}, + {fs.SeekSet, 10, 0, syscall.EINVAL}, + {fs.SeekSet, -5, 0, syscall.EINVAL}, + {fs.SeekCurrent, 0, 0, nil}, + {fs.SeekCurrent, 11, 0, syscall.EINVAL}, + {fs.SeekCurrent, -6, 0, syscall.EINVAL}, + {fs.SeekEnd, 0, 0, syscall.EINVAL}, + {fs.SeekEnd, -1, 0, syscall.EINVAL}, + {fs.SeekEnd, 2, 0, syscall.EINVAL}, + }, + }, + { + nodeType: fs.Symlink, + cases: []seekTest{ + {fs.SeekSet, 5, 0, syscall.EINVAL}, + {fs.SeekSet, -5, 0, syscall.EINVAL}, + {fs.SeekSet, 0, 0, syscall.EINVAL}, + {fs.SeekCurrent, 5, 0, syscall.EINVAL}, + {fs.SeekCurrent, -5, 0, syscall.EINVAL}, + {fs.SeekCurrent, 0, 0, syscall.EINVAL}, + {fs.SeekEnd, 5, 0, syscall.EINVAL}, + {fs.SeekEnd, -5, 0, syscall.EINVAL}, + {fs.SeekEnd, 0, 0, syscall.EINVAL}, + }, + }, + { + nodeType: fs.Pipe, + cases: []seekTest{ + {fs.SeekSet, 5, 0, syscall.ESPIPE}, + {fs.SeekSet, -5, 0, syscall.ESPIPE}, + {fs.SeekSet, 0, 0, syscall.ESPIPE}, + {fs.SeekCurrent, 5, 0, syscall.ESPIPE}, + {fs.SeekCurrent, -5, 0, syscall.ESPIPE}, + {fs.SeekCurrent, 0, 0, syscall.ESPIPE}, + {fs.SeekEnd, 5, 0, syscall.ESPIPE}, + {fs.SeekEnd, -5, 0, syscall.ESPIPE}, + {fs.SeekEnd, 0, 0, syscall.ESPIPE}, + }, + }, + { + nodeType: fs.Socket, + cases: []seekTest{ + {fs.SeekSet, 5, 0, syscall.ESPIPE}, + {fs.SeekSet, -5, 0, syscall.ESPIPE}, + {fs.SeekSet, 0, 0, syscall.ESPIPE}, + {fs.SeekCurrent, 5, 0, syscall.ESPIPE}, + {fs.SeekCurrent, -5, 0, syscall.ESPIPE}, + {fs.SeekCurrent, 0, 0, syscall.ESPIPE}, + {fs.SeekEnd, 5, 0, syscall.ESPIPE}, + {fs.SeekEnd, -5, 0, syscall.ESPIPE}, + {fs.SeekEnd, 0, 0, syscall.ESPIPE}, + }, + }, + { + nodeType: fs.CharacterDevice, + cases: []seekTest{ + {fs.SeekSet, 5, 0, nil}, + {fs.SeekSet, -5, 0, nil}, + {fs.SeekSet, 0, 0, nil}, + {fs.SeekCurrent, 5, 0, nil}, + {fs.SeekCurrent, -5, 0, nil}, + {fs.SeekCurrent, 0, 0, nil}, + {fs.SeekEnd, 5, 0, nil}, + {fs.SeekEnd, -5, 0, nil}, + {fs.SeekEnd, 0, 0, nil}, + }, + }, + { + nodeType: fs.BlockDevice, + cases: []seekTest{ + {fs.SeekSet, 0, 0, nil}, + {fs.SeekSet, 10, 10, nil}, + {fs.SeekSet, -5, 10, syscall.EINVAL}, + {fs.SeekCurrent, -1, 9, nil}, + {fs.SeekCurrent, 2, 11, nil}, + {fs.SeekCurrent, -12, 11, syscall.EINVAL}, + {fs.SeekEnd, -1, 19, nil}, + {fs.SeekEnd, 0, 20, nil}, + {fs.SeekEnd, 2, 22, nil}, + }, + }, + } + + for _, s := range ts { + h, _ := testHandle(t, fs.FileFlags{Read: true, Write: true}, s.nodeType) + defer h.DecRef() + + for _, c := range s.cases { + // Try the given seek. + offset, err := h.Seek(contexttest.Context(t), c.whence, c.offset) + if err != c.err { + t.Errorf("seek(%s, %d) on %s had unexpected error: expected %v, got %v", c.whence, c.offset, s.nodeType, c.err, err) + } + if err == nil && offset != c.result { + t.Errorf("seek(%s, %d) on %s had bad result: expected %v, got %v", c.whence, c.offset, s.nodeType, c.result, offset) + } + } + } +} diff --git a/pkg/sentry/fs/fsutil/host_file_mapper.go b/pkg/sentry/fs/fsutil/host_file_mapper.go new file mode 100644 index 000000000..d0a27fc1c --- /dev/null +++ b/pkg/sentry/fs/fsutil/host_file_mapper.go @@ -0,0 +1,209 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fsutil + +import ( + "fmt" + "sync" + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/pkg/sentry/memmap" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform" + "gvisor.googlesource.com/gvisor/pkg/sentry/safemem" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +// HostFileMapper caches mappings of an arbitrary host file descriptor. It is +// used by implementations of memmap.Mappable that represent a host file +// descriptor. +type HostFileMapper struct { + // HostFile conceptually breaks the file into pieces called chunks, of + // size and alignment chunkSize, and caches mappings of the file on a chunk + // granularity. + + refsMu sync.Mutex `state:"nosave"` + + // refs maps chunk start offsets to the sum of reference counts for all + // pages in that chunk. refs is protected by refsMu. + refs map[uint64]int32 + + mapsMu sync.Mutex `state:"nosave"` + + // mappings maps chunk start offsets to mappings of those chunks, + // obtained by calling syscall.Mmap. mappings is protected by + // mapsMu. + mappings map[uint64]mapping `state:"nosave"` +} + +const ( + chunkShift = usermem.HugePageShift + chunkSize = 1 << chunkShift + chunkMask = chunkSize - 1 +) + +func pagesInChunk(mr memmap.MappableRange, chunkStart uint64) int32 { + return int32(mr.Intersect(memmap.MappableRange{chunkStart, chunkStart + chunkSize}).Length() / usermem.PageSize) +} + +type mapping struct { + addr uintptr + writable bool +} + +// NewHostFileMapper returns a HostFileMapper with no references or cached +// mappings. +func NewHostFileMapper() *HostFileMapper { + return &HostFileMapper{ + refs: make(map[uint64]int32), + mappings: make(map[uint64]mapping), + } +} + +// IncRefOn increments the reference count on all offsets in mr. +// +// Preconditions: mr.Length() != 0. mr.Start and mr.End must be page-aligned. +func (f *HostFileMapper) IncRefOn(mr memmap.MappableRange) { + f.refsMu.Lock() + defer f.refsMu.Unlock() + for chunkStart := mr.Start &^ chunkMask; chunkStart < mr.End; chunkStart += chunkSize { + refs := f.refs[chunkStart] + pgs := pagesInChunk(mr, chunkStart) + if refs+pgs < refs { + // Would overflow. + panic(fmt.Sprintf("HostFileMapper.IncRefOn(%v): adding %d page references to chunk %#x, which has %d page references", mr, pgs, chunkStart, refs)) + } + f.refs[chunkStart] = refs + pgs + } +} + +// DecRefOn decrements the reference count on all offsets in mr. +// +// Preconditions: mr.Length() != 0. mr.Start and mr.End must be page-aligned. +func (f *HostFileMapper) DecRefOn(mr memmap.MappableRange) { + f.refsMu.Lock() + defer f.refsMu.Unlock() + for chunkStart := mr.Start &^ chunkMask; chunkStart < mr.End; chunkStart += chunkSize { + refs := f.refs[chunkStart] + pgs := pagesInChunk(mr, chunkStart) + switch { + case refs > pgs: + f.refs[chunkStart] = refs - pgs + case refs == pgs: + f.mapsMu.Lock() + delete(f.refs, chunkStart) + if m, ok := f.mappings[chunkStart]; ok { + f.unmapAndRemoveLocked(chunkStart, m) + } + f.mapsMu.Unlock() + case refs < pgs: + panic(fmt.Sprintf("HostFileMapper.DecRefOn(%v): removing %d page references from chunk %#x, which has %d page references", mr, pgs, chunkStart, refs)) + } + } +} + +// MapInternal returns a mapping of offsets in fr from fd. The returned +// safemem.BlockSeq is valid as long as at least one reference is held on all +// offsets in fr or until the next call to UnmapAll. +// +// Preconditions: The caller must hold a reference on all offsets in fr. +func (f *HostFileMapper) MapInternal(fr platform.FileRange, fd int, write bool) (safemem.BlockSeq, error) { + chunks := ((fr.End + chunkMask) >> chunkShift) - (fr.Start >> chunkShift) + f.mapsMu.Lock() + defer f.mapsMu.Unlock() + if chunks == 1 { + // Avoid an unnecessary slice allocation. + var seq safemem.BlockSeq + err := f.forEachMappingBlockLocked(fr, fd, write, func(b safemem.Block) { + seq = safemem.BlockSeqOf(b) + }) + return seq, err + } + blocks := make([]safemem.Block, 0, chunks) + err := f.forEachMappingBlockLocked(fr, fd, write, func(b safemem.Block) { + blocks = append(blocks, b) + }) + return safemem.BlockSeqFromSlice(blocks), err +} + +// Preconditions: f.mapsMu must be locked. +func (f *HostFileMapper) forEachMappingBlockLocked(fr platform.FileRange, fd int, write bool, fn func(safemem.Block)) error { + prot := syscall.PROT_READ + if write { + prot |= syscall.PROT_WRITE + } + for chunkStart := fr.Start &^ chunkMask; chunkStart < fr.End; chunkStart += chunkSize { + m, ok := f.mappings[chunkStart] + if !ok { + addr, _, errno := syscall.Syscall6( + syscall.SYS_MMAP, + 0, + chunkSize, + uintptr(prot), + syscall.MAP_SHARED, + uintptr(fd), + uintptr(chunkStart)) + if errno != 0 { + return errno + } + m = mapping{addr, write} + f.mappings[chunkStart] = m + } else if write && !m.writable { + addr, _, errno := syscall.Syscall6( + syscall.SYS_MMAP, + m.addr, + chunkSize, + uintptr(prot), + syscall.MAP_SHARED|syscall.MAP_FIXED, + uintptr(fd), + uintptr(chunkStart)) + if errno != 0 { + return errno + } + m = mapping{addr, write} + f.mappings[chunkStart] = m + } + var startOff uint64 + if chunkStart < fr.Start { + startOff = fr.Start - chunkStart + } + endOff := uint64(chunkSize) + if chunkStart+chunkSize > fr.End { + endOff = fr.End - chunkStart + } + fn(f.unsafeBlockFromChunkMapping(m.addr).TakeFirst64(endOff).DropFirst64(startOff)) + } + return nil +} + +// UnmapAll unmaps all cached mappings. Callers are responsible for +// synchronization with mappings returned by previous calls to MapInternal. +func (f *HostFileMapper) UnmapAll() { + f.mapsMu.Lock() + defer f.mapsMu.Unlock() + for chunkStart, m := range f.mappings { + f.unmapAndRemoveLocked(chunkStart, m) + } +} + +// Preconditions: f.mapsMu must be locked. f.mappings[chunkStart] == m. +func (f *HostFileMapper) unmapAndRemoveLocked(chunkStart uint64, m mapping) { + if _, _, errno := syscall.Syscall(syscall.SYS_MUNMAP, m.addr, chunkSize, 0); errno != 0 { + // This leaks address space and is unexpected, but is otherwise + // harmless, so complain but don't panic. + log.Warningf("HostFileMapper: failed to unmap mapping %#x for chunk %#x: %v", m.addr, chunkStart, errno) + } + delete(f.mappings, chunkStart) +} diff --git a/pkg/sentry/fs/fsutil/host_file_mapper_state.go b/pkg/sentry/fs/fsutil/host_file_mapper_state.go new file mode 100644 index 000000000..57705decd --- /dev/null +++ b/pkg/sentry/fs/fsutil/host_file_mapper_state.go @@ -0,0 +1,20 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fsutil + +// afterLoad is invoked by stateify. +func (f *HostFileMapper) afterLoad() { + f.mappings = make(map[uint64]mapping) +} diff --git a/pkg/sentry/fs/fsutil/host_file_mapper_unsafe.go b/pkg/sentry/fs/fsutil/host_file_mapper_unsafe.go new file mode 100644 index 000000000..790f3a5a6 --- /dev/null +++ b/pkg/sentry/fs/fsutil/host_file_mapper_unsafe.go @@ -0,0 +1,27 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fsutil + +import ( + "unsafe" + + "gvisor.googlesource.com/gvisor/pkg/sentry/safemem" +) + +func (*HostFileMapper) unsafeBlockFromChunkMapping(addr uintptr) safemem.Block { + // We don't control the host file's length, so touching its mappings may + // raise SIGBUS. Thus accesses to it must use safecopy. + return safemem.BlockFromUnsafePointer((unsafe.Pointer)(addr), chunkSize) +} diff --git a/pkg/sentry/fs/fsutil/inode.go b/pkg/sentry/fs/fsutil/inode.go new file mode 100644 index 000000000..e1ad07df2 --- /dev/null +++ b/pkg/sentry/fs/fsutil/inode.go @@ -0,0 +1,380 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fsutil + +import ( + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time" + "gvisor.googlesource.com/gvisor/pkg/sentry/memmap" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" + "gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix" + "gvisor.googlesource.com/gvisor/pkg/waiter" +) + +// NewSimpleInodeOperations constructs fs.InodeOperations from InodeSimpleAttributes. +func NewSimpleInodeOperations(i InodeSimpleAttributes) fs.InodeOperations { + return &simpleInodeOperations{InodeSimpleAttributes: i} +} + +// simpleInodeOperations is a simple implementation of Inode. +type simpleInodeOperations struct { + DeprecatedFileOperations `state:"nosave"` + InodeNotDirectory `state:"nosave"` + InodeNotSocket `state:"nosave"` + InodeNotRenameable `state:"nosave"` + InodeNotOpenable `state:"nosave"` + InodeNotVirtual `state:"nosave"` + InodeNotSymlink `state:"nosave"` + InodeNoExtendedAttributes `state:"nosave"` + NoMappable `state:"nosave"` + NoopWriteOut `state:"nosave"` + + InodeSimpleAttributes +} + +// InodeSimpleAttributes implements a subset of the Inode interface. It provides +// read-only access to attributes. +type InodeSimpleAttributes struct { + // FSType is the filesystem type reported by StatFS. + FSType uint64 + + // UAttr are the unstable attributes of the Inode. + UAttr fs.UnstableAttr +} + +// Release implements fs.InodeOperations.Release. +func (i *InodeSimpleAttributes) Release(context.Context) {} + +// StatFS implements fs.InodeOperations.StatFS. +func (i *InodeSimpleAttributes) StatFS(context.Context) (fs.Info, error) { + return fs.Info{Type: i.FSType}, nil +} + +// UnstableAttr implements fs.InodeOperations.UnstableAttr. +func (i *InodeSimpleAttributes) UnstableAttr(context.Context, *fs.Inode) (fs.UnstableAttr, error) { + return i.UAttr, nil +} + +// Check implements fs.InodeOperations.Check. +func (i *InodeSimpleAttributes) Check(ctx context.Context, inode *fs.Inode, p fs.PermMask) bool { + return fs.ContextCanAccessFile(ctx, inode, p) +} + +// AddLink implements fs.InodeOperations.AddLink. +func (*InodeSimpleAttributes) AddLink() {} + +// DropLink implements fs.InodeOperations.DropLink. +func (*InodeSimpleAttributes) DropLink() {} + +// NotifyStatusChange implements fs.fs.InodeOperations. +func (i *InodeSimpleAttributes) NotifyStatusChange(ctx context.Context) { + i.UAttr.StatusChangeTime = ktime.NowFromContext(ctx) +} + +// SetPermissions implements fs.InodeOperations.SetPermissions. +func (*InodeSimpleAttributes) SetPermissions(context.Context, *fs.Inode, fs.FilePermissions) bool { + return false +} + +// SetOwner implements fs.InodeOperations.SetOwner. +func (*InodeSimpleAttributes) SetOwner(context.Context, *fs.Inode, fs.FileOwner) error { + return syserror.EINVAL +} + +// SetTimestamps implements fs.InodeOperations.SetTimestamps. +func (*InodeSimpleAttributes) SetTimestamps(context.Context, *fs.Inode, fs.TimeSpec) error { + return syserror.EINVAL +} + +// Truncate implements fs.InodeOperations.Truncate. +func (*InodeSimpleAttributes) Truncate(context.Context, *fs.Inode, int64) error { + return syserror.EINVAL +} + +// InMemoryAttributes implements utilities for updating in-memory unstable +// attributes and extended attributes. It is not thread-safe. +// +// Users need not initialize Xattrs to non-nil (it will be initialized +// when the first extended attribute is set. +type InMemoryAttributes struct { + Unstable fs.UnstableAttr + Xattrs map[string][]byte +} + +// SetPermissions updates the permissions to p. +func (i *InMemoryAttributes) SetPermissions(ctx context.Context, p fs.FilePermissions) bool { + i.Unstable.Perms = p + i.Unstable.StatusChangeTime = ktime.NowFromContext(ctx) + return true +} + +// SetOwner updates the file owner to owner. +func (i *InMemoryAttributes) SetOwner(ctx context.Context, owner fs.FileOwner) error { + if owner.UID.Ok() { + i.Unstable.Owner.UID = owner.UID + } + if owner.GID.Ok() { + i.Unstable.Owner.GID = owner.GID + } + return nil +} + +// SetTimestamps sets the timestamps to ts. +func (i *InMemoryAttributes) SetTimestamps(ctx context.Context, ts fs.TimeSpec) error { + if ts.ATimeOmit && ts.MTimeOmit { + return nil + } + + now := ktime.NowFromContext(ctx) + if !ts.ATimeOmit { + if ts.ATimeSetSystemTime { + i.Unstable.AccessTime = now + } else { + i.Unstable.AccessTime = ts.ATime + } + } + if !ts.MTimeOmit { + if ts.MTimeSetSystemTime { + i.Unstable.ModificationTime = now + } else { + i.Unstable.ModificationTime = ts.MTime + } + } + i.Unstable.StatusChangeTime = now + return nil +} + +// TouchAccessTime updates access time to the current time. +func (i *InMemoryAttributes) TouchAccessTime(ctx context.Context) { + i.Unstable.AccessTime = ktime.NowFromContext(ctx) +} + +// TouchModificationTime updates modification and status change +// time to the current time. +func (i *InMemoryAttributes) TouchModificationTime(ctx context.Context) { + now := ktime.NowFromContext(ctx) + i.Unstable.ModificationTime = now + i.Unstable.StatusChangeTime = now +} + +// TouchStatusChangeTime updates status change time to the current time. +func (i *InMemoryAttributes) TouchStatusChangeTime(ctx context.Context) { + i.Unstable.StatusChangeTime = ktime.NowFromContext(ctx) +} + +// Getxattr returns the extended attribute at name or ENOATTR if +// it isn't set. +func (i *InMemoryAttributes) Getxattr(name string) ([]byte, error) { + if value, ok := i.Xattrs[name]; ok { + return value, nil + } + return nil, syserror.ENOATTR +} + +// Setxattr sets the extended attribute at name to value. +func (i *InMemoryAttributes) Setxattr(name string, value []byte) error { + if i.Xattrs == nil { + i.Xattrs = make(map[string][]byte) + } + i.Xattrs[name] = value + return nil +} + +// Listxattr returns the set of all currently set extended attributes. +func (i *InMemoryAttributes) Listxattr() (map[string]struct{}, error) { + names := make(map[string]struct{}, len(i.Xattrs)) + for name := range i.Xattrs { + names[name] = struct{}{} + } + return names, nil +} + +// NoMappable returns a nil memmap.Mappable. +type NoMappable struct{} + +// Mappable implements fs.InodeOperations.Mappable. +func (NoMappable) Mappable(*fs.Inode) memmap.Mappable { + return nil +} + +// NoopWriteOut is a no-op implementation of Inode.WriteOut. +type NoopWriteOut struct{} + +// WriteOut is a no-op. +func (NoopWriteOut) WriteOut(context.Context, *fs.Inode) error { + return nil +} + +// InodeNotDirectory can be used by Inodes that are not directories. +type InodeNotDirectory struct{} + +// Lookup implements fs.InodeOperations.Lookup. +func (InodeNotDirectory) Lookup(context.Context, *fs.Inode, string) (*fs.Dirent, error) { + return nil, syserror.ENOTDIR +} + +// Create implements fs.InodeOperations.Create. +func (InodeNotDirectory) Create(context.Context, *fs.Inode, string, fs.FileFlags, fs.FilePermissions) (*fs.File, error) { + return nil, syserror.ENOTDIR +} + +// CreateLink implements fs.InodeOperations.CreateLink. +func (InodeNotDirectory) CreateLink(context.Context, *fs.Inode, string, string) error { + return syserror.ENOTDIR +} + +// CreateHardLink implements fs.InodeOperations.CreateHardLink. +func (InodeNotDirectory) CreateHardLink(context.Context, *fs.Inode, *fs.Inode, string) error { + return syserror.ENOTDIR +} + +// CreateDirectory implements fs.InodeOperations.CreateDirectory. +func (InodeNotDirectory) CreateDirectory(context.Context, *fs.Inode, string, fs.FilePermissions) error { + return syserror.ENOTDIR +} + +// Bind implements fs.InodeOperations.Bind. +func (InodeNotDirectory) Bind(context.Context, *fs.Inode, string, unix.BoundEndpoint, fs.FilePermissions) error { + return syserror.ENOTDIR +} + +// CreateFifo implements fs.InodeOperations.CreateFifo. +func (InodeNotDirectory) CreateFifo(context.Context, *fs.Inode, string, fs.FilePermissions) error { + return syserror.ENOTDIR +} + +// Remove implements fs.InodeOperations.Remove. +func (InodeNotDirectory) Remove(context.Context, *fs.Inode, string) error { + return syserror.ENOTDIR +} + +// RemoveDirectory implements fs.InodeOperations.RemoveDirectory. +func (InodeNotDirectory) RemoveDirectory(context.Context, *fs.Inode, string) error { + return syserror.ENOTDIR +} + +// InodeNotSocket can be used by Inodes that are not sockets. +type InodeNotSocket struct{} + +// BoundEndpoint implements fs.InodeOperations.BoundEndpoint. +func (InodeNotSocket) BoundEndpoint(*fs.Inode, string) unix.BoundEndpoint { + return nil +} + +// InodeNotRenameable can be used by Inodes that cannot be renamed. +type InodeNotRenameable struct{} + +// Rename implements fs.InodeOperations.Rename. +func (InodeNotRenameable) Rename(context.Context, *fs.Inode, string, *fs.Inode, string) error { + return syserror.EINVAL +} + +// InodeNotOpenable can be used by Inodes that cannot be opened. +type InodeNotOpenable struct{} + +// GetFile implements fs.InodeOperations.GetFile. +func (InodeNotOpenable) GetFile(context.Context, *fs.Dirent, fs.FileFlags) (*fs.File, error) { + return nil, syserror.EIO +} + +// InodeNotVirtual can be used by Inodes that are not virtual. +type InodeNotVirtual struct{} + +// IsVirtual implements fs.InodeOperations.IsVirtual. +func (InodeNotVirtual) IsVirtual() bool { + return false +} + +// InodeNotSymlink can be used by Inodes that are not symlinks. +type InodeNotSymlink struct{} + +// Readlink implements fs.InodeOperations.Readlink. +func (InodeNotSymlink) Readlink(context.Context, *fs.Inode) (string, error) { + return "", syserror.ENOLINK +} + +// Getlink implements fs.InodeOperations.Getlink. +func (InodeNotSymlink) Getlink(context.Context, *fs.Inode) (*fs.Dirent, error) { + return nil, syserror.ENOLINK +} + +// InodeNoExtendedAttributes can be used by Inodes that do not support +// extended attributes. +type InodeNoExtendedAttributes struct{} + +// Getxattr implements fs.InodeOperations.Getxattr. +func (InodeNoExtendedAttributes) Getxattr(*fs.Inode, string) ([]byte, error) { + return nil, syserror.EOPNOTSUPP +} + +// Setxattr implements fs.InodeOperations.Setxattr. +func (InodeNoExtendedAttributes) Setxattr(*fs.Inode, string, []byte) error { + return syserror.EOPNOTSUPP +} + +// Listxattr implements fs.InodeOperations.Listxattr. +func (InodeNoExtendedAttributes) Listxattr(*fs.Inode) (map[string]struct{}, error) { + return nil, syserror.EOPNOTSUPP +} + +// DeprecatedFileOperations panics if any deprecated Inode method is called. +type DeprecatedFileOperations struct{} + +// Readiness implements fs.InodeOperations.Waitable.Readiness. +func (DeprecatedFileOperations) Readiness(waiter.EventMask) waiter.EventMask { + panic("not implemented") +} + +// EventRegister implements fs.InodeOperations.Waitable.EventRegister. +func (DeprecatedFileOperations) EventRegister(*waiter.Entry, waiter.EventMask) { + panic("not implemented") +} + +// EventUnregister implements fs.InodeOperations.Waitable.EventUnregister. +func (DeprecatedFileOperations) EventUnregister(*waiter.Entry) { + panic("not implemented") +} + +// DeprecatedPreadv implements fs.InodeOperations.DeprecatedPreadv. +func (DeprecatedFileOperations) DeprecatedPreadv(context.Context, usermem.IOSequence, int64) (int64, error) { + panic("not implemented") +} + +// DeprecatedPwritev implements fs.InodeOperations.DeprecatedPwritev. +func (DeprecatedFileOperations) DeprecatedPwritev(context.Context, usermem.IOSequence, int64) (int64, error) { + panic("not implemented") +} + +// DeprecatedReaddir implements fs.InodeOperations.DeprecatedReaddir. +func (DeprecatedFileOperations) DeprecatedReaddir(context.Context, *fs.DirCtx, int) (int, error) { + panic("not implemented") +} + +// DeprecatedFsync implements fs.InodeOperations.DeprecatedFsync. +func (DeprecatedFileOperations) DeprecatedFsync() error { + panic("not implemented") +} + +// DeprecatedFlush implements fs.InodeOperations.DeprecatedFlush. +func (DeprecatedFileOperations) DeprecatedFlush() error { + panic("not implemented") +} + +// DeprecatedMappable implements fs.InodeOperations.DeprecatedMappable. +func (DeprecatedFileOperations) DeprecatedMappable(context.Context, *fs.Inode) (memmap.Mappable, bool) { + panic("not implemented") +} diff --git a/pkg/sentry/fs/fsutil/inode_cached.go b/pkg/sentry/fs/fsutil/inode_cached.go new file mode 100644 index 000000000..484668735 --- /dev/null +++ b/pkg/sentry/fs/fsutil/inode_cached.go @@ -0,0 +1,845 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fsutil + +import ( + "fmt" + "io" + "sync" + + "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time" + "gvisor.googlesource.com/gvisor/pkg/sentry/memmap" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform" + "gvisor.googlesource.com/gvisor/pkg/sentry/safemem" + "gvisor.googlesource.com/gvisor/pkg/sentry/usage" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +// Lock order (compare the lock order model in mm/mm.go): +// +// CachingInodeOperations.attrMu ("fs locks") +// CachingInodeOperations.mapsMu ("memmap.Mappable locks not taken by Translate") +// CachingInodeOperations.dataMu ("memmap.Mappable locks taken by Translate") +// CachedFileObject locks + +// CachingInodeOperations caches the metadata and content of a CachedFileObject. +// It implements a subset of InodeOperations. As a utility it can be used to +// implement the full set of InodeOperations. Generally it should not be +// embedded to avoid unexpected inherited behavior. +// +// CachingInodeOperations implements Mappable for the CachedFileObject: +// +// - If CachedFileObject.FD returns a value >= 0 and the current platform shares +// a host fd table with the sentry, then the value of CachedFileObject.FD +// will be memory mapped on the host. +// +// - Otherwise, the contents of CachedFileObject are buffered into memory +// managed by the CachingInodeOperations. +// +// Implementations of FileOperations for a CachedFileObject must read and +// write through CachingInodeOperations using Read and Write respectively. +// +// Implementations of InodeOperations.WriteOut must call Sync to write out +// in-memory modifications of data and metadata to the CachedFileObject. +type CachingInodeOperations struct { + // backingFile is a handle to a cached file object. + backingFile CachedFileObject + + // platform is used to allocate memory that caches backingFile's contents. + platform platform.Platform + + // forcePageCache indicates the sentry page cache should be used regardless + // of whether the platform supports host mapped I/O or not. This must not be + // modified after inode creation. + forcePageCache bool + + attrMu sync.Mutex `state:"nosave"` + + // attr is unstable cached metadata. + // + // attr is protected by attrMu. attr.Size is protected by both attrMu and + // dataMu; reading it requires locking either mutex, while mutating it + // requires locking both. + attr fs.UnstableAttr + + // dirtyAttr is metadata that was updated in-place but hasn't yet + // been successfully written out. + // + // dirtyAttr is protected by attrMu. + dirtyAttr fs.AttrMask + + mapsMu sync.Mutex `state:"nosave"` + + // mappings tracks mappings of the cached file object into + // memmap.MappingSpaces. + // + // mappings is protected by mapsMu. + mappings memmap.MappingSet + + dataMu sync.RWMutex `state:"nosave"` + + // cache maps offsets into the cached file to offsets into + // platform.Memory() that store the file's data. + // + // cache is protected by dataMu. + cache FileRangeSet + + // dirty tracks dirty segments in cache. + // + // dirty is protected by dataMu. + dirty DirtySet + + // hostFileMapper caches internal mappings of backingFile.FD(). + hostFileMapper *HostFileMapper + + // refs tracks active references to data in the cache. + // + // refs is protected by dataMu. + refs frameRefSet +} + +// CachedFileObject is a file that may require caching. +type CachedFileObject interface { + // ReadToBlocksAt reads up to dsts.NumBytes() bytes from the file to dsts, + // starting at offset, and returns the number of bytes read. ReadToBlocksAt + // may return a partial read without an error. + ReadToBlocksAt(ctx context.Context, dsts safemem.BlockSeq, offset uint64) (uint64, error) + + // WriteFromBlocksAt writes up to srcs.NumBytes() bytes from srcs to the + // file, starting at offset, and returns the number of bytes written. + // WriteFromBlocksAt may return a partial write without an error. + WriteFromBlocksAt(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error) + + // SetMaskedAttributes sets the attributes in attr that are true in mask + // on the backing file. + // + // SetMaskedAttributes may be called at any point, regardless of whether + // the file was opened. + SetMaskedAttributes(ctx context.Context, mask fs.AttrMask, attr fs.UnstableAttr) error + + // Sync instructs the remote filesystem to sync the file to stable storage. + Sync(ctx context.Context) error + + // FD returns a host file descriptor. Return value must be -1 or not -1 + // for the lifetime of the CachedFileObject. + // + // FD is called iff the file has been memory mapped. This implies that + // the file was opened (see fs.InodeOperations.GetFile). + // + // FIXME: This interface seems to be + // fundamentally broken. We should clarify CachingInodeOperation's + // behavior with metadata. + FD() int +} + +// NewCachingInodeOperations returns a new CachingInodeOperations backed by +// a CachedFileObject and its initial unstable attributes. +func NewCachingInodeOperations(ctx context.Context, backingFile CachedFileObject, uattr fs.UnstableAttr, forcePageCache bool) *CachingInodeOperations { + p := platform.FromContext(ctx) + if p == nil { + panic(fmt.Sprintf("context.Context %T lacks non-nil value for key %T", ctx, platform.CtxPlatform)) + } + return &CachingInodeOperations{ + backingFile: backingFile, + platform: p, + forcePageCache: forcePageCache, + attr: uattr, + hostFileMapper: NewHostFileMapper(), + } +} + +// Release implements fs.InodeOperations.Release. +func (c *CachingInodeOperations) Release() { + c.mapsMu.Lock() + defer c.mapsMu.Unlock() + c.dataMu.Lock() + defer c.dataMu.Unlock() + // The cache should be empty (something has gone terribly wrong if we're + // releasing an inode that is still memory-mapped). + if !c.mappings.IsEmpty() || !c.cache.IsEmpty() || !c.dirty.IsEmpty() { + panic(fmt.Sprintf("Releasing CachingInodeOperations with mappings:\n%s\ncache contents:\n%s\ndirty segments:\n%s", &c.mappings, &c.cache, &c.dirty)) + } +} + +// UnstableAttr implements fs.InodeOperations.UnstableAttr. +func (c *CachingInodeOperations) UnstableAttr(ctx context.Context, inode *fs.Inode) (fs.UnstableAttr, error) { + c.attrMu.Lock() + defer c.attrMu.Unlock() + return c.attr, nil +} + +// SetPermissions implements fs.InodeOperations.SetPermissions. +func (c *CachingInodeOperations) SetPermissions(ctx context.Context, inode *fs.Inode, perms fs.FilePermissions) bool { + c.attrMu.Lock() + defer c.attrMu.Unlock() + + masked := fs.AttrMask{Perms: true} + if err := c.backingFile.SetMaskedAttributes(ctx, masked, fs.UnstableAttr{Perms: perms}); err != nil { + return false + } + c.attr.Perms = perms + // FIXME: Clarify CachingInodeOperations behavior with metadata. + c.dirtyAttr.Perms = true + c.touchStatusChangeTimeLocked(ctx) + return true + +} + +// SetOwner implements fs.InodeOperations.SetOwner. +func (c *CachingInodeOperations) SetOwner(ctx context.Context, inode *fs.Inode, owner fs.FileOwner) error { + if !owner.UID.Ok() && !owner.GID.Ok() { + return nil + } + + c.attrMu.Lock() + defer c.attrMu.Unlock() + + masked := fs.AttrMask{ + UID: owner.UID.Ok(), + GID: owner.GID.Ok(), + } + if err := c.backingFile.SetMaskedAttributes(ctx, masked, fs.UnstableAttr{Owner: owner}); err != nil { + return err + } + if owner.UID.Ok() { + c.attr.Owner.UID = owner.UID + // FIXME: Clarify CachingInodeOperations behavior with metadata. + c.dirtyAttr.UID = true + } + if owner.GID.Ok() { + c.attr.Owner.GID = owner.GID + // FIXME: Clarify CachingInodeOperations behavior with metadata. + c.dirtyAttr.GID = true + } + c.touchStatusChangeTimeLocked(ctx) + return nil +} + +// SetTimestamps implements fs.InodeOperations.SetTimestamps. +func (c *CachingInodeOperations) SetTimestamps(ctx context.Context, inode *fs.Inode, ts fs.TimeSpec) error { + if ts.ATimeOmit && ts.MTimeOmit { + return nil + } + + c.attrMu.Lock() + defer c.attrMu.Unlock() + + // Replace requests to use the "system time" with the current time to + // ensure that cached timestamps remain consistent with the remote + // filesystem. + now := ktime.NowFromContext(ctx) + if ts.ATimeSetSystemTime { + ts.ATime = now + } + if ts.MTimeSetSystemTime { + ts.MTime = now + } + masked := fs.AttrMask{ + AccessTime: !ts.ATimeOmit, + ModificationTime: !ts.MTimeOmit, + } + if err := c.backingFile.SetMaskedAttributes(ctx, masked, fs.UnstableAttr{AccessTime: ts.ATime, ModificationTime: ts.MTime}); err != nil { + return err + } + if !ts.ATimeOmit { + c.attr.AccessTime = ts.ATime + // FIXME: Clarify CachingInodeOperations behavior with metadata. + c.dirtyAttr.AccessTime = true + } + if !ts.MTimeOmit { + c.attr.ModificationTime = ts.MTime + // FIXME: Clarify CachingInodeOperations behavior with metadata. + c.dirtyAttr.ModificationTime = true + } + c.touchStatusChangeTimeLocked(ctx) + return nil +} + +// Truncate implements fs.InodeOperations.Truncate. +func (c *CachingInodeOperations) Truncate(ctx context.Context, inode *fs.Inode, size int64) error { + c.attrMu.Lock() + defer c.attrMu.Unlock() + + // c.attr.Size is protected by both c.attrMu and c.dataMu. + c.dataMu.Lock() + if err := c.backingFile.SetMaskedAttributes(ctx, fs.AttrMask{ + Size: true, + }, fs.UnstableAttr{ + Size: size, + }); err != nil { + c.dataMu.Unlock() + return err + } + oldSize := c.attr.Size + if oldSize != size { + c.attr.Size = size + // FIXME: Clarify CachingInodeOperations behavior with metadata. + c.dirtyAttr.Size = true + c.touchModificationTimeLocked(ctx) + } + // We drop c.dataMu here so that we can lock c.mapsMu and invalidate + // mappings below. This allows concurrent calls to Read/Translate/etc. + // These functions synchronize with an in-progress Truncate by refusing to + // use cache contents beyond the new c.attr.Size. (We are still holding + // c.attrMu, so we can't race with Truncate/Write.) + c.dataMu.Unlock() + + // Nothing left to do unless shrinking the file. + if size >= oldSize { + return nil + } + + oldpgend := fs.OffsetPageEnd(oldSize) + newpgend := fs.OffsetPageEnd(size) + + // Invalidate past translations of truncated pages. + if newpgend != oldpgend { + c.mapsMu.Lock() + c.mappings.Invalidate(memmap.MappableRange{newpgend, oldpgend}, memmap.InvalidateOpts{ + // Compare Linux's mm/truncate.c:truncate_setsize() => + // truncate_pagecache() => + // mm/memory.c:unmap_mapping_range(evencows=1). + InvalidatePrivate: true, + }) + c.mapsMu.Unlock() + } + + // We are now guaranteed that there are no translations of truncated pages, + // and can remove them from the cache. Since truncated pages have been + // removed from the backing file, they should be dropped without being + // written back. + c.dataMu.Lock() + defer c.dataMu.Unlock() + c.cache.Truncate(uint64(size), c.platform.Memory()) + c.dirty.KeepClean(memmap.MappableRange{uint64(size), oldpgend}) + + return nil +} + +// WriteOut implements fs.InodeOperations.WriteOut. +func (c *CachingInodeOperations) WriteOut(ctx context.Context, inode *fs.Inode) error { + c.attrMu.Lock() + + // Write dirty pages back. + c.dataMu.RLock() + err := SyncDirtyAll(ctx, &c.cache, &c.dirty, uint64(c.attr.Size), c.platform.Memory(), c.backingFile.WriteFromBlocksAt) + c.dataMu.RUnlock() + if err != nil { + c.attrMu.Unlock() + return err + } + + // Write out cached attributes. + if err := c.backingFile.SetMaskedAttributes(ctx, c.dirtyAttr, c.attr); err != nil { + c.attrMu.Unlock() + return err + } + c.dirtyAttr = fs.AttrMask{} + + c.attrMu.Unlock() + + // Fsync the remote file. + return c.backingFile.Sync(ctx) +} + +// IncLinks increases the link count and updates cached access time. +func (c *CachingInodeOperations) IncLinks(ctx context.Context) { + c.attrMu.Lock() + c.attr.Links++ + c.touchModificationTimeLocked(ctx) + c.attrMu.Unlock() +} + +// DecLinks decreases the link count and updates cached access time. +func (c *CachingInodeOperations) DecLinks(ctx context.Context) { + c.attrMu.Lock() + c.attr.Links-- + c.touchModificationTimeLocked(ctx) + c.attrMu.Unlock() +} + +// TouchAccessTime updates the cached access time in-place to the +// current time. It does not update status change time in-place. See +// mm/filemap.c:do_generic_file_read -> include/linux/h:file_accessed. +func (c *CachingInodeOperations) TouchAccessTime(ctx context.Context, inode *fs.Inode) { + if inode.MountSource.Flags.NoAtime { + return + } + + c.attrMu.Lock() + c.touchAccessTimeLocked(ctx) + c.attrMu.Unlock() +} + +// touchAccesstimeLocked updates the cached access time in-place to the current +// time. +// +// Preconditions: c.attrMu is locked for writing. +func (c *CachingInodeOperations) touchAccessTimeLocked(ctx context.Context) { + c.attr.AccessTime = ktime.NowFromContext(ctx) + c.dirtyAttr.AccessTime = true +} + +// TouchModificationTime updates the cached modification and status change time +// in-place to the current time. +func (c *CachingInodeOperations) TouchModificationTime(ctx context.Context) { + c.attrMu.Lock() + c.touchModificationTimeLocked(ctx) + c.attrMu.Unlock() +} + +// touchModificationTimeLocked updates the cached modification and status +// change time in-place to the current time. +// +// Preconditions: c.attrMu is locked for writing. +func (c *CachingInodeOperations) touchModificationTimeLocked(ctx context.Context) { + now := ktime.NowFromContext(ctx) + c.attr.ModificationTime = now + c.dirtyAttr.ModificationTime = true + c.attr.StatusChangeTime = now + c.dirtyAttr.StatusChangeTime = true +} + +// touchStatusChangeTimeLocked updates the cached status change time +// in-place to the current time. +// +// Preconditions: c.attrMu is locked for writing. +func (c *CachingInodeOperations) touchStatusChangeTimeLocked(ctx context.Context) { + now := ktime.NowFromContext(ctx) + c.attr.StatusChangeTime = now + c.dirtyAttr.StatusChangeTime = true +} + +// Read reads from frames and otherwise directly from the backing file +// into dst starting at offset until dst is full, EOF is reached, or an +// error is encountered. +// +// Read may partially fill dst and return a nil error. +func (c *CachingInodeOperations) Read(ctx context.Context, file *fs.File, dst usermem.IOSequence, offset int64) (int64, error) { + if dst.NumBytes() == 0 { + return 0, nil + } + + // Have we reached EOF? We check for this again in + // inodeReadWriter.ReadToBlocks to avoid holding c.attrMu (which would + // serialize reads) or c.dataMu (which would violate lock ordering), but + // check here first (before calling into MM) since reading at EOF is + // common: getting a return value of 0 from a read syscall is the only way + // to detect EOF. + // + // TODO: Separate out c.attr.Size and use atomics instead of + // c.dataMu. + c.dataMu.RLock() + size := c.attr.Size + c.dataMu.RUnlock() + if offset >= size { + return 0, io.EOF + } + + n, err := dst.CopyOutFrom(ctx, &inodeReadWriter{ctx, c, offset}) + // Compare Linux's mm/filemap.c:do_generic_file_read() => file_accessed(). + c.TouchAccessTime(ctx, file.Dirent.Inode) + return n, err +} + +// Write writes to frames and otherwise directly to the backing file +// from src starting at offset and until src is empty or an error is +// encountered. +// +// If Write partially fills src, a non-nil error is returned. +func (c *CachingInodeOperations) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) { + if src.NumBytes() == 0 { + return 0, nil + } + + c.attrMu.Lock() + defer c.attrMu.Unlock() + // Compare Linux's mm/filemap.c:__generic_file_write_iter() => file_update_time(). + c.touchModificationTimeLocked(ctx) + return src.CopyInTo(ctx, &inodeReadWriter{ctx, c, offset}) +} + +type inodeReadWriter struct { + ctx context.Context + c *CachingInodeOperations + offset int64 +} + +// ReadToBlocks implements safemem.Reader.ReadToBlocks. +func (rw *inodeReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) { + rw.c.dataMu.RLock() + defer rw.c.dataMu.RUnlock() + + // Compute the range to read. + if rw.offset >= rw.c.attr.Size { + return 0, io.EOF + } + end := fs.ReadEndOffset(rw.offset, int64(dsts.NumBytes()), rw.c.attr.Size) + if end == rw.offset { // dsts.NumBytes() == 0? + return 0, nil + } + + mem := rw.c.platform.Memory() + var done uint64 + seg, gap := rw.c.cache.Find(uint64(rw.offset)) + for rw.offset < end { + mr := memmap.MappableRange{uint64(rw.offset), uint64(end)} + switch { + case seg.Ok(): + // Get internal mappings from the cache. + ims, err := mem.MapInternal(seg.FileRangeOf(seg.Range().Intersect(mr)), usermem.Read) + if err != nil { + return done, err + } + + // Copy from internal mappings. + n, err := safemem.CopySeq(dsts, ims) + done += n + rw.offset += int64(n) + dsts = dsts.DropFirst64(n) + if err != nil { + return done, err + } + + // Continue. + seg, gap = seg.NextNonEmpty() + + case gap.Ok(): + // Read directly from the backing file. + gapmr := gap.Range().Intersect(mr) + dst := dsts.TakeFirst64(gapmr.Length()) + n, err := rw.c.backingFile.ReadToBlocksAt(rw.ctx, dst, gapmr.Start) + done += n + rw.offset += int64(n) + dsts = dsts.DropFirst64(n) + // Partial reads are fine. But we must stop reading. + if n != dst.NumBytes() || err != nil { + return done, err + } + + // Continue. + seg, gap = gap.NextSegment(), FileRangeGapIterator{} + + default: + break + } + } + return done, nil +} + +// WriteFromBlocks implements safemem.Writer.WriteFromBlocks. +// +// Preconditions: rw.c.attrMu must be locked. +func (rw *inodeReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) { + rw.c.dataMu.Lock() + defer rw.c.dataMu.Unlock() + + // Compute the range to write. + end := fs.WriteEndOffset(rw.offset, int64(srcs.NumBytes())) + if end == rw.offset { // srcs.NumBytes() == 0? + return 0, nil + } + + defer func() { + // If the write ends beyond the file's previous size, it causes the + // file to grow. + if rw.offset > rw.c.attr.Size { + rw.c.attr.Size = rw.offset + rw.c.dirtyAttr.Size = true + } + if rw.offset > rw.c.attr.Usage { + // This is incorrect if CachingInodeOperations is caching a sparse + // file. (In Linux, keeping inode::i_blocks up to date is the + // filesystem's responsibility.) + rw.c.attr.Usage = rw.offset + rw.c.dirtyAttr.Usage = true + } + }() + + mem := rw.c.platform.Memory() + var done uint64 + seg, gap := rw.c.cache.Find(uint64(rw.offset)) + for rw.offset < end { + mr := memmap.MappableRange{uint64(rw.offset), uint64(end)} + switch { + case seg.Ok() && seg.Start() < mr.End: + // Get internal mappings from the cache. + segMR := seg.Range().Intersect(mr) + ims, err := mem.MapInternal(seg.FileRangeOf(segMR), usermem.Write) + if err != nil { + return done, err + } + + // Copy to internal mappings. + n, err := safemem.CopySeq(ims, srcs) + done += n + rw.offset += int64(n) + srcs = srcs.DropFirst64(n) + rw.c.dirty.MarkDirty(segMR) + if err != nil { + return done, err + } + + // Continue. + seg, gap = seg.NextNonEmpty() + + case gap.Ok() && gap.Start() < mr.End: + // Write directly to the backing file. + gapmr := gap.Range().Intersect(mr) + src := srcs.TakeFirst64(gapmr.Length()) + n, err := rw.c.backingFile.WriteFromBlocksAt(rw.ctx, src, gapmr.Start) + done += n + rw.offset += int64(n) + srcs = srcs.DropFirst64(n) + // Partial writes are fine. But we must stop writing. + if n != src.NumBytes() || err != nil { + return done, err + } + + // Continue. + seg, gap = gap.NextSegment(), FileRangeGapIterator{} + + default: + break + } + } + return done, nil +} + +// AddMapping implements memmap.Mappable.AddMapping. +func (c *CachingInodeOperations) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64) error { + c.mapsMu.Lock() + defer c.mapsMu.Unlock() + mapped := c.mappings.AddMapping(ms, ar, offset) + // Do this unconditionally since whether we have c.backingFile.FD() >= 0 + // can change across save/restore. + for _, r := range mapped { + c.hostFileMapper.IncRefOn(r) + } + if !usage.IncrementalMappedAccounting && !c.forcePageCache && c.backingFile.FD() >= 0 { + for _, r := range mapped { + usage.MemoryAccounting.Inc(r.Length(), usage.Mapped) + } + } + return nil +} + +// RemoveMapping implements memmap.Mappable.RemoveMapping. +func (c *CachingInodeOperations) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64) { + c.mapsMu.Lock() + defer c.mapsMu.Unlock() + unmapped := c.mappings.RemoveMapping(ms, ar, offset) + for _, r := range unmapped { + c.hostFileMapper.DecRefOn(r) + } + if !c.forcePageCache && c.backingFile.FD() >= 0 { + if !usage.IncrementalMappedAccounting { + for _, r := range unmapped { + usage.MemoryAccounting.Dec(r.Length(), usage.Mapped) + } + } + return + } + + // Writeback dirty mapped memory now that there are no longer any + // mappings that reference it. This is our naive memory eviction + // strategy. + mem := c.platform.Memory() + c.dataMu.Lock() + defer c.dataMu.Unlock() + for _, r := range unmapped { + if err := SyncDirty(ctx, r, &c.cache, &c.dirty, uint64(c.attr.Size), c.platform.Memory(), c.backingFile.WriteFromBlocksAt); err != nil { + log.Warningf("Failed to writeback cached data %v: %v", r, err) + } + c.cache.Drop(r, mem) + c.dirty.KeepClean(r) + } +} + +// CopyMapping implements memmap.Mappable.CopyMapping. +func (c *CachingInodeOperations) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR usermem.AddrRange, offset uint64) error { + return c.AddMapping(ctx, ms, dstAR, offset) +} + +// Translate implements memmap.Mappable.Translate. +func (c *CachingInodeOperations) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) { + if !c.forcePageCache && c.backingFile.FD() >= 0 { + return []memmap.Translation{ + { + Source: optional, + File: c, + Offset: optional.Start, + }, + }, nil + } + + c.dataMu.Lock() + defer c.dataMu.Unlock() + + // Constrain translations to c.attr.Size (rounded up) to prevent + // translation to pages that may be concurrently truncated. + pgend := fs.OffsetPageEnd(c.attr.Size) + var beyondEOF bool + if required.End > pgend { + if required.Start >= pgend { + return nil, &memmap.BusError{io.EOF} + } + beyondEOF = true + required.End = pgend + } + if optional.End > pgend { + optional.End = pgend + } + + mem := c.platform.Memory() + cerr := c.cache.Fill(ctx, required, maxFillRange(required, optional), mem, usage.PageCache, c.backingFile.ReadToBlocksAt) + + var ts []memmap.Translation + var translatedEnd uint64 + for seg := c.cache.FindSegment(required.Start); seg.Ok() && seg.Start() < required.End; seg, _ = seg.NextNonEmpty() { + segMR := seg.Range().Intersect(optional) + ts = append(ts, memmap.Translation{ + Source: segMR, + File: mem, + Offset: seg.FileRangeOf(segMR).Start, + }) + if at.Write { + // From this point forward, this memory can be dirtied through the + // mapping at any time. + c.dirty.KeepDirty(segMR) + } + translatedEnd = segMR.End + } + + // Don't return the error returned by c.cache.Fill if it occurred outside + // of required. + if translatedEnd < required.End && cerr != nil { + return ts, &memmap.BusError{cerr} + } + if beyondEOF { + return ts, &memmap.BusError{io.EOF} + } + return ts, nil +} + +func maxFillRange(required, optional memmap.MappableRange) memmap.MappableRange { + const maxReadahead = 64 << 10 // 64 KB, chosen arbitrarily + if required.Length() >= maxReadahead { + return required + } + if optional.Length() <= maxReadahead { + return optional + } + optional.Start = required.Start + if optional.Length() <= maxReadahead { + return optional + } + optional.End = optional.Start + maxReadahead + return optional +} + +// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable. +func (c *CachingInodeOperations) InvalidateUnsavable(ctx context.Context) error { + // Whether we have a host fd (and consequently what platform.File is + // mapped) can change across save/restore, so invalidate all translations + // unconditionally. + c.mapsMu.Lock() + defer c.mapsMu.Unlock() + c.mappings.InvalidateAll(memmap.InvalidateOpts{}) + + // Sync the cache's contents so that if we have a host fd after restore, + // the remote file's contents are coherent. + c.dataMu.Lock() + defer c.dataMu.Unlock() + if err := SyncDirtyAll(ctx, &c.cache, &c.dirty, uint64(c.attr.Size), c.platform.Memory(), c.backingFile.WriteFromBlocksAt); err != nil { + return err + } + + // Discard the cache so that it's not stored in saved state. This is safe + // because per InvalidateUnsavable invariants, no new translations can have + // been returned after we invalidated all existing translations above. + c.cache.DropAll(c.platform.Memory()) + c.dirty.RemoveAll() + + return nil +} + +// MapInto implements platform.File.MapInto. This is used when we directly map +// an underlying host fd and CachingInodeOperations is used as the platform.File +// during translation. +func (c *CachingInodeOperations) MapInto(as platform.AddressSpace, addr usermem.Addr, fr platform.FileRange, at usermem.AccessType, precommit bool) error { + return as.MapFile(addr, c.backingFile.FD(), fr, at, precommit) +} + +// MapInternal implements platform.File.MapInternal. This is used when we +// directly map an underlying host fd and CachingInodeOperations is used as the +// platform.File during translation. +func (c *CachingInodeOperations) MapInternal(fr platform.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) { + return c.hostFileMapper.MapInternal(fr, c.backingFile.FD(), at.Write) +} + +// IncRef implements platform.File.IncRef. This is used when we directly map an +// underlying host fd and CachingInodeOperations is used as the platform.File +// during translation. +func (c *CachingInodeOperations) IncRef(fr platform.FileRange) { + c.dataMu.Lock() + defer c.dataMu.Unlock() + + seg, gap := c.refs.Find(fr.Start) + for { + switch { + case seg.Ok() && seg.Start() < fr.End: + seg = c.refs.Isolate(seg, fr) + seg.SetValue(seg.Value() + 1) + seg, gap = seg.NextNonEmpty() + case gap.Ok() && gap.Start() < fr.End: + newRange := gap.Range().Intersect(fr) + if usage.IncrementalMappedAccounting { + usage.MemoryAccounting.Inc(newRange.Length(), usage.Mapped) + } + seg, gap = c.refs.InsertWithoutMerging(gap, newRange, 1).NextNonEmpty() + default: + c.refs.MergeAdjacent(fr) + return + } + } +} + +// DecRef implements platform.File.DecRef. This is used when we directly map an +// underlying host fd and CachingInodeOperations is used as the platform.File +// during translation. +func (c *CachingInodeOperations) DecRef(fr platform.FileRange) { + c.dataMu.Lock() + defer c.dataMu.Unlock() + + seg := c.refs.FindSegment(fr.Start) + + for seg.Ok() && seg.Start() < fr.End { + seg = c.refs.Isolate(seg, fr) + if old := seg.Value(); old == 1 { + if usage.IncrementalMappedAccounting { + usage.MemoryAccounting.Dec(seg.Range().Length(), usage.Mapped) + } + seg = c.refs.Remove(seg).NextSegment() + } else { + seg.SetValue(old - 1) + seg = seg.NextSegment() + } + } + c.refs.MergeAdjacent(fr) +} diff --git a/pkg/sentry/fs/fsutil/inode_cached_test.go b/pkg/sentry/fs/fsutil/inode_cached_test.go new file mode 100644 index 000000000..996c91849 --- /dev/null +++ b/pkg/sentry/fs/fsutil/inode_cached_test.go @@ -0,0 +1,403 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fsutil + +import ( + "bytes" + "io" + "reflect" + "testing" + + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time" + "gvisor.googlesource.com/gvisor/pkg/sentry/memmap" + "gvisor.googlesource.com/gvisor/pkg/sentry/safemem" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +type noopBackingFile struct{} + +func (noopBackingFile) ReadToBlocksAt(ctx context.Context, dsts safemem.BlockSeq, offset uint64) (uint64, error) { + return dsts.NumBytes(), nil +} + +func (noopBackingFile) WriteFromBlocksAt(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error) { + return srcs.NumBytes(), nil +} + +func (noopBackingFile) SetMaskedAttributes(context.Context, fs.AttrMask, fs.UnstableAttr) error { + return nil +} + +func (noopBackingFile) Sync(context.Context) error { + return nil +} + +func (noopBackingFile) FD() int { + return -1 +} + +func TestSetPermissions(t *testing.T) { + ctx := contexttest.Context(t) + + uattr := fs.WithCurrentTime(ctx, fs.UnstableAttr{ + Perms: fs.FilePermsFromMode(0444), + }) + iops := NewCachingInodeOperations(ctx, noopBackingFile{}, uattr, false /*forcePageCache*/) + defer iops.Release() + + perms := fs.FilePermsFromMode(0777) + if !iops.SetPermissions(ctx, nil, perms) { + t.Fatalf("SetPermissions failed, want success") + } + + // Did permissions change? + if !iops.dirtyAttr.Perms { + t.Fatalf("got perms not dirty, want dirty") + } + if iops.attr.Perms != perms { + t.Fatalf("got perms +%v, want +%v", iops.attr.Perms, perms) + } + + // Did status change time change? + if !iops.dirtyAttr.StatusChangeTime { + t.Fatalf("got status change time not dirty, want dirty") + } + if iops.attr.StatusChangeTime.Equal(uattr.StatusChangeTime) { + t.Fatalf("got status change time unchanged") + } +} + +func TestSetTimestamps(t *testing.T) { + ctx := contexttest.Context(t) + for _, test := range []struct { + desc string + ts fs.TimeSpec + wantDirty fs.AttrMask + }{ + { + desc: "noop", + ts: fs.TimeSpec{ + ATimeOmit: true, + MTimeOmit: true, + }, + wantDirty: fs.AttrMask{}, + }, + { + desc: "access time only", + ts: fs.TimeSpec{ + ATime: ktime.NowFromContext(ctx), + MTimeOmit: true, + }, + wantDirty: fs.AttrMask{ + AccessTime: true, + StatusChangeTime: true, + }, + }, + { + desc: "modification time only", + ts: fs.TimeSpec{ + ATimeOmit: true, + MTime: ktime.NowFromContext(ctx), + }, + wantDirty: fs.AttrMask{ + ModificationTime: true, + StatusChangeTime: true, + }, + }, + { + desc: "access and modification time", + ts: fs.TimeSpec{ + ATime: ktime.NowFromContext(ctx), + MTime: ktime.NowFromContext(ctx), + }, + wantDirty: fs.AttrMask{ + AccessTime: true, + ModificationTime: true, + StatusChangeTime: true, + }, + }, + { + desc: "system time access and modification time", + ts: fs.TimeSpec{ + ATimeSetSystemTime: true, + MTimeSetSystemTime: true, + }, + wantDirty: fs.AttrMask{ + AccessTime: true, + ModificationTime: true, + StatusChangeTime: true, + }, + }, + } { + t.Run(test.desc, func(t *testing.T) { + ctx := contexttest.Context(t) + + epoch := ktime.ZeroTime + uattr := fs.UnstableAttr{ + AccessTime: epoch, + ModificationTime: epoch, + StatusChangeTime: epoch, + } + iops := NewCachingInodeOperations(ctx, noopBackingFile{}, uattr, false /*forcePageCache*/) + defer iops.Release() + + if err := iops.SetTimestamps(ctx, nil, test.ts); err != nil { + t.Fatalf("SetTimestamps got error %v, want nil", err) + } + if !reflect.DeepEqual(iops.dirtyAttr, test.wantDirty) { + t.Fatalf("dirty got %+v, want %+v", iops.dirtyAttr, test.wantDirty) + } + if iops.dirtyAttr.AccessTime { + if !iops.attr.AccessTime.After(uattr.AccessTime) { + t.Fatalf("diritied access time did not advance, want %v > %v", iops.attr.AccessTime, uattr.AccessTime) + } + if !iops.dirtyAttr.StatusChangeTime { + t.Fatalf("dirty access time requires dirty status change time") + } + if !iops.attr.StatusChangeTime.After(uattr.StatusChangeTime) { + t.Fatalf("dirtied status change time did not advance") + } + } + if iops.dirtyAttr.ModificationTime { + if !iops.attr.ModificationTime.After(uattr.ModificationTime) { + t.Fatalf("diritied modification time did not advance") + } + if !iops.dirtyAttr.StatusChangeTime { + t.Fatalf("dirty modification time requires dirty status change time") + } + if !iops.attr.StatusChangeTime.After(uattr.StatusChangeTime) { + t.Fatalf("dirtied status change time did not advance") + } + } + }) + } +} + +func TestTruncate(t *testing.T) { + ctx := contexttest.Context(t) + + uattr := fs.UnstableAttr{ + Size: 0, + } + iops := NewCachingInodeOperations(ctx, noopBackingFile{}, uattr, false /*forcePageCache*/) + defer iops.Release() + + if err := iops.Truncate(ctx, nil, uattr.Size); err != nil { + t.Fatalf("Truncate got error %v, want nil", err) + } + if iops.dirtyAttr.Size { + t.Fatalf("Truncate caused size to be dirtied") + } + var size int64 = 4096 + if err := iops.Truncate(ctx, nil, size); err != nil { + t.Fatalf("Truncate got error %v, want nil", err) + } + if !iops.dirtyAttr.Size { + t.Fatalf("Truncate caused size to not be dirtied") + } + if iops.attr.Size != size { + t.Fatalf("Truncate got %d, want %d", iops.attr.Size, size) + } + if !iops.dirtyAttr.ModificationTime || !iops.dirtyAttr.StatusChangeTime { + t.Fatalf("Truncate did not dirty modification and status change time") + } + if !iops.attr.ModificationTime.After(uattr.ModificationTime) { + t.Fatalf("dirtied modification time did not change") + } + if !iops.attr.StatusChangeTime.After(uattr.StatusChangeTime) { + t.Fatalf("dirtied status change time did not change") + } +} + +type sliceBackingFile struct { + data []byte +} + +func newSliceBackingFile(data []byte) *sliceBackingFile { + return &sliceBackingFile{data} +} + +func (f *sliceBackingFile) ReadToBlocksAt(ctx context.Context, dsts safemem.BlockSeq, offset uint64) (uint64, error) { + r := safemem.BlockSeqReader{safemem.BlockSeqOf(safemem.BlockFromSafeSlice(f.data)).DropFirst64(offset)} + return r.ReadToBlocks(dsts) +} + +func (f *sliceBackingFile) WriteFromBlocksAt(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error) { + w := safemem.BlockSeqWriter{safemem.BlockSeqOf(safemem.BlockFromSafeSlice(f.data)).DropFirst64(offset)} + return w.WriteFromBlocks(srcs) +} + +func (*sliceBackingFile) SetMaskedAttributes(context.Context, fs.AttrMask, fs.UnstableAttr) error { + return nil +} + +func (*sliceBackingFile) Sync(context.Context) error { + return nil +} + +func (*sliceBackingFile) FD() int { + return -1 +} + +type noopMappingSpace struct{} + +// Invalidate implements memmap.MappingSpace.Invalidate. +func (noopMappingSpace) Invalidate(ar usermem.AddrRange, opts memmap.InvalidateOpts) { +} + +func anonInode(ctx context.Context) *fs.Inode { + return fs.NewInode(NewSimpleInodeOperations(InodeSimpleAttributes{ + UAttr: fs.WithCurrentTime(ctx, fs.UnstableAttr{ + Owner: fs.FileOwnerFromContext(ctx), + Perms: fs.FilePermissions{ + User: fs.PermMask{Read: true, Write: true}, + }, + Links: 1, + }), + }), fs.NewNonCachingMountSource(nil, fs.MountSourceFlags{}), fs.StableAttr{ + Type: fs.Anonymous, + BlockSize: usermem.PageSize, + }) +} + +func pagesOf(bs ...byte) []byte { + buf := make([]byte, 0, len(bs)*usermem.PageSize) + for _, b := range bs { + buf = append(buf, bytes.Repeat([]byte{b}, usermem.PageSize)...) + } + return buf +} + +func TestRead(t *testing.T) { + ctx := contexttest.Context(t) + + // Construct a 3-page file. + buf := pagesOf('a', 'b', 'c') + file := fs.NewFile(ctx, fs.NewDirent(anonInode(ctx), "anon"), fs.FileFlags{}, nil) + uattr := fs.UnstableAttr{ + Size: int64(len(buf)), + } + iops := NewCachingInodeOperations(ctx, newSliceBackingFile(buf), uattr, false /*forcePageCache*/) + defer iops.Release() + + // Expect the cache to be initially empty. + if cached := iops.cache.Span(); cached != 0 { + t.Errorf("Span got %d, want 0", cached) + } + + // Create a memory mapping of the second page (as CachingInodeOperations + // expects to only cache mapped pages), then call Translate to force it to + // be cached. + var ms noopMappingSpace + ar := usermem.AddrRange{usermem.PageSize, 2 * usermem.PageSize} + if err := iops.AddMapping(ctx, ms, ar, usermem.PageSize); err != nil { + t.Fatalf("AddMapping got %v, want nil", err) + } + mr := memmap.MappableRange{usermem.PageSize, 2 * usermem.PageSize} + if _, err := iops.Translate(ctx, mr, mr, usermem.Read); err != nil { + t.Fatalf("Translate got %v, want nil", err) + } + if cached := iops.cache.Span(); cached != usermem.PageSize { + t.Errorf("SpanRange got %d, want %d", cached, usermem.PageSize) + } + + // Try to read 4 pages. The first and third pages should be read directly + // from the "file", the second page should be read from the cache, and only + // 3 pages (the size of the file) should be readable. + rbuf := make([]byte, 4*usermem.PageSize) + dst := usermem.BytesIOSequence(rbuf) + n, err := iops.Read(ctx, file, dst, 0) + if n != 3*usermem.PageSize || (err != nil && err != io.EOF) { + t.Fatalf("Read got (%d, %v), want (%d, nil or EOF)", n, err, 3*usermem.PageSize) + } + rbuf = rbuf[:3*usermem.PageSize] + + // Did we get the bytes we expect? + if !bytes.Equal(rbuf, buf) { + t.Errorf("Read back bytes %v, want %v", rbuf, buf) + } + + // Delete the memory mapping and expect it to cause the cached page to be + // uncached. + iops.RemoveMapping(ctx, ms, ar, usermem.PageSize) + if cached := iops.cache.Span(); cached != 0 { + t.Fatalf("Span got %d, want 0", cached) + } +} + +func TestWrite(t *testing.T) { + ctx := contexttest.Context(t) + + // Construct a 4-page file. + buf := pagesOf('a', 'b', 'c', 'd') + orig := append([]byte(nil), buf...) + inode := anonInode(ctx) + uattr := fs.UnstableAttr{ + Size: int64(len(buf)), + } + iops := NewCachingInodeOperations(ctx, newSliceBackingFile(buf), uattr, false /*forcePageCache*/) + defer iops.Release() + + // Expect the cache to be initially empty. + if cached := iops.cache.Span(); cached != 0 { + t.Errorf("Span got %d, want 0", cached) + } + + // Create a memory mapping of the second and third pages (as + // CachingInodeOperations expects to only cache mapped pages), then call + // Translate to force them to be cached. + var ms noopMappingSpace + ar := usermem.AddrRange{usermem.PageSize, 3 * usermem.PageSize} + if err := iops.AddMapping(ctx, ms, ar, usermem.PageSize); err != nil { + t.Fatalf("AddMapping got %v, want nil", err) + } + defer iops.RemoveMapping(ctx, ms, ar, usermem.PageSize) + mr := memmap.MappableRange{usermem.PageSize, 3 * usermem.PageSize} + if _, err := iops.Translate(ctx, mr, mr, usermem.Read); err != nil { + t.Fatalf("Translate got %v, want nil", err) + } + if cached := iops.cache.Span(); cached != 2*usermem.PageSize { + t.Errorf("SpanRange got %d, want %d", cached, 2*usermem.PageSize) + } + + // Write to the first 2 pages. + wbuf := pagesOf('e', 'f') + src := usermem.BytesIOSequence(wbuf) + n, err := iops.Write(ctx, src, 0) + if n != 2*usermem.PageSize || err != nil { + t.Fatalf("Write got (%d, %v), want (%d, nil)", n, err, 2*usermem.PageSize) + } + + // The first page should have been written directly, since it was not cached. + want := append([]byte(nil), orig...) + copy(want, pagesOf('e')) + if !bytes.Equal(buf, want) { + t.Errorf("File contents are %v, want %v", buf, want) + } + + // Sync back to the "backing file". + if err := iops.WriteOut(ctx, inode); err != nil { + t.Errorf("Sync got %v, want nil", err) + } + + // Now the second page should have been written as well. + copy(want[usermem.PageSize:], pagesOf('f')) + if !bytes.Equal(buf, want) { + t.Errorf("File contents are %v, want %v", buf, want) + } +} |