summaryrefslogtreecommitdiffhomepage
path: root/pkg/sentry/fs
diff options
context:
space:
mode:
Diffstat (limited to 'pkg/sentry/fs')
-rw-r--r--pkg/sentry/fs/fsutil/frame_ref_set.go40
-rw-r--r--pkg/sentry/fs/g3doc/fuse.md218
2 files changed, 150 insertions, 108 deletions
diff --git a/pkg/sentry/fs/fsutil/frame_ref_set.go b/pkg/sentry/fs/fsutil/frame_ref_set.go
index 6564fd0c6..dd6f5aba6 100644
--- a/pkg/sentry/fs/fsutil/frame_ref_set.go
+++ b/pkg/sentry/fs/fsutil/frame_ref_set.go
@@ -18,6 +18,7 @@ import (
"math"
"gvisor.dev/gvisor/pkg/sentry/platform"
+ "gvisor.dev/gvisor/pkg/sentry/usage"
)
// FrameRefSetFunctions implements segment.Functions for FrameRefSet.
@@ -49,3 +50,42 @@ func (FrameRefSetFunctions) Merge(_ platform.FileRange, val1 uint64, _ platform.
func (FrameRefSetFunctions) Split(_ platform.FileRange, val uint64, _ uint64) (uint64, uint64) {
return val, val
}
+
+// IncRefAndAccount adds a reference on the range fr. All newly inserted segments
+// are accounted as host page cache memory mappings.
+func (refs *FrameRefSet) IncRefAndAccount(fr platform.FileRange) {
+ seg, gap := refs.Find(fr.Start)
+ for {
+ switch {
+ case seg.Ok() && seg.Start() < fr.End:
+ seg = refs.Isolate(seg, fr)
+ seg.SetValue(seg.Value() + 1)
+ seg, gap = seg.NextNonEmpty()
+ case gap.Ok() && gap.Start() < fr.End:
+ newRange := gap.Range().Intersect(fr)
+ usage.MemoryAccounting.Inc(newRange.Length(), usage.Mapped)
+ seg, gap = refs.InsertWithoutMerging(gap, newRange, 1).NextNonEmpty()
+ default:
+ refs.MergeAdjacent(fr)
+ return
+ }
+ }
+}
+
+// DecRefAndAccount removes a reference on the range fr and untracks segments
+// that are removed from memory accounting.
+func (refs *FrameRefSet) DecRefAndAccount(fr platform.FileRange) {
+ seg := refs.FindSegment(fr.Start)
+
+ for seg.Ok() && seg.Start() < fr.End {
+ seg = refs.Isolate(seg, fr)
+ if old := seg.Value(); old == 1 {
+ usage.MemoryAccounting.Dec(seg.Range().Length(), usage.Mapped)
+ seg = refs.Remove(seg).NextSegment()
+ } else {
+ seg.SetValue(old - 1)
+ seg = seg.NextSegment()
+ }
+ }
+ refs.MergeAdjacent(fr)
+}
diff --git a/pkg/sentry/fs/g3doc/fuse.md b/pkg/sentry/fs/g3doc/fuse.md
index c3988aa43..635cc009b 100644
--- a/pkg/sentry/fs/g3doc/fuse.md
+++ b/pkg/sentry/fs/g3doc/fuse.md
@@ -7,20 +7,20 @@ currently incomplete and the document will be updated as things progress.
# FUSE: Filesystem in Userspace
-The sentry supports dispatching filesystem operations to a FUSE server,
-allowing FUSE filesystem to be used with a sandbox.
+The sentry supports dispatching filesystem operations to a FUSE server, allowing
+FUSE filesystem to be used with a sandbox.
## Overview
FUSE has two main components:
-1. A client kernel driver (canonically `fuse.ko` in Linux), which forwards
- filesystem operations (usually initiated by syscalls) to the server.
+1. A client kernel driver (canonically `fuse.ko` in Linux), which forwards
+ filesystem operations (usually initiated by syscalls) to the server.
-2. A server, which is a userspace daemon that implements the actual filesystem.
+2. A server, which is a userspace daemon that implements the actual filesystem.
-The sentry implements the client component, which allows a server daemon
-running within the sandbox to implement a filesystem within the sandbox.
+The sentry implements the client component, which allows a server daemon running
+within the sandbox to implement a filesystem within the sandbox.
A FUSE filesystem is initialized with `mount(2)`, typically with the help of a
utility like `fusermount(1)`. Various mount options exist for establishing
@@ -30,43 +30,43 @@ and server.
The FUSE device FD is obtained by opening `/dev/fuse`. During regular operation,
the client and server use the FUSE protocol described in `fuse(4)` to service
-filesystem operations. See the "Protocol" section below for more
-information about this protocol. The core of the sentry support for FUSE is the
-client-side implementation of this protocol.
+filesystem operations. See the "Protocol" section below for more information
+about this protocol. The core of the sentry support for FUSE is the client-side
+implementation of this protocol.
## FUSE in the Sentry
The sentry's FUSE client targets VFS2 and has the following components:
-- An implementation of `/dev/fuse`.
+- An implementation of `/dev/fuse`.
-- A VFS2 filesystem for mapping syscalls to FUSE ops. Since we're targeting
- VFS2, one point of contention may be the lack of inodes in VFS2. We can
- tentatively implement a kernfs-based filesystem to bridge the gap in APIs. The
- kernfs base functionality can serve the role of the Linux inode cache and, the
- filesystem can map VFS2 syscalls to kernfs inode operations; see the
- `kernfs.Inode` interface.
+- A VFS2 filesystem for mapping syscalls to FUSE ops. Since we're targeting
+ VFS2, one point of contention may be the lack of inodes in VFS2. We can
+ tentatively implement a kernfs-based filesystem to bridge the gap in APIs.
+ The kernfs base functionality can serve the role of the Linux inode cache
+ and, the filesystem can map VFS2 syscalls to kernfs inode operations; see
+ the `kernfs.Inode` interface.
-The FUSE protocol lends itself well to marshaling with `go_marshal`. The
-various request and response packets can be defined in the ABI package and
-converted to and from the wire format using `go_marshal`.
+The FUSE protocol lends itself well to marshaling with `go_marshal`. The various
+request and response packets can be defined in the ABI package and converted to
+and from the wire format using `go_marshal`.
### Design Goals
-- While filesystem performance is always important, the sentry's FUSE support is
- primarily concerned with compatibility, with performance as a secondary
- concern.
+- While filesystem performance is always important, the sentry's FUSE support
+ is primarily concerned with compatibility, with performance as a secondary
+ concern.
-- Avoiding deadlocks from a hung server daemon.
+- Avoiding deadlocks from a hung server daemon.
-- Consider the potential for denial of service from a malicious server
- daemon. Protecting itself from userspace is already a design goal for the
- sentry, but needs additional consideration for FUSE. Normally, an operating
- system doesn't rely on userspace to make progress with filesystem
- operations. Since this changes with FUSE, it opens up the possibility of
- creating a chain of dependencies controlled by userspace, which could affect
- an entire sandbox. For example: a FUSE op can block a syscall, which could be
- holding a subsystem lock, which can then block another task goroutine.
+- Consider the potential for denial of service from a malicious server daemon.
+ Protecting itself from userspace is already a design goal for the sentry,
+ but needs additional consideration for FUSE. Normally, an operating system
+ doesn't rely on userspace to make progress with filesystem operations. Since
+ this changes with FUSE, it opens up the possibility of creating a chain of
+ dependencies controlled by userspace, which could affect an entire sandbox.
+ For example: a FUSE op can block a syscall, which could be holding a
+ subsystem lock, which can then block another task goroutine.
### Milestones
@@ -76,23 +76,23 @@ ops can be implemented in parallel.
#### Minimal client that can mount a trivial FUSE filesystem.
-- Implement `/dev/fuse`.
+- Implement `/dev/fuse`.
-- Implement basic FUSE ops like `FUSE_INIT`, `FUSE_DESTROY`.
+- Implement basic FUSE ops like `FUSE_INIT`, `FUSE_DESTROY`.
#### Read-only mount with basic file operations
-- Implement the majority of file, directory and file descriptor FUSE ops. For
- this milestone, we can skip uncommon or complex operations like mmap, mknod,
- file locking, poll, and extended attributes. We can stub these out along with
- any ops that modify the filesystem. The exact list of required ops are to be
- determined, but the goal is to mount a real filesystem as read-only, and be
- able to read contents from the filesystem in the sentry.
+- Implement the majority of file, directory and file descriptor FUSE ops. For
+ this milestone, we can skip uncommon or complex operations like mmap, mknod,
+ file locking, poll, and extended attributes. We can stub these out along
+ with any ops that modify the filesystem. The exact list of required ops are
+ to be determined, but the goal is to mount a real filesystem as read-only,
+ and be able to read contents from the filesystem in the sentry.
#### Full read-write support
-- Implement the remaining FUSE ops and decide if we can omit rarely used
- operations like ioctl.
+- Implement the remaining FUSE ops and decide if we can omit rarely used
+ operations like ioctl.
# Appendix
@@ -145,19 +145,19 @@ operations map to the sentry virtual filesystem.
These operations are specific to FUSE and don't have a corresponding action in a
generic filesystem.
-- `FUSE_INIT`: This operation initializes a new FUSE filesystem, and is the
- first message sent by the client after mount. This is used for version and
- feature negotiation. This is related to `mount(2)`.
-- `FUSE_DESTROY`: Teardown a FUSE filesystem, related to `unmount(2)`.
-- `FUSE_INTERRUPT`: Interrupts an in-flight operation, specified by the
- `fuse_in_header.unique` value provided in the corresponding request
- header. The client can send at most one of these per request, and will enter
- an uninterruptible wait for a reply. The server is expected to reply promptly.
-- `FUSE_FORGET`: A hint to the server that server should evict the indicate node
- from any caches. This is wired up to `(struct super_operations).evict_inode`
- in Linux, which is in turned hooked as the inode cache shrinker which is
- typically triggered by system memory pressure.
-- `FUSE_BATCH_FORGET`: Batch version of `FUSE_FORGET`.
+- `FUSE_INIT`: This operation initializes a new FUSE filesystem, and is the
+ first message sent by the client after mount. This is used for version and
+ feature negotiation. This is related to `mount(2)`.
+- `FUSE_DESTROY`: Teardown a FUSE filesystem, related to `unmount(2)`.
+- `FUSE_INTERRUPT`: Interrupts an in-flight operation, specified by the
+ `fuse_in_header.unique` value provided in the corresponding request header.
+ The client can send at most one of these per request, and will enter an
+ uninterruptible wait for a reply. The server is expected to reply promptly.
+- `FUSE_FORGET`: A hint to the server that server should evict the indicate
+ node from any caches. This is wired up to `(struct
+ super_operations).evict_inode` in Linux, which is in turned hooked as the
+ inode cache shrinker which is typically triggered by system memory pressure.
+- `FUSE_BATCH_FORGET`: Batch version of `FUSE_FORGET`.
#### Filesystem Syscalls
@@ -167,92 +167,94 @@ otherwise noted.
Node creation:
-- `FUSE_MKNOD`
-- `FUSE_MKDIR`
-- `FUSE_CREATE`: This is equivalent to `open(2)` and `creat(2)`, which
- atomically creates and opens a node.
+- `FUSE_MKNOD`
+- `FUSE_MKDIR`
+- `FUSE_CREATE`: This is equivalent to `open(2)` and `creat(2)`, which
+ atomically creates and opens a node.
Node attributes and extended attributes:
-- `FUSE_GETATTR`
-- `FUSE_SETATTR`
-- `FUSE_SETXATTR`
-- `FUSE_GETXATTR`
-- `FUSE_LISTXATTR`
-- `FUSE_REMOVEXATTR`
+- `FUSE_GETATTR`
+- `FUSE_SETATTR`
+- `FUSE_SETXATTR`
+- `FUSE_GETXATTR`
+- `FUSE_LISTXATTR`
+- `FUSE_REMOVEXATTR`
Node link manipulation:
-- `FUSE_READLINK`
-- `FUSE_LINK`
-- `FUSE_SYMLINK`
-- `FUSE_UNLINK`
+- `FUSE_READLINK`
+- `FUSE_LINK`
+- `FUSE_SYMLINK`
+- `FUSE_UNLINK`
Directory operations:
-- `FUSE_RMDIR`
-- `FUSE_RENAME`
-- `FUSE_RENAME2`
-- `FUSE_OPENDIR`: `open(2)` for directories.
-- `FUSE_RELEASEDIR`: `close(2)` for directories.
-- `FUSE_READDIR`
-- `FUSE_READDIRPLUS`
-- `FUSE_FSYNCDIR`: `fsync(2)` for directories.
-- `FUSE_LOOKUP`: Establishes a unique identifier for a FS node. This is
- reminiscent of `VirtualFilesystem.GetDentryAt` in that it resolves a path
- component to a node. However the returned identifier is opaque to the
- client. The server must remember this mapping, as this is how the client will
- reference the node in the future.
+- `FUSE_RMDIR`
+- `FUSE_RENAME`
+- `FUSE_RENAME2`
+- `FUSE_OPENDIR`: `open(2)` for directories.
+- `FUSE_RELEASEDIR`: `close(2)` for directories.
+- `FUSE_READDIR`
+- `FUSE_READDIRPLUS`
+- `FUSE_FSYNCDIR`: `fsync(2)` for directories.
+- `FUSE_LOOKUP`: Establishes a unique identifier for a FS node. This is
+ reminiscent of `VirtualFilesystem.GetDentryAt` in that it resolves a path
+ component to a node. However the returned identifier is opaque to the
+ client. The server must remember this mapping, as this is how the client
+ will reference the node in the future.
File operations:
-- `FUSE_OPEN`: `open(2)` for files.
-- `FUSE_RELEASE`: `close(2)` for files.
-- `FUSE_FSYNC`
-- `FUSE_FALLOCATE`
-- `FUSE_SETUPMAPPING`: Creates a memory map on a file for `mmap(2)`.
-- `FUSE_REMOVEMAPPING`: Removes a memory map for `munmap(2)`.
+- `FUSE_OPEN`: `open(2)` for files.
+- `FUSE_RELEASE`: `close(2)` for files.
+- `FUSE_FSYNC`
+- `FUSE_FALLOCATE`
+- `FUSE_SETUPMAPPING`: Creates a memory map on a file for `mmap(2)`.
+- `FUSE_REMOVEMAPPING`: Removes a memory map for `munmap(2)`.
File locking:
-- `FUSE_GETLK`
-- `FUSE_SETLK`
-- `FUSE_SETLKW`
-- `FUSE_COPY_FILE_RANGE`
+- `FUSE_GETLK`
+- `FUSE_SETLK`
+- `FUSE_SETLKW`
+- `FUSE_COPY_FILE_RANGE`
File descriptor operations:
-- `FUSE_IOCTL`
-- `FUSE_POLL`
-- `FUSE_LSEEK`
+- `FUSE_IOCTL`
+- `FUSE_POLL`
+- `FUSE_LSEEK`
Filesystem operations:
-- `FUSE_STATFS`
+- `FUSE_STATFS`
#### Permissions
-- `FUSE_ACCESS` is used to check if a node is accessible, as part of many
- syscall implementations. Maps to `vfs.FilesystemImpl.AccessAt`
- in the sentry.
+- `FUSE_ACCESS` is used to check if a node is accessible, as part of many
+ syscall implementations. Maps to `vfs.FilesystemImpl.AccessAt` in the
+ sentry.
#### I/O Operations
These ops are used to read and write file pages. They're used to implement both
I/O syscalls like `read(2)`, `write(2)` and `mmap(2)`.
-- `FUSE_READ`
-- `FUSE_WRITE`
+- `FUSE_READ`
+- `FUSE_WRITE`
#### Miscellaneous
-- `FUSE_FLUSH`: Used by the client to indicate when a file descriptor is
- closed. Distinct from `FUSE_FSYNC`, which corresponds to an `fsync(2)` syscall
- from the user. Maps to `vfs.FileDescriptorImpl.Release` in the sentry.
-- `FUSE_BMAP`: Old address space API for block defrag. Probably not needed.
-- `FUSE_NOTIFY_REPLY`: [TODO: what does this do?]
+- `FUSE_FLUSH`: Used by the client to indicate when a file descriptor is
+ closed. Distinct from `FUSE_FSYNC`, which corresponds to an `fsync(2)`
+ syscall from the user. Maps to `vfs.FileDescriptorImpl.Release` in the
+ sentry.
+- `FUSE_BMAP`: Old address space API for block defrag. Probably not needed.
+- `FUSE_NOTIFY_REPLY`: [TODO: what does this do?]
# References
-- `fuse(4)` manpage.
-- Linux kernel FUSE documentation: https://www.kernel.org/doc/html/latest/filesystems/fuse.html
+- `fuse(4)` manpage.
+- Linux kernel FUSE documentation:
+ https://www.kernel.org/doc/html/latest/filesystems/fuse.html