diff options
Diffstat (limited to 'pkg/sentry/fs')
-rw-r--r-- | pkg/sentry/fs/fsutil/frame_ref_set.go | 40 | ||||
-rw-r--r-- | pkg/sentry/fs/g3doc/fuse.md | 218 |
2 files changed, 150 insertions, 108 deletions
diff --git a/pkg/sentry/fs/fsutil/frame_ref_set.go b/pkg/sentry/fs/fsutil/frame_ref_set.go index 6564fd0c6..dd6f5aba6 100644 --- a/pkg/sentry/fs/fsutil/frame_ref_set.go +++ b/pkg/sentry/fs/fsutil/frame_ref_set.go @@ -18,6 +18,7 @@ import ( "math" "gvisor.dev/gvisor/pkg/sentry/platform" + "gvisor.dev/gvisor/pkg/sentry/usage" ) // FrameRefSetFunctions implements segment.Functions for FrameRefSet. @@ -49,3 +50,42 @@ func (FrameRefSetFunctions) Merge(_ platform.FileRange, val1 uint64, _ platform. func (FrameRefSetFunctions) Split(_ platform.FileRange, val uint64, _ uint64) (uint64, uint64) { return val, val } + +// IncRefAndAccount adds a reference on the range fr. All newly inserted segments +// are accounted as host page cache memory mappings. +func (refs *FrameRefSet) IncRefAndAccount(fr platform.FileRange) { + seg, gap := refs.Find(fr.Start) + for { + switch { + case seg.Ok() && seg.Start() < fr.End: + seg = refs.Isolate(seg, fr) + seg.SetValue(seg.Value() + 1) + seg, gap = seg.NextNonEmpty() + case gap.Ok() && gap.Start() < fr.End: + newRange := gap.Range().Intersect(fr) + usage.MemoryAccounting.Inc(newRange.Length(), usage.Mapped) + seg, gap = refs.InsertWithoutMerging(gap, newRange, 1).NextNonEmpty() + default: + refs.MergeAdjacent(fr) + return + } + } +} + +// DecRefAndAccount removes a reference on the range fr and untracks segments +// that are removed from memory accounting. +func (refs *FrameRefSet) DecRefAndAccount(fr platform.FileRange) { + seg := refs.FindSegment(fr.Start) + + for seg.Ok() && seg.Start() < fr.End { + seg = refs.Isolate(seg, fr) + if old := seg.Value(); old == 1 { + usage.MemoryAccounting.Dec(seg.Range().Length(), usage.Mapped) + seg = refs.Remove(seg).NextSegment() + } else { + seg.SetValue(old - 1) + seg = seg.NextSegment() + } + } + refs.MergeAdjacent(fr) +} diff --git a/pkg/sentry/fs/g3doc/fuse.md b/pkg/sentry/fs/g3doc/fuse.md index c3988aa43..635cc009b 100644 --- a/pkg/sentry/fs/g3doc/fuse.md +++ b/pkg/sentry/fs/g3doc/fuse.md @@ -7,20 +7,20 @@ currently incomplete and the document will be updated as things progress. # FUSE: Filesystem in Userspace -The sentry supports dispatching filesystem operations to a FUSE server, -allowing FUSE filesystem to be used with a sandbox. +The sentry supports dispatching filesystem operations to a FUSE server, allowing +FUSE filesystem to be used with a sandbox. ## Overview FUSE has two main components: -1. A client kernel driver (canonically `fuse.ko` in Linux), which forwards - filesystem operations (usually initiated by syscalls) to the server. +1. A client kernel driver (canonically `fuse.ko` in Linux), which forwards + filesystem operations (usually initiated by syscalls) to the server. -2. A server, which is a userspace daemon that implements the actual filesystem. +2. A server, which is a userspace daemon that implements the actual filesystem. -The sentry implements the client component, which allows a server daemon -running within the sandbox to implement a filesystem within the sandbox. +The sentry implements the client component, which allows a server daemon running +within the sandbox to implement a filesystem within the sandbox. A FUSE filesystem is initialized with `mount(2)`, typically with the help of a utility like `fusermount(1)`. Various mount options exist for establishing @@ -30,43 +30,43 @@ and server. The FUSE device FD is obtained by opening `/dev/fuse`. During regular operation, the client and server use the FUSE protocol described in `fuse(4)` to service -filesystem operations. See the "Protocol" section below for more -information about this protocol. The core of the sentry support for FUSE is the -client-side implementation of this protocol. +filesystem operations. See the "Protocol" section below for more information +about this protocol. The core of the sentry support for FUSE is the client-side +implementation of this protocol. ## FUSE in the Sentry The sentry's FUSE client targets VFS2 and has the following components: -- An implementation of `/dev/fuse`. +- An implementation of `/dev/fuse`. -- A VFS2 filesystem for mapping syscalls to FUSE ops. Since we're targeting - VFS2, one point of contention may be the lack of inodes in VFS2. We can - tentatively implement a kernfs-based filesystem to bridge the gap in APIs. The - kernfs base functionality can serve the role of the Linux inode cache and, the - filesystem can map VFS2 syscalls to kernfs inode operations; see the - `kernfs.Inode` interface. +- A VFS2 filesystem for mapping syscalls to FUSE ops. Since we're targeting + VFS2, one point of contention may be the lack of inodes in VFS2. We can + tentatively implement a kernfs-based filesystem to bridge the gap in APIs. + The kernfs base functionality can serve the role of the Linux inode cache + and, the filesystem can map VFS2 syscalls to kernfs inode operations; see + the `kernfs.Inode` interface. -The FUSE protocol lends itself well to marshaling with `go_marshal`. The -various request and response packets can be defined in the ABI package and -converted to and from the wire format using `go_marshal`. +The FUSE protocol lends itself well to marshaling with `go_marshal`. The various +request and response packets can be defined in the ABI package and converted to +and from the wire format using `go_marshal`. ### Design Goals -- While filesystem performance is always important, the sentry's FUSE support is - primarily concerned with compatibility, with performance as a secondary - concern. +- While filesystem performance is always important, the sentry's FUSE support + is primarily concerned with compatibility, with performance as a secondary + concern. -- Avoiding deadlocks from a hung server daemon. +- Avoiding deadlocks from a hung server daemon. -- Consider the potential for denial of service from a malicious server - daemon. Protecting itself from userspace is already a design goal for the - sentry, but needs additional consideration for FUSE. Normally, an operating - system doesn't rely on userspace to make progress with filesystem - operations. Since this changes with FUSE, it opens up the possibility of - creating a chain of dependencies controlled by userspace, which could affect - an entire sandbox. For example: a FUSE op can block a syscall, which could be - holding a subsystem lock, which can then block another task goroutine. +- Consider the potential for denial of service from a malicious server daemon. + Protecting itself from userspace is already a design goal for the sentry, + but needs additional consideration for FUSE. Normally, an operating system + doesn't rely on userspace to make progress with filesystem operations. Since + this changes with FUSE, it opens up the possibility of creating a chain of + dependencies controlled by userspace, which could affect an entire sandbox. + For example: a FUSE op can block a syscall, which could be holding a + subsystem lock, which can then block another task goroutine. ### Milestones @@ -76,23 +76,23 @@ ops can be implemented in parallel. #### Minimal client that can mount a trivial FUSE filesystem. -- Implement `/dev/fuse`. +- Implement `/dev/fuse`. -- Implement basic FUSE ops like `FUSE_INIT`, `FUSE_DESTROY`. +- Implement basic FUSE ops like `FUSE_INIT`, `FUSE_DESTROY`. #### Read-only mount with basic file operations -- Implement the majority of file, directory and file descriptor FUSE ops. For - this milestone, we can skip uncommon or complex operations like mmap, mknod, - file locking, poll, and extended attributes. We can stub these out along with - any ops that modify the filesystem. The exact list of required ops are to be - determined, but the goal is to mount a real filesystem as read-only, and be - able to read contents from the filesystem in the sentry. +- Implement the majority of file, directory and file descriptor FUSE ops. For + this milestone, we can skip uncommon or complex operations like mmap, mknod, + file locking, poll, and extended attributes. We can stub these out along + with any ops that modify the filesystem. The exact list of required ops are + to be determined, but the goal is to mount a real filesystem as read-only, + and be able to read contents from the filesystem in the sentry. #### Full read-write support -- Implement the remaining FUSE ops and decide if we can omit rarely used - operations like ioctl. +- Implement the remaining FUSE ops and decide if we can omit rarely used + operations like ioctl. # Appendix @@ -145,19 +145,19 @@ operations map to the sentry virtual filesystem. These operations are specific to FUSE and don't have a corresponding action in a generic filesystem. -- `FUSE_INIT`: This operation initializes a new FUSE filesystem, and is the - first message sent by the client after mount. This is used for version and - feature negotiation. This is related to `mount(2)`. -- `FUSE_DESTROY`: Teardown a FUSE filesystem, related to `unmount(2)`. -- `FUSE_INTERRUPT`: Interrupts an in-flight operation, specified by the - `fuse_in_header.unique` value provided in the corresponding request - header. The client can send at most one of these per request, and will enter - an uninterruptible wait for a reply. The server is expected to reply promptly. -- `FUSE_FORGET`: A hint to the server that server should evict the indicate node - from any caches. This is wired up to `(struct super_operations).evict_inode` - in Linux, which is in turned hooked as the inode cache shrinker which is - typically triggered by system memory pressure. -- `FUSE_BATCH_FORGET`: Batch version of `FUSE_FORGET`. +- `FUSE_INIT`: This operation initializes a new FUSE filesystem, and is the + first message sent by the client after mount. This is used for version and + feature negotiation. This is related to `mount(2)`. +- `FUSE_DESTROY`: Teardown a FUSE filesystem, related to `unmount(2)`. +- `FUSE_INTERRUPT`: Interrupts an in-flight operation, specified by the + `fuse_in_header.unique` value provided in the corresponding request header. + The client can send at most one of these per request, and will enter an + uninterruptible wait for a reply. The server is expected to reply promptly. +- `FUSE_FORGET`: A hint to the server that server should evict the indicate + node from any caches. This is wired up to `(struct + super_operations).evict_inode` in Linux, which is in turned hooked as the + inode cache shrinker which is typically triggered by system memory pressure. +- `FUSE_BATCH_FORGET`: Batch version of `FUSE_FORGET`. #### Filesystem Syscalls @@ -167,92 +167,94 @@ otherwise noted. Node creation: -- `FUSE_MKNOD` -- `FUSE_MKDIR` -- `FUSE_CREATE`: This is equivalent to `open(2)` and `creat(2)`, which - atomically creates and opens a node. +- `FUSE_MKNOD` +- `FUSE_MKDIR` +- `FUSE_CREATE`: This is equivalent to `open(2)` and `creat(2)`, which + atomically creates and opens a node. Node attributes and extended attributes: -- `FUSE_GETATTR` -- `FUSE_SETATTR` -- `FUSE_SETXATTR` -- `FUSE_GETXATTR` -- `FUSE_LISTXATTR` -- `FUSE_REMOVEXATTR` +- `FUSE_GETATTR` +- `FUSE_SETATTR` +- `FUSE_SETXATTR` +- `FUSE_GETXATTR` +- `FUSE_LISTXATTR` +- `FUSE_REMOVEXATTR` Node link manipulation: -- `FUSE_READLINK` -- `FUSE_LINK` -- `FUSE_SYMLINK` -- `FUSE_UNLINK` +- `FUSE_READLINK` +- `FUSE_LINK` +- `FUSE_SYMLINK` +- `FUSE_UNLINK` Directory operations: -- `FUSE_RMDIR` -- `FUSE_RENAME` -- `FUSE_RENAME2` -- `FUSE_OPENDIR`: `open(2)` for directories. -- `FUSE_RELEASEDIR`: `close(2)` for directories. -- `FUSE_READDIR` -- `FUSE_READDIRPLUS` -- `FUSE_FSYNCDIR`: `fsync(2)` for directories. -- `FUSE_LOOKUP`: Establishes a unique identifier for a FS node. This is - reminiscent of `VirtualFilesystem.GetDentryAt` in that it resolves a path - component to a node. However the returned identifier is opaque to the - client. The server must remember this mapping, as this is how the client will - reference the node in the future. +- `FUSE_RMDIR` +- `FUSE_RENAME` +- `FUSE_RENAME2` +- `FUSE_OPENDIR`: `open(2)` for directories. +- `FUSE_RELEASEDIR`: `close(2)` for directories. +- `FUSE_READDIR` +- `FUSE_READDIRPLUS` +- `FUSE_FSYNCDIR`: `fsync(2)` for directories. +- `FUSE_LOOKUP`: Establishes a unique identifier for a FS node. This is + reminiscent of `VirtualFilesystem.GetDentryAt` in that it resolves a path + component to a node. However the returned identifier is opaque to the + client. The server must remember this mapping, as this is how the client + will reference the node in the future. File operations: -- `FUSE_OPEN`: `open(2)` for files. -- `FUSE_RELEASE`: `close(2)` for files. -- `FUSE_FSYNC` -- `FUSE_FALLOCATE` -- `FUSE_SETUPMAPPING`: Creates a memory map on a file for `mmap(2)`. -- `FUSE_REMOVEMAPPING`: Removes a memory map for `munmap(2)`. +- `FUSE_OPEN`: `open(2)` for files. +- `FUSE_RELEASE`: `close(2)` for files. +- `FUSE_FSYNC` +- `FUSE_FALLOCATE` +- `FUSE_SETUPMAPPING`: Creates a memory map on a file for `mmap(2)`. +- `FUSE_REMOVEMAPPING`: Removes a memory map for `munmap(2)`. File locking: -- `FUSE_GETLK` -- `FUSE_SETLK` -- `FUSE_SETLKW` -- `FUSE_COPY_FILE_RANGE` +- `FUSE_GETLK` +- `FUSE_SETLK` +- `FUSE_SETLKW` +- `FUSE_COPY_FILE_RANGE` File descriptor operations: -- `FUSE_IOCTL` -- `FUSE_POLL` -- `FUSE_LSEEK` +- `FUSE_IOCTL` +- `FUSE_POLL` +- `FUSE_LSEEK` Filesystem operations: -- `FUSE_STATFS` +- `FUSE_STATFS` #### Permissions -- `FUSE_ACCESS` is used to check if a node is accessible, as part of many - syscall implementations. Maps to `vfs.FilesystemImpl.AccessAt` - in the sentry. +- `FUSE_ACCESS` is used to check if a node is accessible, as part of many + syscall implementations. Maps to `vfs.FilesystemImpl.AccessAt` in the + sentry. #### I/O Operations These ops are used to read and write file pages. They're used to implement both I/O syscalls like `read(2)`, `write(2)` and `mmap(2)`. -- `FUSE_READ` -- `FUSE_WRITE` +- `FUSE_READ` +- `FUSE_WRITE` #### Miscellaneous -- `FUSE_FLUSH`: Used by the client to indicate when a file descriptor is - closed. Distinct from `FUSE_FSYNC`, which corresponds to an `fsync(2)` syscall - from the user. Maps to `vfs.FileDescriptorImpl.Release` in the sentry. -- `FUSE_BMAP`: Old address space API for block defrag. Probably not needed. -- `FUSE_NOTIFY_REPLY`: [TODO: what does this do?] +- `FUSE_FLUSH`: Used by the client to indicate when a file descriptor is + closed. Distinct from `FUSE_FSYNC`, which corresponds to an `fsync(2)` + syscall from the user. Maps to `vfs.FileDescriptorImpl.Release` in the + sentry. +- `FUSE_BMAP`: Old address space API for block defrag. Probably not needed. +- `FUSE_NOTIFY_REPLY`: [TODO: what does this do?] # References -- `fuse(4)` manpage. -- Linux kernel FUSE documentation: https://www.kernel.org/doc/html/latest/filesystems/fuse.html +- `fuse(4)` manpage. +- Linux kernel FUSE documentation: + https://www.kernel.org/doc/html/latest/filesystems/fuse.html |