diff options
Diffstat (limited to 'pkg/sentry/fs')
185 files changed, 33876 insertions, 0 deletions
diff --git a/pkg/sentry/fs/BUILD b/pkg/sentry/fs/BUILD new file mode 100644 index 000000000..9b7264753 --- /dev/null +++ b/pkg/sentry/fs/BUILD @@ -0,0 +1,154 @@ +package(licenses = ["notice"]) # Apache 2.0 + +load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test") +load("//tools/go_generics:defs.bzl", "go_template_instance") +load("//tools/go_stateify:defs.bzl", "go_stateify") + +go_stateify( + name = "fs_state", + srcs = [ + "attr.go", + "dentry.go", + "dirent.go", + "dirent_cache.go", + "dirent_list.go", + "dirent_state.go", + "file.go", + "file_overlay.go", + "file_state.go", + "filesystems.go", + "flags.go", + "inode.go", + "inode_inotify.go", + "inode_operations.go", + "inode_overlay.go", + "inotify.go", + "inotify_event.go", + "inotify_watch.go", + "mock.go", + "mount.go", + "mount_overlay.go", + "mount_state.go", + "mounts.go", + "overlay.go", + "path.go", + ], + out = "fs_state.go", + package = "fs", +) + +go_library( + name = "fs", + srcs = [ + "attr.go", + "context.go", + "copy_up.go", + "dentry.go", + "dirent.go", + "dirent_cache.go", + "dirent_list.go", + "dirent_state.go", + "file.go", + "file_operations.go", + "file_overlay.go", + "file_state.go", + "filesystems.go", + "flags.go", + "fs.go", + "fs_state.go", + "inode.go", + "inode_inotify.go", + "inode_operations.go", + "inode_overlay.go", + "inotify.go", + "inotify_event.go", + "inotify_watch.go", + "mock.go", + "mount.go", + "mount_overlay.go", + "mount_state.go", + "mounts.go", + "offset.go", + "overlay.go", + "path.go", + "restore.go", + "save.go", + "seek.go", + "sync.go", + ], + importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/fs", + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/amutex", + "//pkg/ilist", + "//pkg/log", + "//pkg/p9", + "//pkg/refs", + "//pkg/sentry/arch", + "//pkg/sentry/context", + "//pkg/sentry/device", + "//pkg/sentry/fs/lock", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/kernel/time", + "//pkg/sentry/limits", + "//pkg/sentry/memmap", + "//pkg/sentry/platform", + "//pkg/sentry/uniqueid", + "//pkg/sentry/usage", + "//pkg/sentry/usermem", + "//pkg/state", + "//pkg/syserror", + "//pkg/tcpip", + "//pkg/tcpip/transport/unix", + "//pkg/waiter", + ], +) + +go_template_instance( + name = "dirent_list", + out = "dirent_list.go", + package = "fs", + prefix = "dirent", + template = "//pkg/ilist:generic_list", + types = { + "Linker": "*Dirent", + }, +) + +go_test( + name = "fs_x_test", + size = "small", + srcs = [ + "copy_up_test.go", + "file_overlay_test.go", + "inode_overlay_test.go", + "mounts_test.go", + ], + deps = [ + ":fs", + "//pkg/sentry/context", + "//pkg/sentry/context/contexttest", + "//pkg/sentry/fs/ramfs/test", + "//pkg/sentry/fs/tmpfs", + "//pkg/sentry/usermem", + "//pkg/syserror", + ], +) + +go_test( + name = "fs_test", + size = "small", + srcs = [ + "dirent_cache_test.go", + "dirent_refs_test.go", + "file_test.go", + "mount_test.go", + "path_test.go", + ], + embed = [":fs"], + deps = [ + "//pkg/sentry/context", + "//pkg/sentry/context/contexttest", + ], +) diff --git a/pkg/sentry/fs/README.md b/pkg/sentry/fs/README.md new file mode 100644 index 000000000..898271ee8 --- /dev/null +++ b/pkg/sentry/fs/README.md @@ -0,0 +1,217 @@ +This package provides an implementation of the Linux virtual filesystem. + +[TOC] + +## Overview + +- An `fs.Dirent` caches an `fs.Inode` in memory at a path in the VFS, giving + the `fs.Inode` a relative position with respect to other `fs.Inode`s. + +- If an `fs.Dirent` is referenced by two file descriptors, then those file + descriptors are coherent with each other: they depend on the same + `fs.Inode`. + +- A mount point is an `fs.Dirent` for which `fs.Dirent.mounted` is true. It + exposes the root of a mounted filesystem. + +- The `fs.Inode` produced by a registered filesystem on mount(2) owns an + `fs.MountedFilesystem` from which other `fs.Inode`s will be looked up. For a + remote filesystem, the `fs.MountedFilesystem` owns the connection to that + remote filesystem. + +- In general: + +``` +fs.Inode <------------------------------ +| | +| | +produced by | +exactly one | +| responsible for the +| virtual identity of +v | +fs.MountedFilesystem ------------------- +``` + +Glossary: + +- VFS: virtual filesystem. + +- inode: a virtual file object holding a cached view of a file on a backing + filesystem (includes metadata and page caches). + +- superblock: the virtual state of a mounted filesystem (e.g. the virtual + inode number set). + +- mount namespace: a view of the mounts under a root (during path traversal, + the VFS makes visible/follows the mount point that is in the current task's + mount namespace). + +## Save and restore + +An application's hard dependencies on filesystem state can be broken down into +two categories: + +- The state necessary to execute a traversal on or view the *virtual* + filesystem hierarchy, regardless of what files an application has open. + +- The state necessary to represent open files. + +The first is always necessary to save and restore. An application may never have +any open file descriptors, but across save and restore it should see a coherent +view of any mount namespace. NOTE: Currently only one "initial" +mount namespace is supported. + +The second is so that system calls across save and restore are coherent with +each other (e.g. so that unintended re-reads or overwrites do not occur). + +Specifically this state is: + +- An `fs.MountManager` containing mount points. + +- A `kernel.FDMap` containing pointers to open files. + +Anything else managed by the VFS that can be easily loaded into memory from a +filesystem is synced back to those filesystems and is no saved. Examples are +pages in page caches used for optimizations (i.e. readahead and writeback), and +directory entries used to accelerate path lookups. + +### Mount points + +Saving and restoring a mount point means saving and restoring: + +- The root of the mounted filesystem. + +- Mount flags, which control how the VFS interacts with the mounted + filesystem. + +- Any relevant metadata about the mounted filesystem. + +- All `fs.Inode`s referenced by the application that reside under the mount + point. + +`fs.MountedFilesystem` is metadata about a filesystem that is mounted. It is +referenced by every `fs.Inode` loaded into memory under the mount point +including the `fs.Inode` of the mount point itself. The `fs.MountedFilesystem` +maps file objects on the filesystem to a virtualized `fs.Inode` number and vice +versa. + +To restore all `fs.Inode`s under a given mount point, each `fs.Inode` leverages +its dependency on an `fs.MountedFilesystem`. Since the `fs.MountedFilesystem` +knows how an `fs.Inode` maps to a file object on a backing filesystem, this +mapping can be trivially consulted by each `fs.Inode` when the `fs.Inode` is +restored. + +In detail, a mount point is saved in two steps: + +- First, after the kernel is paused but before state.Save, we walk all mount + namespaces and install a mapping from `fs.Inode` numbers to file paths + relative to the root of the mounted filesystem in each + `fs.MountedFilesystem`. This is subsequently called the set of `fs.Inode` + mappings. + +- Second, during state.Save, each `fs.MountedFilesystem` decides whether to + save the set of `fs.Inode` mappings. In-memory filesystems, like tmpfs, have + no need to save a set of `fs.Inode` mappings, since the `fs.Inode`s can be + entirely encoded in state file. Each `fs.MountedFilesystem` also optionally + saves the device name from when the filesystem was originally mounted. Each + `fs.Inode` saves its virtual identifier and a reference to a + `fs.MountedFilesystem`. + +A mount point is restored in two steps: + +- First, before state.Load, all mount configurations are stored in a global + `fs.RestoreEnvironment`. This tells us what mount points the user wants to + restore and how to re-establish pointers to backing filesystems. + +- Second, during state.Load, each `fs.MountedFilesystem` optionally searches + for a mount in the `fs.RestoreEnvironment` that matches its saved device + name. The `fs.MountedFilesystem` then restablishes a pointer to the root of + the mounted filesystem. For example, the mount specification provides the + network connection for a mounted remote filesystem client to communicate + with its remote file server. The `fs.MountedFilesystem` also trivially loads + its set of `fs.Inode` mappings. When an `fs.Inode` is encountered, the + `fs.Inode` loads its virtual identifier and its reference a + `fs.MountedFilesystem`. It uses the `fs.MountedFilesystem` to obtain the + root of the mounted filesystem and the `fs.Inode` mappings to obtain the + relative file path to its data. With these, the `fs.Inode` re-establishes a + pointer to its file object. + +A mount point can trivially restore its `fs.Inode`s in parallel since +`fs.Inode`s have a restore dependency on their `fs.MountedFilesystem` and not on +each other. + +### Open files + +An `fs.File` references the following filesystem objects: + +```go +fs.File -> fs.Dirent -> fs.Inode -> fs.MountedFilesystem +``` + +The `fs.Inode` is restored using its `fs.MountedFilesystem`. The [Mount +points](#mount-points) section above describes how this happens in detail. The +`fs.Dirent` restores its pointer to an `fs.Inode`, pointers to parent and +children `fs.Dirents`, and the basename of the file. + +Otherwise an `fs.File` restores flags, an offset, and a unique identifier (only +used internally). + +It may use the `fs.Inode`, which it indirectly holds a reference on through the +`fs.Dirent`, to restablish an open file handle on the backing filesystem (e.g. +to continue reading and writing). + +## Overlay + +The overlay implementation in the fs package takes Linux overlayfs as a frame of +reference but corrects for several POSIX consistency errors. + +In Linux overlayfs, the `struct inode` used for reading and writing to the same +file may be different. This is because the `struct inode` is dissociated with +the process of copying up the file from the upper to the lower directory. Since +flock(2) and fcntl(2) locks, inotify(7) watches, page caches, and a file's +identity are all stored directly or indirectly off the `struct inode`, these +properties of the `struct inode` may be stale after the first modification. This +can lead to file locking bugs, missed inotify events, and inconsistent data in +shared memory mappings of files, to name a few problems. + +The fs package maintains a single `fs.Inode` to represent a directory entry in +an overlay and defines operations on this `fs.Inode` which synchronize with the +copy up process. This achieves several things: + ++ File locks, inotify watches, and the identity of the file need not be copied + at all. + ++ Memory mappings of files coordinate with the copy up process so that if a + file in the lower directory is memory mapped, all references to it are + invalidated, forcing the application to re-fault on memory mappings of the + file under the upper directory. + +The `fs.Inode` holds metadata about files in the upper and/or lower directories +via an `fs.overlayEntry`. The `fs.overlayEntry` implements the `fs.Mappable` +interface. It multiplexes between upper and lower directory memory mappings and +stores a copy of memory references so they can be transferred to the upper +directory `fs.Mappable` when the file is copied up. + +The `fs.Inode` also holds a reference to a `fs.MountedFilesystem` that +normalizes across the mounted filesystem state of the upper and lower +directories. + +When a file is copied from the lower to the upper directory, attempts to +interact with the file block until the copy completes. All copying synchronizes +with rename(2). + +## Future Work + +### Overlay + +When a file is copied from a lower directory to an upper directory, several +locks are taken: the global renamuMu and the copyMu of the `fs.Inode` being +copied. This blocks operations on the file, including fault handling of memory +mappings. Performance could be improved by copying files into a temporary +directory that resides on the same filesystem as the upper directory and doing +an atomic rename, holding locks only during the rename operation. + +Additionally files are copied up synchronously. For large files, this causes a +noticeable latency. Performance could be improved by pipelining copies at +non-overlapping file offsets. diff --git a/pkg/sentry/fs/anon/BUILD b/pkg/sentry/fs/anon/BUILD new file mode 100644 index 000000000..6b18aee47 --- /dev/null +++ b/pkg/sentry/fs/anon/BUILD @@ -0,0 +1,21 @@ +package(licenses = ["notice"]) # Apache 2.0 + +load("@io_bazel_rules_go//go:def.bzl", "go_library") + +go_library( + name = "anon", + srcs = [ + "anon.go", + "device.go", + ], + importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/fs/anon", + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/sentry/context", + "//pkg/sentry/device", + "//pkg/sentry/fs", + "//pkg/sentry/fs/fsutil", + "//pkg/sentry/usermem", + ], +) diff --git a/pkg/sentry/fs/anon/anon.go b/pkg/sentry/fs/anon/anon.go new file mode 100644 index 000000000..ddc2c0985 --- /dev/null +++ b/pkg/sentry/fs/anon/anon.go @@ -0,0 +1,46 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package anon implements an anonymous inode, useful for implementing +// inodes for pseudo filesystems. +package anon + +import ( + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +// NewInode constructs an anonymous Inode that is not associated +// with any real filesystem. Some types depend on completely pseudo +// "anon" inodes (eventfds, epollfds, etc). +func NewInode(ctx context.Context) *fs.Inode { + return fs.NewInode(fsutil.NewSimpleInodeOperations(fsutil.InodeSimpleAttributes{ + FSType: linux.ANON_INODE_FS_MAGIC, + UAttr: fs.WithCurrentTime(ctx, fs.UnstableAttr{ + Owner: fs.FileOwnerFromContext(ctx), + Perms: fs.FilePermissions{ + User: fs.PermMask{Read: true, Write: true}, + }, + Links: 1, + }), + }), fs.NewNonCachingMountSource(nil, fs.MountSourceFlags{}), fs.StableAttr{ + Type: fs.Anonymous, + DeviceID: PseudoDevice.DeviceID(), + InodeID: PseudoDevice.NextIno(), + BlockSize: usermem.PageSize, + }) +} diff --git a/pkg/sentry/fs/anon/device.go b/pkg/sentry/fs/anon/device.go new file mode 100644 index 000000000..1c666729c --- /dev/null +++ b/pkg/sentry/fs/anon/device.go @@ -0,0 +1,22 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package anon + +import ( + "gvisor.googlesource.com/gvisor/pkg/sentry/device" +) + +// PseudoDevice is the device on which all anonymous inodes reside. +var PseudoDevice = device.NewAnonDevice() diff --git a/pkg/sentry/fs/ashmem/BUILD b/pkg/sentry/fs/ashmem/BUILD new file mode 100644 index 000000000..e20e22a0f --- /dev/null +++ b/pkg/sentry/fs/ashmem/BUILD @@ -0,0 +1,83 @@ +package(licenses = ["notice"]) # Apache 2.0 + +load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test") +load("//tools/go_stateify:defs.bzl", "go_stateify") +load("//tools/go_generics:defs.bzl", "go_template_instance") + +go_stateify( + name = "ashmem_state", + srcs = [ + "area.go", + "device.go", + "pin_board.go", + "uint64_range.go", + "uint64_set.go", + ], + out = "ashmem_state.go", + package = "ashmem", +) + +go_library( + name = "ashmem", + srcs = [ + "area.go", + "ashmem_state.go", + "device.go", + "pin_board.go", + "uint64_range.go", + "uint64_set.go", + ], + importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/fs/ashmem", + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/sentry/arch", + "//pkg/sentry/context", + "//pkg/sentry/fs", + "//pkg/sentry/fs/fsutil", + "//pkg/sentry/fs/tmpfs", + "//pkg/sentry/kernel", + "//pkg/sentry/kernel/time", + "//pkg/sentry/memmap", + "//pkg/sentry/platform", + "//pkg/sentry/usage", + "//pkg/sentry/usermem", + "//pkg/state", + "//pkg/syserror", + "//pkg/tcpip/transport/unix", + ], +) + +go_test( + name = "ashmem_test", + size = "small", + srcs = ["pin_board_test.go"], + embed = [":ashmem"], + deps = [ + "//pkg/abi/linux", + "//pkg/sentry/usermem", + ], +) + +go_template_instance( + name = "uint64_range", + out = "uint64_range.go", + package = "ashmem", + template = "//pkg/segment:generic_range", + types = { + "T": "uint64", + }, +) + +go_template_instance( + name = "uint64_set", + out = "uint64_set.go", + package = "ashmem", + template = "//pkg/segment:generic_set", + types = { + "Key": "uint64", + "Range": "Range", + "Value": "noValue", + "Functions": "setFunctions", + }, +) diff --git a/pkg/sentry/fs/ashmem/area.go b/pkg/sentry/fs/ashmem/area.go new file mode 100644 index 000000000..e4f76f0d0 --- /dev/null +++ b/pkg/sentry/fs/ashmem/area.go @@ -0,0 +1,313 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package ashmem + +import ( + "sync" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/tmpfs" + "gvisor.googlesource.com/gvisor/pkg/sentry/memmap" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform" + "gvisor.googlesource.com/gvisor/pkg/sentry/usage" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +const ( + // namePrefix is the name prefix assumed and forced by the Linux implementation. + namePrefix = "dev/ashmem" + + // nameLen is the maximum name length. + nameLen = 256 +) + +// Area implements fs.FileOperations. +type Area struct { + fsutil.NoFsync + fsutil.DeprecatedFileOperations + fsutil.NotDirReaddir + + ad *Device + + // mu protects fields below. + mu sync.Mutex `state:"nosave"` + tmpfsFile *fs.File + name string + size uint64 + perms usermem.AccessType + pb *PinBoard +} + +// Release implements fs.FileOperations.Release. +func (a *Area) Release() { + a.mu.Lock() + defer a.mu.Unlock() + if a.tmpfsFile != nil { + a.tmpfsFile.DecRef() + a.tmpfsFile = nil + } +} + +// Seek implements fs.FileOperations.Seek. +func (a *Area) Seek(ctx context.Context, file *fs.File, whence fs.SeekWhence, offset int64) (int64, error) { + a.mu.Lock() + defer a.mu.Unlock() + if a.size == 0 { + return 0, syserror.EINVAL + } + if a.tmpfsFile == nil { + return 0, syserror.EBADF + } + return a.tmpfsFile.FileOperations.Seek(ctx, file, whence, offset) +} + +// Read implements fs.FileOperations.Read. +func (a *Area) Read(ctx context.Context, file *fs.File, dst usermem.IOSequence, offset int64) (int64, error) { + a.mu.Lock() + defer a.mu.Unlock() + if a.size == 0 { + return 0, nil + } + if a.tmpfsFile == nil { + return 0, syserror.EBADF + } + return a.tmpfsFile.FileOperations.Read(ctx, file, dst, offset) +} + +// Write implements fs.FileOperations.Write. +func (a *Area) Write(ctx context.Context, file *fs.File, src usermem.IOSequence, offset int64) (int64, error) { + return 0, syserror.ENOSYS +} + +// Flush implements fs.FileOperations.Flush. +func (a *Area) Flush(ctx context.Context, file *fs.File) error { + return nil +} + +// ConfigureMMap implements fs.FileOperations.ConfigureMMap. +func (a *Area) ConfigureMMap(ctx context.Context, file *fs.File, opts *memmap.MMapOpts) error { + a.mu.Lock() + defer a.mu.Unlock() + if a.size == 0 { + return syserror.EINVAL + } + + if !a.perms.SupersetOf(opts.Perms) { + return syserror.EPERM + } + opts.MaxPerms = opts.MaxPerms.Intersect(a.perms) + + if a.tmpfsFile == nil { + p := platform.FromContext(ctx) + if p == nil { + return syserror.ENOMEM + } + tmpfsInodeOps := tmpfs.NewInMemoryFile(ctx, usage.Tmpfs, fs.UnstableAttr{}, p) + // This is not backed by a real filesystem, so we pass in nil. + tmpfsInode := fs.NewInode(tmpfsInodeOps, fs.NewNonCachingMountSource(nil, fs.MountSourceFlags{}), fs.StableAttr{}) + dirent := fs.NewDirent(tmpfsInode, namePrefix+"/"+a.name) + tmpfsFile, err := tmpfsInode.GetFile(ctx, dirent, fs.FileFlags{Read: true, Write: true}) + // Drop the extra reference on the Dirent. + dirent.DecRef() + + if err != nil { + return err + } + + // Truncate to the size set by ASHMEM_SET_SIZE ioctl. + err = tmpfsInodeOps.Truncate(ctx, tmpfsInode, int64(a.size)) + if err != nil { + return err + } + a.tmpfsFile = tmpfsFile + a.pb = NewPinBoard() + } + + return a.tmpfsFile.ConfigureMMap(ctx, opts) +} + +// Ioctl implements fs.FileOperations.Ioctl. +func (a *Area) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { + // Switch on ioctl request. + switch args[1].Uint() { + case linux.AshmemSetNameIoctl: + name, err := usermem.CopyStringIn(ctx, io, args[2].Pointer(), nameLen-1, usermem.IOOpts{ + AddressSpaceActive: true, + }) + if err != nil { + return 0, err + } + + a.mu.Lock() + defer a.mu.Unlock() + + // Cannot set name for already mapped ashmem. + if a.tmpfsFile != nil { + return 0, syserror.EINVAL + } + a.name = name + return 0, nil + + case linux.AshmemGetNameIoctl: + a.mu.Lock() + var local []byte + if a.name != "" { + nameLen := len([]byte(a.name)) + local = make([]byte, nameLen, nameLen+1) + copy(local, []byte(a.name)) + local = append(local, 0) + } else { + nameLen := len([]byte(namePrefix)) + local = make([]byte, nameLen, nameLen+1) + copy(local, []byte(namePrefix)) + local = append(local, 0) + } + a.mu.Unlock() + + if _, err := io.CopyOut(ctx, args[2].Pointer(), local, usermem.IOOpts{ + AddressSpaceActive: true, + }); err != nil { + return 0, syserror.EFAULT + } + return 0, nil + + case linux.AshmemSetSizeIoctl: + a.mu.Lock() + defer a.mu.Unlock() + + // Cannot set size for already mapped ashmem. + if a.tmpfsFile != nil { + return 0, syserror.EINVAL + } + a.size = uint64(args[2].SizeT()) + return 0, nil + + case linux.AshmemGetSizeIoctl: + return uintptr(a.size), nil + + case linux.AshmemPinIoctl, linux.AshmemUnpinIoctl, linux.AshmemGetPinStatusIoctl: + // Locking and unlocking is ok since once tmpfsFile is set, it won't be nil again + // even after unmapping! Unlocking is needed in order to avoid a deadlock on + // usermem.CopyObjectIn. + + // Cannot execute pin-related ioctls before mapping. + a.mu.Lock() + if a.tmpfsFile == nil { + a.mu.Unlock() + return 0, syserror.EINVAL + } + a.mu.Unlock() + + var pin linux.AshmemPin + _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &pin, usermem.IOOpts{ + AddressSpaceActive: true, + }) + if err != nil { + return 0, syserror.EFAULT + } + + a.mu.Lock() + defer a.mu.Unlock() + return a.pinOperation(pin, args[1].Uint()) + + case linux.AshmemPurgeAllCachesIoctl: + return 0, nil + + case linux.AshmemSetProtMaskIoctl: + prot := uint64(args[2].ModeT()) + perms := usermem.AccessType{ + Read: prot&linux.PROT_READ != 0, + Write: prot&linux.PROT_WRITE != 0, + Execute: prot&linux.PROT_EXEC != 0, + } + + a.mu.Lock() + defer a.mu.Unlock() + + // Can only narrow prot mask. + if !a.perms.SupersetOf(perms) { + return 0, syserror.EINVAL + } + + // TODO: If personality flag + // READ_IMPLIES_EXEC is set, set PROT_EXEC if PORT_READ is set. + + a.perms = perms + return 0, nil + + case linux.AshmemGetProtMaskIoctl: + return uintptr(a.perms.Prot()), nil + default: + // Ioctls irrelevant to Ashmem. + return 0, syserror.EINVAL + } +} + +// pinOperation should only be called while holding a.mu. +func (a *Area) pinOperation(pin linux.AshmemPin, op uint32) (uintptr, error) { + // Page-align a.size for checks. + pageAlignedSize, ok := usermem.Addr(a.size).RoundUp() + if !ok { + return 0, syserror.EINVAL + } + // Len 0 means everything onward. + if pin.Len == 0 { + pin.Len = uint32(pageAlignedSize) - pin.Offset + } + // Both Offset and Len have to be page-aligned. + if pin.Offset%uint32(usermem.PageSize) != 0 { + return 0, syserror.EINVAL + } + if pin.Len%uint32(usermem.PageSize) != 0 { + return 0, syserror.EINVAL + } + // Adding Offset and Len must not cause an uint32 overflow. + if end := pin.Offset + pin.Len; end < pin.Offset { + return 0, syserror.EINVAL + } + // Pin range must not exceed a's size. + if uint32(pageAlignedSize) < pin.Offset+pin.Len { + return 0, syserror.EINVAL + } + // Handle each operation. + r := RangeFromAshmemPin(pin) + switch op { + case linux.AshmemPinIoctl: + if a.pb.PinRange(r) { + return linux.AshmemWasPurged, nil + } + return linux.AshmemNotPurged, nil + + case linux.AshmemUnpinIoctl: + // TODO: Implement purge on unpin. + a.pb.UnpinRange(r) + return 0, nil + + case linux.AshmemGetPinStatusIoctl: + if a.pb.RangePinnedStatus(r) { + return linux.AshmemIsPinned, nil + } + return linux.AshmemIsUnpinned, nil + + default: + panic("unreachable") + } + +} diff --git a/pkg/sentry/fs/ashmem/device.go b/pkg/sentry/fs/ashmem/device.go new file mode 100644 index 000000000..c5b51d4a7 --- /dev/null +++ b/pkg/sentry/fs/ashmem/device.go @@ -0,0 +1,169 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package ashmem implements Android ashmem module (Anonymus Shared Memory). +package ashmem + +import ( + "sync" + + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// Device implements fs.InodeOperations. +type Device struct { + fsutil.DeprecatedFileOperations + fsutil.InodeNoExtendedAttributes + fsutil.InodeNotDirectory + fsutil.InodeNotRenameable + fsutil.InodeNotSocket + fsutil.InodeNotSymlink + fsutil.NoFsync + fsutil.NoMappable + fsutil.NoopWriteOut + fsutil.NotDirReaddir + + mu sync.Mutex `state:"nosave"` + unstable fs.UnstableAttr +} + +// NewDevice creates and intializes a Device structure. +func NewDevice(ctx context.Context, owner fs.FileOwner, fp fs.FilePermissions) *Device { + return &Device{ + unstable: fs.WithCurrentTime(ctx, fs.UnstableAttr{ + Owner: owner, + Perms: fp, + Links: 1, + }), + } +} + +// Release implements fs.InodeOperations.Release. +func (ad *Device) Release(context.Context) {} + +// GetFile implements fs.InodeOperations.GetFile. +func (ad *Device) GetFile(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) { + return fs.NewFile(ctx, d, flags, &Area{ + ad: ad, + tmpfsFile: nil, + perms: usermem.AnyAccess, + }), nil +} + +// UnstableAttr implements fs.InodeOperations.UnstableAttr. +func (ad *Device) UnstableAttr(ctx context.Context, inode *fs.Inode) (fs.UnstableAttr, error) { + ad.mu.Lock() + defer ad.mu.Unlock() + return ad.unstable, nil +} + +// Check implements fs.InodeOperations.Check. +func (ad *Device) Check(ctx context.Context, inode *fs.Inode, p fs.PermMask) bool { + return fs.ContextCanAccessFile(ctx, inode, p) +} + +// SetPermissions implements fs.InodeOperations.SetPermissions. +func (ad *Device) SetPermissions(ctx context.Context, inode *fs.Inode, fp fs.FilePermissions) bool { + ad.mu.Lock() + defer ad.mu.Unlock() + ad.unstable.Perms = fp + ad.unstable.StatusChangeTime = time.NowFromContext(ctx) + return true +} + +// SetOwner implements fs.InodeOperations.SetOwner. +func (ad *Device) SetOwner(ctx context.Context, inode *fs.Inode, owner fs.FileOwner) error { + ad.mu.Lock() + defer ad.mu.Unlock() + if owner.UID.Ok() { + ad.unstable.Owner.UID = owner.UID + } + if owner.GID.Ok() { + ad.unstable.Owner.GID = owner.GID + } + return nil +} + +// SetTimestamps implements fs.InodeOperations.SetTimestamps. +func (ad *Device) SetTimestamps(ctx context.Context, inode *fs.Inode, ts fs.TimeSpec) error { + if ts.ATimeOmit && ts.MTimeOmit { + return nil + } + + ad.mu.Lock() + defer ad.mu.Unlock() + + now := time.NowFromContext(ctx) + if !ts.ATimeOmit { + if ts.ATimeSetSystemTime { + ad.unstable.AccessTime = now + } else { + ad.unstable.AccessTime = ts.ATime + } + } + if !ts.MTimeOmit { + if ts.MTimeSetSystemTime { + ad.unstable.ModificationTime = now + } else { + ad.unstable.ModificationTime = ts.MTime + } + } + ad.unstable.StatusChangeTime = now + return nil +} + +// Truncate implements fs.InodeOperations.WriteOut. +// +// Ignored by ashmem. +func (ad *Device) Truncate(ctx context.Context, inode *fs.Inode, size int64) error { + return nil +} + +// AddLink implements fs.InodeOperations.AddLink. +// +// Ashmem doesn't support links, no-op. +func (ad *Device) AddLink() {} + +// DropLink implements fs.InodeOperations.DropLink. +// +// Ashmem doesn't support links, no-op. +func (ad *Device) DropLink() {} + +// NotifyStatusChange implements fs.InodeOperations.NotifyStatusChange. +func (ad *Device) NotifyStatusChange(ctx context.Context) { + ad.mu.Lock() + defer ad.mu.Unlock() + now := time.NowFromContext(ctx) + ad.unstable.ModificationTime = now + ad.unstable.StatusChangeTime = now +} + +// IsVirtual implements fs.InodeOperations.IsVirtual. +// +// Ashmem is virtual. +func (ad *Device) IsVirtual() bool { + return true +} + +// StatFS implements fs.InodeOperations.StatFS. +// +// Ashmem doesn't support querying for filesystem info. +func (ad *Device) StatFS(context.Context) (fs.Info, error) { + return fs.Info{}, syserror.ENOSYS +} diff --git a/pkg/sentry/fs/ashmem/pin_board.go b/pkg/sentry/fs/ashmem/pin_board.go new file mode 100644 index 000000000..c7fb3822c --- /dev/null +++ b/pkg/sentry/fs/ashmem/pin_board.go @@ -0,0 +1,125 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package ashmem + +import "gvisor.googlesource.com/gvisor/pkg/abi/linux" + +const maxUint64 = ^uint64(0) + +// setFunctions implements segment.Functions generated from segment.Functions for +// uint64 Key and noValue Value. For more information, see the build file and +// segment set implementation at pkg/segment/set.go. +type setFunctions struct{} + +// noValue is a type of range attached value, which is irrelevant here. +type noValue struct{} + +// MinKey implements segment.Functions.MinKey. +func (setFunctions) MinKey() uint64 { + return 0 +} + +// MaxKey implements segment.Functions.MaxKey. +func (setFunctions) MaxKey() uint64 { + return maxUint64 +} + +// ClearValue implements segment.Functions.ClearValue. +func (setFunctions) ClearValue(*noValue) { + return +} + +// Merge implements segment.Functions.Merge. +func (setFunctions) Merge(Range, noValue, Range, noValue) (noValue, bool) { + return noValue{}, true +} + +// Split implements segment.Functions.Split. +func (setFunctions) Split(Range, noValue, uint64) (noValue, noValue) { + return noValue{}, noValue{} +} + +// PinBoard represents a set of pinned ranges in ashmem. +// +// segment.Set is used for implementation where segments represent +// ranges of pinned bytes, while gaps represent ranges of unpinned +// bytes. All ranges are page-aligned. +type PinBoard struct { + Set +} + +// NewPinBoard creates a new pin board with all pages pinned. +func NewPinBoard() *PinBoard { + var pb PinBoard + pb.PinRange(Range{0, maxUint64}) + return &pb +} + +// PinRange pins all pages in the specified range and returns true +// if there are any newly pinned pages. +func (pb *PinBoard) PinRange(r Range) bool { + pinnedPages := false + for gap := pb.LowerBoundGap(r.Start); gap.Ok() && gap.Start() < r.End; { + common := gap.Range().Intersect(r) + if common.Length() == 0 { + gap = gap.NextGap() + continue + } + pinnedPages = true + gap = pb.Insert(gap, common, noValue{}).NextGap() + } + return pinnedPages +} + +// UnpinRange unpins all pages in the specified range. +func (pb *PinBoard) UnpinRange(r Range) { + for seg := pb.LowerBoundSegment(r.Start); seg.Ok() && seg.Start() < r.End; { + common := seg.Range().Intersect(r) + if common.Length() == 0 { + seg = seg.NextSegment() + continue + } + seg = pb.RemoveRange(common).NextSegment() + } +} + +// RangePinnedStatus returns false if there's at least one unpinned page in the +// specified range. +func (pb *PinBoard) RangePinnedStatus(r Range) bool { + for gap := pb.LowerBoundGap(r.Start); gap.Ok() && gap.Start() < r.End; { + common := gap.Range().Intersect(r) + if common.Length() == 0 { + gap = gap.NextGap() + continue + } + return false + } + return true +} + +// RangeFromAshmemPin converts ashmem's original pin structure +// to Range. +func RangeFromAshmemPin(ap linux.AshmemPin) Range { + if ap.Len == 0 { + return Range{ + uint64(ap.Offset), + maxUint64, + } + } + return Range{ + uint64(ap.Offset), + uint64(ap.Offset) + uint64(ap.Len), + } +} diff --git a/pkg/sentry/fs/ashmem/pin_board_test.go b/pkg/sentry/fs/ashmem/pin_board_test.go new file mode 100644 index 000000000..f4ea5de6d --- /dev/null +++ b/pkg/sentry/fs/ashmem/pin_board_test.go @@ -0,0 +1,130 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package ashmem + +import ( + "testing" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +func TestPinBoard(t *testing.T) { + pb := NewPinBoard() + + // Confirm that all pages are pinned. + if !pb.RangePinnedStatus(RangeFromAshmemPin(linux.AshmemPin{0, 0})) { + t.Errorf("RangePinnedStatus(all pages) returned false (unpinned) at start.") + } + + // Unpin pages [1, 11) (counting from 0) + pb.UnpinRange(RangeFromAshmemPin(linux.AshmemPin{ + usermem.PageSize, + usermem.PageSize * 10, + })) + + // Confirm that pages [1, 11) are unpinned and that page 0 and pages + // larger than 10 are pinned. + pinned := []linux.AshmemPin{ + { + 0, + usermem.PageSize, + }, { + usermem.PageSize * 11, + 0, + }, + } + + for _, pin := range pinned { + if !pb.RangePinnedStatus(RangeFromAshmemPin(pin)) { + t.Errorf("RangePinnedStatus(AshmemPin{offset (pages): %v, len (pages): %v}) returned false (unpinned).", + pin.Offset, pin.Len) + } + } + + unpinned := []linux.AshmemPin{ + { + usermem.PageSize, + usermem.PageSize * 10, + }, + } + + for _, pin := range unpinned { + if pb.RangePinnedStatus(RangeFromAshmemPin(pin)) { + t.Errorf("RangePinnedStatus(AshmemPin{offset (pages): %v, len (pages): %v}) returned true (pinned).", + pin.Offset, pin.Len) + } + } + + // Pin pages [2, 6). + pb.PinRange(RangeFromAshmemPin(linux.AshmemPin{ + usermem.PageSize * 2, + usermem.PageSize * 4, + })) + + // Confirm that pages 0, [2, 6) and pages larger than 10 are pinned + // while others remain unpinned. + pinned = []linux.AshmemPin{ + { + 0, + usermem.PageSize, + }, + { + usermem.PageSize * 2, + usermem.PageSize * 4, + }, + { + usermem.PageSize * 11, + 0, + }, + } + + for _, pin := range pinned { + if !pb.RangePinnedStatus(RangeFromAshmemPin(pin)) { + t.Errorf("RangePinnedStatus(AshmemPin{offset (pages): %v, len (pages): %v}) returned false (unpinned).", + pin.Offset, pin.Len) + } + } + + unpinned = []linux.AshmemPin{ + { + usermem.PageSize, + usermem.PageSize, + }, { + usermem.PageSize * 6, + usermem.PageSize * 5, + }, + } + + for _, pin := range unpinned { + if pb.RangePinnedStatus(RangeFromAshmemPin(pin)) { + t.Errorf("RangePinnedStatus(AshmemPin{offset (pages): %v, len (pages): %v}) returned true (pinned).", + pin.Offset, pin.Len) + } + } + + // Status of a partially pinned range is unpinned. + if pb.RangePinnedStatus(RangeFromAshmemPin(linux.AshmemPin{0, 0})) { + t.Errorf("RangePinnedStatus(all pages) returned true (pinned).") + } + + // Pin the whole range again. + pb.PinRange(RangeFromAshmemPin(linux.AshmemPin{0, 0})) + + // Confirm that all pages are pinned. + if !pb.RangePinnedStatus(RangeFromAshmemPin(linux.AshmemPin{0, 0})) { + t.Errorf("RangePinnedStatus(all pages) returned false (unpinned) at start.") + } +} diff --git a/pkg/sentry/fs/attr.go b/pkg/sentry/fs/attr.go new file mode 100644 index 000000000..56a2ad6f7 --- /dev/null +++ b/pkg/sentry/fs/attr.go @@ -0,0 +1,382 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fs + +import ( + "fmt" + "os" + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/p9" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth" + ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time" +) + +// InodeType enumerates types of Inodes. +type InodeType int + +const ( + // RegularFile is a regular file. + RegularFile InodeType = iota + + // SpecialFile is a file that doesn't support SeekEnd. It is used for + // things like proc files. + SpecialFile + + // Directory is a directory. + Directory + + // SpecialDirectory is a directory that *does* support SeekEnd. It's + // the opposite of the SpecialFile scenario above. It similarly + // supports proc files. + SpecialDirectory + + // Symlink is a symbolic link. + Symlink + + // Pipe is a pipe (named or regular). + Pipe + + // Socket is a socket. + Socket + + // CharacterDevice is a character device. + CharacterDevice + + // BlockDevice is a block device. + BlockDevice + + // Anonymous is an anonymous type when none of the above apply. + // Epoll fds and event-driven fds fit this category. + Anonymous +) + +// String returns a human-readable representation of the InodeType. +func (n InodeType) String() string { + switch n { + case RegularFile, SpecialFile: + return "file" + case Directory, SpecialDirectory: + return "directory" + case Symlink: + return "symlink" + case Pipe: + return "pipe" + case Socket: + return "socket" + case CharacterDevice: + return "character-device" + case BlockDevice: + return "block-device" + case Anonymous: + return "anonymous" + default: + return "unknown" + } +} + +// StableAttr contains Inode attributes that will be stable throughout the +// lifetime of the Inode. +type StableAttr struct { + // Type is the InodeType of a InodeOperations. + Type InodeType + + // DeviceID is the device on which a InodeOperations resides. + DeviceID uint64 + + // InodeID uniquely identifies InodeOperations on its device. + InodeID uint64 + + // BlockSize is the block size of data backing this InodeOperations. + BlockSize int64 + + // DeviceFileMajor is the major device number of this Node, if it is a + // device file. + DeviceFileMajor uint16 + + // DeviceFileMinor is the minor device number of this Node, if it is a + // device file. + DeviceFileMinor uint32 +} + +// IsRegular returns true if StableAttr.Type matches a regular file. +func IsRegular(s StableAttr) bool { + return s.Type == RegularFile +} + +// IsFile returns true if StableAttr.Type matches any type of file. +func IsFile(s StableAttr) bool { + return s.Type == RegularFile || s.Type == SpecialFile +} + +// IsDir returns true if StableAttr.Type matches any type of directory. +func IsDir(s StableAttr) bool { + return s.Type == Directory || s.Type == SpecialDirectory +} + +// IsSymlink returns true if StableAttr.Type matches a symlink. +func IsSymlink(s StableAttr) bool { + return s.Type == Symlink +} + +// IsPipe returns true if StableAttr.Type matches any type of pipe. +func IsPipe(s StableAttr) bool { + return s.Type == Pipe +} + +// IsSocket returns true if StableAttr.Type matches any type of socket. +func IsSocket(s StableAttr) bool { + return s.Type == Socket +} + +// IsCharDevice returns true if StableAttr.Type matches a character device. +func IsCharDevice(s StableAttr) bool { + return s.Type == CharacterDevice +} + +// UnstableAttr contains Inode attributes that may change over the lifetime +// of the Inode. +type UnstableAttr struct { + // Size is the file size in bytes. + Size int64 + + // Usage is the actual data usage in bytes. + Usage int64 + + // Perms is the protection (read/write/execute for user/group/other). + Perms FilePermissions + + // Owner describes the ownership of this file. + Owner FileOwner + + // AccessTime is the time of last access + AccessTime ktime.Time + + // ModificationTime is the time of last modification. + ModificationTime ktime.Time + + // StatusChangeTime is the time of last attribute modification. + StatusChangeTime ktime.Time + + // Links is the number of hard links. + Links uint64 +} + +// WithCurrentTime returns u with AccessTime == ModificationTime == current time. +func WithCurrentTime(ctx context.Context, u UnstableAttr) UnstableAttr { + t := ktime.NowFromContext(ctx) + u.AccessTime = t + u.ModificationTime = t + u.StatusChangeTime = t + return u +} + +// AttrMask contains fields to mask StableAttr and UnstableAttr. +type AttrMask struct { + Type bool + DeviceID bool + InodeID bool + BlockSize bool + Size bool + Usage bool + Perms bool + UID bool + GID bool + AccessTime bool + ModificationTime bool + StatusChangeTime bool + Links bool +} + +// Empty returns true if all fields in AttrMask are false. +func (a AttrMask) Empty() bool { + return a == AttrMask{} +} + +// Union returns an AttrMask containing the inclusive disjunction of fields in a and b. +func (a AttrMask) Union(b AttrMask) AttrMask { + return AttrMask{ + Type: a.Type || b.Type, + DeviceID: a.DeviceID || b.DeviceID, + InodeID: a.InodeID || b.InodeID, + BlockSize: a.BlockSize || b.BlockSize, + Size: a.Size || b.Size, + Usage: a.Usage || b.Usage, + Perms: a.Perms || b.Perms, + UID: a.UID || b.UID, + GID: a.GID || b.GID, + AccessTime: a.AccessTime || b.AccessTime, + ModificationTime: a.ModificationTime || b.ModificationTime, + StatusChangeTime: a.StatusChangeTime || b.StatusChangeTime, + Links: a.Links || b.Links, + } +} + +// PermMask are file access permissions. +type PermMask struct { + // Read indicates reading is permitted. + Read bool + + // Write indicates writing is permitted. + Write bool + + // Execute indicates execution is permitted. + Execute bool +} + +// OnlyRead returns true when only the read bit is set. +func (p PermMask) OnlyRead() bool { + return p.Read && !p.Write && !p.Execute +} + +// String implements the fmt.Stringer interface for PermMask. +func (p PermMask) String() string { + return fmt.Sprintf("PermMask{Read: %v, Write: %v, Execute: %v}", p.Read, p.Write, p.Execute) +} + +// Mode returns the system mode (syscall.S_IXOTH, etc.) for these permissions +// in the "other" bits. +func (p PermMask) Mode() (mode os.FileMode) { + if p.Read { + mode |= syscall.S_IROTH + } + if p.Write { + mode |= syscall.S_IWOTH + } + if p.Execute { + mode |= syscall.S_IXOTH + } + return +} + +// SupersetOf returns true iff the permissions in p are a superset of the +// permissions in other. +func (p PermMask) SupersetOf(other PermMask) bool { + if !p.Read && other.Read { + return false + } + if !p.Write && other.Write { + return false + } + if !p.Execute && other.Execute { + return false + } + return true +} + +// FilePermissions represents the permissions of a file, with +// Read/Write/Execute bits for user, group, and other. +type FilePermissions struct { + User PermMask + Group PermMask + Other PermMask + + // Sticky, if set on directories, restricts renaming and deletion of + // files in those directories to the directory owner, file owner, or + // CAP_FOWNER. The sticky bit is ignored when set on other files. + Sticky bool + + // SetUID executables can call UID-setting syscalls without CAP_SETUID. + SetUID bool + + // SetGID executables can call GID-setting syscalls without CAP_SETGID. + SetGID bool +} + +// PermsFromMode takes the Other permissions (last 3 bits) of a FileMode and +// returns a set of PermMask. +func PermsFromMode(mode linux.FileMode) (perms PermMask) { + perms.Read = mode&linux.ModeOtherRead != 0 + perms.Write = mode&linux.ModeOtherWrite != 0 + perms.Execute = mode&linux.ModeOtherExec != 0 + return +} + +// FilePermsFromP9 converts a p9.FileMode to a FilePermissions struct. +func FilePermsFromP9(mode p9.FileMode) FilePermissions { + return FilePermsFromMode(linux.FileMode(mode)) +} + +// FilePermsFromMode converts a system file mode to a FilePermissions struct. +func FilePermsFromMode(mode linux.FileMode) (fp FilePermissions) { + perm := mode.Permissions() + fp.Other = PermsFromMode(perm) + fp.Group = PermsFromMode(perm >> 3) + fp.User = PermsFromMode(perm >> 6) + fp.Sticky = mode&linux.ModeSticky == linux.ModeSticky + fp.SetUID = mode&linux.ModeSetUID == linux.ModeSetUID + fp.SetGID = mode&linux.ModeSetGID == linux.ModeSetGID + return +} + +// LinuxMode returns the linux mode_t representation of these permissions. +func (f FilePermissions) LinuxMode() linux.FileMode { + m := linux.FileMode(f.User.Mode()<<6 | f.Group.Mode()<<3 | f.Other.Mode()) + if f.SetUID { + m |= linux.ModeSetUID + } + if f.SetGID { + m |= linux.ModeSetGID + } + if f.Sticky { + m |= linux.ModeSticky + } + return m +} + +// OSMode returns the Go runtime's OS independent os.FileMode representation of +// these permissions. +func (f FilePermissions) OSMode() os.FileMode { + m := os.FileMode(f.User.Mode()<<6 | f.Group.Mode()<<3 | f.Other.Mode()) + if f.SetUID { + m |= os.ModeSetuid + } + if f.SetGID { + m |= os.ModeSetgid + } + if f.Sticky { + m |= os.ModeSticky + } + return m +} + +// AnyExecute returns true if any of U/G/O have the execute bit set. +func (f FilePermissions) AnyExecute() bool { + return f.User.Execute || f.Group.Execute || f.Other.Execute +} + +// AnyWrite returns true if any of U/G/O have the write bit set. +func (f FilePermissions) AnyWrite() bool { + return f.User.Write || f.Group.Write || f.Other.Write +} + +// AnyRead returns true if any of U/G/O have the read bit set. +func (f FilePermissions) AnyRead() bool { + return f.User.Read || f.Group.Read || f.Other.Read +} + +// FileOwner represents ownership of a file. +type FileOwner struct { + UID auth.KUID + GID auth.KGID +} + +// RootOwner corresponds to KUID/KGID 0/0. +var RootOwner = FileOwner{ + UID: auth.RootKUID, + GID: auth.RootKGID, +} diff --git a/pkg/sentry/fs/binder/BUILD b/pkg/sentry/fs/binder/BUILD new file mode 100644 index 000000000..15f91699f --- /dev/null +++ b/pkg/sentry/fs/binder/BUILD @@ -0,0 +1,38 @@ +package(licenses = ["notice"]) # Apache 2.0 + +load("@io_bazel_rules_go//go:def.bzl", "go_library") +load("//tools/go_stateify:defs.bzl", "go_stateify") + +go_stateify( + name = "binder_state", + srcs = ["binder.go"], + out = "binder_state.go", + package = "binder", +) + +go_library( + name = "binder", + srcs = [ + "binder.go", + "binder_state.go", + ], + importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/fs/binder", + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/log", + "//pkg/sentry/arch", + "//pkg/sentry/context", + "//pkg/sentry/fs", + "//pkg/sentry/fs/fsutil", + "//pkg/sentry/kernel", + "//pkg/sentry/kernel/time", + "//pkg/sentry/memmap", + "//pkg/sentry/platform", + "//pkg/sentry/usage", + "//pkg/sentry/usermem", + "//pkg/state", + "//pkg/syserror", + "//pkg/tcpip/transport/unix", + ], +) diff --git a/pkg/sentry/fs/binder/binder.go b/pkg/sentry/fs/binder/binder.go new file mode 100644 index 000000000..3f87b6b08 --- /dev/null +++ b/pkg/sentry/fs/binder/binder.go @@ -0,0 +1,358 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package binder implements Android Binder IPC module. +package binder + +import ( + "sync" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time" + "gvisor.googlesource.com/gvisor/pkg/sentry/memmap" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform" + "gvisor.googlesource.com/gvisor/pkg/sentry/usage" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +const ( + currentProtocolVersion = 8 + + // mmapSizeLimit is the upper limit for mapped memory size in Binder. + mmapSizeLimit = 4 * 1024 * 1024 // 4MB +) + +// Device implements fs.InodeOperations. +type Device struct { + fsutil.InodeNoExtendedAttributes + fsutil.InodeNotDirectory + fsutil.InodeNotRenameable + fsutil.InodeNotSocket + fsutil.InodeNotSymlink + fsutil.NoMappable + fsutil.NoopWriteOut + fsutil.DeprecatedFileOperations + + // mu protects unstable. + mu sync.Mutex `state:"nosave"` + unstable fs.UnstableAttr +} + +// NewDevice creates and intializes a Device structure. +func NewDevice(ctx context.Context, owner fs.FileOwner, fp fs.FilePermissions) *Device { + return &Device{ + unstable: fs.WithCurrentTime(ctx, fs.UnstableAttr{ + Owner: owner, + Perms: fp, + Links: 1, + }), + } +} + +// Release implements fs.InodeOperations.Release. +func (bd *Device) Release(context.Context) {} + +// GetFile implements fs.InodeOperations.GetFile. +// +// TODO: Add functionality to GetFile: Additional fields will be +// needed in the Device structure, initialize them here. Also, Device will need +// to keep track of the created Procs in order to implement BINDER_READ_WRITE +// ioctl. +func (bd *Device) GetFile(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) { + return fs.NewFile(ctx, d, flags, &Proc{ + bd: bd, + task: kernel.TaskFromContext(ctx), + platform: platform.FromContext(ctx), + }), nil +} + +// UnstableAttr implements fs.InodeOperations.UnstableAttr. +func (bd *Device) UnstableAttr(ctx context.Context, inode *fs.Inode) (fs.UnstableAttr, error) { + bd.mu.Lock() + defer bd.mu.Unlock() + return bd.unstable, nil +} + +// Check implements fs.InodeOperations.Check. +func (bd *Device) Check(ctx context.Context, inode *fs.Inode, p fs.PermMask) bool { + return fs.ContextCanAccessFile(ctx, inode, p) +} + +// SetPermissions implements fs.InodeOperations.SetPermissions. +func (bd *Device) SetPermissions(ctx context.Context, inode *fs.Inode, fp fs.FilePermissions) bool { + bd.mu.Lock() + defer bd.mu.Unlock() + bd.unstable.Perms = fp + bd.unstable.StatusChangeTime = time.NowFromContext(ctx) + return true +} + +// SetOwner implements fs.InodeOperations.SetOwner. +func (bd *Device) SetOwner(ctx context.Context, inode *fs.Inode, owner fs.FileOwner) error { + bd.mu.Lock() + defer bd.mu.Unlock() + if owner.UID.Ok() { + bd.unstable.Owner.UID = owner.UID + } + if owner.GID.Ok() { + bd.unstable.Owner.GID = owner.GID + } + return nil +} + +// SetTimestamps implements fs.InodeOperations.SetTimestamps. +func (bd *Device) SetTimestamps(ctx context.Context, inode *fs.Inode, ts fs.TimeSpec) error { + if ts.ATimeOmit && ts.MTimeOmit { + return nil + } + + bd.mu.Lock() + defer bd.mu.Unlock() + + now := time.NowFromContext(ctx) + if !ts.ATimeOmit { + if ts.ATimeSetSystemTime { + bd.unstable.AccessTime = now + } else { + bd.unstable.AccessTime = ts.ATime + } + } + if !ts.MTimeOmit { + if ts.MTimeSetSystemTime { + bd.unstable.ModificationTime = now + } else { + bd.unstable.ModificationTime = ts.MTime + } + } + bd.unstable.StatusChangeTime = now + return nil +} + +// Truncate implements fs.InodeOperations.WriteOut. +// +// Ignored for a character device, such as Binder. +func (bd *Device) Truncate(ctx context.Context, inode *fs.Inode, size int64) error { + return nil +} + +// AddLink implements fs.InodeOperations.AddLink. +// +// Binder doesn't support links, no-op. +func (bd *Device) AddLink() {} + +// DropLink implements fs.InodeOperations.DropLink. +// +// Binder doesn't support links, no-op. +func (bd *Device) DropLink() {} + +// NotifyStatusChange implements fs.InodeOperations.NotifyStatusChange. +func (bd *Device) NotifyStatusChange(ctx context.Context) { + bd.mu.Lock() + defer bd.mu.Unlock() + now := time.NowFromContext(ctx) + bd.unstable.ModificationTime = now + bd.unstable.StatusChangeTime = now +} + +// IsVirtual implements fs.InodeOperations.IsVirtual. +// +// Binder is virtual. +func (bd *Device) IsVirtual() bool { + return true +} + +// StatFS implements fs.InodeOperations.StatFS. +// +// Binder doesn't support querying for filesystem info. +func (bd *Device) StatFS(context.Context) (fs.Info, error) { + return fs.Info{}, syserror.ENOSYS +} + +// Proc implements fs.FileOperations and fs.IoctlGetter. +type Proc struct { + fsutil.NoFsync + fsutil.DeprecatedFileOperations + fsutil.NotDirReaddir + + bd *Device + task *kernel.Task + platform platform.Platform + + // mu protects fr. + mu sync.Mutex `state:"nosave"` + + // mapped is memory allocated from platform.Memory() by AddMapping. + mapped platform.FileRange +} + +// Release implements fs.FileOperations.Release. +func (bp *Proc) Release() { + bp.mu.Lock() + defer bp.mu.Unlock() + if bp.mapped.Length() != 0 { + bp.platform.Memory().DecRef(bp.mapped) + } +} + +// Seek implements fs.FileOperations.Seek. +// +// Binder doesn't support seek operation (unless in debug mode). +func (bp *Proc) Seek(ctx context.Context, file *fs.File, whence fs.SeekWhence, offset int64) (int64, error) { + return offset, syserror.EOPNOTSUPP +} + +// Read implements fs.FileOperations.Read. +// +// Binder doesn't support read operation (unless in debug mode). +func (bp *Proc) Read(ctx context.Context, file *fs.File, dst usermem.IOSequence, offset int64) (int64, error) { + return 0, syserror.EOPNOTSUPP +} + +// Write implements fs.FileOperations.Write. +// +// Binder doesn't support write operation. +func (bp *Proc) Write(ctx context.Context, file *fs.File, src usermem.IOSequence, offset int64) (int64, error) { + return 0, syserror.EOPNOTSUPP +} + +// Flush implements fs.FileOperations.Flush. +// +// TODO: Implement. +func (bp *Proc) Flush(ctx context.Context, file *fs.File) error { + return nil +} + +// ConfigureMMap implements fs.FileOperations.ConfigureMMap. +func (bp *Proc) ConfigureMMap(ctx context.Context, file *fs.File, opts *memmap.MMapOpts) error { + // Compare drivers/android/binder.c:binder_mmap(). + if caller := kernel.TaskFromContext(ctx); caller != bp.task { + return syserror.EINVAL + } + if opts.Length > mmapSizeLimit { + opts.Length = mmapSizeLimit + } + opts.MaxPerms.Write = false + + // TODO: Binder sets VM_DONTCOPY, preventing the created vma + // from being copied across fork(), but we don't support this yet. As + // a result, MMs containing a Binder mapping cannot be forked (MM.Fork will + // fail when AddMapping returns EBUSY). + + return fsutil.GenericConfigureMMap(file, bp, opts) +} + +// Ioctl implements fs.FileOperations.Ioctl. +// +// TODO: Implement. +func (bp *Proc) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { + // Switch on ioctl request. + switch uint32(args[1].Int()) { + case linux.BinderVersionIoctl: + ver := &linux.BinderVersion{ + ProtocolVersion: currentProtocolVersion, + } + // Copy result to user-space. + _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), ver, usermem.IOOpts{ + AddressSpaceActive: true, + }) + return 0, err + case linux.BinderWriteReadIoctl: + // TODO: Implement. + fallthrough + case linux.BinderSetIdleTimeoutIoctl: + // TODO: Implement. + fallthrough + case linux.BinderSetMaxThreadsIoctl: + // TODO: Implement. + fallthrough + case linux.BinderSetIdlePriorityIoctl: + // TODO: Implement. + fallthrough + case linux.BinderSetContextMgrIoctl: + // TODO: Implement. + fallthrough + case linux.BinderThreadExitIoctl: + // TODO: Implement. + return 0, syserror.ENOSYS + default: + // Ioctls irrelevant to Binder. + return 0, syserror.EINVAL + } +} + +// AddMapping implements memmap.Mappable.AddMapping. +func (bp *Proc) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64) error { + bp.mu.Lock() + defer bp.mu.Unlock() + if bp.mapped.Length() != 0 { + // mmap has been called before, which binder_mmap() doesn't like. + return syserror.EBUSY + } + // Binder only allocates and maps a single page up-front + // (drivers/android/binder.c:binder_mmap() => binder_update_page_range()). + fr, err := bp.platform.Memory().Allocate(usermem.PageSize, usage.Anonymous) + if err != nil { + return err + } + bp.mapped = fr + return nil +} + +// RemoveMapping implements memmap.Mappable.RemoveMapping. +func (bp *Proc) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64) { + // Nothing to do. Notably, we don't free bp.mapped to allow another mmap. +} + +// CopyMapping implements memmap.Mappable.CopyMapping. +func (bp *Proc) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR usermem.AddrRange, offset uint64) error { + // Nothing to do. Notably, this is one case where CopyMapping isn't + // equivalent to AddMapping, as AddMapping would return EBUSY. + return nil +} + +// Translate implements memmap.Mappable.Translate. +func (bp *Proc) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) { + // TODO: In addition to the page initially allocated and mapped + // in AddMapping (Linux: binder_mmap), Binder allocates and maps pages for + // each transaction (Linux: binder_ioctl => binder_ioctl_write_read => + // binder_thread_write => binder_transaction => binder_alloc_buf => + // binder_update_page_range). Since we don't actually implement + // BinderWriteReadIoctl (Linux: BINDER_WRITE_READ), we only ever have the + // first page. + var err error + if required.End > usermem.PageSize { + err = &memmap.BusError{syserror.EFAULT} + } + if required.Start == 0 { + return []memmap.Translation{ + { + Source: memmap.MappableRange{0, usermem.PageSize}, + File: bp.platform.Memory(), + Offset: bp.mapped.Start, + }, + }, err + } + return nil, err +} + +// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable. +func (bp *Proc) InvalidateUnsavable(ctx context.Context) error { + return nil +} diff --git a/pkg/sentry/fs/context.go b/pkg/sentry/fs/context.go new file mode 100644 index 000000000..b521bce75 --- /dev/null +++ b/pkg/sentry/fs/context.go @@ -0,0 +1,97 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fs + +import ( + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth" +) + +// contextID is the kernel package's type for context.Context.Value keys. +type contextID int + +const ( + // CtxRoot is a Context.Value key for a Dirent. + CtxRoot contextID = iota +) + +// ContextCanAccessFile determines whether `file` can be accessed in the requested way +// (for reading, writing, or execution) using the caller's credentials and user +// namespace, as does Linux's fs/namei.c:generic_permission. +func ContextCanAccessFile(ctx context.Context, inode *Inode, reqPerms PermMask) bool { + creds := auth.CredentialsFromContext(ctx) + uattr, err := inode.UnstableAttr(ctx) + if err != nil { + return false + } + + p := uattr.Perms.Other + // Are we owner or in group? + if uattr.Owner.UID == creds.EffectiveKUID { + p = uattr.Perms.User + } else if creds.InGroup(uattr.Owner.GID) { + p = uattr.Perms.Group + } + + // Are permissions satisfied without capability checks? + if p.SupersetOf(reqPerms) { + return true + } + + if IsDir(inode.StableAttr) { + // CAP_DAC_OVERRIDE can override any perms on directories. + if inode.CheckCapability(ctx, linux.CAP_DAC_OVERRIDE) { + return true + } + + // CAP_DAC_READ_SEARCH can normally only override Read perms, + // but for directories it can also override execution. + if !reqPerms.Write && inode.CheckCapability(ctx, linux.CAP_DAC_READ_SEARCH) { + return true + } + } + + // CAP_DAC_OVERRIDE can always override Read/Write. + // Can override executable only when at least one execute bit is set. + if !reqPerms.Execute || uattr.Perms.AnyExecute() { + if inode.CheckCapability(ctx, linux.CAP_DAC_OVERRIDE) { + return true + } + } + + // Read perms can be overridden by CAP_DAC_READ_SEARCH. + if reqPerms.OnlyRead() && inode.CheckCapability(ctx, linux.CAP_DAC_READ_SEARCH) { + return true + } + return false +} + +// FileOwnerFromContext returns a FileOwner using the effective user and group +// IDs used by ctx. +func FileOwnerFromContext(ctx context.Context) FileOwner { + creds := auth.CredentialsFromContext(ctx) + return FileOwner{creds.EffectiveKUID, creds.EffectiveKGID} +} + +// RootFromContext returns the root of the virtual filesystem observed by ctx, +// or nil if ctx is not associated with a virtual filesystem. If +// RootFromContext returns a non-nil fs.Dirent, a reference is taken on it. +func RootFromContext(ctx context.Context) *Dirent { + if v := ctx.Value(CtxRoot); v != nil { + return v.(*Dirent) + } + return nil +} diff --git a/pkg/sentry/fs/copy_up.go b/pkg/sentry/fs/copy_up.go new file mode 100644 index 000000000..ea74d0efd --- /dev/null +++ b/pkg/sentry/fs/copy_up.go @@ -0,0 +1,414 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fs + +import ( + "fmt" + "io" + "sync" + + "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/memmap" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// copyUp copies a file in an overlay from a lower filesystem to an +// upper filesytem so that the file can be modified in the upper +// filesystem. Copying a file involves several steps: +// +// - All parent directories of the file are created in the upper +// filesystem if they don't exist there. For instance: +// +// upper /dir0 +// lower /dir0/dir1/file +// +// copyUp of /dir0/dir1/file creates /dir0/dir1 in order to create +// /dir0/dir1/file. +// +// - The file content is copied from the lower file to the upper +// file. For symlinks this is the symlink target. For directories, +// upper directory entries are merged with lower directory entries +// so there is no need to copy any entries. +// +// - A subset of file attributes of the lower file are set on the +// upper file. These are the file owner, the file timestamps, +// and all non-overlay extended attributes. copyUp will fail if +// the upper filesystem does not support the setting of these +// attributes. +// +// The file's permissions are set when the file is created and its +// size will be brought up to date when its contents are copied. +// Notably no attempt is made to bring link count up to date because +// hard links are currently not preserved across overlay filesystems. +// +// - Memory mappings of the lower file are invalidated and memory +// references are transferred to the upper file. From this point on, +// memory mappings of the file will be backed by content in the upper +// filesystem. +// +// Synchronization: +// +// copyUp synchronizes with rename(2) using renameMu to ensure that +// parentage does not change while a file is being copied. In the context +// of rename(2), copyUpLockedForRename should be used to avoid deadlock on +// renameMu. +// +// The following operations synchronize with copyUp using copyMu: +// +// - InodeOperations, i.e. to ensure that looking up a directory takes +// into account new upper filesystem directories created by copy up, +// which subsequently can be modified. +// +// - FileOperations, i.e. to ensure that reading from a file does not +// continue using a stale, lower filesystem handle when the file is +// written to. +// +// Lock ordering: Dirent.mu -> Inode.overlay.copyMu -> Inode.mu. +// +// Caveats: +// +// If any step in copying up a file fails, copyUp cleans the upper +// filesystem of any partially up-to-date file. If this cleanup fails, +// the overlay may be in an unacceptable, inconsistent state, so copyUp +// panics. If copyUp fails because any step (above) fails, a generic +// error is returned. +// +// copyUp currently makes no attempt to optimize copying up file content. +// For large files, this means that copyUp blocks until the entire file +// is copied synchronously. +func copyUp(ctx context.Context, d *Dirent) error { + renameMu.RLock() + defer renameMu.RUnlock() + return copyUpLockedForRename(ctx, d) +} + +// copyUpLockedForRename is the same as copyUp except that it does not lock +// renameMu. +// +// It copies each component of d that does not yet exist in the upper +// filesystem. If d already exists in the upper filesystem, it is a no-op. +// +// Any error returned indicates a failure to copy all of d. This may +// leave the upper filesystem filled with any number of parent directories +// but the upper filesystem will never be in an inconsistent state. +// +// Preconditions: +// - d.Inode.overlay is non-nil. +func copyUpLockedForRename(ctx context.Context, d *Dirent) error { + for { + // Did we race with another copy up or does there + // already exist something in the upper filesystem + // for d? + d.Inode.overlay.copyMu.Lock() + if d.Inode.overlay.upper != nil { + d.Inode.overlay.copyMu.Unlock() + // Done, d is in the upper filesystem. + return nil + } + d.Inode.overlay.copyMu.Unlock() + + // Find the next component to copy up. We will work our way + // down to the last component of d and finally copy it. + next := findNextCopyUp(ctx, d) + + // Attempt to copy. + if err := doCopyUp(ctx, next); err != nil { + return err + } + } +} + +// findNextCopyUp finds the next component of d from root that does not +// yet exist in the upper filesystem. The parent of this component is +// also returned, which is the root of the overlay in the worst case. +func findNextCopyUp(ctx context.Context, d *Dirent) *Dirent { + next := d + for parent := next.parent; ; /* checked in-loop */ /* updated in-loop */ { + // Does this parent have a non-nil upper Inode? + parent.Inode.overlay.copyMu.RLock() + if parent.Inode.overlay.upper != nil { + parent.Inode.overlay.copyMu.RUnlock() + // Note that since we found an upper, it is stable. + return next + } + parent.Inode.overlay.copyMu.RUnlock() + + // Continue searching for a parent with a non-nil + // upper Inode. + next = parent + parent = next.parent + } +} + +func doCopyUp(ctx context.Context, d *Dirent) error { + // Wait to get exclusive access to the upper Inode. + d.Inode.overlay.copyMu.Lock() + defer d.Inode.overlay.copyMu.Unlock() + if d.Inode.overlay.upper != nil { + // We raced with another doCopyUp, no problem. + return nil + } + + // Perform the copy. + return copyUpLocked(ctx, d.parent, d) +} + +// copyUpLocked creates a copy of next in the upper filesystem of parent. +// +// copyUpLocked must be called with d.Inode.overlay.copyMu locked. +// +// Returns a generic error on failure. +// +// Preconditions: +// - parent.Inode.overlay.upper must be non-nil. +// - next.Inode.overlay.copyMu must be locked writable. +// - next.Inode.overlay.lower must be non-nil. +// - upper filesystem must support setting file ownership and timestamps. +func copyUpLocked(ctx context.Context, parent *Dirent, next *Dirent) error { + // Extract the attributes of the file we wish to copy. + attrs, err := next.Inode.overlay.lower.UnstableAttr(ctx) + if err != nil { + log.Warningf("copy up failed to get lower attributes: %v", err) + return syserror.EIO + } + + var childUpperInode *Inode + parentUpper := parent.Inode.overlay.upper + + // Create the file in the upper filesystem and get an Inode for it. + switch next.Inode.StableAttr.Type { + case RegularFile: + childFile, err := parentUpper.Create(ctx, RootFromContext(ctx), next.name, FileFlags{Read: true, Write: true}, attrs.Perms) + if err != nil { + log.Warningf("copy up failed to create file: %v", err) + return syserror.EIO + } + defer childFile.DecRef() + childUpperInode = childFile.Dirent.Inode + + case Directory: + if err := parentUpper.CreateDirectory(ctx, RootFromContext(ctx), next.name, attrs.Perms); err != nil { + log.Warningf("copy up failed to create directory: %v", err) + return syserror.EIO + } + childUpper, err := parentUpper.Lookup(ctx, next.name) + if err != nil { + log.Warningf("copy up failed to lookup directory: %v", err) + cleanupUpper(ctx, parentUpper, next.name) + return syserror.EIO + } + defer childUpper.DecRef() + childUpperInode = childUpper.Inode + + case Symlink: + childLower := next.Inode.overlay.lower + link, err := childLower.Readlink(ctx) + if err != nil { + log.Warningf("copy up failed to read symlink value: %v", err) + return syserror.EIO + } + if err := parentUpper.CreateLink(ctx, RootFromContext(ctx), link, next.name); err != nil { + log.Warningf("copy up failed to create symlink: %v", err) + return syserror.EIO + } + childUpper, err := parentUpper.Lookup(ctx, next.name) + if err != nil { + log.Warningf("copy up failed to lookup symlink: %v", err) + cleanupUpper(ctx, parentUpper, next.name) + return syserror.EIO + } + defer childUpper.DecRef() + childUpperInode = childUpper.Inode + + default: + return syserror.EINVAL + } + + // Bring file attributes up to date. This does not include size, which will be + // brought up to date with copyContentsLocked. + if err := copyAttributesLocked(ctx, childUpperInode, next.Inode.overlay.lower); err != nil { + log.Warningf("copy up failed to copy up attributes: %v", err) + cleanupUpper(ctx, parentUpper, next.name) + return syserror.EIO + } + + // Copy the entire file. + if err := copyContentsLocked(ctx, childUpperInode, next.Inode.overlay.lower, attrs.Size); err != nil { + log.Warningf("copy up failed to copy up contents: %v", err) + cleanupUpper(ctx, parentUpper, next.name) + return syserror.EIO + } + + lowerMappable := next.Inode.overlay.lower.Mappable() + upperMappable := childUpperInode.Mappable() + if lowerMappable != nil && upperMappable == nil { + log.Warningf("copy up failed: cannot ensure memory mapping coherence") + cleanupUpper(ctx, parentUpper, next.name) + return syserror.EIO + } + + // Propagate memory mappings to the upper Inode. + next.Inode.overlay.mapsMu.Lock() + defer next.Inode.overlay.mapsMu.Unlock() + if upperMappable != nil { + // Remember which mappings we added so we can remove them on failure. + allAdded := make(map[memmap.MappableRange]memmap.MappingsOfRange) + for seg := next.Inode.overlay.mappings.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { + added := make(memmap.MappingsOfRange) + for m := range seg.Value() { + if err := upperMappable.AddMapping(ctx, m.MappingSpace, m.AddrRange, seg.Start()); err != nil { + for m := range added { + upperMappable.RemoveMapping(ctx, m.MappingSpace, m.AddrRange, seg.Start()) + } + for mr, mappings := range allAdded { + for m := range mappings { + upperMappable.RemoveMapping(ctx, m.MappingSpace, m.AddrRange, mr.Start) + } + } + return err + } + added[m] = struct{}{} + } + allAdded[seg.Range()] = added + } + } + + // Take a reference on the upper Inode (transferred to + // next.Inode.overlay.upper) and make new translations use it. + next.Inode.overlay.dataMu.Lock() + childUpperInode.IncRef() + next.Inode.overlay.upper = childUpperInode + next.Inode.overlay.dataMu.Unlock() + + // Invalidate existing translations through the lower Inode. + next.Inode.overlay.mappings.InvalidateAll(memmap.InvalidateOpts{}) + + // Remove existing memory mappings from the lower Inode. + if lowerMappable != nil { + for seg := next.Inode.overlay.mappings.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { + for m := range seg.Value() { + lowerMappable.RemoveMapping(ctx, m.MappingSpace, m.AddrRange, seg.Start()) + } + } + } + + return nil +} + +// cleanupUpper removes name from parent, and panics if it is unsuccessful. +func cleanupUpper(ctx context.Context, parent *Inode, name string) { + if err := parent.InodeOperations.Remove(ctx, parent, name); err != nil { + // Unfortunately we don't have much choice. We shouldn't + // willingly give the caller access to a nonsense filesystem. + panic(fmt.Sprintf("overlay filesystem is in an inconsistent state: failed to remove %q from upper filesystem: %v", name, err)) + } +} + +// copyUpBuffers is a buffer pool for copying file content. The buffer +// size is the same used by io.Copy. +var copyUpBuffers = sync.Pool{New: func() interface{} { return make([]byte, 8*usermem.PageSize) }} + +// copyContentsLocked copies the contents of lower to upper. It panics if +// less than size bytes can be copied. +func copyContentsLocked(ctx context.Context, upper *Inode, lower *Inode, size int64) error { + // We don't support copying up for anything other than regular files. + if lower.StableAttr.Type != RegularFile { + return nil + } + + // Get a handle to the upper filesystem, which we will write to. + upperFile, err := overlayFile(ctx, upper, FileFlags{Write: true}) + if err != nil { + return err + } + defer upperFile.DecRef() + + // Get a handle to the lower filesystem, which we will read from. + lowerFile, err := overlayFile(ctx, lower, FileFlags{Read: true}) + if err != nil { + return err + } + defer lowerFile.DecRef() + + // Use a buffer pool to minimize allocations. + buf := copyUpBuffers.Get().([]byte) + defer copyUpBuffers.Put(buf) + + // Transfer the contents. + // + // One might be able to optimize this by doing parallel reads, parallel writes and reads, larger + // buffers, etc. But we really don't know anything about the underlying implementation, so these + // optimizations could be self-defeating. So we leave this as simple as possible. + var offset int64 + for { + nr, err := lowerFile.FileOperations.Read(ctx, lowerFile, usermem.BytesIOSequence(buf), offset) + if err != nil && err != io.EOF { + return err + } + if nr == 0 { + if offset != size { + // Same as in cleanupUpper, we cannot live + // with ourselves if we do anything less. + panic(fmt.Sprintf("filesystem is in an inconsistent state: wrote only %d bytes of %d sized file", offset, size)) + } + return nil + } + nw, err := upperFile.FileOperations.Write(ctx, upperFile, usermem.BytesIOSequence(buf[:nr]), offset) + if err != nil { + return err + } + offset += nw + } +} + +// copyAttributesLocked copies a subset of lower's attributes to upper, +// specifically owner, timestamps (except of status change time), and +// extended attributes. Notably no attempt is made to copy link count. +// Size and permissions are set on upper when the file content is copied +// and when the file is created respectively. +func copyAttributesLocked(ctx context.Context, upper *Inode, lower *Inode) error { + // Extract attributes fro the lower filesystem. + lowerAttr, err := lower.UnstableAttr(ctx) + if err != nil { + return err + } + lowerXattr, err := lower.Listxattr() + if err != nil && err != syserror.EOPNOTSUPP { + return err + } + + // Set the attributes on the upper filesystem. + if err := upper.InodeOperations.SetOwner(ctx, upper, lowerAttr.Owner); err != nil { + return err + } + if err := upper.InodeOperations.SetTimestamps(ctx, upper, TimeSpec{ + ATime: lowerAttr.AccessTime, + MTime: lowerAttr.ModificationTime, + }); err != nil { + return err + } + for name := range lowerXattr { + value, err := lower.Getxattr(name) + if err != nil { + return err + } + if err := upper.InodeOperations.Setxattr(upper, name, value); err != nil { + return err + } + } + return nil +} diff --git a/pkg/sentry/fs/copy_up_test.go b/pkg/sentry/fs/copy_up_test.go new file mode 100644 index 000000000..c3c9d963d --- /dev/null +++ b/pkg/sentry/fs/copy_up_test.go @@ -0,0 +1,182 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fs_test + +import ( + "bytes" + "crypto/rand" + "fmt" + "io" + "sync" + "testing" + + "gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + _ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/tmpfs" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +const ( + // origFileSize is the original file size. This many bytes should be + // copied up before the test file is modified. + origFileSize = 4096 + + // truncatedFileSize is the size to truncate all test files. + truncateFileSize = 10 +) + +// TestConcurrentCopyUp is a copy up stress test for an overlay. +// +// It creates a 64-level deep directory tree in the lower filesystem and +// populates the last subdirectory with 64 files containing random content: +// +// /lower +// /sudir0/.../subdir63/ +// /file0 +// ... +// /file63 +// +// The files are truncated concurrently by 4 goroutines per file. +// These goroutines contend with copying up all parent 64 subdirectories +// as well as the final file content. +// +// At the end of the test, we assert that the files respect the new truncated +// size and contain the content we expect. +func TestConcurrentCopyUp(t *testing.T) { + ctx := contexttest.Context(t) + files := makeOverlayTestFiles(t) + + var wg sync.WaitGroup + for _, file := range files { + for i := 0; i < 4; i++ { + wg.Add(1) + go func(o *overlayTestFile) { + if err := o.File.Dirent.Inode.Truncate(ctx, o.File.Dirent, truncateFileSize); err != nil { + t.Fatalf("failed to copy up: %v", err) + } + wg.Done() + }(file) + } + } + wg.Wait() + + for _, file := range files { + got := make([]byte, origFileSize) + n, err := file.File.Readv(ctx, usermem.BytesIOSequence(got)) + if int(n) != truncateFileSize { + t.Fatalf("read %d bytes from file, want %d", n, truncateFileSize) + } + if err != nil && err != io.EOF { + t.Fatalf("read got error %v, want nil", err) + } + if !bytes.Equal(got[:n], file.content[:truncateFileSize]) { + t.Fatalf("file content is %v, want %v", got[:n], file.content[:truncateFileSize]) + } + } +} + +type overlayTestFile struct { + File *fs.File + name string + content []byte +} + +func makeOverlayTestFiles(t *testing.T) []*overlayTestFile { + ctx := contexttest.Context(t) + + // Create a lower tmpfs mount. + fsys, _ := fs.FindFilesystem("tmpfs") + lower, err := fsys.Mount(contexttest.Context(t), "", fs.MountSourceFlags{}, "") + if err != nil { + t.Fatalf("failed to mount tmpfs: %v", err) + } + lowerRoot := fs.NewDirent(lower, "") + + // Make a deep set of subdirectories that everyone shares. + next := lowerRoot + for i := 0; i < 64; i++ { + name := fmt.Sprintf("subdir%d", i) + err := next.CreateDirectory(ctx, lowerRoot, name, fs.FilePermsFromMode(0777)) + if err != nil { + t.Fatalf("failed to create dir %q: %v", name, err) + } + next, err = next.Walk(ctx, lowerRoot, name) + if err != nil { + t.Fatalf("failed to walk to %q: %v", name, err) + } + } + + // Make a bunch of files in the last directory. + var files []*overlayTestFile + for i := 0; i < 64; i++ { + name := fmt.Sprintf("file%d", i) + f, err := next.Create(ctx, next, name, fs.FileFlags{Read: true, Write: true}, fs.FilePermsFromMode(0666)) + if err != nil { + t.Fatalf("failed to create file %q: %v", name, err) + } + defer f.DecRef() + + relname, _ := f.Dirent.FullName(lowerRoot) + + o := &overlayTestFile{ + name: relname, + content: make([]byte, origFileSize), + } + + if _, err := rand.Read(o.content); err != nil { + t.Fatalf("failed to read from /dev/urandom: %v", err) + } + + if _, err := f.Writev(ctx, usermem.BytesIOSequence(o.content)); err != nil { + t.Fatalf("failed to write content to file %q: %v", name, err) + } + + files = append(files, o) + } + + // Create an empty upper tmpfs mount which we will copy up into. + upper, err := fsys.Mount(ctx, "", fs.MountSourceFlags{}, "") + if err != nil { + t.Fatalf("failed to mount tmpfs: %v", err) + } + + // Construct an overlay root. + overlay, err := fs.NewOverlayRoot(ctx, upper, lower, fs.MountSourceFlags{}) + if err != nil { + t.Fatalf("failed to construct overlay root: %v", err) + } + + // Create a MountNamespace to traverse the file system. + mns, err := fs.NewMountNamespace(ctx, overlay) + if err != nil { + t.Fatalf("failed to construct mount manager: %v", err) + } + + // Walk to all of the files in the overlay, open them readable. + for _, f := range files { + d, err := mns.FindInode(ctx, mns.Root(), mns.Root(), f.name, 0) + if err != nil { + t.Fatalf("failed to find %q: %v", f.name, err) + } + defer d.DecRef() + + f.File, err = d.Inode.GetFile(ctx, d, fs.FileFlags{Read: true}) + if err != nil { + t.Fatalf("failed to open file %q readable: %v", f.name, err) + } + } + + return files +} diff --git a/pkg/sentry/fs/dentry.go b/pkg/sentry/fs/dentry.go new file mode 100644 index 000000000..d42e8da81 --- /dev/null +++ b/pkg/sentry/fs/dentry.go @@ -0,0 +1,232 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fs + +import ( + "sort" + + "gvisor.googlesource.com/gvisor/pkg/sentry/device" +) + +// DentAttr is the metadata of a directory entry. It is a subset of StableAttr. +type DentAttr struct { + // Type is the InodeType of an Inode. + Type InodeType + + // InodeID uniquely identifies an Inode on a device. + InodeID uint64 +} + +// GenericDentAttr returns a generic DentAttr where: +// +// Type == nt +// InodeID == the inode id of a new inode on device. +func GenericDentAttr(nt InodeType, device *device.Device) DentAttr { + return DentAttr{ + Type: nt, + InodeID: device.NextIno(), + } +} + +// DentrySerializer serializes a directory entry. +type DentrySerializer interface { + // CopyOut serializes a directory entry based on its name and attributes. + CopyOut(name string, attributes DentAttr) error + + // Written returns the number of bytes written. + Written() int +} + +// CollectEntriesSerializer copies DentAttrs to Entries. The order in +// which entries are encountered is preserved in Order. +type CollectEntriesSerializer struct { + Entries map[string]DentAttr + Order []string +} + +// CopyOut implements DentrySerializer.CopyOut. +func (c *CollectEntriesSerializer) CopyOut(name string, attr DentAttr) error { + if c.Entries == nil { + c.Entries = make(map[string]DentAttr) + } + c.Entries[name] = attr + c.Order = append(c.Order, name) + return nil +} + +// Written implements DentrySerializer.Written. +func (c *CollectEntriesSerializer) Written() int { + return len(c.Entries) +} + +// DirCtx is used by node.Readdir to emit directory entries. It is not +// thread-safe. +type DirCtx struct { + // Serializer is used to serialize the node attributes. + Serializer DentrySerializer + + // attrs are DentAttrs + attrs map[string]DentAttr + + // DirCursor is the directory cursor. + // TODO: Once Handles are removed this can just live in the + // respective FileOperations implementations and not need to get + // plumbed everywhere. + DirCursor *string +} + +// DirEmit is called for each directory entry. +func (c *DirCtx) DirEmit(name string, attr DentAttr) error { + if c.Serializer != nil { + if err := c.Serializer.CopyOut(name, attr); err != nil { + return err + } + } + if c.attrs == nil { + c.attrs = make(map[string]DentAttr) + } + c.attrs[name] = attr + return nil +} + +// DentAttrs returns a map of DentAttrs corresponding to the emitted directory +// entries. +func (c *DirCtx) DentAttrs() map[string]DentAttr { + if c.attrs == nil { + c.attrs = make(map[string]DentAttr) + } + return c.attrs +} + +// GenericReaddir serializes DentAttrs based on a SortedDentryMap that must +// contain _all_ up-to-date DentAttrs under a directory. If ctx.DirCursor is +// not nil, it is updated to the name of the last DentAttr that was +// successfully serialized. +// +// Returns the number of entries serialized. +func GenericReaddir(ctx *DirCtx, s *SortedDentryMap) (int, error) { + // Retrieve the next directory entries. + var names []string + var entries map[string]DentAttr + if ctx.DirCursor != nil { + names, entries = s.GetNext(*ctx.DirCursor) + } else { + names, entries = s.GetAll() + } + + // Try to serialize each entry. + var serialized int + for _, name := range names { + // Skip "" per POSIX. Skip "." and ".." which will be added by Dirent.Readdir. + if name == "" || name == "." || name == ".." { + continue + } + + // Emit the directory entry. + if err := ctx.DirEmit(name, entries[name]); err != nil { + // Return potentially a partial serialized count. + return serialized, err + } + + // We successfully serialized this entry. + serialized++ + + // Update the cursor with the name of the entry last serialized. + if ctx.DirCursor != nil { + *ctx.DirCursor = name + } + } + + // Everything was serialized. + return serialized, nil +} + +// SortedDentryMap is a sorted map of names and fs.DentAttr entries. +type SortedDentryMap struct { + // names is always kept in sorted-order. + names []string + + // entries maps names to fs.DentAttrs. + entries map[string]DentAttr +} + +// NewSortedDentryMap maintains entries in name sorted order. +func NewSortedDentryMap(entries map[string]DentAttr) *SortedDentryMap { + s := &SortedDentryMap{ + names: make([]string, 0, len(entries)), + entries: entries, + } + // Don't allow s.entries to be nil, because nil maps arn't Saveable. + if s.entries == nil { + s.entries = make(map[string]DentAttr) + } + + // Collect names from entries and sort them. + for name := range s.entries { + s.names = append(s.names, name) + } + sort.Strings(s.names) + return s +} + +// GetAll returns all names and entries in s. +func (s *SortedDentryMap) GetAll() ([]string, map[string]DentAttr) { + return s.names, s.entries +} + +// GetNext returns names after cursor in s and all entries. +func (s *SortedDentryMap) GetNext(cursor string) ([]string, map[string]DentAttr) { + i := sort.SearchStrings(s.names, cursor) + if i == len(s.names) { + return nil, s.entries + } + + // Return everything strictly after the cursor. + if s.names[i] == cursor { + i++ + } + return s.names[i:], s.entries +} + +// Add adds an entry with the given name to the map, preserving sort order. If +// name already exists in the map, its entry will be overwritten. +func (s *SortedDentryMap) Add(name string, entry DentAttr) { + if _, ok := s.entries[name]; !ok { + // Map does not yet contain an entry with this name. We must + // insert it in s.names at the appropriate spot. + i := sort.SearchStrings(s.names, name) + s.names = append(s.names, "") + copy(s.names[i+1:], s.names[i:]) + s.names[i] = name + } + s.entries[name] = entry +} + +// Remove removes an entry with the given name from the map, preserving sort order. +func (s *SortedDentryMap) Remove(name string) { + if _, ok := s.entries[name]; !ok { + return + } + i := sort.SearchStrings(s.names, name) + copy(s.names[i:], s.names[i+1:]) + s.names = s.names[:len(s.names)-1] + delete(s.entries, name) +} + +// Contains reports whether the map contains an entry with the given name. +func (s *SortedDentryMap) Contains(name string) bool { + _, ok := s.entries[name] + return ok +} diff --git a/pkg/sentry/fs/dev/BUILD b/pkg/sentry/fs/dev/BUILD new file mode 100644 index 000000000..42049ecb5 --- /dev/null +++ b/pkg/sentry/fs/dev/BUILD @@ -0,0 +1,53 @@ +package(licenses = ["notice"]) # Apache 2.0 + +load("@io_bazel_rules_go//go:def.bzl", "go_library") +load("//tools/go_stateify:defs.bzl", "go_stateify") + +go_stateify( + name = "dev_state", + srcs = [ + "dev.go", + "fs.go", + "full.go", + "null.go", + "random.go", + ], + out = "dev_state.go", + package = "dev", +) + +go_library( + name = "dev", + srcs = [ + "dev.go", + "dev_state.go", + "device.go", + "fs.go", + "full.go", + "null.go", + "random.go", + ], + importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/fs/dev", + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/amutex", + "//pkg/log", + "//pkg/sentry/context", + "//pkg/sentry/device", + "//pkg/sentry/fs", + "//pkg/sentry/fs/ashmem", + "//pkg/sentry/fs/binder", + "//pkg/sentry/fs/fsutil", + "//pkg/sentry/fs/ramfs", + "//pkg/sentry/fs/tmpfs", + "//pkg/sentry/memmap", + "//pkg/sentry/mm", + "//pkg/sentry/platform", + "//pkg/sentry/safemem", + "//pkg/sentry/usage", + "//pkg/sentry/usermem", + "//pkg/state", + "//pkg/syserror", + ], +) diff --git a/pkg/sentry/fs/dev/dev.go b/pkg/sentry/fs/dev/dev.go new file mode 100644 index 000000000..36c61bfc2 --- /dev/null +++ b/pkg/sentry/fs/dev/dev.go @@ -0,0 +1,122 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package dev provides a filesystem with simple devices. +package dev + +import ( + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/ashmem" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/binder" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/tmpfs" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +// Dev is the root node. +type Dev struct { + ramfs.Dir +} + +func newCharacterDevice(iops fs.InodeOperations, msrc *fs.MountSource) *fs.Inode { + return fs.NewInode(iops, msrc, fs.StableAttr{ + DeviceID: devDevice.DeviceID(), + InodeID: devDevice.NextIno(), + BlockSize: usermem.PageSize, + Type: fs.CharacterDevice, + }) +} + +func newDirectory(ctx context.Context, msrc *fs.MountSource) *fs.Inode { + iops := &ramfs.Dir{} + iops.InitDir(ctx, map[string]*fs.Inode{}, fs.RootOwner, fs.FilePermsFromMode(0555)) + return fs.NewInode(iops, msrc, fs.StableAttr{ + DeviceID: devDevice.DeviceID(), + InodeID: devDevice.NextIno(), + BlockSize: usermem.PageSize, + Type: fs.Directory, + }) +} + +func newSymlink(ctx context.Context, target string, msrc *fs.MountSource) *fs.Inode { + iops := &ramfs.Symlink{} + iops.InitSymlink(ctx, fs.RootOwner, target) + return fs.NewInode(iops, msrc, fs.StableAttr{ + DeviceID: devDevice.DeviceID(), + InodeID: devDevice.NextIno(), + BlockSize: usermem.PageSize, + Type: fs.Symlink, + }) +} + +// New returns the root node of a device filesystem. +func New(ctx context.Context, msrc *fs.MountSource, binderEnabled bool, ashmemEnabled bool) *fs.Inode { + d := &Dev{} + + contents := map[string]*fs.Inode{ + "fd": newSymlink(ctx, "/proc/self/fd", msrc), + "stdin": newSymlink(ctx, "/proc/self/fd/0", msrc), + "stdout": newSymlink(ctx, "/proc/self/fd/1", msrc), + "stderr": newSymlink(ctx, "/proc/self/fd/2", msrc), + + "null": newCharacterDevice(newNullDevice(ctx, fs.RootOwner, 0666), msrc), + "zero": newCharacterDevice(newZeroDevice(ctx, fs.RootOwner, 0666), msrc), + "full": newCharacterDevice(newFullDevice(ctx, fs.RootOwner, 0666), msrc), + + // This is not as good as /dev/random in linux because go + // runtime uses sys_random and /dev/urandom internally. + // According to 'man 4 random', this will be sufficient unless + // application uses this to generate long-lived GPG/SSL/SSH + // keys. + "random": newCharacterDevice(newRandomDevice(ctx, fs.RootOwner, 0444), msrc), + "urandom": newCharacterDevice(newRandomDevice(ctx, fs.RootOwner, 0444), msrc), + + "shm": tmpfs.NewDir(ctx, nil, fs.RootOwner, fs.FilePermsFromMode(0777), msrc, platform.FromContext(ctx)), + + // A devpts is typically mounted at /dev/pts to provide + // pseudoterminal support. Place an empty directory there for + // the devpts to be mounted over. + "pts": newDirectory(ctx, msrc), + // Similarly, applications expect a ptmx device at /dev/ptmx + // connected to the terminals provided by /dev/pts/. Rather + // than creating a device directly (which requires a hairy + // lookup on open to determine if a devpts exists), just create + // a symlink to the ptmx provided by devpts. (The Linux devpts + // documentation recommends this). + // + // If no devpts is mounted, this will simply be a dangling + // symlink, which is fine. + "ptmx": newSymlink(ctx, "pts/ptmx", msrc), + } + + if binderEnabled { + binder := binder.NewDevice(ctx, fs.RootOwner, fs.FilePermsFromMode(0666)) + contents["binder"] = newCharacterDevice(binder, msrc) + } + + if ashmemEnabled { + ashmem := ashmem.NewDevice(ctx, fs.RootOwner, fs.FilePermsFromMode(0666)) + contents["ashmem"] = newCharacterDevice(ashmem, msrc) + } + + d.InitDir(ctx, contents, fs.RootOwner, fs.FilePermsFromMode(0555)) + return fs.NewInode(d, msrc, fs.StableAttr{ + DeviceID: devDevice.DeviceID(), + InodeID: devDevice.NextIno(), + BlockSize: usermem.PageSize, + Type: fs.Directory, + }) +} diff --git a/pkg/sentry/fs/dev/device.go b/pkg/sentry/fs/dev/device.go new file mode 100644 index 000000000..9d935e008 --- /dev/null +++ b/pkg/sentry/fs/dev/device.go @@ -0,0 +1,20 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package dev + +import "gvisor.googlesource.com/gvisor/pkg/sentry/device" + +// devDevice is the pseudo-filesystem device. +var devDevice = device.NewAnonDevice() diff --git a/pkg/sentry/fs/dev/fs.go b/pkg/sentry/fs/dev/fs.go new file mode 100644 index 000000000..4945ac962 --- /dev/null +++ b/pkg/sentry/fs/dev/fs.go @@ -0,0 +1,90 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package dev + +import ( + "strconv" + + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// Optional key containing boolean flag which specifies if Android Binder IPC should be enabled. +const binderEnabledKey = "binder_enabled" + +// Optional key containing boolean flag which specifies if Android ashmem should be enabled. +const ashmemEnabledKey = "ashmem_enabled" + +// filesystem is a devtmpfs. +type filesystem struct{} + +func init() { + fs.RegisterFilesystem(&filesystem{}) +} + +// FilesystemName is the name underwhich the filesystem is registered. +// Name matches drivers/base/devtmpfs.c:dev_fs_type.name. +const FilesystemName = "devtmpfs" + +// Name is the name of the file system. +func (*filesystem) Name() string { + return FilesystemName +} + +// AllowUserMount allows users to mount(2) this file system. +func (*filesystem) AllowUserMount() bool { + return true +} + +// Flags returns that there is nothing special about this file system. +// +// In Linux, devtmpfs does the same thing. +func (*filesystem) Flags() fs.FilesystemFlags { + return 0 +} + +// Mount returns a devtmpfs root that can be positioned in the vfs. +func (f *filesystem) Mount(ctx context.Context, device string, flags fs.MountSourceFlags, data string) (*fs.Inode, error) { + // device is always ignored. + // devtmpfs backed by ramfs ignores bad options. See fs/ramfs/inode.c:ramfs_parse_options. + // -> we should consider parsing the mode and backing devtmpfs by this. + + // Parse generic comma-separated key=value options. + options := fs.GenericMountSourceOptions(data) + + // binerEnabledKey is optional and binder is disabled by default. + binderEnabled := false + if beStr, exists := options[binderEnabledKey]; exists { + var err error + binderEnabled, err = strconv.ParseBool(beStr) + if err != nil { + return nil, syserror.EINVAL + } + } + + // ashmemEnabledKey is optional and ashmem is disabled by default. + ashmemEnabled := false + if aeStr, exists := options[ashmemEnabledKey]; exists { + var err error + ashmemEnabled, err = strconv.ParseBool(aeStr) + if err != nil { + return nil, syserror.EINVAL + } + } + + // Construct the devtmpfs root. + return New(ctx, fs.NewNonCachingMountSource(f, flags), binderEnabled, ashmemEnabled), nil +} diff --git a/pkg/sentry/fs/dev/full.go b/pkg/sentry/fs/dev/full.go new file mode 100644 index 000000000..e13eb6c03 --- /dev/null +++ b/pkg/sentry/fs/dev/full.go @@ -0,0 +1,53 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package dev + +import ( + "math" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// fullDevice is used to implement /dev/full. +type fullDevice struct { + ramfs.Entry +} + +func newFullDevice(ctx context.Context, owner fs.FileOwner, mode linux.FileMode) *fullDevice { + f := &fullDevice{} + f.InitEntry(ctx, owner, fs.FilePermsFromMode(mode)) + return f +} + +// DeprecatedPwritev implements fs.InodeOperations.DeprecatedPwritev by +// returining ENOSPC. +func (f *fullDevice) DeprecatedPwritev(_ context.Context, _ usermem.IOSequence, _ int64) (int64, error) { + return 0, syserror.ENOSPC +} + +// DeprecatedPreadv implements fs.InodeOperations.DeprecatedPreadv. +func (f *fullDevice) DeprecatedPreadv(ctx context.Context, dst usermem.IOSequence, _ int64) (int64, error) { + return dst.ZeroOut(ctx, math.MaxInt64) +} + +// Truncate should be simply ignored for character devices on linux. +func (f *fullDevice) Truncate(context.Context, *fs.Inode, int64) error { + return nil +} diff --git a/pkg/sentry/fs/dev/null.go b/pkg/sentry/fs/dev/null.go new file mode 100644 index 000000000..66b8ba967 --- /dev/null +++ b/pkg/sentry/fs/dev/null.go @@ -0,0 +1,96 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package dev + +import ( + "io" + "math" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs" + "gvisor.googlesource.com/gvisor/pkg/sentry/memmap" + "gvisor.googlesource.com/gvisor/pkg/sentry/mm" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +type nullDevice struct { + ramfs.Entry +} + +func newNullDevice(ctx context.Context, owner fs.FileOwner, mode linux.FileMode) *nullDevice { + n := &nullDevice{} + n.InitEntry(ctx, owner, fs.FilePermsFromMode(mode)) + return n +} + +// DeprecatedPreadv reads data from the device. +func (n *nullDevice) DeprecatedPreadv(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) { + return 0, io.EOF +} + +// DeprecatedPwritev discards writes. +func (n *nullDevice) DeprecatedPwritev(_ context.Context, src usermem.IOSequence, offset int64) (int64, error) { + return src.NumBytes(), nil +} + +// Truncate should be simply ignored for character devices on linux. +func (n *nullDevice) Truncate(context.Context, *fs.Inode, int64) error { + return nil +} + +type zeroDevice struct { + nullDevice +} + +func newZeroDevice(ctx context.Context, owner fs.FileOwner, mode linux.FileMode) *zeroDevice { + zd := &zeroDevice{} + zd.InitEntry(ctx, owner, fs.FilePermsFromMode(mode)) + return zd +} + +// DeprecatedPreadv implements fs.InodeOperations.DeprecatedPreadv. +func (zd *zeroDevice) DeprecatedPreadv(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) { + return dst.ZeroOut(ctx, math.MaxInt64) +} + +// GetFile overrides ramfs.Entry.GetFile and returns a zeroFile instead. +func (zd *zeroDevice) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) { + // Allow pread(2) and pwrite(2) on this file. + flags.Pread = true + flags.Pwrite = true + + return fs.NewFile(ctx, dirent, flags, &zeroFileOperations{ + FileOperations: &fsutil.Handle{HandleOperations: dirent.Inode.HandleOps()}, + }), nil +} + +type zeroFileOperations struct { + fs.FileOperations +} + +// ConfigureMMap implements fs.FileOperations.ConfigureMMap. +func (*zeroFileOperations) ConfigureMMap(ctx context.Context, file *fs.File, opts *memmap.MMapOpts) error { + m, err := mm.NewSharedAnonMappable(opts.Length, platform.FromContext(ctx)) + if err != nil { + return err + } + opts.MappingIdentity = m + opts.Mappable = m + return nil +} diff --git a/pkg/sentry/fs/dev/random.go b/pkg/sentry/fs/dev/random.go new file mode 100644 index 000000000..0402f9355 --- /dev/null +++ b/pkg/sentry/fs/dev/random.go @@ -0,0 +1,55 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package dev + +import ( + "crypto/rand" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs" + "gvisor.googlesource.com/gvisor/pkg/sentry/safemem" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +type randomDevice struct { + ramfs.Entry +} + +func newRandomDevice(ctx context.Context, owner fs.FileOwner, mode linux.FileMode) *randomDevice { + r := &randomDevice{} + r.InitEntry(ctx, owner, fs.FilePermsFromMode(mode)) + return r +} + +// DeprecatedPreadv reads random data. +func (*randomDevice) DeprecatedPreadv(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) { + return dst.CopyOutFrom(ctx, safemem.FromIOReader{rand.Reader}) +} + +// DeprecatedPwritev implements fs.HandleOperations.DeprecatedPwritev. +func (*randomDevice) DeprecatedPwritev(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) { + // On Linux, "Writing to /dev/random or /dev/urandom will update the + // entropy pool with the data written, but this will not result in a higher + // entropy count" - random(4). We don't need to support this, but we do + // need to support the write, so just make it a no-op a la /dev/null. + return src.NumBytes(), nil +} + +// Truncate should be simply ignored for character devices on linux. +func (r *randomDevice) Truncate(context.Context, *fs.Inode, int64) error { + return nil +} diff --git a/pkg/sentry/fs/dirent.go b/pkg/sentry/fs/dirent.go new file mode 100644 index 000000000..a75c7ea7e --- /dev/null +++ b/pkg/sentry/fs/dirent.go @@ -0,0 +1,1605 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fs + +import ( + "fmt" + "path" + "sort" + "sync" + "sync/atomic" + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/refs" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth" + "gvisor.googlesource.com/gvisor/pkg/sentry/uniqueid" + "gvisor.googlesource.com/gvisor/pkg/syserror" + "gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix" +) + +type globalDirentMap struct { + mu sync.Mutex + dirents map[*Dirent]struct{} +} + +func (g *globalDirentMap) add(d *Dirent) { + g.mu.Lock() + g.dirents[d] = struct{}{} + g.mu.Unlock() +} + +func (g *globalDirentMap) remove(d *Dirent) { + g.mu.Lock() + delete(g.dirents, d) + g.mu.Unlock() +} + +// allDirents keeps track of all Dirents that need to be considered in +// Save/Restore for inode mappings. +// +// Because inodes do not hold paths, but inodes for external file systems map +// to an external path, every user-visible Dirent is stored in this map and +// iterated through upon save to keep inode ID -> restore path mappings. +var allDirents = globalDirentMap{ + dirents: map[*Dirent]struct{}{}, +} + +// renameMu protects the parent of *all* Dirents. (See explanation in +// lockForRename.) +// +// See fs.go for lock ordering. +var renameMu sync.RWMutex + +// Dirent holds an Inode in memory. +// +// A Dirent may be negative or positive: +// +// A negative Dirent contains a nil Inode and indicates that a path does not exist. This +// is a convention taken from the Linux dcache, see fs/dcache.c. A negative Dirent remains +// cached until a create operation replaces it with a positive Dirent. A negative Dirent +// always has one reference owned by its parent and takes _no_ reference on its parent. This +// ensures that its parent can be unhashed regardless of negative children. +// +// A positive Dirent contains a non-nil Inode. It remains cached for as long as there remain +// references to it. A positive Dirent always takes a reference on its parent. +// +// A Dirent may be a root Dirent (parent is nil) or be parented (non-nil parent). +// +// Dirents currently do not attempt to free entries that lack application references under +// memory pressure. +type Dirent struct { + // AtomicRefCount is our reference count. + refs.AtomicRefCount + + // userVisible indicates whether the Dirent is visible to the user or + // not. Only user-visible Dirents should save inode mappings in + // save/restore, as only they hold the real path to the underlying + // inode. + // + // See newDirent and Dirent.afterLoad. + userVisible bool + + // Inode is the underlying file object. + // + // Inode is exported currently to assist in implementing overlay Inodes (where a + // Inode.InodeOperations.Lookup may need to merge the Inode contained in a positive Dirent with + // another Inode). This is normally done before the Dirent is parented (there are + // no external references to it). + // + // Other objects in the VFS may take a reference to this Inode but only while holding + // a reference to this Dirent. + Inode *Inode + + // name is the name (i.e. basename) of this entry. + // + // N.B. name is protected by parent.mu, not this node's mu! + name string + + // parent is the parent directory. + // + // We hold a hard reference to the parent. + // + // parent is protected by renameMu. + parent *Dirent + + // deleted may be set atomically when removed. + deleted int32 `state:"nosave"` + + // frozen indicates this entry can't walk to unknown nodes. + frozen bool + + // mounted is true if Dirent is a mount point, similar to include/linux/dcache.h:DCACHE_MOUNTED. + mounted bool + + // direntEntry identifies this Dirent as an element in a DirentCache. DirentCaches + // and their contents are not saved. + direntEntry `state:"nosave"` + + // dirMu is a read-write mutex that protects caching decisions made by directory operations. + // Lock ordering: dirMu must be taken before mu (see below). Details: + // + // dirMu does not participate in Rename; instead mu and renameMu are used, see lockForRename. + // + // Creation and Removal operations must be synchronized with Walk to prevent stale negative + // caching. Note that this requirement is not specific to a _Dirent_ doing negative caching. + // The following race exists at any level of the VFS: + // + // For an object D that represents a directory, containing a cache of non-existent paths, + // protected by D.cacheMu: + // + // T1: T2: + // D.lookup(name) + // --> ENOENT + // D.create(name) + // --> success + // D.cacheMu.Lock + // delete(D.cache, name) + // D.cacheMu.Unlock + // D.cacheMu.Lock + // D.cache[name] = true + // D.cacheMu.Unlock + // + // D.lookup(name) + // D.cacheMu.Lock + // if D.cache[name] { + // --> ENOENT (wrong) + // } + // D.cacheMu.Lock + // + // Correct: + // + // T1: T2: + // D.cacheMu.Lock + // D.lookup(name) + // --> ENOENT + // D.cache[name] = true + // D.cacheMu.Unlock + // D.cacheMu.Lock + // D.create(name) + // --> success + // delete(D.cache, name) + // D.cacheMu.Unlock + // + // D.cacheMu.Lock + // D.lookup(name) + // --> EXISTS (right) + // D.cacheMu.Unlock + // + // Note that the above "correct" solution causes too much lock contention: all lookups are + // synchronized with each other. This is a problem because lookups are involved in any VFS + // path operation. + // + // A Dirent diverges from the single D.cacheMu and instead uses two locks: dirMu to protect + // concurrent creation/removal/lookup caching, and mu to protect the Dirent's children map + // in general. + // + // This allows for concurrent Walks to be executed in order to pipeline lookups. For instance + // for a hot directory /a/b, threads T1, T2, T3 will only block on each other update the + // children map of /a/b when their individual lookups complete. + // + // T1: T2: T3: + // stat(/a/b/c) stat(/a/b/d) stat(/a/b/e) + dirMu sync.RWMutex `state:"nosave"` + + // mu protects the below fields. Lock ordering: mu must be taken after dirMu. + mu sync.Mutex `state:"nosave"` + + // children are cached via weak references. + children map[string]*refs.WeakRef +} + +// NewDirent returns a new root Dirent, taking the caller's reference on inode. The caller +// holds the only reference to the Dirent. Parents may call hashChild to parent this Dirent. +func NewDirent(inode *Inode, name string) *Dirent { + d := newDirent(inode, name) + allDirents.add(d) + d.userVisible = true + return d +} + +// NewTransientDirent creates a transient Dirent that shouldn't actually be +// visible to users. +func NewTransientDirent(inode *Inode) *Dirent { + return newDirent(inode, "transient") +} + +func newDirent(inode *Inode, name string) *Dirent { + // The Dirent needs to maintain one reference to MountSource. + if inode != nil { + inode.MountSource.IncDirentRefs() + } + return &Dirent{ + Inode: inode, + name: name, + children: make(map[string]*refs.WeakRef), + } +} + +// NewNegativeDirent returns a new root negative Dirent. Otherwise same as NewDirent. +func NewNegativeDirent(name string) *Dirent { + return newDirent(nil, name) +} + +// IsRoot returns true if d is a root Dirent. +func (d *Dirent) IsRoot() bool { + return d.parent == nil +} + +// IsNegative returns true if d represents a path that does not exist. +func (d *Dirent) IsNegative() bool { + return d.Inode == nil +} + +// hashChild will hash child into the children list of its new parent d, carrying over +// any "frozen" state from d. +// +// Returns (*WeakRef, true) if hashing child caused a Dirent to be unhashed. The caller must +// validate the returned unhashed weak reference. Common cases: +// +// * Remove: hashing a negative Dirent unhashes a positive Dirent (unimplemented). +// * Create: hashing a positive Dirent unhashes a negative Dirent. +// * Lookup: hashing any Dirent should not unhash any other Dirent. +// +// Preconditions: +// * d.mu must be held. +// * child must be a root Dirent. +func (d *Dirent) hashChild(child *Dirent) (*refs.WeakRef, bool) { + if !child.IsRoot() { + panic("hashChild must be a root Dirent") + } + + // Assign parentage. + child.parent = d + + // Avoid letting negative Dirents take a reference on their parent; these Dirents + // don't have a role outside of the Dirent cache and should not keep their parent + // indefinitely pinned. + if !child.IsNegative() { + // Positive dirents must take a reference on their parent. + d.IncRef() + } + + // Carry over parent's frozen state. + child.frozen = d.frozen + + return d.hashChildParentSet(child) +} + +// hashChildParentSet will rehash child into the children list of its parent d. +// +// Assumes that child.parent = d already. +func (d *Dirent) hashChildParentSet(child *Dirent) (*refs.WeakRef, bool) { + if child.parent != d { + panic("hashChildParentSet assumes the child already belongs to the parent") + } + + // Save any replaced child so our caller can validate it. + old, ok := d.children[child.name] + + // Hash the child. + d.children[child.name] = refs.NewWeakRef(child, nil) + + // Return any replaced child. + return old, ok +} + +// SyncAll iterates through mount points under d and writes back their buffered +// modifications to filesystems. +func (d *Dirent) SyncAll(ctx context.Context) { + d.mu.Lock() + defer d.mu.Unlock() + + // For negative Dirents there is nothing to sync. By definition these are + // leaves (there is nothing left to traverse). + if d.IsNegative() { + return + } + + // There is nothing to sync for a read-only filesystem. + if !d.Inode.MountSource.Flags.ReadOnly { + // FIXME: This should be a mount traversal, not a + // Dirent traversal, because some Inodes that need to be synced + // may no longer be reachable by name (after sys_unlink). + // + // Write out metadata, dirty page cached pages, and sync disk/remote + // caches. + d.Inode.WriteOut(ctx) + } + + // Continue iterating through other mounted filesystems. + for _, w := range d.children { + if child := w.Get(); child != nil { + child.(*Dirent).SyncAll(ctx) + child.DecRef() + } + } +} + +// FullName returns the fully-qualified name and a boolean value representing +// whether this Dirent was a descendant of root. +// If the root argument is nil it is assumed to be the root of the Dirent tree. +func (d *Dirent) FullName(root *Dirent) (string, bool) { + renameMu.RLock() + defer renameMu.RUnlock() + return d.fullName(root) +} + +// fullName returns the fully-qualified name and a boolean value representing +// if the root node was reachable from this Dirent. +func (d *Dirent) fullName(root *Dirent) (string, bool) { + if d == root { + return "/", true + } + + if d.IsRoot() { + if root != nil { + // We reached the top of the Dirent tree but did not encounter + // the given root. Return false for reachable so the caller + // can handle this situation accordingly. + return d.name, false + } + return d.name, true + } + + // Traverse up to parent. + d.parent.mu.Lock() + name := d.name + d.parent.mu.Unlock() + parentName, reachable := d.parent.fullName(root) + s := path.Join(parentName, name) + if atomic.LoadInt32(&d.deleted) != 0 { + return s + " (deleted)", reachable + } + return s, reachable +} + +func (d *Dirent) freeze() { + if d.frozen { + // Already frozen. + return + } + d.frozen = true + + // Take a reference when freezing. + for _, w := range d.children { + if child := w.Get(); child != nil { + // NOTE: We would normally drop the reference here. But + // instead we're hanging on to it. + ch := child.(*Dirent) + ch.Freeze() + } + } + + // Drop all expired weak references. + d.flush() +} + +// Freeze prevents this dirent from walking to more nodes. Freeze is applied +// recursively to all children. +// +// If this particular Dirent represents a Virtual node, then Walks and Creates +// may proceed as before. +// +// Freeze can only be called before the application starts running, otherwise +// the root it might be out of sync with the application root if modified by +// sys_chroot. +func (d *Dirent) Freeze() { + d.mu.Lock() + defer d.mu.Unlock() + d.freeze() +} + +// descendantOf returns true if the receiver dirent is equal to, or a +// descendant of, the argument dirent. +// +// d.mu must be held. +func (d *Dirent) descendantOf(p *Dirent) bool { + if d == p { + return true + } + if d.IsRoot() { + return false + } + return d.parent.descendantOf(p) +} + +// walk walks to path name starting at the dirent, and will not traverse above +// root Dirent. +// +// If walkMayUnlock is true then walk can unlock d.mu to execute a slow +// Inode.Lookup, otherwise walk will keep d.mu locked. +// +// Preconditions: +// - d.mu must be held. +// - name must must not contain "/"s. +func (d *Dirent) walk(ctx context.Context, root *Dirent, name string, walkMayUnlock bool) (*Dirent, error) { + if !IsDir(d.Inode.StableAttr) { + return nil, syscall.ENOTDIR + } + if name == "" || name == "." { + d.IncRef() + return d, nil + } else if name == ".." { + renameMu.RLock() + // Respect the chroot. Note that in Linux there is no check to enforce + // that d is a descendant of root. + if d == root { + d.IncRef() + renameMu.RUnlock() + return d, nil + } + // Are we already at the root? Then ".." is ".". + if d.IsRoot() { + d.IncRef() + renameMu.RUnlock() + return d, nil + } + d.parent.IncRef() + renameMu.RUnlock() + return d.parent, nil + } + + if w, ok := d.children[name]; ok { + // Try to resolve the weak reference to a hard reference. + if child := w.Get(); child != nil { + cd := child.(*Dirent) + + // Is this a negative Dirent? + if cd.IsNegative() { + // Don't leak a reference; this doesn't matter as much for negative Dirents, + // which don't hold a hard reference on their parent (their parent holds a + // hard reference on them, and they contain virtually no state). But this is + // good house-keeping. + child.DecRef() + return nil, syscall.ENOENT + } + + // Do we need to revalidate this child? + // + // We never allow the file system to revalidate mounts, that could cause them + // to unexpectedly drop out before umount. + if cd.mounted || !cd.Inode.MountSource.Revalidate(cd) { + // Good to go. This is the fast-path. + return cd, nil + } + + // If we're revalidating a child, we must ensure all inotify watches release + // their pins on the child. Inotify doesn't properly support filesystems that + // revalidate dirents (since watches are lost on revalidation), but if we fail + // to unpin the watches child will never be GCed. + cd.Inode.Watches.Unpin(cd) + + // This child needs to be revalidated, fallthrough to unhash it. Make sure + // to not leak a reference from Get(). + // + // Note that previous lookups may still have a reference to this stale child; + // this can't be helped, but we can ensure that *new* lookups are up-to-date. + child.DecRef() + } + + // Either our weak reference expired or we need to revalidate it. Unhash child first, we're + // about to replace it. + delete(d.children, name) + w.Drop() + } + + // Are we allowed to do the lookup? + if d.frozen && !d.Inode.IsVirtual() { + return nil, syscall.ENOENT + } + + // Slow path: load the InodeOperations into memory. Since this is a hot path and the lookup may be expensive, + // if possible release the lock and re-acquire it. + if walkMayUnlock { + d.mu.Unlock() + } + c, err := d.Inode.Lookup(ctx, name) + if walkMayUnlock { + d.mu.Lock() + } + // No dice. + if err != nil { + return nil, err + } + + // Sanity check c, its name must be consistent. + if c.name != name { + panic(fmt.Sprintf("lookup from %q to %q returned unexpected name %q", d.name, name, c.name)) + } + + // Now that we have the lock again, check if we raced. + if w, ok := d.children[name]; ok { + // Someone else looked up or created a child at name before us. + if child := w.Get(); child != nil { + cd := child.(*Dirent) + + // There are active references to the existing child, prefer it to the one we + // retrieved from Lookup. Likely the Lookup happened very close to the insertion + // of child, so considering one stale over the other is fairly arbitrary. + c.DecRef() + + // The child that was installed could be negative. + if cd.IsNegative() { + // If so, don't leak a reference and short circuit. + child.DecRef() + return nil, syscall.ENOENT + } + + // We make the judgement call that if c raced with cd they are close enough to have + // the same staleness, so we don't attempt to revalidate cd. In Linux revalidations + // can continue indefinitely (see fs/namei.c, retry_estale); we try to avoid this. + return cd, nil + } + + // Weak reference expired. We went through a full cycle of create/destroy in the time + // we did the Inode.Lookup. Fully drop the weak reference and fallback to using the child + // we looked up. + delete(d.children, name) + w.Drop() + } + + // Give the looked up child a parent. We cannot kick out entries, since we just checked above + // that there is nothing at name in d's children list. + if _, kicked := d.hashChild(c); kicked { + // Yell loudly. + panic(fmt.Sprintf("hashed child %q over existing child", c.name)) + } + + // Is this a negative Dirent? + if c.IsNegative() { + // Don't drop a reference on the negative Dirent, it was just installed and this is the + // only reference we'll ever get. d owns the reference. + return nil, syscall.ENOENT + } + + // Return the positive Dirent. + return c, nil +} + +// Walk walks to a new dirent, and will not walk higher than the given root +// Dirent, which must not be nil. +func (d *Dirent) Walk(ctx context.Context, root *Dirent, name string) (*Dirent, error) { + if root == nil { + panic("Dirent.Walk: root must not be nil") + } + + d.dirMu.RLock() + d.mu.Lock() + child, err := d.walk(ctx, root, name, true /* may unlock */) + d.mu.Unlock() + d.dirMu.RUnlock() + + return child, err +} + +// exists returns true if name exists in relation to d. +// +// Preconditions: d.mu must be held. +func (d *Dirent) exists(ctx context.Context, root *Dirent, name string) bool { + child, err := d.walk(ctx, root, name, true /* may unlock */) + if err != nil { + // Child may not exist. + return false + } + // Child exists. + child.DecRef() + return true +} + +// lockDirectory should be called for any operation that changes this `d`s +// children (creating or removing them). +func (d *Dirent) lockDirectory() func() { + if d.Inode.overlay != nil { + // overlay copyUp may need to look at Dirent parents, and hence + // may need renameMu. + renameMu.RLock() + d.dirMu.Lock() + d.mu.Lock() + return func() { + d.mu.Unlock() + d.dirMu.Unlock() + renameMu.RUnlock() + } + } + + d.dirMu.Lock() + d.mu.Lock() + return func() { + d.mu.Unlock() + d.dirMu.Unlock() + } +} + +// Create creates a new regular file in this directory. +func (d *Dirent) Create(ctx context.Context, root *Dirent, name string, flags FileFlags, perms FilePermissions) (*File, error) { + unlock := d.lockDirectory() + defer unlock() + + // Does something already exist? + if d.exists(ctx, root, name) { + return nil, syscall.EEXIST + } + + // Are we frozen? + if d.frozen && !d.Inode.IsVirtual() { + return nil, syscall.ENOENT + } + + // Try the create. We need to trust the file system to return EEXIST (or something + // that will translate to EEXIST) if name already exists. + file, err := d.Inode.Create(ctx, d, name, flags, perms) + if err != nil { + return nil, err + } + child := file.Dirent + + // Sanity check c, its name must be consistent. + if child.name != name { + panic(fmt.Sprintf("create from %q to %q returned unexpected name %q", d.name, name, child.name)) + } + + // File systems cannot return a negative Dirent on Create, that makes no sense. + if child.IsNegative() { + panic(fmt.Sprintf("create from %q to %q returned negative Dirent", d.name, name)) + } + + // Hash the child into its parent. We can only kick out a Dirent if it is negative + // (we are replacing something that does not exist with something that now does). + if w, kicked := d.hashChild(child); kicked { + if old := w.Get(); old != nil { + if !old.(*Dirent).IsNegative() { + panic(fmt.Sprintf("hashed child %q over a positive child", child.name)) + } + // Don't leak a reference. + old.DecRef() + + // Drop d's reference. + old.DecRef() + } + + // Finally drop the useless weak reference on the floor. + w.Drop() + } + + d.Inode.Watches.Notify(name, linux.IN_CREATE, 0) + + // Allow the file system to take extra references on c. + child.maybeExtendReference() + + // Return the reference and the new file. When the last reference to + // the file is dropped, file.Dirent may no longer be cached. + return file, nil +} + +// genericCreate executes create if name does not exist. Removes a negative Dirent at name if +// create succeeds. +// +// Preconditions: d.mu must be held. +func (d *Dirent) genericCreate(ctx context.Context, root *Dirent, name string, create func() error) error { + // Does something already exist? + if d.exists(ctx, root, name) { + return syscall.EEXIST + } + + // Are we frozen? + if d.frozen && !d.Inode.IsVirtual() { + return syscall.ENOENT + } + + // Execute the create operation. + if err := create(); err != nil { + return err + } + + // Remove any negative Dirent. We've already asserted above with d.exists + // that the only thing remaining here can be a negative Dirent. + if w, ok := d.children[name]; ok { + // Same as Create. + if old := w.Get(); old != nil { + if !old.(*Dirent).IsNegative() { + panic(fmt.Sprintf("hashed over a positive child %q", old.(*Dirent).name)) + } + // Don't leak a reference. + old.DecRef() + + // Drop d's reference. + old.DecRef() + } + + // Unhash the negative Dirent, name needs to exist now. + delete(d.children, name) + + // Finally drop the useless weak reference on the floor. + w.Drop() + } + + return nil +} + +// CreateLink creates a new link in this directory. +func (d *Dirent) CreateLink(ctx context.Context, root *Dirent, oldname, newname string) error { + unlock := d.lockDirectory() + defer unlock() + + return d.genericCreate(ctx, root, newname, func() error { + if err := d.Inode.CreateLink(ctx, d, oldname, newname); err != nil { + return err + } + d.Inode.Watches.Notify(newname, linux.IN_CREATE, 0) + return nil + }) +} + +// CreateHardLink creates a new hard link in this directory. +func (d *Dirent) CreateHardLink(ctx context.Context, root *Dirent, target *Dirent, name string) error { + unlock := d.lockDirectory() + defer unlock() + + // Make sure that target does not span filesystems. + if d.Inode.MountSource != target.Inode.MountSource { + return syscall.EXDEV + } + + return d.genericCreate(ctx, root, name, func() error { + if err := d.Inode.CreateHardLink(ctx, d, target, name); err != nil { + return err + } + target.Inode.Watches.Notify("", linux.IN_ATTRIB, 0) // Link count change. + d.Inode.Watches.Notify(name, linux.IN_CREATE, 0) + return nil + }) +} + +// CreateDirectory creates a new directory under this dirent. +func (d *Dirent) CreateDirectory(ctx context.Context, root *Dirent, name string, perms FilePermissions) error { + unlock := d.lockDirectory() + defer unlock() + + return d.genericCreate(ctx, root, name, func() error { + if err := d.Inode.CreateDirectory(ctx, d, name, perms); err != nil { + return err + } + d.Inode.Watches.Notify(name, linux.IN_ISDIR|linux.IN_CREATE, 0) + return nil + }) +} + +// Bind satisfies the InodeOperations interface; otherwise same as GetFile. +func (d *Dirent) Bind(ctx context.Context, root *Dirent, name string, socket unix.BoundEndpoint, perms FilePermissions) error { + d.dirMu.Lock() + defer d.dirMu.Unlock() + d.mu.Lock() + defer d.mu.Unlock() + + err := d.genericCreate(ctx, root, name, func() error { + if err := d.Inode.Bind(ctx, name, socket, perms); err != nil { + return err + } + d.Inode.Watches.Notify(name, linux.IN_CREATE, 0) + return nil + }) + if err == syscall.EEXIST { + return syscall.EADDRINUSE + } + return err +} + +// CreateFifo creates a new named pipe under this dirent. +func (d *Dirent) CreateFifo(ctx context.Context, root *Dirent, name string, perms FilePermissions) error { + unlock := d.lockDirectory() + defer unlock() + + return d.genericCreate(ctx, root, name, func() error { + if err := d.Inode.CreateFifo(ctx, d, name, perms); err != nil { + return err + } + d.Inode.Watches.Notify(name, linux.IN_CREATE, 0) + return nil + }) +} + +// getDotAttrs returns the DentAttrs corresponding to "." and ".." directories. +func (d *Dirent) getDotAttrs(root *Dirent) (DentAttr, DentAttr) { + // Get '.'. + sattr := d.Inode.StableAttr + dot := DentAttr{ + Type: sattr.Type, + InodeID: sattr.InodeID, + } + + // Get '..'. + if !d.IsRoot() && d.descendantOf(root) { + // Dirent is a descendant of the root. Get its parent's attrs. + psattr := d.parent.Inode.StableAttr + dotdot := DentAttr{ + Type: psattr.Type, + InodeID: psattr.InodeID, + } + return dot, dotdot + } + // Dirent is either root or not a descendant of the root. ".." is the + // same as ".". + return dot, dot +} + +// readdirFrozen returns readdir results based solely on the frozen children. +func (d *Dirent) readdirFrozen(root *Dirent, offset int64, dirCtx *DirCtx) (int64, error) { + // Collect attrs for "." and "..". + attrs := make(map[string]DentAttr) + names := []string{".", ".."} + attrs["."], attrs[".."] = d.getDotAttrs(root) + + // Get info from all children. + d.mu.Lock() + defer d.mu.Unlock() + for name, w := range d.children { + if child := w.Get(); child != nil { + defer child.DecRef() + + // Skip negative children. + if child.(*Dirent).IsNegative() { + continue + } + + sattr := child.(*Dirent).Inode.StableAttr + attrs[name] = DentAttr{ + Type: sattr.Type, + InodeID: sattr.InodeID, + } + names = append(names, name) + } + } + + sort.Strings(names) + + if int(offset) >= len(names) { + return offset, nil + } + names = names[int(offset):] + for _, name := range names { + if err := dirCtx.DirEmit(name, attrs[name]); err != nil { + return offset, err + } + offset++ + } + return offset, nil +} + +// DirIterator is an open directory containing directory entries that can be read. +type DirIterator interface { + // IterateDir emits directory entries by calling dirCtx.EmitDir, beginning + // with the entry at offset and returning the next directory offset. + // + // Entries for "." and ".." must *not* be included. + // + // If the offset returned is the same as the argument offset, then + // nothing has been serialized. This is equivalent to reaching EOF. + // In this case serializer.Written() should return 0. + // + // The order of entries to emit must be consistent between Readdir + // calls, and must start with the given offset. + // + // The caller must ensure that this operation is permitted. + IterateDir(ctx context.Context, dirCtx *DirCtx, offset int) (int, error) +} + +// DirentReaddir serializes the directory entries of d including "." and "..". +// +// Arguments: +// +// * d: the Dirent of the directory being read; required to provide "." and "..". +// * it: the directory iterator; which represents an open directory handle. +// * root: fs root; if d is equal to the root, then '..' will refer to d. +// * ctx: context provided to file systems in order to select and serialize entries. +// * offset: the current directory offset. +// +// Returns the offset of the *next* element which was not serialized. +func DirentReaddir(ctx context.Context, d *Dirent, it DirIterator, root *Dirent, dirCtx *DirCtx, offset int64) (int64, error) { + offset, err := direntReaddir(ctx, d, it, root, dirCtx, offset) + // Serializing any directory entries at all means success. + if dirCtx.Serializer.Written() > 0 { + return offset, nil + } + return offset, err +} + +func direntReaddir(ctx context.Context, d *Dirent, it DirIterator, root *Dirent, dirCtx *DirCtx, offset int64) (int64, error) { + if root == nil { + panic("Dirent.Readdir: root must not be nil") + } + if dirCtx.Serializer == nil { + panic("Dirent.Readdir: serializer must not be nil") + } + if d.frozen { + return d.readdirFrozen(root, offset, dirCtx) + } + + // Check that this is actually a directory before emitting anything. + // Once we have written entries for "." and "..", future errors from + // IterateDir will be hidden. + if !IsDir(d.Inode.StableAttr) { + return 0, syserror.ENOTDIR + } + + // Collect attrs for "." and "..". + dot, dotdot := d.getDotAttrs(root) + + // Emit "." and ".." if the offset is low enough. + if offset == 0 { + // Serialize ".". + if err := dirCtx.DirEmit(".", dot); err != nil { + return offset, err + } + offset++ + } + if offset == 1 { + // Serialize "..". + if err := dirCtx.DirEmit("..", dotdot); err != nil { + return offset, err + } + offset++ + } + + // it.IterateDir should be passed an offset that does not include the + // initial dot elements. We will add them back later. + offset -= 2 + newOffset, err := it.IterateDir(ctx, dirCtx, int(offset)) + if int64(newOffset) < offset { + panic(fmt.Sprintf("node.Readdir returned offset %v less that input offset %v", offset, newOffset)) + } + // Add the initial nodes back to the offset count. + newOffset += 2 + return int64(newOffset), err +} + +// flush flushes all weak references recursively, and removes any cached +// references to children. +// +// Preconditions: d.mu must be held. +func (d *Dirent) flush() { + expired := make(map[string]*refs.WeakRef) + for n, w := range d.children { + // Call flush recursively on each child before removing our + // reference on it, and removing the cache's reference. + if child := w.Get(); child != nil { + cd := child.(*Dirent) + + if !cd.IsNegative() { + // Flush the child. + cd.mu.Lock() + cd.flush() + cd.mu.Unlock() + + // Allow the file system to drop extra references on child. + cd.dropExtendedReference() + } + + // Don't leak a reference. + child.DecRef() + } + // Check if the child dirent is closed, and mark it as expired if it is. + // We must call w.Get() again here, since the child could have been closed + // by the calls to flush() and cache.Remove() in the above if-block. + if child := w.Get(); child != nil { + child.DecRef() + } else { + expired[n] = w + } + } + + // Remove expired entries. + for n, w := range expired { + delete(d.children, n) + w.Drop() + } +} + +// Busy indicates whether this Dirent is a mount point or root dirent, or has +// active positive children. +// +// This is expensive, since it flushes the children cache. +// +// TODO: Fix this busy-ness check. +func (d *Dirent) Busy() bool { + d.mu.Lock() + defer d.mu.Unlock() + + if d.mounted || d.parent == nil { + return true + } + + // Flush any cached references to children that are doomed. + d.flush() + + // Count positive children. + var nonNegative int + for _, w := range d.children { + if child := w.Get(); child != nil { + if !child.(*Dirent).IsNegative() { + nonNegative++ + } + child.DecRef() + } + } + return nonNegative > 0 +} + +// mount mounts a new dirent with the given inode over d. +// +// Precondition: must be called with mm.withMountLocked held on `d`. +func (d *Dirent) mount(ctx context.Context, inode *Inode) (newChild *Dirent, err error) { + // Did we race with deletion? + if atomic.LoadInt32(&d.deleted) != 0 { + return nil, syserror.ENOENT + } + + // Refuse to mount a symlink. + // + // See Linux equivalent in fs/namespace.c:do_add_mount. + if IsSymlink(inode.StableAttr) { + return nil, syserror.EINVAL + } + + // Are we frozen? + if d.parent.frozen && !d.parent.Inode.IsVirtual() { + return nil, syserror.ENOENT + } + + // Dirent that'll replace d. + // + // Note that NewDirent returns with one reference taken; the reference + // is donated to the caller as the mount reference. + replacement := NewDirent(inode, d.name) + replacement.mounted = true + + weakRef, ok := d.parent.hashChild(replacement) + if !ok { + panic("mount must mount over an existing dirent") + } + weakRef.Drop() + + // Note that even though `d` is now hidden, it still holds a reference + // to its parent. + return replacement, nil +} + +// unmount unmounts `d` and replaces it with the last Dirent that was in its +// place, supplied by the MountNamespace as `replacement`. +// +// Precondition: must be called with mm.withMountLocked held on `d`. +func (d *Dirent) unmount(ctx context.Context, replacement *Dirent) error { + // Did we race with deletion? + if atomic.LoadInt32(&d.deleted) != 0 { + return syserror.ENOENT + } + + // Are we frozen? + if d.parent.frozen && !d.parent.Inode.IsVirtual() { + return syserror.ENOENT + } + + // Remount our former child in its place. + // + // As replacement used to be our child, it must already have the right + // parent. + weakRef, ok := d.parent.hashChildParentSet(replacement) + if !ok { + panic("mount must mount over an existing dirent") + } + weakRef.Drop() + + // d is not reachable anymore, and hence not mounted anymore. + d.mounted = false + + // Drop mount reference. + d.DecRef() + return nil +} + +// Remove removes the given file or symlink. The root dirent is used to +// resolve name, and must not be nil. +func (d *Dirent) Remove(ctx context.Context, root *Dirent, name string) error { + // Check the root. + if root == nil { + panic("Dirent.Remove: root must not be nil") + } + + unlock := d.lockDirectory() + defer unlock() + + // Are we frozen? + if d.frozen && !d.Inode.IsVirtual() { + return syscall.ENOENT + } + + // Try to walk to the node. + child, err := d.walk(ctx, root, name, false /* may unlock */) + if err != nil { + // Child does not exist. + return err + } + defer child.DecRef() + + // Remove cannot remove directories. + if IsDir(child.Inode.StableAttr) { + return syscall.EISDIR + } + + // Remove cannot remove a mount point. + if child.Busy() { + return syscall.EBUSY + } + + // Try to remove name on the file system. + if err := d.Inode.Remove(ctx, d, child); err != nil { + return err + } + + // Link count changed, this only applies to non-directory nodes. + child.Inode.Watches.Notify("", linux.IN_ATTRIB, 0) + + // Mark name as deleted and remove from children. + atomic.StoreInt32(&child.deleted, 1) + if w, ok := d.children[name]; ok { + delete(d.children, name) + w.Drop() + } + + // Allow the file system to drop extra references on child. + child.dropExtendedReference() + + // Finally, let inotify know the child is being unlinked. Drop any extra + // refs from inotify to this child dirent. This doesn't necessarily mean the + // watches on the underlying inode will be destroyed, since the underlying + // inode may have other links. If this was the last link, the events for the + // watch removal will be queued by the inode destructor. + child.Inode.Watches.MarkUnlinked() + child.Inode.Watches.Unpin(child) + d.Inode.Watches.Notify(name, linux.IN_DELETE, 0) + + return nil +} + +// RemoveDirectory removes the given directory. The root dirent is used to +// resolve name, and must not be nil. +func (d *Dirent) RemoveDirectory(ctx context.Context, root *Dirent, name string) error { + // Check the root. + if root == nil { + panic("Dirent.Remove: root must not be nil") + } + + unlock := d.lockDirectory() + defer unlock() + + // Are we frozen? + if d.frozen && !d.Inode.IsVirtual() { + return syscall.ENOENT + } + + // Check for dots. + if name == "." { + // Rejected as the last component by rmdir(2). + return syscall.EINVAL + } + if name == ".." { + // If d was found, then its parent is not empty. + return syscall.ENOTEMPTY + } + + // Try to walk to the node. + child, err := d.walk(ctx, root, name, false /* may unlock */) + if err != nil { + // Child does not exist. + return err + } + defer child.DecRef() + + // RemoveDirectory can only remove directories. + if !IsDir(child.Inode.StableAttr) { + return syscall.ENOTDIR + } + + // Remove cannot remove a mount point. + if child.Busy() { + return syscall.EBUSY + } + + // Try to remove name on the file system. + if err := d.Inode.Remove(ctx, d, child); err != nil { + return err + } + + // Mark name as deleted and remove from children. + atomic.StoreInt32(&child.deleted, 1) + if w, ok := d.children[name]; ok { + delete(d.children, name) + w.Drop() + } + + // Allow the file system to drop extra references on child. + child.dropExtendedReference() + + // Finally, let inotify know the child is being unlinked. Drop any extra + // refs from inotify to this child dirent. + child.Inode.Watches.MarkUnlinked() + child.Inode.Watches.Unpin(child) + d.Inode.Watches.Notify(name, linux.IN_ISDIR|linux.IN_DELETE, 0) + + return nil +} + +// destroy closes this node and all children. +func (d *Dirent) destroy() { + if d.IsNegative() { + // Nothing to tear-down and no parent references to drop, since a negative + // Dirent does not take a references on its parent, has no Inode and no children. + return + } + + var wg sync.WaitGroup + defer wg.Wait() + d.mu.Lock() + defer d.mu.Unlock() + + // Drop all weak references. + for _, w := range d.children { + w.Drop() + } + d.children = nil + + allDirents.remove(d) + + // Drop our reference to the Inode. + d.Inode.DecRef() + + // Allow the Dirent to be GC'ed after this point, since the Inode may still + // be referenced after the Dirent is destroyed (for instance by filesystem + // internal caches or hard links). + d.Inode = nil + + // Drop the reference we have on our parent if we took one. renameMu doesn't need to be + // held because d can't be reparented without any references to it left. + if d.parent != nil { + d.parent.DecRef() + } +} + +// IncRef increases the Dirent's refcount as well as its mount's refcount. +// +// IncRef implements RefCounter.IncRef. +func (d *Dirent) IncRef() { + if d.Inode != nil { + d.Inode.MountSource.IncDirentRefs() + } + d.AtomicRefCount.IncRef() +} + +// TryIncRef implements RefCounter.TryIncRef. +func (d *Dirent) TryIncRef() bool { + ok := d.AtomicRefCount.TryIncRef() + if ok && d.Inode != nil { + d.Inode.MountSource.IncDirentRefs() + } + return ok +} + +// DecRef decreases the Dirent's refcount and drops its reference on its mount. +// +// DecRef implements RefCounter.DecRef with destructor d.destroy. +func (d *Dirent) DecRef() { + if d.Inode != nil { + // Keep mount around, since DecRef may destroy d.Inode. + msrc := d.Inode.MountSource + d.DecRefWithDestructor(d.destroy) + msrc.DecDirentRefs() + } else { + d.DecRefWithDestructor(d.destroy) + } +} + +// InotifyEvent notifies all watches on the inode for this dirent and its parent +// of potential events. The events may not actually propagate up to the user, +// depending on the event masks. InotifyEvent automatically provides the name of +// the current dirent as the subject of the event as required, and adds the +// IN_ISDIR flag for dirents that refer to directories. +func (d *Dirent) InotifyEvent(events, cookie uint32) { + // N.B. We don't defer the unlocks because InotifyEvent is in the hot + // path of all IO operations, and the defers cost too much for small IO + // operations. + renameMu.RLock() + + if IsDir(d.Inode.StableAttr) { + events |= linux.IN_ISDIR + } + + // The ordering below is important, Linux always notifies the parent first. + if d.parent != nil { + d.parent.Inode.Watches.Notify(d.name, events, cookie) + } + d.Inode.Watches.Notify("", events, cookie) + + renameMu.RUnlock() +} + +// maybeExtendReference caches a reference on this Dirent if +// MountSourceOperations.Keep returns true. +func (d *Dirent) maybeExtendReference() { + if msrc := d.Inode.MountSource; msrc.Keep(d) { + msrc.fscache.Add(d) + } +} + +// dropExtendedReference drops any cached reference held by the +// MountSource on the dirent. +func (d *Dirent) dropExtendedReference() { + d.Inode.MountSource.fscache.Remove(d) +} + +// lockForRename takes locks on oldParent and newParent as required by Rename +// and returns a function that will unlock the locks taken. The returned +// function must be called even if a non-nil error is returned. +func lockForRename(oldParent *Dirent, oldName string, newParent *Dirent, newName string) (func(), error) { + if oldParent == newParent { + oldParent.mu.Lock() + return oldParent.mu.Unlock, nil + } + + // Renaming between directories is a bit subtle: + // + // - A concurrent cross-directory Rename may try to lock in the opposite + // order; take renameMu to prevent this from happening. + // + // - If either directory is an ancestor of the other, then a concurrent + // Remove may lock the descendant (in DecRef -> closeAll) while holding a + // lock on the ancestor; to avoid this, ensure we take locks in the same + // ancestor-to-descendant order. (Holding renameMu prevents this + // relationship from changing.) + renameMu.Lock() + + // First check if newParent is a descendant of oldParent. + child := newParent + for p := newParent.parent; p != nil; p = p.parent { + if p == oldParent { + oldParent.mu.Lock() + newParent.mu.Lock() + var err error + if child.name == oldName { + // newParent is not just a descendant of oldParent, but + // more specifically of oldParent/oldName. That is, we're + // trying to rename something into a subdirectory of + // itself. + err = syscall.EINVAL + } + return func() { + newParent.mu.Unlock() + oldParent.mu.Unlock() + renameMu.Unlock() + }, err + } + child = p + } + + // Otherwise, either oldParent is a descendant of newParent or the two + // have no relationship; in either case we can do this: + newParent.mu.Lock() + oldParent.mu.Lock() + return func() { + oldParent.mu.Unlock() + newParent.mu.Unlock() + renameMu.Unlock() + }, nil +} + +func checkSticky(ctx context.Context, dir *Dirent, victim *Dirent) error { + uattr, err := dir.Inode.UnstableAttr(ctx) + if err != nil { + return syserror.EPERM + } + if !uattr.Perms.Sticky { + return nil + } + + creds := auth.CredentialsFromContext(ctx) + if uattr.Owner.UID == creds.EffectiveKUID { + return nil + } + + vuattr, err := victim.Inode.UnstableAttr(ctx) + if err != nil { + return syserror.EPERM + } + if vuattr.Owner.UID == creds.EffectiveKUID { + return nil + } + if victim.Inode.CheckCapability(ctx, linux.CAP_FOWNER) { + return nil + } + return syserror.EPERM +} + +// MayDelete determines whether `name`, a child of `dir`, can be deleted or +// renamed by `ctx`. +// +// Compare Linux kernel fs/namei.c:may_delete. +func MayDelete(ctx context.Context, root, dir *Dirent, name string) error { + victim, err := dir.Walk(ctx, root, name) + if err != nil { + return err + } + defer victim.DecRef() + + return mayDelete(ctx, dir, victim) +} + +func mayDelete(ctx context.Context, dir *Dirent, victim *Dirent) error { + if err := dir.Inode.CheckPermission(ctx, PermMask{Write: true, Execute: true}); err != nil { + return err + } + + return checkSticky(ctx, dir, victim) +} + +// Rename atomically converts the child of oldParent named oldName to a +// child of newParent named newName. +func Rename(ctx context.Context, root *Dirent, oldParent *Dirent, oldName string, newParent *Dirent, newName string) error { + if root == nil { + panic("Rename: root must not be nil") + } + if oldParent == newParent && oldName == newName { + return nil + } + + // Acquire global renameMu lock, and mu locks on oldParent/newParent. + unlock, err := lockForRename(oldParent, oldName, newParent, newName) + defer unlock() + if err != nil { + return err + } + + // Are we frozen? + // TODO: Is this the right errno? + if oldParent.frozen && !oldParent.Inode.IsVirtual() { + return syscall.ENOENT + } + if newParent.frozen && !newParent.Inode.IsVirtual() { + return syscall.ENOENT + } + + // Check constraints on the object being renamed. + renamed, err := oldParent.walk(ctx, root, oldName, false /* may unlock */) + if err != nil { + return err + } + defer renamed.DecRef() + + // Make sure we have write permissions on old and new parent. + if err := mayDelete(ctx, oldParent, renamed); err != nil { + return err + } + if newParent != oldParent { + if err := newParent.Inode.CheckPermission(ctx, PermMask{Write: true, Execute: true}); err != nil { + return err + } + } + + // Source should not be an ancestor of the target. + if renamed == newParent { + return syscall.EINVAL + } + + // Is the thing we're trying to rename busy? + if renamed.Busy() { + return syscall.EBUSY + } + + // Per rename(2): "... EACCES: ... or oldpath is a directory and does not + // allow write permission (needed to update the .. entry)." + if IsDir(renamed.Inode.StableAttr) { + if err := renamed.Inode.CheckPermission(ctx, PermMask{Write: true}); err != nil { + return err + } + } + + // Check constraints on the object being replaced, if any. + replaced, err := newParent.walk(ctx, root, newName, false /* may unlock */) + if err == nil { + defer replaced.DecRef() + + // Target should not be an ancestor of source. + if replaced == oldParent { + // Why is this not EINVAL? See fs/namei.c. + return syscall.ENOTEMPTY + } + + // Is the thing we're trying to replace busy? + if replaced.Busy() { + return syscall.EBUSY + } + + // Require that a directory is replaced by a directory. + oldIsDir := IsDir(renamed.Inode.StableAttr) + newIsDir := IsDir(replaced.Inode.StableAttr) + if !newIsDir && oldIsDir { + return syscall.ENOTDIR + } + if !oldIsDir && newIsDir { + return syscall.EISDIR + } + + // Allow the file system to drop extra references on replaced. + replaced.dropExtendedReference() + + // NOTE: Keeping a dirent + // open across renames is currently broken for multiple + // reasons, so we flush all references on the replaced node and + // its children. + replaced.Inode.Watches.Unpin(replaced) + replaced.flush() + } + + if err := renamed.Inode.Rename(ctx, oldParent, renamed, newParent, newName); err != nil { + return err + } + + renamed.name = newName + renamed.parent = newParent + if oldParent != newParent { + // Reparent the reference held by renamed.parent. oldParent.DecRef + // can't destroy oldParent (and try to retake its lock) because + // Rename's caller must be holding a reference. + newParent.IncRef() + oldParent.DecRef() + } + if w, ok := newParent.children[newName]; ok { + w.Drop() + delete(newParent.children, newName) + } + if w, ok := oldParent.children[oldName]; ok { + w.Drop() + delete(oldParent.children, oldName) + } + + // Add a weak reference from the new parent. This ensures that the child + // can still be found from the new parent if a prior hard reference is + // held on renamed. + // + // This is required for file lock correctness because file locks are per-Dirent + // and without maintaining the a cached child (via a weak reference) for renamed, + // multiple Dirents can correspond to the same resource (by virtue of the renamed + // Dirent being unreachable by its parent and it being looked up). + newParent.children[newName] = refs.NewWeakRef(renamed, nil) + + // Queue inotify events for the rename. + var ev uint32 + if IsDir(renamed.Inode.StableAttr) { + ev |= linux.IN_ISDIR + } + + cookie := uniqueid.InotifyCookie(ctx) + oldParent.Inode.Watches.Notify(oldName, ev|linux.IN_MOVED_FROM, cookie) + newParent.Inode.Watches.Notify(newName, ev|linux.IN_MOVED_TO, cookie) + // Somewhat surprisingly, self move events do not have a cookie. + renamed.Inode.Watches.Notify("", linux.IN_MOVE_SELF, 0) + + // Allow the file system to drop extra references on renamed. + renamed.dropExtendedReference() + + // Same as replaced.flush above. + renamed.flush() + + return nil +} diff --git a/pkg/sentry/fs/dirent_cache.go b/pkg/sentry/fs/dirent_cache.go new file mode 100644 index 000000000..e786e4f65 --- /dev/null +++ b/pkg/sentry/fs/dirent_cache.go @@ -0,0 +1,142 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fs + +import ( + "fmt" + "sync" +) + +// DirentCache is an LRU cache of Dirents. The Dirent's refCount is +// incremented when it is added to the cache, and decremented when it is +// removed. +// +// A nil DirentCache corresponds to a cache with size 0. All methods can be +// called, but nothing is actually cached. +type DirentCache struct { + // Maximum size of the cache. This must be saved manually, to handle the case + // when cache is nil. + maxSize uint64 + + // mu protects currentSize and direntList. + mu sync.Mutex `state:"nosave"` + + // currentSize is the number of elements in the cache. It must be zero (i.e. + // the cache must be empty) on Save. + currentSize uint64 `state:"zerovalue"` + + // list is a direntList, an ilist of Dirents. New Dirents are added + // to the front of the list. Old Dirents are removed from the back of + // the list. It must be zerovalue (i.e. the cache must be empty) on Save. + list direntList `state:"zerovalue"` +} + +// NewDirentCache returns a new DirentCache with the given maxSize. If maxSize +// is 0, nil is returned. +func NewDirentCache(maxSize uint64) *DirentCache { + return &DirentCache{ + maxSize: maxSize, + } +} + +// Add adds the element to the cache and increments the refCount. If the +// argument is already in the cache, it is moved to the front. An element is +// removed from the back if the cache is over capacity. +func (c *DirentCache) Add(d *Dirent) { + if c == nil || c.maxSize == 0 { + return + } + + c.mu.Lock() + if c.contains(d) { + // d is already in cache. Bump it to the front. + // currentSize and refCount are unaffected. + c.list.Remove(d) + c.list.PushFront(d) + c.mu.Unlock() + return + } + + // d is not in cache. Add it and take a reference. + c.list.PushFront(d) + d.IncRef() + c.currentSize++ + + // Remove the oldest until we are under the size limit. + for c.maxSize > 0 && c.currentSize > c.maxSize { + c.remove(c.list.Back()) + } + c.mu.Unlock() +} + +func (c *DirentCache) remove(d *Dirent) { + if !c.contains(d) { + panic(fmt.Sprintf("trying to remove %v, which is not in the dirent cache", d)) + } + c.list.Remove(d) + d.SetPrev(nil) + d.SetNext(nil) + d.DecRef() + c.currentSize-- +} + +// Remove removes the element from the cache and decrements its refCount. It +// also sets the previous and next elements to nil, which allows us to +// determine if a given element is in the cache. +func (c *DirentCache) Remove(d *Dirent) { + if c == nil || c.maxSize == 0 { + return + } + c.mu.Lock() + if !c.contains(d) { + c.mu.Unlock() + return + } + c.remove(d) + c.mu.Unlock() +} + +// Size returns the number of elements in the cache. +func (c *DirentCache) Size() uint64 { + if c == nil { + return 0 + } + c.mu.Lock() + size := c.currentSize + c.mu.Unlock() + return size +} + +func (c *DirentCache) contains(d *Dirent) bool { + // If d has a Prev or Next element, then it is in the cache. + if d.Prev() != nil || d.Next() != nil { + return true + } + // Otherwise, d is in the cache if it is the only element (and thus the + // first element). + return c.list.Front() == d +} + +// Invalidate removes all Dirents from the cache, caling DecRef on each. +func (c *DirentCache) Invalidate() { + if c == nil { + return + } + c.mu.Lock() + for c.list.Front() != nil { + c.remove(c.list.Front()) + } + c.mu.Unlock() +} diff --git a/pkg/sentry/fs/dirent_cache_test.go b/pkg/sentry/fs/dirent_cache_test.go new file mode 100644 index 000000000..82b7f6bd5 --- /dev/null +++ b/pkg/sentry/fs/dirent_cache_test.go @@ -0,0 +1,157 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fs + +import ( + "testing" +) + +func TestDirentCache(t *testing.T) { + const maxSize = 5 + + c := NewDirentCache(maxSize) + + // Size starts at 0. + if got, want := c.Size(), uint64(0); got != want { + t.Errorf("c.Size() got %v, want %v", got, want) + } + + // Create a Dirent d. + d := NewNegativeDirent("") + + // c does not contain d. + if got, want := c.contains(d), false; got != want { + t.Errorf("c.contains(d) got %v want %v", got, want) + } + + // Add d to the cache. + c.Add(d) + + // Size is now 1. + if got, want := c.Size(), uint64(1); got != want { + t.Errorf("c.Size() got %v, want %v", got, want) + } + + // c contains d. + if got, want := c.contains(d), true; got != want { + t.Errorf("c.contains(d) got %v want %v", got, want) + } + + // Add maxSize-1 more elements. d should be oldest element. + for i := 0; i < maxSize-1; i++ { + c.Add(NewNegativeDirent("")) + } + + // Size is maxSize. + if got, want := c.Size(), uint64(maxSize); got != want { + t.Errorf("c.Size() got %v, want %v", got, want) + } + + // c contains d. + if got, want := c.contains(d), true; got != want { + t.Errorf("c.contains(d) got %v want %v", got, want) + } + + // "Bump" d to the front by re-adding it. + c.Add(d) + + // Size is maxSize. + if got, want := c.Size(), uint64(maxSize); got != want { + t.Errorf("c.Size() got %v, want %v", got, want) + } + + // c contains d. + if got, want := c.contains(d), true; got != want { + t.Errorf("c.contains(d) got %v want %v", got, want) + } + + // Add maxSize-1 more elements. d should again be oldest element. + for i := 0; i < maxSize-1; i++ { + c.Add(NewNegativeDirent("")) + } + + // Size is maxSize. + if got, want := c.Size(), uint64(maxSize); got != want { + t.Errorf("c.Size() got %v, want %v", got, want) + } + + // c contains d. + if got, want := c.contains(d), true; got != want { + t.Errorf("c.contains(d) got %v want %v", got, want) + } + + // Add one more element, which will bump d from the cache. + c.Add(NewNegativeDirent("")) + + // Size is maxSize. + if got, want := c.Size(), uint64(maxSize); got != want { + t.Errorf("c.Size() got %v, want %v", got, want) + } + + // c does not contain d. + if got, want := c.contains(d), false; got != want { + t.Errorf("c.contains(d) got %v want %v", got, want) + } + + // Invalidating causes size to be 0 and list to be empty. + c.Invalidate() + if got, want := c.Size(), uint64(0); got != want { + t.Errorf("c.Size() got %v, want %v", got, want) + } + if got, want := c.list.Empty(), true; got != want { + t.Errorf("c.list.Empty() got %v, want %v", got, want) + } + + // Fill cache with maxSize dirents. + for i := 0; i < maxSize; i++ { + c.Add(NewNegativeDirent("")) + } +} + +// TestNilDirentCache tests that a nil cache supports all cache operations, but +// treats them as noop. +func TestNilDirentCache(t *testing.T) { + // Create a nil cache. + var c *DirentCache + + // Size is zero. + if got, want := c.Size(), uint64(0); got != want { + t.Errorf("c.Size() got %v, want %v", got, want) + } + + // Call Add. + c.Add(NewNegativeDirent("")) + + // Size is zero. + if got, want := c.Size(), uint64(0); got != want { + t.Errorf("c.Size() got %v, want %v", got, want) + } + + // Call Remove. + c.Remove(NewNegativeDirent("")) + + // Size is zero. + if got, want := c.Size(), uint64(0); got != want { + t.Errorf("c.Size() got %v, want %v", got, want) + } + + // Call Invalidate. + c.Invalidate() + + // Size is zero. + if got, want := c.Size(), uint64(0); got != want { + t.Errorf("c.Size() got %v, want %v", got, want) + } +} diff --git a/pkg/sentry/fs/dirent_refs_test.go b/pkg/sentry/fs/dirent_refs_test.go new file mode 100644 index 000000000..8ce9ba02d --- /dev/null +++ b/pkg/sentry/fs/dirent_refs_test.go @@ -0,0 +1,417 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fs + +import ( + "syscall" + "testing" + + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest" +) + +func newMockDirInode(ctx context.Context, cache *DirentCache) *Inode { + return NewMockInode(ctx, NewMockMountSource(cache), StableAttr{Type: Directory}) +} + +func TestWalkPositive(t *testing.T) { + // refs == 0 -> one reference. + // refs == -1 -> has been destroyed. + + ctx := contexttest.Context(t) + root := NewDirent(newMockDirInode(ctx, nil), "root") + + if got := root.TestReadRefs(); got != 0 { + t.Fatalf("root has a ref count of %d, want %d", got, 0) + } + + name := "d" + d, err := root.walk(ctx, root, name, false) + if err != nil { + t.Fatalf("root.walk(root, %q) got %v, want nil", name, err) + } + + if got := root.TestReadRefs(); got != 1 { + t.Fatalf("root has a ref count of %d, want %d", got, 1) + } + + if got := d.TestReadRefs(); got != 0 { + t.Fatalf("child name = %q has a ref count of %d, want %d", d.name, got, 0) + } + + d.DecRef() + + if got := root.TestReadRefs(); got != 0 { + t.Fatalf("root has a ref count of %d, want %d", got, 0) + } + + if got := d.TestReadRefs(); got != -1 { + t.Fatalf("child name = %q has a ref count of %d, want %d", d.name, got, -1) + } + + root.flush() + + if got := len(root.children); got != 0 { + t.Fatalf("root has %d children, want %d", got, 0) + } +} + +func TestWalkNegative(t *testing.T) { + // refs == 0 -> one reference. + // refs == -1 -> has been destroyed. + + ctx := contexttest.Context(t) + root := NewDirent(NewEmptyDir(ctx, nil), "root") + mn := root.Inode.InodeOperations.(*mockInodeOperationsLookupNegative) + + if got := root.TestReadRefs(); got != 0 { + t.Fatalf("root has a ref count of %d, want %d", got, 0) + } + + name := "d" + for i := 0; i < 100; i++ { + _, err := root.walk(ctx, root, name, false) + if err != syscall.ENOENT { + t.Fatalf("root.walk(root, %q) got %v, want %v", name, err, syscall.ENOENT) + } + } + + if got := root.TestReadRefs(); got != 0 { + t.Fatalf("root has a ref count of %d, want %d", got, 1) + } + + if got := len(root.children); got != 1 { + t.Fatalf("root has %d children, want %d", got, 1) + } + + w, ok := root.children[name] + if !ok { + t.Fatalf("root wants child at %q", name) + } + + child := w.Get() + if child == nil { + t.Fatalf("root wants to resolve weak reference") + } + + if !child.(*Dirent).IsNegative() { + t.Fatalf("root found positive child at %q, want negative", name) + } + + if got := child.(*Dirent).TestReadRefs(); got != 1 { + t.Fatalf("child has a ref count of %d, want %d", got, 1) + } + + child.DecRef() + + if got := child.(*Dirent).TestReadRefs(); got != 0 { + t.Fatalf("child has a ref count of %d, want %d", got, 0) + } + + if got := len(root.children); got != 1 { + t.Fatalf("root has %d children, want %d", got, 1) + } + + root.DecRef() + + if got := root.TestReadRefs(); got != -1 { + t.Fatalf("root has a ref count of %d, want %d", got, 0) + } + + AsyncBarrier() + + if got := mn.releaseCalled; got != true { + t.Fatalf("root.Close was called %v, want true", got) + } +} + +type mockInodeOperationsLookupNegative struct { + *MockInodeOperations + releaseCalled bool +} + +func NewEmptyDir(ctx context.Context, cache *DirentCache) *Inode { + m := NewMockMountSource(cache) + return NewInode(&mockInodeOperationsLookupNegative{ + MockInodeOperations: NewMockInodeOperations(ctx), + }, m, StableAttr{Type: Directory}) +} + +func (m *mockInodeOperationsLookupNegative) Lookup(ctx context.Context, dir *Inode, p string) (*Dirent, error) { + return NewNegativeDirent(p), nil +} + +func (m *mockInodeOperationsLookupNegative) Release(context.Context) { + m.releaseCalled = true +} + +func TestHashNegativeToPositive(t *testing.T) { + // refs == 0 -> one reference. + // refs == -1 -> has been destroyed. + + ctx := contexttest.Context(t) + root := NewDirent(NewEmptyDir(ctx, nil), "root") + + name := "d" + _, err := root.walk(ctx, root, name, false) + if err != syscall.ENOENT { + t.Fatalf("root.walk(root, %q) got %v, want %v", name, err, syscall.ENOENT) + } + + if got := root.exists(ctx, root, name); got != false { + t.Fatalf("got %q exists, want does not exist", name) + } + + f, err := root.Create(ctx, root, name, FileFlags{}, FilePermissions{}) + if err != nil { + t.Fatalf("root.Create(%q, _), got error %v, want nil", name, err) + } + d := f.Dirent + + if d.IsNegative() { + t.Fatalf("got negative Dirent, want positive") + } + + if got := d.TestReadRefs(); got != 0 { + t.Fatalf("child %q has a ref count of %d, want %d", name, got, 0) + } + + if got := root.TestReadRefs(); got != 1 { + t.Fatalf("root has a ref count of %d, want %d", got, 1) + } + + if got := len(root.children); got != 1 { + t.Fatalf("got %d children, want %d", got, 1) + } + + w, ok := root.children[name] + if !ok { + t.Fatalf("failed to find weak reference to %q", name) + } + + child := w.Get() + if child == nil { + t.Fatalf("want to resolve weak reference") + } + + if child.(*Dirent) != d { + t.Fatalf("got foreign child") + } +} + +func TestRevalidate(t *testing.T) { + // refs == 0 -> one reference. + // refs == -1 -> has been destroyed. + + ctx := contexttest.Context(t) + for _, test := range []struct { + // desc is the test's description. + desc string + + // Whether to make negative Dirents. + makeNegative bool + }{ + { + desc: "Revalidate negative Dirent", + makeNegative: true, + }, + { + desc: "Revalidate positive Dirent", + makeNegative: false, + }, + } { + t.Run(test.desc, func(t *testing.T) { + root := NewDirent(NewMockInodeRevalidate(ctx, test.makeNegative), "root") + + name := "d" + d1, err := root.walk(ctx, root, name, false) + if !test.makeNegative && err != nil { + t.Fatalf("root.walk(root, %q) got %v, want nil", name, err) + } + d2, err := root.walk(ctx, root, name, false) + if !test.makeNegative && err != nil { + t.Fatalf("root.walk(root, %q) got %v, want nil", name, err) + } + if !test.makeNegative && d1 == d2 { + t.Fatalf("revalidating walk got same *Dirent, want different") + } + if got := len(root.children); got != 1 { + t.Errorf("revalidating walk got %d children, want %d", got, 1) + } + }) + } +} + +type MockInodeOperationsRevalidate struct { + *MockInodeOperations + makeNegative bool +} + +func NewMockInodeRevalidate(ctx context.Context, makeNegative bool) *Inode { + mn := NewMockInodeOperations(ctx) + m := NewMockMountSource(nil) + m.MountSourceOperations.(*MockMountSourceOps).revalidate = true + return NewInode(&MockInodeOperationsRevalidate{MockInodeOperations: mn, makeNegative: makeNegative}, m, StableAttr{Type: Directory}) +} + +func (m *MockInodeOperationsRevalidate) Lookup(ctx context.Context, dir *Inode, p string) (*Dirent, error) { + if !m.makeNegative { + return m.MockInodeOperations.Lookup(ctx, dir, p) + } + return NewNegativeDirent(p), nil +} + +func TestCreateExtraRefs(t *testing.T) { + // refs == 0 -> one reference. + // refs == -1 -> has been destroyed. + + ctx := contexttest.Context(t) + for _, test := range []struct { + // desc is the test's description. + desc string + + // root is the Dirent to create from. + root *Dirent + + // expected references on walked Dirent. + refs int64 + }{ + { + desc: "Create caching", + root: NewDirent(NewEmptyDir(ctx, NewDirentCache(1)), "root"), + refs: 1, + }, + { + desc: "Create not caching", + root: NewDirent(NewEmptyDir(ctx, nil), "root"), + refs: 0, + }, + } { + t.Run(test.desc, func(t *testing.T) { + name := "d" + f, err := test.root.Create(ctx, test.root, name, FileFlags{}, FilePermissions{}) + if err != nil { + t.Fatalf("root.Create(root, %q) failed: %v", name, err) + } + d := f.Dirent + + if got := d.TestReadRefs(); got != test.refs { + t.Errorf("dirent has a ref count of %d, want %d", got, test.refs) + } + }) + } +} + +func TestRemoveExtraRefs(t *testing.T) { + // refs == 0 -> one reference. + // refs == -1 -> has been destroyed. + + ctx := contexttest.Context(t) + for _, test := range []struct { + // desc is the test's description. + desc string + + // root is the Dirent to make and remove from. + root *Dirent + }{ + { + desc: "Remove caching", + root: NewDirent(NewEmptyDir(ctx, NewDirentCache(1)), "root"), + }, + { + desc: "Remove not caching", + root: NewDirent(NewEmptyDir(ctx, nil), "root"), + }, + } { + t.Run(test.desc, func(t *testing.T) { + name := "d" + f, err := test.root.Create(ctx, test.root, name, FileFlags{}, FilePermissions{}) + if err != nil { + t.Fatalf("root.Create(%q, _) failed: %v", name, err) + } + d := f.Dirent + + if err := test.root.Remove(contexttest.Context(t), test.root, name); err != nil { + t.Fatalf("root.Remove(root, %q) failed: %v", name, err) + } + + if got := d.TestReadRefs(); got != 0 { + t.Fatalf("dirent has a ref count of %d, want %d", got, 0) + } + + d.DecRef() + + test.root.flush() + + if got := len(test.root.children); got != 0 { + t.Errorf("root has %d children, want %d", got, 0) + } + }) + } +} + +func TestRenameExtraRefs(t *testing.T) { + // refs == 0 -> one reference. + // refs == -1 -> has been destroyed. + + ctx := contexttest.Context(t) + for _, test := range []struct { + // desc is the test's description. + desc string + + // cache of extra Dirent references, may be nil. + cache *DirentCache + }{ + { + desc: "Rename no caching", + cache: nil, + }, + { + desc: "Rename caching", + cache: NewDirentCache(5), + }, + } { + t.Run(test.desc, func(t *testing.T) { + dirAttr := StableAttr{Type: Directory} + + oldParent := NewDirent(NewMockInode(ctx, NewMockMountSource(test.cache), dirAttr), "old_parent") + newParent := NewDirent(NewMockInode(ctx, NewMockMountSource(test.cache), dirAttr), "new_parent") + + renamed, err := oldParent.Walk(ctx, oldParent, "old_child") + if err != nil { + t.Fatalf("Walk(oldParent, %q) got error %v, want nil", "old_child", err) + } + replaced, err := newParent.Walk(ctx, oldParent, "new_child") + if err != nil { + t.Fatalf("Walk(newParent, %q) got error %v, want nil", "new_child", err) + } + + if err := Rename(contexttest.RootContext(t), oldParent /*root */, oldParent, "old_child", newParent, "new_child"); err != nil { + t.Fatalf("Rename got error %v, want nil", err) + } + + oldParent.flush() + newParent.flush() + + // Expect to have only active references. + if got := renamed.TestReadRefs(); got != 0 { + t.Errorf("renamed has ref count %d, want only active references %d", got, 0) + } + if got := replaced.TestReadRefs(); got != 0 { + t.Errorf("replaced has ref count %d, want only active references %d", got, 0) + } + }) + } +} diff --git a/pkg/sentry/fs/dirent_state.go b/pkg/sentry/fs/dirent_state.go new file mode 100644 index 000000000..c6a1b5e38 --- /dev/null +++ b/pkg/sentry/fs/dirent_state.go @@ -0,0 +1,44 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fs + +import ( + "fmt" + "sync/atomic" +) + +// beforeSave is invoked by stateify. +func (d *Dirent) beforeSave() { + // Refuse to save if the file has already been deleted (but still has + // open fds, which is why the Dirent is still accessible). We know the + // the restore opening of the file will always fail. This condition will + // last until all the open fds and this Dirent are closed and released. + // + // Note that this is rejection rather than failure---it would be + // perfectly OK to save---we are simply disallowing it here to prevent + // generating non-restorable state dumps. As the program continues its + // execution, it may become allowed to save again. + if atomic.LoadInt32(&d.deleted) != 0 { + n, _ := d.FullName(nil /* root */) + panic(ErrSaveRejection{fmt.Errorf("deleted file %q still has open fds", n)}) + } +} + +// afterLoad is invoked by stateify. +func (d *Dirent) afterLoad() { + if d.userVisible { + allDirents.add(d) + } +} diff --git a/pkg/sentry/fs/fdpipe/BUILD b/pkg/sentry/fs/fdpipe/BUILD new file mode 100644 index 000000000..9e1f65d3e --- /dev/null +++ b/pkg/sentry/fs/fdpipe/BUILD @@ -0,0 +1,76 @@ +package(licenses = ["notice"]) # Apache 2.0 + +load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test") +load("//tools/go_stateify:defs.bzl", "go_stateify") + +go_stateify( + name = "pipe_state", + srcs = [ + "pipe.go", + "pipe_state.go", + ], + out = "pipe_autogen_state.go", + imports = ["gvisor.googlesource.com/gvisor/pkg/sentry/fs"], + package = "fdpipe", +) + +go_library( + name = "fdpipe", + srcs = [ + "pipe.go", + "pipe_autogen_state.go", + "pipe_opener.go", + "pipe_state.go", + ], + importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/fs/fdpipe", + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/amutex", + "//pkg/fd", + "//pkg/log", + "//pkg/metric", + "//pkg/p9", + "//pkg/refs", + "//pkg/secio", + "//pkg/sentry/context", + "//pkg/sentry/device", + "//pkg/sentry/fs", + "//pkg/sentry/fs/fsutil", + "//pkg/sentry/fs/lock", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/kernel/time", + "//pkg/sentry/memmap", + "//pkg/sentry/platform", + "//pkg/sentry/safemem", + "//pkg/sentry/uniqueid", + "//pkg/sentry/usermem", + "//pkg/state", + "//pkg/syserror", + "//pkg/tcpip", + "//pkg/tcpip/transport/unix", + "//pkg/unet", + "//pkg/waiter", + "//pkg/waiter/fdnotifier", + ], +) + +go_test( + name = "fdpipe_test", + size = "small", + srcs = [ + "pipe_opener_test.go", + "pipe_test.go", + ], + embed = [":fdpipe"], + deps = [ + "//pkg/fd", + "//pkg/sentry/context", + "//pkg/sentry/context/contexttest", + "//pkg/sentry/fs", + "//pkg/sentry/usermem", + "//pkg/syserror", + "//pkg/waiter/fdnotifier", + "@com_github_google_uuid//:go_default_library", + ], +) diff --git a/pkg/sentry/fs/fdpipe/pipe.go b/pkg/sentry/fs/fdpipe/pipe.go new file mode 100644 index 000000000..f7bbd4aff --- /dev/null +++ b/pkg/sentry/fs/fdpipe/pipe.go @@ -0,0 +1,167 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package fdpipe implements common namedpipe opening and accessing logic. +package fdpipe + +import ( + "os" + "sync" + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/fd" + "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/pkg/secio" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil" + "gvisor.googlesource.com/gvisor/pkg/sentry/safemem" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" + "gvisor.googlesource.com/gvisor/pkg/waiter" + "gvisor.googlesource.com/gvisor/pkg/waiter/fdnotifier" +) + +// pipeOperations are the fs.FileOperations of a host pipe. +type pipeOperations struct { + fsutil.PipeSeek `state:"nosave"` + fsutil.NotDirReaddir `state:"nosave"` + fsutil.NoFsync `state:"nosave"` + fsutil.NoopFlush `state:"nosave"` + fsutil.NoMMap `state:"nosave"` + fsutil.NoIoctl `state:"nosave"` + waiter.Queue `state:"nosave"` + + // flags are the flags used to open the pipe. + flags fs.FileFlags `state:".(fs.FileFlags)"` + + // opener is how the pipe was opened. + opener NonBlockingOpener `state:"wait"` + + // file represents the host pipe. + file *fd.FD `state:"nosave"` + + // mu protects readAheadBuffer access below. + mu sync.Mutex `state:"nosave"` + + // readAheadBuffer contains read bytes that have not yet been read + // by the application but need to be buffered for save-restore for correct + // opening semantics. The readAheadBuffer will only be non-empty when the + // is first opened and will be drained by subsequent reads on the pipe. + readAheadBuffer []byte +} + +// newPipeOperations returns an implementation of fs.FileOperations for a pipe. +func newPipeOperations(ctx context.Context, opener NonBlockingOpener, flags fs.FileFlags, file *fd.FD, readAheadBuffer []byte) (*pipeOperations, error) { + pipeOps := &pipeOperations{ + flags: flags, + opener: opener, + file: file, + readAheadBuffer: readAheadBuffer, + } + if err := pipeOps.init(); err != nil { + return nil, err + } + return pipeOps, nil +} + +// init initializes p.file. +func (p *pipeOperations) init() error { + var s syscall.Stat_t + if err := syscall.Fstat(p.file.FD(), &s); err != nil { + log.Warningf("pipe: cannot stat fd %d: %v", p.file.FD(), err) + return syscall.EINVAL + } + if s.Mode&syscall.S_IFIFO != syscall.S_IFIFO { + log.Warningf("pipe: cannot load fd %d as pipe, file type: %o", p.file.FD(), s.Mode) + return syscall.EINVAL + } + if err := syscall.SetNonblock(p.file.FD(), true); err != nil { + return err + } + if err := fdnotifier.AddFD(int32(p.file.FD()), &p.Queue); err != nil { + return err + } + return nil +} + +// EventRegister implements waiter.Waitable.EventRegister. +func (p *pipeOperations) EventRegister(e *waiter.Entry, mask waiter.EventMask) { + p.Queue.EventRegister(e, mask) + fdnotifier.UpdateFD(int32(p.file.FD())) +} + +// EventUnregister implements waiter.Waitable.EventUnregister. +func (p *pipeOperations) EventUnregister(e *waiter.Entry) { + p.Queue.EventUnregister(e) + fdnotifier.UpdateFD(int32(p.file.FD())) +} + +// Readiness returns a mask of ready events for stream. +func (p *pipeOperations) Readiness(mask waiter.EventMask) (eventMask waiter.EventMask) { + return fdnotifier.NonBlockingPoll(int32(p.file.FD()), mask) +} + +// Release implements fs.FileOperations.Release. +func (p *pipeOperations) Release() { + fdnotifier.RemoveFD(int32(p.file.FD())) + p.file.Close() + p.file = nil +} + +// Read implements fs.FileOperations.Read. +func (p *pipeOperations) Read(ctx context.Context, file *fs.File, dst usermem.IOSequence, offset int64) (int64, error) { + // Drain the read ahead buffer, if it contains anything first. + var bufN int + var bufErr error + p.mu.Lock() + if len(p.readAheadBuffer) > 0 { + bufN, bufErr = dst.CopyOut(ctx, p.readAheadBuffer) + p.readAheadBuffer = p.readAheadBuffer[bufN:] + dst = dst.DropFirst(bufN) + } + p.mu.Unlock() + if dst.NumBytes() == 0 || bufErr != nil { + return int64(bufN), bufErr + } + + // Pipes expect full reads. + n, err := dst.CopyOutFrom(ctx, safemem.FromIOReader{secio.FullReader{p.file}}) + total := int64(bufN) + n + if err != nil && isBlockError(err) { + return total, syserror.ErrWouldBlock + } + return total, err +} + +// Write implements fs.FileOperations.Write. +func (p *pipeOperations) Write(ctx context.Context, file *fs.File, src usermem.IOSequence, offset int64) (int64, error) { + n, err := src.CopyInTo(ctx, safemem.FromIOWriter{p.file}) + if err != nil && isBlockError(err) { + return n, syserror.ErrWouldBlock + } + return n, err +} + +// isBlockError unwraps os errors and checks if they are caused by EAGAIN or +// EWOULDBLOCK. This is so they can be transformed into syserror.ErrWouldBlock. +func isBlockError(err error) bool { + if err == syserror.EAGAIN || err == syserror.EWOULDBLOCK { + return true + } + if pe, ok := err.(*os.PathError); ok { + return isBlockError(pe.Err) + } + return false +} diff --git a/pkg/sentry/fs/fdpipe/pipe_opener.go b/pkg/sentry/fs/fdpipe/pipe_opener.go new file mode 100644 index 000000000..a0d59575f --- /dev/null +++ b/pkg/sentry/fs/fdpipe/pipe_opener.go @@ -0,0 +1,193 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fdpipe + +import ( + "io" + "os" + "syscall" + "time" + + "gvisor.googlesource.com/gvisor/pkg/fd" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// NonBlockingOpener is a generic host file opener used to retry opening host +// pipes if necessary. +type NonBlockingOpener interface { + // NonBlockingOpen tries to open a host pipe in a non-blocking way, + // and otherwise returns an error. Implementations should be idempotent. + NonBlockingOpen(context.Context, fs.PermMask) (*fd.FD, error) +} + +// Open blocks until a host pipe can be opened or the action was cancelled. +// On success, returns fs.FileOperations wrapping the opened host pipe. +func Open(ctx context.Context, opener NonBlockingOpener, flags fs.FileFlags) (fs.FileOperations, error) { + p := &pipeOpenState{} + canceled := false + for { + if file, err := p.TryOpen(ctx, opener, flags); err != syserror.ErrWouldBlock { + return file, err + } + + // Honor the cancellation request if open still blocks. + if canceled { + // If we were canceled but we have a handle to a host + // file, we need to close it. + if p.hostFile != nil { + p.hostFile.Close() + } + return nil, syserror.ErrInterrupted + } + + cancel := ctx.SleepStart() + select { + case <-cancel: + // The cancellation request received here really says + // "cancel from now on (or ASAP)". Any environmental + // changes happened before receiving it, that might have + // caused open to not block anymore, should still be + // respected. So we cannot just return here. We have to + // give open another try below first. + canceled = true + ctx.SleepFinish(false) + case <-time.After(100 * time.Millisecond): + // If we would block, then delay retrying for a bit, since there + // is no way to know when the pipe would be ready to be + // re-opened. This is identical to sending an event notification + // to stop blocking in Task.Block, given that this routine will + // stop retrying if a cancelation is received. + ctx.SleepFinish(true) + } + } +} + +// pipeOpenState holds state needed to open a blocking named pipe read only, for instance the +// file that has been opened but doesn't yet have a corresponding writer. +type pipeOpenState struct { + // hostFile is the read only named pipe which lacks a corresponding writer. + hostFile *fd.FD +} + +// unwrapError is needed to match against ENXIO primarily. +func unwrapError(err error) error { + if pe, ok := err.(*os.PathError); ok { + return pe.Err + } + return err +} + +// TryOpen uses a NonBlockingOpener to try to open a host pipe, respecting the fs.FileFlags. +func (p *pipeOpenState) TryOpen(ctx context.Context, opener NonBlockingOpener, flags fs.FileFlags) (*pipeOperations, error) { + switch { + // Reject invalid configurations so they don't accidently succeed below. + case !flags.Read && !flags.Write: + return nil, syscall.EINVAL + + // Handle opening RDWR or with O_NONBLOCK: will never block, so try only once. + case (flags.Read && flags.Write) || flags.NonBlocking: + f, err := opener.NonBlockingOpen(ctx, fs.PermMask{Read: flags.Read, Write: flags.Write}) + if err != nil { + return nil, err + } + return newPipeOperations(ctx, opener, flags, f, nil) + + // Handle opening O_WRONLY blocking: convert ENXIO to syserror.ErrWouldBlock. + // See TryOpenWriteOnly for more details. + case flags.Write: + return p.TryOpenWriteOnly(ctx, opener) + + default: + // Handle opening O_RDONLY blocking: convert EOF from read to syserror.ErrWouldBlock. + // See TryOpenReadOnly for more details. + return p.TryOpenReadOnly(ctx, opener) + } +} + +// TryOpenReadOnly tries to open a host pipe read only but only returns a fs.File when +// there is a coordinating writer. Call TryOpenReadOnly repeatedly on the same pipeOpenState +// until syserror.ErrWouldBlock is no longer returned. +// +// How it works: +// +// Opening a pipe read only will return no error, but each non zero Read will return EOF +// until a writer becomes available, then EWOULDBLOCK. This is the only state change +// available to us. We keep a read ahead buffer in case we read bytes instead of getting +// EWOULDBLOCK, to be read from on the first read request to this fs.File. +func (p *pipeOpenState) TryOpenReadOnly(ctx context.Context, opener NonBlockingOpener) (*pipeOperations, error) { + // Waiting for a blocking read only open involves reading from the host pipe until + // bytes or other writers are available, so instead of retrying opening the pipe, + // it's necessary to retry reading from the pipe. To do this we need to keep around + // the read only pipe we opened, until success or an irrecoverable read error (at + // which point it must be closed). + if p.hostFile == nil { + var err error + p.hostFile, err = opener.NonBlockingOpen(ctx, fs.PermMask{Read: true}) + if err != nil { + return nil, err + } + } + + // Try to read from the pipe to see if writers are around. + tryReadBuffer := make([]byte, 1) + n, rerr := p.hostFile.Read(tryReadBuffer) + + // No bytes were read. + if n == 0 { + // EOF means that we're not ready yet. + if rerr == nil || rerr == io.EOF { + return nil, syserror.ErrWouldBlock + } + // Any error that is not EWOULDBLOCK also means we're not + // ready yet, and probably never will be ready. In this + // case we need to close the host pipe we opened. + if unwrapError(rerr) != syscall.EWOULDBLOCK { + p.hostFile.Close() + return nil, rerr + } + } + + // If any bytes were read, no matter the corresponding error, we need + // to keep them around so they can be read by the application. + var readAheadBuffer []byte + if n > 0 { + readAheadBuffer = tryReadBuffer + } + + // Successfully opened read only blocking pipe with either bytes available + // to read and/or a writer available. + return newPipeOperations(ctx, opener, fs.FileFlags{Read: true}, p.hostFile, readAheadBuffer) +} + +// TryOpenWriteOnly tries to open a host pipe write only but only returns a fs.File when +// there is a coordinating reader. Call TryOpenWriteOnly repeatedly on the same pipeOpenState +// until syserror.ErrWouldBlock is no longer returned. +// +// How it works: +// +// Opening a pipe write only will return ENXIO until readers are available. Converts the ENXIO +// to an syserror.ErrWouldBlock, to tell callers to retry. +func (*pipeOpenState) TryOpenWriteOnly(ctx context.Context, opener NonBlockingOpener) (*pipeOperations, error) { + hostFile, err := opener.NonBlockingOpen(ctx, fs.PermMask{Write: true}) + if unwrapError(err) == syscall.ENXIO { + return nil, syserror.ErrWouldBlock + } + if err != nil { + return nil, err + } + return newPipeOperations(ctx, opener, fs.FileFlags{Write: true}, hostFile, nil) +} diff --git a/pkg/sentry/fs/fdpipe/pipe_opener_test.go b/pkg/sentry/fs/fdpipe/pipe_opener_test.go new file mode 100644 index 000000000..83f6c1986 --- /dev/null +++ b/pkg/sentry/fs/fdpipe/pipe_opener_test.go @@ -0,0 +1,522 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fdpipe + +import ( + "bytes" + "fmt" + "io" + "os" + "path" + "syscall" + "testing" + "time" + + "github.com/google/uuid" + "gvisor.googlesource.com/gvisor/pkg/fd" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +type hostOpener struct { + name string +} + +func (h *hostOpener) NonBlockingOpen(_ context.Context, p fs.PermMask) (*fd.FD, error) { + var flags int + switch { + case p.Read && p.Write: + flags = syscall.O_RDWR + case p.Write: + flags = syscall.O_WRONLY + case p.Read: + flags = syscall.O_RDONLY + default: + return nil, syscall.EINVAL + } + f, err := syscall.Open(h.name, flags|syscall.O_NONBLOCK, 0666) + if err != nil { + return nil, err + } + return fd.New(f), nil +} + +func pipename() string { + return fmt.Sprintf(path.Join(os.TempDir(), "test-named-pipe-%s"), uuid.New()) +} + +func mkpipe(name string) error { + return syscall.Mknod(name, syscall.S_IFIFO|0666, 0) +} + +func TestTryOpen(t *testing.T) { + for _, test := range []struct { + // desc is the test's description. + desc string + + // makePipe is true if the test case should create the pipe. + makePipe bool + + // flags are the fs.FileFlags used to open the pipe. + flags fs.FileFlags + + // expectFile is true if a fs.File is expected. + expectFile bool + + // err is the expected error + err error + }{ + { + desc: "FileFlags lacking Read and Write are invalid", + makePipe: false, + flags: fs.FileFlags{}, /* bogus */ + expectFile: false, + err: syscall.EINVAL, + }, + { + desc: "NonBlocking Read only error returns immediately", + makePipe: false, /* causes the error */ + flags: fs.FileFlags{Read: true, NonBlocking: true}, + expectFile: false, + err: syscall.ENOENT, + }, + { + desc: "NonBlocking Read only success returns immediately", + makePipe: true, + flags: fs.FileFlags{Read: true, NonBlocking: true}, + expectFile: true, + err: nil, + }, + { + desc: "NonBlocking Write only error returns immediately", + makePipe: false, /* causes the error */ + flags: fs.FileFlags{Write: true, NonBlocking: true}, + expectFile: false, + err: syscall.ENOENT, + }, + { + desc: "NonBlocking Write only no reader error returns immediately", + makePipe: true, + flags: fs.FileFlags{Write: true, NonBlocking: true}, + expectFile: false, + err: syscall.ENXIO, + }, + { + desc: "ReadWrite error returns immediately", + makePipe: false, /* causes the error */ + flags: fs.FileFlags{Read: true, Write: true}, + expectFile: false, + err: syscall.ENOENT, + }, + { + desc: "ReadWrite returns immediately", + makePipe: true, + flags: fs.FileFlags{Read: true, Write: true}, + expectFile: true, + err: nil, + }, + { + desc: "Blocking Write only returns open error", + makePipe: false, /* causes the error */ + flags: fs.FileFlags{Write: true}, + expectFile: false, + err: syscall.ENOENT, /* from bogus perms */ + }, + { + desc: "Blocking Read only returns open error", + makePipe: false, /* causes the error */ + flags: fs.FileFlags{Read: true}, + expectFile: false, + err: syscall.ENOENT, + }, + { + desc: "Blocking Write only returns with syserror.ErrWouldBlock", + makePipe: true, + flags: fs.FileFlags{Write: true}, + expectFile: false, + err: syserror.ErrWouldBlock, + }, + { + desc: "Blocking Read only returns with syserror.ErrWouldBlock", + makePipe: true, + flags: fs.FileFlags{Read: true}, + expectFile: false, + err: syserror.ErrWouldBlock, + }, + } { + name := pipename() + if test.makePipe { + // Create the pipe. We do this per-test case to keep tests independent. + if err := mkpipe(name); err != nil { + t.Errorf("%s: failed to make host pipe: %v", test.desc, err) + continue + } + defer syscall.Unlink(name) + } + + // Use a host opener to keep things simple. + opener := &hostOpener{name: name} + + pipeOpenState := &pipeOpenState{} + ctx := contexttest.Context(t) + pipeOps, err := pipeOpenState.TryOpen(ctx, opener, test.flags) + if unwrapError(err) != test.err { + t.Errorf("%s: got error %v, want %v", test.desc, err, test.err) + if pipeOps != nil { + // Cleanup the state of the pipe, and remove the fd from the + // fdnotifier. Sadly this needed to maintain the correctness + // of other tests because the fdnotifier is global. + pipeOps.Release() + } + continue + } + if (pipeOps != nil) != test.expectFile { + t.Errorf("%s: got non-nil file %v, want %v", test.desc, pipeOps != nil, test.expectFile) + } + if pipeOps != nil { + // Same as above. + pipeOps.Release() + } + } +} + +func TestPipeOpenUnblocksEventually(t *testing.T) { + for _, test := range []struct { + // desc is the test's description. + desc string + + // partnerIsReader is true if the goroutine opening the same pipe as the test case + // should open the pipe read only. Otherwise write only. This also means that the + // test case will open the pipe in the opposite way. + partnerIsReader bool + + // partnerIsBlocking is true if the goroutine opening the same pipe as the test case + // should do so without the O_NONBLOCK flag, otherwise opens the pipe with O_NONBLOCK + // until ENXIO is not returned. + partnerIsBlocking bool + }{ + { + desc: "Blocking Read with blocking writer partner opens eventually", + partnerIsReader: false, + partnerIsBlocking: true, + }, + { + desc: "Blocking Write with blocking reader partner opens eventually", + partnerIsReader: true, + partnerIsBlocking: true, + }, + { + desc: "Blocking Read with non-blocking writer partner opens eventually", + partnerIsReader: false, + partnerIsBlocking: false, + }, + { + desc: "Blocking Write with non-blocking reader partner opens eventually", + partnerIsReader: true, + partnerIsBlocking: false, + }, + } { + // Create the pipe. We do this per-test case to keep tests independent. + name := pipename() + if err := mkpipe(name); err != nil { + t.Errorf("%s: failed to make host pipe: %v", test.desc, err) + continue + } + defer syscall.Unlink(name) + + // Spawn the partner. + type fderr struct { + fd int + err error + } + errch := make(chan fderr, 1) + go func() { + var flags int + if test.partnerIsReader { + flags = syscall.O_RDONLY + } else { + flags = syscall.O_WRONLY + } + if test.partnerIsBlocking { + fd, err := syscall.Open(name, flags, 0666) + errch <- fderr{fd: fd, err: err} + } else { + var fd int + err := error(syscall.ENXIO) + for err == syscall.ENXIO { + fd, err = syscall.Open(name, flags|syscall.O_NONBLOCK, 0666) + time.Sleep(1 * time.Second) + } + errch <- fderr{fd: fd, err: err} + } + }() + + // Setup file flags for either a read only or write only open. + flags := fs.FileFlags{ + Read: !test.partnerIsReader, + Write: test.partnerIsReader, + } + + // Open the pipe in a blocking way, which should succeed eventually. + opener := &hostOpener{name: name} + ctx := contexttest.Context(t) + pipeOps, err := Open(ctx, opener, flags) + if pipeOps != nil { + // Same as TestTryOpen. + pipeOps.Release() + } + + // Check that the partner opened the file successfully. + e := <-errch + if e.err != nil { + t.Errorf("%s: partner got error %v, wanted nil", test.desc, e.err) + continue + } + // If so, then close the partner fd to avoid leaking an fd. + syscall.Close(e.fd) + + // Check that our blocking open was successful. + if err != nil { + t.Errorf("%s: blocking open got error %v, wanted nil", test.desc, err) + continue + } + if pipeOps == nil { + t.Errorf("%s: blocking open got nil file, wanted non-nil", test.desc) + continue + } + } +} + +func TestCopiedReadAheadBuffer(t *testing.T) { + // Create the pipe. + name := pipename() + if err := mkpipe(name); err != nil { + t.Fatalf("failed to make host pipe: %v", err) + } + defer syscall.Unlink(name) + + // We're taking advantage of the fact that pipes opened read only always return + // success, but internally they are not deemed "opened" until we're sure that + // another writer comes along. This means we can open the same pipe write only + // with no problems + write to it, given that opener.Open already tried to open + // the pipe RDONLY and succeeded, which we know happened if TryOpen returns + // syserror.ErrwouldBlock. + // + // This simulates the open(RDONLY) <-> open(WRONLY)+write race we care about, but + // does not cause our test to be racy (which would be terrible). + opener := &hostOpener{name: name} + pipeOpenState := &pipeOpenState{} + ctx := contexttest.Context(t) + pipeOps, err := pipeOpenState.TryOpen(ctx, opener, fs.FileFlags{Read: true}) + if pipeOps != nil { + pipeOps.Release() + t.Fatalf("open(%s, %o) got file, want nil", name, syscall.O_RDONLY) + } + if err != syserror.ErrWouldBlock { + t.Fatalf("open(%s, %o) got error %v, want %v", name, syscall.O_RDONLY, err, syserror.ErrWouldBlock) + } + + // Then open the same pipe write only and write some bytes to it. The next + // time we try to open the pipe read only again via the pipeOpenState, we should + // succeed and buffer some of the bytes written. + fd, err := syscall.Open(name, syscall.O_WRONLY, 0666) + if err != nil { + t.Fatalf("open(%s, %o) got error %v, want nil", name, syscall.O_WRONLY, err) + } + defer syscall.Close(fd) + + data := []byte("hello") + if n, err := syscall.Write(fd, data); n != len(data) || err != nil { + t.Fatalf("write(%v) got (%d, %v), want (%d, nil)", data, n, err, len(data)) + } + + // Try the read again, knowing that it should succeed this time. + pipeOps, err = pipeOpenState.TryOpen(ctx, opener, fs.FileFlags{Read: true}) + if pipeOps == nil { + t.Fatalf("open(%s, %o) got nil file, want not nil", name, syscall.O_RDONLY) + } + defer pipeOps.Release() + + if err != nil { + t.Fatalf("open(%s, %o) got error %v, want nil", name, syscall.O_RDONLY, err) + } + + inode := fs.NewMockInode(ctx, fs.NewMockMountSource(nil), fs.StableAttr{ + Type: fs.Pipe, + }) + file := fs.NewFile(ctx, fs.NewDirent(inode, "pipe"), fs.FileFlags{Read: true}, pipeOps) + + // Check that the file we opened points to a pipe with a non-empty read ahead buffer. + bufsize := len(pipeOps.readAheadBuffer) + if bufsize != 1 { + t.Fatalf("read ahead buffer got %d bytes, want %d", bufsize, 1) + } + + // Now for the final test, try to read everything in, expecting to get back all of + // the bytes that were written at once. Note that in the wild there is no atomic + // read size so expecting to get all bytes from a single writer when there are + // multiple readers is a bad expectation. + buf := make([]byte, len(data)) + ioseq := usermem.BytesIOSequence(buf) + n, err := pipeOps.Read(ctx, file, ioseq, 0) + if err != nil { + t.Fatalf("read request got error %v, want nil", err) + } + if n != int64(len(data)) { + t.Fatalf("read request got %d bytes, want %d", n, len(data)) + } + if !bytes.Equal(buf, data) { + t.Errorf("read request got bytes [%v], want [%v]", buf, data) + } +} + +func TestPipeHangup(t *testing.T) { + for _, test := range []struct { + // desc is the test's description. + desc string + + // flags control how we open our end of the pipe and must be read + // only or write only. They also dicate how a coordinating partner + // fd is opened, which is their inverse (read only -> write only, etc). + flags fs.FileFlags + + // hangupSelf if true causes the test case to close our end of the pipe + // and causes hangup errors to be asserted on our coordinating partner's + // fd. If hangupSelf is false, then our partner's fd is closed and the + // hangup errors are expected on our end of the pipe. + hangupSelf bool + }{ + { + desc: "Read only gets hangup error", + flags: fs.FileFlags{Read: true}, + }, + { + desc: "Write only gets hangup error", + flags: fs.FileFlags{Write: true}, + }, + { + desc: "Read only generates hangup error", + flags: fs.FileFlags{Read: true}, + hangupSelf: true, + }, + { + desc: "Write only generates hangup error", + flags: fs.FileFlags{Write: true}, + hangupSelf: true, + }, + } { + if test.flags.Read == test.flags.Write { + t.Errorf("%s: test requires a single reader or writer", test.desc) + continue + } + + // Create the pipe. We do this per-test case to keep tests independent. + name := pipename() + if err := mkpipe(name); err != nil { + t.Errorf("%s: failed to make host pipe: %v", test.desc, err) + continue + } + defer syscall.Unlink(name) + + // Fire off a partner routine which tries to open the same pipe blocking, + // which will synchronize with us. The channel allows us to get back the + // fd once we expect this partner routine to succeed, so we can manifest + // hangup events more directly. + fdchan := make(chan int, 1) + go func() { + // Be explicit about the flags to protect the test from + // misconfiguration. + var flags int + if test.flags.Read { + flags = syscall.O_WRONLY + } else { + flags = syscall.O_RDONLY + } + fd, err := syscall.Open(name, flags, 0666) + if err != nil { + t.Logf("Open(%q, %o, 0666) partner failed: %v", name, flags, err) + } + fdchan <- fd + }() + + // Open our end in a blocking way to ensure that we coordinate. + opener := &hostOpener{name: name} + ctx := contexttest.Context(t) + pipeOps, err := Open(ctx, opener, test.flags) + if err != nil { + t.Errorf("%s: Open got error %v, want nil", test.desc, err) + continue + } + // Don't defer file.DecRef here because that causes the hangup we're + // trying to test for. + + // Expect the partner routine to have coordinated with us and get back + // its open fd. + f := <-fdchan + if f < 0 { + t.Errorf("%s: partner routine got fd %d, want > 0", test.desc, f) + pipeOps.Release() + continue + } + + if test.hangupSelf { + // Hangup self and assert that our partner got the expected hangup + // error. + pipeOps.Release() + + if test.flags.Read { + // Partner is writer. + assertWriterHungup(t, test.desc, fd.NewReadWriter(f)) + } else { + // Partner is reader. + assertReaderHungup(t, test.desc, fd.NewReadWriter(f)) + } + } else { + // Hangup our partner and expect us to get the hangup error. + syscall.Close(f) + defer pipeOps.Release() + + if test.flags.Read { + assertReaderHungup(t, test.desc, pipeOps.(*pipeOperations).file) + } else { + assertWriterHungup(t, test.desc, pipeOps.(*pipeOperations).file) + } + } + } +} + +func assertReaderHungup(t *testing.T, desc string, reader io.Reader) bool { + // Drain the pipe completely, it might have crap in it, but expect EOF eventually. + var err error + for err == nil { + _, err = reader.Read(make([]byte, 10)) + } + if err != io.EOF { + t.Errorf("%s: read from self after hangup got error %v, want %v", desc, err, io.EOF) + return false + } + return true +} + +func assertWriterHungup(t *testing.T, desc string, writer io.Writer) bool { + if _, err := writer.Write([]byte("hello")); unwrapError(err) != syscall.EPIPE { + t.Errorf("%s: write to self after hangup got error %v, want %v", desc, err, syscall.EPIPE) + return false + } + return true +} diff --git a/pkg/sentry/fs/fdpipe/pipe_state.go b/pkg/sentry/fs/fdpipe/pipe_state.go new file mode 100644 index 000000000..8996a2178 --- /dev/null +++ b/pkg/sentry/fs/fdpipe/pipe_state.go @@ -0,0 +1,88 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fdpipe + +import ( + "fmt" + "io/ioutil" + "sync" + + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" +) + +// beforeSave is invoked by stateify. +func (p *pipeOperations) beforeSave() { + if p.flags.Read { + data, err := ioutil.ReadAll(p.file) + if err != nil && !isBlockError(err) { + panic(fmt.Sprintf("failed to read from pipe: %v", err)) + } + p.readAheadBuffer = append(p.readAheadBuffer, data...) + } else if p.flags.Write { + file, err := p.opener.NonBlockingOpen(context.Background(), fs.PermMask{Write: true}) + if err != nil { + panic(fs.ErrSaveRejection{fmt.Errorf("write-only pipe end cannot be re-opened as %v: %v", p, err)}) + } + file.Close() + } +} + +// saveFlags is invoked by stateify. +func (p *pipeOperations) saveFlags() fs.FileFlags { + return p.flags +} + +// readPipeOperationsLoading is used to ensure that write-only pipe fds are +// opened after read/write and read-only pipe fds, to avoid ENXIO when +// multiple pipe fds refer to different ends of the same pipe. +var readPipeOperationsLoading sync.WaitGroup + +// loadFlags is invoked by stateify. +func (p *pipeOperations) loadFlags(flags fs.FileFlags) { + // This is a hack to ensure that readPipeOperationsLoading includes all + // readable pipe fds before any asynchronous calls to + // readPipeOperationsLoading.Wait(). + if flags.Read { + readPipeOperationsLoading.Add(1) + } + p.flags = flags +} + +// afterLoad is invoked by stateify. +func (p *pipeOperations) afterLoad() { + load := func() { + if !p.flags.Read { + readPipeOperationsLoading.Wait() + } else { + defer readPipeOperationsLoading.Done() + } + var err error + p.file, err = p.opener.NonBlockingOpen(context.Background(), fs.PermMask{ + Read: p.flags.Read, + Write: p.flags.Write, + }) + if err != nil { + panic(fmt.Sprintf("unable to open pipe %v: %v", p, err)) + } + if err := p.init(); err != nil { + panic(fmt.Sprintf("unable to initialize pipe %v: %v", p, err)) + } + } + + // Do background opening of pipe ends. Note for write-only pipe ends we + // have to do it asynchronously to avoid blocking the restore. + fs.Async(load) +} diff --git a/pkg/sentry/fs/fdpipe/pipe_test.go b/pkg/sentry/fs/fdpipe/pipe_test.go new file mode 100644 index 000000000..6cd314f5b --- /dev/null +++ b/pkg/sentry/fs/fdpipe/pipe_test.go @@ -0,0 +1,489 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fdpipe + +import ( + "bytes" + "io" + "os" + "syscall" + "testing" + + "gvisor.googlesource.com/gvisor/pkg/fd" + "gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" + "gvisor.googlesource.com/gvisor/pkg/waiter/fdnotifier" +) + +func singlePipeFD() (int, error) { + fds := make([]int, 2) + if err := syscall.Pipe(fds); err != nil { + return -1, err + } + syscall.Close(fds[1]) + return fds[0], nil +} + +func singleDirFD() (int, error) { + return syscall.Open(os.TempDir(), syscall.O_RDONLY, 0666) +} + +func mockPipeDirent(t *testing.T) *fs.Dirent { + ctx := contexttest.Context(t) + node := fs.NewMockInodeOperations(ctx) + node.UAttr = fs.UnstableAttr{ + Perms: fs.FilePermissions{ + User: fs.PermMask{Read: true, Write: true}, + }, + } + inode := fs.NewInode(node, fs.NewMockMountSource(nil), fs.StableAttr{ + Type: fs.Pipe, + BlockSize: usermem.PageSize, + }) + return fs.NewDirent(inode, "") +} + +func TestNewPipe(t *testing.T) { + for _, test := range []struct { + // desc is the test's description. + desc string + + // getfd generates the fd to pass to newPipeOperations. + getfd func() (int, error) + + // flags are the fs.FileFlags passed to newPipeOperations. + flags fs.FileFlags + + // readAheadBuffer is the buffer passed to newPipeOperations. + readAheadBuffer []byte + + // err is the expected error. + err error + }{ + { + desc: "Cannot make new pipe from bad fd", + getfd: func() (int, error) { return -1, nil }, + err: syscall.EINVAL, + }, + { + desc: "Cannot make new pipe from non-pipe fd", + getfd: singleDirFD, + err: syscall.EINVAL, + }, + { + desc: "Can make new pipe from pipe fd", + getfd: singlePipeFD, + flags: fs.FileFlags{Read: true}, + readAheadBuffer: []byte("hello"), + }, + } { + gfd, err := test.getfd() + if err != nil { + t.Errorf("%s: getfd got (%d, %v), want (fd, nil)", test.desc, gfd, err) + continue + } + f := fd.New(gfd) + + p, err := newPipeOperations(contexttest.Context(t), nil, test.flags, f, test.readAheadBuffer) + if p != nil { + // This is necessary to remove the fd from the global fd notifier. + defer p.Release() + } else { + // If there is no p to DecRef on, because newPipeOperations failed, then the + // file still needs to be closed. + defer f.Close() + } + + if err != test.err { + t.Errorf("%s: got error %v, want %v", test.desc, err, test.err) + continue + } + // Check the state of the pipe given that it was successfully opened. + if err == nil { + if p == nil { + t.Errorf("%s: got nil pipe and nil error, want (pipe, nil)", test.desc) + continue + } + if flags := p.flags; test.flags != flags { + t.Errorf("%s: got file flags %s, want %s", test.desc, flags, test.flags) + continue + } + if len(test.readAheadBuffer) != len(p.readAheadBuffer) { + t.Errorf("%s: got read ahead buffer length %d, want %d", test.desc, len(p.readAheadBuffer), len(test.readAheadBuffer)) + continue + } + fileFlags, _, errno := syscall.Syscall(syscall.SYS_FCNTL, uintptr(p.file.FD()), syscall.F_GETFL, 0) + if errno != 0 { + t.Errorf("%s: failed to get file flags for fd %d, got %v, want 0", test.desc, p.file.FD(), errno) + continue + } + if fileFlags&syscall.O_NONBLOCK == 0 { + t.Errorf("%s: pipe is blocking, expected non-blocking", test.desc) + continue + } + if !fdnotifier.HasFD(int32(f.FD())) { + t.Errorf("%s: pipe fd %d is not registered for events", test.desc, f.FD) + } + } + } +} + +func TestPipeDestruction(t *testing.T) { + fds := make([]int, 2) + if err := syscall.Pipe(fds); err != nil { + t.Fatalf("failed to create pipes: got %v, want nil", err) + } + f := fd.New(fds[0]) + + // We don't care about the other end, just use the read end. + syscall.Close(fds[1]) + + // Test the read end, but it doesn't really matter which. + p, err := newPipeOperations(contexttest.Context(t), nil, fs.FileFlags{Read: true}, f, nil) + if err != nil { + f.Close() + t.Fatalf("newPipeOperations got error %v, want nil", err) + } + // Drop our only reference, which should trigger the destructor. + p.Release() + + if fdnotifier.HasFD(int32(fds[0])) { + t.Fatalf("after DecRef fdnotifier has fd %d, want no longer registered", fds[0]) + } + if p.file != nil { + t.Errorf("after DecRef got file, want nil") + } +} + +type Seek struct{} + +type ReadDir struct{} + +type Writev struct { + Src usermem.IOSequence +} + +type Readv struct { + Dst usermem.IOSequence +} + +type Fsync struct{} + +func TestPipeRequest(t *testing.T) { + for _, test := range []struct { + // desc is the test's description. + desc string + + // request to execute. + context interface{} + + // flags determines whether to use the read or write end + // of the pipe, for this test it can only be Read or Write. + flags fs.FileFlags + + // keepOpenPartner if false closes the other end of the pipe, + // otherwise this is delayed until the end of the test. + keepOpenPartner bool + + // expected error + err error + }{ + { + desc: "ReadDir on pipe returns ENOTDIR", + context: &ReadDir{}, + err: syscall.ENOTDIR, + }, + { + desc: "Fsync on pipe returns EINVAL", + context: &Fsync{}, + err: syscall.EINVAL, + }, + { + desc: "Seek on pipe returns ESPIPE", + context: &Seek{}, + err: syscall.ESPIPE, + }, + { + desc: "Readv on pipe from empty buffer returns nil", + context: &Readv{Dst: usermem.BytesIOSequence(nil)}, + flags: fs.FileFlags{Read: true}, + }, + { + desc: "Readv on pipe from non-empty buffer and closed partner returns EOF", + context: &Readv{Dst: usermem.BytesIOSequence(make([]byte, 10))}, + flags: fs.FileFlags{Read: true}, + err: io.EOF, + }, + { + desc: "Readv on pipe from non-empty buffer and open partner returns EWOULDBLOCK", + context: &Readv{Dst: usermem.BytesIOSequence(make([]byte, 10))}, + flags: fs.FileFlags{Read: true}, + keepOpenPartner: true, + err: syserror.ErrWouldBlock, + }, + { + desc: "Writev on pipe from empty buffer returns nil", + context: &Writev{Src: usermem.BytesIOSequence(nil)}, + flags: fs.FileFlags{Write: true}, + }, + { + desc: "Writev on pipe from non-empty buffer and closed partner returns EPIPE", + context: &Writev{Src: usermem.BytesIOSequence([]byte("hello"))}, + flags: fs.FileFlags{Write: true}, + err: syscall.EPIPE, + }, + { + desc: "Writev on pipe from non-empty buffer and open partner succeeds", + context: &Writev{Src: usermem.BytesIOSequence([]byte("hello"))}, + flags: fs.FileFlags{Write: true}, + keepOpenPartner: true, + }, + } { + if test.flags.Read && test.flags.Write { + panic("both read and write not supported for this test") + } + + fds := make([]int, 2) + if err := syscall.Pipe(fds); err != nil { + t.Errorf("%s: failed to create pipes: got %v, want nil", test.desc, err) + continue + } + + // Configure the fd and partner fd based on the file flags. + testFd, partnerFd := fds[0], fds[1] + if test.flags.Write { + testFd, partnerFd = fds[1], fds[0] + } + + // Configure closing the fds. + if test.keepOpenPartner { + defer syscall.Close(partnerFd) + } else { + syscall.Close(partnerFd) + } + + // Create the pipe. + ctx := contexttest.Context(t) + p, err := newPipeOperations(ctx, nil, test.flags, fd.New(testFd), nil) + if err != nil { + t.Fatalf("%s: newPipeOperations got error %v, want nil", test.desc, err) + } + defer p.Release() + + inode := fs.NewMockInode(ctx, fs.NewMockMountSource(nil), fs.StableAttr{Type: fs.Pipe}) + file := fs.NewFile(ctx, fs.NewDirent(inode, "pipe"), fs.FileFlags{Read: true}, p) + + // Issue request via the appropriate function. + switch c := test.context.(type) { + case *Seek: + _, err = p.Seek(ctx, file, 0, 0) + case *ReadDir: + _, err = p.Readdir(ctx, file, nil) + case *Readv: + _, err = p.Read(ctx, file, c.Dst, 0) + case *Writev: + _, err = p.Write(ctx, file, c.Src, 0) + case *Fsync: + err = p.Fsync(ctx, file, 0, fs.FileMaxOffset, fs.SyncAll) + default: + t.Errorf("%s: unknown request type %T", test.desc, test.context) + } + + if unwrapError(err) != test.err { + t.Errorf("%s: got error %v, want %v", test.desc, err, test.err) + } + } +} + +func TestPipeReadAheadBuffer(t *testing.T) { + fds := make([]int, 2) + if err := syscall.Pipe(fds); err != nil { + t.Fatalf("failed to create pipes: got %v, want nil", err) + } + rfile := fd.New(fds[0]) + + // Eventually close the write end, which is not wrapped in a pipe object. + defer syscall.Close(fds[1]) + + // Write some bytes to this end. + data := []byte("world") + if n, err := syscall.Write(fds[1], data); n != len(data) || err != nil { + rfile.Close() + t.Fatalf("write to pipe got (%d, %v), want (%d, nil)", n, err, len(data)) + } + // Close the write end immediately, we don't care about it. + + buffered := []byte("hello ") + ctx := contexttest.Context(t) + p, err := newPipeOperations(ctx, nil, fs.FileFlags{Read: true}, rfile, buffered) + if err != nil { + rfile.Close() + t.Fatalf("newPipeOperations got error %v, want nil", err) + } + defer p.Release() + + inode := fs.NewMockInode(ctx, fs.NewMockMountSource(nil), fs.StableAttr{ + Type: fs.Pipe, + }) + file := fs.NewFile(ctx, fs.NewDirent(inode, "pipe"), fs.FileFlags{Read: true}, p) + + // In total we expect to read data + buffered. + total := append(buffered, data...) + + buf := make([]byte, len(total)) + iov := usermem.BytesIOSequence(buf) + n, err := p.Read(contexttest.Context(t), file, iov, 0) + if err != nil { + t.Fatalf("read request got error %v, want nil", err) + } + if n != int64(len(total)) { + t.Fatalf("read request got %d bytes, want %d", n, len(total)) + } + if !bytes.Equal(buf, total) { + t.Errorf("read request got bytes [%v], want [%v]", buf, total) + } +} + +// This is very important for pipes in general because they can return EWOULDBLOCK and for +// those that block they must continue until they have read all of the data (and report it +// as such. +func TestPipeReadsAccumulate(t *testing.T) { + fds := make([]int, 2) + if err := syscall.Pipe(fds); err != nil { + t.Fatalf("failed to create pipes: got %v, want nil", err) + } + rfile := fd.New(fds[0]) + + // Eventually close the write end, it doesn't depend on a pipe object. + defer syscall.Close(fds[1]) + + // Get a new read only pipe reference. + ctx := contexttest.Context(t) + p, err := newPipeOperations(ctx, nil, fs.FileFlags{Read: true}, rfile, nil) + if err != nil { + rfile.Close() + t.Fatalf("newPipeOperations got error %v, want nil", err) + } + // Don't forget to remove the fd from the fd notifier. Otherwise other tests will + // likely be borked, because it's global :( + defer p.Release() + + inode := fs.NewMockInode(ctx, fs.NewMockMountSource(nil), fs.StableAttr{ + Type: fs.Pipe, + }) + file := fs.NewFile(ctx, fs.NewDirent(inode, "pipe"), fs.FileFlags{Read: true}, p) + + // Write some some bytes to the pipe. + data := []byte("some message") + if n, err := syscall.Write(fds[1], data); n != len(data) || err != nil { + t.Fatalf("write to pipe got (%d, %v), want (%d, nil)", n, err, len(data)) + } + + // Construct a segment vec that is a bit more than we have written so we trigger + // an EWOULDBLOCK. + wantBytes := len(data) + 1 + readBuffer := make([]byte, wantBytes) + iov := usermem.BytesIOSequence(readBuffer) + n, err := p.Read(ctx, file, iov, 0) + total := n + iov = iov.DropFirst64(n) + if err != syserror.ErrWouldBlock { + t.Fatalf("Readv got error %v, want %v", err, syserror.ErrWouldBlock) + } + + // Write a few more bytes to allow us to read more/accumulate. + extra := []byte("extra") + if n, err := syscall.Write(fds[1], extra); n != len(extra) || err != nil { + t.Fatalf("write to pipe got (%d, %v), want (%d, nil)", n, err, len(extra)) + } + + // This time, using the same request, we should not block. + n, err = p.Read(ctx, file, iov, 0) + total += n + if err != nil { + t.Fatalf("Readv got error %v, want nil", err) + } + + // Assert that the result we got back is cumulative. + if total != int64(wantBytes) { + t.Fatalf("Readv sequence got %d bytes, want %d", total, wantBytes) + } + + if want := append(data, extra[0]); !bytes.Equal(readBuffer, want) { + t.Errorf("Readv sequence got %v, want %v", readBuffer, want) + } +} + +// Same as TestReadsAccumulate. +func TestPipeWritesAccumulate(t *testing.T) { + fds := make([]int, 2) + if err := syscall.Pipe(fds); err != nil { + t.Fatalf("failed to create pipes: got %v, want nil", err) + } + wfile := fd.New(fds[1]) + + // Eventually close the read end, it doesn't depend on a pipe object. + defer syscall.Close(fds[0]) + + // Get a new write only pipe reference. + ctx := contexttest.Context(t) + p, err := newPipeOperations(ctx, nil, fs.FileFlags{Write: true}, wfile, nil) + if err != nil { + wfile.Close() + t.Fatalf("newPipeOperations got error %v, want nil", err) + } + // Don't forget to remove the fd from the fd notifier. Otherwise other tests will + // likely be borked, because it's global :( + defer p.Release() + + inode := fs.NewMockInode(ctx, fs.NewMockMountSource(nil), fs.StableAttr{ + Type: fs.Pipe, + }) + file := fs.NewFile(ctx, fs.NewDirent(inode, "pipe"), fs.FileFlags{Read: true}, p) + + // Construct a segment vec that is larger than the pipe size to trigger an EWOULDBLOCK. + wantBytes := 65536 * 2 + writeBuffer := make([]byte, wantBytes) + for i := 0; i < wantBytes; i++ { + writeBuffer[i] = 'a' + } + iov := usermem.BytesIOSequence(writeBuffer) + n, err := p.Write(ctx, file, iov, 0) + total := n + iov = iov.DropFirst64(n) + if err != syserror.ErrWouldBlock { + t.Fatalf("Writev got error %v, want %v", err, syserror.ErrWouldBlock) + } + + // Read the entire pipe buf size to make space for the second half. + throwAway := make([]byte, 65536) + if n, err := syscall.Read(fds[0], throwAway); n != len(throwAway) || err != nil { + t.Fatalf("write to pipe got (%d, %v), want (%d, nil)", n, err, len(throwAway)) + } + + // This time we should not block. + n, err = p.Write(ctx, file, iov, 0) + total += n + if err != nil { + t.Fatalf("Writev got error %v, want nil", err) + } + + // Assert that the result we got back is cumulative. + if total != int64(wantBytes) { + t.Fatalf("Writev sequence got %d bytes, want %d", total, wantBytes) + } +} diff --git a/pkg/sentry/fs/file.go b/pkg/sentry/fs/file.go new file mode 100644 index 000000000..de2e80bf0 --- /dev/null +++ b/pkg/sentry/fs/file.go @@ -0,0 +1,404 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fs + +import ( + "math" + "sync/atomic" + + "gvisor.googlesource.com/gvisor/pkg/amutex" + "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/pkg/refs" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/lock" + "gvisor.googlesource.com/gvisor/pkg/sentry/limits" + "gvisor.googlesource.com/gvisor/pkg/sentry/memmap" + "gvisor.googlesource.com/gvisor/pkg/sentry/uniqueid" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" + "gvisor.googlesource.com/gvisor/pkg/waiter" +) + +// FileMaxOffset is the maximum possible file offset. +const FileMaxOffset = math.MaxInt64 + +// File is an open file handle. It is thread-safe. +// +// File provides stronger synchronization guarantees than Linux. Linux +// synchronizes lseek(2), read(2), and write(2) with respect to the file +// offset for regular files and only for those interfaces. See +// fs/read_write.c:fdget_pos, fs.read_write.c:fdput_pos and FMODE_ATOMIC_POS. +// +// In contrast, File synchronizes any operation that could take a long time +// under a single abortable mutex which also synchronizes lseek(2), read(2), +// and write(2). +// +// FIXME: Split synchronization from cancellation. +type File struct { + refs.AtomicRefCount + + // UniqueID is the globally unique identifier of the File. + UniqueID uint64 + + // Dirent is the Dirent backing this File. This encodes the name + // of the File via Dirent.FullName() as well as its identity via the + // Dirent's Inode. The Dirent is non-nil. + // + // A File holds a reference to this Dirent. Using the returned Dirent is + // only safe as long as a reference on the File is held. The association + // between a File and a Dirent is immutable. + // + // Files that are not parented in a filesystem return a root Dirent + // that holds a reference to their Inode. + // + // The name of the Dirent may reflect parentage if the Dirent is not a + // root Dirent or the identity of the File on a pseudo filesystem (pipefs, + // sockfs, etc). + // + // Multiple Files may hold a reference to the same Dirent. This is the + // common case for Files that are parented and maintain consistency with + // other files via the Dirent cache. + Dirent *Dirent + + // flags are the File's flags. Setting or getting flags is fully atomic + // and is not protected by mu (below). + flags atomic.Value `state:".(FileFlags)"` + + // mu is dual-purpose: first, to make read(2) and write(2) thread-safe + // in conformity with POSIX, and second, to cancel operations before they + // begin in response to interruptions (i.e. signals). + mu amutex.AbortableMutex `state:"nosave"` + + // FileOperations implements file system specific behavior for this File. + FileOperations FileOperations + + // offset is the File's offset. Updating offset is protected by mu but + // can be read atomically via File.Offset() outside of mu. + offset int64 +} + +// NewFile returns a File. It takes a reference on the Dirent and owns the +// lifetime of the FileOperations. Files that do not support reading and +// writing at an arbitrary offset should set flags.Pread and flags.Pwrite +// to false respectively. +func NewFile(ctx context.Context, dirent *Dirent, flags FileFlags, fops FileOperations) *File { + dirent.IncRef() + f := &File{ + UniqueID: uniqueid.GlobalFromContext(ctx), + Dirent: dirent, + FileOperations: fops, + } + f.flags.Store(flags) + f.mu.Init() + return f +} + +// DecRef destroys the File when it is no longer referenced. +func (f *File) DecRef() { + f.DecRefWithDestructor(func() { + // Drop BSD style locks. + lockRng := lock.LockRange{Start: 0, End: lock.LockEOF} + f.Dirent.Inode.LockCtx.BSD.UnlockRegion(lock.UniqueID(f.UniqueID), lockRng) + + // Release resources held by the FileOperations. + f.FileOperations.Release() + + // Release a reference on the Dirent. + f.Dirent.DecRef() + }) +} + +// Flags atomically loads the File's flags. +func (f *File) Flags() FileFlags { + return f.flags.Load().(FileFlags) +} + +// SetFlags atomically changes the File's flags to the values contained +// in newFlags. See SettableFileFlags for values that can be set. +func (f *File) SetFlags(newFlags SettableFileFlags) { + flags := f.flags.Load().(FileFlags) + flags.Direct = newFlags.Direct + flags.NonBlocking = newFlags.NonBlocking + flags.Append = newFlags.Append + f.flags.Store(flags) +} + +// Offset atomically loads the File's offset. +func (f *File) Offset() int64 { + return atomic.LoadInt64(&f.offset) +} + +// Readiness implements waiter.Waitable.Readiness. +func (f *File) Readiness(mask waiter.EventMask) waiter.EventMask { + return f.FileOperations.Readiness(mask) +} + +// EventRegister implements waiter.Waitable.EventRegister. +func (f *File) EventRegister(e *waiter.Entry, mask waiter.EventMask) { + f.FileOperations.EventRegister(e, mask) +} + +// EventUnregister implements waiter.Waitable.EventUnregister. +func (f *File) EventUnregister(e *waiter.Entry) { + f.FileOperations.EventUnregister(e) +} + +// Seek calls f.FileOperations.Seek with f as the File, updating the file +// offset to the value returned by f.FileOperations.Seek if the operation +// is successful. +// +// Returns syserror.ErrInterrupted if seeking was interrupted. +func (f *File) Seek(ctx context.Context, whence SeekWhence, offset int64) (int64, error) { + if !f.mu.Lock(ctx) { + return 0, syserror.ErrInterrupted + } + defer f.mu.Unlock() + + newOffset, err := f.FileOperations.Seek(ctx, f, whence, offset) + if err == nil { + atomic.StoreInt64(&f.offset, newOffset) + } + return newOffset, err +} + +// Readdir reads the directory entries of this File and writes them out +// to the DentrySerializer until entries can no longer be written. If even +// a single directory entry is written then Readdir returns a nil error +// and the directory offset is advanced. +// +// Readdir unconditionally updates the access time on the File's Inode, +// see fs/readdir.c:iterate_dir. +// +// Returns syserror.ErrInterrupted if reading was interrupted. +func (f *File) Readdir(ctx context.Context, serializer DentrySerializer) error { + if !f.mu.Lock(ctx) { + return syserror.ErrInterrupted + } + defer f.mu.Unlock() + + offset, err := f.FileOperations.Readdir(ctx, f, serializer) + atomic.StoreInt64(&f.offset, offset) + return err +} + +// Readv calls f.FileOperations.Read with f as the File, advancing the file +// offset if f.FileOperations.Read returns bytes read > 0. +// +// Returns syserror.ErrInterrupted if reading was interrupted. +func (f *File) Readv(ctx context.Context, dst usermem.IOSequence) (int64, error) { + if !f.mu.Lock(ctx) { + return 0, syserror.ErrInterrupted + } + + n, err := f.FileOperations.Read(ctx, f, dst, f.offset) + if n > 0 { + atomic.AddInt64(&f.offset, n) + } + f.mu.Unlock() + return n, err +} + +// Preadv calls f.FileOperations.Read with f as the File. It does not +// advance the file offset. If !f.Flags().Pread, Preadv should not be +// called. +// +// Otherwise same as Readv. +func (f *File) Preadv(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) { + if !f.mu.Lock(ctx) { + return 0, syserror.ErrInterrupted + } + + n, err := f.FileOperations.Read(ctx, f, dst, offset) + f.mu.Unlock() + return n, err +} + +// Writev calls f.FileOperations.Write with f as the File, advancing the +// file offset if f.FileOperations.Write returns bytes written > 0. +// +// Writev positions the write offset at EOF if f.Flags().Append. This is +// unavoidably racy for network file systems. Writev also truncates src +// to avoid overrunning the current file size limit if necessary. +// +// Returns syserror.ErrInterrupted if writing was interrupted. +func (f *File) Writev(ctx context.Context, src usermem.IOSequence) (int64, error) { + if !f.mu.Lock(ctx) { + return 0, syserror.ErrInterrupted + } + + offset, err := f.checkWriteLocked(ctx, &src, f.offset) + if err != nil { + f.mu.Unlock() + return 0, err + } + n, err := f.FileOperations.Write(ctx, f, src, offset) + if n >= 0 { + atomic.StoreInt64(&f.offset, offset+n) + } + f.mu.Unlock() + return n, err +} + +// Pwritev calls f.FileOperations.Write with f as the File. It does not +// advance the file offset. If !f.Flags().Pwritev, Pwritev should not be +// called. +// +// Otherwise same as Writev. +func (f *File) Pwritev(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) { + if !f.mu.Lock(ctx) { + return 0, syserror.ErrInterrupted + } + + offset, err := f.checkWriteLocked(ctx, &src, offset) + if err != nil { + f.mu.Unlock() + return 0, err + } + n, err := f.FileOperations.Write(ctx, f, src, offset) + f.mu.Unlock() + return n, err +} + +// checkWriteLocked returns the offset to write at or an error if the write +// would not succeed. May update src to fit a write operation into a file +// size limit. +func (f *File) checkWriteLocked(ctx context.Context, src *usermem.IOSequence, offset int64) (int64, error) { + // Handle append only files. Note that this is still racy for network + // filesystems. + if f.Flags().Append { + uattr, err := f.Dirent.Inode.UnstableAttr(ctx) + if err != nil { + // This is an odd error, most likely it is evidence + // that something is terribly wrong with the filesystem. + // Return a generic EIO error. + log.Warningf("Failed to check write of inode %#v: %v", f.Dirent.Inode.StableAttr, err) + return offset, syserror.EIO + } + offset = uattr.Size + } + + // Is this a regular file? + if IsRegular(f.Dirent.Inode.StableAttr) { + // Enforce size limits. + fileSizeLimit := limits.FromContext(ctx).Get(limits.FileSize).Cur + if fileSizeLimit <= math.MaxInt64 { + if offset >= int64(fileSizeLimit) { + return offset, syserror.ErrExceedsFileSizeLimit + } + *src = src.TakeFirst64(int64(fileSizeLimit) - offset) + } + } + + return offset, nil +} + +// Fsync calls f.FileOperations.Fsync with f as the File. +// +// Returns syserror.ErrInterrupted if syncing was interrupted. +func (f *File) Fsync(ctx context.Context, start int64, end int64, syncType SyncType) error { + if !f.mu.Lock(ctx) { + return syserror.ErrInterrupted + } + defer f.mu.Unlock() + + return f.FileOperations.Fsync(ctx, f, start, end, syncType) +} + +// Flush calls f.FileOperations.Flush with f as the File. +// +// Returns syserror.ErrInterrupted if syncing was interrupted. +func (f *File) Flush(ctx context.Context) error { + if !f.mu.Lock(ctx) { + return syserror.ErrInterrupted + } + defer f.mu.Unlock() + + return f.FileOperations.Flush(ctx, f) +} + +// ConfigureMMap calls f.FileOperations.ConfigureMMap with f as the File. +// +// Returns syserror.ErrInterrupted if interrupted. +func (f *File) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { + if !f.mu.Lock(ctx) { + return syserror.ErrInterrupted + } + defer f.mu.Unlock() + + return f.FileOperations.ConfigureMMap(ctx, f, opts) +} + +// MappedName implements memmap.MappingIdentity.MappedName. +func (f *File) MappedName(ctx context.Context) string { + name, _ := f.Dirent.FullName(RootFromContext(ctx)) + return name +} + +// DeviceID implements memmap.MappingIdentity.DeviceID. +func (f *File) DeviceID() uint64 { + return f.Dirent.Inode.StableAttr.DeviceID +} + +// InodeID implements memmap.MappingIdentity.InodeID. +func (f *File) InodeID() uint64 { + return f.Dirent.Inode.StableAttr.InodeID +} + +// Msync implements memmap.MappingIdentity.Msync. +func (f *File) Msync(ctx context.Context, mr memmap.MappableRange) error { + return f.Fsync(ctx, int64(mr.Start), int64(mr.End-1), SyncData) +} + +// FileReader implements io.Reader and io.ReaderAt. +type FileReader struct { + // Ctx is the context for the file reader. + Ctx context.Context + + // File is the file to read from. + File *File +} + +// Read implements io.Reader.Read. +func (r *FileReader) Read(buf []byte) (int, error) { + n, err := r.File.Readv(r.Ctx, usermem.BytesIOSequence(buf)) + return int(n), err +} + +// ReadAt implementes io.Reader.ReadAt. +func (r *FileReader) ReadAt(buf []byte, offset int64) (int, error) { + n, err := r.File.Preadv(r.Ctx, usermem.BytesIOSequence(buf), offset) + return int(n), err +} + +// FileWriter implements io.Writer and io.WriterAt. +type FileWriter struct { + // Ctx is the context for the file writer. + Ctx context.Context + + // File is the file to write to. + File *File +} + +// Write implements io.Writer.Write. +func (w *FileWriter) Write(buf []byte) (int, error) { + n, err := w.File.Writev(w.Ctx, usermem.BytesIOSequence(buf)) + return int(n), err +} + +// WriteAt implements io.Writer.WriteAt. +func (w *FileWriter) WriteAt(buf []byte, offset int64) (int, error) { + n, err := w.File.Pwritev(w.Ctx, usermem.BytesIOSequence(buf), offset) + return int(n), err +} diff --git a/pkg/sentry/fs/file_operations.go b/pkg/sentry/fs/file_operations.go new file mode 100644 index 000000000..d223bb5c7 --- /dev/null +++ b/pkg/sentry/fs/file_operations.go @@ -0,0 +1,106 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fs + +import ( + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/memmap" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/waiter" +) + +// FileOperations are operations on a File that diverge per file system. +// +// Operations that take a *File may use only the following interfaces: +// +// - File.UniqueID: Operations may only read this value. +// - File.Dirent: Operations must not take or drop a reference. +// - File.Offset(): This value is guaranteed to not change for the +// duration of the operation. +// - File.Flags(): This value may change during the operation. +type FileOperations interface { + // Release release resources held by FileOperations. + Release() + + // Waitable defines how this File can be waited on for read and + // write readiness. + waiter.Waitable + + // Seek seeks to offset based on SeekWhence. Returns the new + // offset or no change in the offset and an error. + Seek(ctx context.Context, file *File, whence SeekWhence, offset int64) (int64, error) + + // Readdir reads the directory entries of file and serializes them + // using serializer. + // + // Returns the new directory offset or no change in the offset and + // an error. The offset returned must not be less than file.Offset(). + // + // Serialization of directory entries must not happen asynchronously. + Readdir(ctx context.Context, file *File, serializer DentrySerializer) (int64, error) + + // Read reads from file into dst at offset and returns the number + // of bytes read which must be greater than or equal to 0. File + // systems that do not support reading at an offset, (i.e. pipefs, + // sockfs) may ignore the offset. These file systems are expected + // to construct Files with !FileFlags.Pread. + // + // Read may return a nil error and only partially fill dst (at or + // before EOF). If the file represents a symlink, Read reads the target + // value of the symlink. + // + // Read does not check permissions nor flags. + // + // Read must not be called if !FileFlags.Read. + Read(ctx context.Context, file *File, dst usermem.IOSequence, offset int64) (int64, error) + + // Write writes src to file at offset and returns the number of bytes + // written which must be greater than or equal to 0. Like Read, file + // systems that do not support writing at an offset (i.e. pipefs, sockfs) + // may ignore the offset. These file systems are expected to construct + // Files with !FileFlags.Pwrite. + // + // If only part of src could be written, Write must return an error + // indicating why (e.g. syserror.ErrWouldBlock). + // + // Write does not check permissions nor flags. + // + // Write must not be called if !FileFlags.Write. + Write(ctx context.Context, file *File, src usermem.IOSequence, offset int64) (int64, error) + + // Fsync writes buffered modifications of file and/or flushes in-flight + // operations to backing storage based on syncType. The range to sync is + // [start, end]. The end is inclusive so that the last byte of a maximally + // sized file can be synced. + Fsync(ctx context.Context, file *File, start, end int64, syncType SyncType) error + + // Flush this file's buffers/state (on close(2)). + Flush(ctx context.Context, file *File) error + + // ConfigureMMap mutates opts to implement mmap(2) for the file. Most + // implementations can either embed fsutil.NoMMap (if they don't support + // memory mapping) or call fsutil.GenericConfigureMMap with the appropriate + // memmap.Mappable. + ConfigureMMap(ctx context.Context, file *File, opts *memmap.MMapOpts) error + + // Ioctl implements the ioctl(2) linux syscall. + // + // io provides access to the virtual memory space to which pointers in args + // refer. + // + // Preconditions: The AddressSpace (if any) that io refers to is activated. + Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) +} diff --git a/pkg/sentry/fs/file_overlay.go b/pkg/sentry/fs/file_overlay.go new file mode 100644 index 000000000..0c6e622b9 --- /dev/null +++ b/pkg/sentry/fs/file_overlay.go @@ -0,0 +1,345 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fs + +import ( + "sync" + + "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/memmap" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" + "gvisor.googlesource.com/gvisor/pkg/waiter" +) + +// overlayFile gets a handle to a file from the upper or lower filesystem +// in an overlay. The caller is responsible for calling File.DecRef on +// the returned file. +func overlayFile(ctx context.Context, inode *Inode, flags FileFlags) (*File, error) { + // Do a song and dance to eventually get to: + // + // File -> single reference + // Dirent -> single reference + // Inode -> multiple references + // + // So that File.DecRef() -> File.destroy -> Dirent.DecRef -> Dirent.destroy, + // and both the transitory File and Dirent can be GC'ed but the Inode + // remains. + + // Take another reference on the Inode. + inode.IncRef() + + // Start with a single reference on the Dirent. It inherits the reference + // we just took on the Inode above. + dirent := NewTransientDirent(inode) + + // Get a File. This will take another reference on the Dirent. + f, err := inode.GetFile(ctx, dirent, flags) + + // Drop the extra reference on the Dirent. Now there's only one reference + // on the dirent, either owned by f (if non-nil), or the Dirent is about + // to be destroyed (if GetFile failed). + dirent.DecRef() + + return f, err +} + +// overlayFileOperations implements FileOperations for a file in an overlay. +type overlayFileOperations struct { + // upperMu protects upper below. In contrast lower is stable. + upperMu sync.Mutex `state:"nosave"` + + // We can't share Files in upper and lower filesystems between all Files + // in an overlay because some file systems expect to get distinct handles + // that are not consistent with each other on open(2). + // + // So we lazily acquire an upper File when the overlayEntry acquires an + // upper Inode (it might have one from the start). This synchronizes with + // copy up. + // + // If upper is non-nil and this is not a directory, then lower is ignored. + // + // For directories, upper and lower are ignored because it is always + // necessary to acquire new directory handles so that the directory cursors + // of the upper and lower Files are not exhausted. + upper *File + lower *File + + // dirCursor is a directory cursor for a directory in an overlay. + dirCursor string + + // dirCache is cache of DentAttrs from upper and lower Inodes. + dirCache *SortedDentryMap +} + +// Release implements FileOperations.Release. +func (f *overlayFileOperations) Release() { + if f.upper != nil { + f.upper.DecRef() + } + if f.lower != nil { + f.lower.DecRef() + } +} + +// EventRegister implements FileOperations.EventRegister. +func (f *overlayFileOperations) EventRegister(we *waiter.Entry, mask waiter.EventMask) { + f.upperMu.Lock() + defer f.upperMu.Unlock() + if f.upper != nil { + f.upper.EventRegister(we, mask) + return + } + f.lower.EventRegister(we, mask) +} + +// EventUnregister implements FileOperations.Unregister. +func (f *overlayFileOperations) EventUnregister(we *waiter.Entry) { + f.upperMu.Lock() + defer f.upperMu.Unlock() + if f.upper != nil { + f.upper.EventUnregister(we) + return + } + f.lower.EventUnregister(we) +} + +// Readiness implements FileOperations.Readiness. +func (f *overlayFileOperations) Readiness(mask waiter.EventMask) waiter.EventMask { + f.upperMu.Lock() + defer f.upperMu.Unlock() + if f.upper != nil { + return f.upper.Readiness(mask) + } + return f.lower.Readiness(mask) +} + +// Seek implements FileOperations.Seek. +func (f *overlayFileOperations) Seek(ctx context.Context, file *File, whence SeekWhence, offset int64) (int64, error) { + f.upperMu.Lock() + defer f.upperMu.Unlock() + + var seekDir bool + var n int64 + if f.upper != nil { + var err error + if n, err = f.upper.FileOperations.Seek(ctx, file, whence, offset); err != nil { + return n, err + } + seekDir = IsDir(f.upper.Dirent.Inode.StableAttr) + } else { + var err error + if n, err = f.lower.FileOperations.Seek(ctx, file, whence, offset); err != nil { + return n, err + } + seekDir = IsDir(f.lower.Dirent.Inode.StableAttr) + } + + // If this was a seek on a directory, we must update the cursor. + if seekDir && whence == SeekSet && offset == 0 { + // Currenly only seeking to 0 on a directory is supported. + // FIXME: Lift directory seeking limitations. + f.dirCursor = "" + } + return n, nil +} + +// Readdir implements FileOperations.Readdir. +func (f *overlayFileOperations) Readdir(ctx context.Context, file *File, serializer DentrySerializer) (int64, error) { + o := file.Dirent.Inode.overlay + + o.copyMu.RLock() + defer o.copyMu.RUnlock() + + var err error + f.dirCache, err = readdirEntries(ctx, o) + if err != nil { + return file.Offset(), err + } + + root := RootFromContext(ctx) + defer root.DecRef() + + dirCtx := &DirCtx{ + Serializer: serializer, + DirCursor: &f.dirCursor, + } + return DirentReaddir(ctx, file.Dirent, f, root, dirCtx, file.Offset()) +} + +// IterateDir implements DirIterator.IterateDir. +func (f *overlayFileOperations) IterateDir(ctx context.Context, dirCtx *DirCtx, offset int) (int, error) { + n, err := GenericReaddir(dirCtx, f.dirCache) + return offset + n, err +} + +// Read implements FileOperations.Read. +func (f *overlayFileOperations) Read(ctx context.Context, file *File, dst usermem.IOSequence, offset int64) (int64, error) { + o := file.Dirent.Inode.overlay + + o.copyMu.RLock() + defer o.copyMu.RUnlock() + + if o.upper != nil { + // We may need to acquire an open file handle to read from if + // copy up has occurred. Otherwise we risk reading from the + // wrong source. + f.upperMu.Lock() + if f.upper == nil { + var err error + f.upper, err = overlayFile(ctx, o.upper, file.Flags()) + if err != nil { + f.upperMu.Unlock() + log.Warningf("failed to acquire handle with flags %v: %v", file.Flags(), err) + return 0, syserror.EIO + } + } + f.upperMu.Unlock() + return f.upper.FileOperations.Read(ctx, f.upper, dst, offset) + } + return f.lower.FileOperations.Read(ctx, f.lower, dst, offset) +} + +// Write implements FileOperations.Write. +func (f *overlayFileOperations) Write(ctx context.Context, file *File, src usermem.IOSequence, offset int64) (int64, error) { + // f.upper must be non-nil. See inode_overlay.go:overlayGetFile, where the + // file is copied up and opened in the upper filesystem if FileFlags.Write. + // Write cannot be called if !FileFlags.Write, see FileOperations.Write. + return f.upper.FileOperations.Write(ctx, f.upper, src, offset) +} + +// Fsync implements FileOperations.Fsync. +func (f *overlayFileOperations) Fsync(ctx context.Context, file *File, start, end int64, syncType SyncType) error { + var err error + f.upperMu.Lock() + if f.upper != nil { + err = f.upper.FileOperations.Fsync(ctx, f.upper, start, end, syncType) + } + f.upperMu.Unlock() + if f.lower != nil { + // N.B. Fsync on the lower filesystem can cause writes of file + // attributes (i.e. access time) despite the fact that we must + // treat the lower filesystem as read-only. + // + // This matches the semantics of fsync(2) in Linux overlayfs. + err = f.lower.FileOperations.Fsync(ctx, f.lower, start, end, syncType) + } + return err +} + +// Flush implements FileOperations.Flush. +func (f *overlayFileOperations) Flush(ctx context.Context, file *File) error { + // Flush whatever handles we have. + var err error + f.upperMu.Lock() + if f.upper != nil { + err = f.upper.FileOperations.Flush(ctx, f.upper) + } + f.upperMu.Unlock() + if f.lower != nil { + err = f.lower.FileOperations.Flush(ctx, f.lower) + } + return err +} + +// ConfigureMMap implements FileOperations.ConfigureMMap. +func (*overlayFileOperations) ConfigureMMap(ctx context.Context, file *File, opts *memmap.MMapOpts) error { + o := file.Dirent.Inode.overlay + + o.copyMu.RLock() + defer o.copyMu.RUnlock() + + if !o.isMappableLocked() { + return syserror.ENODEV + } + // FIXME: This is a copy/paste of fsutil.GenericConfigureMMap, + // which we can't use because the overlay implementation is in package fs, + // so depending on fs/fsutil would create a circular dependency. Move + // overlay to fs/overlay. + opts.Mappable = o + opts.MappingIdentity = file + file.IncRef() + return nil +} + +// Ioctl implements fs.FileOperations.Ioctl and always returns ENOTTY. +func (*overlayFileOperations) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { + return 0, syserror.ENOTTY +} + +// readdirEntries returns a sorted map of directory entries from the +// upper and/or lower filesystem. +func readdirEntries(ctx context.Context, o *overlayEntry) (*SortedDentryMap, error) { + // Assert that there is at least one upper or lower entry. + if o.upper == nil && o.lower == nil { + panic("invalid overlayEntry, needs at least one Inode") + } + entries := make(map[string]DentAttr) + + // Try the upper filesystem first. + if o.upper != nil { + var err error + entries, err = readdirOne(ctx, NewTransientDirent(o.upper)) + if err != nil { + return nil, err + } + } + + // Try the lower filesystem next. + if o.lower != nil { + lowerEntries, err := readdirOne(ctx, NewTransientDirent(o.lower)) + if err != nil { + return nil, err + } + for name, entry := range lowerEntries { + // Skip this name if it is a negative entry in the + // upper or there exists a whiteout for it. + if o.upper != nil { + if overlayHasWhiteout(o.upper, name) { + continue + } + } + // Prefer the entries from the upper filesystem + // when names overlap. + if _, ok := entries[name]; !ok { + entries[name] = entry + } + } + } + + // Sort and return the entries. + return NewSortedDentryMap(entries), nil +} + +// readdirOne reads all of the directory entries from d. +func readdirOne(ctx context.Context, d *Dirent) (map[string]DentAttr, error) { + dir, err := d.Inode.GetFile(ctx, d, FileFlags{Read: true}) + if err != nil { + return nil, err + } + defer dir.DecRef() + + // Use a stub serializer to read the entries into memory. + stubSerializer := &CollectEntriesSerializer{} + if err := dir.Readdir(ctx, stubSerializer); err != nil { + return nil, err + } + // The "." and ".." entries are from the overlay Inode's Dirent, not the stub. + delete(stubSerializer.Entries, ".") + delete(stubSerializer.Entries, "..") + return stubSerializer.Entries, nil +} diff --git a/pkg/sentry/fs/file_overlay_test.go b/pkg/sentry/fs/file_overlay_test.go new file mode 100644 index 000000000..407ba8562 --- /dev/null +++ b/pkg/sentry/fs/file_overlay_test.go @@ -0,0 +1,137 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fs_test + +import ( + "reflect" + "testing" + + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" +) + +func TestReaddir(t *testing.T) { + ctx := contexttest.Context(t) + ctx = &rootContext{ + Context: ctx, + root: fs.NewDirent(newTestRamfsDir(ctx, nil, nil), "root"), + } + for _, test := range []struct { + // Test description. + desc string + + // Lookup parameters. + dir *fs.Inode + + // Want from lookup. + err error + names []string + }{ + { + desc: "no upper, lower has entries", + dir: fs.NewTestOverlayDir(ctx, + nil, /* upper */ + newTestRamfsDir(ctx, []dirContent{ + {name: "a"}, + {name: "b"}, + }, nil), /* lower */ + ), + names: []string{".", "..", "a", "b"}, + }, + { + desc: "upper has entries, no lower", + dir: fs.NewTestOverlayDir(ctx, + newTestRamfsDir(ctx, []dirContent{ + {name: "a"}, + {name: "b"}, + }, nil), /* upper */ + nil, /* lower */ + ), + names: []string{".", "..", "a", "b"}, + }, + { + desc: "upper and lower, entries combine", + dir: fs.NewTestOverlayDir(ctx, + newTestRamfsDir(ctx, []dirContent{ + {name: "a"}, + }, nil), /* lower */ + newTestRamfsDir(ctx, []dirContent{ + {name: "b"}, + }, nil), /* lower */ + ), + names: []string{".", "..", "a", "b"}, + }, + { + desc: "upper and lower, entries combine, none are masked", + dir: fs.NewTestOverlayDir(ctx, + newTestRamfsDir(ctx, []dirContent{ + {name: "a"}, + }, []string{"b"}), /* lower */ + newTestRamfsDir(ctx, []dirContent{ + {name: "c"}, + }, nil), /* lower */ + ), + names: []string{".", "..", "a", "c"}, + }, + { + desc: "upper and lower, entries combine, upper masks some of lower", + dir: fs.NewTestOverlayDir(ctx, + newTestRamfsDir(ctx, []dirContent{ + {name: "a"}, + }, []string{"b"}), /* lower */ + newTestRamfsDir(ctx, []dirContent{ + {name: "b"}, /* will be masked */ + {name: "c"}, + }, nil), /* lower */ + ), + names: []string{".", "..", "a", "c"}, + }, + } { + t.Run(test.desc, func(t *testing.T) { + openDir, err := test.dir.GetFile(ctx, fs.NewDirent(test.dir, "stub"), fs.FileFlags{Read: true}) + if err != nil { + t.Fatalf("GetFile got error %v, want nil", err) + } + stubSerializer := &fs.CollectEntriesSerializer{} + err = openDir.Readdir(ctx, stubSerializer) + if err != test.err { + t.Fatalf("Readdir got error %v, want nil", err) + } + if err != nil { + return + } + if !reflect.DeepEqual(stubSerializer.Order, test.names) { + t.Errorf("Readdir got names %v, want %v", stubSerializer.Order, test.names) + } + }) + } +} + +type rootContext struct { + context.Context + root *fs.Dirent +} + +// Value implements context.Context. +func (r *rootContext) Value(key interface{}) interface{} { + switch key { + case fs.CtxRoot: + r.root.IncRef() + return r.root + default: + return r.Context.Value(key) + } +} diff --git a/pkg/sentry/fs/file_state.go b/pkg/sentry/fs/file_state.go new file mode 100644 index 000000000..341cbda0b --- /dev/null +++ b/pkg/sentry/fs/file_state.go @@ -0,0 +1,30 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fs + +// afterLoad is invoked by stateify. +func (f *File) afterLoad() { + f.mu.Init() +} + +// saveFlags is invoked by stateify. +func (f *File) saveFlags() FileFlags { + return f.flags.Load().(FileFlags) +} + +// loadFlags is invoked by stateify. +func (f *File) loadFlags(flags FileFlags) { + f.flags.Store(flags) +} diff --git a/pkg/sentry/fs/file_test.go b/pkg/sentry/fs/file_test.go new file mode 100644 index 000000000..18aee7101 --- /dev/null +++ b/pkg/sentry/fs/file_test.go @@ -0,0 +1,24 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fs + +import "io" + +var ( + _ = io.Reader(&FileReader{}) + _ = io.ReaderAt(&FileReader{}) + _ = io.Writer(&FileWriter{}) + _ = io.WriterAt(&FileWriter{}) +) diff --git a/pkg/sentry/fs/filesystems.go b/pkg/sentry/fs/filesystems.go new file mode 100644 index 000000000..7cd76dfe9 --- /dev/null +++ b/pkg/sentry/fs/filesystems.go @@ -0,0 +1,162 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fs + +import ( + "fmt" + "sort" + "strings" + "sync" + + "gvisor.googlesource.com/gvisor/pkg/sentry/context" +) + +// FilesystemFlags matches include/linux/fs.h:file_system_type.fs_flags. +type FilesystemFlags int + +const ( + // FilesystemRequiresDev indicates that the file system requires a device name + // on mount. It is used to construct the output of /proc/filesystems. + FilesystemRequiresDev FilesystemFlags = 1 + + // Currently other flags are not used, but can be pulled in from + // include/linux/fs.h:file_system_type as needed. +) + +// Filesystem is a mountable file system. +type Filesystem interface { + // Name is the unique identifier of the file system. It corresponds to the + // filesystemtype argument of sys_mount and will appear in the output of + // /proc/filesystems. + Name() string + + // Flags indicate common properties of the file system. + Flags() FilesystemFlags + + // Mount generates a mountable Inode backed by device and configured + // using file system independent flags and file system dependent + // data options. + Mount(ctx context.Context, device string, flags MountSourceFlags, data string) (*Inode, error) + + // AllowUserMount determines whether mount(2) is allowed to mount a + // file system of this type. + AllowUserMount() bool +} + +// filesystems is the global set of registered file systems. It does not need +// to be saved. Packages registering and unregistering file systems must do so +// before calling save/restore methods. +var filesystems = struct { + // mu protects registered below. + mu sync.Mutex + + // registered is a set of registered Filesystems. + registered map[string]Filesystem +}{ + registered: make(map[string]Filesystem), +} + +// RegisterFilesystem registers a new file system that is visible to mount and +// the /proc/filesystems list. Packages implementing Filesystem should call +// RegisterFilesystem in init(). +func RegisterFilesystem(f Filesystem) { + filesystems.mu.Lock() + defer filesystems.mu.Unlock() + + if _, ok := filesystems.registered[f.Name()]; ok { + panic(fmt.Sprintf("filesystem already registered at %q", f.Name())) + } + filesystems.registered[f.Name()] = f +} + +// UnregisterFilesystem removes a file system from the global set. To keep the +// file system set compatible with save/restore, UnregisterFilesystem must be +// called before save/restore methods. +// +// For instance, packages may unregister their file system after it is mounted. +// This makes sense for pseudo file systems that should not be visible or +// mountable. See whitelistfs in fs/host/fs.go for one example. +func UnregisterFilesystem(name string) { + filesystems.mu.Lock() + defer filesystems.mu.Unlock() + + delete(filesystems.registered, name) +} + +// FindFilesystem returns a Filesystem registered at name or (nil, false) if name +// is not a file system type that can be found in /proc/filesystems. +func FindFilesystem(name string) (Filesystem, bool) { + filesystems.mu.Lock() + defer filesystems.mu.Unlock() + + f, ok := filesystems.registered[name] + return f, ok +} + +// GetFilesystems returns the set of registered filesystems in a consistent order. +func GetFilesystems() []Filesystem { + filesystems.mu.Lock() + defer filesystems.mu.Unlock() + + var ss []Filesystem + for _, s := range filesystems.registered { + ss = append(ss, s) + } + sort.Slice(ss, func(i, j int) bool { return ss[i].Name() < ss[j].Name() }) + return ss +} + +// MountSourceFlags represents all mount option flags as a struct. +type MountSourceFlags struct { + // ReadOnly corresponds to mount(2)'s "MS_RDONLY" and indicates that + // the filesystem should be mounted read-only. + ReadOnly bool + + // NoAtime corresponds to mount(2)'s "MS_NOATIME" and indicates that + // the filesystem should not update access time in-place. + NoAtime bool + + // ForcePageCache causes all filesystem I/O operations to use the page + // cache, even when the platform supports direct mapped I/O. This + // doesn't correspond to any Linux mount options. + ForcePageCache bool +} + +// GenericMountSourceOptions splits a string containing comma separated tokens of the +// format 'key=value' or 'key' into a map of keys and values. For example: +// +// data = "key0=value0,key1,key2=value2" -> map{'key0':'value0','key1':'','key2':'value2'} +// +// If data contains duplicate keys, then the last token wins. +func GenericMountSourceOptions(data string) map[string]string { + options := make(map[string]string) + if len(data) == 0 { + // Don't return a nil map, callers might not be expecting that. + return options + } + + // Parse options and skip empty ones. + for _, opt := range strings.Split(data, ",") { + if len(opt) > 0 { + res := strings.SplitN(opt, "=", 2) + if len(res) == 2 { + options[res[0]] = res[1] + } else { + options[opt] = "" + } + } + } + return options +} diff --git a/pkg/sentry/fs/filetest/BUILD b/pkg/sentry/fs/filetest/BUILD new file mode 100644 index 000000000..51a390d77 --- /dev/null +++ b/pkg/sentry/fs/filetest/BUILD @@ -0,0 +1,35 @@ +package(licenses = ["notice"]) # Apache 2.0 + +load("@io_bazel_rules_go//go:def.bzl", "go_library") +load("//tools/go_stateify:defs.bzl", "go_stateify") + +go_stateify( + name = "filetest_state", + srcs = [ + "filetest.go", + ], + out = "filetest_state.go", + package = "filetest", +) + +go_library( + name = "filetest", + testonly = 1, + srcs = [ + "filetest.go", + "filetest_state.go", + ], + importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/fs/filetest", + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/refs", + "//pkg/sentry/context", + "//pkg/sentry/context/contexttest", + "//pkg/sentry/fs", + "//pkg/sentry/fs/anon", + "//pkg/sentry/fs/fsutil", + "//pkg/sentry/usermem", + "//pkg/state", + "//pkg/waiter", + ], +) diff --git a/pkg/sentry/fs/filetest/filetest.go b/pkg/sentry/fs/filetest/filetest.go new file mode 100644 index 000000000..1831aa82f --- /dev/null +++ b/pkg/sentry/fs/filetest/filetest.go @@ -0,0 +1,59 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package filetest provides a test implementation of an fs.File. +package filetest + +import ( + "fmt" + "testing" + + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/anon" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/waiter" +) + +// TestFileOperations is an implementation of the File interface. It provides all +// required methods. +type TestFileOperations struct { + fsutil.NoopRelease `state:"nosave"` + fsutil.PipeSeek `state:"nosave"` + fsutil.NotDirReaddir `state:"nosave"` + fsutil.NoFsync `state:"nosave"` + fsutil.NoopFlush `state:"nosave"` + fsutil.NoMMap `state:"nosave"` + fsutil.NoIoctl `state:"nosave"` + waiter.AlwaysReady `state:"nosave"` +} + +// NewTestFile creates and initializes a new test file. +func NewTestFile(tb testing.TB) *fs.File { + ctx := contexttest.Context(tb) + dirent := fs.NewDirent(anon.NewInode(ctx), "test") + return fs.NewFile(ctx, dirent, fs.FileFlags{}, &TestFileOperations{}) +} + +// Read just fails the request. +func (*TestFileOperations) Read(context.Context, *fs.File, usermem.IOSequence, int64) (int64, error) { + return 0, fmt.Errorf("Readv not implemented") +} + +// Write just fails the request. +func (*TestFileOperations) Write(context.Context, *fs.File, usermem.IOSequence, int64) (int64, error) { + return 0, fmt.Errorf("Writev not implemented") +} diff --git a/pkg/sentry/fs/flags.go b/pkg/sentry/fs/flags.go new file mode 100644 index 000000000..dfa6a3d62 --- /dev/null +++ b/pkg/sentry/fs/flags.go @@ -0,0 +1,67 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fs + +// FileFlags encodes file flags. +type FileFlags struct { + // Direct indicates that I/O should be done directly. + Direct bool + + // NonBlocking indicates that I/O should not block. + NonBlocking bool + + // Sync indicates that any writes should be synchronous. + Sync bool + + // Append indicates this file is append only. + Append bool + + // Read indicates this file is readable. + Read bool + + // Write indicates this file is writeable. + Write bool + + // Pread indicates this file is readable at an arbitrary offset. + Pread bool + + // Pwrite indicates this file is writable at an arbitrary offset. + Pwrite bool + + // Directory indicates that this file must be a directory. + Directory bool +} + +// SettableFileFlags is a subset of FileFlags above that can be changed +// via fcntl(2) using the F_SETFL command. +type SettableFileFlags struct { + // Direct indicates that I/O should be done directly. + Direct bool + + // NonBlocking indicates that I/O should not block. + NonBlocking bool + + // Append indicates this file is append only. + Append bool +} + +// Settable returns the subset of f that are settable. +func (f FileFlags) Settable() SettableFileFlags { + return SettableFileFlags{ + Direct: f.Direct, + NonBlocking: f.NonBlocking, + Append: f.Append, + } +} diff --git a/pkg/sentry/fs/fs.go b/pkg/sentry/fs/fs.go new file mode 100644 index 000000000..f54f767d3 --- /dev/null +++ b/pkg/sentry/fs/fs.go @@ -0,0 +1,88 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package fs implements a virtual filesystem layer. +// +// Specific filesystem implementations must implement the InodeOperations +// interface (inode.go). +// +// The MountNamespace (mounts.go) is used to create a collection of mounts in +// a filesystem rooted at a given Inode. +// +// MountSources (mount.go) form a tree, with each mount holding pointers to its +// parent and children. +// +// Dirents (dirents.go) wrap Inodes in a caching layer. +// +// When multiple locks are to be held at the same time, they should be acquired +// in the following order. +// +// Either: +// File.mu +// Locks in FileOperations implementations +// goto Dirent-Locks +// +// Or: +// MountNamespace.mu +// goto Dirent-Locks +// +// Dirent-Locks: +// renameMu +// Dirent.dirMu +// Dirent.mu +// DirentCache.mu +// Locks in InodeOperations implementations or overlayEntry +// Inode.Watches.mu (see `Inotify` for other lock ordering) +// MountSource.mu +// +// If multiple Dirent or MountSource locks must be taken, locks in the parent must be +// taken before locks in their children. +// +// If locks must be taken on multiple unrelated Dirents, renameMu must be taken +// first. See lockForRename. +package fs + +import ( + "sync" +) + +// work is a sync.WaitGroup that can be used to queue asynchronous operations +// via Do. Callers can use Barrier to ensure no operations are outstanding. +var work sync.WaitGroup + +// AsyncBarrier waits for all outstanding asynchronous work to complete. +func AsyncBarrier() { + work.Wait() +} + +// Async executes a function asynchronously. +func Async(f func()) { + work.Add(1) + go func() { // S/R-SAFE: Barrier must be called. + defer work.Done() // Ensure Done in case of panic. + f() + }() +} + +// ErrSaveRejection indicates a failed save due to unsupported file system state +// such as dangling open fd, etc. +type ErrSaveRejection struct { + // Err is the wrapped error. + Err error +} + +// Error returns a sensible description of the save rejection error. +func (e ErrSaveRejection) Error() string { + return "save rejected due to unsupported file system state: " + e.Err.Error() +} diff --git a/pkg/sentry/fs/fsutil/BUILD b/pkg/sentry/fs/fsutil/BUILD new file mode 100644 index 000000000..4fa6395f7 --- /dev/null +++ b/pkg/sentry/fs/fsutil/BUILD @@ -0,0 +1,149 @@ +package(licenses = ["notice"]) # Apache 2.0 + +load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test") +load("//tools/go_generics:defs.bzl", "go_template_instance") +load("//tools/go_stateify:defs.bzl", "go_stateify") + +go_stateify( + name = "fsutil_state", + srcs = [ + "dirty_set_impl.go", + "file.go", + "file_range_set_impl.go", + "frame_ref_set_impl.go", + "handle.go", + "host_file_mapper.go", + "host_file_mapper_state.go", + "inode.go", + "inode_cached.go", + ], + out = "fsutil_state.go", + package = "fsutil", +) + +go_template_instance( + name = "dirty_set_impl", + out = "dirty_set_impl.go", + imports = { + "memmap": "gvisor.googlesource.com/gvisor/pkg/sentry/memmap", + "platform": "gvisor.googlesource.com/gvisor/pkg/sentry/platform", + }, + package = "fsutil", + prefix = "Dirty", + template = "//pkg/segment:generic_set", + types = { + "Key": "uint64", + "Range": "memmap.MappableRange", + "Value": "DirtyInfo", + "Functions": "dirtySetFunctions", + }, +) + +go_template_instance( + name = "frame_ref_set_impl", + out = "frame_ref_set_impl.go", + imports = { + "platform": "gvisor.googlesource.com/gvisor/pkg/sentry/platform", + }, + package = "fsutil", + prefix = "frameRef", + template = "//pkg/segment:generic_set", + types = { + "Key": "uint64", + "Range": "platform.FileRange", + "Value": "uint64", + "Functions": "frameRefSetFunctions", + }, +) + +go_template_instance( + name = "file_range_set_impl", + out = "file_range_set_impl.go", + imports = { + "memmap": "gvisor.googlesource.com/gvisor/pkg/sentry/memmap", + "platform": "gvisor.googlesource.com/gvisor/pkg/sentry/platform", + }, + package = "fsutil", + prefix = "FileRange", + template = "//pkg/segment:generic_set", + types = { + "Key": "uint64", + "Range": "memmap.MappableRange", + "Value": "uint64", + "Functions": "fileRangeSetFunctions", + }, +) + +go_library( + name = "fsutil", + srcs = [ + "dirty_set.go", + "dirty_set_impl.go", + "file.go", + "file_range_set.go", + "file_range_set_impl.go", + "frame_ref_set.go", + "frame_ref_set_impl.go", + "fsutil.go", + "fsutil_state.go", + "handle.go", + "host_file_mapper.go", + "host_file_mapper_state.go", + "host_file_mapper_unsafe.go", + "inode.go", + "inode_cached.go", + ], + importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil", + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/log", + "//pkg/sentry/arch", + "//pkg/sentry/context", + "//pkg/sentry/device", + "//pkg/sentry/fs", + "//pkg/sentry/kernel/time", + "//pkg/sentry/memmap", + "//pkg/sentry/platform", + "//pkg/sentry/safemem", + "//pkg/sentry/usage", + "//pkg/sentry/usermem", + "//pkg/state", + "//pkg/syserror", + "//pkg/tcpip/transport/unix", + "//pkg/waiter", + ], +) + +go_test( + name = "fsutil_x_test", + size = "small", + srcs = ["handle_test.go"], + deps = [ + ":fsutil", + "//pkg/sentry/context", + "//pkg/sentry/context/contexttest", + "//pkg/sentry/fs", + "//pkg/sentry/fs/ramfs/test", + "//pkg/sentry/usermem", + ], +) + +go_test( + name = "fsutil_test", + size = "small", + srcs = [ + "dirty_set_test.go", + "inode_cached_test.go", + ], + embed = [":fsutil"], + deps = [ + "//pkg/sentry/context", + "//pkg/sentry/context/contexttest", + "//pkg/sentry/fs", + "//pkg/sentry/kernel/time", + "//pkg/sentry/memmap", + "//pkg/sentry/safemem", + "//pkg/sentry/usermem", + ], +) diff --git a/pkg/sentry/fs/fsutil/README.md b/pkg/sentry/fs/fsutil/README.md new file mode 100644 index 000000000..d3780e9fa --- /dev/null +++ b/pkg/sentry/fs/fsutil/README.md @@ -0,0 +1,207 @@ +This package provides utilities for implementing virtual filesystem objects. + +[TOC] + +## Page cache + +`CachingInodeOperations` implements a page cache for files that cannot use the +host page cache. Normally these are files that store their data in a remote +filesystem. This also applies to files that are accessed on a platform that does +not support directly memory mapping host file descriptors (e.g. the ptrace +platform). + +An `CachingInodeOperations` buffers regions of a single file into memory. It is +owned by an `fs.Inode`, the in-memory representation of a file (all open file +descriptors are backed by an `fs.Inode`). The `fs.Inode` provides operations for +reading memory into an `CachingInodeOperations`, to represent the contents of +the file in-memory, and for writing memory out, to relieve memory pressure on +the kernel and to synchronize in-memory changes to filesystems. + +An `CachingInodeOperations` enables readable and/or writable memory access to +file content. Files can be mapped shared or private, see mmap(2). When a file is +mapped shared, changes to the file via write(2) and truncate(2) are reflected in +the shared memory region. Conversely, when the shared memory region is modified, +changes to the file are visible via read(2). Multiple shared mappings of the +same file are coherent with each other. This is consistent with Linux. + +When a file is mapped private, updates to the mapped memory are not visible to +other memory mappings. Updates to the mapped memory are also not reflected in +the file content as seen by read(2). If the file is changed after a private +mapping is created, for instance by write(2), the change to the file may or may +not be reflected in the private mapping. This is consistent with Linux. + +An `CachingInodeOperations` keeps track of ranges of memory that were modified +(or "dirtied"). When the file is explicitly synced via fsync(2), only the dirty +ranges are written out to the filesystem. Any error returned indicates a failure +to write all dirty memory of an `CachingInodeOperations` to the filesystem. In +this case the filesystem may be in an inconsistent state. The same operation can +be performed on the shared memory itself using msync(2). If neither fsync(2) nor +msync(2) is performed, then the dirty memory is written out in accordance with +the `CachingInodeOperations` eviction strategy (see below) and there is no +guarantee that memory will be written out successfully in full. + +### Memory allocation and eviction + +An `CachingInodeOperations` implements the following allocation and eviction +strategy: + +- Memory is allocated and brought up to date with the contents of a file when + a region of mapped memory is accessed (or "faulted on"). + +- Dirty memory is written out to filesystems when an fsync(2) or msync(2) + operation is performed on a memory mapped file, for all memory mapped files + when saved, and/or when there are no longer any memory mappings of a range + of a file, see munmap(2). As the latter implies, in the absence of a panic + or SIGKILL, dirty memory is written out for all memory mapped files when an + application exits. + +- Memory is freed when there are no longer any memory mappings of a range of a + file (e.g. when an application exits). This behavior is consistent with + Linux for shared memory that has been locked via mlock(2). + +Notably, memory is not allocated for read(2) or write(2) operations. This means +that reads and writes to the file are only accelerated by an +`CachingInodeOperations` if the file being read or written has been memory +mapped *and* if the shared memory has been accessed at the region being read or +written. This diverges from Linux which buffers memory into a page cache on +read(2) proactively (i.e. readahead) and delays writing it out to filesystems on +write(2) (i.e. writeback). The absence of these optimizations is not visible to +applications beyond less than optimal performance when repeatedly reading and/or +writing to same region of a file. See [Future Work](#future-work) for plans to +implement these optimizations. + +Additionally, memory held by `CachingInodeOperationss` is currently unbounded in +size. An `CachingInodeOperations` does not write out dirty memory and free it +under system memory pressure. This can cause pathological memory usage. + +When memory is written back, an `CachingInodeOperations` may write regions of +shared memory that were never modified. This is due to the strategy of +minimizing page faults (see below) and handling only a subset of memory write +faults. In the absence of an application or sentry crash, it is guaranteed that +if a region of shared memory was written to, it is written back to a filesystem. + +### Life of a shared memory mapping + +A file is memory mapped via mmap(2). For example, if `A` is an address, an +application may execute: + +``` +mmap(A, 0x1000, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); +``` + +This creates a shared mapping of fd that reflects 4k of the contents of fd +starting at offset 0, accessible at address `A`. This in turn creates a virtual +memory area region ("vma") which indicates that [`A`, `A`+0x1000) is now a valid +address range for this application to access. + +At this point, memory has not been allocated in the file's +`CachingInodeOperations`. It is also the case that the address range [`A`, +`A`+0x1000) has not been mapped on the host on behalf of the application. If the +application then tries to modify 8 bytes of the shared memory: + +``` +char buffer[] = "aaaaaaaa"; +memcpy(A, buffer, 8); +``` + +The host then sends a `SIGSEGV` to the sentry because the address range [`A`, +`A`+8) is not mapped on the host. The `SIGSEGV` indicates that the memory was +accessed writable. The sentry looks up the vma associated with [`A`, `A`+8), +finds the file that was mapped and its `CachingInodeOperations`. It then calls +`CachingInodeOperations.MapInto` which allocates memory to back [`A`, `A`+8). It +may choose to allocate more memory (i.e. do "readahead") to minimize subsequent +faults. + +Memory that is allocated comes from a host tmpfs file (see `filemem.FileMem`). +The host tmpfs file memory is brought up to date with the contents of the mapped +file on its filesystem. The region of the host tmpfs file that reflects the +mapped file is then mapped into the host address space of the application so +that subsequent memory accesses do not repeatedly generate a `SIGSEGV`. + +The range that was allocated, including any extra memory allocation to minimize +faults, is marked dirty due to the write fault. This overcounts dirty memory if +the extra memory allocated is never modified. + +To make the scenario more interesting, imagine that this application spawns +another process and maps the same file in the exact same way: + +``` +mmap(A, 0x1000, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); +``` + +Imagine that this process then tries to modify the file again but with only 4 +bytes: + +``` +char buffer[] = "bbbb"; +memcpy(A, buffer, 4); +``` + +Since the first process has already mapped and accessed the same region of the +file writable, `CachingInodeOperations.MapInto` is called but re-maps the memory +that has already been allocated (because the host mapping can be invalidated at +any time) rather than allocating new memory. The address range [`A`, `A`+0x1000) +reflects the same cached view of the file as the first process sees. For +example, reading 8 bytes from the file from either process via read(2) starting +at offset 0 returns a consistent "bbbbaaaa". + +When this process no longer needs the shared memory, it may do: + +``` +munmap(A, 0x1000); +``` + +At this point, the modified memory cached by the `CachingInodeOperations` is not +written back to the file because it is still in use by the first process that +mapped it. When the first process also does: + +``` +munmap(A, 0x1000); +``` + +Then the last memory mapping of the file at the range [0, 0x1000) is gone. The +file's `CachingInodeOperations` then starts writing back memory marked dirty to +the file on its filesystem. Once writing completes, regardless of whether it was +successful, the `CachingInodeOperations` frees the memory cached at the range +[0, 0x1000). + +Subsequent read(2) or write(2) operations on the file go directly to the +filesystem since there no longer exists memory for it in its +`CachingInodeOperations`. + +## Future Work + +### Page cache + +The sentry does not yet implement the readahead and writeback optimizations for +read(2) and write(2) respectively. To do so, on read(2) and/or write(2) the +sentry must ensure that memory is allocated in a page cache to read or write +into. However, the sentry cannot boundlessly allocate memory. If it did, the +host would eventually OOM-kill the sentry+application process. This means that +the sentry must implement a page cache memory allocation strategy that is +bounded by a global user or container imposed limit. When this limit is +approached, the sentry must decide from which page cache memory should be freed +so that it can allocate more memory. If it makes a poor decision, the sentry may +end up freeing and re-allocating memory to back regions of files that are +frequently used, nullifying the optimization (and in some cases causing worse +performance due to the overhead of memory allocation and general management). +This is a form of "cache thrashing". + +In Linux, much research has been done to select and implement a lightweight but +optimal page cache eviction algorithm. Linux makes use of hardware page bits to +keep track of whether memory has been accessed. The sentry does not have direct +access to hardware. Implementing a similarly lightweight and optimal page cache +eviction algorithm will need to either introduce a kernel interface to obtain +these page bits or find a suitable alternative proxy for access events. + +In Linux, readahead happens by default but is not always ideal. For instance, +for files that are not read sequentially, it would be more ideal to simply read +from only those regions of the file rather than to optimistically cache some +number of bytes ahead of the read (up to 2MB in Linux) if the bytes cached won't +be accessed. Linux implements the fadvise64(2) system call for applications to +specify that a range of a file will not be accessed sequentially. The advice bit +FADV_RANDOM turns off the readahead optimization for the given range in the +given file. However fadvise64 is rarely used by applications so Linux implements +a readahead backoff strategy if reads are not sequential. To ensure that +application performance is not degraded, the sentry must implement a similar +backoff strategy. diff --git a/pkg/sentry/fs/fsutil/dirty_set.go b/pkg/sentry/fs/fsutil/dirty_set.go new file mode 100644 index 000000000..9c6c98542 --- /dev/null +++ b/pkg/sentry/fs/fsutil/dirty_set.go @@ -0,0 +1,213 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fsutil + +import ( + "math" + + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/memmap" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform" + "gvisor.googlesource.com/gvisor/pkg/sentry/safemem" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +// DirtySet maps offsets into a memmap.Mappable to DirtyInfo. It is used to +// implement Mappables that cache data from another source. +// +// type DirtySet <generated by go_generics> + +// DirtyInfo is the value type of DirtySet, and represents information about a +// Mappable offset that is dirty (the cached data for that offset is newer than +// its source). +type DirtyInfo struct { + // Keep is true if the represented offset is concurrently writable, such + // that writing the data for that offset back to the source does not + // guarantee that the offset is clean (since it may be concurrently + // rewritten after the writeback). + Keep bool +} + +// dirtySetFunctions implements segment.Functions for DirtySet. +type dirtySetFunctions struct{} + +// MinKey implements segment.Functions.MinKey. +func (dirtySetFunctions) MinKey() uint64 { + return 0 +} + +// MaxKey implements segment.Functions.MaxKey. +func (dirtySetFunctions) MaxKey() uint64 { + return math.MaxUint64 +} + +// ClearValue implements segment.Functions.ClearValue. +func (dirtySetFunctions) ClearValue(val *DirtyInfo) { +} + +// Merge implements segment.Functions.Merge. +func (dirtySetFunctions) Merge(_ memmap.MappableRange, val1 DirtyInfo, _ memmap.MappableRange, val2 DirtyInfo) (DirtyInfo, bool) { + if val1 != val2 { + return DirtyInfo{}, false + } + return val1, true +} + +// Split implements segment.Functions.Split. +func (dirtySetFunctions) Split(_ memmap.MappableRange, val DirtyInfo, _ uint64) (DirtyInfo, DirtyInfo) { + return val, val +} + +// MarkClean marks all offsets in mr as not dirty, except for those to which +// KeepDirty has been applied. +func (ds *DirtySet) MarkClean(mr memmap.MappableRange) { + seg := ds.LowerBoundSegment(mr.Start) + for seg.Ok() && seg.Start() < mr.End { + if seg.Value().Keep { + seg = seg.NextSegment() + continue + } + seg = ds.Isolate(seg, mr) + seg = ds.Remove(seg).NextSegment() + } +} + +// KeepClean marks all offsets in mr as not dirty, even those that were +// previously kept dirty by KeepDirty. +func (ds *DirtySet) KeepClean(mr memmap.MappableRange) { + ds.RemoveRange(mr) +} + +// MarkDirty marks all offsets in mr as dirty. +func (ds *DirtySet) MarkDirty(mr memmap.MappableRange) { + ds.setDirty(mr, false) +} + +// KeepDirty marks all offsets in mr as dirty and prevents them from being +// marked as clean by MarkClean. +func (ds *DirtySet) KeepDirty(mr memmap.MappableRange) { + ds.setDirty(mr, true) +} + +func (ds *DirtySet) setDirty(mr memmap.MappableRange, keep bool) { + var changedAny bool + defer func() { + if changedAny { + ds.MergeRange(mr) + } + }() + seg, gap := ds.Find(mr.Start) + for { + switch { + case seg.Ok() && seg.Start() < mr.End: + if keep && !seg.Value().Keep { + changedAny = true + seg = ds.Isolate(seg, mr) + seg.ValuePtr().Keep = true + } + seg, gap = seg.NextNonEmpty() + + case gap.Ok() && gap.Start() < mr.End: + changedAny = true + seg = ds.Insert(gap, gap.Range().Intersect(mr), DirtyInfo{keep}) + seg, gap = seg.NextNonEmpty() + + default: + return + } + } +} + +// SyncDirty passes pages in the range mr that are stored in cache and +// identified as dirty to writeAt, updating dirty to reflect successful writes. +// If writeAt returns a successful partial write, SyncDirty will call it +// repeatedly until all bytes have been written. max is the true size of the +// cached object; offsets beyond max will not be passed to writeAt, even if +// they are marked dirty. +func SyncDirty(ctx context.Context, mr memmap.MappableRange, cache *FileRangeSet, dirty *DirtySet, max uint64, mem platform.File, writeAt func(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error)) error { + var changedDirty bool + defer func() { + if changedDirty { + dirty.MergeRange(mr) + } + }() + dseg := dirty.LowerBoundSegment(mr.Start) + for dseg.Ok() && dseg.Start() < mr.End { + var dr memmap.MappableRange + if dseg.Value().Keep { + dr = dseg.Range().Intersect(mr) + } else { + changedDirty = true + dseg = dirty.Isolate(dseg, mr) + dr = dseg.Range() + } + if err := syncDirtyRange(ctx, dr, cache, max, mem, writeAt); err != nil { + return err + } + if dseg.Value().Keep { + dseg = dseg.NextSegment() + } else { + dseg = dirty.Remove(dseg).NextSegment() + } + } + return nil +} + +// SyncDirtyAll passes all pages stored in cache identified as dirty to +// writeAt, updating dirty to reflect successful writes. If writeAt returns a +// successful partial write, SyncDirtyAll will call it repeatedly until all +// bytes have been written. max is the true size of the cached object; offsets +// beyond max will not be passed to writeAt, even if they are marked dirty. +func SyncDirtyAll(ctx context.Context, cache *FileRangeSet, dirty *DirtySet, max uint64, mem platform.File, writeAt func(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error)) error { + dseg := dirty.FirstSegment() + for dseg.Ok() { + if err := syncDirtyRange(ctx, dseg.Range(), cache, max, mem, writeAt); err != nil { + return err + } + if dseg.Value().Keep { + dseg = dseg.NextSegment() + } else { + dseg = dirty.Remove(dseg).NextSegment() + } + } + return nil +} + +// Preconditions: mr must be page-aligned. +func syncDirtyRange(ctx context.Context, mr memmap.MappableRange, cache *FileRangeSet, max uint64, mem platform.File, writeAt func(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error)) error { + for cseg := cache.LowerBoundSegment(mr.Start); cseg.Ok() && cseg.Start() < mr.End; cseg = cseg.NextSegment() { + wbr := cseg.Range().Intersect(mr) + if max < wbr.Start { + break + } + ims, err := mem.MapInternal(cseg.FileRangeOf(wbr), usermem.Read) + if err != nil { + return err + } + if max < wbr.End { + ims = ims.TakeFirst64(max - wbr.Start) + } + offset := wbr.Start + for !ims.IsEmpty() { + n, err := writeAt(ctx, ims, offset) + if err != nil { + return err + } + offset += n + ims = ims.DropFirst64(n) + } + } + return nil +} diff --git a/pkg/sentry/fs/fsutil/dirty_set_test.go b/pkg/sentry/fs/fsutil/dirty_set_test.go new file mode 100644 index 000000000..f7693cb19 --- /dev/null +++ b/pkg/sentry/fs/fsutil/dirty_set_test.go @@ -0,0 +1,38 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fsutil + +import ( + "reflect" + "testing" + + "gvisor.googlesource.com/gvisor/pkg/sentry/memmap" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +func TestDirtySet(t *testing.T) { + var set DirtySet + set.MarkDirty(memmap.MappableRange{0, 2 * usermem.PageSize}) + set.KeepDirty(memmap.MappableRange{usermem.PageSize, 2 * usermem.PageSize}) + set.MarkClean(memmap.MappableRange{0, 2 * usermem.PageSize}) + want := &DirtySegmentDataSlices{ + Start: []uint64{usermem.PageSize}, + End: []uint64{2 * usermem.PageSize}, + Values: []DirtyInfo{{Keep: true}}, + } + if got := set.ExportSortedSlices(); !reflect.DeepEqual(got, want) { + t.Errorf("set:\n\tgot %v,\n\twant %v", got, want) + } +} diff --git a/pkg/sentry/fs/fsutil/file.go b/pkg/sentry/fs/fsutil/file.go new file mode 100644 index 000000000..a7329f1c9 --- /dev/null +++ b/pkg/sentry/fs/fsutil/file.go @@ -0,0 +1,267 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fsutil + +import ( + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/memmap" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" + "gvisor.googlesource.com/gvisor/pkg/waiter" +) + +// NoopRelease implements FileOperations.Release for files that have no +// resources to release. +type NoopRelease struct{} + +// Release is a no-op. +func (NoopRelease) Release() {} + +// SeekWithDirCursor is used to implement fs.FileOperations.Seek. If dirCursor +// is not nil and the seek was on a directory, the cursor will be updated. +// +// Currenly only seeking to 0 on a directory is supported. +// +// FIXME: Lift directory seeking limitations. +func SeekWithDirCursor(ctx context.Context, file *fs.File, whence fs.SeekWhence, offset int64, dirCursor *string) (int64, error) { + inode := file.Dirent.Inode + current := file.Offset() + + // Does the Inode represents a non-seekable type? + if fs.IsPipe(inode.StableAttr) || fs.IsSocket(inode.StableAttr) { + return current, syserror.ESPIPE + } + + // Does the Inode represent a character device? + if fs.IsCharDevice(inode.StableAttr) { + // Ignore seek requests. + // + // FIXME: This preserves existing + // behavior but is not universally correct. + return 0, nil + } + + // Otherwise compute the new offset. + switch whence { + case fs.SeekSet: + switch inode.StableAttr.Type { + case fs.RegularFile, fs.SpecialFile, fs.BlockDevice: + if offset < 0 { + return current, syserror.EINVAL + } + return offset, nil + case fs.Directory, fs.SpecialDirectory: + if offset != 0 { + return current, syserror.EINVAL + } + // SEEK_SET to 0 moves the directory "cursor" to the beginning. + if dirCursor != nil { + *dirCursor = "" + } + return 0, nil + default: + return current, syserror.EINVAL + } + case fs.SeekCurrent: + switch inode.StableAttr.Type { + case fs.RegularFile, fs.SpecialFile, fs.BlockDevice: + if current+offset < 0 { + return current, syserror.EINVAL + } + return current + offset, nil + case fs.Directory, fs.SpecialDirectory: + if offset != 0 { + return current, syserror.EINVAL + } + return current, nil + default: + return current, syserror.EINVAL + } + case fs.SeekEnd: + switch inode.StableAttr.Type { + case fs.RegularFile, fs.BlockDevice: + // Allow the file to determine the end. + uattr, err := inode.UnstableAttr(ctx) + if err != nil { + return current, err + } + sz := uattr.Size + if sz+offset < 0 { + return current, syserror.EINVAL + } + return sz + offset, nil + // FIXME: This is not universally correct. + // Remove SpecialDirectory. + case fs.SpecialDirectory: + if offset != 0 { + return current, syserror.EINVAL + } + // SEEK_END to 0 moves the directory "cursor" to the end. + // + // FIXME: The ensures that after the seek, + // reading on the directory will get EOF. But it is not + // correct in general because the directory can grow in + // size; attempting to read those new entries will be + // futile (EOF will always be the result). + return fs.FileMaxOffset, nil + default: + return current, syserror.EINVAL + } + } + + // Not a valid seek request. + return current, syserror.EINVAL +} + +// GenericSeek implements FileOperations.Seek for files that use a generic +// seek implementation. +type GenericSeek struct{} + +// Seek implements fs.FileOperations.Seek. +func (GenericSeek) Seek(ctx context.Context, file *fs.File, whence fs.SeekWhence, offset int64) (int64, error) { + return SeekWithDirCursor(ctx, file, whence, offset, nil) +} + +// ZeroSeek implements FileOperations.Seek for files that maintain a constant +// zero-value offset and require a no-op Seek. +type ZeroSeek struct{} + +// Seek implements FileOperations.Seek. +func (ZeroSeek) Seek(context.Context, *fs.File, fs.SeekWhence, int64) (int64, error) { + return 0, nil +} + +// PipeSeek implements FileOperations.Seek and can be used for files that behave +// like pipes (seeking is not supported). +type PipeSeek struct{} + +// Seek implements FileOperations.Seek. +func (PipeSeek) Seek(context.Context, *fs.File, fs.SeekWhence, int64) (int64, error) { + return 0, syserror.ESPIPE +} + +// NotDirReaddir implements FileOperations.Readdir for non-directories. +type NotDirReaddir struct{} + +// Readdir implements FileOperations.NotDirReaddir. +func (NotDirReaddir) Readdir(context.Context, *fs.File, fs.DentrySerializer) (int64, error) { + return 0, syserror.ENOTDIR +} + +// NoFsync implements FileOperations.Fsync for files that don't support syncing. +type NoFsync struct{} + +// Fsync implements FileOperations.Fsync. +func (NoFsync) Fsync(context.Context, *fs.File, int64, int64, fs.SyncType) error { + return syserror.EINVAL +} + +// NoopFsync implements FileOperations.Fsync for files that don't need to synced. +type NoopFsync struct{} + +// Fsync implements FileOperations.Fsync. +func (NoopFsync) Fsync(context.Context, *fs.File, int64, int64, fs.SyncType) error { + return nil +} + +// NoopFlush implements FileOperations.Flush as a no-op. +type NoopFlush struct{} + +// Flush implements FileOperations.Flush. +func (NoopFlush) Flush(context.Context, *fs.File) error { + return nil +} + +// NoMMap implements fs.FileOperations.Mappable for files that cannot +// be memory mapped. +type NoMMap struct{} + +// ConfigureMMap implements fs.FileOperations.ConfigureMMap. +func (NoMMap) ConfigureMMap(context.Context, *fs.File, *memmap.MMapOpts) error { + return syserror.ENODEV +} + +// GenericConfigureMMap implements fs.FileOperations.ConfigureMMap for most +// filesystems that support memory mapping. +func GenericConfigureMMap(file *fs.File, m memmap.Mappable, opts *memmap.MMapOpts) error { + opts.Mappable = m + opts.MappingIdentity = file + file.IncRef() + return nil +} + +// NoIoctl implements fs.FileOperations.Ioctl for files that don't implement +// the ioctl syscall. +type NoIoctl struct{} + +// Ioctl implements fs.FileOperations.Ioctl. +func (NoIoctl) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { + return 0, syserror.ENOTTY +} + +// DirFileOperations implements FileOperations for directories. +type DirFileOperations struct { + waiter.AlwaysReady `state:"nosave"` + NoopRelease `state:"nosave"` + GenericSeek `state:"nosave"` + NoFsync `state:"nosave"` + NoopFlush `state:"nosave"` + NoMMap `state:"nosave"` + NoIoctl `state:"nosave"` + + // dentryMap is a SortedDentryMap used to implement Readdir. + dentryMap *fs.SortedDentryMap + + // dirCursor contains the name of the last directory entry that was + // serialized. + dirCursor string +} + +// NewDirFileOperations returns a new DirFileOperations that will iterate the +// given denty map. +func NewDirFileOperations(dentries *fs.SortedDentryMap) *DirFileOperations { + return &DirFileOperations{ + dentryMap: dentries, + } +} + +// IterateDir implements DirIterator.IterateDir. +func (dfo *DirFileOperations) IterateDir(ctx context.Context, dirCtx *fs.DirCtx, offset int) (int, error) { + n, err := fs.GenericReaddir(dirCtx, dfo.dentryMap) + return offset + n, err +} + +// Readdir implements FileOperations.Readdir. +func (dfo *DirFileOperations) Readdir(ctx context.Context, file *fs.File, serializer fs.DentrySerializer) (int64, error) { + root := fs.RootFromContext(ctx) + defer root.DecRef() + dirCtx := &fs.DirCtx{ + Serializer: serializer, + DirCursor: &dfo.dirCursor, + } + return fs.DirentReaddir(ctx, file.Dirent, dfo, root, dirCtx, file.Offset()) +} + +// Read implements FileOperations.Read +func (*DirFileOperations) Read(context.Context, *fs.File, usermem.IOSequence, int64) (int64, error) { + return 0, syserror.EISDIR +} + +// Write implements FileOperations.Write. +func (*DirFileOperations) Write(context.Context, *fs.File, usermem.IOSequence, int64) (int64, error) { + return 0, syserror.EISDIR +} diff --git a/pkg/sentry/fs/fsutil/file_range_set.go b/pkg/sentry/fs/fsutil/file_range_set.go new file mode 100644 index 000000000..da6949ccb --- /dev/null +++ b/pkg/sentry/fs/fsutil/file_range_set.go @@ -0,0 +1,208 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fsutil + +import ( + "fmt" + "io" + "math" + + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/memmap" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform" + "gvisor.googlesource.com/gvisor/pkg/sentry/safemem" + "gvisor.googlesource.com/gvisor/pkg/sentry/usage" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +// FileRangeSet maps offsets into a memmap.Mappable to offsets into a +// platform.File. It is used to implement Mappables that store data in +// sparsely-allocated memory. +// +// type FileRangeSet <generated by go_generics> + +// fileRangeSetFunctions implements segment.Functions for FileRangeSet. +type fileRangeSetFunctions struct{} + +// MinKey implements segment.Functions.MinKey. +func (fileRangeSetFunctions) MinKey() uint64 { + return 0 +} + +// MaxKey implements segment.Functions.MaxKey. +func (fileRangeSetFunctions) MaxKey() uint64 { + return math.MaxUint64 +} + +// ClearValue implements segment.Functions.ClearValue. +func (fileRangeSetFunctions) ClearValue(_ *uint64) { +} + +// Merge implements segment.Functions.Merge. +func (fileRangeSetFunctions) Merge(mr1 memmap.MappableRange, frstart1 uint64, _ memmap.MappableRange, frstart2 uint64) (uint64, bool) { + if frstart1+mr1.Length() != frstart2 { + return 0, false + } + return frstart1, true +} + +// Split implements segment.Functions.Split. +func (fileRangeSetFunctions) Split(mr memmap.MappableRange, frstart uint64, split uint64) (uint64, uint64) { + return frstart, frstart + (split - mr.Start) +} + +// FileRange returns the FileRange mapped by seg. +func (seg FileRangeIterator) FileRange() platform.FileRange { + return seg.FileRangeOf(seg.Range()) +} + +// FileRangeOf returns the FileRange mapped by mr. +// +// Preconditions: seg.Range().IsSupersetOf(mr). mr.Length() != 0. +func (seg FileRangeIterator) FileRangeOf(mr memmap.MappableRange) platform.FileRange { + frstart := seg.Value() + (mr.Start - seg.Start()) + return platform.FileRange{frstart, frstart + mr.Length()} +} + +// Fill attempts to ensure that all memmap.Mappable offsets in required are +// mapped to a platform.File offset, by allocating from mem with the given +// memory usage kind and invoking readAt to store data into memory. (If readAt +// returns a successful partial read, Fill will call it repeatedly until all +// bytes have been read.) EOF is handled consistently with the requirements of +// mmap(2): bytes after EOF on the same page are zeroed; pages after EOF are +// invalid. +// +// Fill may read offsets outside of required, but will never read offsets +// outside of optional. It returns a non-nil error if any error occurs, even +// if the error only affects offsets in optional, but not in required. +// +// Preconditions: required.Length() > 0. optional.IsSupersetOf(required). +// required and optional must be page-aligned. +func (frs *FileRangeSet) Fill(ctx context.Context, required, optional memmap.MappableRange, mem platform.Memory, kind usage.MemoryKind, readAt func(ctx context.Context, dsts safemem.BlockSeq, offset uint64) (uint64, error)) error { + gap := frs.LowerBoundGap(required.Start) + for gap.Ok() && gap.Start() < required.End { + if gap.Range().Length() == 0 { + gap = gap.NextGap() + continue + } + gr := gap.Range().Intersect(optional) + + // Read data into the gap. + fr, err := platform.AllocateAndFill(mem, gr.Length(), kind, safemem.ReaderFunc(func(dsts safemem.BlockSeq) (uint64, error) { + var done uint64 + for !dsts.IsEmpty() { + n, err := readAt(ctx, dsts, gr.Start+done) + done += n + dsts = dsts.DropFirst64(n) + if err != nil { + if err == io.EOF { + // platform.AllocateAndFill truncates down to a page + // boundary, but FileRangeSet.Fill is supposed to + // zero-fill to the end of the page in this case. + donepgaddr, ok := usermem.Addr(done).RoundUp() + if donepg := uint64(donepgaddr); ok && donepg != done { + dsts.DropFirst64(donepg - done) + done = donepg + if dsts.IsEmpty() { + return done, nil + } + } + } + return done, err + } + } + return done, nil + })) + + // Store anything we managed to read into the cache. + if done := fr.Length(); done != 0 { + gr.End = gr.Start + done + gap = frs.Insert(gap, gr, fr.Start).NextGap() + } + + if err != nil { + return err + } + } + return nil +} + +// Drop removes segments for memmap.Mappable offsets in mr, freeing the +// corresponding platform.FileRanges. +// +// Preconditions: mr must be page-aligned. +func (frs *FileRangeSet) Drop(mr memmap.MappableRange, mem platform.Memory) { + seg := frs.LowerBoundSegment(mr.Start) + for seg.Ok() && seg.Start() < mr.End { + seg = frs.Isolate(seg, mr) + mem.DecRef(seg.FileRange()) + seg = frs.Remove(seg).NextSegment() + } +} + +// DropAll removes all segments in mr, freeing the corresponding +// platform.FileRanges. +func (frs *FileRangeSet) DropAll(mem platform.Memory) { + for seg := frs.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { + mem.DecRef(seg.FileRange()) + } + frs.RemoveAll() +} + +// Truncate updates frs to reflect Mappable truncation to the given length: +// bytes after the new EOF on the same page are zeroed, and pages after the new +// EOF are freed. +func (frs *FileRangeSet) Truncate(end uint64, mem platform.Memory) { + pgendaddr, ok := usermem.Addr(end).RoundUp() + if ok { + pgend := uint64(pgendaddr) + + // Free truncated pages. + frs.SplitAt(pgend) + seg := frs.LowerBoundSegment(pgend) + for seg.Ok() { + mem.DecRef(seg.FileRange()) + seg = frs.Remove(seg).NextSegment() + } + + if end == pgend { + return + } + } + + // Here we know end < end.RoundUp(). If the new EOF lands in the + // middle of a page that we have, zero out its contents beyond the new + // length. + seg := frs.FindSegment(end) + if seg.Ok() { + fr := seg.FileRange() + fr.Start += end - seg.Start() + ims, err := mem.MapInternal(fr, usermem.Write) + if err != nil { + // There's no good recourse from here. This means + // that we can't keep cached memory consistent with + // the new end of file. The caller may have already + // updated the file size on their backing file system. + // + // We don't want to risk blindly continuing onward, + // so in the extremely rare cases this does happen, + // we abandon ship. + panic(fmt.Sprintf("Failed to map %v: %v", fr, err)) + } + if _, err := safemem.ZeroSeq(ims); err != nil { + panic(fmt.Sprintf("Zeroing %v failed: %v", fr, err)) + } + } +} diff --git a/pkg/sentry/fs/fsutil/frame_ref_set.go b/pkg/sentry/fs/fsutil/frame_ref_set.go new file mode 100644 index 000000000..14dece315 --- /dev/null +++ b/pkg/sentry/fs/fsutil/frame_ref_set.go @@ -0,0 +1,50 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fsutil + +import ( + "math" + + "gvisor.googlesource.com/gvisor/pkg/sentry/platform" +) + +type frameRefSetFunctions struct{} + +// MinKey implements segment.Functions.MinKey. +func (frameRefSetFunctions) MinKey() uint64 { + return 0 +} + +// MaxKey implements segment.Functions.MaxKey. +func (frameRefSetFunctions) MaxKey() uint64 { + return math.MaxUint64 +} + +// ClearValue implements segment.Functions.ClearValue. +func (frameRefSetFunctions) ClearValue(val *uint64) { +} + +// Merge implements segment.Functions.Merge. +func (frameRefSetFunctions) Merge(_ platform.FileRange, val1 uint64, _ platform.FileRange, val2 uint64) (uint64, bool) { + if val1 != val2 { + return 0, false + } + return val1, true +} + +// Split implements segment.Functions.Split. +func (frameRefSetFunctions) Split(_ platform.FileRange, val uint64, _ uint64) (uint64, uint64) { + return val, val +} diff --git a/pkg/sentry/fs/fsutil/fsutil.go b/pkg/sentry/fs/fsutil/fsutil.go new file mode 100644 index 000000000..6fe4ef13d --- /dev/null +++ b/pkg/sentry/fs/fsutil/fsutil.go @@ -0,0 +1,26 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package fsutil provides utilities for implementing fs.InodeOperations +// and fs.FileOperations: +// +// - For embeddable utilities, see inode.go and file.go. +// +// - For fs.Inodes that require a page cache to be memory mapped, see +// inode_cache.go. +// +// - For fs.Files that implement fs.HandleOps, see handle.go. +// +// - For anon fs.Inodes, see anon.go. +package fsutil diff --git a/pkg/sentry/fs/fsutil/handle.go b/pkg/sentry/fs/fsutil/handle.go new file mode 100644 index 000000000..149c0f84a --- /dev/null +++ b/pkg/sentry/fs/fsutil/handle.go @@ -0,0 +1,126 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fsutil + +import ( + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/memmap" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" + "gvisor.googlesource.com/gvisor/pkg/waiter" +) + +// Handle implements FileOperations. +// +// FIXME: Remove Handle entirely in favor of individual fs.File +// implementations using simple generic utilities. +type Handle struct { + NoopRelease `state:"nosave"` + NoIoctl `state:"nosave"` + HandleOperations fs.HandleOperations + + // dirCursor is the directory cursor. + dirCursor string +} + +// NewHandle returns a File backed by the Dirent and FileFlags. +func NewHandle(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags, hops fs.HandleOperations) *fs.File { + if !fs.IsPipe(dirent.Inode.StableAttr) && !fs.IsSocket(dirent.Inode.StableAttr) { + // Allow reading/writing at an arbitrary offset for non-pipes + // and non-sockets. + flags.Pread = true + flags.Pwrite = true + } + + return fs.NewFile(ctx, dirent, flags, &Handle{HandleOperations: hops}) +} + +// Readiness implements waiter.Waitable.Readiness. +func (h *Handle) Readiness(mask waiter.EventMask) waiter.EventMask { + return h.HandleOperations.Readiness(mask) +} + +// EventRegister implements waiter.Waitable.EventRegister. +func (h *Handle) EventRegister(e *waiter.Entry, mask waiter.EventMask) { + h.HandleOperations.EventRegister(e, mask) +} + +// EventUnregister implements waiter.Waitable.EventUnregister. +func (h *Handle) EventUnregister(e *waiter.Entry) { + h.HandleOperations.EventUnregister(e) +} + +// Readdir implements FileOperations.Readdir. +func (h *Handle) Readdir(ctx context.Context, file *fs.File, serializer fs.DentrySerializer) (int64, error) { + root := fs.RootFromContext(ctx) + defer root.DecRef() + dirCtx := &fs.DirCtx{ + Serializer: serializer, + DirCursor: &h.dirCursor, + } + n, err := fs.DirentReaddir(ctx, file.Dirent, h, root, dirCtx, file.Offset()) + return n, err +} + +// Seek implements FileOperations.Seek. +func (h *Handle) Seek(ctx context.Context, file *fs.File, whence fs.SeekWhence, offset int64) (int64, error) { + return SeekWithDirCursor(ctx, file, whence, offset, &h.dirCursor) +} + +// IterateDir implements DirIterator.IterateDir. +func (h *Handle) IterateDir(ctx context.Context, dirCtx *fs.DirCtx, offset int) (int, error) { + return h.HandleOperations.DeprecatedReaddir(ctx, dirCtx, offset) +} + +// Read implements FileOperations.Read. +func (h *Handle) Read(ctx context.Context, file *fs.File, dst usermem.IOSequence, offset int64) (int64, error) { + return h.HandleOperations.DeprecatedPreadv(ctx, dst, offset) +} + +// Write implements FileOperations.Write. +func (h *Handle) Write(ctx context.Context, file *fs.File, src usermem.IOSequence, offset int64) (int64, error) { + return h.HandleOperations.DeprecatedPwritev(ctx, src, offset) +} + +// Fsync implements FileOperations.Fsync. +func (h *Handle) Fsync(ctx context.Context, file *fs.File, start int64, end int64, syncType fs.SyncType) error { + switch syncType { + case fs.SyncAll, fs.SyncData: + // Write out metadata. + if err := file.Dirent.Inode.WriteOut(ctx); err != nil { + return err + } + fallthrough + case fs.SyncBackingStorage: + // Use DeprecatedFsync to sync disks. + return h.HandleOperations.DeprecatedFsync() + } + panic("invalid sync type") +} + +// Flush implements FileOperations.Flush. +func (h *Handle) Flush(context.Context, *fs.File) error { + return h.HandleOperations.DeprecatedFlush() +} + +// ConfigureMMap implements FileOperations.ConfigureMMap. +func (h *Handle) ConfigureMMap(ctx context.Context, file *fs.File, opts *memmap.MMapOpts) error { + mappable := file.Dirent.Inode.Mappable() + if mappable == nil { + return syserror.ENODEV + } + return GenericConfigureMMap(file, mappable, opts) +} diff --git a/pkg/sentry/fs/fsutil/handle_test.go b/pkg/sentry/fs/fsutil/handle_test.go new file mode 100644 index 000000000..d94c3eb0d --- /dev/null +++ b/pkg/sentry/fs/fsutil/handle_test.go @@ -0,0 +1,227 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fsutil_test + +import ( + "io" + "syscall" + "testing" + + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil" + ramfstest "gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs/test" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +type testInodeOperations struct { + fs.InodeOperations + fs.InodeType + FileSize int64 + writes uint + reads uint +} + +func (t *testInodeOperations) UnstableAttr(ctx context.Context, inode *fs.Inode) (fs.UnstableAttr, error) { + return fs.UnstableAttr{Size: t.FileSize}, nil +} + +// Check implements InodeOperations.Check. +func (t *testInodeOperations) Check(ctx context.Context, inode *fs.Inode, p fs.PermMask) bool { + return fs.ContextCanAccessFile(ctx, inode, p) +} + +func (t *testInodeOperations) DeprecatedPreadv(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) { + t.reads++ + return t.InodeOperations.DeprecatedPreadv(ctx, dst, offset) +} + +func (t *testInodeOperations) DeprecatedPwritev(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) { + t.writes++ + return t.InodeOperations.DeprecatedPwritev(ctx, src, offset) +} + +// testHandle returns a handle for a test node. +// +// The size of the node is fixed at 20 bytes. +func testHandle(t *testing.T, flags fs.FileFlags, nt fs.InodeType) (*fs.File, *testInodeOperations) { + ctx := contexttest.Context(t) + m := fs.NewNonCachingMountSource(nil, fs.MountSourceFlags{}) + n := &testInodeOperations{ + InodeOperations: ramfstest.NewFile(ctx, fs.FilePermissions{User: fs.PermMask{Read: true, Write: true}}), + FileSize: 20, + } + d := fs.NewDirent(fs.NewInode(n, m, fs.StableAttr{Type: nt}), "test") + return fsutil.NewHandle(ctx, d, flags, d.Inode.HandleOps()), n +} + +func TestHandleOps(t *testing.T) { + h, n := testHandle(t, fs.FileFlags{Read: true, Write: true}, fs.RegularFile) + defer h.DecRef() + + // Make sure a write request works. + if n, err := h.Writev(contexttest.Context(t), usermem.BytesIOSequence([]byte("a"))); n != 1 || err != nil { + t.Fatalf("Writev: got (%d, %v), wanted (1, nil)", n, err) + } + if n.writes != 1 { + t.Errorf("found %d writes, expected 1", n.writes) + } + + // Make sure a read request works. + dst := make([]byte, 1) + if n, err := h.Preadv(contexttest.Context(t), usermem.BytesIOSequence(dst), 0); n != 1 || (err != nil && err != io.EOF) { + t.Errorf("Preadv: got (%d, %v), wanted (1, nil or EOF)", n, err) + } + if dst[0] != 'a' { + t.Errorf("Preadv: read %q, wanted 'a'", dst[0]) + } + if n.reads != 1 { + t.Errorf("found %d reads, expected 1", n.reads) + } +} + +type seekTest struct { + whence fs.SeekWhence + offset int64 + result int64 + err error +} + +type seekSuite struct { + nodeType fs.InodeType + cases []seekTest +} + +// FIXME: This is currently missing fs.SeekEnd tests due to the +// fact that NullInodeOperations returns an error on stat. +func TestHandleSeek(t *testing.T) { + ts := []seekSuite{ + { + nodeType: fs.RegularFile, + cases: []seekTest{ + {fs.SeekSet, 0, 0, nil}, + {fs.SeekSet, 10, 10, nil}, + {fs.SeekSet, -5, 10, syscall.EINVAL}, + {fs.SeekCurrent, -1, 9, nil}, + {fs.SeekCurrent, 2, 11, nil}, + {fs.SeekCurrent, -12, 11, syscall.EINVAL}, + {fs.SeekEnd, -1, 19, nil}, + {fs.SeekEnd, 0, 20, nil}, + {fs.SeekEnd, 2, 22, nil}, + }, + }, + { + nodeType: fs.Directory, + cases: []seekTest{ + {fs.SeekSet, 0, 0, nil}, + {fs.SeekSet, 10, 0, syscall.EINVAL}, + {fs.SeekSet, -5, 0, syscall.EINVAL}, + {fs.SeekCurrent, 0, 0, nil}, + {fs.SeekCurrent, 11, 0, syscall.EINVAL}, + {fs.SeekCurrent, -6, 0, syscall.EINVAL}, + {fs.SeekEnd, 0, 0, syscall.EINVAL}, + {fs.SeekEnd, -1, 0, syscall.EINVAL}, + {fs.SeekEnd, 2, 0, syscall.EINVAL}, + }, + }, + { + nodeType: fs.Symlink, + cases: []seekTest{ + {fs.SeekSet, 5, 0, syscall.EINVAL}, + {fs.SeekSet, -5, 0, syscall.EINVAL}, + {fs.SeekSet, 0, 0, syscall.EINVAL}, + {fs.SeekCurrent, 5, 0, syscall.EINVAL}, + {fs.SeekCurrent, -5, 0, syscall.EINVAL}, + {fs.SeekCurrent, 0, 0, syscall.EINVAL}, + {fs.SeekEnd, 5, 0, syscall.EINVAL}, + {fs.SeekEnd, -5, 0, syscall.EINVAL}, + {fs.SeekEnd, 0, 0, syscall.EINVAL}, + }, + }, + { + nodeType: fs.Pipe, + cases: []seekTest{ + {fs.SeekSet, 5, 0, syscall.ESPIPE}, + {fs.SeekSet, -5, 0, syscall.ESPIPE}, + {fs.SeekSet, 0, 0, syscall.ESPIPE}, + {fs.SeekCurrent, 5, 0, syscall.ESPIPE}, + {fs.SeekCurrent, -5, 0, syscall.ESPIPE}, + {fs.SeekCurrent, 0, 0, syscall.ESPIPE}, + {fs.SeekEnd, 5, 0, syscall.ESPIPE}, + {fs.SeekEnd, -5, 0, syscall.ESPIPE}, + {fs.SeekEnd, 0, 0, syscall.ESPIPE}, + }, + }, + { + nodeType: fs.Socket, + cases: []seekTest{ + {fs.SeekSet, 5, 0, syscall.ESPIPE}, + {fs.SeekSet, -5, 0, syscall.ESPIPE}, + {fs.SeekSet, 0, 0, syscall.ESPIPE}, + {fs.SeekCurrent, 5, 0, syscall.ESPIPE}, + {fs.SeekCurrent, -5, 0, syscall.ESPIPE}, + {fs.SeekCurrent, 0, 0, syscall.ESPIPE}, + {fs.SeekEnd, 5, 0, syscall.ESPIPE}, + {fs.SeekEnd, -5, 0, syscall.ESPIPE}, + {fs.SeekEnd, 0, 0, syscall.ESPIPE}, + }, + }, + { + nodeType: fs.CharacterDevice, + cases: []seekTest{ + {fs.SeekSet, 5, 0, nil}, + {fs.SeekSet, -5, 0, nil}, + {fs.SeekSet, 0, 0, nil}, + {fs.SeekCurrent, 5, 0, nil}, + {fs.SeekCurrent, -5, 0, nil}, + {fs.SeekCurrent, 0, 0, nil}, + {fs.SeekEnd, 5, 0, nil}, + {fs.SeekEnd, -5, 0, nil}, + {fs.SeekEnd, 0, 0, nil}, + }, + }, + { + nodeType: fs.BlockDevice, + cases: []seekTest{ + {fs.SeekSet, 0, 0, nil}, + {fs.SeekSet, 10, 10, nil}, + {fs.SeekSet, -5, 10, syscall.EINVAL}, + {fs.SeekCurrent, -1, 9, nil}, + {fs.SeekCurrent, 2, 11, nil}, + {fs.SeekCurrent, -12, 11, syscall.EINVAL}, + {fs.SeekEnd, -1, 19, nil}, + {fs.SeekEnd, 0, 20, nil}, + {fs.SeekEnd, 2, 22, nil}, + }, + }, + } + + for _, s := range ts { + h, _ := testHandle(t, fs.FileFlags{Read: true, Write: true}, s.nodeType) + defer h.DecRef() + + for _, c := range s.cases { + // Try the given seek. + offset, err := h.Seek(contexttest.Context(t), c.whence, c.offset) + if err != c.err { + t.Errorf("seek(%s, %d) on %s had unexpected error: expected %v, got %v", c.whence, c.offset, s.nodeType, c.err, err) + } + if err == nil && offset != c.result { + t.Errorf("seek(%s, %d) on %s had bad result: expected %v, got %v", c.whence, c.offset, s.nodeType, c.result, offset) + } + } + } +} diff --git a/pkg/sentry/fs/fsutil/host_file_mapper.go b/pkg/sentry/fs/fsutil/host_file_mapper.go new file mode 100644 index 000000000..d0a27fc1c --- /dev/null +++ b/pkg/sentry/fs/fsutil/host_file_mapper.go @@ -0,0 +1,209 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fsutil + +import ( + "fmt" + "sync" + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/pkg/sentry/memmap" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform" + "gvisor.googlesource.com/gvisor/pkg/sentry/safemem" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +// HostFileMapper caches mappings of an arbitrary host file descriptor. It is +// used by implementations of memmap.Mappable that represent a host file +// descriptor. +type HostFileMapper struct { + // HostFile conceptually breaks the file into pieces called chunks, of + // size and alignment chunkSize, and caches mappings of the file on a chunk + // granularity. + + refsMu sync.Mutex `state:"nosave"` + + // refs maps chunk start offsets to the sum of reference counts for all + // pages in that chunk. refs is protected by refsMu. + refs map[uint64]int32 + + mapsMu sync.Mutex `state:"nosave"` + + // mappings maps chunk start offsets to mappings of those chunks, + // obtained by calling syscall.Mmap. mappings is protected by + // mapsMu. + mappings map[uint64]mapping `state:"nosave"` +} + +const ( + chunkShift = usermem.HugePageShift + chunkSize = 1 << chunkShift + chunkMask = chunkSize - 1 +) + +func pagesInChunk(mr memmap.MappableRange, chunkStart uint64) int32 { + return int32(mr.Intersect(memmap.MappableRange{chunkStart, chunkStart + chunkSize}).Length() / usermem.PageSize) +} + +type mapping struct { + addr uintptr + writable bool +} + +// NewHostFileMapper returns a HostFileMapper with no references or cached +// mappings. +func NewHostFileMapper() *HostFileMapper { + return &HostFileMapper{ + refs: make(map[uint64]int32), + mappings: make(map[uint64]mapping), + } +} + +// IncRefOn increments the reference count on all offsets in mr. +// +// Preconditions: mr.Length() != 0. mr.Start and mr.End must be page-aligned. +func (f *HostFileMapper) IncRefOn(mr memmap.MappableRange) { + f.refsMu.Lock() + defer f.refsMu.Unlock() + for chunkStart := mr.Start &^ chunkMask; chunkStart < mr.End; chunkStart += chunkSize { + refs := f.refs[chunkStart] + pgs := pagesInChunk(mr, chunkStart) + if refs+pgs < refs { + // Would overflow. + panic(fmt.Sprintf("HostFileMapper.IncRefOn(%v): adding %d page references to chunk %#x, which has %d page references", mr, pgs, chunkStart, refs)) + } + f.refs[chunkStart] = refs + pgs + } +} + +// DecRefOn decrements the reference count on all offsets in mr. +// +// Preconditions: mr.Length() != 0. mr.Start and mr.End must be page-aligned. +func (f *HostFileMapper) DecRefOn(mr memmap.MappableRange) { + f.refsMu.Lock() + defer f.refsMu.Unlock() + for chunkStart := mr.Start &^ chunkMask; chunkStart < mr.End; chunkStart += chunkSize { + refs := f.refs[chunkStart] + pgs := pagesInChunk(mr, chunkStart) + switch { + case refs > pgs: + f.refs[chunkStart] = refs - pgs + case refs == pgs: + f.mapsMu.Lock() + delete(f.refs, chunkStart) + if m, ok := f.mappings[chunkStart]; ok { + f.unmapAndRemoveLocked(chunkStart, m) + } + f.mapsMu.Unlock() + case refs < pgs: + panic(fmt.Sprintf("HostFileMapper.DecRefOn(%v): removing %d page references from chunk %#x, which has %d page references", mr, pgs, chunkStart, refs)) + } + } +} + +// MapInternal returns a mapping of offsets in fr from fd. The returned +// safemem.BlockSeq is valid as long as at least one reference is held on all +// offsets in fr or until the next call to UnmapAll. +// +// Preconditions: The caller must hold a reference on all offsets in fr. +func (f *HostFileMapper) MapInternal(fr platform.FileRange, fd int, write bool) (safemem.BlockSeq, error) { + chunks := ((fr.End + chunkMask) >> chunkShift) - (fr.Start >> chunkShift) + f.mapsMu.Lock() + defer f.mapsMu.Unlock() + if chunks == 1 { + // Avoid an unnecessary slice allocation. + var seq safemem.BlockSeq + err := f.forEachMappingBlockLocked(fr, fd, write, func(b safemem.Block) { + seq = safemem.BlockSeqOf(b) + }) + return seq, err + } + blocks := make([]safemem.Block, 0, chunks) + err := f.forEachMappingBlockLocked(fr, fd, write, func(b safemem.Block) { + blocks = append(blocks, b) + }) + return safemem.BlockSeqFromSlice(blocks), err +} + +// Preconditions: f.mapsMu must be locked. +func (f *HostFileMapper) forEachMappingBlockLocked(fr platform.FileRange, fd int, write bool, fn func(safemem.Block)) error { + prot := syscall.PROT_READ + if write { + prot |= syscall.PROT_WRITE + } + for chunkStart := fr.Start &^ chunkMask; chunkStart < fr.End; chunkStart += chunkSize { + m, ok := f.mappings[chunkStart] + if !ok { + addr, _, errno := syscall.Syscall6( + syscall.SYS_MMAP, + 0, + chunkSize, + uintptr(prot), + syscall.MAP_SHARED, + uintptr(fd), + uintptr(chunkStart)) + if errno != 0 { + return errno + } + m = mapping{addr, write} + f.mappings[chunkStart] = m + } else if write && !m.writable { + addr, _, errno := syscall.Syscall6( + syscall.SYS_MMAP, + m.addr, + chunkSize, + uintptr(prot), + syscall.MAP_SHARED|syscall.MAP_FIXED, + uintptr(fd), + uintptr(chunkStart)) + if errno != 0 { + return errno + } + m = mapping{addr, write} + f.mappings[chunkStart] = m + } + var startOff uint64 + if chunkStart < fr.Start { + startOff = fr.Start - chunkStart + } + endOff := uint64(chunkSize) + if chunkStart+chunkSize > fr.End { + endOff = fr.End - chunkStart + } + fn(f.unsafeBlockFromChunkMapping(m.addr).TakeFirst64(endOff).DropFirst64(startOff)) + } + return nil +} + +// UnmapAll unmaps all cached mappings. Callers are responsible for +// synchronization with mappings returned by previous calls to MapInternal. +func (f *HostFileMapper) UnmapAll() { + f.mapsMu.Lock() + defer f.mapsMu.Unlock() + for chunkStart, m := range f.mappings { + f.unmapAndRemoveLocked(chunkStart, m) + } +} + +// Preconditions: f.mapsMu must be locked. f.mappings[chunkStart] == m. +func (f *HostFileMapper) unmapAndRemoveLocked(chunkStart uint64, m mapping) { + if _, _, errno := syscall.Syscall(syscall.SYS_MUNMAP, m.addr, chunkSize, 0); errno != 0 { + // This leaks address space and is unexpected, but is otherwise + // harmless, so complain but don't panic. + log.Warningf("HostFileMapper: failed to unmap mapping %#x for chunk %#x: %v", m.addr, chunkStart, errno) + } + delete(f.mappings, chunkStart) +} diff --git a/pkg/sentry/fs/fsutil/host_file_mapper_state.go b/pkg/sentry/fs/fsutil/host_file_mapper_state.go new file mode 100644 index 000000000..57705decd --- /dev/null +++ b/pkg/sentry/fs/fsutil/host_file_mapper_state.go @@ -0,0 +1,20 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fsutil + +// afterLoad is invoked by stateify. +func (f *HostFileMapper) afterLoad() { + f.mappings = make(map[uint64]mapping) +} diff --git a/pkg/sentry/fs/fsutil/host_file_mapper_unsafe.go b/pkg/sentry/fs/fsutil/host_file_mapper_unsafe.go new file mode 100644 index 000000000..790f3a5a6 --- /dev/null +++ b/pkg/sentry/fs/fsutil/host_file_mapper_unsafe.go @@ -0,0 +1,27 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fsutil + +import ( + "unsafe" + + "gvisor.googlesource.com/gvisor/pkg/sentry/safemem" +) + +func (*HostFileMapper) unsafeBlockFromChunkMapping(addr uintptr) safemem.Block { + // We don't control the host file's length, so touching its mappings may + // raise SIGBUS. Thus accesses to it must use safecopy. + return safemem.BlockFromUnsafePointer((unsafe.Pointer)(addr), chunkSize) +} diff --git a/pkg/sentry/fs/fsutil/inode.go b/pkg/sentry/fs/fsutil/inode.go new file mode 100644 index 000000000..e1ad07df2 --- /dev/null +++ b/pkg/sentry/fs/fsutil/inode.go @@ -0,0 +1,380 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fsutil + +import ( + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time" + "gvisor.googlesource.com/gvisor/pkg/sentry/memmap" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" + "gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix" + "gvisor.googlesource.com/gvisor/pkg/waiter" +) + +// NewSimpleInodeOperations constructs fs.InodeOperations from InodeSimpleAttributes. +func NewSimpleInodeOperations(i InodeSimpleAttributes) fs.InodeOperations { + return &simpleInodeOperations{InodeSimpleAttributes: i} +} + +// simpleInodeOperations is a simple implementation of Inode. +type simpleInodeOperations struct { + DeprecatedFileOperations `state:"nosave"` + InodeNotDirectory `state:"nosave"` + InodeNotSocket `state:"nosave"` + InodeNotRenameable `state:"nosave"` + InodeNotOpenable `state:"nosave"` + InodeNotVirtual `state:"nosave"` + InodeNotSymlink `state:"nosave"` + InodeNoExtendedAttributes `state:"nosave"` + NoMappable `state:"nosave"` + NoopWriteOut `state:"nosave"` + + InodeSimpleAttributes +} + +// InodeSimpleAttributes implements a subset of the Inode interface. It provides +// read-only access to attributes. +type InodeSimpleAttributes struct { + // FSType is the filesystem type reported by StatFS. + FSType uint64 + + // UAttr are the unstable attributes of the Inode. + UAttr fs.UnstableAttr +} + +// Release implements fs.InodeOperations.Release. +func (i *InodeSimpleAttributes) Release(context.Context) {} + +// StatFS implements fs.InodeOperations.StatFS. +func (i *InodeSimpleAttributes) StatFS(context.Context) (fs.Info, error) { + return fs.Info{Type: i.FSType}, nil +} + +// UnstableAttr implements fs.InodeOperations.UnstableAttr. +func (i *InodeSimpleAttributes) UnstableAttr(context.Context, *fs.Inode) (fs.UnstableAttr, error) { + return i.UAttr, nil +} + +// Check implements fs.InodeOperations.Check. +func (i *InodeSimpleAttributes) Check(ctx context.Context, inode *fs.Inode, p fs.PermMask) bool { + return fs.ContextCanAccessFile(ctx, inode, p) +} + +// AddLink implements fs.InodeOperations.AddLink. +func (*InodeSimpleAttributes) AddLink() {} + +// DropLink implements fs.InodeOperations.DropLink. +func (*InodeSimpleAttributes) DropLink() {} + +// NotifyStatusChange implements fs.fs.InodeOperations. +func (i *InodeSimpleAttributes) NotifyStatusChange(ctx context.Context) { + i.UAttr.StatusChangeTime = ktime.NowFromContext(ctx) +} + +// SetPermissions implements fs.InodeOperations.SetPermissions. +func (*InodeSimpleAttributes) SetPermissions(context.Context, *fs.Inode, fs.FilePermissions) bool { + return false +} + +// SetOwner implements fs.InodeOperations.SetOwner. +func (*InodeSimpleAttributes) SetOwner(context.Context, *fs.Inode, fs.FileOwner) error { + return syserror.EINVAL +} + +// SetTimestamps implements fs.InodeOperations.SetTimestamps. +func (*InodeSimpleAttributes) SetTimestamps(context.Context, *fs.Inode, fs.TimeSpec) error { + return syserror.EINVAL +} + +// Truncate implements fs.InodeOperations.Truncate. +func (*InodeSimpleAttributes) Truncate(context.Context, *fs.Inode, int64) error { + return syserror.EINVAL +} + +// InMemoryAttributes implements utilities for updating in-memory unstable +// attributes and extended attributes. It is not thread-safe. +// +// Users need not initialize Xattrs to non-nil (it will be initialized +// when the first extended attribute is set. +type InMemoryAttributes struct { + Unstable fs.UnstableAttr + Xattrs map[string][]byte +} + +// SetPermissions updates the permissions to p. +func (i *InMemoryAttributes) SetPermissions(ctx context.Context, p fs.FilePermissions) bool { + i.Unstable.Perms = p + i.Unstable.StatusChangeTime = ktime.NowFromContext(ctx) + return true +} + +// SetOwner updates the file owner to owner. +func (i *InMemoryAttributes) SetOwner(ctx context.Context, owner fs.FileOwner) error { + if owner.UID.Ok() { + i.Unstable.Owner.UID = owner.UID + } + if owner.GID.Ok() { + i.Unstable.Owner.GID = owner.GID + } + return nil +} + +// SetTimestamps sets the timestamps to ts. +func (i *InMemoryAttributes) SetTimestamps(ctx context.Context, ts fs.TimeSpec) error { + if ts.ATimeOmit && ts.MTimeOmit { + return nil + } + + now := ktime.NowFromContext(ctx) + if !ts.ATimeOmit { + if ts.ATimeSetSystemTime { + i.Unstable.AccessTime = now + } else { + i.Unstable.AccessTime = ts.ATime + } + } + if !ts.MTimeOmit { + if ts.MTimeSetSystemTime { + i.Unstable.ModificationTime = now + } else { + i.Unstable.ModificationTime = ts.MTime + } + } + i.Unstable.StatusChangeTime = now + return nil +} + +// TouchAccessTime updates access time to the current time. +func (i *InMemoryAttributes) TouchAccessTime(ctx context.Context) { + i.Unstable.AccessTime = ktime.NowFromContext(ctx) +} + +// TouchModificationTime updates modification and status change +// time to the current time. +func (i *InMemoryAttributes) TouchModificationTime(ctx context.Context) { + now := ktime.NowFromContext(ctx) + i.Unstable.ModificationTime = now + i.Unstable.StatusChangeTime = now +} + +// TouchStatusChangeTime updates status change time to the current time. +func (i *InMemoryAttributes) TouchStatusChangeTime(ctx context.Context) { + i.Unstable.StatusChangeTime = ktime.NowFromContext(ctx) +} + +// Getxattr returns the extended attribute at name or ENOATTR if +// it isn't set. +func (i *InMemoryAttributes) Getxattr(name string) ([]byte, error) { + if value, ok := i.Xattrs[name]; ok { + return value, nil + } + return nil, syserror.ENOATTR +} + +// Setxattr sets the extended attribute at name to value. +func (i *InMemoryAttributes) Setxattr(name string, value []byte) error { + if i.Xattrs == nil { + i.Xattrs = make(map[string][]byte) + } + i.Xattrs[name] = value + return nil +} + +// Listxattr returns the set of all currently set extended attributes. +func (i *InMemoryAttributes) Listxattr() (map[string]struct{}, error) { + names := make(map[string]struct{}, len(i.Xattrs)) + for name := range i.Xattrs { + names[name] = struct{}{} + } + return names, nil +} + +// NoMappable returns a nil memmap.Mappable. +type NoMappable struct{} + +// Mappable implements fs.InodeOperations.Mappable. +func (NoMappable) Mappable(*fs.Inode) memmap.Mappable { + return nil +} + +// NoopWriteOut is a no-op implementation of Inode.WriteOut. +type NoopWriteOut struct{} + +// WriteOut is a no-op. +func (NoopWriteOut) WriteOut(context.Context, *fs.Inode) error { + return nil +} + +// InodeNotDirectory can be used by Inodes that are not directories. +type InodeNotDirectory struct{} + +// Lookup implements fs.InodeOperations.Lookup. +func (InodeNotDirectory) Lookup(context.Context, *fs.Inode, string) (*fs.Dirent, error) { + return nil, syserror.ENOTDIR +} + +// Create implements fs.InodeOperations.Create. +func (InodeNotDirectory) Create(context.Context, *fs.Inode, string, fs.FileFlags, fs.FilePermissions) (*fs.File, error) { + return nil, syserror.ENOTDIR +} + +// CreateLink implements fs.InodeOperations.CreateLink. +func (InodeNotDirectory) CreateLink(context.Context, *fs.Inode, string, string) error { + return syserror.ENOTDIR +} + +// CreateHardLink implements fs.InodeOperations.CreateHardLink. +func (InodeNotDirectory) CreateHardLink(context.Context, *fs.Inode, *fs.Inode, string) error { + return syserror.ENOTDIR +} + +// CreateDirectory implements fs.InodeOperations.CreateDirectory. +func (InodeNotDirectory) CreateDirectory(context.Context, *fs.Inode, string, fs.FilePermissions) error { + return syserror.ENOTDIR +} + +// Bind implements fs.InodeOperations.Bind. +func (InodeNotDirectory) Bind(context.Context, *fs.Inode, string, unix.BoundEndpoint, fs.FilePermissions) error { + return syserror.ENOTDIR +} + +// CreateFifo implements fs.InodeOperations.CreateFifo. +func (InodeNotDirectory) CreateFifo(context.Context, *fs.Inode, string, fs.FilePermissions) error { + return syserror.ENOTDIR +} + +// Remove implements fs.InodeOperations.Remove. +func (InodeNotDirectory) Remove(context.Context, *fs.Inode, string) error { + return syserror.ENOTDIR +} + +// RemoveDirectory implements fs.InodeOperations.RemoveDirectory. +func (InodeNotDirectory) RemoveDirectory(context.Context, *fs.Inode, string) error { + return syserror.ENOTDIR +} + +// InodeNotSocket can be used by Inodes that are not sockets. +type InodeNotSocket struct{} + +// BoundEndpoint implements fs.InodeOperations.BoundEndpoint. +func (InodeNotSocket) BoundEndpoint(*fs.Inode, string) unix.BoundEndpoint { + return nil +} + +// InodeNotRenameable can be used by Inodes that cannot be renamed. +type InodeNotRenameable struct{} + +// Rename implements fs.InodeOperations.Rename. +func (InodeNotRenameable) Rename(context.Context, *fs.Inode, string, *fs.Inode, string) error { + return syserror.EINVAL +} + +// InodeNotOpenable can be used by Inodes that cannot be opened. +type InodeNotOpenable struct{} + +// GetFile implements fs.InodeOperations.GetFile. +func (InodeNotOpenable) GetFile(context.Context, *fs.Dirent, fs.FileFlags) (*fs.File, error) { + return nil, syserror.EIO +} + +// InodeNotVirtual can be used by Inodes that are not virtual. +type InodeNotVirtual struct{} + +// IsVirtual implements fs.InodeOperations.IsVirtual. +func (InodeNotVirtual) IsVirtual() bool { + return false +} + +// InodeNotSymlink can be used by Inodes that are not symlinks. +type InodeNotSymlink struct{} + +// Readlink implements fs.InodeOperations.Readlink. +func (InodeNotSymlink) Readlink(context.Context, *fs.Inode) (string, error) { + return "", syserror.ENOLINK +} + +// Getlink implements fs.InodeOperations.Getlink. +func (InodeNotSymlink) Getlink(context.Context, *fs.Inode) (*fs.Dirent, error) { + return nil, syserror.ENOLINK +} + +// InodeNoExtendedAttributes can be used by Inodes that do not support +// extended attributes. +type InodeNoExtendedAttributes struct{} + +// Getxattr implements fs.InodeOperations.Getxattr. +func (InodeNoExtendedAttributes) Getxattr(*fs.Inode, string) ([]byte, error) { + return nil, syserror.EOPNOTSUPP +} + +// Setxattr implements fs.InodeOperations.Setxattr. +func (InodeNoExtendedAttributes) Setxattr(*fs.Inode, string, []byte) error { + return syserror.EOPNOTSUPP +} + +// Listxattr implements fs.InodeOperations.Listxattr. +func (InodeNoExtendedAttributes) Listxattr(*fs.Inode) (map[string]struct{}, error) { + return nil, syserror.EOPNOTSUPP +} + +// DeprecatedFileOperations panics if any deprecated Inode method is called. +type DeprecatedFileOperations struct{} + +// Readiness implements fs.InodeOperations.Waitable.Readiness. +func (DeprecatedFileOperations) Readiness(waiter.EventMask) waiter.EventMask { + panic("not implemented") +} + +// EventRegister implements fs.InodeOperations.Waitable.EventRegister. +func (DeprecatedFileOperations) EventRegister(*waiter.Entry, waiter.EventMask) { + panic("not implemented") +} + +// EventUnregister implements fs.InodeOperations.Waitable.EventUnregister. +func (DeprecatedFileOperations) EventUnregister(*waiter.Entry) { + panic("not implemented") +} + +// DeprecatedPreadv implements fs.InodeOperations.DeprecatedPreadv. +func (DeprecatedFileOperations) DeprecatedPreadv(context.Context, usermem.IOSequence, int64) (int64, error) { + panic("not implemented") +} + +// DeprecatedPwritev implements fs.InodeOperations.DeprecatedPwritev. +func (DeprecatedFileOperations) DeprecatedPwritev(context.Context, usermem.IOSequence, int64) (int64, error) { + panic("not implemented") +} + +// DeprecatedReaddir implements fs.InodeOperations.DeprecatedReaddir. +func (DeprecatedFileOperations) DeprecatedReaddir(context.Context, *fs.DirCtx, int) (int, error) { + panic("not implemented") +} + +// DeprecatedFsync implements fs.InodeOperations.DeprecatedFsync. +func (DeprecatedFileOperations) DeprecatedFsync() error { + panic("not implemented") +} + +// DeprecatedFlush implements fs.InodeOperations.DeprecatedFlush. +func (DeprecatedFileOperations) DeprecatedFlush() error { + panic("not implemented") +} + +// DeprecatedMappable implements fs.InodeOperations.DeprecatedMappable. +func (DeprecatedFileOperations) DeprecatedMappable(context.Context, *fs.Inode) (memmap.Mappable, bool) { + panic("not implemented") +} diff --git a/pkg/sentry/fs/fsutil/inode_cached.go b/pkg/sentry/fs/fsutil/inode_cached.go new file mode 100644 index 000000000..484668735 --- /dev/null +++ b/pkg/sentry/fs/fsutil/inode_cached.go @@ -0,0 +1,845 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fsutil + +import ( + "fmt" + "io" + "sync" + + "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time" + "gvisor.googlesource.com/gvisor/pkg/sentry/memmap" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform" + "gvisor.googlesource.com/gvisor/pkg/sentry/safemem" + "gvisor.googlesource.com/gvisor/pkg/sentry/usage" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +// Lock order (compare the lock order model in mm/mm.go): +// +// CachingInodeOperations.attrMu ("fs locks") +// CachingInodeOperations.mapsMu ("memmap.Mappable locks not taken by Translate") +// CachingInodeOperations.dataMu ("memmap.Mappable locks taken by Translate") +// CachedFileObject locks + +// CachingInodeOperations caches the metadata and content of a CachedFileObject. +// It implements a subset of InodeOperations. As a utility it can be used to +// implement the full set of InodeOperations. Generally it should not be +// embedded to avoid unexpected inherited behavior. +// +// CachingInodeOperations implements Mappable for the CachedFileObject: +// +// - If CachedFileObject.FD returns a value >= 0 and the current platform shares +// a host fd table with the sentry, then the value of CachedFileObject.FD +// will be memory mapped on the host. +// +// - Otherwise, the contents of CachedFileObject are buffered into memory +// managed by the CachingInodeOperations. +// +// Implementations of FileOperations for a CachedFileObject must read and +// write through CachingInodeOperations using Read and Write respectively. +// +// Implementations of InodeOperations.WriteOut must call Sync to write out +// in-memory modifications of data and metadata to the CachedFileObject. +type CachingInodeOperations struct { + // backingFile is a handle to a cached file object. + backingFile CachedFileObject + + // platform is used to allocate memory that caches backingFile's contents. + platform platform.Platform + + // forcePageCache indicates the sentry page cache should be used regardless + // of whether the platform supports host mapped I/O or not. This must not be + // modified after inode creation. + forcePageCache bool + + attrMu sync.Mutex `state:"nosave"` + + // attr is unstable cached metadata. + // + // attr is protected by attrMu. attr.Size is protected by both attrMu and + // dataMu; reading it requires locking either mutex, while mutating it + // requires locking both. + attr fs.UnstableAttr + + // dirtyAttr is metadata that was updated in-place but hasn't yet + // been successfully written out. + // + // dirtyAttr is protected by attrMu. + dirtyAttr fs.AttrMask + + mapsMu sync.Mutex `state:"nosave"` + + // mappings tracks mappings of the cached file object into + // memmap.MappingSpaces. + // + // mappings is protected by mapsMu. + mappings memmap.MappingSet + + dataMu sync.RWMutex `state:"nosave"` + + // cache maps offsets into the cached file to offsets into + // platform.Memory() that store the file's data. + // + // cache is protected by dataMu. + cache FileRangeSet + + // dirty tracks dirty segments in cache. + // + // dirty is protected by dataMu. + dirty DirtySet + + // hostFileMapper caches internal mappings of backingFile.FD(). + hostFileMapper *HostFileMapper + + // refs tracks active references to data in the cache. + // + // refs is protected by dataMu. + refs frameRefSet +} + +// CachedFileObject is a file that may require caching. +type CachedFileObject interface { + // ReadToBlocksAt reads up to dsts.NumBytes() bytes from the file to dsts, + // starting at offset, and returns the number of bytes read. ReadToBlocksAt + // may return a partial read without an error. + ReadToBlocksAt(ctx context.Context, dsts safemem.BlockSeq, offset uint64) (uint64, error) + + // WriteFromBlocksAt writes up to srcs.NumBytes() bytes from srcs to the + // file, starting at offset, and returns the number of bytes written. + // WriteFromBlocksAt may return a partial write without an error. + WriteFromBlocksAt(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error) + + // SetMaskedAttributes sets the attributes in attr that are true in mask + // on the backing file. + // + // SetMaskedAttributes may be called at any point, regardless of whether + // the file was opened. + SetMaskedAttributes(ctx context.Context, mask fs.AttrMask, attr fs.UnstableAttr) error + + // Sync instructs the remote filesystem to sync the file to stable storage. + Sync(ctx context.Context) error + + // FD returns a host file descriptor. Return value must be -1 or not -1 + // for the lifetime of the CachedFileObject. + // + // FD is called iff the file has been memory mapped. This implies that + // the file was opened (see fs.InodeOperations.GetFile). + // + // FIXME: This interface seems to be + // fundamentally broken. We should clarify CachingInodeOperation's + // behavior with metadata. + FD() int +} + +// NewCachingInodeOperations returns a new CachingInodeOperations backed by +// a CachedFileObject and its initial unstable attributes. +func NewCachingInodeOperations(ctx context.Context, backingFile CachedFileObject, uattr fs.UnstableAttr, forcePageCache bool) *CachingInodeOperations { + p := platform.FromContext(ctx) + if p == nil { + panic(fmt.Sprintf("context.Context %T lacks non-nil value for key %T", ctx, platform.CtxPlatform)) + } + return &CachingInodeOperations{ + backingFile: backingFile, + platform: p, + forcePageCache: forcePageCache, + attr: uattr, + hostFileMapper: NewHostFileMapper(), + } +} + +// Release implements fs.InodeOperations.Release. +func (c *CachingInodeOperations) Release() { + c.mapsMu.Lock() + defer c.mapsMu.Unlock() + c.dataMu.Lock() + defer c.dataMu.Unlock() + // The cache should be empty (something has gone terribly wrong if we're + // releasing an inode that is still memory-mapped). + if !c.mappings.IsEmpty() || !c.cache.IsEmpty() || !c.dirty.IsEmpty() { + panic(fmt.Sprintf("Releasing CachingInodeOperations with mappings:\n%s\ncache contents:\n%s\ndirty segments:\n%s", &c.mappings, &c.cache, &c.dirty)) + } +} + +// UnstableAttr implements fs.InodeOperations.UnstableAttr. +func (c *CachingInodeOperations) UnstableAttr(ctx context.Context, inode *fs.Inode) (fs.UnstableAttr, error) { + c.attrMu.Lock() + defer c.attrMu.Unlock() + return c.attr, nil +} + +// SetPermissions implements fs.InodeOperations.SetPermissions. +func (c *CachingInodeOperations) SetPermissions(ctx context.Context, inode *fs.Inode, perms fs.FilePermissions) bool { + c.attrMu.Lock() + defer c.attrMu.Unlock() + + masked := fs.AttrMask{Perms: true} + if err := c.backingFile.SetMaskedAttributes(ctx, masked, fs.UnstableAttr{Perms: perms}); err != nil { + return false + } + c.attr.Perms = perms + // FIXME: Clarify CachingInodeOperations behavior with metadata. + c.dirtyAttr.Perms = true + c.touchStatusChangeTimeLocked(ctx) + return true + +} + +// SetOwner implements fs.InodeOperations.SetOwner. +func (c *CachingInodeOperations) SetOwner(ctx context.Context, inode *fs.Inode, owner fs.FileOwner) error { + if !owner.UID.Ok() && !owner.GID.Ok() { + return nil + } + + c.attrMu.Lock() + defer c.attrMu.Unlock() + + masked := fs.AttrMask{ + UID: owner.UID.Ok(), + GID: owner.GID.Ok(), + } + if err := c.backingFile.SetMaskedAttributes(ctx, masked, fs.UnstableAttr{Owner: owner}); err != nil { + return err + } + if owner.UID.Ok() { + c.attr.Owner.UID = owner.UID + // FIXME: Clarify CachingInodeOperations behavior with metadata. + c.dirtyAttr.UID = true + } + if owner.GID.Ok() { + c.attr.Owner.GID = owner.GID + // FIXME: Clarify CachingInodeOperations behavior with metadata. + c.dirtyAttr.GID = true + } + c.touchStatusChangeTimeLocked(ctx) + return nil +} + +// SetTimestamps implements fs.InodeOperations.SetTimestamps. +func (c *CachingInodeOperations) SetTimestamps(ctx context.Context, inode *fs.Inode, ts fs.TimeSpec) error { + if ts.ATimeOmit && ts.MTimeOmit { + return nil + } + + c.attrMu.Lock() + defer c.attrMu.Unlock() + + // Replace requests to use the "system time" with the current time to + // ensure that cached timestamps remain consistent with the remote + // filesystem. + now := ktime.NowFromContext(ctx) + if ts.ATimeSetSystemTime { + ts.ATime = now + } + if ts.MTimeSetSystemTime { + ts.MTime = now + } + masked := fs.AttrMask{ + AccessTime: !ts.ATimeOmit, + ModificationTime: !ts.MTimeOmit, + } + if err := c.backingFile.SetMaskedAttributes(ctx, masked, fs.UnstableAttr{AccessTime: ts.ATime, ModificationTime: ts.MTime}); err != nil { + return err + } + if !ts.ATimeOmit { + c.attr.AccessTime = ts.ATime + // FIXME: Clarify CachingInodeOperations behavior with metadata. + c.dirtyAttr.AccessTime = true + } + if !ts.MTimeOmit { + c.attr.ModificationTime = ts.MTime + // FIXME: Clarify CachingInodeOperations behavior with metadata. + c.dirtyAttr.ModificationTime = true + } + c.touchStatusChangeTimeLocked(ctx) + return nil +} + +// Truncate implements fs.InodeOperations.Truncate. +func (c *CachingInodeOperations) Truncate(ctx context.Context, inode *fs.Inode, size int64) error { + c.attrMu.Lock() + defer c.attrMu.Unlock() + + // c.attr.Size is protected by both c.attrMu and c.dataMu. + c.dataMu.Lock() + if err := c.backingFile.SetMaskedAttributes(ctx, fs.AttrMask{ + Size: true, + }, fs.UnstableAttr{ + Size: size, + }); err != nil { + c.dataMu.Unlock() + return err + } + oldSize := c.attr.Size + if oldSize != size { + c.attr.Size = size + // FIXME: Clarify CachingInodeOperations behavior with metadata. + c.dirtyAttr.Size = true + c.touchModificationTimeLocked(ctx) + } + // We drop c.dataMu here so that we can lock c.mapsMu and invalidate + // mappings below. This allows concurrent calls to Read/Translate/etc. + // These functions synchronize with an in-progress Truncate by refusing to + // use cache contents beyond the new c.attr.Size. (We are still holding + // c.attrMu, so we can't race with Truncate/Write.) + c.dataMu.Unlock() + + // Nothing left to do unless shrinking the file. + if size >= oldSize { + return nil + } + + oldpgend := fs.OffsetPageEnd(oldSize) + newpgend := fs.OffsetPageEnd(size) + + // Invalidate past translations of truncated pages. + if newpgend != oldpgend { + c.mapsMu.Lock() + c.mappings.Invalidate(memmap.MappableRange{newpgend, oldpgend}, memmap.InvalidateOpts{ + // Compare Linux's mm/truncate.c:truncate_setsize() => + // truncate_pagecache() => + // mm/memory.c:unmap_mapping_range(evencows=1). + InvalidatePrivate: true, + }) + c.mapsMu.Unlock() + } + + // We are now guaranteed that there are no translations of truncated pages, + // and can remove them from the cache. Since truncated pages have been + // removed from the backing file, they should be dropped without being + // written back. + c.dataMu.Lock() + defer c.dataMu.Unlock() + c.cache.Truncate(uint64(size), c.platform.Memory()) + c.dirty.KeepClean(memmap.MappableRange{uint64(size), oldpgend}) + + return nil +} + +// WriteOut implements fs.InodeOperations.WriteOut. +func (c *CachingInodeOperations) WriteOut(ctx context.Context, inode *fs.Inode) error { + c.attrMu.Lock() + + // Write dirty pages back. + c.dataMu.RLock() + err := SyncDirtyAll(ctx, &c.cache, &c.dirty, uint64(c.attr.Size), c.platform.Memory(), c.backingFile.WriteFromBlocksAt) + c.dataMu.RUnlock() + if err != nil { + c.attrMu.Unlock() + return err + } + + // Write out cached attributes. + if err := c.backingFile.SetMaskedAttributes(ctx, c.dirtyAttr, c.attr); err != nil { + c.attrMu.Unlock() + return err + } + c.dirtyAttr = fs.AttrMask{} + + c.attrMu.Unlock() + + // Fsync the remote file. + return c.backingFile.Sync(ctx) +} + +// IncLinks increases the link count and updates cached access time. +func (c *CachingInodeOperations) IncLinks(ctx context.Context) { + c.attrMu.Lock() + c.attr.Links++ + c.touchModificationTimeLocked(ctx) + c.attrMu.Unlock() +} + +// DecLinks decreases the link count and updates cached access time. +func (c *CachingInodeOperations) DecLinks(ctx context.Context) { + c.attrMu.Lock() + c.attr.Links-- + c.touchModificationTimeLocked(ctx) + c.attrMu.Unlock() +} + +// TouchAccessTime updates the cached access time in-place to the +// current time. It does not update status change time in-place. See +// mm/filemap.c:do_generic_file_read -> include/linux/h:file_accessed. +func (c *CachingInodeOperations) TouchAccessTime(ctx context.Context, inode *fs.Inode) { + if inode.MountSource.Flags.NoAtime { + return + } + + c.attrMu.Lock() + c.touchAccessTimeLocked(ctx) + c.attrMu.Unlock() +} + +// touchAccesstimeLocked updates the cached access time in-place to the current +// time. +// +// Preconditions: c.attrMu is locked for writing. +func (c *CachingInodeOperations) touchAccessTimeLocked(ctx context.Context) { + c.attr.AccessTime = ktime.NowFromContext(ctx) + c.dirtyAttr.AccessTime = true +} + +// TouchModificationTime updates the cached modification and status change time +// in-place to the current time. +func (c *CachingInodeOperations) TouchModificationTime(ctx context.Context) { + c.attrMu.Lock() + c.touchModificationTimeLocked(ctx) + c.attrMu.Unlock() +} + +// touchModificationTimeLocked updates the cached modification and status +// change time in-place to the current time. +// +// Preconditions: c.attrMu is locked for writing. +func (c *CachingInodeOperations) touchModificationTimeLocked(ctx context.Context) { + now := ktime.NowFromContext(ctx) + c.attr.ModificationTime = now + c.dirtyAttr.ModificationTime = true + c.attr.StatusChangeTime = now + c.dirtyAttr.StatusChangeTime = true +} + +// touchStatusChangeTimeLocked updates the cached status change time +// in-place to the current time. +// +// Preconditions: c.attrMu is locked for writing. +func (c *CachingInodeOperations) touchStatusChangeTimeLocked(ctx context.Context) { + now := ktime.NowFromContext(ctx) + c.attr.StatusChangeTime = now + c.dirtyAttr.StatusChangeTime = true +} + +// Read reads from frames and otherwise directly from the backing file +// into dst starting at offset until dst is full, EOF is reached, or an +// error is encountered. +// +// Read may partially fill dst and return a nil error. +func (c *CachingInodeOperations) Read(ctx context.Context, file *fs.File, dst usermem.IOSequence, offset int64) (int64, error) { + if dst.NumBytes() == 0 { + return 0, nil + } + + // Have we reached EOF? We check for this again in + // inodeReadWriter.ReadToBlocks to avoid holding c.attrMu (which would + // serialize reads) or c.dataMu (which would violate lock ordering), but + // check here first (before calling into MM) since reading at EOF is + // common: getting a return value of 0 from a read syscall is the only way + // to detect EOF. + // + // TODO: Separate out c.attr.Size and use atomics instead of + // c.dataMu. + c.dataMu.RLock() + size := c.attr.Size + c.dataMu.RUnlock() + if offset >= size { + return 0, io.EOF + } + + n, err := dst.CopyOutFrom(ctx, &inodeReadWriter{ctx, c, offset}) + // Compare Linux's mm/filemap.c:do_generic_file_read() => file_accessed(). + c.TouchAccessTime(ctx, file.Dirent.Inode) + return n, err +} + +// Write writes to frames and otherwise directly to the backing file +// from src starting at offset and until src is empty or an error is +// encountered. +// +// If Write partially fills src, a non-nil error is returned. +func (c *CachingInodeOperations) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) { + if src.NumBytes() == 0 { + return 0, nil + } + + c.attrMu.Lock() + defer c.attrMu.Unlock() + // Compare Linux's mm/filemap.c:__generic_file_write_iter() => file_update_time(). + c.touchModificationTimeLocked(ctx) + return src.CopyInTo(ctx, &inodeReadWriter{ctx, c, offset}) +} + +type inodeReadWriter struct { + ctx context.Context + c *CachingInodeOperations + offset int64 +} + +// ReadToBlocks implements safemem.Reader.ReadToBlocks. +func (rw *inodeReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) { + rw.c.dataMu.RLock() + defer rw.c.dataMu.RUnlock() + + // Compute the range to read. + if rw.offset >= rw.c.attr.Size { + return 0, io.EOF + } + end := fs.ReadEndOffset(rw.offset, int64(dsts.NumBytes()), rw.c.attr.Size) + if end == rw.offset { // dsts.NumBytes() == 0? + return 0, nil + } + + mem := rw.c.platform.Memory() + var done uint64 + seg, gap := rw.c.cache.Find(uint64(rw.offset)) + for rw.offset < end { + mr := memmap.MappableRange{uint64(rw.offset), uint64(end)} + switch { + case seg.Ok(): + // Get internal mappings from the cache. + ims, err := mem.MapInternal(seg.FileRangeOf(seg.Range().Intersect(mr)), usermem.Read) + if err != nil { + return done, err + } + + // Copy from internal mappings. + n, err := safemem.CopySeq(dsts, ims) + done += n + rw.offset += int64(n) + dsts = dsts.DropFirst64(n) + if err != nil { + return done, err + } + + // Continue. + seg, gap = seg.NextNonEmpty() + + case gap.Ok(): + // Read directly from the backing file. + gapmr := gap.Range().Intersect(mr) + dst := dsts.TakeFirst64(gapmr.Length()) + n, err := rw.c.backingFile.ReadToBlocksAt(rw.ctx, dst, gapmr.Start) + done += n + rw.offset += int64(n) + dsts = dsts.DropFirst64(n) + // Partial reads are fine. But we must stop reading. + if n != dst.NumBytes() || err != nil { + return done, err + } + + // Continue. + seg, gap = gap.NextSegment(), FileRangeGapIterator{} + + default: + break + } + } + return done, nil +} + +// WriteFromBlocks implements safemem.Writer.WriteFromBlocks. +// +// Preconditions: rw.c.attrMu must be locked. +func (rw *inodeReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) { + rw.c.dataMu.Lock() + defer rw.c.dataMu.Unlock() + + // Compute the range to write. + end := fs.WriteEndOffset(rw.offset, int64(srcs.NumBytes())) + if end == rw.offset { // srcs.NumBytes() == 0? + return 0, nil + } + + defer func() { + // If the write ends beyond the file's previous size, it causes the + // file to grow. + if rw.offset > rw.c.attr.Size { + rw.c.attr.Size = rw.offset + rw.c.dirtyAttr.Size = true + } + if rw.offset > rw.c.attr.Usage { + // This is incorrect if CachingInodeOperations is caching a sparse + // file. (In Linux, keeping inode::i_blocks up to date is the + // filesystem's responsibility.) + rw.c.attr.Usage = rw.offset + rw.c.dirtyAttr.Usage = true + } + }() + + mem := rw.c.platform.Memory() + var done uint64 + seg, gap := rw.c.cache.Find(uint64(rw.offset)) + for rw.offset < end { + mr := memmap.MappableRange{uint64(rw.offset), uint64(end)} + switch { + case seg.Ok() && seg.Start() < mr.End: + // Get internal mappings from the cache. + segMR := seg.Range().Intersect(mr) + ims, err := mem.MapInternal(seg.FileRangeOf(segMR), usermem.Write) + if err != nil { + return done, err + } + + // Copy to internal mappings. + n, err := safemem.CopySeq(ims, srcs) + done += n + rw.offset += int64(n) + srcs = srcs.DropFirst64(n) + rw.c.dirty.MarkDirty(segMR) + if err != nil { + return done, err + } + + // Continue. + seg, gap = seg.NextNonEmpty() + + case gap.Ok() && gap.Start() < mr.End: + // Write directly to the backing file. + gapmr := gap.Range().Intersect(mr) + src := srcs.TakeFirst64(gapmr.Length()) + n, err := rw.c.backingFile.WriteFromBlocksAt(rw.ctx, src, gapmr.Start) + done += n + rw.offset += int64(n) + srcs = srcs.DropFirst64(n) + // Partial writes are fine. But we must stop writing. + if n != src.NumBytes() || err != nil { + return done, err + } + + // Continue. + seg, gap = gap.NextSegment(), FileRangeGapIterator{} + + default: + break + } + } + return done, nil +} + +// AddMapping implements memmap.Mappable.AddMapping. +func (c *CachingInodeOperations) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64) error { + c.mapsMu.Lock() + defer c.mapsMu.Unlock() + mapped := c.mappings.AddMapping(ms, ar, offset) + // Do this unconditionally since whether we have c.backingFile.FD() >= 0 + // can change across save/restore. + for _, r := range mapped { + c.hostFileMapper.IncRefOn(r) + } + if !usage.IncrementalMappedAccounting && !c.forcePageCache && c.backingFile.FD() >= 0 { + for _, r := range mapped { + usage.MemoryAccounting.Inc(r.Length(), usage.Mapped) + } + } + return nil +} + +// RemoveMapping implements memmap.Mappable.RemoveMapping. +func (c *CachingInodeOperations) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64) { + c.mapsMu.Lock() + defer c.mapsMu.Unlock() + unmapped := c.mappings.RemoveMapping(ms, ar, offset) + for _, r := range unmapped { + c.hostFileMapper.DecRefOn(r) + } + if !c.forcePageCache && c.backingFile.FD() >= 0 { + if !usage.IncrementalMappedAccounting { + for _, r := range unmapped { + usage.MemoryAccounting.Dec(r.Length(), usage.Mapped) + } + } + return + } + + // Writeback dirty mapped memory now that there are no longer any + // mappings that reference it. This is our naive memory eviction + // strategy. + mem := c.platform.Memory() + c.dataMu.Lock() + defer c.dataMu.Unlock() + for _, r := range unmapped { + if err := SyncDirty(ctx, r, &c.cache, &c.dirty, uint64(c.attr.Size), c.platform.Memory(), c.backingFile.WriteFromBlocksAt); err != nil { + log.Warningf("Failed to writeback cached data %v: %v", r, err) + } + c.cache.Drop(r, mem) + c.dirty.KeepClean(r) + } +} + +// CopyMapping implements memmap.Mappable.CopyMapping. +func (c *CachingInodeOperations) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR usermem.AddrRange, offset uint64) error { + return c.AddMapping(ctx, ms, dstAR, offset) +} + +// Translate implements memmap.Mappable.Translate. +func (c *CachingInodeOperations) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) { + if !c.forcePageCache && c.backingFile.FD() >= 0 { + return []memmap.Translation{ + { + Source: optional, + File: c, + Offset: optional.Start, + }, + }, nil + } + + c.dataMu.Lock() + defer c.dataMu.Unlock() + + // Constrain translations to c.attr.Size (rounded up) to prevent + // translation to pages that may be concurrently truncated. + pgend := fs.OffsetPageEnd(c.attr.Size) + var beyondEOF bool + if required.End > pgend { + if required.Start >= pgend { + return nil, &memmap.BusError{io.EOF} + } + beyondEOF = true + required.End = pgend + } + if optional.End > pgend { + optional.End = pgend + } + + mem := c.platform.Memory() + cerr := c.cache.Fill(ctx, required, maxFillRange(required, optional), mem, usage.PageCache, c.backingFile.ReadToBlocksAt) + + var ts []memmap.Translation + var translatedEnd uint64 + for seg := c.cache.FindSegment(required.Start); seg.Ok() && seg.Start() < required.End; seg, _ = seg.NextNonEmpty() { + segMR := seg.Range().Intersect(optional) + ts = append(ts, memmap.Translation{ + Source: segMR, + File: mem, + Offset: seg.FileRangeOf(segMR).Start, + }) + if at.Write { + // From this point forward, this memory can be dirtied through the + // mapping at any time. + c.dirty.KeepDirty(segMR) + } + translatedEnd = segMR.End + } + + // Don't return the error returned by c.cache.Fill if it occurred outside + // of required. + if translatedEnd < required.End && cerr != nil { + return ts, &memmap.BusError{cerr} + } + if beyondEOF { + return ts, &memmap.BusError{io.EOF} + } + return ts, nil +} + +func maxFillRange(required, optional memmap.MappableRange) memmap.MappableRange { + const maxReadahead = 64 << 10 // 64 KB, chosen arbitrarily + if required.Length() >= maxReadahead { + return required + } + if optional.Length() <= maxReadahead { + return optional + } + optional.Start = required.Start + if optional.Length() <= maxReadahead { + return optional + } + optional.End = optional.Start + maxReadahead + return optional +} + +// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable. +func (c *CachingInodeOperations) InvalidateUnsavable(ctx context.Context) error { + // Whether we have a host fd (and consequently what platform.File is + // mapped) can change across save/restore, so invalidate all translations + // unconditionally. + c.mapsMu.Lock() + defer c.mapsMu.Unlock() + c.mappings.InvalidateAll(memmap.InvalidateOpts{}) + + // Sync the cache's contents so that if we have a host fd after restore, + // the remote file's contents are coherent. + c.dataMu.Lock() + defer c.dataMu.Unlock() + if err := SyncDirtyAll(ctx, &c.cache, &c.dirty, uint64(c.attr.Size), c.platform.Memory(), c.backingFile.WriteFromBlocksAt); err != nil { + return err + } + + // Discard the cache so that it's not stored in saved state. This is safe + // because per InvalidateUnsavable invariants, no new translations can have + // been returned after we invalidated all existing translations above. + c.cache.DropAll(c.platform.Memory()) + c.dirty.RemoveAll() + + return nil +} + +// MapInto implements platform.File.MapInto. This is used when we directly map +// an underlying host fd and CachingInodeOperations is used as the platform.File +// during translation. +func (c *CachingInodeOperations) MapInto(as platform.AddressSpace, addr usermem.Addr, fr platform.FileRange, at usermem.AccessType, precommit bool) error { + return as.MapFile(addr, c.backingFile.FD(), fr, at, precommit) +} + +// MapInternal implements platform.File.MapInternal. This is used when we +// directly map an underlying host fd and CachingInodeOperations is used as the +// platform.File during translation. +func (c *CachingInodeOperations) MapInternal(fr platform.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) { + return c.hostFileMapper.MapInternal(fr, c.backingFile.FD(), at.Write) +} + +// IncRef implements platform.File.IncRef. This is used when we directly map an +// underlying host fd and CachingInodeOperations is used as the platform.File +// during translation. +func (c *CachingInodeOperations) IncRef(fr platform.FileRange) { + c.dataMu.Lock() + defer c.dataMu.Unlock() + + seg, gap := c.refs.Find(fr.Start) + for { + switch { + case seg.Ok() && seg.Start() < fr.End: + seg = c.refs.Isolate(seg, fr) + seg.SetValue(seg.Value() + 1) + seg, gap = seg.NextNonEmpty() + case gap.Ok() && gap.Start() < fr.End: + newRange := gap.Range().Intersect(fr) + if usage.IncrementalMappedAccounting { + usage.MemoryAccounting.Inc(newRange.Length(), usage.Mapped) + } + seg, gap = c.refs.InsertWithoutMerging(gap, newRange, 1).NextNonEmpty() + default: + c.refs.MergeAdjacent(fr) + return + } + } +} + +// DecRef implements platform.File.DecRef. This is used when we directly map an +// underlying host fd and CachingInodeOperations is used as the platform.File +// during translation. +func (c *CachingInodeOperations) DecRef(fr platform.FileRange) { + c.dataMu.Lock() + defer c.dataMu.Unlock() + + seg := c.refs.FindSegment(fr.Start) + + for seg.Ok() && seg.Start() < fr.End { + seg = c.refs.Isolate(seg, fr) + if old := seg.Value(); old == 1 { + if usage.IncrementalMappedAccounting { + usage.MemoryAccounting.Dec(seg.Range().Length(), usage.Mapped) + } + seg = c.refs.Remove(seg).NextSegment() + } else { + seg.SetValue(old - 1) + seg = seg.NextSegment() + } + } + c.refs.MergeAdjacent(fr) +} diff --git a/pkg/sentry/fs/fsutil/inode_cached_test.go b/pkg/sentry/fs/fsutil/inode_cached_test.go new file mode 100644 index 000000000..996c91849 --- /dev/null +++ b/pkg/sentry/fs/fsutil/inode_cached_test.go @@ -0,0 +1,403 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fsutil + +import ( + "bytes" + "io" + "reflect" + "testing" + + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time" + "gvisor.googlesource.com/gvisor/pkg/sentry/memmap" + "gvisor.googlesource.com/gvisor/pkg/sentry/safemem" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +type noopBackingFile struct{} + +func (noopBackingFile) ReadToBlocksAt(ctx context.Context, dsts safemem.BlockSeq, offset uint64) (uint64, error) { + return dsts.NumBytes(), nil +} + +func (noopBackingFile) WriteFromBlocksAt(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error) { + return srcs.NumBytes(), nil +} + +func (noopBackingFile) SetMaskedAttributes(context.Context, fs.AttrMask, fs.UnstableAttr) error { + return nil +} + +func (noopBackingFile) Sync(context.Context) error { + return nil +} + +func (noopBackingFile) FD() int { + return -1 +} + +func TestSetPermissions(t *testing.T) { + ctx := contexttest.Context(t) + + uattr := fs.WithCurrentTime(ctx, fs.UnstableAttr{ + Perms: fs.FilePermsFromMode(0444), + }) + iops := NewCachingInodeOperations(ctx, noopBackingFile{}, uattr, false /*forcePageCache*/) + defer iops.Release() + + perms := fs.FilePermsFromMode(0777) + if !iops.SetPermissions(ctx, nil, perms) { + t.Fatalf("SetPermissions failed, want success") + } + + // Did permissions change? + if !iops.dirtyAttr.Perms { + t.Fatalf("got perms not dirty, want dirty") + } + if iops.attr.Perms != perms { + t.Fatalf("got perms +%v, want +%v", iops.attr.Perms, perms) + } + + // Did status change time change? + if !iops.dirtyAttr.StatusChangeTime { + t.Fatalf("got status change time not dirty, want dirty") + } + if iops.attr.StatusChangeTime.Equal(uattr.StatusChangeTime) { + t.Fatalf("got status change time unchanged") + } +} + +func TestSetTimestamps(t *testing.T) { + ctx := contexttest.Context(t) + for _, test := range []struct { + desc string + ts fs.TimeSpec + wantDirty fs.AttrMask + }{ + { + desc: "noop", + ts: fs.TimeSpec{ + ATimeOmit: true, + MTimeOmit: true, + }, + wantDirty: fs.AttrMask{}, + }, + { + desc: "access time only", + ts: fs.TimeSpec{ + ATime: ktime.NowFromContext(ctx), + MTimeOmit: true, + }, + wantDirty: fs.AttrMask{ + AccessTime: true, + StatusChangeTime: true, + }, + }, + { + desc: "modification time only", + ts: fs.TimeSpec{ + ATimeOmit: true, + MTime: ktime.NowFromContext(ctx), + }, + wantDirty: fs.AttrMask{ + ModificationTime: true, + StatusChangeTime: true, + }, + }, + { + desc: "access and modification time", + ts: fs.TimeSpec{ + ATime: ktime.NowFromContext(ctx), + MTime: ktime.NowFromContext(ctx), + }, + wantDirty: fs.AttrMask{ + AccessTime: true, + ModificationTime: true, + StatusChangeTime: true, + }, + }, + { + desc: "system time access and modification time", + ts: fs.TimeSpec{ + ATimeSetSystemTime: true, + MTimeSetSystemTime: true, + }, + wantDirty: fs.AttrMask{ + AccessTime: true, + ModificationTime: true, + StatusChangeTime: true, + }, + }, + } { + t.Run(test.desc, func(t *testing.T) { + ctx := contexttest.Context(t) + + epoch := ktime.ZeroTime + uattr := fs.UnstableAttr{ + AccessTime: epoch, + ModificationTime: epoch, + StatusChangeTime: epoch, + } + iops := NewCachingInodeOperations(ctx, noopBackingFile{}, uattr, false /*forcePageCache*/) + defer iops.Release() + + if err := iops.SetTimestamps(ctx, nil, test.ts); err != nil { + t.Fatalf("SetTimestamps got error %v, want nil", err) + } + if !reflect.DeepEqual(iops.dirtyAttr, test.wantDirty) { + t.Fatalf("dirty got %+v, want %+v", iops.dirtyAttr, test.wantDirty) + } + if iops.dirtyAttr.AccessTime { + if !iops.attr.AccessTime.After(uattr.AccessTime) { + t.Fatalf("diritied access time did not advance, want %v > %v", iops.attr.AccessTime, uattr.AccessTime) + } + if !iops.dirtyAttr.StatusChangeTime { + t.Fatalf("dirty access time requires dirty status change time") + } + if !iops.attr.StatusChangeTime.After(uattr.StatusChangeTime) { + t.Fatalf("dirtied status change time did not advance") + } + } + if iops.dirtyAttr.ModificationTime { + if !iops.attr.ModificationTime.After(uattr.ModificationTime) { + t.Fatalf("diritied modification time did not advance") + } + if !iops.dirtyAttr.StatusChangeTime { + t.Fatalf("dirty modification time requires dirty status change time") + } + if !iops.attr.StatusChangeTime.After(uattr.StatusChangeTime) { + t.Fatalf("dirtied status change time did not advance") + } + } + }) + } +} + +func TestTruncate(t *testing.T) { + ctx := contexttest.Context(t) + + uattr := fs.UnstableAttr{ + Size: 0, + } + iops := NewCachingInodeOperations(ctx, noopBackingFile{}, uattr, false /*forcePageCache*/) + defer iops.Release() + + if err := iops.Truncate(ctx, nil, uattr.Size); err != nil { + t.Fatalf("Truncate got error %v, want nil", err) + } + if iops.dirtyAttr.Size { + t.Fatalf("Truncate caused size to be dirtied") + } + var size int64 = 4096 + if err := iops.Truncate(ctx, nil, size); err != nil { + t.Fatalf("Truncate got error %v, want nil", err) + } + if !iops.dirtyAttr.Size { + t.Fatalf("Truncate caused size to not be dirtied") + } + if iops.attr.Size != size { + t.Fatalf("Truncate got %d, want %d", iops.attr.Size, size) + } + if !iops.dirtyAttr.ModificationTime || !iops.dirtyAttr.StatusChangeTime { + t.Fatalf("Truncate did not dirty modification and status change time") + } + if !iops.attr.ModificationTime.After(uattr.ModificationTime) { + t.Fatalf("dirtied modification time did not change") + } + if !iops.attr.StatusChangeTime.After(uattr.StatusChangeTime) { + t.Fatalf("dirtied status change time did not change") + } +} + +type sliceBackingFile struct { + data []byte +} + +func newSliceBackingFile(data []byte) *sliceBackingFile { + return &sliceBackingFile{data} +} + +func (f *sliceBackingFile) ReadToBlocksAt(ctx context.Context, dsts safemem.BlockSeq, offset uint64) (uint64, error) { + r := safemem.BlockSeqReader{safemem.BlockSeqOf(safemem.BlockFromSafeSlice(f.data)).DropFirst64(offset)} + return r.ReadToBlocks(dsts) +} + +func (f *sliceBackingFile) WriteFromBlocksAt(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error) { + w := safemem.BlockSeqWriter{safemem.BlockSeqOf(safemem.BlockFromSafeSlice(f.data)).DropFirst64(offset)} + return w.WriteFromBlocks(srcs) +} + +func (*sliceBackingFile) SetMaskedAttributes(context.Context, fs.AttrMask, fs.UnstableAttr) error { + return nil +} + +func (*sliceBackingFile) Sync(context.Context) error { + return nil +} + +func (*sliceBackingFile) FD() int { + return -1 +} + +type noopMappingSpace struct{} + +// Invalidate implements memmap.MappingSpace.Invalidate. +func (noopMappingSpace) Invalidate(ar usermem.AddrRange, opts memmap.InvalidateOpts) { +} + +func anonInode(ctx context.Context) *fs.Inode { + return fs.NewInode(NewSimpleInodeOperations(InodeSimpleAttributes{ + UAttr: fs.WithCurrentTime(ctx, fs.UnstableAttr{ + Owner: fs.FileOwnerFromContext(ctx), + Perms: fs.FilePermissions{ + User: fs.PermMask{Read: true, Write: true}, + }, + Links: 1, + }), + }), fs.NewNonCachingMountSource(nil, fs.MountSourceFlags{}), fs.StableAttr{ + Type: fs.Anonymous, + BlockSize: usermem.PageSize, + }) +} + +func pagesOf(bs ...byte) []byte { + buf := make([]byte, 0, len(bs)*usermem.PageSize) + for _, b := range bs { + buf = append(buf, bytes.Repeat([]byte{b}, usermem.PageSize)...) + } + return buf +} + +func TestRead(t *testing.T) { + ctx := contexttest.Context(t) + + // Construct a 3-page file. + buf := pagesOf('a', 'b', 'c') + file := fs.NewFile(ctx, fs.NewDirent(anonInode(ctx), "anon"), fs.FileFlags{}, nil) + uattr := fs.UnstableAttr{ + Size: int64(len(buf)), + } + iops := NewCachingInodeOperations(ctx, newSliceBackingFile(buf), uattr, false /*forcePageCache*/) + defer iops.Release() + + // Expect the cache to be initially empty. + if cached := iops.cache.Span(); cached != 0 { + t.Errorf("Span got %d, want 0", cached) + } + + // Create a memory mapping of the second page (as CachingInodeOperations + // expects to only cache mapped pages), then call Translate to force it to + // be cached. + var ms noopMappingSpace + ar := usermem.AddrRange{usermem.PageSize, 2 * usermem.PageSize} + if err := iops.AddMapping(ctx, ms, ar, usermem.PageSize); err != nil { + t.Fatalf("AddMapping got %v, want nil", err) + } + mr := memmap.MappableRange{usermem.PageSize, 2 * usermem.PageSize} + if _, err := iops.Translate(ctx, mr, mr, usermem.Read); err != nil { + t.Fatalf("Translate got %v, want nil", err) + } + if cached := iops.cache.Span(); cached != usermem.PageSize { + t.Errorf("SpanRange got %d, want %d", cached, usermem.PageSize) + } + + // Try to read 4 pages. The first and third pages should be read directly + // from the "file", the second page should be read from the cache, and only + // 3 pages (the size of the file) should be readable. + rbuf := make([]byte, 4*usermem.PageSize) + dst := usermem.BytesIOSequence(rbuf) + n, err := iops.Read(ctx, file, dst, 0) + if n != 3*usermem.PageSize || (err != nil && err != io.EOF) { + t.Fatalf("Read got (%d, %v), want (%d, nil or EOF)", n, err, 3*usermem.PageSize) + } + rbuf = rbuf[:3*usermem.PageSize] + + // Did we get the bytes we expect? + if !bytes.Equal(rbuf, buf) { + t.Errorf("Read back bytes %v, want %v", rbuf, buf) + } + + // Delete the memory mapping and expect it to cause the cached page to be + // uncached. + iops.RemoveMapping(ctx, ms, ar, usermem.PageSize) + if cached := iops.cache.Span(); cached != 0 { + t.Fatalf("Span got %d, want 0", cached) + } +} + +func TestWrite(t *testing.T) { + ctx := contexttest.Context(t) + + // Construct a 4-page file. + buf := pagesOf('a', 'b', 'c', 'd') + orig := append([]byte(nil), buf...) + inode := anonInode(ctx) + uattr := fs.UnstableAttr{ + Size: int64(len(buf)), + } + iops := NewCachingInodeOperations(ctx, newSliceBackingFile(buf), uattr, false /*forcePageCache*/) + defer iops.Release() + + // Expect the cache to be initially empty. + if cached := iops.cache.Span(); cached != 0 { + t.Errorf("Span got %d, want 0", cached) + } + + // Create a memory mapping of the second and third pages (as + // CachingInodeOperations expects to only cache mapped pages), then call + // Translate to force them to be cached. + var ms noopMappingSpace + ar := usermem.AddrRange{usermem.PageSize, 3 * usermem.PageSize} + if err := iops.AddMapping(ctx, ms, ar, usermem.PageSize); err != nil { + t.Fatalf("AddMapping got %v, want nil", err) + } + defer iops.RemoveMapping(ctx, ms, ar, usermem.PageSize) + mr := memmap.MappableRange{usermem.PageSize, 3 * usermem.PageSize} + if _, err := iops.Translate(ctx, mr, mr, usermem.Read); err != nil { + t.Fatalf("Translate got %v, want nil", err) + } + if cached := iops.cache.Span(); cached != 2*usermem.PageSize { + t.Errorf("SpanRange got %d, want %d", cached, 2*usermem.PageSize) + } + + // Write to the first 2 pages. + wbuf := pagesOf('e', 'f') + src := usermem.BytesIOSequence(wbuf) + n, err := iops.Write(ctx, src, 0) + if n != 2*usermem.PageSize || err != nil { + t.Fatalf("Write got (%d, %v), want (%d, nil)", n, err, 2*usermem.PageSize) + } + + // The first page should have been written directly, since it was not cached. + want := append([]byte(nil), orig...) + copy(want, pagesOf('e')) + if !bytes.Equal(buf, want) { + t.Errorf("File contents are %v, want %v", buf, want) + } + + // Sync back to the "backing file". + if err := iops.WriteOut(ctx, inode); err != nil { + t.Errorf("Sync got %v, want nil", err) + } + + // Now the second page should have been written as well. + copy(want[usermem.PageSize:], pagesOf('f')) + if !bytes.Equal(buf, want) { + t.Errorf("File contents are %v, want %v", buf, want) + } +} diff --git a/pkg/sentry/fs/g3doc/inotify.md b/pkg/sentry/fs/g3doc/inotify.md new file mode 100644 index 000000000..1e99a3357 --- /dev/null +++ b/pkg/sentry/fs/g3doc/inotify.md @@ -0,0 +1,122 @@ +# Inotify + +Inotify implements the like-named filesystem event notification system for the +sentry, see `inotify(7)`. + +## Architecture + +For the most part, the sentry implementation of inotify mirrors the Linux +architecture. Inotify instances (i.e. the fd returned by inotify_init(2)) are +backed by a pseudo-filesystem. Events are generated from various places in the +sentry, including the [syscall layer][syscall_dir], the [vfs layer][dirent] and +the [process fd table][fd_map]. Watches are stored in inodes and generated +events are queued to the inotify instance owning the watches for delivery to the +user. + +## Objects + +Here is a brief description of the existing and new objects involved in the +sentry inotify mechanism, and how they interact: + +### [`fs.Inotify`][inotify] + +- An inotify instances, created by inotify_init(2)/inotify_init1(2). +- The inotify fd has a `fs.Dirent`, supports filesystem syscalls to read + events. +- Has multiple `fs.Watch`es, with at most one watch per target inode, per + inotify instance. +- Has an instance `id` which is globally unique. This is *not* the fd number + for this instance, since the fd can be duped. This `id` is not externally + visible. + +### [`fs.Watch`][watch] + +- An inotify watch, created/deleted by + inotify_add_watch(2)/inotify_rm_watch(2). +- Owned by an `fs.Inotify` instance, each watch keeps a pointer to the + `owner`. +- Associated with a single `fs.Inode`, which is the watch `target`. While the + watch is active, it indirectly pins `target` to memory. See the "Reference + Model" section for a detailed explanation. +- Filesystem operations on `target` generate `fs.Event`s. + +### [`fs.Event`][event] + +- A simple struct encapsulating all the fields for an inotify event. +- Generated by `fs.Watch`es and forwarded to the watches' `owner`s. +- Serialized to the user during read(2) syscalls on the associated + `fs.Inotify`'s fd. + +### [`fs.Dirent`][dirent] + +- Many inotify events are generated inside dirent methods. Events are + generated in the dirent methods rather than `fs.Inode` methods because some + events carry the name of the subject node, and node names are generally + unavailable in an `fs.Inode`. +- Dirents do not directly contain state for any watches. Instead, they forward + notifications to the underlying `fs.Inode`. + +### [`fs.Inode`][inode] + +- Interacts with inotify through `fs.Watch`es. +- Inodes contain a map of all active `fs.Watch`es on them. +- An `fs.Inotify` instance can have at most one `fs.Watch` per inode. + `fs.Watch`es on an inode are indexed by their `owner`'s `id`. +- All inotify logic is encapsulated in the [`Watches`][inode_watches] struct + in an inode. Logically, `Watches` is the set of inotify watches on the + inode. + +## Reference Model + +The sentry inotify implementation has a complex reference model. An inotify +watch observes a single inode. For efficient lookup, the state for a watch is +stored directly on the target inode. This state needs to be persistent for the +lifetime of watch. Unlike usual filesystem metadata, the watch state has no +"on-disk" representation, so they cannot be reconstructed by the filesystem if +the inode is flushed from memory. This effectively means we need to keep any +inodes with actives watches pinned to memory. + +We can't just hold an extra ref on the inode to pin it to memory because some +filesystems (such as gofer-based filesystems) don't have persistent inodes. In +such a filesystem, if we just pin the inode, nothing prevents the enclosing +dirent from being GCed. Once the dirent is GCed, the pinned inode is +unreachable -- these filesystems generate a new inode by re-reading the node +state on the next walk. Incidentally, hardlinks also don't work on these +filesystems for this reason. + +To prevent the above scenario, when a new watch is added on an inode, we *pin* +the dirent we used to reach the inode. Note that due to hardlinks, this dirent +may not be the only dirent pointing to the inode. Attempting to set an inotify +watch via multiple hardlinks to the same file results in the same watch being +returned for both links. However, for each new dirent we use to reach the same +inode, we add a new pin. We need a new pin for each new dirent used to reach the +inode because we have no guarantees about the deletion order of the different +links to the inode. + +## Lock Ordering + +There are 4 locks related to the inotify implementation: + +- `Inotify.mu`: the inotify instance lock. +- `Inotify.evMu`: the inotify event queue lock. +- `Watch.mu`: the watch lock, used to protect pins. +- `fs.Watches.mu`: the inode watch set mu, used to protect the collection of + watches on the inode. + +The correct lock ordering for inotify code is: + +`Inotify.mu` -> `fs.Watches.mu` -> `Watch.mu` -> `Inotify.evMu`. + +We need a distinct lock for the event queue because by the time a goroutine +attempts to queue a new event, it is already holding `fs.Watches.mu`. If we used +`Inotify.mu` to also protect the event queue, this would violate the above lock +ordering. + +[dirent]: https://gvisor.googlesource.com/gvisor/+/master/pkg/sentry/fs/dirent.go +[event]: https://gvisor.googlesource.com/gvisor/+/master/pkg/sentry/fs/inotify_event.go +[fd_map]: https://gvisor.googlesource.com/gvisor/+/master/pkg/sentry/kernel/fd_map.go +[inode]: https://gvisor.googlesource.com/gvisor/+/master/pkg/sentry/fs/inode.go +[inode_watches]: https://gvisor.googlesource.com/gvisor/+/master/pkg/sentry/fs/inode_inotify.go +[inotify]: https://gvisor.googlesource.com/gvisor/+/master/pkg/sentry/fs/inotify.go +[syscall_dir]: https://gvisor.googlesource.com/gvisor/+/master/pkg/sentry/syscalls/linux/ +[watch]: https://gvisor.googlesource.com/gvisor/+/master/pkg/sentry/fs/inotify_watch.go diff --git a/pkg/sentry/fs/gofer/BUILD b/pkg/sentry/fs/gofer/BUILD new file mode 100644 index 000000000..ca42b0a54 --- /dev/null +++ b/pkg/sentry/fs/gofer/BUILD @@ -0,0 +1,90 @@ +package(licenses = ["notice"]) # Apache 2.0 + +load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test") +load("//tools/go_stateify:defs.bzl", "go_stateify") + +go_stateify( + name = "gofer_state", + srcs = [ + "file.go", + "file_state.go", + "fs.go", + "inode.go", + "inode_state.go", + "session.go", + "session_state.go", + ], + out = "gofer_state.go", + package = "gofer", +) + +go_library( + name = "gofer", + srcs = [ + "attr.go", + "context_file.go", + "device.go", + "file.go", + "file_state.go", + "fs.go", + "gofer_state.go", + "handles.go", + "inode.go", + "inode_state.go", + "path.go", + "session.go", + "session_state.go", + "socket.go", + "util.go", + ], + importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/fs/gofer", + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/amutex", + "//pkg/fd", + "//pkg/log", + "//pkg/metric", + "//pkg/p9", + "//pkg/refs", + "//pkg/secio", + "//pkg/sentry/context", + "//pkg/sentry/device", + "//pkg/sentry/fs", + "//pkg/sentry/fs/fdpipe", + "//pkg/sentry/fs/fsutil", + "//pkg/sentry/fs/host", + "//pkg/sentry/fs/lock", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/kernel/time", + "//pkg/sentry/memmap", + "//pkg/sentry/platform", + "//pkg/sentry/safemem", + "//pkg/sentry/uniqueid", + "//pkg/sentry/usermem", + "//pkg/state", + "//pkg/syserror", + "//pkg/tcpip", + "//pkg/tcpip/transport/unix", + "//pkg/unet", + "//pkg/waiter", + ], +) + +go_test( + name = "gofer_test", + size = "small", + srcs = ["gofer_test.go"], + embed = [":gofer"], + deps = [ + "//pkg/log", + "//pkg/p9", + "//pkg/p9/p9test", + "//pkg/sentry/context", + "//pkg/sentry/context/contexttest", + "//pkg/sentry/fs", + "//pkg/sentry/kernel/time", + "//pkg/sentry/usermem", + "//pkg/unet", + ], +) diff --git a/pkg/sentry/fs/gofer/attr.go b/pkg/sentry/fs/gofer/attr.go new file mode 100644 index 000000000..5e24767f9 --- /dev/null +++ b/pkg/sentry/fs/gofer/attr.go @@ -0,0 +1,162 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package gofer + +import ( + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/p9" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth" + ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +// getattr returns the 9p attributes of the p9.File. On success, Mode, Size, and RDev +// are guaranteed to be masked as valid. +func getattr(ctx context.Context, file contextFile) (p9.QID, p9.AttrMask, p9.Attr, error) { + // Retrieve attributes over the wire. + qid, valid, attr, err := file.getAttr(ctx, p9.AttrMaskAll()) + if err != nil { + return qid, valid, attr, err + } + + // Require mode, size, and raw device id. + if !valid.Mode || !valid.Size || !valid.RDev { + return qid, valid, attr, syscall.EIO + } + + return qid, valid, attr, nil +} + +func unstable(ctx context.Context, valid p9.AttrMask, pattr p9.Attr, mounter fs.FileOwner, client *p9.Client) fs.UnstableAttr { + return fs.UnstableAttr{ + Size: int64(pattr.Size), + Usage: int64(pattr.Size), + Perms: perms(valid, pattr, client), + Owner: owner(mounter, valid, pattr), + AccessTime: atime(ctx, valid, pattr), + ModificationTime: mtime(ctx, valid, pattr), + StatusChangeTime: ctime(ctx, valid, pattr), + Links: links(valid, pattr), + } +} + +func perms(valid p9.AttrMask, pattr p9.Attr, client *p9.Client) fs.FilePermissions { + if pattr.Mode.IsDir() && !p9.VersionSupportsMultiUser(client.Version()) { + // If user and group permissions bits are not supplied, use + // "other" bits to supplement them. + // + // Older Gofer's fake directories only have "other" permission, + // but will often be accessed via user or group permissions. + if pattr.Mode&0770 == 0 { + other := pattr.Mode & 07 + pattr.Mode = pattr.Mode | other<<3 | other<<6 + } + } + return fs.FilePermsFromP9(pattr.Mode) +} + +func owner(mounter fs.FileOwner, valid p9.AttrMask, pattr p9.Attr) fs.FileOwner { + // Unless the file returned its UID and GID, it belongs to the mounting + // task's EUID/EGID. + owner := mounter + if valid.UID { + owner.UID = auth.KUID(pattr.UID) + } + if valid.GID { + owner.GID = auth.KGID(pattr.GID) + } + return owner +} + +// bsize returns a block size from 9p attributes. +func bsize(pattr p9.Attr) int64 { + if pattr.BlockSize > 0 { + return int64(pattr.BlockSize) + } + // Some files may have no clue of their block size. Better not to report + // something misleading or buggy and have a safe default. + return usermem.PageSize +} + +// ntype returns an fs.InodeType from 9p attributes. +func ntype(pattr p9.Attr) fs.InodeType { + switch { + case pattr.Mode.IsNamedPipe(): + return fs.Pipe + case pattr.Mode.IsDir(): + return fs.Directory + case pattr.Mode.IsSymlink(): + return fs.Symlink + case pattr.Mode.IsCharacterDevice(): + return fs.CharacterDevice + case pattr.Mode.IsBlockDevice(): + return fs.BlockDevice + case pattr.Mode.IsSocket(): + return fs.Socket + case pattr.Mode.IsRegular(): + fallthrough + default: + return fs.RegularFile + } +} + +// ctime returns a change time from 9p attributes. +func ctime(ctx context.Context, valid p9.AttrMask, pattr p9.Attr) ktime.Time { + if valid.CTime { + return ktime.FromUnix(int64(pattr.CTimeSeconds), int64(pattr.CTimeNanoSeconds)) + } + // Approximate ctime with mtime if ctime isn't available. + return mtime(ctx, valid, pattr) +} + +// atime returns an access time from 9p attributes. +func atime(ctx context.Context, valid p9.AttrMask, pattr p9.Attr) ktime.Time { + if valid.ATime { + return ktime.FromUnix(int64(pattr.ATimeSeconds), int64(pattr.ATimeNanoSeconds)) + } + return ktime.NowFromContext(ctx) +} + +// mtime returns a modification time from 9p attributes. +func mtime(ctx context.Context, valid p9.AttrMask, pattr p9.Attr) ktime.Time { + if valid.MTime { + return ktime.FromUnix(int64(pattr.MTimeSeconds), int64(pattr.MTimeNanoSeconds)) + } + return ktime.NowFromContext(ctx) +} + +// links returns a hard link count from 9p attributes. +func links(valid p9.AttrMask, pattr p9.Attr) uint64 { + // For gofer file systems that support link count (such as a local file gofer), + // we return the link count reported by the underlying file system. + if valid.NLink { + return pattr.NLink + } + + // This node is likely backed by a file system that doesn't support links. + // We could readdir() and count children directories to provide an accurate + // link count. However this may be expensive since the gofer may be backed by remote + // storage. Instead, simply return 2 links for directories and 1 for everything else + // since no one relies on an accurate link count for gofer-based file systems. + switch ntype(pattr) { + case fs.Directory: + return 2 + default: + return 1 + } +} diff --git a/pkg/sentry/fs/gofer/context_file.go b/pkg/sentry/fs/gofer/context_file.go new file mode 100644 index 000000000..d4b6f6eb7 --- /dev/null +++ b/pkg/sentry/fs/gofer/context_file.go @@ -0,0 +1,190 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package gofer + +import ( + "gvisor.googlesource.com/gvisor/pkg/fd" + "gvisor.googlesource.com/gvisor/pkg/p9" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" +) + +// contextFile is a wrapper around p9.File that notifies the context that +// it's about to sleep before calling the Gofer over P9. +type contextFile struct { + file p9.File +} + +func (c *contextFile) walk(ctx context.Context, names []string) ([]p9.QID, contextFile, error) { + ctx.UninterruptibleSleepStart(false) + defer ctx.UninterruptibleSleepFinish(false) + + q, f, err := c.file.Walk(names) + if err != nil { + return nil, contextFile{}, err + } + return q, contextFile{file: f}, nil +} + +func (c *contextFile) statFS(ctx context.Context) (p9.FSStat, error) { + ctx.UninterruptibleSleepStart(false) + defer ctx.UninterruptibleSleepFinish(false) + + return c.file.StatFS() +} + +func (c *contextFile) getAttr(ctx context.Context, req p9.AttrMask) (p9.QID, p9.AttrMask, p9.Attr, error) { + ctx.UninterruptibleSleepStart(false) + defer ctx.UninterruptibleSleepFinish(false) + + return c.file.GetAttr(req) +} + +func (c *contextFile) setAttr(ctx context.Context, valid p9.SetAttrMask, attr p9.SetAttr) error { + ctx.UninterruptibleSleepStart(false) + defer ctx.UninterruptibleSleepFinish(false) + + return c.file.SetAttr(valid, attr) +} + +func (c *contextFile) remove(ctx context.Context) error { + ctx.UninterruptibleSleepStart(false) + defer ctx.UninterruptibleSleepFinish(false) + + return c.file.Remove() +} + +func (c *contextFile) rename(ctx context.Context, directory contextFile, name string) error { + ctx.UninterruptibleSleepStart(false) + defer ctx.UninterruptibleSleepFinish(false) + + return c.file.Rename(directory.file, name) +} + +func (c *contextFile) close(ctx context.Context) error { + ctx.UninterruptibleSleepStart(false) + defer ctx.UninterruptibleSleepFinish(false) + + return c.file.Close() +} + +func (c *contextFile) open(ctx context.Context, mode p9.OpenFlags) (*fd.FD, p9.QID, uint32, error) { + ctx.UninterruptibleSleepStart(false) + defer ctx.UninterruptibleSleepFinish(false) + + return c.file.Open(mode) +} + +func (c *contextFile) readAt(ctx context.Context, p []byte, offset uint64) (int, error) { + ctx.UninterruptibleSleepStart(false) + defer ctx.UninterruptibleSleepFinish(false) + + return c.file.ReadAt(p, offset) +} + +func (c *contextFile) writeAt(ctx context.Context, p []byte, offset uint64) (int, error) { + ctx.UninterruptibleSleepStart(false) + defer ctx.UninterruptibleSleepFinish(false) + + return c.file.WriteAt(p, offset) +} + +func (c *contextFile) fsync(ctx context.Context) error { + ctx.UninterruptibleSleepStart(false) + defer ctx.UninterruptibleSleepFinish(false) + + return c.file.FSync() +} + +func (c *contextFile) create(ctx context.Context, name string, flags p9.OpenFlags, permissions p9.FileMode, uid p9.UID, gid p9.GID) (*fd.FD, error) { + ctx.UninterruptibleSleepStart(false) + defer ctx.UninterruptibleSleepFinish(false) + + fd, _, _, _, err := c.file.Create(name, flags, permissions, uid, gid) + return fd, err +} + +func (c *contextFile) mkdir(ctx context.Context, name string, permissions p9.FileMode, uid p9.UID, gid p9.GID) (p9.QID, error) { + ctx.UninterruptibleSleepStart(false) + defer ctx.UninterruptibleSleepFinish(false) + + return c.file.Mkdir(name, permissions, uid, gid) +} + +func (c *contextFile) symlink(ctx context.Context, oldName string, newName string, uid p9.UID, gid p9.GID) (p9.QID, error) { + ctx.UninterruptibleSleepStart(false) + defer ctx.UninterruptibleSleepFinish(false) + + return c.file.Symlink(oldName, newName, uid, gid) +} + +func (c *contextFile) link(ctx context.Context, target *contextFile, newName string) error { + ctx.UninterruptibleSleepStart(false) + defer ctx.UninterruptibleSleepFinish(false) + + return c.file.Link(target.file, newName) +} + +func (c *contextFile) mknod(ctx context.Context, name string, permissions p9.FileMode, major uint32, minor uint32, uid p9.UID, gid p9.GID) (p9.QID, error) { + ctx.UninterruptibleSleepStart(false) + defer ctx.UninterruptibleSleepFinish(false) + + return c.file.Mknod(name, permissions, major, minor, uid, gid) +} + +func (c *contextFile) unlinkAt(ctx context.Context, name string, flags uint32) error { + ctx.UninterruptibleSleepStart(false) + defer ctx.UninterruptibleSleepFinish(false) + + return c.file.UnlinkAt(name, flags) +} + +func (c *contextFile) readdir(ctx context.Context, offset uint64, count uint32) ([]p9.Dirent, error) { + ctx.UninterruptibleSleepStart(false) + defer ctx.UninterruptibleSleepFinish(false) + + return c.file.Readdir(offset, count) +} + +func (c *contextFile) readlink(ctx context.Context) (string, error) { + ctx.UninterruptibleSleepStart(false) + defer ctx.UninterruptibleSleepFinish(false) + + return c.file.Readlink() +} + +func (c *contextFile) flush(ctx context.Context) error { + ctx.UninterruptibleSleepStart(false) + defer ctx.UninterruptibleSleepFinish(false) + + return c.file.Flush() +} + +func (c *contextFile) walkGetAttr(ctx context.Context, names []string) ([]p9.QID, contextFile, p9.AttrMask, p9.Attr, error) { + ctx.UninterruptibleSleepStart(false) + defer ctx.UninterruptibleSleepFinish(false) + + q, f, m, a, err := c.file.WalkGetAttr(names) + if err != nil { + return nil, contextFile{}, p9.AttrMask{}, p9.Attr{}, err + } + return q, contextFile{file: f}, m, a, nil +} + +func (c *contextFile) connect(ctx context.Context, flags p9.ConnectFlags) (*fd.FD, error) { + ctx.UninterruptibleSleepStart(false) + defer ctx.UninterruptibleSleepFinish(false) + + return c.file.Connect(flags) +} diff --git a/pkg/sentry/fs/gofer/device.go b/pkg/sentry/fs/gofer/device.go new file mode 100644 index 000000000..fac7306d4 --- /dev/null +++ b/pkg/sentry/fs/gofer/device.go @@ -0,0 +1,20 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package gofer + +import "gvisor.googlesource.com/gvisor/pkg/sentry/device" + +// goferDevice is the gofer virtual device. +var goferDevice = device.NewAnonMultiDevice() diff --git a/pkg/sentry/fs/gofer/file.go b/pkg/sentry/fs/gofer/file.go new file mode 100644 index 000000000..07c9bf01d --- /dev/null +++ b/pkg/sentry/fs/gofer/file.go @@ -0,0 +1,255 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package gofer + +import ( + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/pkg/metric" + "gvisor.googlesource.com/gvisor/pkg/p9" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/device" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil" + "gvisor.googlesource.com/gvisor/pkg/sentry/memmap" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" + "gvisor.googlesource.com/gvisor/pkg/waiter" +) + +var openedWX = metric.MustCreateNewUint64Metric("/gofer/opened_write_execute_file", true /* sync */, "Number of times a writable+executable file was opened from a gofer.") + +// fileOperations implements fs.FileOperations for a remote file system. +type fileOperations struct { + fsutil.NoIoctl `state:"nosave"` + waiter.AlwaysReady `state:"nosave"` + + // inodeOperations is the inodeOperations backing the file. It is protected + // by a reference held by File.Dirent.Inode which is stable until + // FileOperations.Release is called. + inodeOperations *inodeOperations `state:"wait"` + + // dirCursor is the directory cursor. + dirCursor string + + // handles are the opened remote file system handles, which may + // be shared with other files. + handles *handles `state:"nosave"` + + // flags are the flags used to open handles. + flags fs.FileFlags `state:"wait"` +} + +// fileOperations implements fs.FileOperations. +var _ fs.FileOperations = (*fileOperations)(nil) + +// NewFile returns a file. NewFile is not appropriate with host pipes and sockets. +func NewFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags, i *inodeOperations, handles *handles) *fs.File { + // Remote file systems enforce readability/writability at an offset, + // see fs/9p/vfs_inode.c:v9fs_vfs_atomic_open -> fs/open.c:finish_open. + flags.Pread = true + flags.Pwrite = true + + f := &fileOperations{ + inodeOperations: i, + handles: handles, + flags: flags, + } + if flags.Write { + if err := dirent.Inode.CheckPermission(ctx, fs.PermMask{Execute: true}); err == nil { + name, _ := dirent.FullName(fs.RootFromContext(ctx)) + openedWX.Increment() + log.Warningf("Opened a writable executable: %q", name) + } + } + return fs.NewFile(ctx, dirent, flags, f) +} + +// Release implements fs.FileOpeations.Release. +func (f *fileOperations) Release() { + f.handles.DecRef() +} + +// Readdir implements fs.FileOperations.Readdir. +func (f *fileOperations) Readdir(ctx context.Context, file *fs.File, serializer fs.DentrySerializer) (int64, error) { + root := fs.RootFromContext(ctx) + defer root.DecRef() + + dirCtx := &fs.DirCtx{ + Serializer: serializer, + DirCursor: &f.dirCursor, + } + n, err := fs.DirentReaddir(ctx, file.Dirent, f, root, dirCtx, file.Offset()) + if f.inodeOperations.session().cachePolicy != cacheNone { + f.inodeOperations.cachingInodeOps.TouchAccessTime(ctx, file.Dirent.Inode) + } + return n, err +} + +// IterateDir implements fs.DirIterator.IterateDir. +func (f *fileOperations) IterateDir(ctx context.Context, dirCtx *fs.DirCtx, offset int) (int, error) { + f.inodeOperations.readdirMu.Lock() + defer f.inodeOperations.readdirMu.Unlock() + + // Fetch directory entries if needed. + if f.inodeOperations.readdirCache == nil || f.inodeOperations.session().cachePolicy == cacheNone { + entries, err := f.readdirAll(ctx) + if err != nil { + return offset, err + } + + // Cache the readdir result. + f.inodeOperations.readdirCache = fs.NewSortedDentryMap(entries) + } + + // Serialize the entries. + n, err := fs.GenericReaddir(dirCtx, f.inodeOperations.readdirCache) + return offset + n, err +} + +// readdirAll fetches fs.DentAttrs for f, using the attributes of g. +func (f *fileOperations) readdirAll(ctx context.Context) (map[string]fs.DentAttr, error) { + entries := make(map[string]fs.DentAttr) + var readOffset uint64 + for { + // We choose some arbitrary high number of directory entries (64k) and call + // Readdir until we've exhausted them all. + dirents, err := f.handles.File.readdir(ctx, readOffset, 64*1024) + if err != nil { + return nil, err + } + if len(dirents) == 0 { + // We're done, we reached EOF. + break + } + + // The last dirent contains the offset into the next set of dirents. The gofer + // returns the offset as an index into directories, not as a byte offset, because + // converting a byte offset to an index into directories entries is a huge pain. + // But everything is fine if we're consistent. + readOffset = dirents[len(dirents)-1].Offset + + for _, dirent := range dirents { + if dirent.Name == "." || dirent.Name == ".." { + // These must not be included in Readdir results. + continue + } + + // Find a best approximation of the type. + var nt fs.InodeType + switch dirent.Type { + case p9.TypeDir: + nt = fs.Directory + case p9.TypeSymlink: + nt = fs.Symlink + default: + nt = fs.RegularFile + } + + // Install the DentAttr. + entries[dirent.Name] = fs.DentAttr{ + Type: nt, + // Construct the key to find the virtual inode. + // Directory entries reside on the same Device + // and SecondaryDevice as their parent. + InodeID: goferDevice.Map(device.MultiDeviceKey{ + Device: f.inodeOperations.fileState.key.Device, + SecondaryDevice: f.inodeOperations.fileState.key.SecondaryDevice, + Inode: dirent.QID.Path, + }), + } + } + } + + return entries, nil +} + +// Write implements fs.FileOperations.Write. +func (f *fileOperations) Write(ctx context.Context, file *fs.File, src usermem.IOSequence, offset int64) (int64, error) { + if fs.IsDir(file.Dirent.Inode.StableAttr) { + // Not all remote file systems enforce this so this client does. + return 0, syserror.EISDIR + } + + // Do cached IO for regular files only. Some character devices expect no caching. + isFile := fs.IsFile(file.Dirent.Inode.StableAttr) + if f.inodeOperations.session().cachePolicy == cacheNone || !isFile { + return src.CopyInTo(ctx, f.handles.readWriterAt(ctx, offset)) + } + return f.inodeOperations.cachingInodeOps.Write(ctx, src, offset) +} + +// Read implements fs.FileOperations.Read. +func (f *fileOperations) Read(ctx context.Context, file *fs.File, dst usermem.IOSequence, offset int64) (int64, error) { + if fs.IsDir(file.Dirent.Inode.StableAttr) { + // Not all remote file systems enforce this so this client does. + return 0, syserror.EISDIR + } + + // Do cached IO for regular files only. Some character devices expect no caching. + isFile := fs.IsFile(file.Dirent.Inode.StableAttr) + if f.inodeOperations.session().cachePolicy == cacheNone || !isFile { + return dst.CopyOutFrom(ctx, f.handles.readWriterAt(ctx, offset)) + } + return f.inodeOperations.cachingInodeOps.Read(ctx, file, dst, offset) +} + +// Fsync implements fs.FileOperations.Fsync. +func (f *fileOperations) Fsync(ctx context.Context, file *fs.File, start int64, end int64, syncType fs.SyncType) error { + switch syncType { + case fs.SyncAll, fs.SyncData: + if err := file.Dirent.Inode.WriteOut(ctx); err != nil { + return err + } + fallthrough + case fs.SyncBackingStorage: + // Sync remote caches. + if f.handles.Host != nil { + // Sync the host fd directly. + return syscall.Fsync(f.handles.Host.FD()) + } + // Otherwise sync on the p9.File handle. + return f.handles.File.fsync(ctx) + } + panic("invalid sync type") +} + +// Flush implements fs.FileOperations.Flush. +func (f *fileOperations) Flush(ctx context.Context, file *fs.File) error { + // If this file is not opened writable then there is nothing to flush. + // We do this because some p9 server implementations of Flush are + // over-zealous. + // + // FIXME: weaken these implementations and remove this check. + if !file.Flags().Write { + return nil + } + // Execute the flush. + return f.handles.File.flush(ctx) +} + +// ConfigureMMap implements fs.FileOperations.ConfigureMMap. +func (f *fileOperations) ConfigureMMap(ctx context.Context, file *fs.File, opts *memmap.MMapOpts) error { + if !isFileCachable(f.inodeOperations.session(), file.Dirent.Inode) { + return syserror.ENODEV + } + return fsutil.GenericConfigureMMap(file, f.inodeOperations.cachingInodeOps, opts) +} + +// Seek implements fs.FileOperations.Seek. +func (f *fileOperations) Seek(ctx context.Context, file *fs.File, whence fs.SeekWhence, offset int64) (int64, error) { + return fsutil.SeekWithDirCursor(ctx, file, whence, offset, &f.dirCursor) +} diff --git a/pkg/sentry/fs/gofer/file_state.go b/pkg/sentry/fs/gofer/file_state.go new file mode 100644 index 000000000..1d63e33ec --- /dev/null +++ b/pkg/sentry/fs/gofer/file_state.go @@ -0,0 +1,37 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package gofer + +import ( + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" +) + +// afterLoad is invoked by stateify. +func (f *fileOperations) afterLoad() { + load := func() { + f.inodeOperations.fileState.waitForLoad() + + // Manually load the open handles. + var err error + // TODO: Context is not plumbed to save/restore. + f.handles, err = newHandles(context.Background(), f.inodeOperations.fileState.file, f.flags) + if err != nil { + panic("failed to re-open handle: " + err.Error()) + } + f.inodeOperations.fileState.setHandlesForCachedIO(f.flags, f.handles) + } + fs.Async(load) +} diff --git a/pkg/sentry/fs/gofer/fs.go b/pkg/sentry/fs/gofer/fs.go new file mode 100644 index 000000000..0a1a49bbd --- /dev/null +++ b/pkg/sentry/fs/gofer/fs.go @@ -0,0 +1,252 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package gofer implements a remote 9p filesystem. +package gofer + +import ( + "errors" + "fmt" + "strconv" + + "gvisor.googlesource.com/gvisor/pkg/p9" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" +) + +// The following are options defined by the Linux 9p client that we support, +// see Documentation/filesystems/9p.txt. +const ( + // The transport method. + transportKey = "trans" + + // The file tree to access when the file server + // is exporting several file systems. Stands for "attach name". + anameKey = "aname" + + // The caching policy. + cacheKey = "cache" + + // The file descriptor for reading with trans=fd. + readFDKey = "rfdno" + + // The file descriptor for writing with trans=fd. + writeFDKey = "wfdno" + + // The number of bytes to use for a 9p packet payload. + msizeKey = "msize" + + // The 9p protocol version. + versionKey = "version" + + // If set to true allows the creation of unix domain sockets inside the + // sandbox using files backed by the gofer. If set to false, unix sockets + // cannot be bound to gofer files without an overlay on top. + privateUnixSocketKey = "privateunixsocket" +) + +// cachePolicy is a 9p cache policy. +type cachePolicy string + +const ( + // Use virtual file system cache. + cacheAll cachePolicy = "fscache" + + // TODO: fully support cache=none. + cacheNone cachePolicy = "none" + + // defaultCache is cacheAll. Note this diverges from the 9p Linux + // client whose default is "none". See TODO above. + defaultCache = cacheAll +) + +// defaultAname is the default attach name. +const defaultAname = "/" + +// defaultMSize is the message size used for chunking large read and write requests. +// This has been tested to give good enough performance up to 64M. +const defaultMSize = 1024 * 1024 // 1M + +// defaultVersion is the default 9p protocol version. Will negotiate downwards with +// file server if needed. +var defaultVersion = p9.HighestVersionString() + +// Number of names of non-children to cache, preventing unneeded walks. 64 is +// plenty for nodejs, which seems to stat about 4 children on every require(). +const nonChildrenCacheSize = 64 + +var ( + // ErrNoTransport is returned when there is no 'trans' option. + ErrNoTransport = errors.New("missing required option: 'trans='") + + // ErrNoReadFD is returned when there is no 'rfdno' option. + ErrNoReadFD = errors.New("missing required option: 'rfdno='") + + // ErrNoWriteFD is returned when there is no 'wfdno' option. + ErrNoWriteFD = errors.New("missing required option: 'wfdno='") +) + +// filesystem is a 9p client. +type filesystem struct{} + +func init() { + fs.RegisterFilesystem(&filesystem{}) +} + +// FilesystemName is the name under which the filesystem is registered. +// The name matches fs/9p/vfs_super.c:v9fs_fs_type.name. +const FilesystemName = "9p" + +// Name is the name of the filesystem. +func (*filesystem) Name() string { + return FilesystemName +} + +// AllowUserMount prohibits users from using mount(2) with this file system. +func (*filesystem) AllowUserMount() bool { + return false +} + +// Flags returns that there is nothing special about this file system. +// +// The 9p Linux client returns FS_RENAME_DOES_D_MOVE, see fs/9p/vfs_super.c. +func (*filesystem) Flags() fs.FilesystemFlags { + return 0 +} + +// Mount returns an attached 9p client that can be positioned in the vfs. +func (f *filesystem) Mount(ctx context.Context, device string, flags fs.MountSourceFlags, data string) (*fs.Inode, error) { + // Parse and validate the mount options. + o, err := options(data) + if err != nil { + return nil, err + } + + // Construct the 9p root to mount. We intentionally diverge from Linux in that + // the first Tversion and Tattach requests are done lazily. + return Root(ctx, device, f, flags, o) +} + +// opts are parsed 9p mount options. +type opts struct { + fd int + aname string + policy cachePolicy + msize uint32 + version string + privateunixsocket bool +} + +// options parses mount(2) data into structured options. +func options(data string) (opts, error) { + var o opts + + // Parse generic comma-separated key=value options, this file system expects them. + options := fs.GenericMountSourceOptions(data) + + // Check for the required 'trans=fd' option. + trans, ok := options[transportKey] + if !ok { + return o, ErrNoTransport + } + if trans != "fd" { + return o, fmt.Errorf("unsupported transport: 'trans=%s'", trans) + } + delete(options, transportKey) + + // Check for the required 'rfdno=' option. + srfd, ok := options[readFDKey] + if !ok { + return o, ErrNoReadFD + } + delete(options, readFDKey) + + // Check for the required 'wfdno=' option. + swfd, ok := options[writeFDKey] + if !ok { + return o, ErrNoWriteFD + } + delete(options, writeFDKey) + + // Parse the read fd. + rfd, err := strconv.Atoi(srfd) + if err != nil { + return o, fmt.Errorf("invalid fd for 'rfdno=%s': %v", srfd, err) + } + + // Parse the write fd. + wfd, err := strconv.Atoi(swfd) + if err != nil { + return o, fmt.Errorf("invalid fd for 'wfdno=%s': %v", swfd, err) + } + + // Require that the read and write fd are the same. + if rfd != wfd { + return o, fmt.Errorf("fd in 'rfdno=%d' and 'wfdno=%d' must match", rfd, wfd) + } + o.fd = rfd + + // Parse the attach name. + o.aname = defaultAname + if an, ok := options[anameKey]; ok { + o.aname = an + delete(options, anameKey) + } + + // Parse the cache policy. Reject unsupported policies. + o.policy = cacheAll + if cp, ok := options[cacheKey]; ok { + if cachePolicy(cp) != cacheAll && cachePolicy(cp) != cacheNone { + return o, fmt.Errorf("unsupported cache mode: 'cache=%s'", cp) + } + o.policy = cachePolicy(cp) + delete(options, cacheKey) + } + + // Parse the message size. Reject malformed options. + o.msize = uint32(defaultMSize) + if m, ok := options[msizeKey]; ok { + i, err := strconv.ParseUint(m, 10, 32) + if err != nil { + return o, fmt.Errorf("invalid message size for 'msize=%s': %v", m, err) + } + o.msize = uint32(i) + delete(options, msizeKey) + } + + // Parse the protocol version. + o.version = defaultVersion + if v, ok := options[versionKey]; ok { + o.version = v + delete(options, versionKey) + } + + // Parse the unix socket policy. Reject non-booleans. + if v, ok := options[privateUnixSocketKey]; ok { + b, err := strconv.ParseBool(v) + if err != nil { + return o, fmt.Errorf("invalid boolean value for '%s=%s': %v", privateUnixSocketKey, v, err) + } + o.privateunixsocket = b + delete(options, privateUnixSocketKey) + } + + // Fail to attach if the caller wanted us to do something that we + // don't support. + if len(options) > 0 { + return o, fmt.Errorf("unsupported mount options: %v", options) + } + + return o, nil +} diff --git a/pkg/sentry/fs/gofer/gofer_test.go b/pkg/sentry/fs/gofer/gofer_test.go new file mode 100644 index 000000000..58a2e2ef5 --- /dev/null +++ b/pkg/sentry/fs/gofer/gofer_test.go @@ -0,0 +1,776 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package gofer + +import ( + "errors" + "fmt" + "io" + "syscall" + "testing" + + "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/pkg/p9" + "gvisor.googlesource.com/gvisor/pkg/p9/p9test" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/unet" +) + +// A errMock is an error that comes from bad usage of the mock. +var errMock = errors.New("mock error") + +// goodMockFile returns a file that can be Walk'ed to and created. +func goodMockFile(mode p9.FileMode, size uint64) *p9test.FileMock { + return &p9test.FileMock{ + GetAttrMock: p9test.GetAttrMock{ + Valid: p9.AttrMask{Mode: true, Size: true, RDev: true}, + Attr: p9.Attr{Mode: mode, Size: size, RDev: 0}, + }, + } +} + +func newClosedSocket() (*unet.Socket, error) { + fd, err := syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM, 0) + if err != nil { + return nil, err + } + + s, err := unet.NewSocket(fd) + if err != nil { + syscall.Close(fd) + return nil, err + } + + return s, s.Close() +} + +// root returns a p9 file mock and an fs.InodeOperations created from that file. Any +// functions performed on fs.InodeOperations will use the p9 file mock. +func root(ctx context.Context, mode p9.FileMode, size uint64) (*p9test.FileMock, *fs.Inode, error) { + sock, err := newClosedSocket() + if err != nil { + return nil, nil, err + } + + // Construct a dummy session that we can destruct. + s := &session{ + conn: sock, + mounter: fs.RootOwner, + cachePolicy: cacheNone, + } + + rootFile := goodMockFile(mode, size) + sattr, rootInodeOperations := newInodeOperations(ctx, s, contextFile{file: rootFile}, p9.QID{}, rootFile.GetAttrMock.Valid, rootFile.GetAttrMock.Attr) + m := fs.NewMountSource(s, &filesystem{}, fs.MountSourceFlags{}) + return rootFile, fs.NewInode(rootInodeOperations, m, sattr), nil +} + +func TestLookup(t *testing.T) { + // Test parameters. + type lookupTest struct { + // Name of the test. + name string + + // Function input parameters. + fileName string + + // Expected return value. + want error + } + + tests := []lookupTest{ + { + name: "mock Walk passes (function succeeds)", + fileName: "ppp", + want: nil, + }, + { + name: "mock Walk fails (function fails)", + fileName: "ppp", + want: syscall.ENOENT, + }, + } + + ctx := contexttest.Context(t) + for _, test := range tests { + // Set up mock. + rootFile, rootInode, err := root(ctx, p9.PermissionsMask, 0) + if err != nil { + t.Errorf("TestWalk %s failed: root error got %v, want nil", test.name, err) + } + + rootFile.WalkGetAttrMock.QIDs = []p9.QID{{}} + rootFile.WalkGetAttrMock.Err = test.want + rootFile.WalkGetAttrMock.File = goodMockFile(p9.PermissionsMask, 0) + + // Call function. + dirent, err := rootInode.Lookup(ctx, test.fileName) + + // Unwrap the InodeOperations. + var newInodeOperations fs.InodeOperations + if dirent != nil { + if dirent.IsNegative() { + err = syscall.ENOENT + } else { + newInodeOperations = dirent.Inode.InodeOperations + } + } + + // Check return values. + if err != test.want { + t.Errorf("TestWalk %s failed: got %v, want %v", test.name, err, test.want) + } + if err == nil && newInodeOperations == nil { + t.Errorf("TestWalk %s failed: expected either non-nil err or non-nil node, but both are nil", test.name) + } + + // Check mock parameters. + if !rootFile.WalkGetAttrMock.Called { + t.Errorf("TestWalk %s failed: GetAttr not called; error: %v", test.name, err) + } else if rootFile.WalkGetAttrMock.Names[0] != test.fileName { + t.Errorf("TestWalk %s failed: file name not set", test.name) + } + } +} + +func TestSetTimestamps(t *testing.T) { + // Test parameters. + type setTimestampsTest struct { + // Name of the test. + name string + + // Function input parameters. + ts fs.TimeSpec + } + + ctx := contexttest.Context(t) + now := ktime.NowFromContext(ctx) + tests := []setTimestampsTest{ + { + name: "mock SetAttr passes (function succeeds)", + ts: fs.TimeSpec{ + ATime: now, + MTime: now, + }, + }, + { + name: "mock SetAttr passes, times are 0 (function succeeds)", + ts: fs.TimeSpec{}, + }, + { + name: "mock SetAttr passes, times are 0 and not system time (function succeeds)", + ts: fs.TimeSpec{ + ATimeSetSystemTime: false, + MTimeSetSystemTime: false, + }, + }, + { + name: "mock SetAttr passes, times are set to system time (function succeeds)", + ts: fs.TimeSpec{ + ATimeSetSystemTime: true, + MTimeSetSystemTime: true, + }, + }, + { + name: "mock SetAttr passes, times are omitted (function succeeds)", + ts: fs.TimeSpec{ + ATimeOmit: true, + MTimeOmit: true, + }, + }, + } + + for _, test := range tests { + // Set up mock. + rootFile, rootInode, err := root(ctx, p9.PermissionsMask, 0) + if err != nil { + t.Errorf("TestSetTimestamps %s failed: root error got %v, want nil", test.name, err) + } + + // Call function. + err = rootInode.SetTimestamps(ctx, nil /* Dirent */, test.ts) + + // Check return values. + if err != nil { + t.Errorf("TestSetTimestamps %s failed: got %v, want nil", test.name, err) + } + + // Check mock parameters. + if !(test.ts.ATimeOmit && test.ts.MTimeOmit) && !rootFile.SetAttrMock.Called { + t.Errorf("TestSetTimestamps %s failed: SetAttr not called", test.name) + continue + } + + // Check what was passed to the mock function. + attr := rootFile.SetAttrMock.Attr + atimeGiven := ktime.FromUnix(int64(attr.ATimeSeconds), int64(attr.ATimeNanoSeconds)) + if test.ts.ATimeOmit { + if rootFile.SetAttrMock.Valid.ATime { + t.Errorf("TestSetTimestamps %s failed: ATime got set true in mask, wanted false", test.name) + } + } else { + if got, want := rootFile.SetAttrMock.Valid.ATimeNotSystemTime, !test.ts.ATimeSetSystemTime; got != want { + t.Errorf("TestSetTimestamps %s failed: got ATimeNotSystemTime %v, want %v", test.name, got, want) + } + if !test.ts.ATimeSetSystemTime && !test.ts.ATime.Equal(atimeGiven) { + t.Errorf("TestSetTimestamps %s failed: ATime got %v, want %v", test.name, atimeGiven, test.ts.ATime) + } + } + + mtimeGiven := ktime.FromUnix(int64(attr.MTimeSeconds), int64(attr.MTimeNanoSeconds)) + if test.ts.MTimeOmit { + if rootFile.SetAttrMock.Valid.MTime { + t.Errorf("TestSetTimestamps %s failed: MTime got set true in mask, wanted false", test.name) + } + } else { + if got, want := rootFile.SetAttrMock.Valid.MTimeNotSystemTime, !test.ts.MTimeSetSystemTime; got != want { + t.Errorf("TestSetTimestamps %s failed: got MTimeNotSystemTime %v, want %v", test.name, got, want) + } + if !test.ts.MTimeSetSystemTime && !test.ts.MTime.Equal(mtimeGiven) { + t.Errorf("TestSetTimestamps %s failed: MTime got %v, want %v", test.name, mtimeGiven, test.ts.MTime) + } + } + + } +} + +func TestSetPermissions(t *testing.T) { + // Test parameters. + type setPermissionsTest struct { + // Name of the test. + name string + + // SetPermissions input parameters. + perms fs.FilePermissions + + // Error that SetAttr mock should return. + setAttrErr error + + // Expected return value. + want bool + } + + tests := []setPermissionsTest{ + { + name: "SetAttr mock succeeds (function succeeds)", + perms: fs.FilePermissions{User: fs.PermMask{Read: true, Write: true, Execute: true}}, + want: true, + setAttrErr: nil, + }, + { + name: "SetAttr mock fails (function fails)", + perms: fs.FilePermissions{User: fs.PermMask{Read: true, Write: true}}, + want: false, + setAttrErr: syscall.ENOENT, + }, + } + + ctx := contexttest.Context(t) + for _, test := range tests { + // Set up mock. + rootFile, rootInode, err := root(ctx, 0, 0) + if err != nil { + t.Errorf("TestSetPermissions %s failed: root error got %v, want nil", test.name, err) + } + rootFile.SetAttrMock.Err = test.setAttrErr + + ok := rootInode.SetPermissions(ctx, nil /* Dirent */, test.perms) + + // Check return value. + if ok != test.want { + t.Errorf("TestSetPermissions %s failed: got %v, want %v", test.name, ok, test.want) + } + + // Check mock parameters. + pattr := rootFile.SetAttrMock.Attr + if !rootFile.SetAttrMock.Called { + t.Errorf("TestSetPermissions %s failed: SetAttr not called", test.name) + continue + } + if !rootFile.SetAttrMock.Valid.Permissions { + t.Errorf("TestSetPermissions %s failed: SetAttr did not get right request (got false, expected SetAttrMask.Permissions true)", + test.name) + } + if got := fs.FilePermsFromP9(pattr.Permissions); got != test.perms { + t.Errorf("TestSetPermissions %s failed: SetAttr did not get right permissions -- got %v, want %v", + test.name, got, test.perms) + } + } +} + +func TestClose(t *testing.T) { + ctx := contexttest.Context(t) + // Set up mock. + rootFile, rootInode, err := root(ctx, p9.PermissionsMask, 0) + if err != nil { + t.Errorf("TestClose failed: root error got %v, want nil", err) + } + + // Call function. + rootInode.InodeOperations.Release(ctx) + + // Check mock parameters. + if !rootFile.CloseMock.Called { + t.Errorf("TestClose failed: Close not called") + } +} + +func TestRename(t *testing.T) { + // Test parameters. + type renameTest struct { + // Name of the test. + name string + + // Input parameters. + newParent *fs.Inode + newName string + + // Rename mock parameters. + renameErr error + renameCalled bool + + // Error want to return given the parameters. (Same as what + // we expect and tell rename to return.) + want error + } + ctx := contexttest.Context(t) + rootFile, rootInode, err := root(ctx, p9.PermissionsMask, 0) + if err != nil { + t.Errorf("TestRename failed: root error got %v, want nil", err) + } + + tests := []renameTest{ + { + name: "mock Rename succeeds (function succeeds)", + newParent: rootInode, + newName: "foo2", + want: nil, + renameErr: nil, + renameCalled: true, + }, + { + name: "mock Rename fails (function fails)", + newParent: rootInode, + newName: "foo2", + want: syscall.ENOENT, + renameErr: syscall.ENOENT, + renameCalled: true, + }, + { + name: "newParent is not inodeOperations but should be (function fails)", + newParent: fs.NewMockInode(ctx, fs.NewMockMountSource(nil), fs.StableAttr{Type: fs.Directory}), + newName: "foo2", + want: syscall.EXDEV, + renameErr: nil, + renameCalled: false, + }, + } + + for _, test := range tests { + mockFile := goodMockFile(p9.PermissionsMask, 0) + rootFile.WalkGetAttrMock.QIDs = []p9.QID{{}} + rootFile.WalkGetAttrMock.File = mockFile + + dirent, err := rootInode.Lookup(ctx, "foo") + if err != nil { + t.Fatalf("root.Walk failed: %v", err) + } + mockFile.RenameMock.Err = test.renameErr + mockFile.RenameMock.Called = false + + // Use a dummy oldParent to acquire write access to that directory. + oldParent := &inodeOperations{ + readdirCache: fs.NewSortedDentryMap(nil), + } + oldInode := fs.NewInode(oldParent, fs.NewMockMountSource(nil), fs.StableAttr{Type: fs.Directory}) + + // Call function. + err = dirent.Inode.InodeOperations.Rename(ctx, oldInode, "", test.newParent, test.newName) + + // Check return value. + if err != test.want { + t.Errorf("TestRename %s failed: got %v, want %v", test.name, err, test.want) + } + + // Check mock parameters. + if got, want := mockFile.RenameMock.Called, test.renameCalled; got != want { + t.Errorf("TestRename %s failed: renameCalled got %v want %v", test.name, got, want) + } + } +} + +// This file is read from in TestPreadv. +type readAtFileFake struct { + p9test.FileMock + + // Parameters for faking ReadAt. + FileLength int + Err error + ChunkSize int + Called bool + LengthRead int +} + +func (r *readAtFileFake) ReadAt(p []byte, offset uint64) (int, error) { + r.Called = true + log.Warningf("ReadAt fake: length read so far = %d, len(p) = %d, offset = %d", r.LengthRead, len(p), offset) + if int(offset) != r.LengthRead { + return 0, fmt.Errorf("offset got %d; expected %d", offset, r.LengthRead) + } + + if r.Err != nil { + return 0, r.Err + } + + if r.LengthRead >= r.FileLength { + return 0, io.EOF + } + + // Read at most ChunkSize and read at most what's left in the file. + toBeRead := len(p) + if r.LengthRead+toBeRead >= r.FileLength { + toBeRead = r.FileLength - int(offset) + } + if toBeRead > r.ChunkSize { + toBeRead = r.ChunkSize + } + + r.LengthRead += toBeRead + if r.LengthRead == r.FileLength { + return toBeRead, io.EOF + } + return toBeRead, nil +} + +func TestPreadv(t *testing.T) { + // Test parameters. + type preadvTest struct { + // Name of the test. + name string + + // Mock parameters + mode p9.FileMode + + // Buffer to read into. + buffer [512]byte + sliceSize int + + // How much readAt returns at a time. + chunkSize int + + // Whether or not we expect ReadAt to be called. + readAtCalled bool + readAtErr error + + // Expected return values. + want error + } + + tests := []preadvTest{ + { + name: "fake ReadAt succeeds, 512 bytes requested, 512 byte chunks (function succeeds)", + want: nil, + readAtErr: nil, + mode: p9.PermissionsMask, + readAtCalled: true, + sliceSize: 512, + chunkSize: 512, + }, + { + name: "fake ReadAt succeeds, 512 bytes requested, 200 byte chunks (function succeeds)", + want: nil, + readAtErr: nil, + mode: p9.PermissionsMask, + readAtCalled: true, + sliceSize: 512, + chunkSize: 200, + }, + { + name: "fake ReadAt succeeds, 0 bytes requested (function succeeds)", + want: nil, + readAtErr: nil, + mode: p9.PermissionsMask, + readAtCalled: false, + sliceSize: 0, + chunkSize: 100, + }, + { + name: "fake ReadAt returns 0 bytes and EOF (function fails)", + want: io.EOF, + readAtErr: io.EOF, + mode: p9.PermissionsMask, + readAtCalled: true, + sliceSize: 512, + chunkSize: 512, + }, + } + + ctx := contexttest.Context(t) + for _, test := range tests { + // Set up mock. + rootFile, rootInode, err := root(ctx, test.mode, 1024) + if err != nil { + t.Errorf("TestPreadv %s failed: root error got %v, want nil", test.name, err) + } + + // Set up the read buffer. + dst := usermem.BytesIOSequence(test.buffer[:test.sliceSize]) + + // This file will be read from. + openFile := &readAtFileFake{ + Err: test.readAtErr, + FileLength: test.sliceSize, + ChunkSize: test.chunkSize, + } + rootFile.WalkGetAttrMock.File = openFile + rootFile.WalkGetAttrMock.Attr.Mode = test.mode + rootFile.WalkGetAttrMock.Valid.Mode = true + + f := NewFile( + ctx, + fs.NewDirent(rootInode, ""), + fs.FileFlags{Read: true}, + rootInode.InodeOperations.(*inodeOperations), + &handles{File: contextFile{file: openFile}}, + ) + + // Call function. + _, err = f.Preadv(ctx, dst, 0) + + // Check return value. + if err != test.want { + t.Errorf("TestPreadv %s failed: got %v, want %v", test.name, err, test.want) + } + + // Check mock parameters. + if test.readAtCalled != openFile.Called { + t.Errorf("TestPreadv %s failed: ReadAt called: %v, but expected opposite", test.name, openFile.Called) + } + } +} + +func TestReadlink(t *testing.T) { + // Test parameters. + type readlinkTest struct { + // Name of the test. + name string + + // Mock parameters + mode p9.FileMode + + // Whether or not we expect ReadAt to be called and what error + // it shall return. + readlinkCalled bool + readlinkErr error + + // Expected return values. + want error + } + + tests := []readlinkTest{ + { + name: "file is not symlink (function fails)", + want: syscall.ENOLINK, + mode: p9.PermissionsMask, + readlinkCalled: false, + readlinkErr: nil, + }, + { + name: "mock Readlink succeeds (function succeeds)", + want: nil, + mode: p9.PermissionsMask | p9.ModeSymlink, + readlinkCalled: true, + readlinkErr: nil, + }, + { + name: "mock Readlink fails (function fails)", + want: syscall.ENOENT, + mode: p9.PermissionsMask | p9.ModeSymlink, + readlinkCalled: true, + readlinkErr: syscall.ENOENT, + }, + } + + ctx := contexttest.Context(t) + for _, test := range tests { + // Set up mock. + rootFile, rootInode, err := root(ctx, test.mode, 0) + if err != nil { + t.Errorf("TestReadlink %s failed: root error got %v, want nil", test.name, err) + } + + openFile := goodMockFile(test.mode, 0) + rootFile.WalkMock.File = openFile + rootFile.ReadlinkMock.Err = test.readlinkErr + + // Call function. + _, err = rootInode.Readlink(ctx) + + // Check return value. + if err != test.want { + t.Errorf("TestReadlink %s failed: got %v, want %v", test.name, err, test.want) + } + + // Check mock parameters. + if test.readlinkCalled && !rootFile.ReadlinkMock.Called { + t.Errorf("TestReadlink %s failed: Readlink not called", test.name) + } + } +} + +// This file is write from in TestPwritev. +type writeAtFileFake struct { + p9test.FileMock + + // Parameters for faking WriteAt. + Err error + ChunkSize int + Called bool + LengthWritten int +} + +func (r *writeAtFileFake) WriteAt(p []byte, offset uint64) (int, error) { + r.Called = true + log.Warningf("WriteAt fake: length written so far = %d, len(p) = %d, offset = %d", r.LengthWritten, len(p), offset) + if int(offset) != r.LengthWritten { + return 0, fmt.Errorf("offset got %d; want %d", offset, r.LengthWritten) + } + + if r.Err != nil { + return 0, r.Err + } + + // Write at most ChunkSize. + toBeWritten := len(p) + if toBeWritten > r.ChunkSize { + toBeWritten = r.ChunkSize + } + r.LengthWritten += toBeWritten + return toBeWritten, nil +} + +func TestPwritev(t *testing.T) { + // Test parameters. + type pwritevTest struct { + // Name of the test. + name string + + // Mock parameters + mode p9.FileMode + + allowWrite bool + + // Buffer to write into. + buffer [512]byte + sliceSize int + chunkSize int + + // Whether or not we expect writeAt to be called. + writeAtCalled bool + writeAtErr error + + // Expected return values. + want error + } + + tests := []pwritevTest{ + { + name: "fake writeAt succeeds, one chunk (function succeeds)", + want: nil, + writeAtErr: nil, + mode: p9.PermissionsMask, + allowWrite: true, + writeAtCalled: true, + sliceSize: 512, + chunkSize: 512, + }, + { + name: "fake writeAt fails, short write (function fails)", + want: io.ErrShortWrite, + writeAtErr: nil, + mode: p9.PermissionsMask, + allowWrite: true, + writeAtCalled: true, + sliceSize: 512, + chunkSize: 200, + }, + { + name: "fake writeAt succeeds, len 0 (function succeeds)", + want: nil, + writeAtErr: nil, + mode: p9.PermissionsMask, + allowWrite: true, + writeAtCalled: false, + sliceSize: 0, + chunkSize: 0, + }, + { + name: "writeAt can still write despite file permissions read only (function succeeds)", + want: nil, + writeAtErr: nil, + mode: p9.PermissionsMask, + allowWrite: false, + writeAtCalled: true, + sliceSize: 512, + chunkSize: 512, + }, + } + + ctx := contexttest.Context(t) + for _, test := range tests { + // Set up mock. + _, rootInode, err := root(ctx, test.mode, 0) + if err != nil { + t.Errorf("TestPwritev %s failed: root error got %v, want nil", test.name, err) + } + + src := usermem.BytesIOSequence(test.buffer[:test.sliceSize]) + + // This is the file that will be used for writing. + openFile := &writeAtFileFake{ + Err: test.writeAtErr, + ChunkSize: test.chunkSize, + } + + f := NewFile( + ctx, + fs.NewDirent(rootInode, ""), + fs.FileFlags{Write: true}, + rootInode.InodeOperations.(*inodeOperations), + &handles{File: contextFile{file: openFile}}, + ) + + // Call function. + _, err = f.Pwritev(ctx, src, 0) + + // Check return value. + if err != test.want { + t.Errorf("TestPwritev %s failed: got %v, want %v", test.name, err, test.want) + } + + // Check mock parameters. + if test.writeAtCalled != openFile.Called { + t.Errorf("TestPwritev %s failed: WriteAt called: %v, but expected opposite", test.name, openFile.Called) + continue + } + if openFile.Called && test.writeAtErr != nil && openFile.LengthWritten != test.sliceSize { + t.Errorf("TestPwritev %s failed: wrote %d bytes, expected %d bytes written", test.name, openFile.LengthWritten, test.sliceSize) + } + } +} diff --git a/pkg/sentry/fs/gofer/handles.go b/pkg/sentry/fs/gofer/handles.go new file mode 100644 index 000000000..a660c9230 --- /dev/null +++ b/pkg/sentry/fs/gofer/handles.go @@ -0,0 +1,144 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package gofer + +import ( + "io" + + "gvisor.googlesource.com/gvisor/pkg/fd" + "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/pkg/p9" + "gvisor.googlesource.com/gvisor/pkg/refs" + "gvisor.googlesource.com/gvisor/pkg/secio" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/safemem" +) + +// handles are the open handles of a gofer file. They are reference counted to +// support open handle sharing between files for read only filesystems. +// +// If Host != nil then it will be used exclusively over File. +type handles struct { + refs.AtomicRefCount + + // File is a p9.File handle. Must not be nil. + File contextFile + + // Host is an *fd.FD handle. May be nil. + Host *fd.FD +} + +// DecRef drops a reference on handles. +func (h *handles) DecRef() { + h.DecRefWithDestructor(func() { + if h.Host != nil { + if err := h.Host.Close(); err != nil { + log.Warningf("error closing host file: %v", err) + } + } + // FIXME: Context is not plumbed here. + if err := h.File.close(context.Background()); err != nil { + log.Warningf("error closing p9 file: %v", err) + } + }) +} + +func newHandles(ctx context.Context, file contextFile, flags fs.FileFlags) (*handles, error) { + _, newFile, err := file.walk(ctx, nil) + if err != nil { + return nil, err + } + + switch { + case flags.Read && flags.Write: + hostFile, _, _, err := newFile.open(ctx, p9.ReadWrite) + if err != nil { + newFile.close(ctx) + return nil, err + } + h := &handles{ + File: newFile, + Host: hostFile, + } + return h, nil + case flags.Read && !flags.Write: + hostFile, _, _, err := newFile.open(ctx, p9.ReadOnly) + if err != nil { + newFile.close(ctx) + return nil, err + } + h := &handles{ + File: newFile, + Host: hostFile, + } + return h, nil + case !flags.Read && flags.Write: + hostFile, _, _, err := newFile.open(ctx, p9.WriteOnly) + if err != nil { + newFile.close(ctx) + return nil, err + } + h := &handles{ + File: newFile, + Host: hostFile, + } + return h, nil + default: + panic("impossible fs.FileFlags") + } +} + +type handleReadWriter struct { + ctx context.Context + h *handles + off int64 +} + +func (h *handles) readWriterAt(ctx context.Context, offset int64) *handleReadWriter { + return &handleReadWriter{ctx, h, offset} +} + +// ReadToBlocks implements safemem.Reader.ReadToBlocks. +func (rw *handleReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) { + var r io.Reader + if rw.h.Host != nil { + r = secio.NewOffsetReader(rw.h.Host, rw.off) + } else { + r = &p9.ReadWriterFile{File: rw.h.File.file, Offset: uint64(rw.off)} + } + + rw.ctx.UninterruptibleSleepStart(false) + defer rw.ctx.UninterruptibleSleepFinish(false) + n, err := safemem.FromIOReader{r}.ReadToBlocks(dsts) + rw.off += int64(n) + return n, err +} + +// WriteFromBlocks implements safemem.Writer.WriteFromBlocks. +func (rw *handleReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) { + var w io.Writer + if rw.h.Host != nil { + w = secio.NewOffsetWriter(rw.h.Host, rw.off) + } else { + w = &p9.ReadWriterFile{File: rw.h.File.file, Offset: uint64(rw.off)} + } + + rw.ctx.UninterruptibleSleepStart(false) + defer rw.ctx.UninterruptibleSleepFinish(false) + n, err := safemem.FromIOWriter{w}.WriteFromBlocks(srcs) + rw.off += int64(n) + return n, err +} diff --git a/pkg/sentry/fs/gofer/inode.go b/pkg/sentry/fs/gofer/inode.go new file mode 100644 index 000000000..454242923 --- /dev/null +++ b/pkg/sentry/fs/gofer/inode.go @@ -0,0 +1,554 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package gofer + +import ( + "errors" + "sync" + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/fd" + "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/pkg/p9" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/device" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/fdpipe" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/host" + "gvisor.googlesource.com/gvisor/pkg/sentry/memmap" + "gvisor.googlesource.com/gvisor/pkg/sentry/safemem" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// inodeOperations implements fs.InodeOperations. +type inodeOperations struct { + fsutil.InodeNotVirtual `state:"nosave"` + fsutil.InodeNoExtendedAttributes `state:"nosave"` + fsutil.DeprecatedFileOperations `state:"nosave"` + + // fileState implements fs.CachedFileObject. It exists + // to break a circular load dependency between inodeOperations + // and cachingInodeOps (below). + fileState *inodeFileState `state:"wait"` + + // cachingInodeOps implement memmap.Mappable for inodeOperations. + cachingInodeOps *fsutil.CachingInodeOperations + + // readdirMu protects readdirCache and concurrent Readdirs. + readdirMu sync.Mutex `state:"nosave"` + + // readdirCache is a cache of readdir results in the form of + // a fs.SortedDentryMap. + // + // Starts out as nil, and is initialized under readdirMu lazily; + // invalidating the cache means setting it to nil. + readdirCache *fs.SortedDentryMap `state:"nosave"` +} + +// inodeFileState implements fs.CachedFileObject and otherwise fully +// encapsulates state that needs to be manually loaded on restore for +// this file object. +// +// This unfortunate structure exists because fs.CachingInodeOperations +// defines afterLoad and therefore cannot be lazily loaded (to break a +// circular load dependency between it and inodeOperations). Even with +// lazy loading, this approach defines the dependencies between objects +// and the expected load behavior more concretely. +type inodeFileState struct { + // s is common file system state for Gofers. + s *session `state:"wait"` + + // MultiDeviceKey consists of: + // + // * Device: file system device from a specific gofer. + // * SecondaryDevice: unique identifier of the attach point. + // * Inode: the inode of this resource, unique per Device.= + // + // These fields combined enable consistent hashing of virtual inodes + // on goferDevice. + key device.MultiDeviceKey `state:"nosave"` + + // file is the p9 file that contains a single unopened fid. + file contextFile `state:"nosave"` + + // sattr caches the stable attributes. + sattr fs.StableAttr `state:"wait"` + + // handlesMu protects the below fields. + handlesMu sync.RWMutex `state:"nosave"` + + // Do minimal open handle caching: only for read only filesystems. + readonly *handles `state:"nosave"` + + // Maintain readthrough handles for populating page caches. + readthrough *handles `state:"nosave"` + + // Maintain writeback handles for syncing from page caches. + writeback *handles `state:"nosave"` + + // writebackRW indicates whether writeback is opened read-write. If + // it is not and a read-write handle could replace writeback (above), + // then writeback is replaced with the read-write handle. This + // ensures that files that were first opened write-only and then + // later are opened read-write to be mapped can in fact be mapped. + writebackRW bool + + // loading is acquired when the inodeFileState begins an asynchronous + // load. It releases when the load is complete. Callers that require all + // state to be available should call waitForLoad() to ensure that. + loading sync.Mutex `state:".(struct{})"` + + // savedUAttr is only allocated during S/R. It points to the save-time + // unstable attributes and is used to validate restore-time ones. + // + // Note that these unstable attributes are only used to detect cross-S/R + // external file system metadata changes. They may differ from the + // cached unstable attributes in cachingInodeOps, as that might differ + // from the external file system attributes if there had been WriteOut + // failures. S/R is transparent to Sentry and the latter will continue + // using its cached values after restore. + savedUAttr *fs.UnstableAttr +} + +// Release releases file handles. +func (i *inodeFileState) Release(ctx context.Context) { + i.file.close(ctx) + if i.readonly != nil { + i.readonly.DecRef() + } + if i.readthrough != nil { + i.readthrough.DecRef() + } + if i.writeback != nil { + i.writeback.DecRef() + } +} + +// setHandlesForCachedIO installs file handles for reading and writing +// through fs.CachingInodeOperations. +func (i *inodeFileState) setHandlesForCachedIO(flags fs.FileFlags, h *handles) { + i.handlesMu.Lock() + defer i.handlesMu.Unlock() + + if flags.Read { + if i.readthrough == nil { + h.IncRef() + i.readthrough = h + } + } + if flags.Write { + if i.writeback == nil { + h.IncRef() + i.writeback = h + } else if !i.writebackRW && flags.Read { + i.writeback.DecRef() + h.IncRef() + i.writeback = h + } + if flags.Read { + i.writebackRW = true + } + } +} + +// getCachedHandles returns any cached handles which would accelerate +// performance generally. These handles should only be used if the mount +// supports caching. This is distinct from fs.CachingInodeOperations +// which is used for a limited set of file types (those that can be mapped). +func (i *inodeFileState) getCachedHandles(ctx context.Context, flags fs.FileFlags, msrc *fs.MountSource) (*handles, bool) { + i.handlesMu.Lock() + defer i.handlesMu.Unlock() + + if flags.Read && !flags.Write && msrc.Flags.ReadOnly { + if i.readonly != nil { + i.readonly.IncRef() + return i.readonly, true + } + h, err := newHandles(ctx, i.file, flags) + if err != nil { + return nil, false + } + i.readonly = h + i.readonly.IncRef() + return i.readonly, true + } + + return nil, false +} + +// ReadToBlocksAt implements fsutil.CachedFileObject.ReadToBlocksAt. +func (i *inodeFileState) ReadToBlocksAt(ctx context.Context, dsts safemem.BlockSeq, offset uint64) (uint64, error) { + i.handlesMu.RLock() + defer i.handlesMu.RUnlock() + return i.readthrough.readWriterAt(ctx, int64(offset)).ReadToBlocks(dsts) +} + +// WriteFromBlocksAt implements fsutil.CachedFileObject.WriteFromBlocksAt. +func (i *inodeFileState) WriteFromBlocksAt(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error) { + i.handlesMu.RLock() + defer i.handlesMu.RUnlock() + return i.writeback.readWriterAt(ctx, int64(offset)).WriteFromBlocks(srcs) +} + +// SetMaskedAttributes implements fsutil.CachedFileObject.SetMaskedAttributes. +func (i *inodeFileState) SetMaskedAttributes(ctx context.Context, mask fs.AttrMask, attr fs.UnstableAttr) error { + if mask.Empty() { + return nil + } + as, ans := attr.AccessTime.Unix() + ms, mns := attr.ModificationTime.Unix() + // An update of status change time is implied by mask.AccessTime + // or mask.ModificationTime. Updating status change time to a + // time earlier than the system time is not possible. + return i.file.setAttr( + ctx, + p9.SetAttrMask{ + Permissions: mask.Perms, + Size: mask.Size, + UID: mask.UID, + GID: mask.GID, + ATime: mask.AccessTime, + ATimeNotSystemTime: true, + MTime: mask.ModificationTime, + MTimeNotSystemTime: true, + }, p9.SetAttr{ + Permissions: p9.FileMode(attr.Perms.LinuxMode()), + UID: p9.UID(attr.Owner.UID), + GID: p9.GID(attr.Owner.GID), + Size: uint64(attr.Size), + ATimeSeconds: uint64(as), + ATimeNanoSeconds: uint64(ans), + MTimeSeconds: uint64(ms), + MTimeNanoSeconds: uint64(mns), + }) +} + +// Sync implements fsutil.CachedFileObject.Sync. +func (i *inodeFileState) Sync(ctx context.Context) error { + i.handlesMu.RLock() + defer i.handlesMu.RUnlock() + if i.writeback == nil { + return nil + } + return i.writeback.File.fsync(ctx) +} + +// FD implements fsutil.CachedFileObject.FD. +// +// FD meets the requirements of fsutil.CachedFileObject.FD because p9.File.Open +// returns a host file descriptor to back _both_ readthrough and writeback or +// not at all (e.g. both are nil). +func (i *inodeFileState) FD() int { + i.handlesMu.RLock() + defer i.handlesMu.RUnlock() + + // Assert that the file was actually opened. + if i.writeback == nil && i.readthrough == nil { + panic("cannot get host FD for a file that was never opened") + } + // If this file is mapped, then it must have been opened + // read-write and i.writeback was upgraded to a read-write + // handle. Prefer that to map. + if i.writeback != nil { + if i.writeback.Host == nil { + return -1 + } + return int(i.writeback.Host.FD()) + } + // Otherwise the file may only have been opened readable + // so far. That's the only way it can be accessed. + if i.readthrough.Host == nil { + return -1 + } + return int(i.readthrough.Host.FD()) +} + +// waitForLoad makes sure any restore-issued loading is done. +func (i *inodeFileState) waitForLoad() { + // This is not a no-op. The loading mutex is hold upon restore until + // all loading actions are done. + i.loading.Lock() + i.loading.Unlock() +} + +func (i *inodeFileState) unstableAttr(ctx context.Context) (fs.UnstableAttr, error) { + _, valid, pattr, err := getattr(ctx, i.file) + if err != nil { + return fs.UnstableAttr{}, err + } + return unstable(ctx, valid, pattr, i.s.mounter, i.s.client), nil +} + +// session extracts the gofer's session from the MountSource. +func (i *inodeOperations) session() *session { + return i.fileState.s +} + +// Release implements fs.InodeOperations.Release. +func (i *inodeOperations) Release(ctx context.Context) { + i.fileState.Release(ctx) + i.cachingInodeOps.Release() +} + +// Mappable implements fs.InodeOperations.Mappable. +func (i *inodeOperations) Mappable(inode *fs.Inode) memmap.Mappable { + if i.session().cachePolicy == cacheNone || !fs.IsFile(inode.StableAttr) { + return nil + } + return i.cachingInodeOps +} + +func isCachable(session *session, inode *fs.Inode) bool { + return session.cachePolicy != cacheNone && (fs.IsFile(inode.StableAttr) || fs.IsDir(inode.StableAttr)) +} + +func isFileCachable(session *session, inode *fs.Inode) bool { + return session.cachePolicy != cacheNone && fs.IsFile(inode.StableAttr) +} + +// UnstableAttr implements fs.InodeOperations.UnstableAttr. +func (i *inodeOperations) UnstableAttr(ctx context.Context, inode *fs.Inode) (fs.UnstableAttr, error) { + if isCachable(i.session(), inode) { + return i.cachingInodeOps.UnstableAttr(ctx, inode) + } + return i.fileState.unstableAttr(ctx) +} + +// Check implements fs.InodeOperations.Check. +func (i *inodeOperations) Check(ctx context.Context, inode *fs.Inode, p fs.PermMask) bool { + return fs.ContextCanAccessFile(ctx, inode, p) +} + +// GetFile implements fs.InodeOperations.GetFile. +func (i *inodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) { + switch d.Inode.StableAttr.Type { + case fs.Socket: + return i.getFileSocket(ctx, d, flags) + case fs.Pipe: + return i.getFilePipe(ctx, d, flags) + default: + return i.getFileDefault(ctx, d, flags) + } +} + +func (i *inodeOperations) getFileSocket(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) { + f, err := i.fileState.file.connect(ctx, p9.AnonymousSocket) + if err != nil { + return nil, syscall.EIO + } + fsf, err := host.NewSocketWithDirent(ctx, d, f, flags) + if err != nil { + f.Close() + return nil, err + } + return fsf, nil +} + +func (i *inodeOperations) getFilePipe(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) { + // Try to open as a host pipe. + if pipeOps, err := fdpipe.Open(ctx, i, flags); err != errNotHostFile { + return fs.NewFile(ctx, d, flags, pipeOps), err + } + + // If the error is due to the fact that this was never a host pipe, then back + // this file with its dirent. + h, err := newHandles(ctx, i.fileState.file, flags) + if err != nil { + return nil, err + } + return NewFile(ctx, d, flags, i, h), nil +} + +// errNotHostFile indicates that the file is not a host file. +var errNotHostFile = errors.New("not a host file") + +// NonBlockingOpen implements fdpipe.NonBlockingOpener for opening host named pipes. +func (i *inodeOperations) NonBlockingOpen(ctx context.Context, p fs.PermMask) (*fd.FD, error) { + i.fileState.waitForLoad() + + // Get a cloned fid which we will open. + _, newFile, err := i.fileState.file.walk(ctx, nil) + if err != nil { + log.Warningf("Open Walk failed: %v", err) + return nil, err + } + defer newFile.close(ctx) + + flags, err := openFlagsFromPerms(p) + if err != nil { + log.Warningf("Open flags %s parsing failed: %v", p, err) + return nil, err + } + hostFile, _, _, err := newFile.open(ctx, flags) + // If the host file returned is nil and the error is nil, + // then this was never a host file to begin with, and should + // be treated like a remote file. + if hostFile == nil && err == nil { + return nil, errNotHostFile + } + return hostFile, err +} + +func (i *inodeOperations) getFileDefault(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) { + if !isFileCachable(i.session(), d.Inode) { + h, err := newHandles(ctx, i.fileState.file, flags) + if err != nil { + return nil, err + } + return NewFile(ctx, d, flags, i, h), nil + } + + h, ok := i.fileState.getCachedHandles(ctx, flags, d.Inode.MountSource) + if !ok { + var err error + h, err = newHandles(ctx, i.fileState.file, flags) + if err != nil { + return nil, err + } + } + i.fileState.setHandlesForCachedIO(flags, h) + + return NewFile(ctx, d, flags, i, h), nil +} + +// SetPermissions implements fs.InodeOperations.SetPermissions. +func (i *inodeOperations) SetPermissions(ctx context.Context, inode *fs.Inode, p fs.FilePermissions) bool { + if isCachable(i.session(), inode) { + return i.cachingInodeOps.SetPermissions(ctx, inode, p) + } + + mask := p9.SetAttrMask{Permissions: true} + pattr := p9.SetAttr{Permissions: p9.FileMode(p.LinuxMode())} + // Execute the chmod. + return i.fileState.file.setAttr(ctx, mask, pattr) == nil +} + +// SetOwner implements fs.InodeOperations.SetOwner. +func (i *inodeOperations) SetOwner(ctx context.Context, inode *fs.Inode, owner fs.FileOwner) error { + // Save the roundtrip. + if !owner.UID.Ok() && !owner.GID.Ok() { + return nil + } + + if isCachable(i.session(), inode) { + return i.cachingInodeOps.SetOwner(ctx, inode, owner) + } + + var mask p9.SetAttrMask + var attr p9.SetAttr + if owner.UID.Ok() { + mask.UID = true + attr.UID = p9.UID(owner.UID) + } + if owner.GID.Ok() { + mask.GID = true + attr.GID = p9.GID(owner.GID) + } + return i.fileState.file.setAttr(ctx, mask, attr) +} + +// SetTimestamps implements fs.InodeOperations.SetTimestamps. +func (i *inodeOperations) SetTimestamps(ctx context.Context, inode *fs.Inode, ts fs.TimeSpec) error { + if isCachable(i.session(), inode) { + return i.cachingInodeOps.SetTimestamps(ctx, inode, ts) + } + + return utimes(ctx, i.fileState.file, ts) +} + +// Truncate implements fs.InodeOperations.Truncate. +func (i *inodeOperations) Truncate(ctx context.Context, inode *fs.Inode, length int64) error { + // This can only be called for files anyway. + if isFileCachable(i.session(), inode) { + return i.cachingInodeOps.Truncate(ctx, inode, length) + } + + return i.fileState.file.setAttr(ctx, p9.SetAttrMask{Size: true}, p9.SetAttr{Size: uint64(length)}) +} + +// WriteOut implements fs.InodeOperations.WriteOut. +func (i *inodeOperations) WriteOut(ctx context.Context, inode *fs.Inode) error { + if !isCachable(i.session(), inode) { + return nil + } + + return i.cachingInodeOps.WriteOut(ctx, inode) +} + +// Readlink implements fs.InodeOperations.Readlink. +func (i *inodeOperations) Readlink(ctx context.Context, inode *fs.Inode) (string, error) { + if !fs.IsSymlink(inode.StableAttr) { + return "", syscall.ENOLINK + } + return i.fileState.file.readlink(ctx) +} + +// Getlink implementfs fs.InodeOperations.Getlink. +func (i *inodeOperations) Getlink(context.Context, *fs.Inode) (*fs.Dirent, error) { + if !fs.IsSymlink(i.fileState.sattr) { + return nil, syserror.ENOLINK + } + return nil, fs.ErrResolveViaReadlink +} + +// StatFS makes a StatFS request. +func (i *inodeOperations) StatFS(ctx context.Context) (fs.Info, error) { + fsstat, err := i.fileState.file.statFS(ctx) + if err != nil { + return fs.Info{}, err + } + + info := fs.Info{ + // This is primarily for distinguishing a gofer file system in + // tests. Testing is important, so instead of defining + // something completely random, use a standard value. + Type: linux.V9FS_MAGIC, + TotalBlocks: fsstat.Blocks, + FreeBlocks: fsstat.BlocksFree, + TotalFiles: fsstat.Files, + FreeFiles: fsstat.FilesFree, + } + + // If blocks available is non-zero, prefer that. + if fsstat.BlocksAvailable != 0 { + info.FreeBlocks = fsstat.BlocksAvailable + } + + return info, nil +} + +func init() { + syserror.AddErrorUnwrapper(func(err error) (syscall.Errno, bool) { + if _, ok := err.(p9.ErrSocket); ok { + // Treat as an I/O error. + return syscall.EIO, true + } + return 0, false + }) +} + +// AddLink implements InodeOperations.AddLink, but is currently a noop. +// FIXME: Remove this from InodeOperations altogether. +func (*inodeOperations) AddLink() {} + +// DropLink implements InodeOperations.DropLink, but is currently a noop. +// FIXME: Remove this from InodeOperations altogether. +func (*inodeOperations) DropLink() {} + +// NotifyStatusChange implements fs.InodeOperations.NotifyStatusChange. +// FIXME: Remove this from InodeOperations altogether. +func (i *inodeOperations) NotifyStatusChange(ctx context.Context) {} diff --git a/pkg/sentry/fs/gofer/inode_state.go b/pkg/sentry/fs/gofer/inode_state.go new file mode 100644 index 000000000..997a7d1c1 --- /dev/null +++ b/pkg/sentry/fs/gofer/inode_state.go @@ -0,0 +1,141 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package gofer + +import ( + "fmt" + "strings" + + "gvisor.googlesource.com/gvisor/pkg/p9" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/device" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time" +) + +// Some fs implementations may not support atime, ctime, or mtime in getattr. +// The unstable() logic would try to use clock time for them. However, we do not +// want to use such time during S/R as that would cause restore timestamp +// checking failure. Hence a dummy stable-time clock is needed. +// +// Note that application-visible UnstableAttrs either come from CachingInodeOps +// (in which case they are saved), or they are requested from the gofer on each +// stat (for non-caching), so the dummy time only affects the modification +// timestamp check. +type dummyClock struct { + time.Clock +} + +// Now returns a stable dummy time. +func (d *dummyClock) Now() time.Time { + return time.Time{} +} + +type dummyClockContext struct { + context.Context +} + +// Value implements context.Context +func (d *dummyClockContext) Value(key interface{}) interface{} { + switch key { + case time.CtxRealtimeClock: + return &dummyClock{} + default: + return d.Context.Value(key) + } +} + +// beforeSave is invoked by stateify. +func (i *inodeFileState) beforeSave() { + if _, ok := i.s.inodeMappings[i.sattr.InodeID]; !ok { + panic(fmt.Sprintf("failed to find path for inode number %d. Device %s contains %s", i.sattr.InodeID, i.s.connID, fs.InodeMappings(i.s.inodeMappings))) + } + if i.sattr.Type == fs.RegularFile { + uattr, err := i.unstableAttr(&dummyClockContext{context.Background()}) + if err != nil { + panic(fmt.Sprintf("failed to get unstable atttribute of %s: %v", i.s.inodeMappings[i.sattr.InodeID], err)) + } + i.savedUAttr = &uattr + } +} + +// saveLoading is invoked by stateify. +func (i *inodeFileState) saveLoading() struct{} { + return struct{}{} +} + +// loadLoading is invoked by stateify. +func (i *inodeFileState) loadLoading(_ struct{}) { + i.loading.Lock() +} + +// afterLoad is invoked by stateify. +func (i *inodeFileState) afterLoad() { + load := func() { + // See comment on i.loading(). + defer i.loading.Unlock() + + // Manually restore the p9.File. + name, ok := i.s.inodeMappings[i.sattr.InodeID] + if !ok { + // This should be impossible, see assertion in + // beforeSave. + panic(fmt.Sprintf("failed to find path for inode number %d. Device %s contains %s", i.sattr.InodeID, i.s.connID, fs.InodeMappings(i.s.inodeMappings))) + } + // TODO: Context is not plumbed to save/restore. + ctx := &dummyClockContext{context.Background()} + var err error + _, i.file, err = i.s.attach.walk(ctx, strings.Split(name, "/")) + if err != nil { + panic(fmt.Sprintf("failed to walk to %q: %v", name, err)) + } + + // Remap the saved inode number into the gofer device using the + // actual device and actual inode that exists in our new + // environment. + qid, mask, attrs, err := i.file.getAttr(ctx, p9.AttrMaskAll()) + if err != nil { + panic(fmt.Sprintf("failed to get file attributes of %s: %v", name, err)) + } + if !mask.RDev { + panic(fmt.Sprintf("file %s lacks device", name)) + } + i.key = device.MultiDeviceKey{ + Device: attrs.RDev, + SecondaryDevice: i.s.connID, + Inode: qid.Path, + } + if !goferDevice.Load(i.key, i.sattr.InodeID) { + panic(fmt.Sprintf("gofer device %s -> %d conflict in gofer device mappings: %s", i.key, i.sattr.InodeID, goferDevice)) + } + + if i.sattr.Type == fs.RegularFile { + env, ok := fs.CurrentRestoreEnvironment() + if !ok { + panic("missing restore environment") + } + uattr := unstable(ctx, mask, attrs, i.s.mounter, i.s.client) + if env.ValidateFileSize && uattr.Size != i.savedUAttr.Size { + panic(fmt.Errorf("file size has changed for %s: previously %d, now %d", i.s.inodeMappings[i.sattr.InodeID], i.savedUAttr.Size, uattr.Size)) + } + if env.ValidateFileTimestamp && uattr.ModificationTime != i.savedUAttr.ModificationTime { + panic(fmt.Errorf("file modification time has changed for %s: previously %v, now %v", i.s.inodeMappings[i.sattr.InodeID], i.savedUAttr.ModificationTime, uattr.ModificationTime)) + } + i.savedUAttr = nil + } + } + + fs.Async(load) +} diff --git a/pkg/sentry/fs/gofer/path.go b/pkg/sentry/fs/gofer/path.go new file mode 100644 index 000000000..d696f1561 --- /dev/null +++ b/pkg/sentry/fs/gofer/path.go @@ -0,0 +1,331 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package gofer + +import ( + "fmt" + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/p9" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/device" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/syserror" + "gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix" +) + +// Lookup loads an Inode at name into a Dirent based on the session's cache +// policy. +func (i *inodeOperations) Lookup(ctx context.Context, dir *fs.Inode, name string) (*fs.Dirent, error) { + if i.session().cachePolicy != cacheNone { + // Check to see if we have readdirCache that indicates the + // child does not exist. Avoid holding readdirMu longer than + // we need to. + i.readdirMu.Lock() + if i.readdirCache != nil && !i.readdirCache.Contains(name) { + // No such child. Return a negative dirent. + i.readdirMu.Unlock() + return fs.NewNegativeDirent(name), nil + } + i.readdirMu.Unlock() + } + + // Get a p9.File for name. + qids, newFile, mask, p9attr, err := i.fileState.file.walkGetAttr(ctx, []string{name}) + if err != nil { + if err == syscall.ENOENT { + if i.session().cachePolicy != cacheNone { + // Return a negative Dirent. It will stay cached until something + // is created over it. + return fs.NewNegativeDirent(name), nil + } + return nil, syserror.ENOENT + } + return nil, err + } + + // Construct the Inode operations. + sattr, node := newInodeOperations(ctx, i.fileState.s, newFile, qids[0], mask, p9attr) + + // Construct a positive Dirent. + return fs.NewDirent(fs.NewInode(node, dir.MountSource, sattr), name), nil +} + +// Creates a new Inode at name and returns its File based on the session's cache policy. +// +// Ownership is currently ignored. +func (i *inodeOperations) Create(ctx context.Context, dir *fs.Inode, name string, flags fs.FileFlags, perm fs.FilePermissions) (*fs.File, error) { + // Create replaces the directory fid with the newly created/opened + // file, so clone this directory so it doesn't change out from under + // this node. + _, newFile, err := i.fileState.file.walk(ctx, nil) + if err != nil { + return nil, err + } + + // Map the FileFlags to p9 OpenFlags. + var openFlags p9.OpenFlags + switch { + case flags.Read && flags.Write: + openFlags = p9.ReadWrite + case flags.Read: + openFlags = p9.ReadOnly + case flags.Write: + openFlags = p9.WriteOnly + default: + panic(fmt.Sprintf("Create called with unknown or unset open flags: %v", flags)) + } + + owner := fs.FileOwnerFromContext(ctx) + hostFile, err := newFile.create(ctx, name, openFlags, p9.FileMode(perm.LinuxMode()), p9.UID(owner.UID), p9.GID(owner.GID)) + if err != nil { + // Could not create the file. + return nil, err + } + + i.touchModificationTime(ctx) + + // Get the attributes of the file. + qid, mask, p9attr, err := getattr(ctx, newFile) + if err != nil { + newFile.close(ctx) + return nil, err + } + + // Get an unopened p9.File for the file we created so that it can be + // cloned and re-opened multiple times after creation. + _, unopened, err := i.fileState.file.walk(ctx, []string{name}) + if err != nil { + newFile.close(ctx) + return nil, err + } + + // Construct the InodeOperations. + sattr, iops := newInodeOperations(ctx, i.fileState.s, unopened, qid, mask, p9attr) + + // Construct the positive Dirent. + d := fs.NewDirent(fs.NewInode(iops, dir.MountSource, sattr), name) + defer d.DecRef() + + // Construct the new file, caching the handles if allowed. + h := &handles{ + File: newFile, + Host: hostFile, + } + if isFileCachable(iops.session(), d.Inode) { + iops.fileState.setHandlesForCachedIO(flags, h) + } + return NewFile(ctx, d, flags, iops, h), nil +} + +// CreateLink uses Create to create a symlink between oldname and newname. +func (i *inodeOperations) CreateLink(ctx context.Context, dir *fs.Inode, oldname string, newname string) error { + owner := fs.FileOwnerFromContext(ctx) + if _, err := i.fileState.file.symlink(ctx, oldname, newname, p9.UID(owner.UID), p9.GID(owner.GID)); err != nil { + return err + } + i.touchModificationTime(ctx) + return nil +} + +// CreateHardLink implements InodeOperations.CreateHardLink. +func (i *inodeOperations) CreateHardLink(ctx context.Context, _ *fs.Inode, target *fs.Inode, newName string) error { + targetOpts, ok := target.InodeOperations.(*inodeOperations) + if !ok { + return syscall.EXDEV + } + + if err := i.fileState.file.link(ctx, &targetOpts.fileState.file, newName); err != nil { + return err + } + // TODO: Don't increase link count because we can't properly accounts for links + // with gofers. + i.touchModificationTime(ctx) + return nil +} + +// CreateDirectory uses Create to create a directory named s under inodeOperations. +func (i *inodeOperations) CreateDirectory(ctx context.Context, dir *fs.Inode, s string, perm fs.FilePermissions) error { + owner := fs.FileOwnerFromContext(ctx) + if _, err := i.fileState.file.mkdir(ctx, s, p9.FileMode(perm.LinuxMode()), p9.UID(owner.UID), p9.GID(owner.GID)); err != nil { + return err + } + if i.session().cachePolicy == cacheAll { + // Increase link count. + i.cachingInodeOps.IncLinks(ctx) + + // Invalidate readdir cache. + i.markDirectoryDirty() + } + return nil +} + +// Bind implements InodeOperations. +func (i *inodeOperations) Bind(ctx context.Context, dir *fs.Inode, name string, ep unix.BoundEndpoint, perm fs.FilePermissions) error { + if i.session().endpoints == nil { + return syscall.EOPNOTSUPP + } + + // Create replaces the directory fid with the newly created/opened + // file, so clone this directory so it doesn't change out from under + // this node. + _, newFile, err := i.fileState.file.walk(ctx, nil) + if err != nil { + return err + } + + // Stabilize the endpoint map while creation is in progress. + unlock := i.session().endpoints.lock() + defer unlock() + + // Create a regular file in the gofer and then mark it as a socket by + // adding this inode key in the 'endpoints' map. + owner := fs.FileOwnerFromContext(ctx) + hostFile, err := newFile.create(ctx, name, p9.ReadWrite, p9.FileMode(perm.LinuxMode()), p9.UID(owner.UID), p9.GID(owner.GID)) + if err != nil { + return err + } + // We're not going to use this file. + hostFile.Close() + + i.touchModificationTime(ctx) + + // Get the attributes of the file to create inode key. + qid, _, attr, err := getattr(ctx, newFile) + if err != nil { + newFile.close(ctx) + return err + } + + key := device.MultiDeviceKey{ + Device: attr.RDev, + SecondaryDevice: i.session().connID, + Inode: qid.Path, + } + i.session().endpoints.add(key, ep) + + return nil +} + +// CreateFifo implements fs.InodeOperations.CreateFifo. Gofer nodes do not support the +// creation of fifos and always returns EOPNOTSUPP. +func (*inodeOperations) CreateFifo(context.Context, *fs.Inode, string, fs.FilePermissions) error { + return syscall.EOPNOTSUPP +} + +// Remove implements InodeOperations.Remove. +func (i *inodeOperations) Remove(ctx context.Context, dir *fs.Inode, name string) error { + var key device.MultiDeviceKey + removeSocket := false + if i.session().endpoints != nil { + // Find out if file being deleted is a socket that needs to be + // removed from endpoint map. + if d, err := i.Lookup(ctx, dir, name); err == nil { + defer d.DecRef() + if fs.IsSocket(d.Inode.StableAttr) { + child := d.Inode.InodeOperations.(*inodeOperations) + key = child.fileState.key + removeSocket = true + + // Stabilize the endpoint map while deletion is in progress. + unlock := i.session().endpoints.lock() + defer unlock() + } + } + } + + if err := i.fileState.file.unlinkAt(ctx, name, 0); err != nil { + return err + } + if removeSocket { + i.session().endpoints.remove(key) + } + i.touchModificationTime(ctx) + + return nil +} + +// Remove implements InodeOperations.RemoveDirectory. +func (i *inodeOperations) RemoveDirectory(ctx context.Context, dir *fs.Inode, name string) error { + // 0x200 = AT_REMOVEDIR. + if err := i.fileState.file.unlinkAt(ctx, name, 0x200); err != nil { + return err + } + if i.session().cachePolicy == cacheAll { + // Decrease link count and updates atime. + i.cachingInodeOps.DecLinks(ctx) + + // Invalidate readdir cache. + i.markDirectoryDirty() + } + return nil +} + +// Rename renames this node. +func (i *inodeOperations) Rename(ctx context.Context, oldParent *fs.Inode, oldName string, newParent *fs.Inode, newName string) error { + // Unwrap the new parent to a *inodeOperations. + newParentInodeOperations, ok := newParent.InodeOperations.(*inodeOperations) + if !ok { + return syscall.EXDEV + } + + // Unwrap the old parent to a *inodeOperations. + oldParentInodeOperations, ok := oldParent.InodeOperations.(*inodeOperations) + if !ok { + return syscall.EXDEV + } + + // Do the rename. + if err := i.fileState.file.rename(ctx, newParentInodeOperations.fileState.file, newName); err != nil { + return err + } + + // Update cached state. + if i.session().cachePolicy == cacheAll { + // Is the renamed entity a directory? Fix link counts. + if fs.IsDir(i.fileState.sattr) { + oldParentInodeOperations.cachingInodeOps.DecLinks(ctx) + newParentInodeOperations.cachingInodeOps.IncLinks(ctx) + } + + // Mark old directory dirty. + oldParentInodeOperations.markDirectoryDirty() + if oldParent != newParent { + // Mark new directory dirty. + newParentInodeOperations.markDirectoryDirty() + } + } + return nil +} + +func (i *inodeOperations) touchModificationTime(ctx context.Context) { + if i.session().cachePolicy == cacheAll { + i.cachingInodeOps.TouchModificationTime(ctx) + + // Invalidate readdir cache. + i.markDirectoryDirty() + } +} + +// markDirectoryDirty marks any cached data dirty for this directory. This is necessary in order +// to ensure that this node does not retain stale state throughout its lifetime across multiple +// open directory handles. +// +// Currently this means invalidating any readdir caches. +func (i *inodeOperations) markDirectoryDirty() { + i.readdirMu.Lock() + defer i.readdirMu.Unlock() + i.readdirCache = nil +} diff --git a/pkg/sentry/fs/gofer/session.go b/pkg/sentry/fs/gofer/session.go new file mode 100644 index 000000000..ab3b964e0 --- /dev/null +++ b/pkg/sentry/fs/gofer/session.go @@ -0,0 +1,251 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package gofer + +import ( + "sync" + + "gvisor.googlesource.com/gvisor/pkg/p9" + "gvisor.googlesource.com/gvisor/pkg/refs" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/device" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil" + "gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix" + "gvisor.googlesource.com/gvisor/pkg/unet" +) + +type endpointMap struct { + mu sync.RWMutex + m map[device.MultiDeviceKey]unix.BoundEndpoint +} + +// add adds the endpoint to the map. +// +// Precondition: map must have been locked with 'lock'. +func (e *endpointMap) add(key device.MultiDeviceKey, ep unix.BoundEndpoint) { + e.m[key] = ep +} + +// remove deletes the key from the map. +// +// Precondition: map must have been locked with 'lock'. +func (e *endpointMap) remove(key device.MultiDeviceKey) { + delete(e.m, key) +} + +// lock blocks other addition and removal operations from happening while +// the backing file is being created or deleted. Returns a function that unlocks +// the endpoint map. +func (e *endpointMap) lock() func() { + e.mu.Lock() + return func() { e.mu.Unlock() } +} + +func (e *endpointMap) get(key device.MultiDeviceKey) unix.BoundEndpoint { + e.mu.RLock() + ep := e.m[key] + e.mu.RUnlock() + return ep +} + +// session holds state for each 9p session established during sys_mount. +type session struct { + refs.AtomicRefCount + + // conn is a unet.Socket that wraps the readFD/writeFD mount option, + // see fs/gofer/fs.go. + conn *unet.Socket `state:"nosave"` + + // msize is the value of the msize mount option, see fs/gofer/fs.go. + msize uint32 `state:"wait"` + + // version is the value of the version mount option, see fs/gofer/fs.go. + version string `state:"wait"` + + // cachePolicy is the cache policy. It may be either cacheAll or cacheNone. + cachePolicy cachePolicy `state:"wait"` + + // aname is the value of the aname mount option, see fs/gofer/fs.go. + aname string `state:"wait"` + + // The client associated with this session. This will be initialized lazily. + client *p9.Client `state:"nosave"` + + // The p9.File pointing to attachName via the client. This will be initialized + // lazily. + attach contextFile `state:"nosave"` + + // Flags provided to the mount. + superBlockFlags fs.MountSourceFlags `state:"wait"` + + // connID is a unique identifier for the session connection. + connID string `state:"wait"` + + // inodeMappings contains mappings of fs.Inodes associated with this session + // to paths relative to the attach point, where inodeMappings is keyed by + // Inode.StableAttr.InodeID. + inodeMappings map[uint64]string `state:"wait"` + + // mounter is the EUID/EGID that mounted this file system. + mounter fs.FileOwner `state:"wait"` + + // endpoints is used to map inodes that represent socket files to their + // corresponding endpoint. Socket files are created as regular files in the + // gofer and their presence in this map indicate that they should indeed be + // socket files. This allows unix domain sockets to be used with paths that + // belong to a gofer. + // + // TODO: there are few possible races with someone stat'ing the + // file and another deleting it concurrently, where the file will not be + // reported as socket file. + endpoints *endpointMap `state:"wait"` +} + +// Destroy tears down the session. +func (s *session) Destroy() { + s.conn.Close() +} + +// Revalidate returns true if the cache policy is does not allow for VFS caching. +func (s *session) Revalidate(*fs.Dirent) bool { + return s.cachePolicy == cacheNone +} + +// TakeRefs takes an extra reference on dirent if possible. +func (s *session) Keep(dirent *fs.Dirent) bool { + // NOTE: Only cache files and directories. + sattr := dirent.Inode.StableAttr + return s.cachePolicy != cacheNone && (fs.IsFile(sattr) || fs.IsDir(sattr)) +} + +// ResetInodeMappings implements fs.MountSourceOperations.ResetInodeMappings. +func (s *session) ResetInodeMappings() { + s.inodeMappings = make(map[uint64]string) +} + +// SaveInodeMapping implements fs.MountSourceOperations.SaveInodeMapping. +func (s *session) SaveInodeMapping(inode *fs.Inode, path string) { + // This is very unintuitive. We *CANNOT* trust the inode's StableAttrs, + // because overlay copyUp may have changed them out from under us. + // So much for "immutable". + sattr := inode.InodeOperations.(*inodeOperations).fileState.sattr + s.inodeMappings[sattr.InodeID] = path +} + +// newInodeOperations creates a new 9p fs.InodeOperations backed by a p9.File and attributes +// (p9.QID, p9.AttrMask, p9.Attr). +func newInodeOperations(ctx context.Context, s *session, file contextFile, qid p9.QID, valid p9.AttrMask, attr p9.Attr) (fs.StableAttr, *inodeOperations) { + deviceKey := device.MultiDeviceKey{ + Device: attr.RDev, + SecondaryDevice: s.connID, + Inode: qid.Path, + } + + sattr := fs.StableAttr{ + Type: ntype(attr), + DeviceID: goferDevice.DeviceID(), + InodeID: goferDevice.Map(deviceKey), + BlockSize: bsize(attr), + } + + if s.endpoints != nil { + // If unix sockets are allowed on this filesystem, check if this file is + // supposed to be a socket file. + if s.endpoints.get(deviceKey) != nil { + sattr.Type = fs.Socket + } + } + + fileState := &inodeFileState{ + s: s, + file: file, + sattr: sattr, + key: deviceKey, + } + + uattr := unstable(ctx, valid, attr, s.mounter, s.client) + return sattr, &inodeOperations{ + fileState: fileState, + cachingInodeOps: fsutil.NewCachingInodeOperations(ctx, fileState, uattr, s.superBlockFlags.ForcePageCache), + } +} + +// Root returns the root of a 9p mount. This mount is bound to a 9p server +// based on conn. Otherwise configuration parameters are: +// +// * dev: connection id +// * filesystem: the filesystem backing the mount +// * superBlockFlags: the mount flags describing general mount options +// * opts: parsed 9p mount options +func Root(ctx context.Context, dev string, filesystem fs.Filesystem, superBlockFlags fs.MountSourceFlags, o opts) (*fs.Inode, error) { + // The mounting EUID/EGID will be cached by this file system. This will + // be used to assign ownership to files that the Gofer owns. + mounter := fs.FileOwnerFromContext(ctx) + + conn, err := unet.NewSocket(o.fd) + if err != nil { + return nil, err + } + + // Construct the session. + s := &session{ + connID: dev, + conn: conn, + msize: o.msize, + version: o.version, + cachePolicy: o.policy, + aname: o.aname, + superBlockFlags: superBlockFlags, + mounter: mounter, + } + + if o.privateunixsocket { + s.endpoints = &endpointMap{m: make(map[device.MultiDeviceKey]unix.BoundEndpoint)} + } + + // Construct the MountSource with the session and superBlockFlags. + m := fs.NewMountSource(s, filesystem, superBlockFlags) + + // Send the Tversion request. + s.client, err = p9.NewClient(s.conn, s.msize, s.version) + if err != nil { + // Drop our reference on the session, it needs to be torn down. + s.DecRef() + return nil, err + } + + // Notify that we're about to call the Gofer and block. + ctx.UninterruptibleSleepStart(false) + // Send the Tattach request. + s.attach.file, err = s.client.Attach(s.aname) + ctx.UninterruptibleSleepFinish(false) + if err != nil { + // Same as above. + s.DecRef() + return nil, err + } + + qid, valid, attr, err := s.attach.getAttr(ctx, p9.AttrMaskAll()) + if err != nil { + s.attach.close(ctx) + // Same as above, but after we execute the Close request. + s.DecRef() + return nil, err + } + + sattr, iops := newInodeOperations(ctx, s, s.attach, qid, valid, attr) + return fs.NewInode(iops, m, sattr), nil +} diff --git a/pkg/sentry/fs/gofer/session_state.go b/pkg/sentry/fs/gofer/session_state.go new file mode 100644 index 000000000..4d993a219 --- /dev/null +++ b/pkg/sentry/fs/gofer/session_state.go @@ -0,0 +1,90 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package gofer + +import ( + "fmt" + + "gvisor.googlesource.com/gvisor/pkg/p9" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/unet" +) + +// afterLoad is invoked by stateify. +func (s *session) afterLoad() { + // The restore environment contains the 9p connection of this mount. + fsys := filesystem{} + env, ok := fs.CurrentRestoreEnvironment() + if !ok { + panic("failed to find restore environment") + } + mounts, ok := env.MountSources[fsys.Name()] + if !ok { + panic("failed to find mounts for filesystem type " + fsys.Name()) + } + var args fs.MountArgs + var found bool + for _, mount := range mounts { + if mount.Dev == s.connID { + args = mount + found = true + } + } + if !found { + panic(fmt.Sprintf("no connection for connection id %q", s.connID)) + } + + // Validate the mount flags and options. + opts, err := options(args.Data) + if err != nil { + panic("failed to parse mount options: " + err.Error()) + } + if opts.msize != s.msize { + panic(fmt.Sprintf("new message size %v, want %v", opts.msize, s.msize)) + } + if opts.version != s.version { + panic(fmt.Sprintf("new version %v, want %v", opts.version, s.version)) + } + if opts.policy != s.cachePolicy { + panic(fmt.Sprintf("new cache policy %v, want %v", opts.policy, s.cachePolicy)) + } + if opts.aname != s.aname { + panic(fmt.Sprintf("new attach name %v, want %v", opts.aname, s.aname)) + } + if opts.privateunixsocket != (s.endpoints != nil) { + panic(fmt.Sprintf("new privateunixsocket option %v, want %v", opts.privateunixsocket, s.endpoints != nil)) + } + if args.Flags != s.superBlockFlags { + panic(fmt.Sprintf("new mount flags %v, want %v", args.Flags, s.superBlockFlags)) + } + + // Manually restore the connection. + s.conn, err = unet.NewSocket(opts.fd) + if err != nil { + panic(fmt.Sprintf("failed to create Socket for FD %d: %v", opts.fd, err)) + } + + // Manually restore the client. + s.client, err = p9.NewClient(s.conn, s.msize, s.version) + if err != nil { + panic(fmt.Sprintf("failed to connect client to server: %v", err)) + } + + // Manually restore the attach point. + s.attach.file, err = s.client.Attach(s.aname) + if err != nil { + panic(fmt.Sprintf("failed to attach to aname: %v", err)) + } +} diff --git a/pkg/sentry/fs/gofer/socket.go b/pkg/sentry/fs/gofer/socket.go new file mode 100644 index 000000000..954000ef0 --- /dev/null +++ b/pkg/sentry/fs/gofer/socket.go @@ -0,0 +1,127 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package gofer + +import ( + "gvisor.googlesource.com/gvisor/pkg/p9" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/host" + "gvisor.googlesource.com/gvisor/pkg/tcpip" + "gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix" + "gvisor.googlesource.com/gvisor/pkg/waiter" +) + +// BoundEndpoint returns a gofer-backed unix.BoundEndpoint. +func (i *inodeOperations) BoundEndpoint(inode *fs.Inode, path string) unix.BoundEndpoint { + if !fs.IsSocket(i.fileState.sattr) { + return nil + } + + if i.session().endpoints != nil { + ep := i.session().endpoints.get(i.fileState.key) + if ep != nil { + return ep + } + + // Not found in endpoints map, it may be a gofer backed unix socket... + } + + inode.IncRef() + return &endpoint{inode, i.fileState.file.file, path} +} + +// endpoint is a Gofer-backed unix.BoundEndpoint. +// +// An endpoint's lifetime is the time between when InodeOperations.BoundEndpoint() +// is called and either BoundEndpoint.BidirectionalConnect or +// BoundEndpoint.UnidirectionalConnect is called. +type endpoint struct { + // inode is the filesystem inode which produced this endpoint. + inode *fs.Inode + + // file is the p9 file that contains a single unopened fid. + file p9.File + + // path is the sentry path where this endpoint is bound. + path string +} + +func unixSockToP9(t unix.SockType) (p9.ConnectFlags, bool) { + switch t { + case unix.SockStream: + return p9.StreamSocket, true + case unix.SockSeqpacket: + return p9.SeqpacketSocket, true + case unix.SockDgram: + return p9.DgramSocket, true + } + return 0, false +} + +// BidirectionalConnect implements ConnectableEndpoint.BidirectionalConnect. +func (e *endpoint) BidirectionalConnect(ce unix.ConnectingEndpoint, returnConnect func(unix.Receiver, unix.ConnectedEndpoint)) *tcpip.Error { + cf, ok := unixSockToP9(ce.Type()) + if !ok { + return tcpip.ErrConnectionRefused + } + + // No lock ordering required as only the ConnectingEndpoint has a mutex. + ce.Lock() + defer ce.Unlock() + + // Check connecting state. + if ce.Connected() { + return tcpip.ErrAlreadyConnected + } + if ce.Listening() { + return tcpip.ErrInvalidEndpointState + } + + hostFile, err := e.file.Connect(cf) + if err != nil { + return tcpip.ErrConnectionRefused + } + + r, c, terr := host.NewConnectedEndpoint(hostFile, ce.WaiterQueue(), e.path) + if terr != nil { + return terr + } + returnConnect(r, c) + return nil +} + +// UnidirectionalConnect implements unix.BoundEndpoint.UnidirectionalConnect. +func (e *endpoint) UnidirectionalConnect() (unix.ConnectedEndpoint, *tcpip.Error) { + hostFile, err := e.file.Connect(p9.DgramSocket) + if err != nil { + return nil, tcpip.ErrConnectionRefused + } + + r, c, terr := host.NewConnectedEndpoint(hostFile, &waiter.Queue{}, e.path) + if terr != nil { + return nil, terr + } + + // We don't need the receiver. + r.CloseRecv() + r.Release() + + return c, nil +} + +// Release implements unix.BoundEndpoint.Release. +func (e *endpoint) Release() { + e.inode.DecRef() +} diff --git a/pkg/sentry/fs/gofer/util.go b/pkg/sentry/fs/gofer/util.go new file mode 100644 index 000000000..d9ed8c81e --- /dev/null +++ b/pkg/sentry/fs/gofer/util.go @@ -0,0 +1,60 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package gofer + +import ( + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/p9" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" +) + +func utimes(ctx context.Context, file contextFile, ts fs.TimeSpec) error { + if ts.ATimeOmit && ts.MTimeOmit { + return nil + } + mask := p9.SetAttrMask{ + ATime: !ts.ATimeOmit, + ATimeNotSystemTime: !ts.ATimeSetSystemTime, + MTime: !ts.MTimeOmit, + MTimeNotSystemTime: !ts.MTimeSetSystemTime, + } + as, ans := ts.ATime.Unix() + ms, mns := ts.MTime.Unix() + attr := p9.SetAttr{ + ATimeSeconds: uint64(as), + ATimeNanoSeconds: uint64(ans), + MTimeSeconds: uint64(ms), + MTimeNanoSeconds: uint64(mns), + } + // 9p2000.L SetAttr: "If a time bit is set without the corresponding SET bit, + // the current system time on the server is used instead of the value sent + // in the request." + return file.setAttr(ctx, mask, attr) +} + +func openFlagsFromPerms(p fs.PermMask) (p9.OpenFlags, error) { + switch { + case p.Read && p.Write: + return p9.ReadWrite, nil + case p.Write: + return p9.WriteOnly, nil + case p.Read: + return p9.ReadOnly, nil + default: + return 0, syscall.EINVAL + } +} diff --git a/pkg/sentry/fs/host/BUILD b/pkg/sentry/fs/host/BUILD new file mode 100644 index 000000000..97b64daed --- /dev/null +++ b/pkg/sentry/fs/host/BUILD @@ -0,0 +1,104 @@ +package(licenses = ["notice"]) # Apache 2.0 + +load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test") +load("//tools/go_stateify:defs.bzl", "go_stateify") + +go_stateify( + name = "host_state", + srcs = [ + "control.go", + "descriptor.go", + "descriptor_state.go", + "file.go", + "fs.go", + "inode.go", + "inode_state.go", + "socket.go", + "socket_state.go", + ], + out = "host_state.go", + package = "host", +) + +go_library( + name = "host", + srcs = [ + "control.go", + "descriptor.go", + "descriptor_state.go", + "device.go", + "file.go", + "fs.go", + "host_state.go", + "inode.go", + "inode_state.go", + "ioctl_unsafe.go", + "socket.go", + "socket_state.go", + "socket_unsafe.go", + "util.go", + "util_unsafe.go", + ], + importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/fs/host", + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/amutex", + "//pkg/fd", + "//pkg/log", + "//pkg/refs", + "//pkg/secio", + "//pkg/sentry/arch", + "//pkg/sentry/context", + "//pkg/sentry/device", + "//pkg/sentry/fs", + "//pkg/sentry/fs/fsutil", + "//pkg/sentry/fs/lock", + "//pkg/sentry/kernel", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/kernel/time", + "//pkg/sentry/memmap", + "//pkg/sentry/platform", + "//pkg/sentry/safemem", + "//pkg/sentry/socket", + "//pkg/sentry/socket/control", + "//pkg/sentry/socket/unix", + "//pkg/sentry/uniqueid", + "//pkg/sentry/usage", + "//pkg/sentry/usermem", + "//pkg/state", + "//pkg/syserror", + "//pkg/tcpip", + "//pkg/tcpip/link/rawfile", + "//pkg/tcpip/transport/unix", + "//pkg/unet", + "//pkg/waiter", + "//pkg/waiter/fdnotifier", + "@org_golang_x_sys//unix:go_default_library", + ], +) + +go_test( + name = "host_test", + size = "small", + srcs = [ + "fs_test.go", + "inode_test.go", + "socket_test.go", + "wait_test.go", + ], + embed = [":host"], + deps = [ + "//pkg/fd", + "//pkg/sentry/context", + "//pkg/sentry/context/contexttest", + "//pkg/sentry/fs", + "//pkg/sentry/socket", + "//pkg/sentry/usermem", + "//pkg/syserr", + "//pkg/tcpip", + "//pkg/tcpip/transport/unix", + "//pkg/waiter", + "//pkg/waiter/fdnotifier", + ], +) diff --git a/pkg/sentry/fs/host/control.go b/pkg/sentry/fs/host/control.go new file mode 100644 index 000000000..d2b007ab2 --- /dev/null +++ b/pkg/sentry/fs/host/control.go @@ -0,0 +1,90 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package host + +import ( + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/socket/control" + "gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix" +) + +type scmRights struct { + fds []int +} + +func newSCMRights(fds []int) control.SCMRights { + return &scmRights{fds} +} + +// Files implements control.SCMRights.Files. +func (c *scmRights) Files(ctx context.Context, max int) control.RightsFiles { + n := max + if l := len(c.fds); n > l { + n = l + } + + rf := control.RightsFiles(fdsToFiles(ctx, c.fds[:n])) + + // Only consume converted FDs (fdsToFiles may convert fewer than n FDs). + c.fds = c.fds[len(rf):] + return rf +} + +// Clone implements unix.RightsControlMessage.Clone. +func (c *scmRights) Clone() unix.RightsControlMessage { + // Host rights never need to be cloned. + return nil +} + +// Release implements unix.RightsControlMessage.Release. +func (c *scmRights) Release() { + for _, fd := range c.fds { + syscall.Close(fd) + } + c.fds = nil +} + +// If an error is encountered, only files created before the error will be +// returned. This is what Linux does. +func fdsToFiles(ctx context.Context, fds []int) []*fs.File { + files := make([]*fs.File, 0, len(fds)) + for _, fd := range fds { + // Get flags. We do it here because they may be modified + // by subsequent functions. + fileFlags, _, errno := syscall.Syscall(syscall.SYS_FCNTL, uintptr(fd), syscall.F_GETFL, 0) + if errno != 0 { + ctx.Warningf("Error retrieving host FD flags: %v", error(errno)) + break + } + + // Create the file backed by hostFD. + file, err := NewFile(ctx, fd, fs.FileOwnerFromContext(ctx)) + if err != nil { + ctx.Warningf("Error creating file from host FD: %v", err) + break + } + + // Set known flags. + file.SetFlags(fs.SettableFileFlags{ + NonBlocking: fileFlags&syscall.O_NONBLOCK != 0, + }) + + files = append(files, file) + } + return files +} diff --git a/pkg/sentry/fs/host/descriptor.go b/pkg/sentry/fs/host/descriptor.go new file mode 100644 index 000000000..613bd06e8 --- /dev/null +++ b/pkg/sentry/fs/host/descriptor.go @@ -0,0 +1,118 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package host + +import ( + "fmt" + "path" + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/pkg/waiter" + "gvisor.googlesource.com/gvisor/pkg/waiter/fdnotifier" +) + +// descriptor wraps a host fd. +type descriptor struct { + // donated is true if the host fd was donated by another process. + donated bool + + // If origFD >= 0, it is the host fd that this file was + // originally created from, which must be available at time + // of restore. Only valid if donated is true. + origFD int + + // wouldBlock is true if value (below) points to a file that can + // return EWOULDBLOCK for operations that would block. + wouldBlock bool + + // value is the wrapped host fd. It is never saved or restored + // directly. How it is restored depends on whether it was + // donated and the fs.MountSource it was originally + // opened/created from. + value int `state:"nosave"` +} + +// newDescriptor returns a wrapped host file descriptor. On success, +// the descriptor is registered for event notifications with queue. +func newDescriptor(fd int, donated bool, saveable bool, wouldBlock bool, queue *waiter.Queue) (*descriptor, error) { + ownedFD := fd + origFD := -1 + if saveable { + var err error + ownedFD, err = syscall.Dup(fd) + if err != nil { + return nil, err + } + origFD = fd + } + if wouldBlock { + if err := syscall.SetNonblock(ownedFD, true); err != nil { + return nil, err + } + if err := fdnotifier.AddFD(int32(ownedFD), queue); err != nil { + return nil, err + } + } + return &descriptor{ + donated: donated, + origFD: origFD, + wouldBlock: wouldBlock, + value: ownedFD, + }, nil +} + +// initAfterLoad initializes the value of the descriptor after Load. +func (d *descriptor) initAfterLoad(mo *superOperations, id uint64, queue *waiter.Queue) error { + if d.donated { + var err error + d.value, err = syscall.Dup(d.origFD) + if err != nil { + return fmt.Errorf("failed to dup restored fd %d: %v", d.origFD, err) + } + } else { + name, ok := mo.inodeMappings[id] + if !ok { + return fmt.Errorf("failed to find path for inode number %d", id) + } + fullpath := path.Join(mo.root, name) + + var err error + d.value, err = open(nil, fullpath) + if err != nil { + return fmt.Errorf("failed to open %q: %v", fullpath, err) + } + } + if d.wouldBlock { + if err := syscall.SetNonblock(d.value, true); err != nil { + return err + } + if err := fdnotifier.AddFD(int32(d.value), queue); err != nil { + return err + } + } + return nil +} + +// Release releases all resources held by descriptor. +func (d *descriptor) Release() { + if d.wouldBlock { + fdnotifier.RemoveFD(int32(d.value)) + } + if err := syscall.Close(d.value); err != nil { + log.Warningf("error closing fd %d: %v", d.value, err) + } + d.value = -1 +} diff --git a/pkg/sentry/fs/host/descriptor_state.go b/pkg/sentry/fs/host/descriptor_state.go new file mode 100644 index 000000000..7fb274451 --- /dev/null +++ b/pkg/sentry/fs/host/descriptor_state.go @@ -0,0 +1,29 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package host + +// beforeSave is invoked by stateify. +func (d *descriptor) beforeSave() { + if d.donated && d.origFD < 0 { + panic("donated file descriptor cannot be saved") + } +} + +// afterLoad is invoked by stateify. +func (d *descriptor) afterLoad() { + // value must be manually restored by the descriptor's parent using + // initAfterLoad. + d.value = -1 +} diff --git a/pkg/sentry/fs/host/device.go b/pkg/sentry/fs/host/device.go new file mode 100644 index 000000000..f2a0b6b15 --- /dev/null +++ b/pkg/sentry/fs/host/device.go @@ -0,0 +1,25 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package host + +import ( + "gvisor.googlesource.com/gvisor/pkg/sentry/device" +) + +// hostFileDevice is the host file virtual device. +var hostFileDevice = device.NewAnonMultiDevice() + +// hostPipeDevice is the host pipe virtual device. +var hostPipeDevice = device.NewAnonDevice() diff --git a/pkg/sentry/fs/host/file.go b/pkg/sentry/fs/host/file.go new file mode 100644 index 000000000..bdf844337 --- /dev/null +++ b/pkg/sentry/fs/host/file.go @@ -0,0 +1,371 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package host + +import ( + "fmt" + "syscall" + + "golang.org/x/sys/unix" + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/fd" + "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/pkg/secio" + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" + "gvisor.googlesource.com/gvisor/pkg/sentry/memmap" + "gvisor.googlesource.com/gvisor/pkg/sentry/safemem" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" + "gvisor.googlesource.com/gvisor/pkg/waiter" + "gvisor.googlesource.com/gvisor/pkg/waiter/fdnotifier" +) + +// fileOperations implements fs.FileOperations for a host file descriptor. +type fileOperations struct { + fsutil.NoopRelease `state:"nosave"` + + // iops are the Inode operations for this file. + iops *inodeOperations `state:"wait"` + + // a scratch buffer for reading directory entries. + dirinfo *dirInfo `state:"nosave"` + + // dirCursor is the directory cursor. + dirCursor string + + // allowIoctl determines whether ioctls should be passed through to the + // host. + allowIoctl bool +} + +// fileOperations implements fs.FileOperations. +var _ fs.FileOperations = (*fileOperations)(nil) + +// NewFile creates a new File backed by the provided host file descriptor. If +// NewFile succeeds, ownership of the fd is transferred to the returned File. +// +// The returned File cannot be saved, since there is no guarantee that the same +// fd will exist or represent the same file at time of restore. If such a +// guarantee does exist, use ImportFile instead. +func NewFile(ctx context.Context, fd int, mounter fs.FileOwner) (*fs.File, error) { + return newFileFromDonatedFD(ctx, fd, mounter, false, false) +} + +// ImportFile creates a new File backed by the provided host file descriptor. +// Unlike NewFile, the file descriptor used by the File is duped from fd to +// ensure that later changes to fd are not reflected by the fs.File. +// +// If the returned file is saved, it will be restored by re-importing the fd +// originally passed to ImportFile. It is the restorer's responsibility to +// ensure that the fd represents the same file. +func ImportFile(ctx context.Context, fd int, mounter fs.FileOwner, allowIoctl bool) (*fs.File, error) { + return newFileFromDonatedFD(ctx, fd, mounter, true, allowIoctl) +} + +// newFileFromDonatedFD returns an fs.File from a donated fd. If the fd is +// saveable, then saveable is true. +func newFileFromDonatedFD(ctx context.Context, donated int, mounter fs.FileOwner, saveable, allowIoctl bool) (*fs.File, error) { + var s syscall.Stat_t + if err := syscall.Fstat(donated, &s); err != nil { + return nil, err + } + switch s.Mode & syscall.S_IFMT { + case syscall.S_IFSOCK: + flags, err := fileFlagsFromDonatedFD(donated) + if err != nil { + return nil, err + } + s, err := newSocket(ctx, donated, saveable) + if err != nil { + return nil, err + } + s.SetFlags(fs.SettableFileFlags{ + NonBlocking: flags.NonBlocking, + }) + return s, nil + default: + flags, err := fileFlagsFromDonatedFD(donated) + if err != nil { + return nil, err + } + msrc := newMountSource(ctx, "/", mounter, &Filesystem{}, fs.MountSourceFlags{}, false /* dontTranslateOwnership */) + inode, err := newInode(ctx, msrc, donated, saveable, true /* donated */) + if err != nil { + return nil, err + } + iops := inode.InodeOperations.(*inodeOperations) + + name := fmt.Sprintf("host:[%d]", inode.StableAttr.InodeID) + dirent := fs.NewDirent(inode, name) + defer dirent.DecRef() + + return newFile(ctx, dirent, flags, iops, allowIoctl), nil + } +} + +func fileFlagsFromDonatedFD(donated int) (fs.FileFlags, error) { + flags, _, errno := syscall.Syscall(syscall.SYS_FCNTL, uintptr(donated), syscall.F_GETFL, 0) + if errno != 0 { + log.Warningf("Failed to get file flags for donated fd %d (errno=%d)", donated, errno) + return fs.FileFlags{}, syscall.EIO + } + accmode := flags & syscall.O_ACCMODE + return fs.FileFlags{ + Direct: flags&syscall.O_DIRECT != 0, + NonBlocking: flags&syscall.O_NONBLOCK != 0, + Sync: flags&syscall.O_SYNC != 0, + Append: flags&syscall.O_APPEND != 0, + Read: accmode == syscall.O_RDONLY || accmode == syscall.O_RDWR, + Write: accmode == syscall.O_WRONLY || accmode == syscall.O_RDWR, + }, nil +} + +// newFile returns a new fs.File. +func newFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags, iops *inodeOperations, allowIoctl bool) *fs.File { + if !iops.ReturnsWouldBlock() { + // Allow reading/writing at an arbitrary offset for files + // that support it. + flags.Pread = true + flags.Pwrite = true + } + return fs.NewFile(ctx, dirent, flags, &fileOperations{ + iops: iops, + allowIoctl: allowIoctl, + }) +} + +// EventRegister implements waiter.Waitable.EventRegister. +func (f *fileOperations) EventRegister(e *waiter.Entry, mask waiter.EventMask) { + f.iops.fileState.queue.EventRegister(e, mask) + fdnotifier.UpdateFD(int32(f.iops.fileState.FD())) +} + +// EventUnregister implements waiter.Waitable.EventUnregister. +func (f *fileOperations) EventUnregister(e *waiter.Entry) { + f.iops.fileState.queue.EventUnregister(e) + fdnotifier.UpdateFD(int32(f.iops.fileState.FD())) +} + +// Readiness uses the poll() syscall to check the status of the underlying FD. +func (f *fileOperations) Readiness(mask waiter.EventMask) waiter.EventMask { + return fdnotifier.NonBlockingPoll(int32(f.iops.fileState.FD()), mask) +} + +// Readdir implements fs.FileOperations.Readdir. +func (f *fileOperations) Readdir(ctx context.Context, file *fs.File, serializer fs.DentrySerializer) (int64, error) { + root := fs.RootFromContext(ctx) + defer root.DecRef() + dirCtx := &fs.DirCtx{ + Serializer: serializer, + DirCursor: &f.dirCursor, + } + return fs.DirentReaddir(ctx, file.Dirent, f, root, dirCtx, file.Offset()) +} + +// IterateDir implements fs.DirIterator.IterateDir. +func (f *fileOperations) IterateDir(ctx context.Context, dirCtx *fs.DirCtx, offset int) (int, error) { + if f.dirinfo == nil { + f.dirinfo = new(dirInfo) + f.dirinfo.buf = make([]byte, usermem.PageSize) + } + entries, err := f.iops.readdirAll(f.dirinfo) + if err != nil { + return offset, err + } + count, err := fs.GenericReaddir(dirCtx, fs.NewSortedDentryMap(entries)) + return offset + count, err +} + +// Write implements fs.FileOperations.Write. +func (f *fileOperations) Write(ctx context.Context, file *fs.File, src usermem.IOSequence, offset int64) (int64, error) { + // Would this file block? + if f.iops.ReturnsWouldBlock() { + // These files can't be memory mapped, assert this. This also + // means that writes do not need to synchronize with memory + // mappings nor metadata cached by this file's fs.Inode. + if canMap(file.Dirent.Inode) { + panic("files that can return EWOULDBLOCK cannot be memory mapped") + } + // Ignore the offset, these files don't support writing at + // an arbitrary offset. + writer := fd.NewReadWriter(f.iops.fileState.FD()) + n, err := src.CopyInTo(ctx, safemem.FromIOWriter{writer}) + if isBlockError(err) { + err = syserror.ErrWouldBlock + } + return n, err + } + if !file.Dirent.Inode.MountSource.Flags.ForcePageCache { + writer := secio.NewOffsetWriter(fd.NewReadWriter(f.iops.fileState.FD()), offset) + return src.CopyInTo(ctx, safemem.FromIOWriter{writer}) + } + return f.iops.cachingInodeOps.Write(ctx, src, offset) +} + +// Read implements fs.FileOperations.Read. +func (f *fileOperations) Read(ctx context.Context, file *fs.File, dst usermem.IOSequence, offset int64) (int64, error) { + // Would this file block? + if f.iops.ReturnsWouldBlock() { + // These files can't be memory mapped, assert this. This also + // means that reads do not need to synchronize with memory + // mappings nor metadata cached by this file's fs.Inode. + if canMap(file.Dirent.Inode) { + panic("files that can return EWOULDBLOCK cannot be memory mapped") + } + // Ignore the offset, these files don't support reading at + // an arbitrary offset. + reader := fd.NewReadWriter(f.iops.fileState.FD()) + n, err := dst.CopyOutFrom(ctx, safemem.FromIOReader{reader}) + if isBlockError(err) { + // If we got any data at all, return it as a "completed" partial read + // rather than retrying until complete. + if n != 0 { + err = nil + } else { + err = syserror.ErrWouldBlock + } + } + return n, err + } + if !file.Dirent.Inode.MountSource.Flags.ForcePageCache { + reader := secio.NewOffsetReader(fd.NewReadWriter(f.iops.fileState.FD()), offset) + return dst.CopyOutFrom(ctx, safemem.FromIOReader{reader}) + } + return f.iops.cachingInodeOps.Read(ctx, file, dst, offset) +} + +// Fsync implements fs.FileOperations.Fsync. +func (f *fileOperations) Fsync(ctx context.Context, file *fs.File, start int64, end int64, syncType fs.SyncType) error { + switch syncType { + case fs.SyncAll, fs.SyncData: + if err := file.Dirent.Inode.WriteOut(ctx); err != nil { + return err + } + fallthrough + case fs.SyncBackingStorage: + return syscall.Fsync(f.iops.fileState.FD()) + } + panic("invalid sync type") +} + +// Flush implements fs.FileOperations.Flush. +func (f *fileOperations) Flush(context.Context, *fs.File) error { + // This is a no-op because flushing the resource backing this + // file would mean closing it. We can't do that because other + // open files may depend on the backing host fd. + return nil +} + +// ConfigureMMap implements fs.FileOperations.ConfigureMMap. +func (f *fileOperations) ConfigureMMap(ctx context.Context, file *fs.File, opts *memmap.MMapOpts) error { + if !canMap(file.Dirent.Inode) { + return syserror.ENODEV + } + return fsutil.GenericConfigureMMap(file, f.iops.cachingInodeOps, opts) +} + +// Seek implements fs.FileOperations.Seek. +func (f *fileOperations) Seek(ctx context.Context, file *fs.File, whence fs.SeekWhence, offset int64) (int64, error) { + return fsutil.SeekWithDirCursor(ctx, file, whence, offset, &f.dirCursor) +} + +// Ioctl implements fs.FileOperations.Iocotl. +func (f *fileOperations) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { + if !f.allowIoctl { + return 0, syserror.ENOTTY + } + // Ignore arg[0]. This is the real FD: + fd := f.iops.fileState.FD() + ioctl := args[1].Uint64() + switch ioctl { + case unix.TCGETS: + termios, err := ioctlGetTermios(fd) + if err != nil { + return 0, err + } + _, err = usermem.CopyObjectOut(ctx, io, args[2].Pointer(), termios, usermem.IOOpts{ + AddressSpaceActive: true, + }) + return 0, err + + case unix.TCSETS, unix.TCSETSW: + var termios linux.Termios + if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &termios, usermem.IOOpts{ + AddressSpaceActive: true, + }); err != nil { + return 0, err + } + err := ioctlSetTermios(fd, ioctl, &termios) + return 0, err + + case unix.TIOCGPGRP: + // Args: pid_t *argp + // When successful, equivalent to *argp = tcgetpgrp(fd). + // Get the process group ID of the foreground process group on + // this terminal. + + t := kernel.TaskFromContext(ctx) + if t == nil { + panic(fmt.Sprintf("cannot get thread group from context %v", ctx)) + } + tid := t.ThreadID() + _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), &tid, usermem.IOOpts{ + AddressSpaceActive: true, + }) + return 0, err + + case unix.TIOCSPGRP: + // Args: const pid_t *argp + // Equivalent to tcsetpgrp(fd, *argp). + // Set the foreground process group ID of this terminal. + + // Not much we can do with this one at the moment, so we just + // lie and pretend everything is great. Bash and Sh seem fine + // with this. + log.Warningf("Ignoring application ioctl(TIOCSPGRP) call") + return 0, nil + + case unix.TIOCGWINSZ: + // Args: struct winsize *argp + // Get window size. + winsize, err := unix.IoctlGetWinsize(fd, unix.TIOCGWINSZ) + if err != nil { + return 0, err + } + _, err = usermem.CopyObjectOut(ctx, io, args[2].Pointer(), winsize, usermem.IOOpts{ + AddressSpaceActive: true, + }) + return 0, err + + case unix.TIOCSWINSZ: + // Args: const struct winsize *argp + // Set window size. + var winsize unix.Winsize + if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &winsize, usermem.IOOpts{ + AddressSpaceActive: true, + }); err != nil { + return 0, err + } + err := unix.IoctlSetWinsize(fd, unix.TIOCSWINSZ, &winsize) + return 0, err + + default: + return 0, syserror.ENOTTY + } +} diff --git a/pkg/sentry/fs/host/fs.go b/pkg/sentry/fs/host/fs.go new file mode 100644 index 000000000..ffd55a5ab --- /dev/null +++ b/pkg/sentry/fs/host/fs.go @@ -0,0 +1,327 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package host implements an fs.Filesystem for files backed by host +// file descriptors. +package host + +import ( + "fmt" + "path" + "path/filepath" + "strconv" + "strings" + + "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" +) + +// FilesystemName is the name under which Filesystem is registered. +const FilesystemName = "whitelistfs" + +const ( + // whitelistKey is the mount option containing a comma-separated list + // of host paths to whitelist. + whitelistKey = "whitelist" + + // rootPathKey is the mount option containing the root path of the + // mount. + rootPathKey = "root" + + // dontTranslateOwnershipKey is the key to superOperations.dontTranslateOwnership. + dontTranslateOwnershipKey = "dont_translate_ownership" +) + +// maxTraversals determines link traversals in building the whitelist. +const maxTraversals = 10 + +// Filesystem is a pseudo file system that is only available during the setup +// to lock down the configurations. This filesystem should only be mounted at root. +// +// Think twice before exposing this to applications. +type Filesystem struct { + // whitelist is a set of host paths to whitelist. + paths []string +} + +// Name is the identifier of this file system. +func (*Filesystem) Name() string { + return FilesystemName +} + +// AllowUserMount prohibits users from using mount(2) with this file system. +func (*Filesystem) AllowUserMount() bool { + return false +} + +// Flags returns that there is nothing special about this file system. +func (*Filesystem) Flags() fs.FilesystemFlags { + return 0 +} + +// Mount returns an fs.Inode exposing the host file system. It is intended to be locked +// down in PreExec below. +func (f *Filesystem) Mount(ctx context.Context, _ string, flags fs.MountSourceFlags, data string) (*fs.Inode, error) { + // Parse generic comma-separated key=value options. + options := fs.GenericMountSourceOptions(data) + + // Grab the whitelist if one was specified. + // TODO: require another option "testonly" in order to allow + // no whitelist. + if wl, ok := options[whitelistKey]; ok { + f.paths = strings.Split(wl, "|") + delete(options, whitelistKey) + } + + // If the rootPath was set, use it. Othewise default to the root of the + // host fs. + rootPath := "/" + if rp, ok := options[rootPathKey]; ok { + rootPath = rp + delete(options, rootPathKey) + + // We must relativize the whitelisted paths to the new root. + for i, p := range f.paths { + rel, err := filepath.Rel(rootPath, p) + if err != nil { + return nil, fmt.Errorf("whitelist path %q must be a child of root path %q", p, rootPath) + } + f.paths[i] = path.Join("/", rel) + } + } + fd, err := open(nil, rootPath) + if err != nil { + return nil, fmt.Errorf("failed to find root: %v", err) + } + + var dontTranslateOwnership bool + if v, ok := options[dontTranslateOwnershipKey]; ok { + b, err := strconv.ParseBool(v) + if err != nil { + return nil, fmt.Errorf("invalid value for %q: %v", dontTranslateOwnershipKey, err) + } + dontTranslateOwnership = b + delete(options, dontTranslateOwnershipKey) + } + + // Fail if the caller passed us more options than we know about. + if len(options) > 0 { + return nil, fmt.Errorf("unsupported mount options: %v", options) + } + + // The mounting EUID/EGID will be cached by this file system. This will + // be used to assign ownership to files that we own. + owner := fs.FileOwnerFromContext(ctx) + + // Construct the host file system mount and inode. + msrc := newMountSource(ctx, rootPath, owner, f, flags, dontTranslateOwnership) + return newInode(ctx, msrc, fd, false /* saveable */, false /* donated */) +} + +// InstallWhitelist locks down the MountNamespace to only the currently installed +// Dirents and the given paths. +func (f *Filesystem) InstallWhitelist(ctx context.Context, m *fs.MountNamespace) error { + return installWhitelist(ctx, m, f.paths) +} + +func installWhitelist(ctx context.Context, m *fs.MountNamespace, paths []string) error { + if len(paths) == 0 || (len(paths) == 1 && paths[0] == "") { + // Warning will be logged during filter installation if the empty + // whitelist matters (allows for host file access). + return nil + } + + // Done tracks entries already added. + done := make(map[string]bool) + root := m.Root() + defer root.DecRef() + + for i := 0; i < len(paths); i++ { + // Make sure the path is absolute. This is a sanity check. + if !path.IsAbs(paths[i]) { + return fmt.Errorf("path %q is not absolute", paths[i]) + } + + // We need to add all the intermediate paths, in case one of + // them is a symlink that needs to be resolved. + for j := 1; j <= len(paths[i]); j++ { + if j < len(paths[i]) && paths[i][j] != '/' { + continue + } + current := paths[i][:j] + + // Lookup the given component in the tree. + d, err := m.FindLink(ctx, root, nil, current, maxTraversals) + if err != nil { + log.Warningf("populate failed for %q: %v", current, err) + continue + } + + // It's critical that this DecRef happens after the + // freeze below. This ensures that the dentry is in + // place to be frozen. Otherwise, we freeze without + // these entries. + defer d.DecRef() + + // Expand the last component if necessary. + if current == paths[i] { + // Is it a directory or symlink? + sattr := d.Inode.StableAttr + if fs.IsDir(sattr) { + for name := range childDentAttrs(ctx, d) { + paths = append(paths, path.Join(current, name)) + } + } + if fs.IsSymlink(sattr) { + // Only expand symlinks once. The + // folder structure may contain + // recursive symlinks and we don't want + // to end up infinitely expanding this + // symlink. This is safe because this + // is the last component. If a later + // path wants to symlink something + // beneath this symlink that will still + // be handled by the FindLink above. + if done[current] { + continue + } + + s, err := d.Inode.Readlink(ctx) + if err != nil { + log.Warningf("readlink failed for %q: %v", current, err) + continue + } + if path.IsAbs(s) { + paths = append(paths, s) + } else { + target := path.Join(path.Dir(current), s) + paths = append(paths, target) + } + } + } + + // Only report this one once even though we may look + // it up more than once. If we whitelist /a/b,/a then + // /a will be "done" when it is looked up for /a/b, + // however we still need to expand all of its contents + // when whitelisting /a. + if !done[current] { + log.Debugf("whitelisted: %s", current) + } + done[current] = true + } + } + + // Freeze the mount tree in place. This prevents any new paths from + // being opened and any old ones from being removed. If we do provide + // tmpfs mounts, we'll want to freeze/thaw those separately. + m.Freeze() + return nil +} + +func childDentAttrs(ctx context.Context, d *fs.Dirent) map[string]fs.DentAttr { + dirname, _ := d.FullName(nil /* root */) + dir, err := d.Inode.GetFile(ctx, d, fs.FileFlags{Read: true}) + if err != nil { + log.Warningf("failed to open directory %q: %v", dirname, err) + return nil + } + dir.DecRef() + var stubSerializer fs.CollectEntriesSerializer + if err := dir.Readdir(ctx, &stubSerializer); err != nil { + log.Warningf("failed to iterate on host directory %q: %v", dirname, err) + return nil + } + delete(stubSerializer.Entries, ".") + delete(stubSerializer.Entries, "..") + return stubSerializer.Entries +} + +// newMountSource constructs a new host fs.MountSource +// relative to a root path. The root should match the mount point. +func newMountSource(ctx context.Context, root string, mounter fs.FileOwner, filesystem fs.Filesystem, flags fs.MountSourceFlags, dontTranslateOwnership bool) *fs.MountSource { + return fs.NewMountSource(&superOperations{ + root: root, + inodeMappings: make(map[uint64]string), + mounter: mounter, + dontTranslateOwnership: dontTranslateOwnership, + }, filesystem, flags) +} + +// superOperations implements fs.MountSourceOperations. +type superOperations struct { + fs.SimpleMountSourceOperations `state:"nosave"` + + // root is the path of the mount point. All inode mappings + // are relative to this root. + root string + + // inodeMappings contains mappings of fs.Inodes associated + // with this MountSource to paths under root. + inodeMappings map[uint64]string + + // mounter is the cached EUID/EGID that mounted this file system. + mounter fs.FileOwner + + // dontTranslateOwnership indicates whether to not translate file + // ownership. + // + // By default, files/directories owned by the sandbox uses UID/GID + // of the mounter. For files/directories that are not owned by the + // sandbox, file UID/GID is translated to a UID/GID which cannot + // be mapped in the sandboxed application's user namespace. The + // UID/GID will look like the nobody UID/GID (65534) but is not + // strictly owned by the user "nobody". + // + // If whitelistfs is a lower filesystem in an overlay, set + // dont_translate_ownership=true in mount options. + dontTranslateOwnership bool +} + +var _ fs.MountSourceOperations = (*superOperations)(nil) + +// ResetInodeMappings implements fs.MountSourceOperations.ResetInodeMappings. +func (m *superOperations) ResetInodeMappings() { + m.inodeMappings = make(map[uint64]string) +} + +// SaveInodeMapping implements fs.MountSourceOperations.SaveInodeMapping. +func (m *superOperations) SaveInodeMapping(inode *fs.Inode, path string) { + // This is very unintuitive. We *CANNOT* trust the inode's StableAttrs, + // because overlay copyUp may have changed them out from under us. + // So much for "immutable". + sattr := inode.InodeOperations.(*inodeOperations).fileState.sattr + m.inodeMappings[sattr.InodeID] = path +} + +// Keep implements fs.MountSourceOperations.Keep. +// +// TODO: It is possible to change the permissions on a +// host file while it is in the dirent cache (say from RO to RW), but it is not +// possible to re-open the file with more relaxed permissions, since the host +// FD is already open and stored in the inode. +// +// Using the dirent LRU cache increases the odds that this bug is encountered. +// Since host file access is relatively fast anyways, we disable the LRU cache +// for host fs files. Once we can properly deal with permissions changes and +// re-opening host files, we should revisit whether or not to make use of the +// LRU cache. +func (*superOperations) Keep(*fs.Dirent) bool { + return false +} + +func init() { + fs.RegisterFilesystem(&Filesystem{}) +} diff --git a/pkg/sentry/fs/host/fs_test.go b/pkg/sentry/fs/host/fs_test.go new file mode 100644 index 000000000..c000afc49 --- /dev/null +++ b/pkg/sentry/fs/host/fs_test.go @@ -0,0 +1,383 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package host + +import ( + "fmt" + "io/ioutil" + "os" + "path" + "reflect" + "sort" + "testing" + + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" +) + +// newTestMountNamespace creates a MountNamespace with a ramfs root. +// It returns the host folder created, which should be removed when done. +func newTestMountNamespace(t *testing.T) (*fs.MountNamespace, string, error) { + p, err := ioutil.TempDir("", "root") + if err != nil { + return nil, "", err + } + + fd, err := open(nil, p) + if err != nil { + os.RemoveAll(p) + return nil, "", err + } + ctx := contexttest.Context(t) + root, err := newInode(ctx, newMountSource(ctx, p, fs.RootOwner, &Filesystem{}, fs.MountSourceFlags{}, false), fd, false, false) + if err != nil { + os.RemoveAll(p) + return nil, "", err + } + mm, err := fs.NewMountNamespace(ctx, root) + if err != nil { + os.RemoveAll(p) + return nil, "", err + } + return mm, p, nil +} + +// createTestDirs populates the root with some test files and directories. +// /a/a1.txt +// /a/a2.txt +// /b/b1.txt +// /b/c/c1.txt +// /symlinks/normal.txt +// /symlinks/to_normal.txt -> /symlinks/normal.txt +// /symlinks/recursive -> /symlinks +func createTestDirs(ctx context.Context, t *testing.T, m *fs.MountNamespace) error { + r := m.Root() + defer r.DecRef() + + if err := r.CreateDirectory(ctx, r, "a", fs.FilePermsFromMode(0777)); err != nil { + return err + } + + a, err := r.Walk(ctx, r, "a") + if err != nil { + return err + } + defer a.DecRef() + + a1, err := a.Create(ctx, r, "a1.txt", fs.FileFlags{Read: true, Write: true}, fs.FilePermsFromMode(0666)) + if err != nil { + return err + } + a1.DecRef() + + a2, err := a.Create(ctx, r, "a2.txt", fs.FileFlags{Read: true, Write: true}, fs.FilePermsFromMode(0666)) + if err != nil { + return err + } + a2.DecRef() + + if err := r.CreateDirectory(ctx, r, "b", fs.FilePermsFromMode(0777)); err != nil { + return err + } + + b, err := r.Walk(ctx, r, "b") + if err != nil { + return err + } + defer b.DecRef() + + b1, err := b.Create(ctx, r, "b1.txt", fs.FileFlags{Read: true, Write: true}, fs.FilePermsFromMode(0666)) + if err != nil { + return err + } + b1.DecRef() + + if err := b.CreateDirectory(ctx, r, "c", fs.FilePermsFromMode(0777)); err != nil { + return err + } + + c, err := b.Walk(ctx, r, "c") + if err != nil { + return err + } + defer c.DecRef() + + c1, err := c.Create(ctx, r, "c1.txt", fs.FileFlags{Read: true, Write: true}, fs.FilePermsFromMode(0666)) + if err != nil { + return err + } + c1.DecRef() + + if err := r.CreateDirectory(ctx, r, "symlinks", fs.FilePermsFromMode(0777)); err != nil { + return err + } + + symlinks, err := r.Walk(ctx, r, "symlinks") + if err != nil { + return err + } + defer symlinks.DecRef() + + normal, err := symlinks.Create(ctx, r, "normal.txt", fs.FileFlags{Read: true, Write: true}, fs.FilePermsFromMode(0666)) + if err != nil { + return err + } + normal.DecRef() + + if err := symlinks.CreateLink(ctx, r, "/symlinks/normal.txt", "to_normal.txt"); err != nil { + return err + } + + if err := symlinks.CreateLink(ctx, r, "/symlinks", "recursive"); err != nil { + return err + } + + return nil +} + +// allPaths returns a slice of all paths of entries visible in the rootfs. +func allPaths(ctx context.Context, t *testing.T, m *fs.MountNamespace, base string) ([]string, error) { + var paths []string + root := m.Root() + defer root.DecRef() + + d, err := m.FindLink(ctx, root, nil, base, 1) + if err != nil { + t.Logf("FindLink failed for %q", base) + return paths, err + } + defer d.DecRef() + + if fs.IsDir(d.Inode.StableAttr) { + dir, err := d.Inode.GetFile(ctx, d, fs.FileFlags{Read: true}) + if err != nil { + return nil, fmt.Errorf("failed to open directory %q: %v", base, err) + } + iter, ok := dir.FileOperations.(fs.DirIterator) + if !ok { + return nil, fmt.Errorf("cannot directly iterate on host directory %q", base) + } + dirCtx := &fs.DirCtx{ + Serializer: noopDentrySerializer{}, + } + if _, err := fs.DirentReaddir(ctx, d, iter, root, dirCtx, 0); err != nil { + return nil, err + } + for name := range dirCtx.DentAttrs() { + if name == "." || name == ".." { + continue + } + + fullName := path.Join(base, name) + paths = append(paths, fullName) + + // Recurse. + subpaths, err := allPaths(ctx, t, m, fullName) + if err != nil { + return paths, err + } + paths = append(paths, subpaths...) + } + } + + return paths, nil +} + +type noopDentrySerializer struct{} + +func (noopDentrySerializer) CopyOut(string, fs.DentAttr) error { + return nil +} +func (noopDentrySerializer) Written() int { + return 4096 +} + +// pathsEqual returns true if the two string slices contain the same entries. +func pathsEqual(got, want []string) bool { + sort.Strings(got) + sort.Strings(want) + + if len(got) != len(want) { + return false + } + + for i := range got { + if got[i] != want[i] { + return false + } + } + + return true +} + +func TestWhitelist(t *testing.T) { + for _, test := range []struct { + // description of the test. + desc string + // paths are the paths to whitelist + paths []string + // want are all of the directory entries that should be + // visible (nothing beyond this set should be visible). + want []string + }{ + { + desc: "root", + paths: []string{"/"}, + want: []string{"/a", "/a/a1.txt", "/a/a2.txt", "/b", "/b/b1.txt", "/b/c", "/b/c/c1.txt", "/symlinks", "/symlinks/normal.txt", "/symlinks/to_normal.txt", "/symlinks/recursive"}, + }, + { + desc: "top-level directories", + paths: []string{"/a", "/b"}, + want: []string{"/a", "/a/a1.txt", "/a/a2.txt", "/b", "/b/b1.txt", "/b/c", "/b/c/c1.txt"}, + }, + { + desc: "nested directories (1/2)", + paths: []string{"/b", "/b/c"}, + want: []string{"/b", "/b/b1.txt", "/b/c", "/b/c/c1.txt"}, + }, + { + desc: "nested directories (2/2)", + paths: []string{"/b/c", "/b"}, + want: []string{"/b", "/b/b1.txt", "/b/c", "/b/c/c1.txt"}, + }, + { + desc: "single file", + paths: []string{"/b/c/c1.txt"}, + want: []string{"/b", "/b/c", "/b/c/c1.txt"}, + }, + { + desc: "single file and directory", + paths: []string{"/a/a1.txt", "/b/c"}, + want: []string{"/a", "/a/a1.txt", "/b", "/b/c", "/b/c/c1.txt"}, + }, + { + desc: "symlink", + paths: []string{"/symlinks/to_normal.txt"}, + want: []string{"/symlinks", "/symlinks/normal.txt", "/symlinks/to_normal.txt"}, + }, + { + desc: "recursive symlink", + paths: []string{"/symlinks/recursive/normal.txt"}, + want: []string{"/symlinks", "/symlinks/normal.txt", "/symlinks/recursive"}, + }, + } { + t.Run(test.desc, func(t *testing.T) { + m, p, err := newTestMountNamespace(t) + if err != nil { + t.Errorf("Failed to create MountNamespace: %v", err) + } + defer os.RemoveAll(p) + + ctx := withRoot(contexttest.RootContext(t), m.Root()) + if err := createTestDirs(ctx, t, m); err != nil { + t.Errorf("Failed to create test dirs: %v", err) + } + + if err := installWhitelist(ctx, m, test.paths); err != nil { + t.Errorf("installWhitelist(%v) err got %v want nil", test.paths, err) + } + + got, err := allPaths(ctx, t, m, "/") + if err != nil { + t.Fatalf("Failed to lookup paths (whitelisted: %v): %v", test.paths, err) + } + + if !pathsEqual(got, test.want) { + t.Errorf("For paths %v got %v want %v", test.paths, got, test.want) + } + }) + } +} + +func TestRootPath(t *testing.T) { + // Create a temp dir, which will be the root of our mounted fs. + rootPath, err := ioutil.TempDir(os.TempDir(), "root") + if err != nil { + t.Fatalf("TempDir failed: %v", err) + } + defer os.RemoveAll(rootPath) + + // Create two files inside the new root, one which will be whitelisted + // and one not. + whitelisted, err := ioutil.TempFile(rootPath, "white") + if err != nil { + t.Fatalf("TempFile failed: %v", err) + } + if _, err := ioutil.TempFile(rootPath, "black"); err != nil { + t.Fatalf("TempFile failed: %v", err) + } + + // Create a mount with a root path and single whitelisted file. + hostFS := &Filesystem{} + ctx := contexttest.Context(t) + data := fmt.Sprintf("%s=%s,%s=%s", rootPathKey, rootPath, whitelistKey, whitelisted.Name()) + inode, err := hostFS.Mount(ctx, "", fs.MountSourceFlags{}, data) + if err != nil { + t.Fatalf("Mount failed: %v", err) + } + mm, err := fs.NewMountNamespace(ctx, inode) + if err != nil { + t.Fatalf("NewMountNamespace failed: %v", err) + } + if err := hostFS.InstallWhitelist(ctx, mm); err != nil { + t.Fatalf("InstallWhitelist failed: %v", err) + } + + // Get the contents of the root directory. + rootDir := mm.Root() + rctx := withRoot(ctx, rootDir) + f, err := rootDir.Inode.GetFile(rctx, rootDir, fs.FileFlags{}) + if err != nil { + t.Fatalf("GetFile failed: %v", err) + } + c := &fs.CollectEntriesSerializer{} + if err := f.Readdir(rctx, c); err != nil { + t.Fatalf("Readdir failed: %v", err) + } + + // We should have only our whitelisted file, plus the dots. + want := []string{path.Base(whitelisted.Name()), ".", ".."} + got := c.Order + sort.Strings(want) + sort.Strings(got) + if !reflect.DeepEqual(got, want) { + t.Errorf("Readdir got %v, wanted %v", got, want) + } +} + +type rootContext struct { + context.Context + root *fs.Dirent +} + +// withRoot returns a copy of ctx with the given root. +func withRoot(ctx context.Context, root *fs.Dirent) context.Context { + return &rootContext{ + Context: ctx, + root: root, + } +} + +// Value implements Context.Value. +func (rc rootContext) Value(key interface{}) interface{} { + switch key { + case fs.CtxRoot: + rc.root.IncRef() + return rc.root + default: + return rc.Context.Value(key) + } +} diff --git a/pkg/sentry/fs/host/inode.go b/pkg/sentry/fs/host/inode.go new file mode 100644 index 000000000..226bc5164 --- /dev/null +++ b/pkg/sentry/fs/host/inode.go @@ -0,0 +1,506 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package host + +import ( + "sync" + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/fd" + "gvisor.googlesource.com/gvisor/pkg/secio" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/device" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil" + "gvisor.googlesource.com/gvisor/pkg/sentry/memmap" + "gvisor.googlesource.com/gvisor/pkg/sentry/safemem" + "gvisor.googlesource.com/gvisor/pkg/syserror" + "gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix" + "gvisor.googlesource.com/gvisor/pkg/waiter" +) + +// inodeOperations implements fs.InodeOperations for an fs.Inodes backed +// by a host file descriptor. +type inodeOperations struct { + fsutil.InodeNotVirtual `state:"nosave"` + fsutil.InodeNoExtendedAttributes `state:"nosave"` + fsutil.DeprecatedFileOperations `state:"nosave"` + + // fileState implements fs.CachedFileObject. It exists + // to break a circular load dependency between inodeOperations + // and cachingInodeOps (below). + fileState *inodeFileState `state:"wait"` + + // cachedInodeOps implements memmap.Mappable. + cachingInodeOps *fsutil.CachingInodeOperations + + // readdirMu protects the file offset on the host FD. This is needed + // for readdir because getdents must use the kernel offset, so + // concurrent readdirs must be exclusive. + // + // All read/write functions pass the offset directly to the kernel and + // thus don't need a lock. + readdirMu sync.Mutex `state:"nosave"` +} + +// inodeFileState implements fs.CachedFileObject and otherwise fully +// encapsulates state that needs to be manually loaded on restore for +// this file object. +// +// This unfortunate structure exists because fs.CachingInodeOperations +// defines afterLoad and therefore cannot be lazily loaded (to break a +// circular load dependency between it and inodeOperations). Even with +// lazy loading, this approach defines the dependencies between objects +// and the expected load behavior more concretely. +type inodeFileState struct { + // Common file system state. + mops *superOperations `state:"wait"` + + // descriptor is the backing host fd. + descriptor *descriptor `state:"wait"` + + // Event queue for blocking operations. + queue waiter.Queue `state:"nosave"` + + // sattr is used to restore the inodeOperations. + sattr fs.StableAttr `state:"wait"` + + // savedUAttr is only allocated during S/R. It points to the save-time + // unstable attributes and is used to validate restore-time ones. + // + // Note that these unstable attributes are only used to detect cross-S/R + // external file system metadata changes. They may differ from the + // cached unstable attributes in cachingInodeOps, as that might differ + // from the external file system attributes if there had been WriteOut + // failures. S/R is transparent to Sentry and the latter will continue + // using its cached values after restore. + savedUAttr *fs.UnstableAttr +} + +// ReadToBlocksAt implements fsutil.CachedFileObject.ReadToBlocksAt. +func (i *inodeFileState) ReadToBlocksAt(ctx context.Context, dsts safemem.BlockSeq, offset uint64) (uint64, error) { + // TODO: Using safemem.FromIOReader here is wasteful for two + // reasons: + // + // - Using preadv instead of iterated preads saves on host system calls. + // + // - Host system calls can handle destination memory that would fault in + // gr3 (i.e. they can accept safemem.Blocks with NeedSafecopy() == true), + // so the buffering performed by FromIOReader is unnecessary. + // + // This also applies to the write path below. + return safemem.FromIOReader{secio.NewOffsetReader(fd.NewReadWriter(i.FD()), int64(offset))}.ReadToBlocks(dsts) +} + +// WriteFromBlocksAt implements fsutil.CachedFileObject.WriteFromBlocksAt. +func (i *inodeFileState) WriteFromBlocksAt(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error) { + return safemem.FromIOWriter{secio.NewOffsetWriter(fd.NewReadWriter(i.FD()), int64(offset))}.WriteFromBlocks(srcs) +} + +// SetMaskedAttributes implements fsutil.CachedFileObject.SetMaskedAttributes. +func (i *inodeFileState) SetMaskedAttributes(ctx context.Context, mask fs.AttrMask, attr fs.UnstableAttr) error { + if mask.Empty() { + return nil + } + if mask.UID || mask.GID { + return syserror.EPERM + } + if mask.Perms { + if err := syscall.Fchmod(i.FD(), uint32(attr.Perms.LinuxMode())); err != nil { + return err + } + } + if mask.Size { + if err := syscall.Ftruncate(i.FD(), attr.Size); err != nil { + return err + } + } + if mask.AccessTime || mask.ModificationTime { + ts := fs.TimeSpec{ + ATime: attr.AccessTime, + ATimeOmit: !mask.AccessTime, + MTime: attr.ModificationTime, + MTimeOmit: !mask.ModificationTime, + } + if err := setTimestamps(i.FD(), ts); err != nil { + return err + } + } + return nil +} + +// Sync implements fsutil.CachedFileObject.Sync. +func (i *inodeFileState) Sync(ctx context.Context) error { + return syscall.Fsync(i.FD()) +} + +// FD implements fsutil.CachedFileObject.FD. +func (i *inodeFileState) FD() int { + return i.descriptor.value +} + +func (i *inodeFileState) unstableAttr(ctx context.Context) (fs.UnstableAttr, error) { + var s syscall.Stat_t + if err := syscall.Fstat(i.FD(), &s); err != nil { + return fs.UnstableAttr{}, err + } + return unstableAttr(i.mops, &s), nil +} + +// inodeOperations implements fs.InodeOperations. +var _ fs.InodeOperations = (*inodeOperations)(nil) + +// newInode returns a new fs.Inode backed by the host fd. +func newInode(ctx context.Context, msrc *fs.MountSource, fd int, saveable bool, donated bool) (*fs.Inode, error) { + // Retrieve metadata. + var s syscall.Stat_t + err := syscall.Fstat(fd, &s) + if err != nil { + return nil, err + } + + fileState := &inodeFileState{ + mops: msrc.MountSourceOperations.(*superOperations), + sattr: stableAttr(&s), + } + + // Initialize the wrapped host file descriptor. + fileState.descriptor, err = newDescriptor( + fd, + donated, + saveable, + wouldBlock(&s), + &fileState.queue, + ) + if err != nil { + return nil, err + } + + // Build the fs.InodeOperations. + uattr := unstableAttr(msrc.MountSourceOperations.(*superOperations), &s) + iops := &inodeOperations{ + fileState: fileState, + cachingInodeOps: fsutil.NewCachingInodeOperations(ctx, fileState, uattr, msrc.Flags.ForcePageCache), + } + + // Return the fs.Inode. + return fs.NewInode(iops, msrc, fileState.sattr), nil +} + +// Mappable implements fs.InodeOperations.Mappable. +func (i *inodeOperations) Mappable(inode *fs.Inode) memmap.Mappable { + if !canMap(inode) { + return nil + } + return i.cachingInodeOps +} + +// ReturnsWouldBlock returns true if this host fd can return EWOULDBLOCK +// for operations that would block. +func (i *inodeOperations) ReturnsWouldBlock() bool { + return i.fileState.descriptor.wouldBlock +} + +// Release implements fs.InodeOperations.Release. +func (i *inodeOperations) Release(context.Context) { + i.fileState.descriptor.Release() + i.cachingInodeOps.Release() +} + +// Lookup implements fs.InodeOperations.Lookup. +func (i *inodeOperations) Lookup(ctx context.Context, dir *fs.Inode, name string) (*fs.Dirent, error) { + // Get a new fd relative to i at name. + fd, err := open(i, name) + if err != nil { + if err == syserror.ENOENT { + return nil, syserror.ENOENT + } + return nil, err + } + + inode, err := newInode(ctx, dir.MountSource, fd, false /* saveable */, false /* donated */) + if err != nil { + return nil, err + } + + // Return the fs.Dirent. + return fs.NewDirent(inode, name), nil +} + +// Create implements fs.InodeOperations.Create. +func (i *inodeOperations) Create(ctx context.Context, dir *fs.Inode, name string, flags fs.FileFlags, perm fs.FilePermissions) (*fs.File, error) { + // Create a file relative to i at name. + // + // N.B. We always open this file O_RDWR regardless of flags because a + // future GetFile might want more access. Open allows this regardless + // of perm. + fd, err := openAt(i, name, syscall.O_RDWR|syscall.O_CREAT|syscall.O_EXCL, perm.LinuxMode()) + if err != nil { + return nil, err + } + + inode, err := newInode(ctx, dir.MountSource, fd, false /* saveable */, false /* donated */) + if err != nil { + return nil, err + } + + d := fs.NewDirent(inode, name) + defer d.DecRef() + return inode.GetFile(ctx, d, flags) +} + +// CreateDirectory implements fs.InodeOperations.CreateDirectory. +func (i *inodeOperations) CreateDirectory(ctx context.Context, dir *fs.Inode, name string, perm fs.FilePermissions) error { + return syscall.Mkdirat(i.fileState.FD(), name, uint32(perm.LinuxMode())) +} + +// CreateLink implements fs.InodeOperations.CreateLink. +func (i *inodeOperations) CreateLink(ctx context.Context, dir *fs.Inode, oldname string, newname string) error { + return createLink(i.fileState.FD(), oldname, newname) +} + +// CreateHardLink implements fs.InodeOperations.CreateHardLink. +func (*inodeOperations) CreateHardLink(context.Context, *fs.Inode, *fs.Inode, string) error { + return syserror.EPERM +} + +// CreateFifo implements fs.InodeOperations.CreateFifo. +func (*inodeOperations) CreateFifo(context.Context, *fs.Inode, string, fs.FilePermissions) error { + return syserror.EOPNOTSUPP +} + +// Remove implements fs.InodeOperations.Remove. +func (i *inodeOperations) Remove(ctx context.Context, dir *fs.Inode, name string) error { + return unlinkAt(i.fileState.FD(), name, false /* dir */) +} + +// RemoveDirectory implements fs.InodeOperations.RemoveDirectory. +func (i *inodeOperations) RemoveDirectory(ctx context.Context, dir *fs.Inode, name string) error { + return unlinkAt(i.fileState.FD(), name, true /* dir */) +} + +// Rename implements fs.InodeOperations.Rename. +func (i *inodeOperations) Rename(ctx context.Context, oldParent *fs.Inode, oldName string, newParent *fs.Inode, newName string) error { + op, ok := oldParent.InodeOperations.(*inodeOperations) + if !ok { + return syscall.EXDEV + } + np, ok := newParent.InodeOperations.(*inodeOperations) + if !ok { + return syscall.EXDEV + } + return syscall.Renameat(op.fileState.FD(), oldName, np.fileState.FD(), newName) +} + +// Bind implements fs.InodeOperations.Bind. +func (i *inodeOperations) Bind(ctx context.Context, dir *fs.Inode, name string, data unix.BoundEndpoint, perm fs.FilePermissions) error { + return syserror.EOPNOTSUPP +} + +// BoundEndpoint implements fs.InodeOperations.BoundEndpoint. +func (i *inodeOperations) BoundEndpoint(inode *fs.Inode, path string) unix.BoundEndpoint { + return nil +} + +// GetFile implements fs.InodeOperations.GetFile. +func (i *inodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) { + return newFile(ctx, d, flags, i, false), nil +} + +// canMap returns true if this fs.Inode can be memory mapped. +func canMap(inode *fs.Inode) bool { + // FIXME: Some obscure character devices can be mapped. + return fs.IsFile(inode.StableAttr) +} + +// UnstableAttr implements fs.InodeOperations.UnstableAttr. +func (i *inodeOperations) UnstableAttr(ctx context.Context, inode *fs.Inode) (fs.UnstableAttr, error) { + // When the kernel supports mapping host FDs, we do so to take + // advantage of the host page cache. We forego updating fs.Inodes + // because the host manages consistency of its own inode structures. + // + // For fs.Inodes that can never be mapped we take advantage of + // synchronizing metadata updates through host caches. + // + // So can we use host kernel metadata caches? + if !inode.MountSource.Flags.ForcePageCache || !canMap(inode) { + // Then just obtain the attributes. + return i.fileState.unstableAttr(ctx) + } + // No, we're maintaining consistency of metadata ourselves. + return i.cachingInodeOps.UnstableAttr(ctx, inode) +} + +// Check implements fs.InodeOperations.Check. +func (i *inodeOperations) Check(ctx context.Context, inode *fs.Inode, p fs.PermMask) bool { + return fs.ContextCanAccessFile(ctx, inode, p) +} + +// SetOwner implements fs.InodeOperations.SetOwner. +func (i *inodeOperations) SetOwner(context.Context, *fs.Inode, fs.FileOwner) error { + return syserror.EPERM +} + +// SetPermissions implements fs.InodeOperations.SetPermissions. +func (i *inodeOperations) SetPermissions(ctx context.Context, inode *fs.Inode, f fs.FilePermissions) bool { + // Can we use host kernel metadata caches? + if !inode.MountSource.Flags.ForcePageCache || !canMap(inode) { + // Then just change the timestamps on the fd, the host + // will synchronize the metadata update with any host + // inode and page cache. + return syscall.Fchmod(i.fileState.FD(), uint32(f.LinuxMode())) == nil + } + // Otherwise update our cached metadata. + return i.cachingInodeOps.SetPermissions(ctx, inode, f) +} + +// SetTimestamps implements fs.InodeOperations.SetTimestamps. +func (i *inodeOperations) SetTimestamps(ctx context.Context, inode *fs.Inode, ts fs.TimeSpec) error { + // Can we use host kernel metadata caches? + if !inode.MountSource.Flags.ForcePageCache || !canMap(inode) { + // Then just change the timestamps on the fd, the host + // will synchronize the metadata update with any host + // inode and page cache. + return setTimestamps(i.fileState.FD(), ts) + } + // Otherwise update our cached metadata. + return i.cachingInodeOps.SetTimestamps(ctx, inode, ts) +} + +// Truncate implements fs.InodeOperations.Truncate. +func (i *inodeOperations) Truncate(ctx context.Context, inode *fs.Inode, size int64) error { + // Is the file not memory-mappable? + if !canMap(inode) { + // Then just change the file size on the fd, the host + // will synchronize the metadata update with any host + // inode and page cache. + return syscall.Ftruncate(i.fileState.FD(), size) + } + // Otherwise we need to go through cachingInodeOps, even if the host page + // cache is in use, to invalidate private copies of truncated pages. + return i.cachingInodeOps.Truncate(ctx, inode, size) +} + +// WriteOut implements fs.InodeOperations.WriteOut. +func (i *inodeOperations) WriteOut(ctx context.Context, inode *fs.Inode) error { + // Have we been using host kernel metadata caches? + if !inode.MountSource.Flags.ForcePageCache || !canMap(inode) { + // Then the metadata is already up to date on the host. + return nil + } + // Otherwise we need to write out cached pages and attributes + // that are dirty. + return i.cachingInodeOps.WriteOut(ctx, inode) +} + +// Readlink implements fs.InodeOperations.Readlink. +func (i *inodeOperations) Readlink(ctx context.Context, inode *fs.Inode) (string, error) { + return readLink(i.fileState.FD()) +} + +// Getlink implements fs.InodeOperations.Getlink. +func (i *inodeOperations) Getlink(context.Context, *fs.Inode) (*fs.Dirent, error) { + if !fs.IsSymlink(i.fileState.sattr) { + return nil, syserror.ENOLINK + } + return nil, fs.ErrResolveViaReadlink +} + +// StatFS implements fs.InodeOperations.StatFS. +func (i *inodeOperations) StatFS(context.Context) (fs.Info, error) { + return fs.Info{}, syserror.ENOSYS +} + +// AddLink implements fs.InodeOperations.AddLink. +// FIXME: Remove this from InodeOperations altogether. +func (i *inodeOperations) AddLink() {} + +// DropLink implements fs.InodeOperations.DropLink. +// FIXME: Remove this from InodeOperations altogether. +func (i *inodeOperations) DropLink() {} + +// NotifyStatusChange implements fs.InodeOperations.NotifyStatusChange. +// FIXME: Remove this from InodeOperations altogether. +func (i *inodeOperations) NotifyStatusChange(ctx context.Context) {} + +// readdirAll returns all of the directory entries in i. +func (i *inodeOperations) readdirAll(d *dirInfo) (map[string]fs.DentAttr, error) { + i.readdirMu.Lock() + defer i.readdirMu.Unlock() + + fd := i.fileState.FD() + + // syscall.ReadDirent will use getdents, which will seek the file past + // the last directory entry. To read the directory entries a second + // time, we need to seek back to the beginning. + if _, err := syscall.Seek(fd, 0, 0); err != nil { + if err == syscall.ESPIPE { + // All directories should be seekable. If this file + // isn't seekable, it is not a directory and we should + // return that more sane error. + err = syscall.ENOTDIR + } + return nil, err + } + + names := make([]string, 0, 100) + for { + // Refill the buffer if necessary + if d.bufp >= d.nbuf { + d.bufp = 0 + // ReadDirent will just do a sys_getdents64 to the kernel. + n, err := syscall.ReadDirent(fd, d.buf) + if err != nil { + return nil, err + } + if n == 0 { + break // EOF + } + d.nbuf = n + } + + var nb int + // Parse the dirent buffer we just get and return the directory names along + // with the number of bytes consumed in the buffer. + nb, _, names = syscall.ParseDirent(d.buf[d.bufp:d.nbuf], -1, names) + d.bufp += nb + } + + entries := make(map[string]fs.DentAttr) + for _, filename := range names { + // Lookup the type and host device and inode. + stat, lerr := fstatat(fd, filename, linux.AT_SYMLINK_NOFOLLOW) + if lerr == syscall.ENOENT { + // File disappeared between readdir and lstat. + // Just treat it as if it didn't exist. + continue + } + + // There was a serious problem, we should probably report it. + if lerr != nil { + return nil, lerr + } + + entries[filename] = fs.DentAttr{ + Type: nodeType(&stat), + InodeID: hostFileDevice.Map(device.MultiDeviceKey{ + Device: stat.Dev, + Inode: stat.Ino, + }), + } + } + return entries, nil +} diff --git a/pkg/sentry/fs/host/inode_state.go b/pkg/sentry/fs/host/inode_state.go new file mode 100644 index 000000000..80066512a --- /dev/null +++ b/pkg/sentry/fs/host/inode_state.go @@ -0,0 +1,79 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package host + +import ( + "fmt" + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/device" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" +) + +// beforeSave is invoked by stateify. +func (i *inodeFileState) beforeSave() { + if !i.queue.IsEmpty() { + panic("event queue must be empty") + } + if !i.descriptor.donated && i.sattr.Type == fs.RegularFile { + uattr, err := i.unstableAttr(context.Background()) + if err != nil { + panic(fmt.Sprintf("failed to get unstable atttribute of %s: %v", i.mops.inodeMappings[i.sattr.InodeID], err)) + } + i.savedUAttr = &uattr + } +} + +// afterLoad is invoked by stateify. +func (i *inodeFileState) afterLoad() { + // Initialize the descriptor value. + if err := i.descriptor.initAfterLoad(i.mops, i.sattr.InodeID, &i.queue); err != nil { + panic(fmt.Sprintf("failed to load value of descriptor: %v", err)) + } + + // Remap the inode number. + var s syscall.Stat_t + if err := syscall.Fstat(i.FD(), &s); err != nil { + panic(fmt.Sprintf("failed to get metadata for fd %d: %v", i.FD(), err)) + } + key := device.MultiDeviceKey{ + Device: s.Dev, + Inode: s.Ino, + } + if !hostFileDevice.Load(key, i.sattr.InodeID) { + // This means there was a conflict at s.Dev and s.Ino with + // another inode mapping: two files that were unique on the + // saved filesystem are no longer unique on this filesystem. + // Since this violates the contract that filesystems cannot + // change across save and restore, error out. + panic(fmt.Sprintf("host %s conflict in host device mappings: %s", key, hostFileDevice)) + } + + if !i.descriptor.donated && i.sattr.Type == fs.RegularFile { + env, ok := fs.CurrentRestoreEnvironment() + if !ok { + panic("missing restore environment") + } + uattr := unstableAttr(i.mops, &s) + if env.ValidateFileSize && uattr.Size != i.savedUAttr.Size { + panic(fmt.Errorf("file size has changed for %s: previously %d, now %d", i.mops.inodeMappings[i.sattr.InodeID], i.savedUAttr.Size, uattr.Size)) + } + if env.ValidateFileTimestamp && uattr.ModificationTime != i.savedUAttr.ModificationTime { + panic(fmt.Errorf("file modification time has changed for %s: previously %v, now %v", i.mops.inodeMappings[i.sattr.InodeID], i.savedUAttr.ModificationTime, uattr.ModificationTime)) + } + i.savedUAttr = nil + } +} diff --git a/pkg/sentry/fs/host/inode_test.go b/pkg/sentry/fs/host/inode_test.go new file mode 100644 index 000000000..0ff87c418 --- /dev/null +++ b/pkg/sentry/fs/host/inode_test.go @@ -0,0 +1,112 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package host + +import ( + "io/ioutil" + "os" + "path" + "syscall" + "testing" + + "gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" +) + +// TestMultipleReaddir verifies that multiple Readdir calls return the same +// thing if they use different dir contexts. +func TestMultipleReaddir(t *testing.T) { + p, err := ioutil.TempDir("", "readdir") + if err != nil { + t.Fatalf("Failed to create test dir: %v", err) + } + defer os.RemoveAll(p) + + f, err := os.Create(path.Join(p, "a.txt")) + if err != nil { + t.Fatalf("Failed to create a.txt: %v", err) + } + f.Close() + + f, err = os.Create(path.Join(p, "b.txt")) + if err != nil { + t.Fatalf("Failed to create b.txt: %v", err) + } + f.Close() + + fd, err := open(nil, p) + if err != nil { + t.Fatalf("Failed to open %q: %v", p, err) + } + ctx := contexttest.Context(t) + n, err := newInode(ctx, newMountSource(ctx, p, fs.RootOwner, &Filesystem{}, fs.MountSourceFlags{}, false), fd, false, false) + if err != nil { + t.Fatalf("Failed to create inode: %v", err) + } + + dirent := fs.NewDirent(n, "readdir") + openFile, err := n.GetFile(ctx, dirent, fs.FileFlags{Read: true}) + if err != nil { + t.Fatalf("Failed to get file: %v", err) + } + defer openFile.DecRef() + + c1 := &fs.DirCtx{DirCursor: new(string)} + if _, err := openFile.FileOperations.(*fileOperations).IterateDir(ctx, c1, 0); err != nil { + t.Fatalf("First Readdir failed: %v", err) + } + + c2 := &fs.DirCtx{DirCursor: new(string)} + if _, err := openFile.FileOperations.(*fileOperations).IterateDir(ctx, c2, 0); err != nil { + t.Errorf("Second Readdir failed: %v", err) + } + + if _, ok := c1.DentAttrs()["a.txt"]; !ok { + t.Errorf("want a.txt in first Readdir, got %v", c1.DentAttrs()) + } + if _, ok := c1.DentAttrs()["b.txt"]; !ok { + t.Errorf("want b.txt in first Readdir, got %v", c1.DentAttrs()) + } + + if _, ok := c2.DentAttrs()["a.txt"]; !ok { + t.Errorf("want a.txt in second Readdir, got %v", c2.DentAttrs()) + } + if _, ok := c2.DentAttrs()["b.txt"]; !ok { + t.Errorf("want b.txt in second Readdir, got %v", c2.DentAttrs()) + } +} + +// TestCloseFD verifies fds will be closed. +func TestCloseFD(t *testing.T) { + var p [2]int + if err := syscall.Pipe(p[0:]); err != nil { + t.Fatalf("Failed to create pipe %v", err) + } + defer syscall.Close(p[0]) + defer syscall.Close(p[1]) + + // Use the write-end because we will detect if it's closed on the read end. + ctx := contexttest.Context(t) + file, err := NewFile(ctx, p[1], fs.RootOwner) + if err != nil { + t.Fatalf("Failed to create File: %v", err) + } + file.DecRef() + + s := make([]byte, 10) + if c, err := syscall.Read(p[0], s); c != 0 || err != nil { + t.Errorf("want 0, nil (EOF) from read end, got %v, %v", c, err) + } +} diff --git a/pkg/sentry/fs/host/ioctl_unsafe.go b/pkg/sentry/fs/host/ioctl_unsafe.go new file mode 100644 index 000000000..3c07c3850 --- /dev/null +++ b/pkg/sentry/fs/host/ioctl_unsafe.go @@ -0,0 +1,39 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package host + +import ( + "syscall" + "unsafe" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" +) + +func ioctlGetTermios(fd int) (*linux.Termios, error) { + var t linux.Termios + _, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), syscall.TCGETS, uintptr(unsafe.Pointer(&t))) + if errno != 0 { + return nil, errno + } + return &t, nil +} + +func ioctlSetTermios(fd int, req uint64, t *linux.Termios) error { + _, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), uintptr(req), uintptr(unsafe.Pointer(t))) + if errno != 0 { + return errno + } + return nil +} diff --git a/pkg/sentry/fs/host/socket.go b/pkg/sentry/fs/host/socket.go new file mode 100644 index 000000000..8e36ed7ee --- /dev/null +++ b/pkg/sentry/fs/host/socket.go @@ -0,0 +1,471 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package host + +import ( + "sync" + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/fd" + "gvisor.googlesource.com/gvisor/pkg/refs" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/socket/control" + unixsocket "gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix" + "gvisor.googlesource.com/gvisor/pkg/syserror" + "gvisor.googlesource.com/gvisor/pkg/tcpip" + "gvisor.googlesource.com/gvisor/pkg/tcpip/link/rawfile" + "gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix" + "gvisor.googlesource.com/gvisor/pkg/unet" + "gvisor.googlesource.com/gvisor/pkg/waiter" + "gvisor.googlesource.com/gvisor/pkg/waiter/fdnotifier" +) + +// endpoint encapsulates the state needed to represent a host Unix socket. +type endpoint struct { + queue waiter.Queue `state:"nosave"` + + // stype is the type of Unix socket. (Ex: unix.SockStream, + // unix.SockSeqpacket, unix.SockDgram) + stype unix.SockType `state:"nosave"` + + // fd is the host fd backing this file. + fd int `state:"nosave"` + + // If srfd >= 0, it is the host fd that fd was imported from. + srfd int `state:"wait"` +} + +func (e *endpoint) init() error { + family, err := syscall.GetsockoptInt(e.fd, syscall.SOL_SOCKET, syscall.SO_DOMAIN) + if err != nil { + return err + } + + if family != syscall.AF_UNIX { + // We only allow Unix sockets. + return syserror.EINVAL + } + + stype, err := syscall.GetsockoptInt(e.fd, syscall.SOL_SOCKET, syscall.SO_TYPE) + if err != nil { + return err + } + + if err := syscall.SetNonblock(e.fd, true); err != nil { + return err + } + + e.stype = unix.SockType(stype) + if err := fdnotifier.AddFD(int32(e.fd), &e.queue); err != nil { + return err + } + return nil +} + +// newEndpoint creates a new host endpoint. +func newEndpoint(fd int, srfd int) (*endpoint, error) { + ep := &endpoint{fd: fd, srfd: srfd} + if err := ep.init(); err != nil { + return nil, err + } + return ep, nil +} + +// newSocket allocates a new unix socket with host endpoint. +func newSocket(ctx context.Context, fd int, saveable bool) (*fs.File, error) { + ownedfd := fd + srfd := -1 + if saveable { + var err error + ownedfd, err = syscall.Dup(fd) + if err != nil { + return nil, err + } + srfd = fd + } + ep, err := newEndpoint(ownedfd, srfd) + if err != nil { + if saveable { + syscall.Close(ownedfd) + } + return nil, err + } + return unixsocket.New(ctx, ep), nil +} + +// NewSocketWithDirent allocates a new unix socket with host endpoint. +// +// This is currently only used by unsaveable Gofer nodes. +// +// NewSocketWithDirent takes ownership of f on success. +func NewSocketWithDirent(ctx context.Context, d *fs.Dirent, f *fd.FD, flags fs.FileFlags) (*fs.File, error) { + ep, err := newEndpoint(f.FD(), -1) + if err != nil { + return nil, err + } + + // Take ownship of the FD. + f.Release() + + return unixsocket.NewWithDirent(ctx, d, ep, flags), nil +} + +// Close implements unix.Endpoint.Close. +func (e *endpoint) Close() { + fdnotifier.RemoveFD(int32(e.fd)) + syscall.Close(e.fd) + e.fd = -1 +} + +// EventRegister implements waiter.Waitable.EventRegister. +func (e *endpoint) EventRegister(we *waiter.Entry, mask waiter.EventMask) { + e.queue.EventRegister(we, mask) + fdnotifier.UpdateFD(int32(e.fd)) +} + +// EventUnregister implements waiter.Waitable.EventUnregister. +func (e *endpoint) EventUnregister(we *waiter.Entry) { + e.queue.EventUnregister(we) + fdnotifier.UpdateFD(int32(e.fd)) +} + +// Readiness implements unix.Endpoint.Readiness. +func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask { + return fdnotifier.NonBlockingPoll(int32(e.fd), mask) +} + +// Type implements unix.Endpoint.Type. +func (e *endpoint) Type() unix.SockType { + return e.stype +} + +// Connect implements unix.Endpoint.Connect. +func (e *endpoint) Connect(server unix.BoundEndpoint) *tcpip.Error { + return tcpip.ErrInvalidEndpointState +} + +// Bind implements unix.Endpoint.Bind. +func (e *endpoint) Bind(address tcpip.FullAddress, commit func() *tcpip.Error) *tcpip.Error { + return tcpip.ErrInvalidEndpointState +} + +// Listen implements unix.Endpoint.Listen. +func (e *endpoint) Listen(backlog int) *tcpip.Error { + return tcpip.ErrInvalidEndpointState +} + +// Accept implements unix.Endpoint.Accept. +func (e *endpoint) Accept() (unix.Endpoint, *tcpip.Error) { + return nil, tcpip.ErrInvalidEndpointState +} + +// Shutdown implements unix.Endpoint.Shutdown. +func (e *endpoint) Shutdown(flags tcpip.ShutdownFlags) *tcpip.Error { + return tcpip.ErrInvalidEndpointState +} + +// GetSockOpt implements unix.Endpoint.GetSockOpt. +func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error { + switch o := opt.(type) { + case tcpip.ErrorOption: + _, err := syscall.GetsockoptInt(e.fd, syscall.SOL_SOCKET, syscall.SO_ERROR) + return translateError(err) + case *tcpip.PasscredOption: + // We don't support passcred on host sockets. + *o = 0 + return nil + case *tcpip.SendBufferSizeOption: + v, err := syscall.GetsockoptInt(e.fd, syscall.SOL_SOCKET, syscall.SO_SNDBUF) + *o = tcpip.SendBufferSizeOption(v) + return translateError(err) + case *tcpip.ReceiveBufferSizeOption: + v, err := syscall.GetsockoptInt(e.fd, syscall.SOL_SOCKET, syscall.SO_RCVBUF) + *o = tcpip.ReceiveBufferSizeOption(v) + return translateError(err) + case *tcpip.ReuseAddressOption: + v, err := syscall.GetsockoptInt(e.fd, syscall.SOL_SOCKET, syscall.SO_REUSEADDR) + *o = tcpip.ReuseAddressOption(v) + return translateError(err) + case *tcpip.ReceiveQueueSizeOption: + return tcpip.ErrQueueSizeNotSupported + } + return tcpip.ErrInvalidEndpointState +} + +// SetSockOpt implements unix.Endpoint.SetSockOpt. +func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error { + return nil +} + +// GetLocalAddress implements unix.Endpoint.GetLocalAddress. +func (e *endpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) { + return tcpip.FullAddress{}, nil +} + +// GetRemoteAddress implements unix.Endpoint.GetRemoteAddress. +func (e *endpoint) GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error) { + return tcpip.FullAddress{}, nil +} + +// Passcred returns whether or not the SO_PASSCRED socket option is +// enabled on this end. +func (e *endpoint) Passcred() bool { + // We don't support credential passing for host sockets. + return false +} + +// ConnectedPasscred returns whether or not the SO_PASSCRED socket option +// is enabled on the connected end. +func (e *endpoint) ConnectedPasscred() bool { + // We don't support credential passing for host sockets. + return false +} + +// SendMsg implements unix.Endpoint.SendMsg. +func (e *endpoint) SendMsg(data [][]byte, controlMessages unix.ControlMessages, to unix.BoundEndpoint) (uintptr, *tcpip.Error) { + if to != nil { + return 0, tcpip.ErrInvalidEndpointState + } + return sendMsg(e.fd, data, controlMessages) +} + +func sendMsg(fd int, data [][]byte, controlMessages unix.ControlMessages) (uintptr, *tcpip.Error) { + if !controlMessages.Empty() { + return 0, tcpip.ErrInvalidEndpointState + } + n, err := fdWriteVec(fd, data) + return n, translateError(err) +} + +// RecvMsg implements unix.Endpoint.RecvMsg. +func (e *endpoint) RecvMsg(data [][]byte, creds bool, numRights uintptr, peek bool, addr *tcpip.FullAddress) (uintptr, uintptr, unix.ControlMessages, *tcpip.Error) { + return recvMsg(e.fd, data, numRights, peek, addr) +} + +func recvMsg(fd int, data [][]byte, numRights uintptr, peek bool, addr *tcpip.FullAddress) (uintptr, uintptr, unix.ControlMessages, *tcpip.Error) { + var cm unet.ControlMessage + if numRights > 0 { + cm.EnableFDs(int(numRights)) + } + rl, ml, cl, err := fdReadVec(fd, data, []byte(cm), peek) + if err == syscall.EAGAIN { + return 0, 0, unix.ControlMessages{}, tcpip.ErrWouldBlock + } + if err != nil { + return 0, 0, unix.ControlMessages{}, translateError(err) + } + + // Trim the control data if we received less than the full amount. + if cl < uint64(len(cm)) { + cm = cm[:cl] + } + + // Avoid extra allocations in the case where there isn't any control data. + if len(cm) == 0 { + return rl, ml, unix.ControlMessages{}, nil + } + + fds, err := cm.ExtractFDs() + if err != nil { + return 0, 0, unix.ControlMessages{}, translateError(err) + } + + if len(fds) == 0 { + return rl, ml, unix.ControlMessages{}, nil + } + return rl, ml, control.New(nil, nil, newSCMRights(fds)), nil +} + +// NewConnectedEndpoint creates a new unix.Receiver and unix.ConnectedEndpoint +// backed by a host FD that will pretend to be bound at a given sentry path. +func NewConnectedEndpoint(file *fd.FD, queue *waiter.Queue, path string) (unix.Receiver, unix.ConnectedEndpoint, *tcpip.Error) { + if err := fdnotifier.AddFD(int32(file.FD()), queue); err != nil { + return nil, nil, translateError(err) + } + + e := &connectedEndpoint{path: path, queue: queue, file: file} + + // AtomicRefCounters start off with a single reference. We need two. + e.ref.IncRef() + + return e, e, nil +} + +// connectedEndpoint is a host FD backed implementation of +// unix.ConnectedEndpoint and unix.Receiver. +// +// connectedEndpoint does not support save/restore for now. +type connectedEndpoint struct { + queue *waiter.Queue + path string + + // ref keeps track of references to a connectedEndpoint. + ref refs.AtomicRefCount + + // mu protects fd, readClosed and writeClosed. + mu sync.RWMutex + + // file is an *fd.FD containing the FD backing this endpoint. It must be + // set to nil if it has been closed. + file *fd.FD + + // readClosed is true if the FD has read shutdown or if it has been closed. + readClosed bool + + // writeClosed is true if the FD has write shutdown or if it has been + // closed. + writeClosed bool +} + +// Send implements unix.ConnectedEndpoint.Send. +func (c *connectedEndpoint) Send(data [][]byte, controlMessages unix.ControlMessages, from tcpip.FullAddress) (uintptr, bool, *tcpip.Error) { + c.mu.RLock() + defer c.mu.RUnlock() + if c.writeClosed { + return 0, false, tcpip.ErrClosedForSend + } + n, err := sendMsg(c.file.FD(), data, controlMessages) + // There is no need for the callee to call SendNotify because sendMsg uses + // the host's sendmsg(2) and the host kernel's queue. + return n, false, err +} + +// SendNotify implements unix.ConnectedEndpoint.SendNotify. +func (c *connectedEndpoint) SendNotify() {} + +// CloseSend implements unix.ConnectedEndpoint.CloseSend. +func (c *connectedEndpoint) CloseSend() { + c.mu.Lock() + c.writeClosed = true + c.mu.Unlock() +} + +// CloseNotify implements unix.ConnectedEndpoint.CloseNotify. +func (c *connectedEndpoint) CloseNotify() {} + +// Writable implements unix.ConnectedEndpoint.Writable. +func (c *connectedEndpoint) Writable() bool { + c.mu.RLock() + defer c.mu.RUnlock() + if c.writeClosed { + return true + } + return fdnotifier.NonBlockingPoll(int32(c.file.FD()), waiter.EventOut)&waiter.EventOut != 0 +} + +// Passcred implements unix.ConnectedEndpoint.Passcred. +func (c *connectedEndpoint) Passcred() bool { + // We don't support credential passing for host sockets. + return false +} + +// GetLocalAddress implements unix.ConnectedEndpoint.GetLocalAddress. +func (c *connectedEndpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) { + return tcpip.FullAddress{Addr: tcpip.Address(c.path)}, nil +} + +// EventUpdate implements unix.ConnectedEndpoint.EventUpdate. +func (c *connectedEndpoint) EventUpdate() { + c.mu.RLock() + defer c.mu.RUnlock() + if c.file.FD() != -1 { + fdnotifier.UpdateFD(int32(c.file.FD())) + } +} + +// Recv implements unix.Receiver.Recv. +func (c *connectedEndpoint) Recv(data [][]byte, creds bool, numRights uintptr, peek bool) (uintptr, uintptr, unix.ControlMessages, tcpip.FullAddress, bool, *tcpip.Error) { + c.mu.RLock() + defer c.mu.RUnlock() + if c.readClosed { + return 0, 0, unix.ControlMessages{}, tcpip.FullAddress{}, false, tcpip.ErrClosedForReceive + } + rl, ml, cm, err := recvMsg(c.file.FD(), data, numRights, peek, nil) + // There is no need for the callee to call RecvNotify because recvMsg uses + // the host's recvmsg(2) and the host kernel's queue. + return rl, ml, cm, tcpip.FullAddress{Addr: tcpip.Address(c.path)}, false, err +} + +// close releases all resources related to the endpoint. +func (c *connectedEndpoint) close() { + fdnotifier.RemoveFD(int32(c.file.FD())) + c.file.Close() + c.file = nil +} + +// RecvNotify implements unix.Receiver.RecvNotify. +func (c *connectedEndpoint) RecvNotify() {} + +// CloseRecv implements unix.Receiver.CloseRecv. +func (c *connectedEndpoint) CloseRecv() { + c.mu.Lock() + c.readClosed = true + c.mu.Unlock() +} + +// Readable implements unix.Receiver.Readable. +func (c *connectedEndpoint) Readable() bool { + c.mu.RLock() + defer c.mu.RUnlock() + if c.readClosed { + return true + } + return fdnotifier.NonBlockingPoll(int32(c.file.FD()), waiter.EventIn)&waiter.EventIn != 0 +} + +// SendQueuedSize implements unix.Receiver.SendQueuedSize. +func (c *connectedEndpoint) SendQueuedSize() int64 { + // SendQueuedSize isn't supported for host sockets because we don't allow the + // sentry to call ioctl(2). + return -1 +} + +// RecvQueuedSize implements unix.Receiver.RecvQueuedSize. +func (c *connectedEndpoint) RecvQueuedSize() int64 { + // RecvQueuedSize isn't supported for host sockets because we don't allow the + // sentry to call ioctl(2). + return -1 +} + +// SendMaxQueueSize implements unix.Receiver.SendMaxQueueSize. +func (c *connectedEndpoint) SendMaxQueueSize() int64 { + v, err := syscall.GetsockoptInt(c.file.FD(), syscall.SOL_SOCKET, syscall.SO_SNDBUF) + if err != nil { + return -1 + } + return int64(v) +} + +// RecvMaxQueueSize implements unix.Receiver.RecvMaxQueueSize. +func (c *connectedEndpoint) RecvMaxQueueSize() int64 { + v, err := syscall.GetsockoptInt(c.file.FD(), syscall.SOL_SOCKET, syscall.SO_RCVBUF) + if err != nil { + return -1 + } + return int64(v) +} + +// Release implements unix.ConnectedEndpoint.Release and unix.Receiver.Release. +func (c *connectedEndpoint) Release() { + c.ref.DecRefWithDestructor(c.close) +} + +func translateError(err error) *tcpip.Error { + if err == nil { + return nil + } + return rawfile.TranslateErrno(err.(syscall.Errno)) +} diff --git a/pkg/sentry/fs/host/socket_state.go b/pkg/sentry/fs/host/socket_state.go new file mode 100644 index 000000000..6acabd55a --- /dev/null +++ b/pkg/sentry/fs/host/socket_state.go @@ -0,0 +1,39 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package host + +import ( + "fmt" + "syscall" +) + +// beforeSave is invoked by stateify. +func (ep *endpoint) beforeSave() { + if ep.srfd < 0 { + panic("only host file descriptors provided at sentry startup can be saved") + } +} + +// afterLoad is invoked by stateify. +func (ep *endpoint) afterLoad() { + fd, err := syscall.Dup(ep.srfd) + if err != nil { + panic(fmt.Sprintf("failed to dup restored fd %d: %v", ep.srfd, err)) + } + ep.fd = fd + if err := ep.init(); err != nil { + panic(fmt.Sprintf("Could not restore host socket fd %d: %v", ep.srfd, err)) + } +} diff --git a/pkg/sentry/fs/host/socket_test.go b/pkg/sentry/fs/host/socket_test.go new file mode 100644 index 000000000..80c46dcfa --- /dev/null +++ b/pkg/sentry/fs/host/socket_test.go @@ -0,0 +1,401 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package host + +import ( + "reflect" + "syscall" + "testing" + + "gvisor.googlesource.com/gvisor/pkg/fd" + "gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest" + "gvisor.googlesource.com/gvisor/pkg/sentry/socket" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserr" + "gvisor.googlesource.com/gvisor/pkg/tcpip" + "gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix" + "gvisor.googlesource.com/gvisor/pkg/waiter" + "gvisor.googlesource.com/gvisor/pkg/waiter/fdnotifier" +) + +var ( + // Make sure that connectedEndpoint implements unix.ConnectedEndpoint. + _ = unix.ConnectedEndpoint(new(connectedEndpoint)) + + // Make sure that connectedEndpoint implements unix.Receiver. + _ = unix.Receiver(new(connectedEndpoint)) +) + +func getFl(fd int) (uint32, error) { + fl, _, err := syscall.RawSyscall(syscall.SYS_FCNTL, uintptr(fd), syscall.F_GETFL, 0) + if err == 0 { + return uint32(fl), nil + } + return 0, err +} + +func TestSocketIsBlocking(t *testing.T) { + // Using socketpair here because it's already connected. + pair, err := syscall.Socketpair(syscall.AF_UNIX, syscall.SOCK_STREAM, 0) + if err != nil { + t.Fatalf("host socket creation failed: %v", err) + } + + fl, err := getFl(pair[0]) + if err != nil { + t.Fatalf("getFl: fcntl(%v, GETFL) => %v", pair[0], err) + } + if fl&syscall.O_NONBLOCK == syscall.O_NONBLOCK { + t.Fatalf("Expected socket %v to be blocking", pair[0]) + } + if fl, err = getFl(pair[1]); err != nil { + t.Fatalf("getFl: fcntl(%v, GETFL) => %v", pair[1], err) + } + if fl&syscall.O_NONBLOCK == syscall.O_NONBLOCK { + t.Fatalf("Expected socket %v to be blocking", pair[1]) + } + sock, err := newSocket(contexttest.Context(t), pair[0], false) + if err != nil { + t.Fatalf("newSocket(%v) failed => %v", pair[0], err) + } + defer sock.DecRef() + // Test that the socket now is non blocking. + if fl, err = getFl(pair[0]); err != nil { + t.Fatalf("getFl: fcntl(%v, GETFL) => %v", pair[0], err) + } + if fl&syscall.O_NONBLOCK != syscall.O_NONBLOCK { + t.Errorf("Expected socket %v to have becoming non blocking", pair[0]) + } + if fl, err = getFl(pair[1]); err != nil { + t.Fatalf("getFl: fcntl(%v, GETFL) => %v", pair[1], err) + } + if fl&syscall.O_NONBLOCK == syscall.O_NONBLOCK { + t.Errorf("Did not expect socket %v to become non blocking", pair[1]) + } +} + +func TestSocketWritev(t *testing.T) { + // Using socketpair here because it's already connected. + pair, err := syscall.Socketpair(syscall.AF_UNIX, syscall.SOCK_STREAM, 0) + if err != nil { + t.Fatalf("host socket creation failed: %v", err) + } + socket, err := newSocket(contexttest.Context(t), pair[0], false) + if err != nil { + t.Fatalf("newSocket(%v) => %v", pair[0], err) + } + defer socket.DecRef() + buf := []byte("hello world\n") + n, err := socket.Writev(contexttest.Context(t), usermem.BytesIOSequence(buf)) + if err != nil { + t.Fatalf("socket writev failed: %v", err) + } + + if n != int64(len(buf)) { + t.Fatalf("socket writev wrote incorrect bytes: %d", n) + } +} + +func TestSocketWritevLen0(t *testing.T) { + // Using socketpair here because it's already connected. + pair, err := syscall.Socketpair(syscall.AF_UNIX, syscall.SOCK_STREAM, 0) + if err != nil { + t.Fatalf("host socket creation failed: %v", err) + } + socket, err := newSocket(contexttest.Context(t), pair[0], false) + if err != nil { + t.Fatalf("newSocket(%v) => %v", pair[0], err) + } + defer socket.DecRef() + n, err := socket.Writev(contexttest.Context(t), usermem.BytesIOSequence(nil)) + if err != nil { + t.Fatalf("socket writev failed: %v", err) + } + + if n != 0 { + t.Fatalf("socket writev wrote incorrect bytes: %d", n) + } +} + +func TestSocketSendMsgLen0(t *testing.T) { + // Using socketpair here because it's already connected. + pair, err := syscall.Socketpair(syscall.AF_UNIX, syscall.SOCK_STREAM, 0) + if err != nil { + t.Fatalf("host socket creation failed: %v", err) + } + sfile, err := newSocket(contexttest.Context(t), pair[0], false) + if err != nil { + t.Fatalf("newSocket(%v) => %v", pair[0], err) + } + defer sfile.DecRef() + + s := sfile.FileOperations.(socket.Socket) + n, terr := s.SendMsg(nil, usermem.BytesIOSequence(nil), []byte{}, 0, unix.ControlMessages{}) + if n != 0 { + t.Fatalf("socket sendmsg() failed: %v wrote: %d", terr, n) + } + + if terr != nil { + t.Fatalf("socket sendmsg() failed: %v", terr) + } +} + +func TestListen(t *testing.T) { + pair, err := syscall.Socketpair(syscall.AF_UNIX, syscall.SOCK_STREAM, 0) + if err != nil { + t.Fatalf("syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM, 0) => %v", err) + } + sfile1, err := newSocket(contexttest.Context(t), pair[0], false) + if err != nil { + t.Fatalf("newSocket(%v) => %v", pair[0], err) + } + defer sfile1.DecRef() + socket1 := sfile1.FileOperations.(socket.Socket) + + sfile2, err := newSocket(contexttest.Context(t), pair[1], false) + if err != nil { + t.Fatalf("newSocket(%v) => %v", pair[1], err) + } + defer sfile2.DecRef() + socket2 := sfile2.FileOperations.(socket.Socket) + + // Socketpairs can not be listened to. + if err := socket1.Listen(nil, 64); err != syserr.ErrInvalidEndpointState { + t.Fatalf("socket1.Listen(nil, 64) => %v, want syserr.ErrInvalidEndpointState", err) + } + if err := socket2.Listen(nil, 64); err != syserr.ErrInvalidEndpointState { + t.Fatalf("socket2.Listen(nil, 64) => %v, want syserr.ErrInvalidEndpointState", err) + } + + // Create a Unix socket, do not bind it. + sock, err := syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM, 0) + if err != nil { + t.Fatalf("syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM, 0) => %v", err) + } + sfile3, err := newSocket(contexttest.Context(t), sock, false) + if err != nil { + t.Fatalf("newSocket(%v) => %v", sock, err) + } + defer sfile3.DecRef() + socket3 := sfile3.FileOperations.(socket.Socket) + + // This socket is not bound so we can't listen on it. + if err := socket3.Listen(nil, 64); err != syserr.ErrInvalidEndpointState { + t.Fatalf("socket3.Listen(nil, 64) => %v, want syserr.ErrInvalidEndpointState", err) + } +} + +func TestSend(t *testing.T) { + e := connectedEndpoint{writeClosed: true} + if _, _, err := e.Send(nil, unix.ControlMessages{}, tcpip.FullAddress{}); err != tcpip.ErrClosedForSend { + t.Errorf("Got %#v.Send() = %v, want = %v", e, err, tcpip.ErrClosedForSend) + } +} + +func TestRecv(t *testing.T) { + e := connectedEndpoint{readClosed: true} + if _, _, _, _, _, err := e.Recv(nil, false, 0, false); err != tcpip.ErrClosedForReceive { + t.Errorf("Got %#v.Recv() = %v, want = %v", e, err, tcpip.ErrClosedForReceive) + } +} + +func TestPasscred(t *testing.T) { + e := connectedEndpoint{} + if got, want := e.Passcred(), false; got != want { + t.Errorf("Got %#v.Passcred() = %t, want = %t", e, got, want) + } +} + +func TestGetLocalAddress(t *testing.T) { + e := connectedEndpoint{path: "foo"} + want := tcpip.FullAddress{Addr: tcpip.Address("foo")} + if got, err := e.GetLocalAddress(); err != nil || got != want { + t.Errorf("Got %#v.GetLocalAddress() = %#v, %v, want = %#v, %v", e, got, err, want, nil) + } +} + +func TestQueuedSize(t *testing.T) { + e := connectedEndpoint{} + tests := []struct { + name string + f func() int64 + }{ + {"SendQueuedSize", e.SendQueuedSize}, + {"RecvQueuedSize", e.RecvQueuedSize}, + } + + for _, test := range tests { + if got, want := test.f(), int64(-1); got != want { + t.Errorf("Got %#v.%s() = %d, want = %d", e, test.name, got, want) + } + } +} + +func TestReadable(t *testing.T) { + e := connectedEndpoint{readClosed: true} + if got, want := e.Readable(), true; got != want { + t.Errorf("Got %#v.Readable() = %t, want = %t", e, got, want) + } +} + +func TestWritable(t *testing.T) { + e := connectedEndpoint{writeClosed: true} + if got, want := e.Writable(), true; got != want { + t.Errorf("Got %#v.Writable() = %t, want = %t", e, got, want) + } +} + +func TestRelease(t *testing.T) { + f, err := syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC, 0) + if err != nil { + t.Fatal("Creating socket:", err) + } + c := &connectedEndpoint{queue: &waiter.Queue{}, file: fd.New(f)} + want := &connectedEndpoint{queue: c.queue} + want.ref.DecRef() + fdnotifier.AddFD(int32(c.file.FD()), nil) + c.Release() + if !reflect.DeepEqual(c, want) { + t.Errorf("got = %#v, want = %#v", c, want) + } +} + +func TestClose(t *testing.T) { + type testCase struct { + name string + cep *connectedEndpoint + addFD bool + f func() + want *connectedEndpoint + } + + var tests []testCase + + // nil is the value used by connectedEndpoint to indicate a closed file. + // Non-nil files are used to check if the file gets closed. + + f, err := syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC, 0) + if err != nil { + t.Fatal("Creating socket:", err) + } + c := &connectedEndpoint{queue: &waiter.Queue{}, file: fd.New(f)} + tests = append(tests, testCase{ + name: "First CloseRecv", + cep: c, + addFD: false, + f: c.CloseRecv, + want: &connectedEndpoint{queue: c.queue, file: c.file, readClosed: true}, + }) + + f, err = syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC, 0) + if err != nil { + t.Fatal("Creating socket:", err) + } + c = &connectedEndpoint{queue: &waiter.Queue{}, file: fd.New(f), readClosed: true} + tests = append(tests, testCase{ + name: "Second CloseRecv", + cep: c, + addFD: false, + f: c.CloseRecv, + want: &connectedEndpoint{queue: c.queue, file: c.file, readClosed: true}, + }) + + f, err = syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC, 0) + if err != nil { + t.Fatal("Creating socket:", err) + } + c = &connectedEndpoint{queue: &waiter.Queue{}, file: fd.New(f)} + tests = append(tests, testCase{ + name: "First CloseSend", + cep: c, + addFD: false, + f: c.CloseSend, + want: &connectedEndpoint{queue: c.queue, file: c.file, writeClosed: true}, + }) + + f, err = syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC, 0) + if err != nil { + t.Fatal("Creating socket:", err) + } + c = &connectedEndpoint{queue: &waiter.Queue{}, file: fd.New(f), writeClosed: true} + tests = append(tests, testCase{ + name: "Second CloseSend", + cep: c, + addFD: false, + f: c.CloseSend, + want: &connectedEndpoint{queue: c.queue, file: c.file, writeClosed: true}, + }) + + f, err = syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC, 0) + if err != nil { + t.Fatal("Creating socket:", err) + } + c = &connectedEndpoint{queue: &waiter.Queue{}, file: fd.New(f), writeClosed: true} + tests = append(tests, testCase{ + name: "CloseSend then CloseRecv", + cep: c, + addFD: true, + f: c.CloseRecv, + want: &connectedEndpoint{queue: c.queue, file: c.file, readClosed: true, writeClosed: true}, + }) + + f, err = syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC, 0) + if err != nil { + t.Fatal("Creating socket:", err) + } + c = &connectedEndpoint{queue: &waiter.Queue{}, file: fd.New(f), readClosed: true} + tests = append(tests, testCase{ + name: "CloseRecv then CloseSend", + cep: c, + addFD: true, + f: c.CloseSend, + want: &connectedEndpoint{queue: c.queue, file: c.file, readClosed: true, writeClosed: true}, + }) + + f, err = syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC, 0) + if err != nil { + t.Fatal("Creating socket:", err) + } + c = &connectedEndpoint{queue: &waiter.Queue{}, file: fd.New(f), readClosed: true, writeClosed: true} + tests = append(tests, testCase{ + name: "Full close then CloseRecv", + cep: c, + addFD: false, + f: c.CloseRecv, + want: &connectedEndpoint{queue: c.queue, file: c.file, readClosed: true, writeClosed: true}, + }) + + f, err = syscall.Socket(syscall.AF_UNIX, syscall.SOCK_STREAM|syscall.SOCK_NONBLOCK|syscall.SOCK_CLOEXEC, 0) + if err != nil { + t.Fatal("Creating socket:", err) + } + c = &connectedEndpoint{queue: &waiter.Queue{}, file: fd.New(f), readClosed: true, writeClosed: true} + tests = append(tests, testCase{ + name: "Full close then CloseSend", + cep: c, + addFD: false, + f: c.CloseSend, + want: &connectedEndpoint{queue: c.queue, file: c.file, readClosed: true, writeClosed: true}, + }) + + for _, test := range tests { + if test.addFD { + fdnotifier.AddFD(int32(test.cep.file.FD()), nil) + } + if test.f(); !reflect.DeepEqual(test.cep, test.want) { + t.Errorf("%s: got = %#v, want = %#v", test.name, test.cep, test.want) + } + } +} diff --git a/pkg/sentry/fs/host/socket_unsafe.go b/pkg/sentry/fs/host/socket_unsafe.go new file mode 100644 index 000000000..bf8da6867 --- /dev/null +++ b/pkg/sentry/fs/host/socket_unsafe.go @@ -0,0 +1,82 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package host + +import ( + "syscall" + "unsafe" +) + +// buildIovec builds an iovec slice from the given []byte slice. +func buildIovec(bufs [][]byte) (uintptr, []syscall.Iovec) { + var length uintptr + iovecs := make([]syscall.Iovec, 0, 10) + for i := range bufs { + if l := len(bufs[i]); l > 0 { + length += uintptr(l) + iovecs = append(iovecs, syscall.Iovec{ + Base: &bufs[i][0], + Len: uint64(l), + }) + } + } + return length, iovecs +} + +func fdReadVec(fd int, bufs [][]byte, control []byte, peek bool) (readLen uintptr, msgLen uintptr, controlLen uint64, err error) { + flags := uintptr(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC) + if peek { + flags |= syscall.MSG_PEEK + } + + length, iovecs := buildIovec(bufs) + + var msg syscall.Msghdr + if len(control) != 0 { + msg.Control = &control[0] + msg.Controllen = uint64(len(control)) + } + + if len(iovecs) != 0 { + msg.Iov = &iovecs[0] + msg.Iovlen = uint64(len(iovecs)) + } + n, _, e := syscall.RawSyscall(syscall.SYS_RECVMSG, uintptr(fd), uintptr(unsafe.Pointer(&msg)), flags) + if e != 0 { + return 0, 0, 0, e + } + + if n > length { + return length, n, msg.Controllen, nil + } + + return n, n, msg.Controllen, nil +} + +func fdWriteVec(fd int, bufs [][]byte) (uintptr, error) { + _, iovecs := buildIovec(bufs) + + var msg syscall.Msghdr + if len(iovecs) > 0 { + msg.Iov = &iovecs[0] + msg.Iovlen = uint64(len(iovecs)) + } + n, _, e := syscall.RawSyscall(syscall.SYS_SENDMSG, uintptr(fd), uintptr(unsafe.Pointer(&msg)), syscall.MSG_DONTWAIT|syscall.MSG_NOSIGNAL) + if e != 0 { + return 0, e + } + + return n, nil +} diff --git a/pkg/sentry/fs/host/util.go b/pkg/sentry/fs/host/util.go new file mode 100644 index 000000000..74c703eb7 --- /dev/null +++ b/pkg/sentry/fs/host/util.go @@ -0,0 +1,197 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package host + +import ( + "os" + "path" + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/pkg/sentry/device" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth" + ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +func open(parent *inodeOperations, name string) (int, error) { + if parent == nil && !path.IsAbs(name) { + return -1, syserror.EINVAL + } + name = path.Clean(name) + + // Don't follow through symlinks. + flags := syscall.O_NOFOLLOW + + if fd, err := openAt(parent, name, flags|syscall.O_RDWR, 0); err == nil { + return fd, nil + } + // Retry as read-only. + if fd, err := openAt(parent, name, flags|syscall.O_RDONLY, 0); err == nil { + return fd, nil + } + + // Retry as write-only. + if fd, err := openAt(parent, name, flags|syscall.O_WRONLY, 0); err == nil { + return fd, nil + } + + // Retry as a symlink, by including O_PATH as an option. + fd, err := openAt(parent, name, linux.O_PATH|flags, 0) + if err == nil { + return fd, nil + } + + // Everything failed. + return -1, err +} + +func openAt(parent *inodeOperations, name string, flags int, perm linux.FileMode) (int, error) { + if parent == nil { + return syscall.Open(name, flags, uint32(perm)) + } + return syscall.Openat(parent.fileState.FD(), name, flags, uint32(perm)) +} + +func nodeType(s *syscall.Stat_t) fs.InodeType { + switch x := (s.Mode & syscall.S_IFMT); x { + case syscall.S_IFLNK: + return fs.Symlink + case syscall.S_IFIFO: + return fs.Pipe + case syscall.S_IFCHR: + return fs.CharacterDevice + case syscall.S_IFBLK: + return fs.BlockDevice + case syscall.S_IFSOCK: + return fs.Socket + case syscall.S_IFDIR: + return fs.Directory + case syscall.S_IFREG: + return fs.RegularFile + default: + // This shouldn't happen, but just in case... + log.Warningf("unknown host file type %d: assuming regular", x) + return fs.RegularFile + } +} + +func wouldBlock(s *syscall.Stat_t) bool { + typ := nodeType(s) + return typ == fs.Pipe || typ == fs.Socket || typ == fs.CharacterDevice +} + +func stableAttr(s *syscall.Stat_t) fs.StableAttr { + return fs.StableAttr{ + Type: nodeType(s), + DeviceID: hostFileDevice.DeviceID(), + InodeID: hostFileDevice.Map(device.MultiDeviceKey{ + Device: s.Dev, + Inode: s.Ino, + }), + BlockSize: int64(s.Blksize), + } +} + +func owner(mo *superOperations, s *syscall.Stat_t) fs.FileOwner { + // User requested no translation, just return actual owner. + if mo.dontTranslateOwnership { + return fs.FileOwner{auth.KUID(s.Uid), auth.KGID(s.Gid)} + } + + // Show only IDs relevant to the sandboxed task. I.e. if we not own the + // file, no sandboxed task can own the file. In that case, we + // use OverflowID for UID, implying that the IDs are not mapped in the + // "root" user namespace. + // + // E.g. + // sandbox's host EUID/EGID is 1/1. + // some_dir's host UID/GID is 2/1. + // Task that mounted this fs has virtualized EUID/EGID 5/5. + // + // If you executed `ls -n` in the sandboxed task, it would show: + // drwxwrxwrx [...] 65534 5 [...] some_dir + + // Files are owned by OverflowID by default. + owner := fs.FileOwner{auth.KUID(auth.OverflowUID), auth.KGID(auth.OverflowGID)} + + // If we own file on host, let mounting task's initial EUID own + // the file. + if s.Uid == hostUID { + owner.UID = mo.mounter.UID + } + + // If our group matches file's group, make file's group match + // the mounting task's initial EGID. + for _, gid := range hostGIDs { + if s.Gid == gid { + owner.GID = mo.mounter.GID + break + } + } + return owner +} + +func unstableAttr(mo *superOperations, s *syscall.Stat_t) fs.UnstableAttr { + return fs.UnstableAttr{ + Size: s.Size, + Usage: s.Blocks * 512, + Perms: fs.FilePermsFromMode(linux.FileMode(s.Mode)), + Owner: owner(mo, s), + AccessTime: ktime.FromUnix(s.Atim.Sec, s.Atim.Nsec), + ModificationTime: ktime.FromUnix(s.Mtim.Sec, s.Mtim.Nsec), + StatusChangeTime: ktime.FromUnix(s.Ctim.Sec, s.Ctim.Nsec), + Links: s.Nlink, + } +} + +type dirInfo struct { + buf []byte // buffer for directory I/O. + nbuf int // length of buf; return value from ReadDirent. + bufp int // location of next record in buf. +} + +// isBlockError unwraps os errors and checks if they are caused by EAGAIN or +// EWOULDBLOCK. This is so they can be transformed into syserror.ErrWouldBlock. +func isBlockError(err error) bool { + if err == syserror.EAGAIN || err == syserror.EWOULDBLOCK { + return true + } + if pe, ok := err.(*os.PathError); ok { + return isBlockError(pe.Err) + } + return false +} + +func hostEffectiveKIDs() (uint32, []uint32, error) { + gids, err := os.Getgroups() + if err != nil { + return 0, nil, err + } + egids := make([]uint32, len(gids)) + for i, gid := range gids { + egids[i] = uint32(gid) + } + return uint32(os.Geteuid()), append(egids, uint32(os.Getegid())), nil +} + +var hostUID uint32 +var hostGIDs []uint32 + +func init() { + hostUID, hostGIDs, _ = hostEffectiveKIDs() +} diff --git a/pkg/sentry/fs/host/util_unsafe.go b/pkg/sentry/fs/host/util_unsafe.go new file mode 100644 index 000000000..c38d2392d --- /dev/null +++ b/pkg/sentry/fs/host/util_unsafe.go @@ -0,0 +1,137 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package host + +import ( + "syscall" + "unsafe" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time" +) + +func createLink(fd int, name string, linkName string) error { + namePtr, err := syscall.BytePtrFromString(name) + if err != nil { + return err + } + linkNamePtr, err := syscall.BytePtrFromString(linkName) + if err != nil { + return err + } + _, _, errno := syscall.Syscall( + syscall.SYS_SYMLINKAT, + uintptr(unsafe.Pointer(namePtr)), + uintptr(fd), + uintptr(unsafe.Pointer(linkNamePtr))) + if errno != 0 { + return errno + } + return nil +} + +func readLink(fd int) (string, error) { + // Buffer sizing copied from os.Readlink. + for l := 128; ; l *= 2 { + b := make([]byte, l) + n, _, errno := syscall.Syscall6( + syscall.SYS_READLINKAT, + uintptr(fd), + uintptr(unsafe.Pointer(syscall.StringBytePtr(""))), + uintptr(unsafe.Pointer(&b[0])), + uintptr(l), + 0, 0) + if n < 0 { + n = 0 + } + if errno != 0 { + return "", errno + } + if n < uintptr(l) { + return string(b[:n]), nil + } + } +} + +func unlinkAt(fd int, name string, dir bool) error { + namePtr, err := syscall.BytePtrFromString(name) + if err != nil { + return err + } + var flags uintptr + if dir { + flags = linux.AT_REMOVEDIR + } + _, _, errno := syscall.Syscall( + syscall.SYS_UNLINKAT, + uintptr(fd), + uintptr(unsafe.Pointer(namePtr)), + flags, + ) + if errno != 0 { + return errno + } + return nil +} + +func timespecFromTimestamp(t ktime.Time, omit, setSysTime bool) syscall.Timespec { + if omit { + return syscall.Timespec{0, linux.UTIME_OMIT} + } + if setSysTime { + return syscall.Timespec{0, linux.UTIME_NOW} + } + return syscall.NsecToTimespec(t.Nanoseconds()) +} + +func setTimestamps(fd int, ts fs.TimeSpec) error { + if ts.ATimeOmit && ts.MTimeOmit { + return nil + } + var sts [2]syscall.Timespec + sts[0] = timespecFromTimestamp(ts.ATime, ts.ATimeOmit, ts.ATimeSetSystemTime) + sts[1] = timespecFromTimestamp(ts.MTime, ts.MTimeOmit, ts.MTimeSetSystemTime) + _, _, errno := syscall.Syscall6( + syscall.SYS_UTIMENSAT, + uintptr(fd), + 0, /* path */ + uintptr(unsafe.Pointer(&sts)), + 0, /* flags */ + 0, 0) + if errno != 0 { + return errno + } + return nil +} + +func fstatat(fd int, name string, flags int) (syscall.Stat_t, error) { + var stat syscall.Stat_t + namePtr, err := syscall.BytePtrFromString(name) + if err != nil { + return stat, err + } + _, _, errno := syscall.Syscall6( + syscall.SYS_NEWFSTATAT, + uintptr(fd), + uintptr(unsafe.Pointer(namePtr)), + uintptr(unsafe.Pointer(&stat)), + uintptr(flags), + 0, 0) + if errno != 0 { + return stat, errno + } + return stat, nil +} diff --git a/pkg/sentry/fs/host/wait_test.go b/pkg/sentry/fs/host/wait_test.go new file mode 100644 index 000000000..c5f5c9c0d --- /dev/null +++ b/pkg/sentry/fs/host/wait_test.go @@ -0,0 +1,70 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package host + +import ( + "syscall" + "testing" + "time" + + "gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/waiter" +) + +func TestWait(t *testing.T) { + var fds [2]int + err := syscall.Pipe(fds[:]) + if err != nil { + t.Fatalf("Unable to create pipe: %v", err) + } + + defer syscall.Close(fds[1]) + + ctx := contexttest.Context(t) + file, err := NewFile(ctx, fds[0], fs.RootOwner) + if err != nil { + syscall.Close(fds[0]) + t.Fatalf("NewFile failed: %v", err) + } + + defer file.DecRef() + + r := file.Readiness(waiter.EventIn) + if r != 0 { + t.Fatalf("File is ready for read when it shouldn't be.") + } + + e, ch := waiter.NewChannelEntry(nil) + file.EventRegister(&e, waiter.EventIn) + defer file.EventUnregister(&e) + + // Check that there are no notifications yet. + if len(ch) != 0 { + t.Fatalf("Channel is non-empty") + } + + // Write to the pipe, so it should be writable now. + syscall.Write(fds[1], []byte{1}) + + // Check that we get a notification. We need to yield the current thread + // so that the fdnotifier can deliver notifications, so we use a + // 1-second timeout instead of just checking the length of the channel. + select { + case <-ch: + case <-time.After(1 * time.Second): + t.Fatalf("Channel not notified") + } +} diff --git a/pkg/sentry/fs/inode.go b/pkg/sentry/fs/inode.go new file mode 100644 index 000000000..b624f4182 --- /dev/null +++ b/pkg/sentry/fs/inode.go @@ -0,0 +1,455 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fs + +import ( + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/pkg/refs" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/lock" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth" + "gvisor.googlesource.com/gvisor/pkg/sentry/memmap" + "gvisor.googlesource.com/gvisor/pkg/syserror" + "gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix" +) + +// Inode is a file system object that can be simulatenously referenced by different +// components of the VFS (Dirent, fs.File, etc). +type Inode struct { + // AtomicRefCount is our reference count. + refs.AtomicRefCount + + // InodeOperations is the file system specific behavior of the Inode. + InodeOperations InodeOperations + + // StableAttr are stable cached attributes of the Inode. + StableAttr StableAttr + + // LockCtx is the file lock context. It manages its own sychronization and tracks + // regions of the Inode that have locks held. + LockCtx LockCtx + + // Watches is the set of inotify watches for this inode. + Watches *Watches + + // MountSource is the mount source this Inode is a part of. + MountSource *MountSource + + // overlay is the overlay entry for this Inode. + overlay *overlayEntry +} + +// LockCtx is an Inode's lock context and contains different personalities of locks; both +// Posix and BSD style locks are supported. +// +// Note that in Linux fcntl(2) and flock(2) locks are _not_ cooperative, because race and +// deadlock conditions make merging them prohibitive. We do the same and keep them oblivious +// to each other but provide a "context" as a convenient container. +type LockCtx struct { + // Posix is a set of POSIX-style regional advisory locks, see fcntl(2). + Posix lock.Locks + + // BSD is a set of BSD-style advisory file wide locks, see flock(2). + BSD lock.Locks +} + +// NewInode constructs an Inode from InodeOperations, a MountSource, and stable attributes. +// +// NewInode takes a reference on msrc. +func NewInode(iops InodeOperations, msrc *MountSource, sattr StableAttr) *Inode { + msrc.IncRef() + return &Inode{ + InodeOperations: iops, + StableAttr: sattr, + Watches: newWatches(), + MountSource: msrc, + } +} + +// DecRef drops a reference on the Inode. +func (i *Inode) DecRef() { + i.DecRefWithDestructor(i.destroy) +} + +// destroy releases the Inode and releases the msrc reference taken. +func (i *Inode) destroy() { + // FIXME: Context is not plumbed here. + ctx := context.Background() + if err := i.WriteOut(ctx); err != nil { + // FIXME: Mark as warning again once noatime is + // properly supported. + log.Debugf("Inode %+v, failed to sync all metadata: %v", i.StableAttr, err) + } + + // If this inode is being destroyed because it was unlinked, queue a + // deletion event. This may not be the case for inodes being revalidated. + if i.Watches.unlinked { + i.Watches.Notify("", linux.IN_DELETE_SELF, 0) + } + + // Remove references from the watch owners to the watches on this inode, + // since the watches are about to be GCed. Note that we don't need to worry + // about the watch pins since if there were any active pins, this inode + // wouldn't be in the destructor. + i.Watches.targetDestroyed() + + // Overlay resources should be released synchronously, since they may + // trigger more Inode.destroy calls which must themselves be handled + // synchronously, like the WriteOut call above. + if i.overlay != nil { + i.overlay.release() + i.MountSource.DecRef() + return + } + + // Regular (non-overlay) resources may be released asynchronously. + Async(func() { + i.InodeOperations.Release(ctx) + i.MountSource.DecRef() + }) +} + +// Mappable calls i.InodeOperations.Mappable. +func (i *Inode) Mappable() memmap.Mappable { + if i.overlay != nil { + // In an overlay, Mappable is always implemented by + // the overlayEntry metadata to synchronize memory + // access of files with copy up. But first check if + // the Inodes involved would be mappable in the first + // place. + i.overlay.copyMu.RLock() + ok := i.overlay.isMappableLocked() + i.overlay.copyMu.RUnlock() + if !ok { + return nil + } + return i.overlay + } + return i.InodeOperations.Mappable(i) +} + +// WriteOut calls i.InodeOperations.WriteOut with i as the Inode. +func (i *Inode) WriteOut(ctx context.Context) error { + if i.overlay != nil { + return overlayWriteOut(ctx, i.overlay) + } + return i.InodeOperations.WriteOut(ctx, i) +} + +// Lookup calls i.InodeOperations.Lookup with i as the directory. +func (i *Inode) Lookup(ctx context.Context, name string) (*Dirent, error) { + if i.overlay != nil { + return overlayLookup(ctx, i.overlay, i, name) + } + return i.InodeOperations.Lookup(ctx, i, name) +} + +// Create calls i.InodeOperations.Create with i as the directory. +func (i *Inode) Create(ctx context.Context, d *Dirent, name string, flags FileFlags, perm FilePermissions) (*File, error) { + if i.overlay != nil { + return overlayCreate(ctx, i.overlay, d, name, flags, perm) + } + return i.InodeOperations.Create(ctx, i, name, flags, perm) +} + +// CreateDirectory calls i.InodeOperations.CreateDirectory with i as the directory. +func (i *Inode) CreateDirectory(ctx context.Context, d *Dirent, name string, perm FilePermissions) error { + if i.overlay != nil { + return overlayCreateDirectory(ctx, i.overlay, d, name, perm) + } + return i.InodeOperations.CreateDirectory(ctx, i, name, perm) +} + +// CreateLink calls i.InodeOperations.CreateLink with i as the directory. +func (i *Inode) CreateLink(ctx context.Context, d *Dirent, oldname string, newname string) error { + if i.overlay != nil { + return overlayCreateLink(ctx, i.overlay, d, oldname, newname) + } + return i.InodeOperations.CreateLink(ctx, i, oldname, newname) +} + +// CreateHardLink calls i.InodeOperations.CreateHardLink with i as the directory. +func (i *Inode) CreateHardLink(ctx context.Context, d *Dirent, target *Dirent, name string) error { + if i.overlay != nil { + return overlayCreateHardLink(ctx, i.overlay, d, target, name) + } + return i.InodeOperations.CreateHardLink(ctx, i, target.Inode, name) +} + +// CreateFifo calls i.InodeOperations.CreateFifo with i as the directory. +func (i *Inode) CreateFifo(ctx context.Context, d *Dirent, name string, perm FilePermissions) error { + if i.overlay != nil { + return overlayCreateFifo(ctx, i.overlay, d, name, perm) + } + return i.InodeOperations.CreateFifo(ctx, i, name, perm) +} + +// Remove calls i.InodeOperations.Remove/RemoveDirectory with i as the directory. +func (i *Inode) Remove(ctx context.Context, d *Dirent, remove *Dirent) error { + if i.overlay != nil { + return overlayRemove(ctx, i.overlay, d, remove) + } + switch remove.Inode.StableAttr.Type { + case Directory, SpecialDirectory: + return i.InodeOperations.RemoveDirectory(ctx, i, remove.name) + default: + return i.InodeOperations.Remove(ctx, i, remove.name) + } +} + +// Rename calls i.InodeOperations.Rename with the given arguments. +func (i *Inode) Rename(ctx context.Context, oldParent *Dirent, renamed *Dirent, newParent *Dirent, newName string) error { + if i.overlay != nil { + return overlayRename(ctx, i.overlay, oldParent, renamed, newParent, newName) + } + return i.InodeOperations.Rename(ctx, oldParent.Inode, renamed.name, newParent.Inode, newName) +} + +// Bind calls i.InodeOperations.Bind with i as the directory. +func (i *Inode) Bind(ctx context.Context, name string, data unix.BoundEndpoint, perm FilePermissions) error { + if i.overlay != nil { + return overlayBind(ctx, i.overlay, name, data, perm) + } + return i.InodeOperations.Bind(ctx, i, name, data, perm) +} + +// BoundEndpoint calls i.InodeOperations.BoundEndpoint with i as the Inode. +func (i *Inode) BoundEndpoint(path string) unix.BoundEndpoint { + if i.overlay != nil { + return overlayBoundEndpoint(i.overlay, path) + } + return i.InodeOperations.BoundEndpoint(i, path) +} + +// GetFile calls i.InodeOperations.GetFile with the given arguments. +func (i *Inode) GetFile(ctx context.Context, d *Dirent, flags FileFlags) (*File, error) { + if i.overlay != nil { + return overlayGetFile(ctx, i.overlay, d, flags) + } + return i.InodeOperations.GetFile(ctx, d, flags) +} + +// UnstableAttr calls i.InodeOperations.UnstableAttr with i as the Inode. +func (i *Inode) UnstableAttr(ctx context.Context) (UnstableAttr, error) { + if i.overlay != nil { + return overlayUnstableAttr(ctx, i.overlay) + } + return i.InodeOperations.UnstableAttr(ctx, i) +} + +// Getxattr calls i.InodeOperations.Getxattr with i as the Inode. +func (i *Inode) Getxattr(name string) ([]byte, error) { + if i.overlay != nil { + return overlayGetxattr(i.overlay, name) + } + return i.InodeOperations.Getxattr(i, name) +} + +// Listxattr calls i.InodeOperations.Listxattr with i as the Inode. +func (i *Inode) Listxattr() (map[string]struct{}, error) { + if i.overlay != nil { + return overlayListxattr(i.overlay) + } + return i.InodeOperations.Listxattr(i) +} + +// CheckPermission will check if the caller may access this file in the +// requested way for reading, writing, or executing. +// +// CheckPermission is like Linux's fs/namei.c:inode_permission. It +// - checks file system mount flags, +// - and utilizes InodeOperations.Check to check capabilities and modes. +func (i *Inode) CheckPermission(ctx context.Context, p PermMask) error { + // First check the outer-most mounted filesystem. + if p.Write && i.MountSource.Flags.ReadOnly { + return syserror.EROFS + } + + if i.overlay != nil { + // CheckPermission requires some special handling for + // an overlay. + // + // Writes will always be redirected to an upper filesystem, + // so ignore all lower layers being read-only. + // + // But still honor the upper-most filesystem's mount flags; + // we should not attempt to modify the writable layer if it + // is mounted read-only. + if p.Write && overlayUpperMountSource(i.MountSource).Flags.ReadOnly { + return syserror.EROFS + } + } + + return i.check(ctx, p) +} + +func (i *Inode) check(ctx context.Context, p PermMask) error { + if i.overlay != nil { + return overlayCheck(ctx, i.overlay, p) + } + if !i.InodeOperations.Check(ctx, i, p) { + return syserror.EACCES + } + return nil +} + +// SetPermissions calls i.InodeOperations.SetPermissions with i as the Inode. +func (i *Inode) SetPermissions(ctx context.Context, d *Dirent, f FilePermissions) bool { + if i.overlay != nil { + return overlaySetPermissions(ctx, i.overlay, d, f) + } + return i.InodeOperations.SetPermissions(ctx, i, f) +} + +// SetOwner calls i.InodeOperations.SetOwner with i as the Inode. +func (i *Inode) SetOwner(ctx context.Context, d *Dirent, o FileOwner) error { + if i.overlay != nil { + return overlaySetOwner(ctx, i.overlay, d, o) + } + return i.InodeOperations.SetOwner(ctx, i, o) +} + +// SetTimestamps calls i.InodeOperations.SetTimestamps with i as the Inode. +func (i *Inode) SetTimestamps(ctx context.Context, d *Dirent, ts TimeSpec) error { + if i.overlay != nil { + return overlaySetTimestamps(ctx, i.overlay, d, ts) + } + return i.InodeOperations.SetTimestamps(ctx, i, ts) +} + +// Truncate calls i.InodeOperations.Truncate with i as the Inode. +func (i *Inode) Truncate(ctx context.Context, d *Dirent, size int64) error { + if i.overlay != nil { + return overlayTruncate(ctx, i.overlay, d, size) + } + return i.InodeOperations.Truncate(ctx, i, size) +} + +// Readlink calls i.InodeOperations.Readlnk with i as the Inode. +func (i *Inode) Readlink(ctx context.Context) (string, error) { + if i.overlay != nil { + return overlayReadlink(ctx, i.overlay) + } + return i.InodeOperations.Readlink(ctx, i) +} + +// Getlink calls i.InodeOperations.Getlink. +func (i *Inode) Getlink(ctx context.Context) (*Dirent, error) { + if i.overlay != nil { + return overlayGetlink(ctx, i.overlay) + } + return i.InodeOperations.Getlink(ctx, i) +} + +// AddLink calls i.InodeOperations.AddLink. +func (i *Inode) AddLink() { + if i.overlay != nil { + // FIXME: Remove this from InodeOperations altogether. + // + // This interface (including DropLink and NotifyStatusChange) + // is only used by ramfs to update metadata of children. These + // filesystems should _never_ have overlay Inodes cached as + // children. So explicitly disallow this scenario and avoid plumbing + // Dirents through to do copy up. + panic("overlay Inodes cached in ramfs directories are not supported") + } + i.InodeOperations.AddLink() +} + +// DropLink calls i.InodeOperations.DropLink. +func (i *Inode) DropLink() { + if i.overlay != nil { + // Same as AddLink. + panic("overlay Inodes cached in ramfs directories are not supported") + } + i.InodeOperations.DropLink() +} + +// NotifyStatusChange calls i.InodeOperations.NotifyStatusChange. +func (i *Inode) NotifyStatusChange(ctx context.Context) { + if i.overlay != nil { + // Same as AddLink. + panic("overlay Inodes cached in ramfs directories are not supported") + } + i.InodeOperations.NotifyStatusChange(ctx) +} + +// IsVirtual calls i.InodeOperations.IsVirtual. +func (i *Inode) IsVirtual() bool { + if i.overlay != nil { + // An overlay configuration does not support virtual files. + return false + } + return i.InodeOperations.IsVirtual() +} + +// StatFS calls i.InodeOperations.StatFS. +func (i *Inode) StatFS(ctx context.Context) (Info, error) { + if i.overlay != nil { + return overlayStatFS(ctx, i.overlay) + } + return i.InodeOperations.StatFS(ctx) +} + +// HandleOps extracts HandleOperations from i. +func (i *Inode) HandleOps() HandleOperations { + if i.overlay != nil { + return overlayHandleOps(i.overlay) + } + if h, ok := i.InodeOperations.(HandleOperations); ok { + return h + } + return nil +} + +// CheckOwnership checks whether `ctx` owns this Inode or may act as its owner. +// Compare Linux's fs/inode.c:inode_owner_or_capable(). +func (i *Inode) CheckOwnership(ctx context.Context) bool { + uattr, err := i.UnstableAttr(ctx) + if err != nil { + return false + } + creds := auth.CredentialsFromContext(ctx) + if uattr.Owner.UID == creds.EffectiveKUID { + return true + } + if creds.HasCapability(linux.CAP_FOWNER) && creds.UserNamespace.MapFromKUID(uattr.Owner.UID).Ok() { + return true + } + return false +} + +// CheckCapability checks whether `ctx` has capability `cp` with respect to +// operations on this Inode. +// +// Compare Linux's kernel/capability.c:capable_wrt_inode_uidgid(). Note that +// this function didn't exist in Linux 3.11.10, but was added by upstream +// 23adbe12ef7d "fs,userns: Change inode_capable to capable_wrt_inode_uidgid" +// to fix local privilege escalation CVE-2014-4014. +func (i *Inode) CheckCapability(ctx context.Context, cp linux.Capability) bool { + uattr, err := i.UnstableAttr(ctx) + if err != nil { + return false + } + creds := auth.CredentialsFromContext(ctx) + if !creds.UserNamespace.MapFromKUID(uattr.Owner.UID).Ok() { + return false + } + if !creds.UserNamespace.MapFromKGID(uattr.Owner.GID).Ok() { + return false + } + return creds.HasCapability(cp) +} diff --git a/pkg/sentry/fs/inode_inotify.go b/pkg/sentry/fs/inode_inotify.go new file mode 100644 index 000000000..358bbecdf --- /dev/null +++ b/pkg/sentry/fs/inode_inotify.go @@ -0,0 +1,166 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fs + +import ( + "fmt" + "sync" +) + +// Watches is the collection of inotify watches on an inode. +type Watches struct { + // mu protects the fields below. + mu sync.RWMutex `state:"nosave"` + + // ws is the map of active watches in this collection, keyed by the inotify + // instance id of the owner. + ws map[uint64]*Watch + + // unlinked indicates whether the target inode was ever unlinked. This is a + // hack to figure out if we should queue a IN_DELETE_SELF event when this + // watches collection is being destroyed, since otherwise we have no way of + // knowing if the target inode is going down due to a deletion or + // revalidation. + unlinked bool +} + +func newWatches() *Watches { + return &Watches{ + ws: make(map[uint64]*Watch), + } +} + +// MarkUnlinked indicates the target for this set of watches to be unlinked. +// This has implications for the IN_EXCL_UNLINK flag. +func (w *Watches) MarkUnlinked() { + w.mu.Lock() + defer w.mu.Unlock() + w.unlinked = true +} + +// Lookup returns a matching watch with the given id. Returns nil if no such +// watch exists. Note that the result returned by this method only remains valid +// if the inotify instance owning the watch is locked, preventing modification +// of the returned watch and preventing the replacement of the watch by another +// one from the same instance (since there may be at most one watch per +// instance, per target). +func (w *Watches) Lookup(id uint64) *Watch { + w.mu.Lock() + defer w.mu.Unlock() + return w.ws[id] +} + +// Add adds watch into this set of watches. The watch being added must be unique +// - its ID() should not collide with any existing watches. +func (w *Watches) Add(watch *Watch) { + w.mu.Lock() + defer w.mu.Unlock() + + // Sanity check, the new watch shouldn't collide with an existing + // watch. Silently replacing an existing watch would result in a ref leak on + // this inode. We could handle this collision by calling Unpin() on the + // existing watch, but then we end up leaking watch descriptor ids at the + // inotify level. + if _, exists := w.ws[watch.ID()]; exists { + panic(fmt.Sprintf("Watch collision with ID %+v", watch.ID())) + } + w.ws[watch.ID()] = watch +} + +// Remove removes a watch with the given id from this set of watches. The caller +// is responsible for generating any watch removal event, as appropriate. The +// provided id must match an existing watch in this collection. +func (w *Watches) Remove(id uint64) { + w.mu.Lock() + defer w.mu.Unlock() + + if w.ws == nil { + // This watch set is being destroyed. The thread executing the + // destructor is already in the process of deleting all our watches. We + // got here with no refs on the inode because we raced with the + // destructor notifying all the watch owners of the inode's destruction. + // See the comment in Watches.TargetDestroyed for why this race exists. + return + } + + watch, ok := w.ws[id] + if !ok { + // While there's technically no problem with silently ignoring a missing + // watch, this is almost certainly a bug. + panic(fmt.Sprintf("Attempt to remove a watch, but no watch found with provided id %+v.", id)) + } + delete(w.ws, watch.ID()) +} + +// Notify queues a new event with all watches in this set. +func (w *Watches) Notify(name string, events, cookie uint32) { + // N.B. We don't defer the unlocks because Notify is in the hot path of + // all IO operations, and the defer costs too much for small IO + // operations. + w.mu.RLock() + for _, watch := range w.ws { + if name != "" && w.unlinked && !watch.NotifyParentAfterUnlink() { + // IN_EXCL_UNLINK - By default, when watching events on the children + // of a directory, events are generated for children even after they + // have been unlinked from the directory. This can result in large + // numbers of uninteresting events for some applications (e.g., if + // watching /tmp, in which many applications create temporary files + // whose names are immediately unlinked). Specifying IN_EXCL_UNLINK + // changes the default behavior, so that events are not generated + // for children after they have been unlinked from the watched + // directory. -- inotify(7) + // + // We know we're dealing with events for a parent when the name + // isn't empty. + continue + } + watch.Notify(name, events, cookie) + } + w.mu.RUnlock() +} + +// Unpin unpins dirent from all watches in this set. +func (w *Watches) Unpin(d *Dirent) { + w.mu.RLock() + defer w.mu.RUnlock() + for _, watch := range w.ws { + watch.Unpin(d) + } +} + +// targetDestroyed is called by the inode destructor to notify the watch owners +// of the impending destruction of the watch target. +func (w *Watches) targetDestroyed() { + var ws map[uint64]*Watch + + // We can't hold w.mu while calling watch.TargetDestroyed to preserve lock + // ordering w.r.t to the owner inotify instances. Instead, atomically move + // the watches map into a local variable so we can iterate over it safely. + // + // Because of this however, it is possible for the watches' owners to reach + // this inode while the inode has no refs. This is still safe because the + // owners can only reach the inode until this function finishes calling + // watch.TargetDestroyed() below and the inode is guaranteed to exist in the + // meanwhile. But we still have to be very careful not to rely on inode + // state that may have been already destroyed. + w.mu.Lock() + ws = w.ws + w.ws = nil + w.mu.Unlock() + + for _, watch := range ws { + watch.TargetDestroyed() + } +} diff --git a/pkg/sentry/fs/inode_operations.go b/pkg/sentry/fs/inode_operations.go new file mode 100644 index 000000000..b33980178 --- /dev/null +++ b/pkg/sentry/fs/inode_operations.go @@ -0,0 +1,385 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fs + +import ( + "errors" + + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time" + "gvisor.googlesource.com/gvisor/pkg/sentry/memmap" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix" + "gvisor.googlesource.com/gvisor/pkg/waiter" +) + +var ( + // ErrResolveViaReadlink is a special error value returned by + // InodeOperations.Getlink() to indicate that a link should be + // resolved automatically by walking to the path returned by + // InodeOperations.Readlink(). + ErrResolveViaReadlink = errors.New("link should be resolved via Readlink()") +) + +// TimeSpec contains access and modification timestamps. If either ATimeOmit or +// MTimeOmit is true, then the corresponding timestamp should not be updated. +// If either ATimeSetSystemTime or MTimeSetSystemTime are set then the +// corresponding timestamp should be ignored and the time will be set to the +// current system time. +type TimeSpec struct { + ATime ktime.Time + ATimeOmit bool + ATimeSetSystemTime bool + MTime ktime.Time + MTimeOmit bool + MTimeSetSystemTime bool +} + +// InodeOperations are operations on an Inode that diverge per file system. +// +// Objects that implement InodeOperations may cache file system "private" +// data that is useful for implementing these methods. In contrast, Inode +// contains state that is common to all Inodes; this state may be optionally +// used by InodeOperations. An object that implements InodeOperations may +// not take a reference on an Inode. +type InodeOperations interface { + // Release releases all private file system data held by this object. + // Once Release is called, this object is dead (no other methods will + // ever be called). + Release(context.Context) + + // Lookup loads an Inode at name under dir into a Dirent. The name + // is a valid component path: it contains no "/"s nor is the empty + // string. + // + // Lookup may return one of: + // + // * A nil Dirent and a non-nil error. If the reason that Lookup failed + // was because the name does not exist under Inode, then must return + // syserror.ENOENT. + // + // * If name does not exist under dir and the file system wishes this + // fact to be cached, a non-nil Dirent containing a nil Inode and a + // nil error. This is a negative Dirent and must have exactly one + // reference (at-construction reference). + // + // * If name does exist under this dir, a non-nil Dirent containing a + // non-nil Inode, and a nil error. File systems that take extra + // references on this Dirent should implement DirentOperations. + Lookup(ctx context.Context, dir *Inode, name string) (*Dirent, error) + + // Create creates an Inode at name under dir and returns a new File + // whose Dirent backs the new Inode. Implementations must ensure that + // name does not already exist. Create may return one of: + // + // * A nil File and a non-nil error. + // + // * A non-nil File and a nil error. File.Dirent will be a new Dirent, + // with a single reference held by File. File systems that take extra + // references on this Dirent should implement DirentOperations. + // + // The caller must ensure that this operation is permitted. + Create(ctx context.Context, dir *Inode, name string, flags FileFlags, perm FilePermissions) (*File, error) + + // CreateDirectory creates a new directory under this dir. + // CreateDirectory should otherwise do the same as Create. + // + // The caller must ensure that this operation is permitted. + CreateDirectory(ctx context.Context, dir *Inode, name string, perm FilePermissions) error + + // CreateLink creates a symbolic link under dir between newname + // and oldname. CreateLink should otherwise do the same as Create. + // + // The caller must ensure that this operation is permitted. + CreateLink(ctx context.Context, dir *Inode, oldname string, newname string) error + + // CreateHardLink creates a hard link under dir between the target + // Inode and name. Implementations must ensure that name does not + // already exist. + // + // The caller must ensure this operation is permitted. + CreateHardLink(ctx context.Context, dir *Inode, target *Inode, name string) error + + // CreateFifo creates a new named pipe under dir at name. + // Implementations must ensure that an Inode at name does not + // already exist. + // + // The caller must ensure that this operation is permitted. + CreateFifo(ctx context.Context, dir *Inode, name string, perm FilePermissions) error + + // Remove removes the given named non-directory under dir. + // + // The caller must ensure that this operation is permitted. + // + // TODO: merge Remove and RemoveDirectory, Remove + // just needs a type flag. + Remove(ctx context.Context, dir *Inode, name string) error + + // RemoveDirectory removes the given named directory under dir. + // + // The caller must ensure that this operation is permitted. + // + // RemoveDirectory should check that the directory to be + // removed is empty. + RemoveDirectory(ctx context.Context, dir *Inode, name string) error + + // Rename atomically renames oldName under oldParent to newName + // under newParent where oldParent and newParent are directories. + // + // Implementations are responsible for rejecting renames that + // replace non-empty directories. + Rename(ctx context.Context, oldParent *Inode, oldName string, newParent *Inode, newName string) error + + // Bind binds a new socket under dir at the given name. + // Implementations must ensure that name does not already exist. + // + // The caller must ensure that this operation is permitted. + Bind(ctx context.Context, dir *Inode, name string, data unix.BoundEndpoint, perm FilePermissions) error + + // BoundEndpoint returns the socket endpoint at path stored in + // or generated by an Inode. + // + // The path is only relevant for generated endpoint because stored + // endpoints already know their path. It is ok for the endpoint to + // hold onto their path because the only way to change a bind + // address is to rebind the socket. + // + // This is valid iff the type of the Inode is a Socket, which + // generally implies that this Inode was created via CreateSocket. + // + // If there is no socket endpoint available, nil will be returned. + BoundEndpoint(inode *Inode, path string) unix.BoundEndpoint + + // GetFile returns a new open File backed by a Dirent and FileFlags. + // It may block as long as it is done with ctx. + // + // The returned File will uniquely back an application fd. + GetFile(ctx context.Context, d *Dirent, flags FileFlags) (*File, error) + + // UnstableAttr returns the most up-to-date "unstable" attributes of + // an Inode, where "unstable" means that they change in response to + // file system events. + UnstableAttr(ctx context.Context, inode *Inode) (UnstableAttr, error) + + // Getxattr retrieves the value of extended attribute name. Inodes that + // do not support extended attributes return EOPNOTSUPP. Inodes that + // support extended attributes but don't have a value at name return + // ENODATA. + Getxattr(inode *Inode, name string) ([]byte, error) + + // Setxattr sets the value of extended attribute name. Inodes that + // do not support extended attributes return EOPNOTSUPP. + Setxattr(inode *Inode, name string, value []byte) error + + // Listxattr returns the set of all extended attributes names that + // have values. Inodes that do not support extended attributes return + // EOPNOTSUPP. + Listxattr(inode *Inode) (map[string]struct{}, error) + + // Check determines whether an Inode can be accessed with the + // requested permission mask using the context (which gives access + // to Credentials and UserNamespace). + Check(ctx context.Context, inode *Inode, p PermMask) bool + + // SetPermissions sets new permissions for an Inode. Returns false + // if it was not possible to set the new permissions. + // + // The caller must ensure that this operation is permitted. + SetPermissions(ctx context.Context, inode *Inode, f FilePermissions) bool + + // SetOwner sets the ownership for this file. + // + // If either UID or GID are set to auth.NoID, its value will not be + // changed. + // + // The caller must ensure that this operation is permitted. + SetOwner(ctx context.Context, inode *Inode, owner FileOwner) error + + // SetTimestamps sets the access and modification timestamps of an + // Inode according to the access and modification times in the TimeSpec. + // + // If either ATimeOmit or MTimeOmit is set, then the corresponding + // timestamp is not updated. + // + // If either ATimeSetSystemTime or MTimeSetSystemTime is true, that + // timestamp is set to the current time instead. + // + // The caller must ensure that this operation is permitted. + SetTimestamps(ctx context.Context, inode *Inode, ts TimeSpec) error + + // Truncate changes the size of an Inode. Truncate should not check + // permissions internally, as it is used for both sys_truncate and + // sys_ftruncate. + // + // Implementations need not check that length >= 0. + Truncate(ctx context.Context, inode *Inode, size int64) error + + // WriteOut writes cached Inode state to a backing filesystem in a + // synchronous manner. + // + // File systems that do not cache metadata or data via an Inode + // implement WriteOut as a no-op. File systems that are entirely in + // memory also implement WriteOut as a no-op. Otherwise file systems + // call Inode.Sync to write back page cached data and cached metadata + // followed by syncing writeback handles. + // + // It derives from include/linux/fs.h:super_operations->write_inode. + WriteOut(ctx context.Context, inode *Inode) error + + // Readlink reads the symlink path of an Inode. + // + // Readlink is permitted to return a different path depending on ctx, + // the request originator. + // + // The caller must ensure that this operation is permitted. + // + // Readlink should check that Inode is a symlink and its content is + // at least readable. + Readlink(ctx context.Context, inode *Inode) (string, error) + + // Getlink resolves a symlink to a target *Dirent. + // + // Filesystems that can resolve the link by walking to the path returned + // by Readlink should return (nil, ErrResolveViaReadlink), which + // triggers link resolution via Realink and Lookup. + // + // Some links cannot be followed by Lookup. In this case, Getlink can + // return the Dirent of the link target. The caller holds a reference + // to the Dirent. Filesystems that return a non-nil *Dirent from Getlink + // cannot participate in an overlay because it is impossible for the + // overlay to ascertain whether or not the *Dirent should contain an + // overlayEntry. + // + // Any error returned from Getlink other than ErrResolveViaReadlink + // indicates the caller's inability to traverse this Inode as a link + // (e.g. syserror.ENOLINK indicates that the Inode is not a link, + // syscall.EPERM indicates that traversing the link is not allowed, etc). + Getlink(context.Context, *Inode) (*Dirent, error) + + // Mappable returns a memmap.Mappable that provides memory mappings of the + // Inode's data. Mappable may return nil if this is not supported. The + // returned Mappable must remain valid until InodeOperations.Release is + // called. + Mappable(*Inode) memmap.Mappable + + // The below methods require cleanup. + + // AddLink increments the hard link count of an Inode. + // + // Remove in favor of Inode.IncLink. + AddLink() + + // DropLink decrements the hard link count of an Inode. + // + // Remove in favor of Inode.DecLink. + DropLink() + + // NotifyStatusChange sets the status change time to the current time. + // + // Remove in favor of updating the Inode's cached status change time. + NotifyStatusChange(ctx context.Context) + + // IsVirtual indicates whether or not this corresponds to a virtual + // resource. + // + // If IsVirtual returns true, then caching will be disabled for this + // node, and fs.Dirent.Freeze() will not stop operations on the node. + // + // Remove in favor of freezing specific mounts. + IsVirtual() bool + + // StatFS returns a filesystem Info implementation or an error. If + // the filesystem does not support this operation (maybe in the future + // it will), then ENOSYS should be returned. + // + // Move to MountSourceOperations. + StatFS(context.Context) (Info, error) + + HandleOperations +} + +// HandleOperations are extended InodeOperations that are only implemented +// for file systems that use fs/handle.go:Handle to generate open Files. +// +// Handle is deprecated; these methods are deprecated as well. +// +// Filesystems are encouraged to implement the File interface directly +// instead of using Handle. To indicate that the below methods should never +// be called, embed DeprecatedFileOperations to satisfy this interface. +type HandleOperations interface { + waiter.Waitable + + // DeprecatedPreadv is deprecated in favor of filesystems + // implementing File.Preadv directly. + // + // DeprecatedPreadv reads up to dst.NumBytes() bytes into dst, starting at + // the given offset, and returns the number of bytes read. + // + // Preadv may return a partial read result before EOF is reached. + // + // If a symlink, Preadv reads the target value of the symlink. + // + // Preadv should not check for readable permissions. + DeprecatedPreadv(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) + + // DeprecatedPwritev is deprecated in favor of filesystems + // implementing File.Pwritev directly. + // + // DeprecatedPwritev writes up to src.NumBytes() bytes from src to the + // Inode, starting at the given offset and returns the number of bytes + // written. + // + // Pwritev should not check that the Inode has writable permissions. + DeprecatedPwritev(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) + + // DeprecatedReaddir is deprecated in favor of filesystems + // implementing File.Readdir directly. + // + // DeprecatedReaddir emits directory entries by calling dirCtx.EmitDir, + // beginning with the entry at offset. + // + // Entries for "." and ".." must *not* be included. + // + // If the offset returned is the same as the argument offset, then + // nothing has been serialized. This is equivalent to reaching EOF. + // In this case serializer.Written() should return 0. + // + // The order of entries to emit must be consistent between Readdir + // calls, and must start with the given offset. + // + // The caller must ensure that this operation is permitted. + DeprecatedReaddir(ctx context.Context, dirCtx *DirCtx, offset int) (int, error) + + // DeprecatedFsync is deprecated in favor of filesystems implementing + // File.Fsync directly. + // + // DeprecatedFsync syncs a file. + DeprecatedFsync() error + + // DeprecatedMappable is deprecated in favor of filesystems implementing + // File.Mappable directly. + // + // DeprecatedMappable returns a Mappable if the Inode can be mapped. + DeprecatedMappable(ctx context.Context, inode *Inode) (memmap.Mappable, bool) + + // DeprecatedFlush is deprecated in favor of filesystems implementing + // File.Flush directly. + // + // DeprecatedFlush flushes a file. + // + // Implementations may choose to free up memory or complete pending I/O + // but also may implement Flush as a no-op. + DeprecatedFlush() error +} diff --git a/pkg/sentry/fs/inode_overlay.go b/pkg/sentry/fs/inode_overlay.go new file mode 100644 index 000000000..343150bb8 --- /dev/null +++ b/pkg/sentry/fs/inode_overlay.go @@ -0,0 +1,555 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fs + +import ( + "strings" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/syserror" + "gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix" +) + +func overlayHasWhiteout(parent *Inode, name string) bool { + buf, err := parent.Getxattr(XattrOverlayWhiteout(name)) + return err == nil && string(buf) == "y" +} + +func overlayCreateWhiteout(parent *Inode, name string) error { + return parent.InodeOperations.Setxattr(parent, XattrOverlayWhiteout(name), []byte("y")) +} + +func overlayWriteOut(ctx context.Context, o *overlayEntry) error { + o.copyMu.RLock() + defer o.copyMu.RUnlock() + if o.upper == nil { + return nil + } + return o.upper.InodeOperations.WriteOut(ctx, o.upper) +} + +func overlayLookup(ctx context.Context, parent *overlayEntry, inode *Inode, name string) (*Dirent, error) { + parent.copyMu.RLock() + defer parent.copyMu.RUnlock() + + // Assert that there is at least one upper or lower entry. + if parent.upper == nil && parent.lower == nil { + panic("invalid overlayEntry, needs at least one Inode") + } + + var upperInode *Inode + var lowerInode *Inode + + // Does the parent directory exist in the upper file system? + if parent.upper != nil { + // First check if a file object exists in the upper file system. + // A file could have been created over a whiteout, so we need to + // check if something exists in the upper file system first. + child, err := parent.upper.Lookup(ctx, name) + if err != nil && err != syserror.ENOENT { + // We encountered an error that an overlay cannot handle, + // we must propagate it to the caller. + return nil, err + } + if child != nil { + defer child.DecRef() + + // Is the child non-negative? + if !child.IsNegative() { + upperInode = child.Inode + upperInode.IncRef() + } + } + + // Are we done? + if overlayHasWhiteout(parent.upper, name) { + if upperInode == nil { + return NewNegativeDirent(name), nil + } + entry, err := newOverlayEntry(ctx, upperInode, nil, false) + if err != nil { + // Don't leak resources. + upperInode.DecRef() + return nil, err + } + return NewDirent(newOverlayInode(ctx, entry, inode.MountSource), name), nil + } + } + + // Check the lower file system. We do this unconditionally (even for + // non-directories) because we may need to use stable attributes from + // the lower filesystem (e.g. device number, inode number) that were + // visible before a copy up. + if parent.lower != nil { + // Check the lower file system. + child, err := parent.lower.Lookup(ctx, name) + // Same song and dance as above. + if err != nil && err != syserror.ENOENT { + // Don't leak resources. + if upperInode != nil { + upperInode.DecRef() + } + return nil, err + } + if child != nil { + defer child.DecRef() + + // Is the child negative? + if !child.IsNegative() { + // Did we find something in the upper filesystem? We can + // only use it if the types match. + if upperInode == nil || upperInode.StableAttr.Type == child.Inode.StableAttr.Type { + lowerInode = child.Inode + lowerInode.IncRef() + } + } + } + } + + // Was all of this for naught? + if upperInode == nil && lowerInode == nil { + // Return a negative Dirent indicating that nothing was found. + return NewNegativeDirent(name), nil + } + + // Did we find a lower Inode? Remember this because we may decide we don't + // actually need the lower Inode (see below). + lowerExists := lowerInode != nil + + // If we found something in the upper filesystem and the lower filesystem, + // use the stable attributes from the lower filesystem. If we don't do this, + // then it may appear that the file was magically recreated across copy up. + if upperInode != nil && lowerInode != nil { + // Steal attributes. + upperInode.StableAttr = lowerInode.StableAttr + + // For non-directories, the lower filesystem resource is strictly + // unnecessary because we don't need to copy-up and we will always + // operate (e.g. read/write) on the upper Inode. + if !IsDir(upperInode.StableAttr) { + lowerInode.DecRef() + lowerInode = nil + } + } + + // Phew, finally done. + entry, err := newOverlayEntry(ctx, upperInode, lowerInode, lowerExists) + if err != nil { + // Well, not quite, we failed at the last moment, how depressing. + // Be sure not to leak resources. + if upperInode != nil { + upperInode.DecRef() + } + if lowerInode != nil { + lowerInode.DecRef() + } + return nil, err + } + return NewDirent(newOverlayInode(ctx, entry, inode.MountSource), name), nil +} + +func overlayCreate(ctx context.Context, o *overlayEntry, parent *Dirent, name string, flags FileFlags, perm FilePermissions) (*File, error) { + // Dirent.Create takes renameMu if the Inode is an overlay Inode. + if err := copyUpLockedForRename(ctx, parent); err != nil { + return nil, err + } + + upperFile, err := o.upper.InodeOperations.Create(ctx, o.upper, name, flags, perm) + if err != nil { + return nil, err + } + + // Take another reference on the upper file's inode, which will be + // owned by the overlay entry. + upperFile.Dirent.Inode.IncRef() + entry, err := newOverlayEntry(ctx, upperFile.Dirent.Inode, nil, false) + if err != nil { + cleanupUpper(ctx, o.upper, name) + return nil, err + } + + // NOTE: Replace the Dirent with a transient Dirent, since + // we are about to create the real Dirent: an overlay Dirent. + // + // This ensures the *fs.File returned from overlayCreate is in the same + // state as the *fs.File returned by overlayGetFile, where the upper + // file has a transient Dirent. + // + // This is necessary for Save/Restore, as otherwise the upper Dirent + // (which has no path as it is unparented and never reachable by the + // user) will clobber the real path for the underlying Inode. + upperFile.Dirent.Inode.IncRef() + upperDirent := NewTransientDirent(upperFile.Dirent.Inode) + upperFile.Dirent.DecRef() + upperFile.Dirent = upperDirent + + // Create the overlay inode and dirent. We need this to construct the + // overlay file. + overlayInode := newOverlayInode(ctx, entry, parent.Inode.MountSource) + // d will own the inode reference. + overlayDirent := NewDirent(overlayInode, name) + // The overlay file created below with NewFile will take a reference on + // the overlayDirent, and it should be the only thing holding a + // reference at the time of creation, so we must drop this reference. + defer overlayDirent.DecRef() + + // Create a new overlay file that wraps the upper file. + flags.Pread = upperFile.Flags().Pread + flags.Pwrite = upperFile.Flags().Pwrite + overlayFile := NewFile(ctx, overlayDirent, flags, &overlayFileOperations{upper: upperFile}) + + return overlayFile, nil +} + +func overlayCreateDirectory(ctx context.Context, o *overlayEntry, parent *Dirent, name string, perm FilePermissions) error { + // Dirent.CreateDirectory takes renameMu if the Inode is an overlay + // Inode. + if err := copyUpLockedForRename(ctx, parent); err != nil { + return err + } + return o.upper.InodeOperations.CreateDirectory(ctx, o.upper, name, perm) +} + +func overlayCreateLink(ctx context.Context, o *overlayEntry, parent *Dirent, oldname string, newname string) error { + // Dirent.CreateLink takes renameMu if the Inode is an overlay Inode. + if err := copyUpLockedForRename(ctx, parent); err != nil { + return err + } + return o.upper.InodeOperations.CreateLink(ctx, o.upper, oldname, newname) +} + +func overlayCreateHardLink(ctx context.Context, o *overlayEntry, parent *Dirent, target *Dirent, name string) error { + // Dirent.CreateHardLink takes renameMu if the Inode is an overlay + // Inode. + if err := copyUpLockedForRename(ctx, parent); err != nil { + return err + } + if err := copyUpLockedForRename(ctx, target); err != nil { + return err + } + return o.upper.InodeOperations.CreateHardLink(ctx, o.upper, target.Inode.overlay.upper, name) +} + +func overlayCreateFifo(ctx context.Context, o *overlayEntry, parent *Dirent, name string, perm FilePermissions) error { + // Dirent.CreateFifo takes renameMu if the Inode is an overlay Inode. + if err := copyUpLockedForRename(ctx, parent); err != nil { + return err + } + return o.upper.InodeOperations.CreateFifo(ctx, o.upper, name, perm) +} + +func overlayRemove(ctx context.Context, o *overlayEntry, parent *Dirent, child *Dirent) error { + // Dirent.Remove and Dirent.RemoveDirectory take renameMu if the Inode + // is an overlay Inode. + if err := copyUpLockedForRename(ctx, parent); err != nil { + return err + } + child.Inode.overlay.copyMu.RLock() + defer child.Inode.overlay.copyMu.RUnlock() + if child.Inode.overlay.upper != nil { + if child.Inode.StableAttr.Type == Directory { + if err := o.upper.InodeOperations.RemoveDirectory(ctx, o.upper, child.name); err != nil { + return err + } + } else { + if err := o.upper.InodeOperations.Remove(ctx, o.upper, child.name); err != nil { + return err + } + } + } + if child.Inode.overlay.lowerExists { + return overlayCreateWhiteout(o.upper, child.name) + } + return nil +} + +func overlayRename(ctx context.Context, o *overlayEntry, oldParent *Dirent, renamed *Dirent, newParent *Dirent, newName string) error { + // To be able to copy these up below, they have to be part of an + // overlay file system. + // + // Maybe some day we can allow the more complicated case of + // non-overlay X overlay renames, but that's not necessary right now. + if renamed.Inode.overlay == nil || newParent.Inode.overlay == nil || oldParent.Inode.overlay == nil { + return syserror.EXDEV + } + + // Check here if the file to be replaced exists and is a non-empty + // directory. If we copy up first, we may end up copying the directory + // but none of its children, so the directory will appear empty in the + // upper fs, which will then allow the rename to proceed when it should + // return ENOTEMPTY. + replaced, err := newParent.Inode.Lookup(ctx, newName) + if err != nil && err != syserror.ENOENT { + return err + } + if err == nil && !replaced.IsNegative() && IsDir(replaced.Inode.StableAttr) { + children, err := readdirOne(ctx, replaced) + if err != nil { + return err + } + + // readdirOne ensures that "." and ".." are not + // included among the returned children, so we don't + // need to bother checking for them. + if len(children) > 0 { + return syserror.ENOTEMPTY + } + } + if err := copyUpLockedForRename(ctx, renamed); err != nil { + return err + } + if err := copyUpLockedForRename(ctx, newParent); err != nil { + return err + } + oldName := renamed.name + if err := o.upper.InodeOperations.Rename(ctx, oldParent.Inode.overlay.upper, oldName, newParent.Inode.overlay.upper, newName); err != nil { + return err + } + if renamed.Inode.overlay.lowerExists { + return overlayCreateWhiteout(oldParent.Inode.overlay.upper, oldName) + } + return nil +} + +func overlayBind(ctx context.Context, o *overlayEntry, name string, data unix.BoundEndpoint, perm FilePermissions) error { + o.copyMu.RLock() + defer o.copyMu.RUnlock() + // We do not support doing anything exciting with sockets unless there + // is already a directory in the upper filesystem. + if o.upper == nil { + return syserror.EOPNOTSUPP + } + return o.upper.InodeOperations.Bind(ctx, o.upper, name, data, perm) +} + +func overlayBoundEndpoint(o *overlayEntry, path string) unix.BoundEndpoint { + o.copyMu.RLock() + defer o.copyMu.RUnlock() + + if o.upper != nil { + return o.upper.InodeOperations.BoundEndpoint(o.upper, path) + } + // If a socket is already in the lower file system, allow connections + // to it. + return o.lower.InodeOperations.BoundEndpoint(o.lower, path) +} + +func overlayGetFile(ctx context.Context, o *overlayEntry, d *Dirent, flags FileFlags) (*File, error) { + if flags.Write { + if err := copyUp(ctx, d); err != nil { + return nil, err + } + } + + o.copyMu.RLock() + defer o.copyMu.RUnlock() + + if o.upper != nil { + upper, err := overlayFile(ctx, o.upper, flags) + if err != nil { + return nil, err + } + flags.Pread = upper.Flags().Pread + flags.Pwrite = upper.Flags().Pwrite + return NewFile(ctx, d, flags, &overlayFileOperations{upper: upper}), nil + } + + lower, err := overlayFile(ctx, o.lower, flags) + if err != nil { + return nil, err + } + flags.Pread = lower.Flags().Pread + flags.Pwrite = lower.Flags().Pwrite + return NewFile(ctx, d, flags, &overlayFileOperations{lower: lower}), nil +} + +func overlayUnstableAttr(ctx context.Context, o *overlayEntry) (UnstableAttr, error) { + o.copyMu.RLock() + defer o.copyMu.RUnlock() + if o.upper != nil { + return o.upper.UnstableAttr(ctx) + } + return o.lower.UnstableAttr(ctx) +} + +func overlayGetxattr(o *overlayEntry, name string) ([]byte, error) { + // Don't forward the value of the extended attribute if it would + // unexpectedly change the behavior of a wrapping overlay layer. + if strings.HasPrefix(XattrOverlayPrefix, name) { + return nil, syserror.ENODATA + } + o.copyMu.RLock() + defer o.copyMu.RUnlock() + if o.upper != nil { + return o.upper.Getxattr(name) + } + return o.lower.Getxattr(name) +} + +func overlayListxattr(o *overlayEntry) (map[string]struct{}, error) { + o.copyMu.RLock() + defer o.copyMu.RUnlock() + var names map[string]struct{} + var err error + if o.upper != nil { + names, err = o.upper.Listxattr() + } else { + names, err = o.lower.Listxattr() + } + for name := range names { + // Same as overlayGetxattr, we shouldn't forward along + // overlay attributes. + if strings.HasPrefix(XattrOverlayPrefix, name) { + delete(names, name) + } + } + return names, err +} + +func overlayCheck(ctx context.Context, o *overlayEntry, p PermMask) error { + o.copyMu.RLock() + defer o.copyMu.RUnlock() + if o.upper != nil { + return o.upper.check(ctx, p) + } + if p.Write { + // Since writes will be redirected to the upper filesystem, the lower + // filesystem need not be writable, but must be readable for copy-up. + p.Write = false + p.Read = true + } + return o.lower.check(ctx, p) +} + +func overlaySetPermissions(ctx context.Context, o *overlayEntry, d *Dirent, f FilePermissions) bool { + if err := copyUp(ctx, d); err != nil { + return false + } + return o.upper.InodeOperations.SetPermissions(ctx, o.upper, f) +} + +func overlaySetOwner(ctx context.Context, o *overlayEntry, d *Dirent, owner FileOwner) error { + if err := copyUp(ctx, d); err != nil { + return err + } + return o.upper.InodeOperations.SetOwner(ctx, o.upper, owner) +} + +func overlaySetTimestamps(ctx context.Context, o *overlayEntry, d *Dirent, ts TimeSpec) error { + if err := copyUp(ctx, d); err != nil { + return err + } + return o.upper.InodeOperations.SetTimestamps(ctx, o.upper, ts) +} + +func overlayTruncate(ctx context.Context, o *overlayEntry, d *Dirent, size int64) error { + if err := copyUp(ctx, d); err != nil { + return err + } + return o.upper.InodeOperations.Truncate(ctx, o.upper, size) +} + +func overlayReadlink(ctx context.Context, o *overlayEntry) (string, error) { + o.copyMu.RLock() + defer o.copyMu.RUnlock() + if o.upper != nil { + return o.upper.Readlink(ctx) + } + return o.lower.Readlink(ctx) +} + +func overlayGetlink(ctx context.Context, o *overlayEntry) (*Dirent, error) { + var dirent *Dirent + var err error + + o.copyMu.RLock() + defer o.copyMu.RUnlock() + + if o.upper != nil { + dirent, err = o.upper.Getlink(ctx) + } else { + dirent, err = o.lower.Getlink(ctx) + } + if dirent != nil { + // This dirent is likely bogus (its Inode likely doesn't contain + // the right overlayEntry). So we're forced to drop it on the + // ground and claim that jumping around the filesystem like this + // is not supported. + name, _ := dirent.FullName(nil) + dirent.DecRef() + + // Claim that the path is not accessible. + err = syserror.EACCES + log.Warningf("Getlink not supported in overlay for %q", name) + } + return nil, err +} + +func overlayStatFS(ctx context.Context, o *overlayEntry) (Info, error) { + o.copyMu.RLock() + defer o.copyMu.RUnlock() + + var i Info + var err error + if o.upper != nil { + i, err = o.upper.StatFS(ctx) + } else { + i, err = o.lower.StatFS(ctx) + } + if err != nil { + return Info{}, err + } + + i.Type = linux.OVERLAYFS_SUPER_MAGIC + + return i, nil +} + +func overlayHandleOps(o *overlayEntry) HandleOperations { + o.copyMu.RLock() + defer o.copyMu.RUnlock() + if o.upper != nil { + return o.upper.HandleOps() + } + return o.lower.HandleOps() +} + +// NewTestOverlayDir returns an overlay Inode for tests. +func NewTestOverlayDir(ctx context.Context, upper *Inode, lower *Inode) *Inode { + fs := &overlayFilesystem{} + msrc := NewMountSource(&overlayMountSourceOperations{ + upper: NewNonCachingMountSource(fs, MountSourceFlags{}), + lower: NewNonCachingMountSource(fs, MountSourceFlags{}), + }, fs, MountSourceFlags{}) + overlay := &overlayEntry{ + upper: upper, + lower: lower, + } + return newOverlayInode(ctx, overlay, msrc) +} + +// TestHasUpperFS returns true if i is an overlay Inode and it has a pointer +// to an Inode on an upper filesystem. +func (i *Inode) TestHasUpperFS() bool { + return i.overlay != nil && i.overlay.upper != nil +} + +// TestHasLowerFS returns true if i is an overlay Inode and it has a pointer +// to an Inode on a lower filesystem. +func (i *Inode) TestHasLowerFS() bool { + return i.overlay != nil && i.overlay.lower != nil +} diff --git a/pkg/sentry/fs/inode_overlay_test.go b/pkg/sentry/fs/inode_overlay_test.go new file mode 100644 index 000000000..684d54bd2 --- /dev/null +++ b/pkg/sentry/fs/inode_overlay_test.go @@ -0,0 +1,251 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fs_test + +import ( + "testing" + + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + ramfstest "gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs/test" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +func TestLookup(t *testing.T) { + ctx := contexttest.Context(t) + for _, test := range []struct { + // Test description. + desc string + + // Lookup parameters. + dir *fs.Inode + name string + + // Want from lookup. + err error + found bool + hasUpper bool + hasLower bool + }{ + { + desc: "no upper, lower has name", + dir: fs.NewTestOverlayDir(ctx, + nil, /* upper */ + newTestRamfsDir(ctx, []dirContent{ + { + name: "a", + dir: false, + }, + }, nil), /* lower */ + ), + name: "a", + found: true, + hasUpper: false, + hasLower: true, + }, + { + desc: "no lower, upper has name", + dir: fs.NewTestOverlayDir(ctx, + newTestRamfsDir(ctx, []dirContent{ + { + name: "a", + dir: false, + }, + }, nil), /* upper */ + nil, /* lower */ + ), + name: "a", + found: true, + hasUpper: true, + hasLower: false, + }, + { + desc: "upper and lower, only lower has name", + dir: fs.NewTestOverlayDir(ctx, + newTestRamfsDir(ctx, []dirContent{ + { + name: "b", + dir: false, + }, + }, nil), /* upper */ + newTestRamfsDir(ctx, []dirContent{ + { + name: "a", + dir: false, + }, + }, nil), /* lower */ + ), + name: "a", + found: true, + hasUpper: false, + hasLower: true, + }, + { + desc: "upper and lower, only upper has name", + dir: fs.NewTestOverlayDir(ctx, + newTestRamfsDir(ctx, []dirContent{ + { + name: "a", + dir: false, + }, + }, nil), /* upper */ + newTestRamfsDir(ctx, []dirContent{ + { + name: "b", + dir: false, + }, + }, nil), /* lower */ + ), + name: "a", + found: true, + hasUpper: true, + hasLower: false, + }, + { + desc: "upper and lower, both have file", + dir: fs.NewTestOverlayDir(ctx, + newTestRamfsDir(ctx, []dirContent{ + { + name: "a", + dir: false, + }, + }, nil), /* upper */ + newTestRamfsDir(ctx, []dirContent{ + { + name: "a", + dir: false, + }, + }, nil), /* lower */ + ), + name: "a", + found: true, + hasUpper: true, + hasLower: false, + }, + { + desc: "upper and lower, both have directory", + dir: fs.NewTestOverlayDir(ctx, + newTestRamfsDir(ctx, []dirContent{ + { + name: "a", + dir: true, + }, + }, nil), /* upper */ + newTestRamfsDir(ctx, []dirContent{ + { + name: "a", + dir: true, + }, + }, nil), /* lower */ + ), + name: "a", + found: true, + hasUpper: true, + hasLower: true, + }, + { + desc: "upper and lower, upper negative masks lower file", + dir: fs.NewTestOverlayDir(ctx, + newTestRamfsDir(ctx, nil, []string{"a"}), /* upper */ + newTestRamfsDir(ctx, []dirContent{ + { + name: "a", + dir: false, + }, + }, nil), /* lower */ + ), + name: "a", + found: false, + hasUpper: false, + hasLower: false, + }, + { + desc: "upper and lower, upper negative does not mask lower file", + dir: fs.NewTestOverlayDir(ctx, + newTestRamfsDir(ctx, nil, []string{"b"}), /* upper */ + newTestRamfsDir(ctx, []dirContent{ + { + name: "a", + dir: false, + }, + }, nil), /* lower */ + ), + name: "a", + found: true, + hasUpper: false, + hasLower: true, + }, + } { + t.Run(test.desc, func(t *testing.T) { + dirent, err := test.dir.Lookup(ctx, test.name) + if err != test.err { + t.Fatalf("lookup got error %v, want %v", err, test.err) + } + if test.found && dirent.IsNegative() { + t.Fatalf("lookup expected to find %q, got negative dirent", test.name) + } + if !test.found { + return + } + if hasUpper := dirent.Inode.TestHasUpperFS(); hasUpper != test.hasUpper { + t.Fatalf("lookup got upper filesystem %v, want %v", hasUpper, test.hasUpper) + } + if hasLower := dirent.Inode.TestHasLowerFS(); hasLower != test.hasLower { + t.Errorf("lookup got lower filesystem %v, want %v", hasLower, test.hasLower) + } + }) + } +} + +type dir struct { + fs.InodeOperations + + // list of negative child names. + negative []string +} + +func (d *dir) Getxattr(inode *fs.Inode, name string) ([]byte, error) { + for _, n := range d.negative { + if name == fs.XattrOverlayWhiteout(n) { + return []byte("y"), nil + } + } + return nil, syserror.ENOATTR +} + +type dirContent struct { + name string + dir bool +} + +func newTestRamfsDir(ctx context.Context, contains []dirContent, negative []string) *fs.Inode { + msrc := fs.NewCachingMountSource(nil, fs.MountSourceFlags{}) + contents := make(map[string]*fs.Inode) + for _, c := range contains { + if c.dir { + contents[c.name] = newTestRamfsDir(ctx, nil, nil) + } else { + contents[c.name] = fs.NewInode(ramfstest.NewFile(ctx, fs.FilePermissions{}), msrc, fs.StableAttr{Type: fs.RegularFile}) + } + } + dops := ramfstest.NewDir(ctx, contents, fs.FilePermissions{ + User: fs.PermMask{Read: true, Execute: true}, + }) + return fs.NewInode(&dir{ + InodeOperations: dops, + negative: negative, + }, msrc, fs.StableAttr{Type: fs.Directory}) +} diff --git a/pkg/sentry/fs/inotify.go b/pkg/sentry/fs/inotify.go new file mode 100644 index 000000000..9f50cb800 --- /dev/null +++ b/pkg/sentry/fs/inotify.go @@ -0,0 +1,329 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fs + +import ( + "sync" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/ilist" + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/memmap" + "gvisor.googlesource.com/gvisor/pkg/sentry/uniqueid" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" + "gvisor.googlesource.com/gvisor/pkg/waiter" +) + +// Inotify represents an inotify instance created by inotify_init(2) or +// inotify_init1(2). Inotify implements the FileOperations interface. +// +// Lock ordering: +// Inotify.mu -> Inode.Watches.mu -> Watch.mu -> Inotify.evMu +type Inotify struct { + // Unique identifier for this inotify instance. We don't just reuse the + // inotify fd because fds can be duped. These should not be exposed to the + // user, since we may aggressively reuse an id on S/R. + id uint64 + + // evMu *only* protects the event queue. We need a separate lock because + // while queuing events, a watch needs to lock the event queue, and using mu + // for that would violate lock ordering since at that point the calling + // goroutine already holds Watch.target.Watches.mu. + evMu sync.Mutex `state:"nosave"` + + waiter.Queue `state:"nosave"` + + // A list of pending events for this inotify instance. Protected by evMu. + events ilist.List + + // A scratch buffer, use to serialize inotify events. Use allocate this + // ahead of time and reuse performance. Protected by evMu. + scratch []byte + + // mu protects the fields below. + mu sync.Mutex `state:"nosave"` + + // The next watch descriptor number to use for this inotify instance. Note + // that Linux starts numbering watch descriptors from 1. + nextWatch int32 + + // Map from watch descriptors to watch objects. + watches map[int32]*Watch +} + +// NewInotify constructs a new Inotify instance. +func NewInotify(ctx context.Context) *Inotify { + return &Inotify{ + id: uniqueid.GlobalFromContext(ctx), + scratch: make([]byte, inotifyEventBaseSize), + nextWatch: 1, // Linux starts numbering watch descriptors from 1. + watches: make(map[int32]*Watch), + } +} + +// Release implements FileOperations.Release. Release removes all watches and +// frees all resources for an inotify instance. +func (i *Inotify) Release() { + // We need to hold i.mu to avoid a race with concurrent calls to + // Inotify.targetDestroyed from Watches. There's no risk of Watches + // accessing this Inotify after the destructor ends, because we remove all + // references to it below. + i.mu.Lock() + defer i.mu.Unlock() + for _, w := range i.watches { + // Remove references to the watch from the watch target. We don't need + // to worry about the references from the owner instance, since we're in + // the owner's destructor. + w.target.Watches.Remove(w.ID()) + // Don't leak any references to the target, held by pins in the watch. + w.destroy() + } +} + +// Readiness implements waiter.Waitable.Readiness. +// +// Readiness indicates whether there are pending events for an inotify instance. +func (i *Inotify) Readiness(mask waiter.EventMask) waiter.EventMask { + ready := waiter.EventMask(0) + + i.evMu.Lock() + defer i.evMu.Unlock() + + if !i.events.Empty() { + ready |= waiter.EventIn + } + + return mask & ready +} + +// Seek implements FileOperations.Seek. +func (*Inotify) Seek(context.Context, *File, SeekWhence, int64) (int64, error) { + return 0, syserror.ESPIPE +} + +// Readdir implements FileOperatons.Readdir. +func (*Inotify) Readdir(context.Context, *File, DentrySerializer) (int64, error) { + return 0, syserror.ENOTDIR +} + +// Write implements FileOperations.Write. +func (*Inotify) Write(context.Context, *File, usermem.IOSequence, int64) (int64, error) { + return 0, syserror.EBADF +} + +// Read implements FileOperations.Read. +func (i *Inotify) Read(ctx context.Context, _ *File, dst usermem.IOSequence, _ int64) (int64, error) { + if dst.NumBytes() < inotifyEventBaseSize { + return 0, syserror.EINVAL + } + + i.evMu.Lock() + defer i.evMu.Unlock() + + if i.events.Empty() { + // Nothing to read yet, tell caller to block. + return 0, syserror.ErrWouldBlock + } + + var writeLen int64 + for e := i.events.Front(); e != nil; e = e.Next() { + event := e.(*Event) + + // Does the buffer have enough remaining space to hold the event we're + // about to write out? + if dst.NumBytes() < int64(event.sizeOf()) { + if writeLen > 0 { + // Buffer wasn't big enough for all pending events, but we did + // write some events out. + return writeLen, nil + } + return 0, syserror.EINVAL + } + + // Linux always dequeues an available event as long as there's enough + // buffer space to copy it out, even if the copy below fails. Emulate + // this behaviour. + i.events.Remove(e) + + // Buffer has enough space, copy event to the read buffer. + n, err := event.CopyTo(ctx, i.scratch, dst) + if err != nil { + return 0, err + } + + writeLen += n + dst = dst.DropFirst64(n) + } + return writeLen, nil +} + +// Fsync implements FileOperations.Fsync. +func (*Inotify) Fsync(context.Context, *File, int64, int64, SyncType) error { + return syserror.EINVAL +} + +// Flush implements FileOperations.Flush. +func (*Inotify) Flush(context.Context, *File) error { + return nil +} + +// ConfigureMMap implements FileOperations.ConfigureMMap. +func (*Inotify) ConfigureMMap(context.Context, *File, *memmap.MMapOpts) error { + return syserror.ENODEV +} + +// Ioctl implements fs.FileOperations.Ioctl. +func (i *Inotify) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { + switch args[1].Int() { + case linux.FIONREAD: + i.evMu.Lock() + defer i.evMu.Unlock() + var n uint32 + for e := i.events.Front(); e != nil; e = e.Next() { + event := e.(*Event) + n += uint32(event.sizeOf()) + } + var buf [4]byte + usermem.ByteOrder.PutUint32(buf[:], n) + _, err := io.CopyOut(ctx, args[2].Pointer(), buf[:], usermem.IOOpts{}) + return 0, err + + default: + return 0, syserror.ENOTTY + } +} + +func (i *Inotify) queueEvent(ev *Event) { + i.evMu.Lock() + defer i.evMu.Unlock() + + // Check if we should coalesce the event we're about to queue with the last + // one currently in the queue. Events are coalesced if they are identical. + if last := i.events.Back(); last != nil { + if ev.equals(last.(*Event)) { + // "Coalesce" the two events by simply not queuing the new one. We + // don't need to raise a waiter.EventIn notification because no new + // data is available for reading. + return + } + } + + i.events.PushBack(ev) + i.Queue.Notify(waiter.EventIn) +} + +// newWatchLocked creates and adds a new watch to target. +func (i *Inotify) newWatchLocked(target *Dirent, mask uint32) *Watch { + wd := i.nextWatch + i.nextWatch++ + + watch := &Watch{ + owner: i, + wd: wd, + mask: mask, + target: target.Inode, + pins: make(map[*Dirent]bool), + } + + i.watches[wd] = watch + + // Grab an extra reference to target to prevent it from being evicted from + // memory. This ref is dropped during either watch removal, target + // destruction, or inotify instance destruction. See callers of Watch.Unpin. + watch.Pin(target) + target.Inode.Watches.Add(watch) + + return watch +} + +// targetDestroyed is called by w to notify i that w's target is gone. This +// automatically generates a watch removal event. +func (i *Inotify) targetDestroyed(w *Watch) { + i.mu.Lock() + _, found := i.watches[w.wd] + delete(i.watches, w.wd) + i.mu.Unlock() + + if found { + i.queueEvent(newEvent(w.wd, "", linux.IN_IGNORED, 0)) + } +} + +// AddWatch constructs a new inotify watch and adds it to the target dirent. It +// returns the watch descriptor returned by inotify_add_watch(2). +func (i *Inotify) AddWatch(target *Dirent, mask uint32) int32 { + // Note: Locking this inotify instance protects the result returned by + // Lookup() below. With the lock held, we know for sure the lookup result + // won't become stale because it's impossible for *this* instance to + // add/remove watches on target. + i.mu.Lock() + defer i.mu.Unlock() + + // Does the target already have a watch from this inotify instance? + if existing := target.Inode.Watches.Lookup(i.id); existing != nil { + // This may be a watch on a different dirent pointing to the + // same inode. Obtain an extra reference if necessary. + existing.Pin(target) + + if mergeMask := mask&linux.IN_MASK_ADD != 0; mergeMask { + // "Add (OR) events to watch mask for this pathname if it already + // exists (instead of replacing mask)." -- inotify(7) + existing.mask |= mask + } else { + existing.mask = mask + } + return existing.wd + } + + // No existing watch, create a new watch. + watch := i.newWatchLocked(target, mask) + return watch.wd +} + +// RmWatch implements watcher.Watchable.RmWatch. +// +// RmWatch looks up an inotify watch for the given 'wd' and configures the +// target dirent to stop sending events to this inotify instance. +func (i *Inotify) RmWatch(wd int32) error { + i.mu.Lock() + + // Find the watch we were asked to removed. + watch, ok := i.watches[wd] + if !ok { + i.mu.Unlock() + return syserror.EINVAL + } + + // Remove the watch from this instance. + delete(i.watches, wd) + + // Remove the watch from the watch target. + watch.target.Watches.Remove(watch.ID()) + + // The watch is now isolated and we can safely drop the instance lock. We + // need to do so because watch.destroy() acquires Watch.mu, which cannot be + // aquired with Inotify.mu held. + i.mu.Unlock() + + // Generate the event for the removal. + i.queueEvent(newEvent(watch.wd, "", linux.IN_IGNORED, 0)) + + // Remove all pins. + watch.destroy() + + return nil +} diff --git a/pkg/sentry/fs/inotify_event.go b/pkg/sentry/fs/inotify_event.go new file mode 100644 index 000000000..217915ba4 --- /dev/null +++ b/pkg/sentry/fs/inotify_event.go @@ -0,0 +1,138 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fs + +import ( + "bytes" + "fmt" + + "gvisor.googlesource.com/gvisor/pkg/ilist" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +// inotifyEventBaseSize is the base size of linux's struct inotify_event. This +// must be a power 2 for rounding below. +const inotifyEventBaseSize = 16 + +// Event represents a struct inotify_event from linux. +type Event struct { + ilist.Entry + + wd int32 + mask uint32 + cookie uint32 + + // len is computed based on the name field is set automatically by + // Event.setName. It should be 0 when no name is set; otherwise it is the + // length of the name slice. + len uint32 + + // The name field has special padding requirements and should only be set by + // calling Event.setName. + name []byte +} + +func newEvent(wd int32, name string, events, cookie uint32) *Event { + e := &Event{ + wd: wd, + mask: events, + cookie: cookie, + } + if name != "" { + e.setName(name) + } + return e +} + +// paddedBytes converts a go string to a null-terminated c-string, padded with +// null bytes to a total size of 'l'. 'l' must be large enough for all the bytes +// in the 's' plus at least one null byte. +func paddedBytes(s string, l uint32) []byte { + if l < uint32(len(s)+1) { + panic("Converting string to byte array results in truncation, this can lead to buffer-overflow due to the missing null-byte!") + } + b := make([]byte, l) + copy(b, s) + + // b was zero-value initialized during make(), so the rest of the slice is + // already filled with null bytes. + + return b +} + +// setName sets the optional name for this event. +func (e *Event) setName(name string) { + // We need to pad the name such that the entire event length ends up a + // multiple of inotifyEventBaseSize. + unpaddedLen := len(name) + 1 + // Round up to nearest multiple of inotifyEventBaseSize. + e.len = uint32((unpaddedLen + inotifyEventBaseSize - 1) & ^(inotifyEventBaseSize - 1)) + // Make sure we haven't overflowed and wrapped around when rounding. + if unpaddedLen > int(e.len) { + panic("Overflow when rounding inotify event size, the 'name' field was too big.") + } + e.name = paddedBytes(name, e.len) +} + +func (e *Event) sizeOf() int { + s := inotifyEventBaseSize + int(e.len) + if s < inotifyEventBaseSize { + panic("overflow") + } + return s +} + +// CopyTo serializes this event to dst. buf is used as a scratch buffer to +// construct the output. We use a buffer allocated ahead of time for +// performance. buf must be at least inotifyEventBaseSize bytes. +func (e *Event) CopyTo(ctx context.Context, buf []byte, dst usermem.IOSequence) (int64, error) { + usermem.ByteOrder.PutUint32(buf[0:], uint32(e.wd)) + usermem.ByteOrder.PutUint32(buf[4:], e.mask) + usermem.ByteOrder.PutUint32(buf[8:], e.cookie) + usermem.ByteOrder.PutUint32(buf[12:], e.len) + + writeLen := 0 + + n, err := dst.CopyOut(ctx, buf) + if err != nil { + return 0, err + } + writeLen += n + dst = dst.DropFirst(n) + + if e.len > 0 { + n, err = dst.CopyOut(ctx, e.name) + if err != nil { + return 0, err + } + writeLen += n + } + + // Santiy check. + if writeLen != e.sizeOf() { + panic(fmt.Sprintf("Serialized unexpected amount of data for an event, expected %v, wrote %v.", e.sizeOf(), writeLen)) + } + + return int64(writeLen), nil +} + +func (e *Event) equals(other *Event) bool { + return e.wd == other.wd && + e.mask == other.mask && + e.cookie == other.cookie && + e.len == other.len && + bytes.Equal(e.name, other.name) +} diff --git a/pkg/sentry/fs/inotify_watch.go b/pkg/sentry/fs/inotify_watch.go new file mode 100644 index 000000000..ff6ec6e3e --- /dev/null +++ b/pkg/sentry/fs/inotify_watch.go @@ -0,0 +1,129 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fs + +import ( + "sync" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" +) + +// Watch represent a particular inotify watch created by inotify_add_watch. +// +// While a watch is active, it ensures the target inode is pinned in memory by +// holding an extra ref on each dirent known (by inotify) to point to the +// inode. These are known as pins. For a full discussion, see +// fs/g3doc/inotify.md. +type Watch struct { + // Inotify instance which owns this watch. + owner *Inotify + + // Descriptor for this watch. This is unique across an inotify instance. + wd int32 + + // Events being monitored via this watch. + mask uint32 + + // The inode being watched. Note that we don't directly hold a reference on + // this inode. Instead we hold a reference on the dirent(s) containing the + // inode, which we record in pins. + target *Inode + + // unpinned indicates whether we have a hard reference on target. This field + // may only be modified through atomic ops. + unpinned uint32 + + // mu protects the fields below. + mu sync.Mutex `state:"nosave"` + + // pins is the set of dirents this watch is currently pinning in memory by + // holding a reference to them. See Pin()/Unpin(). + pins map[*Dirent]bool +} + +// ID returns the id of the inotify instance that owns this watch. +func (w *Watch) ID() uint64 { + return w.owner.id +} + +// NotifyParentAfterUnlink indicates whether the parent of the watched object +// should continue to be be notified of events after the target has been +// unlinked. +func (w *Watch) NotifyParentAfterUnlink() bool { + return w.mask&linux.IN_EXCL_UNLINK == 0 +} + +// isRenameEvent returns true if eventMask describes a rename event. +func isRenameEvent(eventMask uint32) bool { + return eventMask&(linux.IN_MOVED_FROM|linux.IN_MOVED_TO|linux.IN_MOVE_SELF) != 0 +} + +// Notify queues a new event on this watch. +func (w *Watch) Notify(name string, events uint32, cookie uint32) { + unmaskableBits := ^uint32(0) &^ linux.IN_ALL_EVENTS + effectiveMask := unmaskableBits | w.mask + matchedEvents := effectiveMask & events + + if matchedEvents == 0 { + // We weren't watching for this event. + return + } + + w.owner.queueEvent(newEvent(w.wd, name, matchedEvents, cookie)) +} + +// Pin acquires a new ref on dirent, which pins the dirent in memory while +// the watch is active. Calling Pin for a second time on the same dirent for +// the same watch is a no-op. +func (w *Watch) Pin(d *Dirent) { + w.mu.Lock() + defer w.mu.Unlock() + if !w.pins[d] { + w.pins[d] = true + d.IncRef() + } +} + +// Unpin drops any extra refs held on dirent due to a previous Pin +// call. Calling Unpin multiple times for the same dirent, or on a dirent +// without a corresponding Pin call is a no-op. +func (w *Watch) Unpin(d *Dirent) { + w.mu.Lock() + defer w.mu.Unlock() + if w.pins[d] { + delete(w.pins, d) + d.DecRef() + } +} + +// TargetDestroyed notifies the owner of the watch that the watch target is +// gone. The owner should release its own references to the watcher upon +// receiving this notification. +func (w *Watch) TargetDestroyed() { + w.owner.targetDestroyed(w) +} + +// destroy prepares the watch for destruction. It unpins all dirents pinned by +// this watch. Destroy does not cause any new events to be generated. The caller +// is responsible for ensuring there are no outstanding references to this +// watch. +func (w *Watch) destroy() { + w.mu.Lock() + defer w.mu.Unlock() + for d := range w.pins { + d.DecRef() + } + w.pins = nil +} diff --git a/pkg/sentry/fs/lock/BUILD b/pkg/sentry/fs/lock/BUILD new file mode 100644 index 000000000..c15dde800 --- /dev/null +++ b/pkg/sentry/fs/lock/BUILD @@ -0,0 +1,72 @@ +package(licenses = ["notice"]) # Apache 2.0 + +load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test") +load("//tools/go_generics:defs.bzl", "go_template_instance") +load("//tools/go_stateify:defs.bzl", "go_stateify") + +go_stateify( + name = "lock_state", + srcs = [ + "lock.go", + "lock_range.go", + "lock_set.go", + ], + out = "lock_state.go", + package = "lock", +) + +go_template_instance( + name = "lock_range", + out = "lock_range.go", + package = "lock", + prefix = "Lock", + template = "//pkg/segment:generic_range", + types = { + "T": "uint64", + }, +) + +go_template_instance( + name = "lock_set", + out = "lock_set.go", + consts = { + "minDegree": "3", + }, + package = "lock", + prefix = "Lock", + template = "//pkg/segment:generic_set", + types = { + "Key": "uint64", + "Range": "LockRange", + "Value": "Lock", + "Functions": "lockSetFunctions", + }, +) + +go_library( + name = "lock", + srcs = [ + "lock.go", + "lock_range.go", + "lock_set.go", + "lock_set_functions.go", + "lock_state.go", + ], + importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/fs/lock", + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/log", + "//pkg/state", + "//pkg/waiter", + ], +) + +go_test( + name = "lock_test", + size = "small", + srcs = [ + "lock_range_test.go", + "lock_test.go", + ], + embed = [":lock"], +) diff --git a/pkg/sentry/fs/lock/lock.go b/pkg/sentry/fs/lock/lock.go new file mode 100644 index 000000000..24d54c989 --- /dev/null +++ b/pkg/sentry/fs/lock/lock.go @@ -0,0 +1,457 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package lock is the API for POSIX-style advisory regional file locks and +// BSD-style full file locks. +// +// Callers needing to enforce these types of locks, like sys_fcntl, can call +// LockRegion and UnlockRegion on a thread-safe set of Locks. Locks are +// specific to a unique file (unique device/inode pair) and for this reason +// should not be shared between files. +// +// A Lock has a set of holders identified by UniqueID. Normally this is the +// pid of the thread attempting to acquire the lock. +// +// Since these are advisory locks, they do not need to be integrated into +// Reads/Writes and for this reason there is no way to *check* if a lock is +// held. One can only attempt to take a lock or unlock an existing lock. +// +// A Lock in a set of Locks is typed: it is either a read lock with any number +// of readers and no writer, or a write lock with no readers. +// +// As expected from POSIX, any attempt to acquire a write lock on a file region +// when there already exits a write lock held by a different uid will fail. Any +// attempt to acquire a write lock on a file region when there is more than one +// reader will fail. Any attempt to acquire a read lock on a file region when +// there is already a writer will fail. +// +// In special cases, a read lock may be upgraded to a write lock and a write lock +// can be downgraded to a read lock. This can only happen if: +// +// * read lock upgrade to write lock: There can be only one reader and the reader +// must be the same as the requested write lock holder. +// +// * write lock downgrade to read lock: The writer must be the same as the requested +// read lock holder. +// +// UnlockRegion always succeeds. If LockRegion fails the caller should normally +// interpret this as "try again later". +package lock + +import ( + "fmt" + "math" + "sync" + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/waiter" +) + +// LockType is a type of regional file lock. +type LockType int + +// UniqueID is a unique identifier of the holder of a regional file lock. +type UniqueID uint64 + +const ( + // ReadLock describes a POSIX regional file lock to be taken + // read only. There may be multiple of these locks on a single + // file region as long as there is no writer lock on the same + // region. + ReadLock LockType = iota + + // WriteLock describes a POSIX regional file lock to be taken + // write only. There may be only a single holder of this lock + // and no read locks. + WriteLock +) + +// LockEOF is the maximal possible end of a regional file lock. +const LockEOF = math.MaxUint64 + +// Lock is a regional file lock. It consists of either a single writer +// or a set of readers. +// +// A Lock may be upgraded from a read lock to a write lock only if there +// is a single reader and that reader has the same uid as the write lock. +// +// A Lock may be downgraded from a write lock to a read lock only if +// the write lock's uid is the same as the read lock. +type Lock struct { + // Readers are the set of read lock holders identified by UniqueID. + // If len(Readers) > 0 then HasWriter must be false. + Readers map[UniqueID]bool + + // HasWriter indicates that this is a write lock held by a single + // UniqueID. + HasWriter bool + + // Writer is only valid if HasWriter is true. It identifies a + // single write lock holder. + Writer UniqueID +} + +// Locks is a thread-safe wrapper around a LockSet. +type Locks struct { + // mu protects locks below. + mu sync.Mutex `state:"nosave"` + + // locks is the set of region locks currently held on an Inode. + locks LockSet + + // blockedQueue is the queue of waiters that are waiting on a lock. + blockedQueue waiter.Queue +} + +// Blocker is the interface used for blocking locks. Passing a nil Blocker +// will be treated as non-blocking. +type Blocker interface { + Block(C chan struct{}) error +} + +const ( + // EventMaskAll is the mask we will always use for locks, by using the + // same mask all the time we can wake up everyone anytime the lock + // changes state. + EventMaskAll waiter.EventMask = 0xFFFF +) + +// LockRegion attempts to acquire a typed lock for the uid on a region +// of a file. Returns true if successful in locking the region. If false +// is returned, the caller should normally interpret this as "try again later" if +// accquiring the lock in a non-blocking mode or "interrupted" if in a blocking mode. +// Blocker is the interface used to provide blocking behavior, passing a nil Blocker +// will result in non-blocking behavior. +func (l *Locks) LockRegion(uid UniqueID, t LockType, r LockRange, block Blocker) bool { + for { + l.mu.Lock() + + // Blocking locks must run in a loop because we'll be woken up whenever an unlock event + // happens for this lock. We will then attempt to take the lock again and if it fails + // continue blocking. + res := l.locks.lock(uid, t, r) + if !res && block != nil { + e, ch := waiter.NewChannelEntry(nil) + l.blockedQueue.EventRegister(&e, EventMaskAll) + l.mu.Unlock() + if err := block.Block(ch); err != nil { + // We were interrupted, the caller can translate this to EINTR if applicable. + l.blockedQueue.EventUnregister(&e) + return false + } + l.blockedQueue.EventUnregister(&e) + continue // Try again now that someone has unlocked. + } + + l.mu.Unlock() + return res + } +} + +// UnlockRegion attempts to release a lock for the uid on a region of a file. +// This operation is always successful, even if there did not exist a lock on +// the requested region held by uid in the first place. +func (l *Locks) UnlockRegion(uid UniqueID, r LockRange) { + l.mu.Lock() + defer l.mu.Unlock() + l.locks.unlock(uid, r) + + // Now that we've released the lock, we need to wake up any waiters. + l.blockedQueue.Notify(EventMaskAll) +} + +// makeLock returns a new typed Lock that has either uid as its only reader +// or uid as its only writer. +func makeLock(uid UniqueID, t LockType) Lock { + value := Lock{Readers: make(map[UniqueID]bool)} + switch t { + case ReadLock: + value.Readers[uid] = true + case WriteLock: + value.HasWriter = true + value.Writer = uid + default: + panic(fmt.Sprintf("makeLock: invalid lock type %d", t)) + } + return value +} + +// isHeld returns true if uid is a holder of Lock. +func (l Lock) isHeld(uid UniqueID) bool { + if l.HasWriter && l.Writer == uid { + return true + } + return l.Readers[uid] +} + +// lock sets uid as a holder of a typed lock on Lock. +// +// Preconditions: canLock is true for the range containing this Lock. +func (l *Lock) lock(uid UniqueID, t LockType) { + switch t { + case ReadLock: + // If we are already a reader, then this is a no-op. + if l.Readers[uid] { + return + } + // We cannot downgrade a write lock to a read lock unless the + // uid is the same. + if l.HasWriter { + if l.Writer != uid { + panic(fmt.Sprintf("lock: cannot downgrade write lock to read lock for uid %d, writer is %d", uid, l.Writer)) + } + // Ensure that there is only one reader if upgrading. + l.Readers = make(map[UniqueID]bool) + // Ensure that there is no longer a writer. + l.HasWriter = false + } + l.Readers[uid] = true + return + case WriteLock: + // If we are already the writer, then this is a no-op. + if l.HasWriter && l.Writer == uid { + return + } + // We can only upgrade a read lock to a write lock if there + // is only one reader and that reader has the same uid as + // the write lock. + if readers := len(l.Readers); readers > 0 { + if readers != 1 { + panic(fmt.Sprintf("lock: cannot upgrade read lock to write lock for uid %d, too many readers %v", uid, l.Readers)) + } + if !l.Readers[uid] { + panic(fmt.Sprintf("lock: cannot upgrade read lock to write lock for uid %d, conflicting reader %v", uid, l.Readers)) + } + } + // Ensure that there is only a writer. + l.Readers = make(map[UniqueID]bool) + l.HasWriter = true + l.Writer = uid + default: + panic(fmt.Sprintf("lock: invalid lock type %d", t)) + } +} + +// lockable returns true if check returns true for every Lock in LockRange. +// Further, check should return true if Lock meets the callers requirements +// for locking Lock. +func (l LockSet) lockable(r LockRange, check func(value Lock) bool) bool { + // Get our starting point. + seg := l.LowerBoundSegment(r.Start) + for seg.Ok() && seg.Start() < r.End { + // Note that we don't care about overruning the end of the + // last segment because if everything checks out we'll just + // split the last segment. + if !check(seg.Value()) { + return false + } + // Jump to the next segment, ignoring gaps, for the same + // reason we ignored the first gap. + seg = seg.NextSegment() + } + // No conflict, we can get a lock for uid over the entire range. + return true +} + +// canLock returns true if uid will be able to take a Lock of type t on the +// entire range specified by LockRange. +func (l LockSet) canLock(uid UniqueID, t LockType, r LockRange) bool { + switch t { + case ReadLock: + return l.lockable(r, func(value Lock) bool { + // If there is no writer, there's no problem adding + // another reader. + if !value.HasWriter { + return true + } + // If there is a writer, then it must be the same uid + // in order to downgrade the lock to a read lock. + return value.Writer == uid + }) + case WriteLock: + return l.lockable(r, func(value Lock) bool { + // If there are only readers. + if !value.HasWriter { + // Then this uid can only take a write lock if + // this is a private upgrade, meaning that the + // only reader is uid. + return len(value.Readers) == 1 && value.Readers[uid] + } + // If the uid is already a writer on this region, then + // adding a write lock would be a no-op. + return value.Writer == uid + }) + default: + panic(fmt.Sprintf("canLock: invalid lock type %d", t)) + } +} + +// lock returns true if uid took a lock of type t on the entire range of LockRange. +// +// Preconditions: r.Start <= r.End (will panic otherwise). +func (l *LockSet) lock(uid UniqueID, t LockType, r LockRange) bool { + if r.Start > r.End { + panic(fmt.Sprintf("lock: r.Start %d > r.End %d", r.Start, r.End)) + } + + // Don't attempt to insert anything with a range of 0 and treat this + // as a successful no-op. + if r.Length() == 0 { + return true + } + + // Do a first-pass check. We *could* hold onto the segments we + // checked if canLock would return true, but traversing the segment + // set should be fast and this keeps things simple. + if !l.canLock(uid, t, r) { + return false + } + // Get our starting point. + seg, gap := l.Find(r.Start) + if gap.Ok() { + // Fill in the gap and get the next segment to modify. + seg = l.Insert(gap, gap.Range().Intersect(r), makeLock(uid, t)).NextSegment() + } else if seg.Start() < r.Start { + // Get our first segment to modify. + _, seg = l.Split(seg, r.Start) + } + for seg.Ok() && seg.Start() < r.End { + // Split the last one if necessary. + if seg.End() > r.End { + seg, _ = l.SplitUnchecked(seg, r.End) + } + + // Set the lock on the segment. This is guaranteed to + // always be safe, given canLock above. + value := seg.ValuePtr() + value.lock(uid, t) + + // Fill subsequent gaps. + gap = seg.NextGap() + if gr := gap.Range().Intersect(r); gr.Length() > 0 { + seg = l.Insert(gap, gr, makeLock(uid, t)).NextSegment() + } else { + seg = gap.NextSegment() + } + } + return true +} + +// unlock is always successful. If uid has no locks held for the range LockRange, +// unlock is a no-op. +// +// Preconditions: same as lock. +func (l *LockSet) unlock(uid UniqueID, r LockRange) { + if r.Start > r.End { + panic(fmt.Sprintf("unlock: r.Start %d > r.End %d", r.Start, r.End)) + } + + // Same as setlock. + if r.Length() == 0 { + return + } + + // Get our starting point. + seg := l.LowerBoundSegment(r.Start) + for seg.Ok() && seg.Start() < r.End { + // If this segment doesn't have a lock from uid then + // there is no need to fragment the set with Isolate (below). + // In this case just move on to the next segment. + if !seg.Value().isHeld(uid) { + seg = seg.NextSegment() + continue + } + + // Ensure that if we need to unlock a sub-segment that + // we don't unlock/remove that entire segment. + seg = l.Isolate(seg, r) + + value := seg.Value() + var remove bool + if value.HasWriter && value.Writer == uid { + // If we are unlocking a writer, then since there can + // only ever be one writer and no readers, then this + // lock should always be removed from the set. + remove = true + } else if value.Readers[uid] { + // If uid is the last reader, then just remove the entire + // segment. + if len(value.Readers) == 1 { + remove = true + } else { + // Otherwise we need to remove this reader without + // affecting any other segment's readers. To do + // this, we need to make a copy of the Readers map + // and not add this uid. + newValue := Lock{Readers: make(map[UniqueID]bool)} + for k, v := range value.Readers { + if k != uid { + newValue.Readers[k] = v + } + } + seg.SetValue(newValue) + } + } + if remove { + seg = l.Remove(seg).NextSegment() + } else { + seg = seg.NextSegment() + } + } +} + +// ComputeRange takes a positive file offset and computes the start of a LockRange +// using start (relative to offset) and the end of the LockRange using length. The +// values of start and length may be negative but the resulting LockRange must +// preserve that LockRange.Start < LockRange.End and LockRange.Start > 0. +func ComputeRange(start, length, offset int64) (LockRange, error) { + offset += start + // fcntl(2): "l_start can be a negative number provided the offset + // does not lie before the start of the file" + if offset < 0 { + return LockRange{}, syscall.EINVAL + } + + // fcntl(2): Specifying 0 for l_len has the special meaning: lock all + // bytes starting at the location specified by l_whence and l_start + // through to the end of file, no matter how large the file grows. + end := uint64(LockEOF) + if length > 0 { + // fcntl(2): If l_len is positive, then the range to be locked + // covers bytes l_start up to and including l_start+l_len-1. + // + // Since LockRange.End is exclusive we need not -1 from length.. + end = uint64(offset + length) + } else if length < 0 { + // fcntl(2): If l_len is negative, the interval described by + // lock covers bytes l_start+l_len up to and including l_start-1. + // + // Since LockRange.End is exclusive we need not -1 from offset. + signedEnd := offset + // Add to offset using a negative length (subtract). + offset += length + if offset < 0 { + return LockRange{}, syscall.EINVAL + } + if signedEnd < offset { + return LockRange{}, syscall.EOVERFLOW + } + // At this point signedEnd cannot be negative, + // since we asserted that offset is not negative + // and it is not less than offset. + end = uint64(signedEnd) + } + // Offset is guaranteed to be positive at this point. + return LockRange{Start: uint64(offset), End: end}, nil +} diff --git a/pkg/sentry/fs/lock/lock_range_test.go b/pkg/sentry/fs/lock/lock_range_test.go new file mode 100644 index 000000000..06a37c701 --- /dev/null +++ b/pkg/sentry/fs/lock/lock_range_test.go @@ -0,0 +1,136 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package lock + +import ( + "syscall" + "testing" +) + +func TestComputeRange(t *testing.T) { + tests := []struct { + // Description of test. + name string + + // Requested start of the lock range. + start int64 + + // Requested length of the lock range, + // can be negative :( + length int64 + + // Pre-computed file offset based on whence. + // Will be added to start. + offset int64 + + // Expected error. + err error + + // If error is nil, the expected LockRange. + LockRange + }{ + { + name: "offset, start, and length all zero", + LockRange: LockRange{Start: 0, End: LockEOF}, + }, + { + name: "zero offset, zero start, positive length", + start: 0, + length: 4096, + offset: 0, + LockRange: LockRange{Start: 0, End: 4096}, + }, + { + name: "zero offset, negative start", + start: -4096, + offset: 0, + err: syscall.EINVAL, + }, + { + name: "large offset, negative start, positive length", + start: -2048, + length: 2048, + offset: 4096, + LockRange: LockRange{Start: 2048, End: 4096}, + }, + { + name: "large offset, negative start, zero length", + start: -2048, + length: 0, + offset: 4096, + LockRange: LockRange{Start: 2048, End: LockEOF}, + }, + { + name: "zero offset, zero start, negative length", + start: 0, + length: -4096, + offset: 0, + err: syscall.EINVAL, + }, + { + name: "large offset, zero start, negative length", + start: 0, + length: -4096, + offset: 4096, + LockRange: LockRange{Start: 0, End: 4096}, + }, + { + name: "offset, start, and length equal, length is negative", + start: 1024, + length: -1024, + offset: 1024, + LockRange: LockRange{Start: 1024, End: 2048}, + }, + { + name: "offset, start, and length equal, start is negative", + start: -1024, + length: 1024, + offset: 1024, + LockRange: LockRange{Start: 0, End: 1024}, + }, + { + name: "offset, start, and length equal, offset is negative", + start: 1024, + length: 1024, + offset: -1024, + LockRange: LockRange{Start: 0, End: 1024}, + }, + { + name: "offset, start, and length equal, all negative", + start: -1024, + length: -1024, + offset: -1024, + err: syscall.EINVAL, + }, + { + name: "offset, start, and length equal, all positive", + start: 1024, + length: 1024, + offset: 1024, + LockRange: LockRange{Start: 2048, End: 3072}, + }, + } + + for _, test := range tests { + rng, err := ComputeRange(test.start, test.length, test.offset) + if err != test.err { + t.Errorf("%s: lockRange(%d, %d, %d) got error %v, want %v", test.name, test.start, test.length, test.offset, err, test.err) + continue + } + if err == nil && rng != test.LockRange { + t.Errorf("%s: lockRange(%d, %d, %d) got LockRange %v, want %v", test.name, test.start, test.length, test.offset, rng, test.LockRange) + } + } +} diff --git a/pkg/sentry/fs/lock/lock_set_functions.go b/pkg/sentry/fs/lock/lock_set_functions.go new file mode 100644 index 000000000..e16f485be --- /dev/null +++ b/pkg/sentry/fs/lock/lock_set_functions.go @@ -0,0 +1,69 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package lock + +import ( + "math" +) + +// LockSet maps a set of Locks into a file. The key is the file offset. + +type lockSetFunctions struct{} + +func (lockSetFunctions) MinKey() uint64 { + return 0 +} + +func (lockSetFunctions) MaxKey() uint64 { + return math.MaxUint64 +} + +func (lockSetFunctions) ClearValue(l *Lock) { + *l = Lock{} +} + +func (lockSetFunctions) Merge(r1 LockRange, val1 Lock, r2 LockRange, val2 Lock) (Lock, bool) { + // Merge only if the Readers/Writers are identical. + if len(val1.Readers) != len(val2.Readers) { + return Lock{}, false + } + for k := range val1.Readers { + if !val2.Readers[k] { + return Lock{}, false + } + } + if val1.HasWriter != val2.HasWriter { + return Lock{}, false + } + if val1.HasWriter { + if val1.Writer != val2.Writer { + return Lock{}, false + } + } + return val1, true +} + +func (lockSetFunctions) Split(r LockRange, val Lock, split uint64) (Lock, Lock) { + // Copy the segment so that split segments don't contain map references + // to other segments. + val0 := Lock{Readers: make(map[UniqueID]bool)} + for k, v := range val.Readers { + val0.Readers[k] = v + } + val0.HasWriter = val.HasWriter + val0.Writer = val.Writer + + return val, val0 +} diff --git a/pkg/sentry/fs/lock/lock_test.go b/pkg/sentry/fs/lock/lock_test.go new file mode 100644 index 000000000..c60f5f7a2 --- /dev/null +++ b/pkg/sentry/fs/lock/lock_test.go @@ -0,0 +1,1059 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package lock + +import ( + "reflect" + "testing" +) + +type entry struct { + Lock + LockRange +} + +func equals(e0, e1 []entry) bool { + if len(e0) != len(e1) { + return false + } + for i := range e0 { + for k := range e0[i].Lock.Readers { + if !e1[i].Lock.Readers[k] { + return false + } + } + for k := range e1[i].Lock.Readers { + if !e0[i].Lock.Readers[k] { + return false + } + } + if !reflect.DeepEqual(e0[i].LockRange, e1[i].LockRange) { + return false + } + if e0[i].Lock.HasWriter != e1[i].Lock.HasWriter { + return false + } + if e0[i].Lock.Writer != e1[i].Lock.Writer { + return false + } + } + return true +} + +// fill a LockSet with consecutive region locks. Will panic if +// LockRanges are not consecutive. +func fill(entries []entry) LockSet { + l := LockSet{} + for _, e := range entries { + gap := l.FindGap(e.LockRange.Start) + if !gap.Ok() { + panic("cannot insert into existing segment") + } + l.Insert(gap, e.LockRange, e.Lock) + } + return l +} + +func TestCanLockEmpty(t *testing.T) { + l := LockSet{} + + // Expect to be able to take any locks given that the set is empty. + eof := l.FirstGap().End() + r := LockRange{0, eof} + if !l.canLock(1, ReadLock, r) { + t.Fatalf("canLock type %d for range %v and uid %d got false, want true", ReadLock, r, 1) + } + if !l.canLock(2, ReadLock, r) { + t.Fatalf("canLock type %d for range %v and uid %d got false, want true", ReadLock, r, 2) + } + if !l.canLock(1, WriteLock, r) { + t.Fatalf("canLock type %d for range %v and uid %d got false, want true", WriteLock, r, 1) + } + if !l.canLock(2, WriteLock, r) { + t.Fatalf("canLock type %d for range %v and uid %d got false, want true", WriteLock, r, 2) + } +} + +func TestCanLock(t *testing.T) { + // + -------------- + ---------- + -------------- + --------- + + // | Readers 1 & 2 | Readers 1 | Readers 1 & 3 | Writer 1 | + // + ------------- + ---------- + -------------- + --------- + + // 0 1024 2048 3072 4096 + l := fill([]entry{ + { + Lock: Lock{Readers: map[UniqueID]bool{1: true, 2: true}}, + LockRange: LockRange{0, 1024}, + }, + { + Lock: Lock{Readers: map[UniqueID]bool{1: true}}, + LockRange: LockRange{1024, 2048}, + }, + { + Lock: Lock{Readers: map[UniqueID]bool{1: true, 3: true}}, + LockRange: LockRange{2048, 3072}, + }, + { + Lock: Lock{HasWriter: true, Writer: 1}, + LockRange: LockRange{3072, 4096}, + }, + }) + + // Now that we have a mildly interesting layout, try some checks on different + // ranges, uids, and lock types. + // + // Expect to be able to extend the read lock, despite the writer lock, because + // the writer has the same uid as the requested read lock. + r := LockRange{0, 8192} + if !l.canLock(1, ReadLock, r) { + t.Fatalf("canLock type %d for range %v and uid %d got false, want true", ReadLock, r, 1) + } + // Expect to *not* be able to extend the read lock since there is an overlapping + // writer region locked by someone other than the uid. + if l.canLock(2, ReadLock, r) { + t.Fatalf("canLock type %d for range %v and uid %d got true, want false", ReadLock, r, 2) + } + // Expect to be able to extend the read lock if there are only other readers in + // the way. + r = LockRange{64, 3072} + if !l.canLock(2, ReadLock, r) { + t.Fatalf("canLock type %d for range %v and uid %d got false, want true", ReadLock, r, 2) + } + // Expect to be able to set a read lock beyond the range of any existing locks. + r = LockRange{4096, 10240} + if !l.canLock(2, ReadLock, r) { + t.Fatalf("canLock type %d for range %v and uid %d got false, want true", ReadLock, r, 2) + } + + // Expect to not be able to take a write lock with other readers in the way. + r = LockRange{0, 8192} + if l.canLock(1, WriteLock, r) { + t.Fatalf("canLock type %d for range %v and uid %d got true, want false", WriteLock, r, 1) + } + // Expect to be able to extend the write lock for the same uid. + r = LockRange{3072, 8192} + if !l.canLock(1, WriteLock, r) { + t.Fatalf("canLock type %d for range %v and uid %d got false, want true", WriteLock, r, 1) + } + // Expect to not be able to overlap a write lock for two different uids. + if l.canLock(2, WriteLock, r) { + t.Fatalf("canLock type %d for range %v and uid %d got true, want false", WriteLock, r, 2) + } + // Expect to be able to set a write lock that is beyond the range of any + // existing locks. + r = LockRange{8192, 10240} + if !l.canLock(2, WriteLock, r) { + t.Fatalf("canLock type %d for range %v and uid %d got false, want true", WriteLock, r, 2) + } + // Expect to be able to upgrade a read lock (any portion of it). + r = LockRange{1024, 2048} + if !l.canLock(1, WriteLock, r) { + t.Fatalf("canLock type %d for range %v and uid %d got false, want true", WriteLock, r, 1) + } + r = LockRange{1080, 2000} + if !l.canLock(1, WriteLock, r) { + t.Fatalf("canLock type %d for range %v and uid %d got false, want true", WriteLock, r, 1) + } +} + +func TestSetLock(t *testing.T) { + tests := []struct { + // description of test. + name string + + // LockSet entries to pre-fill. + before []entry + + // Description of region to lock: + // + // start is the file offset of the lock. + start uint64 + // end is the end file offset of the lock. + end uint64 + // uid of lock attempter. + uid UniqueID + // lock type requested. + lockType LockType + + // success is true if taking the above + // lock should succeed. + success bool + + // Expected layout of the set after locking + // if success is true. + after []entry + }{ + { + name: "set zero length ReadLock on empty set", + start: 0, + end: 0, + uid: 0, + lockType: ReadLock, + success: true, + }, + { + name: "set zero length WriteLock on empty set", + start: 0, + end: 0, + uid: 0, + lockType: WriteLock, + success: true, + }, + { + name: "set ReadLock on empty set", + start: 0, + end: LockEOF, + uid: 0, + lockType: ReadLock, + success: true, + // + ----------------------------------------- + + // | Readers 0 | + // + ----------------------------------------- + + // 0 max uint64 + after: []entry{ + { + Lock: Lock{Readers: map[UniqueID]bool{0: true}}, + LockRange: LockRange{0, LockEOF}, + }, + }, + }, + { + name: "set WriteLock on empty set", + start: 0, + end: LockEOF, + uid: 0, + lockType: WriteLock, + success: true, + // + ----------------------------------------- + + // | Writer 0 | + // + ----------------------------------------- + + // 0 max uint64 + after: []entry{ + { + Lock: Lock{HasWriter: true, Writer: 0}, + LockRange: LockRange{0, LockEOF}, + }, + }, + }, + { + name: "set ReadLock on WriteLock same uid", + // + ----------------------------------------- + + // | Writer 0 | + // + ----------------------------------------- + + // 0 max uint64 + before: []entry{ + { + Lock: Lock{HasWriter: true, Writer: 0}, + LockRange: LockRange{0, LockEOF}, + }, + }, + start: 0, + end: 4096, + uid: 0, + lockType: ReadLock, + success: true, + // + ----------- + --------------------------- + + // | Readers 0 | Writer 0 | + // + ----------- + --------------------------- + + // 0 4096 max uint64 + after: []entry{ + { + Lock: Lock{Readers: map[UniqueID]bool{0: true}}, + LockRange: LockRange{0, 4096}, + }, + { + Lock: Lock{HasWriter: true, Writer: 0}, + LockRange: LockRange{4096, LockEOF}, + }, + }, + }, + { + name: "set WriteLock on ReadLock same uid", + // + ----------------------------------------- + + // | Readers 0 | + // + ----------------------------------------- + + // 0 max uint64 + before: []entry{ + { + Lock: Lock{Readers: map[UniqueID]bool{0: true}}, + LockRange: LockRange{0, LockEOF}, + }, + }, + start: 0, + end: 4096, + uid: 0, + lockType: WriteLock, + success: true, + // + ----------- + --------------------------- + + // | Writer 0 | Readers 0 | + // + ----------- + --------------------------- + + // 0 4096 max uint64 + after: []entry{ + { + Lock: Lock{HasWriter: true, Writer: 0}, + LockRange: LockRange{0, 4096}, + }, + { + Lock: Lock{Readers: map[UniqueID]bool{0: true}}, + LockRange: LockRange{4096, LockEOF}, + }, + }, + }, + { + name: "set ReadLock on WriteLock different uid", + // + ----------------------------------------- + + // | Writer 0 | + // + ----------------------------------------- + + // 0 max uint64 + before: []entry{ + { + Lock: Lock{HasWriter: true, Writer: 0}, + LockRange: LockRange{0, LockEOF}, + }, + }, + start: 0, + end: 4096, + uid: 1, + lockType: ReadLock, + success: false, + }, + { + name: "set WriteLock on ReadLock different uid", + // + ----------------------------------------- + + // | Readers 0 | + // + ----------------------------------------- + + // 0 max uint64 + before: []entry{ + { + Lock: Lock{Readers: map[UniqueID]bool{0: true}}, + LockRange: LockRange{0, LockEOF}, + }, + }, + start: 0, + end: 4096, + uid: 1, + lockType: WriteLock, + success: false, + }, + { + name: "split ReadLock for overlapping lock at start 0", + // + ----------------------------------------- + + // | Readers 0 | + // + ----------------------------------------- + + // 0 max uint64 + before: []entry{ + { + Lock: Lock{Readers: map[UniqueID]bool{0: true}}, + LockRange: LockRange{0, LockEOF}, + }, + }, + start: 0, + end: 4096, + uid: 1, + lockType: ReadLock, + success: true, + // + -------------- + --------------------------- + + // | Readers 0 & 1 | Readers 0 | + // + -------------- + --------------------------- + + // 0 4096 max uint64 + after: []entry{ + { + Lock: Lock{Readers: map[UniqueID]bool{0: true, 1: true}}, + LockRange: LockRange{0, 4096}, + }, + { + Lock: Lock{Readers: map[UniqueID]bool{0: true}}, + LockRange: LockRange{4096, LockEOF}, + }, + }, + }, + { + name: "split ReadLock for overlapping lock at non-zero start", + // + ----------------------------------------- + + // | Readers 0 | + // + ----------------------------------------- + + // 0 max uint64 + before: []entry{ + { + Lock: Lock{Readers: map[UniqueID]bool{0: true}}, + LockRange: LockRange{0, LockEOF}, + }, + }, + start: 4096, + end: 8192, + uid: 1, + lockType: ReadLock, + success: true, + // + ---------- + -------------- + ----------- + + // | Readers 0 | Readers 0 & 1 | Readers 0 | + // + ---------- + -------------- + ----------- + + // 0 4096 8192 max uint64 + after: []entry{ + { + Lock: Lock{Readers: map[UniqueID]bool{0: true}}, + LockRange: LockRange{0, 4096}, + }, + { + Lock: Lock{Readers: map[UniqueID]bool{0: true, 1: true}}, + LockRange: LockRange{4096, 8192}, + }, + { + Lock: Lock{Readers: map[UniqueID]bool{0: true}}, + LockRange: LockRange{8192, LockEOF}, + }, + }, + }, + { + name: "fill front gap with ReadLock", + // + --------- + ---------------------------- + + // | gap | Readers 0 | + // + --------- + ---------------------------- + + // 0 1024 max uint64 + before: []entry{ + { + Lock: Lock{Readers: map[UniqueID]bool{0: true}}, + LockRange: LockRange{1024, LockEOF}, + }, + }, + start: 0, + end: 8192, + uid: 0, + lockType: ReadLock, + success: true, + // + ----------------------------------------- + + // | Readers 0 | + // + ----------------------------------------- + + // 0 max uint64 + after: []entry{ + { + Lock: Lock{Readers: map[UniqueID]bool{0: true}}, + LockRange: LockRange{0, LockEOF}, + }, + }, + }, + { + name: "fill end gap with ReadLock", + // + ---------------------------- + + // | Readers 0 | + // + ---------------------------- + + // 0 4096 + before: []entry{ + { + Lock: Lock{Readers: map[UniqueID]bool{0: true}}, + LockRange: LockRange{0, 4096}, + }, + }, + start: 1024, + end: LockEOF, + uid: 0, + lockType: ReadLock, + success: true, + // Note that this is not merged after lock does a Split. This is + // fine because the two locks will still *behave* as one. In other + // words we can fragment any lock all we want and semantically it + // makes no difference. + // + // + ----------- + --------------------------- + + // | Readers 0 | Readers 0 | + // + ----------- + --------------------------- + + // 0 max uint64 + after: []entry{ + { + Lock: Lock{Readers: map[UniqueID]bool{0: true}}, + LockRange: LockRange{0, 1024}, + }, + { + Lock: Lock{Readers: map[UniqueID]bool{0: true}}, + LockRange: LockRange{1024, LockEOF}, + }, + }, + }, + { + name: "fill gap with ReadLock and split", + // + --------- + ---------------------------- + + // | gap | Readers 0 | + // + --------- + ---------------------------- + + // 0 1024 max uint64 + before: []entry{ + { + Lock: Lock{Readers: map[UniqueID]bool{0: true}}, + LockRange: LockRange{1024, LockEOF}, + }, + }, + start: 0, + end: 4096, + uid: 1, + lockType: ReadLock, + success: true, + // + --------- + ------------- + ------------- + + // | Reader 1 | Readers 0 & 1 | Reader 0 | + // + ----------+ ------------- + ------------- + + // 0 1024 4096 max uint64 + after: []entry{ + { + Lock: Lock{Readers: map[UniqueID]bool{1: true}}, + LockRange: LockRange{0, 1024}, + }, + { + Lock: Lock{Readers: map[UniqueID]bool{0: true, 1: true}}, + LockRange: LockRange{1024, 4096}, + }, + { + Lock: Lock{Readers: map[UniqueID]bool{0: true}}, + LockRange: LockRange{4096, LockEOF}, + }, + }, + }, + { + name: "upgrade ReadLock to WriteLock for single uid fill gap", + // + ------------- + --------- + --- + ------------- + + // | Readers 0 & 1 | Readers 0 | gap | Readers 0 & 2 | + // + ------------- + --------- + --- + ------------- + + // 0 1024 2048 4096 max uint64 + before: []entry{ + { + Lock: Lock{Readers: map[UniqueID]bool{0: true, 1: true}}, + LockRange: LockRange{0, 1024}, + }, + { + Lock: Lock{Readers: map[UniqueID]bool{0: true}}, + LockRange: LockRange{1024, 2048}, + }, + { + Lock: Lock{Readers: map[UniqueID]bool{0: true, 2: true}}, + LockRange: LockRange{4096, LockEOF}, + }, + }, + start: 1024, + end: 4096, + uid: 0, + lockType: WriteLock, + success: true, + // + ------------- + -------- + ------------- + + // | Readers 0 & 1 | Writer 0 | Readers 0 & 2 | + // + ------------- + -------- + ------------- + + // 0 1024 4096 max uint64 + after: []entry{ + { + Lock: Lock{Readers: map[UniqueID]bool{0: true, 1: true}}, + LockRange: LockRange{0, 1024}, + }, + { + Lock: Lock{HasWriter: true, Writer: 0}, + LockRange: LockRange{1024, 4096}, + }, + { + Lock: Lock{Readers: map[UniqueID]bool{0: true, 2: true}}, + LockRange: LockRange{4096, LockEOF}, + }, + }, + }, + { + name: "upgrade ReadLock to WriteLock for single uid keep gap", + // + ------------- + --------- + --- + ------------- + + // | Readers 0 & 1 | Readers 0 | gap | Readers 0 & 2 | + // + ------------- + --------- + --- + ------------- + + // 0 1024 2048 4096 max uint64 + before: []entry{ + { + Lock: Lock{Readers: map[UniqueID]bool{0: true, 1: true}}, + LockRange: LockRange{0, 1024}, + }, + { + Lock: Lock{Readers: map[UniqueID]bool{0: true}}, + LockRange: LockRange{1024, 2048}, + }, + { + Lock: Lock{Readers: map[UniqueID]bool{0: true, 2: true}}, + LockRange: LockRange{4096, LockEOF}, + }, + }, + start: 1024, + end: 3072, + uid: 0, + lockType: WriteLock, + success: true, + // + ------------- + -------- + --- + ------------- + + // | Readers 0 & 1 | Writer 0 | gap | Readers 0 & 2 | + // + ------------- + -------- + --- + ------------- + + // 0 1024 3072 4096 max uint64 + after: []entry{ + { + Lock: Lock{Readers: map[UniqueID]bool{0: true, 1: true}}, + LockRange: LockRange{0, 1024}, + }, + { + Lock: Lock{HasWriter: true, Writer: 0}, + LockRange: LockRange{1024, 3072}, + }, + { + Lock: Lock{Readers: map[UniqueID]bool{0: true, 2: true}}, + LockRange: LockRange{4096, LockEOF}, + }, + }, + }, + { + name: "fail to upgrade ReadLock to WriteLock with conflicting Reader", + // + ------------- + --------- + + // | Readers 0 & 1 | Readers 0 | + // + ------------- + --------- + + // 0 1024 2048 + before: []entry{ + { + Lock: Lock{Readers: map[UniqueID]bool{0: true, 1: true}}, + LockRange: LockRange{0, 1024}, + }, + { + Lock: Lock{Readers: map[UniqueID]bool{0: true}}, + LockRange: LockRange{1024, 2048}, + }, + }, + start: 0, + end: 2048, + uid: 0, + lockType: WriteLock, + success: false, + }, + { + name: "take WriteLock on whole file if all uids are the same", + // + ------------- + --------- + --------- + ---------- + + // | Writer 0 | Readers 0 | Readers 0 | Readers 0 | + // + ------------- + --------- + --------- + ---------- + + // 0 1024 2048 4096 max uint64 + before: []entry{ + { + Lock: Lock{HasWriter: true, Writer: 0}, + LockRange: LockRange{0, 1024}, + }, + { + Lock: Lock{Readers: map[UniqueID]bool{0: true}}, + LockRange: LockRange{1024, 2048}, + }, + { + Lock: Lock{Readers: map[UniqueID]bool{0: true}}, + LockRange: LockRange{2048, 4096}, + }, + { + Lock: Lock{Readers: map[UniqueID]bool{0: true}}, + LockRange: LockRange{4096, LockEOF}, + }, + }, + start: 0, + end: LockEOF, + uid: 0, + lockType: WriteLock, + success: true, + // We do not manually merge locks. Semantically a fragmented lock + // held by the same uid will behave as one lock so it makes no difference. + // + // + ------------- + ---------------------------- + + // | Writer 0 | Writer 0 | + // + ------------- + ---------------------------- + + // 0 1024 max uint64 + after: []entry{ + { + Lock: Lock{HasWriter: true, Writer: 0}, + LockRange: LockRange{0, 1024}, + }, + { + Lock: Lock{HasWriter: true, Writer: 0}, + LockRange: LockRange{1024, LockEOF}, + }, + }, + }, + } + + for _, test := range tests { + l := fill(test.before) + + r := LockRange{Start: test.start, End: test.end} + success := l.lock(test.uid, test.lockType, r) + var got []entry + for seg := l.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { + got = append(got, entry{ + Lock: seg.Value(), + LockRange: seg.Range(), + }) + } + + if success != test.success { + t.Errorf("%s: setlock(%v, %+v, %d, %d) got success %v, want %v", test.name, test.before, r, test.uid, test.lockType, success, test.success) + continue + } + + if success { + if !equals(got, test.after) { + t.Errorf("%s: got set %+v, want %+v", test.name, got, test.after) + } + } + } +} + +func TestUnlock(t *testing.T) { + tests := []struct { + // description of test. + name string + + // LockSet entries to pre-fill. + before []entry + + // Description of region to unlock: + // + // start is the file start of the lock. + start uint64 + // end is the end file start of the lock. + end uint64 + // uid of lock holder. + uid UniqueID + + // Expected layout of the set after unlocking. + after []entry + }{ + { + name: "unlock zero length on empty set", + start: 0, + end: 0, + uid: 0, + }, + { + name: "unlock on empty set (no-op)", + start: 0, + end: LockEOF, + uid: 0, + }, + { + name: "unlock uid not locked (no-op)", + // + --------------------------- + + // | Readers 1 & 2 | + // + --------------------------- + + // 0 max uint64 + before: []entry{ + { + Lock: Lock{Readers: map[UniqueID]bool{1: true, 2: true}}, + LockRange: LockRange{0, LockEOF}, + }, + }, + start: 1024, + end: 4096, + uid: 0, + // + --------------------------- + + // | Readers 1 & 2 | + // + --------------------------- + + // 0 max uint64 + after: []entry{ + { + Lock: Lock{Readers: map[UniqueID]bool{1: true, 2: true}}, + LockRange: LockRange{0, LockEOF}, + }, + }, + }, + { + name: "unlock ReadLock over entire file", + // + ----------------------------------------- + + // | Readers 0 | + // + ----------------------------------------- + + // 0 max uint64 + before: []entry{ + { + Lock: Lock{Readers: map[UniqueID]bool{0: true}}, + LockRange: LockRange{0, LockEOF}, + }, + }, + start: 0, + end: LockEOF, + uid: 0, + }, + { + name: "unlock WriteLock over entire file", + // + ----------------------------------------- + + // | Writer 0 | + // + ----------------------------------------- + + // 0 max uint64 + before: []entry{ + { + Lock: Lock{HasWriter: true, Writer: 0}, + LockRange: LockRange{0, LockEOF}, + }, + }, + start: 0, + end: LockEOF, + uid: 0, + }, + { + name: "unlock partial ReadLock (start)", + // + ----------------------------------------- + + // | Readers 0 | + // + ----------------------------------------- + + // 0 max uint64 + before: []entry{ + { + Lock: Lock{Readers: map[UniqueID]bool{0: true}}, + LockRange: LockRange{0, LockEOF}, + }, + }, + start: 0, + end: 4096, + uid: 0, + // + ------ + --------------------------- + + // | gap | Readers 0 | + // +------- + --------------------------- + + // 0 4096 max uint64 + after: []entry{ + { + Lock: Lock{Readers: map[UniqueID]bool{0: true}}, + LockRange: LockRange{4096, LockEOF}, + }, + }, + }, + { + name: "unlock partial WriteLock (start)", + // + ----------------------------------------- + + // | Writer 0 | + // + ----------------------------------------- + + // 0 max uint64 + before: []entry{ + { + Lock: Lock{HasWriter: true, Writer: 0}, + LockRange: LockRange{0, LockEOF}, + }, + }, + start: 0, + end: 4096, + uid: 0, + // + ------ + --------------------------- + + // | gap | Writer 0 | + // +------- + --------------------------- + + // 0 4096 max uint64 + after: []entry{ + { + Lock: Lock{HasWriter: true, Writer: 0}, + LockRange: LockRange{4096, LockEOF}, + }, + }, + }, + { + name: "unlock partial ReadLock (end)", + // + ----------------------------------------- + + // | Readers 0 | + // + ----------------------------------------- + + // 0 max uint64 + before: []entry{ + { + Lock: Lock{Readers: map[UniqueID]bool{0: true}}, + LockRange: LockRange{0, LockEOF}, + }, + }, + start: 4096, + end: LockEOF, + uid: 0, + // + --------------------------- + + // | Readers 0 | + // +---------------------------- + + // 0 4096 + after: []entry{ + { + Lock: Lock{Readers: map[UniqueID]bool{0: true}}, + LockRange: LockRange{0, 4096}, + }, + }, + }, + { + name: "unlock partial WriteLock (end)", + // + ----------------------------------------- + + // | Writer 0 | + // + ----------------------------------------- + + // 0 max uint64 + before: []entry{ + { + Lock: Lock{HasWriter: true, Writer: 0}, + LockRange: LockRange{0, LockEOF}, + }, + }, + start: 4096, + end: LockEOF, + uid: 0, + // + --------------------------- + + // | Writer 0 | + // +---------------------------- + + // 0 4096 + after: []entry{ + { + Lock: Lock{HasWriter: true, Writer: 0}, + LockRange: LockRange{0, 4096}, + }, + }, + }, + { + name: "unlock for single uid", + // + ------------- + --------- + ------------------- + + // | Readers 0 & 1 | Writer 0 | Readers 0 & 1 & 2 | + // + ------------- + --------- + ------------------- + + // 0 1024 4096 max uint64 + before: []entry{ + { + Lock: Lock{Readers: map[UniqueID]bool{0: true, 1: true}}, + LockRange: LockRange{0, 1024}, + }, + { + Lock: Lock{HasWriter: true, Writer: 0}, + LockRange: LockRange{1024, 4096}, + }, + { + Lock: Lock{Readers: map[UniqueID]bool{0: true, 1: true, 2: true}}, + LockRange: LockRange{4096, LockEOF}, + }, + }, + start: 0, + end: LockEOF, + uid: 0, + // + --------- + --- + --------------- + + // | Readers 1 | gap | Readers 1 & 2 | + // + --------- + --- + --------------- + + // 0 1024 4096 max uint64 + after: []entry{ + { + Lock: Lock{Readers: map[UniqueID]bool{1: true}}, + LockRange: LockRange{0, 1024}, + }, + { + Lock: Lock{Readers: map[UniqueID]bool{1: true, 2: true}}, + LockRange: LockRange{4096, LockEOF}, + }, + }, + }, + { + name: "unlock subsection locked", + // + ------------------------------- + + // | Readers 0 & 1 & 2 | + // + ------------------------------- + + // 0 max uint64 + before: []entry{ + { + Lock: Lock{Readers: map[UniqueID]bool{0: true, 1: true, 2: true}}, + LockRange: LockRange{0, LockEOF}, + }, + }, + start: 1024, + end: 4096, + uid: 0, + // + ----------------- + ------------- + ----------------- + + // | Readers 0 & 1 & 2 | Readers 1 & 2 | Readers 0 & 1 & 2 | + // + ----------------- + ------------- + ----------------- + + // 0 1024 4096 max uint64 + after: []entry{ + { + Lock: Lock{Readers: map[UniqueID]bool{0: true, 1: true, 2: true}}, + LockRange: LockRange{0, 1024}, + }, + { + Lock: Lock{Readers: map[UniqueID]bool{1: true, 2: true}}, + LockRange: LockRange{1024, 4096}, + }, + { + Lock: Lock{Readers: map[UniqueID]bool{0: true, 1: true, 2: true}}, + LockRange: LockRange{4096, LockEOF}, + }, + }, + }, + { + name: "unlock mid-gap to increase gap", + // + --------- + ----- + ------------------- + + // | Writer 0 | gap | Readers 0 & 1 | + // + --------- + ----- + ------------------- + + // 0 1024 4096 max uint64 + before: []entry{ + { + Lock: Lock{HasWriter: true, Writer: 0}, + LockRange: LockRange{0, 1024}, + }, + { + Lock: Lock{Readers: map[UniqueID]bool{0: true, 1: true}}, + LockRange: LockRange{4096, LockEOF}, + }, + }, + start: 8, + end: 2048, + uid: 0, + // + --------- + ----- + ------------------- + + // | Writer 0 | gap | Readers 0 & 1 | + // + --------- + ----- + ------------------- + + // 0 8 4096 max uint64 + after: []entry{ + { + Lock: Lock{HasWriter: true, Writer: 0}, + LockRange: LockRange{0, 8}, + }, + { + Lock: Lock{Readers: map[UniqueID]bool{0: true, 1: true}}, + LockRange: LockRange{4096, LockEOF}, + }, + }, + }, + { + name: "unlock split region on uid mid-gap", + // + --------- + ----- + ------------------- + + // | Writer 0 | gap | Readers 0 & 1 | + // + --------- + ----- + ------------------- + + // 0 1024 4096 max uint64 + before: []entry{ + { + Lock: Lock{HasWriter: true, Writer: 0}, + LockRange: LockRange{0, 1024}, + }, + { + Lock: Lock{Readers: map[UniqueID]bool{0: true, 1: true}}, + LockRange: LockRange{4096, LockEOF}, + }, + }, + start: 2048, + end: 8192, + uid: 0, + // + --------- + ----- + --------- + ------------- + + // | Writer 0 | gap | Readers 1 | Readers 0 & 1 | + // + --------- + ----- + --------- + ------------- + + // 0 1024 4096 8192 max uint64 + after: []entry{ + { + Lock: Lock{HasWriter: true, Writer: 0}, + LockRange: LockRange{0, 1024}, + }, + { + Lock: Lock{Readers: map[UniqueID]bool{1: true}}, + LockRange: LockRange{4096, 8192}, + }, + { + Lock: Lock{Readers: map[UniqueID]bool{0: true, 1: true}}, + LockRange: LockRange{8192, LockEOF}, + }, + }, + }, + } + + for _, test := range tests { + l := fill(test.before) + + r := LockRange{Start: test.start, End: test.end} + l.unlock(test.uid, r) + var got []entry + for seg := l.FirstSegment(); seg.Ok(); seg = seg.NextSegment() { + got = append(got, entry{ + Lock: seg.Value(), + LockRange: seg.Range(), + }) + } + if !equals(got, test.after) { + t.Errorf("%s: got set %+v, want %+v", test.name, got, test.after) + } + } +} diff --git a/pkg/sentry/fs/mock.go b/pkg/sentry/fs/mock.go new file mode 100644 index 000000000..b3bfa5268 --- /dev/null +++ b/pkg/sentry/fs/mock.go @@ -0,0 +1,177 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fs + +import ( + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// MockInodeOperations implements InodeOperations for testing Inodes. +type MockInodeOperations struct { + InodeOperations + + UAttr UnstableAttr + + createCalled bool + createDirectoryCalled bool + createLinkCalled bool + renameCalled bool + walkCalled bool +} + +// NewMockInode returns a mock *Inode using MockInodeOperations. +func NewMockInode(ctx context.Context, msrc *MountSource, sattr StableAttr) *Inode { + return NewInode(NewMockInodeOperations(ctx), msrc, sattr) +} + +// NewMockInodeOperations returns a *MockInodeOperations. +func NewMockInodeOperations(ctx context.Context) *MockInodeOperations { + return &MockInodeOperations{ + UAttr: WithCurrentTime(ctx, UnstableAttr{ + Perms: FilePermsFromMode(0777), + }), + } +} + +// MockMountSourceOps implements fs.MountSourceOperations. +type MockMountSourceOps struct { + MountSourceOperations + keep bool + revalidate bool +} + +// NewMockMountSource returns a new *MountSource using MockMountSourceOps. +func NewMockMountSource(cache *DirentCache) *MountSource { + var keep bool + if cache != nil { + keep = cache.maxSize > 0 + } + return &MountSource{ + MountSourceOperations: &MockMountSourceOps{keep: keep}, + fscache: cache, + children: make(map[*MountSource]struct{}), + } +} + +// Revalidate implements fs.MountSourceOperations.Revalidate. +func (n *MockMountSourceOps) Revalidate(*Dirent) bool { + return n.revalidate +} + +// Keep implements fs.MountSourceOperations.Keep. +func (n *MockMountSourceOps) Keep(dirent *Dirent) bool { + return n.keep +} + +// WriteOut implements fs.InodeOperations.WriteOut. +func (n *MockInodeOperations) WriteOut(context.Context, *Inode) error { + return nil +} + +// UnstableAttr implements fs.InodeOperations.UnstableAttr. +func (n *MockInodeOperations) UnstableAttr(context.Context, *Inode) (UnstableAttr, error) { + return n.UAttr, nil +} + +// IsVirtual implements fs.InodeOperations.IsVirtual. +func (n *MockInodeOperations) IsVirtual() bool { + return false +} + +// Lookup implements fs.InodeOperations.Lookup. +func (n *MockInodeOperations) Lookup(ctx context.Context, dir *Inode, p string) (*Dirent, error) { + n.walkCalled = true + return NewDirent(NewInode(&MockInodeOperations{}, dir.MountSource, StableAttr{}), p), nil +} + +// SetPermissions implements fs.InodeOperations.SetPermissions. +func (n *MockInodeOperations) SetPermissions(context.Context, *Inode, FilePermissions) bool { + return false +} + +// SetOwner implements fs.InodeOperations.SetOwner. +func (*MockInodeOperations) SetOwner(context.Context, *Inode, FileOwner) error { + return syserror.EINVAL +} + +// SetTimestamps implements fs.InodeOperations.SetTimestamps. +func (n *MockInodeOperations) SetTimestamps(context.Context, *Inode, TimeSpec) error { + return nil +} + +// Create implements fs.InodeOperations.Create. +func (n *MockInodeOperations) Create(ctx context.Context, dir *Inode, p string, flags FileFlags, perms FilePermissions) (*File, error) { + n.createCalled = true + d := NewDirent(NewInode(&MockInodeOperations{}, dir.MountSource, StableAttr{}), p) + return &File{Dirent: d}, nil +} + +// CreateLink implements fs.InodeOperations.CreateLink. +func (n *MockInodeOperations) CreateLink(_ context.Context, dir *Inode, oldname string, newname string) error { + n.createLinkCalled = true + return nil +} + +// CreateDirectory implements fs.InodeOperations.CreateDirectory. +func (n *MockInodeOperations) CreateDirectory(context.Context, *Inode, string, FilePermissions) error { + n.createDirectoryCalled = true + return nil +} + +// Rename implements fs.InodeOperations.Rename. +func (n *MockInodeOperations) Rename(ctx context.Context, oldParent *Inode, oldName string, newParent *Inode, newName string) error { + n.renameCalled = true + return nil +} + +// Check implements fs.InodeOperations.Check. +func (n *MockInodeOperations) Check(ctx context.Context, inode *Inode, p PermMask) bool { + return ContextCanAccessFile(ctx, inode, p) +} + +// Release implements fs.InodeOperations.Release. +func (n *MockInodeOperations) Release(context.Context) {} + +// Truncate implements fs.InodeOperations.Truncate. +func (n *MockInodeOperations) Truncate(ctx context.Context, inode *Inode, size int64) error { + return nil +} + +// DeprecatedPwritev implements fs.InodeOperations.DeprecatedPwritev. +func (n *MockInodeOperations) DeprecatedPwritev(context.Context, usermem.IOSequence, int64) (int64, error) { + return 0, nil +} + +// DeprecatedReaddir implements fs.InodeOperations.DeprecatedReaddir. +func (n *MockInodeOperations) DeprecatedReaddir(context.Context, *DirCtx, int) (int, error) { + return 0, nil +} + +// Remove implements fs.InodeOperations.Remove. +func (n *MockInodeOperations) Remove(context.Context, *Inode, string) error { + return nil +} + +// RemoveDirectory implements fs.InodeOperations.RemoveDirectory. +func (n *MockInodeOperations) RemoveDirectory(context.Context, *Inode, string) error { + return nil +} + +// Getlink implements fs.InodeOperations.Getlink. +func (n *MockInodeOperations) Getlink(context.Context, *Inode) (*Dirent, error) { + return nil, syserror.ENOLINK +} diff --git a/pkg/sentry/fs/mount.go b/pkg/sentry/fs/mount.go new file mode 100644 index 000000000..a2943b097 --- /dev/null +++ b/pkg/sentry/fs/mount.go @@ -0,0 +1,298 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fs + +import ( + "bytes" + "fmt" + "sync" + "sync/atomic" + + "gvisor.googlesource.com/gvisor/pkg/refs" +) + +// DirentOperations provide file systems greater control over how long a Dirent stays pinned +// in core. Implementations must not take Dirent.mu. +type DirentOperations interface { + // Revalidate returns true if the Dirent is stale and its InodeOperations needs to be reloaded. Revalidate + // will never be called on a Dirent that is mounted. + Revalidate(dirent *Dirent) bool + + // Keep returns true if the Dirent should be kept in memory for as long as possible + // beyond any active references. + Keep(dirent *Dirent) bool +} + +// MountSourceOperations contains filesystem specific operations. +type MountSourceOperations interface { + // TODO: Add: + // + // StatFS() (Info, error) + // BlockSize() int64 + // FS() Filesystem + + // DirentOperations provide optional extra management of Dirents. + DirentOperations + + // Destroy destroys the MountSource. + Destroy() + + // Below are MountSourceOperations that do not conform to Linux. + + // ResetInodeMappings clears all mappings of Inodes before SaveInodeMapping + // is called. + ResetInodeMappings() + + // SaveInodeMappings is called during saving to store, for each reachable + // Inode in the mounted filesystem, a mapping of Inode.StableAttr.InodeID + // to the Inode's path relative to its mount point. If an Inode is + // reachable at more than one path due to hard links, it is unspecified + // which path is mapped. Filesystems that do not use this information to + // restore inodes can make SaveInodeMappings a no-op. + SaveInodeMapping(inode *Inode, path string) +} + +// InodeMappings defines a fmt.Stringer MountSource Inode mappings. +type InodeMappings map[uint64]string + +// String implements fmt.Stringer.String. +func (i InodeMappings) String() string { + var mappingsBuf bytes.Buffer + mappingsBuf.WriteString("\n") + for ino, name := range i { + mappingsBuf.WriteString(fmt.Sprintf("\t%q\t\tinode number %d\n", name, ino)) + } + return mappingsBuf.String() +} + +// MountSource represents a source of file objects. +// +// MountSource corresponds to struct super_block in Linux. +// +// A mount source may represent a physical device (or a partition of a physical +// device) or a virtual source of files such as procfs for a specific PID +// namespace. There should be only one mount source per logical device. E.g. +// there should be only procfs mount source for a given PID namespace. +// +// A mount source represents files as inodes. Every inode belongs to exactly +// one mount source. Each file object may only be represented using one inode +// object in a sentry instance. +// +// This is an amalgamation of structs super_block, vfsmount, and mount, while +// MountSourceOperations is akin to struct super_operations. +// +// Hence, mount source also contains common mounted file system state, such as +// mount flags, the root Dirent, and children mounts. For now, this +// amalgamation implies that a mount source cannot be shared by multiple mounts +// (e.g. cannot be mounted at different locations). +// +// TODO: Move mount-specific information out of MountSource. +type MountSource struct { + refs.AtomicRefCount + + // MountSourceOperations defines filesystem specific behavior. + MountSourceOperations + + // Filesystem is the filesystem backing the mount. Can be nil if there + // is no filesystem backing the mount. + Filesystem Filesystem + + // Flags are the flags that this filesystem was mounted with. + Flags MountSourceFlags + + // fscache keeps Dirents pinned beyond application references to them. + // It must be flushed before kernel.SaveTo. + fscache *DirentCache `state:"nosave"` + + // direntRefs is the sum of references on all Dirents in this MountSource. + // + // direntRefs is increased when a Dirent in MountSource is IncRef'd, and + // decreased when a Dirent in MountSource is DecRef'd. + // + // To cleanly unmount a MountSource, one must check that no direntRefs are + // held anymore. To check, one must hold root.parent.dirMu of the + // MountSource's root Dirent before reading direntRefs to prevent further + // walks to Dirents in this MountSource. + // + // direntRefs must be atomically changed. + direntRefs uint64 + + // mu protects the fields below, which are set by the MountNamespace + // during MountSource/Unmount. + mu sync.Mutex `state:"nosave"` + + // id is a unique id for this mount. + id uint64 + + // root is the root Dirent of this mount. + root *Dirent + + // parent is the parent MountSource, or nil if this MountSource is the root. + parent *MountSource + + // children are the child MountSources of this MountSource. + children map[*MountSource]struct{} +} + +// defaultDirentCacheSize is the number of Dirents that the VFS can hold an extra +// reference on. +const defaultDirentCacheSize uint64 = 1000 + +// NewMountSource returns a new MountSource. Filesystem may be nil if there is no +// filesystem backing the mount. +func NewMountSource(mops MountSourceOperations, filesystem Filesystem, flags MountSourceFlags) *MountSource { + return &MountSource{ + MountSourceOperations: mops, + Flags: flags, + Filesystem: filesystem, + fscache: NewDirentCache(defaultDirentCacheSize), + children: make(map[*MountSource]struct{}), + } +} + +// Parent returns the parent mount, or nil if this mount is the root. +func (msrc *MountSource) Parent() *MountSource { + msrc.mu.Lock() + defer msrc.mu.Unlock() + return msrc.parent +} + +// ID returns the ID of this mount. +func (msrc *MountSource) ID() uint64 { + msrc.mu.Lock() + defer msrc.mu.Unlock() + return msrc.id +} + +// Children returns the (immediate) children of this MountSource. +func (msrc *MountSource) Children() []*MountSource { + msrc.mu.Lock() + defer msrc.mu.Unlock() + + ms := make([]*MountSource, 0, len(msrc.children)) + for c := range msrc.children { + ms = append(ms, c) + } + return ms +} + +// Submounts returns all mounts that are descendants of this mount. +func (msrc *MountSource) Submounts() []*MountSource { + var ms []*MountSource + for _, c := range msrc.Children() { + ms = append(ms, c) + ms = append(ms, c.Submounts()...) + } + return ms +} + +// Root returns the root dirent of this mount. +func (msrc *MountSource) Root() *Dirent { + msrc.mu.Lock() + defer msrc.mu.Unlock() + return msrc.root +} + +// DirentRefs returns the current mount direntRefs. +func (msrc *MountSource) DirentRefs() uint64 { + return atomic.LoadUint64(&msrc.direntRefs) +} + +// IncDirentRefs increases direntRefs. +func (msrc *MountSource) IncDirentRefs() { + atomic.AddUint64(&msrc.direntRefs, 1) +} + +// DecDirentRefs decrements direntRefs. +func (msrc *MountSource) DecDirentRefs() { + if atomic.AddUint64(&msrc.direntRefs, ^uint64(0)) == ^uint64(0) { + panic("Decremented zero mount reference direntRefs") + } +} + +func (msrc *MountSource) destroy() { + if c := msrc.DirentRefs(); c != 0 { + panic(fmt.Sprintf("MountSource with non-zero direntRefs is being destroyed: %d", c)) + } + msrc.MountSourceOperations.Destroy() +} + +// DecRef drops a reference on the MountSource. +func (msrc *MountSource) DecRef() { + msrc.DecRefWithDestructor(msrc.destroy) +} + +// FlushDirentRefs drops all references held by the MountSource on Dirents. +func (msrc *MountSource) FlushDirentRefs() { + msrc.fscache.Invalidate() +} + +// NewCachingMountSource returns a generic mount that will cache dirents +// aggressively. Filesystem may be nil if there is no backing filesystem. +func NewCachingMountSource(filesystem Filesystem, flags MountSourceFlags) *MountSource { + return NewMountSource(&SimpleMountSourceOperations{ + keep: true, + }, filesystem, flags) +} + +// NewNonCachingMountSource returns a generic mount that will never cache dirents. +// Filesystem may be nil if there is no backing filesystem. +func NewNonCachingMountSource(filesystem Filesystem, flags MountSourceFlags) *MountSource { + return NewMountSource(&SimpleMountSourceOperations{ + keep: false, + }, filesystem, flags) +} + +// SimpleMountSourceOperations implements MountSourceOperations. +type SimpleMountSourceOperations struct { + keep bool +} + +// Revalidate implements MountSourceOperations.Revalidate. +func (*SimpleMountSourceOperations) Revalidate(*Dirent) bool { + return false +} + +// Keep implements MountSourceOperations.Keep. +func (smo *SimpleMountSourceOperations) Keep(*Dirent) bool { + return smo.keep +} + +// ResetInodeMappings implements MountSourceOperations.ResetInodeMappings. +func (*SimpleMountSourceOperations) ResetInodeMappings() {} + +// SaveInodeMapping implements MountSourceOperations.SaveInodeMapping. +func (*SimpleMountSourceOperations) SaveInodeMapping(*Inode, string) {} + +// Destroy implements MountSourceOperations.Destroy. +func (*SimpleMountSourceOperations) Destroy() {} + +// Info defines attributes of a filesystem. +type Info struct { + // Type is the filesystem type magic value. + Type uint64 + + // TotalBlocks is the total data blocks in the filesystem. + TotalBlocks uint64 + + // FreeBlocks is the number of free blocks available. + FreeBlocks uint64 + + // TotalFiles is the total file nodes in the filesystem. + TotalFiles uint64 + + // FreeFiles is the number of free file nodes. + FreeFiles uint64 +} diff --git a/pkg/sentry/fs/mount_overlay.go b/pkg/sentry/fs/mount_overlay.go new file mode 100644 index 000000000..16c25e46c --- /dev/null +++ b/pkg/sentry/fs/mount_overlay.go @@ -0,0 +1,95 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fs + +import "gvisor.googlesource.com/gvisor/pkg/sentry/context" + +// overlayMountSourceOperations implements MountSourceOperations for an overlay +// mount point. +type overlayMountSourceOperations struct { + upper *MountSource + lower *MountSource +} + +func newOverlayMountSource(upper, lower *MountSource, flags MountSourceFlags) *MountSource { + upper.IncRef() + lower.IncRef() + return NewMountSource(&overlayMountSourceOperations{ + upper: upper, + lower: lower, + }, &overlayFilesystem{}, flags) +} + +// Revalidate panics if the upper or lower MountSource require that dirent be +// revalidated. Otherwise always returns false. +func (o *overlayMountSourceOperations) Revalidate(dirent *Dirent) bool { + if o.upper.Revalidate(dirent) || o.lower.Revalidate(dirent) { + panic("an overlay cannot revalidate file objects") + } + return false +} + +// Keep returns true if either upper or lower MountSource require that the +// dirent be kept in memory. +func (o *overlayMountSourceOperations) Keep(dirent *Dirent) bool { + return o.upper.Keep(dirent) || o.lower.Keep(dirent) +} + +// ResetInodeMappings propagates the call to both upper and lower MountSource. +func (o *overlayMountSourceOperations) ResetInodeMappings() { + o.upper.ResetInodeMappings() + o.lower.ResetInodeMappings() +} + +// SaveInodeMapping propagates the call to both upper and lower MountSource. +func (o *overlayMountSourceOperations) SaveInodeMapping(inode *Inode, path string) { + inode.overlay.copyMu.RLock() + defer inode.overlay.copyMu.RUnlock() + if inode.overlay.upper != nil { + o.upper.SaveInodeMapping(inode.overlay.upper, path) + } + if inode.overlay.lower != nil { + o.lower.SaveInodeMapping(inode.overlay.lower, path) + } +} + +// Destroy drops references on the upper and lower MountSource. +func (o *overlayMountSourceOperations) Destroy() { + o.upper.DecRef() + o.lower.DecRef() +} + +// type overlayFilesystem is the filesystem for overlay mounts. +type overlayFilesystem struct{} + +// Name implements Filesystem.Name. +func (ofs *overlayFilesystem) Name() string { + return "overlayfs" +} + +// Flags implements Filesystem.Flags. +func (ofs *overlayFilesystem) Flags() FilesystemFlags { + return 0 +} + +// AllowUserMount implements Filesystem.AllowUserMount. +func (ofs *overlayFilesystem) AllowUserMount() bool { + return false +} + +// Mount implements Filesystem.Mount. +func (ofs *overlayFilesystem) Mount(ctx context.Context, device string, flags MountSourceFlags, data string) (*Inode, error) { + panic("overlayFilesystem.Mount should not be called!") +} diff --git a/pkg/sentry/fs/mount_state.go b/pkg/sentry/fs/mount_state.go new file mode 100644 index 000000000..f5ed1dd8d --- /dev/null +++ b/pkg/sentry/fs/mount_state.go @@ -0,0 +1,25 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fs + +// afterLoad is invoked by stateify. +// +// Beyond the cache, this method's existence is required to ensure that this +// object is not marked "complete" until all dependent objects are also marked +// "complete". Implementations (e.g. see gofer_state.go) reach into the +// MountSourceOperations through this object, this is necessary on restore. +func (msrc *MountSource) afterLoad() { + msrc.fscache = NewDirentCache(defaultDirentCacheSize) +} diff --git a/pkg/sentry/fs/mount_test.go b/pkg/sentry/fs/mount_test.go new file mode 100644 index 000000000..3a053c154 --- /dev/null +++ b/pkg/sentry/fs/mount_test.go @@ -0,0 +1,216 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fs + +import ( + "fmt" + "testing" + + "gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest" +) + +// cacheReallyContains iterates through the dirent cache to determine whether +// it contains the given dirent. +func cacheReallyContains(cache *DirentCache, d *Dirent) bool { + for i := cache.list.Front(); i != nil; i = i.Next() { + if i == d { + return true + } + } + return false +} + +// TestMountSourceOnlyCachedOnce tests that a Dirent that is mounted over only ends +// up in a single Dirent Cache. NOTE: Having a dirent in multiple +// caches causes major consistency issues. +func TestMountSourceOnlyCachedOnce(t *testing.T) { + ctx := contexttest.Context(t) + + rootCache := NewDirentCache(100) + rootInode := NewMockInode(ctx, NewMockMountSource(rootCache), StableAttr{ + Type: Directory, + }) + mm, err := NewMountNamespace(ctx, rootInode) + if err != nil { + t.Fatalf("NewMountNamespace failed: %v", err) + } + rootDirent := mm.Root() + defer rootDirent.DecRef() + + // Get a child of the root which we will mount over. Note that the + // MockInodeOperations causes Walk to always succeed. + child, err := rootDirent.Walk(ctx, rootDirent, "child") + if err != nil { + t.Fatalf("failed to walk to child dirent: %v", err) + } + child.maybeExtendReference() // Cache. + + // Ensure that the root cache contains the child. + if !cacheReallyContains(rootCache, child) { + t.Errorf("wanted rootCache to contain child dirent, but it did not") + } + + // Create a new cache and inode, and mount it over child. + submountCache := NewDirentCache(100) + submountInode := NewMockInode(ctx, NewMockMountSource(submountCache), StableAttr{ + Type: Directory, + }) + if err := mm.Mount(ctx, child, submountInode); err != nil { + t.Fatalf("failed to mount over child: %v", err) + } + + // Walk to the child again. + child2, err := rootDirent.Walk(ctx, rootDirent, "child") + if err != nil { + t.Fatalf("failed to walk to child dirent: %v", err) + } + + // Should have a different Dirent than before. + if child == child2 { + t.Fatalf("expected %v not equal to %v, but they are the same", child, child2) + } + + // Neither of the caches should no contain the child. + if cacheReallyContains(rootCache, child) { + t.Errorf("wanted rootCache not to contain child dirent, but it did") + } + if cacheReallyContains(submountCache, child) { + t.Errorf("wanted submountCache not to contain child dirent, but it did") + } +} + +// Test that mounts have proper parent/child relationships. +func TestMountSourceParentChildRelationship(t *testing.T) { + ctx := contexttest.Context(t) + + rootCache := NewDirentCache(100) + rootInode := NewMockInode(ctx, NewMockMountSource(rootCache), StableAttr{ + Type: Directory, + }) + mm, err := NewMountNamespace(ctx, rootInode) + if err != nil { + t.Fatalf("NewMountNamespace failed: %v", err) + } + rootDirent := mm.Root() + defer rootDirent.DecRef() + + // Add mounts at the following paths: + paths := []string{ + "/foo", + "/foo/bar", + "/foo/bar/baz", + "/foo/qux", + "/waldo", + } + + for _, p := range paths { + d, err := mm.FindLink(ctx, rootDirent, nil, p, 0) + if err != nil { + t.Fatalf("could not find path %q in mount manager: %v", p, err) + } + submountInode := NewMockInode(ctx, NewMockMountSource(nil), StableAttr{ + Type: Directory, + }) + if err := mm.Mount(ctx, d, submountInode); err != nil { + t.Fatalf("could not mount at %q: %v", p, err) + } + } + + // mm root should contain all submounts (and does not include the root + // mount). + allMountSources := rootDirent.Inode.MountSource.Submounts() + if err := mountPathsAre(rootDirent, allMountSources, paths...); err != nil { + t.Error(err) + } + + // Each mount should have a unique ID. + foundIDs := make(map[uint64]struct{}) + for _, m := range allMountSources { + id := m.ID() + if _, ok := foundIDs[id]; ok { + t.Errorf("got multiple mounts with id %d", id) + } + foundIDs[id] = struct{}{} + } + + // Root mount should have no parent. + rootMountSource := mm.root.Inode.MountSource + if p := rootMountSource.Parent(); p != nil { + t.Errorf("root.Parent got %v wanted nil", p) + } + + // Root mount should have 2 children: foo and waldo. + rootChildren := rootMountSource.Children() + if err := mountPathsAre(rootDirent, rootChildren, "/foo", "/waldo"); err != nil { + t.Error(err) + } + // All root mount children should have root as parent. + for _, c := range rootChildren { + if p := c.Parent(); p != rootMountSource { + t.Errorf("root mount child got parent %+v, wanted root mount", p) + } + } + + // "foo" mount should have two children: /foo/bar, and /foo/qux. + d, err := mm.FindLink(ctx, rootDirent, nil, "/foo", 0) + if err != nil { + t.Fatalf("could not find path %q in mount manager: %v", "/foo", err) + } + fooMountSource := d.Inode.MountSource + fooMountSourceChildren := fooMountSource.Children() + if err := mountPathsAre(rootDirent, fooMountSourceChildren, "/foo/bar", "/foo/qux"); err != nil { + t.Error(err) + } + // Each child should have fooMountSource as parent. + for _, c := range fooMountSourceChildren { + if p := c.Parent(); p != fooMountSource { + t.Errorf("foo mount child got parent %+v, wanted foo mount", p) + } + } + // Submounts of foo are /foo/bar, /foo/qux, and /foo/bar/baz. + if err := mountPathsAre(rootDirent, fooMountSource.Submounts(), "/foo/bar", "/foo/qux", "/foo/bar/baz"); err != nil { + t.Error(err) + } + + // "waldo" mount should have no submounts or children. + waldo, err := mm.FindLink(ctx, rootDirent, nil, "/waldo", 0) + if err != nil { + t.Fatalf("could not find path %q in mount manager: %v", "/waldo", err) + } + waldoMountSource := waldo.Inode.MountSource + if got := len(waldoMountSource.Children()); got != 0 { + t.Errorf("waldo got %d children, wanted 0", got) + } + if got := len(waldoMountSource.Submounts()); got != 0 { + t.Errorf("waldo got %d children, wanted 0", got) + } +} + +func mountPathsAre(root *Dirent, got []*MountSource, want ...string) error { + if len(got) != len(want) { + return fmt.Errorf("mount paths have different lengths: got %d want %d", len(got), len(want)) + } + gotPaths := make(map[string]struct{}, len(got)) + for _, g := range got { + n, _ := g.Root().FullName(root) + gotPaths[n] = struct{}{} + } + for _, w := range want { + if _, ok := gotPaths[w]; !ok { + return fmt.Errorf("no mount with path %q found", w) + } + } + return nil +} diff --git a/pkg/sentry/fs/mounts.go b/pkg/sentry/fs/mounts.go new file mode 100644 index 000000000..1e6b5b70e --- /dev/null +++ b/pkg/sentry/fs/mounts.go @@ -0,0 +1,511 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fs + +import ( + "fmt" + "sync" + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/refs" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// DefaultTraversalLimit provides a sensible default traversal limit that may +// be passed to FindInode and FindLink. You may want to provide other options in +// individual syscall implementations, but for internal functions this will be +// sane. +const DefaultTraversalLimit = 10 + +// MountNamespace defines a collection of mounts. +type MountNamespace struct { + refs.AtomicRefCount + + // userns is the user namespace associated with this mount namespace. + // + // All privileged operations on this mount namespace must have + // appropriate capabilities in this userns. + // + // userns is immutable. + userns *auth.UserNamespace + + // root is the root directory. + root *Dirent + + // mu protects mounts and mountID counter. + mu sync.Mutex `state:"nosave"` + + // mounts is a map of the last mounted Dirent -> stack of old Dirents + // that were mounted over, with the oldest mounted Dirent first and + // more recent mounted Dirents at the end of the slice. + // + // A reference to all Dirents in mounts (keys and values) must be held + // to ensure the Dirents are recoverable when unmounting. + mounts map[*Dirent][]*Dirent + + // mountID is the next mount id to assign. + mountID uint64 +} + +// NewMountNamespace returns a new MountNamespace, with the provided node at the +// root, and the given cache size. A root must always be provided. +func NewMountNamespace(ctx context.Context, root *Inode) (*MountNamespace, error) { + creds := auth.CredentialsFromContext(ctx) + + root.MountSource.mu.Lock() + defer root.MountSource.mu.Unlock() + + // Set the root dirent and id on the root mount. + d := NewDirent(root, "/") + root.MountSource.root = d + root.MountSource.id = 1 + + return &MountNamespace{ + userns: creds.UserNamespace, + root: d, + mounts: make(map[*Dirent][]*Dirent), + mountID: 2, + }, nil +} + +// UserNamespace returns the user namespace associated with this mount manager. +func (mns *MountNamespace) UserNamespace() *auth.UserNamespace { + return mns.userns +} + +// Root returns the MountNamespace's root Dirent and increments its reference +// count. The caller must call DecRef when finished. +func (mns *MountNamespace) Root() *Dirent { + mns.root.IncRef() + return mns.root +} + +// FlushMountSourceRefs flushes extra references held by MountSources for all active mount points; +// see fs/mount.go:MountSource.FlushDirentRefs. +func (mns *MountNamespace) FlushMountSourceRefs() { + mns.mu.Lock() + defer mns.mu.Unlock() + mns.flushMountSourceRefsLocked() +} + +func (mns *MountNamespace) flushMountSourceRefsLocked() { + // Flush mounts' MountSource references. + for current, stack := range mns.mounts { + current.Inode.MountSource.FlushDirentRefs() + for _, prev := range stack { + prev.Inode.MountSource.FlushDirentRefs() + } + } + + // Flush root's MountSource references. + mns.root.Inode.MountSource.FlushDirentRefs() +} + +// destroy drops root and mounts dirent references and closes any original nodes. +// +// After destroy is called, the MountNamespace may continue to be referenced (for +// example via /proc/mounts), but should free all resources and shouldn't have +// Find* methods called. +func (mns *MountNamespace) destroy() { + mns.mu.Lock() + defer mns.mu.Unlock() + + // Flush all mounts' MountSource references to Dirents. This allows for mount + // points to be torn down since there should be no remaining references after + // this and DecRef below. + mns.flushMountSourceRefsLocked() + + // Teardown mounts. + for current, mp := range mns.mounts { + // Drop the mount reference on all mounted dirents. + for _, d := range mp { + d.DecRef() + } + current.DecRef() + } + mns.mounts = nil + + // Drop reference on the root. + mns.root.DecRef() + + // Wait for asynchronous work (queued by dropping Dirent references + // above) to complete before destroying this MountNamespace. + AsyncBarrier() +} + +// DecRef implements RefCounter.DecRef with destructor mns.destroy. +func (mns *MountNamespace) DecRef() { + mns.DecRefWithDestructor(mns.destroy) +} + +// Freeze freezes the entire mount tree. +func (mns *MountNamespace) Freeze() { + mns.mu.Lock() + defer mns.mu.Unlock() + + // We only want to freeze Dirents with active references, not Dirents referenced + // by a mount's MountSource. + mns.flushMountSourceRefsLocked() + + // Freeze the entire shebang. + mns.root.Freeze() +} + +// withMountLocked prevents further walks to `node`, because `node` is about to +// be a mount point. +func (mns *MountNamespace) withMountLocked(node *Dirent, fn func() error) error { + mns.mu.Lock() + defer mns.mu.Unlock() + + renameMu.Lock() + defer renameMu.Unlock() + + // Linux allows mounting over the root (?). It comes with a strange set + // of semantics. We'll just not do this for now. + if node.parent == nil { + return syserror.EBUSY + } + + // For both mount and unmount, we take this lock so we can swap out the + // appropriate child in parent.children. + // + // For unmount, this also ensures that if `node` is a mount point, the + // underlying mount's MountSource.direntRefs cannot increase by preventing + // walks to node. + node.parent.dirMu.Lock() + defer node.parent.dirMu.Unlock() + + node.parent.mu.Lock() + defer node.parent.mu.Unlock() + + // We need not take node.dirMu since we have parent.dirMu. + + // We need to take node.mu, so that we can check for deletion. + node.mu.Lock() + defer node.mu.Unlock() + + return fn() +} + +// Mount mounts a `inode` over the subtree at `node`. +func (mns *MountNamespace) Mount(ctx context.Context, node *Dirent, inode *Inode) error { + return mns.withMountLocked(node, func() error { + // replacement already has one reference taken; this is the mount + // reference. + replacement, err := node.mount(ctx, inode) + if err != nil { + return err + } + + // Set child/parent dirent relationship. + parentMountSource := node.Inode.MountSource + childMountSource := inode.MountSource + parentMountSource.mu.Lock() + defer parentMountSource.mu.Unlock() + childMountSource.mu.Lock() + defer childMountSource.mu.Unlock() + + parentMountSource.children[childMountSource] = struct{}{} + childMountSource.parent = parentMountSource + + // Set the mount's root dirent and id. + childMountSource.root = replacement + childMountSource.id = mns.mountID + mns.mountID++ + + // Drop node from its dirent cache. + node.dropExtendedReference() + + // If node is already a mount point, push node on the stack so it can + // be recovered on unmount. + if stack, ok := mns.mounts[node]; ok { + mns.mounts[replacement] = append(stack, node) + delete(mns.mounts, node) + return nil + } + + // Was not already mounted, just add another mount point. + // Take a reference on node so it can be recovered on unmount. + node.IncRef() + mns.mounts[replacement] = []*Dirent{node} + return nil + }) +} + +// Unmount ensures no references to the MountSource remain and removes `node` from +// this subtree. The subtree formerly mounted in `node`'s place will be +// restored. node's MountSource will be destroyed as soon as the last reference to +// `node` is dropped, as no references to Dirents within will remain. +// +// If detachOnly is set, Unmount merely removes `node` from the subtree, but +// allows existing references to the MountSource remain. E.g. if an open file still +// refers to Dirents in MountSource, the Unmount will succeed anyway and MountSource will +// be destroyed at a later time when all references to Dirents within are +// dropped. +// +// The caller must hold a reference to node from walking to it. +func (mns *MountNamespace) Unmount(ctx context.Context, node *Dirent, detachOnly bool) error { + // This takes locks to prevent further walks to Dirents in this mount + // under the assumption that `node` is the root of the mount. + return mns.withMountLocked(node, func() error { + origs, ok := mns.mounts[node] + if !ok { + // node is not a mount point. + return syserror.EINVAL + } + + if len(origs) == 0 { + panic("cannot unmount initial dirent") + } + + if !detachOnly { + m := node.Inode.MountSource + + // Lock the parent MountSource first, if it exists. We are + // holding mns.Lock, so the parent can not change out + // from under us. + parent := m.Parent() + if parent != nil { + parent.mu.Lock() + defer parent.mu.Unlock() + } + + // Lock the mount that is being unmounted. + m.mu.Lock() + defer m.mu.Unlock() + + if m.parent != nil { + // Sanity check. + if _, ok := m.parent.children[m]; !ok { + panic(fmt.Sprintf("mount %+v is not a child of parent %+v", m, m.parent)) + } + delete(m.parent.children, m) + m.parent = nil + } + + // Flush all references on the mounted node. + m.FlushDirentRefs() + + // At this point, exactly two references must be held + // to mount: one mount reference on node, and one due + // to walking to node. + // + // We must also be guaranteed that no more references + // can be taken on mount. This is why withMountLocked + // must be held at this point to prevent any walks to + // and from node. + if refs := m.DirentRefs(); refs < 2 { + panic(fmt.Sprintf("have %d refs on unmount, expect 2 or more", refs)) + } else if refs != 2 { + return syserror.EBUSY + } + } + + original := origs[len(origs)-1] + if err := node.unmount(ctx, original); err != nil { + return err + } + + switch { + case len(origs) > 1: + mns.mounts[original] = origs[:len(origs)-1] + case len(origs) == 1: + // Drop mount reference taken at the end of + // MountNamespace.Mount. + original.DecRef() + } + + delete(mns.mounts, node) + return nil + }) +} + +// FindLink returns an Dirent from a given node, which may be a symlink. +// +// The root argument is treated as the root directory, and FindLink will not +// return anything above that. The wd dirent provides the starting directory, +// and may be nil which indicates the root should be used. You must call DecRef +// on the resulting Dirent when you are no longer using the object. +// +// If wd is nil, then the root will be used as the working directory. If the +// path is absolute, this has no functional impact. +// +// Precondition: root must be non-nil. +// Precondition: the path must be non-empty. +func (mns *MountNamespace) FindLink(ctx context.Context, root, wd *Dirent, path string, maxTraversals uint) (*Dirent, error) { + if root == nil { + panic("MountNamespace.FindInode: root must not be nil") + } + if len(path) == 0 { + panic("MountNamespace.FindInode: path is empty") + } + + // Split the path. + first, remainder := SplitFirst(path) + + // Where does this walk originate? + current := wd + if current == nil { + current = root + } + for first == "/" { + // Special case: it's possible that we have nothing to walk at + // all. This is necessary since we're resplitting the path. + if remainder == "" { + root.IncRef() + return root, nil + } + + // Start at the root and advance the path component so that the + // walk below can proceed. Note at this point, it handles the + // no-op walk case perfectly fine. + current = root + first, remainder = SplitFirst(remainder) + } + + current.IncRef() // Transferred during walk. + + for { + // Check that the file is a directory and that we have + // permissions to walk. + // + // Note that we elide this check for the root directory as an + // optimization; a non-executable root may still be walked. A + // non-directory root is hopeless. + if current != root { + if !IsDir(current.Inode.StableAttr) { + current.DecRef() // Drop reference from above. + return nil, syserror.ENOTDIR + } + if err := current.Inode.CheckPermission(ctx, PermMask{Execute: true}); err != nil { + current.DecRef() // Drop reference from above. + return nil, err + } + } + + // Move to the next level. + next, err := current.Walk(ctx, root, first) + if err != nil { + // Allow failed walks to cache the dirent, because no + // children will acquire a reference at the end. + current.maybeExtendReference() + current.DecRef() + return nil, err + } + + // Drop old reference. + current.DecRef() + + if remainder != "" { + // Ensure it's resolved, unless it's the last level. + // + // See resolve for reference semantics; on err next + // will have one dropped. + current, err = mns.resolve(ctx, root, next, maxTraversals) + if err != nil { + return nil, err + } + } else { + // Allow the file system to take an extra reference on the + // found child. This will hold a reference on the containing + // directory, so the whole tree will be implicitly cached. + next.maybeExtendReference() + return next, nil + } + + // Move to the next element. + first, remainder = SplitFirst(remainder) + } +} + +// FindInode is identical to FindLink except the return value is resolved. +// +//go:nosplit +func (mns *MountNamespace) FindInode(ctx context.Context, root, wd *Dirent, path string, maxTraversals uint) (*Dirent, error) { + d, err := mns.FindLink(ctx, root, wd, path, maxTraversals) + if err != nil { + return nil, err + } + + // See resolve for reference semantics; on err d will have the + // reference dropped. + return mns.resolve(ctx, root, d, maxTraversals) +} + +// resolve resolves the given link. +// +// If successful, a reference is dropped on node and one is acquired on the +// caller's behalf for the returned dirent. +// +// If not successful, a reference is _also_ dropped on the node and an error +// returned. This is for convenience in using resolve directly as a return +// value. +func (mns *MountNamespace) resolve(ctx context.Context, root, node *Dirent, maxTraversals uint) (*Dirent, error) { + // Resolve the path. + target, err := node.Inode.Getlink(ctx) + + switch err { + case nil: + // Make sure we didn't exhaust the traversal budget. + if maxTraversals == 0 { + target.DecRef() + return nil, syscall.ELOOP + } + + node.DecRef() // Drop the original reference. + return target, nil + + case syscall.ENOLINK: + // Not a symlink. + return node, nil + + case ErrResolveViaReadlink: + defer node.DecRef() // See above. + + // First, check if we should traverse. + if maxTraversals == 0 { + return nil, syscall.ELOOP + } + + // Read the target path. + targetPath, err := node.Inode.Readlink(ctx) + if err != nil { + return nil, err + } + + // Find the node; we resolve relative to the current symlink's parent. + d, err := mns.FindInode(ctx, root, node.parent, targetPath, maxTraversals-1) + if err != nil { + return nil, err + } + + return d, err + + default: + node.DecRef() // Drop for err; see above. + + // Propagate the error. + return nil, err + } +} + +// SyncAll calls Dirent.SyncAll on the root. +func (mns *MountNamespace) SyncAll(ctx context.Context) { + mns.mu.Lock() + defer mns.mu.Unlock() + mns.root.SyncAll(ctx) +} diff --git a/pkg/sentry/fs/mounts_test.go b/pkg/sentry/fs/mounts_test.go new file mode 100644 index 000000000..8669f3a38 --- /dev/null +++ b/pkg/sentry/fs/mounts_test.go @@ -0,0 +1,102 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fs_test + +import ( + "testing" + + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + ramfstest "gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs/test" +) + +// Creates a new MountNamespace with filesystem: +// / (root dir) +// |-foo (dir) +// |-bar (file) +func createMountNamespace(ctx context.Context) (*fs.MountNamespace, error) { + perms := fs.FilePermsFromMode(0777) + m := fs.NewNonCachingMountSource(nil, fs.MountSourceFlags{}) + + barFile := ramfstest.NewFile(ctx, perms) + fooDir := ramfstest.NewDir(ctx, map[string]*fs.Inode{ + "bar": fs.NewInode(barFile, m, fs.StableAttr{Type: fs.RegularFile}), + }, perms) + rootDir := ramfstest.NewDir(ctx, map[string]*fs.Inode{ + "foo": fs.NewInode(fooDir, m, fs.StableAttr{Type: fs.Directory}), + }, perms) + + return fs.NewMountNamespace(ctx, fs.NewInode(rootDir, m, fs.StableAttr{Type: fs.Directory})) +} + +func TestFindLink(t *testing.T) { + ctx := contexttest.Context(t) + mm, err := createMountNamespace(ctx) + if err != nil { + t.Fatalf("createMountNamespace failed: %v", err) + } + + root := mm.Root() + defer root.DecRef() + foo, err := root.Walk(ctx, root, "foo") + if err != nil { + t.Fatalf("Error walking to foo: %v", err) + } + + // Positive cases. + for _, tc := range []struct { + findPath string + wd *fs.Dirent + wantPath string + }{ + {".", root, "/"}, + {".", foo, "/foo"}, + {"..", foo, "/"}, + {"../../..", foo, "/"}, + {"///foo", foo, "/foo"}, + {"/foo", foo, "/foo"}, + {"/foo/bar", foo, "/foo/bar"}, + {"/foo/.///./bar", foo, "/foo/bar"}, + {"/foo///bar", foo, "/foo/bar"}, + {"/foo/../foo/bar", foo, "/foo/bar"}, + {"foo/bar", root, "/foo/bar"}, + {"foo////bar", root, "/foo/bar"}, + {"bar", foo, "/foo/bar"}, + } { + wdPath, _ := tc.wd.FullName(root) + if d, err := mm.FindLink(ctx, root, tc.wd, tc.findPath, 0); err != nil { + t.Errorf("FindLink(%q, wd=%q) failed: %v", tc.findPath, wdPath, err) + } else if got, _ := d.FullName(root); got != tc.wantPath { + t.Errorf("FindLink(%q, wd=%q) got dirent %q, want %q", tc.findPath, wdPath, got, tc.wantPath) + } + } + + // Negative cases. + for _, tc := range []struct { + findPath string + wd *fs.Dirent + }{ + {"bar", root}, + {"/bar", root}, + {"/foo/../../bar", root}, + {"foo", foo}, + } { + wdPath, _ := tc.wd.FullName(root) + if _, err := mm.FindLink(ctx, root, tc.wd, tc.findPath, 0); err == nil { + t.Errorf("FindLink(%q, wd=%q) did not return error", tc.findPath, wdPath) + } + } +} diff --git a/pkg/sentry/fs/offset.go b/pkg/sentry/fs/offset.go new file mode 100644 index 000000000..7cc8398e6 --- /dev/null +++ b/pkg/sentry/fs/offset.go @@ -0,0 +1,65 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fs + +import ( + "math" + + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +// OffsetPageEnd returns the file offset rounded up to the nearest +// page boundary. OffsetPageEnd panics if rounding up causes overflow, +// which shouldn't be possible given that offset is an int64. +func OffsetPageEnd(offset int64) uint64 { + end, ok := usermem.Addr(offset).RoundUp() + if !ok { + panic("impossible overflow") + } + return uint64(end) +} + +// ReadEndOffset returns an exclusive end offset for a read operation +// so that the read does not overflow an int64 nor size. +// +// Parameters: +// - offset: the starting offset of the read. +// - length: the number of bytes to read. +// - size: the size of the file. +// +// Postconditions: The returned offset is >= offset. +func ReadEndOffset(offset int64, length int64, size int64) int64 { + if offset >= size { + return offset + } + end := offset + length + // Don't overflow. + if end < offset || end > size { + end = size + } + return end +} + +// WriteEndOffset returns an exclusive end offset for a write operation +// so that the write does not overflow an int64. +// +// Parameters: +// - offset: the starting offset of the write. +// - length: the number of bytes to write. +// +// Postconditions: The returned offset is >= offset. +func WriteEndOffset(offset int64, length int64) int64 { + return ReadEndOffset(offset, length, math.MaxInt64) +} diff --git a/pkg/sentry/fs/overlay.go b/pkg/sentry/fs/overlay.go new file mode 100644 index 000000000..40eed3feb --- /dev/null +++ b/pkg/sentry/fs/overlay.go @@ -0,0 +1,268 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fs + +import ( + "fmt" + "sync" + + "gvisor.googlesource.com/gvisor/pkg/log" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/memmap" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// The virtual filesystem implements an overlay configuration. For a high-level +// description, see README.md. +// +// Note on whiteouts: +// +// This implementation does not use the "Docker-style" whiteouts (symlinks with +// ".wh." prefix). Instead upper filesystem directories support a set of extended +// attributes to encode whiteouts: "trusted.overlay.whiteout.<filename>". This +// gives flexibility to persist whiteouts independently of the filesystem layout +// while additionally preventing name conflicts with files prefixed with ".wh.". +// +// Known deficiencies: +// +// - The device number of two files under the same overlay mount point may be +// different. This can happen if a file is found in the lower filesystem (takes +// the lower filesystem device) and another file is created in the upper +// filesystem (takes the upper filesystem device). This may appear odd but +// should not break applications. +// +// - Registered events on files (i.e. for notification of read/write readiness) +// are not copied across copy up. This is fine in the common case of files that +// do not block. For files that do block, like pipes and sockets, copy up is not +// supported. +// +// - Hardlinks in a lower filesystem are broken by copy up. For this reason, no +// attempt is made to preserve link count across copy up. +// +// - The maximum length of an extended attribute name is the same as the maximum +// length of a file path in Linux (XATTR_NAME_MAX == NAME_MAX). This means that +// whiteout attributes, if set directly on the host, are limited additionally by +// the extra whiteout prefix length (file paths must be strictly shorter than +// NAME_MAX). This is not a problem for in-memory filesystems which don't enforce +// XATTR_NAME_MAX. + +const ( + // XattrOverlayPrefix is the prefix for extended attributes that affect + // the behavior of an overlay. + XattrOverlayPrefix = "trusted.overlay." + + // XattrOverlayWhiteoutPrefix is the prefix for extended attributes + // that indicate that a whiteout exists. + XattrOverlayWhiteoutPrefix = XattrOverlayPrefix + "whiteout." +) + +// XattrOverlayWhiteout returns an extended attribute that indicates a +// whiteout exists for name. It is supported by directories that wish to +// mask the existence of name. +func XattrOverlayWhiteout(name string) string { + return XattrOverlayWhiteoutPrefix + name +} + +// NewOverlayRoot produces the root of an overlay. +// +// Preconditions: +// +// - upper and lower must be non-nil. +// - lower should not expose character devices, pipes, or sockets, because +// copying up these types of files is not supported. +// - upper and lower must not require that file objects be revalidated. +// - upper and lower must not have dynamic file/directory content. +func NewOverlayRoot(ctx context.Context, upper *Inode, lower *Inode, flags MountSourceFlags) (*Inode, error) { + if !IsDir(upper.StableAttr) { + return nil, fmt.Errorf("upper Inode is not a directory") + } + if !IsDir(lower.StableAttr) { + return nil, fmt.Errorf("lower Inode is not a directory") + } + + msrc := newOverlayMountSource(upper.MountSource, lower.MountSource, flags) + overlay, err := newOverlayEntry(ctx, upper, lower, true) + if err != nil { + msrc.DecRef() + return nil, err + } + + return newOverlayInode(ctx, overlay, msrc), nil +} + +// newOverlayInode creates a new Inode for an overlay. +func newOverlayInode(ctx context.Context, o *overlayEntry, msrc *MountSource) *Inode { + var inode *Inode + if o.upper != nil { + inode = NewInode(nil, msrc, o.upper.StableAttr) + } else { + inode = NewInode(nil, msrc, o.lower.StableAttr) + } + inode.overlay = o + return inode +} + +// overlayEntry is the overlay metadata of an Inode. It implements Mappable. +type overlayEntry struct { + // lowerExists is true if an Inode exists for this file in the lower + // filesystem. If lowerExists is true, then the overlay must create + // a whiteout entry when renaming and removing this entry to mask the + // lower Inode. + // + // Note that this is distinct from actually holding onto a non-nil + // lower Inode (below). The overlay does not need to keep a lower Inode + // around unless it needs to operate on it, but it always needs to know + // whether the lower Inode exists to correctly execute a rename or + // remove operation. + lowerExists bool + + // lower is an Inode from a lower filesystem. Modifications are + // never made on this Inode. + lower *Inode + + // copyMu serializes copy-up for operations above + // mm.MemoryManager.mappingMu in the lock order. + copyMu sync.RWMutex `state:"nosave"` + + // mapsMu serializes copy-up for operations between + // mm.MemoryManager.mappingMu and mm.MemoryManager.activeMu in the lock + // order. + mapsMu sync.Mutex `state:"nosave"` + + // mappings tracks memory mappings of this Mappable so they can be removed + // from the lower filesystem Mappable and added to the upper filesystem + // Mappable when copy up occurs. It is strictly unnecessary after copy-up. + // + // mappings is protected by mapsMu. + mappings memmap.MappingSet + + // dataMu serializes copy-up for operations below mm.MemoryManager.activeMu + // in the lock order. + dataMu sync.RWMutex `state:"nosave"` + + // upper is an Inode from an upper filesystem. It is non-nil if + // the file exists in the upper filesystem. It becomes non-nil + // when the Inode that owns this overlayEntry is modified. + // + // upper is protected by all of copyMu, mapsMu, and dataMu. Holding any of + // these locks is sufficient to read upper; holding all three for writing + // is required to mutate it. + upper *Inode +} + +// newOverlayEntry returns a new overlayEntry. +func newOverlayEntry(ctx context.Context, upper *Inode, lower *Inode, lowerExists bool) (*overlayEntry, error) { + if upper == nil && lower == nil { + panic("invalid overlayEntry, needs at least one Inode") + } + if upper != nil && upper.overlay != nil { + panic("nested writable layers are not supported") + } + // Check for supported lower filesystem types. + if lower != nil { + switch lower.StableAttr.Type { + case RegularFile, Directory, Symlink, Socket: + default: + // We don't support copying up from character devices, + // named pipes, or anything weird (like proc files). + log.Warningf("%s not supported in lower filesytem", lower.StableAttr.Type) + return nil, syserror.EINVAL + } + } + return &overlayEntry{ + lowerExists: lowerExists, + lower: lower, + upper: upper, + }, nil +} + +func (o *overlayEntry) release() { + // We drop a reference on upper and lower file system Inodes + // rather than releasing them, because in-memory filesystems + // may hold an extra reference to these Inodes so that they + // stay in memory. + if o.upper != nil { + o.upper.DecRef() + } + if o.lower != nil { + o.lower.DecRef() + } +} + +// overlayUpperMountSource gives the upper mount of an overlay mount. +// +// The caller may not use this MountSource past the lifetime of overlayMountSource and may +// not call DecRef on it. +func overlayUpperMountSource(overlayMountSource *MountSource) *MountSource { + return overlayMountSource.MountSourceOperations.(*overlayMountSourceOperations).upper +} + +// Preconditions: At least one of o.copyMu, o.mapsMu, or o.dataMu must be locked. +func (o *overlayEntry) inodeLocked() *Inode { + if o.upper != nil { + return o.upper + } + return o.lower +} + +// Preconditions: At least one of o.copyMu, o.mapsMu, or o.dataMu must be locked. +func (o *overlayEntry) isMappableLocked() bool { + return o.inodeLocked().Mappable() != nil +} + +// AddMapping implements memmap.Mappable.AddMapping. +func (o *overlayEntry) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64) error { + o.mapsMu.Lock() + defer o.mapsMu.Unlock() + if err := o.inodeLocked().Mappable().AddMapping(ctx, ms, ar, offset); err != nil { + return err + } + o.mappings.AddMapping(ms, ar, offset) + return nil +} + +// RemoveMapping implements memmap.Mappable.RemoveMapping. +func (o *overlayEntry) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64) { + o.mapsMu.Lock() + defer o.mapsMu.Unlock() + o.inodeLocked().Mappable().RemoveMapping(ctx, ms, ar, offset) + o.mappings.RemoveMapping(ms, ar, offset) +} + +// CopyMapping implements memmap.Mappable.CopyMapping. +func (o *overlayEntry) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR usermem.AddrRange, offset uint64) error { + o.mapsMu.Lock() + defer o.mapsMu.Unlock() + if err := o.inodeLocked().Mappable().CopyMapping(ctx, ms, srcAR, dstAR, offset); err != nil { + return err + } + o.mappings.AddMapping(ms, dstAR, offset) + return nil +} + +// Translate implements memmap.Mappable.Translate. +func (o *overlayEntry) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) { + o.dataMu.RLock() + defer o.dataMu.RUnlock() + return o.inodeLocked().Mappable().Translate(ctx, required, optional, at) +} + +// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable. +func (o *overlayEntry) InvalidateUnsavable(ctx context.Context) error { + o.mapsMu.Lock() + defer o.mapsMu.Unlock() + return o.inodeLocked().Mappable().InvalidateUnsavable(ctx) +} diff --git a/pkg/sentry/fs/path.go b/pkg/sentry/fs/path.go new file mode 100644 index 000000000..b74f6ed8c --- /dev/null +++ b/pkg/sentry/fs/path.go @@ -0,0 +1,92 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fs + +// TrimTrailingSlashes trims any trailing slashes. +// +// The returned boolean indicates whether any changes were made. +// +//go:nosplit +func TrimTrailingSlashes(dir string) (trimmed string, changed bool) { + // Trim the trailing slash, except for root. + for len(dir) > 1 && dir[len(dir)-1] == '/' { + dir = dir[:len(dir)-1] + changed = true + } + return dir, changed +} + +// SplitLast splits the given path into a directory and a file. +// +// The "absoluteness" of the path is preserved, but dir is always stripped of +// trailing slashes. +// +//go:nosplit +func SplitLast(path string) (dir, file string) { + path, _ = TrimTrailingSlashes(path) + if path == "" { + return ".", "." + } else if path == "/" { + return "/", "." + } + + var slash int // Last location of slash in path. + for slash = len(path) - 1; slash >= 0 && path[slash] != '/'; slash-- { + } + switch { + case slash < 0: + return ".", path + case slash == 0: + // Directory of the form "/foo", or just "/". We need to + // preserve the first slash here, since it indicates an + // absolute path. + return "/", path[1:] + default: + // Drop the trailing slash. + dir, _ = TrimTrailingSlashes(path[:slash]) + return dir, path[slash+1:] + } +} + +// SplitFirst splits the given path into a first directory and the remainder. +// +// If remainder is empty, then the path is a single element. +// +//go:nosplit +func SplitFirst(path string) (current, remainder string) { + path, _ = TrimTrailingSlashes(path) + if path == "" { + return ".", "" + } + + var slash int // First location of slash in path. + for slash = 0; slash < len(path) && path[slash] != '/'; slash++ { + } + switch { + case slash >= len(path): + return path, "" + case slash == 0: + // See above. + return "/", path[1:] + default: + current = path[:slash] + remainder = path[slash+1:] + // Strip redundant slashes. + for len(remainder) > 0 && remainder[0] == '/' { + remainder = remainder[1:] + } + return current, remainder + } +} diff --git a/pkg/sentry/fs/path_test.go b/pkg/sentry/fs/path_test.go new file mode 100644 index 000000000..7ab070855 --- /dev/null +++ b/pkg/sentry/fs/path_test.go @@ -0,0 +1,211 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fs + +import ( + "testing" +) + +// TestSplitLast tests variants of path splitting. +func TestSplitLast(t *testing.T) { + cases := []struct { + path string + dir string + file string + }{ + {path: "/", dir: "/", file: "."}, + {path: "/.", dir: "/", file: "."}, + {path: "/./", dir: "/", file: "."}, + {path: "/./.", dir: "/.", file: "."}, + {path: "/././", dir: "/.", file: "."}, + {path: "/./..", dir: "/.", file: ".."}, + {path: "/./../", dir: "/.", file: ".."}, + {path: "/..", dir: "/", file: ".."}, + {path: "/../", dir: "/", file: ".."}, + {path: "/../.", dir: "/..", file: "."}, + {path: "/.././", dir: "/..", file: "."}, + {path: "/../..", dir: "/..", file: ".."}, + {path: "/../../", dir: "/..", file: ".."}, + + {path: "", dir: ".", file: "."}, + {path: ".", dir: ".", file: "."}, + {path: "./", dir: ".", file: "."}, + {path: "./.", dir: ".", file: "."}, + {path: "././", dir: ".", file: "."}, + {path: "./..", dir: ".", file: ".."}, + {path: "./../", dir: ".", file: ".."}, + {path: "..", dir: ".", file: ".."}, + {path: "../", dir: ".", file: ".."}, + {path: "../.", dir: "..", file: "."}, + {path: ".././", dir: "..", file: "."}, + {path: "../..", dir: "..", file: ".."}, + {path: "../../", dir: "..", file: ".."}, + + {path: "/foo", dir: "/", file: "foo"}, + {path: "/foo/", dir: "/", file: "foo"}, + {path: "/foo/.", dir: "/foo", file: "."}, + {path: "/foo/./", dir: "/foo", file: "."}, + {path: "/foo/./.", dir: "/foo/.", file: "."}, + {path: "/foo/./..", dir: "/foo/.", file: ".."}, + {path: "/foo/..", dir: "/foo", file: ".."}, + {path: "/foo/../", dir: "/foo", file: ".."}, + {path: "/foo/../.", dir: "/foo/..", file: "."}, + {path: "/foo/../..", dir: "/foo/..", file: ".."}, + + {path: "/foo/bar", dir: "/foo", file: "bar"}, + {path: "/foo/bar/", dir: "/foo", file: "bar"}, + {path: "/foo/bar/.", dir: "/foo/bar", file: "."}, + {path: "/foo/bar/./", dir: "/foo/bar", file: "."}, + {path: "/foo/bar/./.", dir: "/foo/bar/.", file: "."}, + {path: "/foo/bar/./..", dir: "/foo/bar/.", file: ".."}, + {path: "/foo/bar/..", dir: "/foo/bar", file: ".."}, + {path: "/foo/bar/../", dir: "/foo/bar", file: ".."}, + {path: "/foo/bar/../.", dir: "/foo/bar/..", file: "."}, + {path: "/foo/bar/../..", dir: "/foo/bar/..", file: ".."}, + + {path: "foo", dir: ".", file: "foo"}, + {path: "foo", dir: ".", file: "foo"}, + {path: "foo/", dir: ".", file: "foo"}, + {path: "foo/.", dir: "foo", file: "."}, + {path: "foo/./", dir: "foo", file: "."}, + {path: "foo/./.", dir: "foo/.", file: "."}, + {path: "foo/./..", dir: "foo/.", file: ".."}, + {path: "foo/..", dir: "foo", file: ".."}, + {path: "foo/../", dir: "foo", file: ".."}, + {path: "foo/../.", dir: "foo/..", file: "."}, + {path: "foo/../..", dir: "foo/..", file: ".."}, + {path: "foo/", dir: ".", file: "foo"}, + {path: "foo/.", dir: "foo", file: "."}, + + {path: "foo/bar", dir: "foo", file: "bar"}, + {path: "foo/bar/", dir: "foo", file: "bar"}, + {path: "foo/bar/.", dir: "foo/bar", file: "."}, + {path: "foo/bar/./", dir: "foo/bar", file: "."}, + {path: "foo/bar/./.", dir: "foo/bar/.", file: "."}, + {path: "foo/bar/./..", dir: "foo/bar/.", file: ".."}, + {path: "foo/bar/..", dir: "foo/bar", file: ".."}, + {path: "foo/bar/../", dir: "foo/bar", file: ".."}, + {path: "foo/bar/../.", dir: "foo/bar/..", file: "."}, + {path: "foo/bar/../..", dir: "foo/bar/..", file: ".."}, + {path: "foo/bar/", dir: "foo", file: "bar"}, + {path: "foo/bar/.", dir: "foo/bar", file: "."}, + } + + for _, c := range cases { + dir, file := SplitLast(c.path) + if dir != c.dir || file != c.file { + t.Errorf("SplitLast(%q) got (%q, %q), expected (%q, %q)", c.path, dir, file, c.dir, c.file) + } + } +} + +// TestSplitFirst tests variants of path splitting. +func TestSplitFirst(t *testing.T) { + cases := []struct { + path string + first string + remainder string + }{ + {path: "/", first: "/", remainder: ""}, + {path: "/.", first: "/", remainder: "."}, + {path: "///.", first: "/", remainder: "//."}, + {path: "/.///", first: "/", remainder: "."}, + {path: "/./.", first: "/", remainder: "./."}, + {path: "/././", first: "/", remainder: "./."}, + {path: "/./..", first: "/", remainder: "./.."}, + {path: "/./../", first: "/", remainder: "./.."}, + {path: "/..", first: "/", remainder: ".."}, + {path: "/../", first: "/", remainder: ".."}, + {path: "/../.", first: "/", remainder: "../."}, + {path: "/.././", first: "/", remainder: "../."}, + {path: "/../..", first: "/", remainder: "../.."}, + {path: "/../../", first: "/", remainder: "../.."}, + + {path: "", first: ".", remainder: ""}, + {path: ".", first: ".", remainder: ""}, + {path: "./", first: ".", remainder: ""}, + {path: ".///", first: ".", remainder: ""}, + {path: "./.", first: ".", remainder: "."}, + {path: "././", first: ".", remainder: "."}, + {path: "./..", first: ".", remainder: ".."}, + {path: "./../", first: ".", remainder: ".."}, + {path: "..", first: "..", remainder: ""}, + {path: "../", first: "..", remainder: ""}, + {path: "../.", first: "..", remainder: "."}, + {path: ".././", first: "..", remainder: "."}, + {path: "../..", first: "..", remainder: ".."}, + {path: "../../", first: "..", remainder: ".."}, + + {path: "/foo", first: "/", remainder: "foo"}, + {path: "/foo/", first: "/", remainder: "foo"}, + {path: "/foo///", first: "/", remainder: "foo"}, + {path: "/foo/.", first: "/", remainder: "foo/."}, + {path: "/foo/./", first: "/", remainder: "foo/."}, + {path: "/foo/./.", first: "/", remainder: "foo/./."}, + {path: "/foo/./..", first: "/", remainder: "foo/./.."}, + {path: "/foo/..", first: "/", remainder: "foo/.."}, + {path: "/foo/../", first: "/", remainder: "foo/.."}, + {path: "/foo/../.", first: "/", remainder: "foo/../."}, + {path: "/foo/../..", first: "/", remainder: "foo/../.."}, + + {path: "/foo/bar", first: "/", remainder: "foo/bar"}, + {path: "///foo/bar", first: "/", remainder: "//foo/bar"}, + {path: "/foo///bar", first: "/", remainder: "foo///bar"}, + {path: "/foo/bar/.", first: "/", remainder: "foo/bar/."}, + {path: "/foo/bar/./", first: "/", remainder: "foo/bar/."}, + {path: "/foo/bar/./.", first: "/", remainder: "foo/bar/./."}, + {path: "/foo/bar/./..", first: "/", remainder: "foo/bar/./.."}, + {path: "/foo/bar/..", first: "/", remainder: "foo/bar/.."}, + {path: "/foo/bar/../", first: "/", remainder: "foo/bar/.."}, + {path: "/foo/bar/../.", first: "/", remainder: "foo/bar/../."}, + {path: "/foo/bar/../..", first: "/", remainder: "foo/bar/../.."}, + + {path: "foo", first: "foo", remainder: ""}, + {path: "foo", first: "foo", remainder: ""}, + {path: "foo/", first: "foo", remainder: ""}, + {path: "foo///", first: "foo", remainder: ""}, + {path: "foo/.", first: "foo", remainder: "."}, + {path: "foo/./", first: "foo", remainder: "."}, + {path: "foo/./.", first: "foo", remainder: "./."}, + {path: "foo/./..", first: "foo", remainder: "./.."}, + {path: "foo/..", first: "foo", remainder: ".."}, + {path: "foo/../", first: "foo", remainder: ".."}, + {path: "foo/../.", first: "foo", remainder: "../."}, + {path: "foo/../..", first: "foo", remainder: "../.."}, + {path: "foo/", first: "foo", remainder: ""}, + {path: "foo/.", first: "foo", remainder: "."}, + + {path: "foo/bar", first: "foo", remainder: "bar"}, + {path: "foo///bar", first: "foo", remainder: "bar"}, + {path: "foo/bar/", first: "foo", remainder: "bar"}, + {path: "foo/bar/.", first: "foo", remainder: "bar/."}, + {path: "foo/bar/./", first: "foo", remainder: "bar/."}, + {path: "foo/bar/./.", first: "foo", remainder: "bar/./."}, + {path: "foo/bar/./..", first: "foo", remainder: "bar/./.."}, + {path: "foo/bar/..", first: "foo", remainder: "bar/.."}, + {path: "foo/bar/../", first: "foo", remainder: "bar/.."}, + {path: "foo/bar/../.", first: "foo", remainder: "bar/../."}, + {path: "foo/bar/../..", first: "foo", remainder: "bar/../.."}, + {path: "foo/bar/", first: "foo", remainder: "bar"}, + {path: "foo/bar/.", first: "foo", remainder: "bar/."}, + } + + for _, c := range cases { + first, remainder := SplitFirst(c.path) + if first != c.first || remainder != c.remainder { + t.Errorf("SplitFirst(%q) got (%q, %q), expected (%q, %q)", c.path, first, remainder, c.first, c.remainder) + } + } +} diff --git a/pkg/sentry/fs/proc/BUILD b/pkg/sentry/fs/proc/BUILD new file mode 100644 index 000000000..18372cfbf --- /dev/null +++ b/pkg/sentry/fs/proc/BUILD @@ -0,0 +1,95 @@ +package(licenses = ["notice"]) # Apache 2.0 + +load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test") +load("//tools/go_stateify:defs.bzl", "go_stateify") + +go_stateify( + name = "proc_state", + srcs = [ + "cpuinfo.go", + "exec_args.go", + "fds.go", + "file.go", + "filesystems.go", + "fs.go", + "loadavg.go", + "meminfo.go", + "mounts.go", + "net.go", + "proc.go", + "stat.go", + "sys.go", + "sys_net.go", + "task.go", + "uid_gid_map.go", + "uptime.go", + "version.go", + ], + out = "proc_state.go", + package = "proc", +) + +go_library( + name = "proc", + srcs = [ + "cpuinfo.go", + "exec_args.go", + "fds.go", + "file.go", + "filesystems.go", + "fs.go", + "loadavg.go", + "meminfo.go", + "mounts.go", + "net.go", + "proc.go", + "proc_state.go", + "stat.go", + "sys.go", + "sys_net.go", + "task.go", + "uid_gid_map.go", + "uptime.go", + "version.go", + ], + importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc", + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/amutex", + "//pkg/log", + "//pkg/sentry/arch", + "//pkg/sentry/context", + "//pkg/sentry/fs", + "//pkg/sentry/fs/proc/device", + "//pkg/sentry/fs/proc/seqfile", + "//pkg/sentry/fs/ramfs", + "//pkg/sentry/inet", + "//pkg/sentry/kernel", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/kernel/kdefs", + "//pkg/sentry/kernel/time", + "//pkg/sentry/mm", + "//pkg/sentry/usage", + "//pkg/sentry/usermem", + "//pkg/state", + "//pkg/syserr", + "//pkg/syserror", + ], +) + +go_test( + name = "proc_test", + size = "small", + srcs = [ + "net_test.go", + "sys_net_test.go", + ], + embed = [":proc"], + deps = [ + "//pkg/abi/linux", + "//pkg/sentry/context", + "//pkg/sentry/inet", + "//pkg/sentry/usermem", + ], +) diff --git a/pkg/sentry/fs/proc/README.md b/pkg/sentry/fs/proc/README.md new file mode 100644 index 000000000..c510ee63a --- /dev/null +++ b/pkg/sentry/fs/proc/README.md @@ -0,0 +1,317 @@ +This document tracks what is implemented in procfs. Refer to +Documentation/filesystems/proc.txt in the Linux project for information about +procfs generally. + +**NOTE**: This document is not guaranteed to be up to date. If you find an +inconsistency, please file a bug. + +[TOC] +## Kernel data + +The following files are implemented: + +| File /proc/ | Content | +| :------------------------ | :----------------------------------------------- | +| [cpuinfo](#cpuinfo) | Info about the CPU | +| [filesystem](#filesystem) | Supported filesystems | +| [loadavg](#loadavg) | Load average of last 1, 5 & 15 minutes | +| [meminfo](#meminfo) | Overall memory info | +| [stat](#stat) | Overall kernel statistics | +| [sys](#sys) | Change parameters within the kernel | +| [uptime](#uptime) | Wall clock since boot, combined idle time of all | +: : cpus : +| [version](#version) | Kernel version | + +### cpuinfo + +```bash +$ cat /proc/cpuinfo +processor : 0 +vendor_id : GenuineIntel +cpu family : 6 +model : 45 +model name : unknown +stepping : unknown +cpu MHz : 1234.588 +fpu : yes +fpu_exception : yes +cpuid level : 13 +wp : yes +flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr pdcm pcid dca sse4_1 sse4_2 x2apic popcnt tsc_deadline_timer aes xsave avx xsaveopt +bogomips : 1234.59 +clflush size : 64 +cache_alignment : 64 +address sizes : 46 bits physical, 48 bits virtual +power management: + +... +``` + +Notable divergences: + +Field name | Notes +:--------------- | :--------------------------------------- +model name | Always unknown +stepping | Always unknown +fpu | Always yes +fpu_exception | Always yes +wp | Always yes +bogomips | Bogus value (matches cpu MHz) +clflush size | Always 64 +cache_alignment | Always 64 +address sizes | Always 46 bits physical, 48 bits virtual +power management | Always blank + +Otherwise fields are derived from the SentryCPUIDSpec proto config. + +### filesystem + +```bash +$ cat /proc/filesystem +nodev 9p +nodev devtmpfs +nodev proc +nodev ramdiskfs +nodev sysfs +nodev tmpfs +``` + +Notable divergences: + +Filesystem | Notes +:--------- | :-------------------------------------------------------- +ramdiskfs | No Linux equivalent, see the SentryRamdiskFS proto config + +### loadavg + +```bash +$ cat /proc/loadavg +0.00 0.00 0.00 0/0 0 +``` + +Column | Notes +:------------------------------------ | :---------- +CPU.IO utilization in last 1 minute | Always zero +CPU.IO utilization in last 5 minutes | Always zero +CPU.IO utilization in last 10 minutes | Always zero +Num currently running processes | Always zero +Total num processes | Always zero + +TODO: Populate the columns with accurate statistics. +### meminfo + +```bash +$ cat /proc/meminfo +MemTotal: 2097152 kB +MemFree: 2083540 kB +MemAvailable: 2083540 kB +Buffers: 0 kB +Cached: 4428 kB +SwapCache: 0 kB +Active: 10812 kB +Inactive: 2216 kB +Active(anon): 8600 kB +Inactive(anon): 0 kB +Active(file): 2212 kB +Inactive(file): 2216 kB +Unevictable: 0 kB +Mlocked: 0 kB +SwapTotal: 0 kB +SwapFree: 0 kB +Dirty: 0 kB +Writeback: 0 kB +AnonPages: 8600 kB +Mapped: 4428 kB +Shmem: 0 kB + +``` + +Notable divergences: + +Field name | Notes +:---------------- | :-------------------------------------------------------- +Buffers | Always zero, no block devices +SwapCache | Always zero, no swap +Inactive(anon) | Always zero, see SwapCache +Unevictable | Always zero TODO +Mlocked | Always zero TODO +SwapTotal | Always zero, no swap +SwapFree | Always zero, no swap +Dirty | Always zero TODO +Writeback | Always zero TODO +MemAvailable | Uses the same value as MemFree since there is no swap. +Slab | Missing +SReclaimable | Missing +SUnreclaim | Missing +KernelStack | Missing +PageTables | Missing +NFS_Unstable | Missing +Bounce | Missing +WritebackTmp | Missing +CommitLimit | Missing +Committed_AS | Missing +VmallocTotal | Missing +VmallocUsed | Missing +VmallocChunk | Missing +HardwareCorrupted | Missing +AnonHugePages | Missing +ShmemHugePages | Missing +ShmemPmdMapped | Missing +HugePages_Total | Missing +HugePages_Free | Missing +HugePages_Rsvd | Missing +HugePages_Surp | Missing +Hugepagesize | Missing +DirectMap4k | Missing +DirectMap2M | Missing +DirectMap1G | Missing + +See [Memory +Accounting](pkg/sentry/usage/g3doc/memory-accounting.md) +for general caveats. + +### stat + +```bash +$ cat /proc/stat +cpu 0 0 0 0 0 0 0 0 0 0 +cpu0 0 0 0 0 0 0 0 0 0 0 +cpu1 0 0 0 0 0 0 0 0 0 0 +cpu2 0 0 0 0 0 0 0 0 0 0 +cpu3 0 0 0 0 0 0 0 0 0 0 +cpu4 0 0 0 0 0 0 0 0 0 0 +cpu5 0 0 0 0 0 0 0 0 0 0 +cpu6 0 0 0 0 0 0 0 0 0 0 +cpu7 0 0 0 0 0 0 0 0 0 0 +intr 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +ctxt 0 +btime 1504040968 +processes 0 +procs_running 0 +procs_blokkcked 0 +softirq 0 0 0 0 0 0 0 0 0 0 0 +``` + +All fields except for `btime` are always zero. +TODO: Populate with accurate fields. + +### sys + +```bash +$ ls /proc/sys +kernel vm +``` + +Directory | Notes +:-------- | :---------------------------- +abi | Missing +debug | Missing +dev | Missing +fs | Missing +kernel | Contains hostname (only) +net | Missing +user | Missing +vm | Contains mmap_min_addr (only) + +### uptime + +```bash +$ cat /proc/uptime +3204.62 0.00 +``` + +Column | Notes +:------------------------------- | :---------------------------- +Total num seconds system running | Time since procfs was mounted +Number of seconds idle | Always zero + +### version + +```bash +$ cat /proc/version +Linux version 3.11.10 #1 SMP Fri Nov 29 10:47:50 PST 2013 +``` + +## Process-specific data + +The following files are implemented: + +File /proc/PID | Content +:------------------ | :--------------------------------------------------- +[auxv](#auxv) | Copy of auxiliary vector for the process +[cmdline](#cmdline) | Command line arguments +[comm](#comm) | Command name associated with the process +[exe](#exe) | Symlink to the process's executable +[fd](#fd) | Directory containing links to open file descriptors +[fdinfo](#fdinfo) | Information associated with open file descriptors +[gid_map](#gid_map) | Mappings for group IDs inside the user namespace +[io](#io) | IO statistics +[maps](#maps) | Memory mappings (anon, executables, library files) +[ns](#ns) | Directory containing info about supported namespaces +[stat](#stat) | Process statistics +[status](#status) | Process status in human readable format +[task](#task) | Directory containing info about running threads +[uid_map](#uid_map) | Mappings for user IDs inside the user namespace + +### auxv + +TODO + +### cmdline + +TODO + +### comm + +TODO + +### exe + +TODO + +### fd + +TODO + +### fdinfo + +TODO + +### gid_map + +TODO + +### io + +Only has data for rchar, wchar, syscr, and syscw. + +TODO: add more detail. + +### maps + +TODO + +### ns + +TODO + +### stat + +Only has data for pid, comm, state, ppid, utime, stime, cutime, cstime, +num_threads, and exit_signal. + +TODO: add more detail. + +### status + +Statically created, most of the fields have no data. + +TODO: add more detail. + +### task + +TODO + +### uid_map + +TODO diff --git a/pkg/sentry/fs/proc/cpuinfo.go b/pkg/sentry/fs/proc/cpuinfo.go new file mode 100644 index 000000000..f80aaa5b1 --- /dev/null +++ b/pkg/sentry/fs/proc/cpuinfo.go @@ -0,0 +1,64 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package proc + +import ( + "io" + + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +// cpuinfo is a file describing the CPU capabilities. +// +// Presently cpuinfo never changes, so it doesn't need to be a SeqFile. +type cpuinfo struct { + ramfs.Entry + + // k is the system kernel. + k *kernel.Kernel +} + +// DeprecatedPreadv implements fs.InodeOperations.DeprecatedPreadv. +func (c *cpuinfo) DeprecatedPreadv(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) { + features := c.k.FeatureSet() + if features == nil { + // Kernel is always initialized with a FeatureSet. + panic("cpuinfo read with nil FeatureSet") + } + + contents := make([]byte, 0, 1024) + for i, max := uint(0), c.k.ApplicationCores(); i < max; i++ { + contents = append(contents, []byte(features.CPUInfo(i))...) + } + if offset >= int64(len(contents)) { + return 0, io.EOF + } + + n, err := dst.CopyOut(ctx, contents[offset:]) + return int64(n), err +} + +func (p *proc) newCPUInfo(ctx context.Context, msrc *fs.MountSource) *fs.Inode { + f := &cpuinfo{ + k: p.k, + } + f.InitEntry(ctx, fs.RootOwner, fs.FilePermsFromMode(0444)) + + return newFile(f, msrc, fs.SpecialFile, nil) +} diff --git a/pkg/sentry/fs/proc/device/BUILD b/pkg/sentry/fs/proc/device/BUILD new file mode 100644 index 000000000..b62062bd7 --- /dev/null +++ b/pkg/sentry/fs/proc/device/BUILD @@ -0,0 +1,11 @@ +package(licenses = ["notice"]) # Apache 2.0 + +load("@io_bazel_rules_go//go:def.bzl", "go_library") + +go_library( + name = "device", + srcs = ["device.go"], + importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc/device", + visibility = ["//pkg/sentry:internal"], + deps = ["//pkg/sentry/device"], +) diff --git a/pkg/sentry/fs/proc/device/device.go b/pkg/sentry/fs/proc/device/device.go new file mode 100644 index 000000000..6194afe88 --- /dev/null +++ b/pkg/sentry/fs/proc/device/device.go @@ -0,0 +1,23 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package device contains the proc device to avoid dependency loops. +package device + +import ( + "gvisor.googlesource.com/gvisor/pkg/sentry/device" +) + +// ProcDevice is the kernel proc device. +var ProcDevice = device.NewAnonDevice() diff --git a/pkg/sentry/fs/proc/exec_args.go b/pkg/sentry/fs/proc/exec_args.go new file mode 100644 index 000000000..0e1523bf1 --- /dev/null +++ b/pkg/sentry/fs/proc/exec_args.go @@ -0,0 +1,129 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package proc + +import ( + "fmt" + "io" + + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// execArgType enumerates the types of exec arguments that are exposed through +// proc. +type execArgType int + +const ( + cmdlineExecArg execArgType = iota + environExecArg +) + +// execArgFile is a file containing the exec args (either cmdline or environ) +// for a given task. +type execArgFile struct { + ramfs.Entry + + // arg is the type of exec argument this file contains. + arg execArgType + + // t is the Task to read the exec arg line from. + t *kernel.Task +} + +// newExecArgFile creates a file containing the exec args of the given type. +func newExecArgFile(t *kernel.Task, msrc *fs.MountSource, arg execArgType) *fs.Inode { + if arg != cmdlineExecArg && arg != environExecArg { + panic(fmt.Sprintf("unknown exec arg type %v", arg)) + } + f := &execArgFile{ + arg: arg, + t: t, + } + f.InitEntry(t, fs.RootOwner, fs.FilePermsFromMode(0444)) + return newFile(f, msrc, fs.SpecialFile, t) +} + +// DeprecatedPreadv reads the exec arg from the process's address space.. +func (f *execArgFile) DeprecatedPreadv(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) { + if offset < 0 { + return 0, syserror.EINVAL + } + + // N.B. Linux 4.2 eliminates the arbitrary one page limit. + if offset > usermem.PageSize { + return 0, io.EOF + } + dst = dst.TakeFirst64(usermem.PageSize - offset) + + m, err := getTaskMM(f.t) + if err != nil { + return 0, err + } + defer m.DecUsers(ctx) + + // Figure out the bounds of the exec arg we are trying to read. + var execArgStart, execArgEnd usermem.Addr + switch f.arg { + case cmdlineExecArg: + execArgStart, execArgEnd = m.ArgvStart(), m.ArgvEnd() + case environExecArg: + execArgStart, execArgEnd = m.EnvvStart(), m.EnvvEnd() + default: + panic(fmt.Sprintf("unknown exec arg type %v", f.arg)) + } + if execArgStart == 0 || execArgEnd == 0 { + // Don't attempt to read before the start/end are set up. + return 0, io.EOF + } + + start, ok := execArgStart.AddLength(uint64(offset)) + if !ok { + return 0, io.EOF + } + if start >= execArgEnd { + return 0, io.EOF + } + + length := int(execArgEnd - start) + if dstlen := dst.NumBytes(); int64(length) > dstlen { + length = int(dstlen) + } + + buf := make([]byte, length) + // N.B. Technically this should be usermem.IOOpts.IgnorePermissions = true + // until Linux 4.9 (272ddc8b3735 "proc: don't use FOLL_FORCE for reading + // cmdline and environment"). + copyN, copyErr := m.CopyIn(ctx, start, buf, usermem.IOOpts{}) + if copyN == 0 { + // Nothing to copy. + return 0, copyErr + } + buf = buf[:copyN] + + // TODO: On Linux, if the NUL byte at the end of the + // argument vector has been overwritten, it continues reading the + // environment vector as part of the argument vector. + + n, dstErr := dst.CopyOut(ctx, buf) + if dstErr != nil { + return int64(n), dstErr + } + return int64(n), copyErr +} diff --git a/pkg/sentry/fs/proc/fds.go b/pkg/sentry/fs/proc/fds.go new file mode 100644 index 000000000..2eca9ac31 --- /dev/null +++ b/pkg/sentry/fs/proc/fds.go @@ -0,0 +1,258 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package proc + +import ( + "fmt" + "sort" + "strconv" + + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc/device" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// walkDescriptors finds the descriptor (file-flag pair) for the fd identified +// by p, and calls the toInodeOperations callback with that descriptor. This is a helper +// method for implementing fs.InodeOperations.Lookup. +func walkDescriptors(t *kernel.Task, p string, toInode func(*fs.File, kernel.FDFlags) *fs.Inode) (*fs.Inode, error) { + n, err := strconv.ParseUint(p, 10, 64) + if err != nil { + // Not found. + return nil, syserror.ENOENT + } + + var file *fs.File + var flags kernel.FDFlags + t.WithMuLocked(func(t *kernel.Task) { + if fdm := t.FDMap(); fdm != nil { + file, flags = fdm.GetDescriptor(kdefs.FD(n)) + } + }) + if file == nil { + return nil, syserror.ENOENT + } + return toInode(file, flags), nil +} + +// readDescriptors reads fds in the task starting at offset, and calls the +// toDentAttr callback for each to get a DentAttr, which it then emits. This is +// a helper for implementing fs.InodeOperations.Readdir. +func readDescriptors(t *kernel.Task, c *fs.DirCtx, offset int, toDentAttr func(int) fs.DentAttr) (int, error) { + var fds kernel.FDs + t.WithMuLocked(func(t *kernel.Task) { + if fdm := t.FDMap(); fdm != nil { + fds = fdm.GetFDs() + } + }) + + fdInts := make([]int, 0, len(fds)) + for _, fd := range fds { + fdInts = append(fdInts, int(fd)) + } + + // Find the fd to start at. + idx := sort.SearchInts(fdInts, offset) + if idx == len(fdInts) { + return offset, nil + } + fdInts = fdInts[idx:] + + var fd int + for _, fd = range fdInts { + name := strconv.FormatUint(uint64(fd), 10) + if err := c.DirEmit(name, toDentAttr(fd)); err != nil { + // Returned offset is the next fd to serialize. + return fd, err + } + } + // We serialized them all. Next offset should be higher than last + // serialized fd. + return fd + 1, nil +} + +// fd is a single file in /proc/TID/fd/. +type fd struct { + ramfs.Symlink + *fs.File +} + +// newFD returns a new fd based on an existing file. +// +// This inherits one reference to the file. +func newFd(t *kernel.Task, f *fs.File, msrc *fs.MountSource) *fs.Inode { + fd := &fd{File: f} + // RootOwner by default, is overridden in UnstableAttr() + fd.InitSymlink(t, fs.RootOwner, "") + return newFile(fd, msrc, fs.Symlink, t) +} + +// GetFile returns the fs.File backing this fd. The dirent and flags +// arguments are ignored. +func (f *fd) GetFile(context.Context, *fs.Dirent, fs.FileFlags) (*fs.File, error) { + // Take a reference on the fs.File. + f.File.IncRef() + return f.File, nil +} + +// Readlink returns the current target. +func (f *fd) Readlink(ctx context.Context, _ *fs.Inode) (string, error) { + root := fs.RootFromContext(ctx) + defer root.DecRef() + n, _ := f.Dirent.FullName(root) + return n, nil +} + +// Getlink implements fs.InodeOperations.Getlink. +func (f *fd) Getlink(context.Context, *fs.Inode) (*fs.Dirent, error) { + f.Dirent.IncRef() + return f.Dirent, nil +} + +// Truncate is ignored. +func (f *fd) Truncate(context.Context, *fs.Inode, int64) error { + return nil +} + +// Close releases the reference on the file. +func (f *fd) Close() error { + f.DecRef() + return nil +} + +// fdDir implements /proc/TID/fd. +type fdDir struct { + ramfs.Dir + + // We hold a reference on the task's fdmap but only keep an indirect + // task pointer to avoid Dirent loading circularity caused by fdmap's + // potential back pointers into the dirent tree. + t *kernel.Task +} + +// newFdDir creates a new fdDir. +func newFdDir(t *kernel.Task, msrc *fs.MountSource) *fs.Inode { + f := &fdDir{t: t} + f.InitDir(t, nil, fs.RootOwner, fs.FilePermissions{User: fs.PermMask{Read: true, Execute: true}}) + return newFile(f, msrc, fs.SpecialDirectory, t) +} + +// Check implements InodeOperations.Check. +// +// This is to match Linux, which uses a special permission handler to guarantee +// that a process can still access /proc/self/fd after it has executed +// setuid. See fs/proc/fd.c:proc_fd_permission. +func (f *fdDir) Check(ctx context.Context, inode *fs.Inode, req fs.PermMask) bool { + if fs.ContextCanAccessFile(ctx, inode, req) { + return true + } + if t := kernel.TaskFromContext(ctx); t != nil { + // Allow access if the task trying to access it is in the + // thread group corresponding to this directory. + // + // N.B. Technically, in Linux 3.11, this compares what would be + // the equivalent of task pointers. However, this was fixed + // later in 54708d2858e7 ("proc: actually make + // proc_fd_permission() thread-friendly"). + if f.t.ThreadGroup() == t.ThreadGroup() { + return true + } + } + return false +} + +// Lookup loads an Inode in /proc/TID/fd into a Dirent. +func (f *fdDir) Lookup(ctx context.Context, dir *fs.Inode, p string) (*fs.Dirent, error) { + n, err := walkDescriptors(f.t, p, func(file *fs.File, _ kernel.FDFlags) *fs.Inode { + return newFd(f.t, file, dir.MountSource) + }) + if err != nil { + return nil, err + } + return fs.NewDirent(n, p), nil +} + +// DeprecatedReaddir lists fds in /proc/TID/fd. +func (f *fdDir) DeprecatedReaddir(ctx context.Context, dirCtx *fs.DirCtx, offset int) (int, error) { + return readDescriptors(f.t, dirCtx, offset, func(fd int) fs.DentAttr { + return fs.GenericDentAttr(fs.Symlink, device.ProcDevice) + }) +} + +// fdInfo is a single file in /proc/TID/fdinfo/. +type fdInfo struct { + ramfs.File + + flags kernel.FDFlags +} + +// newFdInfo returns a new fdInfo based on an existing file. +func newFdInfo(t *kernel.Task, _ *fs.File, flags kernel.FDFlags, msrc *fs.MountSource) *fs.Inode { + fdi := &fdInfo{flags: flags} + fdi.InitFile(t, fs.RootOwner, fs.FilePermissions{User: fs.PermMask{Read: true}}) + // TODO: Get pos, locks, and other data. For now we only + // have flags. + // See https://www.kernel.org/doc/Documentation/filesystems/proc.txt + fdi.Append([]byte(fmt.Sprintf("flags: %08o\n", flags))) + return newFile(fdi, msrc, fs.SpecialFile, t) +} + +// DeprecatedPwritev implements fs.HandleOperations.DeprecatedPwritev. +func (*fdInfo) DeprecatedPwritev(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) { + return 0, ramfs.ErrInvalidOp +} + +// Truncate implements fs.InodeOperations.Truncate. +func (*fdInfo) Truncate(ctx context.Context, inode *fs.Inode, size int64) error { + return ramfs.ErrInvalidOp +} + +// fdInfoDir implements /proc/TID/fdinfo. It embeds an fdDir, but overrides +// Lookup and Readdir. +type fdInfoDir struct { + ramfs.Dir + + t *kernel.Task +} + +// newFdInfoDir creates a new fdInfoDir. +func newFdInfoDir(t *kernel.Task, msrc *fs.MountSource) *fs.Inode { + fdid := &fdInfoDir{t: t} + fdid.InitDir(t, nil, fs.RootOwner, fs.FilePermsFromMode(0500)) + return newFile(fdid, msrc, fs.SpecialDirectory, t) +} + +// Lookup loads an fd in /proc/TID/fdinfo into a Dirent. +func (fdid *fdInfoDir) Lookup(ctx context.Context, dir *fs.Inode, p string) (*fs.Dirent, error) { + n, err := walkDescriptors(fdid.t, p, func(file *fs.File, flags kernel.FDFlags) *fs.Inode { + return newFdInfo(fdid.t, file, flags, dir.MountSource) + }) + if err != nil { + return nil, err + } + return fs.NewDirent(n, p), nil +} + +// DeprecatedReaddir lists fds in /proc/TID/fdinfo. +func (fdid *fdInfoDir) DeprecatedReaddir(ctx context.Context, dirCtx *fs.DirCtx, offset int) (int, error) { + return readDescriptors(fdid.t, dirCtx, offset, func(fd int) fs.DentAttr { + return fs.GenericDentAttr(fs.RegularFile, device.ProcDevice) + }) +} diff --git a/pkg/sentry/fs/proc/file.go b/pkg/sentry/fs/proc/file.go new file mode 100644 index 000000000..9a433cdf8 --- /dev/null +++ b/pkg/sentry/fs/proc/file.go @@ -0,0 +1,56 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package proc + +import ( + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc/device" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +type file struct { + fs.InodeOperations + + // nodeType is the file type of this file. + nodeType fs.InodeType + + // t is the associated kernel task that owns this file. + t *kernel.Task +} + +func newFile(node fs.InodeOperations, msrc *fs.MountSource, nodeType fs.InodeType, t *kernel.Task) *fs.Inode { + iops := &file{node, nodeType, t} + sattr := fs.StableAttr{ + DeviceID: device.ProcDevice.DeviceID(), + InodeID: device.ProcDevice.NextIno(), + BlockSize: usermem.PageSize, + Type: nodeType, + } + return fs.NewInode(iops, msrc, sattr) +} + +// UnstableAttr returns all attributes of this file. +func (f *file) UnstableAttr(ctx context.Context, inode *fs.Inode) (fs.UnstableAttr, error) { + uattr, err := f.InodeOperations.UnstableAttr(ctx, inode) + if err != nil { + return fs.UnstableAttr{}, err + } + if f.t != nil { + uattr.Owner = fs.FileOwnerFromContext(f.t) + } + return uattr, nil +} diff --git a/pkg/sentry/fs/proc/filesystems.go b/pkg/sentry/fs/proc/filesystems.go new file mode 100644 index 000000000..fe4de18ba --- /dev/null +++ b/pkg/sentry/fs/proc/filesystems.go @@ -0,0 +1,55 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package proc + +import ( + "bytes" + "fmt" + + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc/seqfile" +) + +// filesystemsData backs /proc/filesystems. +type filesystemsData struct{} + +// NeedsUpdate returns true on the first generation. The set of registered file +// systems doesn't change so there's no need to generate SeqData more than once. +func (*filesystemsData) NeedsUpdate(generation int64) bool { + return generation == 0 +} + +// ReadSeqFileData returns data for the SeqFile reader. +// SeqData, the current generation and where in the file the handle corresponds to. +func (*filesystemsData) ReadSeqFileData(h seqfile.SeqHandle) ([]seqfile.SeqData, int64) { + // We don't ever expect to see a non-nil SeqHandle. + if h != nil { + return nil, 0 + } + + // Generate the file contents. + var buf bytes.Buffer + for _, sys := range fs.GetFilesystems() { + nodev := "nodev" + if sys.Flags()&fs.FilesystemRequiresDev != 0 { + nodev = "" + } + // Matches the format of fs/filesystems.c:filesystems_proc_show. + fmt.Fprintf(&buf, "%s\t%s\n", nodev, sys.Name()) + } + + // Return the SeqData and advance the generation counter. + return []seqfile.SeqData{{Buf: buf.Bytes(), Handle: (*filesystemsData)(nil)}}, 1 +} diff --git a/pkg/sentry/fs/proc/fs.go b/pkg/sentry/fs/proc/fs.go new file mode 100644 index 000000000..072d00beb --- /dev/null +++ b/pkg/sentry/fs/proc/fs.go @@ -0,0 +1,69 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package proc + +import ( + "fmt" + + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" +) + +// filesystem is a procfs. +type filesystem struct{} + +func init() { + fs.RegisterFilesystem(&filesystem{}) +} + +// FilesystemName is the name underwhich the filesystem is registered. +// Name matches fs/proc/root.c:proc_fs_type.name. +const FilesystemName = "proc" + +// Name is the name of the file system. +func (*filesystem) Name() string { + return FilesystemName +} + +// AllowUserMount allows users to mount(2) this file system. +func (*filesystem) AllowUserMount() bool { + return true +} + +// Flags returns that there is nothing special about this file system. +// +// In Linux, proc returns FS_USERNS_VISIBLE | FS_USERNS_MOUNT, see fs/proc/root.c. +func (*filesystem) Flags() fs.FilesystemFlags { + return 0 +} + +// Mount returns the root of a procfs that can be positioned in the vfs. +func (f *filesystem) Mount(ctx context.Context, device string, flags fs.MountSourceFlags, data string) (*fs.Inode, error) { + // device is always ignored. + + // Parse generic comma-separated key=value options, this file system expects them. + options := fs.GenericMountSourceOptions(data) + + // Proc options parsing checks for either a gid= or hidepid= and barfs on + // anything else, see fs/proc/root.c:proc_parse_options. Since we don't know + // what to do with gid= or hidepid=, we blow up if we get any options. + if len(options) > 0 { + return nil, fmt.Errorf("unsupported mount options: %v", options) + } + + // Construct the procfs root. Since procfs files are all virtual, we + // never want them cached. + return New(ctx, fs.NewNonCachingMountSource(f, flags)) +} diff --git a/pkg/sentry/fs/proc/loadavg.go b/pkg/sentry/fs/proc/loadavg.go new file mode 100644 index 000000000..694cde656 --- /dev/null +++ b/pkg/sentry/fs/proc/loadavg.go @@ -0,0 +1,51 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package proc + +import ( + "bytes" + "fmt" + + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc/seqfile" +) + +// loadavgData backs /proc/loadavg. +type loadavgData struct{} + +// NeedsUpdate implements seqfile.SeqSource.NeedsUpdate. +func (*loadavgData) NeedsUpdate(generation int64) bool { + return true +} + +func (d *loadavgData) ReadSeqFileData(h seqfile.SeqHandle) ([]seqfile.SeqData, int64) { + if h != nil { + return nil, 0 + } + + var buf bytes.Buffer + + // TODO: Include real data in fields. + // Column 1-3: CPU and IO utilization of the last 1, 5, and 10 minute periods. + // Column 4-5: currently running processes and the total number of processes. + // Column 6: the last process ID used. + fmt.Fprintf(&buf, "%.2f %.2f %.2f %d/%d %d\n", 0.00, 0.00, 0.00, 0, 0, 0) + + return []seqfile.SeqData{ + { + Buf: buf.Bytes(), + Handle: (*loadavgData)(nil), + }, + }, 0 +} diff --git a/pkg/sentry/fs/proc/meminfo.go b/pkg/sentry/fs/proc/meminfo.go new file mode 100644 index 000000000..489f796e5 --- /dev/null +++ b/pkg/sentry/fs/proc/meminfo.go @@ -0,0 +1,82 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package proc + +import ( + "bytes" + "fmt" + + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc/seqfile" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" + "gvisor.googlesource.com/gvisor/pkg/sentry/usage" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +// meminfoData backs /proc/meminfo. +type meminfoData struct { + // k is the owning Kernel. + k *kernel.Kernel +} + +// NeedsUpdate implements seqfile.SeqSource.NeedsUpdate. +func (*meminfoData) NeedsUpdate(generation int64) bool { + return true +} + +// ReadSeqFileData implements seqfile.SeqSource.ReadSeqFileData. +func (d *meminfoData) ReadSeqFileData(h seqfile.SeqHandle) ([]seqfile.SeqData, int64) { + if h != nil { + return nil, 0 + } + + mem := d.k.Platform.Memory() + mem.UpdateUsage() + snapshot, totalUsage := usage.MemoryAccounting.Copy() + totalSize := usage.TotalMemory(mem.TotalSize(), totalUsage) + anon := snapshot.Anonymous + snapshot.Tmpfs + file := snapshot.PageCache + snapshot.Mapped + // We don't actually have active/inactive LRUs, so just make up numbers. + activeFile := (file / 2) &^ (usermem.PageSize - 1) + inactiveFile := file - activeFile + + var buf bytes.Buffer + fmt.Fprintf(&buf, "MemTotal: %8d kB\n", totalSize/1024) + memFree := (totalSize - totalUsage) / 1024 + // We use MemFree as MemAvailable because we don't swap. + // TODO: When reclaim is implemented the value of MemAvailable + // should change. + fmt.Fprintf(&buf, "MemFree: %8d kB\n", memFree) + fmt.Fprintf(&buf, "MemAvailable: %8d kB\n", memFree) + fmt.Fprintf(&buf, "Buffers: 0 kB\n") // memory usage by block devices + fmt.Fprintf(&buf, "Cached: %8d kB\n", (file+snapshot.Tmpfs)/1024) + // Emulate a system with no swap, which disables inactivation of anon pages. + fmt.Fprintf(&buf, "SwapCache: 0 kB\n") + fmt.Fprintf(&buf, "Active: %8d kB\n", (anon+activeFile)/1024) + fmt.Fprintf(&buf, "Inactive: %8d kB\n", inactiveFile/1024) + fmt.Fprintf(&buf, "Active(anon): %8d kB\n", anon/1024) + fmt.Fprintf(&buf, "Inactive(anon): 0 kB\n") + fmt.Fprintf(&buf, "Active(file): %8d kB\n", activeFile/1024) + fmt.Fprintf(&buf, "Inactive(file): %8d kB\n", inactiveFile/1024) + fmt.Fprintf(&buf, "Unevictable: 0 kB\n") // TODO + fmt.Fprintf(&buf, "Mlocked: 0 kB\n") // TODO + fmt.Fprintf(&buf, "SwapTotal: 0 kB\n") + fmt.Fprintf(&buf, "SwapFree: 0 kB\n") + fmt.Fprintf(&buf, "Dirty: 0 kB\n") + fmt.Fprintf(&buf, "Writeback: 0 kB\n") + fmt.Fprintf(&buf, "AnonPages: %8d kB\n", anon/1024) + fmt.Fprintf(&buf, "Mapped: %8d kB\n", file/1024) // doesn't count mapped tmpfs, which we don't know + fmt.Fprintf(&buf, "Shmem: %8d kB\n", snapshot.Tmpfs/1024) + return []seqfile.SeqData{{Buf: buf.Bytes(), Handle: (*meminfoData)(nil)}}, 0 +} diff --git a/pkg/sentry/fs/proc/mounts.go b/pkg/sentry/fs/proc/mounts.go new file mode 100644 index 000000000..76092567d --- /dev/null +++ b/pkg/sentry/fs/proc/mounts.go @@ -0,0 +1,176 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package proc + +import ( + "bytes" + "fmt" + "sort" + + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc/seqfile" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" +) + +// forEachMountSource runs f for the process root mount and each mount that is a +// descendant of the root. +func forEachMountSource(t *kernel.Task, fn func(string, *fs.MountSource)) { + // All mount points must be relative to the rootDir, and mounts outside + // will be excluded. + rootDir := t.FSContext().RootDirectory() + defer rootDir.DecRef() + + if rootDir.Inode == nil { + panic(fmt.Sprintf("root dirent has nil inode: %+v", rootDir)) + } + if rootDir.Inode.MountSource == nil { + panic(fmt.Sprintf("root dirent has nil mount: %+v", rootDir)) + } + + ms := append(rootDir.Inode.MountSource.Submounts(), rootDir.Inode.MountSource) + sort.Slice(ms, func(i, j int) bool { + return ms[i].ID() < ms[j].ID() + }) + for _, m := range ms { + mountPath, desc := m.Root().FullName(rootDir) + if !desc { + // MountSources that are not descendants of the chroot jail are ignored. + continue + } + + fn(mountPath, m) + } +} + +// mountInfoFile is used to implement /proc/[pid]/mountinfo. +type mountInfoFile struct { + t *kernel.Task +} + +// NeedsUpdate implements SeqSource.NeedsUpdate. +func (mif *mountInfoFile) NeedsUpdate(_ int64) bool { + return true +} + +// ReadSeqFileData implements SeqSource.ReadSeqFileData. +func (mif *mountInfoFile) ReadSeqFileData(handle seqfile.SeqHandle) ([]seqfile.SeqData, int64) { + if handle != nil { + return nil, 0 + } + + var buf bytes.Buffer + forEachMountSource(mif.t, func(mountPath string, m *fs.MountSource) { + // Format: + // 36 35 98:0 /mnt1 /mnt2 rw,noatime master:1 - ext3 /dev/root rw,errors=continue + // (1)(2)(3) (4) (5) (6) (7) (8) (9) (10) (11) + + // (1) MountSource ID. + fmt.Fprintf(&buf, "%d ", m.ID()) + + // (2) Parent ID (or this ID if there is no parent). + pID := m.ID() + if p := m.Parent(); p != nil { + pID = p.ID() + } + fmt.Fprintf(&buf, "%d ", pID) + + // (3) Major:Minor device ID. We don't have a superblock, so we + // just use the root inode device number. + sa := m.Root().Inode.StableAttr + fmt.Fprintf(&buf, "%d:%d ", sa.DeviceFileMajor, sa.DeviceFileMinor) + + // (4) Root: the pathname of the directory in the filesystem + // which forms the root of this mount. + // + // NOTE: This will always be "/" until we implement + // bind mounts. + fmt.Fprintf(&buf, "/ ") + + // (5) Mount point (relative to process root). + fmt.Fprintf(&buf, "%s ", mountPath) + + // (6) Mount options. + opts := "rw" + if m.Flags.ReadOnly { + opts = "ro" + } + if m.Flags.NoAtime { + opts += ",noatime" + } + fmt.Fprintf(&buf, "%s ", opts) + + // (7) Optional fields: zero or more fields of the form "tag[:value]". + // (8) Separator: the end of the optional fields is marked by a single hyphen. + fmt.Fprintf(&buf, "- ") + + // (9) Filesystem type. + name := "none" + if m.Filesystem != nil { + name = m.Filesystem.Name() + } + fmt.Fprintf(&buf, "%s ", name) + + // (10) Mount source: filesystem-specific information or "none". + fmt.Fprintf(&buf, "none ") + + // (11) Superblock options. Only "ro/rw" is supported for now, + // and is the same as the filesystem option. + fmt.Fprintf(&buf, "%s\n", opts) + }) + + return []seqfile.SeqData{{Buf: buf.Bytes(), Handle: (*mountInfoFile)(nil)}}, 0 +} + +// mountsFile is used to implement /proc/[pid]/mountinfo. +type mountsFile struct { + t *kernel.Task +} + +// NeedsUpdate implements SeqSource.NeedsUpdate. +func (mf *mountsFile) NeedsUpdate(_ int64) bool { + return true +} + +// ReadSeqFileData implements SeqSource.ReadSeqFileData. +func (mf *mountsFile) ReadSeqFileData(handle seqfile.SeqHandle) ([]seqfile.SeqData, int64) { + if handle != nil { + return nil, 0 + } + + var buf bytes.Buffer + forEachMountSource(mf.t, func(mountPath string, m *fs.MountSource) { + // Format (tab-separated): + // <special device or remote filesystem> <mount point> <filesystem type> <mount options> <needs dump> <fsck order> + // + // We use the filesystem name as the first field, since there + // is no real block device we can point to, and we also should + // not expose anything about the remote filesystem. + // + // Only ro/rw option is supported for now. + // + // The "needs dump"and fsck flags are always 0, which is allowed. + opts := "rw" + if m.Flags.ReadOnly { + opts = "ro" + } + name := "none" + if m.Filesystem != nil { + name = m.Filesystem.Name() + } + fmt.Fprintf(&buf, "%s\t%s\t%s\t%s\t%d\t%d\n", "none", mountPath, name, opts, 0, 0) + }) + + return []seqfile.SeqData{{Buf: buf.Bytes(), Handle: (*mountsFile)(nil)}}, 0 +} diff --git a/pkg/sentry/fs/proc/net.go b/pkg/sentry/fs/proc/net.go new file mode 100644 index 000000000..6e464857a --- /dev/null +++ b/pkg/sentry/fs/proc/net.go @@ -0,0 +1,151 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package proc + +import ( + "fmt" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc/seqfile" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs" + "gvisor.googlesource.com/gvisor/pkg/sentry/inet" +) + +// newNet creates a new proc net entry. +func (p *proc) newNetDir(ctx context.Context, msrc *fs.MountSource) *fs.Inode { + d := &ramfs.Dir{} + d.InitDir(ctx, nil, fs.RootOwner, fs.FilePermsFromMode(0555)) + if s := p.k.NetworkStack(); s != nil && s.SupportsIPv6() { + d.AddChild(ctx, "dev", seqfile.NewSeqFileInode(ctx, &netDev{s: s}, msrc)) + d.AddChild(ctx, "if_inet6", seqfile.NewSeqFileInode(ctx, &ifinet6{s: s}, msrc)) + } + return newFile(d, msrc, fs.SpecialDirectory, nil) +} + +// ifinet6 implements seqfile.SeqSource for /proc/net/if_inet6. +type ifinet6 struct { + s inet.Stack `state:"nosave"` // S/R-FIXME +} + +func (n *ifinet6) contents() []string { + var lines []string + nics := n.s.Interfaces() + for id, naddrs := range n.s.InterfaceAddrs() { + nic, ok := nics[id] + if !ok { + // NIC was added after NICNames was called. We'll just + // ignore it. + continue + } + + for _, a := range naddrs { + // IPv6 only. + if a.Family != linux.AF_INET6 { + continue + } + + // Fields: + // IPv6 address displayed in 32 hexadecimal chars without colons + // Netlink device number (interface index) in hexadecimal (use nic id) + // Prefix length in hexadecimal + // Scope value (use 0) + // Interface flags + // Device name + lines = append(lines, fmt.Sprintf("%032x %02x %02x %02x %02x %8s\n", a.Addr, id, a.PrefixLen, 0, a.Flags, nic.Name)) + } + } + return lines +} + +// NeedsUpdate implements seqfile.SeqSource.NeedsUpdate. +func (*ifinet6) NeedsUpdate(generation int64) bool { + return true +} + +// ReadSeqFileData implements seqfile.SeqSource.ReadSeqFileData. +func (n *ifinet6) ReadSeqFileData(h seqfile.SeqHandle) ([]seqfile.SeqData, int64) { + if h != nil { + return nil, 0 + } + + var data []seqfile.SeqData + for _, l := range n.contents() { + data = append(data, seqfile.SeqData{Buf: []byte(l), Handle: (*ifinet6)(nil)}) + } + + return data, 0 +} + +// netDev implements seqfile.SeqSource for /proc/net/dev. +type netDev struct { + s inet.Stack `state:"nosave"` // S/R-FIXME +} + +// NeedsUpdate implements seqfile.SeqSource.NeedsUpdate. +func (n *netDev) NeedsUpdate(generation int64) bool { + return true +} + +// ReadSeqFileData implements seqfile.SeqSource.ReadSeqFileData. See Linux's +// net/core/net-procfs.c:dev_seq_show. +func (n *netDev) ReadSeqFileData(h seqfile.SeqHandle) ([]seqfile.SeqData, int64) { + if h != nil { + return nil, 0 + } + + interfaces := n.s.Interfaces() + contents := make([]string, 2, 2+len(interfaces)) + // Add the table header. From net/core/net-procfs.c:dev_seq_show. + contents[0] = "Inter-| Receive | Transmit\n" + contents[1] = " face |bytes packets errs drop fifo frame compressed multicast|bytes packets errs drop fifo colls carrier compressed\n" + + for _, i := range interfaces { + // TODO: Collect stats from each inet.Stack + // implementation (hostinet, epsocket, and rpcinet). + + // Implements the same format as + // net/core/net-procfs.c:dev_seq_printf_stats. + l := fmt.Sprintf("%6s: %7d %7d %4d %4d %4d %5d %10d %9d %8d %7d %4d %4d %4d %5d %7d %10d\n", + i.Name, + // Received + 0, // bytes + 0, // packets + 0, // errors + 0, // dropped + 0, // fifo + 0, // frame + 0, // compressed + 0, // multicast + // Transmitted + 0, // bytes + 0, // packets + 0, // errors + 0, // dropped + 0, // fifo + 0, // frame + 0, // compressed + 0) // multicast + contents = append(contents, l) + } + + var data []seqfile.SeqData + for _, l := range contents { + data = append(data, seqfile.SeqData{Buf: []byte(l), Handle: (*ifinet6)(nil)}) + } + + return data, 0 +} diff --git a/pkg/sentry/fs/proc/net_test.go b/pkg/sentry/fs/proc/net_test.go new file mode 100644 index 000000000..a31a20494 --- /dev/null +++ b/pkg/sentry/fs/proc/net_test.go @@ -0,0 +1,74 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package proc + +import ( + "reflect" + "testing" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/inet" +) + +func newIPv6TestStack() *inet.TestStack { + s := inet.NewTestStack() + s.SupportsIPv6Flag = true + return s +} + +func TestIfinet6NoAddresses(t *testing.T) { + n := &ifinet6{s: newIPv6TestStack()} + if got := n.contents(); got != nil { + t.Errorf("Got n.contents() = %v, want = %v", got, nil) + } +} + +func TestIfinet6(t *testing.T) { + s := newIPv6TestStack() + s.InterfacesMap[1] = inet.Interface{Name: "eth0"} + s.InterfaceAddrsMap[1] = []inet.InterfaceAddr{ + { + Family: linux.AF_INET6, + PrefixLen: 128, + Addr: []byte("\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"), + }, + } + s.InterfacesMap[2] = inet.Interface{Name: "eth1"} + s.InterfaceAddrsMap[2] = []inet.InterfaceAddr{ + { + Family: linux.AF_INET6, + PrefixLen: 128, + Addr: []byte("\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"), + }, + } + want := map[string]struct{}{ + "000102030405060708090a0b0c0d0e0f 01 80 00 00 eth0\n": {}, + "101112131415161718191a1b1c1d1e1f 02 80 00 00 eth1\n": {}, + } + + n := &ifinet6{s: s} + contents := n.contents() + if len(contents) != len(want) { + t.Errorf("Got len(n.contents()) = %d, want = %d", len(contents), len(want)) + } + got := map[string]struct{}{} + for _, l := range contents { + got[l] = struct{}{} + } + + if !reflect.DeepEqual(got, want) { + t.Errorf("Got n.contents() = %v, want = %v", got, want) + } +} diff --git a/pkg/sentry/fs/proc/proc.go b/pkg/sentry/fs/proc/proc.go new file mode 100644 index 000000000..459eb7e62 --- /dev/null +++ b/pkg/sentry/fs/proc/proc.go @@ -0,0 +1,182 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package proc implements a partial in-memory file system for profs. +package proc + +import ( + "fmt" + "sort" + "strconv" + + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc/device" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc/seqfile" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" +) + +// proc is a root proc node. +type proc struct { + ramfs.Dir + + // k is the Kernel containing this proc node. + k *kernel.Kernel + + // pidns is the PID namespace of the task that mounted the proc filesystem + // that this node represents. + pidns *kernel.PIDNamespace +} + +// New returns the root node of a partial simple procfs. +func New(ctx context.Context, msrc *fs.MountSource) (*fs.Inode, error) { + k := kernel.KernelFromContext(ctx) + if k == nil { + return nil, fmt.Errorf("procfs requires a kernel") + } + pidns := kernel.PIDNamespaceFromContext(ctx) + if pidns == nil { + return nil, fmt.Errorf("procfs requires a PID namespace") + } + + p := &proc{k: k, pidns: pidns} + p.InitDir(ctx, map[string]*fs.Inode{ + // Note that these are just the static members. There are + // dynamic members populated in Readdir and Lookup below. + "filesystems": seqfile.NewSeqFileInode(ctx, &filesystemsData{}, msrc), + "loadavg": seqfile.NewSeqFileInode(ctx, &loadavgData{}, msrc), + "meminfo": seqfile.NewSeqFileInode(ctx, &meminfoData{k}, msrc), + "mounts": newMountsSymlink(ctx, msrc), + "stat": seqfile.NewSeqFileInode(ctx, &statData{k}, msrc), + "version": seqfile.NewSeqFileInode(ctx, &versionData{k}, msrc), + }, fs.RootOwner, fs.FilePermsFromMode(0555)) + + p.AddChild(ctx, "cpuinfo", p.newCPUInfo(ctx, msrc)) + p.AddChild(ctx, "uptime", p.newUptime(ctx, msrc)) + + return newFile(p, msrc, fs.SpecialDirectory, nil), nil +} + +// self is a magical link. +type self struct { + ramfs.Symlink + + pidns *kernel.PIDNamespace +} + +// newSelf returns a new "self" node. +func (p *proc) newSelf(ctx context.Context, msrc *fs.MountSource) *fs.Inode { + s := &self{pidns: p.pidns} + s.InitSymlink(ctx, fs.RootOwner, "") + return newFile(s, msrc, fs.Symlink, nil) +} + +// Readlink implements fs.InodeOperations.Readlink. +func (s *self) Readlink(ctx context.Context, inode *fs.Inode) (string, error) { + if t := kernel.TaskFromContext(ctx); t != nil { + tgid := s.pidns.IDOfThreadGroup(t.ThreadGroup()) + if tgid == 0 { + return "", ramfs.ErrNotFound + } + return strconv.FormatUint(uint64(tgid), 10), nil + } + + // Who is reading this link? + return "", ramfs.ErrInvalidOp +} + +// Lookup loads an Inode at name into a Dirent. +func (p *proc) Lookup(ctx context.Context, dir *fs.Inode, name string) (*fs.Dirent, error) { + // Is it one of the static ones? + dirent, walkErr := p.Dir.Lookup(ctx, dir, name) + if walkErr == nil { + return dirent, nil + } + + // Is it a dynamic element? + nfs := map[string]func() *fs.Inode{ + "net": func() *fs.Inode { return p.newNetDir(ctx, dir.MountSource) }, + "self": func() *fs.Inode { return p.newSelf(ctx, dir.MountSource) }, + "sys": func() *fs.Inode { return p.newSysDir(ctx, dir.MountSource) }, + } + if nf, ok := nfs[name]; ok { + return fs.NewDirent(nf(), name), nil + } + + // Try to lookup a corresponding task. + tid, err := strconv.ParseUint(name, 10, 64) + if err != nil { + // Ignore the parse error and return the original. + return nil, walkErr + } + + // Grab the other task. + otherTask := p.pidns.TaskWithID(kernel.ThreadID(tid)) + if otherTask == nil { + // Per above. + return nil, walkErr + } + + // Wrap it in a taskDir. + td := newTaskDir(otherTask, dir.MountSource, p.pidns, true) + return fs.NewDirent(td, name), nil +} + +// Readdir synthesizes proc contents. +func (p *proc) DeprecatedReaddir(ctx context.Context, dirCtx *fs.DirCtx, offset int) (int, error) { + // Serialize normal contents. + _, err := p.Dir.DeprecatedReaddir(ctx, dirCtx, offset) + if err != nil { + return offset, err + } + + m := make(map[string]fs.DentAttr) + var names []string + + // Add special files. + m["sys"] = fs.GenericDentAttr(fs.SpecialFile, device.ProcDevice) + names = append(names, "sys") + + // Collect tasks. + // Per linux we only include it in directory listings if it's the leader. + // But for whatever crazy reason, you can still walk to the given node. + for _, tg := range p.pidns.ThreadGroups() { + if leader := tg.Leader(); leader != nil { + name := strconv.FormatUint(uint64(tg.ID()), 10) + m[name] = fs.GenericDentAttr(fs.SpecialDirectory, device.ProcDevice) + names = append(names, name) + } + } + + if offset >= len(m) { + return offset, nil + } + sort.Strings(names) + names = names[offset:] + for _, name := range names { + if err := dirCtx.DirEmit(name, m[name]); err != nil { + return offset, err + } + offset++ + } + return offset, err +} + +// newMountsSymlink returns a symlink to "self/mounts" +func newMountsSymlink(ctx context.Context, msrc *fs.MountSource) *fs.Inode { + s := &ramfs.Symlink{} + s.InitSymlink(ctx, fs.RootOwner, "self/mounts") + return newFile(s, msrc, fs.Symlink, nil) +} diff --git a/pkg/sentry/fs/proc/seqfile/BUILD b/pkg/sentry/fs/proc/seqfile/BUILD new file mode 100644 index 000000000..48dd25e5b --- /dev/null +++ b/pkg/sentry/fs/proc/seqfile/BUILD @@ -0,0 +1,55 @@ +package(licenses = ["notice"]) # Apache 2.0 + +load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test") +load("//tools/go_stateify:defs.bzl", "go_stateify") + +go_stateify( + name = "seqfile_state", + srcs = [ + "seqfile.go", + ], + out = "seqfile_state.go", + package = "seqfile", +) + +go_library( + name = "seqfile", + srcs = [ + "seqfile.go", + "seqfile_state.go", + ], + importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc/seqfile", + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/sentry/context", + "//pkg/sentry/fs", + "//pkg/sentry/fs/proc/device", + "//pkg/sentry/fs/ramfs", + "//pkg/sentry/kernel/time", + "//pkg/sentry/usermem", + "//pkg/state", + ], +) + +go_stateify( + name = "seqfile_test_state", + srcs = ["seqfile_test.go"], + out = "seqfile_test_state.go", + package = "seqfile", +) + +go_test( + name = "seqfile_test", + size = "small", + srcs = [ + "seqfile_test.go", + "seqfile_test_state.go", + ], + embed = [":seqfile"], + deps = [ + "//pkg/sentry/context/contexttest", + "//pkg/sentry/fs", + "//pkg/sentry/fs/ramfs/test", + "//pkg/sentry/usermem", + ], +) diff --git a/pkg/sentry/fs/proc/seqfile/seqfile.go b/pkg/sentry/fs/proc/seqfile/seqfile.go new file mode 100644 index 000000000..e37a85869 --- /dev/null +++ b/pkg/sentry/fs/proc/seqfile/seqfile.go @@ -0,0 +1,232 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package seqfile + +import ( + "io" + "sync" + + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc/device" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs" + ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +// SeqHandle is a helper handle to seek in the file. +type SeqHandle interface{} + +// SeqData holds the data for one unit in the file. +type SeqData struct { + // The data to be returned to the user. + Buf []byte + + // A seek handle used to find the next valid unit in ReadSeqFiledata. + Handle SeqHandle +} + +// SeqSource is a data source for a SeqFile file. +type SeqSource interface { + // NeedsUpdate returns true if the consumer of SeqData should call + // ReadSeqFileData again. Generation is the generation returned by + // ReadSeqFile or 0. + NeedsUpdate(generation int64) bool + + // Returns a slice of SeqData ordered by unit and the current + // generation. The first entry in the slice is greater than the handle. + // If handle is nil then all known records are returned. Generation + // must always be greater than 0. + ReadSeqFileData(handle SeqHandle) ([]SeqData, int64) +} + +// SeqGenerationCounter is a counter to keep track if the SeqSource should be +// updated. SeqGenerationCounter is not thread-safe and should be protected +// with a mutex. +type SeqGenerationCounter struct { + // The generation that the SeqData is at. + generation int64 +} + +// SetGeneration sets the generation to the new value, be careful to not set it +// to a value less than current. +func (s *SeqGenerationCounter) SetGeneration(generation int64) { + s.generation = generation +} + +// Update increments the current generation. +func (s *SeqGenerationCounter) Update() { + s.generation++ +} + +// Generation returns the current generation counter. +func (s *SeqGenerationCounter) Generation() int64 { + return s.generation +} + +// IsCurrent returns whether the given generation is current or not. +func (s *SeqGenerationCounter) IsCurrent(generation int64) bool { + return s.Generation() == generation +} + +// SeqFile is used to provide dynamic files that can be ordered by record. +type SeqFile struct { + ramfs.Entry + + // mu protects the fields below. + mu sync.Mutex `state:"nosave"` + + SeqSource + + source []SeqData + generation int64 + lastRead int64 +} + +// NewSeqFile returns a seqfile suitable for use by external consumers. +func NewSeqFile(ctx context.Context, source SeqSource) *SeqFile { + s := &SeqFile{SeqSource: source} + s.InitEntry(ctx, fs.RootOwner, fs.FilePermsFromMode(0444)) + return s +} + +// NewSeqFileInode returns an Inode with SeqFile InodeOperations. +func NewSeqFileInode(ctx context.Context, source SeqSource, msrc *fs.MountSource) *fs.Inode { + iops := NewSeqFile(ctx, source) + sattr := fs.StableAttr{ + DeviceID: device.ProcDevice.DeviceID(), + InodeID: device.ProcDevice.NextIno(), + BlockSize: usermem.PageSize, + Type: fs.SpecialFile, + } + return fs.NewInode(iops, msrc, sattr) +} + +// UnstableAttr returns unstable attributes of the SeqFile. +func (s *SeqFile) UnstableAttr(ctx context.Context, inode *fs.Inode) (fs.UnstableAttr, error) { + uattr, _ := s.Entry.UnstableAttr(ctx, inode) + uattr.ModificationTime = ktime.NowFromContext(ctx) + return uattr, nil +} + +// findIndexAndOffset finds the unit that corresponds to a certain offset. +// Returns the unit and the offset within the unit. If there are not enough +// units len(data) and leftover offset is returned. +func findIndexAndOffset(data []SeqData, offset int64) (int, int64) { + for i, buf := range data { + l := int64(len(buf.Buf)) + if offset < l { + return i, offset + } + offset -= l + } + return len(data), offset +} + +// DeprecatedPreadv reads from the file at the given offset. +func (s *SeqFile) DeprecatedPreadv(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) { + s.mu.Lock() + defer s.mu.Unlock() + + s.Entry.NotifyAccess(ctx) + defer func() { s.lastRead = offset }() + + updated := false + + // Try to find where we should start reading this file. + i, recordOffset := findIndexAndOffset(s.source, offset) + if i == len(s.source) { + // Ok, we're at EOF. Let's first check to see if there might be + // more data available to us. If there is more data, add it to + // the end and try reading again. + if !s.SeqSource.NeedsUpdate(s.generation) { + return 0, io.EOF + } + oldLen := len(s.source) + s.updateSourceLocked(len(s.source)) + updated = true + // We know that we had consumed everything up until this point + // so we search in the new slice instead of starting over. + i, recordOffset = findIndexAndOffset(s.source[oldLen:], recordOffset) + i += oldLen + // i is at most the length of the slice which is + // len(s.source) - oldLen. So at most i will be equal to + // len(s.source). + if i == len(s.source) { + return 0, io.EOF + } + } + + var done int64 + // We're reading parts of a record, finish reading the current object + // before continuing on to the next. We don't refresh our data source + // before this record is completed. + if recordOffset != 0 { + n, err := dst.CopyOut(ctx, s.source[i].Buf[recordOffset:]) + done += int64(n) + dst = dst.DropFirst(n) + if dst.NumBytes() == 0 || err != nil { + return done, err + } + i++ + } + + // Next/New unit, update the source file if necessary. Make an extra + // check to see if we've seeked backwards and if so always update our + // data source. + if !updated && (s.SeqSource.NeedsUpdate(s.generation) || s.lastRead > offset) { + s.updateSourceLocked(i) + // recordOffset is 0 here and we won't update records behind the + // current one so recordOffset is still 0 even though source + // just got updated. Just read the next record. + } + + // Finish by reading all the available data. + for _, buf := range s.source[i:] { + n, err := dst.CopyOut(ctx, buf.Buf) + done += int64(n) + dst = dst.DropFirst(n) + if dst.NumBytes() == 0 || err != nil { + return done, err + } + } + + // If the file shrank (entries not yet read were removed above) + // while we tried to read we can end up with nothing read. + if done == 0 && dst.NumBytes() != 0 { + return 0, io.EOF + } + return done, nil +} + +// updateSourceLocked requires that s.mu is held. +func (s *SeqFile) updateSourceLocked(record int) { + var h SeqHandle + if record == 0 { + h = nil + } else { + h = s.source[record-1].Handle + } + // Save what we have previously read. + s.source = s.source[:record] + var newSource []SeqData + newSource, s.generation = s.SeqSource.ReadSeqFileData(h) + s.source = append(s.source, newSource...) +} + +// DeprecatedPwritev is always denied. +func (*SeqFile) DeprecatedPwritev(context.Context, usermem.IOSequence, int64) (int64, error) { + return 0, ramfs.ErrDenied +} diff --git a/pkg/sentry/fs/proc/seqfile/seqfile_test.go b/pkg/sentry/fs/proc/seqfile/seqfile_test.go new file mode 100644 index 000000000..0bf39ad82 --- /dev/null +++ b/pkg/sentry/fs/proc/seqfile/seqfile_test.go @@ -0,0 +1,272 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package seqfile + +import ( + "bytes" + "fmt" + "io" + "testing" + + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + ramfstest "gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs/test" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +type seqTest struct { + actual []SeqData + update bool +} + +func (s *seqTest) Init() { + var sq []SeqData + // Create some SeqData. + for i := 0; i < 10; i++ { + var b []byte + for j := 0; j < 10; j++ { + b = append(b, byte(i)) + } + sq = append(sq, SeqData{ + Buf: b, + Handle: &testHandle{i: i}, + }) + } + s.actual = sq +} + +// NeedsUpdate reports whether we need to update the data we've previously read. +func (s *seqTest) NeedsUpdate(int64) bool { + return s.update +} + +// ReadSeqFiledata returns a slice of SeqData which contains elements +// greater than the handle. +func (s *seqTest) ReadSeqFileData(handle SeqHandle) ([]SeqData, int64) { + if handle == nil { + return s.actual, 0 + } + h := *handle.(*testHandle) + var ret []SeqData + for _, b := range s.actual { + // We want the next one. + h2 := *b.Handle.(*testHandle) + if h2.i > h.i { + ret = append(ret, b) + } + } + return ret, 0 +} + +// Flatten a slice of slices into one slice. +func flatten(buf ...[]byte) []byte { + var flat []byte + for _, b := range buf { + flat = append(flat, b...) + } + return flat +} + +type testHandle struct { + i int +} + +type testTable struct { + offset int64 + readBufferSize int + expectedData []byte + expectedError error +} + +func runTableTests(ctx context.Context, table []testTable, n fs.InodeOperations) error { + for _, tt := range table { + data := make([]byte, tt.readBufferSize) + resultLen, err := n.DeprecatedPreadv(ctx, usermem.BytesIOSequence(data), tt.offset) + if err != tt.expectedError { + return fmt.Errorf("t.Preadv(len: %v, offset: %v) (error) => %v expected %v", tt.readBufferSize, tt.offset, err, tt.expectedError) + } + expectedLen := int64(len(tt.expectedData)) + if resultLen != expectedLen { + // We make this just an error so we wall through and print the data below. + return fmt.Errorf("t.Preadv(len: %v, offset: %v) (size) => %v expected %v", tt.readBufferSize, tt.offset, resultLen, expectedLen) + } + if !bytes.Equal(data[:expectedLen], tt.expectedData) { + return fmt.Errorf("t.Preadv(len: %v, offset: %v) (data) => %v expected %v", tt.readBufferSize, tt.offset, data[:expectedLen], tt.expectedData) + } + } + return nil +} + +func TestSeqFile(t *testing.T) { + testSource := &seqTest{} + testSource.Init() + + // Create a file that can be R/W. + m := fs.NewNonCachingMountSource(nil, fs.MountSourceFlags{}) + ctx := contexttest.Context(t) + contents := map[string]*fs.Inode{ + "foo": NewSeqFileInode(ctx, testSource, m), + } + root := ramfstest.NewDir(ctx, contents, fs.FilePermsFromMode(0777)) + + // How about opening it? + inode := fs.NewInode(root, m, fs.StableAttr{Type: fs.Directory}) + dirent2, err := root.Lookup(ctx, inode, "foo") + if err != nil { + t.Fatalf("failed to walk to foo for n2: %v", err) + } + n2 := dirent2.Inode.InodeOperations + + // Writing? + if _, err := n2.DeprecatedPwritev(nil, usermem.BytesIOSequence([]byte("test")), 0); err == nil { + t.Fatalf("managed to write to n2: %v", err) + } + + // How about reading? + dirent3, err := root.Lookup(ctx, inode, "foo") + if err != nil { + t.Fatalf("failed to walk to foo: %v", err) + } + n3 := dirent3.Inode.InodeOperations + + if n2 != n3 { + t.Error("got n2 != n3, want same") + } + + testSource.update = true + + table := []testTable{ + // Read past the end. + {100, 4, []byte{}, io.EOF}, + {110, 4, []byte{}, io.EOF}, + {200, 4, []byte{}, io.EOF}, + // Read a truncated first line. + {0, 4, testSource.actual[0].Buf[:4], nil}, + // Read the whole first line. + {0, 10, testSource.actual[0].Buf, nil}, + // Read the whole first line + 5 bytes of second line. + {0, 15, flatten(testSource.actual[0].Buf, testSource.actual[1].Buf[:5]), nil}, + // First 4 bytes of the second line. + {10, 4, testSource.actual[1].Buf[:4], nil}, + // Read the two first lines. + {0, 20, flatten(testSource.actual[0].Buf, testSource.actual[1].Buf), nil}, + // Read three lines. + {0, 30, flatten(testSource.actual[0].Buf, testSource.actual[1].Buf, testSource.actual[2].Buf), nil}, + // Read everything, but use a bigger buffer than necessary. + {0, 150, flatten(testSource.actual[0].Buf, testSource.actual[1].Buf, testSource.actual[2].Buf, testSource.actual[3].Buf, testSource.actual[4].Buf, testSource.actual[5].Buf, testSource.actual[6].Buf, testSource.actual[7].Buf, testSource.actual[8].Buf, testSource.actual[9].Buf), nil}, + // Read the last 3 bytes. + {97, 10, testSource.actual[9].Buf[7:], nil}, + } + if err := runTableTests(ctx, table, n2); err != nil { + t.Errorf("runTableTest failed with testSource.update = %v : %v", testSource.update, err) + } + + // Disable updates and do it again. + testSource.update = false + if err := runTableTests(ctx, table, n2); err != nil { + t.Errorf("runTableTest failed with testSource.update = %v: %v", testSource.update, err) + } +} + +// Test that we behave correctly when the file is updated. +func TestSeqFileFileUpdated(t *testing.T) { + testSource := &seqTest{} + testSource.Init() + testSource.update = true + + // Create a file that can be R/W. + m := fs.NewNonCachingMountSource(nil, fs.MountSourceFlags{}) + ctx := contexttest.Context(t) + contents := map[string]*fs.Inode{ + "foo": NewSeqFileInode(ctx, testSource, m), + } + root := ramfstest.NewDir(ctx, contents, fs.FilePermsFromMode(0777)) + + // How about opening it? + inode := fs.NewInode(root, m, fs.StableAttr{Type: fs.Directory}) + dirent2, err := root.Lookup(ctx, inode, "foo") + if err != nil { + t.Fatalf("failed to walk to foo for n2: %v", err) + } + n2 := dirent2.Inode.InodeOperations + + table := []testTable{ + {0, 16, flatten(testSource.actual[0].Buf, testSource.actual[1].Buf[:6]), nil}, + } + if err := runTableTests(ctx, table, n2); err != nil { + t.Errorf("runTableTest failed: %v", err) + } + // Delete the first entry. + cut := testSource.actual[0].Buf + testSource.actual = testSource.actual[1:] + + table = []testTable{ + // Try reading buffer 0 with an offset. This will not delete the old data. + {1, 5, cut[1:6], nil}, + // Reset our file by reading at offset 0. + {0, 10, testSource.actual[0].Buf, nil}, + {16, 14, flatten(testSource.actual[1].Buf[6:], testSource.actual[2].Buf), nil}, + // Read the same data a second time. + {16, 14, flatten(testSource.actual[1].Buf[6:], testSource.actual[2].Buf), nil}, + // Read the following two lines. + {30, 20, flatten(testSource.actual[3].Buf, testSource.actual[4].Buf), nil}, + } + if err := runTableTests(ctx, table, n2); err != nil { + t.Errorf("runTableTest failed after removing first entry: %v", err) + } + + // Add a new duplicate line in the middle (6666...) + after := testSource.actual[5:] + testSource.actual = testSource.actual[:4] + // Note the list must be sorted. + testSource.actual = append(testSource.actual, after[0]) + testSource.actual = append(testSource.actual, after...) + + table = []testTable{ + {50, 20, flatten(testSource.actual[4].Buf, testSource.actual[5].Buf), nil}, + } + if err := runTableTests(ctx, table, n2); err != nil { + t.Errorf("runTableTest failed after adding middle entry: %v", err) + } + // This will be used in a later test. + oldTestData := testSource.actual + + // Delete everything. + testSource.actual = testSource.actual[:0] + table = []testTable{ + {20, 20, []byte{}, io.EOF}, + } + if err := runTableTests(ctx, table, n2); err != nil { + t.Errorf("runTableTest failed after removing all entries: %v", err) + } + // Restore some of the data. + testSource.actual = oldTestData[:1] + table = []testTable{ + {6, 20, testSource.actual[0].Buf[6:], nil}, + } + if err := runTableTests(ctx, table, n2); err != nil { + t.Errorf("runTableTest failed after adding first entry back: %v", err) + } + + // Re-extend the data + testSource.actual = oldTestData + table = []testTable{ + {30, 20, flatten(testSource.actual[3].Buf, testSource.actual[4].Buf), nil}, + } + if err := runTableTests(ctx, table, n2); err != nil { + t.Errorf("runTableTest failed after extending testSource: %v", err) + } +} diff --git a/pkg/sentry/fs/proc/stat.go b/pkg/sentry/fs/proc/stat.go new file mode 100644 index 000000000..dee836a05 --- /dev/null +++ b/pkg/sentry/fs/proc/stat.go @@ -0,0 +1,139 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package proc + +import ( + "bytes" + "fmt" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc/seqfile" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" +) + +// statData backs /proc/stat. +type statData struct { + // k is the owning Kernel. + k *kernel.Kernel +} + +// NeedsUpdate implements seqfile.SeqSource.NeedsUpdate. +func (*statData) NeedsUpdate(generation int64) bool { + return true +} + +// cpuStats contains the breakdown of CPU time for /proc/stat. +type cpuStats struct { + // user is time spent in userspace tasks with non-positive niceness. + user uint64 + + // nice is time spent in userspace tasks with positive niceness. + nice uint64 + + // system is time spent in non-interrupt kernel context. + system uint64 + + // idle is time spent idle. + idle uint64 + + // ioWait is time spent waiting for IO. + ioWait uint64 + + // irq is time spent in interrupt context. + irq uint64 + + // softirq is time spent in software interrupt context. + softirq uint64 + + // steal is involuntary wait time. + steal uint64 + + // guest is time spent in guests with non-positive niceness. + guest uint64 + + // guestNice is time spent in guests with positive niceness. + guestNice uint64 +} + +// String implements fmt.Stringer. +func (c cpuStats) String() string { + return fmt.Sprintf("%d %d %d %d %d %d %d %d %d %d", c.user, c.nice, c.system, c.idle, c.ioWait, c.irq, c.softirq, c.steal, c.guest, c.guestNice) +} + +// ReadSeqFileData implements seqfile.SeqSource.ReadSeqFileData. +func (s *statData) ReadSeqFileData(h seqfile.SeqHandle) ([]seqfile.SeqData, int64) { + if h != nil { + return nil, 0 + } + + var buf bytes.Buffer + + // TODO: We currently export only zero CPU stats. We could + // at least provide some aggregate stats. + var cpu cpuStats + fmt.Fprintf(&buf, "cpu %s\n", cpu) + + for c, max := uint(0), s.k.ApplicationCores(); c < max; c++ { + fmt.Fprintf(&buf, "cpu%d %s\n", c, cpu) + } + + // The total number of interrupts is dependent on the CPUs and PCI + // devices on the system. See arch_probe_nr_irqs. + // + // Since we don't report real interrupt stats, just choose an arbitrary + // value from a representative VM. + const numInterrupts = 256 + + // The Kernel doesn't handle real interrupts, so report all zeroes. + // TODO: We could count page faults as #PF. + fmt.Fprintf(&buf, "intr 0") // total + for i := 0; i < numInterrupts; i++ { + fmt.Fprintf(&buf, " 0") + } + fmt.Fprintf(&buf, "\n") + + // Total number of context switches. + // TODO: Count this. + fmt.Fprintf(&buf, "ctxt 0\n") + + // CLOCK_REALTIME timestamp from boot, in seconds. + fmt.Fprintf(&buf, "btime %d\n", s.k.Timekeeper().BootTime().Seconds()) + + // Total number of clones. + // TODO: Count this. + fmt.Fprintf(&buf, "processes 0\n") + + // Number of runnable tasks. + // TODO: Count this. + fmt.Fprintf(&buf, "procs_running 0\n") + + // Number of tasks waiting on IO. + // TODO: Count this. + fmt.Fprintf(&buf, "procs_blocked 0\n") + + // Number of each softirq handled. + fmt.Fprintf(&buf, "softirq 0") // total + for i := 0; i < linux.NumSoftIRQ; i++ { + fmt.Fprintf(&buf, " 0") + } + fmt.Fprintf(&buf, "\n") + + return []seqfile.SeqData{ + { + Buf: buf.Bytes(), + Handle: (*statData)(nil), + }, + }, 0 +} diff --git a/pkg/sentry/fs/proc/sys.go b/pkg/sentry/fs/proc/sys.go new file mode 100644 index 000000000..4323f3650 --- /dev/null +++ b/pkg/sentry/fs/proc/sys.go @@ -0,0 +1,117 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package proc + +import ( + "fmt" + "io" + + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc/seqfile" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +// hostname is a file containing the system hostname. +type hostname struct { + ramfs.Entry +} + +// DeprecatedPreadv implements fs.InodeOperations.DeprecatedPreadv. +func (hostname) DeprecatedPreadv(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) { + utsns := kernel.UTSNamespaceFromContext(ctx) + contents := []byte(utsns.HostName() + "\n") + + if offset >= int64(len(contents)) { + return 0, io.EOF + } + + n, err := dst.CopyOut(ctx, contents[offset:]) + return int64(n), err +} + +func (p *proc) newHostname(ctx context.Context, msrc *fs.MountSource) *fs.Inode { + h := &hostname{} + h.InitEntry(ctx, fs.RootOwner, fs.FilePermsFromMode(0444)) + return newFile(h, msrc, fs.SpecialFile, nil) +} + +// mmapMinAddrData backs /proc/sys/vm/mmap_min_addr. +type mmapMinAddrData struct { + k *kernel.Kernel +} + +// NeedsUpdate implements seqfile.SeqSource.NeedsUpdate. +func (*mmapMinAddrData) NeedsUpdate(generation int64) bool { + return true +} + +// ReadSeqFileData implements seqfile.SeqSource.ReadSeqFileData. +func (d *mmapMinAddrData) ReadSeqFileData(h seqfile.SeqHandle) ([]seqfile.SeqData, int64) { + if h != nil { + return nil, 0 + } + return []seqfile.SeqData{ + { + Buf: []byte(fmt.Sprintf("%d\n", d.k.Platform.MinUserAddress())), + Handle: (*mmapMinAddrData)(nil), + }, + }, 0 +} + +type overcommitMemory struct{} + +func (*overcommitMemory) NeedsUpdate(generation int64) bool { + return true +} + +// ReadSeqFileData implements seqfile.SeqSource. +func (*overcommitMemory) ReadSeqFileData(h seqfile.SeqHandle) ([]seqfile.SeqData, int64) { + if h != nil { + return nil, 0 + } + return []seqfile.SeqData{ + { + Buf: []byte("0\n"), + Handle: (*overcommitMemory)(nil), + }, + }, 0 +} + +func (p *proc) newKernelDir(ctx context.Context, msrc *fs.MountSource) *fs.Inode { + d := &ramfs.Dir{} + d.InitDir(ctx, nil, fs.RootOwner, fs.FilePermsFromMode(0555)) + d.AddChild(ctx, "hostname", p.newHostname(ctx, msrc)) + return newFile(d, msrc, fs.SpecialDirectory, nil) +} + +func (p *proc) newVMDir(ctx context.Context, msrc *fs.MountSource) *fs.Inode { + d := &ramfs.Dir{} + d.InitDir(ctx, nil, fs.RootOwner, fs.FilePermsFromMode(0555)) + d.AddChild(ctx, "mmap_min_addr", seqfile.NewSeqFileInode(ctx, &mmapMinAddrData{p.k}, msrc)) + d.AddChild(ctx, "overcommit_memory", seqfile.NewSeqFileInode(ctx, &overcommitMemory{}, msrc)) + return newFile(d, msrc, fs.SpecialDirectory, nil) +} + +func (p *proc) newSysDir(ctx context.Context, msrc *fs.MountSource) *fs.Inode { + d := &ramfs.Dir{} + d.InitDir(ctx, nil, fs.RootOwner, fs.FilePermsFromMode(0555)) + d.AddChild(ctx, "kernel", p.newKernelDir(ctx, msrc)) + d.AddChild(ctx, "vm", p.newVMDir(ctx, msrc)) + d.AddChild(ctx, "net", p.newSysNetDir(ctx, msrc)) + return newFile(d, msrc, fs.SpecialDirectory, nil) +} diff --git a/pkg/sentry/fs/proc/sys_net.go b/pkg/sentry/fs/proc/sys_net.go new file mode 100644 index 000000000..db44c95cb --- /dev/null +++ b/pkg/sentry/fs/proc/sys_net.go @@ -0,0 +1,188 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package proc + +import ( + "fmt" + "io" + + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc/device" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs" + "gvisor.googlesource.com/gvisor/pkg/sentry/inet" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +type tcpMemDir int + +const ( + tcpRMem tcpMemDir = iota + tcpWMem +) + +type tcpMem struct { + ramfs.Entry + s inet.Stack + size inet.TCPBufferSize + dir tcpMemDir +} + +func newTCPMem(s inet.Stack, size inet.TCPBufferSize, dir tcpMemDir) *tcpMem { + return &tcpMem{s: s, size: size, dir: dir} +} + +func newTCPMemInode(ctx context.Context, msrc *fs.MountSource, s inet.Stack, size inet.TCPBufferSize, dir tcpMemDir) *fs.Inode { + tm := newTCPMem(s, size, dir) + tm.InitEntry(ctx, fs.RootOwner, fs.FilePermsFromMode(0644)) + sattr := fs.StableAttr{ + DeviceID: device.ProcDevice.DeviceID(), + InodeID: device.ProcDevice.NextIno(), + BlockSize: usermem.PageSize, + Type: fs.SpecialFile, + } + return fs.NewInode(tm, msrc, sattr) +} + +// DeprecatedPreadv implements fs.InodeOperations.DeprecatedPreadv. +func (m *tcpMem) DeprecatedPreadv(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) { + if offset != 0 { + return 0, io.EOF + } + s := fmt.Sprintf("%d\t%d\t%d\n", m.size.Min, m.size.Default, m.size.Max) + n, err := dst.CopyOut(ctx, []byte(s)) + return int64(n), err +} + +// Truncate implements fs.InodeOperations.Truncate. +func (*tcpMem) Truncate(context.Context, *fs.Inode, int64) error { + return nil +} + +// DeprecatedPwritev implements fs.InodeOperations.DeprecatedPwritev. +func (m *tcpMem) DeprecatedPwritev(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) { + if src.NumBytes() == 0 { + return 0, nil + } + src = src.TakeFirst(usermem.PageSize - 1) + + buf := []int32{int32(m.size.Min), int32(m.size.Default), int32(m.size.Max)} + n, cperr := usermem.CopyInt32StringsInVec(ctx, src.IO, src.Addrs, buf, src.Opts) + size := inet.TCPBufferSize{ + Min: int(buf[0]), + Default: int(buf[1]), + Max: int(buf[2]), + } + var err error + switch m.dir { + case tcpRMem: + err = m.s.SetTCPReceiveBufferSize(size) + case tcpWMem: + err = m.s.SetTCPSendBufferSize(size) + default: + panic(fmt.Sprintf("unknown tcpMem.dir: %v", m.dir)) + } + if err != nil { + return n, err + } + return n, cperr +} + +type tcpSack struct { + ramfs.Entry + s inet.Stack `state:"nosave"` // S/R-FIXME +} + +func newTCPSackInode(ctx context.Context, msrc *fs.MountSource, s inet.Stack) *fs.Inode { + ts := &tcpSack{s: s} + ts.InitEntry(ctx, fs.RootOwner, fs.FilePermsFromMode(0644)) + sattr := fs.StableAttr{ + DeviceID: device.ProcDevice.DeviceID(), + InodeID: device.ProcDevice.NextIno(), + BlockSize: usermem.PageSize, + Type: fs.SpecialFile, + } + return fs.NewInode(ts, msrc, sattr) +} + +func (s *tcpSack) DeprecatedPreadv(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) { + if offset != 0 { + return 0, io.EOF + } + + sack, err := s.s.TCPSACKEnabled() + if err != nil { + return 0, err + } + + val := "0\n" + if sack { + // Technically, this is not quite compatible with Linux. Linux + // stores these as an integer, so if you write "2" into + // tcp_sack, you should get 2 back. Tough luck. + val = "1\n" + } + n, err := dst.CopyOut(ctx, []byte(val)) + return int64(n), err +} + +// Truncate implements fs.InodeOperations.Truncate. +func (*tcpSack) Truncate(context.Context, *fs.Inode, int64) error { + return nil +} + +// DeprecatedPwritev implements fs.InodeOperations.DeprecatedPwritev. +func (s *tcpSack) DeprecatedPwritev(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) { + if src.NumBytes() == 0 { + return 0, nil + } + src = src.TakeFirst(usermem.PageSize - 1) + + var v int32 + n, err := usermem.CopyInt32StringInVec(ctx, src.IO, src.Addrs, &v, src.Opts) + if err != nil { + return n, err + } + return n, s.s.SetTCPSACKEnabled(v != 0) +} + +func newSysNetIPv4Dir(ctx context.Context, msrc *fs.MountSource, s inet.Stack) *fs.Inode { + d := &ramfs.Dir{} + d.InitDir(ctx, nil, fs.RootOwner, fs.FilePermsFromMode(0555)) + + // Add tcp_rmem. + if rs, err := s.TCPReceiveBufferSize(); err == nil { + d.AddChild(ctx, "tcp_rmem", newTCPMemInode(ctx, msrc, s, rs, tcpRMem)) + } + + // Add tcp_wmem. + if ss, err := s.TCPSendBufferSize(); err == nil { + d.AddChild(ctx, "tcp_wmem", newTCPMemInode(ctx, msrc, s, ss, tcpWMem)) + } + + // Add tcp_sack. + d.AddChild(ctx, "tcp_sack", newTCPSackInode(ctx, msrc, s)) + + return newFile(d, msrc, fs.SpecialDirectory, nil) +} + +func (p *proc) newSysNetDir(ctx context.Context, msrc *fs.MountSource) *fs.Inode { + d := &ramfs.Dir{} + d.InitDir(ctx, nil, fs.RootOwner, fs.FilePermsFromMode(0555)) + if s := p.k.NetworkStack(); s != nil { + d.AddChild(ctx, "ipv4", newSysNetIPv4Dir(ctx, msrc, s)) + } + return newFile(d, msrc, fs.SpecialDirectory, nil) +} diff --git a/pkg/sentry/fs/proc/sys_net_test.go b/pkg/sentry/fs/proc/sys_net_test.go new file mode 100644 index 000000000..7ba392346 --- /dev/null +++ b/pkg/sentry/fs/proc/sys_net_test.go @@ -0,0 +1,121 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package proc + +import ( + "testing" + + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/inet" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +func TestQuerySendBufferSize(t *testing.T) { + ctx := context.Background() + s := inet.NewTestStack() + s.TCPSendBufSize = inet.TCPBufferSize{100, 200, 300} + tm := newTCPMem(s, s.TCPSendBufSize, tcpWMem) + + buf := make([]byte, 100) + dst := usermem.BytesIOSequence(buf) + n, err := tm.DeprecatedPreadv(ctx, dst, 0) + if err != nil { + t.Fatalf("DeprecatedPreadv failed: %v", err) + } + + if got, want := string(buf[:n]), "100\t200\t300\n"; got != want { + t.Fatalf("Bad string: got %v, want %v", got, want) + } +} + +func TestQueryRecvBufferSize(t *testing.T) { + ctx := context.Background() + s := inet.NewTestStack() + s.TCPRecvBufSize = inet.TCPBufferSize{100, 200, 300} + tm := newTCPMem(s, s.TCPRecvBufSize, tcpRMem) + + buf := make([]byte, 100) + dst := usermem.BytesIOSequence(buf) + n, err := tm.DeprecatedPreadv(ctx, dst, 0) + if err != nil { + t.Fatalf("DeprecatedPreadv failed: %v", err) + } + + if got, want := string(buf[:n]), "100\t200\t300\n"; got != want { + t.Fatalf("Bad string: got %v, want %v", got, want) + } +} + +var cases = []struct { + str string + initial inet.TCPBufferSize + final inet.TCPBufferSize +}{ + { + str: "", + initial: inet.TCPBufferSize{1, 2, 3}, + final: inet.TCPBufferSize{1, 2, 3}, + }, + { + str: "100\n", + initial: inet.TCPBufferSize{1, 100, 200}, + final: inet.TCPBufferSize{100, 100, 200}, + }, + { + str: "100 200 300\n", + initial: inet.TCPBufferSize{1, 2, 3}, + final: inet.TCPBufferSize{100, 200, 300}, + }, +} + +func TestConfigureSendBufferSize(t *testing.T) { + ctx := context.Background() + s := inet.NewTestStack() + for _, c := range cases { + s.TCPSendBufSize = c.initial + tm := newTCPMem(s, c.initial, tcpWMem) + + // Write the values. + src := usermem.BytesIOSequence([]byte(c.str)) + if n, err := tm.DeprecatedPwritev(ctx, src, 0); n != int64(len(c.str)) || err != nil { + t.Errorf("DeprecatedPwritev, case = %q: got (%d, %v), wanted (%d, nil)", c.str, n, err, len(c.str)) + } + + // Read the values from the stack and check them. + if s.TCPSendBufSize != c.final { + t.Errorf("TCPSendBufferSize, case = %q: got %v, wanted %v", c.str, s.TCPSendBufSize, c.final) + } + } +} + +func TestConfigureRecvBufferSize(t *testing.T) { + ctx := context.Background() + s := inet.NewTestStack() + for _, c := range cases { + s.TCPRecvBufSize = c.initial + tm := newTCPMem(s, c.initial, tcpRMem) + + // Write the values. + src := usermem.BytesIOSequence([]byte(c.str)) + if n, err := tm.DeprecatedPwritev(ctx, src, 0); n != int64(len(c.str)) || err != nil { + t.Errorf("DeprecatedPwritev, case = %q: got (%d, %v), wanted (%d, nil)", c.str, n, err, len(c.str)) + } + + // Read the values from the stack and check them. + if s.TCPRecvBufSize != c.final { + t.Errorf("TCPRecvBufferSize, case = %q: got %v, wanted %v", c.str, s.TCPRecvBufSize, c.final) + } + } +} diff --git a/pkg/sentry/fs/proc/task.go b/pkg/sentry/fs/proc/task.go new file mode 100644 index 000000000..3e9a1e50e --- /dev/null +++ b/pkg/sentry/fs/proc/task.go @@ -0,0 +1,567 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package proc + +import ( + "bytes" + "fmt" + "io" + "sort" + "strconv" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc/device" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc/seqfile" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" + "gvisor.googlesource.com/gvisor/pkg/sentry/mm" + "gvisor.googlesource.com/gvisor/pkg/sentry/usage" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// getTaskMM returns t's MemoryManager. If getTaskMM succeeds, the MemoryManager's +// users count is incremented, and must be decremented by the caller when it is +// no longer in use. +func getTaskMM(t *kernel.Task) (*mm.MemoryManager, error) { + if t.ExitState() == kernel.TaskExitDead { + return nil, syserror.ESRCH + } + var m *mm.MemoryManager + t.WithMuLocked(func(t *kernel.Task) { + m = t.MemoryManager() + }) + if m == nil || !m.IncUsers() { + return nil, io.EOF + } + return m, nil +} + +// taskDir represents a task-level directory. +type taskDir struct { + ramfs.Dir + + // t is the associated kernel task that owns this file. + t *kernel.Task +} + +// newTaskDir creates a new proc task entry. +func newTaskDir(t *kernel.Task, msrc *fs.MountSource, pidns *kernel.PIDNamespace, showSubtasks bool) *fs.Inode { + d := &taskDir{t: t} + // TODO: Set EUID/EGID based on dumpability. + d.InitDir(t, map[string]*fs.Inode{ + "auxv": newAuxvec(t, msrc), + "cmdline": newExecArgFile(t, msrc, cmdlineExecArg), + "comm": newComm(t, msrc), + "environ": newExecArgFile(t, msrc, environExecArg), + "exe": newExe(t, msrc), + "fd": newFdDir(t, msrc), + "fdinfo": newFdInfoDir(t, msrc), + "gid_map": newGIDMap(t, msrc), + // TODO: This is incorrect for /proc/[pid]/task/[tid]/io, i.e. if + // showSubtasks is false: + // http://lxr.free-electrons.com/source/fs/proc/base.c?v=3.11#L2980 + "io": newIO(t, msrc), + "maps": newMaps(t, msrc), + "mountinfo": seqfile.NewSeqFileInode(t, &mountInfoFile{t: t}, msrc), + "mounts": seqfile.NewSeqFileInode(t, &mountsFile{t: t}, msrc), + "ns": newNamespaceDir(t, msrc), + "stat": newTaskStat(t, msrc, showSubtasks, pidns), + "status": newStatus(t, msrc, pidns), + "uid_map": newUIDMap(t, msrc), + }, fs.RootOwner, fs.FilePermsFromMode(0555)) + if showSubtasks { + d.AddChild(t, "task", newSubtasks(t, msrc, pidns)) + } + return newFile(d, msrc, fs.SpecialDirectory, t) +} + +// subtasks represents a /proc/TID/task directory. +type subtasks struct { + ramfs.Dir + + t *kernel.Task + + pidns *kernel.PIDNamespace +} + +func newSubtasks(t *kernel.Task, msrc *fs.MountSource, pidns *kernel.PIDNamespace) *fs.Inode { + s := &subtasks{t: t, pidns: pidns} + s.InitDir(t, nil, fs.RootOwner, fs.FilePermsFromMode(0555)) + return newFile(s, msrc, fs.SpecialDirectory, t) +} + +// UnstableAttr returns unstable attributes of the subtasks. +func (s *subtasks) UnstableAttr(ctx context.Context, inode *fs.Inode) (fs.UnstableAttr, error) { + uattr, err := s.Dir.UnstableAttr(ctx, inode) + if err != nil { + return fs.UnstableAttr{}, err + } + // We can't rely on ramfs' implementation because the task directories are + // generated dynamically. + uattr.Links = uint64(2 + s.t.ThreadGroup().Count()) + return uattr, nil +} + +// Lookup loads an Inode in a task's subtask directory into a Dirent. +func (s *subtasks) Lookup(ctx context.Context, dir *fs.Inode, p string) (*fs.Dirent, error) { + tid, err := strconv.ParseUint(p, 10, 32) + if err != nil { + return nil, syserror.ENOENT + } + + task := s.pidns.TaskWithID(kernel.ThreadID(tid)) + if task == nil { + return nil, syserror.ENOENT + } + if task.ThreadGroup() != s.t.ThreadGroup() { + return nil, syserror.ENOENT + } + + td := newTaskDir(task, dir.MountSource, s.pidns, false) + return fs.NewDirent(td, p), nil +} + +// DeprecatedReaddir lists a task's subtask directory. +func (s *subtasks) DeprecatedReaddir(ctx context.Context, dirCtx *fs.DirCtx, offset int) (int, error) { + tasks := s.t.ThreadGroup().MemberIDs(s.pidns) + taskInts := make([]int, 0, len(tasks)) + for _, tid := range tasks { + taskInts = append(taskInts, int(tid)) + } + + // Find the task to start at. + idx := sort.SearchInts(taskInts, offset) + if idx == len(taskInts) { + return offset, nil + } + taskInts = taskInts[idx:] + + var tid int + for _, tid = range taskInts { + name := strconv.FormatUint(uint64(tid), 10) + attr := fs.GenericDentAttr(fs.SpecialDirectory, device.ProcDevice) + if err := dirCtx.DirEmit(name, attr); err != nil { + // Returned offset is next tid to serialize. + return tid, err + } + } + // We serialized them all. Next offset should be higher than last + // serialized tid. + return tid + 1, nil +} + +// exe is an fs.InodeOperations symlink for the /proc/PID/exe file. +type exe struct { + ramfs.Symlink + + t *kernel.Task +} + +func newExe(t *kernel.Task, msrc *fs.MountSource) *fs.Inode { + exeSymlink := &exe{t: t} + exeSymlink.InitSymlink(t, fs.RootOwner, "") + return newFile(exeSymlink, msrc, fs.Symlink, t) +} + +func (e *exe) executable() (d *fs.Dirent, err error) { + e.t.WithMuLocked(func(t *kernel.Task) { + mm := t.MemoryManager() + if mm == nil { + // TODO: Check shouldn't allow Readlink once the + // Task is zombied. + err = syserror.EACCES + return + } + + // The MemoryManager may be destroyed, in which case + // MemoryManager.destroy will simply set the executable to nil + // (with locks held). + d = mm.Executable() + if d == nil { + err = syserror.ENOENT + } + }) + return +} + +// Readlink implements fs.InodeOperations. +func (e *exe) Readlink(ctx context.Context, inode *fs.Inode) (string, error) { + if !kernel.ContextCanTrace(ctx, e.t, false) { + return "", syserror.EACCES + } + + // Pull out the executable for /proc/TID/exe. + exec, err := e.executable() + if err != nil { + return "", err + } + defer exec.DecRef() + + root := fs.RootFromContext(ctx) + if root == nil { + // This doesn't correspond to anything in Linux because the vfs is + // global there. + return "", syserror.EINVAL + } + defer root.DecRef() + n, _ := exec.FullName(root) + return n, nil +} + +// namespaceFile represents a file in the namespacefs, such as the files in +// /proc/<pid>/ns. +type namespaceFile struct { + ramfs.Symlink + + t *kernel.Task +} + +func newNamespaceFile(t *kernel.Task, msrc *fs.MountSource, name string) *fs.Inode { + n := &namespaceFile{t: t} + n.InitSymlink(t, fs.RootOwner, "") + + // TODO: Namespace symlinks should contain the namespace name and the + // inode number for the namespace instance, so for example user:[123456]. We + // currently fake the inode number by sticking the symlink inode in its + // place. + n.Target = fmt.Sprintf("%s:[%d]", name, device.ProcDevice.NextIno()) + + return newFile(n, msrc, fs.Symlink, t) +} + +// Getlink implements fs.InodeOperations.Getlink. +func (n *namespaceFile) Getlink(ctx context.Context, inode *fs.Inode) (*fs.Dirent, error) { + if !kernel.ContextCanTrace(ctx, n.t, false) { + return nil, syserror.EACCES + } + + // Create a new regular file to fake the namespace file. + node := &ramfs.Entry{} + node.InitEntry(ctx, fs.RootOwner, fs.FilePermsFromMode(0777)) + sattr := fs.StableAttr{ + DeviceID: device.ProcDevice.DeviceID(), + InodeID: device.ProcDevice.NextIno(), + BlockSize: usermem.PageSize, + Type: fs.RegularFile, + } + return fs.NewDirent(fs.NewInode(node, inode.MountSource, sattr), n.Symlink.Target), nil +} + +func newNamespaceDir(t *kernel.Task, msrc *fs.MountSource) *fs.Inode { + d := &ramfs.Dir{} + d.InitDir(t, map[string]*fs.Inode{ + "net": newNamespaceFile(t, msrc, "net"), + "pid": newNamespaceFile(t, msrc, "pid"), + "user": newNamespaceFile(t, msrc, "user"), + }, fs.RootOwner, fs.FilePermsFromMode(0511)) + return newFile(d, msrc, fs.SpecialDirectory, t) +} + +// mapsData implements seqfile.SeqSource for /proc/[pid]/maps. +type mapsData struct { + t *kernel.Task +} + +func newMaps(t *kernel.Task, msrc *fs.MountSource) *fs.Inode { + return newFile(seqfile.NewSeqFile(t, &mapsData{t}), msrc, fs.SpecialFile, t) +} + +func (md *mapsData) mm() *mm.MemoryManager { + var tmm *mm.MemoryManager + md.t.WithMuLocked(func(t *kernel.Task) { + if mm := t.MemoryManager(); mm != nil { + // No additional reference is taken on mm here. This is safe + // because MemoryManager.destroy is required to leave the + // MemoryManager in a state where it's still usable as a SeqSource. + tmm = mm + } + }) + return tmm +} + +// NeedsUpdate implements seqfile.SeqSource.NeedsUpdate. +func (md *mapsData) NeedsUpdate(generation int64) bool { + if mm := md.mm(); mm != nil { + return mm.NeedsUpdate(generation) + } + return true +} + +// ReadSeqFileData implements seqfile.SeqSource.ReadSeqFileData. +func (md *mapsData) ReadSeqFileData(h seqfile.SeqHandle) ([]seqfile.SeqData, int64) { + if mm := md.mm(); mm != nil { + return mm.ReadSeqFileData(md.t.AsyncContext(), h) + } + return []seqfile.SeqData{}, 0 +} + +type taskStatData struct { + t *kernel.Task + + // If tgstats is true, accumulate fault stats (not implemented) and CPU + // time across all tasks in t's thread group. + tgstats bool + + // pidns is the PID namespace associated with the proc filesystem that + // includes the file using this statData. + pidns *kernel.PIDNamespace +} + +func newTaskStat(t *kernel.Task, msrc *fs.MountSource, showSubtasks bool, pidns *kernel.PIDNamespace) *fs.Inode { + return newFile(seqfile.NewSeqFile(t, &taskStatData{t, showSubtasks /* tgstats */, pidns}), msrc, fs.SpecialFile, t) +} + +// NeedsUpdate returns whether the generation is old or not. +func (s *taskStatData) NeedsUpdate(generation int64) bool { + return true +} + +// ReadSeqFileData returns data for the SeqFile reader. +// SeqData, the current generation and where in the file the handle corresponds to. +func (s *taskStatData) ReadSeqFileData(h seqfile.SeqHandle) ([]seqfile.SeqData, int64) { + if h != nil { + return nil, 0 + } + + var buf bytes.Buffer + + fmt.Fprintf(&buf, "%d ", s.pidns.IDOfTask(s.t)) + fmt.Fprintf(&buf, "(%s) ", s.t.Name()) + fmt.Fprintf(&buf, "%c ", s.t.StateStatus()[0]) + ppid := kernel.ThreadID(0) + if parent := s.t.Parent(); parent != nil { + ppid = s.pidns.IDOfThreadGroup(parent.ThreadGroup()) + } + fmt.Fprintf(&buf, "%d ", ppid) + fmt.Fprintf(&buf, "%d ", s.pidns.IDOfProcessGroup(s.t.ThreadGroup().ProcessGroup())) + fmt.Fprintf(&buf, "%d ", s.pidns.IDOfSession(s.t.ThreadGroup().Session())) + fmt.Fprintf(&buf, "0 0 " /* tty_nr tpgid */) + fmt.Fprintf(&buf, "0 " /* flags */) + fmt.Fprintf(&buf, "0 0 0 0 " /* minflt cminflt majflt cmajflt */) + var cputime usage.CPUStats + if s.tgstats { + cputime = s.t.ThreadGroup().CPUStats() + } else { + cputime = s.t.CPUStats() + } + fmt.Fprintf(&buf, "%d %d ", linux.ClockTFromDuration(cputime.UserTime), linux.ClockTFromDuration(cputime.SysTime)) + cputime = s.t.ThreadGroup().JoinedChildCPUStats() + fmt.Fprintf(&buf, "%d %d ", linux.ClockTFromDuration(cputime.UserTime), linux.ClockTFromDuration(cputime.SysTime)) + fmt.Fprintf(&buf, "%d %d ", s.t.Priority(), s.t.Niceness()) + fmt.Fprintf(&buf, "%d ", s.t.ThreadGroup().Count()) + fmt.Fprintf(&buf, "0 0 " /* itrealvalue starttime */) + var vss, rss uint64 + s.t.WithMuLocked(func(t *kernel.Task) { + if mm := t.MemoryManager(); mm != nil { + vss = mm.VirtualMemorySize() + rss = mm.ResidentSetSize() + } + }) + fmt.Fprintf(&buf, "%d %d ", vss, rss/usermem.PageSize) + fmt.Fprintf(&buf, "0 0 0 0 0 0 " /* rsslim startcode endcode startstack kstkesp kstkeip */) + fmt.Fprintf(&buf, "0 0 0 0 0 " /* signal blocked sigignore sigcatch wchan */) + fmt.Fprintf(&buf, "0 0 " /* nswap cnswap */) + terminationSignal := linux.Signal(0) + if s.t == s.t.ThreadGroup().Leader() { + terminationSignal = s.t.ThreadGroup().TerminationSignal() + } + fmt.Fprintf(&buf, "%d ", terminationSignal) + fmt.Fprintf(&buf, "0 0 0 " /* processor rt_priority policy */) + fmt.Fprintf(&buf, "0 0 0 " /* delayacct_blkio_ticks guest_time cguest_time */) + fmt.Fprintf(&buf, "0 0 0 0 0 0 0 " /* start_data end_data start_brk arg_start arg_end env_start env_end */) + fmt.Fprintf(&buf, "0\n" /* exit_code */) + + return []seqfile.SeqData{{Buf: buf.Bytes(), Handle: (*taskStatData)(nil)}}, 0 +} + +// statusData implements seqfile.SeqSource for /proc/[pid]/status. +type statusData struct { + t *kernel.Task + pidns *kernel.PIDNamespace +} + +func newStatus(t *kernel.Task, msrc *fs.MountSource, pidns *kernel.PIDNamespace) *fs.Inode { + return newFile(seqfile.NewSeqFile(t, &statusData{t, pidns}), msrc, fs.SpecialFile, t) +} + +// NeedsUpdate implements seqfile.SeqSource.NeedsUpdate. +func (s *statusData) NeedsUpdate(generation int64) bool { + return true +} + +// ReadSeqFileData implements seqfile.SeqSource.ReadSeqFileData. +func (s *statusData) ReadSeqFileData(h seqfile.SeqHandle) ([]seqfile.SeqData, int64) { + if h != nil { + return nil, 0 + } + + var buf bytes.Buffer + fmt.Fprintf(&buf, "Name:\t%s\n", s.t.Name()) + fmt.Fprintf(&buf, "State:\t%s\n", s.t.StateStatus()) + fmt.Fprintf(&buf, "Tgid:\t%d\n", s.pidns.IDOfThreadGroup(s.t.ThreadGroup())) + fmt.Fprintf(&buf, "Pid:\t%d\n", s.pidns.IDOfTask(s.t)) + ppid := kernel.ThreadID(0) + if parent := s.t.Parent(); parent != nil { + ppid = s.pidns.IDOfThreadGroup(parent.ThreadGroup()) + } + fmt.Fprintf(&buf, "PPid:\t%d\n", ppid) + tpid := kernel.ThreadID(0) + if tracer := s.t.Tracer(); tracer != nil { + tpid = s.pidns.IDOfTask(tracer) + } + fmt.Fprintf(&buf, "TracerPid:\t%d\n", tpid) + var fds int + var vss, rss uint64 + s.t.WithMuLocked(func(t *kernel.Task) { + if fdm := t.FDMap(); fdm != nil { + fds = fdm.Size() + } + if mm := t.MemoryManager(); mm != nil { + vss = mm.VirtualMemorySize() + rss = mm.ResidentSetSize() + } + }) + fmt.Fprintf(&buf, "FDSize:\t%d\n", fds) + fmt.Fprintf(&buf, "VmSize:\t%d kB\n", vss>>10) + fmt.Fprintf(&buf, "VmRSS:\t%d kB\n", rss>>10) + fmt.Fprintf(&buf, "Threads:\t%d\n", s.t.ThreadGroup().Count()) + creds := s.t.Credentials() + fmt.Fprintf(&buf, "CapInh:\t%016x\n", creds.InheritableCaps) + fmt.Fprintf(&buf, "CapPrm:\t%016x\n", creds.PermittedCaps) + fmt.Fprintf(&buf, "CapEff:\t%016x\n", creds.EffectiveCaps) + fmt.Fprintf(&buf, "CapBnd:\t%016x\n", creds.BoundingCaps) + fmt.Fprintf(&buf, "Seccomp:\t%d\n", s.t.SeccompMode()) + return []seqfile.SeqData{{Buf: buf.Bytes(), Handle: (*statusData)(nil)}}, 0 +} + +// ioUsage is the /proc/<pid>/io and /proc/<pid>/task/<tid>/io data provider. +type ioUsage interface { + // IOUsage returns the io usage data. + IOUsage() *usage.IO +} + +type ioData struct { + ioUsage +} + +func newIO(t *kernel.Task, msrc *fs.MountSource) *fs.Inode { + return newFile(seqfile.NewSeqFile(t, &ioData{t.ThreadGroup()}), msrc, fs.SpecialFile, t) +} + +// NeedsUpdate returns whether the generation is old or not. +func (i *ioData) NeedsUpdate(generation int64) bool { + return true +} + +// ReadSeqFileData returns data for the SeqFile reader. +// SeqData, the current generation and where in the file the handle corresponds to. +func (i *ioData) ReadSeqFileData(h seqfile.SeqHandle) ([]seqfile.SeqData, int64) { + if h != nil { + return nil, 0 + } + + io := usage.IO{} + io.Accumulate(i.IOUsage()) + + var buf bytes.Buffer + fmt.Fprintf(&buf, "char: %d\n", io.CharsRead) + fmt.Fprintf(&buf, "wchar: %d\n", io.CharsWritten) + fmt.Fprintf(&buf, "syscr: %d\n", io.ReadSyscalls) + fmt.Fprintf(&buf, "syscw: %d\n", io.WriteSyscalls) + fmt.Fprintf(&buf, "read_bytes: %d\n", io.BytesRead) + fmt.Fprintf(&buf, "write_bytes: %d\n", io.BytesWritten) + fmt.Fprintf(&buf, "cancelled_write_bytes: %d\n", io.BytesWriteCancelled) + + return []seqfile.SeqData{{Buf: buf.Bytes(), Handle: (*ioData)(nil)}}, 0 +} + +// comm is a file containing the command name for a task. +// +// On Linux, /proc/[pid]/comm is writable, and writing to the comm file changes +// the thread name. We don't implement this yet as there are no known users of +// this feature. +type comm struct { + ramfs.Entry + + t *kernel.Task +} + +// newComm returns a new comm file. +func newComm(t *kernel.Task, msrc *fs.MountSource) *fs.Inode { + c := &comm{t: t} + c.InitEntry(t, fs.RootOwner, fs.FilePermsFromMode(0444)) + return newFile(c, msrc, fs.SpecialFile, t) +} + +// DeprecatedPreadv reads the current command name. +func (c *comm) DeprecatedPreadv(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) { + if offset < 0 { + return 0, syserror.EINVAL + } + + buf := []byte(c.t.Name() + "\n") + if offset >= int64(len(buf)) { + return 0, io.EOF + } + + n, err := dst.CopyOut(ctx, buf[offset:]) + return int64(n), err +} + +// auxvec is a file containing the auxiliary vector for a task. +type auxvec struct { + ramfs.Entry + + t *kernel.Task +} + +// newAuxvec returns a new auxvec file. +func newAuxvec(t *kernel.Task, msrc *fs.MountSource) *fs.Inode { + a := &auxvec{t: t} + a.InitEntry(t, fs.RootOwner, fs.FilePermsFromMode(0400)) + return newFile(a, msrc, fs.SpecialFile, t) +} + +// DeprecatedPreadv reads the current auxiliary vector. +func (a *auxvec) DeprecatedPreadv(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) { + if offset < 0 { + return 0, syserror.EINVAL + } + + m, err := getTaskMM(a.t) + if err != nil { + return 0, err + } + defer m.DecUsers(ctx) + auxv := m.Auxv() + + // Space for buffer with AT_NULL (0) terminator at the end. + size := (len(auxv) + 1) * 16 + if offset >= int64(size) { + return 0, io.EOF + } + + buf := make([]byte, size) + for i, e := range auxv { + usermem.ByteOrder.PutUint64(buf[16*i:], e.Key) + usermem.ByteOrder.PutUint64(buf[16*i+8:], uint64(e.Value)) + } + + n, err := dst.CopyOut(ctx, buf[offset:]) + return int64(n), err +} diff --git a/pkg/sentry/fs/proc/uid_gid_map.go b/pkg/sentry/fs/proc/uid_gid_map.go new file mode 100644 index 000000000..a2a070bdd --- /dev/null +++ b/pkg/sentry/fs/proc/uid_gid_map.go @@ -0,0 +1,152 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package proc + +import ( + "bytes" + "fmt" + + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc/seqfile" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// An idMapSeqSource is a seqfile.SeqSource that returns UID or GID mappings +// from a task's user namespace. +type idMapSeqSource struct { + t *kernel.Task + gids bool +} + +// NeedsUpdate implements seqfile.SeqSource.NeedsUpdate. +func (imss *idMapSeqSource) NeedsUpdate(generation int64) bool { + return true +} + +// ReadSeqFileData implements seqfile.SeqSource.ReadSeqFileData. +func (imss *idMapSeqSource) ReadSeqFileData(handle seqfile.SeqHandle) ([]seqfile.SeqData, int64) { + var start int + if handle != nil { + start = handle.(*idMapSeqHandle).value + } + var entries []auth.IDMapEntry + if imss.gids { + entries = imss.t.UserNamespace().GIDMap() + } else { + entries = imss.t.UserNamespace().UIDMap() + } + var data []seqfile.SeqData + i := 1 + for _, e := range entries { + if i > start { + data = append(data, seqfile.SeqData{ + Buf: idMapLineFromEntry(e), + Handle: &idMapSeqHandle{i}, + }) + } + i++ + } + return data, 0 +} + +// TODO: Fix issue requiring idMapSeqHandle wrapping an int. +type idMapSeqHandle struct { + value int +} + +type idMapSeqFile struct { + seqfile.SeqFile +} + +// newUIDMap returns a new uid_map file. +func newUIDMap(t *kernel.Task, msrc *fs.MountSource) *fs.Inode { + return newIDMap(t, msrc, false /* gids */) +} + +// newGIDMap returns a new gid_map file. +func newGIDMap(t *kernel.Task, msrc *fs.MountSource) *fs.Inode { + return newIDMap(t, msrc, true /* gids */) +} + +func newIDMap(t *kernel.Task, msrc *fs.MountSource, gids bool) *fs.Inode { + imsf := &idMapSeqFile{seqfile.SeqFile{SeqSource: &idMapSeqSource{ + t: t, + gids: gids, + }}} + imsf.InitEntry(t, fs.RootOwner, fs.FilePermsFromMode(0644)) + return newFile(imsf, msrc, fs.SpecialFile, t) +} + +func (imsf *idMapSeqFile) source() *idMapSeqSource { + return imsf.SeqFile.SeqSource.(*idMapSeqSource) +} + +// "There is an (arbitrary) limit on the number of lines in the file. As at +// Linux 3.18, the limit is five lines." - user_namespaces(7) +const maxIDMapLines = 5 + +// DeprecatedPwritev implements fs.InodeOperations.DeprecatedPwritev. +func (imsf *idMapSeqFile) DeprecatedPwritev(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) { + // "In addition, the number of bytes written to the file must be less than + // the system page size, and the write must be performed at the start of + // the file ..." - user_namespaces(7) + srclen := src.NumBytes() + if srclen >= usermem.PageSize || offset != 0 { + return 0, syserror.EINVAL + } + b := make([]byte, srclen) + if _, err := src.CopyIn(ctx, b); err != nil { + return 0, err + } + lines := bytes.SplitN(bytes.TrimSpace(b), []byte("\n"), maxIDMapLines+1) + if len(lines) > maxIDMapLines { + return 0, syserror.EINVAL + } + entries := make([]auth.IDMapEntry, len(lines)) + for i, l := range lines { + e, err := idMapEntryFromLine(string(l)) + if err != nil { + return 0, syserror.EINVAL + } + entries[i] = e + } + t := imsf.source().t + var err error + if imsf.source().gids { + err = t.UserNamespace().SetGIDMap(ctx, entries) + } else { + err = t.UserNamespace().SetUIDMap(ctx, entries) + } + if err != nil { + return 0, err + } + return int64(len(b)), nil +} + +func idMapLineFromEntry(e auth.IDMapEntry) []byte { + var b bytes.Buffer + fmt.Fprintf(&b, "%10d %10d %10d\n", e.FirstID, e.FirstParentID, e.Length) + return b.Bytes() +} + +func idMapEntryFromLine(line string) (auth.IDMapEntry, error) { + var e auth.IDMapEntry + _, err := fmt.Sscan(line, &e.FirstID, &e.FirstParentID, &e.Length) + return e, err +} diff --git a/pkg/sentry/fs/proc/uptime.go b/pkg/sentry/fs/proc/uptime.go new file mode 100644 index 000000000..4679d5821 --- /dev/null +++ b/pkg/sentry/fs/proc/uptime.go @@ -0,0 +1,61 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package proc + +import ( + "fmt" + "io" + + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs" + ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// uptime is a file containing the system uptime. +type uptime struct { + ramfs.Entry + + // The "start time" of the sandbox. + startTime ktime.Time +} + +// newUptime returns a new uptime file. +func (p *proc) newUptime(ctx context.Context, msrc *fs.MountSource) *fs.Inode { + u := &uptime{ + startTime: ktime.NowFromContext(ctx), + } + u.InitEntry(ctx, fs.RootOwner, fs.FilePermsFromMode(0444)) + return newFile(u, msrc, fs.SpecialFile, nil) +} + +// DeprecatedPreadv reads the current uptime. +func (u *uptime) DeprecatedPreadv(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) { + if offset < 0 { + return 0, syserror.EINVAL + } + + now := ktime.NowFromContext(ctx) + // Pretend that we've spent zero time sleeping (second number). + s := []byte(fmt.Sprintf("%.2f 0.00\n", now.Sub(u.startTime).Seconds())) + if offset >= int64(len(s)) { + return 0, io.EOF + } + + n, err := dst.CopyOut(ctx, s[offset:]) + return int64(n), err +} diff --git a/pkg/sentry/fs/proc/version.go b/pkg/sentry/fs/proc/version.go new file mode 100644 index 000000000..df3040d37 --- /dev/null +++ b/pkg/sentry/fs/proc/version.go @@ -0,0 +1,75 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package proc + +import ( + "fmt" + + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc/seqfile" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel" +) + +// versionData backs /proc/version. +type versionData struct { + // k is the owning Kernel. + k *kernel.Kernel +} + +// NeedsUpdate implements seqfile.SeqSource.NeedsUpdate. +func (*versionData) NeedsUpdate(generation int64) bool { + return true +} + +// ReadSeqFileData implements seqfile.SeqSource.ReadSeqFileData. +func (v *versionData) ReadSeqFileData(h seqfile.SeqHandle) ([]seqfile.SeqData, int64) { + if h != nil { + return nil, 0 + } + + init := v.k.GlobalInit() + if init == nil { + // Attempted to read before the init Task is created. This can + // only occur during startup, which should never need to read + // this file. + panic("Attempted to read version before initial Task is available") + } + + // /proc/version takes the form: + // + // "SYSNAME version RELEASE (COMPILE_USER@COMPILE_HOST) + // (COMPILER_VERSION) VERSION" + // + // where: + // - SYSNAME, RELEASE, and VERSION are the same as returned by + // sys_utsname + // - COMPILE_USER is the user that build the kernel + // - COMPILE_HOST is the hostname of the machine on which the kernel + // was built + // - COMPILER_VERSION is the version reported by the building compiler + // + // Since we don't really want to expose build information to + // applications, those fields are omitted. + // + // FIXME: Using Version from the init task SyscallTable + // disregards the different version a task may have (e.g., in a uts + // namespace). + ver := init.Leader().SyscallTable().Version + return []seqfile.SeqData{ + { + Buf: []byte(fmt.Sprintf("%s version %s %s\n", ver.Sysname, ver.Release, ver.Version)), + Handle: (*versionData)(nil), + }, + }, 0 +} diff --git a/pkg/sentry/fs/ramfs/BUILD b/pkg/sentry/fs/ramfs/BUILD new file mode 100644 index 000000000..663a1aeb9 --- /dev/null +++ b/pkg/sentry/fs/ramfs/BUILD @@ -0,0 +1,62 @@ +package(licenses = ["notice"]) # Apache 2.0 + +load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test") +load("//tools/go_stateify:defs.bzl", "go_stateify") + +go_stateify( + name = "ramfs_state", + srcs = [ + "dir.go", + "file.go", + "ramfs.go", + "socket.go", + "symlink.go", + ], + out = "ramfs_state.go", + package = "ramfs", +) + +go_library( + name = "ramfs", + srcs = [ + "dir.go", + "file.go", + "ramfs.go", + "ramfs_state.go", + "socket.go", + "symlink.go", + "tree.go", + ], + importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs", + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/amutex", + "//pkg/log", + "//pkg/refs", + "//pkg/secio", + "//pkg/sentry/context", + "//pkg/sentry/device", + "//pkg/sentry/fs", + "//pkg/sentry/fs/anon", + "//pkg/sentry/fs/fsutil", + "//pkg/sentry/kernel/time", + "//pkg/sentry/memmap", + "//pkg/sentry/safemem", + "//pkg/sentry/usermem", + "//pkg/state", + "//pkg/syserror", + "//pkg/tcpip/transport/unix", + "//pkg/waiter", + ], +) + +go_test( + name = "ramfs_test", + size = "small", + srcs = ["tree_test.go"], + embed = [":ramfs"], + deps = [ + "//pkg/sentry/context/contexttest", + "//pkg/sentry/fs", + ], +) diff --git a/pkg/sentry/fs/ramfs/dir.go b/pkg/sentry/fs/ramfs/dir.go new file mode 100644 index 000000000..bf4cd8dfd --- /dev/null +++ b/pkg/sentry/fs/ramfs/dir.go @@ -0,0 +1,364 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package ramfs + +import ( + "sync" + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" + "gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix" +) + +// CreateOps represents operations to create different file types. +type CreateOps struct { + // NewDir creates a new directory. + NewDir func(ctx context.Context, dir *fs.Inode, perms fs.FilePermissions) (*fs.Inode, error) + + // NewFile creates a new file. + NewFile func(ctx context.Context, dir *fs.Inode, perms fs.FilePermissions) (*fs.Inode, error) + + // NewSymlink creates a new symlink with permissions 0777. + NewSymlink func(ctx context.Context, dir *fs.Inode, target string) (*fs.Inode, error) + + // NewBoundEndpoint creates a new socket. + NewBoundEndpoint func(ctx context.Context, dir *fs.Inode, ep unix.BoundEndpoint, perms fs.FilePermissions) (*fs.Inode, error) + + // NewFifo creates a new fifo. + NewFifo func(ctx context.Context, dir *fs.Inode, perm fs.FilePermissions) (*fs.Inode, error) +} + +// Dir represents a single directory in the filesystem. +type Dir struct { + Entry + + // CreateOps may be provided. + // + // These may only be modified during initialization (while the application + // is not running). No sychronization is performed when accessing these + // operations during syscalls. + *CreateOps `state:"nosave"` + + // mu protects the fields below. + mu sync.Mutex `state:"nosave"` + + // children are inodes that are in this directory. A reference is held + // on each inode while it is in the map. + children map[string]*fs.Inode + + // dentryMap is a sortedDentryMap containing entries for all children. + // Its entries ar kept up-to-date with d.children. + dentryMap *fs.SortedDentryMap +} + +// InitDir initializes a directory. +func (d *Dir) InitDir(ctx context.Context, contents map[string]*fs.Inode, owner fs.FileOwner, perms fs.FilePermissions) { + d.InitEntry(ctx, owner, perms) + if contents == nil { + contents = make(map[string]*fs.Inode) + } + d.children = contents + // Build the entries map ourselves, rather than calling addChildLocked, + // because it will be faster. + entries := make(map[string]fs.DentAttr, len(contents)) + for name, inode := range contents { + entries[name] = fs.DentAttr{ + Type: inode.StableAttr.Type, + InodeID: inode.StableAttr.InodeID, + } + } + d.dentryMap = fs.NewSortedDentryMap(entries) + + // Directories have an extra link, corresponding to '.'. + d.AddLink() +} + +// addChildLocked add the child inode, inheriting its reference. +func (d *Dir) addChildLocked(name string, inode *fs.Inode) { + d.children[name] = inode + d.dentryMap.Add(name, fs.DentAttr{ + Type: inode.StableAttr.Type, + InodeID: inode.StableAttr.InodeID, + }) + + // If the child is a directory, increment this dir's link count, + // corresponding to '..' from the subdirectory. + if fs.IsDir(inode.StableAttr) { + d.AddLink() + } + + // Given we're now adding this inode to the directory we must also + // increase its link count. Similiarly we decremented it in removeChildLocked. + inode.AddLink() +} + +// AddChild adds a child to this dir. +func (d *Dir) AddChild(ctx context.Context, name string, inode *fs.Inode) { + d.mu.Lock() + defer d.mu.Unlock() + d.addChildLocked(name, inode) +} + +// FindChild returns (child, true) if the directory contains name. +func (d *Dir) FindChild(name string) (*fs.Inode, bool) { + d.mu.Lock() + defer d.mu.Unlock() + child, ok := d.children[name] + return child, ok +} + +// removeChildLocked attempts to remove an entry from this directory. +// This Entry's mutex must be held. It returns the removed Inode. +func (d *Dir) removeChildLocked(ctx context.Context, name string) (*fs.Inode, error) { + inode, ok := d.children[name] + if !ok { + return nil, ErrNotFound + } + + delete(d.children, name) + d.dentryMap.Remove(name) + d.Entry.NotifyModification(ctx) + + // If the child was a subdirectory, then we must decrement this dir's + // link count which was the child's ".." directory entry. + if fs.IsDir(inode.StableAttr) { + d.DropLink() + } + + // Update ctime. + inode.NotifyStatusChange(ctx) + + // Given we're now removing this inode to the directory we must also + // decrease its link count. Similiarly it is increased in addChildLocked. + inode.DropLink() + + return inode, nil +} + +// RemoveEntry attempts to remove an entry from this directory. +func (d *Dir) RemoveEntry(ctx context.Context, name string) error { + d.mu.Lock() + defer d.mu.Unlock() + inode, err := d.removeChildLocked(ctx, name) + if err != nil { + return err + } + + // Remove our reference on the inode. + inode.DecRef() + return nil +} + +// Remove removes the named non-directory. +func (d *Dir) Remove(ctx context.Context, dir *fs.Inode, name string) error { + return d.RemoveEntry(ctx, name) +} + +// RemoveDirectory removes the named directory. +func (d *Dir) RemoveDirectory(ctx context.Context, dir *fs.Inode, name string) error { + d.mu.Lock() + defer d.mu.Unlock() + + n, err := d.walkLocked(ctx, name) + if err != nil { + return err + } + dirCtx := &fs.DirCtx{} + if _, err := n.HandleOps().DeprecatedReaddir(ctx, dirCtx, 0); err != nil { + return err + } + if len(dirCtx.DentAttrs()) > 0 { + return ErrNotEmpty + } + inode, err := d.removeChildLocked(ctx, name) + if err != nil { + return err + } + + // Remove our reference on the inode. + inode.DecRef() + + return err +} + +// Lookup loads an inode at p into a Dirent. +func (d *Dir) Lookup(ctx context.Context, dir *fs.Inode, p string) (*fs.Dirent, error) { + d.mu.Lock() + defer d.mu.Unlock() + + inode, err := d.walkLocked(ctx, p) + if err != nil { + return nil, err + } + + // Take a reference on the inode before returning it. This reference + // is owned by the dirent we are about to create. + inode.IncRef() + return fs.NewDirent(inode, p), nil +} + +// walkLocked must be called with this Entry's mutex held. +func (d *Dir) walkLocked(ctx context.Context, p string) (*fs.Inode, error) { + d.Entry.NotifyAccess(ctx) + + // Lookup a child node. + if inode, ok := d.children[p]; ok { + return inode, nil + } + + // fs.InodeOperations.Lookup returns syserror.ENOENT if p + // does not exist. + return nil, syserror.ENOENT +} + +// createInodeOperationsCommon creates a new child node at this dir by calling +// makeInodeOperations. It is the common logic for creating a new child. +func (d *Dir) createInodeOperationsCommon(ctx context.Context, name string, makeInodeOperations func() (*fs.Inode, error)) (*fs.Inode, error) { + d.mu.Lock() + defer d.mu.Unlock() + + if _, ok := d.children[name]; ok { + return nil, syscall.EEXIST + } + + inode, err := makeInodeOperations() + if err != nil { + return nil, err + } + + d.addChildLocked(name, inode) + d.Entry.NotifyModification(ctx) + + return inode, nil +} + +// Create creates a new Inode with the given name and returns its File. +func (d *Dir) Create(ctx context.Context, dir *fs.Inode, name string, flags fs.FileFlags, perms fs.FilePermissions) (*fs.File, error) { + if d.CreateOps == nil || d.CreateOps.NewFile == nil { + return nil, ErrDenied + } + + inode, err := d.createInodeOperationsCommon(ctx, name, func() (*fs.Inode, error) { + return d.NewFile(ctx, dir, perms) + }) + if err != nil { + return nil, err + } + + // Take an extra ref on inode, which will be owned by the dirent. + inode.IncRef() + + // Create the Dirent and corresponding file. + created := fs.NewDirent(inode, name) + defer created.DecRef() + return created.Inode.GetFile(ctx, created, flags) +} + +// CreateLink returns a new link. +func (d *Dir) CreateLink(ctx context.Context, dir *fs.Inode, oldname, newname string) error { + if d.CreateOps == nil || d.CreateOps.NewSymlink == nil { + return ErrDenied + } + _, err := d.createInodeOperationsCommon(ctx, newname, func() (*fs.Inode, error) { + return d.NewSymlink(ctx, dir, oldname) + }) + return err +} + +// CreateHardLink creates a new hard link. +func (d *Dir) CreateHardLink(ctx context.Context, dir *fs.Inode, target *fs.Inode, name string) error { + d.mu.Lock() + defer d.mu.Unlock() + + // Take an extra reference on the inode and add it to our children. + target.IncRef() + + // The link count will be incremented in addChildLocked. + d.addChildLocked(name, target) + d.Entry.NotifyModification(ctx) + + // Update ctime. + target.NotifyStatusChange(ctx) + + return nil +} + +// CreateDirectory returns a new subdirectory. +func (d *Dir) CreateDirectory(ctx context.Context, dir *fs.Inode, name string, perms fs.FilePermissions) error { + if d.CreateOps == nil || d.CreateOps.NewDir == nil { + return ErrDenied + } + _, err := d.createInodeOperationsCommon(ctx, name, func() (*fs.Inode, error) { + return d.NewDir(ctx, dir, perms) + }) + // TODO: Support updating status times, as those should be + // updated by links. + return err +} + +// Bind implements fs.InodeOperations.Bind. +func (d *Dir) Bind(ctx context.Context, dir *fs.Inode, name string, ep unix.BoundEndpoint, perms fs.FilePermissions) error { + if d.CreateOps == nil || d.CreateOps.NewBoundEndpoint == nil { + return ErrDenied + } + _, err := d.createInodeOperationsCommon(ctx, name, func() (*fs.Inode, error) { + return d.NewBoundEndpoint(ctx, dir, ep, perms) + }) + if err == syscall.EEXIST { + return syscall.EADDRINUSE + } + return err +} + +// CreateFifo implements fs.InodeOperations.CreateFifo. +func (d *Dir) CreateFifo(ctx context.Context, dir *fs.Inode, name string, perms fs.FilePermissions) error { + if d.CreateOps == nil || d.CreateOps.NewFifo == nil { + return ErrDenied + } + _, err := d.createInodeOperationsCommon(ctx, name, func() (*fs.Inode, error) { + return d.NewFifo(ctx, dir, perms) + }) + return err +} + +func (d *Dir) readdirLocked(ctx context.Context, dirCtx *fs.DirCtx, offset int) (int, error) { + // Serialize the entries in dentryMap. + n, err := fs.GenericReaddir(dirCtx, d.dentryMap) + + // Touch the access time. + d.Entry.NotifyAccess(ctx) + + return offset + n, err +} + +// DeprecatedReaddir emits the entries contained in this directory. +func (d *Dir) DeprecatedReaddir(ctx context.Context, dirCtx *fs.DirCtx, offset int) (int, error) { + d.mu.Lock() + defer d.mu.Unlock() + return d.readdirLocked(ctx, dirCtx, offset) +} + +// DeprecatedPreadv always returns ErrIsDirectory +func (*Dir) DeprecatedPreadv(context.Context, usermem.IOSequence, int64) (int64, error) { + return 0, ErrIsDirectory +} + +// DeprecatedPwritev always returns ErrIsDirectory +func (*Dir) DeprecatedPwritev(context.Context, usermem.IOSequence, int64) (int64, error) { + return 0, ErrIsDirectory +} diff --git a/pkg/sentry/fs/ramfs/file.go b/pkg/sentry/fs/ramfs/file.go new file mode 100644 index 000000000..e8363c3e2 --- /dev/null +++ b/pkg/sentry/fs/ramfs/file.go @@ -0,0 +1,148 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package ramfs + +import ( + "io" + "sync" + + "gvisor.googlesource.com/gvisor/pkg/secio" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/safemem" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// File represents a unique file. It uses a simple byte slice as storage, and +// thus should only be used for small files. +// +// A File is not mappable. +type File struct { + Entry + + // mu protects the fields below. + mu sync.Mutex `state:"nosave"` + + // data tracks backing data for the file. + data []byte +} + +// InitFile initializes a file. +func (f *File) InitFile(ctx context.Context, owner fs.FileOwner, perms fs.FilePermissions) { + f.InitEntry(ctx, owner, perms) +} + +// UnstableAttr returns unstable attributes of this ramfs file. +func (f *File) UnstableAttr(ctx context.Context, inode *fs.Inode) (fs.UnstableAttr, error) { + f.mu.Lock() + defer f.mu.Unlock() + + uattr, _ := f.Entry.UnstableAttr(ctx, inode) + uattr.Size = int64(len(f.data)) + uattr.Usage = f.usageLocked() + + return uattr, nil +} + +// usageLocked returns the disk usage. Caller must hold f.mu. +func (f *File) usageLocked() int64 { + return int64(len(f.data)) +} + +// Append appends the given data. This is for internal use. +func (f *File) Append(data []byte) { + f.mu.Lock() + defer f.mu.Unlock() + f.data = append(f.data, data...) +} + +// Truncate truncates this node. +func (f *File) Truncate(ctx context.Context, inode *fs.Inode, l int64) error { + f.mu.Lock() + defer f.mu.Unlock() + if l < int64(len(f.data)) { + // Remove excess bytes. + f.data = f.data[:l] + return nil + } else if l > int64(len(f.data)) { + // Create a new slice with size l, and copy f.data into it. + d := make([]byte, l) + copy(d, f.data) + f.data = d + } + f.Entry.NotifyModification(ctx) + return nil +} + +// ReadAt implements io.ReaderAt. +func (f *File) ReadAt(data []byte, offset int64) (int, error) { + if offset < 0 { + return 0, ErrInvalidOp + } + if offset >= int64(len(f.data)) { + return 0, io.EOF + } + n := copy(data, f.data[offset:]) + // Did we read past the end? + if offset+int64(len(data)) >= int64(len(f.data)) { + return n, io.EOF + } + return n, nil +} + +// DeprecatedPreadv reads into a collection of slices from a given offset. +func (f *File) DeprecatedPreadv(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) { + f.mu.Lock() + defer f.mu.Unlock() + if offset >= int64(len(f.data)) { + return 0, io.EOF + } + n, err := dst.CopyOut(ctx, f.data[offset:]) + if n > 0 { + f.Entry.NotifyAccess(ctx) + } + return int64(n), err +} + +// WriteAt implements io.WriterAt. +func (f *File) WriteAt(data []byte, offset int64) (int, error) { + if offset < 0 { + return 0, ErrInvalidOp + } + newLen := offset + int64(len(data)) + if newLen < 0 { + // Overflow. + return 0, syserror.EINVAL + } + if newLen > int64(len(f.data)) { + // Copy f.data into new slice with expanded length. + d := make([]byte, newLen) + copy(d, f.data) + f.data = d + } + return copy(f.data[offset:], data), nil +} + +// DeprecatedPwritev writes from a collection of slices at a given offset. +func (f *File) DeprecatedPwritev(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) { + f.mu.Lock() + defer f.mu.Unlock() + n, err := src.CopyInTo(ctx, safemem.FromIOWriter{secio.NewOffsetWriter(f, offset)}) + if n > 0 { + f.Entry.NotifyModification(ctx) + } + return n, err +} diff --git a/pkg/sentry/fs/ramfs/ramfs.go b/pkg/sentry/fs/ramfs/ramfs.go new file mode 100644 index 000000000..04f2d38de --- /dev/null +++ b/pkg/sentry/fs/ramfs/ramfs.go @@ -0,0 +1,433 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package ramfs implements an in-memory file system that can be associated with +// any device. +package ramfs + +import ( + "errors" + "sync" + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil" + ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time" + "gvisor.googlesource.com/gvisor/pkg/sentry/memmap" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" + "gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix" + "gvisor.googlesource.com/gvisor/pkg/waiter" +) + +var ( + // ErrInvalidOp indicates the operation is not valid. + ErrInvalidOp = errors.New("invalid operation") + + // ErrDenied indicates the operation was denid. + ErrDenied = errors.New("operation denied") + + // ErrNotFound indicates that a node was not found on a walk. + ErrNotFound = errors.New("node not found") + + // ErrCrossDevice indicates a cross-device link or rename. + ErrCrossDevice = errors.New("can't link across filesystems") + + // ErrIsDirectory indicates that the operation failed because + // the node is a directory. + ErrIsDirectory = errors.New("is a directory") + + // ErrNotDirectory indicates that the operation failed because + // the node is a not directory. + ErrNotDirectory = errors.New("not a directory") + + // ErrNotEmpty indicates that the operation failed because the + // directory is not empty. + ErrNotEmpty = errors.New("directory not empty") +) + +// Entry represents common internal state for file and directory nodes. +// This may be used by other packages to easily create ramfs files. +type Entry struct { + waiter.AlwaysReady `state:"nosave"` + fsutil.NoMappable `state:"nosave"` + fsutil.NoopWriteOut `state:"nosave"` + fsutil.InodeNotSocket `state:"nosave"` + + // mu protects the fields below. + mu sync.Mutex `state:"nosave"` + + // unstable is unstable attributes. + unstable fs.UnstableAttr + + // xattrs are the extended attributes of the Entry. + xattrs map[string][]byte +} + +// InitEntry initializes an entry. +func (e *Entry) InitEntry(ctx context.Context, owner fs.FileOwner, p fs.FilePermissions) { + e.InitEntryWithAttr(ctx, fs.WithCurrentTime(ctx, fs.UnstableAttr{ + Owner: owner, + Perms: p, + // Always start unlinked. + Links: 0, + })) +} + +// InitEntryWithAttr initializes an entry with a complete set of attributes. +func (e *Entry) InitEntryWithAttr(ctx context.Context, uattr fs.UnstableAttr) { + e.unstable = uattr + e.xattrs = make(map[string][]byte) +} + +// UnstableAttr implements fs.InodeOperations.UnstableAttr. +func (e *Entry) UnstableAttr(ctx context.Context, inode *fs.Inode) (fs.UnstableAttr, error) { + e.mu.Lock() + defer e.mu.Unlock() + return e.unstable, nil +} + +// Check implements fs.InodeOperations.Check. +func (*Entry) Check(ctx context.Context, inode *fs.Inode, p fs.PermMask) bool { + return fs.ContextCanAccessFile(ctx, inode, p) +} + +// Getxattr implements fs.InodeOperations.Getxattr. +func (e *Entry) Getxattr(inode *fs.Inode, name string) ([]byte, error) { + e.mu.Lock() + defer e.mu.Unlock() + if value, ok := e.xattrs[name]; ok { + return value, nil + } + return nil, syserror.ENOATTR +} + +// Setxattr implements fs.InodeOperations.Setxattr. +func (e *Entry) Setxattr(inode *fs.Inode, name string, value []byte) error { + e.mu.Lock() + defer e.mu.Unlock() + e.xattrs[name] = value + return nil +} + +// Listxattr implements fs.InodeOperations.Listxattr. +func (e *Entry) Listxattr(inode *fs.Inode) (map[string]struct{}, error) { + e.mu.Lock() + defer e.mu.Unlock() + names := make(map[string]struct{}, len(e.xattrs)) + for name := range e.xattrs { + names[name] = struct{}{} + } + return names, nil +} + +// GetFile returns a fs.File backed by the dirent argument and flags. +func (*Entry) GetFile(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) { + return fsutil.NewHandle(ctx, d, flags, d.Inode.HandleOps()), nil +} + +// SetPermissions always sets the permissions. +func (e *Entry) SetPermissions(ctx context.Context, inode *fs.Inode, p fs.FilePermissions) bool { + e.mu.Lock() + defer e.mu.Unlock() + e.unstable.Perms = p + e.unstable.StatusChangeTime = ktime.NowFromContext(ctx) + return true +} + +// SetOwner always sets ownership. +func (e *Entry) SetOwner(ctx context.Context, inode *fs.Inode, owner fs.FileOwner) error { + e.mu.Lock() + defer e.mu.Unlock() + if owner.UID.Ok() { + e.unstable.Owner.UID = owner.UID + } + if owner.GID.Ok() { + e.unstable.Owner.GID = owner.GID + } + return nil +} + +// SetTimestamps sets the timestamps. +func (e *Entry) SetTimestamps(ctx context.Context, inode *fs.Inode, ts fs.TimeSpec) error { + if ts.ATimeOmit && ts.MTimeOmit { + return nil + } + + e.mu.Lock() + defer e.mu.Unlock() + + now := ktime.NowFromContext(ctx) + if !ts.ATimeOmit { + if ts.ATimeSetSystemTime { + e.unstable.AccessTime = now + } else { + e.unstable.AccessTime = ts.ATime + } + } + if !ts.MTimeOmit { + if ts.MTimeSetSystemTime { + e.unstable.ModificationTime = now + } else { + e.unstable.ModificationTime = ts.MTime + } + } + e.unstable.StatusChangeTime = now + return nil +} + +// NotifyStatusChange updates the status change time (ctime). +func (e *Entry) NotifyStatusChange(ctx context.Context) { + e.mu.Lock() + defer e.mu.Unlock() + e.unstable.StatusChangeTime = ktime.NowFromContext(ctx) +} + +// StatusChangeTime returns the last status change time for this node. +func (e *Entry) StatusChangeTime() ktime.Time { + e.mu.Lock() + defer e.mu.Unlock() + return e.unstable.StatusChangeTime +} + +// NotifyModification updates the modification time and the status change time. +func (e *Entry) NotifyModification(ctx context.Context) { + e.mu.Lock() + defer e.mu.Unlock() + now := ktime.NowFromContext(ctx) + e.unstable.ModificationTime = now + e.unstable.StatusChangeTime = now +} + +// ModificationTime returns the last modification time for this node. +func (e *Entry) ModificationTime() ktime.Time { + e.mu.Lock() + defer e.mu.Unlock() + return e.unstable.ModificationTime +} + +// NotifyAccess updates the access time. +func (e *Entry) NotifyAccess(ctx context.Context) { + e.mu.Lock() + defer e.mu.Unlock() + now := ktime.NowFromContext(ctx) + e.unstable.AccessTime = now +} + +// AccessTime returns the last access time for this node. +func (e *Entry) AccessTime() ktime.Time { + e.mu.Lock() + defer e.mu.Unlock() + return e.unstable.AccessTime +} + +// Permissions returns permissions on this entry. +func (e *Entry) Permissions() fs.FilePermissions { + e.mu.Lock() + defer e.mu.Unlock() + return e.unstable.Perms +} + +// Lookup is not supported by default. +func (*Entry) Lookup(context.Context, *fs.Inode, string) (*fs.Dirent, error) { + return nil, ErrInvalidOp +} + +// Create is not supported by default. +func (*Entry) Create(context.Context, *fs.Inode, string, fs.FileFlags, fs.FilePermissions) (*fs.File, error) { + return nil, ErrInvalidOp +} + +// CreateLink is not supported by default. +func (*Entry) CreateLink(context.Context, *fs.Inode, string, string) error { + return ErrInvalidOp +} + +// CreateHardLink is not supported by default. +func (*Entry) CreateHardLink(context.Context, *fs.Inode, *fs.Inode, string) error { + return ErrInvalidOp +} + +// IsVirtual returns true. +func (*Entry) IsVirtual() bool { + return true +} + +// CreateDirectory is not supported by default. +func (*Entry) CreateDirectory(context.Context, *fs.Inode, string, fs.FilePermissions) error { + return ErrInvalidOp +} + +// Bind is not supported by default. +func (*Entry) Bind(context.Context, *fs.Inode, string, unix.BoundEndpoint, fs.FilePermissions) error { + return ErrInvalidOp +} + +// CreateFifo implements fs.InodeOperations.CreateFifo. CreateFifo is not supported by +// default. +func (*Entry) CreateFifo(context.Context, *fs.Inode, string, fs.FilePermissions) error { + return ErrInvalidOp +} + +// Remove is not supported by default. +func (*Entry) Remove(context.Context, *fs.Inode, string) error { + return ErrInvalidOp +} + +// RemoveDirectory is not supported by default. +func (*Entry) RemoveDirectory(context.Context, *fs.Inode, string) error { + return ErrInvalidOp +} + +// StatFS always returns ENOSYS. +func (*Entry) StatFS(context.Context) (fs.Info, error) { + return fs.Info{}, syscall.ENOSYS +} + +// Rename implements fs.InodeOperations.Rename. +func (e *Entry) Rename(ctx context.Context, oldParent *fs.Inode, oldName string, newParent *fs.Inode, newName string) error { + return Rename(ctx, oldParent.InodeOperations, oldName, newParent.InodeOperations, newName) +} + +// Rename renames from a *ramfs.Dir to another *ramfs.Dir. +func Rename(ctx context.Context, oldParent fs.InodeOperations, oldName string, newParent fs.InodeOperations, newName string) error { + op, ok := oldParent.(*Dir) + if !ok { + return ErrCrossDevice + } + np, ok := newParent.(*Dir) + if !ok { + return ErrCrossDevice + } + + np.mu.Lock() + defer np.mu.Unlock() + + // Check whether the ramfs entry to be replaced is a non-empty directory. + if replaced, ok := np.children[newName]; ok { + if fs.IsDir(replaced.StableAttr) { + // FIXME: simplify by pinning children of ramfs-backed directories + // in the Dirent tree: this allows us to generalize ramfs operations without + // relying on an implementation of Readdir (which may do anything, like require + // that the file be open ... which would be reasonable). + dirCtx := &fs.DirCtx{} + _, err := replaced.HandleOps().DeprecatedReaddir(ctx, dirCtx, 0) + if err != nil { + return err + } + attrs := dirCtx.DentAttrs() + + // ramfs-backed directories should not contain "." and "..", but we do this + // just in case. + delete(attrs, ".") + delete(attrs, "..") + + // If the directory to be replaced is not empty, reject the rename. + if len(attrs) != 0 { + return ErrNotEmpty + } + } + } + + // Be careful, we may have already grabbed this mutex above. + if op != np { + op.mu.Lock() + defer op.mu.Unlock() + } + + // Do the swap. + n := op.children[oldName] + op.removeChildLocked(ctx, oldName) + np.addChildLocked(newName, n) + + // Update ctime. + n.NotifyStatusChange(ctx) + + return nil +} + +// Truncate is not supported by default. +func (*Entry) Truncate(context.Context, *fs.Inode, int64) error { + return ErrInvalidOp +} + +// Readlink always returns ENOLINK. +func (*Entry) Readlink(context.Context, *fs.Inode) (string, error) { + return "", syscall.ENOLINK +} + +// Getlink always returns ENOLINK. +func (*Entry) Getlink(context.Context, *fs.Inode) (*fs.Dirent, error) { + return nil, syscall.ENOLINK +} + +// Release is a no-op. +func (e *Entry) Release(context.Context) {} + +// AddLink implements InodeOperationss.AddLink. +func (e *Entry) AddLink() { + e.mu.Lock() + defer e.mu.Unlock() + e.unstable.Links++ +} + +// DropLink implements InodeOperationss.DropLink. +func (e *Entry) DropLink() { + e.mu.Lock() + defer e.mu.Unlock() + e.unstable.Links-- +} + +// DeprecatedReaddir is not supported by default. +func (*Entry) DeprecatedReaddir(context.Context, *fs.DirCtx, int) (int, error) { + return 0, ErrNotDirectory +} + +// DeprecatedPreadv always returns ErrInvalidOp. +func (*Entry) DeprecatedPreadv(context.Context, usermem.IOSequence, int64) (int64, error) { + return 0, ErrInvalidOp +} + +// DeprecatedPwritev always returns ErrInvalidOp. +func (*Entry) DeprecatedPwritev(context.Context, usermem.IOSequence, int64) (int64, error) { + return 0, ErrInvalidOp +} + +// DeprecatedFsync is a noop. +func (*Entry) DeprecatedFsync() error { + // Ignore, this is in memory. + return nil +} + +// DeprecatedFlush always returns nil. +func (*Entry) DeprecatedFlush() error { + return nil +} + +// DeprecatedMappable implements fs.InodeOperations.DeprecatedMappable. +func (*Entry) DeprecatedMappable(context.Context, *fs.Inode) (memmap.Mappable, bool) { + return nil, false +} + +func init() { + // Register ramfs errors. + syserror.AddErrorTranslation(ErrInvalidOp, syscall.EINVAL) + syserror.AddErrorTranslation(ErrDenied, syscall.EACCES) + syserror.AddErrorTranslation(ErrNotFound, syscall.ENOENT) + syserror.AddErrorTranslation(ErrCrossDevice, syscall.EXDEV) + syserror.AddErrorTranslation(ErrIsDirectory, syscall.EISDIR) + syserror.AddErrorTranslation(ErrNotDirectory, syscall.ENOTDIR) + syserror.AddErrorTranslation(ErrNotEmpty, syscall.ENOTEMPTY) +} diff --git a/pkg/sentry/fs/ramfs/socket.go b/pkg/sentry/fs/ramfs/socket.go new file mode 100644 index 000000000..b0c79325f --- /dev/null +++ b/pkg/sentry/fs/ramfs/socket.go @@ -0,0 +1,42 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package ramfs + +import ( + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix" +) + +// Socket represents a socket. +type Socket struct { + Entry + + // ep is the bound endpoint. + ep unix.BoundEndpoint +} + +// InitSocket initializes a socket. +func (s *Socket) InitSocket(ctx context.Context, ep unix.BoundEndpoint, owner fs.FileOwner, perms fs.FilePermissions) { + s.InitEntry(ctx, owner, perms) + s.ep = ep +} + +// BoundEndpoint returns the socket data. +func (s *Socket) BoundEndpoint(*fs.Inode, string) unix.BoundEndpoint { + // ramfs only supports stored sentry internal sockets. Only gofer sockets + // care about the path argument. + return s.ep +} diff --git a/pkg/sentry/fs/ramfs/symlink.go b/pkg/sentry/fs/ramfs/symlink.go new file mode 100644 index 000000000..9bbf78619 --- /dev/null +++ b/pkg/sentry/fs/ramfs/symlink.go @@ -0,0 +1,72 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package ramfs + +import ( + "sync" + + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" +) + +// Symlink represents a symlink. +type Symlink struct { + Entry + + mu sync.Mutex `state:"nosave"` + + // Target is the symlink target. + Target string +} + +// InitSymlink initializes a symlink, pointing to the given target. +// A symlink is assumed to always have permissions 0777. +func (s *Symlink) InitSymlink(ctx context.Context, owner fs.FileOwner, target string) { + s.InitEntry(ctx, owner, fs.FilePermsFromMode(0777)) + s.Target = target +} + +// UnstableAttr returns all attributes of this ramfs symlink. +func (s *Symlink) UnstableAttr(ctx context.Context, inode *fs.Inode) (fs.UnstableAttr, error) { + uattr, _ := s.Entry.UnstableAttr(ctx, inode) + uattr.Size = int64(len(s.Target)) + uattr.Usage = uattr.Size + return uattr, nil +} + +// Check implements InodeOperations.Check. +func (s *Symlink) Check(ctx context.Context, inode *fs.Inode, p fs.PermMask) bool { + return fs.ContextCanAccessFile(ctx, inode, p) +} + +// SetPermissions on a symlink is always rejected. +func (s *Symlink) SetPermissions(context.Context, *fs.Inode, fs.FilePermissions) bool { + return false +} + +// Readlink reads the symlink value. +func (s *Symlink) Readlink(ctx context.Context, _ *fs.Inode) (string, error) { + s.mu.Lock() + defer s.mu.Unlock() + + s.Entry.NotifyAccess(ctx) + return s.Target, nil +} + +// Getlink returns ErrResolveViaReadlink, falling back to walking to the result +// of Readlink(). +func (*Symlink) Getlink(context.Context, *fs.Inode) (*fs.Dirent, error) { + return nil, fs.ErrResolveViaReadlink +} diff --git a/pkg/sentry/fs/ramfs/test/BUILD b/pkg/sentry/fs/ramfs/test/BUILD new file mode 100644 index 000000000..074b0f5ad --- /dev/null +++ b/pkg/sentry/fs/ramfs/test/BUILD @@ -0,0 +1,31 @@ +package(licenses = ["notice"]) # Apache 2.0 + +load("@io_bazel_rules_go//go:def.bzl", "go_library") +load("//tools/go_stateify:defs.bzl", "go_stateify") + +go_stateify( + name = "test_state", + srcs = [ + "test.go", + ], + out = "test_state.go", + package = "test", +) + +go_library( + name = "test", + testonly = 1, + srcs = [ + "test.go", + "test_state.go", + ], + importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs/test", + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/sentry/context", + "//pkg/sentry/device", + "//pkg/sentry/fs", + "//pkg/sentry/fs/ramfs", + "//pkg/state", + ], +) diff --git a/pkg/sentry/fs/ramfs/test/test.go b/pkg/sentry/fs/ramfs/test/test.go new file mode 100644 index 000000000..fb669558f --- /dev/null +++ b/pkg/sentry/fs/ramfs/test/test.go @@ -0,0 +1,46 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package test provides a simple ramfs-based filesystem for use in testing. +package test + +import ( + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs" +) + +// Dir is a simple ramfs.Dir that supports save/restore as-is. +type Dir struct { + ramfs.Dir +} + +// NewDir returns a simple ramfs directory with the passed contents. +func NewDir(ctx context.Context, contents map[string]*fs.Inode, perms fs.FilePermissions) *Dir { + d := &Dir{} + d.InitDir(ctx, contents, fs.RootOwner, perms) + return d +} + +// File is a simple ramfs.File that supports save/restore as-is. +type File struct { + ramfs.File +} + +// NewFile returns a simple ramfs File. +func NewFile(ctx context.Context, perms fs.FilePermissions) *File { + f := &File{} + f.InitFile(ctx, fs.RootOwner, perms) + return f +} diff --git a/pkg/sentry/fs/ramfs/tree.go b/pkg/sentry/fs/ramfs/tree.go new file mode 100644 index 000000000..1fb335f74 --- /dev/null +++ b/pkg/sentry/fs/ramfs/tree.go @@ -0,0 +1,71 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package ramfs + +import ( + "fmt" + "path" + "strings" + + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/anon" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +// MakeDirectoryTree constructs a ramfs tree of all directories containing +// subdirs. Each element of subdir must be a clean path, and cannot be empty or +// "/". +func MakeDirectoryTree(ctx context.Context, msrc *fs.MountSource, subdirs []string) (*fs.Inode, error) { + root := emptyDir(ctx, msrc) + for _, subdir := range subdirs { + if path.Clean(subdir) != subdir { + return nil, fmt.Errorf("cannot add subdir at an unclean path: %q", subdir) + } + if subdir == "" || subdir == "/" { + return nil, fmt.Errorf("cannot add subdir at %q", subdir) + } + makeSubdir(ctx, msrc, root.InodeOperations.(*Dir), subdir) + } + return root, nil +} + +// makeSubdir installs into root each component of subdir. The final component is +// a *ramfs.Dir. +func makeSubdir(ctx context.Context, msrc *fs.MountSource, root *Dir, subdir string) { + for _, c := range strings.Split(subdir, "/") { + if len(c) == 0 { + continue + } + child, ok := root.FindChild(c) + if !ok { + child = emptyDir(ctx, msrc) + root.AddChild(ctx, c, child) + } + root = child.InodeOperations.(*Dir) + } +} + +// emptyDir returns an empty *ramfs.Dir that is traversable but not writable. +func emptyDir(ctx context.Context, msrc *fs.MountSource) *fs.Inode { + dir := &Dir{} + dir.InitDir(ctx, make(map[string]*fs.Inode), fs.RootOwner, fs.FilePermsFromMode(0555)) + return fs.NewInode(dir, msrc, fs.StableAttr{ + DeviceID: anon.PseudoDevice.DeviceID(), + InodeID: anon.PseudoDevice.NextIno(), + BlockSize: usermem.PageSize, + Type: fs.Directory, + }) +} diff --git a/pkg/sentry/fs/ramfs/tree_test.go b/pkg/sentry/fs/ramfs/tree_test.go new file mode 100644 index 000000000..68e2929d5 --- /dev/null +++ b/pkg/sentry/fs/ramfs/tree_test.go @@ -0,0 +1,79 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package ramfs + +import ( + "testing" + + "gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" +) + +func TestMakeDirectoryTree(t *testing.T) { + mount := fs.NewNonCachingMountSource(nil, fs.MountSourceFlags{}) + + for _, test := range []struct { + name string + subdirs []string + }{ + { + name: "abs paths", + subdirs: []string{ + "/tmp", + "/tmp/a/b", + "/tmp/a/c/d", + "/tmp/c", + "/proc", + "/dev/a/b", + "/tmp", + }, + }, + { + name: "rel paths", + subdirs: []string{ + "tmp", + "tmp/a/b", + "tmp/a/c/d", + "tmp/c", + "proc", + "dev/a/b", + "tmp", + }, + }, + } { + ctx := contexttest.Context(t) + tree, err := MakeDirectoryTree(ctx, mount, test.subdirs) + if err != nil { + t.Errorf("%s: failed to make ramfs tree, got error %v, want nil", test.name, err) + continue + } + + // Expect to be able to find each of the paths. + mm, err := fs.NewMountNamespace(ctx, tree) + if err != nil { + t.Errorf("%s: failed to create mount manager: %v", test.name, err) + continue + } + root := mm.Root() + defer mm.DecRef() + + for _, p := range test.subdirs { + if _, err := mm.FindInode(ctx, root, nil, p, 0); err != nil { + t.Errorf("%s: failed to find node %s: %v", test.name, p, err) + break + } + } + } +} diff --git a/pkg/sentry/fs/restore.go b/pkg/sentry/fs/restore.go new file mode 100644 index 000000000..b4ac85a27 --- /dev/null +++ b/pkg/sentry/fs/restore.go @@ -0,0 +1,75 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fs + +import ( + "sync" +) + +// RestoreEnvironment is the restore environment for file systems. It consists +// of things that change across save and restore and therefore cannot be saved +// in the object graph. +type RestoreEnvironment struct { + // MountSources maps Filesystem.Name() to mount arguments. + MountSources map[string][]MountArgs + + // ValidateFileSize indicates file size should not change across S/R. + ValidateFileSize bool + + // ValidateFileTimestamp indicates file modification timestamp should + // not change across S/R. + ValidateFileTimestamp bool +} + +// MountArgs holds arguments to Mount. +type MountArgs struct { + // Dev corresponds to the devname argumnent of Mount. + Dev string + + // Flags corresponds to the flags argument of Mount. + Flags MountSourceFlags + + // Data corresponds to the data argument of Mount. + Data string +} + +// restoreEnv holds the fs package global RestoreEnvironment. +var restoreEnv = struct { + mu sync.Mutex + env RestoreEnvironment + set bool +}{} + +// SetRestoreEnvironment sets the RestoreEnvironment. Must be called before +// state.Load and only once. +func SetRestoreEnvironment(r RestoreEnvironment) { + restoreEnv.mu.Lock() + defer restoreEnv.mu.Unlock() + if restoreEnv.set { + panic("RestoreEnvironment may only be set once") + } + restoreEnv.env = r + restoreEnv.set = true +} + +// CurrentRestoreEnvironment returns the current, read-only RestoreEnvironment. +// If no RestoreEnvironment was ever set, returns (_, false). +func CurrentRestoreEnvironment() (RestoreEnvironment, bool) { + restoreEnv.mu.Lock() + defer restoreEnv.mu.Unlock() + e := restoreEnv.env + set := restoreEnv.set + return e, set +} diff --git a/pkg/sentry/fs/save.go b/pkg/sentry/fs/save.go new file mode 100644 index 000000000..bf2a85143 --- /dev/null +++ b/pkg/sentry/fs/save.go @@ -0,0 +1,77 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fs + +import ( + "fmt" + "syscall" + + "gvisor.googlesource.com/gvisor/pkg/log" +) + +// SaveInodeMappings saves a mapping of path -> inode ID for every +// user-reachable Dirent. +// +// The entire kernel must be frozen to call this, and filesystem state must not +// change between SaveInodeMappings and state.Save, otherwise the saved state +// of any MountSource may be incoherent. +func SaveInodeMappings() { + mountsSeen := make(map[*MountSource]struct{}) + for dirent := range allDirents.dirents { + if _, ok := mountsSeen[dirent.Inode.MountSource]; !ok { + dirent.Inode.MountSource.ResetInodeMappings() + mountsSeen[dirent.Inode.MountSource] = struct{}{} + } + } + + for dirent := range allDirents.dirents { + if dirent.Inode != nil { + // We cannot trust the root provided in the mount due + // to the overlay. We can trust the overlay to delegate + // SaveInodeMappings to the right underlying + // filesystems, though. + root := dirent + for !root.mounted && root.parent != nil { + root = root.parent + } + + // Add the mapping. + n, reachable := dirent.FullName(root) + if !reachable { + // Something has gone seriously wrong if we can't reach our root. + panic(fmt.Sprintf("Unreachable root on dirent file %s", n)) + } + dirent.Inode.MountSource.SaveInodeMapping(dirent.Inode, n) + } + } +} + +// SaveFileFsyncError converts an fs.File.Fsync error to an error that +// indicates that the fs.File was not synced sufficiently to be saved. +func SaveFileFsyncError(err error) error { + switch err { + case nil: + // We succeeded, everything is great. + return nil + case syscall.EBADF, syscall.EINVAL, syscall.EROFS, syscall.ENOSYS, syscall.EPERM: + // These errors mean that the underlying node might not be syncable, + // which we expect to be reported as such even from the gofer. + log.Infof("failed to sync during save: %v", err) + return nil + default: + // We failed in some way that indicates potential data loss. + return fmt.Errorf("failed to sync: %v, data loss may occur", err) + } +} diff --git a/pkg/sentry/fs/seek.go b/pkg/sentry/fs/seek.go new file mode 100644 index 000000000..1268726c2 --- /dev/null +++ b/pkg/sentry/fs/seek.go @@ -0,0 +1,43 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fs + +// SeekWhence determines seek direction. +type SeekWhence int + +const ( + // SeekSet sets the absolute offset. + SeekSet SeekWhence = iota + + // SeekCurrent sets relative to the current position. + SeekCurrent + + // SeekEnd sets relative to the end of the file. + SeekEnd +) + +// String returns a human readable string for whence. +func (s SeekWhence) String() string { + switch s { + case SeekSet: + return "Set" + case SeekCurrent: + return "Current" + case SeekEnd: + return "End" + default: + return "Unknown" + } +} diff --git a/pkg/sentry/fs/sync.go b/pkg/sentry/fs/sync.go new file mode 100644 index 000000000..9738a8f22 --- /dev/null +++ b/pkg/sentry/fs/sync.go @@ -0,0 +1,43 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fs + +// SyncType enumerates ways in which a File can be synced. +type SyncType int + +const ( + // SyncAll indicates that modified in-memory metadata and data should + // be written to backing storage. SyncAll implies SyncBackingStorage. + SyncAll SyncType = iota + + // SyncData indicates that along with modified in-memory data, only + // metadata needed to access that data needs to be written. + // + // For example, changes to access time or modification time do not + // need to be written because they are not necessary for a data read + // to be handled correctly, unlike the file size. + // + // The aim of SyncData is to reduce disk activity for applications + // that do not require all metadata to be synchronized with the disk, + // see fdatasync(2). File systems that implement SyncData as SyncAll + // do not support this optimization. + // + // SyncData implies SyncBackingStorage. + SyncData + + // SyncBackingStorage indicates that in-flight write operations to + // backing storage should be flushed. + SyncBackingStorage +) diff --git a/pkg/sentry/fs/sys/BUILD b/pkg/sentry/fs/sys/BUILD new file mode 100644 index 000000000..0ae2cbac8 --- /dev/null +++ b/pkg/sentry/fs/sys/BUILD @@ -0,0 +1,34 @@ +package(licenses = ["notice"]) # Apache 2.0 + +load("@io_bazel_rules_go//go:def.bzl", "go_library") +load("//tools/go_stateify:defs.bzl", "go_stateify") + +go_stateify( + name = "sys_state", + srcs = [ + "fs.go", + "sys.go", + ], + out = "sys_state.go", + package = "sys", +) + +go_library( + name = "sys", + srcs = [ + "device.go", + "fs.go", + "sys.go", + "sys_state.go", + ], + importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/fs/sys", + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/sentry/context", + "//pkg/sentry/device", + "//pkg/sentry/fs", + "//pkg/sentry/fs/ramfs", + "//pkg/sentry/usermem", + "//pkg/state", + ], +) diff --git a/pkg/sentry/fs/sys/device.go b/pkg/sentry/fs/sys/device.go new file mode 100644 index 000000000..54e414d1b --- /dev/null +++ b/pkg/sentry/fs/sys/device.go @@ -0,0 +1,20 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package sys + +import "gvisor.googlesource.com/gvisor/pkg/sentry/device" + +// sysfsDevice is the sysfs virtual device. +var sysfsDevice = device.NewAnonDevice() diff --git a/pkg/sentry/fs/sys/fs.go b/pkg/sentry/fs/sys/fs.go new file mode 100644 index 000000000..f25f648c3 --- /dev/null +++ b/pkg/sentry/fs/sys/fs.go @@ -0,0 +1,56 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package sys + +import ( + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" +) + +// filesystem is a sysfs. +type filesystem struct{} + +func init() { + fs.RegisterFilesystem(&filesystem{}) +} + +// FilesystemName is the name underwhich the filesystem is registered. +// Name matches fs/sysfs/mount.c:sysfs_fs_type.name. +const FilesystemName = "sysfs" + +// Name is the name of the file system. +func (*filesystem) Name() string { + return FilesystemName +} + +// AllowUserMount allows users to mount(2) this file system. +func (*filesystem) AllowUserMount() bool { + return true +} + +// Flags returns that there is nothing special about this file system. +// +// In Linux, sysfs returns FS_USERNS_VISIBLE | FS_USERNS_MOUNT, see fs/sysfs/mount.c. +func (*filesystem) Flags() fs.FilesystemFlags { + return 0 +} + +// Mount returns a sysfs root which can be positioned in the vfs. +func (f *filesystem) Mount(ctx context.Context, device string, flags fs.MountSourceFlags, data string) (*fs.Inode, error) { + // device is always ignored. + // sysfs ignores data, see fs/sysfs/mount.c:sysfs_mount. + + return New(ctx, fs.NewNonCachingMountSource(f, flags)), nil +} diff --git a/pkg/sentry/fs/sys/sys.go b/pkg/sentry/fs/sys/sys.go new file mode 100644 index 000000000..ccf56f644 --- /dev/null +++ b/pkg/sentry/fs/sys/sys.go @@ -0,0 +1,57 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package sys implements a sysfs filesystem. +package sys + +import ( + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +type Dir struct { + ramfs.Dir +} + +func newDir(ctx context.Context, msrc *fs.MountSource, contents map[string]*fs.Inode) *fs.Inode { + d := &Dir{} + d.InitDir(ctx, contents, fs.RootOwner, fs.FilePermsFromMode(0555)) + return fs.NewInode(d, msrc, fs.StableAttr{ + DeviceID: sysfsDevice.DeviceID(), + InodeID: sysfsDevice.NextIno(), + BlockSize: usermem.PageSize, + Type: fs.SpecialDirectory, + }) +} + +// New returns the root node of a partial simple sysfs. +func New(ctx context.Context, msrc *fs.MountSource) *fs.Inode { + return newDir(ctx, msrc, map[string]*fs.Inode{ + // Add a basic set of top-level directories. In Linux, these + // are dynamically added depending on the KConfig. Here we just + // add the most common ones. + "block": newDir(ctx, msrc, nil), + "bus": newDir(ctx, msrc, nil), + "class": newDir(ctx, msrc, nil), + "dev": newDir(ctx, msrc, nil), + "devices": newDir(ctx, msrc, nil), + "firmware": newDir(ctx, msrc, nil), + "fs": newDir(ctx, msrc, nil), + "kernel": newDir(ctx, msrc, nil), + "module": newDir(ctx, msrc, nil), + "power": newDir(ctx, msrc, nil), + }) +} diff --git a/pkg/sentry/fs/timerfd/BUILD b/pkg/sentry/fs/timerfd/BUILD new file mode 100644 index 000000000..7fddc29f4 --- /dev/null +++ b/pkg/sentry/fs/timerfd/BUILD @@ -0,0 +1,35 @@ +package(licenses = ["notice"]) # Apache 2.0 + +load("@io_bazel_rules_go//go:def.bzl", "go_library") +load("//tools/go_stateify:defs.bzl", "go_stateify") + +go_stateify( + name = "timerfd_state", + srcs = [ + "timerfd.go", + ], + out = "timerfd_state.go", + package = "timerfd", +) + +go_library( + name = "timerfd", + srcs = [ + "timerfd.go", + "timerfd_state.go", + ], + importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/fs/timerfd", + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/refs", + "//pkg/sentry/context", + "//pkg/sentry/fs", + "//pkg/sentry/fs/anon", + "//pkg/sentry/fs/fsutil", + "//pkg/sentry/kernel/time", + "//pkg/sentry/usermem", + "//pkg/state", + "//pkg/syserror", + "//pkg/waiter", + ], +) diff --git a/pkg/sentry/fs/timerfd/timerfd.go b/pkg/sentry/fs/timerfd/timerfd.go new file mode 100644 index 000000000..ae58f6fd7 --- /dev/null +++ b/pkg/sentry/fs/timerfd/timerfd.go @@ -0,0 +1,144 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package timerfd implements the semantics of Linux timerfd objects as +// described by timerfd_create(2). +package timerfd + +import ( + "sync/atomic" + + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/anon" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil" + ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" + "gvisor.googlesource.com/gvisor/pkg/waiter" +) + +// TimerOperations implements fs.FileOperations for timerfds. +type TimerOperations struct { + fsutil.ZeroSeek `state:"nosave"` + fsutil.NotDirReaddir `state:"nosave"` + fsutil.NoFsync `state:"nosave"` + fsutil.NoopFlush `state:"nosave"` + fsutil.NoMMap `state:"nosave"` + fsutil.NoIoctl `state:"nosave"` + + events waiter.Queue `state:"nosave"` + timer *ktime.Timer + + // val is the number of timer expirations since the last successful call to + // Readv, Preadv, or SetTime. val is accessed using atomic memory + // operations. + val uint64 +} + +// NewFile returns a timerfd File that receives time from c. +func NewFile(ctx context.Context, c ktime.Clock) *fs.File { + dirent := fs.NewDirent(anon.NewInode(ctx), "anon_inode:[timerfd]") + tops := &TimerOperations{} + tops.timer = ktime.NewTimer(c, tops) + // Timerfds reject writes, but the Write flag must be set in order to + // ensure that our Writev/Pwritev methods actually get called to return + // the correct errors. + return fs.NewFile(ctx, dirent, fs.FileFlags{Read: true, Write: true}, tops) +} + +// Release implements fs.FileOperations.Release. +func (t *TimerOperations) Release() { + t.timer.Destroy() +} + +// PauseTimer pauses the associated Timer. +func (t *TimerOperations) PauseTimer() { + t.timer.Pause() +} + +// ResumeTimer resumes the associated Timer. +func (t *TimerOperations) ResumeTimer() { + t.timer.Resume() +} + +// Clock returns the associated Timer's Clock. +func (t *TimerOperations) Clock() ktime.Clock { + return t.timer.Clock() +} + +// GetTime returns the associated Timer's setting and the time at which it was +// observed. +func (t *TimerOperations) GetTime() (ktime.Time, ktime.Setting) { + return t.timer.Get() +} + +// SetTime atomically changes the associated Timer's setting, resets the number +// of expirations to 0, and returns the previous setting and the time at which +// it was observed. +func (t *TimerOperations) SetTime(s ktime.Setting) (ktime.Time, ktime.Setting) { + return t.timer.SwapAnd(s, func() { atomic.StoreUint64(&t.val, 0) }) +} + +// Readiness implements waiter.Waitable.Readiness. +func (t *TimerOperations) Readiness(mask waiter.EventMask) waiter.EventMask { + var ready waiter.EventMask + if atomic.LoadUint64(&t.val) != 0 { + ready |= waiter.EventIn + } + return ready +} + +// EventRegister implements waiter.Waitable.EventRegister. +func (t *TimerOperations) EventRegister(e *waiter.Entry, mask waiter.EventMask) { + t.events.EventRegister(e, mask) +} + +// EventUnregister implements waiter.Waitable.EventUnregister. +func (t *TimerOperations) EventUnregister(e *waiter.Entry) { + t.events.EventUnregister(e) +} + +// Read implements fs.FileOperations.Read. +func (t *TimerOperations) Read(ctx context.Context, file *fs.File, dst usermem.IOSequence, offset int64) (int64, error) { + const sizeofUint64 = 8 + if dst.NumBytes() < sizeofUint64 { + return 0, syserror.EINVAL + } + if val := atomic.SwapUint64(&t.val, 0); val != 0 { + var buf [sizeofUint64]byte + usermem.ByteOrder.PutUint64(buf[:], val) + if _, err := dst.CopyOut(ctx, buf[:]); err != nil { + // Linux does not undo consuming the number of expirations even if + // writing to userspace fails. + return 0, err + } + return sizeofUint64, nil + } + return 0, syserror.ErrWouldBlock +} + +// Write implements fs.FileOperations.Write. +func (t *TimerOperations) Write(context.Context, *fs.File, usermem.IOSequence, int64) (int64, error) { + return 0, syserror.EINVAL +} + +// Notify implements ktime.TimerListener.Notify. +func (t *TimerOperations) Notify(exp uint64) { + atomic.AddUint64(&t.val, exp) + t.events.Notify(waiter.EventIn) +} + +// Destroy implements ktime.TimerListener.Destroy. +func (t *TimerOperations) Destroy() {} diff --git a/pkg/sentry/fs/tmpfs/BUILD b/pkg/sentry/fs/tmpfs/BUILD new file mode 100644 index 000000000..be4e695d3 --- /dev/null +++ b/pkg/sentry/fs/tmpfs/BUILD @@ -0,0 +1,64 @@ +package(licenses = ["notice"]) # Apache 2.0 + +load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test") +load("//tools/go_stateify:defs.bzl", "go_stateify") + +go_stateify( + name = "tmpfs_state", + srcs = [ + "file_regular.go", + "fs.go", + "inode_file.go", + "tmpfs.go", + ], + out = "tmpfs_state.go", + package = "tmpfs", +) + +go_library( + name = "tmpfs", + srcs = [ + "device.go", + "file_regular.go", + "fs.go", + "inode_file.go", + "tmpfs.go", + "tmpfs_state.go", + ], + importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/fs/tmpfs", + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/log", + "//pkg/sentry/context", + "//pkg/sentry/device", + "//pkg/sentry/fs", + "//pkg/sentry/fs/fsutil", + "//pkg/sentry/fs/ramfs", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/kernel/pipe", + "//pkg/sentry/memmap", + "//pkg/sentry/platform", + "//pkg/sentry/safemem", + "//pkg/sentry/usage", + "//pkg/sentry/usermem", + "//pkg/state", + "//pkg/tcpip/transport/unix", + "//pkg/waiter", + ], +) + +go_test( + name = "tmpfs_test", + size = "small", + srcs = ["file_test.go"], + embed = [":tmpfs"], + deps = [ + "//pkg/sentry/context", + "//pkg/sentry/context/contexttest", + "//pkg/sentry/fs", + "//pkg/sentry/platform", + "//pkg/sentry/usage", + "//pkg/sentry/usermem", + ], +) diff --git a/pkg/sentry/fs/tmpfs/device.go b/pkg/sentry/fs/tmpfs/device.go new file mode 100644 index 000000000..e588b3440 --- /dev/null +++ b/pkg/sentry/fs/tmpfs/device.go @@ -0,0 +1,20 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tmpfs + +import "gvisor.googlesource.com/gvisor/pkg/sentry/device" + +// tmpfsDevice is the kernel tmpfs device. +var tmpfsDevice = device.NewAnonDevice() diff --git a/pkg/sentry/fs/tmpfs/file_regular.go b/pkg/sentry/fs/tmpfs/file_regular.go new file mode 100644 index 000000000..9811d90bc --- /dev/null +++ b/pkg/sentry/fs/tmpfs/file_regular.go @@ -0,0 +1,56 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tmpfs + +import ( + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil" + "gvisor.googlesource.com/gvisor/pkg/sentry/memmap" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/waiter" +) + +// regularFileOperations implements fs.FileOperations for a regular +// tmpfs file. +type regularFileOperations struct { + waiter.AlwaysReady `state:"nosave"` + fsutil.NoopRelease `state:"nosave"` + fsutil.GenericSeek `state:"nosave"` + fsutil.NotDirReaddir `state:"nosave"` + fsutil.NoopFsync `state:"nosave"` + fsutil.NoopFlush `state:"nosave"` + fsutil.NoIoctl `state:"nosave"` + + // iops is the InodeOperations of a regular tmpfs file. It is + // guaranteed to be the same as file.Dirent.Inode.InodeOperations, + // see operations that take fs.File below. + iops *fileInodeOperations +} + +// Read implements fs.FileOperations.Read. +func (r *regularFileOperations) Read(ctx context.Context, file *fs.File, dst usermem.IOSequence, offset int64) (int64, error) { + return r.iops.read(ctx, dst, offset) +} + +// Write implements fs.FileOperations.Write. +func (r *regularFileOperations) Write(ctx context.Context, file *fs.File, src usermem.IOSequence, offset int64) (int64, error) { + return r.iops.write(ctx, src, offset) +} + +// ConfigureMMap implements fs.FileOperations.ConfigureMMap. +func (r *regularFileOperations) ConfigureMMap(ctx context.Context, file *fs.File, opts *memmap.MMapOpts) error { + return fsutil.GenericConfigureMMap(file, r.iops, opts) +} diff --git a/pkg/sentry/fs/tmpfs/file_test.go b/pkg/sentry/fs/tmpfs/file_test.go new file mode 100644 index 000000000..f064eb1ac --- /dev/null +++ b/pkg/sentry/fs/tmpfs/file_test.go @@ -0,0 +1,73 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tmpfs + +import ( + "bytes" + "testing" + + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform" + "gvisor.googlesource.com/gvisor/pkg/sentry/usage" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +func newFileInode(ctx context.Context) *fs.Inode { + m := fs.NewCachingMountSource(&Filesystem{}, fs.MountSourceFlags{}) + iops := NewInMemoryFile(ctx, usage.Tmpfs, fs.WithCurrentTime(ctx, fs.UnstableAttr{}), platform.FromContext(ctx)) + return fs.NewInode(iops, m, fs.StableAttr{ + DeviceID: tmpfsDevice.DeviceID(), + InodeID: tmpfsDevice.NextIno(), + BlockSize: usermem.PageSize, + Type: fs.RegularFile, + }) +} + +func newFile(ctx context.Context) *fs.File { + inode := newFileInode(ctx) + f, _ := inode.GetFile(ctx, fs.NewDirent(inode, "stub"), fs.FileFlags{Read: true, Write: true}) + return f +} + +// Allocate once, write twice. +func TestGrow(t *testing.T) { + ctx := contexttest.Context(t) + f := newFile(ctx) + defer f.DecRef() + + abuf := bytes.Repeat([]byte{'a'}, 68) + n, err := f.Pwritev(ctx, usermem.BytesIOSequence(abuf), 0) + if n != int64(len(abuf)) || err != nil { + t.Fatalf("DeprecatedPwritev got (%d, %v) want (%d, nil)", n, err, len(abuf)) + } + + bbuf := bytes.Repeat([]byte{'b'}, 856) + n, err = f.Pwritev(ctx, usermem.BytesIOSequence(bbuf), 68) + if n != int64(len(bbuf)) || err != nil { + t.Fatalf("DeprecatedPwritev got (%d, %v) want (%d, nil)", n, err, len(bbuf)) + } + + rbuf := make([]byte, len(abuf)+len(bbuf)) + n, err = f.Preadv(ctx, usermem.BytesIOSequence(rbuf), 0) + if n != int64(len(rbuf)) || err != nil { + t.Fatalf("DeprecatedPreadv got (%d, %v) want (%d, nil)", n, err, len(rbuf)) + } + + if want := append(abuf, bbuf...); !bytes.Equal(rbuf, want) { + t.Fatalf("Read %v, want %v", rbuf, want) + } +} diff --git a/pkg/sentry/fs/tmpfs/fs.go b/pkg/sentry/fs/tmpfs/fs.go new file mode 100644 index 000000000..639a19b0d --- /dev/null +++ b/pkg/sentry/fs/tmpfs/fs.go @@ -0,0 +1,131 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tmpfs + +import ( + "fmt" + "regexp" + "strconv" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform" +) + +const ( + // Set initial permissions for the root directory. + modeKey = "mode" + + // UID for the root directory. + rootUIDKey = "uid" + + // GID for the root directory. + rootGIDKey = "gid" + + // TODO: support a tmpfs size limit. + // size = "size" + + // default permissions are read/write/execute. + defaultMode = 0777 +) + +// modeRegexp is the expected format of the mode option. +var modeRegexp = regexp.MustCompile("0[0-7][0-7][0-7]") + +// Filesystem is a tmpfs. +type Filesystem struct{} + +func init() { + fs.RegisterFilesystem(&Filesystem{}) +} + +// FilesystemName is the name underwhich the filesystem is registered. +// Name matches mm/shmem.c:shmem_fs_type.name. +const FilesystemName = "tmpfs" + +// Name is the name of the file system. +func (*Filesystem) Name() string { + return FilesystemName +} + +// AllowUserMount allows users to mount(2) this file system. +func (*Filesystem) AllowUserMount() bool { + return true +} + +// Flags returns that there is nothing special about this file system. +// +// In Linux, tmpfs returns FS_USERNS_MOUNT, see mm/shmem.c. +func (*Filesystem) Flags() fs.FilesystemFlags { + return 0 +} + +// Mount returns a tmpfs root that can be positioned in the vfs. +func (f *Filesystem) Mount(ctx context.Context, device string, flags fs.MountSourceFlags, data string) (*fs.Inode, error) { + // device is always ignored. + + // Parse generic comma-separated key=value options, this file system expects them. + options := fs.GenericMountSourceOptions(data) + + // Parse the root directory permissions. + perms := fs.FilePermsFromMode(defaultMode) + if m, ok := options[modeKey]; ok { + if !modeRegexp.MatchString(m) { + return nil, fmt.Errorf("unsupported mode value: 'mode=%s'", m) + } + // It's basically impossible that we error out at this point, + // maybe we should panic. + i, err := strconv.ParseUint(m, 8, 32) + if err != nil { + return nil, fmt.Errorf("mode value not parsable 'mode=%s': %v", m, err) + } + perms = fs.FilePermsFromMode(linux.FileMode(i)) + delete(options, modeKey) + } + + creds := auth.CredentialsFromContext(ctx) + owner := fs.FileOwnerFromContext(ctx) + if uidstr, ok := options[rootUIDKey]; ok { + uid, err := strconv.ParseInt(uidstr, 10, 32) + if err != nil { + return nil, fmt.Errorf("uid value not parsable 'uid=%d': %v", uid, err) + } + owner.UID = creds.UserNamespace.MapToKUID(auth.UID(uid)) + delete(options, rootUIDKey) + } + + if gidstr, ok := options[rootGIDKey]; ok { + gid, err := strconv.ParseInt(gidstr, 10, 32) + if err != nil { + return nil, fmt.Errorf("gid value not parsable 'gid=%d': %v", gid, err) + } + owner.GID = creds.UserNamespace.MapToKGID(auth.GID(gid)) + delete(options, rootGIDKey) + } + + // Fail if the caller passed us more options than we can parse. They may be + // expecting us to set something we can't set. + if len(options) > 0 { + return nil, fmt.Errorf("unsupported mount options: %v", options) + } + + // Construct a mount which will cache dirents. + msrc := fs.NewCachingMountSource(f, flags) + + // Construct the tmpfs root. + return NewDir(ctx, nil, owner, perms, msrc, platform.FromContext(ctx)), nil +} diff --git a/pkg/sentry/fs/tmpfs/inode_file.go b/pkg/sentry/fs/tmpfs/inode_file.go new file mode 100644 index 000000000..66bc934ae --- /dev/null +++ b/pkg/sentry/fs/tmpfs/inode_file.go @@ -0,0 +1,492 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tmpfs + +import ( + "io" + "sync" + + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil" + "gvisor.googlesource.com/gvisor/pkg/sentry/memmap" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform" + "gvisor.googlesource.com/gvisor/pkg/sentry/safemem" + "gvisor.googlesource.com/gvisor/pkg/sentry/usage" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +// fileInodeOperations implements fs.InodeOperations for a regular tmpfs file. +// These files are backed by FrameRegions allocated from a platform.Memory, +// and may be directly mapped. +// +// The tmpfs file memory is backed by FrameRegions, each of which is reference +// counted. frames maintains a single reference on each of the FrameRegions. +// Since these contain the contents of the file, the reference may only be +// decremented once this file is both deleted and all handles to the file have +// been closed. +// +// Mappable users may also call IncRefOn/DecRefOn, generally to indicate that +// they plan to use MapInto to map the file into an AddressSpace. These calls +// include an InvalidatorRegion associated with that reference. When the +// referenced portion of the file is removed (with Truncate), the associated +// InvalidatorRegion is invalidated. +type fileInodeOperations struct { + fsutil.DeprecatedFileOperations `state:"nosave"` + fsutil.InodeNotDirectory `state:"nosave"` + fsutil.InodeNotSocket `state:"nosave"` + fsutil.InodeNotSymlink `state:"nosave"` + fsutil.NoopWriteOut `state:"nosave"` + + // platform is used to allocate memory that stores the file's contents. + platform platform.Platform + + // memUsage is the default memory usage that will be reported by this file. + memUsage usage.MemoryKind + + attrMu sync.Mutex `state:"nosave"` + + // attr contains the unstable metadata for the file. + // + // attr is protected by attrMu. attr.Unstable.Size is protected by both + // attrMu and dataMu; reading it requires locking either mutex, while + // mutating it requires locking both. + attr fsutil.InMemoryAttributes + + mapsMu sync.Mutex `state:"nosave"` + + // mappings tracks mappings of the file into memmap.MappingSpaces. + // + // mappings is protected by mapsMu. + mappings memmap.MappingSet + + dataMu sync.RWMutex `state:"nosave"` + + // data maps offsets into the file to offsets into platform.Memory() that + // store the file's data. + // + // data is protected by dataMu. + data fsutil.FileRangeSet +} + +// NewInMemoryFile returns a new file backed by p.Memory(). +func NewInMemoryFile(ctx context.Context, usage usage.MemoryKind, uattr fs.UnstableAttr, p platform.Platform) fs.InodeOperations { + return &fileInodeOperations{ + attr: fsutil.InMemoryAttributes{ + Unstable: uattr, + }, + platform: p, + memUsage: usage, + } +} + +// Release implements fs.InodeOperations.Release. +func (f *fileInodeOperations) Release(context.Context) { + f.dataMu.Lock() + defer f.dataMu.Unlock() + f.data.DropAll(f.platform.Memory()) +} + +// Mappable implements fs.InodeOperations.Mappable. +func (f *fileInodeOperations) Mappable(*fs.Inode) memmap.Mappable { + return f +} + +// Rename implements fs.InodeOperations.Rename. +func (*fileInodeOperations) Rename(ctx context.Context, oldParent *fs.Inode, oldName string, newParent *fs.Inode, newName string) error { + return rename(ctx, oldParent, oldName, newParent, newName) +} + +// GetFile implements fs.InodeOperations.GetFile. +func (f *fileInodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) { + flags.Pread = true + flags.Pwrite = true + return fs.NewFile(ctx, d, flags, ®ularFileOperations{iops: f}), nil +} + +// UnstableAttr returns unstable attributes of this tmpfs file. +func (f *fileInodeOperations) UnstableAttr(ctx context.Context, inode *fs.Inode) (fs.UnstableAttr, error) { + f.attrMu.Lock() + defer f.attrMu.Unlock() + f.dataMu.RLock() + defer f.dataMu.RUnlock() + attr := f.attr.Unstable + attr.Usage = int64(f.data.Span()) + return attr, nil +} + +// Getxattr implements fs.InodeOperations.Getxattr. +func (f *fileInodeOperations) Getxattr(inode *fs.Inode, name string) ([]byte, error) { + f.attrMu.Lock() + defer f.attrMu.Unlock() + return f.attr.Getxattr(name) +} + +// Setxattr implements fs.InodeOperations.Setxattr. +func (f *fileInodeOperations) Setxattr(inode *fs.Inode, name string, value []byte) error { + f.attrMu.Lock() + defer f.attrMu.Unlock() + return f.attr.Setxattr(name, value) +} + +// Listxattr implements fs.InodeOperations.Listxattr. +func (f *fileInodeOperations) Listxattr(inode *fs.Inode) (map[string]struct{}, error) { + f.attrMu.Lock() + defer f.attrMu.Unlock() + return f.attr.Listxattr() +} + +// Check implements fs.InodeOperations.Check. +func (f *fileInodeOperations) Check(ctx context.Context, inode *fs.Inode, p fs.PermMask) bool { + return fs.ContextCanAccessFile(ctx, inode, p) +} + +// SetPermissions implements fs.InodeOperations.SetPermissions. +func (f *fileInodeOperations) SetPermissions(ctx context.Context, inode *fs.Inode, p fs.FilePermissions) bool { + f.attrMu.Lock() + defer f.attrMu.Unlock() + return f.attr.SetPermissions(ctx, p) +} + +// SetTimestamps implements fs.InodeOperations.SetTimestamps. +func (f *fileInodeOperations) SetTimestamps(ctx context.Context, inode *fs.Inode, ts fs.TimeSpec) error { + f.attrMu.Lock() + defer f.attrMu.Unlock() + return f.attr.SetTimestamps(ctx, ts) +} + +// SetOwner implements fs.InodeOperations.SetOwner. +func (f *fileInodeOperations) SetOwner(ctx context.Context, inode *fs.Inode, owner fs.FileOwner) error { + f.attrMu.Lock() + defer f.attrMu.Unlock() + return f.attr.SetOwner(ctx, owner) +} + +// Truncate implements fs.InodeOperations.Truncate. +func (f *fileInodeOperations) Truncate(ctx context.Context, inode *fs.Inode, size int64) error { + f.attrMu.Lock() + defer f.attrMu.Unlock() + + f.dataMu.Lock() + oldSize := f.attr.Unstable.Size + if oldSize != size { + f.attr.Unstable.Size = size + f.attr.TouchModificationTime(ctx) + } + f.dataMu.Unlock() + + // Nothing left to do unless shrinking the file. + if oldSize <= size { + return nil + } + + oldpgend := fs.OffsetPageEnd(oldSize) + newpgend := fs.OffsetPageEnd(size) + + // Invalidate past translations of truncated pages. + if newpgend != oldpgend { + f.mapsMu.Lock() + f.mappings.Invalidate(memmap.MappableRange{newpgend, oldpgend}, memmap.InvalidateOpts{ + // Compare Linux's mm/shmem.c:shmem_setattr() => + // mm/memory.c:unmap_mapping_range(evencows=1). + InvalidatePrivate: true, + }) + f.mapsMu.Unlock() + } + + // We are now guaranteed that there are no translations of truncated pages, + // and can remove them. + f.dataMu.Lock() + defer f.dataMu.Unlock() + f.data.Truncate(uint64(size), f.platform.Memory()) + + return nil +} + +// AddLink implements fs.InodeOperations.AddLink. +func (f *fileInodeOperations) AddLink() { + f.attrMu.Lock() + f.attr.Unstable.Links++ + f.attrMu.Unlock() +} + +// DropLink implements fs.InodeOperations.DropLink. +func (f *fileInodeOperations) DropLink() { + f.attrMu.Lock() + f.attr.Unstable.Links-- + f.attrMu.Unlock() +} + +// NotifyStatusChange implements fs.InodeOperations.NotifyStatusChange. +func (f *fileInodeOperations) NotifyStatusChange(ctx context.Context) { + f.attrMu.Lock() + f.attr.TouchStatusChangeTime(ctx) + f.attrMu.Unlock() +} + +// IsVirtual implements fs.InodeOperations.IsVirtual. +func (*fileInodeOperations) IsVirtual() bool { + return true +} + +// StatFS implements fs.InodeOperations.StatFS. +func (*fileInodeOperations) StatFS(context.Context) (fs.Info, error) { + return fsInfo, nil +} + +func (f *fileInodeOperations) read(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) { + // Zero length reads for tmpfs are no-ops. + if dst.NumBytes() == 0 { + return 0, nil + } + + // Have we reached EOF? We check for this again in + // fileReadWriter.ReadToBlocks to avoid holding f.attrMu (which would + // serialize reads) or f.dataMu (which would violate lock ordering), but + // check here first (before calling into MM) since reading at EOF is + // common: getting a return value of 0 from a read syscall is the only way + // to detect EOF. + // + // TODO: Separate out f.attr.Size and use atomics instead of + // f.dataMu. + f.dataMu.RLock() + size := f.attr.Unstable.Size + f.dataMu.RUnlock() + if offset >= size { + return 0, io.EOF + } + + n, err := dst.CopyOutFrom(ctx, &fileReadWriter{f, offset}) + // Compare Linux's mm/filemap.c:do_generic_file_read() => file_accessed(). + f.attrMu.Lock() + f.attr.TouchAccessTime(ctx) + f.attrMu.Unlock() + return n, err +} + +func (f *fileInodeOperations) write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) { + // Zero length writes for tmpfs are no-ops. + if src.NumBytes() == 0 { + return 0, nil + } + + f.attrMu.Lock() + defer f.attrMu.Unlock() + // Compare Linux's mm/filemap.c:__generic_file_write_iter() => file_update_time(). + f.attr.TouchModificationTime(ctx) + return src.CopyInTo(ctx, &fileReadWriter{f, offset}) +} + +type fileReadWriter struct { + f *fileInodeOperations + offset int64 +} + +// ReadToBlocks implements safemem.Reader.ReadToBlocks. +func (rw *fileReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) { + rw.f.dataMu.RLock() + defer rw.f.dataMu.RUnlock() + + // Compute the range to read. + if rw.offset >= rw.f.attr.Unstable.Size { + return 0, io.EOF + } + end := fs.ReadEndOffset(rw.offset, int64(dsts.NumBytes()), rw.f.attr.Unstable.Size) + if end == rw.offset { // dsts.NumBytes() == 0? + return 0, nil + } + + mem := rw.f.platform.Memory() + var done uint64 + seg, gap := rw.f.data.Find(uint64(rw.offset)) + for rw.offset < end { + mr := memmap.MappableRange{uint64(rw.offset), uint64(end)} + switch { + case seg.Ok(): + // Get internal mappings. + ims, err := mem.MapInternal(seg.FileRangeOf(seg.Range().Intersect(mr)), usermem.Read) + if err != nil { + return done, err + } + + // Copy from internal mappings. + n, err := safemem.CopySeq(dsts, ims) + done += n + rw.offset += int64(n) + dsts = dsts.DropFirst64(n) + if err != nil { + return done, err + } + + // Continue. + seg, gap = seg.NextNonEmpty() + + case gap.Ok(): + // Tmpfs holes are zero-filled. + gapmr := gap.Range().Intersect(mr) + dst := dsts.TakeFirst64(gapmr.Length()) + n, err := safemem.ZeroSeq(dst) + done += n + rw.offset += int64(n) + dsts = dsts.DropFirst64(n) + if err != nil { + return done, err + } + + // Continue. + seg, gap = gap.NextSegment(), fsutil.FileRangeGapIterator{} + + default: + break + } + } + return done, nil +} + +// WriteFromBlocks implements safemem.Writer.WriteFromBlocks. +func (rw *fileReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) { + rw.f.dataMu.Lock() + defer rw.f.dataMu.Unlock() + + // Compute the range to write. + end := fs.WriteEndOffset(rw.offset, int64(srcs.NumBytes())) + if end == rw.offset { // srcs.NumBytes() == 0? + return 0, nil + } + + defer func() { + // If the write ends beyond the file's previous size, it causes the + // file to grow. + if rw.offset > rw.f.attr.Unstable.Size { + rw.f.attr.Unstable.Size = rw.offset + } + }() + + mem := rw.f.platform.Memory() + // Page-aligned mr for when we need to allocate memory. RoundUp can't + // overflow since end is an int64. + pgstartaddr := usermem.Addr(rw.offset).RoundDown() + pgendaddr, _ := usermem.Addr(end).RoundUp() + pgMR := memmap.MappableRange{uint64(pgstartaddr), uint64(pgendaddr)} + + var done uint64 + seg, gap := rw.f.data.Find(uint64(rw.offset)) + for rw.offset < end { + mr := memmap.MappableRange{uint64(rw.offset), uint64(end)} + switch { + case seg.Ok(): + // Get internal mappings. + ims, err := mem.MapInternal(seg.FileRangeOf(seg.Range().Intersect(mr)), usermem.Write) + if err != nil { + return done, err + } + + // Copy to internal mappings. + n, err := safemem.CopySeq(ims, srcs) + done += n + rw.offset += int64(n) + srcs = srcs.DropFirst64(n) + if err != nil { + return done, err + } + + // Continue. + seg, gap = seg.NextNonEmpty() + + case gap.Ok(): + // Allocate memory for the write. + gapMR := gap.Range().Intersect(pgMR) + fr, err := mem.Allocate(gapMR.Length(), rw.f.memUsage) + if err != nil { + return done, err + } + + // Write to that memory as usual. + seg, gap = rw.f.data.Insert(gap, gapMR, fr.Start), fsutil.FileRangeGapIterator{} + + default: + break + } + } + return done, nil +} + +// AddMapping implements memmap.Mappable.AddMapping. +func (f *fileInodeOperations) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64) error { + f.mapsMu.Lock() + defer f.mapsMu.Unlock() + f.mappings.AddMapping(ms, ar, offset) + return nil +} + +// RemoveMapping implements memmap.Mappable.RemoveMapping. +func (f *fileInodeOperations) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64) { + f.mapsMu.Lock() + defer f.mapsMu.Unlock() + f.mappings.RemoveMapping(ms, ar, offset) +} + +// CopyMapping implements memmap.Mappable.CopyMapping. +func (f *fileInodeOperations) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR usermem.AddrRange, offset uint64) error { + return f.AddMapping(ctx, ms, dstAR, offset) +} + +// Translate implements memmap.Mappable.Translate. +func (f *fileInodeOperations) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) { + f.dataMu.Lock() + defer f.dataMu.Unlock() + + // Constrain translations to f.attr.Unstable.Size (rounded up) to prevent + // translation to pages that may be concurrently truncated. + pgend := fs.OffsetPageEnd(f.attr.Unstable.Size) + var buserr error + if required.End > pgend { + buserr = &memmap.BusError{io.EOF} + required.End = pgend + } + if optional.End > pgend { + optional.End = pgend + } + + mem := f.platform.Memory() + cerr := f.data.Fill(ctx, required, optional, mem, f.memUsage, func(_ context.Context, dsts safemem.BlockSeq, _ uint64) (uint64, error) { + // Newly-allocated pages are zeroed, so we don't need to do anything. + return dsts.NumBytes(), nil + }) + + var ts []memmap.Translation + var translatedEnd uint64 + for seg := f.data.FindSegment(required.Start); seg.Ok() && seg.Start() < required.End; seg, _ = seg.NextNonEmpty() { + segMR := seg.Range().Intersect(optional) + ts = append(ts, memmap.Translation{ + Source: segMR, + File: mem, + Offset: seg.FileRangeOf(segMR).Start, + }) + translatedEnd = segMR.End + } + + // Don't return the error returned by f.data.Fill if it occurred outside of + // required. + if translatedEnd < required.End && cerr != nil { + return ts, cerr + } + return ts, buserr +} + +// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable. +func (f *fileInodeOperations) InvalidateUnsavable(ctx context.Context) error { + return nil +} diff --git a/pkg/sentry/fs/tmpfs/tmpfs.go b/pkg/sentry/fs/tmpfs/tmpfs.go new file mode 100644 index 000000000..1cc7ae491 --- /dev/null +++ b/pkg/sentry/fs/tmpfs/tmpfs.go @@ -0,0 +1,204 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package tmpfs is a filesystem implementation backed by memory. +package tmpfs + +import ( + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/pipe" + "gvisor.googlesource.com/gvisor/pkg/sentry/platform" + "gvisor.googlesource.com/gvisor/pkg/sentry/usage" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix" +) + +var fsInfo = fs.Info{ + Type: linux.TMPFS_MAGIC, + + // TODO: allow configuring a tmpfs size and enforce it. + TotalBlocks: 0, + FreeBlocks: 0, +} + +// rename implements fs.InodeOperations.Rename for tmpfs nodes. +func rename(ctx context.Context, oldParent *fs.Inode, oldName string, newParent *fs.Inode, newName string) error { + op, ok := oldParent.InodeOperations.(*Dir) + if !ok { + return ramfs.ErrCrossDevice + } + np, ok := newParent.InodeOperations.(*Dir) + if !ok { + return ramfs.ErrCrossDevice + } + return ramfs.Rename(ctx, &op.Dir, oldName, &np.Dir, newName) +} + +// Dir is a directory. +type Dir struct { + ramfs.Dir + + // platform is used to allocate storage for tmpfs Files. + platform platform.Platform +} + +// NewDir returns a new directory. +func NewDir(ctx context.Context, contents map[string]*fs.Inode, owner fs.FileOwner, perms fs.FilePermissions, msrc *fs.MountSource, platform platform.Platform) *fs.Inode { + d := &Dir{platform: platform} + d.InitDir(ctx, contents, owner, perms) + + // Manually set the CreateOps. + d.CreateOps = d.newCreateOps() + + return fs.NewInode(d, msrc, fs.StableAttr{ + DeviceID: tmpfsDevice.DeviceID(), + InodeID: tmpfsDevice.NextIno(), + BlockSize: usermem.PageSize, + Type: fs.Directory, + }) +} + +// afterLoad is invoked by stateify. +func (d *Dir) afterLoad() { + // Per NewDir, manually set the CreateOps. + d.Dir.CreateOps = d.newCreateOps() +} + +// newCreateOps builds the custom CreateOps for this Dir. +func (d *Dir) newCreateOps() *ramfs.CreateOps { + return &ramfs.CreateOps{ + NewDir: func(ctx context.Context, dir *fs.Inode, perms fs.FilePermissions) (*fs.Inode, error) { + return NewDir(ctx, nil, fs.FileOwnerFromContext(ctx), perms, dir.MountSource, d.platform), nil + }, + NewFile: func(ctx context.Context, dir *fs.Inode, perms fs.FilePermissions) (*fs.Inode, error) { + uattr := fs.WithCurrentTime(ctx, fs.UnstableAttr{ + Owner: fs.FileOwnerFromContext(ctx), + Perms: perms, + // Always start unlinked. + Links: 0, + }) + iops := NewInMemoryFile(ctx, usage.Tmpfs, uattr, d.platform) + return fs.NewInode(iops, dir.MountSource, fs.StableAttr{ + DeviceID: tmpfsDevice.DeviceID(), + InodeID: tmpfsDevice.NextIno(), + BlockSize: usermem.PageSize, + Type: fs.RegularFile, + }), nil + }, + NewSymlink: func(ctx context.Context, dir *fs.Inode, target string) (*fs.Inode, error) { + return NewSymlink(ctx, target, fs.FileOwnerFromContext(ctx), dir.MountSource), nil + }, + NewBoundEndpoint: func(ctx context.Context, dir *fs.Inode, socket unix.BoundEndpoint, perms fs.FilePermissions) (*fs.Inode, error) { + return NewSocket(ctx, socket, fs.FileOwnerFromContext(ctx), perms, dir.MountSource), nil + }, + NewFifo: func(ctx context.Context, dir *fs.Inode, perms fs.FilePermissions) (*fs.Inode, error) { + return NewFifo(ctx, fs.FileOwnerFromContext(ctx), perms, dir.MountSource), nil + }, + } +} + +// Rename implements fs.InodeOperations.Rename. +func (d *Dir) Rename(ctx context.Context, oldParent *fs.Inode, oldName string, newParent *fs.Inode, newName string) error { + return rename(ctx, oldParent, oldName, newParent, newName) +} + +// StatFS implments fs.InodeOperations.StatFS. +func (*Dir) StatFS(context.Context) (fs.Info, error) { + return fsInfo, nil +} + +// Symlink is a symlink. +type Symlink struct { + ramfs.Symlink +} + +// NewSymlink returns a new symlink with the provided permissions. +func NewSymlink(ctx context.Context, target string, owner fs.FileOwner, msrc *fs.MountSource) *fs.Inode { + s := &Symlink{} + s.InitSymlink(ctx, owner, target) + return fs.NewInode(s, msrc, fs.StableAttr{ + DeviceID: tmpfsDevice.DeviceID(), + InodeID: tmpfsDevice.NextIno(), + BlockSize: usermem.PageSize, + Type: fs.Symlink, + }) +} + +// Rename implements fs.InodeOperations.Rename. +func (s *Symlink) Rename(ctx context.Context, oldParent *fs.Inode, oldName string, newParent *fs.Inode, newName string) error { + return rename(ctx, oldParent, oldName, newParent, newName) +} + +// StatFS returns the tmpfs info. +func (s *Symlink) StatFS(context.Context) (fs.Info, error) { + return fsInfo, nil +} + +// Socket is a socket. +type Socket struct { + ramfs.Socket +} + +// NewSocket returns a new socket with the provided permissions. +func NewSocket(ctx context.Context, socket unix.BoundEndpoint, owner fs.FileOwner, perms fs.FilePermissions, msrc *fs.MountSource) *fs.Inode { + s := &Socket{} + s.InitSocket(ctx, socket, owner, perms) + return fs.NewInode(s, msrc, fs.StableAttr{ + DeviceID: tmpfsDevice.DeviceID(), + InodeID: tmpfsDevice.NextIno(), + BlockSize: usermem.PageSize, + Type: fs.Socket, + }) +} + +// Rename implements fs.InodeOperations.Rename. +func (s *Socket) Rename(ctx context.Context, oldParent *fs.Inode, oldName string, newParent *fs.Inode, newName string) error { + return rename(ctx, oldParent, oldName, newParent, newName) +} + +// StatFS returns the tmpfs info. +func (s *Socket) StatFS(context.Context) (fs.Info, error) { + return fsInfo, nil +} + +// Fifo is a tmpfs named pipe. +type Fifo struct { + ramfs.Entry +} + +// NewFifo creates a new named pipe. +func NewFifo(ctx context.Context, owner fs.FileOwner, perms fs.FilePermissions, msrc *fs.MountSource) *fs.Inode { + f := &Fifo{} + f.InitEntry(ctx, owner, perms) + iops := pipe.NewInodeOperations(f, pipe.NewPipe(ctx, true /* isNamed */, pipe.DefaultPipeSize, usermem.PageSize)) + return fs.NewInode(iops, msrc, fs.StableAttr{ + DeviceID: tmpfsDevice.DeviceID(), + InodeID: tmpfsDevice.NextIno(), + BlockSize: usermem.PageSize, + Type: fs.Pipe, + }) +} + +// Rename implements fs.InodeOperations.Rename. +func (f *Fifo) Rename(ctx context.Context, oldParent *fs.Inode, oldName string, newParent *fs.Inode, newName string) error { + return rename(ctx, oldParent, oldName, newParent, newName) +} + +// StatFS returns the tmpfs info. +func (*Fifo) StatFS(context.Context) (fs.Info, error) { + return fsInfo, nil +} diff --git a/pkg/sentry/fs/tty/BUILD b/pkg/sentry/fs/tty/BUILD new file mode 100644 index 000000000..90b350410 --- /dev/null +++ b/pkg/sentry/fs/tty/BUILD @@ -0,0 +1,63 @@ +package(licenses = ["notice"]) # Apache 2.0 + +load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test") +load("//tools/go_stateify:defs.bzl", "go_stateify") + +go_stateify( + name = "tty_state", + srcs = [ + "dir.go", + "fs.go", + "inode.go", + "line_discipline.go", + "master.go", + "slave.go", + "terminal.go", + ], + out = "tty_state.go", + package = "tty", +) + +go_library( + name = "tty", + srcs = [ + "dir.go", + "fs.go", + "inode.go", + "line_discipline.go", + "master.go", + "slave.go", + "terminal.go", + "tty_state.go", + ], + importpath = "gvisor.googlesource.com/gvisor/pkg/sentry/fs/tty", + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/refs", + "//pkg/sentry/arch", + "//pkg/sentry/context", + "//pkg/sentry/device", + "//pkg/sentry/fs", + "//pkg/sentry/fs/fsutil", + "//pkg/sentry/kernel/auth", + "//pkg/sentry/kernel/time", + "//pkg/sentry/usermem", + "//pkg/state", + "//pkg/syserror", + "//pkg/tcpip/transport/unix", + "//pkg/waiter", + ], +) + +go_test( + name = "tty_test", + size = "small", + srcs = ["tty_test.go"], + embed = [":tty"], + deps = [ + "//pkg/abi/linux", + "//pkg/sentry/context/contexttest", + "//pkg/sentry/usermem", + ], +) diff --git a/pkg/sentry/fs/tty/dir.go b/pkg/sentry/fs/tty/dir.go new file mode 100644 index 000000000..2c5b2aed6 --- /dev/null +++ b/pkg/sentry/fs/tty/dir.go @@ -0,0 +1,398 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package tty provide pseudoterminals via a devpts filesystem. +package tty + +import ( + "fmt" + "math" + "strconv" + "sync" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil" + "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" + "gvisor.googlesource.com/gvisor/pkg/tcpip/transport/unix" + "gvisor.googlesource.com/gvisor/pkg/waiter" +) + +// dirInodeOperations is the root of a devpts mount. +// +// This indirectly manages all terminals within the mount. +// +// New Terminals are created by masterInodeOperations.GetFile, which registers +// the slave Inode in the this directory for discovery via Lookup/Readdir. The +// slave inode is unregistered when the master file is Released, as the slave +// is no longer discoverable at that point. +// +// References on the underlying Terminal are held by masterFileOperations and +// slaveInodeOperations. +// +// masterInodeOperations and slaveInodeOperations hold a pointer to +// dirInodeOperations, which is reference counted by the refcount their +// corresponding Dirents hold on their parent (this directory). +// +// dirInodeOperations implements fs.InodeOperations. +type dirInodeOperations struct { + fsutil.DeprecatedFileOperations + fsutil.InodeNotSocket + fsutil.InodeNotRenameable + fsutil.InodeNotSymlink + fsutil.InodeNoExtendedAttributes + fsutil.NoMappable + fsutil.NoopWriteOut + + // msrc is the super block this directory is on. + // + // TODO: Plumb this through instead of storing it here. + msrc *fs.MountSource + + // mu protects the fields below. + mu sync.Mutex `state:"nosave"` + + // attr contains the UnstableAttrs. + attr fsutil.InMemoryAttributes + + // master is the master PTY inode. + master *fs.Inode + + // slaves contains the slave inodes reachable from the directory. + // + // A new slave is added by allocateTerminal and is removed by + // masterFileOperations.Release. + // + // A reference is held on every slave in the map. + slaves map[uint32]*fs.Inode + + // dentryMap is a SortedDentryMap used to implement Readdir containing + // the master and all entries in slaves. + dentryMap *fs.SortedDentryMap + + // next is the next pty index to use. + // + // TODO: reuse indices when ptys are closed. + next uint32 +} + +var _ fs.InodeOperations = (*dirInodeOperations)(nil) + +// newDir creates a new dir with a ptmx file and no terminals. +func newDir(ctx context.Context, m *fs.MountSource) *fs.Inode { + d := &dirInodeOperations{ + attr: fsutil.InMemoryAttributes{ + Unstable: fs.WithCurrentTime(ctx, fs.UnstableAttr{ + Owner: fs.RootOwner, + Perms: fs.FilePermsFromMode(0555), + }), + }, + msrc: m, + slaves: make(map[uint32]*fs.Inode), + dentryMap: fs.NewSortedDentryMap(nil), + } + // Linux devpts uses a default mode of 0000 for ptmx which can be + // changed with the ptmxmode mount option. However, that default is not + // useful here (since we'd *always* need the mount option, so it is + // accessible by default). + d.master = newMasterInode(ctx, d, fs.RootOwner, fs.FilePermsFromMode(0666)) + d.dentryMap.Add("ptmx", fs.DentAttr{ + Type: d.master.StableAttr.Type, + InodeID: d.master.StableAttr.InodeID, + }) + + return fs.NewInode(d, m, fs.StableAttr{ + DeviceID: ptsDevice.DeviceID(), + // N.B. Linux always uses inode id 1 for the directory. See + // fs/devpts/inode.c:devpts_fill_super. + // + // TODO: Since ptsDevice must be shared between + // different mounts, we must not assign fixed numbers. + InodeID: ptsDevice.NextIno(), + BlockSize: usermem.PageSize, + Type: fs.Directory, + }) +} + +// Release implements fs.InodeOperations.Release. +func (d *dirInodeOperations) Release(ctx context.Context) { + d.master.DecRef() + if len(d.slaves) != 0 { + panic(fmt.Sprintf("devpts directory still contains active terminals: %+v", d)) + } +} + +// Lookup implements fs.InodeOperations.Lookup. +func (d *dirInodeOperations) Lookup(ctx context.Context, dir *fs.Inode, name string) (*fs.Dirent, error) { + d.mu.Lock() + defer d.mu.Unlock() + + // Master? + if name == "ptmx" { + d.master.IncRef() + return fs.NewDirent(d.master, name), nil + } + + // Slave number? + n, err := strconv.ParseUint(name, 10, 32) + if err != nil { + // Not found. + return nil, syserror.ENOENT + } + + s, ok := d.slaves[uint32(n)] + if !ok { + return nil, syserror.ENOENT + } + + s.IncRef() + return fs.NewDirent(s, name), nil +} + +// Create implements fs.InodeOperations.Create. +// +// Creation is never allowed. +func (d *dirInodeOperations) Create(ctx context.Context, dir *fs.Inode, name string, flags fs.FileFlags, perm fs.FilePermissions) (*fs.File, error) { + return nil, syserror.EACCES +} + +// CreateDirectory implements fs.InodeOperations.CreateDirectory. +// +// Creation is never allowed. +func (d *dirInodeOperations) CreateDirectory(ctx context.Context, dir *fs.Inode, name string, perm fs.FilePermissions) error { + return syserror.EACCES +} + +// CreateLink implements fs.InodeOperations.CreateLink. +// +// Creation is never allowed. +func (d *dirInodeOperations) CreateLink(ctx context.Context, dir *fs.Inode, oldname, newname string) error { + return syserror.EACCES +} + +// CreateHardLink implements fs.InodeOperations.CreateHardLink. +// +// Creation is never allowed. +func (d *dirInodeOperations) CreateHardLink(ctx context.Context, dir *fs.Inode, target *fs.Inode, name string) error { + return syserror.EACCES +} + +// CreateFifo implements fs.InodeOperations.CreateFifo. +// +// Creation is never allowed. +func (d *dirInodeOperations) CreateFifo(ctx context.Context, dir *fs.Inode, name string, perm fs.FilePermissions) error { + return syserror.EACCES +} + +// Remove implements fs.InodeOperations.Remove. +// +// Removal is never allowed. +func (d *dirInodeOperations) Remove(ctx context.Context, dir *fs.Inode, name string) error { + return syserror.EPERM +} + +// RemoveDirectory implements fs.InodeOperations.RemoveDirectory. +// +// Removal is never allowed. +func (d *dirInodeOperations) RemoveDirectory(ctx context.Context, dir *fs.Inode, name string) error { + return syserror.EPERM +} + +// Bind implements fs.InodeOperations.Bind. +func (d *dirInodeOperations) Bind(ctx context.Context, dir *fs.Inode, name string, data unix.BoundEndpoint, perm fs.FilePermissions) error { + return syserror.EPERM +} + +// GetFile implements fs.InodeOperations.GetFile. +func (d *dirInodeOperations) GetFile(ctx context.Context, dirent *fs.Dirent, flags fs.FileFlags) (*fs.File, error) { + return fs.NewFile(ctx, dirent, flags, &dirFileOperations{di: d}), nil +} + +// UnstableAttr implements fs.InodeOperations.UnstableAttr. +func (d *dirInodeOperations) UnstableAttr(ctx context.Context, inode *fs.Inode) (fs.UnstableAttr, error) { + d.mu.Lock() + defer d.mu.Unlock() + return d.attr.Unstable, nil +} + +// Check implements fs.InodeOperations.Check. +func (d *dirInodeOperations) Check(ctx context.Context, inode *fs.Inode, p fs.PermMask) bool { + return fs.ContextCanAccessFile(ctx, inode, p) +} + +// SetPermissions implements fs.InodeOperations.SetPermissions. +func (d *dirInodeOperations) SetPermissions(ctx context.Context, inode *fs.Inode, p fs.FilePermissions) bool { + d.mu.Lock() + defer d.mu.Unlock() + return d.attr.SetPermissions(ctx, p) +} + +// SetOwner implements fs.InodeOperations.SetOwner. +func (d *dirInodeOperations) SetOwner(ctx context.Context, inode *fs.Inode, owner fs.FileOwner) error { + d.mu.Lock() + defer d.mu.Unlock() + return d.attr.SetOwner(ctx, owner) +} + +// SetTimestamps implements fs.InodeOperations.SetTimestamps. +func (d *dirInodeOperations) SetTimestamps(ctx context.Context, inode *fs.Inode, ts fs.TimeSpec) error { + d.mu.Lock() + defer d.mu.Unlock() + return d.attr.SetTimestamps(ctx, ts) +} + +// Truncate implements fs.InodeOperations.Truncate. +func (d *dirInodeOperations) Truncate(ctx context.Context, inode *fs.Inode, size int64) error { + return syserror.EINVAL +} + +// AddLink implements fs.InodeOperations.AddLink. +func (d *dirInodeOperations) AddLink() {} + +// DropLink implements fs.InodeOperations.DropLink. +func (d *dirInodeOperations) DropLink() {} + +// NotifyStatusChange implements fs.InodeOperations.NotifyStatusChange. +func (d *dirInodeOperations) NotifyStatusChange(ctx context.Context) { + d.mu.Lock() + defer d.mu.Unlock() + + d.attr.TouchStatusChangeTime(ctx) +} + +// IsVirtual implements fs.InodeOperations.IsVirtual. +func (d *dirInodeOperations) IsVirtual() bool { + return true +} + +// StatFS implements fs.InodeOperations.StatFS. +func (d *dirInodeOperations) StatFS(ctx context.Context) (fs.Info, error) { + return fs.Info{ + Type: linux.DEVPTS_SUPER_MAGIC, + }, nil +} + +// allocateTerminal creates a new Terminal and installs a pts node for it. +// +// The caller must call DecRef when done with the returned Terminal. +func (d *dirInodeOperations) allocateTerminal(ctx context.Context) (*Terminal, error) { + d.mu.Lock() + defer d.mu.Unlock() + + n := d.next + if n == math.MaxUint32 { + return nil, syserror.ENOMEM + } + + if _, ok := d.slaves[n]; ok { + panic(fmt.Sprintf("pty index collision; index %d already exists", n)) + } + + t := newTerminal(ctx, d, n) + d.next++ + + // The reference returned by newTerminal is returned to the caller. + // Take another for the slave inode. + t.IncRef() + + // Create a pts node. The owner is based on the context that opens + // ptmx. + creds := auth.CredentialsFromContext(ctx) + uid, gid := creds.EffectiveKUID, creds.EffectiveKGID + slave := newSlaveInode(ctx, d, t, fs.FileOwner{uid, gid}, fs.FilePermsFromMode(0666)) + + d.slaves[n] = slave + d.dentryMap.Add(strconv.FormatUint(uint64(n), 10), fs.DentAttr{ + Type: slave.StableAttr.Type, + InodeID: slave.StableAttr.InodeID, + }) + + return t, nil +} + +// masterClose is called when the master end of t is closed. +func (d *dirInodeOperations) masterClose(t *Terminal) { + d.mu.Lock() + defer d.mu.Unlock() + + // The slave end disappears from the directory when the master end is + // closed, even if the slave end is open elsewhere. + // + // N.B. since we're using a backdoor method to remove a directory entry + // we won't properly fire inotify events like Linux would. + s, ok := d.slaves[t.n] + if !ok { + panic(fmt.Sprintf("Terminal %+v doesn't exist in %+v?", t, d)) + } + + s.DecRef() + delete(d.slaves, t.n) + d.dentryMap.Remove(strconv.FormatUint(uint64(t.n), 10)) +} + +// dirFileOperations are the fs.FileOperations for the directory. +// +// This is nearly identical to fsutil.DirFileOperations, except that it takes +// df.di.mu in IterateDir. +type dirFileOperations struct { + waiter.AlwaysReady `state:"nosave"` + fsutil.NoopRelease `state:"nosave"` + fsutil.GenericSeek `state:"nosave"` + fsutil.NoFsync `state:"nosave"` + fsutil.NoopFlush `state:"nosave"` + fsutil.NoMMap `state:"nosave"` + fsutil.NoIoctl `state:"nosave"` + + // di is the inode operations. + di *dirInodeOperations + + // dirCursor contains the name of the last directory entry that was + // serialized. + dirCursor string +} + +var _ fs.FileOperations = (*dirFileOperations)(nil) + +// IterateDir implements DirIterator.IterateDir. +func (df *dirFileOperations) IterateDir(ctx context.Context, dirCtx *fs.DirCtx, offset int) (int, error) { + df.di.mu.Lock() + defer df.di.mu.Unlock() + + n, err := fs.GenericReaddir(dirCtx, df.di.dentryMap) + return offset + n, err +} + +// Readdir implements FileOperations.Readdir. +func (df *dirFileOperations) Readdir(ctx context.Context, file *fs.File, serializer fs.DentrySerializer) (int64, error) { + root := fs.RootFromContext(ctx) + defer root.DecRef() + dirCtx := &fs.DirCtx{ + Serializer: serializer, + DirCursor: &df.dirCursor, + } + return fs.DirentReaddir(ctx, file.Dirent, df, root, dirCtx, file.Offset()) +} + +// Read implements FileOperations.Read +func (df *dirFileOperations) Read(context.Context, *fs.File, usermem.IOSequence, int64) (int64, error) { + return 0, syserror.EISDIR +} + +// Write implements FileOperations.Write. +func (df *dirFileOperations) Write(context.Context, *fs.File, usermem.IOSequence, int64) (int64, error) { + return 0, syserror.EISDIR +} diff --git a/pkg/sentry/fs/tty/fs.go b/pkg/sentry/fs/tty/fs.go new file mode 100644 index 000000000..f5e7a3162 --- /dev/null +++ b/pkg/sentry/fs/tty/fs.go @@ -0,0 +1,95 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tty + +import ( + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/device" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// ptsDevice is the pseudo-filesystem device. +var ptsDevice = device.NewAnonDevice() + +// filesystem is a devpts filesystem. +// +// This devpts is always in the new "multi-instance" mode. i.e., it contains a +// ptmx device tied to this mount. +type filesystem struct{} + +func init() { + fs.RegisterFilesystem(&filesystem{}) +} + +// Name matches drivers/devpts/indoe.c:devpts_fs_type.name. +func (*filesystem) Name() string { + return "devpts" +} + +// AllowUserMount allows users to mount(2) this file system. +func (*filesystem) AllowUserMount() bool { + // TODO: Users may mount this once the terminals are in a + // usable state. + return false +} + +// Flags returns that there is nothing special about this file system. +func (*filesystem) Flags() fs.FilesystemFlags { + return 0 +} + +// MountSource returns a devpts root that can be positioned in the vfs. +func (f *filesystem) Mount(ctx context.Context, device string, flags fs.MountSourceFlags, data string) (*fs.Inode, error) { + // device is always ignored. + + // No options are supported. + if data != "" { + return nil, syserror.EINVAL + } + + return newDir(ctx, fs.NewMountSource(&superOperations{}, f, flags)), nil +} + +// superOperations implements fs.MountSourceOperations, preventing caching. +type superOperations struct{} + +// Revalidate implements fs.DirentOperations.Revalidate. +// +// It always returns true, forcing a Lookup for all entries. +// +// Slave entries are dropped from dir when their master is closed, so an +// existing slave Dirent in the tree is not sufficient to guarantee that it +// still exists on the filesystem. +func (superOperations) Revalidate(*fs.Dirent) bool { + return true +} + +// Keep implements fs.DirentOperations.Keep. +// +// Keep returns false because Revalidate would force a lookup on cached entries +// anyways. +func (superOperations) Keep(*fs.Dirent) bool { + return false +} + +// ResetInodeMappings implements MountSourceOperations.ResetInodeMappings. +func (superOperations) ResetInodeMappings() {} + +// SaveInodeMapping implements MountSourceOperations.SaveInodeMapping. +func (superOperations) SaveInodeMapping(*fs.Inode, string) {} + +// Destroy implements MountSourceOperations.Destroy. +func (superOperations) Destroy() {} diff --git a/pkg/sentry/fs/tty/inode.go b/pkg/sentry/fs/tty/inode.go new file mode 100644 index 000000000..04b9a7727 --- /dev/null +++ b/pkg/sentry/fs/tty/inode.go @@ -0,0 +1,143 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tty + +import ( + "sync" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil" + ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time" + "gvisor.googlesource.com/gvisor/pkg/syserror" +) + +// inodeOperations are the base fs.InodeOperations for master and slave Inodes. +// +// inodeOperations does not implement: +// +// * fs.InodeOperations.Release +// * fs.InodeOperations.GetFile +type inodeOperations struct { + fsutil.DeprecatedFileOperations `state:"nosave"` + fsutil.InodeNoExtendedAttributes `state:"nosave"` + fsutil.InodeNotDirectory `state:"nosave"` + fsutil.InodeNotRenameable `state:"nosave"` + fsutil.InodeNotSocket `state:"nosave"` + fsutil.InodeNotSymlink `state:"nosave"` + fsutil.NoMappable `state:"nosave"` + fsutil.NoopWriteOut `state:"nosave"` + + // mu protects the fields below. + mu sync.Mutex `state:"nosave"` + + // uattr is the inode's UnstableAttr. + uattr fs.UnstableAttr +} + +// UnstableAttr implements fs.InodeOperations.UnstableAttr. +func (i *inodeOperations) UnstableAttr(ctx context.Context, inode *fs.Inode) (fs.UnstableAttr, error) { + i.mu.Lock() + defer i.mu.Unlock() + return i.uattr, nil +} + +// Check implements fs.InodeOperations.Check. +func (i *inodeOperations) Check(ctx context.Context, inode *fs.Inode, p fs.PermMask) bool { + return fs.ContextCanAccessFile(ctx, inode, p) +} + +// SetPermissions implements fs.InodeOperations.SetPermissions +func (i *inodeOperations) SetPermissions(ctx context.Context, inode *fs.Inode, p fs.FilePermissions) bool { + i.mu.Lock() + defer i.mu.Unlock() + i.uattr.Perms = p + i.uattr.StatusChangeTime = ktime.NowFromContext(ctx) + return true +} + +// SetOwner implements fs.InodeOperations.SetOwner. +func (i *inodeOperations) SetOwner(ctx context.Context, inode *fs.Inode, owner fs.FileOwner) error { + i.mu.Lock() + defer i.mu.Unlock() + if owner.UID.Ok() { + i.uattr.Owner.UID = owner.UID + } + if owner.GID.Ok() { + i.uattr.Owner.GID = owner.GID + } + return nil +} + +// SetTimestamps implements fs.InodeOperations.SetTimestamps. +func (i *inodeOperations) SetTimestamps(ctx context.Context, inode *fs.Inode, ts fs.TimeSpec) error { + if ts.ATimeOmit && ts.MTimeOmit { + return nil + } + + i.mu.Lock() + defer i.mu.Unlock() + + now := ktime.NowFromContext(ctx) + if !ts.ATimeOmit { + if ts.ATime.IsZero() { + i.uattr.AccessTime = now + } else { + i.uattr.AccessTime = ts.ATime + } + } + if !ts.MTimeOmit { + if ts.MTime.IsZero() { + i.uattr.ModificationTime = now + } else { + i.uattr.ModificationTime = ts.MTime + } + } + i.uattr.StatusChangeTime = now + return nil +} + +// Truncate implements fs.InodeOperations.Truncate. +func (i *inodeOperations) Truncate(ctx context.Context, inode *fs.Inode, size int64) error { + return syserror.EINVAL +} + +// AddLink implements fs.InodeOperations.AddLink. +func (i *inodeOperations) AddLink() { +} + +// DropLink implements fs.InodeOperations.DropLink. +func (i *inodeOperations) DropLink() { +} + +// NotifyStatusChange implements fs.InodeOperations.NotifyStatusChange. +func (i *inodeOperations) NotifyStatusChange(ctx context.Context) { + i.mu.Lock() + defer i.mu.Unlock() + i.uattr.StatusChangeTime = ktime.NowFromContext(ctx) +} + +// IsVirtual implements fs.InodeOperations.IsVirtual. +func (i *inodeOperations) IsVirtual() bool { + return true +} + +// StatFS implements fs.InodeOperations.StatFS. +func (i *inodeOperations) StatFS(ctx context.Context) (fs.Info, error) { + return fs.Info{ + Type: linux.DEVPTS_SUPER_MAGIC, + }, nil +} diff --git a/pkg/sentry/fs/tty/line_discipline.go b/pkg/sentry/fs/tty/line_discipline.go new file mode 100644 index 000000000..fde4e7941 --- /dev/null +++ b/pkg/sentry/fs/tty/line_discipline.go @@ -0,0 +1,342 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tty + +import ( + "bytes" + "sync" + "unicode/utf8" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" + "gvisor.googlesource.com/gvisor/pkg/waiter" +) + +const ( + spacesPerTab = 8 +) + +// lineDiscipline dictates how input and output are handled between the +// pseudoterminal (pty) master and slave. It can be configured to alter I/O, +// modify control characters (e.g. Ctrl-C for SIGINT), etc. The following man +// pages are good resources for how to affect the line discipline: +// +// * termios(3) +// * tty_ioctl(4) +// +// This file corresponds most closely to drivers/tty/n_tty.c. +// +// lineDiscipline has a simple structure but supports a multitude of options +// (see the above man pages). It consists of two queues of bytes: one from the +// terminal master to slave (the input queue) and one from slave to master (the +// output queue). When bytes are written to one end of the pty, the line +// discipline reads the bytes, modifies them or takes special action if +// required, and enqueues them to be read by the other end of the pty: +// +// input from terminal +-------------+ input to process (e.g. bash) +// +------------------------>| input queue |---------------------------+ +// | +-------------+ | +// | | +// | v +// masterFD slaveFD +// ^ | +// | | +// | output to terminal +--------------+ output from process | +// +------------------------| output queue |<--------------------------+ +// +--------------+ +// +// Lock order: +// inMu +// outMu +// termiosMu +type lineDiscipline struct { + // inMu protects inQueue. + inMu sync.Mutex `state:"nosave"` + + // inQueue is the input queue of the terminal. + inQueue queue + + // outMu protects outQueue. + outMu sync.Mutex `state:"nosave"` + + // outQueue is the output queue of the terminal. + outQueue queue + + // termiosMu protects termios. + termiosMu sync.Mutex `state:"nosave"` + + // termios is the terminal configuration used by the lineDiscipline. + termios linux.KernelTermios + + // column is the location in a row of the cursor. This is important for + // handling certain special characters like backspace. + column int +} + +// getTermios gets the linux.Termios for the tty. +func (l *lineDiscipline) getTermios(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { + l.termiosMu.Lock() + defer l.termiosMu.Unlock() + // We must copy a Termios struct, not KernelTermios. + t := l.termios.ToTermios() + _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), t, usermem.IOOpts{ + AddressSpaceActive: true, + }) + return 0, err +} + +// setTermios sets a linux.Termios for the tty. +func (l *lineDiscipline) setTermios(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { + l.termiosMu.Lock() + defer l.termiosMu.Unlock() + // We must copy a Termios struct, not KernelTermios. + var t linux.Termios + _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &t, usermem.IOOpts{ + AddressSpaceActive: true, + }) + l.termios.FromTermios(t) + return 0, err +} + +func (l *lineDiscipline) masterReadiness() waiter.EventMask { + l.inMu.Lock() + defer l.inMu.Unlock() + l.outMu.Lock() + defer l.outMu.Unlock() + return l.inQueue.writeReadiness() | l.outQueue.readReadiness() +} + +func (l *lineDiscipline) slaveReadiness() waiter.EventMask { + l.inMu.Lock() + defer l.inMu.Unlock() + l.outMu.Lock() + defer l.outMu.Unlock() + return l.outQueue.writeReadiness() | l.inQueue.readReadiness() +} + +// queue represents one of the input or output queues between a pty master and +// slave. +type queue struct { + waiter.Queue `state:"nosave"` + buf bytes.Buffer `state:".([]byte)"` +} + +// saveBuf is invoked by stateify. +func (q *queue) saveBuf() []byte { + return append([]byte(nil), q.buf.Bytes()...) +} + +// loadBuf is invoked by stateify. +func (q *queue) loadBuf(b []byte) { + q.buf.Write(b) +} + +// readReadiness returns whether q is ready to be read from. +// +// Preconditions: q's mutex must be held. +func (q *queue) readReadiness() waiter.EventMask { + ready := waiter.EventMask(0) + if q.buf.Len() > 0 { + ready |= waiter.EventIn + } + return ready +} + +// writeReadiness returns whether q is ready to be written to. +func (q *queue) writeReadiness() waiter.EventMask { + return waiter.EventOut +} + +func (l *lineDiscipline) inputQueueRead(ctx context.Context, dst usermem.IOSequence) (int64, error) { + l.inMu.Lock() + defer l.inMu.Unlock() + return l.queueRead(ctx, dst, &l.inQueue) +} + +func (l *lineDiscipline) inputQueueWrite(ctx context.Context, src usermem.IOSequence) (int64, error) { + l.inMu.Lock() + defer l.inMu.Unlock() + return l.queueWrite(ctx, src, &l.inQueue, false) +} + +func (l *lineDiscipline) outputQueueRead(ctx context.Context, dst usermem.IOSequence) (int64, error) { + l.outMu.Lock() + defer l.outMu.Unlock() + return l.queueRead(ctx, dst, &l.outQueue) +} + +func (l *lineDiscipline) outputQueueWrite(ctx context.Context, src usermem.IOSequence) (int64, error) { + l.outMu.Lock() + defer l.outMu.Unlock() + return l.queueWrite(ctx, src, &l.outQueue, true) +} + +// queueRead reads from q to userspace. +// +// Preconditions: q's lock must be held. +func (l *lineDiscipline) queueRead(ctx context.Context, dst usermem.IOSequence, q *queue) (int64, error) { + // Copy bytes out to user-space. queueRead doesn't have to do any + // processing or other extra work -- that's all taken care of when + // writing to a queue. + n, err := q.buf.WriteTo(dst.Writer(ctx)) + + // If state changed, notify any waiters. If nothing was available to + // read, let the caller know we could block. + if n > 0 { + q.Notify(waiter.EventOut) + } else if err == nil { + return 0, syserror.ErrWouldBlock + } + return int64(n), err +} + +// queueWrite writes to q from userspace. `output` is whether the queue being +// written to should be subject to output processing (i.e. whether it is the +// output queue). +// +// Precondition: q's lock must be held. +func (l *lineDiscipline) queueWrite(ctx context.Context, src usermem.IOSequence, q *queue, output bool) (int64, error) { + // TODO: Use CopyInTo/safemem to avoid extra copying. + // Get the bytes to write from user-space. + b := make([]byte, src.NumBytes()) + n, err := src.CopyIn(ctx, b) + if err != nil { + return 0, err + } + b = b[:n] + + // If state changed, notify any waiters. If we were unable to write + // anything, let the caller know we could block. + if n > 0 { + q.Notify(waiter.EventIn) + } else { + return 0, syserror.ErrWouldBlock + } + + // Optionally perform line discipline transformations depending on + // whether we're writing to the input queue or output queue. + var buf *bytes.Buffer + l.termiosMu.Lock() + if output { + buf = l.transformOutput(b) + } else { + buf = l.transformInput(b) + } + l.termiosMu.Unlock() + + // Enqueue buf at the end of the queue. + buf.WriteTo(&q.buf) + return int64(n), err +} + +// transformOutput does ouput processing for one end of the pty. See +// drivers/tty/n_tty.c:do_output_char for an analagous kernel function. +// +// Precondition: l.termiosMu must be held. +func (l *lineDiscipline) transformOutput(buf []byte) *bytes.Buffer { + if !l.termios.OEnabled(linux.OPOST) { + return bytes.NewBuffer(buf) + } + + var ret bytes.Buffer + for len(buf) > 0 { + c := l.removeRune(&buf) + switch c { + case '\n': + if l.termios.OEnabled(linux.ONLRET) { + l.column = 0 + } + if l.termios.OEnabled(linux.ONLCR) { + ret.Write([]byte{'\r', '\n'}) + continue + } + case '\r': + if l.termios.OEnabled(linux.ONOCR) && l.column == 0 { + continue + } + if l.termios.OEnabled(linux.OCRNL) { + c = '\n' + if l.termios.OEnabled(linux.ONLRET) { + l.column = 0 + } + break + } + l.column = 0 + case '\t': + spaces := spacesPerTab - l.column%spacesPerTab + if l.termios.OutputFlags&linux.TABDLY == linux.XTABS { + l.column += spaces + ret.Write(bytes.Repeat([]byte{' '}, 8)) + continue + } + l.column += spaces + case '\b': + if l.column > 0 { + l.column-- + } + default: + l.column++ + } + ret.WriteRune(c) + } + return &ret +} + +// transformInput does input processing for one end of the pty. Characters +// read are transformed according to flags set in the termios struct. See +// drivers/tty/n_tty.c:n_tty_receive_char_special for an analogous kernel +// function. +// +// Precondition: l.termiosMu must be held. +func (l *lineDiscipline) transformInput(buf []byte) *bytes.Buffer { + var ret bytes.Buffer + for len(buf) > 0 { + c := l.removeRune(&buf) + switch c { + case '\r': + if l.termios.IEnabled(linux.IGNCR) { + continue + } + if l.termios.IEnabled(linux.ICRNL) { + c = '\n' + } + case '\n': + if l.termios.IEnabled(linux.INLCR) { + c = '\r' + } + } + ret.WriteRune(c) + } + return &ret +} + +// removeRune removes and returns the first rune from the byte array. The +// buffer's length is updated accordingly. +func (l *lineDiscipline) removeRune(b *[]byte) rune { + var c rune + var size int + // If UTF-8 support is enabled, runes might be multiple bytes. + if l.termios.IEnabled(linux.IUTF8) { + c, size = utf8.DecodeRune(*b) + } else { + c = rune((*b)[0]) + size = 1 + } + *b = (*b)[size:] + return c +} diff --git a/pkg/sentry/fs/tty/master.go b/pkg/sentry/fs/tty/master.go new file mode 100644 index 000000000..3c47ee517 --- /dev/null +++ b/pkg/sentry/fs/tty/master.go @@ -0,0 +1,173 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tty + +import ( + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" + "gvisor.googlesource.com/gvisor/pkg/waiter" +) + +// masterInodeOperations are the fs.InodeOperations for the master end of the +// Terminal (ptmx file). +type masterInodeOperations struct { + inodeOperations + + // d is the containing dir. + d *dirInodeOperations +} + +var _ fs.InodeOperations = (*masterInodeOperations)(nil) + +// newMasterInode creates an Inode for the master end of a terminal. +func newMasterInode(ctx context.Context, d *dirInodeOperations, owner fs.FileOwner, p fs.FilePermissions) *fs.Inode { + iops := &masterInodeOperations{ + inodeOperations: inodeOperations{ + uattr: fs.WithCurrentTime(ctx, fs.UnstableAttr{ + Owner: owner, + Perms: p, + Links: 1, + // Size and Blocks are always 0. + }), + }, + d: d, + } + + return fs.NewInode(iops, d.msrc, fs.StableAttr{ + DeviceID: ptsDevice.DeviceID(), + // N.B. Linux always uses inode id 2 for ptmx. See + // fs/devpts/inode.c:mknod_ptmx. + // + // TODO: Since ptsDevice must be shared between + // different mounts, we must not assign fixed numbers. + InodeID: ptsDevice.NextIno(), + Type: fs.CharacterDevice, + // See fs/devpts/inode.c:devpts_fill_super. + BlockSize: 1024, + // The PTY master effectively has two different major/minor + // device numbers. + // + // This one is returned by stat for both opened and unopened + // instances of this inode. + // + // When the inode is opened (GetFile), a new device number is + // allocated based on major UNIX98_PTY_MASTER_MAJOR and the tty + // index as minor number. However, this device number is only + // accessible via ioctl(TIOCGDEV) and /proc/TID/stat. + DeviceFileMajor: linux.TTYAUX_MAJOR, + DeviceFileMinor: linux.PTMX_MINOR, + }) +} + +// Release implements fs.InodeOperations.Release. +func (mi *masterInodeOperations) Release(ctx context.Context) { +} + +// GetFile implements fs.InodeOperations.GetFile. +// +// It allocates a new terminal. +func (mi *masterInodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) { + t, err := mi.d.allocateTerminal(ctx) + if err != nil { + return nil, err + } + + return fs.NewFile(ctx, d, flags, &masterFileOperations{ + d: mi.d, + t: t, + }), nil +} + +// masterFileOperations are the fs.FileOperations for the master end of a terminal. +type masterFileOperations struct { + fsutil.PipeSeek `state:"nosave"` + fsutil.NotDirReaddir `state:"nosave"` + fsutil.NoFsync `state:"nosave"` + fsutil.NoopFlush `state:"nosave"` + fsutil.NoMMap `state:"nosave"` + + // d is the containing dir. + d *dirInodeOperations + + // t is the connected Terminal. + t *Terminal +} + +var _ fs.FileOperations = (*masterFileOperations)(nil) + +// Release implements fs.FileOperations.Release. +func (mf *masterFileOperations) Release() { + mf.d.masterClose(mf.t) + mf.t.DecRef() +} + +// EventRegister implements waiter.Waitable.EventRegister. +func (mf *masterFileOperations) EventRegister(e *waiter.Entry, mask waiter.EventMask) { + mf.t.ld.inQueue.EventRegister(e, mask) + mf.t.ld.outQueue.EventRegister(e, mask) +} + +// EventUnregister implements waiter.Waitable.EventUnregister. +func (mf *masterFileOperations) EventUnregister(e *waiter.Entry) { + mf.t.ld.inQueue.EventUnregister(e) + mf.t.ld.outQueue.EventUnregister(e) +} + +// Readiness implements waiter.Waitable.Readiness. +func (mf *masterFileOperations) Readiness(mask waiter.EventMask) waiter.EventMask { + return mf.t.ld.masterReadiness() +} + +// Read implements fs.FileOperations.Read. +func (mf *masterFileOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, _ int64) (int64, error) { + return mf.t.ld.outputQueueRead(ctx, dst) +} + +// Write implements fs.FileOperations.Write. +func (mf *masterFileOperations) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, _ int64) (int64, error) { + return mf.t.ld.inputQueueWrite(ctx, src) +} + +// Ioctl implements fs.FileOperations.Ioctl. +func (mf *masterFileOperations) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { + switch args[1].Uint() { + case linux.TCGETS: + // N.B. TCGETS on the master actually returns the configuration + // of the slave end. + return mf.t.ld.getTermios(ctx, io, args) + case linux.TCSETS: + // N.B. TCSETS on the master actually affects the configuration + // of the slave end. + return mf.t.ld.setTermios(ctx, io, args) + case linux.TCSETSW: + // TODO: This should drain the output queue first. + return mf.t.ld.setTermios(ctx, io, args) + case linux.TIOCGPTN: + _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), uint32(mf.t.n), usermem.IOOpts{ + AddressSpaceActive: true, + }) + return 0, err + case linux.TIOCSPTLCK: + // TODO: Implement pty locking. For now just pretend we do. + return 0, nil + default: + return 0, syserror.ENOTTY + } +} diff --git a/pkg/sentry/fs/tty/slave.go b/pkg/sentry/fs/tty/slave.go new file mode 100644 index 000000000..9178071a4 --- /dev/null +++ b/pkg/sentry/fs/tty/slave.go @@ -0,0 +1,151 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tty + +import ( + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/arch" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" + "gvisor.googlesource.com/gvisor/pkg/syserror" + "gvisor.googlesource.com/gvisor/pkg/waiter" +) + +// slaveInodeOperations are the fs.InodeOperations for the slave end of the +// Terminal (pts file). +type slaveInodeOperations struct { + inodeOperations + + // d is the containing dir. + d *dirInodeOperations + + // t is the connected Terminal. + t *Terminal +} + +var _ fs.InodeOperations = (*slaveInodeOperations)(nil) + +// newSlaveInode creates an fs.Inode for the slave end of a terminal. +// +// newSlaveInode takes ownership of t. +func newSlaveInode(ctx context.Context, d *dirInodeOperations, t *Terminal, owner fs.FileOwner, p fs.FilePermissions) *fs.Inode { + iops := &slaveInodeOperations{ + inodeOperations: inodeOperations{ + uattr: fs.WithCurrentTime(ctx, fs.UnstableAttr{ + Owner: owner, + Perms: p, + Links: 1, + // Size and Blocks are always 0. + }), + }, + d: d, + t: t, + } + + return fs.NewInode(iops, d.msrc, fs.StableAttr{ + DeviceID: ptsDevice.DeviceID(), + // N.B. Linux always uses inode id = tty index + 3. See + // fs/devpts/inode.c:devpts_pty_new. + // + // TODO: Since ptsDevice must be shared between + // different mounts, we must not assign fixed numbers. + InodeID: ptsDevice.NextIno(), + Type: fs.CharacterDevice, + // See fs/devpts/inode.c:devpts_fill_super. + BlockSize: 1024, + DeviceFileMajor: linux.UNIX98_PTY_SLAVE_MAJOR, + DeviceFileMinor: t.n, + }) +} + +// Release implements fs.InodeOperations.Release. +func (si *slaveInodeOperations) Release(ctx context.Context) { + si.t.DecRef() +} + +// GetFile implements fs.InodeOperations.GetFile. +// +// This may race with destruction of the terminal. If the terminal is gone, it +// returns ENOENT. +func (si *slaveInodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) { + return fs.NewFile(ctx, d, flags, &slaveFileOperations{si: si}), nil +} + +// slaveFileOperations are the fs.FileOperations for the slave end of a terminal. +type slaveFileOperations struct { + fsutil.PipeSeek `state:"nosave"` + fsutil.NotDirReaddir `state:"nosave"` + fsutil.NoFsync `state:"nosave"` + fsutil.NoopFlush `state:"nosave"` + fsutil.NoMMap `state:"nosave"` + + // si is the inode operations. + si *slaveInodeOperations +} + +var _ fs.FileOperations = (*slaveFileOperations)(nil) + +// Release implements fs.FileOperations.Release. +func (sf *slaveFileOperations) Release() { +} + +// EventRegister implements waiter.Waitable.EventRegister. +func (sf *slaveFileOperations) EventRegister(e *waiter.Entry, mask waiter.EventMask) { + sf.si.t.ld.outQueue.EventRegister(e, mask) + sf.si.t.ld.inQueue.EventRegister(e, mask) +} + +// EventUnregister implements waiter.Waitable.EventUnregister. +func (sf *slaveFileOperations) EventUnregister(e *waiter.Entry) { + sf.si.t.ld.outQueue.EventUnregister(e) + sf.si.t.ld.inQueue.EventUnregister(e) +} + +// Readiness implements waiter.Waitable.Readiness. +func (sf *slaveFileOperations) Readiness(mask waiter.EventMask) waiter.EventMask { + return sf.si.t.ld.slaveReadiness() +} + +// Read implements fs.FileOperations.Read. +func (sf *slaveFileOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, _ int64) (int64, error) { + return sf.si.t.ld.inputQueueRead(ctx, dst) +} + +// Write implements fs.FileOperations.Write. +func (sf *slaveFileOperations) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, _ int64) (int64, error) { + return sf.si.t.ld.outputQueueWrite(ctx, src) +} + +// Ioctl implements fs.FileOperations.Ioctl. +func (sf *slaveFileOperations) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { + switch args[1].Uint() { + case linux.TCGETS: + return sf.si.t.ld.getTermios(ctx, io, args) + case linux.TCSETS: + return sf.si.t.ld.setTermios(ctx, io, args) + case linux.TCSETSW: + // TODO: This should drain the output queue first. + return sf.si.t.ld.setTermios(ctx, io, args) + case linux.TIOCGPTN: + _, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), uint32(sf.si.t.n), usermem.IOOpts{ + AddressSpaceActive: true, + }) + return 0, err + default: + return 0, syserror.ENOTTY + } +} diff --git a/pkg/sentry/fs/tty/terminal.go b/pkg/sentry/fs/tty/terminal.go new file mode 100644 index 000000000..6ae713a32 --- /dev/null +++ b/pkg/sentry/fs/tty/terminal.go @@ -0,0 +1,44 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tty + +import ( + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/refs" + "gvisor.googlesource.com/gvisor/pkg/sentry/context" +) + +// Terminal is a pseudoterminal. +type Terminal struct { + refs.AtomicRefCount + + // n is the terminal index. + n uint32 + + // d is the containing directory. + d *dirInodeOperations + + // ld is the line discipline of the terminal. + ld lineDiscipline +} + +func newTerminal(ctx context.Context, d *dirInodeOperations, n uint32) *Terminal { + termios := linux.DefaultSlaveTermios + return &Terminal{ + d: d, + n: n, + ld: lineDiscipline{termios: termios}, + } +} diff --git a/pkg/sentry/fs/tty/tty_test.go b/pkg/sentry/fs/tty/tty_test.go new file mode 100644 index 000000000..0c7560ed7 --- /dev/null +++ b/pkg/sentry/fs/tty/tty_test.go @@ -0,0 +1,56 @@ +// Copyright 2018 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tty + +import ( + "testing" + + "gvisor.googlesource.com/gvisor/pkg/abi/linux" + "gvisor.googlesource.com/gvisor/pkg/sentry/context/contexttest" + "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" +) + +func TestSimpleMasterToSlave(t *testing.T) { + ld := lineDiscipline{termios: linux.DefaultSlaveTermios} + ctx := contexttest.Context(t) + inBytes := []byte("hello, tty\n") + src := usermem.BytesIOSequence(inBytes) + outBytes := make([]byte, 32) + dst := usermem.BytesIOSequence(outBytes) + + // Write to the input queue. + nw, err := ld.inputQueueWrite(ctx, src) + if err != nil { + t.Fatalf("error writing to input queue: %v", err) + } + if nw != int64(len(inBytes)) { + t.Fatalf("wrote wrong length: got %d, want %d", nw, len(inBytes)) + } + + // Read from the input queue. + nr, err := ld.inputQueueRead(ctx, dst) + if err != nil { + t.Fatalf("error reading from input queue: %v", err) + } + if nr != int64(len(inBytes)) { + t.Fatalf("read wrong length: got %d, want %d", nr, len(inBytes)) + } + + outStr := string(outBytes[:nr]) + inStr := string(inBytes) + if outStr != inStr { + t.Fatalf("written and read strings do not match: got %q, want %q", outStr, inStr) + } +} |