summaryrefslogtreecommitdiffhomepage
path: root/pkg/sentry
diff options
context:
space:
mode:
Diffstat (limited to 'pkg/sentry')
-rw-r--r--pkg/sentry/control/proc.go17
-rw-r--r--pkg/sentry/fs/attr.go44
-rw-r--r--pkg/sentry/fs/context.go24
-rw-r--r--pkg/sentry/fs/ext/BUILD54
-rw-r--r--pkg/sentry/fs/ext/ext.go97
-rw-r--r--pkg/sentry/fs/ext/ext_test.go407
-rw-r--r--pkg/sentry/fs/ext/filesystem.go137
-rw-r--r--pkg/sentry/fs/ext/inode.go209
-rw-r--r--pkg/sentry/fs/fdpipe/pipe.go2
-rw-r--r--pkg/sentry/fs/fsutil/inode_cached.go111
-rw-r--r--pkg/sentry/fs/fsutil/inode_cached_test.go10
-rw-r--r--pkg/sentry/fs/gofer/fs.go22
-rw-r--r--pkg/sentry/fs/gofer/session.go27
-rw-r--r--pkg/sentry/fs/host/inode.go6
-rw-r--r--pkg/sentry/fs/host/socket.go8
-rw-r--r--pkg/sentry/fs/host/socket_iovec.go18
-rw-r--r--pkg/sentry/fs/host/socket_unsafe.go9
-rw-r--r--pkg/sentry/fs/mounts.go16
-rw-r--r--pkg/sentry/fs/proc/net.go4
-rw-r--r--pkg/sentry/fs/ramfs/dir.go23
-rw-r--r--pkg/sentry/fs/tmpfs/tmpfs.go6
-rw-r--r--pkg/sentry/fs/tty/BUILD1
-rw-r--r--pkg/sentry/fs/tty/dir.go3
-rw-r--r--pkg/sentry/fs/tty/master.go17
-rw-r--r--pkg/sentry/fs/tty/slave.go13
-rw-r--r--pkg/sentry/fs/tty/terminal.go92
-rw-r--r--pkg/sentry/fsimpl/ext/BUILD86
-rw-r--r--pkg/sentry/fsimpl/ext/README.md117
-rw-r--r--pkg/sentry/fsimpl/ext/assets/README.md (renamed from pkg/sentry/fs/ext/assets/README.md)0
-rw-r--r--pkg/sentry/fsimpl/ext/assets/bigfile.txt (renamed from pkg/sentry/fs/ext/assets/bigfile.txt)0
-rw-r--r--pkg/sentry/fsimpl/ext/assets/file.txt (renamed from pkg/sentry/fs/ext/assets/file.txt)0
l---------pkg/sentry/fsimpl/ext/assets/symlink.txt (renamed from pkg/sentry/fs/ext/assets/symlink.txt)0
-rw-r--r--pkg/sentry/fsimpl/ext/assets/tiny.ext2 (renamed from pkg/sentry/fs/ext/assets/tiny.ext2)bin65536 -> 65536 bytes
-rw-r--r--pkg/sentry/fsimpl/ext/assets/tiny.ext3 (renamed from pkg/sentry/fs/ext/assets/tiny.ext3)bin65536 -> 65536 bytes
-rw-r--r--pkg/sentry/fsimpl/ext/assets/tiny.ext4 (renamed from pkg/sentry/fs/ext/assets/tiny.ext4)bin65536 -> 65536 bytes
-rw-r--r--pkg/sentry/fsimpl/ext/benchmark/BUILD16
-rw-r--r--pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go193
-rwxr-xr-xpkg/sentry/fsimpl/ext/benchmark/make_deep_ext4.sh72
-rw-r--r--pkg/sentry/fsimpl/ext/block_map_file.go200
-rw-r--r--pkg/sentry/fsimpl/ext/block_map_test.go157
-rw-r--r--pkg/sentry/fsimpl/ext/dentry.go (renamed from pkg/sentry/fs/ext/dentry.go)0
-rw-r--r--pkg/sentry/fsimpl/ext/directory.go308
-rw-r--r--pkg/sentry/fsimpl/ext/disklayout/BUILD (renamed from pkg/sentry/fs/ext/disklayout/BUILD)2
-rw-r--r--pkg/sentry/fsimpl/ext/disklayout/block_group.go (renamed from pkg/sentry/fs/ext/disklayout/block_group.go)0
-rw-r--r--pkg/sentry/fsimpl/ext/disklayout/block_group_32.go (renamed from pkg/sentry/fs/ext/disklayout/block_group_32.go)0
-rw-r--r--pkg/sentry/fsimpl/ext/disklayout/block_group_64.go (renamed from pkg/sentry/fs/ext/disklayout/block_group_64.go)0
-rw-r--r--pkg/sentry/fsimpl/ext/disklayout/block_group_test.go (renamed from pkg/sentry/fs/ext/disklayout/block_group_test.go)0
-rw-r--r--pkg/sentry/fsimpl/ext/disklayout/dirent.go (renamed from pkg/sentry/fs/ext/disklayout/dirent.go)3
-rw-r--r--pkg/sentry/fsimpl/ext/disklayout/dirent_new.go (renamed from pkg/sentry/fs/ext/disklayout/dirent_new.go)0
-rw-r--r--pkg/sentry/fsimpl/ext/disklayout/dirent_old.go (renamed from pkg/sentry/fs/ext/disklayout/dirent_old.go)0
-rw-r--r--pkg/sentry/fsimpl/ext/disklayout/dirent_test.go (renamed from pkg/sentry/fs/ext/disklayout/dirent_test.go)6
-rw-r--r--pkg/sentry/fsimpl/ext/disklayout/disklayout.go (renamed from pkg/sentry/fs/ext/disklayout/disklayout.go)0
-rw-r--r--pkg/sentry/fsimpl/ext/disklayout/extent.go (renamed from pkg/sentry/fs/ext/disklayout/extent.go)0
-rw-r--r--pkg/sentry/fsimpl/ext/disklayout/extent_test.go (renamed from pkg/sentry/fs/ext/disklayout/extent_test.go)0
-rw-r--r--pkg/sentry/fsimpl/ext/disklayout/inode.go (renamed from pkg/sentry/fs/ext/disklayout/inode.go)0
-rw-r--r--pkg/sentry/fsimpl/ext/disklayout/inode_new.go (renamed from pkg/sentry/fs/ext/disklayout/inode_new.go)0
-rw-r--r--pkg/sentry/fsimpl/ext/disklayout/inode_old.go (renamed from pkg/sentry/fs/ext/disklayout/inode_old.go)0
-rw-r--r--pkg/sentry/fsimpl/ext/disklayout/inode_test.go (renamed from pkg/sentry/fs/ext/disklayout/inode_test.go)0
-rw-r--r--pkg/sentry/fsimpl/ext/disklayout/superblock.go (renamed from pkg/sentry/fs/ext/disklayout/superblock.go)2
-rw-r--r--pkg/sentry/fsimpl/ext/disklayout/superblock_32.go (renamed from pkg/sentry/fs/ext/disklayout/superblock_32.go)0
-rw-r--r--pkg/sentry/fsimpl/ext/disklayout/superblock_64.go (renamed from pkg/sentry/fs/ext/disklayout/superblock_64.go)0
-rw-r--r--pkg/sentry/fsimpl/ext/disklayout/superblock_old.go (renamed from pkg/sentry/fs/ext/disklayout/superblock_old.go)0
-rw-r--r--pkg/sentry/fsimpl/ext/disklayout/superblock_test.go (renamed from pkg/sentry/fs/ext/disklayout/superblock_test.go)0
-rw-r--r--pkg/sentry/fsimpl/ext/disklayout/test_utils.go (renamed from pkg/sentry/fs/ext/disklayout/test_utils.go)0
-rw-r--r--pkg/sentry/fsimpl/ext/ext.go135
-rw-r--r--pkg/sentry/fsimpl/ext/ext_test.go917
-rw-r--r--pkg/sentry/fsimpl/ext/extent_file.go237
-rw-r--r--pkg/sentry/fsimpl/ext/extent_test.go (renamed from pkg/sentry/fs/ext/extent_test.go)136
-rw-r--r--pkg/sentry/fsimpl/ext/file_description.go86
-rw-r--r--pkg/sentry/fsimpl/ext/filesystem.go443
-rw-r--r--pkg/sentry/fsimpl/ext/inode.go219
-rw-r--r--pkg/sentry/fsimpl/ext/regular_file.go159
-rw-r--r--pkg/sentry/fsimpl/ext/symlink.go111
-rw-r--r--pkg/sentry/fsimpl/ext/utils.go (renamed from pkg/sentry/fs/ext/utils.go)35
-rw-r--r--pkg/sentry/fsimpl/memfs/BUILD4
-rw-r--r--pkg/sentry/fsimpl/memfs/directory.go55
-rw-r--r--pkg/sentry/fsimpl/memfs/filesystem.go68
-rw-r--r--pkg/sentry/fsimpl/memfs/memfs.go93
-rw-r--r--pkg/sentry/fsimpl/memfs/regular_file.go7
-rw-r--r--pkg/sentry/fsimpl/memfs/symlink.go4
-rw-r--r--pkg/sentry/fsimpl/proc/BUILD49
-rw-r--r--pkg/sentry/fsimpl/proc/filesystems.go25
-rw-r--r--pkg/sentry/fsimpl/proc/loadavg.go40
-rw-r--r--pkg/sentry/fsimpl/proc/meminfo.go77
-rw-r--r--pkg/sentry/fsimpl/proc/mounts.go33
-rw-r--r--pkg/sentry/fsimpl/proc/net.go338
-rw-r--r--pkg/sentry/fsimpl/proc/net_test.go78
-rw-r--r--pkg/sentry/fsimpl/proc/proc.go16
-rw-r--r--pkg/sentry/fsimpl/proc/stat.go127
-rw-r--r--pkg/sentry/fsimpl/proc/sys.go51
-rw-r--r--pkg/sentry/fsimpl/proc/task.go261
-rw-r--r--pkg/sentry/fsimpl/proc/version.go68
-rw-r--r--pkg/sentry/inet/inet.go52
-rw-r--r--pkg/sentry/inet/test_stack.go10
-rw-r--r--pkg/sentry/kernel/BUILD1
-rw-r--r--pkg/sentry/kernel/kernel.go157
-rw-r--r--pkg/sentry/kernel/sessions.go12
-rw-r--r--pkg/sentry/kernel/task_block.go12
-rw-r--r--pkg/sentry/kernel/task_context.go11
-rw-r--r--pkg/sentry/kernel/task_start.go3
-rw-r--r--pkg/sentry/kernel/thread_group.go179
-rw-r--r--pkg/sentry/kernel/tty.go28
-rw-r--r--pkg/sentry/loader/elf.go2
-rw-r--r--pkg/sentry/loader/loader.go107
-rw-r--r--pkg/sentry/mm/procfs.go92
-rw-r--r--pkg/sentry/pgalloc/pgalloc.go13
-rw-r--r--pkg/sentry/platform/ptrace/BUILD6
-rw-r--r--pkg/sentry/platform/ptrace/ptrace.go2
-rw-r--r--pkg/sentry/platform/ptrace/ptrace_amd64.go33
-rw-r--r--pkg/sentry/platform/ptrace/ptrace_arm64.go30
-rw-r--r--pkg/sentry/platform/ptrace/ptrace_unsafe.go46
-rw-r--r--pkg/sentry/platform/ptrace/stub_arm64.s106
-rw-r--r--pkg/sentry/platform/ptrace/subprocess.go15
-rw-r--r--pkg/sentry/platform/ptrace/subprocess_amd64.go24
-rw-r--r--pkg/sentry/platform/ptrace/subprocess_arm64.go126
-rw-r--r--pkg/sentry/platform/ptrace/subprocess_linux.go2
-rw-r--r--pkg/sentry/platform/ptrace/subprocess_linux_unsafe.go (renamed from pkg/sentry/platform/ptrace/subprocess_linux_amd64_unsafe.go)3
-rw-r--r--pkg/sentry/safemem/io.go55
-rw-r--r--pkg/sentry/socket/BUILD1
-rw-r--r--pkg/sentry/socket/epsocket/BUILD2
-rw-r--r--pkg/sentry/socket/epsocket/epsocket.go136
-rw-r--r--pkg/sentry/socket/epsocket/stack.go57
-rw-r--r--pkg/sentry/socket/hostinet/socket.go37
-rw-r--r--pkg/sentry/socket/hostinet/socket_unsafe.go10
-rw-r--r--pkg/sentry/socket/hostinet/stack.go93
-rw-r--r--pkg/sentry/socket/netfilter/BUILD24
-rw-r--r--pkg/sentry/socket/netfilter/netfilter.go286
-rw-r--r--pkg/sentry/socket/netlink/route/protocol.go62
-rw-r--r--pkg/sentry/socket/netlink/socket.go45
-rw-r--r--pkg/sentry/socket/rpcinet/notifier/BUILD3
-rw-r--r--pkg/sentry/socket/rpcinet/notifier/notifier.go3
-rw-r--r--pkg/sentry/socket/rpcinet/socket.go23
-rw-r--r--pkg/sentry/socket/rpcinet/stack.go31
-rw-r--r--pkg/sentry/socket/socket.go40
-rw-r--r--pkg/sentry/socket/unix/io.go4
-rw-r--r--pkg/sentry/socket/unix/transport/connectioned.go2
-rw-r--r--pkg/sentry/socket/unix/transport/connectionless.go2
-rw-r--r--pkg/sentry/socket/unix/transport/unix.go36
-rw-r--r--pkg/sentry/socket/unix/unix.go20
-rw-r--r--pkg/sentry/state/BUILD1
-rw-r--r--pkg/sentry/state/state.go5
-rw-r--r--pkg/sentry/strace/socket.go2
-rw-r--r--pkg/sentry/syscalls/linux/error.go4
-rw-r--r--pkg/sentry/syscalls/linux/linux64.go3
-rw-r--r--pkg/sentry/syscalls/linux/sys_epoll.go19
-rw-r--r--pkg/sentry/syscalls/linux/sys_getdents.go24
-rw-r--r--pkg/sentry/syscalls/linux/sys_mount.go12
-rw-r--r--pkg/sentry/syscalls/linux/sys_read.go3
-rw-r--r--pkg/sentry/syscalls/linux/sys_socket.go6
-rw-r--r--pkg/sentry/syscalls/linux/sys_splice.go27
-rw-r--r--pkg/sentry/syscalls/linux/sys_thread.go2
-rw-r--r--pkg/sentry/syscalls/linux/sys_write.go3
-rw-r--r--pkg/sentry/vfs/BUILD10
-rw-r--r--pkg/sentry/vfs/file_description_impl_util.go122
-rw-r--r--pkg/sentry/vfs/file_description_impl_util_test.go141
-rw-r--r--pkg/sentry/vfs/testutil.go139
156 files changed, 7417 insertions, 1561 deletions
diff --git a/pkg/sentry/control/proc.go b/pkg/sentry/control/proc.go
index 3f9772b87..c35faeb4c 100644
--- a/pkg/sentry/control/proc.go
+++ b/pkg/sentry/control/proc.go
@@ -56,15 +56,10 @@ type ExecArgs struct {
// MountNamespace is the mount namespace to execute the new process in.
// A reference on MountNamespace must be held for the lifetime of the
- // ExecArgs. If MountNamespace is nil, it will default to the kernel's
- // root MountNamespace.
+ // ExecArgs. If MountNamespace is nil, it will default to the init
+ // process's MountNamespace.
MountNamespace *fs.MountNamespace
- // Root defines the root directory for the new process. A reference on
- // Root must be held for the lifetime of the ExecArgs. If Root is nil,
- // it will default to the VFS root.
- Root *fs.Dirent
-
// WorkingDirectory defines the working directory for the new process.
WorkingDirectory string `json:"wd"`
@@ -155,7 +150,6 @@ func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadI
Envv: args.Envv,
WorkingDirectory: args.WorkingDirectory,
MountNamespace: args.MountNamespace,
- Root: args.Root,
Credentials: creds,
FDTable: fdTable,
Umask: 0022,
@@ -167,11 +161,6 @@ func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadI
ContainerID: args.ContainerID,
PIDNamespace: args.PIDNamespace,
}
- if initArgs.Root != nil {
- // initArgs must hold a reference on Root, which will be
- // donated to the new process in CreateProcess.
- initArgs.Root.IncRef()
- }
if initArgs.MountNamespace != nil {
// initArgs must hold a reference on MountNamespace, which will
// be donated to the new process in CreateProcess.
@@ -184,7 +173,7 @@ func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadI
paths := fs.GetPath(initArgs.Envv)
mns := initArgs.MountNamespace
if mns == nil {
- mns = proc.Kernel.RootMountNamespace()
+ mns = proc.Kernel.GlobalInit().Leader().MountNamespace()
}
f, err := mns.ResolveExecutablePath(ctx, initArgs.WorkingDirectory, initArgs.Argv[0], paths)
if err != nil {
diff --git a/pkg/sentry/fs/attr.go b/pkg/sentry/fs/attr.go
index 9fc6a5bc2..4f3d6410e 100644
--- a/pkg/sentry/fs/attr.go
+++ b/pkg/sentry/fs/attr.go
@@ -111,6 +111,50 @@ func (n InodeType) LinuxType() uint32 {
}
}
+// ToDirentType converts an InodeType to a linux dirent type field.
+func ToDirentType(nodeType InodeType) uint8 {
+ switch nodeType {
+ case RegularFile, SpecialFile:
+ return linux.DT_REG
+ case Symlink:
+ return linux.DT_LNK
+ case Directory, SpecialDirectory:
+ return linux.DT_DIR
+ case Pipe:
+ return linux.DT_FIFO
+ case CharacterDevice:
+ return linux.DT_CHR
+ case BlockDevice:
+ return linux.DT_BLK
+ case Socket:
+ return linux.DT_SOCK
+ default:
+ return linux.DT_UNKNOWN
+ }
+}
+
+// ToInodeType coverts a linux file type to InodeType.
+func ToInodeType(linuxFileType linux.FileMode) InodeType {
+ switch linuxFileType {
+ case linux.ModeRegular:
+ return RegularFile
+ case linux.ModeDirectory:
+ return Directory
+ case linux.ModeSymlink:
+ return Symlink
+ case linux.ModeNamedPipe:
+ return Pipe
+ case linux.ModeCharacterDevice:
+ return CharacterDevice
+ case linux.ModeBlockDevice:
+ return BlockDevice
+ case linux.ModeSocket:
+ return Socket
+ default:
+ panic(fmt.Sprintf("unknown file mode: %d", linuxFileType))
+ }
+}
+
// StableAttr contains Inode attributes that will be stable throughout the
// lifetime of the Inode.
//
diff --git a/pkg/sentry/fs/context.go b/pkg/sentry/fs/context.go
index 51b4c7ee1..dd427de5d 100644
--- a/pkg/sentry/fs/context.go
+++ b/pkg/sentry/fs/context.go
@@ -112,3 +112,27 @@ func DirentCacheLimiterFromContext(ctx context.Context) *DirentCacheLimiter {
}
return nil
}
+
+type rootContext struct {
+ context.Context
+ root *Dirent
+}
+
+// WithRoot returns a copy of ctx with the given root.
+func WithRoot(ctx context.Context, root *Dirent) context.Context {
+ return &rootContext{
+ Context: ctx,
+ root: root,
+ }
+}
+
+// Value implements Context.Value.
+func (rc rootContext) Value(key interface{}) interface{} {
+ switch key {
+ case CtxRoot:
+ rc.root.IncRef()
+ return rc.root
+ default:
+ return rc.Context.Value(key)
+ }
+}
diff --git a/pkg/sentry/fs/ext/BUILD b/pkg/sentry/fs/ext/BUILD
deleted file mode 100644
index 2c15875f5..000000000
--- a/pkg/sentry/fs/ext/BUILD
+++ /dev/null
@@ -1,54 +0,0 @@
-package(licenses = ["notice"])
-
-load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
-
-go_library(
- name = "ext",
- srcs = [
- "dentry.go",
- "ext.go",
- "filesystem.go",
- "inode.go",
- "utils.go",
- ],
- importpath = "gvisor.dev/gvisor/pkg/sentry/fs/ext",
- visibility = ["//pkg/sentry:internal"],
- deps = [
- "//pkg/abi/linux",
- "//pkg/binary",
- "//pkg/sentry/context",
- "//pkg/sentry/fs/ext/disklayout",
- "//pkg/sentry/kernel/auth",
- "//pkg/sentry/vfs",
- "//pkg/syserror",
- ],
-)
-
-go_test(
- name = "ext_test",
- size = "small",
- srcs = [
- "ext_test.go",
- "extent_test.go",
- ],
- data = [
- "//pkg/sentry/fs/ext:assets/bigfile.txt",
- "//pkg/sentry/fs/ext:assets/file.txt",
- "//pkg/sentry/fs/ext:assets/tiny.ext2",
- "//pkg/sentry/fs/ext:assets/tiny.ext3",
- "//pkg/sentry/fs/ext:assets/tiny.ext4",
- ],
- embed = [":ext"],
- deps = [
- "//pkg/abi/linux",
- "//pkg/binary",
- "//pkg/sentry/context",
- "//pkg/sentry/context/contexttest",
- "//pkg/sentry/fs/ext/disklayout",
- "//pkg/sentry/kernel/auth",
- "//pkg/sentry/vfs",
- "//runsc/test/testutil",
- "@com_github_google_go-cmp//cmp:go_default_library",
- "@com_github_google_go-cmp//cmp/cmpopts:go_default_library",
- ],
-)
diff --git a/pkg/sentry/fs/ext/ext.go b/pkg/sentry/fs/ext/ext.go
deleted file mode 100644
index 10e235fb1..000000000
--- a/pkg/sentry/fs/ext/ext.go
+++ /dev/null
@@ -1,97 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Package ext implements readonly ext(2/3/4) filesystems.
-package ext
-
-import (
- "errors"
- "fmt"
- "io"
- "os"
-
- "gvisor.dev/gvisor/pkg/abi/linux"
- "gvisor.dev/gvisor/pkg/sentry/context"
- "gvisor.dev/gvisor/pkg/sentry/fs/ext/disklayout"
- "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
- "gvisor.dev/gvisor/pkg/sentry/vfs"
- "gvisor.dev/gvisor/pkg/syserror"
-)
-
-// filesystemType implements vfs.FilesystemType.
-type filesystemType struct{}
-
-// Compiles only if filesystemType implements vfs.FilesystemType.
-var _ vfs.FilesystemType = (*filesystemType)(nil)
-
-// getDeviceFd returns the read seeker to the underlying device.
-// Currently there are two ways of mounting an ext(2/3/4) fs:
-// 1. Specify a mount with our internal special MountType in the OCI spec.
-// 2. Expose the device to the container and mount it from application layer.
-func getDeviceFd(source string, opts vfs.NewFilesystemOptions) (io.ReadSeeker, error) {
- if opts.InternalData == nil {
- // User mount call.
- // TODO(b/134676337): Open the device specified by `source` and return that.
- panic("unimplemented")
- }
-
- // NewFilesystem call originated from within the sentry.
- fd, ok := opts.InternalData.(uintptr)
- if !ok {
- return nil, errors.New("internal data for ext fs must be a uintptr containing the file descriptor to device")
- }
-
- // We do not close this file because that would close the underlying device
- // file descriptor (which is required for reading the fs from disk).
- // TODO(b/134676337): Use pkg/fd instead.
- deviceFile := os.NewFile(fd, source)
- if deviceFile == nil {
- return nil, fmt.Errorf("ext4 device file descriptor is not valid: %d", fd)
- }
-
- return deviceFile, nil
-}
-
-// NewFilesystem implements vfs.FilesystemType.NewFilesystem.
-func (fstype filesystemType) NewFilesystem(ctx context.Context, creds *auth.Credentials, source string, opts vfs.NewFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
- dev, err := getDeviceFd(source, opts)
- if err != nil {
- return nil, nil, err
- }
-
- fs := filesystem{dev: dev, inodeCache: make(map[uint32]*inode)}
- fs.vfsfs.Init(&fs)
- fs.sb, err = readSuperBlock(dev)
- if err != nil {
- return nil, nil, err
- }
-
- if fs.sb.Magic() != linux.EXT_SUPER_MAGIC {
- // mount(2) specifies that EINVAL should be returned if the superblock is
- // invalid.
- return nil, nil, syserror.EINVAL
- }
-
- fs.bgs, err = readBlockGroups(dev, fs.sb)
- if err != nil {
- return nil, nil, err
- }
-
- rootInode, err := fs.getOrCreateInode(disklayout.RootDirInode)
- if err != nil {
- return nil, nil, err
- }
-
- return &fs.vfsfs, &newDentry(rootInode).vfsd, nil
-}
diff --git a/pkg/sentry/fs/ext/ext_test.go b/pkg/sentry/fs/ext/ext_test.go
deleted file mode 100644
index ee7f7907c..000000000
--- a/pkg/sentry/fs/ext/ext_test.go
+++ /dev/null
@@ -1,407 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package ext
-
-import (
- "fmt"
- "os"
- "path"
- "testing"
-
- "github.com/google/go-cmp/cmp"
- "gvisor.dev/gvisor/pkg/abi/linux"
- "gvisor.dev/gvisor/pkg/sentry/context"
- "gvisor.dev/gvisor/pkg/sentry/context/contexttest"
- "gvisor.dev/gvisor/pkg/sentry/fs/ext/disklayout"
- "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
- "gvisor.dev/gvisor/pkg/sentry/vfs"
-
- "gvisor.dev/gvisor/runsc/test/testutil"
-)
-
-const (
- assetsDir = "pkg/sentry/fs/ext/assets"
-)
-
-var (
- ext2ImagePath = path.Join(assetsDir, "tiny.ext2")
- ext3ImagePath = path.Join(assetsDir, "tiny.ext3")
- ext4ImagePath = path.Join(assetsDir, "tiny.ext4")
-)
-
-func beginning(_ uint64) uint64 {
- return 0
-}
-
-func middle(i uint64) uint64 {
- return i / 2
-}
-
-func end(i uint64) uint64 {
- return i
-}
-
-// setUp opens imagePath as an ext Filesystem and returns all necessary
-// elements required to run tests. If error is non-nil, it also returns a tear
-// down function which must be called after the test is run for clean up.
-func setUp(t *testing.T, imagePath string) (context.Context, *vfs.Filesystem, *vfs.Dentry, func(), error) {
- localImagePath, err := testutil.FindFile(imagePath)
- if err != nil {
- return nil, nil, nil, nil, fmt.Errorf("failed to open local image at path %s: %v", imagePath, err)
- }
-
- f, err := os.Open(localImagePath)
- if err != nil {
- return nil, nil, nil, nil, err
- }
-
- // Mount the ext4 fs and retrieve the inode structure for the file.
- mockCtx := contexttest.Context(t)
- fs, d, err := filesystemType{}.NewFilesystem(mockCtx, nil, localImagePath, vfs.NewFilesystemOptions{InternalData: f.Fd()})
- if err != nil {
- f.Close()
- return nil, nil, nil, nil, err
- }
-
- tearDown := func() {
- if err := f.Close(); err != nil {
- t.Fatalf("tearDown failed: %v", err)
- }
- }
- return mockCtx, fs, d, tearDown, nil
-}
-
-// TestRootDir tests that the root directory inode is correctly initialized and
-// returned from setUp.
-func TestRootDir(t *testing.T) {
- type inodeProps struct {
- Mode linux.FileMode
- UID auth.KUID
- GID auth.KGID
- Size uint64
- InodeSize uint16
- Links uint16
- Flags disklayout.InodeFlags
- }
-
- type rootDirTest struct {
- name string
- image string
- wantInode inodeProps
- }
-
- tests := []rootDirTest{
- {
- name: "ext4 root dir",
- image: ext4ImagePath,
- wantInode: inodeProps{
- Mode: linux.ModeDirectory | 0755,
- Size: 0x400,
- InodeSize: 0x80,
- Links: 3,
- Flags: disklayout.InodeFlags{Extents: true},
- },
- },
- {
- name: "ext3 root dir",
- image: ext3ImagePath,
- wantInode: inodeProps{
- Mode: linux.ModeDirectory | 0755,
- Size: 0x400,
- InodeSize: 0x80,
- Links: 3,
- },
- },
- {
- name: "ext2 root dir",
- image: ext2ImagePath,
- wantInode: inodeProps{
- Mode: linux.ModeDirectory | 0755,
- Size: 0x400,
- InodeSize: 0x80,
- Links: 3,
- },
- },
- }
-
- for _, test := range tests {
- t.Run(test.name, func(t *testing.T) {
- _, _, vfsd, tearDown, err := setUp(t, test.image)
- if err != nil {
- t.Fatalf("setUp failed: %v", err)
- }
- defer tearDown()
-
- d, ok := vfsd.Impl().(*dentry)
- if !ok {
- t.Fatalf("ext dentry of incorrect type: %T", vfsd.Impl())
- }
-
- // Offload inode contents into local structs for comparison.
- gotInode := inodeProps{
- Mode: d.inode.diskInode.Mode(),
- UID: d.inode.diskInode.UID(),
- GID: d.inode.diskInode.GID(),
- Size: d.inode.diskInode.Size(),
- InodeSize: d.inode.diskInode.InodeSize(),
- Links: d.inode.diskInode.LinksCount(),
- Flags: d.inode.diskInode.Flags(),
- }
-
- if diff := cmp.Diff(gotInode, test.wantInode); diff != "" {
- t.Errorf("inode mismatch (-want +got):\n%s", diff)
- }
- })
- }
-}
-
-// TestFilesystemInit tests that the filesystem superblock and block group
-// descriptors are correctly read in and initialized.
-func TestFilesystemInit(t *testing.T) {
- // sb only contains the immutable properties of the superblock.
- type sb struct {
- InodesCount uint32
- BlocksCount uint64
- MaxMountCount uint16
- FirstDataBlock uint32
- BlockSize uint64
- BlocksPerGroup uint32
- ClusterSize uint64
- ClustersPerGroup uint32
- InodeSize uint16
- InodesPerGroup uint32
- BgDescSize uint16
- Magic uint16
- Revision disklayout.SbRevision
- CompatFeatures disklayout.CompatFeatures
- IncompatFeatures disklayout.IncompatFeatures
- RoCompatFeatures disklayout.RoCompatFeatures
- }
-
- // bg only contains the immutable properties of the block group descriptor.
- type bg struct {
- InodeTable uint64
- BlockBitmap uint64
- InodeBitmap uint64
- ExclusionBitmap uint64
- Flags disklayout.BGFlags
- }
-
- type fsInitTest struct {
- name string
- image string
- wantSb sb
- wantBgs []bg
- }
-
- tests := []fsInitTest{
- {
- name: "ext4 filesystem init",
- image: ext4ImagePath,
- wantSb: sb{
- InodesCount: 0x10,
- BlocksCount: 0x40,
- MaxMountCount: 0xffff,
- FirstDataBlock: 0x1,
- BlockSize: 0x400,
- BlocksPerGroup: 0x2000,
- ClusterSize: 0x400,
- ClustersPerGroup: 0x2000,
- InodeSize: 0x80,
- InodesPerGroup: 0x10,
- BgDescSize: 0x40,
- Magic: linux.EXT_SUPER_MAGIC,
- Revision: disklayout.DynamicRev,
- CompatFeatures: disklayout.CompatFeatures{
- ExtAttr: true,
- ResizeInode: true,
- DirIndex: true,
- },
- IncompatFeatures: disklayout.IncompatFeatures{
- DirentFileType: true,
- Extents: true,
- Is64Bit: true,
- FlexBg: true,
- },
- RoCompatFeatures: disklayout.RoCompatFeatures{
- Sparse: true,
- LargeFile: true,
- HugeFile: true,
- DirNlink: true,
- ExtraIsize: true,
- MetadataCsum: true,
- },
- },
- wantBgs: []bg{
- {
- InodeTable: 0x23,
- BlockBitmap: 0x3,
- InodeBitmap: 0x13,
- Flags: disklayout.BGFlags{
- InodeZeroed: true,
- },
- },
- },
- },
- {
- name: "ext3 filesystem init",
- image: ext3ImagePath,
- wantSb: sb{
- InodesCount: 0x10,
- BlocksCount: 0x40,
- MaxMountCount: 0xffff,
- FirstDataBlock: 0x1,
- BlockSize: 0x400,
- BlocksPerGroup: 0x2000,
- ClusterSize: 0x400,
- ClustersPerGroup: 0x2000,
- InodeSize: 0x80,
- InodesPerGroup: 0x10,
- BgDescSize: 0x20,
- Magic: linux.EXT_SUPER_MAGIC,
- Revision: disklayout.DynamicRev,
- CompatFeatures: disklayout.CompatFeatures{
- ExtAttr: true,
- ResizeInode: true,
- DirIndex: true,
- },
- IncompatFeatures: disklayout.IncompatFeatures{
- DirentFileType: true,
- },
- RoCompatFeatures: disklayout.RoCompatFeatures{
- Sparse: true,
- LargeFile: true,
- },
- },
- wantBgs: []bg{
- {
- InodeTable: 0x5,
- BlockBitmap: 0x3,
- InodeBitmap: 0x4,
- Flags: disklayout.BGFlags{
- InodeZeroed: true,
- },
- },
- },
- },
- {
- name: "ext2 filesystem init",
- image: ext2ImagePath,
- wantSb: sb{
- InodesCount: 0x10,
- BlocksCount: 0x40,
- MaxMountCount: 0xffff,
- FirstDataBlock: 0x1,
- BlockSize: 0x400,
- BlocksPerGroup: 0x2000,
- ClusterSize: 0x400,
- ClustersPerGroup: 0x2000,
- InodeSize: 0x80,
- InodesPerGroup: 0x10,
- BgDescSize: 0x20,
- Magic: linux.EXT_SUPER_MAGIC,
- Revision: disklayout.DynamicRev,
- CompatFeatures: disklayout.CompatFeatures{
- ExtAttr: true,
- ResizeInode: true,
- DirIndex: true,
- },
- IncompatFeatures: disklayout.IncompatFeatures{
- DirentFileType: true,
- },
- RoCompatFeatures: disklayout.RoCompatFeatures{
- Sparse: true,
- LargeFile: true,
- },
- },
- wantBgs: []bg{
- {
- InodeTable: 0x5,
- BlockBitmap: 0x3,
- InodeBitmap: 0x4,
- Flags: disklayout.BGFlags{
- InodeZeroed: true,
- },
- },
- },
- },
- }
-
- for _, test := range tests {
- t.Run(test.name, func(t *testing.T) {
- _, vfsfs, _, tearDown, err := setUp(t, test.image)
- if err != nil {
- t.Fatalf("setUp failed: %v", err)
- }
- defer tearDown()
-
- fs, ok := vfsfs.Impl().(*filesystem)
- if !ok {
- t.Fatalf("ext filesystem of incorrect type: %T", vfsfs.Impl())
- }
-
- // Offload superblock and block group descriptors contents into
- // local structs for comparison.
- totalFreeInodes := uint32(0)
- totalFreeBlocks := uint64(0)
- gotSb := sb{
- InodesCount: fs.sb.InodesCount(),
- BlocksCount: fs.sb.BlocksCount(),
- MaxMountCount: fs.sb.MaxMountCount(),
- FirstDataBlock: fs.sb.FirstDataBlock(),
- BlockSize: fs.sb.BlockSize(),
- BlocksPerGroup: fs.sb.BlocksPerGroup(),
- ClusterSize: fs.sb.ClusterSize(),
- ClustersPerGroup: fs.sb.ClustersPerGroup(),
- InodeSize: fs.sb.InodeSize(),
- InodesPerGroup: fs.sb.InodesPerGroup(),
- BgDescSize: fs.sb.BgDescSize(),
- Magic: fs.sb.Magic(),
- Revision: fs.sb.Revision(),
- CompatFeatures: fs.sb.CompatibleFeatures(),
- IncompatFeatures: fs.sb.IncompatibleFeatures(),
- RoCompatFeatures: fs.sb.ReadOnlyCompatibleFeatures(),
- }
- gotNumBgs := len(fs.bgs)
- gotBgs := make([]bg, gotNumBgs)
- for i := 0; i < gotNumBgs; i++ {
- gotBgs[i].InodeTable = fs.bgs[i].InodeTable()
- gotBgs[i].BlockBitmap = fs.bgs[i].BlockBitmap()
- gotBgs[i].InodeBitmap = fs.bgs[i].InodeBitmap()
- gotBgs[i].ExclusionBitmap = fs.bgs[i].ExclusionBitmap()
- gotBgs[i].Flags = fs.bgs[i].Flags()
-
- totalFreeInodes += fs.bgs[i].FreeInodesCount()
- totalFreeBlocks += uint64(fs.bgs[i].FreeBlocksCount())
- }
-
- if diff := cmp.Diff(gotSb, test.wantSb); diff != "" {
- t.Errorf("superblock mismatch (-want +got):\n%s", diff)
- }
-
- if diff := cmp.Diff(gotBgs, test.wantBgs); diff != "" {
- t.Errorf("block group descriptors mismatch (-want +got):\n%s", diff)
- }
-
- if diff := cmp.Diff(totalFreeInodes, fs.sb.FreeInodesCount()); diff != "" {
- t.Errorf("total free inodes mismatch (-want +got):\n%s", diff)
- }
-
- if diff := cmp.Diff(totalFreeBlocks, fs.sb.FreeBlocksCount()); diff != "" {
- t.Errorf("total free blocks mismatch (-want +got):\n%s", diff)
- }
- })
- }
-}
diff --git a/pkg/sentry/fs/ext/filesystem.go b/pkg/sentry/fs/ext/filesystem.go
deleted file mode 100644
index 7150e75a5..000000000
--- a/pkg/sentry/fs/ext/filesystem.go
+++ /dev/null
@@ -1,137 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package ext
-
-import (
- "io"
- "sync"
-
- "gvisor.dev/gvisor/pkg/sentry/context"
- "gvisor.dev/gvisor/pkg/sentry/fs/ext/disklayout"
- "gvisor.dev/gvisor/pkg/sentry/vfs"
- "gvisor.dev/gvisor/pkg/syserror"
-)
-
-// filesystem implements vfs.FilesystemImpl.
-type filesystem struct {
- // TODO(b/134676337): Remove when all methods have been implemented.
- vfs.FilesystemImpl
-
- vfsfs vfs.Filesystem
-
- // mu serializes changes to the Dentry tree and the usage of the read seeker.
- mu sync.Mutex
-
- // dev is the ReadSeeker for the underlying fs device. It is protected by mu.
- //
- // The ext filesystems aim to maximize locality, i.e. place all the data
- // blocks of a file close together. On a spinning disk, locality reduces the
- // amount of movement of the head hence speeding up IO operations. On an SSD
- // there are no moving parts but locality increases the size of each transer
- // request. Hence, having mutual exclusion on the read seeker while reading a
- // file *should* help in achieving the intended performance gains.
- //
- // Note: This synchronization was not coupled with the ReadSeeker itself
- // because we want to synchronize across read/seek operations for the
- // performance gains mentioned above. Helps enforcing one-file-at-a-time IO.
- dev io.ReadSeeker
-
- // inodeCache maps absolute inode numbers to the corresponding Inode struct.
- // Inodes should be removed from this once their reference count hits 0.
- //
- // Protected by mu because every addition and removal from this corresponds to
- // a change in the dentry tree.
- inodeCache map[uint32]*inode
-
- // sb represents the filesystem superblock. Immutable after initialization.
- sb disklayout.SuperBlock
-
- // bgs represents all the block group descriptors for the filesystem.
- // Immutable after initialization.
- bgs []disklayout.BlockGroup
-}
-
-// Compiles only if filesystem implements vfs.FilesystemImpl.
-var _ vfs.FilesystemImpl = (*filesystem)(nil)
-
-// getOrCreateInode gets the inode corresponding to the inode number passed in.
-// It creates a new one with the given inode number if one does not exist.
-//
-// Preconditions: must be holding fs.mu.
-func (fs *filesystem) getOrCreateInode(inodeNum uint32) (*inode, error) {
- if in, ok := fs.inodeCache[inodeNum]; ok {
- return in, nil
- }
-
- in, err := newInode(fs.dev, fs.sb, fs.bgs, inodeNum)
- if err != nil {
- return nil, err
- }
-
- fs.inodeCache[inodeNum] = in
- return in, nil
-}
-
-// Release implements vfs.FilesystemImpl.Release.
-func (fs *filesystem) Release() {
-}
-
-// Sync implements vfs.FilesystemImpl.Sync.
-func (fs *filesystem) Sync(ctx context.Context) error {
- // This is a readonly filesystem for now.
- return nil
-}
-
-// The vfs.FilesystemImpl functions below return EROFS because their respective
-// man pages say that EROFS must be returned if the path resolves to a file on
-// a read-only filesystem.
-
-// TODO(b/134676337): Implement path traversal and return EROFS only if the
-// path resolves to a Dentry within ext fs.
-
-// MkdirAt implements vfs.FilesystemImpl.MkdirAt.
-func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error {
- return syserror.EROFS
-}
-
-// MknodAt implements vfs.FilesystemImpl.MknodAt.
-func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error {
- return syserror.EROFS
-}
-
-// RenameAt implements vfs.FilesystemImpl.RenameAt.
-func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry, opts vfs.RenameOptions) error {
- return syserror.EROFS
-}
-
-// RmdirAt implements vfs.FilesystemImpl.RmdirAt.
-func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error {
- return syserror.EROFS
-}
-
-// SetStatAt implements vfs.FilesystemImpl.SetStatAt.
-func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error {
- return syserror.EROFS
-}
-
-// SymlinkAt implements vfs.FilesystemImpl.SymlinkAt.
-func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error {
- return syserror.EROFS
-}
-
-// UnlinkAt implements vfs.FilesystemImpl.UnlinkAt.
-func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error {
- return syserror.EROFS
-}
diff --git a/pkg/sentry/fs/ext/inode.go b/pkg/sentry/fs/ext/inode.go
deleted file mode 100644
index df1ea0bda..000000000
--- a/pkg/sentry/fs/ext/inode.go
+++ /dev/null
@@ -1,209 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package ext
-
-import (
- "io"
- "sync/atomic"
-
- "gvisor.dev/gvisor/pkg/binary"
- "gvisor.dev/gvisor/pkg/sentry/fs/ext/disklayout"
- "gvisor.dev/gvisor/pkg/syserror"
-)
-
-// inode represents an ext inode.
-type inode struct {
- // refs is a reference count. refs is accessed using atomic memory operations.
- refs int64
-
- // inodeNum is the inode number of this inode on disk. This is used to
- // identify inodes within the ext filesystem.
- inodeNum uint32
-
- // diskInode gives us access to the inode struct on disk. Immutable.
- diskInode disklayout.Inode
-
- // root is the root extent node. This lives in the 60 byte diskInode.Blocks().
- // Immutable. Nil if the inode does not use extents.
- root *disklayout.ExtentNode
-}
-
-// incRef increments the inode ref count.
-func (in *inode) incRef() {
- atomic.AddInt64(&in.refs, 1)
-}
-
-// tryIncRef tries to increment the ref count. Returns true if successful.
-func (in *inode) tryIncRef() bool {
- for {
- refs := atomic.LoadInt64(&in.refs)
- if refs == 0 {
- return false
- }
- if atomic.CompareAndSwapInt64(&in.refs, refs, refs+1) {
- return true
- }
- }
-}
-
-// decRef decrements the inode ref count and releases the inode resources if
-// the ref count hits 0.
-//
-// Preconditions: Must have locked fs.mu.
-func (in *inode) decRef(fs *filesystem) {
- if refs := atomic.AddInt64(&in.refs, -1); refs == 0 {
- delete(fs.inodeCache, in.inodeNum)
- } else if refs < 0 {
- panic("ext.inode.decRef() called without holding a reference")
- }
-}
-
-// newInode is the inode constructor. Reads the inode off disk. Identifies
-// inodes based on the absolute inode number on disk.
-//
-// Preconditions: Must hold the mutex of the filesystem containing dev.
-func newInode(dev io.ReadSeeker, sb disklayout.SuperBlock, bgs []disklayout.BlockGroup, inodeNum uint32) (*inode, error) {
- if inodeNum == 0 {
- panic("inode number 0 on ext filesystems is not possible")
- }
-
- in := &inode{refs: 1, inodeNum: inodeNum}
- inodeRecordSize := sb.InodeSize()
- if inodeRecordSize == disklayout.OldInodeSize {
- in.diskInode = &disklayout.InodeOld{}
- } else {
- in.diskInode = &disklayout.InodeNew{}
- }
-
- // Calculate where the inode is actually placed.
- inodesPerGrp := sb.InodesPerGroup()
- blkSize := sb.BlockSize()
- inodeTableOff := bgs[getBGNum(inodeNum, inodesPerGrp)].InodeTable() * blkSize
- inodeOff := inodeTableOff + uint64(uint32(inodeRecordSize)*getBGOff(inodeNum, inodesPerGrp))
-
- // Read it from disk and figure out which type of inode this is.
- if err := readFromDisk(dev, int64(inodeOff), in.diskInode); err != nil {
- return nil, err
- }
-
- if in.diskInode.Flags().Extents {
- in.buildExtTree(dev, blkSize)
- }
-
- return in, nil
-}
-
-// getBGNum returns the block group number that a given inode belongs to.
-func getBGNum(inodeNum uint32, inodesPerGrp uint32) uint32 {
- return (inodeNum - 1) / inodesPerGrp
-}
-
-// getBGOff returns the offset at which the given inode lives in the block
-// group's inode table, i.e. the index of the inode in the inode table.
-func getBGOff(inodeNum uint32, inodesPerGrp uint32) uint32 {
- return (inodeNum - 1) % inodesPerGrp
-}
-
-// buildExtTree builds the extent tree by reading it from disk by doing
-// running a simple DFS. It first reads the root node from the inode struct in
-// memory. Then it recursively builds the rest of the tree by reading it off
-// disk.
-//
-// Preconditions:
-// - Must hold the mutex of the filesystem containing dev.
-// - Inode flag InExtents must be set.
-func (in *inode) buildExtTree(dev io.ReadSeeker, blkSize uint64) error {
- rootNodeData := in.diskInode.Data()
-
- var rootHeader disklayout.ExtentHeader
- binary.Unmarshal(rootNodeData[:disklayout.ExtentStructsSize], binary.LittleEndian, &rootHeader)
-
- // Root node can not have more than 4 entries: 60 bytes = 1 header + 4 entries.
- if rootHeader.NumEntries > 4 {
- // read(2) specifies that EINVAL should be returned if the file is unsuitable
- // for reading.
- return syserror.EINVAL
- }
-
- rootEntries := make([]disklayout.ExtentEntryPair, rootHeader.NumEntries)
- for i, off := uint16(0), disklayout.ExtentStructsSize; i < rootHeader.NumEntries; i, off = i+1, off+disklayout.ExtentStructsSize {
- var curEntry disklayout.ExtentEntry
- if rootHeader.Height == 0 {
- // Leaf node.
- curEntry = &disklayout.Extent{}
- } else {
- // Internal node.
- curEntry = &disklayout.ExtentIdx{}
- }
- binary.Unmarshal(rootNodeData[off:off+disklayout.ExtentStructsSize], binary.LittleEndian, curEntry)
- rootEntries[i].Entry = curEntry
- }
-
- // If this node is internal, perform DFS.
- if rootHeader.Height > 0 {
- for i := uint16(0); i < rootHeader.NumEntries; i++ {
- var err error
- if rootEntries[i].Node, err = buildExtTreeFromDisk(dev, rootEntries[i].Entry, blkSize); err != nil {
- return err
- }
- }
- }
-
- in.root = &disklayout.ExtentNode{rootHeader, rootEntries}
- return nil
-}
-
-// buildExtTreeFromDisk reads the extent tree nodes from disk and recursively
-// builds the tree. Performs a simple DFS. It returns the ExtentNode pointed to
-// by the ExtentEntry.
-//
-// Preconditions: Must hold the mutex of the filesystem containing dev.
-func buildExtTreeFromDisk(dev io.ReadSeeker, entry disklayout.ExtentEntry, blkSize uint64) (*disklayout.ExtentNode, error) {
- var header disklayout.ExtentHeader
- off := entry.PhysicalBlock() * blkSize
- if err := readFromDisk(dev, int64(off), &header); err != nil {
- return nil, err
- }
-
- entries := make([]disklayout.ExtentEntryPair, header.NumEntries)
- for i, off := uint16(0), off+disklayout.ExtentStructsSize; i < header.NumEntries; i, off = i+1, off+disklayout.ExtentStructsSize {
- var curEntry disklayout.ExtentEntry
- if header.Height == 0 {
- // Leaf node.
- curEntry = &disklayout.Extent{}
- } else {
- // Internal node.
- curEntry = &disklayout.ExtentIdx{}
- }
-
- if err := readFromDisk(dev, int64(off), curEntry); err != nil {
- return nil, err
- }
- entries[i].Entry = curEntry
- }
-
- // If this node is internal, perform DFS.
- if header.Height > 0 {
- for i := uint16(0); i < header.NumEntries; i++ {
- var err error
- entries[i].Node, err = buildExtTreeFromDisk(dev, entries[i].Entry, blkSize)
- if err != nil {
- return nil, err
- }
- }
- }
-
- return &disklayout.ExtentNode{header, entries}, nil
-}
diff --git a/pkg/sentry/fs/fdpipe/pipe.go b/pkg/sentry/fs/fdpipe/pipe.go
index 5a0a67eab..669ffcb75 100644
--- a/pkg/sentry/fs/fdpipe/pipe.go
+++ b/pkg/sentry/fs/fdpipe/pipe.go
@@ -87,7 +87,7 @@ func (p *pipeOperations) init() error {
log.Warningf("pipe: cannot stat fd %d: %v", p.file.FD(), err)
return syscall.EINVAL
}
- if s.Mode&syscall.S_IFIFO != syscall.S_IFIFO {
+ if (s.Mode & syscall.S_IFMT) != syscall.S_IFIFO {
log.Warningf("pipe: cannot load fd %d as pipe, file type: %o", p.file.FD(), s.Mode)
return syscall.EINVAL
}
diff --git a/pkg/sentry/fs/fsutil/inode_cached.go b/pkg/sentry/fs/fsutil/inode_cached.go
index ed62049a9..20cb9a367 100644
--- a/pkg/sentry/fs/fsutil/inode_cached.go
+++ b/pkg/sentry/fs/fsutil/inode_cached.go
@@ -66,10 +66,8 @@ type CachingInodeOperations struct {
// mfp is used to allocate memory that caches backingFile's contents.
mfp pgalloc.MemoryFileProvider
- // forcePageCache indicates the sentry page cache should be used regardless
- // of whether the platform supports host mapped I/O or not. This must not be
- // modified after inode creation.
- forcePageCache bool
+ // opts contains options. opts is immutable.
+ opts CachingInodeOperationsOptions
attrMu sync.Mutex `state:"nosave"`
@@ -116,6 +114,20 @@ type CachingInodeOperations struct {
refs frameRefSet
}
+// CachingInodeOperationsOptions configures a CachingInodeOperations.
+//
+// +stateify savable
+type CachingInodeOperationsOptions struct {
+ // If ForcePageCache is true, use the sentry page cache even if a host file
+ // descriptor is available.
+ ForcePageCache bool
+
+ // If LimitHostFDTranslation is true, apply maxFillRange() constraints to
+ // host file descriptor mappings returned by
+ // CachingInodeOperations.Translate().
+ LimitHostFDTranslation bool
+}
+
// CachedFileObject is a file that may require caching.
type CachedFileObject interface {
// ReadToBlocksAt reads up to dsts.NumBytes() bytes from the file to dsts,
@@ -159,7 +171,7 @@ type CachedFileObject interface {
// NewCachingInodeOperations returns a new CachingInodeOperations backed by
// a CachedFileObject and its initial unstable attributes.
-func NewCachingInodeOperations(ctx context.Context, backingFile CachedFileObject, uattr fs.UnstableAttr, forcePageCache bool) *CachingInodeOperations {
+func NewCachingInodeOperations(ctx context.Context, backingFile CachedFileObject, uattr fs.UnstableAttr, opts CachingInodeOperationsOptions) *CachingInodeOperations {
mfp := pgalloc.MemoryFileProviderFromContext(ctx)
if mfp == nil {
panic(fmt.Sprintf("context.Context %T lacks non-nil value for key %T", ctx, pgalloc.CtxMemoryFileProvider))
@@ -167,7 +179,7 @@ func NewCachingInodeOperations(ctx context.Context, backingFile CachedFileObject
return &CachingInodeOperations{
backingFile: backingFile,
mfp: mfp,
- forcePageCache: forcePageCache,
+ opts: opts,
attr: uattr,
hostFileMapper: NewHostFileMapper(),
}
@@ -568,21 +580,30 @@ type inodeReadWriter struct {
// ReadToBlocks implements safemem.Reader.ReadToBlocks.
func (rw *inodeReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
+ mem := rw.c.mfp.MemoryFile()
+ fillCache := !rw.c.useHostPageCache() && mem.ShouldCacheEvictable()
+
// Hot path. Avoid defers.
- rw.c.dataMu.RLock()
+ var unlock func()
+ if fillCache {
+ rw.c.dataMu.Lock()
+ unlock = rw.c.dataMu.Unlock
+ } else {
+ rw.c.dataMu.RLock()
+ unlock = rw.c.dataMu.RUnlock
+ }
// Compute the range to read.
if rw.offset >= rw.c.attr.Size {
- rw.c.dataMu.RUnlock()
+ unlock()
return 0, io.EOF
}
end := fs.ReadEndOffset(rw.offset, int64(dsts.NumBytes()), rw.c.attr.Size)
if end == rw.offset { // dsts.NumBytes() == 0?
- rw.c.dataMu.RUnlock()
+ unlock()
return 0, nil
}
- mem := rw.c.mfp.MemoryFile()
var done uint64
seg, gap := rw.c.cache.Find(uint64(rw.offset))
for rw.offset < end {
@@ -592,7 +613,7 @@ func (rw *inodeReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
// Get internal mappings from the cache.
ims, err := mem.MapInternal(seg.FileRangeOf(seg.Range().Intersect(mr)), usermem.Read)
if err != nil {
- rw.c.dataMu.RUnlock()
+ unlock()
return done, err
}
@@ -602,7 +623,7 @@ func (rw *inodeReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
rw.offset += int64(n)
dsts = dsts.DropFirst64(n)
if err != nil {
- rw.c.dataMu.RUnlock()
+ unlock()
return done, err
}
@@ -610,27 +631,48 @@ func (rw *inodeReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
seg, gap = seg.NextNonEmpty()
case gap.Ok():
- // Read directly from the backing file.
- gapmr := gap.Range().Intersect(mr)
- dst := dsts.TakeFirst64(gapmr.Length())
- n, err := rw.c.backingFile.ReadToBlocksAt(rw.ctx, dst, gapmr.Start)
- done += n
- rw.offset += int64(n)
- dsts = dsts.DropFirst64(n)
- // Partial reads are fine. But we must stop reading.
- if n != dst.NumBytes() || err != nil {
- rw.c.dataMu.RUnlock()
- return done, err
+ gapMR := gap.Range().Intersect(mr)
+ if fillCache {
+ // Read into the cache, then re-enter the loop to read from the
+ // cache.
+ reqMR := memmap.MappableRange{
+ Start: uint64(usermem.Addr(gapMR.Start).RoundDown()),
+ End: fs.OffsetPageEnd(int64(gapMR.End)),
+ }
+ optMR := gap.Range()
+ err := rw.c.cache.Fill(rw.ctx, reqMR, maxFillRange(reqMR, optMR), mem, usage.PageCache, rw.c.backingFile.ReadToBlocksAt)
+ mem.MarkEvictable(rw.c, pgalloc.EvictableRange{optMR.Start, optMR.End})
+ seg, gap = rw.c.cache.Find(uint64(rw.offset))
+ if !seg.Ok() {
+ unlock()
+ return done, err
+ }
+ // err might have occurred in part of gap.Range() outside
+ // gapMR. Forget about it for now; if the error matters and
+ // persists, we'll run into it again in a later iteration of
+ // this loop.
+ } else {
+ // Read directly from the backing file.
+ dst := dsts.TakeFirst64(gapMR.Length())
+ n, err := rw.c.backingFile.ReadToBlocksAt(rw.ctx, dst, gapMR.Start)
+ done += n
+ rw.offset += int64(n)
+ dsts = dsts.DropFirst64(n)
+ // Partial reads are fine. But we must stop reading.
+ if n != dst.NumBytes() || err != nil {
+ unlock()
+ return done, err
+ }
+
+ // Continue.
+ seg, gap = gap.NextSegment(), FileRangeGapIterator{}
}
- // Continue.
- seg, gap = gap.NextSegment(), FileRangeGapIterator{}
-
default:
break
}
}
- rw.c.dataMu.RUnlock()
+ unlock()
return done, nil
}
@@ -700,7 +742,10 @@ func (rw *inodeReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error
seg, gap = seg.NextNonEmpty()
case gap.Ok() && gap.Start() < mr.End:
- // Write directly to the backing file.
+ // Write directly to the backing file. At present, we never fill
+ // the cache when writing, since doing so can convert small writes
+ // into inefficient read-modify-write cycles, and we have no
+ // mechanism for detecting or avoiding this.
gapmr := gap.Range().Intersect(mr)
src := srcs.TakeFirst64(gapmr.Length())
n, err := rw.c.backingFile.WriteFromBlocksAt(rw.ctx, src, gapmr.Start)
@@ -730,7 +775,7 @@ func (rw *inodeReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error
// and memory mappings, and false if c.cache may contain data cached from
// c.backingFile.
func (c *CachingInodeOperations) useHostPageCache() bool {
- return !c.forcePageCache && c.backingFile.FD() >= 0
+ return !c.opts.ForcePageCache && c.backingFile.FD() >= 0
}
// AddMapping implements memmap.Mappable.AddMapping.
@@ -802,11 +847,15 @@ func (c *CachingInodeOperations) CopyMapping(ctx context.Context, ms memmap.Mapp
func (c *CachingInodeOperations) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) {
// Hot path. Avoid defer.
if c.useHostPageCache() {
+ mr := optional
+ if c.opts.LimitHostFDTranslation {
+ mr = maxFillRange(required, optional)
+ }
return []memmap.Translation{
{
- Source: optional,
+ Source: mr,
File: c,
- Offset: optional.Start,
+ Offset: mr.Start,
Perms: usermem.AnyAccess,
},
}, nil
diff --git a/pkg/sentry/fs/fsutil/inode_cached_test.go b/pkg/sentry/fs/fsutil/inode_cached_test.go
index dc19255ed..eb5730c35 100644
--- a/pkg/sentry/fs/fsutil/inode_cached_test.go
+++ b/pkg/sentry/fs/fsutil/inode_cached_test.go
@@ -61,7 +61,7 @@ func TestSetPermissions(t *testing.T) {
uattr := fs.WithCurrentTime(ctx, fs.UnstableAttr{
Perms: fs.FilePermsFromMode(0444),
})
- iops := NewCachingInodeOperations(ctx, noopBackingFile{}, uattr, false /*forcePageCache*/)
+ iops := NewCachingInodeOperations(ctx, noopBackingFile{}, uattr, CachingInodeOperationsOptions{})
defer iops.Release()
perms := fs.FilePermsFromMode(0777)
@@ -150,7 +150,7 @@ func TestSetTimestamps(t *testing.T) {
ModificationTime: epoch,
StatusChangeTime: epoch,
}
- iops := NewCachingInodeOperations(ctx, noopBackingFile{}, uattr, false /*forcePageCache*/)
+ iops := NewCachingInodeOperations(ctx, noopBackingFile{}, uattr, CachingInodeOperationsOptions{})
defer iops.Release()
if err := iops.SetTimestamps(ctx, nil, test.ts); err != nil {
@@ -188,7 +188,7 @@ func TestTruncate(t *testing.T) {
uattr := fs.UnstableAttr{
Size: 0,
}
- iops := NewCachingInodeOperations(ctx, noopBackingFile{}, uattr, false /*forcePageCache*/)
+ iops := NewCachingInodeOperations(ctx, noopBackingFile{}, uattr, CachingInodeOperationsOptions{})
defer iops.Release()
if err := iops.Truncate(ctx, nil, uattr.Size); err != nil {
@@ -280,7 +280,7 @@ func TestRead(t *testing.T) {
uattr := fs.UnstableAttr{
Size: int64(len(buf)),
}
- iops := NewCachingInodeOperations(ctx, newSliceBackingFile(buf), uattr, false /*forcePageCache*/)
+ iops := NewCachingInodeOperations(ctx, newSliceBackingFile(buf), uattr, CachingInodeOperationsOptions{})
defer iops.Release()
// Expect the cache to be initially empty.
@@ -336,7 +336,7 @@ func TestWrite(t *testing.T) {
uattr := fs.UnstableAttr{
Size: int64(len(buf)),
}
- iops := NewCachingInodeOperations(ctx, newSliceBackingFile(buf), uattr, false /*forcePageCache*/)
+ iops := NewCachingInodeOperations(ctx, newSliceBackingFile(buf), uattr, CachingInodeOperationsOptions{})
defer iops.Release()
// Expect the cache to be initially empty.
diff --git a/pkg/sentry/fs/gofer/fs.go b/pkg/sentry/fs/gofer/fs.go
index 69999dc28..8f8ab5d29 100644
--- a/pkg/sentry/fs/gofer/fs.go
+++ b/pkg/sentry/fs/gofer/fs.go
@@ -54,6 +54,10 @@ const (
// sandbox using files backed by the gofer. If set to false, unix sockets
// cannot be bound to gofer files without an overlay on top.
privateUnixSocketKey = "privateunixsocket"
+
+ // If present, sets CachingInodeOperationsOptions.LimitHostFDTranslation to
+ // true.
+ limitHostFDTranslationKey = "limit_host_fd_translation"
)
// defaultAname is the default attach name.
@@ -134,12 +138,13 @@ func (f *filesystem) Mount(ctx context.Context, device string, flags fs.MountSou
// opts are parsed 9p mount options.
type opts struct {
- fd int
- aname string
- policy cachePolicy
- msize uint32
- version string
- privateunixsocket bool
+ fd int
+ aname string
+ policy cachePolicy
+ msize uint32
+ version string
+ privateunixsocket bool
+ limitHostFDTranslation bool
}
// options parses mount(2) data into structured options.
@@ -237,6 +242,11 @@ func options(data string) (opts, error) {
delete(options, privateUnixSocketKey)
}
+ if _, ok := options[limitHostFDTranslationKey]; ok {
+ o.limitHostFDTranslation = true
+ delete(options, limitHostFDTranslationKey)
+ }
+
// Fail to attach if the caller wanted us to do something that we
// don't support.
if len(options) > 0 {
diff --git a/pkg/sentry/fs/gofer/session.go b/pkg/sentry/fs/gofer/session.go
index 69d08a627..50da865c1 100644
--- a/pkg/sentry/fs/gofer/session.go
+++ b/pkg/sentry/fs/gofer/session.go
@@ -117,6 +117,11 @@ type session struct {
// Flags provided to the mount.
superBlockFlags fs.MountSourceFlags `state:"wait"`
+ // limitHostFDTranslation is the value used for
+ // CachingInodeOperationsOptions.LimitHostFDTranslation for all
+ // CachingInodeOperations created by the session.
+ limitHostFDTranslation bool
+
// connID is a unique identifier for the session connection.
connID string `state:"wait"`
@@ -218,8 +223,11 @@ func newInodeOperations(ctx context.Context, s *session, file contextFile, qid p
uattr := unstable(ctx, valid, attr, s.mounter, s.client)
return sattr, &inodeOperations{
- fileState: fileState,
- cachingInodeOps: fsutil.NewCachingInodeOperations(ctx, fileState, uattr, s.superBlockFlags.ForcePageCache),
+ fileState: fileState,
+ cachingInodeOps: fsutil.NewCachingInodeOperations(ctx, fileState, uattr, fsutil.CachingInodeOperationsOptions{
+ ForcePageCache: s.superBlockFlags.ForcePageCache,
+ LimitHostFDTranslation: s.limitHostFDTranslation,
+ }),
}
}
@@ -242,13 +250,14 @@ func Root(ctx context.Context, dev string, filesystem fs.Filesystem, superBlockF
// Construct the session.
s := session{
- connID: dev,
- msize: o.msize,
- version: o.version,
- cachePolicy: o.policy,
- aname: o.aname,
- superBlockFlags: superBlockFlags,
- mounter: mounter,
+ connID: dev,
+ msize: o.msize,
+ version: o.version,
+ cachePolicy: o.policy,
+ aname: o.aname,
+ superBlockFlags: superBlockFlags,
+ limitHostFDTranslation: o.limitHostFDTranslation,
+ mounter: mounter,
}
s.EnableLeakCheck("gofer.session")
diff --git a/pkg/sentry/fs/host/inode.go b/pkg/sentry/fs/host/inode.go
index 679d8321a..894ab01f0 100644
--- a/pkg/sentry/fs/host/inode.go
+++ b/pkg/sentry/fs/host/inode.go
@@ -200,8 +200,10 @@ func newInode(ctx context.Context, msrc *fs.MountSource, fd int, saveable bool,
// Build the fs.InodeOperations.
uattr := unstableAttr(msrc.MountSourceOperations.(*superOperations), &s)
iops := &inodeOperations{
- fileState: fileState,
- cachingInodeOps: fsutil.NewCachingInodeOperations(ctx, fileState, uattr, msrc.Flags.ForcePageCache),
+ fileState: fileState,
+ cachingInodeOps: fsutil.NewCachingInodeOperations(ctx, fileState, uattr, fsutil.CachingInodeOperationsOptions{
+ ForcePageCache: msrc.Flags.ForcePageCache,
+ }),
}
// Return the fs.Inode.
diff --git a/pkg/sentry/fs/host/socket.go b/pkg/sentry/fs/host/socket.go
index 44c4ee5f2..2392787cb 100644
--- a/pkg/sentry/fs/host/socket.go
+++ b/pkg/sentry/fs/host/socket.go
@@ -65,7 +65,7 @@ type ConnectedEndpoint struct {
// GetSockOpt and message splitting/rejection in SendMsg, but do not
// prevent lots of small messages from filling the real send buffer
// size on the host.
- sndbuf int `state:"nosave"`
+ sndbuf int64 `state:"nosave"`
// mu protects the fields below.
mu sync.RWMutex `state:"nosave"`
@@ -107,7 +107,7 @@ func (c *ConnectedEndpoint) init() *syserr.Error {
}
c.stype = linux.SockType(stype)
- c.sndbuf = sndbuf
+ c.sndbuf = int64(sndbuf)
return nil
}
@@ -202,7 +202,7 @@ func newSocket(ctx context.Context, orgfd int, saveable bool) (*fs.File, error)
}
// Send implements transport.ConnectedEndpoint.Send.
-func (c *ConnectedEndpoint) Send(data [][]byte, controlMessages transport.ControlMessages, from tcpip.FullAddress) (uintptr, bool, *syserr.Error) {
+func (c *ConnectedEndpoint) Send(data [][]byte, controlMessages transport.ControlMessages, from tcpip.FullAddress) (int64, bool, *syserr.Error) {
c.mu.RLock()
defer c.mu.RUnlock()
@@ -279,7 +279,7 @@ func (c *ConnectedEndpoint) EventUpdate() {
}
// Recv implements transport.Receiver.Recv.
-func (c *ConnectedEndpoint) Recv(data [][]byte, creds bool, numRights uintptr, peek bool) (uintptr, uintptr, transport.ControlMessages, bool, tcpip.FullAddress, bool, *syserr.Error) {
+func (c *ConnectedEndpoint) Recv(data [][]byte, creds bool, numRights int, peek bool) (int64, int64, transport.ControlMessages, bool, tcpip.FullAddress, bool, *syserr.Error) {
c.mu.RLock()
defer c.mu.RUnlock()
diff --git a/pkg/sentry/fs/host/socket_iovec.go b/pkg/sentry/fs/host/socket_iovec.go
index 05d7c79ad..af6955675 100644
--- a/pkg/sentry/fs/host/socket_iovec.go
+++ b/pkg/sentry/fs/host/socket_iovec.go
@@ -55,19 +55,19 @@ func copyFromMulti(dst []byte, src [][]byte) {
//
// If intermediate != nil, iovecs references intermediate rather than bufs and
// the caller must copy to/from bufs as necessary.
-func buildIovec(bufs [][]byte, maxlen int, truncate bool) (length uintptr, iovecs []syscall.Iovec, intermediate []byte, err error) {
+func buildIovec(bufs [][]byte, maxlen int64, truncate bool) (length int64, iovecs []syscall.Iovec, intermediate []byte, err error) {
var iovsRequired int
for _, b := range bufs {
- length += uintptr(len(b))
+ length += int64(len(b))
if len(b) > 0 {
iovsRequired++
}
}
stopLen := length
- if length > uintptr(maxlen) {
+ if length > maxlen {
if truncate {
- stopLen = uintptr(maxlen)
+ stopLen = maxlen
err = syserror.EAGAIN
} else {
return 0, nil, nil, syserror.EMSGSIZE
@@ -85,7 +85,7 @@ func buildIovec(bufs [][]byte, maxlen int, truncate bool) (length uintptr, iovec
}}, b, err
}
- var total uintptr
+ var total int64
iovecs = make([]syscall.Iovec, 0, iovsRequired)
for i := range bufs {
l := len(bufs[i])
@@ -93,9 +93,9 @@ func buildIovec(bufs [][]byte, maxlen int, truncate bool) (length uintptr, iovec
continue
}
- stop := l
- if total+uintptr(stop) > stopLen {
- stop = int(stopLen - total)
+ stop := int64(l)
+ if total+stop > stopLen {
+ stop = stopLen - total
}
iovecs = append(iovecs, syscall.Iovec{
@@ -103,7 +103,7 @@ func buildIovec(bufs [][]byte, maxlen int, truncate bool) (length uintptr, iovec
Len: uint64(stop),
})
- total += uintptr(stop)
+ total += stop
if total >= stopLen {
break
}
diff --git a/pkg/sentry/fs/host/socket_unsafe.go b/pkg/sentry/fs/host/socket_unsafe.go
index e57be0506..f3bbed7ea 100644
--- a/pkg/sentry/fs/host/socket_unsafe.go
+++ b/pkg/sentry/fs/host/socket_unsafe.go
@@ -23,7 +23,7 @@ import (
//
// If the total length of bufs is > maxlen, fdReadVec will do a partial read
// and err will indicate why the message was truncated.
-func fdReadVec(fd int, bufs [][]byte, control []byte, peek bool, maxlen int) (readLen uintptr, msgLen uintptr, controlLen uint64, controlTrunc bool, err error) {
+func fdReadVec(fd int, bufs [][]byte, control []byte, peek bool, maxlen int64) (readLen int64, msgLen int64, controlLen uint64, controlTrunc bool, err error) {
flags := uintptr(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC)
if peek {
flags |= syscall.MSG_PEEK
@@ -48,11 +48,12 @@ func fdReadVec(fd int, bufs [][]byte, control []byte, peek bool, maxlen int) (re
msg.Iovlen = uint64(len(iovecs))
}
- n, _, e := syscall.RawSyscall(syscall.SYS_RECVMSG, uintptr(fd), uintptr(unsafe.Pointer(&msg)), flags)
+ rawN, _, e := syscall.RawSyscall(syscall.SYS_RECVMSG, uintptr(fd), uintptr(unsafe.Pointer(&msg)), flags)
if e != 0 {
// N.B. prioritize the syscall error over the buildIovec error.
return 0, 0, 0, false, e
}
+ n := int64(rawN)
// Copy data back to bufs.
if intermediate != nil {
@@ -72,7 +73,7 @@ func fdReadVec(fd int, bufs [][]byte, control []byte, peek bool, maxlen int) (re
//
// If the total length of bufs is > maxlen && truncate, fdWriteVec will do a
// partial write and err will indicate why the message was truncated.
-func fdWriteVec(fd int, bufs [][]byte, maxlen int, truncate bool) (uintptr, uintptr, error) {
+func fdWriteVec(fd int, bufs [][]byte, maxlen int64, truncate bool) (int64, int64, error) {
length, iovecs, intermediate, err := buildIovec(bufs, maxlen, truncate)
if err != nil && len(iovecs) == 0 {
// No partial write to do, return error immediately.
@@ -96,5 +97,5 @@ func fdWriteVec(fd int, bufs [][]byte, maxlen int, truncate bool) (uintptr, uint
return 0, length, e
}
- return n, length, err
+ return int64(n), length, err
}
diff --git a/pkg/sentry/fs/mounts.go b/pkg/sentry/fs/mounts.go
index 693ffc760..ac0398bd9 100644
--- a/pkg/sentry/fs/mounts.go
+++ b/pkg/sentry/fs/mounts.go
@@ -171,8 +171,6 @@ type MountNamespace struct {
// NewMountNamespace returns a new MountNamespace, with the provided node at the
// root, and the given cache size. A root must always be provided.
func NewMountNamespace(ctx context.Context, root *Inode) (*MountNamespace, error) {
- creds := auth.CredentialsFromContext(ctx)
-
// Set the root dirent and id on the root mount. The reference returned from
// NewDirent will be donated to the MountNamespace constructed below.
d := NewDirent(ctx, root, "/")
@@ -181,6 +179,7 @@ func NewMountNamespace(ctx context.Context, root *Inode) (*MountNamespace, error
d: newRootMount(1, d),
}
+ creds := auth.CredentialsFromContext(ctx)
mns := MountNamespace{
userns: creds.UserNamespace,
root: d,
@@ -219,6 +218,13 @@ func (mns *MountNamespace) flushMountSourceRefsLocked() {
}
}
+ if mns.root == nil {
+ // No root? This MountSource must have already been destroyed.
+ // This can happen when a Save is triggered while a process is
+ // exiting. There is nothing to flush.
+ return
+ }
+
// Flush root's MountSource references.
mns.root.Inode.MountSource.FlushDirentRefs()
}
@@ -249,6 +255,10 @@ func (mns *MountNamespace) destroy() {
// Drop reference on the root.
mns.root.DecRef()
+ // Ensure that root cannot be accessed via this MountNamespace any
+ // more.
+ mns.root = nil
+
// Wait for asynchronous work (queued by dropping Dirent references
// above) to complete before destroying this MountNamespace.
AsyncBarrier()
@@ -678,7 +688,7 @@ func (mns *MountNamespace) ResolveExecutablePath(ctx context.Context, wd, name s
return "", syserror.ENOENT
}
-// GetPath returns the PATH as a slice of strings given the environemnt
+// GetPath returns the PATH as a slice of strings given the environment
// variables.
func GetPath(env []string) []string {
const prefix = "PATH="
diff --git a/pkg/sentry/fs/proc/net.go b/pkg/sentry/fs/proc/net.go
index 6b839685b..9adb23608 100644
--- a/pkg/sentry/fs/proc/net.go
+++ b/pkg/sentry/fs/proc/net.go
@@ -348,7 +348,7 @@ func (n *netTCP) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]se
// Field: local_adddress.
var localAddr linux.SockAddrInet
if local, _, err := sops.GetSockName(t); err == nil {
- localAddr = local.(linux.SockAddrInet)
+ localAddr = *local.(*linux.SockAddrInet)
}
binary.LittleEndian.PutUint16(portBuf, localAddr.Port)
fmt.Fprintf(&buf, "%08X:%04X ",
@@ -358,7 +358,7 @@ func (n *netTCP) ReadSeqFileData(ctx context.Context, h seqfile.SeqHandle) ([]se
// Field: rem_address.
var remoteAddr linux.SockAddrInet
if remote, _, err := sops.GetPeerName(t); err == nil {
- remoteAddr = remote.(linux.SockAddrInet)
+ remoteAddr = *remote.(*linux.SockAddrInet)
}
binary.LittleEndian.PutUint16(portBuf, remoteAddr.Port)
fmt.Fprintf(&buf, "%08X:%04X ",
diff --git a/pkg/sentry/fs/ramfs/dir.go b/pkg/sentry/fs/ramfs/dir.go
index f3e984c24..78e082b8e 100644
--- a/pkg/sentry/fs/ramfs/dir.go
+++ b/pkg/sentry/fs/ramfs/dir.go
@@ -53,7 +53,6 @@ type Dir struct {
fsutil.InodeGenericChecker `state:"nosave"`
fsutil.InodeIsDirAllocate `state:"nosave"`
fsutil.InodeIsDirTruncate `state:"nosave"`
- fsutil.InodeNoopRelease `state:"nosave"`
fsutil.InodeNoopWriteOut `state:"nosave"`
fsutil.InodeNotMappable `state:"nosave"`
fsutil.InodeNotSocket `state:"nosave"`
@@ -84,7 +83,8 @@ type Dir struct {
var _ fs.InodeOperations = (*Dir)(nil)
-// NewDir returns a new Dir with the given contents and attributes.
+// NewDir returns a new Dir with the given contents and attributes. A reference
+// on each fs.Inode in the `contents` map will be donated to this Dir.
func NewDir(ctx context.Context, contents map[string]*fs.Inode, owner fs.FileOwner, perms fs.FilePermissions) *Dir {
d := &Dir{
InodeSimpleAttributes: fsutil.NewInodeSimpleAttributes(ctx, owner, perms, linux.RAMFS_MAGIC),
@@ -138,7 +138,7 @@ func (d *Dir) addChildLocked(ctx context.Context, name string, inode *fs.Inode)
d.NotifyModificationAndStatusChange(ctx)
}
-// AddChild adds a child to this dir.
+// AddChild adds a child to this dir, inheriting its reference.
func (d *Dir) AddChild(ctx context.Context, name string, inode *fs.Inode) {
d.mu.Lock()
defer d.mu.Unlock()
@@ -172,7 +172,9 @@ func (d *Dir) Children() ([]string, map[string]fs.DentAttr) {
return namesCopy, entriesCopy
}
-// removeChildLocked attempts to remove an entry from this directory.
+// removeChildLocked attempts to remove an entry from this directory. It
+// returns the removed fs.Inode along with its reference, which callers are
+// responsible for decrementing.
func (d *Dir) removeChildLocked(ctx context.Context, name string) (*fs.Inode, error) {
inode, ok := d.children[name]
if !ok {
@@ -253,7 +255,8 @@ func (d *Dir) RemoveDirectory(ctx context.Context, _ *fs.Inode, name string) err
return nil
}
-// Lookup loads an inode at p into a Dirent.
+// Lookup loads an inode at p into a Dirent. It returns the fs.Dirent along
+// with a reference.
func (d *Dir) Lookup(ctx context.Context, _ *fs.Inode, p string) (*fs.Dirent, error) {
if len(p) > linux.NAME_MAX {
return nil, syserror.ENAMETOOLONG
@@ -408,6 +411,16 @@ func (*Dir) Rename(ctx context.Context, inode *fs.Inode, oldParent *fs.Inode, ol
return Rename(ctx, oldParent.InodeOperations, oldName, newParent.InodeOperations, newName, replacement)
}
+// Release implements fs.InodeOperation.Release.
+func (d *Dir) Release(_ context.Context) {
+ // Drop references on all children.
+ d.mu.Lock()
+ for _, i := range d.children {
+ i.DecRef()
+ }
+ d.mu.Unlock()
+}
+
// dirFileOperations implements fs.FileOperations for a ramfs directory.
//
// +stateify savable
diff --git a/pkg/sentry/fs/tmpfs/tmpfs.go b/pkg/sentry/fs/tmpfs/tmpfs.go
index 0f4497cd6..159fb7c08 100644
--- a/pkg/sentry/fs/tmpfs/tmpfs.go
+++ b/pkg/sentry/fs/tmpfs/tmpfs.go
@@ -56,7 +56,6 @@ func rename(ctx context.Context, oldParent *fs.Inode, oldName string, newParent
type Dir struct {
fsutil.InodeGenericChecker `state:"nosave"`
fsutil.InodeIsDirTruncate `state:"nosave"`
- fsutil.InodeNoopRelease `state:"nosave"`
fsutil.InodeNoopWriteOut `state:"nosave"`
fsutil.InodeNotMappable `state:"nosave"`
fsutil.InodeNotSocket `state:"nosave"`
@@ -252,6 +251,11 @@ func (d *Dir) Allocate(ctx context.Context, node *fs.Inode, offset, length int64
return d.ramfsDir.Allocate(ctx, node, offset, length)
}
+// Release implements fs.InodeOperations.Release.
+func (d *Dir) Release(ctx context.Context) {
+ d.ramfsDir.Release(ctx)
+}
+
// Symlink is a symlink.
//
// +stateify savable
diff --git a/pkg/sentry/fs/tty/BUILD b/pkg/sentry/fs/tty/BUILD
index 5e9327aec..291164986 100644
--- a/pkg/sentry/fs/tty/BUILD
+++ b/pkg/sentry/fs/tty/BUILD
@@ -23,6 +23,7 @@ go_library(
"//pkg/sentry/device",
"//pkg/sentry/fs",
"//pkg/sentry/fs/fsutil",
+ "//pkg/sentry/kernel",
"//pkg/sentry/kernel/auth",
"//pkg/sentry/safemem",
"//pkg/sentry/socket/unix/transport",
diff --git a/pkg/sentry/fs/tty/dir.go b/pkg/sentry/fs/tty/dir.go
index 1d128532b..2f639c823 100644
--- a/pkg/sentry/fs/tty/dir.go
+++ b/pkg/sentry/fs/tty/dir.go
@@ -129,6 +129,9 @@ func newDir(ctx context.Context, m *fs.MountSource) *fs.Inode {
// Release implements fs.InodeOperations.Release.
func (d *dirInodeOperations) Release(ctx context.Context) {
+ d.mu.Lock()
+ defer d.mu.Unlock()
+
d.master.DecRef()
if len(d.slaves) != 0 {
panic(fmt.Sprintf("devpts directory still contains active terminals: %+v", d))
diff --git a/pkg/sentry/fs/tty/master.go b/pkg/sentry/fs/tty/master.go
index 92ec1ca18..19b7557d5 100644
--- a/pkg/sentry/fs/tty/master.go
+++ b/pkg/sentry/fs/tty/master.go
@@ -172,6 +172,19 @@ func (mf *masterFileOperations) Ioctl(ctx context.Context, _ *fs.File, io userme
return 0, mf.t.ld.windowSize(ctx, io, args)
case linux.TIOCSWINSZ:
return 0, mf.t.ld.setWindowSize(ctx, io, args)
+ case linux.TIOCSCTTY:
+ // Make the given terminal the controlling terminal of the
+ // calling process.
+ return 0, mf.t.setControllingTTY(ctx, io, args, true /* isMaster */)
+ case linux.TIOCNOTTY:
+ // Release this process's controlling terminal.
+ return 0, mf.t.releaseControllingTTY(ctx, io, args, true /* isMaster */)
+ case linux.TIOCGPGRP:
+ // Get the foreground process group.
+ return mf.t.foregroundProcessGroup(ctx, io, args, true /* isMaster */)
+ case linux.TIOCSPGRP:
+ // Set the foreground process group.
+ return mf.t.setForegroundProcessGroup(ctx, io, args, true /* isMaster */)
default:
maybeEmitUnimplementedEvent(ctx, cmd)
return 0, syserror.ENOTTY
@@ -185,8 +198,6 @@ func maybeEmitUnimplementedEvent(ctx context.Context, cmd uint32) {
linux.TCSETS,
linux.TCSETSW,
linux.TCSETSF,
- linux.TIOCGPGRP,
- linux.TIOCSPGRP,
linux.TIOCGWINSZ,
linux.TIOCSWINSZ,
linux.TIOCSETD,
@@ -200,8 +211,6 @@ func maybeEmitUnimplementedEvent(ctx context.Context, cmd uint32) {
linux.TIOCEXCL,
linux.TIOCNXCL,
linux.TIOCGEXCL,
- linux.TIOCNOTTY,
- linux.TIOCSCTTY,
linux.TIOCGSID,
linux.TIOCGETD,
linux.TIOCVHANGUP,
diff --git a/pkg/sentry/fs/tty/slave.go b/pkg/sentry/fs/tty/slave.go
index e30266404..944c4ada1 100644
--- a/pkg/sentry/fs/tty/slave.go
+++ b/pkg/sentry/fs/tty/slave.go
@@ -152,9 +152,16 @@ func (sf *slaveFileOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem
case linux.TIOCSCTTY:
// Make the given terminal the controlling terminal of the
// calling process.
- // TODO(b/129283598): Implement once we have support for job
- // control.
- return 0, nil
+ return 0, sf.si.t.setControllingTTY(ctx, io, args, false /* isMaster */)
+ case linux.TIOCNOTTY:
+ // Release this process's controlling terminal.
+ return 0, sf.si.t.releaseControllingTTY(ctx, io, args, false /* isMaster */)
+ case linux.TIOCGPGRP:
+ // Get the foreground process group.
+ return sf.si.t.foregroundProcessGroup(ctx, io, args, false /* isMaster */)
+ case linux.TIOCSPGRP:
+ // Set the foreground process group.
+ return sf.si.t.setForegroundProcessGroup(ctx, io, args, false /* isMaster */)
default:
maybeEmitUnimplementedEvent(ctx, cmd)
return 0, syserror.ENOTTY
diff --git a/pkg/sentry/fs/tty/terminal.go b/pkg/sentry/fs/tty/terminal.go
index b7cecb2ed..ff8138820 100644
--- a/pkg/sentry/fs/tty/terminal.go
+++ b/pkg/sentry/fs/tty/terminal.go
@@ -17,7 +17,10 @@ package tty
import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/refs"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/usermem"
)
// Terminal is a pseudoterminal.
@@ -26,23 +29,100 @@ import (
type Terminal struct {
refs.AtomicRefCount
- // n is the terminal index.
+ // n is the terminal index. It is immutable.
n uint32
- // d is the containing directory.
+ // d is the containing directory. It is immutable.
d *dirInodeOperations
- // ld is the line discipline of the terminal.
+ // ld is the line discipline of the terminal. It is immutable.
ld *lineDiscipline
+
+ // masterKTTY contains the controlling process of the master end of
+ // this terminal. This field is immutable.
+ masterKTTY *kernel.TTY
+
+ // slaveKTTY contains the controlling process of the slave end of this
+ // terminal. This field is immutable.
+ slaveKTTY *kernel.TTY
}
func newTerminal(ctx context.Context, d *dirInodeOperations, n uint32) *Terminal {
termios := linux.DefaultSlaveTermios
t := Terminal{
- d: d,
- n: n,
- ld: newLineDiscipline(termios),
+ d: d,
+ n: n,
+ ld: newLineDiscipline(termios),
+ masterKTTY: &kernel.TTY{},
+ slaveKTTY: &kernel.TTY{},
}
t.EnableLeakCheck("tty.Terminal")
return &t
}
+
+// setControllingTTY makes tm the controlling terminal of the calling thread
+// group.
+func (tm *Terminal) setControllingTTY(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) error {
+ task := kernel.TaskFromContext(ctx)
+ if task == nil {
+ panic("setControllingTTY must be called from a task context")
+ }
+
+ return task.ThreadGroup().SetControllingTTY(tm.tty(isMaster), args[2].Int())
+}
+
+// releaseControllingTTY removes tm as the controlling terminal of the calling
+// thread group.
+func (tm *Terminal) releaseControllingTTY(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) error {
+ task := kernel.TaskFromContext(ctx)
+ if task == nil {
+ panic("releaseControllingTTY must be called from a task context")
+ }
+
+ return task.ThreadGroup().ReleaseControllingTTY(tm.tty(isMaster))
+}
+
+// foregroundProcessGroup gets the process group ID of tm's foreground process.
+func (tm *Terminal) foregroundProcessGroup(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) (uintptr, error) {
+ task := kernel.TaskFromContext(ctx)
+ if task == nil {
+ panic("foregroundProcessGroup must be called from a task context")
+ }
+
+ ret, err := task.ThreadGroup().ForegroundProcessGroup(tm.tty(isMaster))
+ if err != nil {
+ return 0, err
+ }
+
+ // Write it out to *arg.
+ _, err = usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(ret), usermem.IOOpts{
+ AddressSpaceActive: true,
+ })
+ return 0, err
+}
+
+// foregroundProcessGroup sets tm's foreground process.
+func (tm *Terminal) setForegroundProcessGroup(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) (uintptr, error) {
+ task := kernel.TaskFromContext(ctx)
+ if task == nil {
+ panic("setForegroundProcessGroup must be called from a task context")
+ }
+
+ // Read in the process group ID.
+ var pgid int32
+ if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &pgid, usermem.IOOpts{
+ AddressSpaceActive: true,
+ }); err != nil {
+ return 0, err
+ }
+
+ ret, err := task.ThreadGroup().SetForegroundProcessGroup(tm.tty(isMaster), kernel.ProcessGroupID(pgid))
+ return uintptr(ret), err
+}
+
+func (tm *Terminal) tty(isMaster bool) *kernel.TTY {
+ if isMaster {
+ return tm.masterKTTY
+ }
+ return tm.slaveKTTY
+}
diff --git a/pkg/sentry/fsimpl/ext/BUILD b/pkg/sentry/fsimpl/ext/BUILD
new file mode 100644
index 000000000..a41101339
--- /dev/null
+++ b/pkg/sentry/fsimpl/ext/BUILD
@@ -0,0 +1,86 @@
+package(licenses = ["notice"])
+
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+
+go_template_instance(
+ name = "dirent_list",
+ out = "dirent_list.go",
+ package = "ext",
+ prefix = "dirent",
+ template = "//pkg/ilist:generic_list",
+ types = {
+ "Element": "*dirent",
+ "Linker": "*dirent",
+ },
+)
+
+go_library(
+ name = "ext",
+ srcs = [
+ "block_map_file.go",
+ "dentry.go",
+ "directory.go",
+ "dirent_list.go",
+ "ext.go",
+ "extent_file.go",
+ "file_description.go",
+ "filesystem.go",
+ "inode.go",
+ "regular_file.go",
+ "symlink.go",
+ "utils.go",
+ ],
+ importpath = "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext",
+ visibility = ["//pkg/sentry:internal"],
+ deps = [
+ "//pkg/abi/linux",
+ "//pkg/binary",
+ "//pkg/fd",
+ "//pkg/log",
+ "//pkg/sentry/arch",
+ "//pkg/sentry/context",
+ "//pkg/sentry/fs",
+ "//pkg/sentry/fsimpl/ext/disklayout",
+ "//pkg/sentry/kernel/auth",
+ "//pkg/sentry/memmap",
+ "//pkg/sentry/safemem",
+ "//pkg/sentry/syscalls/linux",
+ "//pkg/sentry/usermem",
+ "//pkg/sentry/vfs",
+ "//pkg/syserror",
+ "//pkg/waiter",
+ ],
+)
+
+go_test(
+ name = "ext_test",
+ size = "small",
+ srcs = [
+ "block_map_test.go",
+ "ext_test.go",
+ "extent_test.go",
+ ],
+ data = [
+ "//pkg/sentry/fsimpl/ext:assets/bigfile.txt",
+ "//pkg/sentry/fsimpl/ext:assets/file.txt",
+ "//pkg/sentry/fsimpl/ext:assets/tiny.ext2",
+ "//pkg/sentry/fsimpl/ext:assets/tiny.ext3",
+ "//pkg/sentry/fsimpl/ext:assets/tiny.ext4",
+ ],
+ embed = [":ext"],
+ deps = [
+ "//pkg/abi/linux",
+ "//pkg/binary",
+ "//pkg/sentry/context",
+ "//pkg/sentry/context/contexttest",
+ "//pkg/sentry/fsimpl/ext/disklayout",
+ "//pkg/sentry/kernel/auth",
+ "//pkg/sentry/usermem",
+ "//pkg/sentry/vfs",
+ "//pkg/syserror",
+ "//runsc/test/testutil",
+ "@com_github_google_go-cmp//cmp:go_default_library",
+ "@com_github_google_go-cmp//cmp/cmpopts:go_default_library",
+ ],
+)
diff --git a/pkg/sentry/fsimpl/ext/README.md b/pkg/sentry/fsimpl/ext/README.md
new file mode 100644
index 000000000..af00cfda8
--- /dev/null
+++ b/pkg/sentry/fsimpl/ext/README.md
@@ -0,0 +1,117 @@
+## EXT(2/3/4) File System
+
+This is a filesystem driver which supports ext2, ext3 and ext4 filesystems.
+Linux has specialized drivers for each variant but none which supports all. This
+library takes advantage of ext's backward compatibility and understands the
+internal organization of on-disk structures to support all variants.
+
+This driver implementation diverges from the Linux implementations in being more
+forgiving about versioning. For instance, if a filesystem contains both extent
+based inodes and classical block map based inodes, this driver will not complain
+and interpret them both correctly. While in Linux this would be an issue. This
+blurs the line between the three ext fs variants.
+
+Ext2 is considered deprecated as of Red Hat Enterprise Linux 7, and ext3 has
+been superseded by ext4 by large performance gains. Thus it is recommended to
+upgrade older filesystem images to ext4 using e2fsprogs for better performance.
+
+### Read Only
+
+This driver currently only allows read only operations. A lot of the design
+decisions are based on this feature. There are plans to implement write (the
+process for which is documented in the future work section).
+
+### Performance
+
+One of the biggest wins about this driver is that it directly talks to the
+underlying block device (or whatever persistent storage is being used), instead
+of making expensive RPCs to a gofer.
+
+Another advantage is that ext fs supports fast concurrent reads. Currently the
+device is represented using a `io.ReaderAt` which allows for concurrent reads.
+All reads are directly passed to the device driver which intelligently serves
+the read requests in the optimal order. There is no congestion due to locking
+while reading in the filesystem level.
+
+Reads are optimized further in the way file data is transferred over to user
+memory. Ext fs directly copies over file data from disk into user memory with no
+additional allocations on the way. We can only get faster by preloading file
+data into memory (see future work section).
+
+The internal structures used to represent files, inodes and file descriptors use
+a lot of inheritance. With the level of indirection that an interface adds with
+an internal pointer, it can quickly fragment a structure across memory. As this
+runs along side a full blown kernel (which is memory intensive), having a
+fragmented struct might hurt performance. Hence these internal structures,
+though interfaced, are tightly packed in memory using the same inheritance
+pattern that pkg/sentry/vfs uses. The pkg/sentry/fsimpl/ext/disklayout package
+makes an execption to this pattern for reasons documented in the package.
+
+### Security
+
+This driver also intends to help sandbox the container better by reducing the
+surface of the host kernel that the application touches. It prevents the
+application from exploiting vulnerabilities in the host filesystem driver. All
+`io.ReaderAt.ReadAt()` calls are translated to `pread(2)` which are directly
+passed to the device driver in the kernel. Hence this reduces the surface for
+attack.
+
+The application can not affect any host filesystems other than the one passed
+via block device by the user.
+
+### Future Work
+
+#### Write
+
+To support write operations we would need to modify the block device underneath.
+Currently, the driver does not modify the device at all, not even for updating
+the access times for reads. Modifying the filesystem incorrectly can corrupt it
+and render it unreadable for other correct ext(x) drivers. Hence caution must be
+maintained while modifying metadata structures.
+
+Ext4 specifically is built for performance and has added a lot of complexity as
+to how metadata structures are modified. For instance, files that are organized
+via an extent tree which must be balanced and file data blocks must be placed in
+the same extent as much as possible to increase locality. Such properties must
+be maintained while modifying the tree.
+
+Ext filesystems boast a lot about locality, which plays a big role in them being
+performant. The block allocation algorithm in Linux does a good job in keeping
+related data together. This behavior must be maintained as much as possible,
+else we might end up degrading the filesystem performance over time.
+
+Ext4 also supports a wide variety of features which are specialized for varying
+use cases. Implementing all of them can get difficult very quickly.
+
+Ext(x) checksums all its metadata structures to check for corruption, so
+modification of any metadata struct must correspond with re-checksumming the
+struct. Linux filesystem drivers also order on-disk updates intelligently to not
+corrupt the filesystem and also remain performant. The in-memory metadata
+structures must be kept in sync with what is on disk.
+
+There is also replication of some important structures across the filesystem.
+All replicas must be updated when their original copy is updated. There is also
+provisioning for snapshotting which must be kept in mind, although it should not
+affect this implementation unless we allow users to create filesystem snapshots.
+
+Ext4 also introduced journaling (jbd2). The journal must be updated
+appropriately.
+
+#### Performance
+
+To improve performance we should implement a buffer cache, and optionally, read
+ahead for small files. While doing so we must also keep in mind the memory usage
+and have a reasonable cap on how much file data we want to hold in memory.
+
+#### Features
+
+Our current implementation will work with most ext4 filesystems for readonly
+purposed. However, the following features are not supported yet:
+
+- Journal
+- Snapshotting
+- Extended Attributes
+- Hash Tree Directories
+- Meta Block Groups
+- Multiple Mount Protection
+- Bigalloc
diff --git a/pkg/sentry/fs/ext/assets/README.md b/pkg/sentry/fsimpl/ext/assets/README.md
index 6f1e81b3a..6f1e81b3a 100644
--- a/pkg/sentry/fs/ext/assets/README.md
+++ b/pkg/sentry/fsimpl/ext/assets/README.md
diff --git a/pkg/sentry/fs/ext/assets/bigfile.txt b/pkg/sentry/fsimpl/ext/assets/bigfile.txt
index 3857cf516..3857cf516 100644
--- a/pkg/sentry/fs/ext/assets/bigfile.txt
+++ b/pkg/sentry/fsimpl/ext/assets/bigfile.txt
diff --git a/pkg/sentry/fs/ext/assets/file.txt b/pkg/sentry/fsimpl/ext/assets/file.txt
index 980a0d5f1..980a0d5f1 100644
--- a/pkg/sentry/fs/ext/assets/file.txt
+++ b/pkg/sentry/fsimpl/ext/assets/file.txt
diff --git a/pkg/sentry/fs/ext/assets/symlink.txt b/pkg/sentry/fsimpl/ext/assets/symlink.txt
index 4c330738c..4c330738c 120000
--- a/pkg/sentry/fs/ext/assets/symlink.txt
+++ b/pkg/sentry/fsimpl/ext/assets/symlink.txt
diff --git a/pkg/sentry/fs/ext/assets/tiny.ext2 b/pkg/sentry/fsimpl/ext/assets/tiny.ext2
index 381ade9bf..381ade9bf 100644
--- a/pkg/sentry/fs/ext/assets/tiny.ext2
+++ b/pkg/sentry/fsimpl/ext/assets/tiny.ext2
Binary files differ
diff --git a/pkg/sentry/fs/ext/assets/tiny.ext3 b/pkg/sentry/fsimpl/ext/assets/tiny.ext3
index 0e97a324c..0e97a324c 100644
--- a/pkg/sentry/fs/ext/assets/tiny.ext3
+++ b/pkg/sentry/fsimpl/ext/assets/tiny.ext3
Binary files differ
diff --git a/pkg/sentry/fs/ext/assets/tiny.ext4 b/pkg/sentry/fsimpl/ext/assets/tiny.ext4
index a6859736d..a6859736d 100644
--- a/pkg/sentry/fs/ext/assets/tiny.ext4
+++ b/pkg/sentry/fsimpl/ext/assets/tiny.ext4
Binary files differ
diff --git a/pkg/sentry/fsimpl/ext/benchmark/BUILD b/pkg/sentry/fsimpl/ext/benchmark/BUILD
new file mode 100644
index 000000000..9fddb4c4c
--- /dev/null
+++ b/pkg/sentry/fsimpl/ext/benchmark/BUILD
@@ -0,0 +1,16 @@
+load("//tools/go_stateify:defs.bzl", "go_test")
+
+package(licenses = ["notice"])
+
+go_test(
+ name = "benchmark_test",
+ size = "small",
+ srcs = ["benchmark_test.go"],
+ deps = [
+ "//pkg/sentry/context",
+ "//pkg/sentry/context/contexttest",
+ "//pkg/sentry/fsimpl/ext",
+ "//pkg/sentry/kernel/auth",
+ "//pkg/sentry/vfs",
+ ],
+)
diff --git a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go
new file mode 100644
index 000000000..10a8083a0
--- /dev/null
+++ b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go
@@ -0,0 +1,193 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// These benchmarks emulate memfs benchmarks. Ext4 images must be created
+// before this benchmark is run using the `make_deep_ext4.sh` script at
+// /tmp/image-{depth}.ext4 for all the depths tested below.
+package benchmark_test
+
+import (
+ "fmt"
+ "os"
+ "runtime"
+ "strings"
+ "testing"
+
+ "gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/sentry/context/contexttest"
+ "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+)
+
+var depths = []int{1, 2, 3, 8, 64, 100}
+
+const filename = "file.txt"
+
+// setUp opens imagePath as an ext Filesystem and returns all necessary
+// elements required to run tests. If error is nil, it also returns a tear
+// down function which must be called after the test is run for clean up.
+func setUp(b *testing.B, imagePath string) (context.Context, *vfs.VirtualFilesystem, *vfs.VirtualDentry, func(), error) {
+ f, err := os.Open(imagePath)
+ if err != nil {
+ return nil, nil, nil, nil, err
+ }
+
+ ctx := contexttest.Context(b)
+ creds := auth.CredentialsFromContext(ctx)
+
+ // Create VFS.
+ vfsObj := vfs.New()
+ vfsObj.MustRegisterFilesystemType("extfs", ext.FilesystemType{})
+ mntns, err := vfsObj.NewMountNamespace(ctx, creds, imagePath, "extfs", &vfs.NewFilesystemOptions{InternalData: int(f.Fd())})
+ if err != nil {
+ f.Close()
+ return nil, nil, nil, nil, err
+ }
+
+ root := mntns.Root()
+
+ tearDown := func() {
+ root.DecRef()
+
+ if err := f.Close(); err != nil {
+ b.Fatalf("tearDown failed: %v", err)
+ }
+ }
+ return ctx, vfsObj, &root, tearDown, nil
+}
+
+// mount mounts extfs at the path operation passed. Returns a tear down
+// function which must be called after the test is run for clean up.
+func mount(b *testing.B, imagePath string, vfsfs *vfs.VirtualFilesystem, pop *vfs.PathOperation) func() {
+ b.Helper()
+
+ f, err := os.Open(imagePath)
+ if err != nil {
+ b.Fatalf("could not open image at %s: %v", imagePath, err)
+ }
+
+ ctx := contexttest.Context(b)
+ creds := auth.CredentialsFromContext(ctx)
+
+ if err := vfsfs.NewMount(ctx, creds, imagePath, pop, "extfs", &vfs.NewFilesystemOptions{InternalData: int(f.Fd())}); err != nil {
+ b.Fatalf("failed to mount tmpfs submount: %v", err)
+ }
+ return func() {
+ if err := f.Close(); err != nil {
+ b.Fatalf("tearDown failed: %v", err)
+ }
+ }
+}
+
+// BenchmarkVFS2Ext4fsStat emulates BenchmarkVFS2MemfsStat.
+func BenchmarkVFS2Ext4fsStat(b *testing.B) {
+ for _, depth := range depths {
+ b.Run(fmt.Sprintf("%d", depth), func(b *testing.B) {
+ ctx, vfsfs, root, tearDown, err := setUp(b, fmt.Sprintf("/tmp/image-%d.ext4", depth))
+ if err != nil {
+ b.Fatalf("setUp failed: %v", err)
+ }
+ defer tearDown()
+
+ creds := auth.CredentialsFromContext(ctx)
+ var filePathBuilder strings.Builder
+ filePathBuilder.WriteByte('/')
+ for i := 1; i <= depth; i++ {
+ filePathBuilder.WriteString(fmt.Sprintf("%d", i))
+ filePathBuilder.WriteByte('/')
+ }
+ filePathBuilder.WriteString(filename)
+ filePath := filePathBuilder.String()
+
+ runtime.GC()
+ b.ResetTimer()
+ for i := 0; i < b.N; i++ {
+ stat, err := vfsfs.StatAt(ctx, creds, &vfs.PathOperation{
+ Root: *root,
+ Start: *root,
+ Pathname: filePath,
+ FollowFinalSymlink: true,
+ }, &vfs.StatOptions{})
+ if err != nil {
+ b.Fatalf("stat(%q) failed: %v", filePath, err)
+ }
+ // Sanity check.
+ if stat.Size > 0 {
+ b.Fatalf("got wrong file size (%d)", stat.Size)
+ }
+ }
+ })
+ }
+}
+
+// BenchmarkVFS2ExtfsMountStat emulates BenchmarkVFS2MemfsMountStat.
+func BenchmarkVFS2ExtfsMountStat(b *testing.B) {
+ for _, depth := range depths {
+ b.Run(fmt.Sprintf("%d", depth), func(b *testing.B) {
+ // Create root extfs with depth 1 so we can mount extfs again at /1/.
+ ctx, vfsfs, root, tearDown, err := setUp(b, fmt.Sprintf("/tmp/image-%d.ext4", 1))
+ if err != nil {
+ b.Fatalf("setUp failed: %v", err)
+ }
+ defer tearDown()
+
+ creds := auth.CredentialsFromContext(ctx)
+ mountPointName := "/1/"
+ pop := vfs.PathOperation{
+ Root: *root,
+ Start: *root,
+ Pathname: mountPointName,
+ }
+
+ // Save the mount point for later use.
+ mountPoint, err := vfsfs.GetDentryAt(ctx, creds, &pop, &vfs.GetDentryOptions{})
+ if err != nil {
+ b.Fatalf("failed to walk to mount point: %v", err)
+ }
+ defer mountPoint.DecRef()
+
+ // Create extfs submount.
+ mountTearDown := mount(b, fmt.Sprintf("/tmp/image-%d.ext4", depth), vfsfs, &pop)
+ defer mountTearDown()
+
+ var filePathBuilder strings.Builder
+ filePathBuilder.WriteString(mountPointName)
+ for i := 1; i <= depth; i++ {
+ filePathBuilder.WriteString(fmt.Sprintf("%d", i))
+ filePathBuilder.WriteByte('/')
+ }
+ filePathBuilder.WriteString(filename)
+ filePath := filePathBuilder.String()
+
+ runtime.GC()
+ b.ResetTimer()
+ for i := 0; i < b.N; i++ {
+ stat, err := vfsfs.StatAt(ctx, creds, &vfs.PathOperation{
+ Root: *root,
+ Start: *root,
+ Pathname: filePath,
+ FollowFinalSymlink: true,
+ }, &vfs.StatOptions{})
+ if err != nil {
+ b.Fatalf("stat(%q) failed: %v", filePath, err)
+ }
+ // Sanity check. touch(1) always creates files of size 0 (empty).
+ if stat.Size > 0 {
+ b.Fatalf("got wrong file size (%d)", stat.Size)
+ }
+ }
+ })
+ }
+}
diff --git a/pkg/sentry/fsimpl/ext/benchmark/make_deep_ext4.sh b/pkg/sentry/fsimpl/ext/benchmark/make_deep_ext4.sh
new file mode 100755
index 000000000..d0910da1f
--- /dev/null
+++ b/pkg/sentry/fsimpl/ext/benchmark/make_deep_ext4.sh
@@ -0,0 +1,72 @@
+#!/bin/bash
+
+# Copyright 2019 The gVisor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This script creates an ext4 image with $1 depth of directories and a file in
+# the inner most directory. The created file is at path /1/2/.../depth/file.txt.
+# The ext4 image is written to $2. The image is temporarily mounted at
+# /tmp/mountpoint. This script must be run with sudo privileges.
+
+# Usage:
+# sudo bash make_deep_ext4.sh {depth} {output path}
+
+# Check positional arguments.
+if [ "$#" -ne 2 ]; then
+ echo "Usage: sudo bash make_deep_ext4.sh {depth} {output path}"
+ exit 1
+fi
+
+# Make sure depth is a non-negative number.
+if ! [[ "$1" =~ ^[0-9]+$ ]]; then
+ echo "Depth must be a non-negative number."
+ exit 1
+fi
+
+# Create a 1 MB filesystem image at the requested output path.
+rm -f $2
+fallocate -l 1M $2
+if [ $? -ne 0 ]; then
+ echo "fallocate failed"
+ exit $?
+fi
+
+# Convert that blank into an ext4 image.
+mkfs.ext4 -j $2
+if [ $? -ne 0 ]; then
+ echo "mkfs.ext4 failed"
+ exit $?
+fi
+
+# Mount the image.
+MOUNTPOINT=/tmp/mountpoint
+mkdir -p $MOUNTPOINT
+mount -o loop $2 $MOUNTPOINT
+if [ $? -ne 0 ]; then
+ echo "mount failed"
+ exit $?
+fi
+
+# Create nested directories and the file.
+if [ "$1" -eq 0 ]; then
+ FILEPATH=$MOUNTPOINT/file.txt
+else
+ FILEPATH=$MOUNTPOINT/$(seq -s '/' 1 $1)/file.txt
+fi
+mkdir -p $(dirname $FILEPATH) || exit
+touch $FILEPATH
+
+# Clean up.
+umount $MOUNTPOINT
+rm -rf $MOUNTPOINT
diff --git a/pkg/sentry/fsimpl/ext/block_map_file.go b/pkg/sentry/fsimpl/ext/block_map_file.go
new file mode 100644
index 000000000..cea89bcd9
--- /dev/null
+++ b/pkg/sentry/fsimpl/ext/block_map_file.go
@@ -0,0 +1,200 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package ext
+
+import (
+ "io"
+ "math"
+
+ "gvisor.dev/gvisor/pkg/binary"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+const (
+ // numDirectBlks is the number of direct blocks in ext block map inodes.
+ numDirectBlks = 12
+)
+
+// blockMapFile is a type of regular file which uses direct/indirect block
+// addressing to store file data. This was deprecated in ext4.
+type blockMapFile struct {
+ regFile regularFile
+
+ // directBlks are the direct blocks numbers. The physical blocks pointed by
+ // these holds file data. Contains file blocks 0 to 11.
+ directBlks [numDirectBlks]uint32
+
+ // indirectBlk is the physical block which contains (blkSize/4) direct block
+ // numbers (as uint32 integers).
+ indirectBlk uint32
+
+ // doubleIndirectBlk is the physical block which contains (blkSize/4) indirect
+ // block numbers (as uint32 integers).
+ doubleIndirectBlk uint32
+
+ // tripleIndirectBlk is the physical block which contains (blkSize/4) doubly
+ // indirect block numbers (as uint32 integers).
+ tripleIndirectBlk uint32
+
+ // coverage at (i)th index indicates the amount of file data a node at
+ // height (i) covers. Height 0 is the direct block.
+ coverage [4]uint64
+}
+
+// Compiles only if blockMapFile implements io.ReaderAt.
+var _ io.ReaderAt = (*blockMapFile)(nil)
+
+// newBlockMapFile is the blockMapFile constructor. It initializes the file to
+// physical blocks map with (at most) the first 12 (direct) blocks.
+func newBlockMapFile(regFile regularFile) (*blockMapFile, error) {
+ file := &blockMapFile{regFile: regFile}
+ file.regFile.impl = file
+
+ for i := uint(0); i < 4; i++ {
+ file.coverage[i] = getCoverage(regFile.inode.blkSize, i)
+ }
+
+ blkMap := regFile.inode.diskInode.Data()
+ binary.Unmarshal(blkMap[:numDirectBlks*4], binary.LittleEndian, &file.directBlks)
+ binary.Unmarshal(blkMap[numDirectBlks*4:(numDirectBlks+1)*4], binary.LittleEndian, &file.indirectBlk)
+ binary.Unmarshal(blkMap[(numDirectBlks+1)*4:(numDirectBlks+2)*4], binary.LittleEndian, &file.doubleIndirectBlk)
+ binary.Unmarshal(blkMap[(numDirectBlks+2)*4:(numDirectBlks+3)*4], binary.LittleEndian, &file.tripleIndirectBlk)
+ return file, nil
+}
+
+// ReadAt implements io.ReaderAt.ReadAt.
+func (f *blockMapFile) ReadAt(dst []byte, off int64) (int, error) {
+ if len(dst) == 0 {
+ return 0, nil
+ }
+
+ if off < 0 {
+ return 0, syserror.EINVAL
+ }
+
+ offset := uint64(off)
+ size := f.regFile.inode.diskInode.Size()
+ if offset >= size {
+ return 0, io.EOF
+ }
+
+ // dirBlksEnd is the file offset until which direct blocks cover file data.
+ // Direct blocks cover 0 <= file offset < dirBlksEnd.
+ dirBlksEnd := numDirectBlks * f.coverage[0]
+
+ // indirBlkEnd is the file offset until which the indirect block covers file
+ // data. The indirect block covers dirBlksEnd <= file offset < indirBlkEnd.
+ indirBlkEnd := dirBlksEnd + f.coverage[1]
+
+ // doubIndirBlkEnd is the file offset until which the double indirect block
+ // covers file data. The double indirect block covers the range
+ // indirBlkEnd <= file offset < doubIndirBlkEnd.
+ doubIndirBlkEnd := indirBlkEnd + f.coverage[2]
+
+ read := 0
+ toRead := len(dst)
+ if uint64(toRead)+offset > size {
+ toRead = int(size - offset)
+ }
+ for read < toRead {
+ var err error
+ var curR int
+
+ // Figure out which block to delegate the read to.
+ switch {
+ case offset < dirBlksEnd:
+ // Direct block.
+ curR, err = f.read(f.directBlks[offset/f.regFile.inode.blkSize], offset%f.regFile.inode.blkSize, 0, dst[read:])
+ case offset < indirBlkEnd:
+ // Indirect block.
+ curR, err = f.read(f.indirectBlk, offset-dirBlksEnd, 1, dst[read:])
+ case offset < doubIndirBlkEnd:
+ // Doubly indirect block.
+ curR, err = f.read(f.doubleIndirectBlk, offset-indirBlkEnd, 2, dst[read:])
+ default:
+ // Triply indirect block.
+ curR, err = f.read(f.tripleIndirectBlk, offset-doubIndirBlkEnd, 3, dst[read:])
+ }
+
+ read += curR
+ offset += uint64(curR)
+ if err != nil {
+ return read, err
+ }
+ }
+
+ if read < len(dst) {
+ return read, io.EOF
+ }
+ return read, nil
+}
+
+// read is the recursive step of the ReadAt function. It relies on knowing the
+// current node's location on disk (curPhyBlk) and its height in the block map
+// tree. A height of 0 shows that the current node is actually holding file
+// data. relFileOff tells the offset from which we need to start to reading
+// under the current node. It is completely relative to the current node.
+func (f *blockMapFile) read(curPhyBlk uint32, relFileOff uint64, height uint, dst []byte) (int, error) {
+ curPhyBlkOff := int64(curPhyBlk) * int64(f.regFile.inode.blkSize)
+ if height == 0 {
+ toRead := int(f.regFile.inode.blkSize - relFileOff)
+ if len(dst) < toRead {
+ toRead = len(dst)
+ }
+
+ n, _ := f.regFile.inode.dev.ReadAt(dst[:toRead], curPhyBlkOff+int64(relFileOff))
+ if n < toRead {
+ return n, syserror.EIO
+ }
+ return n, nil
+ }
+
+ childCov := f.coverage[height-1]
+ startIdx := relFileOff / childCov
+ endIdx := f.regFile.inode.blkSize / 4 // This is exclusive.
+ wantEndIdx := (relFileOff + uint64(len(dst))) / childCov
+ wantEndIdx++ // Make this exclusive.
+ if wantEndIdx < endIdx {
+ endIdx = wantEndIdx
+ }
+
+ read := 0
+ curChildOff := relFileOff % childCov
+ for i := startIdx; i < endIdx; i++ {
+ var childPhyBlk uint32
+ err := readFromDisk(f.regFile.inode.dev, curPhyBlkOff+int64(i*4), &childPhyBlk)
+ if err != nil {
+ return read, err
+ }
+
+ n, err := f.read(childPhyBlk, curChildOff, height-1, dst[read:])
+ read += n
+ if err != nil {
+ return read, err
+ }
+
+ curChildOff = 0
+ }
+
+ return read, nil
+}
+
+// getCoverage returns the number of bytes a node at the given height covers.
+// Height 0 is the file data block itself. Height 1 is the indirect block.
+//
+// Formula: blkSize * ((blkSize / 4)^height)
+func getCoverage(blkSize uint64, height uint) uint64 {
+ return blkSize * uint64(math.Pow(float64(blkSize/4), float64(height)))
+}
diff --git a/pkg/sentry/fsimpl/ext/block_map_test.go b/pkg/sentry/fsimpl/ext/block_map_test.go
new file mode 100644
index 000000000..213aa3919
--- /dev/null
+++ b/pkg/sentry/fsimpl/ext/block_map_test.go
@@ -0,0 +1,157 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package ext
+
+import (
+ "bytes"
+ "math/rand"
+ "testing"
+
+ "github.com/google/go-cmp/cmp"
+ "gvisor.dev/gvisor/pkg/binary"
+ "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout"
+)
+
+// These consts are for mocking the block map tree.
+const (
+ mockBMBlkSize = uint32(16)
+ mockBMDiskSize = 2500
+)
+
+// TestBlockMapReader stress tests block map reader functionality. It performs
+// random length reads from all possible positions in the block map structure.
+func TestBlockMapReader(t *testing.T) {
+ mockBMFile, want := blockMapSetUp(t)
+ n := len(want)
+
+ for from := 0; from < n; from++ {
+ got := make([]byte, n-from)
+
+ if read, err := mockBMFile.ReadAt(got, int64(from)); err != nil {
+ t.Fatalf("file read operation from offset %d to %d only read %d bytes: %v", from, n, read, err)
+ }
+
+ if diff := cmp.Diff(got, want[from:]); diff != "" {
+ t.Fatalf("file data from offset %d to %d mismatched (-want +got):\n%s", from, n, diff)
+ }
+ }
+}
+
+// blkNumGen is a number generator which gives block numbers for building the
+// block map file on disk. It gives unique numbers in a random order which
+// facilitates in creating an extremely fragmented filesystem.
+type blkNumGen struct {
+ nums []uint32
+}
+
+// newBlkNumGen is the blkNumGen constructor.
+func newBlkNumGen() *blkNumGen {
+ blkNums := &blkNumGen{}
+ lim := mockBMDiskSize / mockBMBlkSize
+ blkNums.nums = make([]uint32, lim)
+ for i := uint32(0); i < lim; i++ {
+ blkNums.nums[i] = i
+ }
+
+ rand.Shuffle(int(lim), func(i, j int) {
+ blkNums.nums[i], blkNums.nums[j] = blkNums.nums[j], blkNums.nums[i]
+ })
+ return blkNums
+}
+
+// next returns the next random block number.
+func (n *blkNumGen) next() uint32 {
+ ret := n.nums[0]
+ n.nums = n.nums[1:]
+ return ret
+}
+
+// blockMapSetUp creates a mock disk and a block map file. It initializes the
+// block map file with 12 direct block, 1 indirect block, 1 double indirect
+// block and 1 triple indirect block (basically fill it till the rim). It
+// initializes the disk to reflect the inode. Also returns the file data that
+// the inode covers and that is written to disk.
+func blockMapSetUp(t *testing.T) (*blockMapFile, []byte) {
+ mockDisk := make([]byte, mockBMDiskSize)
+ regFile := regularFile{
+ inode: inode{
+ diskInode: &disklayout.InodeNew{
+ InodeOld: disklayout.InodeOld{
+ SizeLo: getMockBMFileFize(),
+ },
+ },
+ dev: bytes.NewReader(mockDisk),
+ blkSize: uint64(mockBMBlkSize),
+ },
+ }
+
+ var fileData []byte
+ blkNums := newBlkNumGen()
+ var data []byte
+
+ // Write the direct blocks.
+ for i := 0; i < numDirectBlks; i++ {
+ curBlkNum := blkNums.next()
+ data = binary.Marshal(data, binary.LittleEndian, curBlkNum)
+ fileData = append(fileData, writeFileDataToBlock(mockDisk, curBlkNum, 0, blkNums)...)
+ }
+
+ // Write to indirect block.
+ indirectBlk := blkNums.next()
+ data = binary.Marshal(data, binary.LittleEndian, indirectBlk)
+ fileData = append(fileData, writeFileDataToBlock(mockDisk, indirectBlk, 1, blkNums)...)
+
+ // Write to indirect block.
+ doublyIndirectBlk := blkNums.next()
+ data = binary.Marshal(data, binary.LittleEndian, doublyIndirectBlk)
+ fileData = append(fileData, writeFileDataToBlock(mockDisk, doublyIndirectBlk, 2, blkNums)...)
+
+ // Write to indirect block.
+ triplyIndirectBlk := blkNums.next()
+ data = binary.Marshal(data, binary.LittleEndian, triplyIndirectBlk)
+ fileData = append(fileData, writeFileDataToBlock(mockDisk, triplyIndirectBlk, 3, blkNums)...)
+
+ copy(regFile.inode.diskInode.Data(), data)
+
+ mockFile, err := newBlockMapFile(regFile)
+ if err != nil {
+ t.Fatalf("newBlockMapFile failed: %v", err)
+ }
+ return mockFile, fileData
+}
+
+// writeFileDataToBlock writes random bytes to the block on disk.
+func writeFileDataToBlock(disk []byte, blkNum uint32, height uint, blkNums *blkNumGen) []byte {
+ if height == 0 {
+ start := blkNum * mockBMBlkSize
+ end := start + mockBMBlkSize
+ rand.Read(disk[start:end])
+ return disk[start:end]
+ }
+
+ var fileData []byte
+ for off := blkNum * mockBMBlkSize; off < (blkNum+1)*mockBMBlkSize; off += 4 {
+ curBlkNum := blkNums.next()
+ copy(disk[off:off+4], binary.Marshal(nil, binary.LittleEndian, curBlkNum))
+ fileData = append(fileData, writeFileDataToBlock(disk, curBlkNum, height-1, blkNums)...)
+ }
+ return fileData
+}
+
+// getMockBMFileFize gets the size of the mock block map file which is used for
+// testing.
+func getMockBMFileFize() uint32 {
+ return uint32(numDirectBlks*getCoverage(uint64(mockBMBlkSize), 0) + getCoverage(uint64(mockBMBlkSize), 1) + getCoverage(uint64(mockBMBlkSize), 2) + getCoverage(uint64(mockBMBlkSize), 3))
+}
diff --git a/pkg/sentry/fs/ext/dentry.go b/pkg/sentry/fsimpl/ext/dentry.go
index 054fb42b6..054fb42b6 100644
--- a/pkg/sentry/fs/ext/dentry.go
+++ b/pkg/sentry/fsimpl/ext/dentry.go
diff --git a/pkg/sentry/fsimpl/ext/directory.go b/pkg/sentry/fsimpl/ext/directory.go
new file mode 100644
index 000000000..b51f3e18d
--- /dev/null
+++ b/pkg/sentry/fsimpl/ext/directory.go
@@ -0,0 +1,308 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package ext
+
+import (
+ "sync"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/binary"
+ "gvisor.dev/gvisor/pkg/log"
+ "gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/sentry/fs"
+ "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout"
+ "gvisor.dev/gvisor/pkg/sentry/memmap"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+// directory represents a directory inode. It holds the childList in memory.
+type directory struct {
+ inode inode
+
+ // mu serializes the changes to childList.
+ // Lock Order (outermost locks must be taken first):
+ // directory.mu
+ // filesystem.mu
+ mu sync.Mutex
+
+ // childList is a list containing (1) child dirents and (2) fake dirents
+ // (with diskDirent == nil) that represent the iteration position of
+ // directoryFDs. childList is used to support directoryFD.IterDirents()
+ // efficiently. childList is protected by mu.
+ childList direntList
+
+ // childMap maps the child's filename to the dirent structure stored in
+ // childList. This adds some data replication but helps in faster path
+ // traversal. For consistency, key == childMap[key].diskDirent.FileName().
+ // Immutable.
+ childMap map[string]*dirent
+}
+
+// newDirectroy is the directory constructor.
+func newDirectroy(inode inode, newDirent bool) (*directory, error) {
+ file := &directory{inode: inode, childMap: make(map[string]*dirent)}
+ file.inode.impl = file
+
+ // Initialize childList by reading dirents from the underlying file.
+ if inode.diskInode.Flags().Index {
+ // TODO(b/134676337): Support hash tree directories. Currently only the '.'
+ // and '..' entries are read in.
+
+ // Users cannot navigate this hash tree directory yet.
+ log.Warningf("hash tree directory being used which is unsupported")
+ return file, nil
+ }
+
+ // The dirents are organized in a linear array in the file data.
+ // Extract the file data and decode the dirents.
+ regFile, err := newRegularFile(inode)
+ if err != nil {
+ return nil, err
+ }
+
+ // buf is used as scratch space for reading in dirents from disk and
+ // unmarshalling them into dirent structs.
+ buf := make([]byte, disklayout.DirentSize)
+ size := inode.diskInode.Size()
+ for off, inc := uint64(0), uint64(0); off < size; off += inc {
+ toRead := size - off
+ if toRead > disklayout.DirentSize {
+ toRead = disklayout.DirentSize
+ }
+ if n, err := regFile.impl.ReadAt(buf[:toRead], int64(off)); uint64(n) < toRead {
+ return nil, err
+ }
+
+ var curDirent dirent
+ if newDirent {
+ curDirent.diskDirent = &disklayout.DirentNew{}
+ } else {
+ curDirent.diskDirent = &disklayout.DirentOld{}
+ }
+ binary.Unmarshal(buf, binary.LittleEndian, curDirent.diskDirent)
+
+ if curDirent.diskDirent.Inode() != 0 && len(curDirent.diskDirent.FileName()) != 0 {
+ // Inode number and name length fields being set to 0 is used to indicate
+ // an unused dirent.
+ file.childList.PushBack(&curDirent)
+ file.childMap[curDirent.diskDirent.FileName()] = &curDirent
+ }
+
+ // The next dirent is placed exactly after this dirent record on disk.
+ inc = uint64(curDirent.diskDirent.RecordSize())
+ }
+
+ return file, nil
+}
+
+func (i *inode) isDir() bool {
+ _, ok := i.impl.(*directory)
+ return ok
+}
+
+// dirent is the directory.childList node.
+type dirent struct {
+ diskDirent disklayout.Dirent
+
+ // direntEntry links dirents into their parent directory.childList.
+ direntEntry
+}
+
+// directoryFD represents a directory file description. It implements
+// vfs.FileDescriptionImpl.
+type directoryFD struct {
+ fileDescription
+ vfs.DirectoryFileDescriptionDefaultImpl
+
+ // Protected by directory.mu.
+ iter *dirent
+ off int64
+}
+
+// Compiles only if directoryFD implements vfs.FileDescriptionImpl.
+var _ vfs.FileDescriptionImpl = (*directoryFD)(nil)
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *directoryFD) Release() {
+ if fd.iter == nil {
+ return
+ }
+
+ dir := fd.inode().impl.(*directory)
+ dir.mu.Lock()
+ dir.childList.Remove(fd.iter)
+ dir.mu.Unlock()
+ fd.iter = nil
+}
+
+// IterDirents implements vfs.FileDescriptionImpl.IterDirents.
+func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error {
+ extfs := fd.filesystem()
+ dir := fd.inode().impl.(*directory)
+
+ dir.mu.Lock()
+ defer dir.mu.Unlock()
+
+ // Ensure that fd.iter exists and is not linked into dir.childList.
+ var child *dirent
+ if fd.iter == nil {
+ // Start iteration at the beginning of dir.
+ child = dir.childList.Front()
+ fd.iter = &dirent{}
+ } else {
+ // Continue iteration from where we left off.
+ child = fd.iter.Next()
+ dir.childList.Remove(fd.iter)
+ }
+ for ; child != nil; child = child.Next() {
+ // Skip other directoryFD iterators.
+ if child.diskDirent != nil {
+ childType, ok := child.diskDirent.FileType()
+ if !ok {
+ // We will need to read the inode off disk. Do not increment
+ // ref count here because this inode is not being added to the
+ // dentry tree.
+ extfs.mu.Lock()
+ childInode, err := extfs.getOrCreateInodeLocked(child.diskDirent.Inode())
+ extfs.mu.Unlock()
+ if err != nil {
+ // Usage of the file description after the error is
+ // undefined. This implementation would continue reading
+ // from the next dirent.
+ fd.off++
+ dir.childList.InsertAfter(child, fd.iter)
+ return err
+ }
+ childType = fs.ToInodeType(childInode.diskInode.Mode().FileType())
+ }
+
+ if !cb.Handle(vfs.Dirent{
+ Name: child.diskDirent.FileName(),
+ Type: fs.ToDirentType(childType),
+ Ino: uint64(child.diskDirent.Inode()),
+ Off: fd.off,
+ }) {
+ dir.childList.InsertBefore(child, fd.iter)
+ return nil
+ }
+ fd.off++
+ }
+ }
+ dir.childList.PushBack(fd.iter)
+ return nil
+}
+
+// Seek implements vfs.FileDescriptionImpl.Seek.
+func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+ if whence != linux.SEEK_SET && whence != linux.SEEK_CUR {
+ return 0, syserror.EINVAL
+ }
+
+ dir := fd.inode().impl.(*directory)
+
+ dir.mu.Lock()
+ defer dir.mu.Unlock()
+
+ // Find resulting offset.
+ if whence == linux.SEEK_CUR {
+ offset += fd.off
+ }
+
+ if offset < 0 {
+ // lseek(2) specifies that EINVAL should be returned if the resulting offset
+ // is negative.
+ return 0, syserror.EINVAL
+ }
+
+ n := int64(len(dir.childMap))
+ realWantOff := offset
+ if realWantOff > n {
+ realWantOff = n
+ }
+ realCurOff := fd.off
+ if realCurOff > n {
+ realCurOff = n
+ }
+
+ // Ensure that fd.iter exists and is linked into dir.childList so we can
+ // intelligently seek from the optimal position.
+ if fd.iter == nil {
+ fd.iter = &dirent{}
+ dir.childList.PushFront(fd.iter)
+ }
+
+ // Guess that iterating from the current position is optimal.
+ child := fd.iter
+ diff := realWantOff - realCurOff // Shows direction and magnitude of travel.
+
+ // See if starting from the beginning or end is better.
+ abDiff := diff
+ if diff < 0 {
+ abDiff = -diff
+ }
+ if abDiff > realWantOff {
+ // Starting from the beginning is best.
+ child = dir.childList.Front()
+ diff = realWantOff
+ } else if abDiff > (n - realWantOff) {
+ // Starting from the end is best.
+ child = dir.childList.Back()
+ // (n - 1) because the last non-nil dirent represents the (n-1)th offset.
+ diff = realWantOff - (n - 1)
+ }
+
+ for child != nil {
+ // Skip other directoryFD iterators.
+ if child.diskDirent != nil {
+ if diff == 0 {
+ if child != fd.iter {
+ dir.childList.Remove(fd.iter)
+ dir.childList.InsertBefore(child, fd.iter)
+ }
+
+ fd.off = offset
+ return offset, nil
+ }
+
+ if diff < 0 {
+ diff++
+ child = child.Prev()
+ } else {
+ diff--
+ child = child.Next()
+ }
+ continue
+ }
+
+ if diff < 0 {
+ child = child.Prev()
+ } else {
+ child = child.Next()
+ }
+ }
+
+ // Reaching here indicates that the offset is beyond the end of the childList.
+ dir.childList.Remove(fd.iter)
+ dir.childList.PushBack(fd.iter)
+ fd.off = offset
+ return offset, nil
+}
+
+// IterDirents implements vfs.FileDescriptionImpl.IterDirents.
+func (fd *directoryFD) ConfigureMMap(ctx context.Context, opts memmap.MMapOpts) error {
+ // mmap(2) specifies that EACCESS should be returned for non-regular file fds.
+ return syserror.EACCES
+}
diff --git a/pkg/sentry/fs/ext/disklayout/BUILD b/pkg/sentry/fsimpl/ext/disklayout/BUILD
index dde15110d..907d35b7e 100644
--- a/pkg/sentry/fs/ext/disklayout/BUILD
+++ b/pkg/sentry/fsimpl/ext/disklayout/BUILD
@@ -22,7 +22,7 @@ go_library(
"superblock_old.go",
"test_utils.go",
],
- importpath = "gvisor.dev/gvisor/pkg/sentry/fs/ext/disklayout",
+ importpath = "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout",
visibility = ["//pkg/sentry:internal"],
deps = [
"//pkg/abi/linux",
diff --git a/pkg/sentry/fs/ext/disklayout/block_group.go b/pkg/sentry/fsimpl/ext/disklayout/block_group.go
index ad6f4fef8..ad6f4fef8 100644
--- a/pkg/sentry/fs/ext/disklayout/block_group.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/block_group.go
diff --git a/pkg/sentry/fs/ext/disklayout/block_group_32.go b/pkg/sentry/fsimpl/ext/disklayout/block_group_32.go
index 3e16c76db..3e16c76db 100644
--- a/pkg/sentry/fs/ext/disklayout/block_group_32.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/block_group_32.go
diff --git a/pkg/sentry/fs/ext/disklayout/block_group_64.go b/pkg/sentry/fsimpl/ext/disklayout/block_group_64.go
index 9a809197a..9a809197a 100644
--- a/pkg/sentry/fs/ext/disklayout/block_group_64.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/block_group_64.go
diff --git a/pkg/sentry/fs/ext/disklayout/block_group_test.go b/pkg/sentry/fsimpl/ext/disklayout/block_group_test.go
index 0ef4294c0..0ef4294c0 100644
--- a/pkg/sentry/fs/ext/disklayout/block_group_test.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/block_group_test.go
diff --git a/pkg/sentry/fs/ext/disklayout/dirent.go b/pkg/sentry/fsimpl/ext/disklayout/dirent.go
index 685bf57b8..417b6cf65 100644
--- a/pkg/sentry/fs/ext/disklayout/dirent.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/dirent.go
@@ -21,6 +21,9 @@ import (
const (
// MaxFileName is the maximum length of an ext fs file's name.
MaxFileName = 255
+
+ // DirentSize is the size of ext dirent structures.
+ DirentSize = 263
)
var (
diff --git a/pkg/sentry/fs/ext/disklayout/dirent_new.go b/pkg/sentry/fsimpl/ext/disklayout/dirent_new.go
index 29ae4a5c2..29ae4a5c2 100644
--- a/pkg/sentry/fs/ext/disklayout/dirent_new.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/dirent_new.go
diff --git a/pkg/sentry/fs/ext/disklayout/dirent_old.go b/pkg/sentry/fsimpl/ext/disklayout/dirent_old.go
index 6fff12a6e..6fff12a6e 100644
--- a/pkg/sentry/fs/ext/disklayout/dirent_old.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/dirent_old.go
diff --git a/pkg/sentry/fs/ext/disklayout/dirent_test.go b/pkg/sentry/fsimpl/ext/disklayout/dirent_test.go
index cc6dff2c9..934919f8a 100644
--- a/pkg/sentry/fs/ext/disklayout/dirent_test.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/dirent_test.go
@@ -21,8 +21,6 @@ import (
// TestDirentSize tests that the dirent structs are of the correct
// size.
func TestDirentSize(t *testing.T) {
- want := uintptr(263)
-
- assertSize(t, DirentOld{}, want)
- assertSize(t, DirentNew{}, want)
+ assertSize(t, DirentOld{}, uintptr(DirentSize))
+ assertSize(t, DirentNew{}, uintptr(DirentSize))
}
diff --git a/pkg/sentry/fs/ext/disklayout/disklayout.go b/pkg/sentry/fsimpl/ext/disklayout/disklayout.go
index bdf4e2132..bdf4e2132 100644
--- a/pkg/sentry/fs/ext/disklayout/disklayout.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/disklayout.go
diff --git a/pkg/sentry/fs/ext/disklayout/extent.go b/pkg/sentry/fsimpl/ext/disklayout/extent.go
index 567523d32..567523d32 100644
--- a/pkg/sentry/fs/ext/disklayout/extent.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/extent.go
diff --git a/pkg/sentry/fs/ext/disklayout/extent_test.go b/pkg/sentry/fsimpl/ext/disklayout/extent_test.go
index b0fad9b71..b0fad9b71 100644
--- a/pkg/sentry/fs/ext/disklayout/extent_test.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/extent_test.go
diff --git a/pkg/sentry/fs/ext/disklayout/inode.go b/pkg/sentry/fsimpl/ext/disklayout/inode.go
index 88ae913f5..88ae913f5 100644
--- a/pkg/sentry/fs/ext/disklayout/inode.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/inode.go
diff --git a/pkg/sentry/fs/ext/disklayout/inode_new.go b/pkg/sentry/fsimpl/ext/disklayout/inode_new.go
index 8f9f574ce..8f9f574ce 100644
--- a/pkg/sentry/fs/ext/disklayout/inode_new.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/inode_new.go
diff --git a/pkg/sentry/fs/ext/disklayout/inode_old.go b/pkg/sentry/fsimpl/ext/disklayout/inode_old.go
index db25b11b6..db25b11b6 100644
--- a/pkg/sentry/fs/ext/disklayout/inode_old.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/inode_old.go
diff --git a/pkg/sentry/fs/ext/disklayout/inode_test.go b/pkg/sentry/fsimpl/ext/disklayout/inode_test.go
index dd03ee50e..dd03ee50e 100644
--- a/pkg/sentry/fs/ext/disklayout/inode_test.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/inode_test.go
diff --git a/pkg/sentry/fs/ext/disklayout/superblock.go b/pkg/sentry/fsimpl/ext/disklayout/superblock.go
index 7a337a5e0..8bb327006 100644
--- a/pkg/sentry/fs/ext/disklayout/superblock.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/superblock.go
@@ -221,7 +221,7 @@ func CompatFeaturesFromInt(f uint32) CompatFeatures {
// This is not exhaustive, unused features are not listed.
const (
// SbDirentFileType indicates that directory entries record the file type.
- // We should use struct ext4_dir_entry_2 for dirents then.
+ // We should use struct DirentNew for dirents then.
SbDirentFileType = 0x2
// SbRecovery indicates that the filesystem needs recovery.
diff --git a/pkg/sentry/fs/ext/disklayout/superblock_32.go b/pkg/sentry/fsimpl/ext/disklayout/superblock_32.go
index 53e515fd3..53e515fd3 100644
--- a/pkg/sentry/fs/ext/disklayout/superblock_32.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/superblock_32.go
diff --git a/pkg/sentry/fs/ext/disklayout/superblock_64.go b/pkg/sentry/fsimpl/ext/disklayout/superblock_64.go
index 7c1053fb4..7c1053fb4 100644
--- a/pkg/sentry/fs/ext/disklayout/superblock_64.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/superblock_64.go
diff --git a/pkg/sentry/fs/ext/disklayout/superblock_old.go b/pkg/sentry/fsimpl/ext/disklayout/superblock_old.go
index 9221e0251..9221e0251 100644
--- a/pkg/sentry/fs/ext/disklayout/superblock_old.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/superblock_old.go
diff --git a/pkg/sentry/fs/ext/disklayout/superblock_test.go b/pkg/sentry/fsimpl/ext/disklayout/superblock_test.go
index 463b5ba21..463b5ba21 100644
--- a/pkg/sentry/fs/ext/disklayout/superblock_test.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/superblock_test.go
diff --git a/pkg/sentry/fs/ext/disklayout/test_utils.go b/pkg/sentry/fsimpl/ext/disklayout/test_utils.go
index 9c63f04c0..9c63f04c0 100644
--- a/pkg/sentry/fs/ext/disklayout/test_utils.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/test_utils.go
diff --git a/pkg/sentry/fsimpl/ext/ext.go b/pkg/sentry/fsimpl/ext/ext.go
new file mode 100644
index 000000000..f10accafc
--- /dev/null
+++ b/pkg/sentry/fsimpl/ext/ext.go
@@ -0,0 +1,135 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package ext implements readonly ext(2/3/4) filesystems.
+package ext
+
+import (
+ "errors"
+ "fmt"
+ "io"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/fd"
+ "gvisor.dev/gvisor/pkg/log"
+ "gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+// FilesystemType implements vfs.FilesystemType.
+type FilesystemType struct{}
+
+// Compiles only if FilesystemType implements vfs.FilesystemType.
+var _ vfs.FilesystemType = (*FilesystemType)(nil)
+
+// getDeviceFd returns an io.ReaderAt to the underlying device.
+// Currently there are two ways of mounting an ext(2/3/4) fs:
+// 1. Specify a mount with our internal special MountType in the OCI spec.
+// 2. Expose the device to the container and mount it from application layer.
+func getDeviceFd(source string, opts vfs.NewFilesystemOptions) (io.ReaderAt, error) {
+ if opts.InternalData == nil {
+ // User mount call.
+ // TODO(b/134676337): Open the device specified by `source` and return that.
+ panic("unimplemented")
+ }
+
+ // NewFilesystem call originated from within the sentry.
+ devFd, ok := opts.InternalData.(int)
+ if !ok {
+ return nil, errors.New("internal data for ext fs must be an int containing the file descriptor to device")
+ }
+
+ if devFd < 0 {
+ return nil, fmt.Errorf("ext device file descriptor is not valid: %d", devFd)
+ }
+
+ // The fd.ReadWriter returned from fd.NewReadWriter() does not take ownership
+ // of the file descriptor and hence will not close it when it is garbage
+ // collected.
+ return fd.NewReadWriter(devFd), nil
+}
+
+// isCompatible checks if the superblock has feature sets which are compatible.
+// We only need to check the superblock incompatible feature set since we are
+// mounting readonly. We will also need to check readonly compatible feature
+// set when mounting for read/write.
+func isCompatible(sb disklayout.SuperBlock) bool {
+ // Please note that what is being checked is limited based on the fact that we
+ // are mounting readonly and that we are not journaling. When mounting
+ // read/write or with a journal, this must be reevaluated.
+ incompatFeatures := sb.IncompatibleFeatures()
+ if incompatFeatures.MetaBG {
+ log.Warningf("ext fs: meta block groups are not supported")
+ return false
+ }
+ if incompatFeatures.MMP {
+ log.Warningf("ext fs: multiple mount protection is not supported")
+ return false
+ }
+ if incompatFeatures.Encrypted {
+ log.Warningf("ext fs: encrypted inodes not supported")
+ return false
+ }
+ if incompatFeatures.InlineData {
+ log.Warningf("ext fs: inline files not supported")
+ return false
+ }
+ return true
+}
+
+// NewFilesystem implements vfs.FilesystemType.NewFilesystem.
+func (FilesystemType) NewFilesystem(ctx context.Context, creds *auth.Credentials, source string, opts vfs.NewFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
+ // TODO(b/134676337): Ensure that the user is mounting readonly. If not,
+ // EACCESS should be returned according to mount(2). Filesystem independent
+ // flags (like readonly) are currently not available in pkg/sentry/vfs.
+
+ dev, err := getDeviceFd(source, opts)
+ if err != nil {
+ return nil, nil, err
+ }
+
+ fs := filesystem{dev: dev, inodeCache: make(map[uint32]*inode)}
+ fs.vfsfs.Init(&fs)
+ fs.sb, err = readSuperBlock(dev)
+ if err != nil {
+ return nil, nil, err
+ }
+
+ if fs.sb.Magic() != linux.EXT_SUPER_MAGIC {
+ // mount(2) specifies that EINVAL should be returned if the superblock is
+ // invalid.
+ return nil, nil, syserror.EINVAL
+ }
+
+ // Refuse to mount if the filesystem is incompatible.
+ if !isCompatible(fs.sb) {
+ return nil, nil, syserror.EINVAL
+ }
+
+ fs.bgs, err = readBlockGroups(dev, fs.sb)
+ if err != nil {
+ return nil, nil, err
+ }
+
+ rootInode, err := fs.getOrCreateInodeLocked(disklayout.RootDirInode)
+ if err != nil {
+ return nil, nil, err
+ }
+ rootInode.incRef()
+
+ return &fs.vfsfs, &newDentry(rootInode).vfsd, nil
+}
diff --git a/pkg/sentry/fsimpl/ext/ext_test.go b/pkg/sentry/fsimpl/ext/ext_test.go
new file mode 100644
index 000000000..49b57a2d6
--- /dev/null
+++ b/pkg/sentry/fsimpl/ext/ext_test.go
@@ -0,0 +1,917 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package ext
+
+import (
+ "fmt"
+ "io"
+ "os"
+ "path"
+ "sort"
+ "testing"
+
+ "github.com/google/go-cmp/cmp"
+ "github.com/google/go-cmp/cmp/cmpopts"
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/sentry/context/contexttest"
+ "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/usermem"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+
+ "gvisor.dev/gvisor/runsc/test/testutil"
+)
+
+const (
+ assetsDir = "pkg/sentry/fsimpl/ext/assets"
+)
+
+var (
+ ext2ImagePath = path.Join(assetsDir, "tiny.ext2")
+ ext3ImagePath = path.Join(assetsDir, "tiny.ext3")
+ ext4ImagePath = path.Join(assetsDir, "tiny.ext4")
+)
+
+// setUp opens imagePath as an ext Filesystem and returns all necessary
+// elements required to run tests. If error is non-nil, it also returns a tear
+// down function which must be called after the test is run for clean up.
+func setUp(t *testing.T, imagePath string) (context.Context, *vfs.VirtualFilesystem, *vfs.VirtualDentry, func(), error) {
+ localImagePath, err := testutil.FindFile(imagePath)
+ if err != nil {
+ return nil, nil, nil, nil, fmt.Errorf("failed to open local image at path %s: %v", imagePath, err)
+ }
+
+ f, err := os.Open(localImagePath)
+ if err != nil {
+ return nil, nil, nil, nil, err
+ }
+
+ ctx := contexttest.Context(t)
+ creds := auth.CredentialsFromContext(ctx)
+
+ // Create VFS.
+ vfsObj := vfs.New()
+ vfsObj.MustRegisterFilesystemType("extfs", FilesystemType{})
+ mntns, err := vfsObj.NewMountNamespace(ctx, creds, localImagePath, "extfs", &vfs.NewFilesystemOptions{InternalData: int(f.Fd())})
+ if err != nil {
+ f.Close()
+ return nil, nil, nil, nil, err
+ }
+
+ root := mntns.Root()
+
+ tearDown := func() {
+ root.DecRef()
+
+ if err := f.Close(); err != nil {
+ t.Fatalf("tearDown failed: %v", err)
+ }
+ }
+ return ctx, vfsObj, &root, tearDown, nil
+}
+
+// TODO(b/134676337): Test vfs.FilesystemImpl.ReadlinkAt and
+// vfs.FilesystemImpl.StatFSAt which are not implemented in
+// vfs.VirtualFilesystem yet.
+
+// TestSeek tests vfs.FileDescriptionImpl.Seek functionality.
+func TestSeek(t *testing.T) {
+ type seekTest struct {
+ name string
+ image string
+ path string
+ }
+
+ tests := []seekTest{
+ {
+ name: "ext4 root dir seek",
+ image: ext4ImagePath,
+ path: "/",
+ },
+ {
+ name: "ext3 root dir seek",
+ image: ext3ImagePath,
+ path: "/",
+ },
+ {
+ name: "ext2 root dir seek",
+ image: ext2ImagePath,
+ path: "/",
+ },
+ {
+ name: "ext4 reg file seek",
+ image: ext4ImagePath,
+ path: "/file.txt",
+ },
+ {
+ name: "ext3 reg file seek",
+ image: ext3ImagePath,
+ path: "/file.txt",
+ },
+ {
+ name: "ext2 reg file seek",
+ image: ext2ImagePath,
+ path: "/file.txt",
+ },
+ }
+
+ for _, test := range tests {
+ t.Run(test.name, func(t *testing.T) {
+ ctx, vfsfs, root, tearDown, err := setUp(t, test.image)
+ if err != nil {
+ t.Fatalf("setUp failed: %v", err)
+ }
+ defer tearDown()
+
+ fd, err := vfsfs.OpenAt(
+ ctx,
+ auth.CredentialsFromContext(ctx),
+ &vfs.PathOperation{Root: *root, Start: *root, Pathname: test.path},
+ &vfs.OpenOptions{},
+ )
+ if err != nil {
+ t.Fatalf("vfsfs.OpenAt failed: %v", err)
+ }
+
+ if n, err := fd.Impl().Seek(ctx, 0, linux.SEEK_SET); n != 0 || err != nil {
+ t.Errorf("expected seek position 0, got %d and error %v", n, err)
+ }
+
+ stat, err := fd.Impl().Stat(ctx, vfs.StatOptions{})
+ if err != nil {
+ t.Errorf("fd.stat failed for file %s in image %s: %v", test.path, test.image, err)
+ }
+
+ // We should be able to seek beyond the end of file.
+ size := int64(stat.Size)
+ if n, err := fd.Impl().Seek(ctx, size, linux.SEEK_SET); n != size || err != nil {
+ t.Errorf("expected seek position %d, got %d and error %v", size, n, err)
+ }
+
+ // EINVAL should be returned if the resulting offset is negative.
+ if _, err := fd.Impl().Seek(ctx, -1, linux.SEEK_SET); err != syserror.EINVAL {
+ t.Errorf("expected error EINVAL but got %v", err)
+ }
+
+ if n, err := fd.Impl().Seek(ctx, 3, linux.SEEK_CUR); n != size+3 || err != nil {
+ t.Errorf("expected seek position %d, got %d and error %v", size+3, n, err)
+ }
+
+ // Make sure negative offsets work with SEEK_CUR.
+ if n, err := fd.Impl().Seek(ctx, -2, linux.SEEK_CUR); n != size+1 || err != nil {
+ t.Errorf("expected seek position %d, got %d and error %v", size+1, n, err)
+ }
+
+ // EINVAL should be returned if the resulting offset is negative.
+ if _, err := fd.Impl().Seek(ctx, -(size + 2), linux.SEEK_CUR); err != syserror.EINVAL {
+ t.Errorf("expected error EINVAL but got %v", err)
+ }
+
+ // Make sure SEEK_END works with regular files.
+ switch fd.Impl().(type) {
+ case *regularFileFD:
+ // Seek back to 0.
+ if n, err := fd.Impl().Seek(ctx, -size, linux.SEEK_END); n != 0 || err != nil {
+ t.Errorf("expected seek position %d, got %d and error %v", 0, n, err)
+ }
+
+ // Seek forward beyond EOF.
+ if n, err := fd.Impl().Seek(ctx, 1, linux.SEEK_END); n != size+1 || err != nil {
+ t.Errorf("expected seek position %d, got %d and error %v", size+1, n, err)
+ }
+
+ // EINVAL should be returned if the resulting offset is negative.
+ if _, err := fd.Impl().Seek(ctx, -(size + 1), linux.SEEK_END); err != syserror.EINVAL {
+ t.Errorf("expected error EINVAL but got %v", err)
+ }
+ }
+ })
+ }
+}
+
+// TestStatAt tests filesystem.StatAt functionality.
+func TestStatAt(t *testing.T) {
+ type statAtTest struct {
+ name string
+ image string
+ path string
+ want linux.Statx
+ }
+
+ tests := []statAtTest{
+ {
+ name: "ext4 statx small file",
+ image: ext4ImagePath,
+ path: "/file.txt",
+ want: linux.Statx{
+ Blksize: 0x400,
+ Nlink: 1,
+ UID: 0,
+ GID: 0,
+ Mode: 0644 | linux.ModeRegular,
+ Size: 13,
+ },
+ },
+ {
+ name: "ext3 statx small file",
+ image: ext3ImagePath,
+ path: "/file.txt",
+ want: linux.Statx{
+ Blksize: 0x400,
+ Nlink: 1,
+ UID: 0,
+ GID: 0,
+ Mode: 0644 | linux.ModeRegular,
+ Size: 13,
+ },
+ },
+ {
+ name: "ext2 statx small file",
+ image: ext2ImagePath,
+ path: "/file.txt",
+ want: linux.Statx{
+ Blksize: 0x400,
+ Nlink: 1,
+ UID: 0,
+ GID: 0,
+ Mode: 0644 | linux.ModeRegular,
+ Size: 13,
+ },
+ },
+ {
+ name: "ext4 statx big file",
+ image: ext4ImagePath,
+ path: "/bigfile.txt",
+ want: linux.Statx{
+ Blksize: 0x400,
+ Nlink: 1,
+ UID: 0,
+ GID: 0,
+ Mode: 0644 | linux.ModeRegular,
+ Size: 13042,
+ },
+ },
+ {
+ name: "ext3 statx big file",
+ image: ext3ImagePath,
+ path: "/bigfile.txt",
+ want: linux.Statx{
+ Blksize: 0x400,
+ Nlink: 1,
+ UID: 0,
+ GID: 0,
+ Mode: 0644 | linux.ModeRegular,
+ Size: 13042,
+ },
+ },
+ {
+ name: "ext2 statx big file",
+ image: ext2ImagePath,
+ path: "/bigfile.txt",
+ want: linux.Statx{
+ Blksize: 0x400,
+ Nlink: 1,
+ UID: 0,
+ GID: 0,
+ Mode: 0644 | linux.ModeRegular,
+ Size: 13042,
+ },
+ },
+ {
+ name: "ext4 statx symlink file",
+ image: ext4ImagePath,
+ path: "/symlink.txt",
+ want: linux.Statx{
+ Blksize: 0x400,
+ Nlink: 1,
+ UID: 0,
+ GID: 0,
+ Mode: 0777 | linux.ModeSymlink,
+ Size: 8,
+ },
+ },
+ {
+ name: "ext3 statx symlink file",
+ image: ext3ImagePath,
+ path: "/symlink.txt",
+ want: linux.Statx{
+ Blksize: 0x400,
+ Nlink: 1,
+ UID: 0,
+ GID: 0,
+ Mode: 0777 | linux.ModeSymlink,
+ Size: 8,
+ },
+ },
+ {
+ name: "ext2 statx symlink file",
+ image: ext2ImagePath,
+ path: "/symlink.txt",
+ want: linux.Statx{
+ Blksize: 0x400,
+ Nlink: 1,
+ UID: 0,
+ GID: 0,
+ Mode: 0777 | linux.ModeSymlink,
+ Size: 8,
+ },
+ },
+ }
+
+ // Ignore the fields that are not supported by filesystem.StatAt yet and
+ // those which are likely to change as the image does.
+ ignoredFields := map[string]bool{
+ "Attributes": true,
+ "AttributesMask": true,
+ "Atime": true,
+ "Blocks": true,
+ "Btime": true,
+ "Ctime": true,
+ "DevMajor": true,
+ "DevMinor": true,
+ "Ino": true,
+ "Mask": true,
+ "Mtime": true,
+ "RdevMajor": true,
+ "RdevMinor": true,
+ }
+
+ for _, test := range tests {
+ t.Run(test.name, func(t *testing.T) {
+ ctx, vfsfs, root, tearDown, err := setUp(t, test.image)
+ if err != nil {
+ t.Fatalf("setUp failed: %v", err)
+ }
+ defer tearDown()
+
+ got, err := vfsfs.StatAt(ctx,
+ auth.CredentialsFromContext(ctx),
+ &vfs.PathOperation{Root: *root, Start: *root, Pathname: test.path},
+ &vfs.StatOptions{},
+ )
+ if err != nil {
+ t.Fatalf("vfsfs.StatAt failed for file %s in image %s: %v", test.path, test.image, err)
+ }
+
+ cmpIgnoreFields := cmp.FilterPath(func(p cmp.Path) bool {
+ _, ok := ignoredFields[p.String()]
+ return ok
+ }, cmp.Ignore())
+ if diff := cmp.Diff(got, test.want, cmpIgnoreFields, cmpopts.IgnoreUnexported(linux.Statx{})); diff != "" {
+ t.Errorf("stat mismatch (-want +got):\n%s", diff)
+ }
+ })
+ }
+}
+
+// TestRead tests the read functionality for vfs file descriptions.
+func TestRead(t *testing.T) {
+ type readTest struct {
+ name string
+ image string
+ absPath string
+ }
+
+ tests := []readTest{
+ {
+ name: "ext4 read small file",
+ image: ext4ImagePath,
+ absPath: "/file.txt",
+ },
+ {
+ name: "ext3 read small file",
+ image: ext3ImagePath,
+ absPath: "/file.txt",
+ },
+ {
+ name: "ext2 read small file",
+ image: ext2ImagePath,
+ absPath: "/file.txt",
+ },
+ {
+ name: "ext4 read big file",
+ image: ext4ImagePath,
+ absPath: "/bigfile.txt",
+ },
+ {
+ name: "ext3 read big file",
+ image: ext3ImagePath,
+ absPath: "/bigfile.txt",
+ },
+ {
+ name: "ext2 read big file",
+ image: ext2ImagePath,
+ absPath: "/bigfile.txt",
+ },
+ }
+
+ for _, test := range tests {
+ t.Run(test.name, func(t *testing.T) {
+ ctx, vfsfs, root, tearDown, err := setUp(t, test.image)
+ if err != nil {
+ t.Fatalf("setUp failed: %v", err)
+ }
+ defer tearDown()
+
+ fd, err := vfsfs.OpenAt(
+ ctx,
+ auth.CredentialsFromContext(ctx),
+ &vfs.PathOperation{Root: *root, Start: *root, Pathname: test.absPath},
+ &vfs.OpenOptions{},
+ )
+ if err != nil {
+ t.Fatalf("vfsfs.OpenAt failed: %v", err)
+ }
+
+ // Get a local file descriptor and compare its functionality with a vfs file
+ // description for the same file.
+ localFile, err := testutil.FindFile(path.Join(assetsDir, test.absPath))
+ if err != nil {
+ t.Fatalf("testutil.FindFile failed for %s: %v", test.absPath, err)
+ }
+
+ f, err := os.Open(localFile)
+ if err != nil {
+ t.Fatalf("os.Open failed for %s: %v", localFile, err)
+ }
+ defer f.Close()
+
+ // Read the entire file by reading one byte repeatedly. Doing this stress
+ // tests the underlying file reader implementation.
+ got := make([]byte, 1)
+ want := make([]byte, 1)
+ for {
+ n, err := f.Read(want)
+ fd.Impl().Read(ctx, usermem.BytesIOSequence(got), vfs.ReadOptions{})
+
+ if diff := cmp.Diff(got, want); diff != "" {
+ t.Errorf("file data mismatch (-want +got):\n%s", diff)
+ }
+
+ // Make sure there is no more file data left after getting EOF.
+ if n == 0 || err == io.EOF {
+ if n, _ := fd.Impl().Read(ctx, usermem.BytesIOSequence(got), vfs.ReadOptions{}); n != 0 {
+ t.Errorf("extra unexpected file data in file %s in image %s", test.absPath, test.image)
+ }
+
+ break
+ }
+
+ if err != nil {
+ t.Fatalf("read failed: %v", err)
+ }
+ }
+ })
+ }
+}
+
+// iterDirentsCb is a simple callback which just keeps adding the dirents to an
+// internal list. Implements vfs.IterDirentsCallback.
+type iterDirentsCb struct {
+ dirents []vfs.Dirent
+}
+
+// Compiles only if iterDirentCb implements vfs.IterDirentsCallback.
+var _ vfs.IterDirentsCallback = (*iterDirentsCb)(nil)
+
+// newIterDirentsCb is the iterDirent
+func newIterDirentCb() *iterDirentsCb {
+ return &iterDirentsCb{dirents: make([]vfs.Dirent, 0)}
+}
+
+// Handle implements vfs.IterDirentsCallback.Handle.
+func (cb *iterDirentsCb) Handle(dirent vfs.Dirent) bool {
+ cb.dirents = append(cb.dirents, dirent)
+ return true
+}
+
+// TestIterDirents tests the FileDescriptionImpl.IterDirents functionality.
+func TestIterDirents(t *testing.T) {
+ type iterDirentTest struct {
+ name string
+ image string
+ path string
+ want []vfs.Dirent
+ }
+
+ wantDirents := []vfs.Dirent{
+ vfs.Dirent{
+ Name: ".",
+ Type: linux.DT_DIR,
+ },
+ vfs.Dirent{
+ Name: "..",
+ Type: linux.DT_DIR,
+ },
+ vfs.Dirent{
+ Name: "lost+found",
+ Type: linux.DT_DIR,
+ },
+ vfs.Dirent{
+ Name: "file.txt",
+ Type: linux.DT_REG,
+ },
+ vfs.Dirent{
+ Name: "bigfile.txt",
+ Type: linux.DT_REG,
+ },
+ vfs.Dirent{
+ Name: "symlink.txt",
+ Type: linux.DT_LNK,
+ },
+ }
+ tests := []iterDirentTest{
+ {
+ name: "ext4 root dir iteration",
+ image: ext4ImagePath,
+ path: "/",
+ want: wantDirents,
+ },
+ {
+ name: "ext3 root dir iteration",
+ image: ext3ImagePath,
+ path: "/",
+ want: wantDirents,
+ },
+ {
+ name: "ext2 root dir iteration",
+ image: ext2ImagePath,
+ path: "/",
+ want: wantDirents,
+ },
+ }
+
+ for _, test := range tests {
+ t.Run(test.name, func(t *testing.T) {
+ ctx, vfsfs, root, tearDown, err := setUp(t, test.image)
+ if err != nil {
+ t.Fatalf("setUp failed: %v", err)
+ }
+ defer tearDown()
+
+ fd, err := vfsfs.OpenAt(
+ ctx,
+ auth.CredentialsFromContext(ctx),
+ &vfs.PathOperation{Root: *root, Start: *root, Pathname: test.path},
+ &vfs.OpenOptions{},
+ )
+ if err != nil {
+ t.Fatalf("vfsfs.OpenAt failed: %v", err)
+ }
+
+ cb := &iterDirentsCb{}
+ if err = fd.Impl().IterDirents(ctx, cb); err != nil {
+ t.Fatalf("dir fd.IterDirents() failed: %v", err)
+ }
+
+ sort.Slice(cb.dirents, func(i int, j int) bool { return cb.dirents[i].Name < cb.dirents[j].Name })
+ sort.Slice(test.want, func(i int, j int) bool { return test.want[i].Name < test.want[j].Name })
+
+ // Ignore the inode number and offset of dirents because those are likely to
+ // change as the underlying image changes.
+ cmpIgnoreFields := cmp.FilterPath(func(p cmp.Path) bool {
+ return p.String() == "Ino" || p.String() == "Off"
+ }, cmp.Ignore())
+ if diff := cmp.Diff(cb.dirents, test.want, cmpIgnoreFields); diff != "" {
+ t.Errorf("dirents mismatch (-want +got):\n%s", diff)
+ }
+ })
+ }
+}
+
+// TestRootDir tests that the root directory inode is correctly initialized and
+// returned from setUp.
+func TestRootDir(t *testing.T) {
+ type inodeProps struct {
+ Mode linux.FileMode
+ UID auth.KUID
+ GID auth.KGID
+ Size uint64
+ InodeSize uint16
+ Links uint16
+ Flags disklayout.InodeFlags
+ }
+
+ type rootDirTest struct {
+ name string
+ image string
+ wantInode inodeProps
+ }
+
+ tests := []rootDirTest{
+ {
+ name: "ext4 root dir",
+ image: ext4ImagePath,
+ wantInode: inodeProps{
+ Mode: linux.ModeDirectory | 0755,
+ Size: 0x400,
+ InodeSize: 0x80,
+ Links: 3,
+ Flags: disklayout.InodeFlags{Extents: true},
+ },
+ },
+ {
+ name: "ext3 root dir",
+ image: ext3ImagePath,
+ wantInode: inodeProps{
+ Mode: linux.ModeDirectory | 0755,
+ Size: 0x400,
+ InodeSize: 0x80,
+ Links: 3,
+ },
+ },
+ {
+ name: "ext2 root dir",
+ image: ext2ImagePath,
+ wantInode: inodeProps{
+ Mode: linux.ModeDirectory | 0755,
+ Size: 0x400,
+ InodeSize: 0x80,
+ Links: 3,
+ },
+ },
+ }
+
+ for _, test := range tests {
+ t.Run(test.name, func(t *testing.T) {
+ _, _, vd, tearDown, err := setUp(t, test.image)
+ if err != nil {
+ t.Fatalf("setUp failed: %v", err)
+ }
+ defer tearDown()
+
+ d, ok := vd.Dentry().Impl().(*dentry)
+ if !ok {
+ t.Fatalf("ext dentry of incorrect type: %T", vd.Dentry().Impl())
+ }
+
+ // Offload inode contents into local structs for comparison.
+ gotInode := inodeProps{
+ Mode: d.inode.diskInode.Mode(),
+ UID: d.inode.diskInode.UID(),
+ GID: d.inode.diskInode.GID(),
+ Size: d.inode.diskInode.Size(),
+ InodeSize: d.inode.diskInode.InodeSize(),
+ Links: d.inode.diskInode.LinksCount(),
+ Flags: d.inode.diskInode.Flags(),
+ }
+
+ if diff := cmp.Diff(gotInode, test.wantInode); diff != "" {
+ t.Errorf("inode mismatch (-want +got):\n%s", diff)
+ }
+ })
+ }
+}
+
+// TestFilesystemInit tests that the filesystem superblock and block group
+// descriptors are correctly read in and initialized.
+func TestFilesystemInit(t *testing.T) {
+ // sb only contains the immutable properties of the superblock.
+ type sb struct {
+ InodesCount uint32
+ BlocksCount uint64
+ MaxMountCount uint16
+ FirstDataBlock uint32
+ BlockSize uint64
+ BlocksPerGroup uint32
+ ClusterSize uint64
+ ClustersPerGroup uint32
+ InodeSize uint16
+ InodesPerGroup uint32
+ BgDescSize uint16
+ Magic uint16
+ Revision disklayout.SbRevision
+ CompatFeatures disklayout.CompatFeatures
+ IncompatFeatures disklayout.IncompatFeatures
+ RoCompatFeatures disklayout.RoCompatFeatures
+ }
+
+ // bg only contains the immutable properties of the block group descriptor.
+ type bg struct {
+ InodeTable uint64
+ BlockBitmap uint64
+ InodeBitmap uint64
+ ExclusionBitmap uint64
+ Flags disklayout.BGFlags
+ }
+
+ type fsInitTest struct {
+ name string
+ image string
+ wantSb sb
+ wantBgs []bg
+ }
+
+ tests := []fsInitTest{
+ {
+ name: "ext4 filesystem init",
+ image: ext4ImagePath,
+ wantSb: sb{
+ InodesCount: 0x10,
+ BlocksCount: 0x40,
+ MaxMountCount: 0xffff,
+ FirstDataBlock: 0x1,
+ BlockSize: 0x400,
+ BlocksPerGroup: 0x2000,
+ ClusterSize: 0x400,
+ ClustersPerGroup: 0x2000,
+ InodeSize: 0x80,
+ InodesPerGroup: 0x10,
+ BgDescSize: 0x40,
+ Magic: linux.EXT_SUPER_MAGIC,
+ Revision: disklayout.DynamicRev,
+ CompatFeatures: disklayout.CompatFeatures{
+ ExtAttr: true,
+ ResizeInode: true,
+ DirIndex: true,
+ },
+ IncompatFeatures: disklayout.IncompatFeatures{
+ DirentFileType: true,
+ Extents: true,
+ Is64Bit: true,
+ FlexBg: true,
+ },
+ RoCompatFeatures: disklayout.RoCompatFeatures{
+ Sparse: true,
+ LargeFile: true,
+ HugeFile: true,
+ DirNlink: true,
+ ExtraIsize: true,
+ MetadataCsum: true,
+ },
+ },
+ wantBgs: []bg{
+ {
+ InodeTable: 0x23,
+ BlockBitmap: 0x3,
+ InodeBitmap: 0x13,
+ Flags: disklayout.BGFlags{
+ InodeZeroed: true,
+ },
+ },
+ },
+ },
+ {
+ name: "ext3 filesystem init",
+ image: ext3ImagePath,
+ wantSb: sb{
+ InodesCount: 0x10,
+ BlocksCount: 0x40,
+ MaxMountCount: 0xffff,
+ FirstDataBlock: 0x1,
+ BlockSize: 0x400,
+ BlocksPerGroup: 0x2000,
+ ClusterSize: 0x400,
+ ClustersPerGroup: 0x2000,
+ InodeSize: 0x80,
+ InodesPerGroup: 0x10,
+ BgDescSize: 0x20,
+ Magic: linux.EXT_SUPER_MAGIC,
+ Revision: disklayout.DynamicRev,
+ CompatFeatures: disklayout.CompatFeatures{
+ ExtAttr: true,
+ ResizeInode: true,
+ DirIndex: true,
+ },
+ IncompatFeatures: disklayout.IncompatFeatures{
+ DirentFileType: true,
+ },
+ RoCompatFeatures: disklayout.RoCompatFeatures{
+ Sparse: true,
+ LargeFile: true,
+ },
+ },
+ wantBgs: []bg{
+ {
+ InodeTable: 0x5,
+ BlockBitmap: 0x3,
+ InodeBitmap: 0x4,
+ Flags: disklayout.BGFlags{
+ InodeZeroed: true,
+ },
+ },
+ },
+ },
+ {
+ name: "ext2 filesystem init",
+ image: ext2ImagePath,
+ wantSb: sb{
+ InodesCount: 0x10,
+ BlocksCount: 0x40,
+ MaxMountCount: 0xffff,
+ FirstDataBlock: 0x1,
+ BlockSize: 0x400,
+ BlocksPerGroup: 0x2000,
+ ClusterSize: 0x400,
+ ClustersPerGroup: 0x2000,
+ InodeSize: 0x80,
+ InodesPerGroup: 0x10,
+ BgDescSize: 0x20,
+ Magic: linux.EXT_SUPER_MAGIC,
+ Revision: disklayout.DynamicRev,
+ CompatFeatures: disklayout.CompatFeatures{
+ ExtAttr: true,
+ ResizeInode: true,
+ DirIndex: true,
+ },
+ IncompatFeatures: disklayout.IncompatFeatures{
+ DirentFileType: true,
+ },
+ RoCompatFeatures: disklayout.RoCompatFeatures{
+ Sparse: true,
+ LargeFile: true,
+ },
+ },
+ wantBgs: []bg{
+ {
+ InodeTable: 0x5,
+ BlockBitmap: 0x3,
+ InodeBitmap: 0x4,
+ Flags: disklayout.BGFlags{
+ InodeZeroed: true,
+ },
+ },
+ },
+ },
+ }
+
+ for _, test := range tests {
+ t.Run(test.name, func(t *testing.T) {
+ _, _, vd, tearDown, err := setUp(t, test.image)
+ if err != nil {
+ t.Fatalf("setUp failed: %v", err)
+ }
+ defer tearDown()
+
+ fs, ok := vd.Mount().Filesystem().Impl().(*filesystem)
+ if !ok {
+ t.Fatalf("ext filesystem of incorrect type: %T", vd.Mount().Filesystem().Impl())
+ }
+
+ // Offload superblock and block group descriptors contents into
+ // local structs for comparison.
+ totalFreeInodes := uint32(0)
+ totalFreeBlocks := uint64(0)
+ gotSb := sb{
+ InodesCount: fs.sb.InodesCount(),
+ BlocksCount: fs.sb.BlocksCount(),
+ MaxMountCount: fs.sb.MaxMountCount(),
+ FirstDataBlock: fs.sb.FirstDataBlock(),
+ BlockSize: fs.sb.BlockSize(),
+ BlocksPerGroup: fs.sb.BlocksPerGroup(),
+ ClusterSize: fs.sb.ClusterSize(),
+ ClustersPerGroup: fs.sb.ClustersPerGroup(),
+ InodeSize: fs.sb.InodeSize(),
+ InodesPerGroup: fs.sb.InodesPerGroup(),
+ BgDescSize: fs.sb.BgDescSize(),
+ Magic: fs.sb.Magic(),
+ Revision: fs.sb.Revision(),
+ CompatFeatures: fs.sb.CompatibleFeatures(),
+ IncompatFeatures: fs.sb.IncompatibleFeatures(),
+ RoCompatFeatures: fs.sb.ReadOnlyCompatibleFeatures(),
+ }
+ gotNumBgs := len(fs.bgs)
+ gotBgs := make([]bg, gotNumBgs)
+ for i := 0; i < gotNumBgs; i++ {
+ gotBgs[i].InodeTable = fs.bgs[i].InodeTable()
+ gotBgs[i].BlockBitmap = fs.bgs[i].BlockBitmap()
+ gotBgs[i].InodeBitmap = fs.bgs[i].InodeBitmap()
+ gotBgs[i].ExclusionBitmap = fs.bgs[i].ExclusionBitmap()
+ gotBgs[i].Flags = fs.bgs[i].Flags()
+
+ totalFreeInodes += fs.bgs[i].FreeInodesCount()
+ totalFreeBlocks += uint64(fs.bgs[i].FreeBlocksCount())
+ }
+
+ if diff := cmp.Diff(gotSb, test.wantSb); diff != "" {
+ t.Errorf("superblock mismatch (-want +got):\n%s", diff)
+ }
+
+ if diff := cmp.Diff(gotBgs, test.wantBgs); diff != "" {
+ t.Errorf("block group descriptors mismatch (-want +got):\n%s", diff)
+ }
+
+ if diff := cmp.Diff(totalFreeInodes, fs.sb.FreeInodesCount()); diff != "" {
+ t.Errorf("total free inodes mismatch (-want +got):\n%s", diff)
+ }
+
+ if diff := cmp.Diff(totalFreeBlocks, fs.sb.FreeBlocksCount()); diff != "" {
+ t.Errorf("total free blocks mismatch (-want +got):\n%s", diff)
+ }
+ })
+ }
+}
diff --git a/pkg/sentry/fsimpl/ext/extent_file.go b/pkg/sentry/fsimpl/ext/extent_file.go
new file mode 100644
index 000000000..38b68a2d3
--- /dev/null
+++ b/pkg/sentry/fsimpl/ext/extent_file.go
@@ -0,0 +1,237 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package ext
+
+import (
+ "io"
+ "sort"
+
+ "gvisor.dev/gvisor/pkg/binary"
+ "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+// extentFile is a type of regular file which uses extents to store file data.
+type extentFile struct {
+ regFile regularFile
+
+ // root is the root extent node. This lives in the 60 byte diskInode.Data().
+ // Immutable.
+ root disklayout.ExtentNode
+}
+
+// Compiles only if extentFile implements io.ReaderAt.
+var _ io.ReaderAt = (*extentFile)(nil)
+
+// newExtentFile is the extent file constructor. It reads the entire extent
+// tree into memory.
+// TODO(b/134676337): Build extent tree on demand to reduce memory usage.
+func newExtentFile(regFile regularFile) (*extentFile, error) {
+ file := &extentFile{regFile: regFile}
+ file.regFile.impl = file
+ err := file.buildExtTree()
+ if err != nil {
+ return nil, err
+ }
+ return file, nil
+}
+
+// buildExtTree builds the extent tree by reading it from disk by doing
+// running a simple DFS. It first reads the root node from the inode struct in
+// memory. Then it recursively builds the rest of the tree by reading it off
+// disk.
+//
+// Precondition: inode flag InExtents must be set.
+func (f *extentFile) buildExtTree() error {
+ rootNodeData := f.regFile.inode.diskInode.Data()
+
+ binary.Unmarshal(rootNodeData[:disklayout.ExtentStructsSize], binary.LittleEndian, &f.root.Header)
+
+ // Root node can not have more than 4 entries: 60 bytes = 1 header + 4 entries.
+ if f.root.Header.NumEntries > 4 {
+ // read(2) specifies that EINVAL should be returned if the file is unsuitable
+ // for reading.
+ return syserror.EINVAL
+ }
+
+ f.root.Entries = make([]disklayout.ExtentEntryPair, f.root.Header.NumEntries)
+ for i, off := uint16(0), disklayout.ExtentStructsSize; i < f.root.Header.NumEntries; i, off = i+1, off+disklayout.ExtentStructsSize {
+ var curEntry disklayout.ExtentEntry
+ if f.root.Header.Height == 0 {
+ // Leaf node.
+ curEntry = &disklayout.Extent{}
+ } else {
+ // Internal node.
+ curEntry = &disklayout.ExtentIdx{}
+ }
+ binary.Unmarshal(rootNodeData[off:off+disklayout.ExtentStructsSize], binary.LittleEndian, curEntry)
+ f.root.Entries[i].Entry = curEntry
+ }
+
+ // If this node is internal, perform DFS.
+ if f.root.Header.Height > 0 {
+ for i := uint16(0); i < f.root.Header.NumEntries; i++ {
+ var err error
+ if f.root.Entries[i].Node, err = f.buildExtTreeFromDisk(f.root.Entries[i].Entry); err != nil {
+ return err
+ }
+ }
+ }
+
+ return nil
+}
+
+// buildExtTreeFromDisk reads the extent tree nodes from disk and recursively
+// builds the tree. Performs a simple DFS. It returns the ExtentNode pointed to
+// by the ExtentEntry.
+func (f *extentFile) buildExtTreeFromDisk(entry disklayout.ExtentEntry) (*disklayout.ExtentNode, error) {
+ var header disklayout.ExtentHeader
+ off := entry.PhysicalBlock() * f.regFile.inode.blkSize
+ err := readFromDisk(f.regFile.inode.dev, int64(off), &header)
+ if err != nil {
+ return nil, err
+ }
+
+ entries := make([]disklayout.ExtentEntryPair, header.NumEntries)
+ for i, off := uint16(0), off+disklayout.ExtentStructsSize; i < header.NumEntries; i, off = i+1, off+disklayout.ExtentStructsSize {
+ var curEntry disklayout.ExtentEntry
+ if header.Height == 0 {
+ // Leaf node.
+ curEntry = &disklayout.Extent{}
+ } else {
+ // Internal node.
+ curEntry = &disklayout.ExtentIdx{}
+ }
+
+ err := readFromDisk(f.regFile.inode.dev, int64(off), curEntry)
+ if err != nil {
+ return nil, err
+ }
+ entries[i].Entry = curEntry
+ }
+
+ // If this node is internal, perform DFS.
+ if header.Height > 0 {
+ for i := uint16(0); i < header.NumEntries; i++ {
+ var err error
+ entries[i].Node, err = f.buildExtTreeFromDisk(entries[i].Entry)
+ if err != nil {
+ return nil, err
+ }
+ }
+ }
+
+ return &disklayout.ExtentNode{header, entries}, nil
+}
+
+// ReadAt implements io.ReaderAt.ReadAt.
+func (f *extentFile) ReadAt(dst []byte, off int64) (int, error) {
+ if len(dst) == 0 {
+ return 0, nil
+ }
+
+ if off < 0 {
+ return 0, syserror.EINVAL
+ }
+
+ if uint64(off) >= f.regFile.inode.diskInode.Size() {
+ return 0, io.EOF
+ }
+
+ n, err := f.read(&f.root, uint64(off), dst)
+ if n < len(dst) && err == nil {
+ err = io.EOF
+ }
+ return n, err
+}
+
+// read is the recursive step of extentFile.ReadAt which traverses the extent
+// tree from the node passed and reads file data.
+func (f *extentFile) read(node *disklayout.ExtentNode, off uint64, dst []byte) (int, error) {
+ // Perform a binary search for the node covering bytes starting at r.fileOff.
+ // A highly fragmented filesystem can have upto 340 entries and so linear
+ // search should be avoided. Finds the first entry which does not cover the
+ // file block we want and subtracts 1 to get the desired index.
+ fileBlk := uint32(off / f.regFile.inode.blkSize)
+ n := len(node.Entries)
+ found := sort.Search(n, func(i int) bool {
+ return node.Entries[i].Entry.FileBlock() > fileBlk
+ }) - 1
+
+ // We should be in this recursive step only if the data we want exists under
+ // the current node.
+ if found < 0 {
+ panic("searching for a file block in an extent entry which does not cover it")
+ }
+
+ read := 0
+ toRead := len(dst)
+ var curR int
+ var err error
+ for i := found; i < n && read < toRead; i++ {
+ if node.Header.Height == 0 {
+ curR, err = f.readFromExtent(node.Entries[i].Entry.(*disklayout.Extent), off, dst[read:])
+ } else {
+ curR, err = f.read(node.Entries[i].Node, off, dst[read:])
+ }
+
+ read += curR
+ off += uint64(curR)
+ if err != nil {
+ return read, err
+ }
+ }
+
+ return read, nil
+}
+
+// readFromExtent reads file data from the extent. It takes advantage of the
+// sequential nature of extents and reads file data from multiple blocks in one
+// call.
+//
+// A non-nil error indicates that this is a partial read and there is probably
+// more to read from this extent. The caller should propagate the error upward
+// and not move to the next extent in the tree.
+//
+// A subsequent call to extentReader.Read should continue reading from where we
+// left off as expected.
+func (f *extentFile) readFromExtent(ex *disklayout.Extent, off uint64, dst []byte) (int, error) {
+ curFileBlk := uint32(off / f.regFile.inode.blkSize)
+ exFirstFileBlk := ex.FileBlock()
+ exLastFileBlk := exFirstFileBlk + uint32(ex.Length) // This is exclusive.
+
+ // We should be in this recursive step only if the data we want exists under
+ // the current extent.
+ if curFileBlk < exFirstFileBlk || exLastFileBlk <= curFileBlk {
+ panic("searching for a file block in an extent which does not cover it")
+ }
+
+ curPhyBlk := uint64(curFileBlk-exFirstFileBlk) + ex.PhysicalBlock()
+ readStart := curPhyBlk*f.regFile.inode.blkSize + (off % f.regFile.inode.blkSize)
+
+ endPhyBlk := ex.PhysicalBlock() + uint64(ex.Length)
+ extentEnd := endPhyBlk * f.regFile.inode.blkSize // This is exclusive.
+
+ toRead := int(extentEnd - readStart)
+ if len(dst) < toRead {
+ toRead = len(dst)
+ }
+
+ n, _ := f.regFile.inode.dev.ReadAt(dst[:toRead], int64(readStart))
+ if n < toRead {
+ return n, syserror.EIO
+ }
+ return n, nil
+}
diff --git a/pkg/sentry/fs/ext/extent_test.go b/pkg/sentry/fsimpl/ext/extent_test.go
index b3f342c8e..42d0a484b 100644
--- a/pkg/sentry/fs/ext/extent_test.go
+++ b/pkg/sentry/fsimpl/ext/extent_test.go
@@ -16,17 +16,23 @@ package ext
import (
"bytes"
+ "math/rand"
"testing"
"github.com/google/go-cmp/cmp"
"github.com/google/go-cmp/cmp/cmpopts"
"gvisor.dev/gvisor/pkg/binary"
- "gvisor.dev/gvisor/pkg/sentry/fs/ext/disklayout"
+ "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout"
)
-// TestExtentTree tests the extent tree building logic.
+const (
+ // mockExtentBlkSize is the mock block size used for testing.
+ // No block has more than 1 header + 4 entries.
+ mockExtentBlkSize = uint64(64)
+)
+
+// The tree described below looks like:
//
-// Test tree:
// 0.{Head}[Idx][Idx]
// / \
// / \
@@ -44,12 +50,8 @@ import (
//
// Please note that ext4 might not construct extent trees looking like this.
// This is purely for testing the tree traversal logic.
-func TestExtentTree(t *testing.T) {
- blkSize := uint64(64) // No block has more than 1 header + 4 entries.
- mockDisk := make([]byte, blkSize*10)
- mockInode := &inode{diskInode: &disklayout.InodeNew{}}
-
- node3 := &disklayout.ExtentNode{
+var (
+ node3 = &disklayout.ExtentNode{
Header: disklayout.ExtentHeader{
Magic: disklayout.ExtentMagic,
NumEntries: 1,
@@ -68,7 +70,7 @@ func TestExtentTree(t *testing.T) {
},
}
- node2 := &disklayout.ExtentNode{
+ node2 = &disklayout.ExtentNode{
Header: disklayout.ExtentHeader{
Magic: disklayout.ExtentMagic,
NumEntries: 1,
@@ -86,7 +88,7 @@ func TestExtentTree(t *testing.T) {
},
}
- node1 := &disklayout.ExtentNode{
+ node1 = &disklayout.ExtentNode{
Header: disklayout.ExtentHeader{
Magic: disklayout.ExtentMagic,
NumEntries: 2,
@@ -113,7 +115,7 @@ func TestExtentTree(t *testing.T) {
},
}
- node0 := &disklayout.ExtentNode{
+ node0 = &disklayout.ExtentNode{
Header: disklayout.ExtentHeader{
Magic: disklayout.ExtentMagic,
NumEntries: 2,
@@ -137,22 +139,69 @@ func TestExtentTree(t *testing.T) {
},
},
}
+)
- writeTree(mockInode, mockDisk, node0, blkSize)
+// TestExtentReader stress tests extentReader functionality. It performs random
+// length reads from all possible positions in the extent tree.
+func TestExtentReader(t *testing.T) {
+ mockExtentFile, want := extentTreeSetUp(t, node0)
+ n := len(want)
- r := bytes.NewReader(mockDisk)
- if err := mockInode.buildExtTree(r, blkSize); err != nil {
- t.Fatalf("inode.buildExtTree failed: %v", err)
+ for from := 0; from < n; from++ {
+ got := make([]byte, n-from)
+
+ if read, err := mockExtentFile.ReadAt(got, int64(from)); err != nil {
+ t.Fatalf("file read operation from offset %d to %d only read %d bytes: %v", from, n, read, err)
+ }
+
+ if diff := cmp.Diff(got, want[from:]); diff != "" {
+ t.Fatalf("file data from offset %d to %d mismatched (-want +got):\n%s", from, n, diff)
+ }
}
+}
+
+// TestBuildExtentTree tests the extent tree building logic.
+func TestBuildExtentTree(t *testing.T) {
+ mockExtentFile, _ := extentTreeSetUp(t, node0)
opt := cmpopts.IgnoreUnexported(disklayout.ExtentIdx{}, disklayout.ExtentHeader{})
- if diff := cmp.Diff(mockInode.root, node0, opt); diff != "" {
+ if diff := cmp.Diff(&mockExtentFile.root, node0, opt); diff != "" {
t.Errorf("extent tree mismatch (-want +got):\n%s", diff)
}
}
-// writeTree writes the tree represented by `root` to the inode and disk passed.
-func writeTree(in *inode, disk []byte, root *disklayout.ExtentNode, blkSize uint64) {
+// extentTreeSetUp writes the passed extent tree to a mock disk as an extent
+// tree. It also constucts a mock extent file with the same tree built in it.
+// It also writes random data file data and returns it.
+func extentTreeSetUp(t *testing.T, root *disklayout.ExtentNode) (*extentFile, []byte) {
+ t.Helper()
+
+ mockDisk := make([]byte, mockExtentBlkSize*10)
+ mockExtentFile := &extentFile{
+ regFile: regularFile{
+ inode: inode{
+ diskInode: &disklayout.InodeNew{
+ InodeOld: disklayout.InodeOld{
+ SizeLo: uint32(mockExtentBlkSize) * getNumPhyBlks(root),
+ },
+ },
+ blkSize: mockExtentBlkSize,
+ dev: bytes.NewReader(mockDisk),
+ },
+ },
+ }
+
+ fileData := writeTree(&mockExtentFile.regFile.inode, mockDisk, node0, mockExtentBlkSize)
+
+ if err := mockExtentFile.buildExtTree(); err != nil {
+ t.Fatalf("inode.buildExtTree failed: %v", err)
+ }
+ return mockExtentFile, fileData
+}
+
+// writeTree writes the tree represented by `root` to the inode and disk. It
+// also writes random file data on disk.
+func writeTree(in *inode, disk []byte, root *disklayout.ExtentNode, mockExtentBlkSize uint64) []byte {
rootData := binary.Marshal(nil, binary.LittleEndian, root.Header)
for _, ep := range root.Entries {
rootData = binary.Marshal(rootData, binary.LittleEndian, ep.Entry)
@@ -160,26 +209,57 @@ func writeTree(in *inode, disk []byte, root *disklayout.ExtentNode, blkSize uint
copy(in.diskInode.Data(), rootData)
- if root.Header.Height > 0 {
- for _, ep := range root.Entries {
- writeTreeToDisk(disk, ep, blkSize)
+ var fileData []byte
+ for _, ep := range root.Entries {
+ if root.Header.Height == 0 {
+ fileData = append(fileData, writeFileDataToExtent(disk, ep.Entry.(*disklayout.Extent))...)
+ } else {
+ fileData = append(fileData, writeTreeToDisk(disk, ep)...)
}
}
+ return fileData
}
// writeTreeToDisk is the recursive step for writeTree which writes the tree
-// on the disk only.
-func writeTreeToDisk(disk []byte, curNode disklayout.ExtentEntryPair, blkSize uint64) {
+// on the disk only. Also writes random file data on disk.
+func writeTreeToDisk(disk []byte, curNode disklayout.ExtentEntryPair) []byte {
nodeData := binary.Marshal(nil, binary.LittleEndian, curNode.Node.Header)
for _, ep := range curNode.Node.Entries {
nodeData = binary.Marshal(nodeData, binary.LittleEndian, ep.Entry)
}
- copy(disk[curNode.Entry.PhysicalBlock()*blkSize:], nodeData)
+ copy(disk[curNode.Entry.PhysicalBlock()*mockExtentBlkSize:], nodeData)
+
+ var fileData []byte
+ for _, ep := range curNode.Node.Entries {
+ if curNode.Node.Header.Height == 0 {
+ fileData = append(fileData, writeFileDataToExtent(disk, ep.Entry.(*disklayout.Extent))...)
+ } else {
+ fileData = append(fileData, writeTreeToDisk(disk, ep)...)
+ }
+ }
+ return fileData
+}
+
+// writeFileDataToExtent writes random bytes to the blocks on disk that the
+// passed extent points to.
+func writeFileDataToExtent(disk []byte, ex *disklayout.Extent) []byte {
+ phyExStartBlk := ex.PhysicalBlock()
+ phyExStartOff := phyExStartBlk * mockExtentBlkSize
+ phyExEndOff := phyExStartOff + uint64(ex.Length)*mockExtentBlkSize
+ rand.Read(disk[phyExStartOff:phyExEndOff])
+ return disk[phyExStartOff:phyExEndOff]
+}
- if curNode.Node.Header.Height > 0 {
- for _, ep := range curNode.Node.Entries {
- writeTreeToDisk(disk, ep, blkSize)
+// getNumPhyBlks returns the number of physical blocks covered under the node.
+func getNumPhyBlks(node *disklayout.ExtentNode) uint32 {
+ var res uint32
+ for _, ep := range node.Entries {
+ if node.Header.Height == 0 {
+ res += uint32(ep.Entry.(*disklayout.Extent).Length)
+ } else {
+ res += getNumPhyBlks(ep.Node)
}
}
+ return res
}
diff --git a/pkg/sentry/fsimpl/ext/file_description.go b/pkg/sentry/fsimpl/ext/file_description.go
new file mode 100644
index 000000000..a0065343b
--- /dev/null
+++ b/pkg/sentry/fsimpl/ext/file_description.go
@@ -0,0 +1,86 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package ext
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+// fileDescription is embedded by ext implementations of
+// vfs.FileDescriptionImpl.
+type fileDescription struct {
+ vfsfd vfs.FileDescription
+ vfs.FileDescriptionDefaultImpl
+
+ // flags is the same as vfs.OpenOptions.Flags which are passed to
+ // vfs.FilesystemImpl.OpenAt.
+ // TODO(b/134676337): syscalls like read(2), write(2), fchmod(2), fchown(2),
+ // fgetxattr(2), ioctl(2), mmap(2) should fail with EBADF if O_PATH is set.
+ // Only close(2), fstat(2), fstatfs(2) should work.
+ flags uint32
+}
+
+func (fd *fileDescription) filesystem() *filesystem {
+ return fd.vfsfd.VirtualDentry().Mount().Filesystem().Impl().(*filesystem)
+}
+
+func (fd *fileDescription) inode() *inode {
+ return fd.vfsfd.VirtualDentry().Dentry().Impl().(*dentry).inode
+}
+
+// OnClose implements vfs.FileDescriptionImpl.OnClose.
+func (fd *fileDescription) OnClose() error { return nil }
+
+// StatusFlags implements vfs.FileDescriptionImpl.StatusFlags.
+func (fd *fileDescription) StatusFlags(ctx context.Context) (uint32, error) {
+ return fd.flags, nil
+}
+
+// SetStatusFlags implements vfs.FileDescriptionImpl.SetStatusFlags.
+func (fd *fileDescription) SetStatusFlags(ctx context.Context, flags uint32) error {
+ // None of the flags settable by fcntl(F_SETFL) are supported, so this is a
+ // no-op.
+ return nil
+}
+
+// Stat implements vfs.FileDescriptionImpl.Stat.
+func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
+ var stat linux.Statx
+ fd.inode().statTo(&stat)
+ return stat, nil
+}
+
+// SetStat implements vfs.FileDescriptionImpl.SetStat.
+func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
+ if opts.Stat.Mask == 0 {
+ return nil
+ }
+ return syserror.EPERM
+}
+
+// SetStat implements vfs.FileDescriptionImpl.StatFS.
+func (fd *fileDescription) StatFS(ctx context.Context) (linux.Statfs, error) {
+ var stat linux.Statfs
+ fd.filesystem().statTo(&stat)
+ return stat, nil
+}
+
+// Sync implements vfs.FileDescriptionImpl.Sync.
+func (fd *fileDescription) Sync(ctx context.Context) error {
+ return nil
+}
diff --git a/pkg/sentry/fsimpl/ext/filesystem.go b/pkg/sentry/fsimpl/ext/filesystem.go
new file mode 100644
index 000000000..2d15e8aaf
--- /dev/null
+++ b/pkg/sentry/fsimpl/ext/filesystem.go
@@ -0,0 +1,443 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package ext
+
+import (
+ "errors"
+ "io"
+ "sync"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+var (
+ // errResolveDirent indicates that the vfs.ResolvingPath.Component() does
+ // not exist on the dentry tree but does exist on disk. So it has to be read in
+ // using the in-memory dirent and added to the dentry tree. Usually indicates
+ // the need to lock filesystem.mu for writing.
+ errResolveDirent = errors.New("resolve path component using dirent")
+)
+
+// filesystem implements vfs.FilesystemImpl.
+type filesystem struct {
+ vfsfs vfs.Filesystem
+
+ // mu serializes changes to the Dentry tree.
+ mu sync.RWMutex
+
+ // dev represents the underlying fs device. It does not require protection
+ // because io.ReaderAt permits concurrent read calls to it. It translates to
+ // the pread syscall which passes on the read request directly to the device
+ // driver. Device drivers are intelligent in serving multiple concurrent read
+ // requests in the optimal order (taking locality into consideration).
+ dev io.ReaderAt
+
+ // inodeCache maps absolute inode numbers to the corresponding Inode struct.
+ // Inodes should be removed from this once their reference count hits 0.
+ //
+ // Protected by mu because most additions (see IterDirents) and all removals
+ // from this corresponds to a change in the dentry tree.
+ inodeCache map[uint32]*inode
+
+ // sb represents the filesystem superblock. Immutable after initialization.
+ sb disklayout.SuperBlock
+
+ // bgs represents all the block group descriptors for the filesystem.
+ // Immutable after initialization.
+ bgs []disklayout.BlockGroup
+}
+
+// Compiles only if filesystem implements vfs.FilesystemImpl.
+var _ vfs.FilesystemImpl = (*filesystem)(nil)
+
+// stepLocked resolves rp.Component() in parent directory vfsd. The write
+// parameter passed tells if the caller has acquired filesystem.mu for writing
+// or not. If set to true, an existing inode on disk can be added to the dentry
+// tree if not present already.
+//
+// stepLocked is loosely analogous to fs/namei.c:walk_component().
+//
+// Preconditions:
+// - filesystem.mu must be locked (for writing if write param is true).
+// - !rp.Done().
+// - inode == vfsd.Impl().(*Dentry).inode.
+func stepLocked(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, inode *inode, write bool) (*vfs.Dentry, *inode, error) {
+ if !inode.isDir() {
+ return nil, nil, syserror.ENOTDIR
+ }
+ if err := inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
+ return nil, nil, err
+ }
+
+ for {
+ nextVFSD, err := rp.ResolveComponent(vfsd)
+ if err != nil {
+ return nil, nil, err
+ }
+ if nextVFSD == nil {
+ // Since the Dentry tree is not the sole source of truth for extfs, if it's
+ // not in the Dentry tree, it might need to be pulled from disk.
+ childDirent, ok := inode.impl.(*directory).childMap[rp.Component()]
+ if !ok {
+ // The underlying inode does not exist on disk.
+ return nil, nil, syserror.ENOENT
+ }
+
+ if !write {
+ // filesystem.mu must be held for writing to add to the dentry tree.
+ return nil, nil, errResolveDirent
+ }
+
+ // Create and add the component's dirent to the dentry tree.
+ fs := rp.Mount().Filesystem().Impl().(*filesystem)
+ childInode, err := fs.getOrCreateInodeLocked(childDirent.diskDirent.Inode())
+ if err != nil {
+ return nil, nil, err
+ }
+ // incRef because this is being added to the dentry tree.
+ childInode.incRef()
+ child := newDentry(childInode)
+ vfsd.InsertChild(&child.vfsd, rp.Component())
+
+ // Continue as usual now that nextVFSD is not nil.
+ nextVFSD = &child.vfsd
+ }
+ nextInode := nextVFSD.Impl().(*dentry).inode
+ if nextInode.isSymlink() && rp.ShouldFollowSymlink() {
+ if err := rp.HandleSymlink(inode.impl.(*symlink).target); err != nil {
+ return nil, nil, err
+ }
+ continue
+ }
+ rp.Advance()
+ return nextVFSD, nextInode, nil
+ }
+}
+
+// walkLocked resolves rp to an existing file. The write parameter
+// passed tells if the caller has acquired filesystem.mu for writing or not.
+// If set to true, additions can be made to the dentry tree while walking.
+// If errResolveDirent is returned, the walk needs to be continued with an
+// upgraded filesystem.mu.
+//
+// walkLocked is loosely analogous to Linux's fs/namei.c:path_lookupat().
+//
+// Preconditions:
+// - filesystem.mu must be locked (for writing if write param is true).
+func walkLocked(rp *vfs.ResolvingPath, write bool) (*vfs.Dentry, *inode, error) {
+ vfsd := rp.Start()
+ inode := vfsd.Impl().(*dentry).inode
+ for !rp.Done() {
+ var err error
+ vfsd, inode, err = stepLocked(rp, vfsd, inode, write)
+ if err != nil {
+ return nil, nil, err
+ }
+ }
+ if rp.MustBeDir() && !inode.isDir() {
+ return nil, nil, syserror.ENOTDIR
+ }
+ return vfsd, inode, nil
+}
+
+// walkParentLocked resolves all but the last path component of rp to an
+// existing directory. It does not check that the returned directory is
+// searchable by the provider of rp. The write parameter passed tells if the
+// caller has acquired filesystem.mu for writing or not. If set to true,
+// additions can be made to the dentry tree while walking.
+// If errResolveDirent is returned, the walk needs to be continued with an
+// upgraded filesystem.mu.
+//
+// walkParentLocked is loosely analogous to Linux's fs/namei.c:path_parentat().
+//
+// Preconditions:
+// - filesystem.mu must be locked (for writing if write param is true).
+// - !rp.Done().
+func walkParentLocked(rp *vfs.ResolvingPath, write bool) (*vfs.Dentry, *inode, error) {
+ vfsd := rp.Start()
+ inode := vfsd.Impl().(*dentry).inode
+ for !rp.Final() {
+ var err error
+ vfsd, inode, err = stepLocked(rp, vfsd, inode, write)
+ if err != nil {
+ return nil, nil, err
+ }
+ }
+ if !inode.isDir() {
+ return nil, nil, syserror.ENOTDIR
+ }
+ return vfsd, inode, nil
+}
+
+// walk resolves rp to an existing file. If parent is set to true, it resolves
+// the rp till the parent of the last component which should be an existing
+// directory. If parent is false then resolves rp entirely. Attemps to resolve
+// the path as far as it can with a read lock and upgrades the lock if needed.
+func (fs *filesystem) walk(rp *vfs.ResolvingPath, parent bool) (*vfs.Dentry, *inode, error) {
+ var (
+ vfsd *vfs.Dentry
+ inode *inode
+ err error
+ )
+
+ // Try walking with the hopes that all dentries have already been pulled out
+ // of disk. This reduces congestion (allows concurrent walks).
+ fs.mu.RLock()
+ if parent {
+ vfsd, inode, err = walkParentLocked(rp, false)
+ } else {
+ vfsd, inode, err = walkLocked(rp, false)
+ }
+ fs.mu.RUnlock()
+
+ if err == errResolveDirent {
+ // Upgrade lock and continue walking. Lock upgrading in the middle of the
+ // walk is fine as this is a read only filesystem.
+ fs.mu.Lock()
+ if parent {
+ vfsd, inode, err = walkParentLocked(rp, true)
+ } else {
+ vfsd, inode, err = walkLocked(rp, true)
+ }
+ fs.mu.Unlock()
+ }
+
+ return vfsd, inode, err
+}
+
+// getOrCreateInodeLocked gets the inode corresponding to the inode number passed in.
+// It creates a new one with the given inode number if one does not exist.
+// The caller must increment the ref count if adding this to the dentry tree.
+//
+// Precondition: must be holding fs.mu for writing.
+func (fs *filesystem) getOrCreateInodeLocked(inodeNum uint32) (*inode, error) {
+ if in, ok := fs.inodeCache[inodeNum]; ok {
+ return in, nil
+ }
+
+ in, err := newInode(fs, inodeNum)
+ if err != nil {
+ return nil, err
+ }
+
+ fs.inodeCache[inodeNum] = in
+ return in, nil
+}
+
+// statTo writes the statfs fields to the output parameter.
+func (fs *filesystem) statTo(stat *linux.Statfs) {
+ stat.Type = uint64(fs.sb.Magic())
+ stat.BlockSize = int64(fs.sb.BlockSize())
+ stat.Blocks = fs.sb.BlocksCount()
+ stat.BlocksFree = fs.sb.FreeBlocksCount()
+ stat.BlocksAvailable = fs.sb.FreeBlocksCount()
+ stat.Files = uint64(fs.sb.InodesCount())
+ stat.FilesFree = uint64(fs.sb.FreeInodesCount())
+ stat.NameLength = disklayout.MaxFileName
+ stat.FragmentSize = int64(fs.sb.BlockSize())
+ // TODO(b/134676337): Set Statfs.Flags and Statfs.FSID.
+}
+
+// GetDentryAt implements vfs.FilesystemImpl.GetDentryAt.
+func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) {
+ vfsd, inode, err := fs.walk(rp, false)
+ if err != nil {
+ return nil, err
+ }
+
+ if opts.CheckSearchable {
+ if !inode.isDir() {
+ return nil, syserror.ENOTDIR
+ }
+ if err := inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
+ return nil, err
+ }
+ }
+
+ inode.incRef()
+ return vfsd, nil
+}
+
+// OpenAt implements vfs.FilesystemImpl.OpenAt.
+func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+ vfsd, inode, err := fs.walk(rp, false)
+ if err != nil {
+ return nil, err
+ }
+
+ // EROFS is returned if write access is needed.
+ if vfs.MayWriteFileWithOpenFlags(opts.Flags) || opts.Flags&(linux.O_CREAT|linux.O_EXCL|linux.O_TMPFILE) != 0 {
+ return nil, syserror.EROFS
+ }
+ return inode.open(rp, vfsd, opts.Flags)
+}
+
+// ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt.
+func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) {
+ _, inode, err := fs.walk(rp, false)
+ if err != nil {
+ return "", err
+ }
+ symlink, ok := inode.impl.(*symlink)
+ if !ok {
+ return "", syserror.EINVAL
+ }
+ return symlink.target, nil
+}
+
+// StatAt implements vfs.FilesystemImpl.StatAt.
+func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) {
+ _, inode, err := fs.walk(rp, false)
+ if err != nil {
+ return linux.Statx{}, err
+ }
+ var stat linux.Statx
+ inode.statTo(&stat)
+ return stat, nil
+}
+
+// StatFSAt implements vfs.FilesystemImpl.StatFSAt.
+func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) {
+ if _, _, err := fs.walk(rp, false); err != nil {
+ return linux.Statfs{}, err
+ }
+
+ var stat linux.Statfs
+ fs.statTo(&stat)
+ return stat, nil
+}
+
+// Release implements vfs.FilesystemImpl.Release.
+func (fs *filesystem) Release() {}
+
+// Sync implements vfs.FilesystemImpl.Sync.
+func (fs *filesystem) Sync(ctx context.Context) error {
+ // This is a readonly filesystem for now.
+ return nil
+}
+
+// The vfs.FilesystemImpl functions below return EROFS because their respective
+// man pages say that EROFS must be returned if the path resolves to a file on
+// this read-only filesystem.
+
+// LinkAt implements vfs.FilesystemImpl.LinkAt.
+func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error {
+ if rp.Done() {
+ return syserror.EEXIST
+ }
+
+ if _, _, err := fs.walk(rp, true); err != nil {
+ return err
+ }
+
+ return syserror.EROFS
+}
+
+// MkdirAt implements vfs.FilesystemImpl.MkdirAt.
+func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error {
+ if rp.Done() {
+ return syserror.EEXIST
+ }
+
+ if _, _, err := fs.walk(rp, true); err != nil {
+ return err
+ }
+
+ return syserror.EROFS
+}
+
+// MknodAt implements vfs.FilesystemImpl.MknodAt.
+func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error {
+ if rp.Done() {
+ return syserror.EEXIST
+ }
+
+ _, _, err := fs.walk(rp, true)
+ if err != nil {
+ return err
+ }
+
+ return syserror.EROFS
+}
+
+// RenameAt implements vfs.FilesystemImpl.RenameAt.
+func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry, opts vfs.RenameOptions) error {
+ if rp.Done() {
+ return syserror.ENOENT
+ }
+
+ _, _, err := fs.walk(rp, false)
+ if err != nil {
+ return err
+ }
+
+ return syserror.EROFS
+}
+
+// RmdirAt implements vfs.FilesystemImpl.RmdirAt.
+func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error {
+ _, inode, err := fs.walk(rp, false)
+ if err != nil {
+ return err
+ }
+
+ if !inode.isDir() {
+ return syserror.ENOTDIR
+ }
+
+ return syserror.EROFS
+}
+
+// SetStatAt implements vfs.FilesystemImpl.SetStatAt.
+func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error {
+ _, _, err := fs.walk(rp, false)
+ if err != nil {
+ return err
+ }
+
+ return syserror.EROFS
+}
+
+// SymlinkAt implements vfs.FilesystemImpl.SymlinkAt.
+func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error {
+ if rp.Done() {
+ return syserror.EEXIST
+ }
+
+ _, _, err := fs.walk(rp, true)
+ if err != nil {
+ return err
+ }
+
+ return syserror.EROFS
+}
+
+// UnlinkAt implements vfs.FilesystemImpl.UnlinkAt.
+func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error {
+ _, inode, err := fs.walk(rp, false)
+ if err != nil {
+ return err
+ }
+
+ if inode.isDir() {
+ return syserror.EISDIR
+ }
+
+ return syserror.EROFS
+}
diff --git a/pkg/sentry/fsimpl/ext/inode.go b/pkg/sentry/fsimpl/ext/inode.go
new file mode 100644
index 000000000..e6c847a71
--- /dev/null
+++ b/pkg/sentry/fsimpl/ext/inode.go
@@ -0,0 +1,219 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package ext
+
+import (
+ "fmt"
+ "io"
+ "sync/atomic"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+// inode represents an ext inode.
+//
+// inode uses the same inheritance pattern that pkg/sentry/vfs structures use.
+// This has been done to increase memory locality.
+//
+// Implementations:
+// inode --
+// |-- dir
+// |-- symlink
+// |-- regular--
+// |-- extent file
+// |-- block map file
+type inode struct {
+ // refs is a reference count. refs is accessed using atomic memory operations.
+ refs int64
+
+ // inodeNum is the inode number of this inode on disk. This is used to
+ // identify inodes within the ext filesystem.
+ inodeNum uint32
+
+ // dev represents the underlying device. Same as filesystem.dev.
+ dev io.ReaderAt
+
+ // blkSize is the fs data block size. Same as filesystem.sb.BlockSize().
+ blkSize uint64
+
+ // diskInode gives us access to the inode struct on disk. Immutable.
+ diskInode disklayout.Inode
+
+ // This is immutable. The first field of the implementations must have inode
+ // as the first field to ensure temporality.
+ impl interface{}
+}
+
+// incRef increments the inode ref count.
+func (in *inode) incRef() {
+ atomic.AddInt64(&in.refs, 1)
+}
+
+// tryIncRef tries to increment the ref count. Returns true if successful.
+func (in *inode) tryIncRef() bool {
+ for {
+ refs := atomic.LoadInt64(&in.refs)
+ if refs == 0 {
+ return false
+ }
+ if atomic.CompareAndSwapInt64(&in.refs, refs, refs+1) {
+ return true
+ }
+ }
+}
+
+// decRef decrements the inode ref count and releases the inode resources if
+// the ref count hits 0.
+//
+// Precondition: Must have locked fs.mu.
+func (in *inode) decRef(fs *filesystem) {
+ if refs := atomic.AddInt64(&in.refs, -1); refs == 0 {
+ delete(fs.inodeCache, in.inodeNum)
+ } else if refs < 0 {
+ panic("ext.inode.decRef() called without holding a reference")
+ }
+}
+
+// newInode is the inode constructor. Reads the inode off disk. Identifies
+// inodes based on the absolute inode number on disk.
+func newInode(fs *filesystem, inodeNum uint32) (*inode, error) {
+ if inodeNum == 0 {
+ panic("inode number 0 on ext filesystems is not possible")
+ }
+
+ inodeRecordSize := fs.sb.InodeSize()
+ var diskInode disklayout.Inode
+ if inodeRecordSize == disklayout.OldInodeSize {
+ diskInode = &disklayout.InodeOld{}
+ } else {
+ diskInode = &disklayout.InodeNew{}
+ }
+
+ // Calculate where the inode is actually placed.
+ inodesPerGrp := fs.sb.InodesPerGroup()
+ blkSize := fs.sb.BlockSize()
+ inodeTableOff := fs.bgs[getBGNum(inodeNum, inodesPerGrp)].InodeTable() * blkSize
+ inodeOff := inodeTableOff + uint64(uint32(inodeRecordSize)*getBGOff(inodeNum, inodesPerGrp))
+
+ if err := readFromDisk(fs.dev, int64(inodeOff), diskInode); err != nil {
+ return nil, err
+ }
+
+ // Build the inode based on its type.
+ inode := inode{
+ inodeNum: inodeNum,
+ dev: fs.dev,
+ blkSize: blkSize,
+ diskInode: diskInode,
+ }
+
+ switch diskInode.Mode().FileType() {
+ case linux.ModeSymlink:
+ f, err := newSymlink(inode)
+ if err != nil {
+ return nil, err
+ }
+ return &f.inode, nil
+ case linux.ModeRegular:
+ f, err := newRegularFile(inode)
+ if err != nil {
+ return nil, err
+ }
+ return &f.inode, nil
+ case linux.ModeDirectory:
+ f, err := newDirectroy(inode, fs.sb.IncompatibleFeatures().DirentFileType)
+ if err != nil {
+ return nil, err
+ }
+ return &f.inode, nil
+ default:
+ // TODO(b/134676337): Return appropriate errors for sockets, pipes and devices.
+ return nil, syserror.EINVAL
+ }
+}
+
+// open creates and returns a file description for the dentry passed in.
+func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) {
+ ats := vfs.AccessTypesForOpenFlags(flags)
+ if err := in.checkPermissions(rp.Credentials(), ats); err != nil {
+ return nil, err
+ }
+ switch in.impl.(type) {
+ case *regularFile:
+ var fd regularFileFD
+ fd.flags = flags
+ fd.vfsfd.Init(&fd, rp.Mount(), vfsd)
+ return &fd.vfsfd, nil
+ case *directory:
+ // Can't open directories writably. This check is not necessary for a read
+ // only filesystem but will be required when write is implemented.
+ if ats&vfs.MayWrite != 0 {
+ return nil, syserror.EISDIR
+ }
+ var fd directoryFD
+ fd.vfsfd.Init(&fd, rp.Mount(), vfsd)
+ fd.flags = flags
+ return &fd.vfsfd, nil
+ case *symlink:
+ if flags&linux.O_PATH == 0 {
+ // Can't open symlinks without O_PATH.
+ return nil, syserror.ELOOP
+ }
+ var fd symlinkFD
+ fd.flags = flags
+ fd.vfsfd.Init(&fd, rp.Mount(), vfsd)
+ return &fd.vfsfd, nil
+ default:
+ panic(fmt.Sprintf("unknown inode type: %T", in.impl))
+ }
+}
+
+func (in *inode) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error {
+ return vfs.GenericCheckPermissions(creds, ats, in.isDir(), uint16(in.diskInode.Mode()), in.diskInode.UID(), in.diskInode.GID())
+}
+
+// statTo writes the statx fields to the output parameter.
+func (in *inode) statTo(stat *linux.Statx) {
+ stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK |
+ linux.STATX_UID | linux.STATX_GID | linux.STATX_INO | linux.STATX_SIZE |
+ linux.STATX_ATIME | linux.STATX_CTIME | linux.STATX_MTIME
+ stat.Blksize = uint32(in.blkSize)
+ stat.Mode = uint16(in.diskInode.Mode())
+ stat.Nlink = uint32(in.diskInode.LinksCount())
+ stat.UID = uint32(in.diskInode.UID())
+ stat.GID = uint32(in.diskInode.GID())
+ stat.Ino = uint64(in.inodeNum)
+ stat.Size = in.diskInode.Size()
+ stat.Atime = in.diskInode.AccessTime().StatxTimestamp()
+ stat.Ctime = in.diskInode.ChangeTime().StatxTimestamp()
+ stat.Mtime = in.diskInode.ModificationTime().StatxTimestamp()
+ // TODO(b/134676337): Set stat.Blocks which is the number of 512 byte blocks
+ // (including metadata blocks) required to represent this file.
+}
+
+// getBGNum returns the block group number that a given inode belongs to.
+func getBGNum(inodeNum uint32, inodesPerGrp uint32) uint32 {
+ return (inodeNum - 1) / inodesPerGrp
+}
+
+// getBGOff returns the offset at which the given inode lives in the block
+// group's inode table, i.e. the index of the inode in the inode table.
+func getBGOff(inodeNum uint32, inodesPerGrp uint32) uint32 {
+ return (inodeNum - 1) % inodesPerGrp
+}
diff --git a/pkg/sentry/fsimpl/ext/regular_file.go b/pkg/sentry/fsimpl/ext/regular_file.go
new file mode 100644
index 000000000..ffc76ba5b
--- /dev/null
+++ b/pkg/sentry/fsimpl/ext/regular_file.go
@@ -0,0 +1,159 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package ext
+
+import (
+ "io"
+ "sync"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/sentry/memmap"
+ "gvisor.dev/gvisor/pkg/sentry/safemem"
+ "gvisor.dev/gvisor/pkg/sentry/usermem"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+// regularFile represents a regular file's inode. This too follows the
+// inheritance pattern prevelant in the vfs layer described in
+// pkg/sentry/vfs/README.md.
+type regularFile struct {
+ inode inode
+
+ // This is immutable. The first field of fileReader implementations must be
+ // regularFile to ensure temporality.
+ // io.ReaderAt is more strict than io.Reader in the sense that a partial read
+ // is always accompanied by an error. If a read spans past the end of file, a
+ // partial read (within file range) is done and io.EOF is returned.
+ impl io.ReaderAt
+}
+
+// newRegularFile is the regularFile constructor. It figures out what kind of
+// file this is and initializes the fileReader.
+func newRegularFile(inode inode) (*regularFile, error) {
+ regFile := regularFile{
+ inode: inode,
+ }
+
+ inodeFlags := inode.diskInode.Flags()
+
+ if inodeFlags.Extents {
+ file, err := newExtentFile(regFile)
+ if err != nil {
+ return nil, err
+ }
+
+ file.regFile.inode.impl = &file.regFile
+ return &file.regFile, nil
+ }
+
+ file, err := newBlockMapFile(regFile)
+ if err != nil {
+ return nil, err
+ }
+ file.regFile.inode.impl = &file.regFile
+ return &file.regFile, nil
+}
+
+func (in *inode) isRegular() bool {
+ _, ok := in.impl.(*regularFile)
+ return ok
+}
+
+// directoryFD represents a directory file description. It implements
+// vfs.FileDescriptionImpl.
+type regularFileFD struct {
+ fileDescription
+
+ // off is the file offset. off is accessed using atomic memory operations.
+ off int64
+
+ // offMu serializes operations that may mutate off.
+ offMu sync.Mutex
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *regularFileFD) Release() {}
+
+// PRead implements vfs.FileDescriptionImpl.PRead.
+func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+ safeReader := safemem.FromIOReaderAt{
+ ReaderAt: fd.inode().impl.(*regularFile).impl,
+ Offset: offset,
+ }
+
+ // Copies data from disk directly into usermem without any intermediate
+ // allocations (if dst is converted into BlockSeq such that it does not need
+ // safe copying).
+ return dst.CopyOutFrom(ctx, safeReader)
+}
+
+// Read implements vfs.FileDescriptionImpl.Read.
+func (fd *regularFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+ n, err := fd.PRead(ctx, dst, fd.off, opts)
+ fd.offMu.Lock()
+ fd.off += n
+ fd.offMu.Unlock()
+ return n, err
+}
+
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
+func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+ // write(2) specifies that EBADF must be returned if the fd is not open for
+ // writing.
+ return 0, syserror.EBADF
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (fd *regularFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+ n, err := fd.PWrite(ctx, src, fd.off, opts)
+ fd.offMu.Lock()
+ fd.off += n
+ fd.offMu.Unlock()
+ return n, err
+}
+
+// IterDirents implements vfs.FileDescriptionImpl.IterDirents.
+func (fd *regularFileFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error {
+ return syserror.ENOTDIR
+}
+
+// Seek implements vfs.FileDescriptionImpl.Seek.
+func (fd *regularFileFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+ fd.offMu.Lock()
+ defer fd.offMu.Unlock()
+ switch whence {
+ case linux.SEEK_SET:
+ // Use offset as specified.
+ case linux.SEEK_CUR:
+ offset += fd.off
+ case linux.SEEK_END:
+ offset += int64(fd.inode().diskInode.Size())
+ default:
+ return 0, syserror.EINVAL
+ }
+ if offset < 0 {
+ return 0, syserror.EINVAL
+ }
+ fd.off = offset
+ return offset, nil
+}
+
+// IterDirents implements vfs.FileDescriptionImpl.IterDirents.
+func (fd *regularFileFD) ConfigureMMap(ctx context.Context, opts memmap.MMapOpts) error {
+ // TODO(b/134676337): Implement mmap(2).
+ return syserror.ENODEV
+}
diff --git a/pkg/sentry/fsimpl/ext/symlink.go b/pkg/sentry/fsimpl/ext/symlink.go
new file mode 100644
index 000000000..e06548a98
--- /dev/null
+++ b/pkg/sentry/fsimpl/ext/symlink.go
@@ -0,0 +1,111 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package ext
+
+import (
+ "gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/sentry/memmap"
+ "gvisor.dev/gvisor/pkg/sentry/usermem"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+// symlink represents a symlink inode.
+type symlink struct {
+ inode inode
+ target string // immutable
+}
+
+// newSymlink is the symlink constructor. It reads out the symlink target from
+// the inode (however it might have been stored).
+func newSymlink(inode inode) (*symlink, error) {
+ var file *symlink
+ var link []byte
+
+ // If the symlink target is lesser than 60 bytes, its stores in inode.Data().
+ // Otherwise either extents or block maps will be used to store the link.
+ size := inode.diskInode.Size()
+ if size < 60 {
+ link = inode.diskInode.Data()[:size]
+ } else {
+ // Create a regular file out of this inode and read out the target.
+ regFile, err := newRegularFile(inode)
+ if err != nil {
+ return nil, err
+ }
+
+ link = make([]byte, size)
+ if n, err := regFile.impl.ReadAt(link, 0); uint64(n) < size {
+ return nil, err
+ }
+ }
+
+ file = &symlink{inode: inode, target: string(link)}
+ file.inode.impl = file
+ return file, nil
+}
+
+func (in *inode) isSymlink() bool {
+ _, ok := in.impl.(*symlink)
+ return ok
+}
+
+// symlinkFD represents a symlink file description and implements implements
+// vfs.FileDescriptionImpl. which may only be used if open options contains
+// O_PATH. For this reason most of the functions return EBADF.
+type symlinkFD struct {
+ fileDescription
+}
+
+// Compiles only if symlinkFD implements vfs.FileDescriptionImpl.
+var _ vfs.FileDescriptionImpl = (*symlinkFD)(nil)
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *symlinkFD) Release() {}
+
+// PRead implements vfs.FileDescriptionImpl.PRead.
+func (fd *symlinkFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+ return 0, syserror.EBADF
+}
+
+// Read implements vfs.FileDescriptionImpl.Read.
+func (fd *symlinkFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+ return 0, syserror.EBADF
+}
+
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
+func (fd *symlinkFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+ return 0, syserror.EBADF
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (fd *symlinkFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+ return 0, syserror.EBADF
+}
+
+// IterDirents implements vfs.FileDescriptionImpl.IterDirents.
+func (fd *symlinkFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error {
+ return syserror.ENOTDIR
+}
+
+// Seek implements vfs.FileDescriptionImpl.Seek.
+func (fd *symlinkFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+ return 0, syserror.EBADF
+}
+
+// IterDirents implements vfs.FileDescriptionImpl.IterDirents.
+func (fd *symlinkFD) ConfigureMMap(ctx context.Context, opts memmap.MMapOpts) error {
+ return syserror.EBADF
+}
diff --git a/pkg/sentry/fs/ext/utils.go b/pkg/sentry/fsimpl/ext/utils.go
index 3472c5fa8..d8b728f8c 100644
--- a/pkg/sentry/fs/ext/utils.go
+++ b/pkg/sentry/fsimpl/ext/utils.go
@@ -15,38 +15,30 @@
package ext
import (
- "encoding/binary"
"io"
- "gvisor.dev/gvisor/pkg/sentry/fs/ext/disklayout"
+ "gvisor.dev/gvisor/pkg/binary"
+ "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout"
"gvisor.dev/gvisor/pkg/syserror"
)
// readFromDisk performs a binary read from disk into the given struct from
// the absolute offset provided.
-//
-// All disk reads should use this helper so we avoid reading from stale
-// previously used offsets. This function forces the offset parameter.
-//
-// Precondition: Must hold the mutex of the filesystem containing dev.
-func readFromDisk(dev io.ReadSeeker, abOff int64, v interface{}) error {
- if _, err := dev.Seek(abOff, io.SeekStart); err != nil {
- return syserror.EIO
- }
-
- if err := binary.Read(dev, binary.LittleEndian, v); err != nil {
+func readFromDisk(dev io.ReaderAt, abOff int64, v interface{}) error {
+ n := binary.Size(v)
+ buf := make([]byte, n)
+ if read, _ := dev.ReadAt(buf, abOff); read < int(n) {
return syserror.EIO
}
+ binary.Unmarshal(buf, binary.LittleEndian, v)
return nil
}
// readSuperBlock reads the SuperBlock from block group 0 in the underlying
// device. There are three versions of the superblock. This function identifies
// and returns the correct version.
-//
-// Precondition: Must hold the mutex of the filesystem containing dev.
-func readSuperBlock(dev io.ReadSeeker) (disklayout.SuperBlock, error) {
+func readSuperBlock(dev io.ReaderAt) (disklayout.SuperBlock, error) {
var sb disklayout.SuperBlock = &disklayout.SuperBlockOld{}
if err := readFromDisk(dev, disklayout.SbOffset, sb); err != nil {
return nil, err
@@ -76,19 +68,12 @@ func blockGroupsCount(sb disklayout.SuperBlock) uint64 {
blocksPerGroup := uint64(sb.BlocksPerGroup())
// Round up the result. float64 can compromise precision so do it manually.
- bgCount := blocksCount / blocksPerGroup
- if blocksCount%blocksPerGroup != 0 {
- bgCount++
- }
-
- return bgCount
+ return (blocksCount + blocksPerGroup - 1) / blocksPerGroup
}
// readBlockGroups reads the block group descriptor table from block group 0 in
// the underlying device.
-//
-// Precondition: Must hold the mutex of the filesystem containing dev.
-func readBlockGroups(dev io.ReadSeeker, sb disklayout.SuperBlock) ([]disklayout.BlockGroup, error) {
+func readBlockGroups(dev io.ReaderAt, sb disklayout.SuperBlock) ([]disklayout.BlockGroup, error) {
bgCount := blockGroupsCount(sb)
bgdSize := uint64(sb.BgDescSize())
is64Bit := sb.IncompatibleFeatures().Is64Bit
diff --git a/pkg/sentry/fsimpl/memfs/BUILD b/pkg/sentry/fsimpl/memfs/BUILD
index d5d4f68df..d2450e810 100644
--- a/pkg/sentry/fsimpl/memfs/BUILD
+++ b/pkg/sentry/fsimpl/memfs/BUILD
@@ -11,8 +11,8 @@ go_template_instance(
prefix = "dentry",
template = "//pkg/ilist:generic_list",
types = {
- "Element": "*Dentry",
- "Linker": "*Dentry",
+ "Element": "*dentry",
+ "Linker": "*dentry",
},
)
diff --git a/pkg/sentry/fsimpl/memfs/directory.go b/pkg/sentry/fsimpl/memfs/directory.go
index b0c3ea39a..c52dc781c 100644
--- a/pkg/sentry/fsimpl/memfs/directory.go
+++ b/pkg/sentry/fsimpl/memfs/directory.go
@@ -23,23 +23,23 @@ import (
)
type directory struct {
- inode Inode
+ inode inode
// childList is a list containing (1) child Dentries and (2) fake Dentries
// (with inode == nil) that represent the iteration position of
// directoryFDs. childList is used to support directoryFD.IterDirents()
- // efficiently. childList is protected by Filesystem.mu.
+ // efficiently. childList is protected by filesystem.mu.
childList dentryList
}
-func (fs *Filesystem) newDirectory(creds *auth.Credentials, mode uint16) *Inode {
+func (fs *filesystem) newDirectory(creds *auth.Credentials, mode uint16) *inode {
dir := &directory{}
dir.inode.init(dir, fs, creds, mode)
dir.inode.nlink = 2 // from "." and parent directory or ".." for root
return &dir.inode
}
-func (i *Inode) isDir() bool {
+func (i *inode) isDir() bool {
_, ok := i.impl.(*directory)
return ok
}
@@ -48,8 +48,8 @@ type directoryFD struct {
fileDescription
vfs.DirectoryFileDescriptionDefaultImpl
- // Protected by Filesystem.mu.
- iter *Dentry
+ // Protected by filesystem.mu.
+ iter *dentry
off int64
}
@@ -68,7 +68,7 @@ func (fd *directoryFD) Release() {
// IterDirents implements vfs.FileDescriptionImpl.IterDirents.
func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error {
fs := fd.filesystem()
- d := fd.vfsfd.VirtualDentry().Dentry()
+ vfsd := fd.vfsfd.VirtualDentry().Dentry()
fs.mu.Lock()
defer fs.mu.Unlock()
@@ -77,7 +77,7 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba
if !cb.Handle(vfs.Dirent{
Name: ".",
Type: linux.DT_DIR,
- Ino: d.Impl().(*Dentry).inode.ino,
+ Ino: vfsd.Impl().(*dentry).inode.ino,
Off: 0,
}) {
return nil
@@ -85,7 +85,7 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba
fd.off++
}
if fd.off == 1 {
- parentInode := d.ParentOrSelf().Impl().(*Dentry).inode
+ parentInode := vfsd.ParentOrSelf().Impl().(*dentry).inode
if !cb.Handle(vfs.Dirent{
Name: "..",
Type: parentInode.direntType(),
@@ -97,12 +97,12 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba
fd.off++
}
- dir := d.Impl().(*Dentry).inode.impl.(*directory)
- var child *Dentry
+ dir := vfsd.Impl().(*dentry).inode.impl.(*directory)
+ var child *dentry
if fd.iter == nil {
// Start iteration at the beginning of dir.
child = dir.childList.Front()
- fd.iter = &Dentry{}
+ fd.iter = &dentry{}
} else {
// Continue iteration from where we left off.
child = fd.iter.Next()
@@ -130,32 +130,41 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba
// Seek implements vfs.FileDescriptionImpl.Seek.
func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
- if whence != linux.SEEK_SET {
- // TODO: Linux also allows SEEK_CUR.
+ fs := fd.filesystem()
+ fs.mu.Lock()
+ defer fs.mu.Unlock()
+
+ switch whence {
+ case linux.SEEK_SET:
+ // Use offset as given.
+ case linux.SEEK_CUR:
+ offset += fd.off
+ default:
return 0, syserror.EINVAL
}
if offset < 0 {
return 0, syserror.EINVAL
}
+ // If the offset isn't changing (e.g. due to lseek(0, SEEK_CUR)), don't
+ // seek even if doing so might reposition the iterator due to concurrent
+ // mutation of the directory. Compare fs/libfs.c:dcache_dir_lseek().
+ if fd.off == offset {
+ return offset, nil
+ }
+
fd.off = offset
// Compensate for "." and "..".
- var remChildren int64
- if offset < 2 {
- remChildren = 0
- } else {
+ remChildren := int64(0)
+ if offset >= 2 {
remChildren = offset - 2
}
- fs := fd.filesystem()
dir := fd.inode().impl.(*directory)
- fs.mu.Lock()
- defer fs.mu.Unlock()
-
// Ensure that fd.iter exists and is not linked into dir.childList.
if fd.iter == nil {
- fd.iter = &Dentry{}
+ fd.iter = &dentry{}
} else {
dir.childList.Remove(fd.iter)
}
diff --git a/pkg/sentry/fsimpl/memfs/filesystem.go b/pkg/sentry/fsimpl/memfs/filesystem.go
index 4d989eeaf..f79e2d9c8 100644
--- a/pkg/sentry/fsimpl/memfs/filesystem.go
+++ b/pkg/sentry/fsimpl/memfs/filesystem.go
@@ -28,9 +28,9 @@ import (
//
// stepLocked is loosely analogous to fs/namei.c:walk_component().
//
-// Preconditions: Filesystem.mu must be locked. !rp.Done(). inode ==
-// vfsd.Impl().(*Dentry).inode.
-func stepLocked(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, inode *Inode) (*vfs.Dentry, *Inode, error) {
+// Preconditions: filesystem.mu must be locked. !rp.Done(). inode ==
+// vfsd.Impl().(*dentry).inode.
+func stepLocked(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, inode *inode) (*vfs.Dentry, *inode, error) {
if !inode.isDir() {
return nil, nil, syserror.ENOTDIR
}
@@ -47,7 +47,7 @@ afterSymlink:
// not in the Dentry tree, it doesn't exist.
return nil, nil, syserror.ENOENT
}
- nextInode := nextVFSD.Impl().(*Dentry).inode
+ nextInode := nextVFSD.Impl().(*dentry).inode
if symlink, ok := nextInode.impl.(*symlink); ok && rp.ShouldFollowSymlink() {
// TODO: symlink traversals update access time
if err := rp.HandleSymlink(symlink.target); err != nil {
@@ -64,10 +64,10 @@ afterSymlink:
// walkExistingLocked is loosely analogous to Linux's
// fs/namei.c:path_lookupat().
//
-// Preconditions: Filesystem.mu must be locked.
-func walkExistingLocked(rp *vfs.ResolvingPath) (*vfs.Dentry, *Inode, error) {
+// Preconditions: filesystem.mu must be locked.
+func walkExistingLocked(rp *vfs.ResolvingPath) (*vfs.Dentry, *inode, error) {
vfsd := rp.Start()
- inode := vfsd.Impl().(*Dentry).inode
+ inode := vfsd.Impl().(*dentry).inode
for !rp.Done() {
var err error
vfsd, inode, err = stepLocked(rp, vfsd, inode)
@@ -88,10 +88,10 @@ func walkExistingLocked(rp *vfs.ResolvingPath) (*vfs.Dentry, *Inode, error) {
// walkParentDirLocked is loosely analogous to Linux's
// fs/namei.c:path_parentat().
//
-// Preconditions: Filesystem.mu must be locked. !rp.Done().
-func walkParentDirLocked(rp *vfs.ResolvingPath) (*vfs.Dentry, *Inode, error) {
+// Preconditions: filesystem.mu must be locked. !rp.Done().
+func walkParentDirLocked(rp *vfs.ResolvingPath) (*vfs.Dentry, *inode, error) {
vfsd := rp.Start()
- inode := vfsd.Impl().(*Dentry).inode
+ inode := vfsd.Impl().(*dentry).inode
for !rp.Final() {
var err error
vfsd, inode, err = stepLocked(rp, vfsd, inode)
@@ -108,9 +108,9 @@ func walkParentDirLocked(rp *vfs.ResolvingPath) (*vfs.Dentry, *Inode, error) {
// checkCreateLocked checks that a file named rp.Component() may be created in
// directory parentVFSD, then returns rp.Component().
//
-// Preconditions: Filesystem.mu must be locked. parentInode ==
-// parentVFSD.Impl().(*Dentry).inode. parentInode.isDir() == true.
-func checkCreateLocked(rp *vfs.ResolvingPath, parentVFSD *vfs.Dentry, parentInode *Inode) (string, error) {
+// Preconditions: filesystem.mu must be locked. parentInode ==
+// parentVFSD.Impl().(*dentry).inode. parentInode.isDir() == true.
+func checkCreateLocked(rp *vfs.ResolvingPath, parentVFSD *vfs.Dentry, parentInode *inode) (string, error) {
if err := parentInode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true); err != nil {
return "", err
}
@@ -144,7 +144,7 @@ func checkDeleteLocked(vfsd *vfs.Dentry) error {
}
// GetDentryAt implements vfs.FilesystemImpl.GetDentryAt.
-func (fs *Filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) {
+func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) {
fs.mu.RLock()
defer fs.mu.RUnlock()
vfsd, inode, err := walkExistingLocked(rp)
@@ -164,7 +164,7 @@ func (fs *Filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, op
}
// LinkAt implements vfs.FilesystemImpl.LinkAt.
-func (fs *Filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error {
+func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error {
if rp.Done() {
return syserror.EEXIST
}
@@ -185,7 +185,7 @@ func (fs *Filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.
return err
}
defer rp.Mount().EndWrite()
- d := vd.Dentry().Impl().(*Dentry)
+ d := vd.Dentry().Impl().(*dentry)
if d.inode.isDir() {
return syserror.EPERM
}
@@ -197,7 +197,7 @@ func (fs *Filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.
}
// MkdirAt implements vfs.FilesystemImpl.MkdirAt.
-func (fs *Filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error {
+func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error {
if rp.Done() {
return syserror.EEXIST
}
@@ -223,7 +223,7 @@ func (fs *Filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
}
// MknodAt implements vfs.FilesystemImpl.MknodAt.
-func (fs *Filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error {
+func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error {
if rp.Done() {
return syserror.EEXIST
}
@@ -246,7 +246,7 @@ func (fs *Filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
}
// OpenAt implements vfs.FilesystemImpl.OpenAt.
-func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
// Filter out flags that are not supported by memfs. O_DIRECTORY and
// O_NOFOLLOW have no effect here (they're handled by VFS by setting
// appropriate bits in rp), but are returned by
@@ -265,11 +265,10 @@ func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
mustCreate := opts.Flags&linux.O_EXCL != 0
vfsd := rp.Start()
- inode := vfsd.Impl().(*Dentry).inode
+ inode := vfsd.Impl().(*dentry).inode
fs.mu.Lock()
defer fs.mu.Unlock()
if rp.Done() {
- // FIXME: ???
if rp.MustBeDir() {
return nil, syserror.EISDIR
}
@@ -327,7 +326,7 @@ afterTrailingSymlink:
if mustCreate {
return nil, syserror.EEXIST
}
- childInode := childVFSD.Impl().(*Dentry).inode
+ childInode := childVFSD.Impl().(*dentry).inode
if symlink, ok := childInode.impl.(*symlink); ok && rp.ShouldFollowSymlink() {
// TODO: symlink traversals update access time
if err := rp.HandleSymlink(symlink.target); err != nil {
@@ -340,7 +339,7 @@ afterTrailingSymlink:
return childInode.open(rp, childVFSD, opts.Flags, false)
}
-func (i *Inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32, afterCreate bool) (*vfs.FileDescription, error) {
+func (i *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32, afterCreate bool) (*vfs.FileDescription, error) {
ats := vfs.AccessTypesForOpenFlags(flags)
if !afterCreate {
if err := i.checkPermissions(rp.Credentials(), ats, i.isDir()); err != nil {
@@ -385,7 +384,7 @@ func (i *Inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32, afte
}
// ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt.
-func (fs *Filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) {
+func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) {
fs.mu.RLock()
_, inode, err := walkExistingLocked(rp)
fs.mu.RUnlock()
@@ -400,9 +399,8 @@ func (fs *Filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (st
}
// RenameAt implements vfs.FilesystemImpl.RenameAt.
-func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry, opts vfs.RenameOptions) error {
+func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry, opts vfs.RenameOptions) error {
if rp.Done() {
- // FIXME
return syserror.ENOENT
}
fs.mu.Lock()
@@ -424,7 +422,7 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, vd vf
}
// RmdirAt implements vfs.FilesystemImpl.RmdirAt.
-func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error {
+func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error {
fs.mu.Lock()
defer fs.mu.Unlock()
vfsd, inode, err := walkExistingLocked(rp)
@@ -447,12 +445,14 @@ func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error
if err := rp.VirtualFilesystem().DeleteDentry(vfs.MountNamespaceFromContext(ctx), vfsd); err != nil {
return err
}
+ // Remove from parent directory's childList.
+ vfsd.Parent().Impl().(*dentry).inode.impl.(*directory).childList.Remove(vfsd.Impl().(*dentry))
inode.decRef()
return nil
}
// SetStatAt implements vfs.FilesystemImpl.SetStatAt.
-func (fs *Filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error {
+func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error {
fs.mu.RLock()
_, _, err := walkExistingLocked(rp)
fs.mu.RUnlock()
@@ -462,12 +462,12 @@ func (fs *Filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts
if opts.Stat.Mask == 0 {
return nil
}
- // TODO: implement Inode.setStat
+ // TODO: implement inode.setStat
return syserror.EPERM
}
// StatAt implements vfs.FilesystemImpl.StatAt.
-func (fs *Filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) {
+func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) {
fs.mu.RLock()
_, inode, err := walkExistingLocked(rp)
fs.mu.RUnlock()
@@ -480,7 +480,7 @@ func (fs *Filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
}
// StatFSAt implements vfs.FilesystemImpl.StatFSAt.
-func (fs *Filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) {
+func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) {
fs.mu.RLock()
_, _, err := walkExistingLocked(rp)
fs.mu.RUnlock()
@@ -492,7 +492,7 @@ func (fs *Filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linu
}
// SymlinkAt implements vfs.FilesystemImpl.SymlinkAt.
-func (fs *Filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error {
+func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error {
if rp.Done() {
return syserror.EEXIST
}
@@ -517,7 +517,7 @@ func (fs *Filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, targ
}
// UnlinkAt implements vfs.FilesystemImpl.UnlinkAt.
-func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error {
+func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error {
fs.mu.Lock()
defer fs.mu.Unlock()
vfsd, inode, err := walkExistingLocked(rp)
@@ -537,6 +537,8 @@ func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
if err := rp.VirtualFilesystem().DeleteDentry(vfs.MountNamespaceFromContext(ctx), vfsd); err != nil {
return err
}
+ // Remove from parent directory's childList.
+ vfsd.Parent().Impl().(*dentry).inode.impl.(*directory).childList.Remove(vfsd.Impl().(*dentry))
inode.decLinksLocked()
return nil
}
diff --git a/pkg/sentry/fsimpl/memfs/memfs.go b/pkg/sentry/fsimpl/memfs/memfs.go
index f381e1a88..45cd42b3e 100644
--- a/pkg/sentry/fsimpl/memfs/memfs.go
+++ b/pkg/sentry/fsimpl/memfs/memfs.go
@@ -21,10 +21,10 @@
//
// Lock order:
//
-// Filesystem.mu
+// filesystem.mu
// regularFileFD.offMu
// regularFile.mu
-// Inode.mu
+// inode.mu
package memfs
import (
@@ -42,8 +42,8 @@ import (
// FilesystemType implements vfs.FilesystemType.
type FilesystemType struct{}
-// Filesystem implements vfs.FilesystemImpl.
-type Filesystem struct {
+// filesystem implements vfs.FilesystemImpl.
+type filesystem struct {
vfsfs vfs.Filesystem
// mu serializes changes to the Dentry tree.
@@ -54,44 +54,44 @@ type Filesystem struct {
// NewFilesystem implements vfs.FilesystemType.NewFilesystem.
func (fstype FilesystemType) NewFilesystem(ctx context.Context, creds *auth.Credentials, source string, opts vfs.NewFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
- var fs Filesystem
+ var fs filesystem
fs.vfsfs.Init(&fs)
root := fs.newDentry(fs.newDirectory(creds, 01777))
return &fs.vfsfs, &root.vfsd, nil
}
// Release implements vfs.FilesystemImpl.Release.
-func (fs *Filesystem) Release() {
+func (fs *filesystem) Release() {
}
// Sync implements vfs.FilesystemImpl.Sync.
-func (fs *Filesystem) Sync(ctx context.Context) error {
+func (fs *filesystem) Sync(ctx context.Context) error {
// All filesystem state is in-memory.
return nil
}
-// Dentry implements vfs.DentryImpl.
-type Dentry struct {
+// dentry implements vfs.DentryImpl.
+type dentry struct {
vfsd vfs.Dentry
- // inode is the inode represented by this Dentry. Multiple Dentries may
- // share a single non-directory Inode (with hard links). inode is
+ // inode is the inode represented by this dentry. Multiple Dentries may
+ // share a single non-directory inode (with hard links). inode is
// immutable.
- inode *Inode
+ inode *inode
- // memfs doesn't count references on Dentries; because the Dentry tree is
+ // memfs doesn't count references on dentries; because the dentry tree is
// the sole source of truth, it is by definition always consistent with the
- // state of the filesystem. However, it does count references on Inodes,
- // because Inode resources are released when all references are dropped.
+ // state of the filesystem. However, it does count references on inodes,
+ // because inode resources are released when all references are dropped.
// (memfs doesn't really have resources to release, but we implement
// reference counting because tmpfs regular files will.)
- // dentryEntry (ugh) links Dentries into their parent directory.childList.
+ // dentryEntry (ugh) links dentries into their parent directory.childList.
dentryEntry
}
-func (fs *Filesystem) newDentry(inode *Inode) *Dentry {
- d := &Dentry{
+func (fs *filesystem) newDentry(inode *inode) *dentry {
+ d := &dentry{
inode: inode,
}
d.vfsd.Init(d)
@@ -99,37 +99,37 @@ func (fs *Filesystem) newDentry(inode *Inode) *Dentry {
}
// IncRef implements vfs.DentryImpl.IncRef.
-func (d *Dentry) IncRef(vfsfs *vfs.Filesystem) {
+func (d *dentry) IncRef(vfsfs *vfs.Filesystem) {
d.inode.incRef()
}
// TryIncRef implements vfs.DentryImpl.TryIncRef.
-func (d *Dentry) TryIncRef(vfsfs *vfs.Filesystem) bool {
+func (d *dentry) TryIncRef(vfsfs *vfs.Filesystem) bool {
return d.inode.tryIncRef()
}
// DecRef implements vfs.DentryImpl.DecRef.
-func (d *Dentry) DecRef(vfsfs *vfs.Filesystem) {
+func (d *dentry) DecRef(vfsfs *vfs.Filesystem) {
d.inode.decRef()
}
-// Inode represents a filesystem object.
-type Inode struct {
+// inode represents a filesystem object.
+type inode struct {
// refs is a reference count. refs is accessed using atomic memory
// operations.
//
- // A reference is held on all Inodes that are reachable in the filesystem
+ // A reference is held on all inodes that are reachable in the filesystem
// tree. For non-directories (which may have multiple hard links), this
// means that a reference is dropped when nlink reaches 0. For directories,
// nlink never reaches 0 due to the "." entry; instead,
- // Filesystem.RmdirAt() drops the reference.
+ // filesystem.RmdirAt() drops the reference.
refs int64
// Inode metadata; protected by mu and accessed using atomic memory
// operations unless otherwise specified.
mu sync.RWMutex
mode uint32 // excluding file type bits, which are based on impl
- nlink uint32 // protected by Filesystem.mu instead of Inode.mu
+ nlink uint32 // protected by filesystem.mu instead of inode.mu
uid uint32 // auth.KUID, but stored as raw uint32 for sync/atomic
gid uint32 // auth.KGID, but ...
ino uint64 // immutable
@@ -137,7 +137,7 @@ type Inode struct {
impl interface{} // immutable
}
-func (i *Inode) init(impl interface{}, fs *Filesystem, creds *auth.Credentials, mode uint16) {
+func (i *inode) init(impl interface{}, fs *filesystem, creds *auth.Credentials, mode uint16) {
i.refs = 1
i.mode = uint32(mode)
i.uid = uint32(creds.EffectiveKUID)
@@ -147,29 +147,29 @@ func (i *Inode) init(impl interface{}, fs *Filesystem, creds *auth.Credentials,
i.impl = impl
}
-// Preconditions: Filesystem.mu must be locked for writing.
-func (i *Inode) incLinksLocked() {
+// Preconditions: filesystem.mu must be locked for writing.
+func (i *inode) incLinksLocked() {
if atomic.AddUint32(&i.nlink, 1) <= 1 {
- panic("memfs.Inode.incLinksLocked() called with no existing links")
+ panic("memfs.inode.incLinksLocked() called with no existing links")
}
}
-// Preconditions: Filesystem.mu must be locked for writing.
-func (i *Inode) decLinksLocked() {
+// Preconditions: filesystem.mu must be locked for writing.
+func (i *inode) decLinksLocked() {
if nlink := atomic.AddUint32(&i.nlink, ^uint32(0)); nlink == 0 {
i.decRef()
} else if nlink == ^uint32(0) { // negative overflow
- panic("memfs.Inode.decLinksLocked() called with no existing links")
+ panic("memfs.inode.decLinksLocked() called with no existing links")
}
}
-func (i *Inode) incRef() {
+func (i *inode) incRef() {
if atomic.AddInt64(&i.refs, 1) <= 1 {
- panic("memfs.Inode.incRef() called without holding a reference")
+ panic("memfs.inode.incRef() called without holding a reference")
}
}
-func (i *Inode) tryIncRef() bool {
+func (i *inode) tryIncRef() bool {
for {
refs := atomic.LoadInt64(&i.refs)
if refs == 0 {
@@ -181,7 +181,7 @@ func (i *Inode) tryIncRef() bool {
}
}
-func (i *Inode) decRef() {
+func (i *inode) decRef() {
if refs := atomic.AddInt64(&i.refs, -1); refs == 0 {
// This is unnecessary; it's mostly to simulate what tmpfs would do.
if regfile, ok := i.impl.(*regularFile); ok {
@@ -191,18 +191,18 @@ func (i *Inode) decRef() {
regfile.mu.Unlock()
}
} else if refs < 0 {
- panic("memfs.Inode.decRef() called without holding a reference")
+ panic("memfs.inode.decRef() called without holding a reference")
}
}
-func (i *Inode) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes, isDir bool) error {
+func (i *inode) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes, isDir bool) error {
return vfs.GenericCheckPermissions(creds, ats, isDir, uint16(atomic.LoadUint32(&i.mode)), auth.KUID(atomic.LoadUint32(&i.uid)), auth.KGID(atomic.LoadUint32(&i.gid)))
}
// Go won't inline this function, and returning linux.Statx (which is quite
// big) means spending a lot of time in runtime.duffcopy(), so instead it's an
// output parameter.
-func (i *Inode) statTo(stat *linux.Statx) {
+func (i *inode) statTo(stat *linux.Statx) {
stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO
stat.Blksize = 1 // usermem.PageSize in tmpfs
stat.Nlink = atomic.LoadUint32(&i.nlink)
@@ -241,7 +241,7 @@ func allocatedBlocksForSize(size uint64) uint64 {
return (size + 511) / 512
}
-func (i *Inode) direntType() uint8 {
+func (i *inode) direntType() uint8 {
switch i.impl.(type) {
case *regularFile:
return linux.DT_REG
@@ -258,16 +258,17 @@ func (i *Inode) direntType() uint8 {
// vfs.FileDescriptionImpl.
type fileDescription struct {
vfsfd vfs.FileDescription
+ vfs.FileDescriptionDefaultImpl
flags uint32 // status flags; immutable
}
-func (fd *fileDescription) filesystem() *Filesystem {
- return fd.vfsfd.VirtualDentry().Mount().Filesystem().Impl().(*Filesystem)
+func (fd *fileDescription) filesystem() *filesystem {
+ return fd.vfsfd.VirtualDentry().Mount().Filesystem().Impl().(*filesystem)
}
-func (fd *fileDescription) inode() *Inode {
- return fd.vfsfd.VirtualDentry().Dentry().Impl().(*Dentry).inode
+func (fd *fileDescription) inode() *inode {
+ return fd.vfsfd.VirtualDentry().Dentry().Impl().(*dentry).inode
}
// StatusFlags implements vfs.FileDescriptionImpl.StatusFlags.
@@ -294,6 +295,6 @@ func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions)
if opts.Stat.Mask == 0 {
return nil
}
- // TODO: implement Inode.setStat
+ // TODO: implement inode.setStat
return syserror.EPERM
}
diff --git a/pkg/sentry/fsimpl/memfs/regular_file.go b/pkg/sentry/fsimpl/memfs/regular_file.go
index 4a3603cc8..55f869798 100644
--- a/pkg/sentry/fsimpl/memfs/regular_file.go
+++ b/pkg/sentry/fsimpl/memfs/regular_file.go
@@ -28,16 +28,16 @@ import (
)
type regularFile struct {
- inode Inode
+ inode inode
mu sync.RWMutex
data []byte
// dataLen is len(data), but accessed using atomic memory operations to
- // avoid locking in Inode.stat().
+ // avoid locking in inode.stat().
dataLen int64
}
-func (fs *Filesystem) newRegularFile(creds *auth.Credentials, mode uint16) *Inode {
+func (fs *filesystem) newRegularFile(creds *auth.Credentials, mode uint16) *inode {
file := &regularFile{}
file.inode.init(file, fs, creds, mode)
file.inode.nlink = 1 // from parent directory
@@ -46,7 +46,6 @@ func (fs *Filesystem) newRegularFile(creds *auth.Credentials, mode uint16) *Inod
type regularFileFD struct {
fileDescription
- vfs.FileDescriptionDefaultImpl
// These are immutable.
readable bool
diff --git a/pkg/sentry/fsimpl/memfs/symlink.go b/pkg/sentry/fsimpl/memfs/symlink.go
index e002d1727..b2ac2cbeb 100644
--- a/pkg/sentry/fsimpl/memfs/symlink.go
+++ b/pkg/sentry/fsimpl/memfs/symlink.go
@@ -19,11 +19,11 @@ import (
)
type symlink struct {
- inode Inode
+ inode inode
target string // immutable
}
-func (fs *Filesystem) newSymlink(creds *auth.Credentials, target string) *Inode {
+func (fs *filesystem) newSymlink(creds *auth.Credentials, target string) *inode {
link := &symlink{
target: target,
}
diff --git a/pkg/sentry/fsimpl/proc/BUILD b/pkg/sentry/fsimpl/proc/BUILD
new file mode 100644
index 000000000..3d8a4deaf
--- /dev/null
+++ b/pkg/sentry/fsimpl/proc/BUILD
@@ -0,0 +1,49 @@
+load("//tools/go_stateify:defs.bzl", "go_library", "go_test")
+
+package(licenses = ["notice"])
+
+go_library(
+ name = "proc",
+ srcs = [
+ "filesystems.go",
+ "loadavg.go",
+ "meminfo.go",
+ "mounts.go",
+ "net.go",
+ "proc.go",
+ "stat.go",
+ "sys.go",
+ "task.go",
+ "version.go",
+ ],
+ importpath = "gvisor.dev/gvisor/pkg/sentry/fsimpl/proc",
+ deps = [
+ "//pkg/abi/linux",
+ "//pkg/binary",
+ "//pkg/log",
+ "//pkg/sentry/context",
+ "//pkg/sentry/fs",
+ "//pkg/sentry/inet",
+ "//pkg/sentry/kernel",
+ "//pkg/sentry/limits",
+ "//pkg/sentry/mm",
+ "//pkg/sentry/socket",
+ "//pkg/sentry/socket/unix",
+ "//pkg/sentry/socket/unix/transport",
+ "//pkg/sentry/usage",
+ "//pkg/sentry/usermem",
+ "//pkg/sentry/vfs",
+ ],
+)
+
+go_test(
+ name = "proc_test",
+ size = "small",
+ srcs = ["net_test.go"],
+ embed = [":proc"],
+ deps = [
+ "//pkg/abi/linux",
+ "//pkg/sentry/context/contexttest",
+ "//pkg/sentry/inet",
+ ],
+)
diff --git a/pkg/sentry/fsimpl/proc/filesystems.go b/pkg/sentry/fsimpl/proc/filesystems.go
new file mode 100644
index 000000000..c36c4aff5
--- /dev/null
+++ b/pkg/sentry/fsimpl/proc/filesystems.go
@@ -0,0 +1,25 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package proc
+
+// filesystemsData implements vfs.DynamicBytesSource for /proc/filesystems.
+//
+// +stateify savable
+type filesystemsData struct{}
+
+// TODO(b/138862512): Implement vfs.DynamicBytesSource.Generate for
+// filesystemsData. We would need to retrive filesystem names from
+// vfs.VirtualFilesystem. Also needs vfs replacement for
+// fs.Filesystem.AllowUserList() and fs.FilesystemRequiresDev.
diff --git a/pkg/sentry/fsimpl/proc/loadavg.go b/pkg/sentry/fsimpl/proc/loadavg.go
new file mode 100644
index 000000000..9135afef1
--- /dev/null
+++ b/pkg/sentry/fsimpl/proc/loadavg.go
@@ -0,0 +1,40 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package proc
+
+import (
+ "bytes"
+ "fmt"
+
+ "gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+)
+
+// loadavgData backs /proc/loadavg.
+//
+// +stateify savable
+type loadavgData struct{}
+
+var _ vfs.DynamicBytesSource = (*loadavgData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (d *loadavgData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+ // TODO(b/62345059): Include real data in fields.
+ // Column 1-3: CPU and IO utilization of the last 1, 5, and 10 minute periods.
+ // Column 4-5: currently running processes and the total number of processes.
+ // Column 6: the last process ID used.
+ fmt.Fprintf(buf, "%.2f %.2f %.2f %d/%d %d\n", 0.00, 0.00, 0.00, 0, 0, 0)
+ return nil
+}
diff --git a/pkg/sentry/fsimpl/proc/meminfo.go b/pkg/sentry/fsimpl/proc/meminfo.go
new file mode 100644
index 000000000..9a827cd66
--- /dev/null
+++ b/pkg/sentry/fsimpl/proc/meminfo.go
@@ -0,0 +1,77 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package proc
+
+import (
+ "bytes"
+ "fmt"
+
+ "gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/usage"
+ "gvisor.dev/gvisor/pkg/sentry/usermem"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+)
+
+// meminfoData implements vfs.DynamicBytesSource for /proc/meminfo.
+//
+// +stateify savable
+type meminfoData struct {
+ // k is the owning Kernel.
+ k *kernel.Kernel
+}
+
+var _ vfs.DynamicBytesSource = (*meminfoData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (d *meminfoData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+ mf := d.k.MemoryFile()
+ mf.UpdateUsage()
+ snapshot, totalUsage := usage.MemoryAccounting.Copy()
+ totalSize := usage.TotalMemory(mf.TotalSize(), totalUsage)
+ anon := snapshot.Anonymous + snapshot.Tmpfs
+ file := snapshot.PageCache + snapshot.Mapped
+ // We don't actually have active/inactive LRUs, so just make up numbers.
+ activeFile := (file / 2) &^ (usermem.PageSize - 1)
+ inactiveFile := file - activeFile
+
+ fmt.Fprintf(buf, "MemTotal: %8d kB\n", totalSize/1024)
+ memFree := (totalSize - totalUsage) / 1024
+ // We use MemFree as MemAvailable because we don't swap.
+ // TODO(rahat): When reclaim is implemented the value of MemAvailable
+ // should change.
+ fmt.Fprintf(buf, "MemFree: %8d kB\n", memFree)
+ fmt.Fprintf(buf, "MemAvailable: %8d kB\n", memFree)
+ fmt.Fprintf(buf, "Buffers: 0 kB\n") // memory usage by block devices
+ fmt.Fprintf(buf, "Cached: %8d kB\n", (file+snapshot.Tmpfs)/1024)
+ // Emulate a system with no swap, which disables inactivation of anon pages.
+ fmt.Fprintf(buf, "SwapCache: 0 kB\n")
+ fmt.Fprintf(buf, "Active: %8d kB\n", (anon+activeFile)/1024)
+ fmt.Fprintf(buf, "Inactive: %8d kB\n", inactiveFile/1024)
+ fmt.Fprintf(buf, "Active(anon): %8d kB\n", anon/1024)
+ fmt.Fprintf(buf, "Inactive(anon): 0 kB\n")
+ fmt.Fprintf(buf, "Active(file): %8d kB\n", activeFile/1024)
+ fmt.Fprintf(buf, "Inactive(file): %8d kB\n", inactiveFile/1024)
+ fmt.Fprintf(buf, "Unevictable: 0 kB\n") // TODO(b/31823263)
+ fmt.Fprintf(buf, "Mlocked: 0 kB\n") // TODO(b/31823263)
+ fmt.Fprintf(buf, "SwapTotal: 0 kB\n")
+ fmt.Fprintf(buf, "SwapFree: 0 kB\n")
+ fmt.Fprintf(buf, "Dirty: 0 kB\n")
+ fmt.Fprintf(buf, "Writeback: 0 kB\n")
+ fmt.Fprintf(buf, "AnonPages: %8d kB\n", anon/1024)
+ fmt.Fprintf(buf, "Mapped: %8d kB\n", file/1024) // doesn't count mapped tmpfs, which we don't know
+ fmt.Fprintf(buf, "Shmem: %8d kB\n", snapshot.Tmpfs/1024)
+ return nil
+}
diff --git a/pkg/sentry/fsimpl/proc/mounts.go b/pkg/sentry/fsimpl/proc/mounts.go
new file mode 100644
index 000000000..e81b1e910
--- /dev/null
+++ b/pkg/sentry/fsimpl/proc/mounts.go
@@ -0,0 +1,33 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package proc
+
+import "gvisor.dev/gvisor/pkg/sentry/kernel"
+
+// TODO(b/138862512): Implement mountInfoFile and mountsFile.
+
+// mountInfoFile implements vfs.DynamicBytesSource for /proc/[pid]/mountinfo.
+//
+// +stateify savable
+type mountInfoFile struct {
+ t *kernel.Task
+}
+
+// mountsFile implements vfs.DynamicBytesSource for /proc/[pid]/mounts.
+//
+// +stateify savable
+type mountsFile struct {
+ t *kernel.Task
+}
diff --git a/pkg/sentry/fsimpl/proc/net.go b/pkg/sentry/fsimpl/proc/net.go
new file mode 100644
index 000000000..fd46eebf8
--- /dev/null
+++ b/pkg/sentry/fsimpl/proc/net.go
@@ -0,0 +1,338 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package proc
+
+import (
+ "bytes"
+ "fmt"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/binary"
+ "gvisor.dev/gvisor/pkg/log"
+ "gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/sentry/fs"
+ "gvisor.dev/gvisor/pkg/sentry/inet"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/socket"
+ "gvisor.dev/gvisor/pkg/sentry/socket/unix"
+ "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+)
+
+// ifinet6 implements vfs.DynamicBytesSource for /proc/net/if_inet6.
+//
+// +stateify savable
+type ifinet6 struct {
+ s inet.Stack
+}
+
+var _ vfs.DynamicBytesSource = (*ifinet6)(nil)
+
+func (n *ifinet6) contents() []string {
+ var lines []string
+ nics := n.s.Interfaces()
+ for id, naddrs := range n.s.InterfaceAddrs() {
+ nic, ok := nics[id]
+ if !ok {
+ // NIC was added after NICNames was called. We'll just
+ // ignore it.
+ continue
+ }
+
+ for _, a := range naddrs {
+ // IPv6 only.
+ if a.Family != linux.AF_INET6 {
+ continue
+ }
+
+ // Fields:
+ // IPv6 address displayed in 32 hexadecimal chars without colons
+ // Netlink device number (interface index) in hexadecimal (use nic id)
+ // Prefix length in hexadecimal
+ // Scope value (use 0)
+ // Interface flags
+ // Device name
+ lines = append(lines, fmt.Sprintf("%032x %02x %02x %02x %02x %8s\n", a.Addr, id, a.PrefixLen, 0, a.Flags, nic.Name))
+ }
+ }
+ return lines
+}
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (n *ifinet6) Generate(ctx context.Context, buf *bytes.Buffer) error {
+ for _, l := range n.contents() {
+ buf.WriteString(l)
+ }
+ return nil
+}
+
+// netDev implements vfs.DynamicBytesSource for /proc/net/dev.
+//
+// +stateify savable
+type netDev struct {
+ s inet.Stack
+}
+
+var _ vfs.DynamicBytesSource = (*netDev)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (n *netDev) Generate(ctx context.Context, buf *bytes.Buffer) error {
+ interfaces := n.s.Interfaces()
+ buf.WriteString("Inter-| Receive | Transmit\n")
+ buf.WriteString(" face |bytes packets errs drop fifo frame compressed multicast|bytes packets errs drop fifo colls carrier compressed\n")
+
+ for _, i := range interfaces {
+ // Implements the same format as
+ // net/core/net-procfs.c:dev_seq_printf_stats.
+ var stats inet.StatDev
+ if err := n.s.Statistics(&stats, i.Name); err != nil {
+ log.Warningf("Failed to retrieve interface statistics for %v: %v", i.Name, err)
+ continue
+ }
+ fmt.Fprintf(
+ buf,
+ "%6s: %7d %7d %4d %4d %4d %5d %10d %9d %8d %7d %4d %4d %4d %5d %7d %10d\n",
+ i.Name,
+ // Received
+ stats[0], // bytes
+ stats[1], // packets
+ stats[2], // errors
+ stats[3], // dropped
+ stats[4], // fifo
+ stats[5], // frame
+ stats[6], // compressed
+ stats[7], // multicast
+ // Transmitted
+ stats[8], // bytes
+ stats[9], // packets
+ stats[10], // errors
+ stats[11], // dropped
+ stats[12], // fifo
+ stats[13], // frame
+ stats[14], // compressed
+ stats[15], // multicast
+ )
+ }
+
+ return nil
+}
+
+// netUnix implements vfs.DynamicBytesSource for /proc/net/unix.
+//
+// +stateify savable
+type netUnix struct {
+ k *kernel.Kernel
+}
+
+var _ vfs.DynamicBytesSource = (*netUnix)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (n *netUnix) Generate(ctx context.Context, buf *bytes.Buffer) error {
+ buf.WriteString("Num RefCount Protocol Flags Type St Inode Path\n")
+ for _, se := range n.k.ListSockets() {
+ s := se.Sock.Get()
+ if s == nil {
+ log.Debugf("Couldn't resolve weakref %v in socket table, racing with destruction?", se.Sock)
+ continue
+ }
+ sfile := s.(*fs.File)
+ if family, _, _ := sfile.FileOperations.(socket.Socket).Type(); family != linux.AF_UNIX {
+ s.DecRef()
+ // Not a unix socket.
+ continue
+ }
+ sops := sfile.FileOperations.(*unix.SocketOperations)
+
+ addr, err := sops.Endpoint().GetLocalAddress()
+ if err != nil {
+ log.Warningf("Failed to retrieve socket name from %+v: %v", sfile, err)
+ addr.Addr = "<unknown>"
+ }
+
+ sockFlags := 0
+ if ce, ok := sops.Endpoint().(transport.ConnectingEndpoint); ok {
+ if ce.Listening() {
+ // For unix domain sockets, linux reports a single flag
+ // value if the socket is listening, of __SO_ACCEPTCON.
+ sockFlags = linux.SO_ACCEPTCON
+ }
+ }
+
+ // In the socket entry below, the value for the 'Num' field requires
+ // some consideration. Linux prints the address to the struct
+ // unix_sock representing a socket in the kernel, but may redact the
+ // value for unprivileged users depending on the kptr_restrict
+ // sysctl.
+ //
+ // One use for this field is to allow a privileged user to
+ // introspect into the kernel memory to determine information about
+ // a socket not available through procfs, such as the socket's peer.
+ //
+ // In gvisor, returning a pointer to our internal structures would
+ // be pointless, as it wouldn't match the memory layout for struct
+ // unix_sock, making introspection difficult. We could populate a
+ // struct unix_sock with the appropriate data, but even that
+ // requires consideration for which kernel version to emulate, as
+ // the definition of this struct changes over time.
+ //
+ // For now, we always redact this pointer.
+ fmt.Fprintf(buf, "%#016p: %08X %08X %08X %04X %02X %5d",
+ (*unix.SocketOperations)(nil), // Num, pointer to kernel socket struct.
+ sfile.ReadRefs()-1, // RefCount, don't count our own ref.
+ 0, // Protocol, always 0 for UDS.
+ sockFlags, // Flags.
+ sops.Endpoint().Type(), // Type.
+ sops.State(), // State.
+ sfile.InodeID(), // Inode.
+ )
+
+ // Path
+ if len(addr.Addr) != 0 {
+ if addr.Addr[0] == 0 {
+ // Abstract path.
+ fmt.Fprintf(buf, " @%s", string(addr.Addr[1:]))
+ } else {
+ fmt.Fprintf(buf, " %s", string(addr.Addr))
+ }
+ }
+ fmt.Fprintf(buf, "\n")
+
+ s.DecRef()
+ }
+ return nil
+}
+
+// netTCP implements vfs.DynamicBytesSource for /proc/net/tcp.
+//
+// +stateify savable
+type netTCP struct {
+ k *kernel.Kernel
+}
+
+var _ vfs.DynamicBytesSource = (*netTCP)(nil)
+
+func (n *netTCP) Generate(ctx context.Context, buf *bytes.Buffer) error {
+ t := kernel.TaskFromContext(ctx)
+ buf.WriteString(" sl local_address rem_address st tx_queue rx_queue tr tm->when retrnsmt uid timeout inode \n")
+ for _, se := range n.k.ListSockets() {
+ s := se.Sock.Get()
+ if s == nil {
+ log.Debugf("Couldn't resolve weakref %+v in socket table, racing with destruction?", se.Sock)
+ continue
+ }
+ sfile := s.(*fs.File)
+ sops, ok := sfile.FileOperations.(socket.Socket)
+ if !ok {
+ panic(fmt.Sprintf("Found non-socket file in socket table: %+v", sfile))
+ }
+ if family, stype, _ := sops.Type(); !(family == linux.AF_INET && stype == linux.SOCK_STREAM) {
+ s.DecRef()
+ // Not tcp4 sockets.
+ continue
+ }
+
+ // Linux's documentation for the fields below can be found at
+ // https://www.kernel.org/doc/Documentation/networking/proc_net_tcp.txt.
+ // For Linux's implementation, see net/ipv4/tcp_ipv4.c:get_tcp4_sock().
+ // Note that the header doesn't contain labels for all the fields.
+
+ // Field: sl; entry number.
+ fmt.Fprintf(buf, "%4d: ", se.ID)
+
+ portBuf := make([]byte, 2)
+
+ // Field: local_adddress.
+ var localAddr linux.SockAddrInet
+ if local, _, err := sops.GetSockName(t); err == nil {
+ localAddr = *local.(*linux.SockAddrInet)
+ }
+ binary.LittleEndian.PutUint16(portBuf, localAddr.Port)
+ fmt.Fprintf(buf, "%08X:%04X ",
+ binary.LittleEndian.Uint32(localAddr.Addr[:]),
+ portBuf)
+
+ // Field: rem_address.
+ var remoteAddr linux.SockAddrInet
+ if remote, _, err := sops.GetPeerName(t); err == nil {
+ remoteAddr = *remote.(*linux.SockAddrInet)
+ }
+ binary.LittleEndian.PutUint16(portBuf, remoteAddr.Port)
+ fmt.Fprintf(buf, "%08X:%04X ",
+ binary.LittleEndian.Uint32(remoteAddr.Addr[:]),
+ portBuf)
+
+ // Field: state; socket state.
+ fmt.Fprintf(buf, "%02X ", sops.State())
+
+ // Field: tx_queue, rx_queue; number of packets in the transmit and
+ // receive queue. Unimplemented.
+ fmt.Fprintf(buf, "%08X:%08X ", 0, 0)
+
+ // Field: tr, tm->when; timer active state and number of jiffies
+ // until timer expires. Unimplemented.
+ fmt.Fprintf(buf, "%02X:%08X ", 0, 0)
+
+ // Field: retrnsmt; number of unrecovered RTO timeouts.
+ // Unimplemented.
+ fmt.Fprintf(buf, "%08X ", 0)
+
+ // Field: uid.
+ uattr, err := sfile.Dirent.Inode.UnstableAttr(ctx)
+ if err != nil {
+ log.Warningf("Failed to retrieve unstable attr for socket file: %v", err)
+ fmt.Fprintf(buf, "%5d ", 0)
+ } else {
+ fmt.Fprintf(buf, "%5d ", uint32(uattr.Owner.UID.In(t.UserNamespace()).OrOverflow()))
+ }
+
+ // Field: timeout; number of unanswered 0-window probes.
+ // Unimplemented.
+ fmt.Fprintf(buf, "%8d ", 0)
+
+ // Field: inode.
+ fmt.Fprintf(buf, "%8d ", sfile.InodeID())
+
+ // Field: refcount. Don't count the ref we obtain while deferencing
+ // the weakref to this socket.
+ fmt.Fprintf(buf, "%d ", sfile.ReadRefs()-1)
+
+ // Field: Socket struct address. Redacted due to the same reason as
+ // the 'Num' field in /proc/net/unix, see netUnix.ReadSeqFileData.
+ fmt.Fprintf(buf, "%#016p ", (*socket.Socket)(nil))
+
+ // Field: retransmit timeout. Unimplemented.
+ fmt.Fprintf(buf, "%d ", 0)
+
+ // Field: predicted tick of soft clock (delayed ACK control data).
+ // Unimplemented.
+ fmt.Fprintf(buf, "%d ", 0)
+
+ // Field: (ack.quick<<1)|ack.pingpong, Unimplemented.
+ fmt.Fprintf(buf, "%d ", 0)
+
+ // Field: sending congestion window, Unimplemented.
+ fmt.Fprintf(buf, "%d ", 0)
+
+ // Field: Slow start size threshold, -1 if threshold >= 0xFFFF.
+ // Unimplemented, report as large threshold.
+ fmt.Fprintf(buf, "%d", -1)
+
+ fmt.Fprintf(buf, "\n")
+
+ s.DecRef()
+ }
+
+ return nil
+}
diff --git a/pkg/sentry/fsimpl/proc/net_test.go b/pkg/sentry/fsimpl/proc/net_test.go
new file mode 100644
index 000000000..20a77a8ca
--- /dev/null
+++ b/pkg/sentry/fsimpl/proc/net_test.go
@@ -0,0 +1,78 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package proc
+
+import (
+ "bytes"
+ "reflect"
+ "testing"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/context/contexttest"
+ "gvisor.dev/gvisor/pkg/sentry/inet"
+)
+
+func newIPv6TestStack() *inet.TestStack {
+ s := inet.NewTestStack()
+ s.SupportsIPv6Flag = true
+ return s
+}
+
+func TestIfinet6NoAddresses(t *testing.T) {
+ n := &ifinet6{s: newIPv6TestStack()}
+ var buf bytes.Buffer
+ n.Generate(contexttest.Context(t), &buf)
+ if buf.Len() > 0 {
+ t.Errorf("n.Generate() generated = %v, want = %v", buf.Bytes(), []byte{})
+ }
+}
+
+func TestIfinet6(t *testing.T) {
+ s := newIPv6TestStack()
+ s.InterfacesMap[1] = inet.Interface{Name: "eth0"}
+ s.InterfaceAddrsMap[1] = []inet.InterfaceAddr{
+ {
+ Family: linux.AF_INET6,
+ PrefixLen: 128,
+ Addr: []byte("\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"),
+ },
+ }
+ s.InterfacesMap[2] = inet.Interface{Name: "eth1"}
+ s.InterfaceAddrsMap[2] = []inet.InterfaceAddr{
+ {
+ Family: linux.AF_INET6,
+ PrefixLen: 128,
+ Addr: []byte("\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"),
+ },
+ }
+ want := map[string]struct{}{
+ "000102030405060708090a0b0c0d0e0f 01 80 00 00 eth0\n": {},
+ "101112131415161718191a1b1c1d1e1f 02 80 00 00 eth1\n": {},
+ }
+
+ n := &ifinet6{s: s}
+ contents := n.contents()
+ if len(contents) != len(want) {
+ t.Errorf("Got len(n.contents()) = %d, want = %d", len(contents), len(want))
+ }
+ got := map[string]struct{}{}
+ for _, l := range contents {
+ got[l] = struct{}{}
+ }
+
+ if !reflect.DeepEqual(got, want) {
+ t.Errorf("Got n.contents() = %v, want = %v", got, want)
+ }
+}
diff --git a/pkg/sentry/fsimpl/proc/proc.go b/pkg/sentry/fsimpl/proc/proc.go
new file mode 100644
index 000000000..31dec36de
--- /dev/null
+++ b/pkg/sentry/fsimpl/proc/proc.go
@@ -0,0 +1,16 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package proc implements a partial in-memory file system for procfs.
+package proc
diff --git a/pkg/sentry/fsimpl/proc/stat.go b/pkg/sentry/fsimpl/proc/stat.go
new file mode 100644
index 000000000..720db3828
--- /dev/null
+++ b/pkg/sentry/fsimpl/proc/stat.go
@@ -0,0 +1,127 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package proc
+
+import (
+ "bytes"
+ "fmt"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+)
+
+// cpuStats contains the breakdown of CPU time for /proc/stat.
+type cpuStats struct {
+ // user is time spent in userspace tasks with non-positive niceness.
+ user uint64
+
+ // nice is time spent in userspace tasks with positive niceness.
+ nice uint64
+
+ // system is time spent in non-interrupt kernel context.
+ system uint64
+
+ // idle is time spent idle.
+ idle uint64
+
+ // ioWait is time spent waiting for IO.
+ ioWait uint64
+
+ // irq is time spent in interrupt context.
+ irq uint64
+
+ // softirq is time spent in software interrupt context.
+ softirq uint64
+
+ // steal is involuntary wait time.
+ steal uint64
+
+ // guest is time spent in guests with non-positive niceness.
+ guest uint64
+
+ // guestNice is time spent in guests with positive niceness.
+ guestNice uint64
+}
+
+// String implements fmt.Stringer.
+func (c cpuStats) String() string {
+ return fmt.Sprintf("%d %d %d %d %d %d %d %d %d %d", c.user, c.nice, c.system, c.idle, c.ioWait, c.irq, c.softirq, c.steal, c.guest, c.guestNice)
+}
+
+// statData implements vfs.DynamicBytesSource for /proc/stat.
+//
+// +stateify savable
+type statData struct {
+ // k is the owning Kernel.
+ k *kernel.Kernel
+}
+
+var _ vfs.DynamicBytesSource = (*statData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (s *statData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+ // TODO(b/37226836): We currently export only zero CPU stats. We could
+ // at least provide some aggregate stats.
+ var cpu cpuStats
+ fmt.Fprintf(buf, "cpu %s\n", cpu)
+
+ for c, max := uint(0), s.k.ApplicationCores(); c < max; c++ {
+ fmt.Fprintf(buf, "cpu%d %s\n", c, cpu)
+ }
+
+ // The total number of interrupts is dependent on the CPUs and PCI
+ // devices on the system. See arch_probe_nr_irqs.
+ //
+ // Since we don't report real interrupt stats, just choose an arbitrary
+ // value from a representative VM.
+ const numInterrupts = 256
+
+ // The Kernel doesn't handle real interrupts, so report all zeroes.
+ // TODO(b/37226836): We could count page faults as #PF.
+ fmt.Fprintf(buf, "intr 0") // total
+ for i := 0; i < numInterrupts; i++ {
+ fmt.Fprintf(buf, " 0")
+ }
+ fmt.Fprintf(buf, "\n")
+
+ // Total number of context switches.
+ // TODO(b/37226836): Count this.
+ fmt.Fprintf(buf, "ctxt 0\n")
+
+ // CLOCK_REALTIME timestamp from boot, in seconds.
+ fmt.Fprintf(buf, "btime %d\n", s.k.Timekeeper().BootTime().Seconds())
+
+ // Total number of clones.
+ // TODO(b/37226836): Count this.
+ fmt.Fprintf(buf, "processes 0\n")
+
+ // Number of runnable tasks.
+ // TODO(b/37226836): Count this.
+ fmt.Fprintf(buf, "procs_running 0\n")
+
+ // Number of tasks waiting on IO.
+ // TODO(b/37226836): Count this.
+ fmt.Fprintf(buf, "procs_blocked 0\n")
+
+ // Number of each softirq handled.
+ fmt.Fprintf(buf, "softirq 0") // total
+ for i := 0; i < linux.NumSoftIRQ; i++ {
+ fmt.Fprintf(buf, " 0")
+ }
+ fmt.Fprintf(buf, "\n")
+ return nil
+}
diff --git a/pkg/sentry/fsimpl/proc/sys.go b/pkg/sentry/fsimpl/proc/sys.go
new file mode 100644
index 000000000..b88256e12
--- /dev/null
+++ b/pkg/sentry/fsimpl/proc/sys.go
@@ -0,0 +1,51 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package proc
+
+import (
+ "bytes"
+ "fmt"
+
+ "gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+)
+
+// mmapMinAddrData implements vfs.DynamicBytesSource for
+// /proc/sys/vm/mmap_min_addr.
+//
+// +stateify savable
+type mmapMinAddrData struct {
+ k *kernel.Kernel
+}
+
+var _ vfs.DynamicBytesSource = (*mmapMinAddrData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (d *mmapMinAddrData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+ fmt.Fprintf(buf, "%d\n", d.k.Platform.MinUserAddress())
+ return nil
+}
+
+// +stateify savable
+type overcommitMemory struct{}
+
+var _ vfs.DynamicBytesSource = (*overcommitMemory)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (d *overcommitMemory) Generate(ctx context.Context, buf *bytes.Buffer) error {
+ fmt.Fprintf(buf, "0\n")
+ return nil
+}
diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go
new file mode 100644
index 000000000..c46e05c3a
--- /dev/null
+++ b/pkg/sentry/fsimpl/proc/task.go
@@ -0,0 +1,261 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package proc
+
+import (
+ "bytes"
+ "fmt"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/limits"
+ "gvisor.dev/gvisor/pkg/sentry/mm"
+ "gvisor.dev/gvisor/pkg/sentry/usage"
+ "gvisor.dev/gvisor/pkg/sentry/usermem"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+)
+
+// mapsCommon is embedded by mapsData and smapsData.
+type mapsCommon struct {
+ t *kernel.Task
+}
+
+// mm gets the kernel task's MemoryManager. No additional reference is taken on
+// mm here. This is safe because MemoryManager.destroy is required to leave the
+// MemoryManager in a state where it's still usable as a DynamicBytesSource.
+func (md *mapsCommon) mm() *mm.MemoryManager {
+ var tmm *mm.MemoryManager
+ md.t.WithMuLocked(func(t *kernel.Task) {
+ if mm := t.MemoryManager(); mm != nil {
+ tmm = mm
+ }
+ })
+ return tmm
+}
+
+// mapsData implements vfs.DynamicBytesSource for /proc/[pid]/maps.
+//
+// +stateify savable
+type mapsData struct {
+ mapsCommon
+}
+
+var _ vfs.DynamicBytesSource = (*mapsData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (md *mapsData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+ if mm := md.mm(); mm != nil {
+ mm.ReadMapsDataInto(ctx, buf)
+ }
+ return nil
+}
+
+// smapsData implements vfs.DynamicBytesSource for /proc/[pid]/smaps.
+//
+// +stateify savable
+type smapsData struct {
+ mapsCommon
+}
+
+var _ vfs.DynamicBytesSource = (*smapsData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (sd *smapsData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+ if mm := sd.mm(); mm != nil {
+ mm.ReadSmapsDataInto(ctx, buf)
+ }
+ return nil
+}
+
+// +stateify savable
+type taskStatData struct {
+ t *kernel.Task
+
+ // If tgstats is true, accumulate fault stats (not implemented) and CPU
+ // time across all tasks in t's thread group.
+ tgstats bool
+
+ // pidns is the PID namespace associated with the proc filesystem that
+ // includes the file using this statData.
+ pidns *kernel.PIDNamespace
+}
+
+var _ vfs.DynamicBytesSource = (*taskStatData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (s *taskStatData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+ fmt.Fprintf(buf, "%d ", s.pidns.IDOfTask(s.t))
+ fmt.Fprintf(buf, "(%s) ", s.t.Name())
+ fmt.Fprintf(buf, "%c ", s.t.StateStatus()[0])
+ ppid := kernel.ThreadID(0)
+ if parent := s.t.Parent(); parent != nil {
+ ppid = s.pidns.IDOfThreadGroup(parent.ThreadGroup())
+ }
+ fmt.Fprintf(buf, "%d ", ppid)
+ fmt.Fprintf(buf, "%d ", s.pidns.IDOfProcessGroup(s.t.ThreadGroup().ProcessGroup()))
+ fmt.Fprintf(buf, "%d ", s.pidns.IDOfSession(s.t.ThreadGroup().Session()))
+ fmt.Fprintf(buf, "0 0 " /* tty_nr tpgid */)
+ fmt.Fprintf(buf, "0 " /* flags */)
+ fmt.Fprintf(buf, "0 0 0 0 " /* minflt cminflt majflt cmajflt */)
+ var cputime usage.CPUStats
+ if s.tgstats {
+ cputime = s.t.ThreadGroup().CPUStats()
+ } else {
+ cputime = s.t.CPUStats()
+ }
+ fmt.Fprintf(buf, "%d %d ", linux.ClockTFromDuration(cputime.UserTime), linux.ClockTFromDuration(cputime.SysTime))
+ cputime = s.t.ThreadGroup().JoinedChildCPUStats()
+ fmt.Fprintf(buf, "%d %d ", linux.ClockTFromDuration(cputime.UserTime), linux.ClockTFromDuration(cputime.SysTime))
+ fmt.Fprintf(buf, "%d %d ", s.t.Priority(), s.t.Niceness())
+ fmt.Fprintf(buf, "%d ", s.t.ThreadGroup().Count())
+
+ // itrealvalue. Since kernel 2.6.17, this field is no longer
+ // maintained, and is hard coded as 0.
+ fmt.Fprintf(buf, "0 ")
+
+ // Start time is relative to boot time, expressed in clock ticks.
+ fmt.Fprintf(buf, "%d ", linux.ClockTFromDuration(s.t.StartTime().Sub(s.t.Kernel().Timekeeper().BootTime())))
+
+ var vss, rss uint64
+ s.t.WithMuLocked(func(t *kernel.Task) {
+ if mm := t.MemoryManager(); mm != nil {
+ vss = mm.VirtualMemorySize()
+ rss = mm.ResidentSetSize()
+ }
+ })
+ fmt.Fprintf(buf, "%d %d ", vss, rss/usermem.PageSize)
+
+ // rsslim.
+ fmt.Fprintf(buf, "%d ", s.t.ThreadGroup().Limits().Get(limits.Rss).Cur)
+
+ fmt.Fprintf(buf, "0 0 0 0 0 " /* startcode endcode startstack kstkesp kstkeip */)
+ fmt.Fprintf(buf, "0 0 0 0 0 " /* signal blocked sigignore sigcatch wchan */)
+ fmt.Fprintf(buf, "0 0 " /* nswap cnswap */)
+ terminationSignal := linux.Signal(0)
+ if s.t == s.t.ThreadGroup().Leader() {
+ terminationSignal = s.t.ThreadGroup().TerminationSignal()
+ }
+ fmt.Fprintf(buf, "%d ", terminationSignal)
+ fmt.Fprintf(buf, "0 0 0 " /* processor rt_priority policy */)
+ fmt.Fprintf(buf, "0 0 0 " /* delayacct_blkio_ticks guest_time cguest_time */)
+ fmt.Fprintf(buf, "0 0 0 0 0 0 0 " /* start_data end_data start_brk arg_start arg_end env_start env_end */)
+ fmt.Fprintf(buf, "0\n" /* exit_code */)
+
+ return nil
+}
+
+// statmData implements vfs.DynamicBytesSource for /proc/[pid]/statm.
+//
+// +stateify savable
+type statmData struct {
+ t *kernel.Task
+}
+
+var _ vfs.DynamicBytesSource = (*statmData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (s *statmData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+ var vss, rss uint64
+ s.t.WithMuLocked(func(t *kernel.Task) {
+ if mm := t.MemoryManager(); mm != nil {
+ vss = mm.VirtualMemorySize()
+ rss = mm.ResidentSetSize()
+ }
+ })
+
+ fmt.Fprintf(buf, "%d %d 0 0 0 0 0\n", vss/usermem.PageSize, rss/usermem.PageSize)
+ return nil
+}
+
+// statusData implements vfs.DynamicBytesSource for /proc/[pid]/status.
+//
+// +stateify savable
+type statusData struct {
+ t *kernel.Task
+ pidns *kernel.PIDNamespace
+}
+
+var _ vfs.DynamicBytesSource = (*statusData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (s *statusData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+ fmt.Fprintf(buf, "Name:\t%s\n", s.t.Name())
+ fmt.Fprintf(buf, "State:\t%s\n", s.t.StateStatus())
+ fmt.Fprintf(buf, "Tgid:\t%d\n", s.pidns.IDOfThreadGroup(s.t.ThreadGroup()))
+ fmt.Fprintf(buf, "Pid:\t%d\n", s.pidns.IDOfTask(s.t))
+ ppid := kernel.ThreadID(0)
+ if parent := s.t.Parent(); parent != nil {
+ ppid = s.pidns.IDOfThreadGroup(parent.ThreadGroup())
+ }
+ fmt.Fprintf(buf, "PPid:\t%d\n", ppid)
+ tpid := kernel.ThreadID(0)
+ if tracer := s.t.Tracer(); tracer != nil {
+ tpid = s.pidns.IDOfTask(tracer)
+ }
+ fmt.Fprintf(buf, "TracerPid:\t%d\n", tpid)
+ var fds int
+ var vss, rss, data uint64
+ s.t.WithMuLocked(func(t *kernel.Task) {
+ if fdTable := t.FDTable(); fdTable != nil {
+ fds = fdTable.Size()
+ }
+ if mm := t.MemoryManager(); mm != nil {
+ vss = mm.VirtualMemorySize()
+ rss = mm.ResidentSetSize()
+ data = mm.VirtualDataSize()
+ }
+ })
+ fmt.Fprintf(buf, "FDSize:\t%d\n", fds)
+ fmt.Fprintf(buf, "VmSize:\t%d kB\n", vss>>10)
+ fmt.Fprintf(buf, "VmRSS:\t%d kB\n", rss>>10)
+ fmt.Fprintf(buf, "VmData:\t%d kB\n", data>>10)
+ fmt.Fprintf(buf, "Threads:\t%d\n", s.t.ThreadGroup().Count())
+ creds := s.t.Credentials()
+ fmt.Fprintf(buf, "CapInh:\t%016x\n", creds.InheritableCaps)
+ fmt.Fprintf(buf, "CapPrm:\t%016x\n", creds.PermittedCaps)
+ fmt.Fprintf(buf, "CapEff:\t%016x\n", creds.EffectiveCaps)
+ fmt.Fprintf(buf, "CapBnd:\t%016x\n", creds.BoundingCaps)
+ fmt.Fprintf(buf, "Seccomp:\t%d\n", s.t.SeccompMode())
+ return nil
+}
+
+// ioUsage is the /proc/<pid>/io and /proc/<pid>/task/<tid>/io data provider.
+type ioUsage interface {
+ // IOUsage returns the io usage data.
+ IOUsage() *usage.IO
+}
+
+// +stateify savable
+type ioData struct {
+ ioUsage
+}
+
+var _ vfs.DynamicBytesSource = (*ioData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (i *ioData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+ io := usage.IO{}
+ io.Accumulate(i.IOUsage())
+
+ fmt.Fprintf(buf, "char: %d\n", io.CharsRead)
+ fmt.Fprintf(buf, "wchar: %d\n", io.CharsWritten)
+ fmt.Fprintf(buf, "syscr: %d\n", io.ReadSyscalls)
+ fmt.Fprintf(buf, "syscw: %d\n", io.WriteSyscalls)
+ fmt.Fprintf(buf, "read_bytes: %d\n", io.BytesRead)
+ fmt.Fprintf(buf, "write_bytes: %d\n", io.BytesWritten)
+ fmt.Fprintf(buf, "cancelled_write_bytes: %d\n", io.BytesWriteCancelled)
+ return nil
+}
diff --git a/pkg/sentry/fsimpl/proc/version.go b/pkg/sentry/fsimpl/proc/version.go
new file mode 100644
index 000000000..e1643d4e0
--- /dev/null
+++ b/pkg/sentry/fsimpl/proc/version.go
@@ -0,0 +1,68 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package proc
+
+import (
+ "bytes"
+ "fmt"
+
+ "gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/vfs"
+)
+
+// versionData implements vfs.DynamicBytesSource for /proc/version.
+//
+// +stateify savable
+type versionData struct {
+ // k is the owning Kernel.
+ k *kernel.Kernel
+}
+
+var _ vfs.DynamicBytesSource = (*versionData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (v *versionData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+ init := v.k.GlobalInit()
+ if init == nil {
+ // Attempted to read before the init Task is created. This can
+ // only occur during startup, which should never need to read
+ // this file.
+ panic("Attempted to read version before initial Task is available")
+ }
+
+ // /proc/version takes the form:
+ //
+ // "SYSNAME version RELEASE (COMPILE_USER@COMPILE_HOST)
+ // (COMPILER_VERSION) VERSION"
+ //
+ // where:
+ // - SYSNAME, RELEASE, and VERSION are the same as returned by
+ // sys_utsname
+ // - COMPILE_USER is the user that build the kernel
+ // - COMPILE_HOST is the hostname of the machine on which the kernel
+ // was built
+ // - COMPILER_VERSION is the version reported by the building compiler
+ //
+ // Since we don't really want to expose build information to
+ // applications, those fields are omitted.
+ //
+ // FIXME(mpratt): Using Version from the init task SyscallTable
+ // disregards the different version a task may have (e.g., in a uts
+ // namespace).
+ ver := init.Leader().SyscallTable().Version
+ fmt.Fprintf(buf, "%s version %s %s\n", ver.Sysname, ver.Release, ver.Version)
+ return nil
+}
diff --git a/pkg/sentry/inet/inet.go b/pkg/sentry/inet/inet.go
index 5b75a4a06..80f227dbe 100644
--- a/pkg/sentry/inet/inet.go
+++ b/pkg/sentry/inet/inet.go
@@ -52,12 +52,16 @@ type Stack interface {
// Statistics reports stack statistics.
Statistics(stat interface{}, arg string) error
+
+ // RouteTable returns the network stack's route table.
+ RouteTable() []Route
+
+ // Resume restarts the network stack after restore.
+ Resume()
}
// Interface contains information about a network interface.
type Interface struct {
- // Keep these fields sorted in the order they appear in rtnetlink(7).
-
// DeviceType is the device type, a Linux ARPHRD_* constant.
DeviceType uint16
@@ -77,8 +81,6 @@ type Interface struct {
// InterfaceAddr contains information about a network interface address.
type InterfaceAddr struct {
- // Keep these fields sorted in the order they appear in rtnetlink(7).
-
// Family is the address family, a Linux AF_* constant.
Family uint8
@@ -109,3 +111,45 @@ type TCPBufferSize struct {
// StatDev describes one line of /proc/net/dev, i.e., stats for one network
// interface.
type StatDev [16]uint64
+
+// Route contains information about a network route.
+type Route struct {
+ // Family is the address family, a Linux AF_* constant.
+ Family uint8
+
+ // DstLen is the length of the destination address.
+ DstLen uint8
+
+ // SrcLen is the length of the source address.
+ SrcLen uint8
+
+ // TOS is the Type of Service filter.
+ TOS uint8
+
+ // Table is the routing table ID.
+ Table uint8
+
+ // Protocol is the route origin, a Linux RTPROT_* constant.
+ Protocol uint8
+
+ // Scope is the distance to destination, a Linux RT_SCOPE_* constant.
+ Scope uint8
+
+ // Type is the route origin, a Linux RTN_* constant.
+ Type uint8
+
+ // Flags are route flags. See rtnetlink(7) under "rtm_flags".
+ Flags uint32
+
+ // DstAddr is the route destination address (RTA_DST).
+ DstAddr []byte
+
+ // SrcAddr is the route source address (RTA_SRC).
+ SrcAddr []byte
+
+ // OutputInterface is the output interface index (RTA_OIF).
+ OutputInterface int32
+
+ // GatewayAddr is the route gateway address (RTA_GATEWAY).
+ GatewayAddr []byte
+}
diff --git a/pkg/sentry/inet/test_stack.go b/pkg/sentry/inet/test_stack.go
index 75f9e7a77..b9eed7c3a 100644
--- a/pkg/sentry/inet/test_stack.go
+++ b/pkg/sentry/inet/test_stack.go
@@ -18,6 +18,7 @@ package inet
type TestStack struct {
InterfacesMap map[int32]Interface
InterfaceAddrsMap map[int32][]InterfaceAddr
+ RouteList []Route
SupportsIPv6Flag bool
TCPRecvBufSize TCPBufferSize
TCPSendBufSize TCPBufferSize
@@ -86,3 +87,12 @@ func (s *TestStack) SetTCPSACKEnabled(enabled bool) error {
func (s *TestStack) Statistics(stat interface{}, arg string) error {
return nil
}
+
+// RouteTable implements Stack.RouteTable.
+func (s *TestStack) RouteTable() []Route {
+ return s.RouteList
+}
+
+// Resume implements Stack.Resume.
+func (s *TestStack) Resume() {
+}
diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD
index e61d39c82..41bee9a22 100644
--- a/pkg/sentry/kernel/BUILD
+++ b/pkg/sentry/kernel/BUILD
@@ -144,6 +144,7 @@ go_library(
"threads.go",
"timekeeper.go",
"timekeeper_state.go",
+ "tty.go",
"uts_namespace.go",
"vdso.go",
"version.go",
diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go
index 4c2d48e65..8c1f79ab5 100644
--- a/pkg/sentry/kernel/kernel.go
+++ b/pkg/sentry/kernel/kernel.go
@@ -112,11 +112,6 @@ type Kernel struct {
rootIPCNamespace *IPCNamespace
rootAbstractSocketNamespace *AbstractSocketNamespace
- // mounts holds the state of the virtual filesystem. mounts is initially
- // nil, and must be set by calling Kernel.SetRootMountNamespace before
- // Kernel.CreateProcess can succeed.
- mounts *fs.MountNamespace
-
// futexes is the "root" futex.Manager, from which all others are forked.
// This is necessary to ensure that shared futexes are coherent across all
// tasks, including those created by CreateProcess.
@@ -197,6 +192,15 @@ type Kernel struct {
// caches. Not all caches use it, only the caches that use host resources use
// the limiter. It may be nil if disabled.
DirentCacheLimiter *fs.DirentCacheLimiter
+
+ // unimplementedSyscallEmitterOnce is used in the initialization of
+ // unimplementedSyscallEmitter.
+ unimplementedSyscallEmitterOnce sync.Once `state:"nosave"`
+
+ // unimplementedSyscallEmitter is used to emit unimplemented syscall
+ // events. This is initialized lazily on the first unimplemented
+ // syscall.
+ unimplementedSyscallEmitter eventchannel.Emitter `state:"nosave"`
}
// InitKernelArgs holds arguments to Init.
@@ -290,7 +294,6 @@ func (k *Kernel) Init(args InitKernelArgs) error {
k.monotonicClock = &timekeeperClock{tk: args.Timekeeper, c: sentrytime.Monotonic}
k.futexes = futex.NewManager()
k.netlinkPorts = port.New()
-
return nil
}
@@ -384,11 +387,7 @@ func (k *Kernel) SaveTo(w io.Writer) error {
// flushMountSourceRefs flushes the MountSources for all mounted filesystems
// and open FDs.
func (k *Kernel) flushMountSourceRefs() error {
- // Flush all mount sources for currently mounted filesystems in the
- // root mount namespace.
- k.mounts.FlushMountSourceRefs()
-
- // Some tasks may have other mount namespaces; flush those as well.
+ // Flush all mount sources for currently mounted filesystems in each task.
flushed := make(map[*fs.MountNamespace]struct{})
k.tasks.mu.RLock()
k.tasks.forEachThreadGroupLocked(func(tg *ThreadGroup) {
@@ -497,7 +496,7 @@ func (ts *TaskSet) unregisterEpollWaiters() {
}
// LoadFrom returns a new Kernel loaded from args.
-func (k *Kernel) LoadFrom(r io.Reader, net inet.Stack) error {
+func (k *Kernel) LoadFrom(r io.Reader, net inet.Stack, clocks sentrytime.Clocks) error {
loadStart := time.Now()
k.networkStack = net
@@ -541,6 +540,11 @@ func (k *Kernel) LoadFrom(r io.Reader, net inet.Stack) error {
log.Infof("Overall load took [%s]", time.Since(loadStart))
+ k.Timekeeper().SetClocks(clocks)
+ if net != nil {
+ net.Resume()
+ }
+
// Ensure that all pending asynchronous work is complete:
// - namedpipe opening
// - inode file opening
@@ -550,7 +554,7 @@ func (k *Kernel) LoadFrom(r io.Reader, net inet.Stack) error {
tcpip.AsyncLoading.Wait()
- log.Infof("Overall load took [%s]", time.Since(loadStart))
+ log.Infof("Overall load took [%s] after async work", time.Since(loadStart))
// Applications may size per-cpu structures based on k.applicationCores, so
// it can't change across save/restore. When we are virtualizing CPU
@@ -565,16 +569,6 @@ func (k *Kernel) LoadFrom(r io.Reader, net inet.Stack) error {
return nil
}
-// Destroy releases resources owned by k.
-//
-// Preconditions: There must be no task goroutines running in k.
-func (k *Kernel) Destroy() {
- if k.mounts != nil {
- k.mounts.DecRef()
- k.mounts = nil
- }
-}
-
// UniqueID returns a unique identifier.
func (k *Kernel) UniqueID() uint64 {
id := atomic.AddUint64(&k.uniqueID, 1)
@@ -586,11 +580,17 @@ func (k *Kernel) UniqueID() uint64 {
// CreateProcessArgs holds arguments to kernel.CreateProcess.
type CreateProcessArgs struct {
- // Filename is the filename to load.
+ // Filename is the filename to load as the init binary.
//
- // If this is provided as "", then the file will be guessed via Argv[0].
+ // If this is provided as "", File will be checked, then the file will be
+ // guessed via Argv[0].
Filename string
+ // File is a passed host FD pointing to a file to load as the init binary.
+ //
+ // This is checked if and only if Filename is "".
+ File *fs.File
+
// Argvv is a list of arguments.
Argv []string
@@ -632,19 +632,12 @@ type CreateProcessArgs struct {
AbstractSocketNamespace *AbstractSocketNamespace
// MountNamespace optionally contains the mount namespace for this
- // process. If nil, the kernel's mount namespace is used.
+ // process. If nil, the init process's mount namespace is used.
//
// Anyone setting MountNamespace must donate a reference (i.e.
// increment it).
MountNamespace *fs.MountNamespace
- // Root optionally contains the dirent that serves as the root for the
- // process. If nil, the mount namespace's root is used as the process'
- // root.
- //
- // Anyone setting Root must donate a reference (i.e. increment it).
- Root *fs.Dirent
-
// ContainerID is the container that the process belongs to.
ContainerID string
}
@@ -682,16 +675,10 @@ func (ctx *createProcessContext) Value(key interface{}) interface{} {
case auth.CtxCredentials:
return ctx.args.Credentials
case fs.CtxRoot:
- if ctx.args.Root != nil {
- // Take a reference on the root dirent that will be
- // given to the caller.
- ctx.args.Root.IncRef()
- return ctx.args.Root
- }
- if ctx.k.mounts != nil {
- // MountNamespace.Root() will take a reference on the
- // root dirent for us.
- return ctx.k.mounts.Root()
+ if ctx.args.MountNamespace != nil {
+ // MountNamespace.Root() will take a reference on the root
+ // dirent for us.
+ return ctx.args.MountNamespace.Root()
}
return nil
case fs.CtxDirentCacheLimiter:
@@ -735,30 +722,18 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
defer k.extMu.Unlock()
log.Infof("EXEC: %v", args.Argv)
- if k.mounts == nil {
- return nil, 0, fmt.Errorf("no kernel MountNamespace")
- }
-
// Grab the mount namespace.
mounts := args.MountNamespace
if mounts == nil {
- // If no MountNamespace was configured, then use the kernel's
- // root mount namespace, with an extra reference that will be
- // donated to the task.
- mounts = k.mounts
+ mounts = k.GlobalInit().Leader().MountNamespace()
mounts.IncRef()
}
tg := k.newThreadGroup(mounts, args.PIDNamespace, NewSignalHandlers(), linux.SIGCHLD, args.Limits, k.monotonicClock)
ctx := args.NewContext(k)
- // Grab the root directory.
- root := args.Root
- if root == nil {
- // If no Root was configured, then get it from the
- // MountNamespace.
- root = mounts.Root()
- }
+ // Get the root directory from the MountNamespace.
+ root := mounts.Root()
// The call to newFSContext below will take a reference on root, so we
// don't need to hold this one.
defer root.DecRef()
@@ -768,15 +743,23 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
wd := root // Default.
if args.WorkingDirectory != "" {
var err error
- wd, err = k.mounts.FindInode(ctx, root, nil, args.WorkingDirectory, &remainingTraversals)
+ wd, err = mounts.FindInode(ctx, root, nil, args.WorkingDirectory, &remainingTraversals)
if err != nil {
return nil, 0, fmt.Errorf("failed to find initial working directory %q: %v", args.WorkingDirectory, err)
}
defer wd.DecRef()
}
- if args.Filename == "" {
- // Was anything provided?
+ // Check which file to start from.
+ switch {
+ case args.Filename != "":
+ // If a filename is given, take that.
+ // Set File to nil so we resolve the path in LoadTaskImage.
+ args.File = nil
+ case args.File != nil:
+ // If File is set, take the File provided directly.
+ default:
+ // Otherwise look at Argv and see if the first argument is a valid path.
if len(args.Argv) == 0 {
return nil, 0, fmt.Errorf("no filename or command provided")
}
@@ -788,7 +771,8 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
// Create a fresh task context.
remainingTraversals = uint(args.MaxSymlinkTraversals)
- tc, se := k.LoadTaskImage(ctx, k.mounts, root, wd, &remainingTraversals, args.Filename, args.Argv, args.Envv, k.featureSet)
+
+ tc, se := k.LoadTaskImage(ctx, mounts, root, wd, &remainingTraversals, args.Filename, args.File, args.Argv, args.Envv, k.featureSet)
if se != nil {
return nil, 0, errors.New(se.String())
}
@@ -1032,20 +1016,6 @@ func (k *Kernel) RootAbstractSocketNamespace() *AbstractSocketNamespace {
return k.rootAbstractSocketNamespace
}
-// RootMountNamespace returns the MountNamespace.
-func (k *Kernel) RootMountNamespace() *fs.MountNamespace {
- k.extMu.Lock()
- defer k.extMu.Unlock()
- return k.mounts
-}
-
-// SetRootMountNamespace sets the MountNamespace.
-func (k *Kernel) SetRootMountNamespace(mounts *fs.MountNamespace) {
- k.extMu.Lock()
- defer k.extMu.Unlock()
- k.mounts = mounts
-}
-
// NetworkStack returns the network stack. NetworkStack may return nil if no
// network stack is available.
func (k *Kernel) NetworkStack() inet.Stack {
@@ -1168,16 +1138,6 @@ func (k *Kernel) SupervisorContext() context.Context {
}
}
-// EmitUnimplementedEvent emits an UnimplementedSyscall event via the event
-// channel.
-func (k *Kernel) EmitUnimplementedEvent(ctx context.Context) {
- t := TaskFromContext(ctx)
- eventchannel.Emit(&uspb.UnimplementedSyscall{
- Tid: int32(t.ThreadID()),
- Registers: t.Arch().StateData().Proto(),
- })
-}
-
// SocketEntry represents a socket recorded in Kernel.sockets. It implements
// refs.WeakRefUser for sockets stored in the socket table.
//
@@ -1246,7 +1206,10 @@ func (ctx supervisorContext) Value(key interface{}) interface{} {
// The supervisor context is global root.
return auth.NewRootCredentials(ctx.k.rootUserNamespace)
case fs.CtxRoot:
- return ctx.k.mounts.Root()
+ if ctx.k.globalInit != nil {
+ return ctx.k.globalInit.mounts.Root()
+ }
+ return nil
case fs.CtxDirentCacheLimiter:
return ctx.k.DirentCacheLimiter
case ktime.CtxRealtimeClock:
@@ -1272,3 +1235,23 @@ func (ctx supervisorContext) Value(key interface{}) interface{} {
return nil
}
}
+
+// Rate limits for the number of unimplemented syscall events.
+const (
+ unimplementedSyscallsMaxRate = 100 // events per second
+ unimplementedSyscallBurst = 1000 // events
+)
+
+// EmitUnimplementedEvent emits an UnimplementedSyscall event via the event
+// channel.
+func (k *Kernel) EmitUnimplementedEvent(ctx context.Context) {
+ k.unimplementedSyscallEmitterOnce.Do(func() {
+ k.unimplementedSyscallEmitter = eventchannel.RateLimitedEmitterFrom(eventchannel.DefaultEmitter, unimplementedSyscallsMaxRate, unimplementedSyscallBurst)
+ })
+
+ t := TaskFromContext(ctx)
+ k.unimplementedSyscallEmitter.Emit(&uspb.UnimplementedSyscall{
+ Tid: int32(t.ThreadID()),
+ Registers: t.Arch().StateData().Proto(),
+ })
+}
diff --git a/pkg/sentry/kernel/sessions.go b/pkg/sentry/kernel/sessions.go
index 81fcd8258..e5f297478 100644
--- a/pkg/sentry/kernel/sessions.go
+++ b/pkg/sentry/kernel/sessions.go
@@ -47,6 +47,11 @@ type Session struct {
// The id is immutable.
id SessionID
+ // foreground is the foreground process group.
+ //
+ // This is protected by TaskSet.mu.
+ foreground *ProcessGroup
+
// ProcessGroups is a list of process groups in this Session. This is
// protected by TaskSet.mu.
processGroups processGroupList
@@ -260,12 +265,14 @@ func (pg *ProcessGroup) SendSignal(info *arch.SignalInfo) error {
func (tg *ThreadGroup) CreateSession() error {
tg.pidns.owner.mu.Lock()
defer tg.pidns.owner.mu.Unlock()
+ tg.signalHandlers.mu.Lock()
+ defer tg.signalHandlers.mu.Unlock()
return tg.createSession()
}
// createSession creates a new session for a threadgroup.
//
-// Precondition: callers must hold TaskSet.mu for writing.
+// Precondition: callers must hold TaskSet.mu and the signal mutex for writing.
func (tg *ThreadGroup) createSession() error {
// Get the ID for this thread in the current namespace.
id := tg.pidns.tgids[tg]
@@ -346,6 +353,9 @@ func (tg *ThreadGroup) createSession() error {
ns.processGroups[ProcessGroupID(local)] = pg
}
+ // Disconnect from the controlling terminal.
+ tg.tty = nil
+
return nil
}
diff --git a/pkg/sentry/kernel/task_block.go b/pkg/sentry/kernel/task_block.go
index 2a2e6f662..dd69939f9 100644
--- a/pkg/sentry/kernel/task_block.go
+++ b/pkg/sentry/kernel/task_block.go
@@ -15,6 +15,7 @@
package kernel
import (
+ "runtime"
"time"
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
@@ -121,6 +122,17 @@ func (t *Task) block(C <-chan struct{}, timerChan <-chan struct{}) error {
// Deactive our address space, we don't need it.
interrupt := t.SleepStart()
+ // If the request is not completed, but the timer has already expired,
+ // then ensure that we run through a scheduler cycle. This is because
+ // we may see applications relying on timer slack to yield the thread.
+ // For example, they may attempt to sleep for some number of nanoseconds,
+ // and expect that this will actually yield the CPU and sleep for at
+ // least microseconds, e.g.:
+ // https://github.com/LMAX-Exchange/disruptor/commit/6ca210f2bcd23f703c479804d583718e16f43c07
+ if len(timerChan) > 0 {
+ runtime.Gosched()
+ }
+
select {
case <-C:
t.SleepFinish(true)
diff --git a/pkg/sentry/kernel/task_context.go b/pkg/sentry/kernel/task_context.go
index 54b1676b0..8639d379f 100644
--- a/pkg/sentry/kernel/task_context.go
+++ b/pkg/sentry/kernel/task_context.go
@@ -140,15 +140,22 @@ func (t *Task) Stack() *arch.Stack {
// * wd: Working directory to lookup filename under
// * maxTraversals: maximum number of symlinks to follow
// * filename: path to binary to load
+// * file: an open fs.File object of the binary to load. If set,
+// file will be loaded and not filename.
// * argv: Binary argv
// * envv: Binary envv
// * fs: Binary FeatureSet
-func (k *Kernel) LoadTaskImage(ctx context.Context, mounts *fs.MountNamespace, root, wd *fs.Dirent, maxTraversals *uint, filename string, argv, envv []string, fs *cpuid.FeatureSet) (*TaskContext, *syserr.Error) {
+func (k *Kernel) LoadTaskImage(ctx context.Context, mounts *fs.MountNamespace, root, wd *fs.Dirent, maxTraversals *uint, filename string, file *fs.File, argv, envv []string, fs *cpuid.FeatureSet) (*TaskContext, *syserr.Error) {
+ // If File is not nil, we should load that instead of resolving filename.
+ if file != nil {
+ filename = file.MappedName(ctx)
+ }
+
// Prepare a new user address space to load into.
m := mm.NewMemoryManager(k, k)
defer m.DecUsers(ctx)
- os, ac, name, err := loader.Load(ctx, m, mounts, root, wd, maxTraversals, fs, filename, argv, envv, k.extraAuxv, k.vdso)
+ os, ac, name, err := loader.Load(ctx, m, mounts, root, wd, maxTraversals, fs, filename, file, argv, envv, k.extraAuxv, k.vdso)
if err != nil {
return nil, err
}
diff --git a/pkg/sentry/kernel/task_start.go b/pkg/sentry/kernel/task_start.go
index d60cd62c7..ae6fc4025 100644
--- a/pkg/sentry/kernel/task_start.go
+++ b/pkg/sentry/kernel/task_start.go
@@ -172,9 +172,10 @@ func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) {
if parentPG := tg.parentPG(); parentPG == nil {
tg.createSession()
} else {
- // Inherit the process group.
+ // Inherit the process group and terminal.
parentPG.incRefWithParent(parentPG)
tg.processGroup = parentPG
+ tg.tty = t.parent.tg.tty
}
}
tg.tasks.PushBack(t)
diff --git a/pkg/sentry/kernel/thread_group.go b/pkg/sentry/kernel/thread_group.go
index 2a97e3e8e..0eef24bfb 100644
--- a/pkg/sentry/kernel/thread_group.go
+++ b/pkg/sentry/kernel/thread_group.go
@@ -19,10 +19,13 @@ import (
"sync/atomic"
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/fs"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
"gvisor.dev/gvisor/pkg/sentry/limits"
"gvisor.dev/gvisor/pkg/sentry/usage"
+ "gvisor.dev/gvisor/pkg/syserror"
)
// A ThreadGroup is a logical grouping of tasks that has widespread
@@ -245,6 +248,12 @@ type ThreadGroup struct {
//
// mounts is immutable.
mounts *fs.MountNamespace
+
+ // tty is the thread group's controlling terminal. If nil, there is no
+ // controlling terminal.
+ //
+ // tty is protected by the signal mutex.
+ tty *TTY
}
// newThreadGroup returns a new, empty thread group in PID namespace ns. The
@@ -324,6 +333,176 @@ func (tg *ThreadGroup) forEachChildThreadGroupLocked(fn func(*ThreadGroup)) {
}
}
+// SetControllingTTY sets tty as the controlling terminal of tg.
+func (tg *ThreadGroup) SetControllingTTY(tty *TTY, arg int32) error {
+ tty.mu.Lock()
+ defer tty.mu.Unlock()
+
+ // We might be asked to set the controlling terminal of multiple
+ // processes, so we lock both the TaskSet and SignalHandlers.
+ tg.pidns.owner.mu.Lock()
+ defer tg.pidns.owner.mu.Unlock()
+ tg.signalHandlers.mu.Lock()
+ defer tg.signalHandlers.mu.Unlock()
+
+ // "The calling process must be a session leader and not have a
+ // controlling terminal already." - tty_ioctl(4)
+ if tg.processGroup.session.leader != tg || tg.tty != nil {
+ return syserror.EINVAL
+ }
+
+ // "If this terminal is already the controlling terminal of a different
+ // session group, then the ioctl fails with EPERM, unless the caller
+ // has the CAP_SYS_ADMIN capability and arg equals 1, in which case the
+ // terminal is stolen, and all processes that had it as controlling
+ // terminal lose it." - tty_ioctl(4)
+ if tty.tg != nil && tg.processGroup.session != tty.tg.processGroup.session {
+ if !auth.CredentialsFromContext(tg.leader).HasCapability(linux.CAP_SYS_ADMIN) || arg != 1 {
+ return syserror.EPERM
+ }
+ // Steal the TTY away. Unlike TIOCNOTTY, don't send signals.
+ for othertg := range tg.pidns.owner.Root.tgids {
+ // This won't deadlock by locking tg.signalHandlers
+ // because at this point:
+ // - We only lock signalHandlers if it's in the same
+ // session as the tty's controlling thread group.
+ // - We know that the calling thread group is not in
+ // the same session as the tty's controlling thread
+ // group.
+ if othertg.processGroup.session == tty.tg.processGroup.session {
+ othertg.signalHandlers.mu.Lock()
+ othertg.tty = nil
+ othertg.signalHandlers.mu.Unlock()
+ }
+ }
+ }
+
+ // Set the controlling terminal and foreground process group.
+ tg.tty = tty
+ tg.processGroup.session.foreground = tg.processGroup
+ // Set this as the controlling process of the terminal.
+ tty.tg = tg
+
+ return nil
+}
+
+// ReleaseControllingTTY gives up tty as the controlling tty of tg.
+func (tg *ThreadGroup) ReleaseControllingTTY(tty *TTY) error {
+ tty.mu.Lock()
+ defer tty.mu.Unlock()
+
+ // We might be asked to set the controlling terminal of multiple
+ // processes, so we lock both the TaskSet and SignalHandlers.
+ tg.pidns.owner.mu.RLock()
+ defer tg.pidns.owner.mu.RUnlock()
+
+ // Just below, we may re-lock signalHandlers in order to send signals.
+ // Thus we can't defer Unlock here.
+ tg.signalHandlers.mu.Lock()
+
+ if tg.tty == nil || tg.tty != tty {
+ tg.signalHandlers.mu.Unlock()
+ return syserror.ENOTTY
+ }
+
+ // "If the process was session leader, then send SIGHUP and SIGCONT to
+ // the foreground process group and all processes in the current
+ // session lose their controlling terminal." - tty_ioctl(4)
+ // Remove tty as the controlling tty for each process in the session,
+ // then send them SIGHUP and SIGCONT.
+
+ // If we're not the session leader, we don't have to do much.
+ if tty.tg != tg {
+ tg.tty = nil
+ tg.signalHandlers.mu.Unlock()
+ return nil
+ }
+
+ tg.signalHandlers.mu.Unlock()
+
+ // We're the session leader. SIGHUP and SIGCONT the foreground process
+ // group and remove all controlling terminals in the session.
+ var lastErr error
+ for othertg := range tg.pidns.owner.Root.tgids {
+ if othertg.processGroup.session == tg.processGroup.session {
+ othertg.signalHandlers.mu.Lock()
+ othertg.tty = nil
+ if othertg.processGroup == tg.processGroup.session.foreground {
+ if err := othertg.leader.sendSignalLocked(&arch.SignalInfo{Signo: int32(linux.SIGHUP)}, true /* group */); err != nil {
+ lastErr = err
+ }
+ if err := othertg.leader.sendSignalLocked(&arch.SignalInfo{Signo: int32(linux.SIGCONT)}, true /* group */); err != nil {
+ lastErr = err
+ }
+ }
+ othertg.signalHandlers.mu.Unlock()
+ }
+ }
+
+ return lastErr
+}
+
+// ForegroundProcessGroup returns the process group ID of the foreground
+// process group.
+func (tg *ThreadGroup) ForegroundProcessGroup(tty *TTY) (int32, error) {
+ tty.mu.Lock()
+ defer tty.mu.Unlock()
+
+ tg.pidns.owner.mu.Lock()
+ defer tg.pidns.owner.mu.Unlock()
+ tg.signalHandlers.mu.Lock()
+ defer tg.signalHandlers.mu.Unlock()
+
+ // "When fd does not refer to the controlling terminal of the calling
+ // process, -1 is returned" - tcgetpgrp(3)
+ if tg.tty != tty {
+ return -1, syserror.ENOTTY
+ }
+
+ return int32(tg.processGroup.session.foreground.id), nil
+}
+
+// SetForegroundProcessGroup sets the foreground process group of tty to pgid.
+func (tg *ThreadGroup) SetForegroundProcessGroup(tty *TTY, pgid ProcessGroupID) (int32, error) {
+ tty.mu.Lock()
+ defer tty.mu.Unlock()
+
+ tg.pidns.owner.mu.Lock()
+ defer tg.pidns.owner.mu.Unlock()
+ tg.signalHandlers.mu.Lock()
+ defer tg.signalHandlers.mu.Unlock()
+
+ // TODO(b/129283598): "If tcsetpgrp() is called by a member of a
+ // background process group in its session, and the calling process is
+ // not blocking or ignoring SIGTTOU, a SIGTTOU signal is sent to all
+ // members of this background process group."
+
+ // tty must be the controlling terminal.
+ if tg.tty != tty {
+ return -1, syserror.ENOTTY
+ }
+
+ // pgid must be positive.
+ if pgid < 0 {
+ return -1, syserror.EINVAL
+ }
+
+ // pg must not be empty. Empty process groups are removed from their
+ // pid namespaces.
+ pg, ok := tg.pidns.processGroups[pgid]
+ if !ok {
+ return -1, syserror.ESRCH
+ }
+
+ // pg must be part of this process's session.
+ if tg.processGroup.session != pg.session {
+ return -1, syserror.EPERM
+ }
+
+ tg.processGroup.session.foreground.id = pgid
+ return 0, nil
+}
+
// itimerRealListener implements ktime.Listener for ITIMER_REAL expirations.
//
// +stateify savable
diff --git a/pkg/sentry/kernel/tty.go b/pkg/sentry/kernel/tty.go
new file mode 100644
index 000000000..34f84487a
--- /dev/null
+++ b/pkg/sentry/kernel/tty.go
@@ -0,0 +1,28 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernel
+
+import "sync"
+
+// TTY defines the relationship between a thread group and its controlling
+// terminal.
+//
+// +stateify savable
+type TTY struct {
+ mu sync.Mutex `state:"nosave"`
+
+ // tg is protected by mu.
+ tg *ThreadGroup
+}
diff --git a/pkg/sentry/loader/elf.go b/pkg/sentry/loader/elf.go
index bc5b841fb..ba9c9ce12 100644
--- a/pkg/sentry/loader/elf.go
+++ b/pkg/sentry/loader/elf.go
@@ -464,7 +464,7 @@ func loadParsedELF(ctx context.Context, m *mm.MemoryManager, f *fs.File, info el
// base address big enough to fit all segments, so we first create a
// mapping for the total size just to find a region that is big enough.
//
- // It is safe to unmap it immediately with racing with another mapping
+ // It is safe to unmap it immediately without racing with another mapping
// because we are the only one in control of the MemoryManager.
//
// Note that the vaddr of the first PT_LOAD segment is ignored when
diff --git a/pkg/sentry/loader/loader.go b/pkg/sentry/loader/loader.go
index baa12d9a0..f6f1ae762 100644
--- a/pkg/sentry/loader/loader.go
+++ b/pkg/sentry/loader/loader.go
@@ -67,8 +67,64 @@ func openPath(ctx context.Context, mm *fs.MountNamespace, root, wd *fs.Dirent, m
if err != nil {
return nil, nil, err
}
+
+ // Open file will take a reference to Dirent, so destroy this one.
defer d.DecRef()
+ return openFile(ctx, nil, d, name)
+}
+
+// openFile performs checks on a file to be executed. If provided a *fs.File,
+// openFile takes that file's Dirent and performs checks on it. If provided a
+// *fs.Dirent and not a *fs.File, it creates a *fs.File object from the Dirent's
+// Inode and performs checks on that.
+//
+// openFile returns an *fs.File and *fs.Dirent, and the caller takes ownership
+// of both.
+//
+// "dirent" and "file" must not both be nil and point to a readable, executable, regular file.
+func openFile(ctx context.Context, file *fs.File, dirent *fs.Dirent, name string) (*fs.Dirent, *fs.File, error) {
+ // file and dirent must not be nil.
+ if dirent == nil && file == nil {
+ ctx.Infof("dirent and file cannot both be nil.")
+ return nil, nil, syserror.ENOENT
+ }
+
+ if file != nil {
+ dirent = file.Dirent
+ }
+
+ // Perform permissions checks on the file.
+ if err := checkFile(ctx, dirent, name); err != nil {
+ return nil, nil, err
+ }
+
+ if file == nil {
+ var ferr error
+ if file, ferr = dirent.Inode.GetFile(ctx, dirent, fs.FileFlags{Read: true}); ferr != nil {
+ return nil, nil, ferr
+ }
+ } else {
+ // GetFile takes a reference to the created file, so make one in the case
+ // that the file reference already existed.
+ file.IncRef()
+ }
+
+ // We must be able to read at arbitrary offsets.
+ if !file.Flags().Pread {
+ file.DecRef()
+ ctx.Infof("%s cannot be read at an offset: %+v", file.MappedName(ctx), file.Flags())
+ return nil, nil, syserror.EACCES
+ }
+
+ // Grab reference for caller.
+ dirent.IncRef()
+ return dirent, file, nil
+}
+
+// checkFile performs file permissions checks for binaries called in openPath
+// and openFile
+func checkFile(ctx context.Context, d *fs.Dirent, name string) error {
perms := fs.PermMask{
// TODO(gvisor.dev/issue/160): Linux requires only execute
// permission, not read. However, our backing filesystems may
@@ -80,7 +136,7 @@ func openPath(ctx context.Context, mm *fs.MountNamespace, root, wd *fs.Dirent, m
Execute: true,
}
if err := d.Inode.CheckPermission(ctx, perms); err != nil {
- return nil, nil, err
+ return err
}
// If they claim it's a directory, then make sure.
@@ -88,31 +144,17 @@ func openPath(ctx context.Context, mm *fs.MountNamespace, root, wd *fs.Dirent, m
// N.B. we reject directories below, but we must first reject
// non-directories passed as directories.
if len(name) > 0 && name[len(name)-1] == '/' && !fs.IsDir(d.Inode.StableAttr) {
- return nil, nil, syserror.ENOTDIR
+ return syserror.ENOTDIR
}
// No exec-ing directories, pipes, etc!
if !fs.IsRegular(d.Inode.StableAttr) {
ctx.Infof("%s is not regular: %v", name, d.Inode.StableAttr)
- return nil, nil, syserror.EACCES
+ return syserror.EACCES
}
- // Create a new file.
- file, err := d.Inode.GetFile(ctx, d, fs.FileFlags{Read: true})
- if err != nil {
- return nil, nil, err
- }
+ return nil
- // We must be able to read at arbitrary offsets.
- if !file.Flags().Pread {
- file.DecRef()
- ctx.Infof("%s cannot be read at an offset: %+v", name, file.Flags())
- return nil, nil, syserror.EACCES
- }
-
- // Grab a reference for the caller.
- d.IncRef()
- return d, file, nil
}
// allocStack allocates and maps a stack in to any available part of the address space.
@@ -131,16 +173,30 @@ const (
maxLoaderAttempts = 6
)
-// loadPath resolves filename to a binary and loads it.
+// loadBinary loads a binary that is pointed to by "file". If nil, the path
+// "filename" is resolved and loaded.
//
// It returns:
// * loadedELF, description of the loaded binary
// * arch.Context matching the binary arch
// * fs.Dirent of the binary file
// * Possibly updated argv
-func loadPath(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamespace, root, wd *fs.Dirent, remainingTraversals *uint, fs *cpuid.FeatureSet, filename string, argv []string) (loadedELF, arch.Context, *fs.Dirent, []string, error) {
+func loadBinary(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamespace, root, wd *fs.Dirent, remainingTraversals *uint, features *cpuid.FeatureSet, filename string, passedFile *fs.File, argv []string) (loadedELF, arch.Context, *fs.Dirent, []string, error) {
for i := 0; i < maxLoaderAttempts; i++ {
- d, f, err := openPath(ctx, mounts, root, wd, remainingTraversals, filename)
+ var (
+ d *fs.Dirent
+ f *fs.File
+ err error
+ )
+ if passedFile == nil {
+ d, f, err = openPath(ctx, mounts, root, wd, remainingTraversals, filename)
+
+ } else {
+ d, f, err = openFile(ctx, passedFile, nil, "")
+ // Set to nil in case we loop on a Interpreter Script.
+ passedFile = nil
+ }
+
if err != nil {
ctx.Infof("Error opening %s: %v", filename, err)
return loadedELF{}, nil, nil, nil, err
@@ -165,7 +221,7 @@ func loadPath(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamespac
switch {
case bytes.Equal(hdr[:], []byte(elfMagic)):
- loaded, ac, err := loadELF(ctx, m, mounts, root, wd, remainingTraversals, fs, f)
+ loaded, ac, err := loadELF(ctx, m, mounts, root, wd, remainingTraversals, features, f)
if err != nil {
ctx.Infof("Error loading ELF: %v", err)
return loadedELF{}, nil, nil, nil, err
@@ -190,7 +246,8 @@ func loadPath(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamespac
return loadedELF{}, nil, nil, nil, syserror.ELOOP
}
-// Load loads filename into a MemoryManager.
+// Load loads "file" into a MemoryManager. If file is nil, the path "filename"
+// is resolved and loaded instead.
//
// If Load returns ErrSwitchFile it should be called again with the returned
// path and argv.
@@ -198,9 +255,9 @@ func loadPath(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamespac
// Preconditions:
// * The Task MemoryManager is empty.
// * Load is called on the Task goroutine.
-func Load(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamespace, root, wd *fs.Dirent, maxTraversals *uint, fs *cpuid.FeatureSet, filename string, argv, envv []string, extraAuxv []arch.AuxEntry, vdso *VDSO) (abi.OS, arch.Context, string, *syserr.Error) {
+func Load(ctx context.Context, m *mm.MemoryManager, mounts *fs.MountNamespace, root, wd *fs.Dirent, maxTraversals *uint, fs *cpuid.FeatureSet, filename string, file *fs.File, argv, envv []string, extraAuxv []arch.AuxEntry, vdso *VDSO) (abi.OS, arch.Context, string, *syserr.Error) {
// Load the binary itself.
- loaded, ac, d, argv, err := loadPath(ctx, m, mounts, root, wd, maxTraversals, fs, filename, argv)
+ loaded, ac, d, argv, err := loadBinary(ctx, m, mounts, root, wd, maxTraversals, fs, filename, file, argv)
if err != nil {
return 0, nil, "", syserr.NewDynamic(fmt.Sprintf("Failed to load %s: %v", filename, err), syserr.FromError(err).ToLinux())
}
diff --git a/pkg/sentry/mm/procfs.go b/pkg/sentry/mm/procfs.go
index a8819aa84..8c2246bb4 100644
--- a/pkg/sentry/mm/procfs.go
+++ b/pkg/sentry/mm/procfs.go
@@ -58,6 +58,34 @@ func (mm *MemoryManager) NeedsUpdate(generation int64) bool {
return true
}
+// ReadMapsDataInto is called by fsimpl/proc.mapsData.Generate to
+// implement /proc/[pid]/maps.
+func (mm *MemoryManager) ReadMapsDataInto(ctx context.Context, buf *bytes.Buffer) {
+ mm.mappingMu.RLock()
+ defer mm.mappingMu.RUnlock()
+ var start usermem.Addr
+
+ for vseg := mm.vmas.LowerBoundSegment(start); vseg.Ok(); vseg = vseg.NextSegment() {
+ // FIXME(b/30793614): If we use a usermem.Addr for the handle, we get
+ // "panic: autosave error: type usermem.Addr is not registered".
+ mm.appendVMAMapsEntryLocked(ctx, vseg, buf)
+ }
+
+ // We always emulate vsyscall, so advertise it here. Everything about a
+ // vsyscall region is static, so just hard code the maps entry since we
+ // don't have a real vma backing it. The vsyscall region is at the end of
+ // the virtual address space so nothing should be mapped after it (if
+ // something is really mapped in the tiny ~10 MiB segment afterwards, we'll
+ // get the sorting on the maps file wrong at worst; but that's not possible
+ // on any current platform).
+ //
+ // Artifically adjust the seqfile handle so we only output vsyscall entry once.
+ if start != vsyscallEnd {
+ // FIXME(b/30793614): Can't get a pointer to constant vsyscallEnd.
+ buf.WriteString(vsyscallMapsEntry)
+ }
+}
+
// ReadMapsSeqFileData is called by fs/proc.mapsData.ReadSeqFileData to
// implement /proc/[pid]/maps.
func (mm *MemoryManager) ReadMapsSeqFileData(ctx context.Context, handle seqfile.SeqHandle) ([]seqfile.SeqData, int64) {
@@ -151,6 +179,27 @@ func (mm *MemoryManager) appendVMAMapsEntryLocked(ctx context.Context, vseg vmaI
b.WriteString("\n")
}
+// ReadSmapsDataInto is called by fsimpl/proc.smapsData.Generate to
+// implement /proc/[pid]/maps.
+func (mm *MemoryManager) ReadSmapsDataInto(ctx context.Context, buf *bytes.Buffer) {
+ mm.mappingMu.RLock()
+ defer mm.mappingMu.RUnlock()
+ var start usermem.Addr
+
+ for vseg := mm.vmas.LowerBoundSegment(start); vseg.Ok(); vseg = vseg.NextSegment() {
+ // FIXME(b/30793614): If we use a usermem.Addr for the handle, we get
+ // "panic: autosave error: type usermem.Addr is not registered".
+ mm.vmaSmapsEntryIntoLocked(ctx, vseg, buf)
+ }
+
+ // We always emulate vsyscall, so advertise it here. See
+ // ReadMapsSeqFileData for additional commentary.
+ if start != vsyscallEnd {
+ // FIXME(b/30793614): Can't get a pointer to constant vsyscallEnd.
+ buf.WriteString(vsyscallSmapsEntry)
+ }
+}
+
// ReadSmapsSeqFileData is called by fs/proc.smapsData.ReadSeqFileData to
// implement /proc/[pid]/smaps.
func (mm *MemoryManager) ReadSmapsSeqFileData(ctx context.Context, handle seqfile.SeqHandle) ([]seqfile.SeqData, int64) {
@@ -190,7 +239,12 @@ func (mm *MemoryManager) ReadSmapsSeqFileData(ctx context.Context, handle seqfil
// Preconditions: mm.mappingMu must be locked.
func (mm *MemoryManager) vmaSmapsEntryLocked(ctx context.Context, vseg vmaIterator) []byte {
var b bytes.Buffer
- mm.appendVMAMapsEntryLocked(ctx, vseg, &b)
+ mm.vmaSmapsEntryIntoLocked(ctx, vseg, &b)
+ return b.Bytes()
+}
+
+func (mm *MemoryManager) vmaSmapsEntryIntoLocked(ctx context.Context, vseg vmaIterator, b *bytes.Buffer) {
+ mm.appendVMAMapsEntryLocked(ctx, vseg, b)
vma := vseg.ValuePtr()
// We take mm.activeMu here in each call to vmaSmapsEntryLocked, instead of
@@ -211,40 +265,40 @@ func (mm *MemoryManager) vmaSmapsEntryLocked(ctx context.Context, vseg vmaIterat
}
mm.activeMu.RUnlock()
- fmt.Fprintf(&b, "Size: %8d kB\n", vseg.Range().Length()/1024)
- fmt.Fprintf(&b, "Rss: %8d kB\n", rss/1024)
+ fmt.Fprintf(b, "Size: %8d kB\n", vseg.Range().Length()/1024)
+ fmt.Fprintf(b, "Rss: %8d kB\n", rss/1024)
// Currently we report PSS = RSS, i.e. we pretend each page mapped by a pma
// is only mapped by that pma. This avoids having to query memmap.Mappables
// for reference count information on each page. As a corollary, all pages
// are accounted as "private" whether or not the vma is private; compare
// Linux's fs/proc/task_mmu.c:smaps_account().
- fmt.Fprintf(&b, "Pss: %8d kB\n", rss/1024)
- fmt.Fprintf(&b, "Shared_Clean: %8d kB\n", 0)
- fmt.Fprintf(&b, "Shared_Dirty: %8d kB\n", 0)
+ fmt.Fprintf(b, "Pss: %8d kB\n", rss/1024)
+ fmt.Fprintf(b, "Shared_Clean: %8d kB\n", 0)
+ fmt.Fprintf(b, "Shared_Dirty: %8d kB\n", 0)
// Pretend that all pages are dirty if the vma is writable, and clean otherwise.
clean := rss
if vma.effectivePerms.Write {
clean = 0
}
- fmt.Fprintf(&b, "Private_Clean: %8d kB\n", clean/1024)
- fmt.Fprintf(&b, "Private_Dirty: %8d kB\n", (rss-clean)/1024)
+ fmt.Fprintf(b, "Private_Clean: %8d kB\n", clean/1024)
+ fmt.Fprintf(b, "Private_Dirty: %8d kB\n", (rss-clean)/1024)
// Pretend that all pages are "referenced" (recently touched).
- fmt.Fprintf(&b, "Referenced: %8d kB\n", rss/1024)
- fmt.Fprintf(&b, "Anonymous: %8d kB\n", anon/1024)
+ fmt.Fprintf(b, "Referenced: %8d kB\n", rss/1024)
+ fmt.Fprintf(b, "Anonymous: %8d kB\n", anon/1024)
// Hugepages (hugetlb and THP) are not implemented.
- fmt.Fprintf(&b, "AnonHugePages: %8d kB\n", 0)
- fmt.Fprintf(&b, "Shared_Hugetlb: %8d kB\n", 0)
- fmt.Fprintf(&b, "Private_Hugetlb: %7d kB\n", 0)
+ fmt.Fprintf(b, "AnonHugePages: %8d kB\n", 0)
+ fmt.Fprintf(b, "Shared_Hugetlb: %8d kB\n", 0)
+ fmt.Fprintf(b, "Private_Hugetlb: %7d kB\n", 0)
// Swap is not implemented.
- fmt.Fprintf(&b, "Swap: %8d kB\n", 0)
- fmt.Fprintf(&b, "SwapPss: %8d kB\n", 0)
- fmt.Fprintf(&b, "KernelPageSize: %8d kB\n", usermem.PageSize/1024)
- fmt.Fprintf(&b, "MMUPageSize: %8d kB\n", usermem.PageSize/1024)
+ fmt.Fprintf(b, "Swap: %8d kB\n", 0)
+ fmt.Fprintf(b, "SwapPss: %8d kB\n", 0)
+ fmt.Fprintf(b, "KernelPageSize: %8d kB\n", usermem.PageSize/1024)
+ fmt.Fprintf(b, "MMUPageSize: %8d kB\n", usermem.PageSize/1024)
locked := rss
if vma.mlockMode == memmap.MLockNone {
locked = 0
}
- fmt.Fprintf(&b, "Locked: %8d kB\n", locked/1024)
+ fmt.Fprintf(b, "Locked: %8d kB\n", locked/1024)
b.WriteString("VmFlags: ")
if vma.realPerms.Read {
@@ -284,6 +338,4 @@ func (mm *MemoryManager) vmaSmapsEntryLocked(ctx context.Context, vseg vmaIterat
b.WriteString("ac ")
}
b.WriteString("\n")
-
- return b.Bytes()
}
diff --git a/pkg/sentry/pgalloc/pgalloc.go b/pkg/sentry/pgalloc/pgalloc.go
index 8bd3e885d..f7f7298c4 100644
--- a/pkg/sentry/pgalloc/pgalloc.go
+++ b/pkg/sentry/pgalloc/pgalloc.go
@@ -285,7 +285,10 @@ func NewMemoryFile(file *os.File, opts MemoryFileOpts) (*MemoryFile, error) {
switch opts.DelayedEviction {
case DelayedEvictionDefault:
opts.DelayedEviction = DelayedEvictionEnabled
- case DelayedEvictionDisabled, DelayedEvictionEnabled, DelayedEvictionManual:
+ case DelayedEvictionDisabled, DelayedEvictionManual:
+ opts.UseHostMemcgPressure = false
+ case DelayedEvictionEnabled:
+ // ok
default:
return nil, fmt.Errorf("invalid MemoryFileOpts.DelayedEviction: %v", opts.DelayedEviction)
}
@@ -777,6 +780,14 @@ func (f *MemoryFile) MarkAllUnevictable(user EvictableMemoryUser) {
}
}
+// ShouldCacheEvictable returns true if f is meaningfully delaying evictions of
+// evictable memory, such that it may be advantageous to cache data in
+// evictable memory. The value returned by ShouldCacheEvictable may change
+// between calls.
+func (f *MemoryFile) ShouldCacheEvictable() bool {
+ return f.opts.DelayedEviction == DelayedEvictionManual || f.opts.UseHostMemcgPressure
+}
+
// UpdateUsage ensures that the memory usage statistics in
// usage.MemoryAccounting are up to date.
func (f *MemoryFile) UpdateUsage() error {
diff --git a/pkg/sentry/platform/ptrace/BUILD b/pkg/sentry/platform/ptrace/BUILD
index 1b6c54e96..ebcc8c098 100644
--- a/pkg/sentry/platform/ptrace/BUILD
+++ b/pkg/sentry/platform/ptrace/BUILD
@@ -7,13 +7,17 @@ go_library(
srcs = [
"filters.go",
"ptrace.go",
+ "ptrace_amd64.go",
+ "ptrace_arm64.go",
"ptrace_unsafe.go",
"stub_amd64.s",
+ "stub_arm64.s",
"stub_unsafe.go",
"subprocess.go",
"subprocess_amd64.go",
+ "subprocess_arm64.go",
"subprocess_linux.go",
- "subprocess_linux_amd64_unsafe.go",
+ "subprocess_linux_unsafe.go",
"subprocess_unsafe.go",
],
importpath = "gvisor.dev/gvisor/pkg/sentry/platform/ptrace",
diff --git a/pkg/sentry/platform/ptrace/ptrace.go b/pkg/sentry/platform/ptrace/ptrace.go
index 6fd30ed25..7b120a15d 100644
--- a/pkg/sentry/platform/ptrace/ptrace.go
+++ b/pkg/sentry/platform/ptrace/ptrace.go
@@ -60,7 +60,7 @@ var (
// maximum user address. This is valid only after a call to stubInit.
//
// We attempt to link the stub here, and adjust downward as needed.
- stubStart uintptr = 0x7fffffff0000
+ stubStart uintptr = stubInitAddress
// stubEnd is the first byte past the end of the stub, as with
// stubStart this is valid only after a call to stubInit.
diff --git a/pkg/sentry/platform/ptrace/ptrace_amd64.go b/pkg/sentry/platform/ptrace/ptrace_amd64.go
new file mode 100644
index 000000000..db0212538
--- /dev/null
+++ b/pkg/sentry/platform/ptrace/ptrace_amd64.go
@@ -0,0 +1,33 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package ptrace
+
+import (
+ "syscall"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+)
+
+// fpRegSet returns the GETREGSET/SETREGSET register set type to be used.
+func fpRegSet(useXsave bool) uintptr {
+ if useXsave {
+ return linux.NT_X86_XSTATE
+ }
+ return linux.NT_PRFPREG
+}
+
+func stackPointer(r *syscall.PtraceRegs) uintptr {
+ return uintptr(r.Rsp)
+}
diff --git a/pkg/sentry/platform/ptrace/ptrace_arm64.go b/pkg/sentry/platform/ptrace/ptrace_arm64.go
new file mode 100644
index 000000000..4db28c534
--- /dev/null
+++ b/pkg/sentry/platform/ptrace/ptrace_arm64.go
@@ -0,0 +1,30 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package ptrace
+
+import (
+ "syscall"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+)
+
+// fpRegSet returns the GETREGSET/SETREGSET register set type to be used.
+func fpRegSet(_ bool) uintptr {
+ return linux.NT_PRFPREG
+}
+
+func stackPointer(r *syscall.PtraceRegs) uintptr {
+ return uintptr(r.Sp)
+}
diff --git a/pkg/sentry/platform/ptrace/ptrace_unsafe.go b/pkg/sentry/platform/ptrace/ptrace_unsafe.go
index 2706039a5..47957bb3b 100644
--- a/pkg/sentry/platform/ptrace/ptrace_unsafe.go
+++ b/pkg/sentry/platform/ptrace/ptrace_unsafe.go
@@ -18,37 +18,23 @@ import (
"syscall"
"unsafe"
+ "gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/usermem"
)
-// GETREGSET/SETREGSET register set types.
-//
-// See include/uapi/linux/elf.h.
-const (
- // _NT_PRFPREG is for x86 floating-point state without using xsave.
- _NT_PRFPREG = 0x2
-
- // _NT_X86_XSTATE is for x86 extended state using xsave.
- _NT_X86_XSTATE = 0x202
-)
-
-// fpRegSet returns the GETREGSET/SETREGSET register set type to be used.
-func fpRegSet(useXsave bool) uintptr {
- if useXsave {
- return _NT_X86_XSTATE
- }
- return _NT_PRFPREG
-}
-
-// getRegs sets the regular register set.
+// getRegs gets the general purpose register set.
func (t *thread) getRegs(regs *syscall.PtraceRegs) error {
+ iovec := syscall.Iovec{
+ Base: (*byte)(unsafe.Pointer(regs)),
+ Len: uint64(unsafe.Sizeof(*regs)),
+ }
_, _, errno := syscall.RawSyscall6(
syscall.SYS_PTRACE,
- syscall.PTRACE_GETREGS,
+ syscall.PTRACE_GETREGSET,
uintptr(t.tid),
- 0,
- uintptr(unsafe.Pointer(regs)),
+ linux.NT_PRSTATUS,
+ uintptr(unsafe.Pointer(&iovec)),
0, 0)
if errno != 0 {
return errno
@@ -56,14 +42,18 @@ func (t *thread) getRegs(regs *syscall.PtraceRegs) error {
return nil
}
-// setRegs sets the regular register set.
+// setRegs sets the general purpose register set.
func (t *thread) setRegs(regs *syscall.PtraceRegs) error {
+ iovec := syscall.Iovec{
+ Base: (*byte)(unsafe.Pointer(regs)),
+ Len: uint64(unsafe.Sizeof(*regs)),
+ }
_, _, errno := syscall.RawSyscall6(
syscall.SYS_PTRACE,
- syscall.PTRACE_SETREGS,
+ syscall.PTRACE_SETREGSET,
uintptr(t.tid),
- 0,
- uintptr(unsafe.Pointer(regs)),
+ linux.NT_PRSTATUS,
+ uintptr(unsafe.Pointer(&iovec)),
0, 0)
if errno != 0 {
return errno
@@ -131,7 +121,7 @@ func (t *thread) getSignalInfo(si *arch.SignalInfo) error {
//
// Precondition: the OS thread must be locked and own t.
func (t *thread) clone() (*thread, error) {
- r, ok := usermem.Addr(t.initRegs.Rsp).RoundUp()
+ r, ok := usermem.Addr(stackPointer(&t.initRegs)).RoundUp()
if !ok {
return nil, syscall.EINVAL
}
diff --git a/pkg/sentry/platform/ptrace/stub_arm64.s b/pkg/sentry/platform/ptrace/stub_arm64.s
new file mode 100644
index 000000000..2c5e4d5cb
--- /dev/null
+++ b/pkg/sentry/platform/ptrace/stub_arm64.s
@@ -0,0 +1,106 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "funcdata.h"
+#include "textflag.h"
+
+#define SYS_GETPID 172
+#define SYS_EXIT 93
+#define SYS_KILL 129
+#define SYS_GETPPID 173
+#define SYS_PRCTL 167
+
+#define SIGKILL 9
+#define SIGSTOP 19
+
+#define PR_SET_PDEATHSIG 1
+
+// stub bootstraps the child and sends itself SIGSTOP to wait for attach.
+//
+// R7 contains the expected PPID.
+//
+// This should not be used outside the context of a new ptrace child (as the
+// function is otherwise a bunch of nonsense).
+TEXT ·stub(SB),NOSPLIT,$0
+begin:
+ // N.B. This loop only executes in the context of a single-threaded
+ // fork child.
+
+ MOVD $SYS_PRCTL, R8
+ MOVD $PR_SET_PDEATHSIG, R0
+ MOVD $SIGKILL, R1
+ SVC
+
+ CMN $4095, R0
+ BCS error
+
+ // If the parent already died before we called PR_SET_DEATHSIG then
+ // we'll have an unexpected PPID.
+ MOVD $SYS_GETPPID, R8
+ SVC
+
+ CMP R0, R7
+ BNE parent_dead
+
+ MOVD $SYS_GETPID, R8
+ SVC
+
+ CMP $0x0, R0
+ BLT error
+
+ // SIGSTOP to wait for attach.
+ //
+ // The SYSCALL instruction will be used for future syscall injection by
+ // thread.syscall.
+ MOVD $SYS_KILL, R8
+ MOVD $SIGSTOP, R1
+ SVC
+ // The tracer may "detach" and/or allow code execution here in three cases:
+ //
+ // 1. New (traced) stub threads are explicitly detached by the
+ // goroutine in newSubprocess. However, they are detached while in
+ // group-stop, so they do not execute code here.
+ //
+ // 2. If a tracer thread exits, it implicitly detaches from the stub,
+ // potentially allowing code execution here. However, the Go runtime
+ // never exits individual threads, so this case never occurs.
+ //
+ // 3. subprocess.createStub clones a new stub process that is untraced,
+ // thus executing this code. We setup the PDEATHSIG before SIGSTOPing
+ // ourselves for attach by the tracer.
+ //
+ // R7 has been updated with the expected PPID.
+ B begin
+
+error:
+ // Exit with -errno.
+ NEG R0, R0
+ MOVD $SYS_EXIT, R8
+ SVC
+ HLT
+
+parent_dead:
+ MOVD $SYS_EXIT, R8
+ MOVD $1, R0
+ SVC
+ HLT
+
+// stubCall calls the stub function at the given address with the given PPID.
+//
+// This is a distinct function because stub, above, may be mapped at any
+// arbitrary location, and stub has a specific binary API (see above).
+TEXT ·stubCall(SB),NOSPLIT,$0-16
+ MOVD addr+0(FP), R0
+ MOVD pid+8(FP), R7
+ B (R0)
diff --git a/pkg/sentry/platform/ptrace/subprocess.go b/pkg/sentry/platform/ptrace/subprocess.go
index 15e84735e..6bf7cd097 100644
--- a/pkg/sentry/platform/ptrace/subprocess.go
+++ b/pkg/sentry/platform/ptrace/subprocess.go
@@ -28,6 +28,16 @@ import (
"gvisor.dev/gvisor/pkg/sentry/usermem"
)
+// Linux kernel errnos which "should never be seen by user programs", but will
+// be revealed to ptrace syscall exit tracing.
+//
+// These constants are only used in subprocess.go.
+const (
+ ERESTARTSYS = syscall.Errno(512)
+ ERESTARTNOINTR = syscall.Errno(513)
+ ERESTARTNOHAND = syscall.Errno(514)
+)
+
// globalPool exists to solve two distinct problems:
//
// 1) Subprocesses can't always be killed properly (see Release).
@@ -282,7 +292,7 @@ func (t *thread) grabInitRegs() {
if err := t.getRegs(&t.initRegs); err != nil {
panic(fmt.Sprintf("ptrace get regs failed: %v", err))
}
- t.initRegs.Rip -= initRegsRipAdjustment
+ t.adjustInitRegsRip()
}
// detach detaches from the thread.
@@ -344,6 +354,9 @@ func (t *thread) wait(outcome waitOutcome) syscall.Signal {
continue // Spurious stop.
}
if stopSig == syscall.SIGTRAP {
+ if status.TrapCause() == syscall.PTRACE_EVENT_EXIT {
+ t.dumpAndPanic("wait failed: the process exited")
+ }
// Re-encode the trap cause the way it's expected.
return stopSig | syscall.Signal(status.TrapCause()<<8)
}
diff --git a/pkg/sentry/platform/ptrace/subprocess_amd64.go b/pkg/sentry/platform/ptrace/subprocess_amd64.go
index a70512913..4649a94a7 100644
--- a/pkg/sentry/platform/ptrace/subprocess_amd64.go
+++ b/pkg/sentry/platform/ptrace/subprocess_amd64.go
@@ -28,20 +28,13 @@ const (
// maximumUserAddress is the largest possible user address.
maximumUserAddress = 0x7ffffffff000
+ // stubInitAddress is the initial attempt link address for the stub.
+ stubInitAddress = 0x7fffffff0000
+
// initRegsRipAdjustment is the size of the syscall instruction.
initRegsRipAdjustment = 2
)
-// Linux kernel errnos which "should never be seen by user programs", but will
-// be revealed to ptrace syscall exit tracing.
-//
-// These constants are used in subprocess.go.
-const (
- ERESTARTSYS = syscall.Errno(512)
- ERESTARTNOINTR = syscall.Errno(513)
- ERESTARTNOHAND = syscall.Errno(514)
-)
-
// resetSysemuRegs sets up emulation registers.
//
// This should be called prior to calling sysemu.
@@ -139,3 +132,14 @@ func dumpRegs(regs *syscall.PtraceRegs) string {
return m.String()
}
+
+// adjustInitregsRip adjust the current register RIP value to
+// be just before the system call instruction excution
+func (t *thread) adjustInitRegsRip() {
+ t.initRegs.Rip -= initRegsRipAdjustment
+}
+
+// Pass the expected PPID to the child via R15 when creating stub process
+func initChildProcessPPID(initregs *syscall.PtraceRegs, ppid int32) {
+ initregs.R15 = uint64(ppid)
+}
diff --git a/pkg/sentry/platform/ptrace/subprocess_arm64.go b/pkg/sentry/platform/ptrace/subprocess_arm64.go
new file mode 100644
index 000000000..bec884ba5
--- /dev/null
+++ b/pkg/sentry/platform/ptrace/subprocess_arm64.go
@@ -0,0 +1,126 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build arm64
+
+package ptrace
+
+import (
+ "syscall"
+
+ "gvisor.dev/gvisor/pkg/sentry/arch"
+)
+
+const (
+ // maximumUserAddress is the largest possible user address.
+ maximumUserAddress = 0xfffffffff000
+
+ // stubInitAddress is the initial attempt link address for the stub.
+ // Only support 48bits VA currently.
+ stubInitAddress = 0xffffffff0000
+
+ // initRegsRipAdjustment is the size of the svc instruction.
+ initRegsRipAdjustment = 4
+)
+
+// resetSysemuRegs sets up emulation registers.
+//
+// This should be called prior to calling sysemu.
+func (s *subprocess) resetSysemuRegs(regs *syscall.PtraceRegs) {
+}
+
+// createSyscallRegs sets up syscall registers.
+//
+// This should be called to generate registers for a system call.
+func createSyscallRegs(initRegs *syscall.PtraceRegs, sysno uintptr, args ...arch.SyscallArgument) syscall.PtraceRegs {
+ // Copy initial registers (Pc, Sp, etc.).
+ regs := *initRegs
+
+ // Set our syscall number.
+ // r8 for the syscall number.
+ // r0-r6 is used to store the parameters.
+ regs.Regs[8] = uint64(sysno)
+ if len(args) >= 1 {
+ regs.Regs[0] = args[0].Uint64()
+ }
+ if len(args) >= 2 {
+ regs.Regs[1] = args[1].Uint64()
+ }
+ if len(args) >= 3 {
+ regs.Regs[2] = args[2].Uint64()
+ }
+ if len(args) >= 4 {
+ regs.Regs[3] = args[3].Uint64()
+ }
+ if len(args) >= 5 {
+ regs.Regs[4] = args[4].Uint64()
+ }
+ if len(args) >= 6 {
+ regs.Regs[5] = args[5].Uint64()
+ }
+
+ return regs
+}
+
+// isSingleStepping determines if the registers indicate single-stepping.
+func isSingleStepping(regs *syscall.PtraceRegs) bool {
+ // Refer to the ARM SDM D2.12.3: software step state machine
+ // return (regs.Pstate.SS == 1) && (MDSCR_EL1.SS == 1).
+ //
+ // Since the host Linux kernel will set MDSCR_EL1.SS on our behalf
+ // when we call a single-step ptrace command, we only need to check
+ // the Pstate.SS bit here.
+ return (regs.Pstate & arch.ARMTrapFlag) != 0
+}
+
+// updateSyscallRegs updates registers after finishing sysemu.
+func updateSyscallRegs(regs *syscall.PtraceRegs) {
+ // No special work is necessary.
+ return
+}
+
+// syscallReturnValue extracts a sensible return from registers.
+func syscallReturnValue(regs *syscall.PtraceRegs) (uintptr, error) {
+ rval := int64(regs.Regs[0])
+ if rval < 0 {
+ return 0, syscall.Errno(-rval)
+ }
+ return uintptr(rval), nil
+}
+
+func dumpRegs(regs *syscall.PtraceRegs) string {
+ var m strings.Builder
+
+ fmt.Fprintf(&m, "Registers:\n")
+
+ for i := 0; i < 31; i++ {
+ fmt.Fprintf(&m, "\tRegs[%d]\t = %016x\n", i, regs.Regs[i])
+ }
+ fmt.Fprintf(&m, "\tSp\t = %016x\n", regs.Sp)
+ fmt.Fprintf(&m, "\tPc\t = %016x\n", regs.Pc)
+ fmt.Fprintf(&m, "\tPstate\t = %016x\n", regs.Pstate)
+
+ return m.String()
+}
+
+// adjustInitregsRip adjust the current register RIP value to
+// be just before the system call instruction excution
+func (t *thread) adjustInitRegsRip() {
+ t.initRegs.Pc -= initRegsRipAdjustment
+}
+
+// Pass the expected PPID to the child via X7 when creating stub process
+func initChildProcessPPID(initregs *syscall.PtraceRegs, ppid int32) {
+ initregs.Regs[7] = uint64(ppid)
+}
diff --git a/pkg/sentry/platform/ptrace/subprocess_linux.go b/pkg/sentry/platform/ptrace/subprocess_linux.go
index 87ded0bbd..f09b0b3d0 100644
--- a/pkg/sentry/platform/ptrace/subprocess_linux.go
+++ b/pkg/sentry/platform/ptrace/subprocess_linux.go
@@ -284,7 +284,7 @@ func (s *subprocess) createStub() (*thread, error) {
// Pass the expected PPID to the child via R15.
regs := t.initRegs
- regs.R15 = uint64(t.tgid)
+ initChildProcessPPID(&regs, t.tgid)
// Call fork in a subprocess.
//
diff --git a/pkg/sentry/platform/ptrace/subprocess_linux_amd64_unsafe.go b/pkg/sentry/platform/ptrace/subprocess_linux_unsafe.go
index e977992f9..de6783fb0 100644
--- a/pkg/sentry/platform/ptrace/subprocess_linux_amd64_unsafe.go
+++ b/pkg/sentry/platform/ptrace/subprocess_linux_unsafe.go
@@ -12,7 +12,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-// +build amd64 linux
+// +build linux
+// +build amd64 arm64
package ptrace
diff --git a/pkg/sentry/safemem/io.go b/pkg/sentry/safemem/io.go
index 5c3d73eb7..f039a5c34 100644
--- a/pkg/sentry/safemem/io.go
+++ b/pkg/sentry/safemem/io.go
@@ -157,7 +157,8 @@ func (w ToIOWriter) Write(src []byte) (int, error) {
}
// FromIOReader implements Reader for an io.Reader by repeatedly invoking
-// io.Reader.Read until it returns an error or partial read.
+// io.Reader.Read until it returns an error or partial read. This is not
+// thread-safe.
//
// FromIOReader will return a successful partial read iff Reader.Read does so.
type FromIOReader struct {
@@ -206,6 +207,58 @@ func (r FromIOReader) readToBlock(dst Block, buf []byte) (int, []byte, error) {
return wbn, buf, rerr
}
+// FromIOReaderAt implements Reader for an io.ReaderAt. Does not repeatedly
+// invoke io.ReaderAt.ReadAt because ReadAt is more strict than Read. A partial
+// read indicates an error. This is not thread-safe.
+type FromIOReaderAt struct {
+ ReaderAt io.ReaderAt
+ Offset int64
+}
+
+// ReadToBlocks implements Reader.ReadToBlocks.
+func (r FromIOReaderAt) ReadToBlocks(dsts BlockSeq) (uint64, error) {
+ var buf []byte
+ var done uint64
+ for !dsts.IsEmpty() {
+ dst := dsts.Head()
+ var n int
+ var err error
+ n, buf, err = r.readToBlock(dst, buf)
+ done += uint64(n)
+ if n != dst.Len() {
+ return done, err
+ }
+ dsts = dsts.Tail()
+ if err != nil {
+ if dsts.IsEmpty() && err == io.EOF {
+ return done, nil
+ }
+ return done, err
+ }
+ }
+ return done, nil
+}
+
+func (r FromIOReaderAt) readToBlock(dst Block, buf []byte) (int, []byte, error) {
+ // io.Reader isn't safecopy-aware, so we have to buffer Blocks that require
+ // safecopy.
+ if !dst.NeedSafecopy() {
+ n, err := r.ReaderAt.ReadAt(dst.ToSlice(), r.Offset)
+ r.Offset += int64(n)
+ return n, buf, err
+ }
+ if len(buf) < dst.Len() {
+ buf = make([]byte, dst.Len())
+ }
+ rn, rerr := r.ReaderAt.ReadAt(buf[:dst.Len()], r.Offset)
+ r.Offset += int64(rn)
+ wbn, wberr := Copy(dst, BlockFromSafeSlice(buf[:rn]))
+ if wberr != nil {
+ return wbn, buf, wberr
+ }
+ return wbn, buf, rerr
+}
+
// FromIOWriter implements Writer for an io.Writer by repeatedly invoking
// io.Writer.Write until it returns an error or partial write.
//
diff --git a/pkg/sentry/socket/BUILD b/pkg/sentry/socket/BUILD
index 2b03ea87c..3300f9a6b 100644
--- a/pkg/sentry/socket/BUILD
+++ b/pkg/sentry/socket/BUILD
@@ -9,6 +9,7 @@ go_library(
visibility = ["//pkg/sentry:internal"],
deps = [
"//pkg/abi/linux",
+ "//pkg/binary",
"//pkg/sentry/context",
"//pkg/sentry/device",
"//pkg/sentry/fs",
diff --git a/pkg/sentry/socket/epsocket/BUILD b/pkg/sentry/socket/epsocket/BUILD
index 1f014f399..e927821e1 100644
--- a/pkg/sentry/socket/epsocket/BUILD
+++ b/pkg/sentry/socket/epsocket/BUILD
@@ -31,6 +31,7 @@ go_library(
"//pkg/sentry/kernel/time",
"//pkg/sentry/safemem",
"//pkg/sentry/socket",
+ "//pkg/sentry/socket/netfilter",
"//pkg/sentry/unimpl",
"//pkg/sentry/usermem",
"//pkg/syserr",
@@ -38,6 +39,7 @@ go_library(
"//pkg/tcpip",
"//pkg/tcpip/buffer",
"//pkg/tcpip/header",
+ "//pkg/tcpip/iptables",
"//pkg/tcpip/network/ipv4",
"//pkg/tcpip/network/ipv6",
"//pkg/tcpip/stack",
diff --git a/pkg/sentry/socket/epsocket/epsocket.go b/pkg/sentry/socket/epsocket/epsocket.go
index e57aed927..635042263 100644
--- a/pkg/sentry/socket/epsocket/epsocket.go
+++ b/pkg/sentry/socket/epsocket/epsocket.go
@@ -43,6 +43,7 @@ import (
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
"gvisor.dev/gvisor/pkg/sentry/safemem"
"gvisor.dev/gvisor/pkg/sentry/socket"
+ "gvisor.dev/gvisor/pkg/sentry/socket/netfilter"
"gvisor.dev/gvisor/pkg/sentry/unimpl"
"gvisor.dev/gvisor/pkg/sentry/usermem"
"gvisor.dev/gvisor/pkg/syserr"
@@ -290,18 +291,22 @@ func bytesToIPAddress(addr []byte) tcpip.Address {
return tcpip.Address(addr)
}
-// GetAddress reads an sockaddr struct from the given address and converts it
-// to the FullAddress format. It supports AF_UNIX, AF_INET and AF_INET6
-// addresses.
-func GetAddress(sfamily int, addr []byte, strict bool) (tcpip.FullAddress, *syserr.Error) {
+// AddressAndFamily reads an sockaddr struct from the given address and
+// converts it to the FullAddress format. It supports AF_UNIX, AF_INET and
+// AF_INET6 addresses.
+//
+// strict indicates whether addresses with the AF_UNSPEC family are accepted of not.
+//
+// AddressAndFamily returns an address, its family.
+func AddressAndFamily(sfamily int, addr []byte, strict bool) (tcpip.FullAddress, uint16, *syserr.Error) {
// Make sure we have at least 2 bytes for the address family.
if len(addr) < 2 {
- return tcpip.FullAddress{}, syserr.ErrInvalidArgument
+ return tcpip.FullAddress{}, 0, syserr.ErrInvalidArgument
}
family := usermem.ByteOrder.Uint16(addr)
if family != uint16(sfamily) && (!strict && family != linux.AF_UNSPEC) {
- return tcpip.FullAddress{}, syserr.ErrAddressFamilyNotSupported
+ return tcpip.FullAddress{}, family, syserr.ErrAddressFamilyNotSupported
}
// Get the rest of the fields based on the address family.
@@ -309,7 +314,7 @@ func GetAddress(sfamily int, addr []byte, strict bool) (tcpip.FullAddress, *syse
case linux.AF_UNIX:
path := addr[2:]
if len(path) > linux.UnixPathMax {
- return tcpip.FullAddress{}, syserr.ErrInvalidArgument
+ return tcpip.FullAddress{}, family, syserr.ErrInvalidArgument
}
// Drop the terminating NUL (if one exists) and everything after
// it for filesystem (non-abstract) addresses.
@@ -320,12 +325,12 @@ func GetAddress(sfamily int, addr []byte, strict bool) (tcpip.FullAddress, *syse
}
return tcpip.FullAddress{
Addr: tcpip.Address(path),
- }, nil
+ }, family, nil
case linux.AF_INET:
var a linux.SockAddrInet
if len(addr) < sockAddrInetSize {
- return tcpip.FullAddress{}, syserr.ErrInvalidArgument
+ return tcpip.FullAddress{}, family, syserr.ErrInvalidArgument
}
binary.Unmarshal(addr[:sockAddrInetSize], usermem.ByteOrder, &a)
@@ -333,12 +338,12 @@ func GetAddress(sfamily int, addr []byte, strict bool) (tcpip.FullAddress, *syse
Addr: bytesToIPAddress(a.Addr[:]),
Port: ntohs(a.Port),
}
- return out, nil
+ return out, family, nil
case linux.AF_INET6:
var a linux.SockAddrInet6
if len(addr) < sockAddrInet6Size {
- return tcpip.FullAddress{}, syserr.ErrInvalidArgument
+ return tcpip.FullAddress{}, family, syserr.ErrInvalidArgument
}
binary.Unmarshal(addr[:sockAddrInet6Size], usermem.ByteOrder, &a)
@@ -349,13 +354,13 @@ func GetAddress(sfamily int, addr []byte, strict bool) (tcpip.FullAddress, *syse
if isLinkLocal(out.Addr) {
out.NIC = tcpip.NICID(a.Scope_id)
}
- return out, nil
+ return out, family, nil
case linux.AF_UNSPEC:
- return tcpip.FullAddress{}, nil
+ return tcpip.FullAddress{}, family, nil
default:
- return tcpip.FullAddress{}, syserr.ErrAddressFamilyNotSupported
+ return tcpip.FullAddress{}, 0, syserr.ErrAddressFamilyNotSupported
}
}
@@ -428,6 +433,11 @@ func (i *ioSequencePayload) Size() int {
return int(i.src.NumBytes())
}
+// DropFirst drops the first n bytes from underlying src.
+func (i *ioSequencePayload) DropFirst(n int) {
+ i.src = i.src.DropFirst(int(n))
+}
+
// Write implements fs.FileOperations.Write.
func (s *SocketOperations) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, _ int64) (int64, error) {
f := &ioSequencePayload{ctx: ctx, src: src}
@@ -476,11 +486,18 @@ func (s *SocketOperations) Readiness(mask waiter.EventMask) waiter.EventMask {
// Connect implements the linux syscall connect(2) for sockets backed by
// tpcip.Endpoint.
func (s *SocketOperations) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error {
- addr, err := GetAddress(s.family, sockaddr, false /* strict */)
+ addr, family, err := AddressAndFamily(s.family, sockaddr, false /* strict */)
if err != nil {
return err
}
+ if family == linux.AF_UNSPEC {
+ err := s.Endpoint.Disconnect()
+ if err == tcpip.ErrNotSupported {
+ return syserr.ErrAddressFamilyNotSupported
+ }
+ return syserr.TranslateNetstackError(err)
+ }
// Always return right away in the non-blocking case.
if !blocking {
return syserr.TranslateNetstackError(s.Endpoint.Connect(addr))
@@ -509,7 +526,7 @@ func (s *SocketOperations) Connect(t *kernel.Task, sockaddr []byte, blocking boo
// Bind implements the linux syscall bind(2) for sockets backed by
// tcpip.Endpoint.
func (s *SocketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error {
- addr, err := GetAddress(s.family, sockaddr, true /* strict */)
+ addr, _, err := AddressAndFamily(s.family, sockaddr, true /* strict */)
if err != nil {
return err
}
@@ -547,7 +564,7 @@ func (s *SocketOperations) blockingAccept(t *kernel.Task) (tcpip.Endpoint, *wait
// Accept implements the linux syscall accept(2) for sockets backed by
// tcpip.Endpoint.
-func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, interface{}, uint32, *syserr.Error) {
+func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error) {
// Issue the accept request to get the new endpoint.
ep, wq, terr := s.Endpoint.Accept()
if terr != nil {
@@ -574,7 +591,7 @@ func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int,
ns.SetFlags(flags.Settable())
}
- var addr interface{}
+ var addr linux.SockAddr
var addrLen uint32
if peerRequested {
// Get address of the peer and write it to peer slice.
@@ -624,7 +641,7 @@ func (s *SocketOperations) Shutdown(t *kernel.Task, how int) *syserr.Error {
// GetSockOpt implements the linux syscall getsockopt(2) for sockets backed by
// tcpip.Endpoint.
-func (s *SocketOperations) GetSockOpt(t *kernel.Task, level, name, outLen int) (interface{}, *syserr.Error) {
+func (s *SocketOperations) GetSockOpt(t *kernel.Task, level, name int, outPtr usermem.Addr, outLen int) (interface{}, *syserr.Error) {
// TODO(b/78348848): Unlike other socket options, SO_TIMESTAMP is
// implemented specifically for epsocket.SocketOperations rather than
// commonEndpoint. commonEndpoint should be extended to support socket
@@ -655,6 +672,33 @@ func (s *SocketOperations) GetSockOpt(t *kernel.Task, level, name, outLen int) (
return val, nil
}
+ if s.skType == linux.SOCK_RAW && level == linux.IPPROTO_IP {
+ switch name {
+ case linux.IPT_SO_GET_INFO:
+ if outLen < linux.SizeOfIPTGetinfo {
+ return nil, syserr.ErrInvalidArgument
+ }
+
+ info, err := netfilter.GetInfo(t, s.Endpoint, outPtr)
+ if err != nil {
+ return nil, err
+ }
+ return info, nil
+
+ case linux.IPT_SO_GET_ENTRIES:
+ if outLen < linux.SizeOfIPTGetEntries {
+ return nil, syserr.ErrInvalidArgument
+ }
+
+ entries, err := netfilter.GetEntries(t, s.Endpoint, outPtr, outLen)
+ if err != nil {
+ return nil, err
+ }
+ return entries, nil
+
+ }
+ }
+
return GetSockOpt(t, s, s.Endpoint, s.family, s.skType, level, name, outLen)
}
@@ -1028,7 +1072,7 @@ func getSockOptIP(t *kernel.Task, ep commonEndpoint, name, outLen int) (interfac
a, _ := ConvertAddress(linux.AF_INET, tcpip.FullAddress{Addr: v.InterfaceAddr})
- return a.(linux.SockAddrInet).Addr, nil
+ return a.(*linux.SockAddrInet).Addr, nil
case linux.IP_MULTICAST_LOOP:
if outLen < sizeOfInt32 {
@@ -1658,7 +1702,7 @@ func isLinkLocal(addr tcpip.Address) bool {
}
// ConvertAddress converts the given address to a native format.
-func ConvertAddress(family int, addr tcpip.FullAddress) (interface{}, uint32) {
+func ConvertAddress(family int, addr tcpip.FullAddress) (linux.SockAddr, uint32) {
switch family {
case linux.AF_UNIX:
var out linux.SockAddrUnix
@@ -1674,15 +1718,15 @@ func ConvertAddress(family int, addr tcpip.FullAddress) (interface{}, uint32) {
// address length is the max. Abstract and empty paths always return
// the full exact length.
if l == 0 || out.Path[0] == 0 || l == len(out.Path) {
- return out, uint32(2 + l)
+ return &out, uint32(2 + l)
}
- return out, uint32(3 + l)
+ return &out, uint32(3 + l)
case linux.AF_INET:
var out linux.SockAddrInet
copy(out.Addr[:], addr.Addr)
out.Family = linux.AF_INET
out.Port = htons(addr.Port)
- return out, uint32(binary.Size(out))
+ return &out, uint32(binary.Size(out))
case linux.AF_INET6:
var out linux.SockAddrInet6
if len(addr.Addr) == 4 {
@@ -1698,7 +1742,7 @@ func ConvertAddress(family int, addr tcpip.FullAddress) (interface{}, uint32) {
if isLinkLocal(addr.Addr) {
out.Scope_id = uint32(addr.NIC)
}
- return out, uint32(binary.Size(out))
+ return &out, uint32(binary.Size(out))
default:
return nil, 0
}
@@ -1706,7 +1750,7 @@ func ConvertAddress(family int, addr tcpip.FullAddress) (interface{}, uint32) {
// GetSockName implements the linux syscall getsockname(2) for sockets backed by
// tcpip.Endpoint.
-func (s *SocketOperations) GetSockName(t *kernel.Task) (interface{}, uint32, *syserr.Error) {
+func (s *SocketOperations) GetSockName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) {
addr, err := s.Endpoint.GetLocalAddress()
if err != nil {
return nil, 0, syserr.TranslateNetstackError(err)
@@ -1718,7 +1762,7 @@ func (s *SocketOperations) GetSockName(t *kernel.Task) (interface{}, uint32, *sy
// GetPeerName implements the linux syscall getpeername(2) for sockets backed by
// tcpip.Endpoint.
-func (s *SocketOperations) GetPeerName(t *kernel.Task) (interface{}, uint32, *syserr.Error) {
+func (s *SocketOperations) GetPeerName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) {
addr, err := s.Endpoint.GetRemoteAddress()
if err != nil {
return nil, 0, syserr.TranslateNetstackError(err)
@@ -1791,7 +1835,7 @@ func (s *SocketOperations) fillCmsgInq(cmsg *socket.ControlMessages) {
// nonBlockingRead issues a non-blocking read.
//
// TODO(b/78348848): Support timestamps for stream sockets.
-func (s *SocketOperations) nonBlockingRead(ctx context.Context, dst usermem.IOSequence, peek, trunc, senderRequested bool) (int, int, interface{}, uint32, socket.ControlMessages, *syserr.Error) {
+func (s *SocketOperations) nonBlockingRead(ctx context.Context, dst usermem.IOSequence, peek, trunc, senderRequested bool) (int, int, linux.SockAddr, uint32, socket.ControlMessages, *syserr.Error) {
isPacket := s.isPacketBased()
// Fast path for regular reads from stream (e.g., TCP) endpoints. Note
@@ -1839,7 +1883,7 @@ func (s *SocketOperations) nonBlockingRead(ctx context.Context, dst usermem.IOSe
if err == nil {
s.updateTimestamp()
}
- var addr interface{}
+ var addr linux.SockAddr
var addrLen uint32
if isPacket && senderRequested {
addr, addrLen = ConvertAddress(s.family, s.sender)
@@ -1914,7 +1958,7 @@ func (s *SocketOperations) updateTimestamp() {
// RecvMsg implements the linux syscall recvmsg(2) for sockets backed by
// tcpip.Endpoint.
-func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (n int, msgFlags int, senderAddr interface{}, senderAddrLen uint32, controlMessages socket.ControlMessages, err *syserr.Error) {
+func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (n int, msgFlags int, senderAddr linux.SockAddr, senderAddrLen uint32, controlMessages socket.ControlMessages, err *syserr.Error) {
trunc := flags&linux.MSG_TRUNC != 0
peek := flags&linux.MSG_PEEK != 0
dontWait := flags&linux.MSG_DONTWAIT != 0
@@ -1990,7 +2034,7 @@ func (s *SocketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []
var addr *tcpip.FullAddress
if len(to) > 0 {
- addrBuf, err := GetAddress(s.family, to, true /* strict */)
+ addrBuf, _, err := AddressAndFamily(s.family, to, true /* strict */)
if err != nil {
return 0, err
}
@@ -1998,28 +2042,22 @@ func (s *SocketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []
addr = &addrBuf
}
- v := buffer.NewView(int(src.NumBytes()))
-
- // Copy all the data into the buffer.
- if _, err := src.CopyIn(t, v); err != nil {
- return 0, syserr.FromError(err)
- }
-
opts := tcpip.WriteOptions{
To: addr,
More: flags&linux.MSG_MORE != 0,
EndOfRecord: flags&linux.MSG_EOR != 0,
}
- n, resCh, err := s.Endpoint.Write(tcpip.SlicePayload(v), opts)
+ v := &ioSequencePayload{t, src}
+ n, resCh, err := s.Endpoint.Write(v, opts)
if resCh != nil {
if err := t.Block(resCh); err != nil {
return 0, syserr.FromError(err)
}
- n, _, err = s.Endpoint.Write(tcpip.SlicePayload(v), opts)
+ n, _, err = s.Endpoint.Write(v, opts)
}
dontWait := flags&linux.MSG_DONTWAIT != 0
- if err == nil && (n >= uintptr(len(v)) || dontWait) {
+ if err == nil && (n >= int64(v.Size()) || dontWait) {
// Complete write.
return int(n), nil
}
@@ -2033,18 +2071,18 @@ func (s *SocketOperations) SendMsg(t *kernel.Task, src usermem.IOSequence, to []
s.EventRegister(&e, waiter.EventOut)
defer s.EventUnregister(&e)
- v.TrimFront(int(n))
+ v.DropFirst(int(n))
total := n
for {
- n, _, err = s.Endpoint.Write(tcpip.SlicePayload(v), opts)
- v.TrimFront(int(n))
+ n, _, err = s.Endpoint.Write(v, opts)
+ v.DropFirst(int(n))
total += n
if err != nil && err != tcpip.ErrWouldBlock && total == 0 {
return 0, syserr.TranslateNetstackError(err)
}
- if err == nil && len(v) == 0 || err != nil && err != tcpip.ErrWouldBlock {
+ if err == nil && v.Size() == 0 || err != nil && err != tcpip.ErrWouldBlock {
return int(total), nil
}
@@ -2252,19 +2290,19 @@ func interfaceIoctl(ctx context.Context, io usermem.IO, arg int, ifr *linux.IFRe
case syscall.SIOCGIFMAP:
// Gets the hardware parameters of the device.
- // TODO(b/71872867): Implement.
+ // TODO(gvisor.dev/issue/505): Implement.
case syscall.SIOCGIFTXQLEN:
// Gets the transmit queue length of the device.
- // TODO(b/71872867): Implement.
+ // TODO(gvisor.dev/issue/505): Implement.
case syscall.SIOCGIFDSTADDR:
// Gets the destination address of a point-to-point device.
- // TODO(b/71872867): Implement.
+ // TODO(gvisor.dev/issue/505): Implement.
case syscall.SIOCGIFBRDADDR:
// Gets the broadcast address of a device.
- // TODO(b/71872867): Implement.
+ // TODO(gvisor.dev/issue/505): Implement.
case syscall.SIOCGIFNETMASK:
// Gets the network mask of a device.
diff --git a/pkg/sentry/socket/epsocket/stack.go b/pkg/sentry/socket/epsocket/stack.go
index 8fe489c0e..7cf7ff735 100644
--- a/pkg/sentry/socket/epsocket/stack.go
+++ b/pkg/sentry/socket/epsocket/stack.go
@@ -18,7 +18,10 @@ import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/sentry/inet"
+ "gvisor.dev/gvisor/pkg/sentry/socket/netfilter"
"gvisor.dev/gvisor/pkg/syserr"
+ "gvisor.dev/gvisor/pkg/tcpip/header"
+ "gvisor.dev/gvisor/pkg/tcpip/iptables"
"gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
"gvisor.dev/gvisor/pkg/tcpip/network/ipv6"
"gvisor.dev/gvisor/pkg/tcpip/stack"
@@ -143,3 +146,57 @@ func (s *Stack) SetTCPSACKEnabled(enabled bool) error {
func (s *Stack) Statistics(stat interface{}, arg string) error {
return syserr.ErrEndpointOperation.ToError()
}
+
+// RouteTable implements inet.Stack.RouteTable.
+func (s *Stack) RouteTable() []inet.Route {
+ var routeTable []inet.Route
+
+ for _, rt := range s.Stack.GetRouteTable() {
+ var family uint8
+ switch len(rt.Destination.ID()) {
+ case header.IPv4AddressSize:
+ family = linux.AF_INET
+ case header.IPv6AddressSize:
+ family = linux.AF_INET6
+ default:
+ log.Warningf("Unknown network protocol in route %+v", rt)
+ continue
+ }
+
+ routeTable = append(routeTable, inet.Route{
+ Family: family,
+ DstLen: uint8(rt.Destination.Prefix()), // The CIDR prefix for the destination.
+
+ // Always return unspecified protocol since we have no notion of
+ // protocol for routes.
+ Protocol: linux.RTPROT_UNSPEC,
+ // Set statically to LINK scope for now.
+ //
+ // TODO(gvisor.dev/issue/595): Set scope for routes.
+ Scope: linux.RT_SCOPE_LINK,
+ Type: linux.RTN_UNICAST,
+
+ DstAddr: []byte(rt.Destination.ID()),
+ OutputInterface: int32(rt.NIC),
+ GatewayAddr: []byte(rt.Gateway),
+ })
+ }
+
+ return routeTable
+}
+
+// IPTables returns the stack's iptables.
+func (s *Stack) IPTables() (iptables.IPTables, error) {
+ return s.Stack.IPTables(), nil
+}
+
+// FillDefaultIPTables sets the stack's iptables to the default tables, which
+// allow and do not modify all traffic.
+func (s *Stack) FillDefaultIPTables() {
+ netfilter.FillDefaultIPTables(s.Stack)
+}
+
+// Resume implements inet.Stack.Resume.
+func (s *Stack) Resume() {
+ s.Stack.Resume()
+}
diff --git a/pkg/sentry/socket/hostinet/socket.go b/pkg/sentry/socket/hostinet/socket.go
index 7f69406b7..92beb1bcf 100644
--- a/pkg/sentry/socket/hostinet/socket.go
+++ b/pkg/sentry/socket/hostinet/socket.go
@@ -189,15 +189,16 @@ func (s *socketOperations) Connect(t *kernel.Task, sockaddr []byte, blocking boo
}
// Accept implements socket.Socket.Accept.
-func (s *socketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, interface{}, uint32, *syserr.Error) {
- var peerAddr []byte
+func (s *socketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error) {
+ var peerAddr linux.SockAddr
+ var peerAddrBuf []byte
var peerAddrlen uint32
var peerAddrPtr *byte
var peerAddrlenPtr *uint32
if peerRequested {
- peerAddr = make([]byte, sizeofSockaddr)
- peerAddrlen = uint32(len(peerAddr))
- peerAddrPtr = &peerAddr[0]
+ peerAddrBuf = make([]byte, sizeofSockaddr)
+ peerAddrlen = uint32(len(peerAddrBuf))
+ peerAddrPtr = &peerAddrBuf[0]
peerAddrlenPtr = &peerAddrlen
}
@@ -222,7 +223,7 @@ func (s *socketOperations) Accept(t *kernel.Task, peerRequested bool, flags int,
}
if peerRequested {
- peerAddr = peerAddr[:peerAddrlen]
+ peerAddr = socket.UnmarshalSockAddr(s.family, peerAddrBuf[:peerAddrlen])
}
if syscallErr != nil {
return 0, peerAddr, peerAddrlen, syserr.FromError(syscallErr)
@@ -272,7 +273,7 @@ func (s *socketOperations) Shutdown(t *kernel.Task, how int) *syserr.Error {
}
// GetSockOpt implements socket.Socket.GetSockOpt.
-func (s *socketOperations) GetSockOpt(t *kernel.Task, level int, name int, outLen int) (interface{}, *syserr.Error) {
+func (s *socketOperations) GetSockOpt(t *kernel.Task, level int, name int, outPtr usermem.Addr, outLen int) (interface{}, *syserr.Error) {
if outLen < 0 {
return nil, syserr.ErrInvalidArgument
}
@@ -353,7 +354,7 @@ func (s *socketOperations) SetSockOpt(t *kernel.Task, level int, name int, opt [
}
// RecvMsg implements socket.Socket.RecvMsg.
-func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (int, int, interface{}, uint32, socket.ControlMessages, *syserr.Error) {
+func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (int, int, linux.SockAddr, uint32, socket.ControlMessages, *syserr.Error) {
// Whitelist flags.
//
// FIXME(jamieliu): We can't support MSG_ERRQUEUE because it uses ancillary
@@ -363,9 +364,10 @@ func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
return 0, 0, nil, 0, socket.ControlMessages{}, syserr.ErrInvalidArgument
}
- var senderAddr []byte
+ var senderAddr linux.SockAddr
+ var senderAddrBuf []byte
if senderRequested {
- senderAddr = make([]byte, sizeofSockaddr)
+ senderAddrBuf = make([]byte, sizeofSockaddr)
}
var msgFlags int
@@ -384,7 +386,7 @@ func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
if dsts.NumBlocks() == 1 {
// Skip allocating []syscall.Iovec.
- return recvfrom(s.fd, dsts.Head().ToSlice(), sysflags, &senderAddr)
+ return recvfrom(s.fd, dsts.Head().ToSlice(), sysflags, &senderAddrBuf)
}
iovs := iovecsFromBlockSeq(dsts)
@@ -392,15 +394,15 @@ func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
Iov: &iovs[0],
Iovlen: uint64(len(iovs)),
}
- if len(senderAddr) != 0 {
- msg.Name = &senderAddr[0]
- msg.Namelen = uint32(len(senderAddr))
+ if len(senderAddrBuf) != 0 {
+ msg.Name = &senderAddrBuf[0]
+ msg.Namelen = uint32(len(senderAddrBuf))
}
n, err := recvmsg(s.fd, &msg, sysflags)
if err != nil {
return 0, err
}
- senderAddr = senderAddr[:msg.Namelen]
+ senderAddrBuf = senderAddrBuf[:msg.Namelen]
msgFlags = int(msg.Flags)
return n, nil
})
@@ -431,7 +433,10 @@ func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
// We don't allow control messages.
msgFlags &^= linux.MSG_CTRUNC
- return int(n), msgFlags, senderAddr, uint32(len(senderAddr)), socket.ControlMessages{}, syserr.FromError(err)
+ if senderRequested {
+ senderAddr = socket.UnmarshalSockAddr(s.family, senderAddrBuf)
+ }
+ return int(n), msgFlags, senderAddr, uint32(len(senderAddrBuf)), socket.ControlMessages{}, syserr.FromError(err)
}
// SendMsg implements socket.Socket.SendMsg.
diff --git a/pkg/sentry/socket/hostinet/socket_unsafe.go b/pkg/sentry/socket/hostinet/socket_unsafe.go
index 6c69ba9c7..e69ec38c2 100644
--- a/pkg/sentry/socket/hostinet/socket_unsafe.go
+++ b/pkg/sentry/socket/hostinet/socket_unsafe.go
@@ -18,10 +18,12 @@ import (
"syscall"
"unsafe"
+ "gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/context"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/socket"
"gvisor.dev/gvisor/pkg/sentry/usermem"
"gvisor.dev/gvisor/pkg/syserr"
"gvisor.dev/gvisor/pkg/syserror"
@@ -91,25 +93,25 @@ func getsockopt(fd int, level, name int, optlen int) ([]byte, error) {
}
// GetSockName implements socket.Socket.GetSockName.
-func (s *socketOperations) GetSockName(t *kernel.Task) (interface{}, uint32, *syserr.Error) {
+func (s *socketOperations) GetSockName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) {
addr := make([]byte, sizeofSockaddr)
addrlen := uint32(len(addr))
_, _, errno := syscall.Syscall(syscall.SYS_GETSOCKNAME, uintptr(s.fd), uintptr(unsafe.Pointer(&addr[0])), uintptr(unsafe.Pointer(&addrlen)))
if errno != 0 {
return nil, 0, syserr.FromError(errno)
}
- return addr[:addrlen], addrlen, nil
+ return socket.UnmarshalSockAddr(s.family, addr), addrlen, nil
}
// GetPeerName implements socket.Socket.GetPeerName.
-func (s *socketOperations) GetPeerName(t *kernel.Task) (interface{}, uint32, *syserr.Error) {
+func (s *socketOperations) GetPeerName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) {
addr := make([]byte, sizeofSockaddr)
addrlen := uint32(len(addr))
_, _, errno := syscall.Syscall(syscall.SYS_GETPEERNAME, uintptr(s.fd), uintptr(unsafe.Pointer(&addr[0])), uintptr(unsafe.Pointer(&addrlen)))
if errno != 0 {
return nil, 0, syserr.FromError(errno)
}
- return addr[:addrlen], addrlen, nil
+ return socket.UnmarshalSockAddr(s.family, addr), addrlen, nil
}
func recvfrom(fd int, dst []byte, flags int, from *[]byte) (uint64, error) {
diff --git a/pkg/sentry/socket/hostinet/stack.go b/pkg/sentry/socket/hostinet/stack.go
index cc1f66fa1..3a4fdec47 100644
--- a/pkg/sentry/socket/hostinet/stack.go
+++ b/pkg/sentry/socket/hostinet/stack.go
@@ -46,6 +46,7 @@ type Stack struct {
// Stack is immutable.
interfaces map[int32]inet.Interface
interfaceAddrs map[int32][]inet.InterfaceAddr
+ routes []inet.Route
supportsIPv6 bool
tcpRecvBufSize inet.TCPBufferSize
tcpSendBufSize inet.TCPBufferSize
@@ -66,6 +67,10 @@ func (s *Stack) Configure() error {
return err
}
+ if err := addHostRoutes(s); err != nil {
+ return err
+ }
+
if _, err := os.Stat("/proc/net/if_inet6"); err == nil {
s.supportsIPv6 = true
}
@@ -161,6 +166,60 @@ func ExtractHostInterfaces(links []syscall.NetlinkMessage, addrs []syscall.Netli
return nil
}
+// ExtractHostRoutes populates the given routes slice with the data from the
+// host route table.
+func ExtractHostRoutes(routeMsgs []syscall.NetlinkMessage) ([]inet.Route, error) {
+ var routes []inet.Route
+ for _, routeMsg := range routeMsgs {
+ if routeMsg.Header.Type != syscall.RTM_NEWROUTE {
+ continue
+ }
+
+ var ifRoute syscall.RtMsg
+ binary.Unmarshal(routeMsg.Data[:syscall.SizeofRtMsg], usermem.ByteOrder, &ifRoute)
+ inetRoute := inet.Route{
+ Family: ifRoute.Family,
+ DstLen: ifRoute.Dst_len,
+ SrcLen: ifRoute.Src_len,
+ TOS: ifRoute.Tos,
+ Table: ifRoute.Table,
+ Protocol: ifRoute.Protocol,
+ Scope: ifRoute.Scope,
+ Type: ifRoute.Type,
+ Flags: ifRoute.Flags,
+ }
+
+ // Not clearly documented: syscall.ParseNetlinkRouteAttr will check the
+ // syscall.NetlinkMessage.Header.Type and skip the struct rtmsg
+ // accordingly.
+ attrs, err := syscall.ParseNetlinkRouteAttr(&routeMsg)
+ if err != nil {
+ return nil, fmt.Errorf("RTM_GETROUTE returned RTM_NEWROUTE message with invalid rtattrs: %v", err)
+ }
+
+ for _, attr := range attrs {
+ switch attr.Attr.Type {
+ case syscall.RTA_DST:
+ inetRoute.DstAddr = attr.Value
+ case syscall.RTA_SRC:
+ inetRoute.SrcAddr = attr.Value
+ case syscall.RTA_GATEWAY:
+ inetRoute.GatewayAddr = attr.Value
+ case syscall.RTA_OIF:
+ expected := int(binary.Size(inetRoute.OutputInterface))
+ if len(attr.Value) != expected {
+ return nil, fmt.Errorf("RTM_GETROUTE returned RTM_NEWROUTE message with invalid attribute data length (%d bytes, expected %d bytes)", len(attr.Value), expected)
+ }
+ binary.Unmarshal(attr.Value, usermem.ByteOrder, &inetRoute.OutputInterface)
+ }
+ }
+
+ routes = append(routes, inetRoute)
+ }
+
+ return routes, nil
+}
+
func addHostInterfaces(s *Stack) error {
links, err := doNetlinkRouteRequest(syscall.RTM_GETLINK)
if err != nil {
@@ -175,6 +234,20 @@ func addHostInterfaces(s *Stack) error {
return ExtractHostInterfaces(links, addrs, s.interfaces, s.interfaceAddrs)
}
+func addHostRoutes(s *Stack) error {
+ routes, err := doNetlinkRouteRequest(syscall.RTM_GETROUTE)
+ if err != nil {
+ return fmt.Errorf("RTM_GETROUTE failed: %v", err)
+ }
+
+ s.routes, err = ExtractHostRoutes(routes)
+ if err != nil {
+ return err
+ }
+
+ return nil
+}
+
func doNetlinkRouteRequest(req int) ([]syscall.NetlinkMessage, error) {
data, err := syscall.NetlinkRIB(req, syscall.AF_UNSPEC)
if err != nil {
@@ -202,12 +275,20 @@ func readTCPBufferSizeFile(filename string) (inet.TCPBufferSize, error) {
// Interfaces implements inet.Stack.Interfaces.
func (s *Stack) Interfaces() map[int32]inet.Interface {
- return s.interfaces
+ interfaces := make(map[int32]inet.Interface)
+ for k, v := range s.interfaces {
+ interfaces[k] = v
+ }
+ return interfaces
}
// InterfaceAddrs implements inet.Stack.InterfaceAddrs.
func (s *Stack) InterfaceAddrs() map[int32][]inet.InterfaceAddr {
- return s.interfaceAddrs
+ addrs := make(map[int32][]inet.InterfaceAddr)
+ for k, v := range s.interfaceAddrs {
+ addrs[k] = append([]inet.InterfaceAddr(nil), v...)
+ }
+ return addrs
}
// SupportsIPv6 implements inet.Stack.SupportsIPv6.
@@ -249,3 +330,11 @@ func (s *Stack) SetTCPSACKEnabled(enabled bool) error {
func (s *Stack) Statistics(stat interface{}, arg string) error {
return syserror.EOPNOTSUPP
}
+
+// RouteTable implements inet.Stack.RouteTable.
+func (s *Stack) RouteTable() []inet.Route {
+ return append([]inet.Route(nil), s.routes...)
+}
+
+// Resume implements inet.Stack.Resume.
+func (s *Stack) Resume() {}
diff --git a/pkg/sentry/socket/netfilter/BUILD b/pkg/sentry/socket/netfilter/BUILD
new file mode 100644
index 000000000..354a0d6ee
--- /dev/null
+++ b/pkg/sentry/socket/netfilter/BUILD
@@ -0,0 +1,24 @@
+package(licenses = ["notice"])
+
+load("//tools/go_stateify:defs.bzl", "go_library")
+
+go_library(
+ name = "netfilter",
+ srcs = [
+ "netfilter.go",
+ ],
+ importpath = "gvisor.dev/gvisor/pkg/sentry/socket/netfilter",
+ # This target depends on netstack and should only be used by epsocket,
+ # which is allowed to depend on netstack.
+ visibility = ["//pkg/sentry:internal"],
+ deps = [
+ "//pkg/abi/linux",
+ "//pkg/binary",
+ "//pkg/sentry/kernel",
+ "//pkg/sentry/usermem",
+ "//pkg/syserr",
+ "//pkg/tcpip",
+ "//pkg/tcpip/iptables",
+ "//pkg/tcpip/stack",
+ ],
+)
diff --git a/pkg/sentry/socket/netfilter/netfilter.go b/pkg/sentry/socket/netfilter/netfilter.go
new file mode 100644
index 000000000..9f87c32f1
--- /dev/null
+++ b/pkg/sentry/socket/netfilter/netfilter.go
@@ -0,0 +1,286 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package netfilter helps the sentry interact with netstack's netfilter
+// capabilities.
+package netfilter
+
+import (
+ "fmt"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/binary"
+ "gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/usermem"
+ "gvisor.dev/gvisor/pkg/syserr"
+ "gvisor.dev/gvisor/pkg/tcpip"
+ "gvisor.dev/gvisor/pkg/tcpip/iptables"
+ "gvisor.dev/gvisor/pkg/tcpip/stack"
+)
+
+// errorTargetName is used to mark targets as error targets. Error targets
+// shouldn't be reached - an error has occurred if we fall through to one.
+const errorTargetName = "ERROR"
+
+// metadata is opaque to netstack. It holds data that we need to translate
+// between Linux's and netstack's iptables representations.
+type metadata struct {
+ HookEntry [linux.NF_INET_NUMHOOKS]uint32
+ Underflow [linux.NF_INET_NUMHOOKS]uint32
+ NumEntries uint32
+ Size uint32
+}
+
+// GetInfo returns information about iptables.
+func GetInfo(t *kernel.Task, ep tcpip.Endpoint, outPtr usermem.Addr) (linux.IPTGetinfo, *syserr.Error) {
+ // Read in the struct and table name.
+ var info linux.IPTGetinfo
+ if _, err := t.CopyIn(outPtr, &info); err != nil {
+ return linux.IPTGetinfo{}, syserr.FromError(err)
+ }
+
+ // Find the appropriate table.
+ table, err := findTable(ep, info.TableName())
+ if err != nil {
+ return linux.IPTGetinfo{}, err
+ }
+
+ // Get the hooks that apply to this table.
+ info.ValidHooks = table.ValidHooks()
+
+ // Grab the metadata struct, which is used to store information (e.g.
+ // the number of entries) that applies to the user's encoding of
+ // iptables, but not netstack's.
+ metadata := table.Metadata().(metadata)
+
+ // Set values from metadata.
+ info.HookEntry = metadata.HookEntry
+ info.Underflow = metadata.Underflow
+ info.NumEntries = metadata.NumEntries
+ info.Size = metadata.Size
+
+ return info, nil
+}
+
+// GetEntries returns netstack's iptables rules encoded for the iptables tool.
+func GetEntries(t *kernel.Task, ep tcpip.Endpoint, outPtr usermem.Addr, outLen int) (linux.KernelIPTGetEntries, *syserr.Error) {
+ // Read in the struct and table name.
+ var userEntries linux.IPTGetEntries
+ if _, err := t.CopyIn(outPtr, &userEntries); err != nil {
+ return linux.KernelIPTGetEntries{}, syserr.FromError(err)
+ }
+
+ // Find the appropriate table.
+ table, err := findTable(ep, userEntries.TableName())
+ if err != nil {
+ return linux.KernelIPTGetEntries{}, err
+ }
+
+ // Convert netstack's iptables rules to something that the iptables
+ // tool can understand.
+ entries, _, err := convertNetstackToBinary(userEntries.TableName(), table)
+ if err != nil {
+ return linux.KernelIPTGetEntries{}, err
+ }
+ if binary.Size(entries) > uintptr(outLen) {
+ return linux.KernelIPTGetEntries{}, syserr.ErrInvalidArgument
+ }
+
+ return entries, nil
+}
+
+func findTable(ep tcpip.Endpoint, tableName string) (iptables.Table, *syserr.Error) {
+ ipt, err := ep.IPTables()
+ if err != nil {
+ return iptables.Table{}, syserr.FromError(err)
+ }
+ table, ok := ipt.Tables[tableName]
+ if !ok {
+ return iptables.Table{}, syserr.ErrInvalidArgument
+ }
+ return table, nil
+}
+
+// FillDefaultIPTables sets stack's IPTables to the default tables and
+// populates them with metadata.
+func FillDefaultIPTables(stack *stack.Stack) {
+ ipt := iptables.DefaultTables()
+
+ // In order to fill in the metadata, we have to translate ipt from its
+ // netstack format to Linux's giant-binary-blob format.
+ for name, table := range ipt.Tables {
+ _, metadata, err := convertNetstackToBinary(name, table)
+ if err != nil {
+ panic(fmt.Errorf("Unable to set default IP tables: %v", err))
+ }
+ table.SetMetadata(metadata)
+ ipt.Tables[name] = table
+ }
+
+ stack.SetIPTables(ipt)
+}
+
+// convertNetstackToBinary converts the iptables as stored in netstack to the
+// format expected by the iptables tool. Linux stores each table as a binary
+// blob that can only be traversed by parsing a bit, reading some offsets,
+// jumping to those offsets, parsing again, etc.
+func convertNetstackToBinary(name string, table iptables.Table) (linux.KernelIPTGetEntries, metadata, *syserr.Error) {
+ // Return values.
+ var entries linux.KernelIPTGetEntries
+ var meta metadata
+
+ // The table name has to fit in the struct.
+ if linux.XT_TABLE_MAXNAMELEN < len(name) {
+ return linux.KernelIPTGetEntries{}, metadata{}, syserr.ErrInvalidArgument
+ }
+ copy(entries.Name[:], name)
+
+ // Deal with the built in chains first (INPUT, OUTPUT, etc.). Each of
+ // these chains ends with an unconditional policy entry.
+ for hook := iptables.Prerouting; hook < iptables.NumHooks; hook++ {
+ chain, ok := table.BuiltinChains[hook]
+ if !ok {
+ // This table doesn't support this hook.
+ continue
+ }
+
+ // Sanity check.
+ if len(chain.Rules) < 1 {
+ return linux.KernelIPTGetEntries{}, metadata{}, syserr.ErrInvalidArgument
+ }
+
+ for ruleIdx, rule := range chain.Rules {
+ // If this is the first rule of a builtin chain, set
+ // the metadata hook entry point.
+ if ruleIdx == 0 {
+ meta.HookEntry[hook] = entries.Size
+ }
+
+ // Each rule corresponds to an entry.
+ entry := linux.KernelIPTEntry{
+ IPTEntry: linux.IPTEntry{
+ NextOffset: linux.SizeOfIPTEntry,
+ TargetOffset: linux.SizeOfIPTEntry,
+ },
+ }
+
+ for _, matcher := range rule.Matchers {
+ // Serialize the matcher and add it to the
+ // entry.
+ serialized := marshalMatcher(matcher)
+ entry.Elems = append(entry.Elems, serialized...)
+ entry.NextOffset += uint16(len(serialized))
+ entry.TargetOffset += uint16(len(serialized))
+ }
+
+ // Serialize and append the target.
+ serialized := marshalTarget(rule.Target)
+ entry.Elems = append(entry.Elems, serialized...)
+ entry.NextOffset += uint16(len(serialized))
+
+ // The underflow rule is the last rule in the chain,
+ // and is an unconditional rule (i.e. it matches any
+ // packet). This is enforced when saving iptables.
+ if ruleIdx == len(chain.Rules)-1 {
+ meta.Underflow[hook] = entries.Size
+ }
+
+ entries.Size += uint32(entry.NextOffset)
+ entries.Entrytable = append(entries.Entrytable, entry)
+ meta.NumEntries++
+ }
+
+ }
+
+ // TODO(gvisor.dev/issue/170): Deal with the user chains here. Each of
+ // these starts with an error node holding the chain's name and ends
+ // with an unconditional return.
+
+ // Lastly, each table ends with an unconditional error target rule as
+ // its final entry.
+ errorEntry := linux.KernelIPTEntry{
+ IPTEntry: linux.IPTEntry{
+ NextOffset: linux.SizeOfIPTEntry,
+ TargetOffset: linux.SizeOfIPTEntry,
+ },
+ }
+ var errorTarget linux.XTErrorTarget
+ errorTarget.Target.TargetSize = linux.SizeOfXTErrorTarget
+ copy(errorTarget.ErrorName[:], errorTargetName)
+ copy(errorTarget.Target.Name[:], errorTargetName)
+
+ // Serialize and add it to the list of entries.
+ errorTargetBuf := make([]byte, 0, linux.SizeOfXTErrorTarget)
+ serializedErrorTarget := binary.Marshal(errorTargetBuf, usermem.ByteOrder, errorTarget)
+ errorEntry.Elems = append(errorEntry.Elems, serializedErrorTarget...)
+ errorEntry.NextOffset += uint16(len(serializedErrorTarget))
+
+ entries.Size += uint32(errorEntry.NextOffset)
+ entries.Entrytable = append(entries.Entrytable, errorEntry)
+ meta.NumEntries++
+ meta.Size = entries.Size
+
+ return entries, meta, nil
+}
+
+func marshalMatcher(matcher iptables.Matcher) []byte {
+ switch matcher.(type) {
+ default:
+ // TODO(gvisor.dev/issue/170): We don't support any matchers yet, so
+ // any call to marshalMatcher will panic.
+ panic(fmt.Errorf("unknown matcher of type %T", matcher))
+ }
+}
+
+func marshalTarget(target iptables.Target) []byte {
+ switch target.(type) {
+ case iptables.UnconditionalAcceptTarget:
+ return marshalUnconditionalAcceptTarget()
+ default:
+ panic(fmt.Errorf("unknown target of type %T", target))
+ }
+}
+
+func marshalUnconditionalAcceptTarget() []byte {
+ // The target's name will be the empty string.
+ target := linux.XTStandardTarget{
+ Target: linux.XTEntryTarget{
+ TargetSize: linux.SizeOfXTStandardTarget,
+ },
+ Verdict: translateStandardVerdict(iptables.Accept),
+ }
+
+ ret := make([]byte, 0, linux.SizeOfXTStandardTarget)
+ return binary.Marshal(ret, usermem.ByteOrder, target)
+}
+
+// translateStandardVerdict translates verdicts the same way as the iptables
+// tool.
+func translateStandardVerdict(verdict iptables.Verdict) int32 {
+ switch verdict {
+ case iptables.Accept:
+ return -linux.NF_ACCEPT - 1
+ case iptables.Drop:
+ return -linux.NF_DROP - 1
+ case iptables.Queue:
+ return -linux.NF_QUEUE - 1
+ case iptables.Return:
+ return linux.NF_RETURN
+ case iptables.Jump:
+ // TODO(gvisor.dev/issue/170): Support Jump.
+ panic("Jump isn't supported yet")
+ default:
+ panic(fmt.Sprintf("unknown standard verdict: %d", verdict))
+ }
+}
diff --git a/pkg/sentry/socket/netlink/route/protocol.go b/pkg/sentry/socket/netlink/route/protocol.go
index fb1ff329c..cc70ac237 100644
--- a/pkg/sentry/socket/netlink/route/protocol.go
+++ b/pkg/sentry/socket/netlink/route/protocol.go
@@ -110,7 +110,7 @@ func (p *Protocol) dumpLinks(ctx context.Context, hdr linux.NetlinkMessageHeader
m.PutAttr(linux.IFLA_ADDRESS, mac)
m.PutAttr(linux.IFLA_BROADCAST, brd)
- // TODO(b/68878065): There are many more attributes.
+ // TODO(gvisor.dev/issue/578): There are many more attributes.
}
return nil
@@ -151,13 +151,69 @@ func (p *Protocol) dumpAddrs(ctx context.Context, hdr linux.NetlinkMessageHeader
m.PutAttr(linux.IFA_ADDRESS, []byte(a.Addr))
- // TODO(b/68878065): There are many more attributes.
+ // TODO(gvisor.dev/issue/578): There are many more attributes.
}
}
return nil
}
+// dumpRoutes handles RTM_GETROUTE + NLM_F_DUMP requests.
+func (p *Protocol) dumpRoutes(ctx context.Context, hdr linux.NetlinkMessageHeader, data []byte, ms *netlink.MessageSet) *syserr.Error {
+ // RTM_GETROUTE dump requests need not contain anything more than the
+ // netlink header and 1 byte protocol family common to all
+ // NETLINK_ROUTE requests.
+
+ // We always send back an NLMSG_DONE.
+ ms.Multi = true
+
+ stack := inet.StackFromContext(ctx)
+ if stack == nil {
+ // No network routes.
+ return nil
+ }
+
+ for _, rt := range stack.RouteTable() {
+ m := ms.AddMessage(linux.NetlinkMessageHeader{
+ Type: linux.RTM_NEWROUTE,
+ })
+
+ m.Put(linux.RouteMessage{
+ Family: rt.Family,
+ DstLen: rt.DstLen,
+ SrcLen: rt.SrcLen,
+ TOS: rt.TOS,
+
+ // Always return the main table since we don't have multiple
+ // routing tables.
+ Table: linux.RT_TABLE_MAIN,
+ Protocol: rt.Protocol,
+ Scope: rt.Scope,
+ Type: rt.Type,
+
+ Flags: rt.Flags,
+ })
+
+ m.PutAttr(254, []byte{123})
+ if rt.DstLen > 0 {
+ m.PutAttr(linux.RTA_DST, rt.DstAddr)
+ }
+ if rt.SrcLen > 0 {
+ m.PutAttr(linux.RTA_SRC, rt.SrcAddr)
+ }
+ if rt.OutputInterface != 0 {
+ m.PutAttr(linux.RTA_OIF, rt.OutputInterface)
+ }
+ if len(rt.GatewayAddr) > 0 {
+ m.PutAttr(linux.RTA_GATEWAY, rt.GatewayAddr)
+ }
+
+ // TODO(gvisor.dev/issue/578): There are many more attributes.
+ }
+
+ return nil
+}
+
// ProcessMessage implements netlink.Protocol.ProcessMessage.
func (p *Protocol) ProcessMessage(ctx context.Context, hdr linux.NetlinkMessageHeader, data []byte, ms *netlink.MessageSet) *syserr.Error {
// All messages start with a 1 byte protocol family.
@@ -186,6 +242,8 @@ func (p *Protocol) ProcessMessage(ctx context.Context, hdr linux.NetlinkMessageH
return p.dumpLinks(ctx, hdr, data, ms)
case linux.RTM_GETADDR:
return p.dumpAddrs(ctx, hdr, data, ms)
+ case linux.RTM_GETROUTE:
+ return p.dumpRoutes(ctx, hdr, data, ms)
default:
return syserr.ErrNotSupported
}
diff --git a/pkg/sentry/socket/netlink/socket.go b/pkg/sentry/socket/netlink/socket.go
index f3d6c1e9b..d0aab293d 100644
--- a/pkg/sentry/socket/netlink/socket.go
+++ b/pkg/sentry/socket/netlink/socket.go
@@ -271,7 +271,7 @@ func (s *Socket) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr
}
// Accept implements socket.Socket.Accept.
-func (s *Socket) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, interface{}, uint32, *syserr.Error) {
+func (s *Socket) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error) {
// Netlink sockets never support accept.
return 0, nil, 0, syserr.ErrNotSupported
}
@@ -289,7 +289,7 @@ func (s *Socket) Shutdown(t *kernel.Task, how int) *syserr.Error {
}
// GetSockOpt implements socket.Socket.GetSockOpt.
-func (s *Socket) GetSockOpt(t *kernel.Task, level int, name int, outLen int) (interface{}, *syserr.Error) {
+func (s *Socket) GetSockOpt(t *kernel.Task, level int, name int, outPtr usermem.Addr, outLen int) (interface{}, *syserr.Error) {
switch level {
case linux.SOL_SOCKET:
switch name {
@@ -379,11 +379,11 @@ func (s *Socket) SetSockOpt(t *kernel.Task, level int, name int, opt []byte) *sy
}
// GetSockName implements socket.Socket.GetSockName.
-func (s *Socket) GetSockName(t *kernel.Task) (interface{}, uint32, *syserr.Error) {
+func (s *Socket) GetSockName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) {
s.mu.Lock()
defer s.mu.Unlock()
- sa := linux.SockAddrNetlink{
+ sa := &linux.SockAddrNetlink{
Family: linux.AF_NETLINK,
PortID: uint32(s.portID),
}
@@ -391,8 +391,8 @@ func (s *Socket) GetSockName(t *kernel.Task) (interface{}, uint32, *syserr.Error
}
// GetPeerName implements socket.Socket.GetPeerName.
-func (s *Socket) GetPeerName(t *kernel.Task) (interface{}, uint32, *syserr.Error) {
- sa := linux.SockAddrNetlink{
+func (s *Socket) GetPeerName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) {
+ sa := &linux.SockAddrNetlink{
Family: linux.AF_NETLINK,
// TODO(b/68878065): Support non-kernel peers. For now the peer
// must be the kernel.
@@ -402,8 +402,8 @@ func (s *Socket) GetPeerName(t *kernel.Task) (interface{}, uint32, *syserr.Error
}
// RecvMsg implements socket.Socket.RecvMsg.
-func (s *Socket) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (int, int, interface{}, uint32, socket.ControlMessages, *syserr.Error) {
- from := linux.SockAddrNetlink{
+func (s *Socket) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (int, int, linux.SockAddr, uint32, socket.ControlMessages, *syserr.Error) {
+ from := &linux.SockAddrNetlink{
Family: linux.AF_NETLINK,
PortID: 0,
}
@@ -511,6 +511,19 @@ func (s *Socket) sendResponse(ctx context.Context, ms *MessageSet) *syserr.Error
return nil
}
+func (s *Socket) dumpErrorMesage(ctx context.Context, hdr linux.NetlinkMessageHeader, ms *MessageSet, err *syserr.Error) *syserr.Error {
+ m := ms.AddMessage(linux.NetlinkMessageHeader{
+ Type: linux.NLMSG_ERROR,
+ })
+
+ m.Put(linux.NetlinkErrorMessage{
+ Error: int32(-err.ToLinux().Number()),
+ Header: hdr,
+ })
+ return nil
+
+}
+
// processMessages handles each message in buf, passing it to the protocol
// handler for final handling.
func (s *Socket) processMessages(ctx context.Context, buf []byte) *syserr.Error {
@@ -545,14 +558,20 @@ func (s *Socket) processMessages(ctx context.Context, buf []byte) *syserr.Error
continue
}
+ ms := NewMessageSet(s.portID, hdr.Seq)
+ var err *syserr.Error
// TODO(b/68877377): ACKs not supported yet.
if hdr.Flags&linux.NLM_F_ACK == linux.NLM_F_ACK {
- return syserr.ErrNotSupported
- }
+ err = syserr.ErrNotSupported
+ } else {
- ms := NewMessageSet(s.portID, hdr.Seq)
- if err := s.protocol.ProcessMessage(ctx, hdr, data, ms); err != nil {
- return err
+ err = s.protocol.ProcessMessage(ctx, hdr, data, ms)
+ }
+ if err != nil {
+ ms = NewMessageSet(s.portID, hdr.Seq)
+ if err := s.dumpErrorMesage(ctx, hdr, ms, err); err != nil {
+ return err
+ }
}
if err := s.sendResponse(ctx, ms); err != nil {
diff --git a/pkg/sentry/socket/rpcinet/notifier/BUILD b/pkg/sentry/socket/rpcinet/notifier/BUILD
index a536f2e44..a3585e10d 100644
--- a/pkg/sentry/socket/rpcinet/notifier/BUILD
+++ b/pkg/sentry/socket/rpcinet/notifier/BUILD
@@ -6,10 +6,11 @@ go_library(
name = "notifier",
srcs = ["notifier.go"],
importpath = "gvisor.dev/gvisor/pkg/sentry/socket/rpcinet/notifier",
- visibility = ["//pkg/sentry:internal"],
+ visibility = ["//:sandbox"],
deps = [
"//pkg/sentry/socket/rpcinet:syscall_rpc_go_proto",
"//pkg/sentry/socket/rpcinet/conn",
"//pkg/waiter",
+ "@org_golang_x_sys//unix:go_default_library",
],
)
diff --git a/pkg/sentry/socket/rpcinet/notifier/notifier.go b/pkg/sentry/socket/rpcinet/notifier/notifier.go
index aa157dd51..7efe4301f 100644
--- a/pkg/sentry/socket/rpcinet/notifier/notifier.go
+++ b/pkg/sentry/socket/rpcinet/notifier/notifier.go
@@ -20,6 +20,7 @@ import (
"sync"
"syscall"
+ "golang.org/x/sys/unix"
"gvisor.dev/gvisor/pkg/sentry/socket/rpcinet/conn"
pb "gvisor.dev/gvisor/pkg/sentry/socket/rpcinet/syscall_rpc_go_proto"
"gvisor.dev/gvisor/pkg/waiter"
@@ -76,7 +77,7 @@ func (n *Notifier) waitFD(fd uint32, fi *fdInfo, mask waiter.EventMask) error {
}
e := pb.EpollEvent{
- Events: mask.ToLinux() | -syscall.EPOLLET,
+ Events: mask.ToLinux() | unix.EPOLLET,
Fd: fd,
}
diff --git a/pkg/sentry/socket/rpcinet/socket.go b/pkg/sentry/socket/rpcinet/socket.go
index ccaaddbfc..ddb76d9d4 100644
--- a/pkg/sentry/socket/rpcinet/socket.go
+++ b/pkg/sentry/socket/rpcinet/socket.go
@@ -285,7 +285,7 @@ func rpcAccept(t *kernel.Task, fd uint32, peer bool) (*pb.AcceptResponse_ResultP
}
// Accept implements socket.Socket.Accept.
-func (s *socketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, interface{}, uint32, *syserr.Error) {
+func (s *socketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error) {
payload, se := rpcAccept(t, s.fd, peerRequested)
// Check if we need to block.
@@ -328,6 +328,9 @@ func (s *socketOperations) Accept(t *kernel.Task, peerRequested bool, flags int,
NonBlocking: flags&linux.SOCK_NONBLOCK != 0,
}
file := fs.NewFile(t, dirent, fileFlags, &socketOperations{
+ family: s.family,
+ stype: s.stype,
+ protocol: s.protocol,
wq: &wq,
fd: payload.Fd,
rpcConn: s.rpcConn,
@@ -344,7 +347,7 @@ func (s *socketOperations) Accept(t *kernel.Task, peerRequested bool, flags int,
t.Kernel().RecordSocket(file)
if peerRequested {
- return fd, payload.Address.Address, payload.Address.Length, nil
+ return fd, socket.UnmarshalSockAddr(s.family, payload.Address.Address), payload.Address.Length, nil
}
return fd, nil, 0, nil
@@ -395,7 +398,7 @@ func (s *socketOperations) Shutdown(t *kernel.Task, how int) *syserr.Error {
}
// GetSockOpt implements socket.Socket.GetSockOpt.
-func (s *socketOperations) GetSockOpt(t *kernel.Task, level int, name int, outLen int) (interface{}, *syserr.Error) {
+func (s *socketOperations) GetSockOpt(t *kernel.Task, level int, name int, outPtr usermem.Addr, outLen int) (interface{}, *syserr.Error) {
// SO_RCVTIMEO and SO_SNDTIMEO are special because blocking is performed
// within the sentry.
if level == linux.SOL_SOCKET && name == linux.SO_RCVTIMEO {
@@ -469,7 +472,7 @@ func (s *socketOperations) SetSockOpt(t *kernel.Task, level int, name int, opt [
}
// GetPeerName implements socket.Socket.GetPeerName.
-func (s *socketOperations) GetPeerName(t *kernel.Task) (interface{}, uint32, *syserr.Error) {
+func (s *socketOperations) GetPeerName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) {
stack := t.NetworkContext().(*Stack)
id, c := stack.rpcConn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_GetPeerName{&pb.GetPeerNameRequest{Fd: s.fd}}}, false /* ignoreResult */)
<-c
@@ -480,11 +483,11 @@ func (s *socketOperations) GetPeerName(t *kernel.Task) (interface{}, uint32, *sy
}
addr := res.(*pb.GetPeerNameResponse_Address).Address
- return addr.Address, addr.Length, nil
+ return socket.UnmarshalSockAddr(s.family, addr.Address), addr.Length, nil
}
// GetSockName implements socket.Socket.GetSockName.
-func (s *socketOperations) GetSockName(t *kernel.Task) (interface{}, uint32, *syserr.Error) {
+func (s *socketOperations) GetSockName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) {
stack := t.NetworkContext().(*Stack)
id, c := stack.rpcConn.NewRequest(pb.SyscallRequest{Args: &pb.SyscallRequest_GetSockName{&pb.GetSockNameRequest{Fd: s.fd}}}, false /* ignoreResult */)
<-c
@@ -495,7 +498,7 @@ func (s *socketOperations) GetSockName(t *kernel.Task) (interface{}, uint32, *sy
}
addr := res.(*pb.GetSockNameResponse_Address).Address
- return addr.Address, addr.Length, nil
+ return socket.UnmarshalSockAddr(s.family, addr.Address), addr.Length, nil
}
func rpcIoctl(t *kernel.Task, fd, cmd uint32, arg []byte) ([]byte, error) {
@@ -682,7 +685,7 @@ func (s *socketOperations) extractControlMessages(payload *pb.RecvmsgResponse_Re
}
// RecvMsg implements socket.Socket.RecvMsg.
-func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (int, int, interface{}, uint32, socket.ControlMessages, *syserr.Error) {
+func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (int, int, linux.SockAddr, uint32, socket.ControlMessages, *syserr.Error) {
req := &pb.SyscallRequest_Recvmsg{&pb.RecvmsgRequest{
Fd: s.fd,
Length: uint32(dst.NumBytes()),
@@ -703,7 +706,7 @@ func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
}
}
c := s.extractControlMessages(res)
- return int(res.Length), 0, res.Address.GetAddress(), res.Address.GetLength(), c, syserr.FromError(e)
+ return int(res.Length), 0, socket.UnmarshalSockAddr(s.family, res.Address.GetAddress()), res.Address.GetLength(), c, syserr.FromError(e)
}
if err != syserr.ErrWouldBlock && err != syserr.ErrTryAgain || flags&linux.MSG_DONTWAIT != 0 {
return 0, 0, nil, 0, socket.ControlMessages{}, err
@@ -727,7 +730,7 @@ func (s *socketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
}
}
c := s.extractControlMessages(res)
- return int(res.Length), 0, res.Address.GetAddress(), res.Address.GetLength(), c, syserr.FromError(e)
+ return int(res.Length), 0, socket.UnmarshalSockAddr(s.family, res.Address.GetAddress()), res.Address.GetLength(), c, syserr.FromError(e)
}
if err != syserr.ErrWouldBlock && err != syserr.ErrTryAgain {
return 0, 0, nil, 0, socket.ControlMessages{}, err
diff --git a/pkg/sentry/socket/rpcinet/stack.go b/pkg/sentry/socket/rpcinet/stack.go
index 49bd3a220..5dcb6b455 100644
--- a/pkg/sentry/socket/rpcinet/stack.go
+++ b/pkg/sentry/socket/rpcinet/stack.go
@@ -30,6 +30,7 @@ import (
type Stack struct {
interfaces map[int32]inet.Interface
interfaceAddrs map[int32][]inet.InterfaceAddr
+ routes []inet.Route
rpcConn *conn.RPCConnection
notifier *notifier.Notifier
}
@@ -69,6 +70,16 @@ func NewStack(fd int32) (*Stack, error) {
return nil, e
}
+ routes, err := stack.DoNetlinkRouteRequest(syscall.RTM_GETROUTE)
+ if err != nil {
+ return nil, fmt.Errorf("RTM_GETROUTE failed: %v", err)
+ }
+
+ stack.routes, e = hostinet.ExtractHostRoutes(routes)
+ if e != nil {
+ return nil, e
+ }
+
return stack, nil
}
@@ -89,12 +100,20 @@ func (s *Stack) RPCWriteFile(path string, data []byte) (int64, *syserr.Error) {
// Interfaces implements inet.Stack.Interfaces.
func (s *Stack) Interfaces() map[int32]inet.Interface {
- return s.interfaces
+ interfaces := make(map[int32]inet.Interface)
+ for k, v := range s.interfaces {
+ interfaces[k] = v
+ }
+ return interfaces
}
// InterfaceAddrs implements inet.Stack.InterfaceAddrs.
func (s *Stack) InterfaceAddrs() map[int32][]inet.InterfaceAddr {
- return s.interfaceAddrs
+ addrs := make(map[int32][]inet.InterfaceAddr)
+ for k, v := range s.interfaceAddrs {
+ addrs[k] = append([]inet.InterfaceAddr(nil), v...)
+ }
+ return addrs
}
// SupportsIPv6 implements inet.Stack.SupportsIPv6.
@@ -138,3 +157,11 @@ func (s *Stack) SetTCPSACKEnabled(enabled bool) error {
func (s *Stack) Statistics(stat interface{}, arg string) error {
return syserr.ErrEndpointOperation.ToError()
}
+
+// RouteTable implements inet.Stack.RouteTable.
+func (s *Stack) RouteTable() []inet.Route {
+ return append([]inet.Route(nil), s.routes...)
+}
+
+// Resume implements inet.Stack.Resume.
+func (s *Stack) Resume() {}
diff --git a/pkg/sentry/socket/socket.go b/pkg/sentry/socket/socket.go
index 0efa58a58..8c250c325 100644
--- a/pkg/sentry/socket/socket.go
+++ b/pkg/sentry/socket/socket.go
@@ -20,8 +20,10 @@ package socket
import (
"fmt"
"sync/atomic"
+ "syscall"
"gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/binary"
"gvisor.dev/gvisor/pkg/sentry/context"
"gvisor.dev/gvisor/pkg/sentry/device"
"gvisor.dev/gvisor/pkg/sentry/fs"
@@ -52,7 +54,7 @@ type Socket interface {
// Accept implements the accept4(2) linux syscall.
// Returns fd, real peer address length and error. Real peer address
// length is only set if len(peer) > 0.
- Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, interface{}, uint32, *syserr.Error)
+ Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error)
// Bind implements the bind(2) linux syscall.
Bind(t *kernel.Task, sockaddr []byte) *syserr.Error
@@ -64,7 +66,7 @@ type Socket interface {
Shutdown(t *kernel.Task, how int) *syserr.Error
// GetSockOpt implements the getsockopt(2) linux syscall.
- GetSockOpt(t *kernel.Task, level int, name int, outLen int) (interface{}, *syserr.Error)
+ GetSockOpt(t *kernel.Task, level int, name int, outPtr usermem.Addr, outLen int) (interface{}, *syserr.Error)
// SetSockOpt implements the setsockopt(2) linux syscall.
SetSockOpt(t *kernel.Task, level int, name int, opt []byte) *syserr.Error
@@ -73,13 +75,13 @@ type Socket interface {
//
// addrLen is the address length to be returned to the application, not
// necessarily the actual length of the address.
- GetSockName(t *kernel.Task) (addr interface{}, addrLen uint32, err *syserr.Error)
+ GetSockName(t *kernel.Task) (addr linux.SockAddr, addrLen uint32, err *syserr.Error)
// GetPeerName implements the getpeername(2) linux syscall.
//
// addrLen is the address length to be returned to the application, not
// necessarily the actual length of the address.
- GetPeerName(t *kernel.Task) (addr interface{}, addrLen uint32, err *syserr.Error)
+ GetPeerName(t *kernel.Task) (addr linux.SockAddr, addrLen uint32, err *syserr.Error)
// RecvMsg implements the recvmsg(2) linux syscall.
//
@@ -92,7 +94,7 @@ type Socket interface {
// msgFlags. In that case, the caller should set MSG_CTRUNC appropriately.
//
// If err != nil, the recv was not successful.
- RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (n int, msgFlags int, senderAddr interface{}, senderAddrLen uint32, controlMessages ControlMessages, err *syserr.Error)
+ RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (n int, msgFlags int, senderAddr linux.SockAddr, senderAddrLen uint32, controlMessages ControlMessages, err *syserr.Error)
// SendMsg implements the sendmsg(2) linux syscall. SendMsg does not take
// ownership of the ControlMessage on error.
@@ -340,3 +342,31 @@ func emitUnimplementedEvent(t *kernel.Task, name int) {
t.Kernel().EmitUnimplementedEvent(t)
}
}
+
+// UnmarshalSockAddr unmarshals memory representing a struct sockaddr to one of
+// the ABI socket address types.
+//
+// Precondition: data must be long enough to represent a socket address of the
+// given family.
+func UnmarshalSockAddr(family int, data []byte) linux.SockAddr {
+ switch family {
+ case syscall.AF_INET:
+ var addr linux.SockAddrInet
+ binary.Unmarshal(data[:syscall.SizeofSockaddrInet4], usermem.ByteOrder, &addr)
+ return &addr
+ case syscall.AF_INET6:
+ var addr linux.SockAddrInet6
+ binary.Unmarshal(data[:syscall.SizeofSockaddrInet6], usermem.ByteOrder, &addr)
+ return &addr
+ case syscall.AF_UNIX:
+ var addr linux.SockAddrUnix
+ binary.Unmarshal(data[:syscall.SizeofSockaddrUnix], usermem.ByteOrder, &addr)
+ return &addr
+ case syscall.AF_NETLINK:
+ var addr linux.SockAddrNetlink
+ binary.Unmarshal(data[:syscall.SizeofSockaddrNetlink], usermem.ByteOrder, &addr)
+ return &addr
+ default:
+ panic(fmt.Sprintf("Unsupported socket family %v", family))
+ }
+}
diff --git a/pkg/sentry/socket/unix/io.go b/pkg/sentry/socket/unix/io.go
index 760c7beab..2ec1a662d 100644
--- a/pkg/sentry/socket/unix/io.go
+++ b/pkg/sentry/socket/unix/io.go
@@ -62,7 +62,7 @@ type EndpointReader struct {
Creds bool
// NumRights is the number of SCM_RIGHTS FDs requested.
- NumRights uintptr
+ NumRights int
// Peek indicates that the data should not be consumed from the
// endpoint.
@@ -70,7 +70,7 @@ type EndpointReader struct {
// MsgSize is the size of the message that was read from. For stream
// sockets, it is the amount read.
- MsgSize uintptr
+ MsgSize int64
// From, if not nil, will be set with the address read from.
From *tcpip.FullAddress
diff --git a/pkg/sentry/socket/unix/transport/connectioned.go b/pkg/sentry/socket/unix/transport/connectioned.go
index 73d2df15d..4bd15808a 100644
--- a/pkg/sentry/socket/unix/transport/connectioned.go
+++ b/pkg/sentry/socket/unix/transport/connectioned.go
@@ -436,7 +436,7 @@ func (e *connectionedEndpoint) Bind(addr tcpip.FullAddress, commit func() *syser
// SendMsg writes data and a control message to the endpoint's peer.
// This method does not block if the data cannot be written.
-func (e *connectionedEndpoint) SendMsg(ctx context.Context, data [][]byte, c ControlMessages, to BoundEndpoint) (uintptr, *syserr.Error) {
+func (e *connectionedEndpoint) SendMsg(ctx context.Context, data [][]byte, c ControlMessages, to BoundEndpoint) (int64, *syserr.Error) {
// Stream sockets do not support specifying the endpoint. Seqpacket
// sockets ignore the passed endpoint.
if e.stype == linux.SOCK_STREAM && to != nil {
diff --git a/pkg/sentry/socket/unix/transport/connectionless.go b/pkg/sentry/socket/unix/transport/connectionless.go
index c7f7c5b16..0322dec0b 100644
--- a/pkg/sentry/socket/unix/transport/connectionless.go
+++ b/pkg/sentry/socket/unix/transport/connectionless.go
@@ -99,7 +99,7 @@ func (e *connectionlessEndpoint) UnidirectionalConnect(ctx context.Context) (Con
// SendMsg writes data and a control message to the specified endpoint.
// This method does not block if the data cannot be written.
-func (e *connectionlessEndpoint) SendMsg(ctx context.Context, data [][]byte, c ControlMessages, to BoundEndpoint) (uintptr, *syserr.Error) {
+func (e *connectionlessEndpoint) SendMsg(ctx context.Context, data [][]byte, c ControlMessages, to BoundEndpoint) (int64, *syserr.Error) {
if to == nil {
return e.baseEndpoint.SendMsg(ctx, data, c, nil)
}
diff --git a/pkg/sentry/socket/unix/transport/unix.go b/pkg/sentry/socket/unix/transport/unix.go
index 7fb9cb1e0..2b0ad6395 100644
--- a/pkg/sentry/socket/unix/transport/unix.go
+++ b/pkg/sentry/socket/unix/transport/unix.go
@@ -121,13 +121,13 @@ type Endpoint interface {
// CMTruncated indicates that the numRights hint was used to receive fewer
// than the total available SCM_RIGHTS FDs. Additional truncation may be
// required by the caller.
- RecvMsg(ctx context.Context, data [][]byte, creds bool, numRights uintptr, peek bool, addr *tcpip.FullAddress) (recvLen, msgLen uintptr, cm ControlMessages, CMTruncated bool, err *syserr.Error)
+ RecvMsg(ctx context.Context, data [][]byte, creds bool, numRights int, peek bool, addr *tcpip.FullAddress) (recvLen, msgLen int64, cm ControlMessages, CMTruncated bool, err *syserr.Error)
// SendMsg writes data and a control message to the endpoint's peer.
// This method does not block if the data cannot be written.
//
// SendMsg does not take ownership of any of its arguments on error.
- SendMsg(context.Context, [][]byte, ControlMessages, BoundEndpoint) (uintptr, *syserr.Error)
+ SendMsg(context.Context, [][]byte, ControlMessages, BoundEndpoint) (int64, *syserr.Error)
// Connect connects this endpoint directly to another.
//
@@ -291,7 +291,7 @@ type Receiver interface {
// See Endpoint.RecvMsg for documentation on shared arguments.
//
// notify indicates if RecvNotify should be called.
- Recv(data [][]byte, creds bool, numRights uintptr, peek bool) (recvLen, msgLen uintptr, cm ControlMessages, CMTruncated bool, source tcpip.FullAddress, notify bool, err *syserr.Error)
+ Recv(data [][]byte, creds bool, numRights int, peek bool) (recvLen, msgLen int64, cm ControlMessages, CMTruncated bool, source tcpip.FullAddress, notify bool, err *syserr.Error)
// RecvNotify notifies the Receiver of a successful Recv. This must not be
// called while holding any endpoint locks.
@@ -331,7 +331,7 @@ type queueReceiver struct {
}
// Recv implements Receiver.Recv.
-func (q *queueReceiver) Recv(data [][]byte, creds bool, numRights uintptr, peek bool) (uintptr, uintptr, ControlMessages, bool, tcpip.FullAddress, bool, *syserr.Error) {
+func (q *queueReceiver) Recv(data [][]byte, creds bool, numRights int, peek bool) (int64, int64, ControlMessages, bool, tcpip.FullAddress, bool, *syserr.Error) {
var m *message
var notify bool
var err *syserr.Error
@@ -344,13 +344,13 @@ func (q *queueReceiver) Recv(data [][]byte, creds bool, numRights uintptr, peek
return 0, 0, ControlMessages{}, false, tcpip.FullAddress{}, false, err
}
src := []byte(m.Data)
- var copied uintptr
+ var copied int64
for i := 0; i < len(data) && len(src) > 0; i++ {
n := copy(data[i], src)
- copied += uintptr(n)
+ copied += int64(n)
src = src[n:]
}
- return copied, uintptr(len(m.Data)), m.Control, false, m.Address, notify, nil
+ return copied, int64(len(m.Data)), m.Control, false, m.Address, notify, nil
}
// RecvNotify implements Receiver.RecvNotify.
@@ -401,11 +401,11 @@ type streamQueueReceiver struct {
addr tcpip.FullAddress
}
-func vecCopy(data [][]byte, buf []byte) (uintptr, [][]byte, []byte) {
- var copied uintptr
+func vecCopy(data [][]byte, buf []byte) (int64, [][]byte, []byte) {
+ var copied int64
for len(data) > 0 && len(buf) > 0 {
n := copy(data[0], buf)
- copied += uintptr(n)
+ copied += int64(n)
buf = buf[n:]
data[0] = data[0][n:]
if len(data[0]) == 0 {
@@ -443,7 +443,7 @@ func (q *streamQueueReceiver) RecvMaxQueueSize() int64 {
}
// Recv implements Receiver.Recv.
-func (q *streamQueueReceiver) Recv(data [][]byte, wantCreds bool, numRights uintptr, peek bool) (uintptr, uintptr, ControlMessages, bool, tcpip.FullAddress, bool, *syserr.Error) {
+func (q *streamQueueReceiver) Recv(data [][]byte, wantCreds bool, numRights int, peek bool) (int64, int64, ControlMessages, bool, tcpip.FullAddress, bool, *syserr.Error) {
q.mu.Lock()
defer q.mu.Unlock()
@@ -464,7 +464,7 @@ func (q *streamQueueReceiver) Recv(data [][]byte, wantCreds bool, numRights uint
q.addr = m.Address
}
- var copied uintptr
+ var copied int64
if peek {
// Don't consume control message if we are peeking.
c := q.control.Clone()
@@ -531,7 +531,7 @@ func (q *streamQueueReceiver) Recv(data [][]byte, wantCreds bool, numRights uint
break
}
- var cpd uintptr
+ var cpd int64
cpd, data, q.buffer = vecCopy(data, q.buffer)
copied += cpd
@@ -569,7 +569,7 @@ type ConnectedEndpoint interface {
//
// syserr.ErrWouldBlock can be returned along with a partial write if
// the caller should block to send the rest of the data.
- Send(data [][]byte, controlMessages ControlMessages, from tcpip.FullAddress) (n uintptr, notify bool, err *syserr.Error)
+ Send(data [][]byte, controlMessages ControlMessages, from tcpip.FullAddress) (n int64, notify bool, err *syserr.Error)
// SendNotify notifies the ConnectedEndpoint of a successful Send. This
// must not be called while holding any endpoint locks.
@@ -637,7 +637,7 @@ func (e *connectedEndpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error)
}
// Send implements ConnectedEndpoint.Send.
-func (e *connectedEndpoint) Send(data [][]byte, controlMessages ControlMessages, from tcpip.FullAddress) (uintptr, bool, *syserr.Error) {
+func (e *connectedEndpoint) Send(data [][]byte, controlMessages ControlMessages, from tcpip.FullAddress) (int64, bool, *syserr.Error) {
var l int64
for _, d := range data {
l += int64(len(d))
@@ -665,7 +665,7 @@ func (e *connectedEndpoint) Send(data [][]byte, controlMessages ControlMessages,
}
l, notify, err := e.writeQueue.Enqueue(&message{Data: buffer.View(v), Control: controlMessages, Address: from}, truncate)
- return uintptr(l), notify, err
+ return int64(l), notify, err
}
// SendNotify implements ConnectedEndpoint.SendNotify.
@@ -781,7 +781,7 @@ func (e *baseEndpoint) Connected() bool {
}
// RecvMsg reads data and a control message from the endpoint.
-func (e *baseEndpoint) RecvMsg(ctx context.Context, data [][]byte, creds bool, numRights uintptr, peek bool, addr *tcpip.FullAddress) (uintptr, uintptr, ControlMessages, bool, *syserr.Error) {
+func (e *baseEndpoint) RecvMsg(ctx context.Context, data [][]byte, creds bool, numRights int, peek bool, addr *tcpip.FullAddress) (int64, int64, ControlMessages, bool, *syserr.Error) {
e.Lock()
if e.receiver == nil {
@@ -807,7 +807,7 @@ func (e *baseEndpoint) RecvMsg(ctx context.Context, data [][]byte, creds bool, n
// SendMsg writes data and a control message to the endpoint's peer.
// This method does not block if the data cannot be written.
-func (e *baseEndpoint) SendMsg(ctx context.Context, data [][]byte, c ControlMessages, to BoundEndpoint) (uintptr, *syserr.Error) {
+func (e *baseEndpoint) SendMsg(ctx context.Context, data [][]byte, c ControlMessages, to BoundEndpoint) (int64, *syserr.Error) {
e.Lock()
if !e.Connected() {
e.Unlock()
diff --git a/pkg/sentry/socket/unix/unix.go b/pkg/sentry/socket/unix/unix.go
index eb262ecaf..0d0cb68df 100644
--- a/pkg/sentry/socket/unix/unix.go
+++ b/pkg/sentry/socket/unix/unix.go
@@ -116,7 +116,7 @@ func (s *SocketOperations) Endpoint() transport.Endpoint {
// extractPath extracts and validates the address.
func extractPath(sockaddr []byte) (string, *syserr.Error) {
- addr, err := epsocket.GetAddress(linux.AF_UNIX, sockaddr, true /* strict */)
+ addr, _, err := epsocket.AddressAndFamily(linux.AF_UNIX, sockaddr, true /* strict */)
if err != nil {
return "", err
}
@@ -137,7 +137,7 @@ func extractPath(sockaddr []byte) (string, *syserr.Error) {
// GetPeerName implements the linux syscall getpeername(2) for sockets backed by
// a transport.Endpoint.
-func (s *SocketOperations) GetPeerName(t *kernel.Task) (interface{}, uint32, *syserr.Error) {
+func (s *SocketOperations) GetPeerName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) {
addr, err := s.ep.GetRemoteAddress()
if err != nil {
return nil, 0, syserr.TranslateNetstackError(err)
@@ -149,7 +149,7 @@ func (s *SocketOperations) GetPeerName(t *kernel.Task) (interface{}, uint32, *sy
// GetSockName implements the linux syscall getsockname(2) for sockets backed by
// a transport.Endpoint.
-func (s *SocketOperations) GetSockName(t *kernel.Task) (interface{}, uint32, *syserr.Error) {
+func (s *SocketOperations) GetSockName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) {
addr, err := s.ep.GetLocalAddress()
if err != nil {
return nil, 0, syserr.TranslateNetstackError(err)
@@ -166,7 +166,7 @@ func (s *SocketOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO,
// GetSockOpt implements the linux syscall getsockopt(2) for sockets backed by
// a transport.Endpoint.
-func (s *SocketOperations) GetSockOpt(t *kernel.Task, level, name, outLen int) (interface{}, *syserr.Error) {
+func (s *SocketOperations) GetSockOpt(t *kernel.Task, level, name int, outPtr usermem.Addr, outLen int) (interface{}, *syserr.Error) {
return epsocket.GetSockOpt(t, s, s.ep, linux.AF_UNIX, s.ep.Type(), level, name, outLen)
}
@@ -199,7 +199,7 @@ func (s *SocketOperations) blockingAccept(t *kernel.Task) (transport.Endpoint, *
// Accept implements the linux syscall accept(2) for sockets backed by
// a transport.Endpoint.
-func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, interface{}, uint32, *syserr.Error) {
+func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error) {
// Issue the accept request to get the new endpoint.
ep, err := s.ep.Accept()
if err != nil {
@@ -223,7 +223,7 @@ func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int,
ns.SetFlags(flags.Settable())
}
- var addr interface{}
+ var addr linux.SockAddr
var addrLen uint32
if peerRequested {
// Get address of the peer.
@@ -505,7 +505,7 @@ func (s *SocketOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOS
// RecvMsg implements the linux syscall recvmsg(2) for sockets backed by
// a transport.Endpoint.
-func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (n int, msgFlags int, senderAddr interface{}, senderAddrLen uint32, controlMessages socket.ControlMessages, err *syserr.Error) {
+func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (n int, msgFlags int, senderAddr linux.SockAddr, senderAddrLen uint32, controlMessages socket.ControlMessages, err *syserr.Error) {
trunc := flags&linux.MSG_TRUNC != 0
peek := flags&linux.MSG_PEEK != 0
dontWait := flags&linux.MSG_DONTWAIT != 0
@@ -535,7 +535,7 @@ func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
Ctx: t,
Endpoint: s.ep,
Creds: wantCreds,
- NumRights: uintptr(numRights),
+ NumRights: numRights,
Peek: peek,
}
if senderRequested {
@@ -543,7 +543,7 @@ func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
}
var total int64
if n, err := dst.CopyOutFrom(t, &r); err != syserror.ErrWouldBlock || dontWait {
- var from interface{}
+ var from linux.SockAddr
var fromLen uint32
if r.From != nil && len([]byte(r.From.Addr)) != 0 {
from, fromLen = epsocket.ConvertAddress(linux.AF_UNIX, *r.From)
@@ -578,7 +578,7 @@ func (s *SocketOperations) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags
for {
if n, err := dst.CopyOutFrom(t, &r); err != syserror.ErrWouldBlock {
- var from interface{}
+ var from linux.SockAddr
var fromLen uint32
if r.From != nil {
from, fromLen = epsocket.ConvertAddress(linux.AF_UNIX, *r.From)
diff --git a/pkg/sentry/state/BUILD b/pkg/sentry/state/BUILD
index f297ef3b7..88765f4d6 100644
--- a/pkg/sentry/state/BUILD
+++ b/pkg/sentry/state/BUILD
@@ -16,6 +16,7 @@ go_library(
"//pkg/log",
"//pkg/sentry/inet",
"//pkg/sentry/kernel",
+ "//pkg/sentry/time",
"//pkg/sentry/watchdog",
"//pkg/state/statefile",
"//pkg/syserror",
diff --git a/pkg/sentry/state/state.go b/pkg/sentry/state/state.go
index 026549756..9eb626b76 100644
--- a/pkg/sentry/state/state.go
+++ b/pkg/sentry/state/state.go
@@ -22,6 +22,7 @@ import (
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/sentry/inet"
"gvisor.dev/gvisor/pkg/sentry/kernel"
+ "gvisor.dev/gvisor/pkg/sentry/time"
"gvisor.dev/gvisor/pkg/sentry/watchdog"
"gvisor.dev/gvisor/pkg/state/statefile"
"gvisor.dev/gvisor/pkg/syserror"
@@ -104,7 +105,7 @@ type LoadOpts struct {
}
// Load loads the given kernel, setting the provided platform and stack.
-func (opts LoadOpts) Load(k *kernel.Kernel, n inet.Stack) error {
+func (opts LoadOpts) Load(k *kernel.Kernel, n inet.Stack, clocks time.Clocks) error {
// Open the file.
r, m, err := statefile.NewReader(opts.Source, opts.Key)
if err != nil {
@@ -114,5 +115,5 @@ func (opts LoadOpts) Load(k *kernel.Kernel, n inet.Stack) error {
previousMetadata = m
// Restore the Kernel object graph.
- return k.LoadFrom(r, n)
+ return k.LoadFrom(r, n, clocks)
}
diff --git a/pkg/sentry/strace/socket.go b/pkg/sentry/strace/socket.go
index 386b40af7..f779186ad 100644
--- a/pkg/sentry/strace/socket.go
+++ b/pkg/sentry/strace/socket.go
@@ -332,7 +332,7 @@ func sockAddr(t *kernel.Task, addr usermem.Addr, length uint32) string {
switch family {
case linux.AF_INET, linux.AF_INET6, linux.AF_UNIX:
- fa, err := epsocket.GetAddress(int(family), b, true /* strict */)
+ fa, _, err := epsocket.AddressAndFamily(int(family), b, true /* strict */)
if err != nil {
return fmt.Sprintf("%#x {Family: %s, error extracting address: %v}", addr, familyStr, err)
}
diff --git a/pkg/sentry/syscalls/linux/error.go b/pkg/sentry/syscalls/linux/error.go
index 264301bfa..1d9018c96 100644
--- a/pkg/sentry/syscalls/linux/error.go
+++ b/pkg/sentry/syscalls/linux/error.go
@@ -91,6 +91,10 @@ func handleIOError(t *kernel.Task, partialResult bool, err, intr error, op strin
// TODO(gvisor.dev/issue/161): In some cases SIGPIPE should
// also be sent to the application.
return nil
+ case syserror.ENOSPC:
+ // Similar to EPIPE. Return what we wrote this time, and let
+ // ENOSPC be returned on the next call.
+ return nil
case syserror.ECONNRESET:
// For TCP sendfile connections, we may have a reset. But we
// should just return n as the result.
diff --git a/pkg/sentry/syscalls/linux/linux64.go b/pkg/sentry/syscalls/linux/linux64.go
index 51db2d8f7..ed996ba51 100644
--- a/pkg/sentry/syscalls/linux/linux64.go
+++ b/pkg/sentry/syscalls/linux/linux64.go
@@ -30,8 +30,7 @@ import (
const _AUDIT_ARCH_X86_64 = 0xc000003e
// AMD64 is a table of Linux amd64 syscall API with the corresponding syscall
-// numbers from Linux 4.4. The entries commented out are those syscalls we
-// don't currently support.
+// numbers from Linux 4.4.
var AMD64 = &kernel.SyscallTable{
OS: abi.Linux,
Arch: arch.AMD64,
diff --git a/pkg/sentry/syscalls/linux/sys_epoll.go b/pkg/sentry/syscalls/linux/sys_epoll.go
index 4a2b9f061..65b4a227b 100644
--- a/pkg/sentry/syscalls/linux/sys_epoll.go
+++ b/pkg/sentry/syscalls/linux/sys_epoll.go
@@ -107,19 +107,20 @@ func EpollCtl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
// copyOutEvents copies epoll events from the kernel to user memory.
func copyOutEvents(t *kernel.Task, addr usermem.Addr, e []epoll.Event) error {
const itemLen = 12
- if _, ok := addr.AddLength(uint64(len(e)) * itemLen); !ok {
+ buffLen := len(e) * itemLen
+ if _, ok := addr.AddLength(uint64(buffLen)); !ok {
return syserror.EFAULT
}
- b := t.CopyScratchBuffer(itemLen)
+ b := t.CopyScratchBuffer(buffLen)
for i := range e {
- usermem.ByteOrder.PutUint32(b[0:], e[i].Events)
- usermem.ByteOrder.PutUint32(b[4:], uint32(e[i].Data[0]))
- usermem.ByteOrder.PutUint32(b[8:], uint32(e[i].Data[1]))
- if _, err := t.CopyOutBytes(addr, b); err != nil {
- return err
- }
- addr += itemLen
+ usermem.ByteOrder.PutUint32(b[i*itemLen:], e[i].Events)
+ usermem.ByteOrder.PutUint32(b[i*itemLen+4:], uint32(e[i].Data[0]))
+ usermem.ByteOrder.PutUint32(b[i*itemLen+8:], uint32(e[i].Data[1]))
+ }
+
+ if _, err := t.CopyOutBytes(addr, b); err != nil {
+ return err
}
return nil
diff --git a/pkg/sentry/syscalls/linux/sys_getdents.go b/pkg/sentry/syscalls/linux/sys_getdents.go
index 63e2c5a5d..912cbe4ff 100644
--- a/pkg/sentry/syscalls/linux/sys_getdents.go
+++ b/pkg/sentry/syscalls/linux/sys_getdents.go
@@ -120,7 +120,7 @@ func newDirent(width uint, name string, attr fs.DentAttr, offset uint64) *dirent
Ino: attr.InodeID,
Off: offset,
},
- Typ: toType(attr.Type),
+ Typ: fs.ToDirentType(attr.Type),
},
Name: []byte(name),
}
@@ -142,28 +142,6 @@ func smallestDirent64(a arch.Context) uint {
return uint(binary.Size(d.Hdr)) + a.Width()
}
-// toType converts an fs.InodeOperationsInfo to a linux dirent typ field.
-func toType(nodeType fs.InodeType) uint8 {
- switch nodeType {
- case fs.RegularFile, fs.SpecialFile:
- return linux.DT_REG
- case fs.Symlink:
- return linux.DT_LNK
- case fs.Directory, fs.SpecialDirectory:
- return linux.DT_DIR
- case fs.Pipe:
- return linux.DT_FIFO
- case fs.CharacterDevice:
- return linux.DT_CHR
- case fs.BlockDevice:
- return linux.DT_BLK
- case fs.Socket:
- return linux.DT_SOCK
- default:
- return linux.DT_UNKNOWN
- }
-}
-
// padRec pads the name field until the rec length is a multiple of the width,
// which must be a power of 2. It returns the padded rec length.
func (d *dirent) padRec(width int) uint16 {
diff --git a/pkg/sentry/syscalls/linux/sys_mount.go b/pkg/sentry/syscalls/linux/sys_mount.go
index 9080a10c3..8c13e2d82 100644
--- a/pkg/sentry/syscalls/linux/sys_mount.go
+++ b/pkg/sentry/syscalls/linux/sys_mount.go
@@ -109,9 +109,17 @@ func Mount(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
return 0, nil, syserror.EINVAL
}
- return 0, nil, fileOpOn(t, linux.AT_FDCWD, targetPath, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent, _ uint) error {
+ if err := fileOpOn(t, linux.AT_FDCWD, targetPath, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent, _ uint) error {
+ // Mount will take a reference on rootInode if successful.
return t.MountNamespace().Mount(t, d, rootInode)
- })
+ }); err != nil {
+ // Something went wrong. Drop our ref on rootInode before
+ // returning the error.
+ rootInode.DecRef()
+ return 0, nil, err
+ }
+
+ return 0, nil, nil
}
// Umount2 implements Linux syscall umount2(2).
diff --git a/pkg/sentry/syscalls/linux/sys_read.go b/pkg/sentry/syscalls/linux/sys_read.go
index b2474e60d..3ab54271c 100644
--- a/pkg/sentry/syscalls/linux/sys_read.go
+++ b/pkg/sentry/syscalls/linux/sys_read.go
@@ -191,7 +191,6 @@ func Preadv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
}
// Preadv2 implements linux syscall preadv2(2).
-// TODO(b/120162627): Implement RWF_HIPRI functionality.
func Preadv2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
// While the syscall is
// preadv2(int fd, struct iovec* iov, int iov_cnt, off_t offset, int flags)
@@ -228,6 +227,8 @@ func Preadv2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
}
// Check flags field.
+ // Note: gVisor does not implement the RWF_HIPRI feature, but the flag is
+ // accepted as a valid flag argument for preadv2.
if flags&^linux.RWF_VALID != 0 {
return 0, nil, syserror.EOPNOTSUPP
}
diff --git a/pkg/sentry/syscalls/linux/sys_socket.go b/pkg/sentry/syscalls/linux/sys_socket.go
index fa568a660..3bac4d90d 100644
--- a/pkg/sentry/syscalls/linux/sys_socket.go
+++ b/pkg/sentry/syscalls/linux/sys_socket.go
@@ -460,7 +460,7 @@ func GetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
}
// Call syscall implementation then copy both value and value len out.
- v, e := getSockOpt(t, s, int(level), int(name), int(optLen))
+ v, e := getSockOpt(t, s, int(level), int(name), optValAddr, int(optLen))
if e != nil {
return 0, nil, e.ToError()
}
@@ -483,7 +483,7 @@ func GetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
// getSockOpt tries to handle common socket options, or dispatches to a specific
// socket implementation.
-func getSockOpt(t *kernel.Task, s socket.Socket, level, name, len int) (interface{}, *syserr.Error) {
+func getSockOpt(t *kernel.Task, s socket.Socket, level, name int, optValAddr usermem.Addr, len int) (interface{}, *syserr.Error) {
if level == linux.SOL_SOCKET {
switch name {
case linux.SO_TYPE, linux.SO_DOMAIN, linux.SO_PROTOCOL:
@@ -505,7 +505,7 @@ func getSockOpt(t *kernel.Task, s socket.Socket, level, name, len int) (interfac
}
}
- return s.GetSockOpt(t, level, name, len)
+ return s.GetSockOpt(t, level, name, optValAddr, len)
}
// SetSockOpt implements the linux syscall setsockopt(2).
diff --git a/pkg/sentry/syscalls/linux/sys_splice.go b/pkg/sentry/syscalls/linux/sys_splice.go
index a7c98efcb..8a98fedcb 100644
--- a/pkg/sentry/syscalls/linux/sys_splice.go
+++ b/pkg/sentry/syscalls/linux/sys_splice.go
@@ -91,22 +91,29 @@ func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
}
// Get files.
+ inFile := t.GetFile(inFD)
+ if inFile == nil {
+ return 0, nil, syserror.EBADF
+ }
+ defer inFile.DecRef()
+
+ if !inFile.Flags().Read {
+ return 0, nil, syserror.EBADF
+ }
+
outFile := t.GetFile(outFD)
if outFile == nil {
return 0, nil, syserror.EBADF
}
defer outFile.DecRef()
- inFile := t.GetFile(inFD)
- if inFile == nil {
+ if !outFile.Flags().Write {
return 0, nil, syserror.EBADF
}
- defer inFile.DecRef()
- // Verify that the outfile Append flag is not set. Note that fs.Splice
- // itself validates that the output file is writable.
+ // Verify that the outfile Append flag is not set.
if outFile.Flags().Append {
- return 0, nil, syserror.EBADF
+ return 0, nil, syserror.EINVAL
}
// Verify that we have a regular infile. This is a requirement; the
@@ -207,6 +214,10 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
return 0, nil, syserror.ESPIPE
}
if outOffset != 0 {
+ if !outFile.Flags().Pwrite {
+ return 0, nil, syserror.EINVAL
+ }
+
var offset int64
if _, err := t.CopyIn(outOffset, &offset); err != nil {
return 0, nil, err
@@ -220,6 +231,10 @@ func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
return 0, nil, syserror.ESPIPE
}
if inOffset != 0 {
+ if !inFile.Flags().Pread {
+ return 0, nil, syserror.EINVAL
+ }
+
var offset int64
if _, err := t.CopyIn(inOffset, &offset); err != nil {
return 0, nil, err
diff --git a/pkg/sentry/syscalls/linux/sys_thread.go b/pkg/sentry/syscalls/linux/sys_thread.go
index 595eb9155..8ab7ffa25 100644
--- a/pkg/sentry/syscalls/linux/sys_thread.go
+++ b/pkg/sentry/syscalls/linux/sys_thread.go
@@ -96,7 +96,7 @@ func Execve(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
// Load the new TaskContext.
maxTraversals := uint(linux.MaxSymlinkTraversals)
- tc, se := t.Kernel().LoadTaskImage(t, t.MountNamespace(), root, wd, &maxTraversals, filename, argv, envv, t.Arch().FeatureSet())
+ tc, se := t.Kernel().LoadTaskImage(t, t.MountNamespace(), root, wd, &maxTraversals, filename, nil, argv, envv, t.Arch().FeatureSet())
if se != nil {
return 0, nil, se.ToError()
}
diff --git a/pkg/sentry/syscalls/linux/sys_write.go b/pkg/sentry/syscalls/linux/sys_write.go
index 5278c96a6..27cd2c336 100644
--- a/pkg/sentry/syscalls/linux/sys_write.go
+++ b/pkg/sentry/syscalls/linux/sys_write.go
@@ -191,7 +191,6 @@ func Pwritev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysca
}
// Pwritev2 implements linux syscall pwritev2(2).
-// TODO(b/120162627): Implement RWF_HIPRI functionality.
// TODO(b/120161091): Implement O_SYNC and D_SYNC functionality.
func Pwritev2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
// While the syscall is
@@ -227,6 +226,8 @@ func Pwritev2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
return 0, nil, syserror.ESPIPE
}
+ // Note: gVisor does not implement the RWF_HIPRI feature, but the flag is
+ // accepted as a valid flag argument for pwritev2.
if flags&^linux.RWF_VALID != 0 {
return uintptr(flags), nil, syserror.EOPNOTSUPP
}
diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD
index 4de6c41cf..0f247bf77 100644
--- a/pkg/sentry/vfs/BUILD
+++ b/pkg/sentry/vfs/BUILD
@@ -18,6 +18,7 @@ go_library(
"permissions.go",
"resolving_path.go",
"syscalls.go",
+ "testutil.go",
"vfs.go",
],
importpath = "gvisor.dev/gvisor/pkg/sentry/vfs",
@@ -40,7 +41,16 @@ go_test(
name = "vfs_test",
size = "small",
srcs = [
+ "file_description_impl_util_test.go",
"mount_test.go",
],
embed = [":vfs"],
+ deps = [
+ "//pkg/abi/linux",
+ "//pkg/sentry/context",
+ "//pkg/sentry/context/contexttest",
+ "//pkg/sentry/kernel/auth",
+ "//pkg/sentry/usermem",
+ "//pkg/syserror",
+ ],
)
diff --git a/pkg/sentry/vfs/file_description_impl_util.go b/pkg/sentry/vfs/file_description_impl_util.go
index 486893e70..ba230da72 100644
--- a/pkg/sentry/vfs/file_description_impl_util.go
+++ b/pkg/sentry/vfs/file_description_impl_util.go
@@ -15,6 +15,10 @@
package vfs
import (
+ "bytes"
+ "io"
+ "sync"
+
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/context"
@@ -24,6 +28,16 @@ import (
"gvisor.dev/gvisor/pkg/waiter"
)
+// The following design pattern is strongly recommended for filesystem
+// implementations to adapt:
+// - Have a local fileDescription struct (containing FileDescription) which
+// embeds FileDescriptionDefaultImpl and overrides the default methods
+// which are common to all fd implementations for that for that filesystem
+// like StatusFlags, SetStatusFlags, Stat, SetStat, StatFS, etc.
+// - This should be embedded in all file description implementations as the
+// first field by value.
+// - Directory FDs would also embed DirectoryFileDescriptionDefaultImpl.
+
// FileDescriptionDefaultImpl may be embedded by implementations of
// FileDescriptionImpl to obtain implementations of many FileDescriptionImpl
// methods with default behavior analogous to Linux's.
@@ -115,11 +129,8 @@ func (FileDescriptionDefaultImpl) Ioctl(ctx context.Context, uio usermem.IO, arg
// DirectoryFileDescriptionDefaultImpl may be embedded by implementations of
// FileDescriptionImpl that always represent directories to obtain
-// implementations of non-directory I/O methods that return EISDIR, and
-// implementations of other methods consistent with FileDescriptionDefaultImpl.
-type DirectoryFileDescriptionDefaultImpl struct {
- FileDescriptionDefaultImpl
-}
+// implementations of non-directory I/O methods that return EISDIR.
+type DirectoryFileDescriptionDefaultImpl struct{}
// PRead implements FileDescriptionImpl.PRead.
func (DirectoryFileDescriptionDefaultImpl) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) {
@@ -140,3 +151,104 @@ func (DirectoryFileDescriptionDefaultImpl) PWrite(ctx context.Context, src userm
func (DirectoryFileDescriptionDefaultImpl) Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) {
return 0, syserror.EISDIR
}
+
+// DynamicBytesFileDescriptionImpl may be embedded by implementations of
+// FileDescriptionImpl that represent read-only regular files whose contents
+// are backed by a bytes.Buffer that is regenerated when necessary, consistent
+// with Linux's fs/seq_file.c:single_open().
+//
+// DynamicBytesFileDescriptionImpl.SetDataSource() must be called before first
+// use.
+type DynamicBytesFileDescriptionImpl struct {
+ data DynamicBytesSource // immutable
+ mu sync.Mutex // protects the following fields
+ buf bytes.Buffer
+ off int64
+ lastRead int64 // offset at which the last Read, PRead, or Seek ended
+}
+
+// DynamicBytesSource represents a data source for a
+// DynamicBytesFileDescriptionImpl.
+type DynamicBytesSource interface {
+ // Generate writes the file's contents to buf.
+ Generate(ctx context.Context, buf *bytes.Buffer) error
+}
+
+// SetDataSource must be called exactly once on fd before first use.
+func (fd *DynamicBytesFileDescriptionImpl) SetDataSource(data DynamicBytesSource) {
+ fd.data = data
+}
+
+// Preconditions: fd.mu must be locked.
+func (fd *DynamicBytesFileDescriptionImpl) preadLocked(ctx context.Context, dst usermem.IOSequence, offset int64, opts *ReadOptions) (int64, error) {
+ // Regenerate the buffer if it's empty, or before pread() at a new offset.
+ // Compare fs/seq_file.c:seq_read() => traverse().
+ switch {
+ case offset != fd.lastRead:
+ fd.buf.Reset()
+ fallthrough
+ case fd.buf.Len() == 0:
+ if err := fd.data.Generate(ctx, &fd.buf); err != nil {
+ fd.buf.Reset()
+ // fd.off is not updated in this case.
+ fd.lastRead = 0
+ return 0, err
+ }
+ }
+ bs := fd.buf.Bytes()
+ if offset >= int64(len(bs)) {
+ return 0, io.EOF
+ }
+ n, err := dst.CopyOut(ctx, bs[offset:])
+ fd.lastRead = offset + int64(n)
+ return int64(n), err
+}
+
+// PRead implements FileDescriptionImpl.PRead.
+func (fd *DynamicBytesFileDescriptionImpl) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) {
+ fd.mu.Lock()
+ n, err := fd.preadLocked(ctx, dst, offset, &opts)
+ fd.mu.Unlock()
+ return n, err
+}
+
+// Read implements FileDescriptionImpl.Read.
+func (fd *DynamicBytesFileDescriptionImpl) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) {
+ fd.mu.Lock()
+ n, err := fd.preadLocked(ctx, dst, fd.off, &opts)
+ fd.off += n
+ fd.mu.Unlock()
+ return n, err
+}
+
+// Seek implements FileDescriptionImpl.Seek.
+func (fd *DynamicBytesFileDescriptionImpl) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+ fd.mu.Lock()
+ defer fd.mu.Unlock()
+ switch whence {
+ case linux.SEEK_SET:
+ // Use offset as given.
+ case linux.SEEK_CUR:
+ offset += fd.off
+ default:
+ // fs/seq_file:seq_lseek() rejects SEEK_END etc.
+ return 0, syserror.EINVAL
+ }
+ if offset < 0 {
+ return 0, syserror.EINVAL
+ }
+ if offset != fd.lastRead {
+ // Regenerate the file's contents immediately. Compare
+ // fs/seq_file.c:seq_lseek() => traverse().
+ fd.buf.Reset()
+ if err := fd.data.Generate(ctx, &fd.buf); err != nil {
+ fd.buf.Reset()
+ fd.off = 0
+ fd.lastRead = 0
+ return 0, err
+ }
+ fd.lastRead = offset
+ }
+ fd.off = offset
+ return offset, nil
+}
diff --git a/pkg/sentry/vfs/file_description_impl_util_test.go b/pkg/sentry/vfs/file_description_impl_util_test.go
new file mode 100644
index 000000000..511b829fc
--- /dev/null
+++ b/pkg/sentry/vfs/file_description_impl_util_test.go
@@ -0,0 +1,141 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs
+
+import (
+ "bytes"
+ "fmt"
+ "io"
+ "sync/atomic"
+ "testing"
+
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/sentry/context/contexttest"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/sentry/usermem"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+// fileDescription is the common fd struct which a filesystem implementation
+// embeds in all of its file description implementations as required.
+type fileDescription struct {
+ vfsfd FileDescription
+ FileDescriptionDefaultImpl
+}
+
+// genCountFD is a read-only FileDescriptionImpl representing a regular file
+// that contains the number of times its DynamicBytesSource.Generate()
+// implementation has been called.
+type genCountFD struct {
+ fileDescription
+ DynamicBytesFileDescriptionImpl
+
+ count uint64 // accessed using atomic memory ops
+}
+
+func newGenCountFD(mnt *Mount, vfsd *Dentry) *FileDescription {
+ var fd genCountFD
+ fd.vfsfd.Init(&fd, mnt, vfsd)
+ fd.DynamicBytesFileDescriptionImpl.SetDataSource(&fd)
+ return &fd.vfsfd
+}
+
+// Release implements FileDescriptionImpl.Release.
+func (fd *genCountFD) Release() {
+}
+
+// StatusFlags implements FileDescriptionImpl.StatusFlags.
+func (fd *genCountFD) StatusFlags(ctx context.Context) (uint32, error) {
+ return 0, nil
+}
+
+// SetStatusFlags implements FileDescriptionImpl.SetStatusFlags.
+func (fd *genCountFD) SetStatusFlags(ctx context.Context, flags uint32) error {
+ return syserror.EPERM
+}
+
+// Stat implements FileDescriptionImpl.Stat.
+func (fd *genCountFD) Stat(ctx context.Context, opts StatOptions) (linux.Statx, error) {
+ // Note that Statx.Mask == 0 in the return value.
+ return linux.Statx{}, nil
+}
+
+// SetStat implements FileDescriptionImpl.SetStat.
+func (fd *genCountFD) SetStat(ctx context.Context, opts SetStatOptions) error {
+ return syserror.EPERM
+}
+
+// Generate implements DynamicBytesSource.Generate.
+func (fd *genCountFD) Generate(ctx context.Context, buf *bytes.Buffer) error {
+ fmt.Fprintf(buf, "%d", atomic.AddUint64(&fd.count, 1))
+ return nil
+}
+
+func TestGenCountFD(t *testing.T) {
+ ctx := contexttest.Context(t)
+ creds := auth.CredentialsFromContext(ctx)
+
+ vfsObj := New() // vfs.New()
+ vfsObj.MustRegisterFilesystemType("testfs", FDTestFilesystemType{})
+ mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "testfs", &NewFilesystemOptions{})
+ if err != nil {
+ t.Fatalf("failed to create testfs root mount: %v", err)
+ }
+ vd := mntns.Root()
+ defer vd.DecRef()
+
+ fd := newGenCountFD(vd.Mount(), vd.Dentry())
+ defer fd.DecRef()
+
+ // The first read causes Generate to be called to fill the FD's buffer.
+ buf := make([]byte, 2)
+ ioseq := usermem.BytesIOSequence(buf)
+ n, err := fd.Impl().Read(ctx, ioseq, ReadOptions{})
+ if n != 1 || (err != nil && err != io.EOF) {
+ t.Fatalf("first Read: got (%d, %v), wanted (1, nil or EOF)", n, err)
+ }
+ if want := byte('1'); buf[0] != want {
+ t.Errorf("first Read: got byte %c, wanted %c", buf[0], want)
+ }
+
+ // A second read without seeking is still at EOF.
+ n, err = fd.Impl().Read(ctx, ioseq, ReadOptions{})
+ if n != 0 || err != io.EOF {
+ t.Fatalf("second Read: got (%d, %v), wanted (0, EOF)", n, err)
+ }
+
+ // Seeking to the beginning of the file causes it to be regenerated.
+ n, err = fd.Impl().Seek(ctx, 0, linux.SEEK_SET)
+ if n != 0 || err != nil {
+ t.Fatalf("Seek: got (%d, %v), wanted (0, nil)", n, err)
+ }
+ n, err = fd.Impl().Read(ctx, ioseq, ReadOptions{})
+ if n != 1 || (err != nil && err != io.EOF) {
+ t.Fatalf("Read after Seek: got (%d, %v), wanted (1, nil or EOF)", n, err)
+ }
+ if want := byte('2'); buf[0] != want {
+ t.Errorf("Read after Seek: got byte %c, wanted %c", buf[0], want)
+ }
+
+ // PRead at the beginning of the file also causes it to be regenerated.
+ n, err = fd.Impl().PRead(ctx, ioseq, 0, ReadOptions{})
+ if n != 1 || (err != nil && err != io.EOF) {
+ t.Fatalf("PRead: got (%d, %v), wanted (1, nil or EOF)", n, err)
+ }
+ if want := byte('3'); buf[0] != want {
+ t.Errorf("PRead: got byte %c, wanted %c", buf[0], want)
+ }
+}
diff --git a/pkg/sentry/vfs/testutil.go b/pkg/sentry/vfs/testutil.go
new file mode 100644
index 000000000..70b192ece
--- /dev/null
+++ b/pkg/sentry/vfs/testutil.go
@@ -0,0 +1,139 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs
+
+import (
+ "gvisor.dev/gvisor/pkg/abi/linux"
+ "gvisor.dev/gvisor/pkg/sentry/context"
+ "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+ "gvisor.dev/gvisor/pkg/syserror"
+)
+
+// FDTestFilesystemType is a test-only FilesystemType that produces Filesystems
+// for which all FilesystemImpl methods taking a path return EPERM. It is used
+// to produce Mounts and Dentries for testing of FileDescriptionImpls that do
+// not depend on their originating Filesystem.
+type FDTestFilesystemType struct{}
+
+// FDTestFilesystem is a test-only FilesystemImpl produced by
+// FDTestFilesystemType.
+type FDTestFilesystem struct {
+ vfsfs Filesystem
+}
+
+// NewFilesystem implements FilesystemType.NewFilesystem.
+func (fstype FDTestFilesystemType) NewFilesystem(ctx context.Context, creds *auth.Credentials, source string, opts NewFilesystemOptions) (*Filesystem, *Dentry, error) {
+ var fs FDTestFilesystem
+ fs.vfsfs.Init(&fs)
+ return &fs.vfsfs, fs.NewDentry(), nil
+}
+
+// Release implements FilesystemImpl.Release.
+func (fs *FDTestFilesystem) Release() {
+}
+
+// Sync implements FilesystemImpl.Sync.
+func (fs *FDTestFilesystem) Sync(ctx context.Context) error {
+ return nil
+}
+
+// GetDentryAt implements FilesystemImpl.GetDentryAt.
+func (fs *FDTestFilesystem) GetDentryAt(ctx context.Context, rp *ResolvingPath, opts GetDentryOptions) (*Dentry, error) {
+ return nil, syserror.EPERM
+}
+
+// LinkAt implements FilesystemImpl.LinkAt.
+func (fs *FDTestFilesystem) LinkAt(ctx context.Context, rp *ResolvingPath, vd VirtualDentry) error {
+ return syserror.EPERM
+}
+
+// MkdirAt implements FilesystemImpl.MkdirAt.
+func (fs *FDTestFilesystem) MkdirAt(ctx context.Context, rp *ResolvingPath, opts MkdirOptions) error {
+ return syserror.EPERM
+}
+
+// MknodAt implements FilesystemImpl.MknodAt.
+func (fs *FDTestFilesystem) MknodAt(ctx context.Context, rp *ResolvingPath, opts MknodOptions) error {
+ return syserror.EPERM
+}
+
+// OpenAt implements FilesystemImpl.OpenAt.
+func (fs *FDTestFilesystem) OpenAt(ctx context.Context, rp *ResolvingPath, opts OpenOptions) (*FileDescription, error) {
+ return nil, syserror.EPERM
+}
+
+// ReadlinkAt implements FilesystemImpl.ReadlinkAt.
+func (fs *FDTestFilesystem) ReadlinkAt(ctx context.Context, rp *ResolvingPath) (string, error) {
+ return "", syserror.EPERM
+}
+
+// RenameAt implements FilesystemImpl.RenameAt.
+func (fs *FDTestFilesystem) RenameAt(ctx context.Context, rp *ResolvingPath, vd VirtualDentry, opts RenameOptions) error {
+ return syserror.EPERM
+}
+
+// RmdirAt implements FilesystemImpl.RmdirAt.
+func (fs *FDTestFilesystem) RmdirAt(ctx context.Context, rp *ResolvingPath) error {
+ return syserror.EPERM
+}
+
+// SetStatAt implements FilesystemImpl.SetStatAt.
+func (fs *FDTestFilesystem) SetStatAt(ctx context.Context, rp *ResolvingPath, opts SetStatOptions) error {
+ return syserror.EPERM
+}
+
+// StatAt implements FilesystemImpl.StatAt.
+func (fs *FDTestFilesystem) StatAt(ctx context.Context, rp *ResolvingPath, opts StatOptions) (linux.Statx, error) {
+ return linux.Statx{}, syserror.EPERM
+}
+
+// StatFSAt implements FilesystemImpl.StatFSAt.
+func (fs *FDTestFilesystem) StatFSAt(ctx context.Context, rp *ResolvingPath) (linux.Statfs, error) {
+ return linux.Statfs{}, syserror.EPERM
+}
+
+// SymlinkAt implements FilesystemImpl.SymlinkAt.
+func (fs *FDTestFilesystem) SymlinkAt(ctx context.Context, rp *ResolvingPath, target string) error {
+ return syserror.EPERM
+}
+
+// UnlinkAt implements FilesystemImpl.UnlinkAt.
+func (fs *FDTestFilesystem) UnlinkAt(ctx context.Context, rp *ResolvingPath) error {
+ return syserror.EPERM
+}
+
+type fdTestDentry struct {
+ vfsd Dentry
+}
+
+// NewDentry returns a new Dentry.
+func (fs *FDTestFilesystem) NewDentry() *Dentry {
+ var d fdTestDentry
+ d.vfsd.Init(&d)
+ return &d.vfsd
+}
+
+// IncRef implements DentryImpl.IncRef.
+func (d *fdTestDentry) IncRef(vfsfs *Filesystem) {
+}
+
+// TryIncRef implements DentryImpl.TryIncRef.
+func (d *fdTestDentry) TryIncRef(vfsfs *Filesystem) bool {
+ return true
+}
+
+// DecRef implements DentryImpl.DecRef.
+func (d *fdTestDentry) DecRef(vfsfs *Filesystem) {
+}